diff options
Diffstat (limited to '')
-rw-r--r-- | libbb/bitops.c | 128 |
1 files changed, 128 insertions, 0 deletions
diff --git a/libbb/bitops.c b/libbb/bitops.c new file mode 100644 index 000000000..467e1a2d9 --- /dev/null +++ b/libbb/bitops.c | |||
@@ -0,0 +1,128 @@ | |||
1 | /* | ||
2 | * Utility routines. | ||
3 | * | ||
4 | * Copyright (C) 2025 by Denys Vlasenko <vda.linux@googlemail.com> | ||
5 | * | ||
6 | * Licensed under GPLv2, see file LICENSE in this source tree. | ||
7 | */ | ||
8 | //kbuild:lib-y += bitops.o | ||
9 | |||
10 | #include "libbb.h" | ||
11 | |||
12 | void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count) | ||
13 | { | ||
14 | uint8_t *d = dst; | ||
15 | const uint8_t *s1 = src1; | ||
16 | const uint8_t *s2 = src2; | ||
17 | #if BB_UNALIGNED_MEMACCESS_OK | ||
18 | while (count >= sizeof(long)) { | ||
19 | *(long*)d = *(long*)s1 ^ *(long*)s2; | ||
20 | count -= sizeof(long); | ||
21 | d += sizeof(long); | ||
22 | s1 += sizeof(long); | ||
23 | s2 += sizeof(long); | ||
24 | } | ||
25 | #endif | ||
26 | while (count--) | ||
27 | *d++ = *s1++ ^ *s2++; | ||
28 | } | ||
29 | |||
30 | void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count) | ||
31 | { | ||
32 | xorbuf_3(dst, dst, src, count); | ||
33 | } | ||
34 | |||
35 | void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src) | ||
36 | { | ||
37 | #if defined(__SSE__) /* any x86_64 has it */ | ||
38 | asm volatile( | ||
39 | "\n movups (%0),%%xmm0" | ||
40 | "\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0: | ||
41 | "\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment | ||
42 | "\n movups %%xmm0,(%0)" | ||
43 | "\n" | ||
44 | : "=r" (dst), "=r" (src) | ||
45 | : "0" (dst), "1" (src) | ||
46 | : "xmm0", "xmm1", "memory" | ||
47 | ); | ||
48 | #else | ||
49 | unsigned long *d = dst; | ||
50 | const unsigned long *s = src; | ||
51 | d[0] ^= s[0]; | ||
52 | # if LONG_MAX <= 0x7fffffffffffffff | ||
53 | d[1] ^= s[1]; | ||
54 | # if LONG_MAX == 0x7fffffff | ||
55 | d[2] ^= s[2]; | ||
56 | d[3] ^= s[3]; | ||
57 | # endif | ||
58 | # endif | ||
59 | #endif | ||
60 | } | ||
61 | // The above can be inlined in libbb.h, in a way where compiler | ||
62 | // is even free to use better addressing modes than (%reg), and | ||
63 | // to keep the result in a register | ||
64 | // (to not store it to memory after each XOR): | ||
65 | //#if defined(__SSE__) | ||
66 | //#include <xmmintrin.h> | ||
67 | //^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1))); | ||
68 | //static ALWAYS_INLINE void xorbuf16_aligned_long(void *dst, const void *src) | ||
69 | //{ | ||
70 | // __m128_u xmm0, xmm1; | ||
71 | // asm volatile( | ||
72 | //"\n xorps %1,%0" | ||
73 | // : "=x" (xmm0), "=x" (xmm1) | ||
74 | // : "0" (*(__m128_u*)dst), "1" (*(__m128_u*)src) | ||
75 | // ); | ||
76 | // *(__m128_u*)dst = xmm0; // this store may be optimized out! | ||
77 | //} | ||
78 | //#endif | ||
79 | // but I don't trust gcc optimizer enough to not generate some monstrosity. | ||
80 | // See GMULT() function in TLS code as an example. | ||
81 | |||
82 | void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2) | ||
83 | { | ||
84 | #if defined(__SSE__) /* any x86_64 has it */ | ||
85 | asm volatile( | ||
86 | "\n movups 0*16(%1),%%xmm0" | ||
87 | "\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0: | ||
88 | "\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte | ||
89 | "\n movups %%xmm0,0*16(%0)" | ||
90 | "\n movups 1*16(%1),%%xmm0" | ||
91 | "\n movups 1*16(%2),%%xmm1" | ||
92 | "\n xorps %%xmm1,%%xmm0" | ||
93 | "\n movups %%xmm0,1*16(%0)" | ||
94 | "\n movups 2*16(%1),%%xmm0" | ||
95 | "\n movups 2*16(%2),%%xmm1" | ||
96 | "\n xorps %%xmm1,%%xmm0" | ||
97 | "\n movups %%xmm0,2*16(%0)" | ||
98 | "\n movups 3*16(%1),%%xmm0" | ||
99 | "\n movups 3*16(%2),%%xmm1" | ||
100 | "\n xorps %%xmm1,%%xmm0" | ||
101 | "\n movups %%xmm0,3*16(%0)" | ||
102 | "\n" | ||
103 | : "=r" (dst), "=r" (src1), "=r" (src2) | ||
104 | : "0" (dst), "1" (src1), "2" (src2) | ||
105 | : "xmm0", "xmm1", "memory" | ||
106 | ); | ||
107 | #else | ||
108 | long *d = dst; | ||
109 | const long *s1 = src1; | ||
110 | const long *s2 = src2; | ||
111 | unsigned count = 64 / sizeof(long); | ||
112 | do { | ||
113 | *d++ = *s1++ ^ *s2++; | ||
114 | } while (--count != 0); | ||
115 | #endif | ||
116 | } | ||
117 | |||
118 | #if !BB_UNALIGNED_MEMACCESS_OK | ||
119 | void FAST_FUNC xorbuf16(void *dst, const void *src) | ||
120 | { | ||
121 | #define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0) | ||
122 | if (p_aligned(src) && p_aligned(dst)) { | ||
123 | xorbuf16_aligned_long(dst, src); | ||
124 | return; | ||
125 | } | ||
126 | xorbuf_3(dst, dst, src, 16); | ||
127 | } | ||
128 | #endif | ||