aboutsummaryrefslogtreecommitdiff
path: root/libbb/bitops.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--libbb/bitops.c128
1 files changed, 128 insertions, 0 deletions
diff --git a/libbb/bitops.c b/libbb/bitops.c
new file mode 100644
index 000000000..467e1a2d9
--- /dev/null
+++ b/libbb/bitops.c
@@ -0,0 +1,128 @@
1/*
2 * Utility routines.
3 *
4 * Copyright (C) 2025 by Denys Vlasenko <vda.linux@googlemail.com>
5 *
6 * Licensed under GPLv2, see file LICENSE in this source tree.
7 */
8//kbuild:lib-y += bitops.o
9
10#include "libbb.h"
11
12void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count)
13{
14 uint8_t *d = dst;
15 const uint8_t *s1 = src1;
16 const uint8_t *s2 = src2;
17#if BB_UNALIGNED_MEMACCESS_OK
18 while (count >= sizeof(long)) {
19 *(long*)d = *(long*)s1 ^ *(long*)s2;
20 count -= sizeof(long);
21 d += sizeof(long);
22 s1 += sizeof(long);
23 s2 += sizeof(long);
24 }
25#endif
26 while (count--)
27 *d++ = *s1++ ^ *s2++;
28}
29
30void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
31{
32 xorbuf_3(dst, dst, src, count);
33}
34
35void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src)
36{
37#if defined(__SSE__) /* any x86_64 has it */
38 asm volatile(
39"\n movups (%0),%%xmm0"
40"\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0:
41"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment
42"\n movups %%xmm0,(%0)"
43"\n"
44 : "=r" (dst), "=r" (src)
45 : "0" (dst), "1" (src)
46 : "xmm0", "xmm1", "memory"
47 );
48#else
49 unsigned long *d = dst;
50 const unsigned long *s = src;
51 d[0] ^= s[0];
52# if LONG_MAX <= 0x7fffffffffffffff
53 d[1] ^= s[1];
54# if LONG_MAX == 0x7fffffff
55 d[2] ^= s[2];
56 d[3] ^= s[3];
57# endif
58# endif
59#endif
60}
61// The above can be inlined in libbb.h, in a way where compiler
62// is even free to use better addressing modes than (%reg), and
63// to keep the result in a register
64// (to not store it to memory after each XOR):
65//#if defined(__SSE__)
66//#include <xmmintrin.h>
67//^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1)));
68//static ALWAYS_INLINE void xorbuf16_aligned_long(void *dst, const void *src)
69//{
70// __m128_u xmm0, xmm1;
71// asm volatile(
72//"\n xorps %1,%0"
73// : "=x" (xmm0), "=x" (xmm1)
74// : "0" (*(__m128_u*)dst), "1" (*(__m128_u*)src)
75// );
76// *(__m128_u*)dst = xmm0; // this store may be optimized out!
77//}
78//#endif
79// but I don't trust gcc optimizer enough to not generate some monstrosity.
80// See GMULT() function in TLS code as an example.
81
82void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2)
83{
84#if defined(__SSE__) /* any x86_64 has it */
85 asm volatile(
86"\n movups 0*16(%1),%%xmm0"
87"\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
88"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte
89"\n movups %%xmm0,0*16(%0)"
90"\n movups 1*16(%1),%%xmm0"
91"\n movups 1*16(%2),%%xmm1"
92"\n xorps %%xmm1,%%xmm0"
93"\n movups %%xmm0,1*16(%0)"
94"\n movups 2*16(%1),%%xmm0"
95"\n movups 2*16(%2),%%xmm1"
96"\n xorps %%xmm1,%%xmm0"
97"\n movups %%xmm0,2*16(%0)"
98"\n movups 3*16(%1),%%xmm0"
99"\n movups 3*16(%2),%%xmm1"
100"\n xorps %%xmm1,%%xmm0"
101"\n movups %%xmm0,3*16(%0)"
102"\n"
103 : "=r" (dst), "=r" (src1), "=r" (src2)
104 : "0" (dst), "1" (src1), "2" (src2)
105 : "xmm0", "xmm1", "memory"
106 );
107#else
108 long *d = dst;
109 const long *s1 = src1;
110 const long *s2 = src2;
111 unsigned count = 64 / sizeof(long);
112 do {
113 *d++ = *s1++ ^ *s2++;
114 } while (--count != 0);
115#endif
116}
117
118#if !BB_UNALIGNED_MEMACCESS_OK
119void FAST_FUNC xorbuf16(void *dst, const void *src)
120{
121#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
122 if (p_aligned(src) && p_aligned(dst)) {
123 xorbuf16_aligned_long(dst, src);
124 return;
125 }
126 xorbuf_3(dst, dst, src, 16);
127}
128#endif