1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
/*
* Utility routines.
*
* Copyright (C) 2025 by Denys Vlasenko <vda.linux@googlemail.com>
*
* Licensed under GPLv2, see file LICENSE in this source tree.
*/
//kbuild:lib-y += bitops.o
#include "libbb.h"
void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count)
{
uint8_t *d = dst;
const uint8_t *s1 = src1;
const uint8_t *s2 = src2;
#if BB_UNALIGNED_MEMACCESS_OK
while (count >= sizeof(long)) {
*(long*)d = *(long*)s1 ^ *(long*)s2;
count -= sizeof(long);
d += sizeof(long);
s1 += sizeof(long);
s2 += sizeof(long);
}
#endif
while (count--)
*d++ = *s1++ ^ *s2++;
}
void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
{
xorbuf_3(dst, dst, src, count);
}
void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src)
{
#if defined(__SSE__) /* any x86_64 has it */
asm volatile(
"\n movups (%0),%%xmm0"
"\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0:
"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment
"\n movups %%xmm0,(%0)"
"\n"
: "=r" (dst), "=r" (src)
: "0" (dst), "1" (src)
: "xmm0", "xmm1", "memory"
);
#else
unsigned long *d = dst;
const unsigned long *s = src;
d[0] ^= s[0];
# if LONG_MAX <= 0x7fffffffffffffff
d[1] ^= s[1];
# if LONG_MAX == 0x7fffffff
d[2] ^= s[2];
d[3] ^= s[3];
# endif
# endif
#endif
}
// The above can be inlined in libbb.h, in a way where compiler
// is even free to use better addressing modes than (%reg), and
// to keep the result in a register
// (to not store it to memory after each XOR):
//#if defined(__SSE__)
//#include <xmmintrin.h>
//^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1)));
//static ALWAYS_INLINE void xorbuf16_aligned_long(void *dst, const void *src)
//{
// __m128_u xmm0, xmm1;
// asm volatile(
//"\n xorps %1,%0"
// : "=x" (xmm0), "=x" (xmm1)
// : "0" (*(__m128_u*)dst), "1" (*(__m128_u*)src)
// );
// *(__m128_u*)dst = xmm0; // this store may be optimized out!
//}
//#endif
// but I don't trust gcc optimizer enough to not generate some monstrosity.
// See GMULT() function in TLS code as an example.
void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2)
{
#if defined(__SSE__) /* any x86_64 has it */
asm volatile(
"\n movups 0*16(%1),%%xmm0"
"\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte
"\n movups %%xmm0,0*16(%0)"
"\n movups 1*16(%1),%%xmm0"
"\n movups 1*16(%2),%%xmm1"
"\n xorps %%xmm1,%%xmm0"
"\n movups %%xmm0,1*16(%0)"
"\n movups 2*16(%1),%%xmm0"
"\n movups 2*16(%2),%%xmm1"
"\n xorps %%xmm1,%%xmm0"
"\n movups %%xmm0,2*16(%0)"
"\n movups 3*16(%1),%%xmm0"
"\n movups 3*16(%2),%%xmm1"
"\n xorps %%xmm1,%%xmm0"
"\n movups %%xmm0,3*16(%0)"
"\n"
: "=r" (dst), "=r" (src1), "=r" (src2)
: "0" (dst), "1" (src1), "2" (src2)
: "xmm0", "xmm1", "memory"
);
#else
long *d = dst;
const long *s1 = src1;
const long *s2 = src2;
unsigned count = 64 / sizeof(long);
do {
*d++ = *s1++ ^ *s2++;
} while (--count != 0);
#endif
}
#if !BB_UNALIGNED_MEMACCESS_OK
void FAST_FUNC xorbuf16(void *dst, const void *src)
{
#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
if (p_aligned(src) && p_aligned(dst)) {
xorbuf16_aligned_long(dst, src);
return;
}
xorbuf_3(dst, dst, src, 16);
}
#endif
|