1 files changed, 128 insertions, 0 deletions
diff --git a/libbb/bitops.c b/libbb/bitops.c
new file mode 100644
index 000000000..467e1a2d9
--- /dev/null
+++ b/libbb/bitops.c
@@ -0,0 +1,128 @@
+/*
+ * Utility routines.
+ *
+ * Copyright (C) 2025 by Denys Vlasenko <vda.linux@googlemail.com>
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+//kbuild:lib-y += bitops.o
+#include "libbb.h"
+void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count)
+{
+        uint8_t *d = dst;
+        const uint8_t *s1 = src1;
+        const uint8_t *s2 = src2;
+#if BB_UNALIGNED_MEMACCESS_OK
+        while (count >= sizeof(long)) {
+                *(long*)d = *(long*)s1 ^ *(long*)s2;
+                count -= sizeof(long);
+                d += sizeof(long);
+                s1 += sizeof(long);
+                s2 += sizeof(long);
+        }
+#endif
+        while (count--)
+                *d++ = *s1++ ^ *s2++;
+}
+void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count)
+{
+        xorbuf_3(dst, dst, src, count);
+}
+void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+        asm volatile(
+"\n             movups  (%0),%%xmm0"
+"\n             movups  (%1),%%xmm1"   // can't just xorps(%1),%%xmm0:
+"\n             xorps   %%xmm1,%%xmm0" // SSE requires 16-byte alignment
+"\n             movups  %%xmm0,(%0)"
+"\n"
+                : "=r" (dst), "=r" (src)
+                : "0" (dst), "1" (src)
+                : "xmm0", "xmm1", "memory"
+        );
+#else
+        unsigned long *d = dst;
+        const unsigned long *s = src;
+        d[0] ^= s[0];
+# if LONG_MAX <= 0x7fffffffffffffff
+        d[1] ^= s[1];
+#  if LONG_MAX == 0x7fffffff
+        d[2] ^= s[2];
+        d[3] ^= s[3];
+#  endif
+# endif
+#endif
+}
+// The above can be inlined in libbb.h, in a way where compiler
+// is even free to use better addressing modes than (%reg), and
+// to keep the result in a register
+// (to not store it to memory after each XOR):
+//#if defined(__SSE__)
+//#include <xmmintrin.h>
+//^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1)));
+//static ALWAYS_INLINE void xorbuf16_aligned_long(void *dst, const void *src)
+//{
+//      __m128_u xmm0, xmm1;
+//      asm volatile(
+//"\n           xorps   %1,%0"
+//              : "=x" (xmm0), "=x" (xmm1)
+//              : "0" (*(__m128_u*)dst), "1" (*(__m128_u*)src)
+//      );
+//      *(__m128_u*)dst = xmm0; // this store may be optimized out!
+//}
+//#endif
+// but I don't trust gcc optimizer enough to not generate some monstrosity.
+// See GMULT() function in TLS code as an example.
+void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2)
+{
+#if defined(__SSE__) /* any x86_64 has it */
+        asm volatile(
+"\n             movups  0*16(%1),%%xmm0"
+"\n             movups  0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
+"\n             xorps   %%xmm1,%%xmm0"   // SSE requires 16-byte alignment, we have only 8-byte
+"\n             movups  %%xmm0,0*16(%0)"
+"\n             movups  1*16(%1),%%xmm0"
+"\n             movups  1*16(%2),%%xmm1"
+"\n             xorps   %%xmm1,%%xmm0"
+"\n             movups  %%xmm0,1*16(%0)"
+"\n             movups  2*16(%1),%%xmm0"
+"\n             movups  2*16(%2),%%xmm1"
+"\n             xorps   %%xmm1,%%xmm0"
+"\n             movups  %%xmm0,2*16(%0)"
+"\n             movups  3*16(%1),%%xmm0"
+"\n             movups  3*16(%2),%%xmm1"
+"\n             xorps   %%xmm1,%%xmm0"
+"\n             movups  %%xmm0,3*16(%0)"
+"\n"
+                : "=r" (dst), "=r" (src1), "=r" (src2)
+                : "0" (dst), "1" (src1), "2" (src2)
+                : "xmm0", "xmm1", "memory"
+        );
+#else
+        long *d = dst;
+        const long *s1 = src1;
+        const long *s2 = src2;
+        unsigned count = 64 / sizeof(long);
+        do {
+                *d++ = *s1++ ^ *s2++;
+        } while (--count != 0);
+#endif
+}
+#if !BB_UNALIGNED_MEMACCESS_OK
+void FAST_FUNC xorbuf16(void *dst, const void *src)
+{
+#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
+        if (p_aligned(src) && p_aligned(dst)) {
+                xorbuf16_aligned_long(dst, src);
+                return;
+        }
+        xorbuf_3(dst, dst, src, 16);
+}
+#endif

diff --git a/libbb/bitops.c b/libbb/bitops.c new file mode 100644 index 000000000..467e1a2d9 --- /dev/null +++ b/libbb/bitops.c
@@ -0,0 +1,128 @@
	1	/*
	2	* Utility routines.
	3	*
	4	* Copyright (C) 2025 by Denys Vlasenko <vda.linux@googlemail.com>
	5	*
	6	* Licensed under GPLv2, see file LICENSE in this source tree.
	7	*/
	8	//kbuild:lib-y += bitops.o
	9
	10	#include "libbb.h"
	11
	12	void FAST_FUNC xorbuf_3(void dst, const void src1, const void *src2, unsigned count)
	13	{
	14	uint8_t *d = dst;
	15	const uint8_t *s1 = src1;
	16	const uint8_t *s2 = src2;
	17	#if BB_UNALIGNED_MEMACCESS_OK
	18	while (count >= sizeof(long)) {
	19	(long)d = (long)s1 ^ (long)s2;
	20	count -= sizeof(long);
	21	d += sizeof(long);
	22	s1 += sizeof(long);
	23	s2 += sizeof(long);
	24	}
	25	#endif
	26	while (count--)
	27	d++ = s1++ ^ *s2++;
	28	}
	29
	30	void FAST_FUNC xorbuf(void dst, const void src, unsigned count)
	31	{
	32	xorbuf_3(dst, dst, src, count);
	33	}
	34
	35	void FAST_FUNC xorbuf16_aligned_long(void dst, const void src)
	36	{
	37	#if defined(__SSE__) /* any x86_64 has it */
	38	asm volatile(
	39	"\n movups (%0),%%xmm0"
	40	"\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0:
	41	"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment
	42	"\n movups %%xmm0,(%0)"
	43	"\n"
	44	: "=r" (dst), "=r" (src)
	45	: "0" (dst), "1" (src)
	46	: "xmm0", "xmm1", "memory"
	47	);
	48	#else
	49	unsigned long *d = dst;
	50	const unsigned long *s = src;
	51	d[0] ^= s[0];
	52	# if LONG_MAX <= 0x7fffffffffffffff
	53	d[1] ^= s[1];
	54	# if LONG_MAX == 0x7fffffff
	55	d[2] ^= s[2];
	56	d[3] ^= s[3];
	57	# endif
	58	# endif
	59	#endif
	60	}
	61	// The above can be inlined in libbb.h, in a way where compiler
	62	// is even free to use better addressing modes than (%reg), and
	63	// to keep the result in a register
	64	// (to not store it to memory after each XOR):
	65	//#if defined(__SSE__)
	66	//#include <xmmintrin.h>
	67	//^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1)));
	68	//static ALWAYS_INLINE void xorbuf16_aligned_long(void dst, const void src)
	69	//{
	70	// __m128_u xmm0, xmm1;
	71	// asm volatile(
	72	//"\n xorps %1,%0"
	73	// : "=x" (xmm0), "=x" (xmm1)
	74	// : "0" ((__m128_u)dst), "1" ((__m128_u)src)
	75	// );
	76	// (__m128_u)dst = xmm0; // this store may be optimized out!
	77	//}
	78	//#endif
	79	// but I don't trust gcc optimizer enough to not generate some monstrosity.
	80	// See GMULT() function in TLS code as an example.
	81
	82	void FAST_FUNC xorbuf64_3_aligned64(void dst, const void src1, const void *src2)
	83	{
	84	#if defined(__SSE__) /* any x86_64 has it */
	85	asm volatile(
	86	"\n movups 0*16(%1),%%xmm0"
	87	"\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0:
	88	"\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte
	89	"\n movups %%xmm0,0*16(%0)"
	90	"\n movups 1*16(%1),%%xmm0"
	91	"\n movups 1*16(%2),%%xmm1"
	92	"\n xorps %%xmm1,%%xmm0"
	93	"\n movups %%xmm0,1*16(%0)"
	94	"\n movups 2*16(%1),%%xmm0"
	95	"\n movups 2*16(%2),%%xmm1"
	96	"\n xorps %%xmm1,%%xmm0"
	97	"\n movups %%xmm0,2*16(%0)"
	98	"\n movups 3*16(%1),%%xmm0"
	99	"\n movups 3*16(%2),%%xmm1"
	100	"\n xorps %%xmm1,%%xmm0"
	101	"\n movups %%xmm0,3*16(%0)"
	102	"\n"
	103	: "=r" (dst), "=r" (src1), "=r" (src2)
	104	: "0" (dst), "1" (src1), "2" (src2)
	105	: "xmm0", "xmm1", "memory"
	106	);
	107	#else
	108	long *d = dst;
	109	const long *s1 = src1;
	110	const long *s2 = src2;
	111	unsigned count = 64 / sizeof(long);
	112	do {
	113	d++ = s1++ ^ *s2++;
	114	} while (--count != 0);
	115	#endif
	116	}
	117
	118	#if !BB_UNALIGNED_MEMACCESS_OK
	119	void FAST_FUNC xorbuf16(void dst, const void src)
	120	{
	121	#define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0)
	122	if (p_aligned(src) && p_aligned(dst)) {
	123	xorbuf16_aligned_long(dst, src);
	124	return;
	125	}
	126	xorbuf_3(dst, dst, src, 16);
	127	}
	128	#endif