/* * Utility routines. * * Copyright (C) 2025 by Denys Vlasenko * * Licensed under GPLv2, see file LICENSE in this source tree. */ //kbuild:lib-y += bitops.o #include "libbb.h" void FAST_FUNC xorbuf_3(void *dst, const void *src1, const void *src2, unsigned count) { uint8_t *d = dst; const uint8_t *s1 = src1; const uint8_t *s2 = src2; #if BB_UNALIGNED_MEMACCESS_OK while (count >= sizeof(long)) { *(long*)d = *(long*)s1 ^ *(long*)s2; count -= sizeof(long); d += sizeof(long); s1 += sizeof(long); s2 += sizeof(long); } #endif while (count--) *d++ = *s1++ ^ *s2++; } void FAST_FUNC xorbuf(void *dst, const void *src, unsigned count) { xorbuf_3(dst, dst, src, count); } void FAST_FUNC xorbuf16_aligned_long(void *dst, const void *src) { #if defined(__SSE__) /* any x86_64 has it */ asm volatile( "\n movups (%0),%%xmm0" "\n movups (%1),%%xmm1" // can't just xorps(%1),%%xmm0: "\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment "\n movups %%xmm0,(%0)" "\n" : "=r" (dst), "=r" (src) : "0" (dst), "1" (src) : "xmm0", "xmm1", "memory" ); #else unsigned long *d = dst; const unsigned long *s = src; d[0] ^= s[0]; # if LONG_MAX <= 0x7fffffffffffffff d[1] ^= s[1]; # if LONG_MAX == 0x7fffffff d[2] ^= s[2]; d[3] ^= s[3]; # endif # endif #endif } // The above can be inlined in libbb.h, in a way where compiler // is even free to use better addressing modes than (%reg), and // to keep the result in a register // (to not store it to memory after each XOR): //#if defined(__SSE__) //#include //^^^ or just: typedef float __m128_u attribute((__vector_size__(16),__may_alias__,__aligned__(1))); //static ALWAYS_INLINE void xorbuf16_aligned_long(void *dst, const void *src) //{ // __m128_u xmm0, xmm1; // asm volatile( //"\n xorps %1,%0" // : "=x" (xmm0), "=x" (xmm1) // : "0" (*(__m128_u*)dst), "1" (*(__m128_u*)src) // ); // *(__m128_u*)dst = xmm0; // this store may be optimized out! //} //#endif // but I don't trust gcc optimizer enough to not generate some monstrosity. // See GMULT() function in TLS code as an example. void FAST_FUNC xorbuf64_3_aligned64(void *dst, const void *src1, const void *src2) { #if defined(__SSE__) /* any x86_64 has it */ asm volatile( "\n movups 0*16(%1),%%xmm0" "\n movups 0*16(%2),%%xmm1" // can't just xorps(%2),%%xmm0: "\n xorps %%xmm1,%%xmm0" // SSE requires 16-byte alignment, we have only 8-byte "\n movups %%xmm0,0*16(%0)" "\n movups 1*16(%1),%%xmm0" "\n movups 1*16(%2),%%xmm1" "\n xorps %%xmm1,%%xmm0" "\n movups %%xmm0,1*16(%0)" "\n movups 2*16(%1),%%xmm0" "\n movups 2*16(%2),%%xmm1" "\n xorps %%xmm1,%%xmm0" "\n movups %%xmm0,2*16(%0)" "\n movups 3*16(%1),%%xmm0" "\n movups 3*16(%2),%%xmm1" "\n xorps %%xmm1,%%xmm0" "\n movups %%xmm0,3*16(%0)" "\n" : "=r" (dst), "=r" (src1), "=r" (src2) : "0" (dst), "1" (src1), "2" (src2) : "xmm0", "xmm1", "memory" ); #else long *d = dst; const long *s1 = src1; const long *s2 = src2; unsigned count = 64 / sizeof(long); do { *d++ = *s1++ ^ *s2++; } while (--count != 0); #endif } #if !BB_UNALIGNED_MEMACCESS_OK void FAST_FUNC xorbuf16(void *dst, const void *src) { #define p_aligned(a) (((uintptr_t)(a) & (sizeof(long)-1)) == 0) if (p_aligned(src) && p_aligned(dst)) { xorbuf16_aligned_long(dst, src); return; } xorbuf_3(dst, dst, src, 16); } #endif