From f8e9bd30d73f2acf6818da71a2ba44748151b716 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 7 Jul 2025 22:34:31 +0200 Subject: libbb/yescrypt: disable unrolling in two places Also, make many define macros safer function old new delta blockmix 2300 814 -1486 blockmix_xor 4606 1543 -3063 blockmix_xor_save 4737 1620 -3117 ------------------------------------------------------------------------------ (add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-7666) Total: -7666 bytes Signed-off-by: Denys Vlasenko --- libbb/yescrypt/alg-yescrypt-kdf.c | 255 ++++++++++++++++++++++++-------------- 1 file changed, 159 insertions(+), 96 deletions(-) (limited to 'libbb') diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c index d24b05150..ab095eae1 100644 --- a/libbb/yescrypt/alg-yescrypt-kdf.c +++ b/libbb/yescrypt/alg-yescrypt-kdf.c @@ -42,6 +42,15 @@ #define unlikely(exp) (exp) #endif +// Not a size win if 0 +#define UNROLL_COPY 1 + +// -5324 bytes if 0: +#define UNROLL_PWXFORM_ROUND 0 +// -4864 bytes if 0: +#define UNROLL_PWXFORM 0 +// both 0: -7666 bytes + typedef union { uint32_t w[16]; uint64_t d[8]; @@ -52,15 +61,17 @@ static void salsa20_simd_shuffle( salsa20_blk_t *Bout) { #define COMBINE(out, in1, in2) \ - Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); - COMBINE(0, 0, 2) - COMBINE(1, 5, 7) - COMBINE(2, 2, 4) - COMBINE(3, 7, 1) - COMBINE(4, 4, 6) - COMBINE(5, 1, 3) - COMBINE(6, 6, 0) - COMBINE(7, 3, 5) +do { \ + Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); \ +} while (0) + COMBINE(0, 0, 2); + COMBINE(1, 5, 7); + COMBINE(2, 2, 4); + COMBINE(3, 7, 1); + COMBINE(4, 4, 6); + COMBINE(5, 1, 3); + COMBINE(6, 6, 0); + COMBINE(7, 3, 5); #undef COMBINE } @@ -69,25 +80,29 @@ static void salsa20_simd_unshuffle( salsa20_blk_t *Bout) { #define UNCOMBINE(out, in1, in2) \ +do { \ Bout->w[out * 2] = Bin->d[in1]; \ - Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; - UNCOMBINE(0, 0, 6) - UNCOMBINE(1, 5, 3) - UNCOMBINE(2, 2, 0) - UNCOMBINE(3, 7, 5) - UNCOMBINE(4, 4, 2) - UNCOMBINE(5, 1, 7) - UNCOMBINE(6, 6, 4) - UNCOMBINE(7, 3, 1) + Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; \ +} while (0) + UNCOMBINE(0, 0, 6); + UNCOMBINE(1, 5, 3); + UNCOMBINE(2, 2, 0); + UNCOMBINE(3, 7, 5); + UNCOMBINE(4, 4, 2); + UNCOMBINE(5, 1, 7); + UNCOMBINE(6, 6, 4); + UNCOMBINE(7, 3, 1); #undef UNCOMBINE } #define DECL_X \ - salsa20_blk_t X; + salsa20_blk_t X #define DECL_Y \ - salsa20_blk_t Y; + salsa20_blk_t Y +#if UNROLL_COPY #define COPY(out, in) \ +do { \ (out).d[0] = (in).d[0]; \ (out).d[1] = (in).d[1]; \ (out).d[2] = (in).d[2]; \ @@ -95,9 +110,17 @@ static void salsa20_simd_unshuffle( (out).d[4] = (in).d[4]; \ (out).d[5] = (in).d[5]; \ (out).d[6] = (in).d[6]; \ - (out).d[7] = (in).d[7]; + (out).d[7] = (in).d[7]; \ +} while (0) +#else +#define COPY(out, in) \ +do { \ + for (int copyi=0; copyi<8; copyi++) \ + (out).d[copyi] = (in).d[copyi]; \ +} while (0) +#endif -#define READ_X(in) COPY(X, in) +#define READ_X(in) COPY(X, in) #define WRITE_X(out) COPY(out, X) /** @@ -154,7 +177,6 @@ static void salsa20(salsa20_blk_t *restrict B, B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; } } - #if 0 /* Too expensive */ explicit_bzero(&X, sizeof(X)); @@ -165,9 +187,10 @@ static void salsa20(salsa20_blk_t *restrict B, * Apply the Salsa20/2 core to the block provided in X. */ #define SALSA20_2(out) \ - salsa20(&X, &out, 1); + salsa20(&X, &out, 1) #define XOR(out, in1, in2) \ +do { \ (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ @@ -175,23 +198,28 @@ static void salsa20(salsa20_blk_t *restrict B, (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ - (out).d[7] = (in1).d[7] ^ (in2).d[7]; + (out).d[7] = (in1).d[7] ^ (in2).d[7]; \ +} while (0) -#define XOR_X(in) XOR(X, X, in) +#define XOR_X(in) XOR(X, X, in) #define XOR_X_2(in1, in2) XOR(X, in1, in2) #define XOR_X_WRITE_XOR_Y_2(out, in) \ - XOR(Y, out, in) \ - COPY(out, Y) \ - XOR(X, X, Y) +do { \ + XOR(Y, out, in); \ + COPY(out, Y); \ + XOR(X, X, Y); \ +} while (0) /** * Apply the Salsa20/8 core to the block provided in X ^ in. */ #define SALSA20_8_XOR_MEM(in, out) \ +do { \ XOR_X(in); \ - salsa20(&X, &out, 4); + salsa20(&X, &out, 4); \ +} while (0) -#define INTEGERIFY (uint32_t)X.d[0] +#define INTEGERIFY ((uint32_t)X.d[0]) /** * blockmix_salsa8(Bin, Bout, r): @@ -204,12 +232,12 @@ static void blockmix_salsa8( size_t r) { size_t i; - DECL_X + DECL_X; - READ_X(Bin[r * 2 - 1]) + READ_X(Bin[r * 2 - 1]); for (i = 0; i < r; i++) { - SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]) - SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]) + SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]); + SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]); } } @@ -220,14 +248,14 @@ static uint32_t blockmix_salsa8_xor( size_t r) { size_t i; - DECL_X + DECL_X; - XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]) + XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]); for (i = 0; i < r; i++) { - XOR_X(Bin1[i * 2]) - SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]) - XOR_X(Bin1[i * 2 + 1]) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]) + XOR_X(Bin1[i * 2]); + SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]); + XOR_X(Bin1[i * 2 + 1]); + SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]); } return INTEGERIFY; @@ -242,27 +270,38 @@ static uint32_t blockmix_salsa8_xor( /* Derived values. Not tunable except via Swidth above. */ #define PWXbytes (PWXgather * PWXsimple * 8) -#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) -#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) -#define Smask2 (((uint64_t)Smask << 32) | Smask) +#define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) +#define Smask (((1 << Swidth) - 1) * PWXsimple * 8) +#define Smask2 (((uint64_t)Smask << 32) | Smask) -#define DECL_SMASK2REG /* empty */ -#define FORCE_REGALLOC_3 /* empty */ -#define MAYBE_MEMORY_BARRIER /* empty */ +#define DECL_SMASK2REG do {} while (0) +#define FORCE_REGALLOC_3 do {} while (0) +#define MAYBE_MEMORY_BARRIER do {} while (0) -#define PWXFORM_SIMD(x0, x1) { \ +#define PWXFORM_SIMD(x0, x1) \ +do { \ uint64_t x = x0 & Smask2; \ uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ -} +} while (0) +#if UNROLL_PWXFORM_ROUND +#define PWXFORM_ROUND \ +do { \ + PWXFORM_SIMD(X.d[0], X.d[1]); \ + PWXFORM_SIMD(X.d[2], X.d[3]); \ + PWXFORM_SIMD(X.d[4], X.d[5]); \ + PWXFORM_SIMD(X.d[6], X.d[7]); \ +} while (0) +#else #define PWXFORM_ROUND \ - PWXFORM_SIMD(X.d[0], X.d[1]) \ - PWXFORM_SIMD(X.d[2], X.d[3]) \ - PWXFORM_SIMD(X.d[4], X.d[5]) \ - PWXFORM_SIMD(X.d[6], X.d[7]) +do { \ + for (int pwxi=0; pwxi<8; pwxi+=2) \ + PWXFORM_SIMD(X.d[pwxi], X.d[pwxi + 1]); \ +} while (0) +#endif /* * This offset helps address the 256-byte write block via the single-byte @@ -275,19 +314,23 @@ static uint32_t blockmix_salsa8_xor( #define PWXFORM_WRITE_OFFSET 0x7c #define PWXFORM_WRITE \ - WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)) \ - Sw += 64; - -#define PWXFORM { \ +do { \ + WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)); \ + Sw += 64; \ +} while (0) + +#if UNROLL_PWXFORM +#define PWXFORM \ +do { \ uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ - FORCE_REGALLOC_3 \ - MAYBE_MEMORY_BARRIER \ - PWXFORM_ROUND \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND PWXFORM_WRITE \ - PWXFORM_ROUND \ + FORCE_REGALLOC_3; \ + MAYBE_MEMORY_BARRIER; \ + PWXFORM_ROUND; \ + PWXFORM_ROUND; PWXFORM_WRITE; \ + PWXFORM_ROUND; PWXFORM_WRITE; \ + PWXFORM_ROUND; PWXFORM_WRITE; \ + PWXFORM_ROUND; PWXFORM_WRITE; \ + PWXFORM_ROUND; \ w = (w + 64 * 4) & Smask2; \ { \ uint8_t *Stmp = S2; \ @@ -295,7 +338,27 @@ static uint32_t blockmix_salsa8_xor( S1 = S0; \ S0 = Stmp; \ } \ -} +} while (0) +#else +#define PWXFORM \ +do { \ + uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ + FORCE_REGALLOC_3; \ + MAYBE_MEMORY_BARRIER; \ + PWXFORM_ROUND; \ + for (int pwxj=0; pwxj<4; pwxj++) {\ + PWXFORM_ROUND; PWXFORM_WRITE; \ + } \ + PWXFORM_ROUND; \ + w = (w + 64 * 4) & Smask2; \ + { \ + uint8_t *Stmp = S2; \ + S2 = S1; \ + S1 = S0; \ + S0 = Stmp; \ + } \ +} while (0) +#endif typedef struct { uint8_t *S0, *S1, *S2; @@ -318,29 +381,29 @@ static void blockmix( uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; size_t w = ctx->w; size_t i; - DECL_X + DECL_X; /* Convert count of 128-byte blocks to max index of 64-byte block */ r = r * 2 - 1; - READ_X(Bin[r]) + READ_X(Bin[r]); - DECL_SMASK2REG + DECL_SMASK2REG; i = 0; do { - XOR_X(Bin[i]) - PWXFORM + XOR_X(Bin[i]); + PWXFORM; if (unlikely(i >= r)) break; - WRITE_X(Bout[i]) + WRITE_X(Bout[i]); i++; } while (1); ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; ctx->w = w; - SALSA20_2(Bout[i]) + SALSA20_2(Bout[i]); } static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, @@ -352,31 +415,31 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; size_t w = ctx->w; size_t i; - DECL_X + DECL_X; /* Convert count of 128-byte blocks to max index of 64-byte block */ r = r * 2 - 1; - XOR_X_2(Bin1[r], Bin2[r]) + XOR_X_2(Bin1[r], Bin2[r]); - DECL_SMASK2REG + DECL_SMASK2REG; i = 0; r--; do { - XOR_X(Bin1[i]) - XOR_X(Bin2[i]) - PWXFORM - WRITE_X(Bout[i]) + XOR_X(Bin1[i]); + XOR_X(Bin2[i]); + PWXFORM; + WRITE_X(Bout[i]); - XOR_X(Bin1[i + 1]) - XOR_X(Bin2[i + 1]) - PWXFORM + XOR_X(Bin1[i + 1]); + XOR_X(Bin2[i + 1]); + PWXFORM; if (unlikely(i >= r)) break; - WRITE_X(Bout[i + 1]) + WRITE_X(Bout[i + 1]); i += 2; } while (1); @@ -385,7 +448,7 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; ctx->w = w; - SALSA20_2(Bout[i]) + SALSA20_2(Bout[i]); return INTEGERIFY; } @@ -399,30 +462,30 @@ static uint32_t blockmix_xor_save( uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; size_t w = ctx->w; size_t i; - DECL_X - DECL_Y + DECL_X; + DECL_Y; /* Convert count of 128-byte blocks to max index of 64-byte block */ r = r * 2 - 1; - XOR_X_2(Bin1out[r], Bin2[r]) + XOR_X_2(Bin1out[r], Bin2[r]); - DECL_SMASK2REG + DECL_SMASK2REG; i = 0; r--; do { - XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) - PWXFORM - WRITE_X(Bin1out[i]) + XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]); + PWXFORM; + WRITE_X(Bin1out[i]); - XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) - PWXFORM + XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]); + PWXFORM; if (unlikely(i >= r)) break; - WRITE_X(Bin1out[i + 1]) + WRITE_X(Bin1out[i + 1]); i += 2; } while (1); @@ -431,7 +494,7 @@ static uint32_t blockmix_xor_save( ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; ctx->w = w; - SALSA20_2(Bin1out[i]) + SALSA20_2(Bin1out[i]); return INTEGERIFY; } -- cgit v1.2.3-55-g6feb