diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2025-07-07 22:34:31 +0200 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2025-07-07 22:34:31 +0200 |
| commit | f8e9bd30d73f2acf6818da71a2ba44748151b716 (patch) | |
| tree | cc0855615b0e876253445c900e2d96fe04bf3d50 | |
| parent | e5d3a87633eac2a8a17d909b98a1e6dd21f80489 (diff) | |
| download | busybox-w32-f8e9bd30d73f2acf6818da71a2ba44748151b716.tar.gz busybox-w32-f8e9bd30d73f2acf6818da71a2ba44748151b716.tar.bz2 busybox-w32-f8e9bd30d73f2acf6818da71a2ba44748151b716.zip | |
libbb/yescrypt: disable unrolling in two places
Also, make many define macros safer
function old new delta
blockmix 2300 814 -1486
blockmix_xor 4606 1543 -3063
blockmix_xor_save 4737 1620 -3117
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-7666) Total: -7666 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
| -rw-r--r-- | libbb/yescrypt/alg-yescrypt-kdf.c | 255 |
1 files changed, 159 insertions, 96 deletions
diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c index d24b05150..ab095eae1 100644 --- a/libbb/yescrypt/alg-yescrypt-kdf.c +++ b/libbb/yescrypt/alg-yescrypt-kdf.c | |||
| @@ -42,6 +42,15 @@ | |||
| 42 | #define unlikely(exp) (exp) | 42 | #define unlikely(exp) (exp) |
| 43 | #endif | 43 | #endif |
| 44 | 44 | ||
| 45 | // Not a size win if 0 | ||
| 46 | #define UNROLL_COPY 1 | ||
| 47 | |||
| 48 | // -5324 bytes if 0: | ||
| 49 | #define UNROLL_PWXFORM_ROUND 0 | ||
| 50 | // -4864 bytes if 0: | ||
| 51 | #define UNROLL_PWXFORM 0 | ||
| 52 | // both 0: -7666 bytes | ||
| 53 | |||
| 45 | typedef union { | 54 | typedef union { |
| 46 | uint32_t w[16]; | 55 | uint32_t w[16]; |
| 47 | uint64_t d[8]; | 56 | uint64_t d[8]; |
| @@ -52,15 +61,17 @@ static void salsa20_simd_shuffle( | |||
| 52 | salsa20_blk_t *Bout) | 61 | salsa20_blk_t *Bout) |
| 53 | { | 62 | { |
| 54 | #define COMBINE(out, in1, in2) \ | 63 | #define COMBINE(out, in1, in2) \ |
| 55 | Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); | 64 | do { \ |
| 56 | COMBINE(0, 0, 2) | 65 | Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); \ |
| 57 | COMBINE(1, 5, 7) | 66 | } while (0) |
| 58 | COMBINE(2, 2, 4) | 67 | COMBINE(0, 0, 2); |
| 59 | COMBINE(3, 7, 1) | 68 | COMBINE(1, 5, 7); |
| 60 | COMBINE(4, 4, 6) | 69 | COMBINE(2, 2, 4); |
| 61 | COMBINE(5, 1, 3) | 70 | COMBINE(3, 7, 1); |
| 62 | COMBINE(6, 6, 0) | 71 | COMBINE(4, 4, 6); |
| 63 | COMBINE(7, 3, 5) | 72 | COMBINE(5, 1, 3); |
| 73 | COMBINE(6, 6, 0); | ||
| 74 | COMBINE(7, 3, 5); | ||
| 64 | #undef COMBINE | 75 | #undef COMBINE |
| 65 | } | 76 | } |
| 66 | 77 | ||
| @@ -69,25 +80,29 @@ static void salsa20_simd_unshuffle( | |||
| 69 | salsa20_blk_t *Bout) | 80 | salsa20_blk_t *Bout) |
| 70 | { | 81 | { |
| 71 | #define UNCOMBINE(out, in1, in2) \ | 82 | #define UNCOMBINE(out, in1, in2) \ |
| 83 | do { \ | ||
| 72 | Bout->w[out * 2] = Bin->d[in1]; \ | 84 | Bout->w[out * 2] = Bin->d[in1]; \ |
| 73 | Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; | 85 | Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; \ |
| 74 | UNCOMBINE(0, 0, 6) | 86 | } while (0) |
| 75 | UNCOMBINE(1, 5, 3) | 87 | UNCOMBINE(0, 0, 6); |
| 76 | UNCOMBINE(2, 2, 0) | 88 | UNCOMBINE(1, 5, 3); |
| 77 | UNCOMBINE(3, 7, 5) | 89 | UNCOMBINE(2, 2, 0); |
| 78 | UNCOMBINE(4, 4, 2) | 90 | UNCOMBINE(3, 7, 5); |
| 79 | UNCOMBINE(5, 1, 7) | 91 | UNCOMBINE(4, 4, 2); |
| 80 | UNCOMBINE(6, 6, 4) | 92 | UNCOMBINE(5, 1, 7); |
| 81 | UNCOMBINE(7, 3, 1) | 93 | UNCOMBINE(6, 6, 4); |
| 94 | UNCOMBINE(7, 3, 1); | ||
| 82 | #undef UNCOMBINE | 95 | #undef UNCOMBINE |
| 83 | } | 96 | } |
| 84 | 97 | ||
| 85 | #define DECL_X \ | 98 | #define DECL_X \ |
| 86 | salsa20_blk_t X; | 99 | salsa20_blk_t X |
| 87 | #define DECL_Y \ | 100 | #define DECL_Y \ |
| 88 | salsa20_blk_t Y; | 101 | salsa20_blk_t Y |
| 89 | 102 | ||
| 103 | #if UNROLL_COPY | ||
| 90 | #define COPY(out, in) \ | 104 | #define COPY(out, in) \ |
| 105 | do { \ | ||
| 91 | (out).d[0] = (in).d[0]; \ | 106 | (out).d[0] = (in).d[0]; \ |
| 92 | (out).d[1] = (in).d[1]; \ | 107 | (out).d[1] = (in).d[1]; \ |
| 93 | (out).d[2] = (in).d[2]; \ | 108 | (out).d[2] = (in).d[2]; \ |
| @@ -95,9 +110,17 @@ static void salsa20_simd_unshuffle( | |||
| 95 | (out).d[4] = (in).d[4]; \ | 110 | (out).d[4] = (in).d[4]; \ |
| 96 | (out).d[5] = (in).d[5]; \ | 111 | (out).d[5] = (in).d[5]; \ |
| 97 | (out).d[6] = (in).d[6]; \ | 112 | (out).d[6] = (in).d[6]; \ |
| 98 | (out).d[7] = (in).d[7]; | 113 | (out).d[7] = (in).d[7]; \ |
| 114 | } while (0) | ||
| 115 | #else | ||
| 116 | #define COPY(out, in) \ | ||
| 117 | do { \ | ||
| 118 | for (int copyi=0; copyi<8; copyi++) \ | ||
| 119 | (out).d[copyi] = (in).d[copyi]; \ | ||
| 120 | } while (0) | ||
| 121 | #endif | ||
| 99 | 122 | ||
| 100 | #define READ_X(in) COPY(X, in) | 123 | #define READ_X(in) COPY(X, in) |
| 101 | #define WRITE_X(out) COPY(out, X) | 124 | #define WRITE_X(out) COPY(out, X) |
| 102 | 125 | ||
| 103 | /** | 126 | /** |
| @@ -154,7 +177,6 @@ static void salsa20(salsa20_blk_t *restrict B, | |||
| 154 | B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; | 177 | B->w[i + 3] = Bout->w[i + 3] += B->w[i + 3]; |
| 155 | } | 178 | } |
| 156 | } | 179 | } |
| 157 | |||
| 158 | #if 0 | 180 | #if 0 |
| 159 | /* Too expensive */ | 181 | /* Too expensive */ |
| 160 | explicit_bzero(&X, sizeof(X)); | 182 | explicit_bzero(&X, sizeof(X)); |
| @@ -165,9 +187,10 @@ static void salsa20(salsa20_blk_t *restrict B, | |||
| 165 | * Apply the Salsa20/2 core to the block provided in X. | 187 | * Apply the Salsa20/2 core to the block provided in X. |
| 166 | */ | 188 | */ |
| 167 | #define SALSA20_2(out) \ | 189 | #define SALSA20_2(out) \ |
| 168 | salsa20(&X, &out, 1); | 190 | salsa20(&X, &out, 1) |
| 169 | 191 | ||
| 170 | #define XOR(out, in1, in2) \ | 192 | #define XOR(out, in1, in2) \ |
| 193 | do { \ | ||
| 171 | (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ | 194 | (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ |
| 172 | (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ | 195 | (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ |
| 173 | (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ | 196 | (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ |
| @@ -175,23 +198,28 @@ static void salsa20(salsa20_blk_t *restrict B, | |||
| 175 | (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ | 198 | (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ |
| 176 | (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ | 199 | (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ |
| 177 | (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ | 200 | (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ |
| 178 | (out).d[7] = (in1).d[7] ^ (in2).d[7]; | 201 | (out).d[7] = (in1).d[7] ^ (in2).d[7]; \ |
| 202 | } while (0) | ||
| 179 | 203 | ||
| 180 | #define XOR_X(in) XOR(X, X, in) | 204 | #define XOR_X(in) XOR(X, X, in) |
| 181 | #define XOR_X_2(in1, in2) XOR(X, in1, in2) | 205 | #define XOR_X_2(in1, in2) XOR(X, in1, in2) |
| 182 | #define XOR_X_WRITE_XOR_Y_2(out, in) \ | 206 | #define XOR_X_WRITE_XOR_Y_2(out, in) \ |
| 183 | XOR(Y, out, in) \ | 207 | do { \ |
| 184 | COPY(out, Y) \ | 208 | XOR(Y, out, in); \ |
| 185 | XOR(X, X, Y) | 209 | COPY(out, Y); \ |
| 210 | XOR(X, X, Y); \ | ||
| 211 | } while (0) | ||
| 186 | 212 | ||
| 187 | /** | 213 | /** |
| 188 | * Apply the Salsa20/8 core to the block provided in X ^ in. | 214 | * Apply the Salsa20/8 core to the block provided in X ^ in. |
| 189 | */ | 215 | */ |
| 190 | #define SALSA20_8_XOR_MEM(in, out) \ | 216 | #define SALSA20_8_XOR_MEM(in, out) \ |
| 217 | do { \ | ||
| 191 | XOR_X(in); \ | 218 | XOR_X(in); \ |
| 192 | salsa20(&X, &out, 4); | 219 | salsa20(&X, &out, 4); \ |
| 220 | } while (0) | ||
| 193 | 221 | ||
| 194 | #define INTEGERIFY (uint32_t)X.d[0] | 222 | #define INTEGERIFY ((uint32_t)X.d[0]) |
| 195 | 223 | ||
| 196 | /** | 224 | /** |
| 197 | * blockmix_salsa8(Bin, Bout, r): | 225 | * blockmix_salsa8(Bin, Bout, r): |
| @@ -204,12 +232,12 @@ static void blockmix_salsa8( | |||
| 204 | size_t r) | 232 | size_t r) |
| 205 | { | 233 | { |
| 206 | size_t i; | 234 | size_t i; |
| 207 | DECL_X | 235 | DECL_X; |
| 208 | 236 | ||
| 209 | READ_X(Bin[r * 2 - 1]) | 237 | READ_X(Bin[r * 2 - 1]); |
| 210 | for (i = 0; i < r; i++) { | 238 | for (i = 0; i < r; i++) { |
| 211 | SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]) | 239 | SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]); |
| 212 | SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]) | 240 | SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]); |
| 213 | } | 241 | } |
| 214 | } | 242 | } |
| 215 | 243 | ||
| @@ -220,14 +248,14 @@ static uint32_t blockmix_salsa8_xor( | |||
| 220 | size_t r) | 248 | size_t r) |
| 221 | { | 249 | { |
| 222 | size_t i; | 250 | size_t i; |
| 223 | DECL_X | 251 | DECL_X; |
| 224 | 252 | ||
| 225 | XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]) | 253 | XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]); |
| 226 | for (i = 0; i < r; i++) { | 254 | for (i = 0; i < r; i++) { |
| 227 | XOR_X(Bin1[i * 2]) | 255 | XOR_X(Bin1[i * 2]); |
| 228 | SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]) | 256 | SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]); |
| 229 | XOR_X(Bin1[i * 2 + 1]) | 257 | XOR_X(Bin1[i * 2 + 1]); |
| 230 | SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]) | 258 | SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]); |
| 231 | } | 259 | } |
| 232 | 260 | ||
| 233 | return INTEGERIFY; | 261 | return INTEGERIFY; |
| @@ -242,27 +270,38 @@ static uint32_t blockmix_salsa8_xor( | |||
| 242 | 270 | ||
| 243 | /* Derived values. Not tunable except via Swidth above. */ | 271 | /* Derived values. Not tunable except via Swidth above. */ |
| 244 | #define PWXbytes (PWXgather * PWXsimple * 8) | 272 | #define PWXbytes (PWXgather * PWXsimple * 8) |
| 245 | #define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) | 273 | #define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) |
| 246 | #define Smask (((1 << Swidth) - 1) * PWXsimple * 8) | 274 | #define Smask (((1 << Swidth) - 1) * PWXsimple * 8) |
| 247 | #define Smask2 (((uint64_t)Smask << 32) | Smask) | 275 | #define Smask2 (((uint64_t)Smask << 32) | Smask) |
| 248 | 276 | ||
| 249 | #define DECL_SMASK2REG /* empty */ | 277 | #define DECL_SMASK2REG do {} while (0) |
| 250 | #define FORCE_REGALLOC_3 /* empty */ | 278 | #define FORCE_REGALLOC_3 do {} while (0) |
| 251 | #define MAYBE_MEMORY_BARRIER /* empty */ | 279 | #define MAYBE_MEMORY_BARRIER do {} while (0) |
| 252 | 280 | ||
| 253 | #define PWXFORM_SIMD(x0, x1) { \ | 281 | #define PWXFORM_SIMD(x0, x1) \ |
| 282 | do { \ | ||
| 254 | uint64_t x = x0 & Smask2; \ | 283 | uint64_t x = x0 & Smask2; \ |
| 255 | uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ | 284 | uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ |
| 256 | uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ | 285 | uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ |
| 257 | x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ | 286 | x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ |
| 258 | x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ | 287 | x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ |
| 259 | } | 288 | } while (0) |
| 260 | 289 | ||
| 290 | #if UNROLL_PWXFORM_ROUND | ||
| 291 | #define PWXFORM_ROUND \ | ||
| 292 | do { \ | ||
| 293 | PWXFORM_SIMD(X.d[0], X.d[1]); \ | ||
| 294 | PWXFORM_SIMD(X.d[2], X.d[3]); \ | ||
| 295 | PWXFORM_SIMD(X.d[4], X.d[5]); \ | ||
| 296 | PWXFORM_SIMD(X.d[6], X.d[7]); \ | ||
| 297 | } while (0) | ||
| 298 | #else | ||
| 261 | #define PWXFORM_ROUND \ | 299 | #define PWXFORM_ROUND \ |
| 262 | PWXFORM_SIMD(X.d[0], X.d[1]) \ | 300 | do { \ |
| 263 | PWXFORM_SIMD(X.d[2], X.d[3]) \ | 301 | for (int pwxi=0; pwxi<8; pwxi+=2) \ |
| 264 | PWXFORM_SIMD(X.d[4], X.d[5]) \ | 302 | PWXFORM_SIMD(X.d[pwxi], X.d[pwxi + 1]); \ |
| 265 | PWXFORM_SIMD(X.d[6], X.d[7]) | 303 | } while (0) |
| 304 | #endif | ||
| 266 | 305 | ||
| 267 | /* | 306 | /* |
| 268 | * This offset helps address the 256-byte write block via the single-byte | 307 | * This offset helps address the 256-byte write block via the single-byte |
| @@ -275,19 +314,23 @@ static uint32_t blockmix_salsa8_xor( | |||
| 275 | #define PWXFORM_WRITE_OFFSET 0x7c | 314 | #define PWXFORM_WRITE_OFFSET 0x7c |
| 276 | 315 | ||
| 277 | #define PWXFORM_WRITE \ | 316 | #define PWXFORM_WRITE \ |
| 278 | WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)) \ | 317 | do { \ |
| 279 | Sw += 64; | 318 | WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)); \ |
| 280 | 319 | Sw += 64; \ | |
| 281 | #define PWXFORM { \ | 320 | } while (0) |
| 321 | |||
| 322 | #if UNROLL_PWXFORM | ||
| 323 | #define PWXFORM \ | ||
| 324 | do { \ | ||
| 282 | uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ | 325 | uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ |
| 283 | FORCE_REGALLOC_3 \ | 326 | FORCE_REGALLOC_3; \ |
| 284 | MAYBE_MEMORY_BARRIER \ | 327 | MAYBE_MEMORY_BARRIER; \ |
| 285 | PWXFORM_ROUND \ | 328 | PWXFORM_ROUND; \ |
| 286 | PWXFORM_ROUND PWXFORM_WRITE \ | 329 | PWXFORM_ROUND; PWXFORM_WRITE; \ |
| 287 | PWXFORM_ROUND PWXFORM_WRITE \ | 330 | PWXFORM_ROUND; PWXFORM_WRITE; \ |
| 288 | PWXFORM_ROUND PWXFORM_WRITE \ | 331 | PWXFORM_ROUND; PWXFORM_WRITE; \ |
| 289 | PWXFORM_ROUND PWXFORM_WRITE \ | 332 | PWXFORM_ROUND; PWXFORM_WRITE; \ |
| 290 | PWXFORM_ROUND \ | 333 | PWXFORM_ROUND; \ |
| 291 | w = (w + 64 * 4) & Smask2; \ | 334 | w = (w + 64 * 4) & Smask2; \ |
| 292 | { \ | 335 | { \ |
| 293 | uint8_t *Stmp = S2; \ | 336 | uint8_t *Stmp = S2; \ |
| @@ -295,7 +338,27 @@ static uint32_t blockmix_salsa8_xor( | |||
| 295 | S1 = S0; \ | 338 | S1 = S0; \ |
| 296 | S0 = Stmp; \ | 339 | S0 = Stmp; \ |
| 297 | } \ | 340 | } \ |
| 298 | } | 341 | } while (0) |
| 342 | #else | ||
| 343 | #define PWXFORM \ | ||
| 344 | do { \ | ||
| 345 | uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ | ||
| 346 | FORCE_REGALLOC_3; \ | ||
| 347 | MAYBE_MEMORY_BARRIER; \ | ||
| 348 | PWXFORM_ROUND; \ | ||
| 349 | for (int pwxj=0; pwxj<4; pwxj++) {\ | ||
| 350 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
| 351 | } \ | ||
| 352 | PWXFORM_ROUND; \ | ||
| 353 | w = (w + 64 * 4) & Smask2; \ | ||
| 354 | { \ | ||
| 355 | uint8_t *Stmp = S2; \ | ||
| 356 | S2 = S1; \ | ||
| 357 | S1 = S0; \ | ||
| 358 | S0 = Stmp; \ | ||
| 359 | } \ | ||
| 360 | } while (0) | ||
| 361 | #endif | ||
| 299 | 362 | ||
| 300 | typedef struct { | 363 | typedef struct { |
| 301 | uint8_t *S0, *S1, *S2; | 364 | uint8_t *S0, *S1, *S2; |
| @@ -318,29 +381,29 @@ static void blockmix( | |||
| 318 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | 381 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; |
| 319 | size_t w = ctx->w; | 382 | size_t w = ctx->w; |
| 320 | size_t i; | 383 | size_t i; |
| 321 | DECL_X | 384 | DECL_X; |
| 322 | 385 | ||
| 323 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | 386 | /* Convert count of 128-byte blocks to max index of 64-byte block */ |
| 324 | r = r * 2 - 1; | 387 | r = r * 2 - 1; |
| 325 | 388 | ||
| 326 | READ_X(Bin[r]) | 389 | READ_X(Bin[r]); |
| 327 | 390 | ||
| 328 | DECL_SMASK2REG | 391 | DECL_SMASK2REG; |
| 329 | 392 | ||
| 330 | i = 0; | 393 | i = 0; |
| 331 | do { | 394 | do { |
| 332 | XOR_X(Bin[i]) | 395 | XOR_X(Bin[i]); |
| 333 | PWXFORM | 396 | PWXFORM; |
| 334 | if (unlikely(i >= r)) | 397 | if (unlikely(i >= r)) |
| 335 | break; | 398 | break; |
| 336 | WRITE_X(Bout[i]) | 399 | WRITE_X(Bout[i]); |
| 337 | i++; | 400 | i++; |
| 338 | } while (1); | 401 | } while (1); |
| 339 | 402 | ||
| 340 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; | 403 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; |
| 341 | ctx->w = w; | 404 | ctx->w = w; |
| 342 | 405 | ||
| 343 | SALSA20_2(Bout[i]) | 406 | SALSA20_2(Bout[i]); |
| 344 | } | 407 | } |
| 345 | 408 | ||
| 346 | static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, | 409 | static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, |
| @@ -352,31 +415,31 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, | |||
| 352 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | 415 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; |
| 353 | size_t w = ctx->w; | 416 | size_t w = ctx->w; |
| 354 | size_t i; | 417 | size_t i; |
| 355 | DECL_X | 418 | DECL_X; |
| 356 | 419 | ||
| 357 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | 420 | /* Convert count of 128-byte blocks to max index of 64-byte block */ |
| 358 | r = r * 2 - 1; | 421 | r = r * 2 - 1; |
| 359 | 422 | ||
| 360 | XOR_X_2(Bin1[r], Bin2[r]) | 423 | XOR_X_2(Bin1[r], Bin2[r]); |
| 361 | 424 | ||
| 362 | DECL_SMASK2REG | 425 | DECL_SMASK2REG; |
| 363 | 426 | ||
| 364 | i = 0; | 427 | i = 0; |
| 365 | r--; | 428 | r--; |
| 366 | do { | 429 | do { |
| 367 | XOR_X(Bin1[i]) | 430 | XOR_X(Bin1[i]); |
| 368 | XOR_X(Bin2[i]) | 431 | XOR_X(Bin2[i]); |
| 369 | PWXFORM | 432 | PWXFORM; |
| 370 | WRITE_X(Bout[i]) | 433 | WRITE_X(Bout[i]); |
| 371 | 434 | ||
| 372 | XOR_X(Bin1[i + 1]) | 435 | XOR_X(Bin1[i + 1]); |
| 373 | XOR_X(Bin2[i + 1]) | 436 | XOR_X(Bin2[i + 1]); |
| 374 | PWXFORM | 437 | PWXFORM; |
| 375 | 438 | ||
| 376 | if (unlikely(i >= r)) | 439 | if (unlikely(i >= r)) |
| 377 | break; | 440 | break; |
| 378 | 441 | ||
| 379 | WRITE_X(Bout[i + 1]) | 442 | WRITE_X(Bout[i + 1]); |
| 380 | 443 | ||
| 381 | i += 2; | 444 | i += 2; |
| 382 | } while (1); | 445 | } while (1); |
| @@ -385,7 +448,7 @@ static uint32_t blockmix_xor(const salsa20_blk_t *Bin1, | |||
| 385 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; | 448 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; |
| 386 | ctx->w = w; | 449 | ctx->w = w; |
| 387 | 450 | ||
| 388 | SALSA20_2(Bout[i]) | 451 | SALSA20_2(Bout[i]); |
| 389 | 452 | ||
| 390 | return INTEGERIFY; | 453 | return INTEGERIFY; |
| 391 | } | 454 | } |
| @@ -399,30 +462,30 @@ static uint32_t blockmix_xor_save( | |||
| 399 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | 462 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; |
| 400 | size_t w = ctx->w; | 463 | size_t w = ctx->w; |
| 401 | size_t i; | 464 | size_t i; |
| 402 | DECL_X | 465 | DECL_X; |
| 403 | DECL_Y | 466 | DECL_Y; |
| 404 | 467 | ||
| 405 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | 468 | /* Convert count of 128-byte blocks to max index of 64-byte block */ |
| 406 | r = r * 2 - 1; | 469 | r = r * 2 - 1; |
| 407 | 470 | ||
| 408 | XOR_X_2(Bin1out[r], Bin2[r]) | 471 | XOR_X_2(Bin1out[r], Bin2[r]); |
| 409 | 472 | ||
| 410 | DECL_SMASK2REG | 473 | DECL_SMASK2REG; |
| 411 | 474 | ||
| 412 | i = 0; | 475 | i = 0; |
| 413 | r--; | 476 | r--; |
| 414 | do { | 477 | do { |
| 415 | XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]) | 478 | XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]); |
| 416 | PWXFORM | 479 | PWXFORM; |
| 417 | WRITE_X(Bin1out[i]) | 480 | WRITE_X(Bin1out[i]); |
| 418 | 481 | ||
| 419 | XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]) | 482 | XOR_X_WRITE_XOR_Y_2(Bin2[i + 1], Bin1out[i + 1]); |
| 420 | PWXFORM | 483 | PWXFORM; |
| 421 | 484 | ||
| 422 | if (unlikely(i >= r)) | 485 | if (unlikely(i >= r)) |
| 423 | break; | 486 | break; |
| 424 | 487 | ||
| 425 | WRITE_X(Bin1out[i + 1]) | 488 | WRITE_X(Bin1out[i + 1]); |
| 426 | 489 | ||
| 427 | i += 2; | 490 | i += 2; |
| 428 | } while (1); | 491 | } while (1); |
| @@ -431,7 +494,7 @@ static uint32_t blockmix_xor_save( | |||
| 431 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; | 494 | ctx->S0 = S0; ctx->S1 = S1; ctx->S2 = S2; |
| 432 | ctx->w = w; | 495 | ctx->w = w; |
| 433 | 496 | ||
| 434 | SALSA20_2(Bin1out[i]) | 497 | SALSA20_2(Bin1out[i]); |
| 435 | 498 | ||
| 436 | return INTEGERIFY; | 499 | return INTEGERIFY; |
| 437 | } | 500 | } |
