diff options
Diffstat (limited to '')
-rw-r--r-- | libbb/yescrypt/alg-yescrypt-kdf.c | 1212 |
1 files changed, 1212 insertions, 0 deletions
diff --git a/libbb/yescrypt/alg-yescrypt-kdf.c b/libbb/yescrypt/alg-yescrypt-kdf.c new file mode 100644 index 000000000..a9a1bd591 --- /dev/null +++ b/libbb/yescrypt/alg-yescrypt-kdf.c | |||
@@ -0,0 +1,1212 @@ | |||
1 | /*- | ||
2 | * Copyright 2009 Colin Percival | ||
3 | * Copyright 2012-2018 Alexander Peslyak | ||
4 | * All rights reserved. | ||
5 | * | ||
6 | * Redistribution and use in source and binary forms, with or without | ||
7 | * modification, are permitted provided that the following conditions | ||
8 | * are met: | ||
9 | * 1. Redistributions of source code must retain the above copyright | ||
10 | * notice, this list of conditions and the following disclaimer. | ||
11 | * 2. Redistributions in binary form must reproduce the above copyright | ||
12 | * notice, this list of conditions and the following disclaimer in the | ||
13 | * documentation and/or other materials provided with the distribution. | ||
14 | * | ||
15 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | ||
16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
18 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | ||
19 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | ||
20 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | ||
21 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | ||
22 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | ||
23 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | ||
24 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | ||
25 | * SUCH DAMAGE. | ||
26 | * | ||
27 | * This file was originally written by Colin Percival as part of the Tarsnap | ||
28 | * online backup system. | ||
29 | */ | ||
30 | |||
31 | #if __STDC_VERSION__ >= 199901L | ||
32 | /* Have restrict */ | ||
33 | #elif defined(__GNUC__) | ||
34 | #define restrict __restrict | ||
35 | #else | ||
36 | #define restrict | ||
37 | #endif | ||
38 | |||
39 | #ifdef __GNUC__ | ||
40 | #define unlikely(exp) __builtin_expect(exp, 0) | ||
41 | #else | ||
42 | #define unlikely(exp) (exp) | ||
43 | #endif | ||
44 | |||
45 | typedef union { | ||
46 | uint32_t w[16]; | ||
47 | uint64_t d[8]; | ||
48 | } salsa20_blk_t; | ||
49 | |||
50 | static void salsa20_simd_shuffle( | ||
51 | const salsa20_blk_t *Bin, | ||
52 | salsa20_blk_t *Bout) | ||
53 | { | ||
54 | #define COMBINE(out, in1, in2) \ | ||
55 | do { \ | ||
56 | Bout->d[out] = Bin->w[in1 * 2] | ((uint64_t)Bin->w[in2 * 2 + 1] << 32); \ | ||
57 | } while (0) | ||
58 | COMBINE(0, 0, 2); | ||
59 | COMBINE(1, 5, 7); | ||
60 | COMBINE(2, 2, 4); | ||
61 | COMBINE(3, 7, 1); | ||
62 | COMBINE(4, 4, 6); | ||
63 | COMBINE(5, 1, 3); | ||
64 | COMBINE(6, 6, 0); | ||
65 | COMBINE(7, 3, 5); | ||
66 | #undef COMBINE | ||
67 | } | ||
68 | |||
69 | static void salsa20_simd_unshuffle( | ||
70 | const salsa20_blk_t *Bin, | ||
71 | salsa20_blk_t *Bout) | ||
72 | { | ||
73 | #define UNCOMBINE(out, in1, in2) \ | ||
74 | do { \ | ||
75 | Bout->w[out * 2] = Bin->d[in1]; \ | ||
76 | Bout->w[out * 2 + 1] = Bin->d[in2] >> 32; \ | ||
77 | } while (0) | ||
78 | UNCOMBINE(0, 0, 6); | ||
79 | UNCOMBINE(1, 5, 3); | ||
80 | UNCOMBINE(2, 2, 0); | ||
81 | UNCOMBINE(3, 7, 5); | ||
82 | UNCOMBINE(4, 4, 2); | ||
83 | UNCOMBINE(5, 1, 7); | ||
84 | UNCOMBINE(6, 6, 4); | ||
85 | UNCOMBINE(7, 3, 1); | ||
86 | #undef UNCOMBINE | ||
87 | } | ||
88 | |||
89 | #define DECL_X \ | ||
90 | salsa20_blk_t X | ||
91 | #define DECL_Y \ | ||
92 | salsa20_blk_t Y | ||
93 | |||
94 | #if KDF_UNROLL_COPY | ||
95 | #define COPY(out, in) \ | ||
96 | do { \ | ||
97 | (out).d[0] = (in).d[0]; \ | ||
98 | (out).d[1] = (in).d[1]; \ | ||
99 | (out).d[2] = (in).d[2]; \ | ||
100 | (out).d[3] = (in).d[3]; \ | ||
101 | (out).d[4] = (in).d[4]; \ | ||
102 | (out).d[5] = (in).d[5]; \ | ||
103 | (out).d[6] = (in).d[6]; \ | ||
104 | (out).d[7] = (in).d[7]; \ | ||
105 | } while (0) | ||
106 | #else | ||
107 | #define COPY(out, in) \ | ||
108 | do { \ | ||
109 | memcpy((out).d, (in).d, sizeof((in).d)); \ | ||
110 | } while (0) | ||
111 | #endif | ||
112 | |||
113 | #define READ_X(in) COPY(X, in) | ||
114 | #define WRITE_X(out) COPY(out, X) | ||
115 | |||
116 | /** | ||
117 | * salsa20(B): | ||
118 | * Apply the Salsa20 core to the provided block. | ||
119 | */ | ||
120 | static void salsa20(salsa20_blk_t *restrict B, | ||
121 | salsa20_blk_t *restrict Bout, | ||
122 | uint32_t doublerounds) | ||
123 | { | ||
124 | salsa20_blk_t X; | ||
125 | #define x X.w | ||
126 | |||
127 | salsa20_simd_unshuffle(B, &X); | ||
128 | |||
129 | do { | ||
130 | #define R(a,b) (((a) << (b)) | ((a) >> (32 - (b)))) | ||
131 | /* Operate on columns */ | ||
132 | #if KDF_UNROLL_SALSA20 | ||
133 | x[ 4] ^= R(x[ 0]+x[12], 7); // x[j] ^= R(x[k]+x[l], CONST) | ||
134 | x[ 8] ^= R(x[ 4]+x[ 0], 9); | ||
135 | x[12] ^= R(x[ 8]+x[ 4],13); | ||
136 | x[ 0] ^= R(x[12]+x[ 8],18); | ||
137 | |||
138 | x[ 9] ^= R(x[ 5]+x[ 1], 7); | ||
139 | x[13] ^= R(x[ 9]+x[ 5], 9); | ||
140 | x[ 1] ^= R(x[13]+x[ 9],13); | ||
141 | x[ 5] ^= R(x[ 1]+x[13],18); | ||
142 | |||
143 | x[14] ^= R(x[10]+x[ 6], 7); | ||
144 | x[ 2] ^= R(x[14]+x[10], 9); | ||
145 | x[ 6] ^= R(x[ 2]+x[14],13); | ||
146 | x[10] ^= R(x[ 6]+x[ 2],18); | ||
147 | |||
148 | x[ 3] ^= R(x[15]+x[11], 7); | ||
149 | x[ 7] ^= R(x[ 3]+x[15], 9); | ||
150 | x[11] ^= R(x[ 7]+x[ 3],13); | ||
151 | x[15] ^= R(x[11]+x[ 7],18); | ||
152 | #else | ||
153 | { | ||
154 | unsigned j, k, l; | ||
155 | j = 4; k = 0; l = 12; | ||
156 | for (;;) { | ||
157 | uint32_t t; | ||
158 | x[j] ^= ({ t = x[k] + x[l]; R(t, 7); }); l = k; k = j; j = (j+4) & 0xf; | ||
159 | x[j] ^= ({ t = x[k] + x[l]; R(t, 9); }); l = k; k = j; j = (j+4) & 0xf; | ||
160 | x[j] ^= ({ t = x[k] + x[l]; R(t,13); }); l = k; k = j; j = (j+4) & 0xf; | ||
161 | x[j] ^= ({ t = x[k] + x[l]; R(t,18); }); | ||
162 | if (j == 15) break; | ||
163 | l = j + 1; k = j + 5; j = (j+9) & 0xf; | ||
164 | } | ||
165 | } | ||
166 | #endif | ||
167 | /* Operate on rows */ | ||
168 | #if KDF_UNROLL_SALSA20 | ||
169 | // i=0 n=0 | ||
170 | x[ 1] ^= R(x[ 0]+x[ 3], 7); // [i + (n+1)&3] [i + (n+0)&3] [i + (n+3)&3] | ||
171 | x[ 2] ^= R(x[ 1]+x[ 0], 9); // [i + (n+2)&3] [i + (n+1)&3] [i + (n+0)&3] | ||
172 | x[ 3] ^= R(x[ 2]+x[ 1],13); // [i + (n+3)&3] [i + (n+2)&3] [i + (n+1)&3] | ||
173 | x[ 0] ^= R(x[ 3]+x[ 2],18); // [i + (n+0)&3] [i + (n+3)&3] [i + (n+2)&3] | ||
174 | // i=4 n=1 ^^^j^^^ ^^^k^^^ ^^^l^^^ | ||
175 | x[ 6] ^= R(x[ 5]+x[ 4], 7); // [i + (n+1)&3] [i + (n+0)&3] [i + (n+3)&3] | ||
176 | x[ 7] ^= R(x[ 6]+x[ 5], 9); // [i + (n+2)&3] [i + (n+1)&3] [i + (n+0)&3] | ||
177 | x[ 4] ^= R(x[ 7]+x[ 6],13); // [i + (n+3)&3] [i + (n+2)&3] [i + (n+1)&3] | ||
178 | x[ 5] ^= R(x[ 4]+x[ 7],18); // [i + (n+0)&3] [i + (n+3)&3] [i + (n+2)&3] | ||
179 | // i=8 n=2 | ||
180 | x[11] ^= R(x[10]+x[ 9], 7); // [i + (n+1)&3] [i + (n+0)&3] [i + (n+3)&3] | ||
181 | x[ 8] ^= R(x[11]+x[10], 9); // [i + (n+2)&3] [i + (n+1)&3] [i + (n+0)&3] | ||
182 | x[ 9] ^= R(x[ 8]+x[11],13); // [i + (n+3)&3] [i + (n+2)&3] [i + (n+1)&3] | ||
183 | x[10] ^= R(x[ 9]+x[ 8],18); // [i + (n+0)&3] [i + (n+3)&3] [i + (n+2)&3] | ||
184 | // i=12 n=3 | ||
185 | x[12] ^= R(x[15]+x[14], 7); // [i + (n+1)&3] [i + (n+0)&3] [i + (n+3)&3] | ||
186 | x[13] ^= R(x[12]+x[15], 9); // [i + (n+2)&3] [i + (n+1)&3] [i + (n+0)&3] | ||
187 | x[14] ^= R(x[13]+x[12],13); // [i + (n+3)&3] [i + (n+2)&3] [i + (n+1)&3] | ||
188 | x[15] ^= R(x[14]+x[13],18); // [i + (n+0)&3] [i + (n+3)&3] [i + (n+2)&3] | ||
189 | #else | ||
190 | { | ||
191 | unsigned j, k, l; | ||
192 | uint32_t *xrow; | ||
193 | j = 1; k = 0; l = 3; | ||
194 | xrow = &x[0]; | ||
195 | for (;;) { | ||
196 | uint32_t t; | ||
197 | xrow[j] ^= ({ t = xrow[k] + xrow[l]; R(t, 7); }); l = k; k = j; j = (j+1) & 3; | ||
198 | xrow[j] ^= ({ t = xrow[k] + xrow[l]; R(t, 9); }); l = k; k = j; j = (j+1) & 3; | ||
199 | xrow[j] ^= ({ t = xrow[k] + xrow[l]; R(t,13); }); l = k; k = j; j = (j+1) & 3; | ||
200 | xrow[j] ^= ({ t = xrow[k] + xrow[l]; R(t,18); }); | ||
201 | if (j == 3) break; | ||
202 | l = j; k = j + 1; j = (j+2) & 3; | ||
203 | xrow += 4; | ||
204 | } | ||
205 | } | ||
206 | #endif | ||
207 | |||
208 | #undef R | ||
209 | } while (--doublerounds); | ||
210 | #undef x | ||
211 | |||
212 | { | ||
213 | uint32_t i; | ||
214 | salsa20_simd_shuffle(&X, Bout); | ||
215 | for (i = 0; i < 16; i++) { | ||
216 | // bbox: note: was unrolled x4 | ||
217 | B->w[i] = Bout->w[i] += B->w[i]; | ||
218 | } | ||
219 | } | ||
220 | #if 0 | ||
221 | /* Too expensive */ | ||
222 | explicit_bzero(&X, sizeof(X)); | ||
223 | #endif | ||
224 | } | ||
225 | |||
226 | /** | ||
227 | * Apply the Salsa20/2 core to the block provided in X. | ||
228 | */ | ||
229 | #define SALSA20_2(out) \ | ||
230 | salsa20(&X, &out, 1) | ||
231 | |||
232 | #if 0 | ||
233 | #define XOR(out, in1, in2) \ | ||
234 | do { \ | ||
235 | (out).d[0] = (in1).d[0] ^ (in2).d[0]; \ | ||
236 | (out).d[1] = (in1).d[1] ^ (in2).d[1]; \ | ||
237 | (out).d[2] = (in1).d[2] ^ (in2).d[2]; \ | ||
238 | (out).d[3] = (in1).d[3] ^ (in2).d[3]; \ | ||
239 | (out).d[4] = (in1).d[4] ^ (in2).d[4]; \ | ||
240 | (out).d[5] = (in1).d[5] ^ (in2).d[5]; \ | ||
241 | (out).d[6] = (in1).d[6] ^ (in2).d[6]; \ | ||
242 | (out).d[7] = (in1).d[7] ^ (in2).d[7]; \ | ||
243 | } while (0) | ||
244 | #else | ||
245 | #define XOR(out, in1, in2) \ | ||
246 | do { \ | ||
247 | xorbuf64_3_aligned64(&(out).d, &(in1).d, &(in2).d); \ | ||
248 | } while (0) | ||
249 | #endif | ||
250 | |||
251 | #define XOR_X(in) XOR(X, X, in) | ||
252 | #define XOR_X_2(in1, in2) XOR(X, in1, in2) | ||
253 | #define XOR_X_WRITE_XOR_Y_2(out, in) \ | ||
254 | do { \ | ||
255 | XOR(Y, out, in); \ | ||
256 | COPY(out, Y); \ | ||
257 | XOR(X, X, Y); \ | ||
258 | } while (0) | ||
259 | |||
260 | /** | ||
261 | * Apply the Salsa20/8 core to the block provided in X ^ in. | ||
262 | */ | ||
263 | #define SALSA20_8_XOR_MEM(in, out) \ | ||
264 | do { \ | ||
265 | XOR_X(in); \ | ||
266 | salsa20(&X, &out, 4); \ | ||
267 | } while (0) | ||
268 | |||
269 | #define INTEGERIFY ((uint32_t)X.d[0]) | ||
270 | |||
271 | /** | ||
272 | * blockmix_salsa8(Bin, Bout, r): | ||
273 | * Compute Bout = BlockMix_{salsa20/8, r}(Bin). The input Bin must be 128r | ||
274 | * bytes in length; the output Bout must also be the same size. | ||
275 | */ | ||
276 | static void blockmix_salsa8( | ||
277 | const salsa20_blk_t *restrict Bin, | ||
278 | salsa20_blk_t *restrict Bout, | ||
279 | size_t r) | ||
280 | { | ||
281 | size_t i; | ||
282 | DECL_X; | ||
283 | |||
284 | READ_X(Bin[r * 2 - 1]); | ||
285 | for (i = 0; i < r; i++) { | ||
286 | SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]); | ||
287 | SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + i]); | ||
288 | } | ||
289 | } | ||
290 | |||
291 | static uint32_t blockmix_salsa8_xor( | ||
292 | const salsa20_blk_t *restrict Bin1, | ||
293 | const salsa20_blk_t *restrict Bin2, | ||
294 | salsa20_blk_t *restrict Bout, | ||
295 | size_t r) | ||
296 | { | ||
297 | size_t i; | ||
298 | DECL_X; | ||
299 | |||
300 | XOR_X_2(Bin1[r * 2 - 1], Bin2[r * 2 - 1]); | ||
301 | for (i = 0; i < r; i++) { | ||
302 | XOR_X(Bin1[i * 2]); | ||
303 | SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]); | ||
304 | XOR_X(Bin1[i * 2 + 1]); | ||
305 | SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + i]); | ||
306 | } | ||
307 | |||
308 | return INTEGERIFY; | ||
309 | } | ||
310 | |||
311 | /* This is tunable */ | ||
312 | #define Swidth 8 | ||
313 | |||
314 | /* Not tunable in this implementation, hard-coded in a few places */ | ||
315 | #define PWXsimple 2 | ||
316 | #define PWXgather 4 | ||
317 | |||
318 | /* Derived values. Not tunable except via Swidth above. */ | ||
319 | #define PWXbytes (PWXgather * PWXsimple * 8) | ||
320 | #define Sbytes (3 * (1 << Swidth) * PWXsimple * 8) | ||
321 | #define Smask (((1 << Swidth) - 1) * PWXsimple * 8) | ||
322 | #define Smask2 (((uint64_t)Smask << 32) | Smask) | ||
323 | |||
324 | #define DECL_SMASK2REG do {} while (0) | ||
325 | #define FORCE_REGALLOC_3 do {} while (0) | ||
326 | #define MAYBE_MEMORY_BARRIER do {} while (0) | ||
327 | |||
328 | #define PWXFORM_SIMD(x0, x1) \ | ||
329 | do { \ | ||
330 | uint64_t x = x0 & Smask2; \ | ||
331 | uint64_t *p0 = (uint64_t *)(S0 + (uint32_t)x); \ | ||
332 | uint64_t *p1 = (uint64_t *)(S1 + (x >> 32)); \ | ||
333 | x0 = ((x0 >> 32) * (uint32_t)x0 + p0[0]) ^ p1[0]; \ | ||
334 | x1 = ((x1 >> 32) * (uint32_t)x1 + p0[1]) ^ p1[1]; \ | ||
335 | } while (0) | ||
336 | |||
337 | #if KDF_UNROLL_PWXFORM_ROUND | ||
338 | #define PWXFORM_ROUND \ | ||
339 | do { \ | ||
340 | PWXFORM_SIMD(X.d[0], X.d[1]); \ | ||
341 | PWXFORM_SIMD(X.d[2], X.d[3]); \ | ||
342 | PWXFORM_SIMD(X.d[4], X.d[5]); \ | ||
343 | PWXFORM_SIMD(X.d[6], X.d[7]); \ | ||
344 | } while (0) | ||
345 | #else | ||
346 | #define PWXFORM_ROUND \ | ||
347 | do { \ | ||
348 | for (int pwxi=0; pwxi<8; pwxi+=2) \ | ||
349 | PWXFORM_SIMD(X.d[pwxi], X.d[pwxi + 1]); \ | ||
350 | } while (0) | ||
351 | #endif | ||
352 | |||
353 | /* | ||
354 | * This offset helps address the 256-byte write block via the single-byte | ||
355 | * displacements encodable in x86(-64) instructions. It is needed because the | ||
356 | * displacements are signed. Without it, we'd get 4-byte displacements for | ||
357 | * half of the writes. Setting it to 0x80 instead of 0x7c would avoid needing | ||
358 | * a displacement for one of the writes, but then the LEA instruction would | ||
359 | * need a 4-byte displacement. | ||
360 | */ | ||
361 | #define PWXFORM_WRITE_OFFSET 0x7c | ||
362 | |||
363 | #define PWXFORM_WRITE \ | ||
364 | do { \ | ||
365 | WRITE_X(*(salsa20_blk_t *)(Sw - PWXFORM_WRITE_OFFSET)); \ | ||
366 | Sw += 64; \ | ||
367 | } while (0) | ||
368 | |||
369 | #if KDF_UNROLL_PWXFORM | ||
370 | #define PWXFORM \ | ||
371 | do { \ | ||
372 | uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ | ||
373 | FORCE_REGALLOC_3; \ | ||
374 | MAYBE_MEMORY_BARRIER; \ | ||
375 | PWXFORM_ROUND; \ | ||
376 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
377 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
378 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
379 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
380 | PWXFORM_ROUND; \ | ||
381 | w = (w + 64 * 4) & Smask2; \ | ||
382 | { \ | ||
383 | uint8_t *Stmp = S2; \ | ||
384 | S2 = S1; \ | ||
385 | S1 = S0; \ | ||
386 | S0 = Stmp; \ | ||
387 | } \ | ||
388 | } while (0) | ||
389 | #else | ||
390 | #define PWXFORM \ | ||
391 | do { \ | ||
392 | uint8_t *Sw = S2 + w + PWXFORM_WRITE_OFFSET; \ | ||
393 | FORCE_REGALLOC_3; \ | ||
394 | MAYBE_MEMORY_BARRIER; \ | ||
395 | PWXFORM_ROUND; \ | ||
396 | for (int pwxj=0; pwxj<4; pwxj++) {\ | ||
397 | PWXFORM_ROUND; PWXFORM_WRITE; \ | ||
398 | } \ | ||
399 | PWXFORM_ROUND; \ | ||
400 | w = (w + 64 * 4) & Smask2; \ | ||
401 | { \ | ||
402 | uint8_t *Stmp = S2; \ | ||
403 | S2 = S1; \ | ||
404 | S1 = S0; \ | ||
405 | S0 = Stmp; \ | ||
406 | } \ | ||
407 | } while (0) | ||
408 | #endif | ||
409 | |||
410 | typedef struct { | ||
411 | uint8_t *S0, *S1, *S2; | ||
412 | size_t w; | ||
413 | } pwxform_ctx_t; | ||
414 | |||
415 | #define Salloc (Sbytes + ((sizeof(pwxform_ctx_t) + 63) & ~63U)) | ||
416 | |||
417 | /** | ||
418 | * blockmix_pwxform(Bin, Bout, r, S): | ||
419 | * Compute Bout = BlockMix_pwxform{salsa20/2, r, S}(Bin). The input Bin must | ||
420 | * be 128r bytes in length; the output Bout must also be the same size. | ||
421 | */ | ||
422 | static void blockmix( | ||
423 | const salsa20_blk_t *restrict Bin, | ||
424 | salsa20_blk_t *restrict Bout, | ||
425 | size_t r, | ||
426 | pwxform_ctx_t *restrict ctx) | ||
427 | { | ||
428 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | ||
429 | size_t w = ctx->w; | ||
430 | size_t i; | ||
431 | DECL_X; | ||
432 | |||
433 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | ||
434 | r = r * 2 - 1; | ||
435 | |||
436 | READ_X(Bin[r]); | ||
437 | |||
438 | DECL_SMASK2REG; | ||
439 | |||
440 | i = 0; | ||
441 | for (;;) { | ||
442 | XOR_X(Bin[i]); | ||
443 | PWXFORM; | ||
444 | if (unlikely(i >= r)) | ||
445 | break; | ||
446 | WRITE_X(Bout[i]); | ||
447 | i++; | ||
448 | } | ||
449 | |||
450 | ctx->S0 = S0; | ||
451 | ctx->S1 = S1; | ||
452 | ctx->S2 = S2; | ||
453 | ctx->w = w; | ||
454 | |||
455 | SALSA20_2(Bout[i]); | ||
456 | } | ||
457 | |||
458 | static uint32_t blockmix_xor( | ||
459 | const salsa20_blk_t *Bin1, | ||
460 | const salsa20_blk_t *restrict Bin2, | ||
461 | salsa20_blk_t *Bout, | ||
462 | size_t r, | ||
463 | pwxform_ctx_t *restrict ctx) | ||
464 | { | ||
465 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | ||
466 | size_t w = ctx->w; | ||
467 | size_t i; | ||
468 | DECL_X; | ||
469 | |||
470 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | ||
471 | r = r * 2 - 1; | ||
472 | |||
473 | XOR_X_2(Bin1[r], Bin2[r]); | ||
474 | |||
475 | DECL_SMASK2REG; | ||
476 | |||
477 | i = 0; | ||
478 | r--; | ||
479 | for (;;) { | ||
480 | XOR_X(Bin1[i]); | ||
481 | XOR_X(Bin2[i]); | ||
482 | PWXFORM; | ||
483 | if (unlikely(i > r)) | ||
484 | break; | ||
485 | WRITE_X(Bout[i]); | ||
486 | i++; | ||
487 | } | ||
488 | |||
489 | ctx->S0 = S0; | ||
490 | ctx->S1 = S1; | ||
491 | ctx->S2 = S2; | ||
492 | ctx->w = w; | ||
493 | |||
494 | SALSA20_2(Bout[i]); | ||
495 | |||
496 | return INTEGERIFY; | ||
497 | } | ||
498 | |||
499 | static uint32_t blockmix_xor_save( | ||
500 | salsa20_blk_t *restrict Bin1out, | ||
501 | salsa20_blk_t *restrict Bin2, | ||
502 | size_t r, | ||
503 | pwxform_ctx_t *restrict ctx) | ||
504 | { | ||
505 | uint8_t *S0 = ctx->S0, *S1 = ctx->S1, *S2 = ctx->S2; | ||
506 | size_t w = ctx->w; | ||
507 | size_t i; | ||
508 | DECL_X; | ||
509 | DECL_Y; | ||
510 | |||
511 | /* Convert count of 128-byte blocks to max index of 64-byte block */ | ||
512 | r = r * 2 - 1; | ||
513 | |||
514 | XOR_X_2(Bin1out[r], Bin2[r]); | ||
515 | |||
516 | DECL_SMASK2REG; | ||
517 | |||
518 | i = 0; | ||
519 | r--; | ||
520 | for (;;) { | ||
521 | XOR_X_WRITE_XOR_Y_2(Bin2[i], Bin1out[i]); | ||
522 | PWXFORM; | ||
523 | if (unlikely(i > r)) | ||
524 | break; | ||
525 | WRITE_X(Bin1out[i]); | ||
526 | i++; | ||
527 | } | ||
528 | |||
529 | ctx->S0 = S0; | ||
530 | ctx->S1 = S1; | ||
531 | ctx->S2 = S2; | ||
532 | ctx->w = w; | ||
533 | |||
534 | SALSA20_2(Bin1out[i]); | ||
535 | |||
536 | return INTEGERIFY; | ||
537 | } | ||
538 | |||
539 | /** | ||
540 | * integerify(B, r): | ||
541 | * Return the result of parsing B_{2r-1} as a little-endian integer. | ||
542 | */ | ||
543 | static inline uint32_t integerify(const salsa20_blk_t *B, size_t r) | ||
544 | { | ||
545 | /* | ||
546 | * Our 64-bit words are in host byte order, which is why we don't just read | ||
547 | * w[0] here (would be wrong on big-endian). Also, our 32-bit words are | ||
548 | * SIMD-shuffled (so the next 32 bits would be part of d[6]), but currently | ||
549 | * this does not matter as we only care about the least significant 32 bits. | ||
550 | */ | ||
551 | return (uint32_t)B[2 * r - 1].d[0]; | ||
552 | } | ||
553 | |||
554 | /** | ||
555 | * smix1(B, r, N, flags, V, NROM, VROM, XY, ctx): | ||
556 | * Compute first loop of B = SMix_r(B, N). The input B must be 128r bytes in | ||
557 | * length; the temporary storage V must be 128rN bytes in length; the temporary | ||
558 | * storage XY must be 128r+64 bytes in length. N must be even and at least 4. | ||
559 | * The array V must be aligned to a multiple of 64 bytes, and arrays B and XY | ||
560 | * to a multiple of at least 16 bytes. | ||
561 | */ | ||
562 | #if DISABLE_NROM_CODE | ||
563 | #define smix1(B,r,N,flags,V,NROM,VROM,XY,ctx) \ | ||
564 | smix1(B,r,N,flags,V,XY,ctx) | ||
565 | #endif | ||
566 | static void smix1(uint8_t *B, size_t r, uint32_t N, | ||
567 | uint32_t flags, | ||
568 | salsa20_blk_t *V, | ||
569 | uint32_t NROM, const salsa20_blk_t *VROM, | ||
570 | salsa20_blk_t *XY, | ||
571 | pwxform_ctx_t *ctx) | ||
572 | { | ||
573 | #if DISABLE_NROM_CODE | ||
574 | uint32_t NROM = 0; | ||
575 | const salsa20_blk_t *VROM = NULL; | ||
576 | #endif | ||
577 | size_t s = 2 * r; | ||
578 | salsa20_blk_t *X = V, *Y = &V[s]; | ||
579 | uint32_t i, j; | ||
580 | |||
581 | for (i = 0; i < 2 * r; i++) { | ||
582 | const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; | ||
583 | salsa20_blk_t *tmp = Y; | ||
584 | salsa20_blk_t *dst = &X[i]; | ||
585 | size_t k; | ||
586 | for (k = 0; k < 16; k++) | ||
587 | tmp->w[k] = SWAP_LE32(src->w[k]); | ||
588 | salsa20_simd_shuffle(tmp, dst); | ||
589 | } | ||
590 | |||
591 | if (VROM) { | ||
592 | uint32_t n; | ||
593 | const salsa20_blk_t *V_j; | ||
594 | |||
595 | V_j = &VROM[(NROM - 1) * s]; | ||
596 | j = blockmix_xor(X, V_j, Y, r, ctx) & (NROM - 1); | ||
597 | V_j = &VROM[j * s]; | ||
598 | X = Y + s; | ||
599 | j = blockmix_xor(Y, V_j, X, r, ctx); | ||
600 | |||
601 | for (n = 2; n < N; n <<= 1) { | ||
602 | uint32_t m = (n < N / 2) ? n : (N - 1 - n); | ||
603 | for (i = 1; i < m; i += 2) { | ||
604 | j &= n - 1; | ||
605 | j += i - 1; | ||
606 | V_j = &V[j * s]; | ||
607 | Y = X + s; | ||
608 | j = blockmix_xor(X, V_j, Y, r, ctx) & (NROM - 1); | ||
609 | V_j = &VROM[j * s]; | ||
610 | X = Y + s; | ||
611 | j = blockmix_xor(Y, V_j, X, r, ctx); | ||
612 | } | ||
613 | } | ||
614 | n >>= 1; | ||
615 | |||
616 | j &= n - 1; | ||
617 | j += N - 2 - n; | ||
618 | V_j = &V[j * s]; | ||
619 | Y = X + s; | ||
620 | j = blockmix_xor(X, V_j, Y, r, ctx) & (NROM - 1); | ||
621 | V_j = &VROM[j * s]; | ||
622 | blockmix_xor(Y, V_j, XY, r, ctx); | ||
623 | } else if (flags & YESCRYPT_RW) { | ||
624 | //can't use flags___YESCRYPT_RW, smix1() may be called with flags = 0 | ||
625 | uint32_t n; | ||
626 | salsa20_blk_t *V_j; | ||
627 | |||
628 | blockmix(X, Y, r, ctx); | ||
629 | X = Y + s; | ||
630 | blockmix(Y, X, r, ctx); | ||
631 | j = integerify(X, r); | ||
632 | |||
633 | for (n = 2; n < N; n <<= 1) { | ||
634 | uint32_t m = (n < N / 2) ? n : (N - 1 - n); | ||
635 | for (i = 1; i < m; i += 2) { | ||
636 | Y = X + s; | ||
637 | j &= n - 1; | ||
638 | j += i - 1; | ||
639 | V_j = &V[j * s]; | ||
640 | j = blockmix_xor(X, V_j, Y, r, ctx); | ||
641 | j &= n - 1; | ||
642 | j += i; | ||
643 | V_j = &V[j * s]; | ||
644 | X = Y + s; | ||
645 | j = blockmix_xor(Y, V_j, X, r, ctx); | ||
646 | } | ||
647 | } | ||
648 | n >>= 1; | ||
649 | |||
650 | j &= n - 1; | ||
651 | j += N - 2 - n; | ||
652 | V_j = &V[j * s]; | ||
653 | Y = X + s; | ||
654 | j = blockmix_xor(X, V_j, Y, r, ctx); | ||
655 | j &= n - 1; | ||
656 | j += N - 1 - n; | ||
657 | V_j = &V[j * s]; | ||
658 | blockmix_xor(Y, V_j, XY, r, ctx); | ||
659 | } else { | ||
660 | N -= 2; | ||
661 | do { | ||
662 | blockmix_salsa8(X, Y, r); | ||
663 | X = Y + s; | ||
664 | blockmix_salsa8(Y, X, r); | ||
665 | Y = X + s; | ||
666 | } while ((N -= 2)); | ||
667 | |||
668 | blockmix_salsa8(X, Y, r); | ||
669 | blockmix_salsa8(Y, XY, r); | ||
670 | } | ||
671 | |||
672 | for (i = 0; i < 2 * r; i++) { | ||
673 | const salsa20_blk_t *src = &XY[i]; | ||
674 | salsa20_blk_t *tmp = &XY[s]; | ||
675 | salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; | ||
676 | size_t k; | ||
677 | for (k = 0; k < 16; k++) | ||
678 | tmp->w[k] = SWAP_LE32(src->w[k]); | ||
679 | salsa20_simd_unshuffle(tmp, dst); | ||
680 | } | ||
681 | } | ||
682 | |||
683 | /** | ||
684 | * smix2(B, r, N, Nloop, flags, V, NROM, VROM, XY, ctx): | ||
685 | * Compute second loop of B = SMix_r(B, N). The input B must be 128r bytes in | ||
686 | * length; the temporary storage V must be 128rN bytes in length; the temporary | ||
687 | * storage XY must be 256r bytes in length. N must be a power of 2 and at | ||
688 | * least 2. Nloop must be even. The array V must be aligned to a multiple of | ||
689 | * 64 bytes, and arrays B and XY to a multiple of at least 16 bytes. | ||
690 | */ | ||
691 | #if DISABLE_NROM_CODE | ||
692 | #define smix2(B,r,N,Nloop,flags,V,NROM,VROM,XY,ctx) \ | ||
693 | smix2(B,r,N,Nloop,flags,V,XY,ctx) | ||
694 | #endif | ||
695 | static void smix2(uint8_t *B, size_t r, uint32_t N, uint64_t Nloop, | ||
696 | uint32_t flags, | ||
697 | salsa20_blk_t *V, | ||
698 | uint32_t NROM, const salsa20_blk_t *VROM, | ||
699 | salsa20_blk_t *XY, | ||
700 | pwxform_ctx_t *ctx) | ||
701 | { | ||
702 | #if DISABLE_NROM_CODE | ||
703 | uint32_t NROM = 0; | ||
704 | const salsa20_blk_t *VROM = NULL; | ||
705 | #endif | ||
706 | size_t s = 2 * r; | ||
707 | salsa20_blk_t *X = XY, *Y = &XY[s]; | ||
708 | uint32_t i, j; | ||
709 | |||
710 | if (Nloop == 0) | ||
711 | return; | ||
712 | |||
713 | for (i = 0; i < 2 * r; i++) { | ||
714 | const salsa20_blk_t *src = (salsa20_blk_t *)&B[i * 64]; | ||
715 | salsa20_blk_t *tmp = Y; | ||
716 | salsa20_blk_t *dst = &X[i]; | ||
717 | size_t k; | ||
718 | for (k = 0; k < 16; k++) | ||
719 | tmp->w[k] = SWAP_LE32(src->w[k]); | ||
720 | salsa20_simd_shuffle(tmp, dst); | ||
721 | } | ||
722 | |||
723 | j = integerify(X, r) & (N - 1); | ||
724 | |||
725 | /* | ||
726 | * Normally, VROM implies YESCRYPT_RW, but we check for these separately | ||
727 | * because our SMix resets YESCRYPT_RW for the smix2() calls operating on the | ||
728 | * entire V when p > 1. | ||
729 | */ | ||
730 | //and this is why bbox can't use flags___YESCRYPT_RW in this function | ||
731 | if (VROM && (flags & YESCRYPT_RW)) { | ||
732 | do { | ||
733 | salsa20_blk_t *V_j = &V[j * s]; | ||
734 | const salsa20_blk_t *VROM_j; | ||
735 | j = blockmix_xor_save(X, V_j, r, ctx) & (NROM - 1); | ||
736 | VROM_j = &VROM[j * s]; | ||
737 | j = blockmix_xor(X, VROM_j, X, r, ctx) & (N - 1); | ||
738 | } while (Nloop -= 2); | ||
739 | } else if (VROM) { | ||
740 | do { | ||
741 | const salsa20_blk_t *V_j = &V[j * s]; | ||
742 | j = blockmix_xor(X, V_j, X, r, ctx) & (NROM - 1); | ||
743 | V_j = &VROM[j * s]; | ||
744 | j = blockmix_xor(X, V_j, X, r, ctx) & (N - 1); | ||
745 | } while (Nloop -= 2); | ||
746 | } else if (flags & YESCRYPT_RW) { | ||
747 | do { | ||
748 | salsa20_blk_t *V_j = &V[j * s]; | ||
749 | j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); | ||
750 | V_j = &V[j * s]; | ||
751 | j = blockmix_xor_save(X, V_j, r, ctx) & (N - 1); | ||
752 | } while (Nloop -= 2); | ||
753 | } else if (ctx) { | ||
754 | do { | ||
755 | const salsa20_blk_t *V_j = &V[j * s]; | ||
756 | j = blockmix_xor(X, V_j, X, r, ctx) & (N - 1); | ||
757 | V_j = &V[j * s]; | ||
758 | j = blockmix_xor(X, V_j, X, r, ctx) & (N - 1); | ||
759 | } while (Nloop -= 2); | ||
760 | } else { | ||
761 | do { | ||
762 | const salsa20_blk_t *V_j = &V[j * s]; | ||
763 | j = blockmix_salsa8_xor(X, V_j, Y, r) & (N - 1); | ||
764 | V_j = &V[j * s]; | ||
765 | j = blockmix_salsa8_xor(Y, V_j, X, r) & (N - 1); | ||
766 | } while (Nloop -= 2); | ||
767 | } | ||
768 | |||
769 | for (i = 0; i < 2 * r; i++) { | ||
770 | const salsa20_blk_t *src = &X[i]; | ||
771 | salsa20_blk_t *tmp = Y; | ||
772 | salsa20_blk_t *dst = (salsa20_blk_t *)&B[i * 64]; | ||
773 | size_t k; | ||
774 | for (k = 0; k < 16; k++) | ||
775 | tmp->w[k] = SWAP_LE32(src->w[k]); | ||
776 | salsa20_simd_unshuffle(tmp, dst); | ||
777 | } | ||
778 | } | ||
779 | |||
780 | /** | ||
781 | * p2floor(x): | ||
782 | * Largest power of 2 not greater than argument. | ||
783 | */ | ||
784 | static uint64_t p2floor(uint64_t x) | ||
785 | { | ||
786 | uint64_t y; | ||
787 | while ((y = x & (x - 1))) | ||
788 | x = y; | ||
789 | return x; | ||
790 | } | ||
791 | |||
792 | /** | ||
793 | * smix(B, r, N, p, t, flags, V, NROM, VROM, XY, S, passwd): | ||
794 | * Compute B = SMix_r(B, N). The input B must be 128rp bytes in length; the | ||
795 | * temporary storage V must be 128rN bytes in length; the temporary storage | ||
796 | * XY must be 256r or 256rp bytes in length (the larger size is required with | ||
797 | * OpenMP-enabled builds). N must be a power of 2 and at least 4. The array V | ||
798 | * must be aligned to a multiple of 64 bytes, and arrays B and XY to a multiple | ||
799 | * of at least 16 bytes (aligning them to 64 bytes as well saves cache lines | ||
800 | * and helps avoid false sharing in OpenMP-enabled builds when p > 1, but it | ||
801 | * might also result in cache bank conflicts). | ||
802 | */ | ||
803 | #if DISABLE_NROM_CODE | ||
804 | #define smix(B,r,N,p,t,flags,V,NROM,VROM,XY,S,passwd) \ | ||
805 | smix(B,r,N,p,t,flags,V,XY,S,passwd) | ||
806 | #endif | ||
807 | static void smix(uint8_t *B, size_t r, uint32_t N, uint32_t p, uint32_t t, | ||
808 | uint32_t flags, | ||
809 | salsa20_blk_t *V, | ||
810 | uint32_t NROM, const salsa20_blk_t *VROM, | ||
811 | salsa20_blk_t *XY, | ||
812 | uint8_t *S, uint8_t *passwd) | ||
813 | { | ||
814 | size_t s = 2 * r; | ||
815 | uint32_t Nchunk; | ||
816 | uint64_t Nloop_all, Nloop_rw; | ||
817 | uint32_t i; | ||
818 | |||
819 | Nchunk = N / p; | ||
820 | Nloop_all = Nchunk; | ||
821 | if (flags___YESCRYPT_RW) { | ||
822 | if (t <= 1) { | ||
823 | if (t) | ||
824 | Nloop_all *= 2; /* 2/3 */ | ||
825 | Nloop_all = (Nloop_all + 2) / 3; /* 1/3, round up */ | ||
826 | } else { | ||
827 | Nloop_all *= t - 1; | ||
828 | } | ||
829 | } else if (t) { | ||
830 | if (t == 1) | ||
831 | Nloop_all += (Nloop_all + 1) / 2; /* 1.5, round up */ | ||
832 | Nloop_all *= t; | ||
833 | } | ||
834 | |||
835 | Nloop_rw = 0; | ||
836 | if (flags___YESCRYPT_RW) | ||
837 | Nloop_rw = Nloop_all / p; | ||
838 | |||
839 | Nchunk &= ~(uint32_t)1; /* round down to even */ | ||
840 | Nloop_all++; Nloop_all &= ~(uint64_t)1; /* round up to even */ | ||
841 | Nloop_rw++; Nloop_rw &= ~(uint64_t)1; /* round up to even */ | ||
842 | |||
843 | for (i = 0; i < p; i++) { | ||
844 | uint32_t Vchunk = i * Nchunk; | ||
845 | uint32_t Np = (i < p - 1) ? Nchunk : (N - Vchunk); | ||
846 | uint8_t *Bp = &B[128 * r * i]; | ||
847 | salsa20_blk_t *Vp = &V[Vchunk * s]; | ||
848 | salsa20_blk_t *XYp = XY; | ||
849 | pwxform_ctx_t *ctx_i = NULL; | ||
850 | if (flags___YESCRYPT_RW) { | ||
851 | uint8_t *Si = S + i * Salloc; | ||
852 | smix1(Bp, 1, Sbytes / 128, 0 /* no flags */, | ||
853 | (salsa20_blk_t *)Si, 0, NULL, XYp, NULL); | ||
854 | ctx_i = (pwxform_ctx_t *)(Si + Sbytes); | ||
855 | ctx_i->S2 = Si; | ||
856 | ctx_i->S1 = Si + Sbytes / 3; | ||
857 | ctx_i->S0 = Si + Sbytes / 3 * 2; | ||
858 | ctx_i->w = 0; | ||
859 | if (i == 0) | ||
860 | hmac_block( | ||
861 | /* key,len: */ Bp + (128 * r - 64), 64, | ||
862 | /* hash fn: */ sha256_begin, | ||
863 | /* in,len: */ passwd, 32, | ||
864 | /* outbuf: */ passwd | ||
865 | ); | ||
866 | } | ||
867 | smix1(Bp, r, Np, flags, Vp, NROM, VROM, XYp, ctx_i); | ||
868 | smix2(Bp, r, p2floor(Np), Nloop_rw, flags, Vp, | ||
869 | NROM, VROM, XYp, ctx_i); | ||
870 | } | ||
871 | |||
872 | if (Nloop_all > Nloop_rw) { | ||
873 | for (i = 0; i < p; i++) { | ||
874 | uint8_t *Bp = &B[128 * r * i]; | ||
875 | salsa20_blk_t *XYp = XY; | ||
876 | pwxform_ctx_t *ctx_i = NULL; | ||
877 | if (flags___YESCRYPT_RW) { | ||
878 | uint8_t *Si = S + i * Salloc; | ||
879 | ctx_i = (pwxform_ctx_t *)(Si + Sbytes); | ||
880 | } | ||
881 | smix2(Bp, r, N, Nloop_all - Nloop_rw, | ||
882 | flags & (uint32_t)~YESCRYPT_RW, | ||
883 | V, NROM, VROM, XYp, ctx_i); | ||
884 | } | ||
885 | } | ||
886 | } | ||
887 | |||
888 | /* Allocator code */ | ||
889 | |||
890 | static void alloc_region(yescrypt_region_t *region, size_t size) | ||
891 | { | ||
892 | uint8_t *base; | ||
893 | int flags = | ||
894 | # ifdef MAP_NOCORE /* huh? */ | ||
895 | MAP_NOCORE | | ||
896 | # endif | ||
897 | MAP_ANON | MAP_PRIVATE; | ||
898 | |||
899 | base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0); | ||
900 | if (base == MAP_FAILED) | ||
901 | bb_die_memory_exhausted(); | ||
902 | |||
903 | #if defined(MADV_HUGEPAGE) | ||
904 | /* Reduces mkpasswd qweRTY123@-+ '$y$jHT$123' | ||
905 | * (which allocates 4 Gbytes) | ||
906 | * run time from 10.543s to 5.635s | ||
907 | * Seen on linux-5.18.0. | ||
908 | */ | ||
909 | madvise(base, size, MADV_HUGEPAGE); | ||
910 | #endif | ||
911 | //region->base = base; | ||
912 | //region->base_size = size; | ||
913 | region->aligned = base; | ||
914 | region->aligned_size = size; | ||
915 | } | ||
916 | |||
917 | static void free_region(yescrypt_region_t *region) | ||
918 | { | ||
919 | if (region->aligned) | ||
920 | munmap(region->aligned, region->aligned_size); | ||
921 | //region->base = NULL; | ||
922 | //region->base_size = 0; | ||
923 | region->aligned = NULL; | ||
924 | region->aligned_size = 0; | ||
925 | } | ||
926 | /** | ||
927 | * yescrypt_kdf_body(shared, local, passwd, passwdlen, salt, saltlen, | ||
928 | * flags, N, r, p, t, NROM, buf, buflen): | ||
929 | * Compute scrypt(passwd[0 .. passwdlen - 1], salt[0 .. saltlen - 1], N, r, | ||
930 | * p, buflen), or a revision of scrypt as requested by flags and shared, and | ||
931 | * write the result into buf. | ||
932 | * | ||
933 | * shared and flags may request special modes as described in yescrypt.h. | ||
934 | * | ||
935 | * local is the thread-local data structure, allowing to preserve and reuse a | ||
936 | * memory allocation across calls, thereby reducing its overhead. | ||
937 | * | ||
938 | * t controls computation time while not affecting peak memory usage. | ||
939 | * | ||
940 | * Return 0 on success; or -1 on error. | ||
941 | * | ||
942 | * This optimized implementation currently limits N to the range from 4 to | ||
943 | * 2^31, but other implementations might not. | ||
944 | */ | ||
945 | static int yescrypt_kdf32_body( | ||
946 | yescrypt_ctx_t *yctx, | ||
947 | const uint8_t *passwd, size_t passwdlen, | ||
948 | uint32_t flags, uint64_t N, uint32_t t, | ||
949 | uint8_t *buf32) | ||
950 | { | ||
951 | #if !DISABLE_NROM_CODE | ||
952 | const salsa20_blk_t *VROM; | ||
953 | #endif | ||
954 | size_t B_size, V_size, XY_size, need; | ||
955 | uint8_t *B, *S; | ||
956 | salsa20_blk_t *V, *XY; | ||
957 | struct { | ||
958 | uint8_t sha256[32]; | ||
959 | uint8_t dk[32]; | ||
960 | } u; | ||
961 | #define sha256 u.sha256 | ||
962 | #define dk u.dk | ||
963 | uint8_t *dkp = buf32; | ||
964 | uint32_t r, p; | ||
965 | |||
966 | /* Sanity-check parameters */ | ||
967 | switch (flags___YESCRYPT_MODE_MASK) { | ||
968 | case 0: /* classic scrypt - can't have anything non-standard */ | ||
969 | if (flags || t || YCTX_param_NROM) | ||
970 | goto out_EINVAL; | ||
971 | break; | ||
972 | case YESCRYPT_WORM: | ||
973 | if (flags != YESCRYPT_WORM || YCTX_param_NROM) | ||
974 | goto out_EINVAL; | ||
975 | break; | ||
976 | case YESCRYPT_RW: | ||
977 | if (flags != (flags & YESCRYPT_KNOWN_FLAGS)) | ||
978 | goto out_EINVAL; | ||
979 | #if PWXsimple == 2 && PWXgather == 4 && Sbytes == 12288 | ||
980 | if ((flags & YESCRYPT_RW_FLAVOR_MASK) == | ||
981 | (YESCRYPT_ROUNDS_6 | YESCRYPT_GATHER_4 | | ||
982 | YESCRYPT_SIMPLE_2 | YESCRYPT_SBOX_12K)) | ||
983 | break; | ||
984 | #else | ||
985 | #error "Unsupported pwxform settings" | ||
986 | #endif | ||
987 | /* FALLTHRU */ | ||
988 | default: | ||
989 | goto out_EINVAL; | ||
990 | } | ||
991 | |||
992 | r = YCTX_param_r; | ||
993 | p = YCTX_param_p; | ||
994 | if ((uint64_t)r * (uint64_t)p >= 1 << 30) { | ||
995 | dbg("r * n >= 2^30"); | ||
996 | goto out_EINVAL; | ||
997 | } | ||
998 | if (N > UINT32_MAX) { | ||
999 | dbg("N > 0x%lx", (long)UINT32_MAX); | ||
1000 | goto out_EINVAL; | ||
1001 | } | ||
1002 | if (N <= 3 | ||
1003 | || r < 1 | ||
1004 | || p < 1 | ||
1005 | ) { | ||
1006 | dbg("bad N, r or p"); | ||
1007 | goto out_EINVAL; | ||
1008 | } | ||
1009 | if (r > SIZE_MAX / 256 / p | ||
1010 | || N > SIZE_MAX / 128 / r | ||
1011 | ) { | ||
1012 | /* 32-bit testcase: mkpasswd qweRTY123@-+ '$y$jHT$123' | ||
1013 | * (works on 64-bit, needs buffer > 4Gbytes) | ||
1014 | */ | ||
1015 | dbg("r > SIZE_MAX / 256 / p? %c", "NY"[r > SIZE_MAX / 256 / p]); | ||
1016 | dbg("N > SIZE_MAX / 128 / r? %c", "NY"[N > SIZE_MAX / 128 / r]); | ||
1017 | goto out_EINVAL; | ||
1018 | } | ||
1019 | if (flags___YESCRYPT_RW) { | ||
1020 | /* p cannot be greater than SIZE_MAX/Salloc on 64-bit systems, | ||
1021 | but it can on 32-bit systems. */ | ||
1022 | #pragma GCC diagnostic push | ||
1023 | #pragma GCC diagnostic ignored "-Wtype-limits" | ||
1024 | if (N / p <= 3 || p > SIZE_MAX / Salloc) { | ||
1025 | dbg("bad p:%ld", (long)p); | ||
1026 | goto out_EINVAL; | ||
1027 | } | ||
1028 | #pragma GCC diagnostic pop | ||
1029 | } | ||
1030 | |||
1031 | #if !DISABLE_NROM_CODE | ||
1032 | VROM = NULL; | ||
1033 | if (YCTX_param_NROM) | ||
1034 | goto out_EINVAL; | ||
1035 | #endif | ||
1036 | |||
1037 | /* Allocate memory */ | ||
1038 | V = NULL; | ||
1039 | V_size = (size_t)128 * r * N; | ||
1040 | need = V_size; | ||
1041 | B_size = (size_t)128 * r * p; | ||
1042 | need += B_size; | ||
1043 | if (need < B_size) { | ||
1044 | dbg("integer overflow at += B_size(%lu)", (long)B_size); | ||
1045 | goto out_EINVAL; | ||
1046 | } | ||
1047 | XY_size = (size_t)256 * r; | ||
1048 | need += XY_size; | ||
1049 | if (need < XY_size) { | ||
1050 | dbg("integer overflow at += XY_size(%lu)", (long)XY_size); | ||
1051 | goto out_EINVAL; | ||
1052 | } | ||
1053 | if (flags___YESCRYPT_RW) { | ||
1054 | size_t S_size = (size_t)Salloc * p; | ||
1055 | need += S_size; | ||
1056 | if (need < S_size) { | ||
1057 | dbg("integer overflow at += S_size(%lu)", (long)S_size); | ||
1058 | goto out_EINVAL; | ||
1059 | } | ||
1060 | } | ||
1061 | if (yctx->local->aligned_size < need) { | ||
1062 | free_region(yctx->local); | ||
1063 | alloc_region(yctx->local, need); | ||
1064 | dbg("allocated local:%lu 0x%lx", (long)need, (long)need); | ||
1065 | /* standard "j9T" params allocate 16Mbytes here */ | ||
1066 | } | ||
1067 | if (flags & YESCRYPT_ALLOC_ONLY) | ||
1068 | return -3; /* expected "failure" */ | ||
1069 | B = (uint8_t *)yctx->local->aligned; | ||
1070 | V = (salsa20_blk_t *)((uint8_t *)B + B_size); | ||
1071 | XY = (salsa20_blk_t *)((uint8_t *)V + V_size); | ||
1072 | S = NULL; | ||
1073 | if (flags___YESCRYPT_RW) | ||
1074 | S = (uint8_t *)XY + XY_size; | ||
1075 | |||
1076 | if (flags) { | ||
1077 | hmac_block( | ||
1078 | /* key,len: */ (const void*)"yescrypt-prehash", (flags & YESCRYPT_PREHASH) ? 16 : 8, | ||
1079 | /* hash fn: */ sha256_begin, | ||
1080 | /* in,len: */ passwd, passwdlen, | ||
1081 | /* outbuf: */ sha256 | ||
1082 | ); | ||
1083 | passwd = sha256; | ||
1084 | passwdlen = sizeof(sha256); | ||
1085 | } | ||
1086 | |||
1087 | PBKDF2_SHA256(passwd, passwdlen, yctx->salt, yctx->saltlen, 1, B, B_size); | ||
1088 | |||
1089 | if (flags) | ||
1090 | memcpy(sha256, B, sizeof(sha256)); | ||
1091 | |||
1092 | if (p == 1 || (flags___YESCRYPT_RW)) { | ||
1093 | smix(B, r, N, p, t, flags, V, YCTX_param_NROM, VROM, XY, S, sha256); | ||
1094 | } else { | ||
1095 | uint32_t i; | ||
1096 | for (i = 0; i < p; i++) { | ||
1097 | smix(&B[(size_t)128 * r * i], r, N, 1, t, flags, V, | ||
1098 | YCTX_param_NROM, VROM, XY, NULL, NULL); | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1102 | dkp = buf32; | ||
1103 | if (flags && /*buflen:*/32 < sizeof(dk)) { | ||
1104 | PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, dk, sizeof(dk)); | ||
1105 | dkp = dk; | ||
1106 | } | ||
1107 | |||
1108 | PBKDF2_SHA256(passwd, passwdlen, B, B_size, 1, buf32, /*buflen:*/32); | ||
1109 | |||
1110 | /* | ||
1111 | * Except when computing classic scrypt, allow all computation so far | ||
1112 | * to be performed on the client. The final steps below match those of | ||
1113 | * SCRAM (RFC 5802), so that an extension of SCRAM (with the steps so | ||
1114 | * far in place of SCRAM's use of PBKDF2 and with SHA-256 in place of | ||
1115 | * SCRAM's use of SHA-1) would be usable with yescrypt hashes. | ||
1116 | */ | ||
1117 | if (flags && !(flags & YESCRYPT_PREHASH)) { | ||
1118 | /* Compute ClientKey */ | ||
1119 | hmac_block( | ||
1120 | /* key,len: */ dkp, sizeof(dk), | ||
1121 | /* hash fn: */ sha256_begin, | ||
1122 | /* in,len: */ "Client Key", 10, | ||
1123 | /* outbuf: */ sha256 | ||
1124 | ); | ||
1125 | /* Compute StoredKey */ | ||
1126 | { | ||
1127 | size_t clen = /*buflen:*/32; | ||
1128 | if (clen > sizeof(dk)) | ||
1129 | clen = sizeof(dk); | ||
1130 | if (sizeof(dk) != 32) { /* not true, optimize it out */ | ||
1131 | sha256_block(sha256, sizeof(sha256), dk); | ||
1132 | memcpy(buf32, dk, clen); | ||
1133 | } else { | ||
1134 | sha256_block(sha256, sizeof(sha256), buf32); | ||
1135 | } | ||
1136 | } | ||
1137 | } | ||
1138 | |||
1139 | explicit_bzero(&u, sizeof(u)); | ||
1140 | |||
1141 | /* Success! */ | ||
1142 | return 0; | ||
1143 | |||
1144 | out_EINVAL: | ||
1145 | //bbox does not need this: errno = EINVAL; | ||
1146 | return -1; | ||
1147 | #undef sha256 | ||
1148 | #undef dk | ||
1149 | } | ||
1150 | |||
1151 | /** | ||
1152 | * yescrypt_kdf(shared, local, passwd, passwdlen, salt, saltlen, params, | ||
1153 | * buf, buflen): | ||
1154 | * Compute scrypt or its revision as requested by the parameters. The inputs | ||
1155 | * to this function are the same as those for yescrypt_kdf_body() above, with | ||
1156 | * the addition of g, which controls hash upgrades (0 for no upgrades so far). | ||
1157 | */ | ||
1158 | static | ||
1159 | int yescrypt_kdf32( | ||
1160 | yescrypt_ctx_t *yctx, | ||
1161 | const uint8_t *passwd, size_t passwdlen, | ||
1162 | uint8_t *buf32) | ||
1163 | { | ||
1164 | uint32_t flags = YCTX_param_flags; | ||
1165 | uint64_t N = YCTX_param_N; | ||
1166 | uint32_t r = YCTX_param_r; | ||
1167 | uint32_t p = YCTX_param_p; | ||
1168 | uint32_t t = YCTX_param_t; | ||
1169 | uint32_t g = YCTX_param_g; | ||
1170 | uint8_t dk32[32]; | ||
1171 | int retval; | ||
1172 | |||
1173 | /* Support for hash upgrades has been temporarily removed */ | ||
1174 | if (g) { | ||
1175 | //bbox does not need this: errno = EINVAL; | ||
1176 | return -1; | ||
1177 | } | ||
1178 | |||
1179 | if ((flags___YESCRYPT_RW) | ||
1180 | && p >= 1 | ||
1181 | && N / p >= 0x100 | ||
1182 | && N / p * r >= 0x20000 | ||
1183 | ) { | ||
1184 | if (yescrypt_kdf32_body(yctx, | ||
1185 | passwd, passwdlen, | ||
1186 | flags | YESCRYPT_ALLOC_ONLY, N, t, | ||
1187 | buf32) != -3 | ||
1188 | ) { | ||
1189 | dbg("yescrypt_kdf32_body: not -3"); | ||
1190 | return -1; | ||
1191 | } | ||
1192 | retval = yescrypt_kdf32_body(yctx, | ||
1193 | passwd, passwdlen, | ||
1194 | flags | YESCRYPT_PREHASH, N >> 6, 0, | ||
1195 | dk32); | ||
1196 | if (retval) { | ||
1197 | dbg("yescrypt_kdf32_body(PREHASH):%d", retval); | ||
1198 | return retval; | ||
1199 | } | ||
1200 | passwd = dk32; | ||
1201 | passwdlen = sizeof(dk32); | ||
1202 | } | ||
1203 | |||
1204 | retval = yescrypt_kdf32_body(yctx, | ||
1205 | passwd, passwdlen, | ||
1206 | flags, N, t, buf32); | ||
1207 | |||
1208 | explicit_bzero(dk32, sizeof(dk32)); | ||
1209 | |||
1210 | dbg("yescrypt_kdf32_body:%d", retval); | ||
1211 | return retval; | ||
1212 | } | ||