diff options
Diffstat (limited to 'C/Blake2s.c')
-rw-r--r-- | C/Blake2s.c | 2693 |
1 files changed, 2544 insertions, 149 deletions
diff --git a/C/Blake2s.c b/C/Blake2s.c index 2a84b57..459e76b 100644 --- a/C/Blake2s.c +++ b/C/Blake2s.c | |||
@@ -1,250 +1,2645 @@ | |||
1 | /* Blake2s.c -- BLAKE2s and BLAKE2sp Hash | 1 | /* Blake2s.c -- BLAKE2sp Hash |
2 | 2023-03-04 : Igor Pavlov : Public domain | 2 | 2024-01-29 : Igor Pavlov : Public domain |
3 | 2015 : Samuel Neves : Public domain */ | 3 | 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | // #include <stdio.h> | ||
7 | #include <string.h> | 8 | #include <string.h> |
8 | 9 | ||
9 | #include "Blake2.h" | 10 | #include "Blake2.h" |
10 | #include "CpuArch.h" | ||
11 | #include "RotateDefs.h" | 11 | #include "RotateDefs.h" |
12 | #include "Compiler.h" | ||
13 | #include "CpuArch.h" | ||
14 | |||
15 | #if defined(__SSE2__) | ||
16 | #define Z7_BLAKE2S_USE_VECTORS | ||
17 | #elif defined(MY_CPU_X86_OR_AMD64) | ||
18 | #if defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
20 | || defined(__clang__) \ | ||
21 | || defined(__INTEL_COMPILER) | ||
22 | #define Z7_BLAKE2S_USE_VECTORS | ||
23 | #endif | ||
24 | #endif | ||
25 | |||
26 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
27 | |||
28 | #define Z7_BLAKE2SP_USE_FUNCTIONS | ||
29 | |||
30 | // define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes. | ||
31 | // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
32 | |||
33 | // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%. | ||
34 | #if defined(__SSSE3__) | ||
35 | #define Z7_BLAKE2S_USE_SSSE3 | ||
36 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
37 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
38 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
39 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
40 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
41 | #define Z7_BLAKE2S_USE_SSSE3 | ||
42 | #endif | ||
43 | |||
44 | #ifdef Z7_BLAKE2S_USE_SSSE3 | ||
45 | /* SSE41 : for _mm_insert_epi32 (pinsrd) | ||
46 | it can slightly reduce code size and improves the performance in some cases. | ||
47 | it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used. | ||
48 | it can be used for all blocks in another algos (4+). | ||
49 | */ | ||
50 | #if defined(__SSE4_1__) | ||
51 | #define Z7_BLAKE2S_USE_SSE41 | ||
52 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
53 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
54 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
55 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
56 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
57 | #define Z7_BLAKE2S_USE_SSE41 | ||
58 | #endif | ||
59 | #endif // SSSE3 | ||
60 | |||
61 | #if defined(__GNUC__) || defined(__clang__) | ||
62 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
63 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1"))) | ||
64 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
65 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3"))) | ||
66 | #else | ||
67 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2"))) | ||
68 | #endif | ||
69 | #endif | ||
70 | |||
71 | |||
72 | #if defined(__AVX2__) | ||
73 | #define Z7_BLAKE2S_USE_AVX2 | ||
74 | #else | ||
75 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
76 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
77 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) | ||
78 | #define Z7_BLAKE2S_USE_AVX2 | ||
79 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
80 | #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2"))) | ||
81 | #endif | ||
82 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
83 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
84 | #if (Z7_MSC_VER_ORIGINAL == 1900) | ||
85 | #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX | ||
86 | #endif | ||
87 | #define Z7_BLAKE2S_USE_AVX2 | ||
88 | #endif | ||
89 | #endif | ||
90 | |||
91 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
92 | #include <smmintrin.h> // SSE4.1 | ||
93 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
94 | #include <tmmintrin.h> // SSSE3 | ||
95 | #else | ||
96 | #include <emmintrin.h> // SSE2 | ||
97 | #endif | ||
98 | |||
99 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
100 | #include <immintrin.h> | ||
101 | #if defined(__clang__) | ||
102 | #include <avxintrin.h> | ||
103 | #include <avx2intrin.h> | ||
104 | #endif | ||
105 | #endif // avx2 | ||
106 | |||
107 | |||
108 | #if defined(__AVX512F__) && defined(__AVX512VL__) | ||
109 | // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930) | ||
110 | #define Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
111 | // #pragma message ("=== Blake2s AVX512") | ||
112 | #endif | ||
12 | 113 | ||
13 | #define rotr32 rotrFixed | ||
14 | 114 | ||
15 | #define BLAKE2S_NUM_ROUNDS 10 | 115 | #define Z7_BLAKE2S_USE_V128_FAST |
16 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | 116 | // for speed optimization for small messages: |
117 | // #define Z7_BLAKE2S_USE_V128_WAY2 | ||
17 | 118 | ||
119 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
120 | |||
121 | // for debug: | ||
122 | // gather is slow | ||
123 | // #define Z7_BLAKE2S_USE_GATHER | ||
124 | |||
125 | #define Z7_BLAKE2S_USE_AVX2_FAST | ||
126 | // for speed optimization for small messages: | ||
127 | // #define Z7_BLAKE2S_USE_AVX2_WAY2 | ||
128 | // #define Z7_BLAKE2S_USE_AVX2_WAY4 | ||
129 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \ | ||
130 | defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
131 | #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
132 | #endif | ||
133 | #endif | ||
134 | |||
135 | #define Z7_BLAKE2SP_ALGO_DEFAULT 0 | ||
136 | #define Z7_BLAKE2SP_ALGO_SCALAR 1 | ||
137 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
138 | #define Z7_BLAKE2SP_ALGO_V128_FAST 2 | ||
139 | #endif | ||
140 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
141 | #define Z7_BLAKE2SP_ALGO_V256_FAST 3 | ||
142 | #endif | ||
143 | #define Z7_BLAKE2SP_ALGO_V128_WAY1 4 | ||
144 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
145 | #define Z7_BLAKE2SP_ALGO_V128_WAY2 5 | ||
146 | #endif | ||
147 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
148 | #define Z7_BLAKE2SP_ALGO_V256_WAY2 6 | ||
149 | #endif | ||
150 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
151 | #define Z7_BLAKE2SP_ALGO_V256_WAY4 7 | ||
152 | #endif | ||
153 | |||
154 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
155 | |||
156 | |||
157 | |||
158 | |||
159 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | ||
160 | #define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS | ||
161 | #define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE) | ||
162 | #define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1) | ||
163 | |||
164 | #define V_INDEX_0_0 0 | ||
165 | #define V_INDEX_1_0 1 | ||
166 | #define V_INDEX_2_0 2 | ||
167 | #define V_INDEX_3_0 3 | ||
168 | #define V_INDEX_0_1 4 | ||
169 | #define V_INDEX_1_1 5 | ||
170 | #define V_INDEX_2_1 6 | ||
171 | #define V_INDEX_3_1 7 | ||
172 | #define V_INDEX_0_2 8 | ||
173 | #define V_INDEX_1_2 9 | ||
174 | #define V_INDEX_2_2 10 | ||
175 | #define V_INDEX_3_2 11 | ||
176 | #define V_INDEX_0_3 12 | ||
177 | #define V_INDEX_1_3 13 | ||
178 | #define V_INDEX_2_3 14 | ||
179 | #define V_INDEX_3_3 15 | ||
180 | #define V_INDEX_4_0 0 | ||
181 | #define V_INDEX_5_0 1 | ||
182 | #define V_INDEX_6_0 2 | ||
183 | #define V_INDEX_7_0 3 | ||
184 | #define V_INDEX_7_1 4 | ||
185 | #define V_INDEX_4_1 5 | ||
186 | #define V_INDEX_5_1 6 | ||
187 | #define V_INDEX_6_1 7 | ||
188 | #define V_INDEX_6_2 8 | ||
189 | #define V_INDEX_7_2 9 | ||
190 | #define V_INDEX_4_2 10 | ||
191 | #define V_INDEX_5_2 11 | ||
192 | #define V_INDEX_5_3 12 | ||
193 | #define V_INDEX_6_3 13 | ||
194 | #define V_INDEX_7_3 14 | ||
195 | #define V_INDEX_4_3 15 | ||
196 | |||
197 | #define V(row, col) v[V_INDEX_ ## row ## _ ## col] | ||
198 | |||
199 | #define k_Blake2s_IV_0 0x6A09E667UL | ||
200 | #define k_Blake2s_IV_1 0xBB67AE85UL | ||
201 | #define k_Blake2s_IV_2 0x3C6EF372UL | ||
202 | #define k_Blake2s_IV_3 0xA54FF53AUL | ||
203 | #define k_Blake2s_IV_4 0x510E527FUL | ||
204 | #define k_Blake2s_IV_5 0x9B05688CUL | ||
205 | #define k_Blake2s_IV_6 0x1F83D9ABUL | ||
206 | #define k_Blake2s_IV_7 0x5BE0CD19UL | ||
207 | |||
208 | #define KIV(n) (k_Blake2s_IV_## n) | ||
209 | |||
210 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
211 | MY_ALIGN(16) | ||
18 | static const UInt32 k_Blake2s_IV[8] = | 212 | static const UInt32 k_Blake2s_IV[8] = |
19 | { | 213 | { |
20 | 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, | 214 | KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7) |
21 | 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL | ||
22 | }; | 215 | }; |
216 | #endif | ||
23 | 217 | ||
24 | static const Byte k_Blake2s_Sigma[BLAKE2S_NUM_ROUNDS][16] = | 218 | #define STATE_T(s) ((s) + 8) |
25 | { | 219 | #define STATE_F(s) ((s) + 10) |
26 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , | 220 | |
27 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , | 221 | #ifdef Z7_BLAKE2S_USE_VECTORS |
28 | { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , | ||
29 | { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , | ||
30 | { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , | ||
31 | { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , | ||
32 | { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , | ||
33 | { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , | ||
34 | { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , | ||
35 | { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , | ||
36 | }; | ||
37 | 222 | ||
223 | #define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p)) | ||
224 | #define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p)) | ||
225 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
226 | // here we use unaligned load and stores | ||
227 | // use this branch if CBlake2sp can be unaligned for 16 bytes | ||
228 | #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r) | ||
229 | #define LOAD_128_FROM_STRUCT(p) LOADU_128(p) | ||
230 | #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r) | ||
231 | #else | ||
232 | // here we use aligned load and stores | ||
233 | // use this branch if CBlake2sp is aligned for 16 bytes | ||
234 | #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r) | ||
235 | #define LOAD_128_FROM_STRUCT(p) LOAD_128(p) | ||
236 | #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r) | ||
237 | #endif | ||
38 | 238 | ||
39 | static void Blake2s_Init0(CBlake2s *p) | 239 | #endif // Z7_BLAKE2S_USE_VECTORS |
240 | |||
241 | |||
242 | #if 0 | ||
243 | static void PrintState(const UInt32 *s, unsigned num) | ||
244 | { | ||
245 | unsigned i; | ||
246 | printf("\n"); | ||
247 | for (i = 0; i < num; i++) | ||
248 | printf(" %08x", (unsigned)s[i]); | ||
249 | } | ||
250 | static void PrintStates2(const UInt32 *s, unsigned x, unsigned y) | ||
40 | { | 251 | { |
41 | unsigned i; | 252 | unsigned i; |
42 | for (i = 0; i < 8; i++) | 253 | for (i = 0; i < y; i++) |
43 | p->h[i] = k_Blake2s_IV[i]; | 254 | PrintState(s + i * x, x); |
44 | p->t[0] = 0; | 255 | printf("\n"); |
45 | p->t[1] = 0; | ||
46 | p->f[0] = 0; | ||
47 | p->f[1] = 0; | ||
48 | p->bufPos = 0; | ||
49 | p->lastNode_f1 = 0; | ||
50 | } | 256 | } |
257 | #endif | ||
258 | |||
259 | |||
260 | #define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) } | ||
261 | |||
262 | #define BLAKE2S_NUM_ROUNDS 10 | ||
263 | |||
264 | #if defined(Z7_BLAKE2S_USE_VECTORS) | ||
265 | #define ROUNDS_LOOP(mac) \ | ||
266 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) } | ||
267 | #endif | ||
268 | /* | ||
269 | #define ROUNDS_LOOP_2(mac) \ | ||
270 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } } | ||
271 | */ | ||
272 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
273 | #define ROUNDS_LOOP_UNROLLED(m) \ | ||
274 | { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) } | ||
275 | #endif | ||
276 | |||
277 | #define SIGMA_TABLE(M) \ | ||
278 | M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \ | ||
279 | M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \ | ||
280 | M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \ | ||
281 | M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \ | ||
282 | M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \ | ||
283 | M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \ | ||
284 | M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \ | ||
285 | M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \ | ||
286 | M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \ | ||
287 | M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 ) | ||
288 | |||
289 | #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
290 | { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m } | ||
291 | #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
292 | SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
293 | |||
294 | // MY_ALIGN(32) | ||
295 | MY_ALIGN(16) | ||
296 | static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] = | ||
297 | { SIGMA_TABLE(SIGMA_TABLE_MULT_4) }; | ||
298 | |||
299 | #define GET_SIGMA_PTR(p, index) \ | ||
300 | ((const void *)((const Byte *)(const void *)(p) + (index))) | ||
51 | 301 | ||
302 | #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \ | ||
303 | ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos))) | ||
52 | 304 | ||
53 | static void Blake2s_Compress(CBlake2s *p) | 305 | |
306 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
307 | |||
308 | |||
309 | #if 0 | ||
310 | // use loading constants from memory | ||
311 | // is faster for some compilers. | ||
312 | #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n) | ||
313 | MY_ALIGN(64) | ||
314 | static const UInt32 k_Blake2s_IV_WAY4[]= | ||
54 | { | 315 | { |
55 | UInt32 m[16]; | 316 | KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7) |
56 | UInt32 v[16]; | 317 | }; |
57 | 318 | #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i)) | |
319 | #else | ||
320 | // use constant generation: | ||
321 | #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i)) | ||
322 | #endif | ||
323 | |||
324 | |||
325 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
326 | #define GET_CONST_128_FROM_ARRAY32(k) \ | ||
327 | _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0]) | ||
328 | #endif | ||
329 | |||
330 | |||
331 | #if 0 | ||
332 | #define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
333 | #define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
334 | #define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE) | ||
335 | #define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0) | ||
336 | #define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4) | ||
337 | #else | ||
338 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
339 | !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
340 | MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 }; | ||
341 | MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 }; | ||
342 | #define k_r8 LOAD_128(k_r8_arr) | ||
343 | #define k_r16 LOAD_128(k_r16_arr) | ||
344 | #endif | ||
345 | MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 }; | ||
346 | #define k_inc LOAD_128(k_inc_arr) | ||
347 | #define k_iv0_128 LOAD_128(k_Blake2s_IV + 0) | ||
348 | #define k_iv4_128 LOAD_128(k_Blake2s_IV + 4) | ||
349 | #endif | ||
350 | |||
351 | |||
352 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
353 | |||
354 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
355 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) | ||
356 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | ||
357 | #else | ||
358 | #define MY_mm256_set_m128i _mm256_set_m128i | ||
359 | #endif | ||
360 | |||
361 | #define SET_FROM_128(a) MY_mm256_set_m128i(a, a) | ||
362 | |||
363 | #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
364 | MY_ALIGN(32) static const Byte k_r8_arr_256 [32] = | ||
365 | { | ||
366 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12, | ||
367 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 | ||
368 | }; | ||
369 | MY_ALIGN(32) static const Byte k_r16_arr_256[32] = | ||
370 | { | ||
371 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, | ||
372 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 | ||
373 | }; | ||
374 | #define k_r8_256 LOAD_256(k_r8_arr_256) | ||
375 | #define k_r16_256 LOAD_256(k_r16_arr_256) | ||
376 | #endif | ||
377 | |||
378 | // #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)) | ||
379 | // #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)) | ||
380 | // #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)) | ||
381 | // #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)) | ||
382 | #define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)) | ||
383 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
384 | #endif | ||
385 | |||
386 | |||
387 | /* | ||
388 | IPC(TP) ports: | ||
389 | 1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps | ||
390 | 2 p_15 : icl+ | ||
391 | 1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps | ||
392 | 3 p015 : skl+ | ||
393 | |||
394 | 3 p015 : SSE2 : pxor : _mm_xor_si128 | ||
395 | 2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32 | ||
396 | 2 p0_5: mrm-wsm : | ||
397 | 3 p015 : skl+ | ||
398 | |||
399 | 2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq | ||
400 | 2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32 | ||
401 | 2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16 | ||
402 | 2 p_15 : : SSE2 : psrldq : | ||
403 | 2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8 | ||
404 | 2 p_15 : : SSE4 : pblendw : _mm_blend_epi16 | ||
405 | 1 p__5 : hsw-skl : * | ||
406 | |||
407 | 1 p0 : SSE2 : pslld (i8) : _mm_slli_si128 | ||
408 | 2 p01 : skl+ : | ||
409 | |||
410 | 2 p_15 : ivb- : SSE3 : palignr | ||
411 | 1 p__5 : hsw+ | ||
412 | |||
413 | 2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8) | ||
414 | 1 p__5 + p23 : hsw-skl | ||
415 | 1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8) | ||
416 | 0.5 2*p5 : hsw-skl | ||
417 | |||
418 | 2 p23 : SSE2 : movd (m32) | ||
419 | 3 p23A : adl : | ||
420 | 1 p5: : SSE2 : movd (r32) | ||
421 | */ | ||
422 | |||
423 | #if 0 && defined(__XOP__) | ||
424 | // we must debug and test __XOP__ instruction | ||
425 | #include <x86intrin.h> | ||
426 | #include <ammintrin.h> | ||
427 | #define LOAD_ROTATE_CONSTS | ||
428 | #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c)) | ||
429 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
430 | #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
431 | #define LOAD_ROTATE_CONSTS | ||
432 | #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c) | ||
433 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
434 | #else | ||
435 | |||
436 | // MSVC_1937+ uses "orps" instruction for _mm_or_si128(). | ||
437 | // But "orps" has low throughput: TP=1 for bdw-nhm. | ||
438 | // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps". | ||
439 | // But "orps" is fast for modern cpus (skl+). | ||
440 | // So we are default with "or" version: | ||
441 | #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937 | ||
442 | // minor optimization for some old cpus, if "xorps" is slow. | ||
443 | #define MM128_EPI32_OR_or_ADD _mm_add_epi32 | ||
444 | #else | ||
445 | #define MM128_EPI32_OR_or_ADD _mm_or_si128 | ||
446 | #endif | ||
447 | |||
448 | #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \ | ||
449 | MM128_EPI32_OR_or_ADD( \ | ||
450 | _mm_srli_epi32((r), (c)), \ | ||
451 | _mm_slli_epi32((r), 32-(c)))) | ||
452 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
453 | #define LOAD_ROTATE_CONSTS \ | ||
454 | const __m128i r8 = k_r8; \ | ||
455 | const __m128i r16 = k_r16; | ||
456 | #define MM_ROR_EPI32(r, c) ( \ | ||
457 | ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \ | ||
458 | : (16==(c)) ? _mm_shuffle_epi8(r,r16) \ | ||
459 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
460 | #else | ||
461 | #define LOAD_ROTATE_CONSTS | ||
462 | #define MM_ROR_EPI32(r, c) ( \ | ||
463 | (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \ | ||
464 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
465 | #endif | ||
466 | #endif | ||
467 | |||
468 | /* | ||
469 | we have 3 main ways to load 4 32-bit integers to __m128i: | ||
470 | 1) SSE2: _mm_set_epi32() | ||
471 | 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128() | ||
472 | 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128() | ||
473 | good compiler for _mm_set_epi32() generates these instructions: | ||
474 | { | ||
475 | movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq; | ||
476 | } | ||
477 | good new compiler generates one instruction | ||
478 | { | ||
479 | for _mm_insert_epi32() : { pinsrd xmm, [m32], i } | ||
480 | for _mm_cvtsi32_si128() : { movd xmm, [m32] } | ||
481 | } | ||
482 | but vc2010 generates slow pair of instructions: | ||
483 | { | ||
484 | for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i } | ||
485 | for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 } | ||
486 | } | ||
487 | _mm_insert_epi32() (pinsrd) code reduces xmm register pressure | ||
488 | in comparison with _mm_set_epi32() (movd + vpunpckld) code. | ||
489 | Note that variant with "movd xmm, r32" can be more slow, | ||
490 | but register pressure can be more important. | ||
491 | So we can force to "pinsrd" always. | ||
492 | */ | ||
493 | // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86) | ||
494 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
495 | /* _mm_set_epi32() can be more effective for GCC and CLANG | ||
496 | _mm_insert_epi32() is more effective for MSVC */ | ||
497 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
498 | #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
499 | #endif | ||
500 | #endif // USE_SSE41 | ||
501 | // #endif | ||
502 | |||
503 | #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
504 | // for SSE4.1 | ||
505 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
506 | _mm_insert_epi32( \ | ||
507 | _mm_insert_epi32( \ | ||
508 | _mm_insert_epi32( \ | ||
509 | _mm_cvtsi32_si128( \ | ||
510 | *(const Int32 *)p0), \ | ||
511 | *(const Int32 *)p1, 1), \ | ||
512 | *(const Int32 *)p2, 2), \ | ||
513 | *(const Int32 *)p3, 3) | ||
514 | #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
515 | /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read. | ||
516 | Also _mm_unpacklo_epi32 is more effective for another MSVC compilers. | ||
517 | But _mm_set_epi32() is more effective for GCC and CLANG. | ||
518 | So we use _mm_unpacklo_epi32 for MSVC only */ | ||
519 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
520 | _mm_unpacklo_epi64( \ | ||
521 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \ | ||
522 | _mm_cvtsi32_si128(*(const Int32 *)p1)), \ | ||
523 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \ | ||
524 | _mm_cvtsi32_si128(*(const Int32 *)p3))) | ||
525 | #else | ||
526 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
527 | _mm_set_epi32( \ | ||
528 | *(const Int32 *)p3, \ | ||
529 | *(const Int32 *)p2, \ | ||
530 | *(const Int32 *)p1, \ | ||
531 | *(const Int32 *)p0) | ||
532 | #endif | ||
533 | |||
534 | #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \ | ||
535 | MM_LOAD_EPI32_FROM_4_POINTERS( \ | ||
536 | GET_SIGMA_PTR(input, i0), \ | ||
537 | GET_SIGMA_PTR(input, i1), \ | ||
538 | GET_SIGMA_PTR(input, i2), \ | ||
539 | GET_SIGMA_PTR(input, i3)) | ||
540 | |||
541 | #define SET_ROW_FROM_SIGMA(input, sigma_index) \ | ||
542 | SET_ROW_FROM_SIGMA_BASE(input, \ | ||
543 | sigma[(sigma_index) ], \ | ||
544 | sigma[(sigma_index) + 2 * 1], \ | ||
545 | sigma[(sigma_index) + 2 * 2], \ | ||
546 | sigma[(sigma_index) + 2 * 3]) \ | ||
547 | |||
548 | |||
549 | #define ADD_128(a, b) _mm_add_epi32(a, b) | ||
550 | #define XOR_128(a, b) _mm_xor_si128(a, b) | ||
551 | |||
552 | #define D_ADD_128(dest, src) dest = ADD_128(dest, src) | ||
553 | #define D_XOR_128(dest, src) dest = XOR_128(dest, src) | ||
554 | #define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift) | ||
555 | #define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src) | ||
556 | |||
557 | |||
558 | #define AXR(a, b, d, shift) \ | ||
559 | D_ADD_128(a, b); \ | ||
560 | D_XOR_128(d, a); \ | ||
561 | D_ROR_128(d, shift); | ||
562 | |||
563 | #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \ | ||
564 | a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \ | ||
565 | AXR(a, b, d, shift1) \ | ||
566 | AXR(c, d, b, shift2) | ||
567 | |||
568 | #define ROTATE_WORDS_TO_RIGHT(a, n) \ | ||
569 | a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); | ||
570 | |||
571 | #define AXR4(a, b, c, d, input, sigma_index) \ | ||
572 | AXR2(a, b, c, d, input, sigma_index, 16, 12) \ | ||
573 | AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \ | ||
574 | |||
575 | #define RR2(a, b, c, d, input) \ | ||
576 | { \ | ||
577 | AXR4(a, b, c, d, input, 0) \ | ||
578 | ROTATE_WORDS_TO_RIGHT(b, 1) \ | ||
579 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
580 | ROTATE_WORDS_TO_RIGHT(d, 3) \ | ||
581 | AXR4(a, b, c, d, input, 8) \ | ||
582 | ROTATE_WORDS_TO_RIGHT(b, 3) \ | ||
583 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
584 | ROTATE_WORDS_TO_RIGHT(d, 1) \ | ||
585 | } | ||
586 | |||
587 | |||
588 | /* | ||
589 | Way1: | ||
590 | per 64 bytes block: | ||
591 | 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1 | ||
592 | * (7 + 1) = 320 cycles = if pslld TP=2 (skl+) | ||
593 | additional operations per 7_op_iter : | ||
594 | 4 movzx byte mem | ||
595 | 1 movd mem | ||
596 | 3 pinsrd mem | ||
597 | 1.5 pshufd | ||
598 | */ | ||
599 | |||
600 | static | ||
601 | #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
602 | defined(Z7_BLAKE2S_USE_V256_WAY2)) | ||
603 | Z7_NO_INLINE | ||
604 | #else | ||
605 | Z7_FORCE_INLINE | ||
606 | #endif | ||
607 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
608 | BLAKE2S_ATTRIB_128BIT | ||
609 | #endif | ||
610 | void | ||
611 | Z7_FASTCALL | ||
612 | Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input) | ||
613 | { | ||
614 | __m128i a, b, c, d; | ||
615 | __m128i f0, f1; | ||
616 | |||
617 | LOAD_ROTATE_CONSTS | ||
618 | d = LOAD_128_FROM_STRUCT(STATE_T(s)); | ||
619 | c = k_iv0_128; | ||
620 | a = f0 = LOAD_128_FROM_STRUCT(s); | ||
621 | b = f1 = LOAD_128_FROM_STRUCT(s + 4); | ||
622 | D_ADD_EPI64_128(d, k_inc); | ||
623 | STORE_128_TO_STRUCT (STATE_T(s), d); | ||
624 | D_XOR_128(d, k_iv4_128); | ||
625 | |||
626 | #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
627 | RR2(a, b, c, d, input) } | ||
628 | |||
629 | ROUNDS_LOOP(RR) | ||
630 | #undef RR | ||
631 | |||
632 | STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c))); | ||
633 | STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d))); | ||
634 | } | ||
635 | |||
636 | |||
637 | static | ||
638 | Z7_NO_INLINE | ||
639 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
640 | BLAKE2S_ATTRIB_128BIT | ||
641 | #endif | ||
642 | void | ||
643 | Z7_FASTCALL | ||
644 | Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end) | ||
645 | { | ||
646 | size_t pos = 0; | ||
647 | do | ||
58 | { | 648 | { |
59 | unsigned i; | 649 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
650 | Blake2s_Compress_V128_Way1(s, data); | ||
651 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
652 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
653 | pos &= SUPER_BLOCK_MASK; | ||
654 | } | ||
655 | while (data != end); | ||
656 | } | ||
657 | |||
658 | |||
659 | #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
660 | defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
661 | #if 1 | ||
662 | #define Z7_BLAKE2S_CompressSingleBlock(s, data) \ | ||
663 | Blake2sp_Compress2_V128_Way1(s, data, \ | ||
664 | (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE) | ||
665 | #else | ||
666 | #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1 | ||
667 | #endif | ||
668 | #endif | ||
669 | |||
670 | |||
671 | #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \ | ||
672 | defined(Z7_BLAKE2S_USE_V128_WAY2)) && \ | ||
673 | !defined(Z7_BLAKE2S_USE_GATHER) | ||
674 | #define AXR2_LOAD_INDEXES(sigma_index) \ | ||
675 | const unsigned i0 = sigma[(sigma_index)]; \ | ||
676 | const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \ | ||
677 | const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \ | ||
678 | const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \ | ||
679 | |||
680 | #define SET_ROW_FROM_SIGMA_W(input) \ | ||
681 | SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) | ||
682 | #endif | ||
683 | |||
684 | |||
685 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
686 | |||
687 | #if 1 || !defined(Z7_BLAKE2S_USE_SSE41) | ||
688 | /* we use SET_ROW_FROM_SIGMA_BASE, that uses | ||
689 | (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined | ||
690 | (SSE2) _mm_set_epi32() | ||
691 | MSVC can be faster for this branch: | ||
692 | */ | ||
693 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
694 | { \ | ||
695 | AXR2_LOAD_INDEXES(sigma_index) \ | ||
696 | a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \ | ||
697 | a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \ | ||
698 | AXR(a0, b0, d0, shift1) \ | ||
699 | AXR(a1, b1, d1, shift1) \ | ||
700 | AXR(c0, d0, b0, shift2) \ | ||
701 | AXR(c1, d1, b1, shift2) \ | ||
702 | } | ||
703 | #else | ||
704 | /* we use interleaved _mm_insert_epi32(): | ||
705 | GCC can be faster for this branch: | ||
706 | */ | ||
707 | #define AXR2_W_PRE_INSERT(sigma_index, i) \ | ||
708 | { const unsigned ii = sigma[(sigma_index) + i * 2]; \ | ||
709 | t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \ | ||
710 | t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \ | ||
711 | } | ||
712 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
713 | { __m128i t0, t1; \ | ||
714 | { const unsigned ii = sigma[sigma_index]; \ | ||
715 | t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \ | ||
716 | t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \ | ||
717 | } \ | ||
718 | AXR2_W_PRE_INSERT(sigma_index, 1) \ | ||
719 | AXR2_W_PRE_INSERT(sigma_index, 2) \ | ||
720 | AXR2_W_PRE_INSERT(sigma_index, 3) \ | ||
721 | a0 = _mm_add_epi32(a0, t0); \ | ||
722 | a1 = _mm_add_epi32(a1, t1); \ | ||
723 | AXR(a0, b0, d0, shift1) \ | ||
724 | AXR(a1, b1, d1, shift1) \ | ||
725 | AXR(c0, d0, b0, shift2) \ | ||
726 | AXR(c1, d1, b1, shift2) \ | ||
727 | } | ||
728 | #endif | ||
729 | |||
730 | |||
731 | #define AXR4_W(sigma_index) \ | ||
732 | AXR2_W(sigma_index, 16, 12) \ | ||
733 | AXR2_W(sigma_index + 1, 8, 7) \ | ||
734 | |||
735 | #define WW(r) \ | ||
736 | { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
737 | AXR4_W(0) \ | ||
738 | ROTATE_WORDS_TO_RIGHT(b0, 1) \ | ||
739 | ROTATE_WORDS_TO_RIGHT(b1, 1) \ | ||
740 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
741 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
742 | ROTATE_WORDS_TO_RIGHT(d0, 3) \ | ||
743 | ROTATE_WORDS_TO_RIGHT(d1, 3) \ | ||
744 | AXR4_W(8) \ | ||
745 | ROTATE_WORDS_TO_RIGHT(b0, 3) \ | ||
746 | ROTATE_WORDS_TO_RIGHT(b1, 3) \ | ||
747 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
748 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
749 | ROTATE_WORDS_TO_RIGHT(d0, 1) \ | ||
750 | ROTATE_WORDS_TO_RIGHT(d1, 1) \ | ||
751 | } | ||
752 | |||
753 | |||
754 | static | ||
755 | Z7_NO_INLINE | ||
756 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
757 | BLAKE2S_ATTRIB_128BIT | ||
758 | #endif | ||
759 | void | ||
760 | Z7_FASTCALL | ||
761 | Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
762 | { | ||
763 | size_t pos = 0; | ||
764 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
765 | |||
766 | if (data != end) | ||
767 | { | ||
768 | LOAD_ROTATE_CONSTS | ||
769 | do | ||
770 | { | ||
771 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
772 | __m128i a0, b0, c0, d0; | ||
773 | __m128i a1, b1, c1, d1; | ||
774 | { | ||
775 | const __m128i inc = k_inc; | ||
776 | const __m128i temp = k_iv4_128; | ||
777 | d0 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
778 | d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
779 | D_ADD_EPI64_128(d0, inc); | ||
780 | D_ADD_EPI64_128(d1, inc); | ||
781 | STORE_128_TO_STRUCT (STATE_T(s ), d0); | ||
782 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1); | ||
783 | D_XOR_128(d0, temp); | ||
784 | D_XOR_128(d1, temp); | ||
785 | } | ||
786 | c1 = c0 = k_iv0_128; | ||
787 | a0 = LOAD_128_FROM_STRUCT(s); | ||
788 | b0 = LOAD_128_FROM_STRUCT(s + 4); | ||
789 | a1 = LOAD_128_FROM_STRUCT(s + NSW); | ||
790 | b1 = LOAD_128_FROM_STRUCT(s + NSW + 4); | ||
791 | |||
792 | ROUNDS_LOOP (WW) | ||
793 | |||
794 | #undef WW | ||
795 | |||
796 | D_XOR_128(a0, c0); | ||
797 | D_XOR_128(b0, d0); | ||
798 | D_XOR_128(a1, c1); | ||
799 | D_XOR_128(b1, d1); | ||
800 | |||
801 | D_XOR_128(a0, LOAD_128_FROM_STRUCT(s)); | ||
802 | D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4)); | ||
803 | D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW)); | ||
804 | D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4)); | ||
805 | |||
806 | STORE_128_TO_STRUCT(s, a0); | ||
807 | STORE_128_TO_STRUCT(s + 4, b0); | ||
808 | STORE_128_TO_STRUCT(s + NSW, a1); | ||
809 | STORE_128_TO_STRUCT(s + NSW + 4, b1); | ||
810 | |||
811 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
812 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
813 | pos &= SUPER_BLOCK_MASK; | ||
814 | } | ||
815 | while (data < end); | ||
816 | if (data != end) | ||
817 | return; | ||
818 | } | ||
819 | { | ||
820 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
821 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
822 | } | ||
823 | } | ||
824 | #endif // Z7_BLAKE2S_USE_V128_WAY2 | ||
825 | |||
826 | |||
827 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
828 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2 | ||
829 | #else | ||
830 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1 | ||
831 | #endif | ||
832 | |||
833 | |||
834 | |||
835 | #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
836 | #define ROT_128_8(x) MM_ROR_EPI32(x, 8) | ||
837 | #define ROT_128_16(x) MM_ROR_EPI32(x, 16) | ||
838 | #define ROT_128_7(x) MM_ROR_EPI32(x, 7) | ||
839 | #define ROT_128_12(x) MM_ROR_EPI32(x, 12) | ||
840 | #else | ||
841 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
842 | #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8 | ||
843 | #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16 | ||
844 | #else | ||
845 | #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8) | ||
846 | #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16) | ||
847 | #endif | ||
848 | #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7) | ||
849 | #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12) | ||
850 | #endif | ||
851 | |||
852 | |||
853 | #if 1 | ||
854 | // this branch can provide similar speed on x86* in most cases, | ||
855 | // because [base + index*4] provides same speed as [base + index]. | ||
856 | // but some compilers can generate different code with this branch, that can be faster sometimes. | ||
857 | // this branch uses additional table of 10*16=160 bytes. | ||
858 | #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
859 | SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
860 | MY_ALIGN(16) | ||
861 | static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] = | ||
862 | { SIGMA_TABLE(SIGMA_TABLE_MULT_16) }; | ||
863 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r]; | ||
864 | #define GET_SIGMA_VAL_128(n) (sigma[n]) | ||
865 | #else | ||
866 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
867 | #define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n]) | ||
868 | #endif | ||
869 | |||
870 | |||
871 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
872 | #if 1 | ||
873 | #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
874 | SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
875 | MY_ALIGN(64) | ||
876 | static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] = | ||
877 | { SIGMA_TABLE(SIGMA_TABLE_MULT_32) }; | ||
878 | #define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r]; | ||
879 | #define GET_SIGMA_VAL_256(n) (sigma[n]) | ||
880 | #else | ||
881 | #define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
882 | #define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n]) | ||
883 | #endif | ||
884 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
885 | |||
886 | |||
887 | #define D_ROT_128_7(dest) dest = ROT_128_7(dest) | ||
888 | #define D_ROT_128_8(dest) dest = ROT_128_8(dest) | ||
889 | #define D_ROT_128_12(dest) dest = ROT_128_12(dest) | ||
890 | #define D_ROT_128_16(dest) dest = ROT_128_16(dest) | ||
891 | |||
892 | #define OP_L(a, i) D_ADD_128 (V(a, 0), \ | ||
893 | LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i)))); | ||
894 | |||
895 | #define OP_0(a) OP_L(a, 0) | ||
896 | #define OP_7(a) OP_L(a, 1) | ||
897 | |||
898 | #define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1)); | ||
899 | #define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0)); | ||
900 | #define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3)); | ||
901 | #define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2)); | ||
902 | |||
903 | #define OP_3(a) D_ROT_128_16 (V(a, 3)); | ||
904 | #define OP_6(a) D_ROT_128_12 (V(a, 1)); | ||
905 | #define OP_8(a) D_ROT_128_8 (V(a, 3)); | ||
906 | #define OP_9(a) D_ROT_128_7 (V(a, 1)); | ||
907 | |||
908 | |||
909 | // for 32-bit x86 : interleave mode works slower, because of register pressure. | ||
910 | |||
911 | #if 0 || 1 && (defined(MY_CPU_X86) \ | ||
912 | || defined(__GNUC__) && !defined(__clang__)) | ||
913 | // non-inteleaved version: | ||
914 | // is fast for x86 32-bit. | ||
915 | // is fast for GCC x86-64. | ||
916 | |||
917 | #define V4G(a) \ | ||
918 | OP_0 (a) \ | ||
919 | OP_1 (a) \ | ||
920 | OP_2 (a) \ | ||
921 | OP_3 (a) \ | ||
922 | OP_4 (a) \ | ||
923 | OP_5 (a) \ | ||
924 | OP_6 (a) \ | ||
925 | OP_7 (a) \ | ||
926 | OP_1 (a) \ | ||
927 | OP_2 (a) \ | ||
928 | OP_8 (a) \ | ||
929 | OP_4 (a) \ | ||
930 | OP_5 (a) \ | ||
931 | OP_9 (a) \ | ||
932 | |||
933 | #define V4R \ | ||
934 | { \ | ||
935 | V4G (0) \ | ||
936 | V4G (1) \ | ||
937 | V4G (2) \ | ||
938 | V4G (3) \ | ||
939 | V4G (4) \ | ||
940 | V4G (5) \ | ||
941 | V4G (6) \ | ||
942 | V4G (7) \ | ||
943 | } | ||
944 | |||
945 | #elif 0 || 1 && defined(MY_CPU_X86) | ||
946 | |||
947 | #define OP_INTER_2(op, a,b) \ | ||
948 | op (a) \ | ||
949 | op (b) \ | ||
950 | |||
951 | #define V4G(a,b) \ | ||
952 | OP_INTER_2 (OP_0, a,b) \ | ||
953 | OP_INTER_2 (OP_1, a,b) \ | ||
954 | OP_INTER_2 (OP_2, a,b) \ | ||
955 | OP_INTER_2 (OP_3, a,b) \ | ||
956 | OP_INTER_2 (OP_4, a,b) \ | ||
957 | OP_INTER_2 (OP_5, a,b) \ | ||
958 | OP_INTER_2 (OP_6, a,b) \ | ||
959 | OP_INTER_2 (OP_7, a,b) \ | ||
960 | OP_INTER_2 (OP_1, a,b) \ | ||
961 | OP_INTER_2 (OP_2, a,b) \ | ||
962 | OP_INTER_2 (OP_8, a,b) \ | ||
963 | OP_INTER_2 (OP_4, a,b) \ | ||
964 | OP_INTER_2 (OP_5, a,b) \ | ||
965 | OP_INTER_2 (OP_9, a,b) \ | ||
966 | |||
967 | #define V4R \ | ||
968 | { \ | ||
969 | V4G (0, 1) \ | ||
970 | V4G (2, 3) \ | ||
971 | V4G (4, 5) \ | ||
972 | V4G (6, 7) \ | ||
973 | } | ||
974 | |||
975 | #else | ||
976 | // iterleave-4 version is fast for x64 (MSVC/CLANG) | ||
977 | |||
978 | #define OP_INTER_4(op, a,b,c,d) \ | ||
979 | op (a) \ | ||
980 | op (b) \ | ||
981 | op (c) \ | ||
982 | op (d) \ | ||
983 | |||
984 | #define V4G(a,b,c,d) \ | ||
985 | OP_INTER_4 (OP_0, a,b,c,d) \ | ||
986 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
987 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
988 | OP_INTER_4 (OP_3, a,b,c,d) \ | ||
989 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
990 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
991 | OP_INTER_4 (OP_6, a,b,c,d) \ | ||
992 | OP_INTER_4 (OP_7, a,b,c,d) \ | ||
993 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
994 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
995 | OP_INTER_4 (OP_8, a,b,c,d) \ | ||
996 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
997 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
998 | OP_INTER_4 (OP_9, a,b,c,d) \ | ||
999 | |||
1000 | #define V4R \ | ||
1001 | { \ | ||
1002 | V4G (0, 1, 2, 3) \ | ||
1003 | V4G (4, 5, 6, 7) \ | ||
1004 | } | ||
1005 | |||
1006 | #endif | ||
1007 | |||
1008 | #define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R } | ||
1009 | |||
1010 | |||
1011 | #define V4_LOAD_MSG_1(w, m, i) \ | ||
1012 | { \ | ||
1013 | __m128i m0, m1, m2, m3; \ | ||
1014 | __m128i t0, t1, t2, t3; \ | ||
1015 | m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \ | ||
1016 | m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \ | ||
1017 | m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \ | ||
1018 | m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \ | ||
1019 | t0 = _mm_unpacklo_epi32(m0, m1); \ | ||
1020 | t1 = _mm_unpackhi_epi32(m0, m1); \ | ||
1021 | t2 = _mm_unpacklo_epi32(m2, m3); \ | ||
1022 | t3 = _mm_unpackhi_epi32(m2, m3); \ | ||
1023 | w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \ | ||
1024 | w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \ | ||
1025 | w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \ | ||
1026 | w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \ | ||
1027 | } | ||
1028 | |||
1029 | #define V4_LOAD_MSG(w, m) \ | ||
1030 | { \ | ||
1031 | V4_LOAD_MSG_1 (w, m, 0) \ | ||
1032 | V4_LOAD_MSG_1 (w, m, 1) \ | ||
1033 | V4_LOAD_MSG_1 (w, m, 2) \ | ||
1034 | V4_LOAD_MSG_1 (w, m, 3) \ | ||
1035 | } | ||
1036 | |||
1037 | #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \ | ||
1038 | { \ | ||
1039 | const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \ | ||
1040 | const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \ | ||
1041 | d0 = _mm_unpacklo_epi32(v0, v1); \ | ||
1042 | d1 = _mm_unpackhi_epi32(v0, v1); \ | ||
1043 | } | ||
1044 | |||
1045 | #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \ | ||
1046 | { \ | ||
1047 | STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \ | ||
1048 | STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \ | ||
1049 | } | ||
1050 | |||
1051 | #define V4_UNPACK_STATE(dest32, src32) \ | ||
1052 | { \ | ||
1053 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1054 | V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \ | ||
1055 | V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \ | ||
1056 | V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \ | ||
1057 | V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \ | ||
1058 | V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \ | ||
1059 | V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \ | ||
1060 | V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \ | ||
1061 | V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \ | ||
1062 | } | ||
1063 | |||
1064 | |||
1065 | static | ||
1066 | Z7_NO_INLINE | ||
1067 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
1068 | BLAKE2S_ATTRIB_128BIT | ||
1069 | #endif | ||
1070 | void | ||
1071 | Z7_FASTCALL | ||
1072 | Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1073 | { | ||
1074 | // PrintStates2(s_items, 8, 16); | ||
1075 | size_t pos = 0; | ||
1076 | pos /= 2; | ||
1077 | do | ||
1078 | { | ||
1079 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
1080 | !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED) | ||
1081 | const __m128i r8 = k_r8; | ||
1082 | const __m128i r16 = k_r16; | ||
1083 | #endif | ||
1084 | __m128i w[16]; | ||
1085 | __m128i v[16]; | ||
1086 | UInt32 *s; | ||
1087 | V4_LOAD_MSG(w, data) | ||
1088 | s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1089 | { | ||
1090 | __m128i ctr = LOAD_128_FROM_STRUCT(s + 64); | ||
1091 | D_ADD_EPI64_128 (ctr, k_inc); | ||
1092 | STORE_128_TO_STRUCT(s + 64, ctr); | ||
1093 | v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
1094 | v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
1095 | } | ||
1096 | v[ 8] = GET_128_IV_WAY4(0); | ||
1097 | v[ 9] = GET_128_IV_WAY4(1); | ||
1098 | v[10] = GET_128_IV_WAY4(2); | ||
1099 | v[11] = GET_128_IV_WAY4(3); | ||
1100 | v[14] = GET_128_IV_WAY4(6); | ||
1101 | v[15] = GET_128_IV_WAY4(7); | ||
60 | 1102 | ||
61 | for (i = 0; i < 16; i++) | 1103 | #define LOAD_STATE_128_FROM_STRUCT(i) \ |
62 | m[i] = GetUi32(p->buf + i * sizeof(m[i])); | 1104 | v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4); |
1105 | |||
1106 | #define UPDATE_STATE_128_IN_STRUCT(i) \ | ||
1107 | STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \ | ||
1108 | XOR_128(v[i], v[(i) + 8]), \ | ||
1109 | LOAD_128_FROM_STRUCT(s + (i) * 4))); | ||
63 | 1110 | ||
64 | for (i = 0; i < 8; i++) | 1111 | REP8_MACRO (LOAD_STATE_128_FROM_STRUCT) |
65 | v[i] = p->h[i]; | 1112 | ROUNDS_LOOP (V4_ROUND) |
1113 | REP8_MACRO (UPDATE_STATE_128_IN_STRUCT) | ||
1114 | |||
1115 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1116 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2; | ||
1117 | pos &= SUPER_BLOCK_SIZE / 2 - 1; | ||
66 | } | 1118 | } |
1119 | while (data != end); | ||
1120 | } | ||
67 | 1121 | ||
68 | v[ 8] = k_Blake2s_IV[0]; | ||
69 | v[ 9] = k_Blake2s_IV[1]; | ||
70 | v[10] = k_Blake2s_IV[2]; | ||
71 | v[11] = k_Blake2s_IV[3]; | ||
72 | |||
73 | v[12] = p->t[0] ^ k_Blake2s_IV[4]; | ||
74 | v[13] = p->t[1] ^ k_Blake2s_IV[5]; | ||
75 | v[14] = p->f[0] ^ k_Blake2s_IV[6]; | ||
76 | v[15] = p->f[1] ^ k_Blake2s_IV[7]; | ||
77 | 1122 | ||
78 | #define G(r,i,a,b,c,d) \ | 1123 | static |
79 | a += b + m[sigma[2*i+0]]; d ^= a; d = rotr32(d, 16); c += d; b ^= c; b = rotr32(b, 12); \ | 1124 | Z7_NO_INLINE |
80 | a += b + m[sigma[2*i+1]]; d ^= a; d = rotr32(d, 8); c += d; b ^= c; b = rotr32(b, 7); \ | 1125 | #ifdef BLAKE2S_ATTRIB_128BIT |
1126 | BLAKE2S_ATTRIB_128BIT | ||
1127 | #endif | ||
1128 | void | ||
1129 | Z7_FASTCALL | ||
1130 | Blake2sp_Final_V128_Fast(UInt32 *states) | ||
1131 | { | ||
1132 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); | ||
1133 | // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n"); | ||
1134 | // PrintStates2(states, 8, 16); | ||
1135 | { | ||
1136 | ptrdiff_t pos = 8 * 4; | ||
1137 | do | ||
1138 | { | ||
1139 | UInt32 *src32 = states + (size_t)(pos * 1); | ||
1140 | UInt32 *dest32 = states + (size_t)(pos * 2); | ||
1141 | V4_UNPACK_STATE(dest32, src32) | ||
1142 | pos -= 8 * 4; | ||
1143 | } | ||
1144 | while (pos >= 0); | ||
1145 | } | ||
1146 | { | ||
1147 | unsigned k; | ||
1148 | for (k = 0; k < 8; k++) | ||
1149 | { | ||
1150 | UInt32 *s = states + (size_t)k * 16; | ||
1151 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
1152 | } | ||
1153 | } | ||
1154 | // PrintStates2(states, 8, 16); | ||
1155 | } | ||
1156 | |||
1157 | |||
1158 | |||
1159 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
1160 | |||
1161 | #define ADD_256(a, b) _mm256_add_epi32(a, b) | ||
1162 | #define XOR_256(a, b) _mm256_xor_si256(a, b) | ||
1163 | |||
1164 | #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
1165 | #define MM256_ROR_EPI32 _mm256_ror_epi32 | ||
1166 | #define Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
1167 | #define LOAD_ROTATE_CONSTS_256 | ||
1168 | #else | ||
1169 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1170 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
1171 | #define LOAD_ROTATE_CONSTS_256 \ | ||
1172 | const __m256i r8 = k_r8_256; \ | ||
1173 | const __m256i r16 = k_r16_256; | ||
1174 | #endif // AVX2_WAY2 | ||
1175 | |||
1176 | #define MM256_ROR_EPI32(r, c) ( \ | ||
1177 | ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \ | ||
1178 | : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \ | ||
1179 | : _mm256_or_si256( \ | ||
1180 | _mm256_srli_epi32((r), (c)), \ | ||
1181 | _mm256_slli_epi32((r), 32-(c)))) | ||
1182 | #endif // WAY_SLOW | ||
1183 | #endif | ||
1184 | |||
1185 | |||
1186 | #define D_ADD_256(dest, src) dest = ADD_256(dest, src) | ||
1187 | #define D_XOR_256(dest, src) dest = XOR_256(dest, src) | ||
1188 | |||
1189 | #define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
1190 | |||
1191 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
1192 | |||
1193 | #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
1194 | #define ROT_256_16(x) MM256_ROR_EPI32((x), 16) | ||
1195 | #define ROT_256_12(x) MM256_ROR_EPI32((x), 12) | ||
1196 | #define ROT_256_8(x) MM256_ROR_EPI32((x), 8) | ||
1197 | #define ROT_256_7(x) MM256_ROR_EPI32((x), 7) | ||
1198 | #else | ||
1199 | #define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \ | ||
1200 | 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
1201 | #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \ | ||
1202 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
1203 | #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16) | ||
1204 | #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20)) | ||
1205 | #define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8) | ||
1206 | #define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25)) | ||
1207 | #endif | ||
1208 | |||
1209 | #define D_ROT_256_7(dest) dest = ROT_256_7(dest) | ||
1210 | #define D_ROT_256_8(dest) dest = ROT_256_8(dest) | ||
1211 | #define D_ROT_256_12(dest) dest = ROT_256_12(dest) | ||
1212 | #define D_ROT_256_16(dest) dest = ROT_256_16(dest) | ||
1213 | |||
1214 | #define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p)) | ||
1215 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
1216 | #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r) | ||
1217 | #define LOAD_256_FROM_STRUCT(p) LOADU_256(p) | ||
1218 | #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r) | ||
1219 | #else | ||
1220 | // if struct is aligned for 32-bytes | ||
1221 | #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r) | ||
1222 | #define LOAD_256_FROM_STRUCT(p) LOAD_256(p) | ||
1223 | #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r) | ||
1224 | #endif | ||
1225 | |||
1226 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
1227 | |||
1228 | |||
1229 | |||
1230 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1231 | |||
1232 | #if 0 | ||
1233 | #define DIAG_PERM2(s) \ | ||
1234 | { \ | ||
1235 | const __m256i a = LOAD_256_FROM_STRUCT((s) ); \ | ||
1236 | const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \ | ||
1237 | STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \ | ||
1238 | STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \ | ||
1239 | } | ||
1240 | #else | ||
1241 | #define DIAG_PERM2(s) \ | ||
1242 | { \ | ||
1243 | const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \ | ||
1244 | const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \ | ||
1245 | STORE_128_TO_STRUCT((s) + NSW, a); \ | ||
1246 | STORE_128_TO_STRUCT((s) + 4 , b); \ | ||
1247 | } | ||
1248 | #endif | ||
1249 | #define DIAG_PERM8(s_items) \ | ||
1250 | { \ | ||
1251 | DIAG_PERM2(s_items) \ | ||
1252 | DIAG_PERM2(s_items + NSW * 2) \ | ||
1253 | DIAG_PERM2(s_items + NSW * 4) \ | ||
1254 | DIAG_PERM2(s_items + NSW * 6) \ | ||
1255 | } | ||
1256 | |||
1257 | |||
1258 | #define AXR256(a, b, d, shift) \ | ||
1259 | D_ADD_256(a, b); \ | ||
1260 | D_XOR_256(d, a); \ | ||
1261 | d = MM256_ROR_EPI32(d, shift); \ | ||
1262 | |||
1263 | |||
1264 | |||
1265 | #ifdef Z7_BLAKE2S_USE_GATHER | ||
1266 | |||
1267 | #define TABLE_GATHER_256_4(a0,a1,a2,a3) \ | ||
1268 | a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16 | ||
1269 | #define TABLE_GATHER_256( \ | ||
1270 | a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
1271 | { TABLE_GATHER_256_4(a0,a2,a4,a6), \ | ||
1272 | TABLE_GATHER_256_4(a1,a3,a5,a7), \ | ||
1273 | TABLE_GATHER_256_4(a8,a10,a12,a14), \ | ||
1274 | TABLE_GATHER_256_4(a9,a11,a13,a15) } | ||
1275 | MY_ALIGN(64) | ||
1276 | static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] = | ||
1277 | { SIGMA_TABLE(TABLE_GATHER_256) }; | ||
1278 | #define GET_SIGMA(r) \ | ||
1279 | const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r]; | ||
1280 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1281 | const __m256i i01234567 = LOAD_256(sigma + (sigma_index)); | ||
1282 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
1283 | _mm256_i32gather_epi32((const void *)(in), i01234567, 4) | ||
1284 | #define SIGMA_INTERLEAVE 8 | ||
1285 | #define SIGMA_HALF_ROW_SIZE 16 | ||
1286 | |||
1287 | #else // !Z7_BLAKE2S_USE_GATHER | ||
1288 | |||
1289 | #define GET_SIGMA(r) \ | ||
1290 | const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
1291 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1292 | AXR2_LOAD_INDEXES(sigma_index) | ||
1293 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
1294 | MY_mm256_set_m128i( \ | ||
1295 | SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \ | ||
1296 | SET_ROW_FROM_SIGMA_W(in)) | ||
1297 | #define SIGMA_INTERLEAVE 1 | ||
1298 | #define SIGMA_HALF_ROW_SIZE 8 | ||
1299 | #endif // !Z7_BLAKE2S_USE_GATHER | ||
1300 | |||
81 | 1301 | ||
82 | #define R(r) \ | 1302 | #define ROTATE_WORDS_TO_RIGHT_256(a, n) \ |
83 | G(r,0,v[ 0],v[ 4],v[ 8],v[12]) \ | 1303 | a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); |
84 | G(r,1,v[ 1],v[ 5],v[ 9],v[13]) \ | ||
85 | G(r,2,v[ 2],v[ 6],v[10],v[14]) \ | ||
86 | G(r,3,v[ 3],v[ 7],v[11],v[15]) \ | ||
87 | G(r,4,v[ 0],v[ 5],v[10],v[15]) \ | ||
88 | G(r,5,v[ 1],v[ 6],v[11],v[12]) \ | ||
89 | G(r,6,v[ 2],v[ 7],v[ 8],v[13]) \ | ||
90 | G(r,7,v[ 3],v[ 4],v[ 9],v[14]) \ | ||
91 | 1304 | ||
1305 | |||
1306 | |||
1307 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
1308 | |||
1309 | #define AXR2_A(sigma_index, shift1, shift2) \ | ||
1310 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1311 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
1312 | AXR256(a0, b0, d0, shift1) \ | ||
1313 | AXR256(c0, d0, b0, shift2) \ | ||
1314 | |||
1315 | #define AXR4_A(sigma_index) \ | ||
1316 | { AXR2_A(sigma_index, 16, 12) } \ | ||
1317 | { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
1318 | |||
1319 | #define EE1(r) \ | ||
1320 | { GET_SIGMA(r) \ | ||
1321 | AXR4_A(0) \ | ||
1322 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
1323 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1324 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
1325 | AXR4_A(SIGMA_HALF_ROW_SIZE) \ | ||
1326 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
1327 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1328 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
1329 | } | ||
1330 | |||
1331 | static | ||
1332 | Z7_NO_INLINE | ||
1333 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1334 | BLAKE2S_ATTRIB_AVX2 | ||
1335 | #endif | ||
1336 | void | ||
1337 | Z7_FASTCALL | ||
1338 | Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1339 | { | ||
1340 | size_t pos = 0; | ||
1341 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
1342 | |||
1343 | if (data != end) | ||
92 | { | 1344 | { |
93 | unsigned r; | 1345 | LOAD_ROTATE_CONSTS_256 |
94 | for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) | 1346 | DIAG_PERM8(s_items) |
1347 | do | ||
95 | { | 1348 | { |
96 | const Byte *sigma = k_Blake2s_Sigma[r]; | 1349 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
97 | R(r) | 1350 | __m256i a0, b0, c0, d0; |
1351 | { | ||
1352 | const __m128i inc = k_inc; | ||
1353 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
1354 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
1355 | D_ADD_EPI64_128(d0_128, inc); | ||
1356 | D_ADD_EPI64_128(d1_128, inc); | ||
1357 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
1358 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128); | ||
1359 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
1360 | D_XOR_256(d0, k_iv4_256); | ||
1361 | } | ||
1362 | c0 = SET_FROM_128(k_iv0_128); | ||
1363 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
1364 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
1365 | |||
1366 | ROUNDS_LOOP (EE1) | ||
1367 | |||
1368 | D_XOR_256(a0, c0); | ||
1369 | D_XOR_256(b0, d0); | ||
1370 | |||
1371 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
1372 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
1373 | |||
1374 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
1375 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
1376 | |||
1377 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
1378 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
1379 | pos &= SUPER_BLOCK_MASK; | ||
98 | } | 1380 | } |
99 | /* R(0); R(1); R(2); R(3); R(4); R(5); R(6); R(7); R(8); R(9); */ | 1381 | while (data < end); |
1382 | DIAG_PERM8(s_items) | ||
1383 | if (data != end) | ||
1384 | return; | ||
1385 | } | ||
1386 | { | ||
1387 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1388 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
100 | } | 1389 | } |
1390 | } | ||
1391 | |||
1392 | #endif // Z7_BLAKE2S_USE_AVX2_WAY2 | ||
101 | 1393 | ||
102 | #undef G | ||
103 | #undef R | ||
104 | 1394 | ||
1395 | |||
1396 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
1397 | |||
1398 | #define AXR2_X(sigma_index, shift1, shift2) \ | ||
1399 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1400 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
1401 | D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \ | ||
1402 | AXR256(a0, b0, d0, shift1) \ | ||
1403 | AXR256(a1, b1, d1, shift1) \ | ||
1404 | AXR256(c0, d0, b0, shift2) \ | ||
1405 | AXR256(c1, d1, b1, shift2) \ | ||
1406 | |||
1407 | #define AXR4_X(sigma_index) \ | ||
1408 | { AXR2_X(sigma_index, 16, 12) } \ | ||
1409 | { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
1410 | |||
1411 | #define EE2(r) \ | ||
1412 | { GET_SIGMA(r) \ | ||
1413 | AXR4_X(0) \ | ||
1414 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
1415 | ROTATE_WORDS_TO_RIGHT_256(b1, 1) \ | ||
1416 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1417 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
1418 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
1419 | ROTATE_WORDS_TO_RIGHT_256(d1, 3) \ | ||
1420 | AXR4_X(SIGMA_HALF_ROW_SIZE) \ | ||
1421 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
1422 | ROTATE_WORDS_TO_RIGHT_256(b1, 3) \ | ||
1423 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1424 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
1425 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
1426 | ROTATE_WORDS_TO_RIGHT_256(d1, 1) \ | ||
1427 | } | ||
1428 | |||
1429 | static | ||
1430 | Z7_NO_INLINE | ||
1431 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1432 | BLAKE2S_ATTRIB_AVX2 | ||
1433 | #endif | ||
1434 | void | ||
1435 | Z7_FASTCALL | ||
1436 | Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1437 | { | ||
1438 | size_t pos = 0; | ||
1439 | |||
1440 | if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4) | ||
105 | { | 1441 | { |
106 | unsigned i; | 1442 | #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED |
107 | for (i = 0; i < 8; i++) | 1443 | const __m256i r8 = k_r8_256; |
108 | p->h[i] ^= v[i] ^ v[i + 8]; | 1444 | const __m256i r16 = k_r16_256; |
1445 | #endif | ||
1446 | end -= Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
1447 | DIAG_PERM8(s_items) | ||
1448 | do | ||
1449 | { | ||
1450 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1451 | __m256i a0, b0, c0, d0; | ||
1452 | __m256i a1, b1, c1, d1; | ||
1453 | { | ||
1454 | const __m128i inc = k_inc; | ||
1455 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
1456 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
1457 | __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2)); | ||
1458 | __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3)); | ||
1459 | D_ADD_EPI64_128(d0_128, inc); | ||
1460 | D_ADD_EPI64_128(d1_128, inc); | ||
1461 | D_ADD_EPI64_128(d2_128, inc); | ||
1462 | D_ADD_EPI64_128(d3_128, inc); | ||
1463 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
1464 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128); | ||
1465 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128); | ||
1466 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128); | ||
1467 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
1468 | d1 = MY_mm256_set_m128i(d3_128, d2_128); | ||
1469 | D_XOR_256(d0, k_iv4_256); | ||
1470 | D_XOR_256(d1, k_iv4_256); | ||
1471 | } | ||
1472 | c1 = c0 = SET_FROM_128(k_iv0_128); | ||
1473 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
1474 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
1475 | a1 = LOAD_256_FROM_STRUCT(s + NSW * 2); | ||
1476 | b1 = LOAD_256_FROM_STRUCT(s + NSW * 3); | ||
1477 | |||
1478 | ROUNDS_LOOP (EE2) | ||
1479 | |||
1480 | D_XOR_256(a0, c0); | ||
1481 | D_XOR_256(b0, d0); | ||
1482 | D_XOR_256(a1, c1); | ||
1483 | D_XOR_256(b1, d1); | ||
1484 | |||
1485 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
1486 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
1487 | D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2)); | ||
1488 | D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3)); | ||
1489 | |||
1490 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
1491 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
1492 | STORE_256_TO_STRUCT(s + NSW * 2, a1); | ||
1493 | STORE_256_TO_STRUCT(s + NSW * 3, b1); | ||
1494 | |||
1495 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1496 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1497 | pos &= SUPER_BLOCK_MASK; | ||
1498 | } | ||
1499 | while (data < end); | ||
1500 | DIAG_PERM8(s_items) | ||
1501 | end += Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
109 | } | 1502 | } |
1503 | if (data == end) | ||
1504 | return; | ||
1505 | // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos); | ||
1506 | do | ||
1507 | { | ||
1508 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1509 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
1510 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
1511 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
1512 | pos &= SUPER_BLOCK_MASK; | ||
1513 | } | ||
1514 | while (data != end); | ||
1515 | } | ||
1516 | |||
1517 | #endif // Z7_BLAKE2S_USE_AVX2_WAY4 | ||
1518 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1519 | |||
1520 | |||
1521 | // --------------------------------------------------------- | ||
1522 | |||
1523 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
1524 | |||
1525 | #define OP256_L(a, i) D_ADD_256 (V(a, 0), \ | ||
1526 | LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i)))); | ||
1527 | |||
1528 | #define OP256_0(a) OP256_L(a, 0) | ||
1529 | #define OP256_7(a) OP256_L(a, 1) | ||
1530 | |||
1531 | #define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1)); | ||
1532 | #define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0)); | ||
1533 | #define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3)); | ||
1534 | #define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2)); | ||
1535 | |||
1536 | #define OP256_3(a) D_ROT_256_16 (V(a, 3)); | ||
1537 | #define OP256_6(a) D_ROT_256_12 (V(a, 1)); | ||
1538 | #define OP256_8(a) D_ROT_256_8 (V(a, 3)); | ||
1539 | #define OP256_9(a) D_ROT_256_7 (V(a, 1)); | ||
1540 | |||
1541 | |||
1542 | #if 0 || 1 && defined(MY_CPU_X86) | ||
1543 | |||
1544 | #define V8_G(a) \ | ||
1545 | OP256_0 (a) \ | ||
1546 | OP256_1 (a) \ | ||
1547 | OP256_2 (a) \ | ||
1548 | OP256_3 (a) \ | ||
1549 | OP256_4 (a) \ | ||
1550 | OP256_5 (a) \ | ||
1551 | OP256_6 (a) \ | ||
1552 | OP256_7 (a) \ | ||
1553 | OP256_1 (a) \ | ||
1554 | OP256_2 (a) \ | ||
1555 | OP256_8 (a) \ | ||
1556 | OP256_4 (a) \ | ||
1557 | OP256_5 (a) \ | ||
1558 | OP256_9 (a) \ | ||
1559 | |||
1560 | #define V8R { \ | ||
1561 | V8_G (0); \ | ||
1562 | V8_G (1); \ | ||
1563 | V8_G (2); \ | ||
1564 | V8_G (3); \ | ||
1565 | V8_G (4); \ | ||
1566 | V8_G (5); \ | ||
1567 | V8_G (6); \ | ||
1568 | V8_G (7); \ | ||
1569 | } | ||
1570 | |||
1571 | #else | ||
1572 | |||
1573 | #define OP256_INTER_4(op, a,b,c,d) \ | ||
1574 | op (a) \ | ||
1575 | op (b) \ | ||
1576 | op (c) \ | ||
1577 | op (d) \ | ||
1578 | |||
1579 | #define V8_G(a,b,c,d) \ | ||
1580 | OP256_INTER_4 (OP256_0, a,b,c,d) \ | ||
1581 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
1582 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
1583 | OP256_INTER_4 (OP256_3, a,b,c,d) \ | ||
1584 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
1585 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
1586 | OP256_INTER_4 (OP256_6, a,b,c,d) \ | ||
1587 | OP256_INTER_4 (OP256_7, a,b,c,d) \ | ||
1588 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
1589 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
1590 | OP256_INTER_4 (OP256_8, a,b,c,d) \ | ||
1591 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
1592 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
1593 | OP256_INTER_4 (OP256_9, a,b,c,d) \ | ||
1594 | |||
1595 | #define V8R { \ | ||
1596 | V8_G (0, 1, 2, 3) \ | ||
1597 | V8_G (4, 5, 6, 7) \ | ||
1598 | } | ||
1599 | #endif | ||
1600 | |||
1601 | #define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R } | ||
1602 | |||
1603 | |||
1604 | // for debug: | ||
1605 | // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1606 | #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER) | ||
1607 | // gather instruction is slow. | ||
1608 | #define V8_LOAD_MSG(w, m) \ | ||
1609 | { \ | ||
1610 | unsigned i; \ | ||
1611 | for (i = 0; i < 16; ++i) { \ | ||
1612 | w[i] = _mm256_i32gather_epi32( \ | ||
1613 | (const void *)((m) + i * sizeof(UInt32)),\ | ||
1614 | _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \ | ||
1615 | sizeof(UInt32)); \ | ||
1616 | } \ | ||
1617 | } | ||
1618 | #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1619 | |||
1620 | #define V8_LOAD_MSG_2(w, a0, a1) \ | ||
1621 | { \ | ||
1622 | (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \ | ||
1623 | (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \ | ||
1624 | } | ||
1625 | |||
1626 | #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \ | ||
1627 | { \ | ||
1628 | __m256i s0, s1, s2, s3; \ | ||
1629 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
1630 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
1631 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
1632 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
1633 | V8_LOAD_MSG_2((w) + 0, s0, s2) \ | ||
1634 | V8_LOAD_MSG_2((w) + 1, s1, s3) \ | ||
1635 | } | ||
1636 | |||
1637 | #define V8_LOAD_MSG_0(t0, t1, m) \ | ||
1638 | { \ | ||
1639 | __m256i m0, m1; \ | ||
1640 | m0 = LOADU_256(m); \ | ||
1641 | m1 = LOADU_256((m) + 2 * 32); \ | ||
1642 | t0 = _mm256_unpacklo_epi32(m0, m1); \ | ||
1643 | t1 = _mm256_unpackhi_epi32(m0, m1); \ | ||
1644 | } | ||
1645 | |||
1646 | #define V8_LOAD_MSG_8(w, m) \ | ||
1647 | { \ | ||
1648 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1649 | V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \ | ||
1650 | V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \ | ||
1651 | V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \ | ||
1652 | V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \ | ||
1653 | V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \ | ||
1654 | V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \ | ||
1655 | } | ||
1656 | |||
1657 | #define V8_LOAD_MSG(w, m) \ | ||
1658 | { \ | ||
1659 | V8_LOAD_MSG_8(w, m) \ | ||
1660 | V8_LOAD_MSG_8((w) + 8, (m) + 32) \ | ||
1661 | } | ||
1662 | |||
1663 | #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1664 | |||
1665 | |||
1666 | #define V8_PERM_PAIR_STORE(u, a0, a2) \ | ||
1667 | { \ | ||
1668 | STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \ | ||
1669 | STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \ | ||
1670 | } | ||
1671 | |||
1672 | #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \ | ||
1673 | { \ | ||
1674 | __m256i s0, s1, s2, s3; \ | ||
1675 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
1676 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
1677 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
1678 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
1679 | V8_PERM_PAIR_STORE(u + 0, s0, s2) \ | ||
1680 | V8_PERM_PAIR_STORE(u + 2, s1, s3) \ | ||
1681 | } | ||
1682 | |||
1683 | #define V8_UNPACK_STORE_0(src32, d0, d1) \ | ||
1684 | { \ | ||
1685 | const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \ | ||
1686 | const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \ | ||
1687 | d0 = _mm256_unpacklo_epi32(v0, v1); \ | ||
1688 | d1 = _mm256_unpackhi_epi32(v0, v1); \ | ||
1689 | } | ||
1690 | |||
1691 | #define V8_UNPACK_STATE(dest32, src32) \ | ||
1692 | { \ | ||
1693 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1694 | V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \ | ||
1695 | V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \ | ||
1696 | V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \ | ||
1697 | V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \ | ||
1698 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \ | ||
1699 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \ | ||
110 | } | 1700 | } |
111 | 1701 | ||
112 | 1702 | ||
113 | #define Blake2s_Increment_Counter(S, inc) \ | ||
114 | { p->t[0] += (inc); p->t[1] += (p->t[0] < (inc)); } | ||
115 | 1703 | ||
116 | #define Blake2s_Set_LastBlock(p) \ | 1704 | #define V8_LOAD_STATE_256_FROM_STRUCT(i) \ |
117 | { p->f[0] = BLAKE2S_FINAL_FLAG; p->f[1] = p->lastNode_f1; } | 1705 | v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8); |
1706 | |||
1707 | #if 0 || 0 && defined(MY_CPU_X86) | ||
1708 | #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1709 | #endif | ||
1710 | |||
1711 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1712 | // this branch doesn't use (iv) array | ||
1713 | // so register pressure can be lower. | ||
1714 | // it can be faster sometimes | ||
1715 | #define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i) | ||
1716 | #define V8_UPDATE_STATE_256(i) \ | ||
1717 | { \ | ||
1718 | STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \ | ||
1719 | XOR_256(v[i], v[(i) + 8]), \ | ||
1720 | LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \ | ||
1721 | } | ||
1722 | #else | ||
1723 | // it uses more variables (iv) registers | ||
1724 | // it's better for gcc | ||
1725 | // maybe that branch is better, if register pressure will be lower (avx512) | ||
1726 | #define V8_LOAD_STATE_256(i) { iv[i] = v[i]; } | ||
1727 | #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); } | ||
1728 | #define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); } | ||
1729 | #endif | ||
118 | 1730 | ||
119 | 1731 | ||
120 | static void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size) | 1732 | #if 0 |
1733 | // use loading constants from memory | ||
1734 | #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n) | ||
1735 | MY_ALIGN(64) | ||
1736 | static const UInt32 k_Blake2s_IV_WAY8[]= | ||
121 | { | 1737 | { |
122 | while (size != 0) | 1738 | KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7) |
123 | { | 1739 | }; |
124 | unsigned pos = (unsigned)p->bufPos; | 1740 | #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i)) |
125 | unsigned rem = BLAKE2S_BLOCK_SIZE - pos; | 1741 | #else |
1742 | // use constant generation: | ||
1743 | #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i)) | ||
1744 | #endif | ||
126 | 1745 | ||
127 | if (size <= rem) | 1746 | |
1747 | static | ||
1748 | Z7_NO_INLINE | ||
1749 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1750 | BLAKE2S_ATTRIB_AVX2 | ||
1751 | #endif | ||
1752 | void | ||
1753 | Z7_FASTCALL | ||
1754 | Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1755 | { | ||
1756 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1757 | __m256i v[16]; | ||
1758 | #endif | ||
1759 | |||
1760 | // PrintStates2(s_items, 8, 16); | ||
1761 | |||
1762 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1763 | REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT) | ||
1764 | #endif | ||
1765 | |||
1766 | do | ||
1767 | { | ||
1768 | __m256i w[16]; | ||
1769 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1770 | __m256i v[16]; | ||
1771 | #else | ||
1772 | __m256i iv[8]; | ||
1773 | #endif | ||
1774 | V8_LOAD_MSG(w, data) | ||
128 | { | 1775 | { |
129 | memcpy(p->buf + pos, data, size); | 1776 | // we use load/store ctr inside loop to reduce register pressure: |
130 | p->bufPos += (UInt32)size; | 1777 | #if 1 || 1 && defined(MY_CPU_X86) |
131 | return; | 1778 | const __m256i ctr = _mm256_add_epi64( |
1779 | LOAD_256_FROM_STRUCT(s_items + 64), | ||
1780 | _mm256_set_epi32( | ||
1781 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE, | ||
1782 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)); | ||
1783 | STORE_256_TO_STRUCT(s_items + 64, ctr); | ||
1784 | #else | ||
1785 | const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64) | ||
1786 | + Z7_BLAKE2S_BLOCK_SIZE; | ||
1787 | const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64); | ||
1788 | *(UInt64 *)(void *)(s_items + 64) = ctr64; | ||
1789 | #endif | ||
1790 | v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
1791 | v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
132 | } | 1792 | } |
1793 | v[ 8] = GET_256_IV_WAY8(0); | ||
1794 | v[ 9] = GET_256_IV_WAY8(1); | ||
1795 | v[10] = GET_256_IV_WAY8(2); | ||
1796 | v[11] = GET_256_IV_WAY8(3); | ||
1797 | v[14] = GET_256_IV_WAY8(6); | ||
1798 | v[15] = GET_256_IV_WAY8(7); | ||
133 | 1799 | ||
134 | memcpy(p->buf + pos, data, rem); | 1800 | REP8_MACRO (V8_LOAD_STATE_256) |
135 | Blake2s_Increment_Counter(S, BLAKE2S_BLOCK_SIZE) | 1801 | ROUNDS_LOOP (V8_ROUND) |
136 | Blake2s_Compress(p); | 1802 | REP8_MACRO (V8_UPDATE_STATE_256) |
137 | p->bufPos = 0; | 1803 | data += SUPER_BLOCK_SIZE; |
138 | data += rem; | ||
139 | size -= rem; | ||
140 | } | 1804 | } |
1805 | while (data != end); | ||
1806 | |||
1807 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1808 | REP8_MACRO (V8_STORE_STATE_256) | ||
1809 | #endif | ||
141 | } | 1810 | } |
142 | 1811 | ||
143 | 1812 | ||
144 | static void Blake2s_Final(CBlake2s *p, Byte *digest) | 1813 | static |
1814 | Z7_NO_INLINE | ||
1815 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1816 | BLAKE2S_ATTRIB_AVX2 | ||
1817 | #endif | ||
1818 | void | ||
1819 | Z7_FASTCALL | ||
1820 | Blake2sp_Final_AVX2_Fast(UInt32 *states) | ||
145 | { | 1821 | { |
146 | unsigned i; | 1822 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); |
1823 | // PrintStates2(states, 8, 16); | ||
1824 | V8_UNPACK_STATE(states, states) | ||
1825 | // PrintStates2(states, 8, 16); | ||
1826 | { | ||
1827 | unsigned k; | ||
1828 | for (k = 0; k < 8; k++) | ||
1829 | { | ||
1830 | UInt32 *s = states + (size_t)k * 16; | ||
1831 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
1832 | } | ||
1833 | } | ||
1834 | // PrintStates2(states, 8, 16); | ||
1835 | // printf("\nafter V8_UNPACK_STATE \n"); | ||
1836 | } | ||
1837 | |||
1838 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
1839 | #endif // avx2 | ||
1840 | #endif // vector | ||
1841 | |||
1842 | |||
1843 | /* | ||
1844 | #define Blake2s_Increment_Counter(s, inc) \ | ||
1845 | { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); } | ||
1846 | #define Blake2s_Increment_Counter_Small(s, inc) \ | ||
1847 | { STATE_T(s)[0] += (inc); } | ||
1848 | */ | ||
1849 | |||
1850 | #define Blake2s_Set_LastBlock(s) \ | ||
1851 | { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ } | ||
1852 | |||
1853 | |||
1854 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600 | ||
1855 | // good for vs2022 | ||
1856 | #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) } | ||
1857 | #else | ||
1858 | // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64. | ||
1859 | #define LOOP_8(mac) { REP8_MACRO(mac) } | ||
1860 | #endif | ||
1861 | |||
1862 | |||
1863 | static | ||
1864 | Z7_FORCE_INLINE | ||
1865 | // Z7_NO_INLINE | ||
1866 | void | ||
1867 | Z7_FASTCALL | ||
1868 | Blake2s_Compress(UInt32 *s, const Byte *input) | ||
1869 | { | ||
1870 | UInt32 m[16]; | ||
1871 | UInt32 v[16]; | ||
1872 | { | ||
1873 | unsigned i; | ||
1874 | for (i = 0; i < 16; i++) | ||
1875 | m[i] = GetUi32(input + i * 4); | ||
1876 | } | ||
1877 | |||
1878 | #define INIT_v_FROM_s(i) v[i] = s[i]; | ||
1879 | |||
1880 | LOOP_8(INIT_v_FROM_s) | ||
1881 | |||
1882 | // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE) | ||
1883 | { | ||
1884 | const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE; | ||
1885 | const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE); | ||
1886 | STATE_T(s)[0] = t0; | ||
1887 | STATE_T(s)[1] = t1; | ||
1888 | v[12] = t0 ^ KIV(4); | ||
1889 | v[13] = t1 ^ KIV(5); | ||
1890 | } | ||
1891 | // v[12] = STATE_T(s)[0] ^ KIV(4); | ||
1892 | // v[13] = STATE_T(s)[1] ^ KIV(5); | ||
1893 | v[14] = STATE_F(s)[0] ^ KIV(6); | ||
1894 | v[15] = STATE_F(s)[1] ^ KIV(7); | ||
1895 | |||
1896 | v[ 8] = KIV(0); | ||
1897 | v[ 9] = KIV(1); | ||
1898 | v[10] = KIV(2); | ||
1899 | v[11] = KIV(3); | ||
1900 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
1901 | |||
1902 | #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]); | ||
1903 | #define ADD32M(dest, src, a) V(a, dest) += V(a, src); | ||
1904 | #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src); | ||
1905 | #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift); | ||
1906 | |||
1907 | // big interleaving can provides big performance gain, if scheduler queues are small. | ||
1908 | #if 0 || 1 && defined(MY_CPU_X86) | ||
1909 | // interleave-1: for small register number (x86-32bit) | ||
1910 | #define G2(index, a, x, y) \ | ||
1911 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1912 | ADD32M (0, 1, a) \ | ||
1913 | XOR32M (3, 0, a) \ | ||
1914 | RTR32M (3, x, a) \ | ||
1915 | ADD32M (2, 3, a) \ | ||
1916 | XOR32M (1, 2, a) \ | ||
1917 | RTR32M (1, y, a) \ | ||
1918 | |||
1919 | #define G(a) \ | ||
1920 | G2(a * 2 , a, 16, 12) \ | ||
1921 | G2(a * 2 + 1, a, 8, 7) \ | ||
1922 | |||
1923 | #define R2 \ | ||
1924 | G(0) \ | ||
1925 | G(1) \ | ||
1926 | G(2) \ | ||
1927 | G(3) \ | ||
1928 | G(4) \ | ||
1929 | G(5) \ | ||
1930 | G(6) \ | ||
1931 | G(7) \ | ||
1932 | |||
1933 | #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64) | ||
1934 | // interleave-2: is good if the number of registers is not big (x86-64). | ||
1935 | |||
1936 | #define REP2(mac, dest, src, a, b) \ | ||
1937 | mac(dest, src, a) \ | ||
1938 | mac(dest, src, b) | ||
1939 | |||
1940 | #define G2(index, a, b, x, y) \ | ||
1941 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1942 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
1943 | REP2 (ADD32M, 0, 1, a, b) \ | ||
1944 | REP2 (XOR32M, 3, 0, a, b) \ | ||
1945 | REP2 (RTR32M, 3, x, a, b) \ | ||
1946 | REP2 (ADD32M, 2, 3, a, b) \ | ||
1947 | REP2 (XOR32M, 1, 2, a, b) \ | ||
1948 | REP2 (RTR32M, 1, y, a, b) \ | ||
1949 | |||
1950 | #define G(a, b) \ | ||
1951 | G2(a * 2 , a, b, 16, 12) \ | ||
1952 | G2(a * 2 + 1, a, b, 8, 7) \ | ||
1953 | |||
1954 | #define R2 \ | ||
1955 | G(0, 1) \ | ||
1956 | G(2, 3) \ | ||
1957 | G(4, 5) \ | ||
1958 | G(6, 7) \ | ||
147 | 1959 | ||
148 | Blake2s_Increment_Counter(S, (UInt32)p->bufPos) | 1960 | #else |
149 | Blake2s_Set_LastBlock(p) | 1961 | // interleave-4: |
150 | memset(p->buf + p->bufPos, 0, BLAKE2S_BLOCK_SIZE - p->bufPos); | 1962 | // it has big register pressure for x86/x64. |
151 | Blake2s_Compress(p); | 1963 | // and MSVC compilers for x86/x64 are slow for this branch. |
1964 | // but if we have big number of registers, this branch can be faster. | ||
152 | 1965 | ||
153 | for (i = 0; i < 8; i++) | 1966 | #define REP4(mac, dest, src, a, b, c, d) \ |
1967 | mac(dest, src, a) \ | ||
1968 | mac(dest, src, b) \ | ||
1969 | mac(dest, src, c) \ | ||
1970 | mac(dest, src, d) | ||
1971 | |||
1972 | #define G2(index, a, b, c, d, x, y) \ | ||
1973 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1974 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
1975 | ADD_SIGMA (c, (index) + 2 * 2) \ | ||
1976 | ADD_SIGMA (d, (index) + 2 * 3) \ | ||
1977 | REP4 (ADD32M, 0, 1, a, b, c, d) \ | ||
1978 | REP4 (XOR32M, 3, 0, a, b, c, d) \ | ||
1979 | REP4 (RTR32M, 3, x, a, b, c, d) \ | ||
1980 | REP4 (ADD32M, 2, 3, a, b, c, d) \ | ||
1981 | REP4 (XOR32M, 1, 2, a, b, c, d) \ | ||
1982 | REP4 (RTR32M, 1, y, a, b, c, d) \ | ||
1983 | |||
1984 | #define G(a, b, c, d) \ | ||
1985 | G2(a * 2 , a, b, c, d, 16, 12) \ | ||
1986 | G2(a * 2 + 1, a, b, c, d, 8, 7) \ | ||
1987 | |||
1988 | #define R2 \ | ||
1989 | G(0, 1, 2, 3) \ | ||
1990 | G(4, 5, 6, 7) \ | ||
1991 | |||
1992 | #endif | ||
1993 | |||
1994 | #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 } | ||
1995 | |||
1996 | // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster: | ||
1997 | // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG. | ||
1998 | // 30-60% faster for (arm64-arm32) GCC. | ||
1999 | // 5-11% faster for (arm64) CLANG-MAC. | ||
2000 | // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch. | ||
2001 | // But if there is vectors branch (for x86*), this scalar code will be unused mostly. | ||
2002 | // So we want smaller code (without unrolling) in that case (x86*). | ||
2003 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
2004 | #define Z7_BLAKE2S_UNROLL | ||
2005 | #endif | ||
2006 | |||
2007 | #ifdef Z7_BLAKE2S_UNROLL | ||
2008 | ROUNDS_LOOP_UNROLLED (R) | ||
2009 | #else | ||
2010 | ROUNDS_LOOP (R) | ||
2011 | #endif | ||
2012 | |||
2013 | #undef G | ||
2014 | #undef G2 | ||
2015 | #undef R | ||
2016 | #undef R2 | ||
2017 | |||
2018 | // printf("\n v after: \n"); | ||
2019 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
2020 | #define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8]; | ||
2021 | |||
2022 | LOOP_8(XOR_s_PAIR_v) | ||
2023 | // printf("\n s after:\n"); | ||
2024 | // PrintStates2((const UInt32 *)s, 1, 16); | ||
2025 | } | ||
2026 | |||
2027 | |||
2028 | static | ||
2029 | Z7_NO_INLINE | ||
2030 | void | ||
2031 | Z7_FASTCALL | ||
2032 | Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
2033 | { | ||
2034 | size_t pos = 0; | ||
2035 | // PrintStates2(s_items, 8, 16); | ||
2036 | do | ||
154 | { | 2037 | { |
155 | SetUi32(digest + sizeof(p->h[i]) * i, p->h[i]) | 2038 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
2039 | Blake2s_Compress(s, data); | ||
2040 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
2041 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
2042 | pos &= SUPER_BLOCK_MASK; | ||
156 | } | 2043 | } |
2044 | while (data != end); | ||
157 | } | 2045 | } |
158 | 2046 | ||
159 | 2047 | ||
160 | /* ---------- BLAKE2s ---------- */ | 2048 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2049 | |||
2050 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2; | ||
2051 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2; | ||
2052 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2053 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2054 | static unsigned g_z7_Blake2sp_SupportedFlags; | ||
2055 | |||
2056 | #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast | ||
2057 | #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single | ||
2058 | #else | ||
2059 | #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2 | ||
2060 | #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2 | ||
2061 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
2062 | |||
161 | 2063 | ||
162 | /* we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ | 2064 | #if 1 && defined(MY_CPU_LE) |
2065 | #define GET_DIGEST(_s, _digest) \ | ||
2066 | { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); } | ||
2067 | #else | ||
2068 | #define GET_DIGEST(_s, _digest) \ | ||
2069 | { unsigned _i; for (_i = 0; _i < 8; _i++) \ | ||
2070 | { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \ | ||
2071 | } | ||
2072 | #endif | ||
2073 | |||
2074 | |||
2075 | /* ---------- BLAKE2s ---------- */ | ||
163 | /* | 2076 | /* |
2077 | // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() | ||
164 | typedef struct | 2078 | typedef struct |
165 | { | 2079 | { |
166 | Byte digest_length; | 2080 | Byte digest_length; |
167 | Byte key_length; | 2081 | Byte key_length; |
168 | Byte fanout; | 2082 | Byte fanout; // = 1 : in sequential mode |
169 | Byte depth; | 2083 | Byte depth; // = 1 : in sequential mode |
170 | UInt32 leaf_length; | 2084 | UInt32 leaf_length; |
171 | Byte node_offset[6]; | 2085 | Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode |
172 | Byte node_depth; | 2086 | Byte node_depth; // 0 for the leaves, or in sequential mode |
173 | Byte inner_length; | 2087 | Byte inner_length; // [0, 32], 0 in sequential mode |
174 | Byte salt[BLAKE2S_SALTBYTES]; | 2088 | Byte salt[BLAKE2S_SALTBYTES]; |
175 | Byte personal[BLAKE2S_PERSONALBYTES]; | 2089 | Byte personal[BLAKE2S_PERSONALBYTES]; |
176 | } CBlake2sParam; | 2090 | } CBlake2sParam; |
177 | */ | 2091 | */ |
178 | 2092 | ||
2093 | #define k_Blake2sp_IV_0 \ | ||
2094 | (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24))) | ||
2095 | #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \ | ||
2096 | (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24)) | ||
179 | 2097 | ||
180 | static void Blake2sp_Init_Spec(CBlake2s *p, unsigned node_offset, unsigned node_depth) | 2098 | Z7_FORCE_INLINE |
2099 | static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth) | ||
181 | { | 2100 | { |
182 | Blake2s_Init0(p); | 2101 | s[0] = k_Blake2sp_IV_0; |
183 | 2102 | s[1] = KIV(1); | |
184 | p->h[0] ^= (BLAKE2S_DIGEST_SIZE | ((UInt32)BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)); | 2103 | s[2] = KIV(2) ^ (UInt32)node_offset; |
185 | p->h[2] ^= ((UInt32)node_offset); | 2104 | s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth); |
186 | p->h[3] ^= ((UInt32)node_depth << 16) | ((UInt32)BLAKE2S_DIGEST_SIZE << 24); | 2105 | s[4] = KIV(4); |
187 | /* | 2106 | s[5] = KIV(5); |
188 | P->digest_length = BLAKE2S_DIGEST_SIZE; | 2107 | s[6] = KIV(6); |
189 | P->key_length = 0; | 2108 | s[7] = KIV(7); |
190 | P->fanout = BLAKE2SP_PARALLEL_DEGREE; | 2109 | |
191 | P->depth = 2; | 2110 | STATE_T(s)[0] = 0; |
192 | P->leaf_length = 0; | 2111 | STATE_T(s)[1] = 0; |
193 | store48(P->node_offset, node_offset); | 2112 | STATE_F(s)[0] = 0; |
194 | P->node_depth = node_depth; | 2113 | STATE_F(s)[1] = 0; |
195 | P->inner_length = BLAKE2S_DIGEST_SIZE; | ||
196 | */ | ||
197 | } | 2114 | } |
198 | 2115 | ||
199 | 2116 | ||
2117 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
2118 | |||
2119 | static | ||
2120 | Z7_NO_INLINE | ||
2121 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
2122 | BLAKE2S_ATTRIB_128BIT | ||
2123 | #endif | ||
2124 | void | ||
2125 | Z7_FASTCALL | ||
2126 | Blake2sp_InitState_V128_Fast(UInt32 *states) | ||
2127 | { | ||
2128 | #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \ | ||
2129 | { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \ | ||
2130 | STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \ | ||
2131 | } | ||
2132 | #define STORE_128_PAIR_INIT_STATES_1(i, mac) \ | ||
2133 | { const __m128i t = mac; \ | ||
2134 | STORE_128_PAIR_INIT_STATES_2(i, t, t) \ | ||
2135 | } | ||
2136 | #define STORE_128_PAIR_INIT_STATES_IV(i) \ | ||
2137 | STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i)) | ||
2138 | |||
2139 | STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
2140 | STORE_128_PAIR_INIT_STATES_IV (1) | ||
2141 | { | ||
2142 | const __m128i t = GET_128_IV_WAY4(2); | ||
2143 | STORE_128_PAIR_INIT_STATES_2 (2, | ||
2144 | XOR_128(t, _mm_set_epi32(3, 2, 1, 0)), | ||
2145 | XOR_128(t, _mm_set_epi32(7, 6, 5, 4))) | ||
2146 | } | ||
2147 | STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
2148 | STORE_128_PAIR_INIT_STATES_IV (4) | ||
2149 | STORE_128_PAIR_INIT_STATES_IV (5) | ||
2150 | STORE_128_PAIR_INIT_STATES_IV (6) | ||
2151 | STORE_128_PAIR_INIT_STATES_IV (7) | ||
2152 | STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0)) | ||
2153 | // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]); | ||
2154 | } | ||
2155 | |||
2156 | #endif // Z7_BLAKE2S_USE_V128_FAST | ||
2157 | |||
2158 | |||
2159 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2160 | |||
2161 | static | ||
2162 | Z7_NO_INLINE | ||
2163 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
2164 | BLAKE2S_ATTRIB_AVX2 | ||
2165 | #endif | ||
2166 | void | ||
2167 | Z7_FASTCALL | ||
2168 | Blake2sp_InitState_AVX2_Fast(UInt32 *states) | ||
2169 | { | ||
2170 | #define STORE_256_INIT_STATES(i, t) \ | ||
2171 | STORE_256_TO_STRUCT(states + 8 * (i), t); | ||
2172 | #define STORE_256_INIT_STATES_IV(i) \ | ||
2173 | STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i)) | ||
2174 | |||
2175 | STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
2176 | STORE_256_INIT_STATES_IV (1) | ||
2177 | STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2), | ||
2178 | _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))) | ||
2179 | STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
2180 | STORE_256_INIT_STATES_IV (4) | ||
2181 | STORE_256_INIT_STATES_IV (5) | ||
2182 | STORE_256_INIT_STATES_IV (6) | ||
2183 | STORE_256_INIT_STATES_IV (7) | ||
2184 | STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0)) | ||
2185 | // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n"); | ||
2186 | } | ||
2187 | |||
2188 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
2189 | |||
2190 | |||
2191 | |||
2192 | Z7_NO_INLINE | ||
2193 | void Blake2sp_InitState(CBlake2sp *p) | ||
2194 | { | ||
2195 | size_t i; | ||
2196 | // memset(p->states, 0, sizeof(p->states)); // for debug | ||
2197 | p->u.header.cycPos = 0; | ||
2198 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2199 | if (p->u.header.func_Init) | ||
2200 | { | ||
2201 | p->u.header.func_Init(p->states); | ||
2202 | return; | ||
2203 | } | ||
2204 | #endif | ||
2205 | for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++) | ||
2206 | Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0); | ||
2207 | } | ||
2208 | |||
200 | void Blake2sp_Init(CBlake2sp *p) | 2209 | void Blake2sp_Init(CBlake2sp *p) |
201 | { | 2210 | { |
202 | unsigned i; | 2211 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS |
203 | 2212 | p->u.header.func_Compress_Fast = | |
204 | p->bufPos = 0; | 2213 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2214 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
2215 | #else | ||
2216 | NULL; | ||
2217 | #endif | ||
2218 | |||
2219 | p->u.header.func_Compress_Single = | ||
2220 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2221 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
2222 | #else | ||
2223 | NULL; | ||
2224 | #endif | ||
2225 | |||
2226 | p->u.header.func_Init = | ||
2227 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2228 | g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2229 | #else | ||
2230 | NULL; | ||
2231 | #endif | ||
205 | 2232 | ||
206 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | 2233 | p->u.header.func_Final = |
207 | Blake2sp_Init_Spec(&p->S[i], i, 0); | 2234 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2235 | g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2236 | #else | ||
2237 | NULL; | ||
2238 | #endif | ||
2239 | #endif | ||
208 | 2240 | ||
209 | p->S[BLAKE2SP_PARALLEL_DEGREE - 1].lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2241 | Blake2sp_InitState(p); |
210 | } | 2242 | } |
211 | 2243 | ||
212 | 2244 | ||
213 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) | 2245 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) |
214 | { | 2246 | { |
215 | unsigned pos = p->bufPos; | 2247 | size_t pos; |
216 | while (size != 0) | 2248 | // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data); |
2249 | if (size == 0) | ||
2250 | return; | ||
2251 | pos = p->u.header.cycPos; | ||
2252 | // pos < SUPER_BLOCK_SIZE * 2 : is expected | ||
2253 | // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also | ||
2254 | { | ||
2255 | const size_t pos2 = pos & SUPER_BLOCK_MASK; | ||
2256 | if (pos2) | ||
2257 | { | ||
2258 | const size_t rem = SUPER_BLOCK_SIZE - pos2; | ||
2259 | if (rem > size) | ||
2260 | { | ||
2261 | p->u.header.cycPos = (unsigned)(pos + size); | ||
2262 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
2263 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); | ||
2264 | /* to simpilify the code here we don't try to process first superblock, | ||
2265 | if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */ | ||
2266 | return; | ||
2267 | } | ||
2268 | // (rem <= size) | ||
2269 | memcpy((Byte *)(void *)p->buf32 + pos, data, rem); | ||
2270 | pos += rem; | ||
2271 | data += rem; | ||
2272 | size -= rem; | ||
2273 | } | ||
2274 | } | ||
2275 | |||
2276 | // pos <= SUPER_BLOCK_SIZE * 2 | ||
2277 | // pos % SUPER_BLOCK_SIZE == 0 | ||
2278 | if (pos) | ||
2279 | { | ||
2280 | /* pos == SUPER_BLOCK_SIZE || | ||
2281 | pos == SUPER_BLOCK_SIZE * 2 */ | ||
2282 | size_t end = pos; | ||
2283 | if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE | ||
2284 | || (end -= SUPER_BLOCK_SIZE)) | ||
2285 | { | ||
2286 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, | ||
2287 | (const Byte *)(const void *)p->buf32, | ||
2288 | (const Byte *)(const void *)p->buf32 + end); | ||
2289 | if (pos -= end) | ||
2290 | memcpy(p->buf32, (const Byte *)(const void *)p->buf32 | ||
2291 | + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE); | ||
2292 | } | ||
2293 | } | ||
2294 | |||
2295 | // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE) | ||
2296 | if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
2297 | { | ||
2298 | // pos == 0 | ||
2299 | const Byte *end; | ||
2300 | const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1)) | ||
2301 | & ~(size_t)SUPER_BLOCK_MASK; | ||
2302 | size -= size2; | ||
2303 | // size < SUPER_BLOCK_SIZE * 2 | ||
2304 | end = data + size2; | ||
2305 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end); | ||
2306 | data = end; | ||
2307 | } | ||
2308 | |||
2309 | if (size != 0) | ||
217 | { | 2310 | { |
218 | unsigned index = pos / BLAKE2S_BLOCK_SIZE; | 2311 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); |
219 | unsigned rem = BLAKE2S_BLOCK_SIZE - (pos & (BLAKE2S_BLOCK_SIZE - 1)); | 2312 | pos += size; |
220 | if (rem > size) | ||
221 | rem = (unsigned)size; | ||
222 | Blake2s_Update(&p->S[index], data, rem); | ||
223 | size -= rem; | ||
224 | data += rem; | ||
225 | pos += rem; | ||
226 | pos &= (BLAKE2S_BLOCK_SIZE * BLAKE2SP_PARALLEL_DEGREE - 1); | ||
227 | } | 2313 | } |
228 | p->bufPos = pos; | 2314 | p->u.header.cycPos = (unsigned)pos; |
2315 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
229 | } | 2316 | } |
230 | 2317 | ||
231 | 2318 | ||
232 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) | 2319 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) |
233 | { | 2320 | { |
234 | CBlake2s R; | 2321 | // UInt32 * const R_states = p->states; |
235 | unsigned i; | 2322 | // printf("\nBlake2sp_Final \n"); |
2323 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2324 | if (p->u.header.func_Final) | ||
2325 | p->u.header.func_Final(p->states); | ||
2326 | #endif | ||
2327 | // printf("\n=====\nBlake2sp_Final \n"); | ||
2328 | // PrintStates(p->states, 32); | ||
2329 | |||
2330 | // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch: | ||
2331 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE) | ||
2332 | { | ||
2333 | unsigned pos; | ||
2334 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, | ||
2335 | 0, SUPER_BLOCK_SIZE - p->u.header.cycPos); | ||
2336 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
2337 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2338 | { | ||
2339 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
2340 | Blake2s_Set_LastBlock(s) | ||
2341 | if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos) | ||
2342 | { | ||
2343 | UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE; | ||
2344 | if (pos < p->u.header.cycPos) | ||
2345 | delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1); | ||
2346 | // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE | ||
2347 | { | ||
2348 | const UInt32 v = STATE_T(s)[0]; | ||
2349 | STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0) | ||
2350 | STATE_T(s)[0] = v - delta; | ||
2351 | } | ||
2352 | } | ||
2353 | } | ||
2354 | // PrintStates(p->states, 16); | ||
2355 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2356 | (Byte *)(void *)p->buf32, | ||
2357 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
2358 | // PrintStates(p->states, 16); | ||
2359 | } | ||
2360 | else | ||
2361 | { | ||
2362 | // (p->u.header.cycPos > SUPER_BLOCK_SIZE) | ||
2363 | unsigned pos; | ||
2364 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2365 | { | ||
2366 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
2367 | if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos) | ||
2368 | Blake2s_Set_LastBlock(s) | ||
2369 | } | ||
2370 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
2371 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
2372 | |||
2373 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2374 | (Byte *)(void *)p->buf32, | ||
2375 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
236 | 2376 | ||
237 | Blake2sp_Init_Spec(&R, 0, 1); | 2377 | // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE; |
238 | R.lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2378 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; |
2379 | |||
2380 | // if (p->u.header.cycPos != SUPER_BLOCK_SIZE) | ||
2381 | { | ||
2382 | pos = SUPER_BLOCK_SIZE; | ||
2383 | for (;;) | ||
2384 | { | ||
2385 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK); | ||
2386 | Blake2s_Set_LastBlock(s) | ||
2387 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
2388 | if (pos >= p->u.header.cycPos) | ||
2389 | { | ||
2390 | if (pos != p->u.header.cycPos) | ||
2391 | { | ||
2392 | const UInt32 delta = pos - p->u.header.cycPos; | ||
2393 | const UInt32 v = STATE_T(s)[0]; | ||
2394 | STATE_T(s)[1] -= v < delta; | ||
2395 | STATE_T(s)[0] = v - delta; | ||
2396 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta); | ||
2397 | } | ||
2398 | break; | ||
2399 | } | ||
2400 | } | ||
2401 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2402 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE, | ||
2403 | (Byte *)(void *)p->buf32 + pos); | ||
2404 | } | ||
2405 | } | ||
239 | 2406 | ||
240 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | ||
241 | { | 2407 | { |
242 | Byte hash[BLAKE2S_DIGEST_SIZE]; | 2408 | size_t pos; |
243 | Blake2s_Final(&p->S[i], hash); | 2409 | for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2) |
244 | Blake2s_Update(&R, hash, BLAKE2S_DIGEST_SIZE); | 2410 | { |
2411 | const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2)); | ||
2412 | Byte *dest = (Byte *)(void *)p->buf32 + pos; | ||
2413 | GET_DIGEST(s, dest) | ||
2414 | } | ||
245 | } | 2415 | } |
2416 | Blake2sp_Init_Spec(p->states, 0, 1); | ||
2417 | { | ||
2418 | size_t pos; | ||
2419 | for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE) | ||
2420 | - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2421 | { | ||
2422 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2423 | (const Byte *)(const void *)p->buf32 + pos, | ||
2424 | (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE); | ||
2425 | } | ||
2426 | } | ||
2427 | // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i); | ||
2428 | Blake2s_Set_LastBlock(p->states) | ||
2429 | STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG; | ||
2430 | { | ||
2431 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2432 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE, | ||
2433 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE); | ||
2434 | } | ||
2435 | GET_DIGEST(p->states, digest) | ||
2436 | // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs); | ||
2437 | } | ||
2438 | |||
246 | 2439 | ||
247 | Blake2s_Final(&R, digest); | 2440 | BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo) |
2441 | { | ||
2442 | // printf("\n========== setfunction = %d ======== \n", algo); | ||
2443 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2444 | Z7_BLAKE2SP_FUNC_COMPRESS func = NULL; | ||
2445 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL; | ||
2446 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
2447 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
2448 | #else | ||
2449 | UNUSED_VAR(p) | ||
2450 | #endif | ||
2451 | |||
2452 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2453 | |||
2454 | func = func_Single = Blake2sp_Compress2; | ||
2455 | |||
2456 | if (algo != Z7_BLAKE2SP_ALGO_SCALAR) | ||
2457 | { | ||
2458 | // printf("\n========== setfunction NON-SCALER ======== \n"); | ||
2459 | if (algo == Z7_BLAKE2SP_ALGO_DEFAULT) | ||
2460 | { | ||
2461 | func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
2462 | func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
2463 | func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2464 | func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2465 | } | ||
2466 | else | ||
2467 | { | ||
2468 | if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0) | ||
2469 | return False; | ||
2470 | |||
2471 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
2472 | |||
2473 | func_Single = | ||
2474 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
2475 | Blake2sp_Compress2_AVX2_Way2; | ||
2476 | #else | ||
2477 | Z7_BLAKE2S_Compress2_V128; | ||
2478 | #endif | ||
2479 | |||
2480 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2481 | if (algo == Z7_BLAKE2SP_ALGO_V256_FAST) | ||
2482 | { | ||
2483 | func = Blake2sp_Compress2_AVX2_Fast; | ||
2484 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
2485 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
2486 | } | ||
2487 | else | ||
2488 | #endif | ||
2489 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
2490 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2) | ||
2491 | func = Blake2sp_Compress2_AVX2_Way2; | ||
2492 | else | ||
2493 | #endif | ||
2494 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
2495 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4) | ||
2496 | { | ||
2497 | func_Single = func = Blake2sp_Compress2_AVX2_Way4; | ||
2498 | } | ||
2499 | else | ||
2500 | #endif | ||
2501 | #endif // avx2 | ||
2502 | { | ||
2503 | if (algo == Z7_BLAKE2SP_ALGO_V128_FAST) | ||
2504 | { | ||
2505 | func = Blake2sp_Compress2_V128_Fast; | ||
2506 | func_Final = Blake2sp_Final_V128_Fast; | ||
2507 | func_Init = Blake2sp_InitState_V128_Fast; | ||
2508 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
2509 | } | ||
2510 | else | ||
2511 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
2512 | if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2) | ||
2513 | func = func_Single = Blake2sp_Compress2_V128_Way2; | ||
2514 | else | ||
2515 | #endif | ||
2516 | { | ||
2517 | if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1) | ||
2518 | return False; | ||
2519 | func = func_Single = Blake2sp_Compress2_V128_Way1; | ||
2520 | } | ||
2521 | } | ||
2522 | } | ||
2523 | } | ||
2524 | #else // !VECTORS | ||
2525 | if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR | ||
2526 | return False; | ||
2527 | #endif // !VECTORS | ||
2528 | |||
2529 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2530 | p->u.header.func_Compress_Fast = func; | ||
2531 | p->u.header.func_Compress_Single = func_Single; | ||
2532 | p->u.header.func_Final = func_Final; | ||
2533 | p->u.header.func_Init = func_Init; | ||
2534 | #endif | ||
2535 | // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress); | ||
2536 | return True; | ||
2537 | } | ||
2538 | |||
2539 | |||
2540 | void z7_Black2sp_Prepare(void) | ||
2541 | { | ||
2542 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2543 | unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR); | ||
2544 | |||
2545 | Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2; | ||
2546 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2; | ||
2547 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
2548 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
2549 | |||
2550 | #if defined(MY_CPU_X86_OR_AMD64) | ||
2551 | #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
2552 | if (CPU_IsSupported_AVX512F_AVX512VL()) | ||
2553 | #endif | ||
2554 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
2555 | if (CPU_IsSupported_SSE41()) | ||
2556 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
2557 | if (CPU_IsSupported_SSSE3()) | ||
2558 | #elif !defined(MY_CPU_AMD64) | ||
2559 | if (CPU_IsSupported_SSE2()) | ||
2560 | #endif | ||
2561 | #endif | ||
2562 | { | ||
2563 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
2564 | // printf("\n========== Blake2s SSE41 128-bit\n"); | ||
2565 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
2566 | // printf("\n========== Blake2s SSSE3 128-bit\n"); | ||
2567 | #else | ||
2568 | // printf("\n========== Blake2s SSE2 128-bit\n"); | ||
2569 | #endif | ||
2570 | // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2; | ||
2571 | // printf("\n========== Blake2sp_Compress2_V128_Way2\n"); | ||
2572 | func_Fast = | ||
2573 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
2574 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1); | ||
2575 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
2576 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2); | ||
2577 | #endif | ||
2578 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
2579 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST); | ||
2580 | func_Fast = Blake2sp_Compress2_V128_Fast; | ||
2581 | func_Init = Blake2sp_InitState_V128_Fast; | ||
2582 | func_Final = Blake2sp_Final_V128_Fast; | ||
2583 | #endif | ||
2584 | |||
2585 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
2586 | #if defined(MY_CPU_X86_OR_AMD64) | ||
2587 | if ( | ||
2588 | #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
2589 | CPU_IsSupported_AVX512F_AVX512VL() && | ||
2590 | #endif | ||
2591 | CPU_IsSupported_AVX2() | ||
2592 | ) | ||
2593 | #endif | ||
2594 | { | ||
2595 | // #pragma message ("=== Blake2s AVX2") | ||
2596 | // printf("\n========== Blake2s AVX2\n"); | ||
2597 | |||
2598 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
2599 | func_Single = Blake2sp_Compress2_AVX2_Way2; | ||
2600 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2); | ||
2601 | #endif | ||
2602 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
2603 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4); | ||
2604 | #endif | ||
2605 | |||
2606 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2607 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST); | ||
2608 | func_Fast = Blake2sp_Compress2_AVX2_Fast; | ||
2609 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
2610 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
2611 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
2612 | func_Fast = Blake2sp_Compress2_AVX2_Way4; | ||
2613 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
2614 | func_Fast = Blake2sp_Compress2_AVX2_Way2; | ||
2615 | #endif | ||
2616 | } // avx2 | ||
2617 | #endif // avx2 | ||
2618 | } // sse* | ||
2619 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast; | ||
2620 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single; | ||
2621 | g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init; | ||
2622 | g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final; | ||
2623 | g_z7_Blake2sp_SupportedFlags = flags; | ||
2624 | // printf("\nflags=%x\n", flags); | ||
2625 | #endif // vectors | ||
248 | } | 2626 | } |
249 | 2627 | ||
250 | #undef rotr32 | 2628 | /* |
2629 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2630 | void align_test2(CBlake2sp *sp); | ||
2631 | void align_test2(CBlake2sp *sp) | ||
2632 | { | ||
2633 | __m128i a = LOAD_128(sp->states); | ||
2634 | D_XOR_128(a, LOAD_128(sp->states + 4)); | ||
2635 | STORE_128(sp->states, a); | ||
2636 | } | ||
2637 | void align_test2(void); | ||
2638 | void align_test2(void) | ||
2639 | { | ||
2640 | CBlake2sp sp; | ||
2641 | Blake2sp_Init(&sp); | ||
2642 | Blake2sp_Update(&sp, NULL, 0); | ||
2643 | } | ||
2644 | #endif | ||
2645 | */ | ||