aboutsummaryrefslogtreecommitdiff
path: root/C/Blake2s.c
diff options
context:
space:
mode:
Diffstat (limited to 'C/Blake2s.c')
-rw-r--r--C/Blake2s.c2693
1 files changed, 2544 insertions, 149 deletions
diff --git a/C/Blake2s.c b/C/Blake2s.c
index 2a84b57..459e76b 100644
--- a/C/Blake2s.c
+++ b/C/Blake2s.c
@@ -1,250 +1,2645 @@
1/* Blake2s.c -- BLAKE2s and BLAKE2sp Hash 1/* Blake2s.c -- BLAKE2sp Hash
22023-03-04 : Igor Pavlov : Public domain 22024-01-29 : Igor Pavlov : Public domain
32015 : Samuel Neves : Public domain */ 32015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7// #include <stdio.h>
7#include <string.h> 8#include <string.h>
8 9
9#include "Blake2.h" 10#include "Blake2.h"
10#include "CpuArch.h"
11#include "RotateDefs.h" 11#include "RotateDefs.h"
12#include "Compiler.h"
13#include "CpuArch.h"
14
15#if defined(__SSE2__)
16 #define Z7_BLAKE2S_USE_VECTORS
17#elif defined(MY_CPU_X86_OR_AMD64)
18 #if defined(_MSC_VER) && _MSC_VER > 1200 \
19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
20 || defined(__clang__) \
21 || defined(__INTEL_COMPILER)
22 #define Z7_BLAKE2S_USE_VECTORS
23 #endif
24#endif
25
26#ifdef Z7_BLAKE2S_USE_VECTORS
27
28#define Z7_BLAKE2SP_USE_FUNCTIONS
29
30// define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
31// #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
32
33// SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
34#if defined(__SSSE3__)
35 #define Z7_BLAKE2S_USE_SSSE3
36#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
37 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
38 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
39 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
40 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
41 #define Z7_BLAKE2S_USE_SSSE3
42#endif
43
44#ifdef Z7_BLAKE2S_USE_SSSE3
45/* SSE41 : for _mm_insert_epi32 (pinsrd)
46 it can slightly reduce code size and improves the performance in some cases.
47 it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
48 it can be used for all blocks in another algos (4+).
49*/
50#if defined(__SSE4_1__)
51 #define Z7_BLAKE2S_USE_SSE41
52#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
53 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
54 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
55 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
56 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
57 #define Z7_BLAKE2S_USE_SSE41
58#endif
59#endif // SSSE3
60
61#if defined(__GNUC__) || defined(__clang__)
62 #if defined(Z7_BLAKE2S_USE_SSE41)
63 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1")))
64 #elif defined(Z7_BLAKE2S_USE_SSSE3)
65 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3")))
66 #else
67 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2")))
68 #endif
69#endif
70
71
72#if defined(__AVX2__)
73 #define Z7_BLAKE2S_USE_AVX2
74#else
75 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
76 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
77 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
78 #define Z7_BLAKE2S_USE_AVX2
79 #ifdef Z7_BLAKE2S_USE_AVX2
80 #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2")))
81 #endif
82 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
83 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
84 #if (Z7_MSC_VER_ORIGINAL == 1900)
85 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
86 #endif
87 #define Z7_BLAKE2S_USE_AVX2
88 #endif
89#endif
90
91#ifdef Z7_BLAKE2S_USE_SSE41
92#include <smmintrin.h> // SSE4.1
93#elif defined(Z7_BLAKE2S_USE_SSSE3)
94#include <tmmintrin.h> // SSSE3
95#else
96#include <emmintrin.h> // SSE2
97#endif
98
99#ifdef Z7_BLAKE2S_USE_AVX2
100#include <immintrin.h>
101#if defined(__clang__)
102#include <avxintrin.h>
103#include <avx2intrin.h>
104#endif
105#endif // avx2
106
107
108#if defined(__AVX512F__) && defined(__AVX512VL__)
109 // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
110 #define Z7_BLAKE2S_USE_AVX512_ALWAYS
111 // #pragma message ("=== Blake2s AVX512")
112#endif
12 113
13#define rotr32 rotrFixed
14 114
15#define BLAKE2S_NUM_ROUNDS 10 115#define Z7_BLAKE2S_USE_V128_FAST
16#define BLAKE2S_FINAL_FLAG (~(UInt32)0) 116// for speed optimization for small messages:
117// #define Z7_BLAKE2S_USE_V128_WAY2
17 118
119#ifdef Z7_BLAKE2S_USE_AVX2
120
121// for debug:
122// gather is slow
123// #define Z7_BLAKE2S_USE_GATHER
124
125 #define Z7_BLAKE2S_USE_AVX2_FAST
126// for speed optimization for small messages:
127// #define Z7_BLAKE2S_USE_AVX2_WAY2
128// #define Z7_BLAKE2S_USE_AVX2_WAY4
129#if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
130 defined(Z7_BLAKE2S_USE_AVX2_WAY4)
131 #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW
132#endif
133#endif
134
135 #define Z7_BLAKE2SP_ALGO_DEFAULT 0
136 #define Z7_BLAKE2SP_ALGO_SCALAR 1
137#ifdef Z7_BLAKE2S_USE_V128_FAST
138 #define Z7_BLAKE2SP_ALGO_V128_FAST 2
139#endif
140#ifdef Z7_BLAKE2S_USE_AVX2_FAST
141 #define Z7_BLAKE2SP_ALGO_V256_FAST 3
142#endif
143 #define Z7_BLAKE2SP_ALGO_V128_WAY1 4
144#ifdef Z7_BLAKE2S_USE_V128_WAY2
145 #define Z7_BLAKE2SP_ALGO_V128_WAY2 5
146#endif
147#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
148 #define Z7_BLAKE2SP_ALGO_V256_WAY2 6
149#endif
150#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
151 #define Z7_BLAKE2SP_ALGO_V256_WAY4 7
152#endif
153
154#endif // Z7_BLAKE2S_USE_VECTORS
155
156
157
158
159#define BLAKE2S_FINAL_FLAG (~(UInt32)0)
160#define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS
161#define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
162#define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1)
163
164#define V_INDEX_0_0 0
165#define V_INDEX_1_0 1
166#define V_INDEX_2_0 2
167#define V_INDEX_3_0 3
168#define V_INDEX_0_1 4
169#define V_INDEX_1_1 5
170#define V_INDEX_2_1 6
171#define V_INDEX_3_1 7
172#define V_INDEX_0_2 8
173#define V_INDEX_1_2 9
174#define V_INDEX_2_2 10
175#define V_INDEX_3_2 11
176#define V_INDEX_0_3 12
177#define V_INDEX_1_3 13
178#define V_INDEX_2_3 14
179#define V_INDEX_3_3 15
180#define V_INDEX_4_0 0
181#define V_INDEX_5_0 1
182#define V_INDEX_6_0 2
183#define V_INDEX_7_0 3
184#define V_INDEX_7_1 4
185#define V_INDEX_4_1 5
186#define V_INDEX_5_1 6
187#define V_INDEX_6_1 7
188#define V_INDEX_6_2 8
189#define V_INDEX_7_2 9
190#define V_INDEX_4_2 10
191#define V_INDEX_5_2 11
192#define V_INDEX_5_3 12
193#define V_INDEX_6_3 13
194#define V_INDEX_7_3 14
195#define V_INDEX_4_3 15
196
197#define V(row, col) v[V_INDEX_ ## row ## _ ## col]
198
199#define k_Blake2s_IV_0 0x6A09E667UL
200#define k_Blake2s_IV_1 0xBB67AE85UL
201#define k_Blake2s_IV_2 0x3C6EF372UL
202#define k_Blake2s_IV_3 0xA54FF53AUL
203#define k_Blake2s_IV_4 0x510E527FUL
204#define k_Blake2s_IV_5 0x9B05688CUL
205#define k_Blake2s_IV_6 0x1F83D9ABUL
206#define k_Blake2s_IV_7 0x5BE0CD19UL
207
208#define KIV(n) (k_Blake2s_IV_## n)
209
210#ifdef Z7_BLAKE2S_USE_VECTORS
211MY_ALIGN(16)
18static const UInt32 k_Blake2s_IV[8] = 212static const UInt32 k_Blake2s_IV[8] =
19{ 213{
20 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 214 KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
21 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
22}; 215};
216#endif
23 217
24static const Byte k_Blake2s_Sigma[BLAKE2S_NUM_ROUNDS][16] = 218#define STATE_T(s) ((s) + 8)
25{ 219#define STATE_F(s) ((s) + 10)
26 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , 220
27 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , 221#ifdef Z7_BLAKE2S_USE_VECTORS
28 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
29 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
30 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
31 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
32 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
33 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
34 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
35 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
36};
37 222
223#define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
224#define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
225#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
226 // here we use unaligned load and stores
227 // use this branch if CBlake2sp can be unaligned for 16 bytes
228 #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
229 #define LOAD_128_FROM_STRUCT(p) LOADU_128(p)
230 #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r)
231#else
232 // here we use aligned load and stores
233 // use this branch if CBlake2sp is aligned for 16 bytes
234 #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r)
235 #define LOAD_128_FROM_STRUCT(p) LOAD_128(p)
236 #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r)
237#endif
38 238
39static void Blake2s_Init0(CBlake2s *p) 239#endif // Z7_BLAKE2S_USE_VECTORS
240
241
242#if 0
243static void PrintState(const UInt32 *s, unsigned num)
244{
245 unsigned i;
246 printf("\n");
247 for (i = 0; i < num; i++)
248 printf(" %08x", (unsigned)s[i]);
249}
250static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
40{ 251{
41 unsigned i; 252 unsigned i;
42 for (i = 0; i < 8; i++) 253 for (i = 0; i < y; i++)
43 p->h[i] = k_Blake2s_IV[i]; 254 PrintState(s + i * x, x);
44 p->t[0] = 0; 255 printf("\n");
45 p->t[1] = 0;
46 p->f[0] = 0;
47 p->f[1] = 0;
48 p->bufPos = 0;
49 p->lastNode_f1 = 0;
50} 256}
257#endif
258
259
260#define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
261
262#define BLAKE2S_NUM_ROUNDS 10
263
264#if defined(Z7_BLAKE2S_USE_VECTORS)
265#define ROUNDS_LOOP(mac) \
266 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
267#endif
268/*
269#define ROUNDS_LOOP_2(mac) \
270 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
271*/
272#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
273#define ROUNDS_LOOP_UNROLLED(m) \
274 { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
275#endif
276
277#define SIGMA_TABLE(M) \
278 M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \
279 M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \
280 M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \
281 M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \
282 M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \
283 M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \
284 M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \
285 M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \
286 M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \
287 M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 )
288
289#define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
290 { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
291#define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
292 SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
293
294// MY_ALIGN(32)
295MY_ALIGN(16)
296static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
297 { SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
298
299#define GET_SIGMA_PTR(p, index) \
300 ((const void *)((const Byte *)(const void *)(p) + (index)))
51 301
302#define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
303 ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
52 304
53static void Blake2s_Compress(CBlake2s *p) 305
306#ifdef Z7_BLAKE2S_USE_VECTORS
307
308
309#if 0
310 // use loading constants from memory
311 // is faster for some compilers.
312 #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n)
313MY_ALIGN(64)
314static const UInt32 k_Blake2s_IV_WAY4[]=
54{ 315{
55 UInt32 m[16]; 316 KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
56 UInt32 v[16]; 317};
57 318 #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
319#else
320 // use constant generation:
321 #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i))
322#endif
323
324
325#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
326#define GET_CONST_128_FROM_ARRAY32(k) \
327 _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
328#endif
329
330
331#if 0
332#define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
333#define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
334#define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
335#define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
336#define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
337#else
338#if defined(Z7_BLAKE2S_USE_SSSE3) && \
339 !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
340MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
341MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
342#define k_r8 LOAD_128(k_r8_arr)
343#define k_r16 LOAD_128(k_r16_arr)
344#endif
345MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
346#define k_inc LOAD_128(k_inc_arr)
347#define k_iv0_128 LOAD_128(k_Blake2s_IV + 0)
348#define k_iv4_128 LOAD_128(k_Blake2s_IV + 4)
349#endif
350
351
352#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
353
354#ifdef Z7_BLAKE2S_USE_AVX2
355#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
356 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
357#else
358 #define MY_mm256_set_m128i _mm256_set_m128i
359#endif
360
361#define SET_FROM_128(a) MY_mm256_set_m128i(a, a)
362
363#ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
364MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
365{
366 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
367 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
368};
369MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
370{
371 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
372 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
373};
374#define k_r8_256 LOAD_256(k_r8_arr_256)
375#define k_r16_256 LOAD_256(k_r16_arr_256)
376#endif
377
378// #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
379// #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
380// #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
381// #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
382#define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
383#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
384#endif
385
386
387/*
388IPC(TP) ports:
3891 p__5 : skl- : SSE : shufps : _mm_shuffle_ps
3902 p_15 : icl+
3911 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps
3923 p015 : skl+
393
3943 p015 : SSE2 : pxor : _mm_xor_si128
3952 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32
3962 p0_5: mrm-wsm :
3973 p015 : skl+
398
3992 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
4002 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32
4012 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16
4022 p_15 : : SSE2 : psrldq :
4032 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8
4042 p_15 : : SSE4 : pblendw : _mm_blend_epi16
4051 p__5 : hsw-skl : *
406
4071 p0 : SSE2 : pslld (i8) : _mm_slli_si128
4082 p01 : skl+ :
409
4102 p_15 : ivb- : SSE3 : palignr
4111 p__5 : hsw+
412
4132 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8)
4141 p__5 + p23 : hsw-skl
4151 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8)
4160.5 2*p5 : hsw-skl
417
4182 p23 : SSE2 : movd (m32)
4193 p23A : adl :
4201 p5: : SSE2 : movd (r32)
421*/
422
423#if 0 && defined(__XOP__)
424// we must debug and test __XOP__ instruction
425#include <x86intrin.h>
426#include <ammintrin.h>
427 #define LOAD_ROTATE_CONSTS
428 #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c))
429 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
430#elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
431 #define LOAD_ROTATE_CONSTS
432 #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c)
433 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
434#else
435
436// MSVC_1937+ uses "orps" instruction for _mm_or_si128().
437// But "orps" has low throughput: TP=1 for bdw-nhm.
438// So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
439// But "orps" is fast for modern cpus (skl+).
440// So we are default with "or" version:
441#if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
442 // minor optimization for some old cpus, if "xorps" is slow.
443 #define MM128_EPI32_OR_or_ADD _mm_add_epi32
444#else
445 #define MM128_EPI32_OR_or_ADD _mm_or_si128
446#endif
447
448 #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
449 MM128_EPI32_OR_or_ADD( \
450 _mm_srli_epi32((r), (c)), \
451 _mm_slli_epi32((r), 32-(c))))
452 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
453 #define LOAD_ROTATE_CONSTS \
454 const __m128i r8 = k_r8; \
455 const __m128i r16 = k_r16;
456 #define MM_ROR_EPI32(r, c) ( \
457 ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
458 : (16==(c)) ? _mm_shuffle_epi8(r,r16) \
459 : MM_ROR_EPI32_VIA_SHIFT(r, c))
460 #else
461 #define LOAD_ROTATE_CONSTS
462 #define MM_ROR_EPI32(r, c) ( \
463 (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
464 : MM_ROR_EPI32_VIA_SHIFT(r, c))
465 #endif
466#endif
467
468/*
469we have 3 main ways to load 4 32-bit integers to __m128i:
470 1) SSE2: _mm_set_epi32()
471 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
472 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
473good compiler for _mm_set_epi32() generates these instructions:
474{
475 movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
476}
477good new compiler generates one instruction
478{
479 for _mm_insert_epi32() : { pinsrd xmm, [m32], i }
480 for _mm_cvtsi32_si128() : { movd xmm, [m32] }
481}
482but vc2010 generates slow pair of instructions:
483{
484 for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i }
485 for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 }
486}
487_mm_insert_epi32() (pinsrd) code reduces xmm register pressure
488in comparison with _mm_set_epi32() (movd + vpunpckld) code.
489Note that variant with "movd xmm, r32" can be more slow,
490but register pressure can be more important.
491So we can force to "pinsrd" always.
492*/
493// #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
494#ifdef Z7_BLAKE2S_USE_SSE41
495 /* _mm_set_epi32() can be more effective for GCC and CLANG
496 _mm_insert_epi32() is more effective for MSVC */
497 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
498 #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
499 #endif
500#endif // USE_SSE41
501// #endif
502
503#ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
504 // for SSE4.1
505#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
506 _mm_insert_epi32( \
507 _mm_insert_epi32( \
508 _mm_insert_epi32( \
509 _mm_cvtsi32_si128( \
510 *(const Int32 *)p0), \
511 *(const Int32 *)p1, 1), \
512 *(const Int32 *)p2, 2), \
513 *(const Int32 *)p3, 3)
514#elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
515/* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
516 Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
517 But _mm_set_epi32() is more effective for GCC and CLANG.
518 So we use _mm_unpacklo_epi32 for MSVC only */
519#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
520 _mm_unpacklo_epi64( \
521 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \
522 _mm_cvtsi32_si128(*(const Int32 *)p1)), \
523 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \
524 _mm_cvtsi32_si128(*(const Int32 *)p3)))
525#else
526#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
527 _mm_set_epi32( \
528 *(const Int32 *)p3, \
529 *(const Int32 *)p2, \
530 *(const Int32 *)p1, \
531 *(const Int32 *)p0)
532#endif
533
534#define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \
535 MM_LOAD_EPI32_FROM_4_POINTERS( \
536 GET_SIGMA_PTR(input, i0), \
537 GET_SIGMA_PTR(input, i1), \
538 GET_SIGMA_PTR(input, i2), \
539 GET_SIGMA_PTR(input, i3))
540
541#define SET_ROW_FROM_SIGMA(input, sigma_index) \
542 SET_ROW_FROM_SIGMA_BASE(input, \
543 sigma[(sigma_index) ], \
544 sigma[(sigma_index) + 2 * 1], \
545 sigma[(sigma_index) + 2 * 2], \
546 sigma[(sigma_index) + 2 * 3]) \
547
548
549#define ADD_128(a, b) _mm_add_epi32(a, b)
550#define XOR_128(a, b) _mm_xor_si128(a, b)
551
552#define D_ADD_128(dest, src) dest = ADD_128(dest, src)
553#define D_XOR_128(dest, src) dest = XOR_128(dest, src)
554#define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift)
555#define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src)
556
557
558#define AXR(a, b, d, shift) \
559 D_ADD_128(a, b); \
560 D_XOR_128(d, a); \
561 D_ROR_128(d, shift);
562
563#define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
564 a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
565 AXR(a, b, d, shift1) \
566 AXR(c, d, b, shift2)
567
568#define ROTATE_WORDS_TO_RIGHT(a, n) \
569 a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
570
571#define AXR4(a, b, c, d, input, sigma_index) \
572 AXR2(a, b, c, d, input, sigma_index, 16, 12) \
573 AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \
574
575#define RR2(a, b, c, d, input) \
576 { \
577 AXR4(a, b, c, d, input, 0) \
578 ROTATE_WORDS_TO_RIGHT(b, 1) \
579 ROTATE_WORDS_TO_RIGHT(c, 2) \
580 ROTATE_WORDS_TO_RIGHT(d, 3) \
581 AXR4(a, b, c, d, input, 8) \
582 ROTATE_WORDS_TO_RIGHT(b, 3) \
583 ROTATE_WORDS_TO_RIGHT(c, 2) \
584 ROTATE_WORDS_TO_RIGHT(d, 1) \
585 }
586
587
588/*
589Way1:
590per 64 bytes block:
59110 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
592 * (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
593additional operations per 7_op_iter :
5944 movzx byte mem
5951 movd mem
5963 pinsrd mem
5971.5 pshufd
598*/
599
600static
601#if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
602 defined(Z7_BLAKE2S_USE_V256_WAY2))
603 Z7_NO_INLINE
604#else
605 Z7_FORCE_INLINE
606#endif
607#ifdef BLAKE2S_ATTRIB_128BIT
608 BLAKE2S_ATTRIB_128BIT
609#endif
610void
611Z7_FASTCALL
612Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
613{
614 __m128i a, b, c, d;
615 __m128i f0, f1;
616
617 LOAD_ROTATE_CONSTS
618 d = LOAD_128_FROM_STRUCT(STATE_T(s));
619 c = k_iv0_128;
620 a = f0 = LOAD_128_FROM_STRUCT(s);
621 b = f1 = LOAD_128_FROM_STRUCT(s + 4);
622 D_ADD_EPI64_128(d, k_inc);
623 STORE_128_TO_STRUCT (STATE_T(s), d);
624 D_XOR_128(d, k_iv4_128);
625
626#define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
627 RR2(a, b, c, d, input) }
628
629 ROUNDS_LOOP(RR)
630#undef RR
631
632 STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c)));
633 STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
634}
635
636
637static
638Z7_NO_INLINE
639#ifdef BLAKE2S_ATTRIB_128BIT
640 BLAKE2S_ATTRIB_128BIT
641#endif
642void
643Z7_FASTCALL
644Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
645{
646 size_t pos = 0;
647 do
58 { 648 {
59 unsigned i; 649 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
650 Blake2s_Compress_V128_Way1(s, data);
651 data += Z7_BLAKE2S_BLOCK_SIZE;
652 pos += Z7_BLAKE2S_BLOCK_SIZE;
653 pos &= SUPER_BLOCK_MASK;
654 }
655 while (data != end);
656}
657
658
659#if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
660 defined(Z7_BLAKE2S_USE_AVX2_WAY2)
661#if 1
662 #define Z7_BLAKE2S_CompressSingleBlock(s, data) \
663 Blake2sp_Compress2_V128_Way1(s, data, \
664 (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
665#else
666 #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1
667#endif
668#endif
669
670
671#if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
672 defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
673 !defined(Z7_BLAKE2S_USE_GATHER)
674#define AXR2_LOAD_INDEXES(sigma_index) \
675 const unsigned i0 = sigma[(sigma_index)]; \
676 const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
677 const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
678 const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
679
680#define SET_ROW_FROM_SIGMA_W(input) \
681 SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
682#endif
683
684
685#ifdef Z7_BLAKE2S_USE_V128_WAY2
686
687#if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
688/* we use SET_ROW_FROM_SIGMA_BASE, that uses
689 (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
690 (SSE2) _mm_set_epi32()
691 MSVC can be faster for this branch:
692*/
693#define AXR2_W(sigma_index, shift1, shift2) \
694 { \
695 AXR2_LOAD_INDEXES(sigma_index) \
696 a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
697 a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
698 AXR(a0, b0, d0, shift1) \
699 AXR(a1, b1, d1, shift1) \
700 AXR(c0, d0, b0, shift2) \
701 AXR(c1, d1, b1, shift2) \
702 }
703#else
704/* we use interleaved _mm_insert_epi32():
705 GCC can be faster for this branch:
706*/
707#define AXR2_W_PRE_INSERT(sigma_index, i) \
708 { const unsigned ii = sigma[(sigma_index) + i * 2]; \
709 t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \
710 t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
711 }
712#define AXR2_W(sigma_index, shift1, shift2) \
713 { __m128i t0, t1; \
714 { const unsigned ii = sigma[sigma_index]; \
715 t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
716 t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
717 } \
718 AXR2_W_PRE_INSERT(sigma_index, 1) \
719 AXR2_W_PRE_INSERT(sigma_index, 2) \
720 AXR2_W_PRE_INSERT(sigma_index, 3) \
721 a0 = _mm_add_epi32(a0, t0); \
722 a1 = _mm_add_epi32(a1, t1); \
723 AXR(a0, b0, d0, shift1) \
724 AXR(a1, b1, d1, shift1) \
725 AXR(c0, d0, b0, shift2) \
726 AXR(c1, d1, b1, shift2) \
727 }
728#endif
729
730
731#define AXR4_W(sigma_index) \
732 AXR2_W(sigma_index, 16, 12) \
733 AXR2_W(sigma_index + 1, 8, 7) \
734
735#define WW(r) \
736 { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
737 AXR4_W(0) \
738 ROTATE_WORDS_TO_RIGHT(b0, 1) \
739 ROTATE_WORDS_TO_RIGHT(b1, 1) \
740 ROTATE_WORDS_TO_RIGHT(c0, 2) \
741 ROTATE_WORDS_TO_RIGHT(c1, 2) \
742 ROTATE_WORDS_TO_RIGHT(d0, 3) \
743 ROTATE_WORDS_TO_RIGHT(d1, 3) \
744 AXR4_W(8) \
745 ROTATE_WORDS_TO_RIGHT(b0, 3) \
746 ROTATE_WORDS_TO_RIGHT(b1, 3) \
747 ROTATE_WORDS_TO_RIGHT(c0, 2) \
748 ROTATE_WORDS_TO_RIGHT(c1, 2) \
749 ROTATE_WORDS_TO_RIGHT(d0, 1) \
750 ROTATE_WORDS_TO_RIGHT(d1, 1) \
751 }
752
753
754static
755Z7_NO_INLINE
756#ifdef BLAKE2S_ATTRIB_128BIT
757 BLAKE2S_ATTRIB_128BIT
758#endif
759void
760Z7_FASTCALL
761Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
762{
763 size_t pos = 0;
764 end -= Z7_BLAKE2S_BLOCK_SIZE;
765
766 if (data != end)
767 {
768 LOAD_ROTATE_CONSTS
769 do
770 {
771 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
772 __m128i a0, b0, c0, d0;
773 __m128i a1, b1, c1, d1;
774 {
775 const __m128i inc = k_inc;
776 const __m128i temp = k_iv4_128;
777 d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
778 d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
779 D_ADD_EPI64_128(d0, inc);
780 D_ADD_EPI64_128(d1, inc);
781 STORE_128_TO_STRUCT (STATE_T(s ), d0);
782 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
783 D_XOR_128(d0, temp);
784 D_XOR_128(d1, temp);
785 }
786 c1 = c0 = k_iv0_128;
787 a0 = LOAD_128_FROM_STRUCT(s);
788 b0 = LOAD_128_FROM_STRUCT(s + 4);
789 a1 = LOAD_128_FROM_STRUCT(s + NSW);
790 b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
791
792 ROUNDS_LOOP (WW)
793
794#undef WW
795
796 D_XOR_128(a0, c0);
797 D_XOR_128(b0, d0);
798 D_XOR_128(a1, c1);
799 D_XOR_128(b1, d1);
800
801 D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
802 D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
803 D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
804 D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
805
806 STORE_128_TO_STRUCT(s, a0);
807 STORE_128_TO_STRUCT(s + 4, b0);
808 STORE_128_TO_STRUCT(s + NSW, a1);
809 STORE_128_TO_STRUCT(s + NSW + 4, b1);
810
811 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
812 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
813 pos &= SUPER_BLOCK_MASK;
814 }
815 while (data < end);
816 if (data != end)
817 return;
818 }
819 {
820 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
821 Z7_BLAKE2S_CompressSingleBlock(s, data);
822 }
823}
824#endif // Z7_BLAKE2S_USE_V128_WAY2
825
826
827#ifdef Z7_BLAKE2S_USE_V128_WAY2
828 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2
829#else
830 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1
831#endif
832
833
834
835#ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
836 #define ROT_128_8(x) MM_ROR_EPI32(x, 8)
837 #define ROT_128_16(x) MM_ROR_EPI32(x, 16)
838 #define ROT_128_7(x) MM_ROR_EPI32(x, 7)
839 #define ROT_128_12(x) MM_ROR_EPI32(x, 12)
840#else
841#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
842 #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8
843 #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16
844#else
845 #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8)
846 #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16)
847#endif
848 #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7)
849 #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12)
850#endif
851
852
853#if 1
854// this branch can provide similar speed on x86* in most cases,
855// because [base + index*4] provides same speed as [base + index].
856// but some compilers can generate different code with this branch, that can be faster sometimes.
857// this branch uses additional table of 10*16=160 bytes.
858#define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
859 SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
860MY_ALIGN(16)
861static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
862 { SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
863#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r];
864#define GET_SIGMA_VAL_128(n) (sigma[n])
865#else
866#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
867#define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n])
868#endif
869
870
871#ifdef Z7_BLAKE2S_USE_AVX2_FAST
872#if 1
873#define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
874 SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
875MY_ALIGN(64)
876static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
877 { SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
878#define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
879#define GET_SIGMA_VAL_256(n) (sigma[n])
880#else
881#define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
882#define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n])
883#endif
884#endif // Z7_BLAKE2S_USE_AVX2_FAST
885
886
887#define D_ROT_128_7(dest) dest = ROT_128_7(dest)
888#define D_ROT_128_8(dest) dest = ROT_128_8(dest)
889#define D_ROT_128_12(dest) dest = ROT_128_12(dest)
890#define D_ROT_128_16(dest) dest = ROT_128_16(dest)
891
892#define OP_L(a, i) D_ADD_128 (V(a, 0), \
893 LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
894
895#define OP_0(a) OP_L(a, 0)
896#define OP_7(a) OP_L(a, 1)
897
898#define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1));
899#define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0));
900#define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3));
901#define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2));
902
903#define OP_3(a) D_ROT_128_16 (V(a, 3));
904#define OP_6(a) D_ROT_128_12 (V(a, 1));
905#define OP_8(a) D_ROT_128_8 (V(a, 3));
906#define OP_9(a) D_ROT_128_7 (V(a, 1));
907
908
909// for 32-bit x86 : interleave mode works slower, because of register pressure.
910
911#if 0 || 1 && (defined(MY_CPU_X86) \
912 || defined(__GNUC__) && !defined(__clang__))
913// non-inteleaved version:
914// is fast for x86 32-bit.
915// is fast for GCC x86-64.
916
917#define V4G(a) \
918 OP_0 (a) \
919 OP_1 (a) \
920 OP_2 (a) \
921 OP_3 (a) \
922 OP_4 (a) \
923 OP_5 (a) \
924 OP_6 (a) \
925 OP_7 (a) \
926 OP_1 (a) \
927 OP_2 (a) \
928 OP_8 (a) \
929 OP_4 (a) \
930 OP_5 (a) \
931 OP_9 (a) \
932
933#define V4R \
934{ \
935 V4G (0) \
936 V4G (1) \
937 V4G (2) \
938 V4G (3) \
939 V4G (4) \
940 V4G (5) \
941 V4G (6) \
942 V4G (7) \
943}
944
945#elif 0 || 1 && defined(MY_CPU_X86)
946
947#define OP_INTER_2(op, a,b) \
948 op (a) \
949 op (b) \
950
951#define V4G(a,b) \
952 OP_INTER_2 (OP_0, a,b) \
953 OP_INTER_2 (OP_1, a,b) \
954 OP_INTER_2 (OP_2, a,b) \
955 OP_INTER_2 (OP_3, a,b) \
956 OP_INTER_2 (OP_4, a,b) \
957 OP_INTER_2 (OP_5, a,b) \
958 OP_INTER_2 (OP_6, a,b) \
959 OP_INTER_2 (OP_7, a,b) \
960 OP_INTER_2 (OP_1, a,b) \
961 OP_INTER_2 (OP_2, a,b) \
962 OP_INTER_2 (OP_8, a,b) \
963 OP_INTER_2 (OP_4, a,b) \
964 OP_INTER_2 (OP_5, a,b) \
965 OP_INTER_2 (OP_9, a,b) \
966
967#define V4R \
968{ \
969 V4G (0, 1) \
970 V4G (2, 3) \
971 V4G (4, 5) \
972 V4G (6, 7) \
973}
974
975#else
976// iterleave-4 version is fast for x64 (MSVC/CLANG)
977
978#define OP_INTER_4(op, a,b,c,d) \
979 op (a) \
980 op (b) \
981 op (c) \
982 op (d) \
983
984#define V4G(a,b,c,d) \
985 OP_INTER_4 (OP_0, a,b,c,d) \
986 OP_INTER_4 (OP_1, a,b,c,d) \
987 OP_INTER_4 (OP_2, a,b,c,d) \
988 OP_INTER_4 (OP_3, a,b,c,d) \
989 OP_INTER_4 (OP_4, a,b,c,d) \
990 OP_INTER_4 (OP_5, a,b,c,d) \
991 OP_INTER_4 (OP_6, a,b,c,d) \
992 OP_INTER_4 (OP_7, a,b,c,d) \
993 OP_INTER_4 (OP_1, a,b,c,d) \
994 OP_INTER_4 (OP_2, a,b,c,d) \
995 OP_INTER_4 (OP_8, a,b,c,d) \
996 OP_INTER_4 (OP_4, a,b,c,d) \
997 OP_INTER_4 (OP_5, a,b,c,d) \
998 OP_INTER_4 (OP_9, a,b,c,d) \
999
1000#define V4R \
1001{ \
1002 V4G (0, 1, 2, 3) \
1003 V4G (4, 5, 6, 7) \
1004}
1005
1006#endif
1007
1008#define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R }
1009
1010
1011#define V4_LOAD_MSG_1(w, m, i) \
1012{ \
1013 __m128i m0, m1, m2, m3; \
1014 __m128i t0, t1, t2, t3; \
1015 m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
1016 m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
1017 m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
1018 m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
1019 t0 = _mm_unpacklo_epi32(m0, m1); \
1020 t1 = _mm_unpackhi_epi32(m0, m1); \
1021 t2 = _mm_unpacklo_epi32(m2, m3); \
1022 t3 = _mm_unpackhi_epi32(m2, m3); \
1023 w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
1024 w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
1025 w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
1026 w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
1027}
1028
1029#define V4_LOAD_MSG(w, m) \
1030{ \
1031 V4_LOAD_MSG_1 (w, m, 0) \
1032 V4_LOAD_MSG_1 (w, m, 1) \
1033 V4_LOAD_MSG_1 (w, m, 2) \
1034 V4_LOAD_MSG_1 (w, m, 3) \
1035}
1036
1037#define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
1038{ \
1039 const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \
1040 const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \
1041 d0 = _mm_unpacklo_epi32(v0, v1); \
1042 d1 = _mm_unpackhi_epi32(v0, v1); \
1043}
1044
1045#define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
1046{ \
1047 STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \
1048 STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \
1049}
1050
1051#define V4_UNPACK_STATE(dest32, src32) \
1052{ \
1053 __m128i t0, t1, t2, t3, t4, t5, t6, t7; \
1054 V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \
1055 V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \
1056 V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \
1057 V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \
1058 V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \
1059 V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \
1060 V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \
1061 V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \
1062}
1063
1064
1065static
1066Z7_NO_INLINE
1067#ifdef BLAKE2S_ATTRIB_128BIT
1068 BLAKE2S_ATTRIB_128BIT
1069#endif
1070void
1071Z7_FASTCALL
1072Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1073{
1074 // PrintStates2(s_items, 8, 16);
1075 size_t pos = 0;
1076 pos /= 2;
1077 do
1078 {
1079#if defined(Z7_BLAKE2S_USE_SSSE3) && \
1080 !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
1081 const __m128i r8 = k_r8;
1082 const __m128i r16 = k_r16;
1083#endif
1084 __m128i w[16];
1085 __m128i v[16];
1086 UInt32 *s;
1087 V4_LOAD_MSG(w, data)
1088 s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1089 {
1090 __m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
1091 D_ADD_EPI64_128 (ctr, k_inc);
1092 STORE_128_TO_STRUCT(s + 64, ctr);
1093 v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1094 v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1095 }
1096 v[ 8] = GET_128_IV_WAY4(0);
1097 v[ 9] = GET_128_IV_WAY4(1);
1098 v[10] = GET_128_IV_WAY4(2);
1099 v[11] = GET_128_IV_WAY4(3);
1100 v[14] = GET_128_IV_WAY4(6);
1101 v[15] = GET_128_IV_WAY4(7);
60 1102
61 for (i = 0; i < 16; i++) 1103#define LOAD_STATE_128_FROM_STRUCT(i) \
62 m[i] = GetUi32(p->buf + i * sizeof(m[i])); 1104 v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
1105
1106#define UPDATE_STATE_128_IN_STRUCT(i) \
1107 STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
1108 XOR_128(v[i], v[(i) + 8]), \
1109 LOAD_128_FROM_STRUCT(s + (i) * 4)));
63 1110
64 for (i = 0; i < 8; i++) 1111 REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
65 v[i] = p->h[i]; 1112 ROUNDS_LOOP (V4_ROUND)
1113 REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
1114
1115 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1116 pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
1117 pos &= SUPER_BLOCK_SIZE / 2 - 1;
66 } 1118 }
1119 while (data != end);
1120}
67 1121
68 v[ 8] = k_Blake2s_IV[0];
69 v[ 9] = k_Blake2s_IV[1];
70 v[10] = k_Blake2s_IV[2];
71 v[11] = k_Blake2s_IV[3];
72
73 v[12] = p->t[0] ^ k_Blake2s_IV[4];
74 v[13] = p->t[1] ^ k_Blake2s_IV[5];
75 v[14] = p->f[0] ^ k_Blake2s_IV[6];
76 v[15] = p->f[1] ^ k_Blake2s_IV[7];
77 1122
78 #define G(r,i,a,b,c,d) \ 1123static
79 a += b + m[sigma[2*i+0]]; d ^= a; d = rotr32(d, 16); c += d; b ^= c; b = rotr32(b, 12); \ 1124Z7_NO_INLINE
80 a += b + m[sigma[2*i+1]]; d ^= a; d = rotr32(d, 8); c += d; b ^= c; b = rotr32(b, 7); \ 1125#ifdef BLAKE2S_ATTRIB_128BIT
1126 BLAKE2S_ATTRIB_128BIT
1127#endif
1128void
1129Z7_FASTCALL
1130Blake2sp_Final_V128_Fast(UInt32 *states)
1131{
1132 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1133 // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
1134 // PrintStates2(states, 8, 16);
1135 {
1136 ptrdiff_t pos = 8 * 4;
1137 do
1138 {
1139 UInt32 *src32 = states + (size_t)(pos * 1);
1140 UInt32 *dest32 = states + (size_t)(pos * 2);
1141 V4_UNPACK_STATE(dest32, src32)
1142 pos -= 8 * 4;
1143 }
1144 while (pos >= 0);
1145 }
1146 {
1147 unsigned k;
1148 for (k = 0; k < 8; k++)
1149 {
1150 UInt32 *s = states + (size_t)k * 16;
1151 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1152 }
1153 }
1154 // PrintStates2(states, 8, 16);
1155}
1156
1157
1158
1159#ifdef Z7_BLAKE2S_USE_AVX2
1160
1161#define ADD_256(a, b) _mm256_add_epi32(a, b)
1162#define XOR_256(a, b) _mm256_xor_si256(a, b)
1163
1164#if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
1165 #define MM256_ROR_EPI32 _mm256_ror_epi32
1166 #define Z7_MM256_ROR_EPI32_IS_SUPPORTED
1167 #define LOAD_ROTATE_CONSTS_256
1168#else
1169#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1170#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1171 #define LOAD_ROTATE_CONSTS_256 \
1172 const __m256i r8 = k_r8_256; \
1173 const __m256i r16 = k_r16_256;
1174#endif // AVX2_WAY2
1175
1176 #define MM256_ROR_EPI32(r, c) ( \
1177 ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
1178 : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
1179 : _mm256_or_si256( \
1180 _mm256_srli_epi32((r), (c)), \
1181 _mm256_slli_epi32((r), 32-(c))))
1182#endif // WAY_SLOW
1183#endif
1184
1185
1186#define D_ADD_256(dest, src) dest = ADD_256(dest, src)
1187#define D_XOR_256(dest, src) dest = XOR_256(dest, src)
1188
1189#define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
1190
1191#ifdef Z7_BLAKE2S_USE_AVX2_FAST
1192
1193#ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1194#define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
1195#define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
1196#define ROT_256_8(x) MM256_ROR_EPI32((x), 8)
1197#define ROT_256_7(x) MM256_ROR_EPI32((x), 7)
1198#else
1199#define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
1200 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
1201#define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
1202 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
1203#define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
1204#define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
1205#define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8)
1206#define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25))
1207#endif
1208
1209#define D_ROT_256_7(dest) dest = ROT_256_7(dest)
1210#define D_ROT_256_8(dest) dest = ROT_256_8(dest)
1211#define D_ROT_256_12(dest) dest = ROT_256_12(dest)
1212#define D_ROT_256_16(dest) dest = ROT_256_16(dest)
1213
1214#define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p))
1215#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
1216 #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
1217 #define LOAD_256_FROM_STRUCT(p) LOADU_256(p)
1218 #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r)
1219#else
1220 // if struct is aligned for 32-bytes
1221 #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r)
1222 #define LOAD_256_FROM_STRUCT(p) LOAD_256(p)
1223 #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r)
1224#endif
1225
1226#endif // Z7_BLAKE2S_USE_AVX2_FAST
1227
1228
1229
1230#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1231
1232#if 0
1233 #define DIAG_PERM2(s) \
1234 { \
1235 const __m256i a = LOAD_256_FROM_STRUCT((s) ); \
1236 const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
1237 STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \
1238 STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
1239 }
1240#else
1241 #define DIAG_PERM2(s) \
1242 { \
1243 const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
1244 const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
1245 STORE_128_TO_STRUCT((s) + NSW, a); \
1246 STORE_128_TO_STRUCT((s) + 4 , b); \
1247 }
1248#endif
1249 #define DIAG_PERM8(s_items) \
1250 { \
1251 DIAG_PERM2(s_items) \
1252 DIAG_PERM2(s_items + NSW * 2) \
1253 DIAG_PERM2(s_items + NSW * 4) \
1254 DIAG_PERM2(s_items + NSW * 6) \
1255 }
1256
1257
1258#define AXR256(a, b, d, shift) \
1259 D_ADD_256(a, b); \
1260 D_XOR_256(d, a); \
1261 d = MM256_ROR_EPI32(d, shift); \
1262
1263
1264
1265#ifdef Z7_BLAKE2S_USE_GATHER
1266
1267 #define TABLE_GATHER_256_4(a0,a1,a2,a3) \
1268 a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
1269 #define TABLE_GATHER_256( \
1270 a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
1271 { TABLE_GATHER_256_4(a0,a2,a4,a6), \
1272 TABLE_GATHER_256_4(a1,a3,a5,a7), \
1273 TABLE_GATHER_256_4(a8,a10,a12,a14), \
1274 TABLE_GATHER_256_4(a9,a11,a13,a15) }
1275MY_ALIGN(64)
1276static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
1277 { SIGMA_TABLE(TABLE_GATHER_256) };
1278 #define GET_SIGMA(r) \
1279 const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
1280 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1281 const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
1282 #define SET_ROW_FROM_SIGMA_AVX(in) \
1283 _mm256_i32gather_epi32((const void *)(in), i01234567, 4)
1284 #define SIGMA_INTERLEAVE 8
1285 #define SIGMA_HALF_ROW_SIZE 16
1286
1287#else // !Z7_BLAKE2S_USE_GATHER
1288
1289 #define GET_SIGMA(r) \
1290 const Byte * const sigma = k_Blake2s_Sigma_4[r];
1291 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1292 AXR2_LOAD_INDEXES(sigma_index)
1293 #define SET_ROW_FROM_SIGMA_AVX(in) \
1294 MY_mm256_set_m128i( \
1295 SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
1296 SET_ROW_FROM_SIGMA_W(in))
1297 #define SIGMA_INTERLEAVE 1
1298 #define SIGMA_HALF_ROW_SIZE 8
1299#endif // !Z7_BLAKE2S_USE_GATHER
1300
81 1301
82 #define R(r) \ 1302#define ROTATE_WORDS_TO_RIGHT_256(a, n) \
83 G(r,0,v[ 0],v[ 4],v[ 8],v[12]) \ 1303 a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
84 G(r,1,v[ 1],v[ 5],v[ 9],v[13]) \
85 G(r,2,v[ 2],v[ 6],v[10],v[14]) \
86 G(r,3,v[ 3],v[ 7],v[11],v[15]) \
87 G(r,4,v[ 0],v[ 5],v[10],v[15]) \
88 G(r,5,v[ 1],v[ 6],v[11],v[12]) \
89 G(r,6,v[ 2],v[ 7],v[ 8],v[13]) \
90 G(r,7,v[ 3],v[ 4],v[ 9],v[14]) \
91 1304
1305
1306
1307#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1308
1309#define AXR2_A(sigma_index, shift1, shift2) \
1310 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1311 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1312 AXR256(a0, b0, d0, shift1) \
1313 AXR256(c0, d0, b0, shift2) \
1314
1315#define AXR4_A(sigma_index) \
1316 { AXR2_A(sigma_index, 16, 12) } \
1317 { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1318
1319#define EE1(r) \
1320 { GET_SIGMA(r) \
1321 AXR4_A(0) \
1322 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1323 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1324 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1325 AXR4_A(SIGMA_HALF_ROW_SIZE) \
1326 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1327 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1328 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1329 }
1330
1331static
1332Z7_NO_INLINE
1333#ifdef BLAKE2S_ATTRIB_AVX2
1334 BLAKE2S_ATTRIB_AVX2
1335#endif
1336void
1337Z7_FASTCALL
1338Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
1339{
1340 size_t pos = 0;
1341 end -= Z7_BLAKE2S_BLOCK_SIZE;
1342
1343 if (data != end)
92 { 1344 {
93 unsigned r; 1345 LOAD_ROTATE_CONSTS_256
94 for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) 1346 DIAG_PERM8(s_items)
1347 do
95 { 1348 {
96 const Byte *sigma = k_Blake2s_Sigma[r]; 1349 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
97 R(r) 1350 __m256i a0, b0, c0, d0;
1351 {
1352 const __m128i inc = k_inc;
1353 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1354 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1355 D_ADD_EPI64_128(d0_128, inc);
1356 D_ADD_EPI64_128(d1_128, inc);
1357 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1358 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
1359 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1360 D_XOR_256(d0, k_iv4_256);
1361 }
1362 c0 = SET_FROM_128(k_iv0_128);
1363 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1364 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1365
1366 ROUNDS_LOOP (EE1)
1367
1368 D_XOR_256(a0, c0);
1369 D_XOR_256(b0, d0);
1370
1371 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1372 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1373
1374 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1375 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1376
1377 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
1378 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
1379 pos &= SUPER_BLOCK_MASK;
98 } 1380 }
99 /* R(0); R(1); R(2); R(3); R(4); R(5); R(6); R(7); R(8); R(9); */ 1381 while (data < end);
1382 DIAG_PERM8(s_items)
1383 if (data != end)
1384 return;
1385 }
1386 {
1387 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1388 Z7_BLAKE2S_CompressSingleBlock(s, data);
100 } 1389 }
1390}
1391
1392#endif // Z7_BLAKE2S_USE_AVX2_WAY2
101 1393
102 #undef G
103 #undef R
104 1394
1395
1396#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
1397
1398#define AXR2_X(sigma_index, shift1, shift2) \
1399 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1400 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1401 D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
1402 AXR256(a0, b0, d0, shift1) \
1403 AXR256(a1, b1, d1, shift1) \
1404 AXR256(c0, d0, b0, shift2) \
1405 AXR256(c1, d1, b1, shift2) \
1406
1407#define AXR4_X(sigma_index) \
1408 { AXR2_X(sigma_index, 16, 12) } \
1409 { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1410
1411#define EE2(r) \
1412 { GET_SIGMA(r) \
1413 AXR4_X(0) \
1414 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1415 ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
1416 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1417 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1418 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1419 ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
1420 AXR4_X(SIGMA_HALF_ROW_SIZE) \
1421 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1422 ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
1423 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1424 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1425 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1426 ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
1427 }
1428
1429static
1430Z7_NO_INLINE
1431#ifdef BLAKE2S_ATTRIB_AVX2
1432 BLAKE2S_ATTRIB_AVX2
1433#endif
1434void
1435Z7_FASTCALL
1436Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
1437{
1438 size_t pos = 0;
1439
1440 if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
105 { 1441 {
106 unsigned i; 1442#ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
107 for (i = 0; i < 8; i++) 1443 const __m256i r8 = k_r8_256;
108 p->h[i] ^= v[i] ^ v[i + 8]; 1444 const __m256i r16 = k_r16_256;
1445#endif
1446 end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
1447 DIAG_PERM8(s_items)
1448 do
1449 {
1450 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1451 __m256i a0, b0, c0, d0;
1452 __m256i a1, b1, c1, d1;
1453 {
1454 const __m128i inc = k_inc;
1455 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1456 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1457 __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
1458 __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
1459 D_ADD_EPI64_128(d0_128, inc);
1460 D_ADD_EPI64_128(d1_128, inc);
1461 D_ADD_EPI64_128(d2_128, inc);
1462 D_ADD_EPI64_128(d3_128, inc);
1463 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1464 STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
1465 STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
1466 STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
1467 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1468 d1 = MY_mm256_set_m128i(d3_128, d2_128);
1469 D_XOR_256(d0, k_iv4_256);
1470 D_XOR_256(d1, k_iv4_256);
1471 }
1472 c1 = c0 = SET_FROM_128(k_iv0_128);
1473 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1474 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1475 a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
1476 b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
1477
1478 ROUNDS_LOOP (EE2)
1479
1480 D_XOR_256(a0, c0);
1481 D_XOR_256(b0, d0);
1482 D_XOR_256(a1, c1);
1483 D_XOR_256(b1, d1);
1484
1485 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1486 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1487 D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
1488 D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
1489
1490 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1491 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1492 STORE_256_TO_STRUCT(s + NSW * 2, a1);
1493 STORE_256_TO_STRUCT(s + NSW * 3, b1);
1494
1495 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1496 pos += Z7_BLAKE2S_BLOCK_SIZE * 4;
1497 pos &= SUPER_BLOCK_MASK;
1498 }
1499 while (data < end);
1500 DIAG_PERM8(s_items)
1501 end += Z7_BLAKE2S_BLOCK_SIZE * 3;
109 } 1502 }
1503 if (data == end)
1504 return;
1505 // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
1506 do
1507 {
1508 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1509 Z7_BLAKE2S_CompressSingleBlock(s, data);
1510 data += Z7_BLAKE2S_BLOCK_SIZE;
1511 pos += Z7_BLAKE2S_BLOCK_SIZE;
1512 pos &= SUPER_BLOCK_MASK;
1513 }
1514 while (data != end);
1515}
1516
1517#endif // Z7_BLAKE2S_USE_AVX2_WAY4
1518#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1519
1520
1521// ---------------------------------------------------------
1522
1523#ifdef Z7_BLAKE2S_USE_AVX2_FAST
1524
1525#define OP256_L(a, i) D_ADD_256 (V(a, 0), \
1526 LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
1527
1528#define OP256_0(a) OP256_L(a, 0)
1529#define OP256_7(a) OP256_L(a, 1)
1530
1531#define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1));
1532#define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0));
1533#define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3));
1534#define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2));
1535
1536#define OP256_3(a) D_ROT_256_16 (V(a, 3));
1537#define OP256_6(a) D_ROT_256_12 (V(a, 1));
1538#define OP256_8(a) D_ROT_256_8 (V(a, 3));
1539#define OP256_9(a) D_ROT_256_7 (V(a, 1));
1540
1541
1542#if 0 || 1 && defined(MY_CPU_X86)
1543
1544#define V8_G(a) \
1545 OP256_0 (a) \
1546 OP256_1 (a) \
1547 OP256_2 (a) \
1548 OP256_3 (a) \
1549 OP256_4 (a) \
1550 OP256_5 (a) \
1551 OP256_6 (a) \
1552 OP256_7 (a) \
1553 OP256_1 (a) \
1554 OP256_2 (a) \
1555 OP256_8 (a) \
1556 OP256_4 (a) \
1557 OP256_5 (a) \
1558 OP256_9 (a) \
1559
1560#define V8R { \
1561 V8_G (0); \
1562 V8_G (1); \
1563 V8_G (2); \
1564 V8_G (3); \
1565 V8_G (4); \
1566 V8_G (5); \
1567 V8_G (6); \
1568 V8_G (7); \
1569}
1570
1571#else
1572
1573#define OP256_INTER_4(op, a,b,c,d) \
1574 op (a) \
1575 op (b) \
1576 op (c) \
1577 op (d) \
1578
1579#define V8_G(a,b,c,d) \
1580 OP256_INTER_4 (OP256_0, a,b,c,d) \
1581 OP256_INTER_4 (OP256_1, a,b,c,d) \
1582 OP256_INTER_4 (OP256_2, a,b,c,d) \
1583 OP256_INTER_4 (OP256_3, a,b,c,d) \
1584 OP256_INTER_4 (OP256_4, a,b,c,d) \
1585 OP256_INTER_4 (OP256_5, a,b,c,d) \
1586 OP256_INTER_4 (OP256_6, a,b,c,d) \
1587 OP256_INTER_4 (OP256_7, a,b,c,d) \
1588 OP256_INTER_4 (OP256_1, a,b,c,d) \
1589 OP256_INTER_4 (OP256_2, a,b,c,d) \
1590 OP256_INTER_4 (OP256_8, a,b,c,d) \
1591 OP256_INTER_4 (OP256_4, a,b,c,d) \
1592 OP256_INTER_4 (OP256_5, a,b,c,d) \
1593 OP256_INTER_4 (OP256_9, a,b,c,d) \
1594
1595#define V8R { \
1596 V8_G (0, 1, 2, 3) \
1597 V8_G (4, 5, 6, 7) \
1598}
1599#endif
1600
1601#define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R }
1602
1603
1604// for debug:
1605// #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
1606#if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
1607// gather instruction is slow.
1608#define V8_LOAD_MSG(w, m) \
1609{ \
1610 unsigned i; \
1611 for (i = 0; i < 16; ++i) { \
1612 w[i] = _mm256_i32gather_epi32( \
1613 (const void *)((m) + i * sizeof(UInt32)),\
1614 _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
1615 sizeof(UInt32)); \
1616 } \
1617}
1618#else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1619
1620#define V8_LOAD_MSG_2(w, a0, a1) \
1621{ \
1622 (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \
1623 (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \
1624}
1625
1626#define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
1627{ \
1628 __m256i s0, s1, s2, s3; \
1629 s0 = _mm256_unpacklo_epi64(z0, z1); \
1630 s1 = _mm256_unpackhi_epi64(z0, z1); \
1631 s2 = _mm256_unpacklo_epi64(z2, z3); \
1632 s3 = _mm256_unpackhi_epi64(z2, z3); \
1633 V8_LOAD_MSG_2((w) + 0, s0, s2) \
1634 V8_LOAD_MSG_2((w) + 1, s1, s3) \
1635}
1636
1637#define V8_LOAD_MSG_0(t0, t1, m) \
1638{ \
1639 __m256i m0, m1; \
1640 m0 = LOADU_256(m); \
1641 m1 = LOADU_256((m) + 2 * 32); \
1642 t0 = _mm256_unpacklo_epi32(m0, m1); \
1643 t1 = _mm256_unpackhi_epi32(m0, m1); \
1644}
1645
1646#define V8_LOAD_MSG_8(w, m) \
1647{ \
1648 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1649 V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \
1650 V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \
1651 V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \
1652 V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \
1653 V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \
1654 V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \
1655}
1656
1657#define V8_LOAD_MSG(w, m) \
1658{ \
1659 V8_LOAD_MSG_8(w, m) \
1660 V8_LOAD_MSG_8((w) + 8, (m) + 32) \
1661}
1662
1663#endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1664
1665
1666#define V8_PERM_PAIR_STORE(u, a0, a2) \
1667{ \
1668 STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \
1669 STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \
1670}
1671
1672#define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
1673{ \
1674 __m256i s0, s1, s2, s3; \
1675 s0 = _mm256_unpacklo_epi64(z0, z1); \
1676 s1 = _mm256_unpackhi_epi64(z0, z1); \
1677 s2 = _mm256_unpacklo_epi64(z2, z3); \
1678 s3 = _mm256_unpackhi_epi64(z2, z3); \
1679 V8_PERM_PAIR_STORE(u + 0, s0, s2) \
1680 V8_PERM_PAIR_STORE(u + 2, s1, s3) \
1681}
1682
1683#define V8_UNPACK_STORE_0(src32, d0, d1) \
1684{ \
1685 const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \
1686 const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \
1687 d0 = _mm256_unpacklo_epi32(v0, v1); \
1688 d1 = _mm256_unpackhi_epi32(v0, v1); \
1689}
1690
1691#define V8_UNPACK_STATE(dest32, src32) \
1692{ \
1693 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1694 V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \
1695 V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \
1696 V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \
1697 V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \
1698 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \
1699 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \
110} 1700}
111 1701
112 1702
113#define Blake2s_Increment_Counter(S, inc) \
114 { p->t[0] += (inc); p->t[1] += (p->t[0] < (inc)); }
115 1703
116#define Blake2s_Set_LastBlock(p) \ 1704#define V8_LOAD_STATE_256_FROM_STRUCT(i) \
117 { p->f[0] = BLAKE2S_FINAL_FLAG; p->f[1] = p->lastNode_f1; } 1705 v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
1706
1707#if 0 || 0 && defined(MY_CPU_X86)
1708#define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1709#endif
1710
1711#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1712// this branch doesn't use (iv) array
1713// so register pressure can be lower.
1714// it can be faster sometimes
1715#define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i)
1716#define V8_UPDATE_STATE_256(i) \
1717{ \
1718 STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
1719 XOR_256(v[i], v[(i) + 8]), \
1720 LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
1721}
1722#else
1723// it uses more variables (iv) registers
1724// it's better for gcc
1725// maybe that branch is better, if register pressure will be lower (avx512)
1726#define V8_LOAD_STATE_256(i) { iv[i] = v[i]; }
1727#define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
1728#define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
1729#endif
118 1730
119 1731
120static void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size) 1732#if 0
1733 // use loading constants from memory
1734 #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
1735MY_ALIGN(64)
1736static const UInt32 k_Blake2s_IV_WAY8[]=
121{ 1737{
122 while (size != 0) 1738 KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
123 { 1739};
124 unsigned pos = (unsigned)p->bufPos; 1740 #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
125 unsigned rem = BLAKE2S_BLOCK_SIZE - pos; 1741#else
1742 // use constant generation:
1743 #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i))
1744#endif
126 1745
127 if (size <= rem) 1746
1747static
1748Z7_NO_INLINE
1749#ifdef BLAKE2S_ATTRIB_AVX2
1750 BLAKE2S_ATTRIB_AVX2
1751#endif
1752void
1753Z7_FASTCALL
1754Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1755{
1756#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1757 __m256i v[16];
1758#endif
1759
1760 // PrintStates2(s_items, 8, 16);
1761
1762#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1763 REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
1764#endif
1765
1766 do
1767 {
1768 __m256i w[16];
1769#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1770 __m256i v[16];
1771#else
1772 __m256i iv[8];
1773#endif
1774 V8_LOAD_MSG(w, data)
128 { 1775 {
129 memcpy(p->buf + pos, data, size); 1776 // we use load/store ctr inside loop to reduce register pressure:
130 p->bufPos += (UInt32)size; 1777#if 1 || 1 && defined(MY_CPU_X86)
131 return; 1778 const __m256i ctr = _mm256_add_epi64(
1779 LOAD_256_FROM_STRUCT(s_items + 64),
1780 _mm256_set_epi32(
1781 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
1782 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
1783 STORE_256_TO_STRUCT(s_items + 64, ctr);
1784#else
1785 const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
1786 + Z7_BLAKE2S_BLOCK_SIZE;
1787 const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
1788 *(UInt64 *)(void *)(s_items + 64) = ctr64;
1789#endif
1790 v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1791 v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
132 } 1792 }
1793 v[ 8] = GET_256_IV_WAY8(0);
1794 v[ 9] = GET_256_IV_WAY8(1);
1795 v[10] = GET_256_IV_WAY8(2);
1796 v[11] = GET_256_IV_WAY8(3);
1797 v[14] = GET_256_IV_WAY8(6);
1798 v[15] = GET_256_IV_WAY8(7);
133 1799
134 memcpy(p->buf + pos, data, rem); 1800 REP8_MACRO (V8_LOAD_STATE_256)
135 Blake2s_Increment_Counter(S, BLAKE2S_BLOCK_SIZE) 1801 ROUNDS_LOOP (V8_ROUND)
136 Blake2s_Compress(p); 1802 REP8_MACRO (V8_UPDATE_STATE_256)
137 p->bufPos = 0; 1803 data += SUPER_BLOCK_SIZE;
138 data += rem;
139 size -= rem;
140 } 1804 }
1805 while (data != end);
1806
1807#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1808 REP8_MACRO (V8_STORE_STATE_256)
1809#endif
141} 1810}
142 1811
143 1812
144static void Blake2s_Final(CBlake2s *p, Byte *digest) 1813static
1814Z7_NO_INLINE
1815#ifdef BLAKE2S_ATTRIB_AVX2
1816 BLAKE2S_ATTRIB_AVX2
1817#endif
1818void
1819Z7_FASTCALL
1820Blake2sp_Final_AVX2_Fast(UInt32 *states)
145{ 1821{
146 unsigned i; 1822 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1823 // PrintStates2(states, 8, 16);
1824 V8_UNPACK_STATE(states, states)
1825 // PrintStates2(states, 8, 16);
1826 {
1827 unsigned k;
1828 for (k = 0; k < 8; k++)
1829 {
1830 UInt32 *s = states + (size_t)k * 16;
1831 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1832 }
1833 }
1834 // PrintStates2(states, 8, 16);
1835 // printf("\nafter V8_UNPACK_STATE \n");
1836}
1837
1838#endif // Z7_BLAKE2S_USE_AVX2_FAST
1839#endif // avx2
1840#endif // vector
1841
1842
1843/*
1844#define Blake2s_Increment_Counter(s, inc) \
1845 { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
1846#define Blake2s_Increment_Counter_Small(s, inc) \
1847 { STATE_T(s)[0] += (inc); }
1848*/
1849
1850#define Blake2s_Set_LastBlock(s) \
1851 { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
1852
1853
1854#if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
1855 // good for vs2022
1856 #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
1857#else
1858 // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
1859 #define LOOP_8(mac) { REP8_MACRO(mac) }
1860#endif
1861
1862
1863static
1864Z7_FORCE_INLINE
1865// Z7_NO_INLINE
1866void
1867Z7_FASTCALL
1868Blake2s_Compress(UInt32 *s, const Byte *input)
1869{
1870 UInt32 m[16];
1871 UInt32 v[16];
1872 {
1873 unsigned i;
1874 for (i = 0; i < 16; i++)
1875 m[i] = GetUi32(input + i * 4);
1876 }
1877
1878#define INIT_v_FROM_s(i) v[i] = s[i];
1879
1880 LOOP_8(INIT_v_FROM_s)
1881
1882 // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
1883 {
1884 const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
1885 const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
1886 STATE_T(s)[0] = t0;
1887 STATE_T(s)[1] = t1;
1888 v[12] = t0 ^ KIV(4);
1889 v[13] = t1 ^ KIV(5);
1890 }
1891 // v[12] = STATE_T(s)[0] ^ KIV(4);
1892 // v[13] = STATE_T(s)[1] ^ KIV(5);
1893 v[14] = STATE_F(s)[0] ^ KIV(6);
1894 v[15] = STATE_F(s)[1] ^ KIV(7);
1895
1896 v[ 8] = KIV(0);
1897 v[ 9] = KIV(1);
1898 v[10] = KIV(2);
1899 v[11] = KIV(3);
1900 // PrintStates2((const UInt32 *)v, 1, 16);
1901
1902 #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
1903 #define ADD32M(dest, src, a) V(a, dest) += V(a, src);
1904 #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src);
1905 #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift);
1906
1907// big interleaving can provides big performance gain, if scheduler queues are small.
1908#if 0 || 1 && defined(MY_CPU_X86)
1909 // interleave-1: for small register number (x86-32bit)
1910 #define G2(index, a, x, y) \
1911 ADD_SIGMA (a, (index) + 2 * 0) \
1912 ADD32M (0, 1, a) \
1913 XOR32M (3, 0, a) \
1914 RTR32M (3, x, a) \
1915 ADD32M (2, 3, a) \
1916 XOR32M (1, 2, a) \
1917 RTR32M (1, y, a) \
1918
1919 #define G(a) \
1920 G2(a * 2 , a, 16, 12) \
1921 G2(a * 2 + 1, a, 8, 7) \
1922
1923 #define R2 \
1924 G(0) \
1925 G(1) \
1926 G(2) \
1927 G(3) \
1928 G(4) \
1929 G(5) \
1930 G(6) \
1931 G(7) \
1932
1933#elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
1934 // interleave-2: is good if the number of registers is not big (x86-64).
1935
1936 #define REP2(mac, dest, src, a, b) \
1937 mac(dest, src, a) \
1938 mac(dest, src, b)
1939
1940 #define G2(index, a, b, x, y) \
1941 ADD_SIGMA (a, (index) + 2 * 0) \
1942 ADD_SIGMA (b, (index) + 2 * 1) \
1943 REP2 (ADD32M, 0, 1, a, b) \
1944 REP2 (XOR32M, 3, 0, a, b) \
1945 REP2 (RTR32M, 3, x, a, b) \
1946 REP2 (ADD32M, 2, 3, a, b) \
1947 REP2 (XOR32M, 1, 2, a, b) \
1948 REP2 (RTR32M, 1, y, a, b) \
1949
1950 #define G(a, b) \
1951 G2(a * 2 , a, b, 16, 12) \
1952 G2(a * 2 + 1, a, b, 8, 7) \
1953
1954 #define R2 \
1955 G(0, 1) \
1956 G(2, 3) \
1957 G(4, 5) \
1958 G(6, 7) \
147 1959
148 Blake2s_Increment_Counter(S, (UInt32)p->bufPos) 1960#else
149 Blake2s_Set_LastBlock(p) 1961 // interleave-4:
150 memset(p->buf + p->bufPos, 0, BLAKE2S_BLOCK_SIZE - p->bufPos); 1962 // it has big register pressure for x86/x64.
151 Blake2s_Compress(p); 1963 // and MSVC compilers for x86/x64 are slow for this branch.
1964 // but if we have big number of registers, this branch can be faster.
152 1965
153 for (i = 0; i < 8; i++) 1966 #define REP4(mac, dest, src, a, b, c, d) \
1967 mac(dest, src, a) \
1968 mac(dest, src, b) \
1969 mac(dest, src, c) \
1970 mac(dest, src, d)
1971
1972 #define G2(index, a, b, c, d, x, y) \
1973 ADD_SIGMA (a, (index) + 2 * 0) \
1974 ADD_SIGMA (b, (index) + 2 * 1) \
1975 ADD_SIGMA (c, (index) + 2 * 2) \
1976 ADD_SIGMA (d, (index) + 2 * 3) \
1977 REP4 (ADD32M, 0, 1, a, b, c, d) \
1978 REP4 (XOR32M, 3, 0, a, b, c, d) \
1979 REP4 (RTR32M, 3, x, a, b, c, d) \
1980 REP4 (ADD32M, 2, 3, a, b, c, d) \
1981 REP4 (XOR32M, 1, 2, a, b, c, d) \
1982 REP4 (RTR32M, 1, y, a, b, c, d) \
1983
1984 #define G(a, b, c, d) \
1985 G2(a * 2 , a, b, c, d, 16, 12) \
1986 G2(a * 2 + 1, a, b, c, d, 8, 7) \
1987
1988 #define R2 \
1989 G(0, 1, 2, 3) \
1990 G(4, 5, 6, 7) \
1991
1992#endif
1993
1994 #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 }
1995
1996 // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
1997 // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
1998 // 30-60% faster for (arm64-arm32) GCC.
1999 // 5-11% faster for (arm64) CLANG-MAC.
2000 // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
2001 // But if there is vectors branch (for x86*), this scalar code will be unused mostly.
2002 // So we want smaller code (without unrolling) in that case (x86*).
2003#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
2004 #define Z7_BLAKE2S_UNROLL
2005#endif
2006
2007#ifdef Z7_BLAKE2S_UNROLL
2008 ROUNDS_LOOP_UNROLLED (R)
2009#else
2010 ROUNDS_LOOP (R)
2011#endif
2012
2013 #undef G
2014 #undef G2
2015 #undef R
2016 #undef R2
2017
2018 // printf("\n v after: \n");
2019 // PrintStates2((const UInt32 *)v, 1, 16);
2020#define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8];
2021
2022 LOOP_8(XOR_s_PAIR_v)
2023 // printf("\n s after:\n");
2024 // PrintStates2((const UInt32 *)s, 1, 16);
2025}
2026
2027
2028static
2029Z7_NO_INLINE
2030void
2031Z7_FASTCALL
2032Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
2033{
2034 size_t pos = 0;
2035 // PrintStates2(s_items, 8, 16);
2036 do
154 { 2037 {
155 SetUi32(digest + sizeof(p->h[i]) * i, p->h[i]) 2038 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
2039 Blake2s_Compress(s, data);
2040 data += Z7_BLAKE2S_BLOCK_SIZE;
2041 pos += Z7_BLAKE2S_BLOCK_SIZE;
2042 pos &= SUPER_BLOCK_MASK;
156 } 2043 }
2044 while (data != end);
157} 2045}
158 2046
159 2047
160/* ---------- BLAKE2s ---------- */ 2048#ifdef Z7_BLAKE2S_USE_VECTORS
2049
2050static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2;
2051static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
2052static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init;
2053static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final;
2054static unsigned g_z7_Blake2sp_SupportedFlags;
2055
2056 #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast
2057 #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
2058#else
2059 #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2
2060 #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
2061#endif // Z7_BLAKE2S_USE_VECTORS
2062
161 2063
162/* we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ 2064#if 1 && defined(MY_CPU_LE)
2065 #define GET_DIGEST(_s, _digest) \
2066 { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
2067#else
2068 #define GET_DIGEST(_s, _digest) \
2069 { unsigned _i; for (_i = 0; _i < 8; _i++) \
2070 { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
2071 }
2072#endif
2073
2074
2075/* ---------- BLAKE2s ---------- */
163/* 2076/*
2077// we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
164typedef struct 2078typedef struct
165{ 2079{
166 Byte digest_length; 2080 Byte digest_length;
167 Byte key_length; 2081 Byte key_length;
168 Byte fanout; 2082 Byte fanout; // = 1 : in sequential mode
169 Byte depth; 2083 Byte depth; // = 1 : in sequential mode
170 UInt32 leaf_length; 2084 UInt32 leaf_length;
171 Byte node_offset[6]; 2085 Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode
172 Byte node_depth; 2086 Byte node_depth; // 0 for the leaves, or in sequential mode
173 Byte inner_length; 2087 Byte inner_length; // [0, 32], 0 in sequential mode
174 Byte salt[BLAKE2S_SALTBYTES]; 2088 Byte salt[BLAKE2S_SALTBYTES];
175 Byte personal[BLAKE2S_PERSONALBYTES]; 2089 Byte personal[BLAKE2S_PERSONALBYTES];
176} CBlake2sParam; 2090} CBlake2sParam;
177*/ 2091*/
178 2092
2093#define k_Blake2sp_IV_0 \
2094 (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
2095#define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \
2096 (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
179 2097
180static void Blake2sp_Init_Spec(CBlake2s *p, unsigned node_offset, unsigned node_depth) 2098Z7_FORCE_INLINE
2099static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
181{ 2100{
182 Blake2s_Init0(p); 2101 s[0] = k_Blake2sp_IV_0;
183 2102 s[1] = KIV(1);
184 p->h[0] ^= (BLAKE2S_DIGEST_SIZE | ((UInt32)BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)); 2103 s[2] = KIV(2) ^ (UInt32)node_offset;
185 p->h[2] ^= ((UInt32)node_offset); 2104 s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
186 p->h[3] ^= ((UInt32)node_depth << 16) | ((UInt32)BLAKE2S_DIGEST_SIZE << 24); 2105 s[4] = KIV(4);
187 /* 2106 s[5] = KIV(5);
188 P->digest_length = BLAKE2S_DIGEST_SIZE; 2107 s[6] = KIV(6);
189 P->key_length = 0; 2108 s[7] = KIV(7);
190 P->fanout = BLAKE2SP_PARALLEL_DEGREE; 2109
191 P->depth = 2; 2110 STATE_T(s)[0] = 0;
192 P->leaf_length = 0; 2111 STATE_T(s)[1] = 0;
193 store48(P->node_offset, node_offset); 2112 STATE_F(s)[0] = 0;
194 P->node_depth = node_depth; 2113 STATE_F(s)[1] = 0;
195 P->inner_length = BLAKE2S_DIGEST_SIZE;
196 */
197} 2114}
198 2115
199 2116
2117#ifdef Z7_BLAKE2S_USE_V128_FAST
2118
2119static
2120Z7_NO_INLINE
2121#ifdef BLAKE2S_ATTRIB_128BIT
2122 BLAKE2S_ATTRIB_128BIT
2123#endif
2124void
2125Z7_FASTCALL
2126Blake2sp_InitState_V128_Fast(UInt32 *states)
2127{
2128#define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
2129 { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \
2130 STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
2131 }
2132#define STORE_128_PAIR_INIT_STATES_1(i, mac) \
2133 { const __m128i t = mac; \
2134 STORE_128_PAIR_INIT_STATES_2(i, t, t) \
2135 }
2136#define STORE_128_PAIR_INIT_STATES_IV(i) \
2137 STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
2138
2139 STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
2140 STORE_128_PAIR_INIT_STATES_IV (1)
2141 {
2142 const __m128i t = GET_128_IV_WAY4(2);
2143 STORE_128_PAIR_INIT_STATES_2 (2,
2144 XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
2145 XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
2146 }
2147 STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2148 STORE_128_PAIR_INIT_STATES_IV (4)
2149 STORE_128_PAIR_INIT_STATES_IV (5)
2150 STORE_128_PAIR_INIT_STATES_IV (6)
2151 STORE_128_PAIR_INIT_STATES_IV (7)
2152 STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0))
2153 // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
2154}
2155
2156#endif // Z7_BLAKE2S_USE_V128_FAST
2157
2158
2159#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2160
2161static
2162Z7_NO_INLINE
2163#ifdef BLAKE2S_ATTRIB_AVX2
2164 BLAKE2S_ATTRIB_AVX2
2165#endif
2166void
2167Z7_FASTCALL
2168Blake2sp_InitState_AVX2_Fast(UInt32 *states)
2169{
2170#define STORE_256_INIT_STATES(i, t) \
2171 STORE_256_TO_STRUCT(states + 8 * (i), t);
2172#define STORE_256_INIT_STATES_IV(i) \
2173 STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
2174
2175 STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
2176 STORE_256_INIT_STATES_IV (1)
2177 STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2),
2178 _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
2179 STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2180 STORE_256_INIT_STATES_IV (4)
2181 STORE_256_INIT_STATES_IV (5)
2182 STORE_256_INIT_STATES_IV (6)
2183 STORE_256_INIT_STATES_IV (7)
2184 STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
2185 // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
2186}
2187
2188#endif // Z7_BLAKE2S_USE_AVX2_FAST
2189
2190
2191
2192Z7_NO_INLINE
2193void Blake2sp_InitState(CBlake2sp *p)
2194{
2195 size_t i;
2196 // memset(p->states, 0, sizeof(p->states)); // for debug
2197 p->u.header.cycPos = 0;
2198#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2199 if (p->u.header.func_Init)
2200 {
2201 p->u.header.func_Init(p->states);
2202 return;
2203 }
2204#endif
2205 for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
2206 Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
2207}
2208
200void Blake2sp_Init(CBlake2sp *p) 2209void Blake2sp_Init(CBlake2sp *p)
201{ 2210{
202 unsigned i; 2211#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
203 2212 p->u.header.func_Compress_Fast =
204 p->bufPos = 0; 2213#ifdef Z7_BLAKE2S_USE_VECTORS
2214 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2215#else
2216 NULL;
2217#endif
2218
2219 p->u.header.func_Compress_Single =
2220#ifdef Z7_BLAKE2S_USE_VECTORS
2221 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2222#else
2223 NULL;
2224#endif
2225
2226 p->u.header.func_Init =
2227#ifdef Z7_BLAKE2S_USE_VECTORS
2228 g_Z7_BLAKE2SP_FUNC_INIT_Init;
2229#else
2230 NULL;
2231#endif
205 2232
206 for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) 2233 p->u.header.func_Final =
207 Blake2sp_Init_Spec(&p->S[i], i, 0); 2234#ifdef Z7_BLAKE2S_USE_VECTORS
2235 g_Z7_BLAKE2SP_FUNC_INIT_Final;
2236#else
2237 NULL;
2238#endif
2239#endif
208 2240
209 p->S[BLAKE2SP_PARALLEL_DEGREE - 1].lastNode_f1 = BLAKE2S_FINAL_FLAG; 2241 Blake2sp_InitState(p);
210} 2242}
211 2243
212 2244
213void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) 2245void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
214{ 2246{
215 unsigned pos = p->bufPos; 2247 size_t pos;
216 while (size != 0) 2248 // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
2249 if (size == 0)
2250 return;
2251 pos = p->u.header.cycPos;
2252 // pos < SUPER_BLOCK_SIZE * 2 : is expected
2253 // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also
2254 {
2255 const size_t pos2 = pos & SUPER_BLOCK_MASK;
2256 if (pos2)
2257 {
2258 const size_t rem = SUPER_BLOCK_SIZE - pos2;
2259 if (rem > size)
2260 {
2261 p->u.header.cycPos = (unsigned)(pos + size);
2262 // cycPos < SUPER_BLOCK_SIZE * 2
2263 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2264 /* to simpilify the code here we don't try to process first superblock,
2265 if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
2266 return;
2267 }
2268 // (rem <= size)
2269 memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
2270 pos += rem;
2271 data += rem;
2272 size -= rem;
2273 }
2274 }
2275
2276 // pos <= SUPER_BLOCK_SIZE * 2
2277 // pos % SUPER_BLOCK_SIZE == 0
2278 if (pos)
2279 {
2280 /* pos == SUPER_BLOCK_SIZE ||
2281 pos == SUPER_BLOCK_SIZE * 2 */
2282 size_t end = pos;
2283 if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
2284 || (end -= SUPER_BLOCK_SIZE))
2285 {
2286 Z7_BLAKE2SP_Compress_Fast(p)(p->states,
2287 (const Byte *)(const void *)p->buf32,
2288 (const Byte *)(const void *)p->buf32 + end);
2289 if (pos -= end)
2290 memcpy(p->buf32, (const Byte *)(const void *)p->buf32
2291 + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
2292 }
2293 }
2294
2295 // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
2296 if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2297 {
2298 // pos == 0
2299 const Byte *end;
2300 const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
2301 & ~(size_t)SUPER_BLOCK_MASK;
2302 size -= size2;
2303 // size < SUPER_BLOCK_SIZE * 2
2304 end = data + size2;
2305 Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
2306 data = end;
2307 }
2308
2309 if (size != 0)
217 { 2310 {
218 unsigned index = pos / BLAKE2S_BLOCK_SIZE; 2311 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
219 unsigned rem = BLAKE2S_BLOCK_SIZE - (pos & (BLAKE2S_BLOCK_SIZE - 1)); 2312 pos += size;
220 if (rem > size)
221 rem = (unsigned)size;
222 Blake2s_Update(&p->S[index], data, rem);
223 size -= rem;
224 data += rem;
225 pos += rem;
226 pos &= (BLAKE2S_BLOCK_SIZE * BLAKE2SP_PARALLEL_DEGREE - 1);
227 } 2313 }
228 p->bufPos = pos; 2314 p->u.header.cycPos = (unsigned)pos;
2315 // cycPos < SUPER_BLOCK_SIZE * 2
229} 2316}
230 2317
231 2318
232void Blake2sp_Final(CBlake2sp *p, Byte *digest) 2319void Blake2sp_Final(CBlake2sp *p, Byte *digest)
233{ 2320{
234 CBlake2s R; 2321 // UInt32 * const R_states = p->states;
235 unsigned i; 2322 // printf("\nBlake2sp_Final \n");
2323#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2324 if (p->u.header.func_Final)
2325 p->u.header.func_Final(p->states);
2326#endif
2327 // printf("\n=====\nBlake2sp_Final \n");
2328 // PrintStates(p->states, 32);
2329
2330 // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
2331 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
2332 {
2333 unsigned pos;
2334 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
2335 0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
2336 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2337 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2338 {
2339 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2340 Blake2s_Set_LastBlock(s)
2341 if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
2342 {
2343 UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
2344 if (pos < p->u.header.cycPos)
2345 delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
2346 // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
2347 {
2348 const UInt32 v = STATE_T(s)[0];
2349 STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0)
2350 STATE_T(s)[0] = v - delta;
2351 }
2352 }
2353 }
2354 // PrintStates(p->states, 16);
2355 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2356 (Byte *)(void *)p->buf32,
2357 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2358 // PrintStates(p->states, 16);
2359 }
2360 else
2361 {
2362 // (p->u.header.cycPos > SUPER_BLOCK_SIZE)
2363 unsigned pos;
2364 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2365 {
2366 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2367 if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
2368 Blake2s_Set_LastBlock(s)
2369 }
2370 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2371 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2372
2373 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2374 (Byte *)(void *)p->buf32,
2375 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
236 2376
237 Blake2sp_Init_Spec(&R, 0, 1); 2377 // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
238 R.lastNode_f1 = BLAKE2S_FINAL_FLAG; 2378 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2379
2380 // if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
2381 {
2382 pos = SUPER_BLOCK_SIZE;
2383 for (;;)
2384 {
2385 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
2386 Blake2s_Set_LastBlock(s)
2387 pos += Z7_BLAKE2S_BLOCK_SIZE;
2388 if (pos >= p->u.header.cycPos)
2389 {
2390 if (pos != p->u.header.cycPos)
2391 {
2392 const UInt32 delta = pos - p->u.header.cycPos;
2393 const UInt32 v = STATE_T(s)[0];
2394 STATE_T(s)[1] -= v < delta;
2395 STATE_T(s)[0] = v - delta;
2396 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
2397 }
2398 break;
2399 }
2400 }
2401 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2402 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
2403 (Byte *)(void *)p->buf32 + pos);
2404 }
2405 }
239 2406
240 for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++)
241 { 2407 {
242 Byte hash[BLAKE2S_DIGEST_SIZE]; 2408 size_t pos;
243 Blake2s_Final(&p->S[i], hash); 2409 for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
244 Blake2s_Update(&R, hash, BLAKE2S_DIGEST_SIZE); 2410 {
2411 const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
2412 Byte *dest = (Byte *)(void *)p->buf32 + pos;
2413 GET_DIGEST(s, dest)
2414 }
245 } 2415 }
2416 Blake2sp_Init_Spec(p->states, 0, 1);
2417 {
2418 size_t pos;
2419 for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
2420 - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2421 {
2422 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2423 (const Byte *)(const void *)p->buf32 + pos,
2424 (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
2425 }
2426 }
2427 // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
2428 Blake2s_Set_LastBlock(p->states)
2429 STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
2430 {
2431 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2432 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
2433 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
2434 }
2435 GET_DIGEST(p->states, digest)
2436 // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
2437}
2438
246 2439
247 Blake2s_Final(&R, digest); 2440BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
2441{
2442 // printf("\n========== setfunction = %d ======== \n", algo);
2443#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2444 Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
2445 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
2446 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2447 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2448#else
2449 UNUSED_VAR(p)
2450#endif
2451
2452#ifdef Z7_BLAKE2S_USE_VECTORS
2453
2454 func = func_Single = Blake2sp_Compress2;
2455
2456 if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
2457 {
2458 // printf("\n========== setfunction NON-SCALER ======== \n");
2459 if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
2460 {
2461 func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2462 func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2463 func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init;
2464 func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final;
2465 }
2466 else
2467 {
2468 if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
2469 return False;
2470
2471#ifdef Z7_BLAKE2S_USE_AVX2
2472
2473 func_Single =
2474#if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2475 Blake2sp_Compress2_AVX2_Way2;
2476#else
2477 Z7_BLAKE2S_Compress2_V128;
2478#endif
2479
2480#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2481 if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
2482 {
2483 func = Blake2sp_Compress2_AVX2_Fast;
2484 func_Final = Blake2sp_Final_AVX2_Fast;
2485 func_Init = Blake2sp_InitState_AVX2_Fast;
2486 }
2487 else
2488#endif
2489#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2490 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
2491 func = Blake2sp_Compress2_AVX2_Way2;
2492 else
2493#endif
2494#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2495 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
2496 {
2497 func_Single = func = Blake2sp_Compress2_AVX2_Way4;
2498 }
2499 else
2500#endif
2501#endif // avx2
2502 {
2503 if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
2504 {
2505 func = Blake2sp_Compress2_V128_Fast;
2506 func_Final = Blake2sp_Final_V128_Fast;
2507 func_Init = Blake2sp_InitState_V128_Fast;
2508 func_Single = Z7_BLAKE2S_Compress2_V128;
2509 }
2510 else
2511#ifdef Z7_BLAKE2S_USE_V128_WAY2
2512 if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
2513 func = func_Single = Blake2sp_Compress2_V128_Way2;
2514 else
2515#endif
2516 {
2517 if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
2518 return False;
2519 func = func_Single = Blake2sp_Compress2_V128_Way1;
2520 }
2521 }
2522 }
2523 }
2524#else // !VECTORS
2525 if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
2526 return False;
2527#endif // !VECTORS
2528
2529#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2530 p->u.header.func_Compress_Fast = func;
2531 p->u.header.func_Compress_Single = func_Single;
2532 p->u.header.func_Final = func_Final;
2533 p->u.header.func_Init = func_Init;
2534#endif
2535 // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
2536 return True;
2537}
2538
2539
2540void z7_Black2sp_Prepare(void)
2541{
2542#ifdef Z7_BLAKE2S_USE_VECTORS
2543 unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
2544
2545 Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
2546 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
2547 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2548 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2549
2550#if defined(MY_CPU_X86_OR_AMD64)
2551 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2552 if (CPU_IsSupported_AVX512F_AVX512VL())
2553 #endif
2554 #if defined(Z7_BLAKE2S_USE_SSE41)
2555 if (CPU_IsSupported_SSE41())
2556 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2557 if (CPU_IsSupported_SSSE3())
2558 #elif !defined(MY_CPU_AMD64)
2559 if (CPU_IsSupported_SSE2())
2560 #endif
2561#endif
2562 {
2563 #if defined(Z7_BLAKE2S_USE_SSE41)
2564 // printf("\n========== Blake2s SSE41 128-bit\n");
2565 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2566 // printf("\n========== Blake2s SSSE3 128-bit\n");
2567 #else
2568 // printf("\n========== Blake2s SSE2 128-bit\n");
2569 #endif
2570 // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
2571 // printf("\n========== Blake2sp_Compress2_V128_Way2\n");
2572 func_Fast =
2573 func_Single = Z7_BLAKE2S_Compress2_V128;
2574 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
2575#ifdef Z7_BLAKE2S_USE_V128_WAY2
2576 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
2577#endif
2578#ifdef Z7_BLAKE2S_USE_V128_FAST
2579 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
2580 func_Fast = Blake2sp_Compress2_V128_Fast;
2581 func_Init = Blake2sp_InitState_V128_Fast;
2582 func_Final = Blake2sp_Final_V128_Fast;
2583#endif
2584
2585#ifdef Z7_BLAKE2S_USE_AVX2
2586#if defined(MY_CPU_X86_OR_AMD64)
2587 if (
2588 #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2589 CPU_IsSupported_AVX512F_AVX512VL() &&
2590 #endif
2591 CPU_IsSupported_AVX2()
2592 )
2593#endif
2594 {
2595 // #pragma message ("=== Blake2s AVX2")
2596 // printf("\n========== Blake2s AVX2\n");
2597
2598#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2599 func_Single = Blake2sp_Compress2_AVX2_Way2;
2600 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
2601#endif
2602#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2603 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
2604#endif
2605
2606#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2607 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
2608 func_Fast = Blake2sp_Compress2_AVX2_Fast;
2609 func_Init = Blake2sp_InitState_AVX2_Fast;
2610 func_Final = Blake2sp_Final_AVX2_Fast;
2611#elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
2612 func_Fast = Blake2sp_Compress2_AVX2_Way4;
2613#elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2614 func_Fast = Blake2sp_Compress2_AVX2_Way2;
2615#endif
2616 } // avx2
2617#endif // avx2
2618 } // sse*
2619 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast;
2620 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
2621 g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init;
2622 g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final;
2623 g_z7_Blake2sp_SupportedFlags = flags;
2624 // printf("\nflags=%x\n", flags);
2625#endif // vectors
248} 2626}
249 2627
250#undef rotr32 2628/*
2629#ifdef Z7_BLAKE2S_USE_VECTORS
2630void align_test2(CBlake2sp *sp);
2631void align_test2(CBlake2sp *sp)
2632{
2633 __m128i a = LOAD_128(sp->states);
2634 D_XOR_128(a, LOAD_128(sp->states + 4));
2635 STORE_128(sp->states, a);
2636}
2637void align_test2(void);
2638void align_test2(void)
2639{
2640 CBlake2sp sp;
2641 Blake2sp_Init(&sp);
2642 Blake2sp_Update(&sp, NULL, 0);
2643}
2644#endif
2645*/