diff options
Diffstat (limited to 'C/Sha256Opt.c')
-rw-r--r-- | C/Sha256Opt.c | 172 |
1 files changed, 78 insertions, 94 deletions
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index eb38166..1c6b50f 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha256 HW") | 36 | // #pragma message("Sha256 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -91,60 +76,44 @@ SHA: | |||
91 | extern | 76 | extern |
92 | MY_ALIGN(64) | 77 | MY_ALIGN(64) |
93 | const UInt32 SHA256_K_ARRAY[64]; | 78 | const UInt32 SHA256_K_ARRAY[64]; |
94 | |||
95 | #define K SHA256_K_ARRAY | 79 | #define K SHA256_K_ARRAY |
96 | 80 | ||
97 | 81 | ||
98 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | 82 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); |
99 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); | 83 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); |
100 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); | 84 | #define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); |
101 | |||
102 | 85 | ||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | m = _mm_shuffle_epi8(m, mask); \ | 88 | m = _mm_shuffle_epi8(m, mask); \ |
106 | 89 | ||
107 | #define SM1(g0, g1, g2, g3) \ | 90 | #define NNN(m0, m1, m2, m3) |
108 | SHA256_MSG1(g3, g0); \ | ||
109 | 91 | ||
110 | #define SM2(g0, g1, g2, g3) \ | 92 | #define SM1(m1, m2, m3, m0) \ |
111 | tmp = _mm_alignr_epi8(g1, g0, 4); \ | 93 | SHA256_MSG1(m0, m1); \ |
112 | ADD_EPI32(g2, tmp) \ | ||
113 | SHA25G_MSG2(g2, g1); \ | ||
114 | |||
115 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) | ||
116 | // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) | ||
117 | |||
118 | |||
119 | #define NNN(g0, g1, g2, g3) | ||
120 | 94 | ||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ | ||
97 | SHA256_MSG2(m0, m3); \ | ||
121 | 98 | ||
122 | #define RND2(t0, t1) \ | 99 | #define RND2(t0, t1) \ |
123 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); | 100 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); |
124 | 101 | ||
125 | #define RND2_0(m, k) \ | ||
126 | msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
127 | RND2(state0, state1); \ | ||
128 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
129 | 102 | ||
130 | 103 | ||
131 | #define RND2_1 \ | 104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ |
105 | msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1); \ | ||
107 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
108 | OP0(m0, m1, m2, m3) \ | ||
132 | RND2(state1, state0); \ | 109 | RND2(state1, state0); \ |
133 | 110 | OP1(m0, m1, m2, m3) \ | |
134 | |||
135 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 | ||
136 | |||
137 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | ||
138 | RND2_0(g0, k) \ | ||
139 | OP0(g0, g1, g2, g3) \ | ||
140 | RND2_1 \ | ||
141 | OP1(g0, g1, g2, g3) \ | ||
142 | 111 | ||
143 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
144 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ |
145 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ |
146 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ |
147 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | 116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ | 119 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ |
@@ -161,8 +130,9 @@ ATTRIB_SHA | |||
161 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 130 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); | 132 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); |
164 | __m128i tmp; | 133 | |
165 | __m128i state0, state1; | 134 | |
135 | __m128i tmp, state0, state1; | ||
166 | 136 | ||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -324,41 +282,70 @@ typedef uint32x4_t v128; | |||
324 | // typedef __n128 v128; // MSVC | 282 | // typedef __n128 v128; // MSVC |
325 | 283 | ||
326 | #ifdef MY_CPU_BE | 284 | #ifdef MY_CPU_BE |
327 | #define MY_rev32_for_LE(x) | 285 | #define MY_rev32_for_LE(x) x |
328 | #else | 286 | #else |
329 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 287 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
330 | #endif | 288 | #endif |
331 | 289 | ||
332 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 290 | #if 1 // 0 for debug |
333 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 291 | // for arm32: it works slower by some reason than direct code |
292 | /* | ||
293 | for arm32 it generates: | ||
294 | MSVC-2022, GCC-9: | ||
295 | vld1.32 {d18,d19}, [r10] | ||
296 | vst1.32 {d4,d5}, [r3] | ||
297 | vld1.8 {d20-d21}, [r4] | ||
298 | there is no align hint (like [r10:128]). So instruction allows unaligned access | ||
299 | */ | ||
300 | #define LOAD_128_32(_p) vld1q_u32(_p) | ||
301 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
302 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
303 | #else | ||
304 | /* | ||
305 | for arm32: | ||
306 | MSVC-2022: | ||
307 | vldm r10,{d18,d19} | ||
308 | vstm r3,{d4,d5} | ||
309 | does it require strict alignment? | ||
310 | GCC-9: | ||
311 | vld1.64 {d30-d31}, [r0:64] | ||
312 | vldr d28, [r0, #16] | ||
313 | vldr d29, [r0, #24] | ||
314 | vst1.64 {d30-d31}, [r0:64] | ||
315 | vstr d28, [r0, #16] | ||
316 | vstr d29, [r0, #24] | ||
317 | there is hint [r0:64], so does it requires 64-bit alignment. | ||
318 | */ | ||
319 | #define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) | ||
320 | #define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) | ||
321 | #define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) | ||
322 | #endif | ||
334 | 323 | ||
335 | #define LOAD_SHUFFLE(m, k) \ | 324 | #define LOAD_SHUFFLE(m, k) \ |
336 | m = LOAD_128((data + (k) * 16)); \ | 325 | m = vreinterpretq_u32_u8( \ |
337 | MY_rev32_for_LE(m); \ | 326 | MY_rev32_for_LE( \ |
327 | LOAD_128_8(data + (k) * 16))); \ | ||
338 | 328 | ||
339 | // K array must be aligned for 16-bytes at least. | 329 | // K array must be aligned for 16-bytes at least. |
340 | extern | 330 | extern |
341 | MY_ALIGN(64) | 331 | MY_ALIGN(64) |
342 | const UInt32 SHA256_K_ARRAY[64]; | 332 | const UInt32 SHA256_K_ARRAY[64]; |
343 | |||
344 | #define K SHA256_K_ARRAY | 333 | #define K SHA256_K_ARRAY |
345 | 334 | ||
346 | |||
347 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); | 335 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); |
348 | #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); | 336 | #define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); |
349 | 337 | ||
350 | #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) | 338 | #define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) |
351 | #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) | 339 | #define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) |
352 | #define NNN(g0, g1, g2, g3) | 340 | #define NNN(m0, m1, m2, m3) |
353 | 341 | ||
354 | 342 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | |
355 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | 343 | msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ |
356 | msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ | ||
357 | tmp = state0; \ | 344 | tmp = state0; \ |
358 | state0 = vsha256hq_u32( state0, state1, msg ); \ | 345 | state0 = vsha256hq_u32( state0, state1, msg ); \ |
359 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ | 346 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ |
360 | OP0(g0, g1, g2, g3); \ | 347 | OP0(m0, m1, m2, m3); \ |
361 | OP1(g0, g1, g2, g3); \ | 348 | OP1(m0, m1, m2, m3); \ |
362 | 349 | ||
363 | 350 | ||
364 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 351 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
379 | if (numBlocks == 0) | 366 | if (numBlocks == 0) |
380 | return; | 367 | return; |
381 | 368 | ||
382 | state0 = LOAD_128(&state[0]); | 369 | state0 = LOAD_128_32(&state[0]); |
383 | state1 = LOAD_128(&state[4]); | 370 | state1 = LOAD_128_32(&state[4]); |
384 | 371 | ||
385 | do | 372 | do |
386 | { | 373 | { |
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
408 | } | 395 | } |
409 | while (--numBlocks); | 396 | while (--numBlocks); |
410 | 397 | ||
411 | STORE_128(&state[0], state0); | 398 | STORE_128_32(&state[0], state0); |
412 | STORE_128(&state[4], state1); | 399 | STORE_128_32(&state[4], state1); |
413 | } | 400 | } |
414 | 401 | ||
415 | #endif // USE_HW_SHA | 402 | #endif // USE_HW_SHA |
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
443 | #endif | 430 | #endif |
444 | 431 | ||
445 | 432 | ||
446 | |||
447 | #undef K | 433 | #undef K |
448 | #undef RND2 | 434 | #undef RND2 |
449 | #undef RND2_0 | ||
450 | #undef RND2_1 | ||
451 | |||
452 | #undef MY_rev32_for_LE | 435 | #undef MY_rev32_for_LE |
436 | |||
453 | #undef NNN | 437 | #undef NNN |
454 | #undef LOAD_128 | 438 | #undef LOAD_128 |
455 | #undef STORE_128 | 439 | #undef STORE_128 |
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
457 | #undef SM1 | 441 | #undef SM1 |
458 | #undef SM2 | 442 | #undef SM2 |
459 | 443 | ||
460 | #undef NNN | 444 | |
461 | #undef R4 | 445 | #undef R4 |
462 | #undef R16 | 446 | #undef R16 |
463 | #undef PREPARE_STATE | 447 | #undef PREPARE_STATE |