aboutsummaryrefslogtreecommitdiff
path: root/C/Sha256Opt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-29 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-30 15:27:15 +0500
commite5431fa6f5505e385c6f9367260717e9c47dc2ee (patch)
tree4cd2c2c3b225b48c8e7053432c41d7b6b6a3d5f8 /C/Sha256Opt.c
parente008ce3976c087bfd21344af8f00a23cf69d4174 (diff)
download7zip-main.tar.gz
7zip-main.tar.bz2
7zip-main.zip
Diffstat (limited to 'C/Sha256Opt.c')
-rw-r--r--C/Sha256Opt.c172
1 files changed, 78 insertions, 94 deletions
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c
index eb38166..1c6b50f 100644
--- a/C/Sha256Opt.c
+++ b/C/Sha256Opt.c
@@ -1,18 +1,11 @@
1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions 1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha256 HW") 36// #pragma message("Sha256 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -91,60 +76,44 @@ SHA:
91extern 76extern
92MY_ALIGN(64) 77MY_ALIGN(64)
93const UInt32 SHA256_K_ARRAY[64]; 78const UInt32 SHA256_K_ARRAY[64];
94
95#define K SHA256_K_ARRAY 79#define K SHA256_K_ARRAY
96 80
97 81
98#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); 82#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
99#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); 83#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
100#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); 84#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
101
102 85
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 m = _mm_shuffle_epi8(m, mask); \ 88 m = _mm_shuffle_epi8(m, mask); \
106 89
107#define SM1(g0, g1, g2, g3) \ 90#define NNN(m0, m1, m2, m3)
108 SHA256_MSG1(g3, g0); \
109 91
110#define SM2(g0, g1, g2, g3) \ 92#define SM1(m1, m2, m3, m0) \
111 tmp = _mm_alignr_epi8(g1, g0, 4); \ 93 SHA256_MSG1(m0, m1); \
112 ADD_EPI32(g2, tmp) \
113 SHA25G_MSG2(g2, g1); \
114
115// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
116// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
117
118
119#define NNN(g0, g1, g2, g3)
120 94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
97 SHA256_MSG2(m0, m3); \
121 98
122#define RND2(t0, t1) \ 99#define RND2(t0, t1) \
123 t0 = _mm_sha256rnds2_epu32(t0, t1, msg); 100 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
124 101
125#define RND2_0(m, k) \
126 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
127 RND2(state0, state1); \
128 msg = _mm_shuffle_epi32(msg, 0x0E); \
129 102
130 103
131#define RND2_1 \ 104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1); \
107 msg = _mm_shuffle_epi32(msg, 0x0E); \
108 OP0(m0, m1, m2, m3) \
132 RND2(state1, state0); \ 109 RND2(state1, state0); \
133 110 OP1(m0, m1, m2, m3) \
134
135// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
136
137#define R4(k, g0, g1, g2, g3, OP0, OP1) \
138 RND2_0(g0, k) \
139 OP0(g0, g1, g2, g3) \
140 RND2_1 \
141 OP1(g0, g1, g2, g3) \
142 111
143#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
144 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
145 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
146 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
147 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ 116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ 119 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@@ -161,8 +130,9 @@ ATTRIB_SHA
161void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) 130void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); 132 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
164 __m128i tmp; 133
165 __m128i state0, state1; 134
135 __m128i tmp, state0, state1;
166 136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -324,41 +282,70 @@ typedef uint32x4_t v128;
324// typedef __n128 v128; // MSVC 282// typedef __n128 v128; // MSVC
325 283
326#ifdef MY_CPU_BE 284#ifdef MY_CPU_BE
327 #define MY_rev32_for_LE(x) 285 #define MY_rev32_for_LE(x) x
328#else 286#else
329 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 287 #define MY_rev32_for_LE(x) vrev32q_u8(x)
330#endif 288#endif
331 289
332#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 290#if 1 // 0 for debug
333#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 291// for arm32: it works slower by some reason than direct code
292/*
293for arm32 it generates:
294MSVC-2022, GCC-9:
295 vld1.32 {d18,d19}, [r10]
296 vst1.32 {d4,d5}, [r3]
297 vld1.8 {d20-d21}, [r4]
298there is no align hint (like [r10:128]). So instruction allows unaligned access
299*/
300#define LOAD_128_32(_p) vld1q_u32(_p)
301#define LOAD_128_8(_p) vld1q_u8 (_p)
302#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
303#else
304/*
305for arm32:
306MSVC-2022:
307 vldm r10,{d18,d19}
308 vstm r3,{d4,d5}
309 does it require strict alignment?
310GCC-9:
311 vld1.64 {d30-d31}, [r0:64]
312 vldr d28, [r0, #16]
313 vldr d29, [r0, #24]
314 vst1.64 {d30-d31}, [r0:64]
315 vstr d28, [r0, #16]
316 vstr d29, [r0, #24]
317there is hint [r0:64], so does it requires 64-bit alignment.
318*/
319#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
320#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
321#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
322#endif
334 323
335#define LOAD_SHUFFLE(m, k) \ 324#define LOAD_SHUFFLE(m, k) \
336 m = LOAD_128((data + (k) * 16)); \ 325 m = vreinterpretq_u32_u8( \
337 MY_rev32_for_LE(m); \ 326 MY_rev32_for_LE( \
327 LOAD_128_8(data + (k) * 16))); \
338 328
339// K array must be aligned for 16-bytes at least. 329// K array must be aligned for 16-bytes at least.
340extern 330extern
341MY_ALIGN(64) 331MY_ALIGN(64)
342const UInt32 SHA256_K_ARRAY[64]; 332const UInt32 SHA256_K_ARRAY[64];
343
344#define K SHA256_K_ARRAY 333#define K SHA256_K_ARRAY
345 334
346
347#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); 335#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
348#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); 336#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
349 337
350#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) 338#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
351#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) 339#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
352#define NNN(g0, g1, g2, g3) 340#define NNN(m0, m1, m2, m3)
353 341
354 342#define R4(k, m0, m1, m2, m3, OP0, OP1) \
355#define R4(k, g0, g1, g2, g3, OP0, OP1) \ 343 msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
356 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
357 tmp = state0; \ 344 tmp = state0; \
358 state0 = vsha256hq_u32( state0, state1, msg ); \ 345 state0 = vsha256hq_u32( state0, state1, msg ); \
359 state1 = vsha256h2q_u32( state1, tmp, msg ); \ 346 state1 = vsha256h2q_u32( state1, tmp, msg ); \
360 OP0(g0, g1, g2, g3); \ 347 OP0(m0, m1, m2, m3); \
361 OP1(g0, g1, g2, g3); \ 348 OP1(m0, m1, m2, m3); \
362 349
363 350
364#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 351#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
379 if (numBlocks == 0) 366 if (numBlocks == 0)
380 return; 367 return;
381 368
382 state0 = LOAD_128(&state[0]); 369 state0 = LOAD_128_32(&state[0]);
383 state1 = LOAD_128(&state[4]); 370 state1 = LOAD_128_32(&state[4]);
384 371
385 do 372 do
386 { 373 {
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
408 } 395 }
409 while (--numBlocks); 396 while (--numBlocks);
410 397
411 STORE_128(&state[0], state0); 398 STORE_128_32(&state[0], state0);
412 STORE_128(&state[4], state1); 399 STORE_128_32(&state[4], state1);
413} 400}
414 401
415#endif // USE_HW_SHA 402#endif // USE_HW_SHA
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
443#endif 430#endif
444 431
445 432
446
447#undef K 433#undef K
448#undef RND2 434#undef RND2
449#undef RND2_0
450#undef RND2_1
451
452#undef MY_rev32_for_LE 435#undef MY_rev32_for_LE
436
453#undef NNN 437#undef NNN
454#undef LOAD_128 438#undef LOAD_128
455#undef STORE_128 439#undef STORE_128
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
457#undef SM1 441#undef SM1
458#undef SM2 442#undef SM2
459 443
460#undef NNN 444
461#undef R4 445#undef R4
462#undef R16 446#undef R16
463#undef PREPARE_STATE 447#undef PREPARE_STATE