diff options
Diffstat (limited to 'C/Sha256Opt.c')
-rw-r--r-- | C/Sha256Opt.c | 129 |
1 files changed, 71 insertions, 58 deletions
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index decc138..e4465e3 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
@@ -1,7 +1,9 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
2 | 2021-04-01 : Igor Pavlov : Public domain */ | 2 | 2023-04-02 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | ||
6 | #include "CpuArch.h" | ||
5 | 7 | ||
6 | #if defined(_MSC_VER) | 8 | #if defined(_MSC_VER) |
7 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | 9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) |
@@ -9,41 +11,26 @@ | |||
9 | #endif | 11 | #endif |
10 | #endif | 12 | #endif |
11 | 13 | ||
12 | #include "CpuArch.h" | ||
13 | |||
14 | #ifdef MY_CPU_X86_OR_AMD64 | 14 | #ifdef MY_CPU_X86_OR_AMD64 |
15 | #if defined(__clang__) | 15 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
16 | #if (__clang_major__ >= 8) // fix that check | ||
17 | #define USE_HW_SHA | 16 | #define USE_HW_SHA |
18 | #ifndef __SHA__ | 17 | #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 18 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
20 | #if defined(_MSC_VER) | 19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
21 | // SSSE3: for clang-cl: | ||
22 | #include <tmmintrin.h> | ||
23 | #define __SHA__ | ||
24 | #endif | ||
25 | #endif | ||
26 | |||
27 | #endif | ||
28 | #elif defined(__GNUC__) | ||
29 | #if (__GNUC__ >= 8) // fix that check | ||
30 | #define USE_HW_SHA | 20 | #define USE_HW_SHA |
31 | #ifndef __SHA__ | 21 | #if !defined(_INTEL_COMPILER) |
22 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | ||
23 | #if !defined(__SHA__) || !defined(__SSSE3__) | ||
32 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 24 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
33 | // #pragma GCC target("sha,ssse3") | ||
34 | #endif | 25 | #endif |
35 | #endif | 26 | #endif |
36 | #elif defined(__INTEL_COMPILER) | ||
37 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
38 | #define USE_HW_SHA | ||
39 | #endif | ||
40 | #elif defined(_MSC_VER) | 27 | #elif defined(_MSC_VER) |
41 | #ifdef USE_MY_MM | 28 | #ifdef USE_MY_MM |
42 | #define USE_VER_MIN 1300 | 29 | #define USE_VER_MIN 1300 |
43 | #else | 30 | #else |
44 | #define USE_VER_MIN 1910 | 31 | #define USE_VER_MIN 1900 |
45 | #endif | 32 | #endif |
46 | #if _MSC_VER >= USE_VER_MIN | 33 | #if (_MSC_VER >= USE_VER_MIN) |
47 | #define USE_HW_SHA | 34 | #define USE_HW_SHA |
48 | #endif | 35 | #endif |
49 | #endif | 36 | #endif |
@@ -52,16 +39,19 @@ | |||
52 | #ifdef USE_HW_SHA | 39 | #ifdef USE_HW_SHA |
53 | 40 | ||
54 | // #pragma message("Sha256 HW") | 41 | // #pragma message("Sha256 HW") |
55 | // #include <wmmintrin.h> | ||
56 | 42 | ||
57 | #if !defined(_MSC_VER) || (_MSC_VER >= 1900) | 43 | // sse/sse2/ssse3: |
44 | #include <tmmintrin.h> | ||
45 | // sha*: | ||
58 | #include <immintrin.h> | 46 | #include <immintrin.h> |
59 | #else | ||
60 | #include <emmintrin.h> | ||
61 | 47 | ||
62 | #if defined(_MSC_VER) && (_MSC_VER >= 1600) | 48 | #if defined (__clang__) && defined(_MSC_VER) |
63 | // #include <intrin.h> | 49 | // #if !defined(__SSSE3__) |
64 | #endif | 50 | // #endif |
51 | #if !defined(__SHA__) | ||
52 | #include <shaintrin.h> | ||
53 | #endif | ||
54 | #else | ||
65 | 55 | ||
66 | #ifdef USE_MY_MM | 56 | #ifdef USE_MY_MM |
67 | #include "My_mm.h" | 57 | #include "My_mm.h" |
@@ -98,9 +88,9 @@ const UInt32 SHA256_K_ARRAY[64]; | |||
98 | #define K SHA256_K_ARRAY | 88 | #define K SHA256_K_ARRAY |
99 | 89 | ||
100 | 90 | ||
101 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | 91 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); |
102 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); | 92 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); |
103 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); | 93 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); |
104 | 94 | ||
105 | 95 | ||
106 | #define LOAD_SHUFFLE(m, k) \ | 96 | #define LOAD_SHUFFLE(m, k) \ |
@@ -112,7 +102,7 @@ const UInt32 SHA256_K_ARRAY[64]; | |||
112 | 102 | ||
113 | #define SM2(g0, g1, g2, g3) \ | 103 | #define SM2(g0, g1, g2, g3) \ |
114 | tmp = _mm_alignr_epi8(g1, g0, 4); \ | 104 | tmp = _mm_alignr_epi8(g1, g0, 4); \ |
115 | ADD_EPI32(g2, tmp); \ | 105 | ADD_EPI32(g2, tmp) \ |
116 | SHA25G_MSG2(g2, g1); \ | 106 | SHA25G_MSG2(g2, g1); \ |
117 | 107 | ||
118 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) | 108 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) |
@@ -138,16 +128,16 @@ const UInt32 SHA256_K_ARRAY[64]; | |||
138 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 | 128 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 |
139 | 129 | ||
140 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | 130 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ |
141 | RND2_0(g0, k); \ | 131 | RND2_0(g0, k) \ |
142 | OP0(g0, g1, g2, g3); \ | 132 | OP0(g0, g1, g2, g3) \ |
143 | RND2_1; \ | 133 | RND2_1 \ |
144 | OP1(g0, g1, g2, g3); \ | 134 | OP1(g0, g1, g2, g3) \ |
145 | 135 | ||
146 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 136 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
147 | R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ | 137 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ |
148 | R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ | 138 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ |
149 | R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ | 139 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ |
150 | R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ | 140 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ |
151 | 141 | ||
152 | #define PREPARE_STATE \ | 142 | #define PREPARE_STATE \ |
153 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ | 143 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ |
@@ -157,11 +147,11 @@ const UInt32 SHA256_K_ARRAY[64]; | |||
157 | state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ | 147 | state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ |
158 | 148 | ||
159 | 149 | ||
160 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 150 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
161 | #ifdef ATTRIB_SHA | 151 | #ifdef ATTRIB_SHA |
162 | ATTRIB_SHA | 152 | ATTRIB_SHA |
163 | #endif | 153 | #endif |
164 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 154 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
165 | { | 155 | { |
166 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); | 156 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); |
167 | __m128i tmp; | 157 | __m128i tmp; |
@@ -192,13 +182,13 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size | |||
192 | 182 | ||
193 | 183 | ||
194 | 184 | ||
195 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); | 185 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) |
196 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 186 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
197 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 187 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
198 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); | 188 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) |
199 | 189 | ||
200 | ADD_EPI32(state0, state0_save); | 190 | ADD_EPI32(state0, state0_save) |
201 | ADD_EPI32(state1, state1_save); | 191 | ADD_EPI32(state1, state1_save) |
202 | 192 | ||
203 | data += 64; | 193 | data += 64; |
204 | } | 194 | } |
@@ -298,11 +288,11 @@ const UInt32 SHA256_K_ARRAY[64]; | |||
298 | R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ | 288 | R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ |
299 | 289 | ||
300 | 290 | ||
301 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 291 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
302 | #ifdef ATTRIB_SHA | 292 | #ifdef ATTRIB_SHA |
303 | ATTRIB_SHA | 293 | ATTRIB_SHA |
304 | #endif | 294 | #endif |
305 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 295 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
306 | { | 296 | { |
307 | v128 state0, state1; | 297 | v128 state0, state1; |
308 | 298 | ||
@@ -353,12 +343,12 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size | |||
353 | // #include <stdlib.h> | 343 | // #include <stdlib.h> |
354 | 344 | ||
355 | // #include "Sha256.h" | 345 | // #include "Sha256.h" |
356 | void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); | 346 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); |
357 | 347 | ||
358 | #pragma message("Sha256 HW-SW stub was used") | 348 | #pragma message("Sha256 HW-SW stub was used") |
359 | 349 | ||
360 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 350 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
361 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 351 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
362 | { | 352 | { |
363 | Sha256_UpdateBlocks(state, data, numBlocks); | 353 | Sha256_UpdateBlocks(state, data, numBlocks); |
364 | /* | 354 | /* |
@@ -371,3 +361,26 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size | |||
371 | } | 361 | } |
372 | 362 | ||
373 | #endif | 363 | #endif |
364 | |||
365 | |||
366 | |||
367 | #undef K | ||
368 | #undef RND2 | ||
369 | #undef RND2_0 | ||
370 | #undef RND2_1 | ||
371 | |||
372 | #undef MY_rev32_for_LE | ||
373 | #undef NNN | ||
374 | #undef LOAD_128 | ||
375 | #undef STORE_128 | ||
376 | #undef LOAD_SHUFFLE | ||
377 | #undef SM1 | ||
378 | #undef SM2 | ||
379 | |||
380 | #undef NNN | ||
381 | #undef R4 | ||
382 | #undef R16 | ||
383 | #undef PREPARE_STATE | ||
384 | #undef USE_HW_SHA | ||
385 | #undef ATTRIB_SHA | ||
386 | #undef USE_VER_MIN | ||