diff options
Diffstat (limited to '')
-rw-r--r-- | C/Sha1Opt.c | 146 |
1 files changed, 49 insertions, 97 deletions
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 4e835f1..8738b94 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha1 HW") | 36 | // #pragma message("Sha1 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -84,7 +69,6 @@ SHA: | |||
84 | _mm_sha1* | 69 | _mm_sha1* |
85 | */ | 70 | */ |
86 | 71 | ||
87 | |||
88 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); | 72 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); |
89 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); | 73 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); |
90 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); | 74 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); |
@@ -99,11 +83,12 @@ SHA: | |||
99 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); | 83 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); |
100 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); | 84 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); |
101 | 85 | ||
102 | |||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | SHUFFLE_EPI8(m, mask) \ | 88 | SHUFFLE_EPI8(m, mask) \ |
106 | 89 | ||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
107 | #define SM1(m0, m1, m2, m3) \ | 92 | #define SM1(m0, m1, m2, m3) \ |
108 | SHA1_MSG1(m0, m1) \ | 93 | SHA1_MSG1(m0, m1) \ |
109 | 94 | ||
@@ -116,35 +101,19 @@ SHA: | |||
116 | SM1(m0, m1, m2, m3) \ | 101 | SM1(m0, m1, m2, m3) \ |
117 | SHA1_MSG2(m3, m2) \ | 102 | SHA1_MSG2(m3, m2) \ |
118 | 103 | ||
119 | #define NNN(m0, m1, m2, m3) | 104 | #define R4(k, m0, m1, m2, m3, e0, e1, OP) \ |
120 | |||
121 | |||
122 | |||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
128 | |||
129 | |||
130 | |||
131 | |||
132 | |||
133 | |||
134 | |||
135 | |||
136 | |||
137 | #define R4(k, e0, e1, m0, m1, m2, m3, OP) \ | ||
138 | e1 = abcd; \ | 105 | e1 = abcd; \ |
139 | SHA1_RND4(abcd, e0, (k) / 5) \ | 106 | SHA1_RND4(abcd, e0, (k) / 5) \ |
140 | SHA1_NEXTE(e1, m1) \ | 107 | SHA1_NEXTE(e1, m1) \ |
141 | OP(m0, m1, m2, m3) \ | 108 | OP(m0, m1, m2, m3) \ |
142 | 109 | ||
110 | |||
111 | |||
143 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ | 112 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ |
144 | R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \ |
145 | R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \ |
146 | R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \ |
147 | R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ | 116 | R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | SHUFFLE_EPI32 (abcd, 0x1B) \ | 119 | SHUFFLE_EPI32 (abcd, 0x1B) \ |
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); | 132 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); |
164 | 133 | ||
165 | __m128i abcd, e0; | ||
166 | 134 | ||
135 | __m128i abcd, e0; | ||
136 | |||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
169 | 139 | ||
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
204 | PREPARE_STATE | 174 | PREPARE_STATE |
205 | 175 | ||
206 | _mm_storeu_si128((__m128i *) (void *) state, abcd); | 176 | _mm_storeu_si128((__m128i *) (void *) state, abcd); |
207 | *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); | 177 | *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0); |
208 | } | 178 | } |
209 | 179 | ||
210 | #endif // USE_HW_SHA | 180 | #endif // USE_HW_SHA |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -329,26 +287,37 @@ typedef uint32x4_t v128; | |||
329 | #endif | 287 | #endif |
330 | 288 | ||
331 | #ifdef MY_CPU_BE | 289 | #ifdef MY_CPU_BE |
332 | #define MY_rev32_for_LE(x) | 290 | #define MY_rev32_for_LE(x) x |
333 | #else | 291 | #else |
334 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 292 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
335 | #endif | 293 | #endif |
336 | 294 | ||
337 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 295 | #define LOAD_128_32(_p) vld1q_u32(_p) |
338 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 296 | #define LOAD_128_8(_p) vld1q_u8 (_p) |
297 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
339 | 298 | ||
340 | #define LOAD_SHUFFLE(m, k) \ | 299 | #define LOAD_SHUFFLE(m, k) \ |
341 | m = LOAD_128((data + (k) * 16)); \ | 300 | m = vreinterpretq_u32_u8( \ |
342 | MY_rev32_for_LE(m); \ | 301 | MY_rev32_for_LE( \ |
343 | 302 | LOAD_128_8(data + (k) * 16))); \ | |
344 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) | 303 | |
345 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src) | 304 | #define N0(dest, src2, src3) |
305 | #define N1(dest, src) | ||
306 | #define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | ||
307 | #define U1(dest, src) dest = vsha1su1q_u32(dest, src); | ||
346 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) | 308 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) |
347 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) | 309 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) |
348 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) | 310 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) |
349 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | 311 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) |
350 | #define T(m, c) t = vaddq_u32(m, c) | 312 | #define T(m, c) t = vaddq_u32(m, c) |
351 | 313 | ||
314 | #define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \ | ||
315 | T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \ | ||
316 | T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \ | ||
317 | T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \ | ||
318 | T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \ | ||
319 | |||
320 | |||
352 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 321 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
353 | #ifdef ATTRIB_SHA | 322 | #ifdef ATTRIB_SHA |
354 | ATTRIB_SHA | 323 | ATTRIB_SHA |
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
367 | c2 = vdupq_n_u32(0x8f1bbcdc); | 336 | c2 = vdupq_n_u32(0x8f1bbcdc); |
368 | c3 = vdupq_n_u32(0xca62c1d6); | 337 | c3 = vdupq_n_u32(0xca62c1d6); |
369 | 338 | ||
370 | abcd = LOAD_128(&state[0]); | 339 | abcd = LOAD_128_32(&state[0]); |
371 | e0 = state[4]; | 340 | e0 = state[4]; |
372 | 341 | ||
373 | do | 342 | do |
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
385 | LOAD_SHUFFLE (m2, 2) | 354 | LOAD_SHUFFLE (m2, 2) |
386 | LOAD_SHUFFLE (m3, 3) | 355 | LOAD_SHUFFLE (m3, 3) |
387 | 356 | ||
388 | T(m0, c0); H(e1); C(e0); | 357 | R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C ) |
389 | T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); | 358 | R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P ) |
390 | T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); | 359 | R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M ) |
391 | T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); | 360 | R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P ) |
392 | T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); | 361 | R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P ) |
393 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
394 | T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); | ||
395 | T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
396 | T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
397 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
398 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
399 | T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); | ||
400 | T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); | ||
401 | T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); | ||
402 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
403 | T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
404 | T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
405 | T(m1, c3); SU1(m3, m2); H(e0); P(e1); | ||
406 | T(m2, c3); H(e1); P(e0); | ||
407 | T(m3, c3); H(e0); P(e1); | ||
408 | 362 | ||
409 | abcd = vaddq_u32(abcd, abcd_save); | 363 | abcd = vaddq_u32(abcd, abcd_save); |
410 | e0 += e0_save; | 364 | e0 += e0_save; |
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
413 | } | 367 | } |
414 | while (--numBlocks); | 368 | while (--numBlocks); |
415 | 369 | ||
416 | STORE_128(&state[0], abcd); | 370 | STORE_128_32(&state[0], abcd); |
417 | state[4] = e0; | 371 | state[4] = e0; |
418 | } | 372 | } |
419 | 373 | ||
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
421 | 375 | ||
422 | #endif // MY_CPU_ARM_OR_ARM64 | 376 | #endif // MY_CPU_ARM_OR_ARM64 |
423 | 377 | ||
424 | |||
425 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | 378 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
426 | // #error Stop_Compiling_UNSUPPORTED_SHA | 379 | // #error Stop_Compiling_UNSUPPORTED_SHA |
427 | // #include <stdlib.h> | 380 | // #include <stdlib.h> |
428 | |||
429 | |||
430 | |||
431 | // #include "Sha1.h" | 381 | // #include "Sha1.h" |
432 | // #if defined(_MSC_VER) | 382 | // #if defined(_MSC_VER) |
433 | #pragma message("Sha1 HW-SW stub was used") | 383 | #pragma message("Sha1 HW-SW stub was used") |
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
447 | } | 397 | } |
448 | #endif | 398 | #endif |
449 | 399 | ||
450 | #undef SU0 | 400 | #undef U0 |
451 | #undef SU1 | 401 | #undef U1 |
402 | #undef N0 | ||
403 | #undef N1 | ||
452 | #undef C | 404 | #undef C |
453 | #undef P | 405 | #undef P |
454 | #undef M | 406 | #undef M |