aboutsummaryrefslogtreecommitdiff
path: root/C/Sha1Opt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-29 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-30 15:27:15 +0500
commite5431fa6f5505e385c6f9367260717e9c47dc2ee (patch)
tree4cd2c2c3b225b48c8e7053432c41d7b6b6a3d5f8 /C/Sha1Opt.c
parente008ce3976c087bfd21344af8f00a23cf69d4174 (diff)
download7zip-main.tar.gz
7zip-main.tar.bz2
7zip-main.zip
Diffstat (limited to 'C/Sha1Opt.c')
-rw-r--r--C/Sha1Opt.c146
1 files changed, 49 insertions, 97 deletions
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c
index 4e835f1..8738b94 100644
--- a/C/Sha1Opt.c
+++ b/C/Sha1Opt.c
@@ -1,18 +1,11 @@
1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions 1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha1 HW") 36// #pragma message("Sha1 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -84,7 +69,6 @@ SHA:
84 _mm_sha1* 69 _mm_sha1*
85*/ 70*/
86 71
87
88#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); 72#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
89#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); 73#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
90#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); 74#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
@@ -99,11 +83,12 @@ SHA:
99#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); 83#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
100#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); 84#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
101 85
102
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 SHUFFLE_EPI8(m, mask) \ 88 SHUFFLE_EPI8(m, mask) \
106 89
90#define NNN(m0, m1, m2, m3)
91
107#define SM1(m0, m1, m2, m3) \ 92#define SM1(m0, m1, m2, m3) \
108 SHA1_MSG1(m0, m1) \ 93 SHA1_MSG1(m0, m1) \
109 94
@@ -116,35 +101,19 @@ SHA:
116 SM1(m0, m1, m2, m3) \ 101 SM1(m0, m1, m2, m3) \
117 SHA1_MSG2(m3, m2) \ 102 SHA1_MSG2(m3, m2) \
118 103
119#define NNN(m0, m1, m2, m3) 104#define R4(k, m0, m1, m2, m3, e0, e1, OP) \
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
138 e1 = abcd; \ 105 e1 = abcd; \
139 SHA1_RND4(abcd, e0, (k) / 5) \ 106 SHA1_RND4(abcd, e0, (k) / 5) \
140 SHA1_NEXTE(e1, m1) \ 107 SHA1_NEXTE(e1, m1) \
141 OP(m0, m1, m2, m3) \ 108 OP(m0, m1, m2, m3) \
142 109
110
111
143#define R16(k, mx, OP0, OP1, OP2, OP3) \ 112#define R16(k, mx, OP0, OP1, OP2, OP3) \
144 R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
145 R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
146 R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
147 R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ 116 R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 SHUFFLE_EPI32 (abcd, 0x1B) \ 119 SHUFFLE_EPI32 (abcd, 0x1B) \
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); 132 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
164 133
165 __m128i abcd, e0;
166 134
135 __m128i abcd, e0;
136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
169 139
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
204 PREPARE_STATE 174 PREPARE_STATE
205 175
206 _mm_storeu_si128((__m128i *) (void *) state, abcd); 176 _mm_storeu_si128((__m128i *) (void *) state, abcd);
207 *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); 177 *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
208} 178}
209 179
210#endif // USE_HW_SHA 180#endif // USE_HW_SHA
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -329,26 +287,37 @@ typedef uint32x4_t v128;
329#endif 287#endif
330 288
331#ifdef MY_CPU_BE 289#ifdef MY_CPU_BE
332 #define MY_rev32_for_LE(x) 290 #define MY_rev32_for_LE(x) x
333#else 291#else
334 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 292 #define MY_rev32_for_LE(x) vrev32q_u8(x)
335#endif 293#endif
336 294
337#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 295#define LOAD_128_32(_p) vld1q_u32(_p)
338#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 296#define LOAD_128_8(_p) vld1q_u8 (_p)
297#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
339 298
340#define LOAD_SHUFFLE(m, k) \ 299#define LOAD_SHUFFLE(m, k) \
341 m = LOAD_128((data + (k) * 16)); \ 300 m = vreinterpretq_u32_u8( \
342 MY_rev32_for_LE(m); \ 301 MY_rev32_for_LE( \
343 302 LOAD_128_8(data + (k) * 16))); \
344#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) 303
345#define SU1(dest, src) dest = vsha1su1q_u32(dest, src) 304#define N0(dest, src2, src3)
305#define N1(dest, src)
306#define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
307#define U1(dest, src) dest = vsha1su1q_u32(dest, src);
346#define C(e) abcd = vsha1cq_u32(abcd, e, t) 308#define C(e) abcd = vsha1cq_u32(abcd, e, t)
347#define P(e) abcd = vsha1pq_u32(abcd, e, t) 309#define P(e) abcd = vsha1pq_u32(abcd, e, t)
348#define M(e) abcd = vsha1mq_u32(abcd, e, t) 310#define M(e) abcd = vsha1mq_u32(abcd, e, t)
349#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) 311#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
350#define T(m, c) t = vaddq_u32(m, c) 312#define T(m, c) t = vaddq_u32(m, c)
351 313
314#define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
315 T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \
316 T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \
317 T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \
318 T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \
319
320
352void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); 321void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
353#ifdef ATTRIB_SHA 322#ifdef ATTRIB_SHA
354ATTRIB_SHA 323ATTRIB_SHA
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
367 c2 = vdupq_n_u32(0x8f1bbcdc); 336 c2 = vdupq_n_u32(0x8f1bbcdc);
368 c3 = vdupq_n_u32(0xca62c1d6); 337 c3 = vdupq_n_u32(0xca62c1d6);
369 338
370 abcd = LOAD_128(&state[0]); 339 abcd = LOAD_128_32(&state[0]);
371 e0 = state[4]; 340 e0 = state[4];
372 341
373 do 342 do
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
385 LOAD_SHUFFLE (m2, 2) 354 LOAD_SHUFFLE (m2, 2)
386 LOAD_SHUFFLE (m3, 3) 355 LOAD_SHUFFLE (m3, 3)
387 356
388 T(m0, c0); H(e1); C(e0); 357 R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
389 T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); 358 R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
390 T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); 359 R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
391 T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); 360 R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
392 T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); 361 R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
393 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
394 T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
395 T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
396 T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
397 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
398 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
399 T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
400 T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
401 T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
402 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
403 T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
404 T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
405 T(m1, c3); SU1(m3, m2); H(e0); P(e1);
406 T(m2, c3); H(e1); P(e0);
407 T(m3, c3); H(e0); P(e1);
408 362
409 abcd = vaddq_u32(abcd, abcd_save); 363 abcd = vaddq_u32(abcd, abcd_save);
410 e0 += e0_save; 364 e0 += e0_save;
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
413 } 367 }
414 while (--numBlocks); 368 while (--numBlocks);
415 369
416 STORE_128(&state[0], abcd); 370 STORE_128_32(&state[0], abcd);
417 state[4] = e0; 371 state[4] = e0;
418} 372}
419 373
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
421 375
422#endif // MY_CPU_ARM_OR_ARM64 376#endif // MY_CPU_ARM_OR_ARM64
423 377
424
425#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) 378#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
426// #error Stop_Compiling_UNSUPPORTED_SHA 379// #error Stop_Compiling_UNSUPPORTED_SHA
427// #include <stdlib.h> 380// #include <stdlib.h>
428
429
430
431// #include "Sha1.h" 381// #include "Sha1.h"
432// #if defined(_MSC_VER) 382// #if defined(_MSC_VER)
433#pragma message("Sha1 HW-SW stub was used") 383#pragma message("Sha1 HW-SW stub was used")
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
447} 397}
448#endif 398#endif
449 399
450#undef SU0 400#undef U0
451#undef SU1 401#undef U1
402#undef N0
403#undef N1
452#undef C 404#undef C
453#undef P 405#undef P
454#undef M 406#undef M