aboutsummaryrefslogtreecommitdiff
path: root/C/Sha1Opt.c
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2021-12-27 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2022-03-18 15:35:13 +0500
commitf19f813537c7aea1c20749c914e756b54a9c3cf5 (patch)
tree816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /C/Sha1Opt.c
parent98e06a519b63b81986abe76d28887f6984a7732b (diff)
download7zip-21.07.tar.gz
7zip-21.07.tar.bz2
7zip-21.07.zip
'21.07'21.07
Diffstat (limited to 'C/Sha1Opt.c')
-rw-r--r--C/Sha1Opt.c373
1 files changed, 373 insertions, 0 deletions
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c
new file mode 100644
index 0000000..63132da
--- /dev/null
+++ b/C/Sha1Opt.c
@@ -0,0 +1,373 @@
1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
22021-04-01 : Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5
6#if defined(_MSC_VER)
7#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
8// #define USE_MY_MM
9#endif
10#endif
11
12#include "CpuArch.h"
13
14#ifdef MY_CPU_X86_OR_AMD64
15 #if defined(__clang__)
16 #if (__clang_major__ >= 8) // fix that check
17 #define USE_HW_SHA
18 #ifndef __SHA__
19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20 #if defined(_MSC_VER)
21 // SSSE3: for clang-cl:
22 #include <tmmintrin.h>
23 #define __SHA__
24 #endif
25 #endif
26 #pragma clang diagnostic ignored "-Wvector-conversion"
27 #endif
28 #elif defined(__GNUC__)
29 #if (__GNUC__ >= 8) // fix that check
30 #define USE_HW_SHA
31 #ifndef __SHA__
32 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
33 // #pragma GCC target("sha,ssse3")
34 #endif
35 #endif
36 #elif defined(__INTEL_COMPILER)
37 #if (__INTEL_COMPILER >= 1800) // fix that check
38 #define USE_HW_SHA
39 #endif
40 #elif defined(_MSC_VER)
41 #ifdef USE_MY_MM
42 #define USE_VER_MIN 1300
43 #else
44 #define USE_VER_MIN 1910
45 #endif
46 #if _MSC_VER >= USE_VER_MIN
47 #define USE_HW_SHA
48 #endif
49 #endif
50// #endif // MY_CPU_X86_OR_AMD64
51
52#ifdef USE_HW_SHA
53
54// #pragma message("Sha1 HW")
55// #include <wmmintrin.h>
56
57#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
58#include <immintrin.h>
59#else
60#include <emmintrin.h>
61
62#if defined(_MSC_VER) && (_MSC_VER >= 1600)
63// #include <intrin.h>
64#endif
65
66#ifdef USE_MY_MM
67#include "My_mm.h"
68#endif
69
70#endif
71
72/*
73SHA1 uses:
74SSE2:
75 _mm_loadu_si128
76 _mm_storeu_si128
77 _mm_set_epi32
78 _mm_add_epi32
79 _mm_shuffle_epi32 / pshufd
80 _mm_xor_si128
81 _mm_cvtsi128_si32
82 _mm_cvtsi32_si128
83SSSE3:
84 _mm_shuffle_epi8 / pshufb
85
86SHA:
87 _mm_sha1*
88*/
89
90#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
91#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
92#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
93#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
94
95#define SHA1_RND4(abcd, e0, f) abcd = _mm_sha1rnds4_epu32(abcd, e0, f);
96#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m);
97
98
99
100
101
102#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
103#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
104
105
106#define LOAD_SHUFFLE(m, k) \
107 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
108 SHUFFLE_EPI8(m, mask); \
109
110#define SM1(m0, m1, m2, m3) \
111 SHA1_MSG1(m0, m1); \
112
113#define SM2(m0, m1, m2, m3) \
114 XOR_SI128(m3, m1); \
115 SHA1_MSG2(m3, m2); \
116
117#define SM3(m0, m1, m2, m3) \
118 XOR_SI128(m3, m1); \
119 SM1(m0, m1, m2, m3) \
120 SHA1_MSG2(m3, m2); \
121
122#define NNN(m0, m1, m2, m3)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
141 e1 = abcd; \
142 SHA1_RND4(abcd, e0, (k) / 5); \
143 SHA1_NEXTE(e1, m1); \
144 OP(m0, m1, m2, m3); \
145
146#define R16(k, mx, OP0, OP1, OP2, OP3) \
147 R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \
148 R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \
149 R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \
150 R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \
151
152#define PREPARE_STATE \
153 SHUFFLE_EPI32 (abcd, 0x1B); \
154 SHUFFLE_EPI32 (e0, 0x1B); \
155
156
157
158
159
160void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
161#ifdef ATTRIB_SHA
162ATTRIB_SHA
163#endif
164void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
165{
166 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
167
168 __m128i abcd, e0;
169
170 if (numBlocks == 0)
171 return;
172
173 abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca
174 e0 = _mm_cvtsi32_si128((int)state[4]); // 000e
175
176 PREPARE_STATE
177
178 do
179 {
180 __m128i abcd_save, e2;
181 __m128i m0, m1, m2, m3;
182 __m128i e1;
183
184
185 abcd_save = abcd;
186 e2 = e0;
187
188 LOAD_SHUFFLE (m0, 0)
189 LOAD_SHUFFLE (m1, 1)
190 LOAD_SHUFFLE (m2, 2)
191 LOAD_SHUFFLE (m3, 3)
192
193 ADD_EPI32(e0, m0);
194
195 R16 ( 0, m0, SM1, SM3, SM3, SM3 );
196 R16 ( 1, m0, SM3, SM3, SM3, SM3 );
197 R16 ( 2, m0, SM3, SM3, SM3, SM3 );
198 R16 ( 3, m0, SM3, SM3, SM3, SM3 );
199 R16 ( 4, e2, SM2, NNN, NNN, NNN );
200
201 ADD_EPI32(abcd, abcd_save);
202
203 data += 64;
204 }
205 while (--numBlocks);
206
207 PREPARE_STATE
208
209 _mm_storeu_si128((__m128i *) (void *) state, abcd);
210 *(state+4) = (UInt32)_mm_cvtsi128_si32(e0);
211}
212
213#endif // USE_HW_SHA
214
215#elif defined(MY_CPU_ARM_OR_ARM64)
216
217 #if defined(__clang__)
218 #if (__clang_major__ >= 8) // fix that check
219 #define USE_HW_SHA
220 #endif
221 #elif defined(__GNUC__)
222 #if (__GNUC__ >= 6) // fix that check
223 #define USE_HW_SHA
224 #endif
225 #elif defined(_MSC_VER)
226 #if _MSC_VER >= 1910
227 #define USE_HW_SHA
228 #endif
229 #endif
230
231#ifdef USE_HW_SHA
232
233// #pragma message("=== Sha1 HW === ")
234
235#if defined(__clang__) || defined(__GNUC__)
236 #ifdef MY_CPU_ARM64
237 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
238 #else
239 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
240 #endif
241#else
242 // _MSC_VER
243 // for arm32
244 #define _ARM_USE_NEW_NEON_INTRINSICS
245#endif
246
247#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
248#include <arm64_neon.h>
249#else
250#include <arm_neon.h>
251#endif
252
253typedef uint32x4_t v128;
254// typedef __n128 v128; // MSVC
255
256#ifdef MY_CPU_BE
257 #define MY_rev32_for_LE(x)
258#else
259 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
260#endif
261
262#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
263#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
264
265#define LOAD_SHUFFLE(m, k) \
266 m = LOAD_128((data + (k) * 16)); \
267 MY_rev32_for_LE(m); \
268
269#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
270#define SU1(dest, src) dest = vsha1su1q_u32(dest, src);
271#define C(e) abcd = vsha1cq_u32(abcd, e, t);
272#define P(e) abcd = vsha1pq_u32(abcd, e, t);
273#define M(e) abcd = vsha1mq_u32(abcd, e, t);
274#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
275#define T(m, c) t = vaddq_u32(m, c)
276
277void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
278#ifdef ATTRIB_SHA
279ATTRIB_SHA
280#endif
281void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
282{
283 v128 abcd;
284 v128 c0, c1, c2, c3;
285 uint32_t e0;
286
287 if (numBlocks == 0)
288 return;
289
290 c0 = vdupq_n_u32(0x5a827999);
291 c1 = vdupq_n_u32(0x6ed9eba1);
292 c2 = vdupq_n_u32(0x8f1bbcdc);
293 c3 = vdupq_n_u32(0xca62c1d6);
294
295 abcd = LOAD_128(&state[0]);
296 e0 = state[4];
297
298 do
299 {
300 v128 abcd_save;
301 v128 m0, m1, m2, m3;
302 v128 t;
303 uint32_t e0_save, e1;
304
305 abcd_save = abcd;
306 e0_save = e0;
307
308 LOAD_SHUFFLE (m0, 0)
309 LOAD_SHUFFLE (m1, 1)
310 LOAD_SHUFFLE (m2, 2)
311 LOAD_SHUFFLE (m3, 3)
312
313 T(m0, c0); H(e1); C(e0);
314 T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1);
315 T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0);
316 T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1);
317 T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0);
318 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
319 T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
320 T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
321 T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
322 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
323 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
324 T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
325 T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
326 T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
327 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
328 T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
329 T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
330 T(m1, c3); SU1(m3, m2); H(e0); P(e1);
331 T(m2, c3); H(e1); P(e0);
332 T(m3, c3); H(e0); P(e1);
333
334 abcd = vaddq_u32(abcd, abcd_save);
335 e0 += e0_save;
336
337 data += 64;
338 }
339 while (--numBlocks);
340
341 STORE_128(&state[0], abcd);
342 state[4] = e0;
343}
344
345#endif // USE_HW_SHA
346
347#endif // MY_CPU_ARM_OR_ARM64
348
349
350#ifndef USE_HW_SHA
351
352// #error Stop_Compiling_UNSUPPORTED_SHA
353// #include <stdlib.h>
354
355// #include "Sha1.h"
356void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
357
358#pragma message("Sha1 HW-SW stub was used")
359
360void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
361void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
362{
363 Sha1_UpdateBlocks(state, data, numBlocks);
364 /*
365 UNUSED_VAR(state);
366 UNUSED_VAR(data);
367 UNUSED_VAR(numBlocks);
368 exit(1);
369 return;
370 */
371}
372
373#endif