diff options
Diffstat (limited to '')
-rw-r--r-- | C/Sha1Opt.c | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c new file mode 100644 index 0000000..63132da --- /dev/null +++ b/C/Sha1Opt.c | |||
@@ -0,0 +1,373 @@ | |||
1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | ||
2 | 2021-04-01 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | |||
6 | #if defined(_MSC_VER) | ||
7 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
8 | // #define USE_MY_MM | ||
9 | #endif | ||
10 | #endif | ||
11 | |||
12 | #include "CpuArch.h" | ||
13 | |||
14 | #ifdef MY_CPU_X86_OR_AMD64 | ||
15 | #if defined(__clang__) | ||
16 | #if (__clang_major__ >= 8) // fix that check | ||
17 | #define USE_HW_SHA | ||
18 | #ifndef __SHA__ | ||
19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | ||
20 | #if defined(_MSC_VER) | ||
21 | // SSSE3: for clang-cl: | ||
22 | #include <tmmintrin.h> | ||
23 | #define __SHA__ | ||
24 | #endif | ||
25 | #endif | ||
26 | #pragma clang diagnostic ignored "-Wvector-conversion" | ||
27 | #endif | ||
28 | #elif defined(__GNUC__) | ||
29 | #if (__GNUC__ >= 8) // fix that check | ||
30 | #define USE_HW_SHA | ||
31 | #ifndef __SHA__ | ||
32 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | ||
33 | // #pragma GCC target("sha,ssse3") | ||
34 | #endif | ||
35 | #endif | ||
36 | #elif defined(__INTEL_COMPILER) | ||
37 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
38 | #define USE_HW_SHA | ||
39 | #endif | ||
40 | #elif defined(_MSC_VER) | ||
41 | #ifdef USE_MY_MM | ||
42 | #define USE_VER_MIN 1300 | ||
43 | #else | ||
44 | #define USE_VER_MIN 1910 | ||
45 | #endif | ||
46 | #if _MSC_VER >= USE_VER_MIN | ||
47 | #define USE_HW_SHA | ||
48 | #endif | ||
49 | #endif | ||
50 | // #endif // MY_CPU_X86_OR_AMD64 | ||
51 | |||
52 | #ifdef USE_HW_SHA | ||
53 | |||
54 | // #pragma message("Sha1 HW") | ||
55 | // #include <wmmintrin.h> | ||
56 | |||
57 | #if !defined(_MSC_VER) || (_MSC_VER >= 1900) | ||
58 | #include <immintrin.h> | ||
59 | #else | ||
60 | #include <emmintrin.h> | ||
61 | |||
62 | #if defined(_MSC_VER) && (_MSC_VER >= 1600) | ||
63 | // #include <intrin.h> | ||
64 | #endif | ||
65 | |||
66 | #ifdef USE_MY_MM | ||
67 | #include "My_mm.h" | ||
68 | #endif | ||
69 | |||
70 | #endif | ||
71 | |||
72 | /* | ||
73 | SHA1 uses: | ||
74 | SSE2: | ||
75 | _mm_loadu_si128 | ||
76 | _mm_storeu_si128 | ||
77 | _mm_set_epi32 | ||
78 | _mm_add_epi32 | ||
79 | _mm_shuffle_epi32 / pshufd | ||
80 | _mm_xor_si128 | ||
81 | _mm_cvtsi128_si32 | ||
82 | _mm_cvtsi32_si128 | ||
83 | SSSE3: | ||
84 | _mm_shuffle_epi8 / pshufb | ||
85 | |||
86 | SHA: | ||
87 | _mm_sha1* | ||
88 | */ | ||
89 | |||
90 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | ||
91 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); | ||
92 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); | ||
93 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); | ||
94 | |||
95 | #define SHA1_RND4(abcd, e0, f) abcd = _mm_sha1rnds4_epu32(abcd, e0, f); | ||
96 | #define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m); | ||
97 | |||
98 | |||
99 | |||
100 | |||
101 | |||
102 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); | ||
103 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); | ||
104 | |||
105 | |||
106 | #define LOAD_SHUFFLE(m, k) \ | ||
107 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | ||
108 | SHUFFLE_EPI8(m, mask); \ | ||
109 | |||
110 | #define SM1(m0, m1, m2, m3) \ | ||
111 | SHA1_MSG1(m0, m1); \ | ||
112 | |||
113 | #define SM2(m0, m1, m2, m3) \ | ||
114 | XOR_SI128(m3, m1); \ | ||
115 | SHA1_MSG2(m3, m2); \ | ||
116 | |||
117 | #define SM3(m0, m1, m2, m3) \ | ||
118 | XOR_SI128(m3, m1); \ | ||
119 | SM1(m0, m1, m2, m3) \ | ||
120 | SHA1_MSG2(m3, m2); \ | ||
121 | |||
122 | #define NNN(m0, m1, m2, m3) | ||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
128 | |||
129 | |||
130 | |||
131 | |||
132 | |||
133 | |||
134 | |||
135 | |||
136 | |||
137 | |||
138 | |||
139 | |||
140 | #define R4(k, e0, e1, m0, m1, m2, m3, OP) \ | ||
141 | e1 = abcd; \ | ||
142 | SHA1_RND4(abcd, e0, (k) / 5); \ | ||
143 | SHA1_NEXTE(e1, m1); \ | ||
144 | OP(m0, m1, m2, m3); \ | ||
145 | |||
146 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ | ||
147 | R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ | ||
148 | R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ | ||
149 | R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ | ||
150 | R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ | ||
151 | |||
152 | #define PREPARE_STATE \ | ||
153 | SHUFFLE_EPI32 (abcd, 0x1B); \ | ||
154 | SHUFFLE_EPI32 (e0, 0x1B); \ | ||
155 | |||
156 | |||
157 | |||
158 | |||
159 | |||
160 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); | ||
161 | #ifdef ATTRIB_SHA | ||
162 | ATTRIB_SHA | ||
163 | #endif | ||
164 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) | ||
165 | { | ||
166 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); | ||
167 | |||
168 | __m128i abcd, e0; | ||
169 | |||
170 | if (numBlocks == 0) | ||
171 | return; | ||
172 | |||
173 | abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca | ||
174 | e0 = _mm_cvtsi32_si128((int)state[4]); // 000e | ||
175 | |||
176 | PREPARE_STATE | ||
177 | |||
178 | do | ||
179 | { | ||
180 | __m128i abcd_save, e2; | ||
181 | __m128i m0, m1, m2, m3; | ||
182 | __m128i e1; | ||
183 | |||
184 | |||
185 | abcd_save = abcd; | ||
186 | e2 = e0; | ||
187 | |||
188 | LOAD_SHUFFLE (m0, 0) | ||
189 | LOAD_SHUFFLE (m1, 1) | ||
190 | LOAD_SHUFFLE (m2, 2) | ||
191 | LOAD_SHUFFLE (m3, 3) | ||
192 | |||
193 | ADD_EPI32(e0, m0); | ||
194 | |||
195 | R16 ( 0, m0, SM1, SM3, SM3, SM3 ); | ||
196 | R16 ( 1, m0, SM3, SM3, SM3, SM3 ); | ||
197 | R16 ( 2, m0, SM3, SM3, SM3, SM3 ); | ||
198 | R16 ( 3, m0, SM3, SM3, SM3, SM3 ); | ||
199 | R16 ( 4, e2, SM2, NNN, NNN, NNN ); | ||
200 | |||
201 | ADD_EPI32(abcd, abcd_save); | ||
202 | |||
203 | data += 64; | ||
204 | } | ||
205 | while (--numBlocks); | ||
206 | |||
207 | PREPARE_STATE | ||
208 | |||
209 | _mm_storeu_si128((__m128i *) (void *) state, abcd); | ||
210 | *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); | ||
211 | } | ||
212 | |||
213 | #endif // USE_HW_SHA | ||
214 | |||
215 | #elif defined(MY_CPU_ARM_OR_ARM64) | ||
216 | |||
217 | #if defined(__clang__) | ||
218 | #if (__clang_major__ >= 8) // fix that check | ||
219 | #define USE_HW_SHA | ||
220 | #endif | ||
221 | #elif defined(__GNUC__) | ||
222 | #if (__GNUC__ >= 6) // fix that check | ||
223 | #define USE_HW_SHA | ||
224 | #endif | ||
225 | #elif defined(_MSC_VER) | ||
226 | #if _MSC_VER >= 1910 | ||
227 | #define USE_HW_SHA | ||
228 | #endif | ||
229 | #endif | ||
230 | |||
231 | #ifdef USE_HW_SHA | ||
232 | |||
233 | // #pragma message("=== Sha1 HW === ") | ||
234 | |||
235 | #if defined(__clang__) || defined(__GNUC__) | ||
236 | #ifdef MY_CPU_ARM64 | ||
237 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | ||
238 | #else | ||
239 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | ||
240 | #endif | ||
241 | #else | ||
242 | // _MSC_VER | ||
243 | // for arm32 | ||
244 | #define _ARM_USE_NEW_NEON_INTRINSICS | ||
245 | #endif | ||
246 | |||
247 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | ||
248 | #include <arm64_neon.h> | ||
249 | #else | ||
250 | #include <arm_neon.h> | ||
251 | #endif | ||
252 | |||
253 | typedef uint32x4_t v128; | ||
254 | // typedef __n128 v128; // MSVC | ||
255 | |||
256 | #ifdef MY_CPU_BE | ||
257 | #define MY_rev32_for_LE(x) | ||
258 | #else | ||
259 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | ||
260 | #endif | ||
261 | |||
262 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | ||
263 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | ||
264 | |||
265 | #define LOAD_SHUFFLE(m, k) \ | ||
266 | m = LOAD_128((data + (k) * 16)); \ | ||
267 | MY_rev32_for_LE(m); \ | ||
268 | |||
269 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | ||
270 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src); | ||
271 | #define C(e) abcd = vsha1cq_u32(abcd, e, t); | ||
272 | #define P(e) abcd = vsha1pq_u32(abcd, e, t); | ||
273 | #define M(e) abcd = vsha1mq_u32(abcd, e, t); | ||
274 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | ||
275 | #define T(m, c) t = vaddq_u32(m, c) | ||
276 | |||
277 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
278 | #ifdef ATTRIB_SHA | ||
279 | ATTRIB_SHA | ||
280 | #endif | ||
281 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | ||
282 | { | ||
283 | v128 abcd; | ||
284 | v128 c0, c1, c2, c3; | ||
285 | uint32_t e0; | ||
286 | |||
287 | if (numBlocks == 0) | ||
288 | return; | ||
289 | |||
290 | c0 = vdupq_n_u32(0x5a827999); | ||
291 | c1 = vdupq_n_u32(0x6ed9eba1); | ||
292 | c2 = vdupq_n_u32(0x8f1bbcdc); | ||
293 | c3 = vdupq_n_u32(0xca62c1d6); | ||
294 | |||
295 | abcd = LOAD_128(&state[0]); | ||
296 | e0 = state[4]; | ||
297 | |||
298 | do | ||
299 | { | ||
300 | v128 abcd_save; | ||
301 | v128 m0, m1, m2, m3; | ||
302 | v128 t; | ||
303 | uint32_t e0_save, e1; | ||
304 | |||
305 | abcd_save = abcd; | ||
306 | e0_save = e0; | ||
307 | |||
308 | LOAD_SHUFFLE (m0, 0) | ||
309 | LOAD_SHUFFLE (m1, 1) | ||
310 | LOAD_SHUFFLE (m2, 2) | ||
311 | LOAD_SHUFFLE (m3, 3) | ||
312 | |||
313 | T(m0, c0); H(e1); C(e0); | ||
314 | T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); | ||
315 | T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); | ||
316 | T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); | ||
317 | T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); | ||
318 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
319 | T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); | ||
320 | T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
321 | T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
322 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
323 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
324 | T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); | ||
325 | T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); | ||
326 | T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); | ||
327 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
328 | T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
329 | T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
330 | T(m1, c3); SU1(m3, m2); H(e0); P(e1); | ||
331 | T(m2, c3); H(e1); P(e0); | ||
332 | T(m3, c3); H(e0); P(e1); | ||
333 | |||
334 | abcd = vaddq_u32(abcd, abcd_save); | ||
335 | e0 += e0_save; | ||
336 | |||
337 | data += 64; | ||
338 | } | ||
339 | while (--numBlocks); | ||
340 | |||
341 | STORE_128(&state[0], abcd); | ||
342 | state[4] = e0; | ||
343 | } | ||
344 | |||
345 | #endif // USE_HW_SHA | ||
346 | |||
347 | #endif // MY_CPU_ARM_OR_ARM64 | ||
348 | |||
349 | |||
350 | #ifndef USE_HW_SHA | ||
351 | |||
352 | // #error Stop_Compiling_UNSUPPORTED_SHA | ||
353 | // #include <stdlib.h> | ||
354 | |||
355 | // #include "Sha1.h" | ||
356 | void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); | ||
357 | |||
358 | #pragma message("Sha1 HW-SW stub was used") | ||
359 | |||
360 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); | ||
361 | void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) | ||
362 | { | ||
363 | Sha1_UpdateBlocks(state, data, numBlocks); | ||
364 | /* | ||
365 | UNUSED_VAR(state); | ||
366 | UNUSED_VAR(data); | ||
367 | UNUSED_VAR(numBlocks); | ||
368 | exit(1); | ||
369 | return; | ||
370 | */ | ||
371 | } | ||
372 | |||
373 | #endif | ||