diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2021-12-27 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2022-03-18 15:35:13 +0500 |
commit | f19f813537c7aea1c20749c914e756b54a9c3cf5 (patch) | |
tree | 816ba62ca7c0fa19f2eb46d9e9d6f7dd7c3a744d /C/Sha256Opt.c | |
parent | 98e06a519b63b81986abe76d28887f6984a7732b (diff) | |
download | 7zip-21.07.tar.gz 7zip-21.07.tar.bz2 7zip-21.07.zip |
'21.07'21.07
Diffstat (limited to 'C/Sha256Opt.c')
-rw-r--r-- | C/Sha256Opt.c | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c new file mode 100644 index 0000000..decc138 --- /dev/null +++ b/C/Sha256Opt.c | |||
@@ -0,0 +1,373 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | ||
2 | 2021-04-01 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | |||
6 | #if defined(_MSC_VER) | ||
7 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
8 | // #define USE_MY_MM | ||
9 | #endif | ||
10 | #endif | ||
11 | |||
12 | #include "CpuArch.h" | ||
13 | |||
14 | #ifdef MY_CPU_X86_OR_AMD64 | ||
15 | #if defined(__clang__) | ||
16 | #if (__clang_major__ >= 8) // fix that check | ||
17 | #define USE_HW_SHA | ||
18 | #ifndef __SHA__ | ||
19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | ||
20 | #if defined(_MSC_VER) | ||
21 | // SSSE3: for clang-cl: | ||
22 | #include <tmmintrin.h> | ||
23 | #define __SHA__ | ||
24 | #endif | ||
25 | #endif | ||
26 | |||
27 | #endif | ||
28 | #elif defined(__GNUC__) | ||
29 | #if (__GNUC__ >= 8) // fix that check | ||
30 | #define USE_HW_SHA | ||
31 | #ifndef __SHA__ | ||
32 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | ||
33 | // #pragma GCC target("sha,ssse3") | ||
34 | #endif | ||
35 | #endif | ||
36 | #elif defined(__INTEL_COMPILER) | ||
37 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
38 | #define USE_HW_SHA | ||
39 | #endif | ||
40 | #elif defined(_MSC_VER) | ||
41 | #ifdef USE_MY_MM | ||
42 | #define USE_VER_MIN 1300 | ||
43 | #else | ||
44 | #define USE_VER_MIN 1910 | ||
45 | #endif | ||
46 | #if _MSC_VER >= USE_VER_MIN | ||
47 | #define USE_HW_SHA | ||
48 | #endif | ||
49 | #endif | ||
50 | // #endif // MY_CPU_X86_OR_AMD64 | ||
51 | |||
52 | #ifdef USE_HW_SHA | ||
53 | |||
54 | // #pragma message("Sha256 HW") | ||
55 | // #include <wmmintrin.h> | ||
56 | |||
57 | #if !defined(_MSC_VER) || (_MSC_VER >= 1900) | ||
58 | #include <immintrin.h> | ||
59 | #else | ||
60 | #include <emmintrin.h> | ||
61 | |||
62 | #if defined(_MSC_VER) && (_MSC_VER >= 1600) | ||
63 | // #include <intrin.h> | ||
64 | #endif | ||
65 | |||
66 | #ifdef USE_MY_MM | ||
67 | #include "My_mm.h" | ||
68 | #endif | ||
69 | |||
70 | #endif | ||
71 | |||
72 | /* | ||
73 | SHA256 uses: | ||
74 | SSE2: | ||
75 | _mm_loadu_si128 | ||
76 | _mm_storeu_si128 | ||
77 | _mm_set_epi32 | ||
78 | _mm_add_epi32 | ||
79 | _mm_shuffle_epi32 / pshufd | ||
80 | |||
81 | |||
82 | |||
83 | SSSE3: | ||
84 | _mm_shuffle_epi8 / pshufb | ||
85 | _mm_alignr_epi8 | ||
86 | SHA: | ||
87 | _mm_sha256* | ||
88 | */ | ||
89 | |||
90 | // K array must be aligned for 16-bytes at least. | ||
91 | // The compiler can look align attribute and selects | ||
92 | // movdqu - for code without align attribute | ||
93 | // movdqa - for code with align attribute | ||
94 | extern | ||
95 | MY_ALIGN(64) | ||
96 | const UInt32 SHA256_K_ARRAY[64]; | ||
97 | |||
98 | #define K SHA256_K_ARRAY | ||
99 | |||
100 | |||
101 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | ||
102 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); | ||
103 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); | ||
104 | |||
105 | |||
106 | #define LOAD_SHUFFLE(m, k) \ | ||
107 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | ||
108 | m = _mm_shuffle_epi8(m, mask); \ | ||
109 | |||
110 | #define SM1(g0, g1, g2, g3) \ | ||
111 | SHA256_MSG1(g3, g0); \ | ||
112 | |||
113 | #define SM2(g0, g1, g2, g3) \ | ||
114 | tmp = _mm_alignr_epi8(g1, g0, 4); \ | ||
115 | ADD_EPI32(g2, tmp); \ | ||
116 | SHA25G_MSG2(g2, g1); \ | ||
117 | |||
118 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) | ||
119 | // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) | ||
120 | |||
121 | |||
122 | #define NNN(g0, g1, g2, g3) | ||
123 | |||
124 | |||
125 | #define RND2(t0, t1) \ | ||
126 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); | ||
127 | |||
128 | #define RND2_0(m, k) \ | ||
129 | msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
130 | RND2(state0, state1); \ | ||
131 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
132 | |||
133 | |||
134 | #define RND2_1 \ | ||
135 | RND2(state1, state0); \ | ||
136 | |||
137 | |||
138 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 | ||
139 | |||
140 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | ||
141 | RND2_0(g0, k); \ | ||
142 | OP0(g0, g1, g2, g3); \ | ||
143 | RND2_1; \ | ||
144 | OP1(g0, g1, g2, g3); \ | ||
145 | |||
146 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | ||
147 | R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ | ||
148 | R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ | ||
149 | R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ | ||
150 | R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ | ||
151 | |||
152 | #define PREPARE_STATE \ | ||
153 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ | ||
154 | state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \ | ||
155 | state1 = state0; \ | ||
156 | state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \ | ||
157 | state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ | ||
158 | |||
159 | |||
160 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
161 | #ifdef ATTRIB_SHA | ||
162 | ATTRIB_SHA | ||
163 | #endif | ||
164 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | ||
165 | { | ||
166 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); | ||
167 | __m128i tmp; | ||
168 | __m128i state0, state1; | ||
169 | |||
170 | if (numBlocks == 0) | ||
171 | return; | ||
172 | |||
173 | state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); | ||
174 | state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]); | ||
175 | |||
176 | PREPARE_STATE | ||
177 | |||
178 | do | ||
179 | { | ||
180 | __m128i state0_save, state1_save; | ||
181 | __m128i m0, m1, m2, m3; | ||
182 | __m128i msg; | ||
183 | // #define msg tmp | ||
184 | |||
185 | state0_save = state0; | ||
186 | state1_save = state1; | ||
187 | |||
188 | LOAD_SHUFFLE (m0, 0) | ||
189 | LOAD_SHUFFLE (m1, 1) | ||
190 | LOAD_SHUFFLE (m2, 2) | ||
191 | LOAD_SHUFFLE (m3, 3) | ||
192 | |||
193 | |||
194 | |||
195 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); | ||
196 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | ||
197 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | ||
198 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); | ||
199 | |||
200 | ADD_EPI32(state0, state0_save); | ||
201 | ADD_EPI32(state1, state1_save); | ||
202 | |||
203 | data += 64; | ||
204 | } | ||
205 | while (--numBlocks); | ||
206 | |||
207 | PREPARE_STATE | ||
208 | |||
209 | _mm_storeu_si128((__m128i *) (void *) &state[0], state0); | ||
210 | _mm_storeu_si128((__m128i *) (void *) &state[4], state1); | ||
211 | } | ||
212 | |||
213 | #endif // USE_HW_SHA | ||
214 | |||
215 | #elif defined(MY_CPU_ARM_OR_ARM64) | ||
216 | |||
217 | #if defined(__clang__) | ||
218 | #if (__clang_major__ >= 8) // fix that check | ||
219 | #define USE_HW_SHA | ||
220 | #endif | ||
221 | #elif defined(__GNUC__) | ||
222 | #if (__GNUC__ >= 6) // fix that check | ||
223 | #define USE_HW_SHA | ||
224 | #endif | ||
225 | #elif defined(_MSC_VER) | ||
226 | #if _MSC_VER >= 1910 | ||
227 | #define USE_HW_SHA | ||
228 | #endif | ||
229 | #endif | ||
230 | |||
231 | #ifdef USE_HW_SHA | ||
232 | |||
233 | // #pragma message("=== Sha256 HW === ") | ||
234 | |||
235 | #if defined(__clang__) || defined(__GNUC__) | ||
236 | #ifdef MY_CPU_ARM64 | ||
237 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | ||
238 | #else | ||
239 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | ||
240 | #endif | ||
241 | #else | ||
242 | // _MSC_VER | ||
243 | // for arm32 | ||
244 | #define _ARM_USE_NEW_NEON_INTRINSICS | ||
245 | #endif | ||
246 | |||
247 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | ||
248 | #include <arm64_neon.h> | ||
249 | #else | ||
250 | #include <arm_neon.h> | ||
251 | #endif | ||
252 | |||
253 | typedef uint32x4_t v128; | ||
254 | // typedef __n128 v128; // MSVC | ||
255 | |||
256 | #ifdef MY_CPU_BE | ||
257 | #define MY_rev32_for_LE(x) | ||
258 | #else | ||
259 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | ||
260 | #endif | ||
261 | |||
262 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | ||
263 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | ||
264 | |||
265 | #define LOAD_SHUFFLE(m, k) \ | ||
266 | m = LOAD_128((data + (k) * 16)); \ | ||
267 | MY_rev32_for_LE(m); \ | ||
268 | |||
269 | // K array must be aligned for 16-bytes at least. | ||
270 | extern | ||
271 | MY_ALIGN(64) | ||
272 | const UInt32 SHA256_K_ARRAY[64]; | ||
273 | |||
274 | #define K SHA256_K_ARRAY | ||
275 | |||
276 | |||
277 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); | ||
278 | #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); | ||
279 | |||
280 | #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) | ||
281 | #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) | ||
282 | #define NNN(g0, g1, g2, g3) | ||
283 | |||
284 | |||
285 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | ||
286 | msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ | ||
287 | tmp = state0; \ | ||
288 | state0 = vsha256hq_u32( state0, state1, msg ); \ | ||
289 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ | ||
290 | OP0(g0, g1, g2, g3); \ | ||
291 | OP1(g0, g1, g2, g3); \ | ||
292 | |||
293 | |||
294 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | ||
295 | R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ | ||
296 | R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ | ||
297 | R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ | ||
298 | R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ | ||
299 | |||
300 | |||
301 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
302 | #ifdef ATTRIB_SHA | ||
303 | ATTRIB_SHA | ||
304 | #endif | ||
305 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | ||
306 | { | ||
307 | v128 state0, state1; | ||
308 | |||
309 | if (numBlocks == 0) | ||
310 | return; | ||
311 | |||
312 | state0 = LOAD_128(&state[0]); | ||
313 | state1 = LOAD_128(&state[4]); | ||
314 | |||
315 | do | ||
316 | { | ||
317 | v128 state0_save, state1_save; | ||
318 | v128 m0, m1, m2, m3; | ||
319 | v128 msg, tmp; | ||
320 | |||
321 | state0_save = state0; | ||
322 | state1_save = state1; | ||
323 | |||
324 | LOAD_SHUFFLE (m0, 0) | ||
325 | LOAD_SHUFFLE (m1, 1) | ||
326 | LOAD_SHUFFLE (m2, 2) | ||
327 | LOAD_SHUFFLE (m3, 3) | ||
328 | |||
329 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); | ||
330 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | ||
331 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | ||
332 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); | ||
333 | |||
334 | state0 = vaddq_u32(state0, state0_save); | ||
335 | state1 = vaddq_u32(state1, state1_save); | ||
336 | |||
337 | data += 64; | ||
338 | } | ||
339 | while (--numBlocks); | ||
340 | |||
341 | STORE_128(&state[0], state0); | ||
342 | STORE_128(&state[4], state1); | ||
343 | } | ||
344 | |||
345 | #endif // USE_HW_SHA | ||
346 | |||
347 | #endif // MY_CPU_ARM_OR_ARM64 | ||
348 | |||
349 | |||
350 | #ifndef USE_HW_SHA | ||
351 | |||
352 | // #error Stop_Compiling_UNSUPPORTED_SHA | ||
353 | // #include <stdlib.h> | ||
354 | |||
355 | // #include "Sha256.h" | ||
356 | void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
357 | |||
358 | #pragma message("Sha256 HW-SW stub was used") | ||
359 | |||
360 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
361 | void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | ||
362 | { | ||
363 | Sha256_UpdateBlocks(state, data, numBlocks); | ||
364 | /* | ||
365 | UNUSED_VAR(state); | ||
366 | UNUSED_VAR(data); | ||
367 | UNUSED_VAR(numBlocks); | ||
368 | exit(1); | ||
369 | return; | ||
370 | */ | ||
371 | } | ||
372 | |||
373 | #endif | ||