diff options
Diffstat (limited to 'C/Sha512Opt.c')
-rw-r--r-- | C/Sha512Opt.c | 395 |
1 files changed, 395 insertions, 0 deletions
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c new file mode 100644 index 0000000..3a13868 --- /dev/null +++ b/C/Sha512Opt.c | |||
@@ -0,0 +1,395 @@ | |||
1 | /* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | #include "Compiler.h" | ||
6 | #include "CpuArch.h" | ||
7 | |||
8 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
9 | #ifdef MY_CPU_X86_OR_AMD64 | ||
10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it | ||
11 | #define USE_HW_SHA | ||
12 | #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ | ||
13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ | ||
14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) | ||
15 | #define USE_HW_SHA | ||
16 | #if !defined(__INTEL_COMPILER) | ||
17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | ||
18 | #if !defined(__SHA512__) || !defined(__AVX2__) | ||
19 | #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2"))) | ||
20 | #endif | ||
21 | #endif | ||
22 | #elif defined(Z7_MSC_VER_ORIGINAL) | ||
23 | #if (_MSC_VER >= 1940) | ||
24 | #define USE_HW_SHA | ||
25 | #else | ||
26 | // #define Z7_USE_HW_SHA_STUB | ||
27 | #endif | ||
28 | #endif | ||
29 | // #endif // MY_CPU_X86_OR_AMD64 | ||
30 | #ifndef USE_HW_SHA | ||
31 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
32 | #endif | ||
33 | |||
34 | #ifdef USE_HW_SHA | ||
35 | |||
36 | // #pragma message("Sha512 HW") | ||
37 | |||
38 | #include <immintrin.h> | ||
39 | |||
40 | #if defined (__clang__) && defined(_MSC_VER) | ||
41 | #if !defined(__AVX__) | ||
42 | #include <avxintrin.h> | ||
43 | #endif | ||
44 | #if !defined(__AVX2__) | ||
45 | #include <avx2intrin.h> | ||
46 | #endif | ||
47 | #if !defined(__SHA512__) | ||
48 | #include <sha512intrin.h> | ||
49 | #endif | ||
50 | #else | ||
51 | |||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | SHA512 uses: | ||
56 | AVX: | ||
57 | _mm256_loadu_si256 (vmovdqu) | ||
58 | _mm256_storeu_si256 | ||
59 | _mm256_set_epi32 (unused) | ||
60 | AVX2: | ||
61 | _mm256_add_epi64 : vpaddq | ||
62 | _mm256_shuffle_epi8 : vpshufb | ||
63 | _mm256_shuffle_epi32 : pshufd | ||
64 | _mm256_blend_epi32 : vpblendd | ||
65 | _mm256_permute4x64_epi64 : vpermq : 3c | ||
66 | _mm256_permute2x128_si256: vperm2i128 : 3c | ||
67 | _mm256_extracti128_si256 : vextracti128 : 3c | ||
68 | SHA512: | ||
69 | _mm256_sha512* | ||
70 | */ | ||
71 | |||
72 | // K array must be aligned for 32-bytes at least. | ||
73 | // The compiler can look align attribute and selects | ||
74 | // vmovdqu - for code without align attribute | ||
75 | // vmovdqa - for code with align attribute | ||
76 | extern | ||
77 | MY_ALIGN(64) | ||
78 | const UInt64 SHA512_K_ARRAY[80]; | ||
79 | #define K SHA512_K_ARRAY | ||
80 | |||
81 | |||
82 | #define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src); | ||
83 | #define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0)); | ||
84 | #define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src); | ||
85 | |||
86 | #define LOAD_SHUFFLE(m, k) \ | ||
87 | m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \ | ||
88 | m = _mm256_shuffle_epi8(m, mask); \ | ||
89 | |||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
92 | #define SM1(m1, m2, m3, m0) \ | ||
93 | SHA512_MSG1(m0, m1); \ | ||
94 | |||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \ | ||
97 | SHA512_MSG2(m0, m3); \ | ||
98 | |||
99 | #define RND2(t0, t1, lane) \ | ||
100 | t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane)); | ||
101 | |||
102 | |||
103 | |||
104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | ||
105 | msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \ | ||
107 | RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \ | ||
108 | |||
109 | |||
110 | |||
111 | |||
112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | ||
113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | ||
114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | ||
115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | ||
116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | ||
117 | |||
118 | #define PREPARE_STATE \ | ||
119 | state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \ | ||
120 | state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \ | ||
121 | tmp = state0; \ | ||
122 | state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \ | ||
123 | state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \ | ||
124 | |||
125 | |||
126 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
127 | #ifdef ATTRIB_SHA512 | ||
128 | ATTRIB_SHA512 | ||
129 | #endif | ||
130 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
131 | { | ||
132 | const __m256i mask = _mm256_set_epi32( | ||
133 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607, | ||
134 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607); | ||
135 | __m256i tmp, state0, state1; | ||
136 | |||
137 | if (numBlocks == 0) | ||
138 | return; | ||
139 | |||
140 | state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]); | ||
141 | state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]); | ||
142 | |||
143 | PREPARE_STATE | ||
144 | |||
145 | do | ||
146 | { | ||
147 | __m256i state0_save, state1_save; | ||
148 | __m256i m0, m1, m2, m3; | ||
149 | __m256i msg; | ||
150 | // #define msg tmp | ||
151 | |||
152 | state0_save = state0; | ||
153 | state1_save = state1; | ||
154 | |||
155 | LOAD_SHUFFLE (m0, 0) | ||
156 | LOAD_SHUFFLE (m1, 1) | ||
157 | LOAD_SHUFFLE (m2, 2) | ||
158 | LOAD_SHUFFLE (m3, 3) | ||
159 | |||
160 | |||
161 | |||
162 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) | ||
163 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
164 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
165 | R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
166 | R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) | ||
167 | ADD_EPI64(state0, state0_save) | ||
168 | ADD_EPI64(state1, state1_save) | ||
169 | |||
170 | data += 128; | ||
171 | } | ||
172 | while (--numBlocks); | ||
173 | |||
174 | PREPARE_STATE | ||
175 | |||
176 | _mm256_storeu_si256((__m256i *) (void *) &state[0], state0); | ||
177 | _mm256_storeu_si256((__m256i *) (void *) &state[4], state1); | ||
178 | } | ||
179 | |||
180 | #endif // USE_HW_SHA | ||
181 | |||
182 | // gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc | ||
183 | #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
184 | |||
185 | #if defined(__ARM_FEATURE_SHA512) | ||
186 | #define USE_HW_SHA | ||
187 | #else | ||
188 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ | ||
189 | || defined(__GNUC__) && (__GNUC__ >= 9) \ | ||
190 | ) \ | ||
191 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it | ||
192 | #define USE_HW_SHA | ||
193 | #endif | ||
194 | #endif | ||
195 | |||
196 | #ifdef USE_HW_SHA | ||
197 | |||
198 | // #pragma message("=== Sha512 HW === ") | ||
199 | |||
200 | |||
201 | #if defined(__clang__) || defined(__GNUC__) | ||
202 | #if !defined(__ARM_FEATURE_SHA512) | ||
203 | // #pragma message("=== we define SHA3 ATTRIB_SHA512 === ") | ||
204 | #if defined(__clang__) | ||
205 | #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3" | ||
206 | #else | ||
207 | #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3"))) | ||
208 | #endif | ||
209 | #endif | ||
210 | #endif | ||
211 | |||
212 | |||
213 | #if defined(Z7_MSC_VER_ORIGINAL) | ||
214 | #include <arm64_neon.h> | ||
215 | #else | ||
216 | |||
217 | #if defined(__clang__) && __clang_major__ < 16 | ||
218 | #if !defined(__ARM_FEATURE_SHA512) | ||
219 | // #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ") | ||
220 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
221 | #define Z7_ARM_FEATURE_SHA512_WAS_SET 1 | ||
222 | #define __ARM_FEATURE_SHA512 1 | ||
223 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
224 | #endif | ||
225 | #endif // clang | ||
226 | |||
227 | #include <arm_neon.h> | ||
228 | |||
229 | #if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \ | ||
230 | defined(__ARM_FEATURE_SHA512) | ||
231 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
232 | #undef __ARM_FEATURE_SHA512 | ||
233 | #undef Z7_ARM_FEATURE_SHA512_WAS_SET | ||
234 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
235 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
236 | #endif | ||
237 | |||
238 | #endif // Z7_MSC_VER_ORIGINAL | ||
239 | |||
240 | typedef uint64x2_t v128_64; | ||
241 | // typedef __n128 v128_64; // MSVC | ||
242 | |||
243 | #ifdef MY_CPU_BE | ||
244 | #define MY_rev64_for_LE(x) x | ||
245 | #else | ||
246 | #define MY_rev64_for_LE(x) vrev64q_u8(x) | ||
247 | #endif | ||
248 | |||
249 | #define LOAD_128_64(_p) vld1q_u64(_p) | ||
250 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
251 | #define STORE_128_64(_p, _v) vst1q_u64(_p, _v) | ||
252 | |||
253 | #define LOAD_SHUFFLE(m, k) \ | ||
254 | m = vreinterpretq_u64_u8( \ | ||
255 | MY_rev64_for_LE( \ | ||
256 | LOAD_128_8(data + (k) * 16))); \ | ||
257 | |||
258 | // K array must be aligned for 16-bytes at least. | ||
259 | extern | ||
260 | MY_ALIGN(64) | ||
261 | const UInt64 SHA512_K_ARRAY[80]; | ||
262 | #define K SHA512_K_ARRAY | ||
263 | |||
264 | #define NN(m0, m1, m4, m5, m7) | ||
265 | #define SM(m0, m1, m4, m5, m7) \ | ||
266 | m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); | ||
267 | |||
268 | #define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \ | ||
269 | OP(m0, m1, m4, m5, m7) \ | ||
270 | t = vaddq_u64(m0, vld1q_u64(k)); \ | ||
271 | t = vaddq_u64(vextq_u64(t, t, 1), a3); \ | ||
272 | t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \ | ||
273 | a3 = vsha512h2q_u64(t, a1, a0); \ | ||
274 | a1 = vaddq_u64(a1, t); \ | ||
275 | |||
276 | #define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \ | ||
277 | R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \ | ||
278 | R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \ | ||
279 | R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \ | ||
280 | R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \ | ||
281 | |||
282 | #define R16(k, OP) \ | ||
283 | R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \ | ||
284 | R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \ | ||
285 | |||
286 | |||
287 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
288 | #ifdef ATTRIB_SHA512 | ||
289 | ATTRIB_SHA512 | ||
290 | #endif | ||
291 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
292 | { | ||
293 | v128_64 a0, a1, a2, a3; | ||
294 | |||
295 | if (numBlocks == 0) | ||
296 | return; | ||
297 | a0 = LOAD_128_64(&state[0]); | ||
298 | a1 = LOAD_128_64(&state[2]); | ||
299 | a2 = LOAD_128_64(&state[4]); | ||
300 | a3 = LOAD_128_64(&state[6]); | ||
301 | do | ||
302 | { | ||
303 | v128_64 a0_save, a1_save, a2_save, a3_save; | ||
304 | v128_64 m0, m1, m2, m3, m4, m5, m6, m7; | ||
305 | v128_64 t; | ||
306 | unsigned i; | ||
307 | const UInt64 *k_ptr; | ||
308 | |||
309 | LOAD_SHUFFLE (m0, 0) | ||
310 | LOAD_SHUFFLE (m1, 1) | ||
311 | LOAD_SHUFFLE (m2, 2) | ||
312 | LOAD_SHUFFLE (m3, 3) | ||
313 | LOAD_SHUFFLE (m4, 4) | ||
314 | LOAD_SHUFFLE (m5, 5) | ||
315 | LOAD_SHUFFLE (m6, 6) | ||
316 | LOAD_SHUFFLE (m7, 7) | ||
317 | |||
318 | a0_save = a0; | ||
319 | a1_save = a1; | ||
320 | a2_save = a2; | ||
321 | a3_save = a3; | ||
322 | |||
323 | R16 ( K, NN ) | ||
324 | k_ptr = K + 16; | ||
325 | for (i = 0; i < 4; i++) | ||
326 | { | ||
327 | R16 ( k_ptr, SM ) | ||
328 | k_ptr += 16; | ||
329 | } | ||
330 | |||
331 | a0 = vaddq_u64(a0, a0_save); | ||
332 | a1 = vaddq_u64(a1, a1_save); | ||
333 | a2 = vaddq_u64(a2, a2_save); | ||
334 | a3 = vaddq_u64(a3, a3_save); | ||
335 | |||
336 | data += 128; | ||
337 | } | ||
338 | while (--numBlocks); | ||
339 | |||
340 | STORE_128_64(&state[0], a0); | ||
341 | STORE_128_64(&state[2], a1); | ||
342 | STORE_128_64(&state[4], a2); | ||
343 | STORE_128_64(&state[6], a3); | ||
344 | } | ||
345 | |||
346 | #endif // USE_HW_SHA | ||
347 | |||
348 | #endif // MY_CPU_ARM_OR_ARM64 | ||
349 | |||
350 | |||
351 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | ||
352 | // #error Stop_Compiling_UNSUPPORTED_SHA | ||
353 | // #include <stdlib.h> | ||
354 | // We can compile this file with another C compiler, | ||
355 | // or we can compile asm version. | ||
356 | // So we can generate real code instead of this stub function. | ||
357 | // #include "Sha512.h" | ||
358 | // #if defined(_MSC_VER) | ||
359 | #pragma message("Sha512 HW-SW stub was used") | ||
360 | // #endif | ||
361 | void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks); | ||
362 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
363 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
364 | { | ||
365 | Sha512_UpdateBlocks(state, data, numBlocks); | ||
366 | /* | ||
367 | UNUSED_VAR(state); | ||
368 | UNUSED_VAR(data); | ||
369 | UNUSED_VAR(numBlocks); | ||
370 | exit(1); | ||
371 | return; | ||
372 | */ | ||
373 | } | ||
374 | #endif | ||
375 | |||
376 | |||
377 | #undef K | ||
378 | #undef RND2 | ||
379 | #undef MY_rev64_for_LE | ||
380 | #undef NN | ||
381 | #undef NNN | ||
382 | #undef LOAD_128 | ||
383 | #undef STORE_128 | ||
384 | #undef LOAD_SHUFFLE | ||
385 | #undef SM1 | ||
386 | #undef SM2 | ||
387 | #undef SM | ||
388 | #undef R2 | ||
389 | #undef R4 | ||
390 | #undef R16 | ||
391 | #undef PREPARE_STATE | ||
392 | #undef USE_HW_SHA | ||
393 | #undef ATTRIB_SHA512 | ||
394 | #undef USE_VER_MIN | ||
395 | #undef Z7_USE_HW_SHA_STUB | ||