aboutsummaryrefslogtreecommitdiff
path: root/C/Sha512Opt.c
diff options
context:
space:
mode:
Diffstat (limited to 'C/Sha512Opt.c')
-rw-r--r--C/Sha512Opt.c395
1 files changed, 395 insertions, 0 deletions
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c
new file mode 100644
index 0000000..3a13868
--- /dev/null
+++ b/C/Sha512Opt.c
@@ -0,0 +1,395 @@
1/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
2: Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5#include "Compiler.h"
6#include "CpuArch.h"
7
8// #define Z7_USE_HW_SHA_STUB // for debug
9#ifdef MY_CPU_X86_OR_AMD64
10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
11 #define USE_HW_SHA
12 #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000)
15 #define USE_HW_SHA
16 #if !defined(__INTEL_COMPILER)
17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18 #if !defined(__SHA512__) || !defined(__AVX2__)
19 #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
20 #endif
21 #endif
22 #elif defined(Z7_MSC_VER_ORIGINAL)
23 #if (_MSC_VER >= 1940)
24 #define USE_HW_SHA
25 #else
26 // #define Z7_USE_HW_SHA_STUB
27 #endif
28 #endif
29// #endif // MY_CPU_X86_OR_AMD64
30#ifndef USE_HW_SHA
31 // #define Z7_USE_HW_SHA_STUB // for debug
32#endif
33
34#ifdef USE_HW_SHA
35
36// #pragma message("Sha512 HW")
37
38#include <immintrin.h>
39
40#if defined (__clang__) && defined(_MSC_VER)
41 #if !defined(__AVX__)
42 #include <avxintrin.h>
43 #endif
44 #if !defined(__AVX2__)
45 #include <avx2intrin.h>
46 #endif
47 #if !defined(__SHA512__)
48 #include <sha512intrin.h>
49 #endif
50#else
51
52#endif
53
54/*
55SHA512 uses:
56AVX:
57 _mm256_loadu_si256 (vmovdqu)
58 _mm256_storeu_si256
59 _mm256_set_epi32 (unused)
60AVX2:
61 _mm256_add_epi64 : vpaddq
62 _mm256_shuffle_epi8 : vpshufb
63 _mm256_shuffle_epi32 : pshufd
64 _mm256_blend_epi32 : vpblendd
65 _mm256_permute4x64_epi64 : vpermq : 3c
66 _mm256_permute2x128_si256: vperm2i128 : 3c
67 _mm256_extracti128_si256 : vextracti128 : 3c
68SHA512:
69 _mm256_sha512*
70*/
71
72// K array must be aligned for 32-bytes at least.
73// The compiler can look align attribute and selects
74// vmovdqu - for code without align attribute
75// vmovdqa - for code with align attribute
76extern
77MY_ALIGN(64)
78const UInt64 SHA512_K_ARRAY[80];
79#define K SHA512_K_ARRAY
80
81
82#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src);
83#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
84#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src);
85
86#define LOAD_SHUFFLE(m, k) \
87 m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
88 m = _mm256_shuffle_epi8(m, mask); \
89
90#define NNN(m0, m1, m2, m3)
91
92#define SM1(m1, m2, m3, m0) \
93 SHA512_MSG1(m0, m1); \
94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
97 SHA512_MSG2(m0, m3); \
98
99#define RND2(t0, t1, lane) \
100 t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
101
102
103
104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \
107 RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \
108
109
110
111
112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
117
118#define PREPARE_STATE \
119 state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \
120 state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \
121 tmp = state0; \
122 state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
123 state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \
124
125
126void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
127#ifdef ATTRIB_SHA512
128ATTRIB_SHA512
129#endif
130void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
131{
132 const __m256i mask = _mm256_set_epi32(
133 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
134 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
135 __m256i tmp, state0, state1;
136
137 if (numBlocks == 0)
138 return;
139
140 state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
141 state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
142
143 PREPARE_STATE
144
145 do
146 {
147 __m256i state0_save, state1_save;
148 __m256i m0, m1, m2, m3;
149 __m256i msg;
150 // #define msg tmp
151
152 state0_save = state0;
153 state1_save = state1;
154
155 LOAD_SHUFFLE (m0, 0)
156 LOAD_SHUFFLE (m1, 1)
157 LOAD_SHUFFLE (m2, 2)
158 LOAD_SHUFFLE (m3, 3)
159
160
161
162 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
163 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
164 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
165 R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
166 R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
167 ADD_EPI64(state0, state0_save)
168 ADD_EPI64(state1, state1_save)
169
170 data += 128;
171 }
172 while (--numBlocks);
173
174 PREPARE_STATE
175
176 _mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
177 _mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
178}
179
180#endif // USE_HW_SHA
181
182// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
183#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
184
185 #if defined(__ARM_FEATURE_SHA512)
186 #define USE_HW_SHA
187 #else
188 #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
189 || defined(__GNUC__) && (__GNUC__ >= 9) \
190 ) \
191 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
192 #define USE_HW_SHA
193 #endif
194 #endif
195
196#ifdef USE_HW_SHA
197
198// #pragma message("=== Sha512 HW === ")
199
200
201#if defined(__clang__) || defined(__GNUC__)
202#if !defined(__ARM_FEATURE_SHA512)
203// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
204#if defined(__clang__)
205 #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
206#else
207 #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
208#endif
209#endif
210#endif
211
212
213#if defined(Z7_MSC_VER_ORIGINAL)
214#include <arm64_neon.h>
215#else
216
217#if defined(__clang__) && __clang_major__ < 16
218#if !defined(__ARM_FEATURE_SHA512)
219// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
220 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
221 #define Z7_ARM_FEATURE_SHA512_WAS_SET 1
222 #define __ARM_FEATURE_SHA512 1
223 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
224#endif
225#endif // clang
226
227#include <arm_neon.h>
228
229#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
230 defined(__ARM_FEATURE_SHA512)
231 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
232 #undef __ARM_FEATURE_SHA512
233 #undef Z7_ARM_FEATURE_SHA512_WAS_SET
234 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
235// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
236#endif
237
238#endif // Z7_MSC_VER_ORIGINAL
239
240typedef uint64x2_t v128_64;
241// typedef __n128 v128_64; // MSVC
242
243#ifdef MY_CPU_BE
244 #define MY_rev64_for_LE(x) x
245#else
246 #define MY_rev64_for_LE(x) vrev64q_u8(x)
247#endif
248
249#define LOAD_128_64(_p) vld1q_u64(_p)
250#define LOAD_128_8(_p) vld1q_u8 (_p)
251#define STORE_128_64(_p, _v) vst1q_u64(_p, _v)
252
253#define LOAD_SHUFFLE(m, k) \
254 m = vreinterpretq_u64_u8( \
255 MY_rev64_for_LE( \
256 LOAD_128_8(data + (k) * 16))); \
257
258// K array must be aligned for 16-bytes at least.
259extern
260MY_ALIGN(64)
261const UInt64 SHA512_K_ARRAY[80];
262#define K SHA512_K_ARRAY
263
264#define NN(m0, m1, m4, m5, m7)
265#define SM(m0, m1, m4, m5, m7) \
266 m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
267
268#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
269 OP(m0, m1, m4, m5, m7) \
270 t = vaddq_u64(m0, vld1q_u64(k)); \
271 t = vaddq_u64(vextq_u64(t, t, 1), a3); \
272 t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
273 a3 = vsha512h2q_u64(t, a1, a0); \
274 a1 = vaddq_u64(a1, t); \
275
276#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \
277 R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
278 R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
279 R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
280 R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
281
282#define R16(k, OP) \
283 R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
284 R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
285
286
287void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
288#ifdef ATTRIB_SHA512
289ATTRIB_SHA512
290#endif
291void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
292{
293 v128_64 a0, a1, a2, a3;
294
295 if (numBlocks == 0)
296 return;
297 a0 = LOAD_128_64(&state[0]);
298 a1 = LOAD_128_64(&state[2]);
299 a2 = LOAD_128_64(&state[4]);
300 a3 = LOAD_128_64(&state[6]);
301 do
302 {
303 v128_64 a0_save, a1_save, a2_save, a3_save;
304 v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
305 v128_64 t;
306 unsigned i;
307 const UInt64 *k_ptr;
308
309 LOAD_SHUFFLE (m0, 0)
310 LOAD_SHUFFLE (m1, 1)
311 LOAD_SHUFFLE (m2, 2)
312 LOAD_SHUFFLE (m3, 3)
313 LOAD_SHUFFLE (m4, 4)
314 LOAD_SHUFFLE (m5, 5)
315 LOAD_SHUFFLE (m6, 6)
316 LOAD_SHUFFLE (m7, 7)
317
318 a0_save = a0;
319 a1_save = a1;
320 a2_save = a2;
321 a3_save = a3;
322
323 R16 ( K, NN )
324 k_ptr = K + 16;
325 for (i = 0; i < 4; i++)
326 {
327 R16 ( k_ptr, SM )
328 k_ptr += 16;
329 }
330
331 a0 = vaddq_u64(a0, a0_save);
332 a1 = vaddq_u64(a1, a1_save);
333 a2 = vaddq_u64(a2, a2_save);
334 a3 = vaddq_u64(a3, a3_save);
335
336 data += 128;
337 }
338 while (--numBlocks);
339
340 STORE_128_64(&state[0], a0);
341 STORE_128_64(&state[2], a1);
342 STORE_128_64(&state[4], a2);
343 STORE_128_64(&state[6], a3);
344}
345
346#endif // USE_HW_SHA
347
348#endif // MY_CPU_ARM_OR_ARM64
349
350
351#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
352// #error Stop_Compiling_UNSUPPORTED_SHA
353// #include <stdlib.h>
354// We can compile this file with another C compiler,
355// or we can compile asm version.
356// So we can generate real code instead of this stub function.
357// #include "Sha512.h"
358// #if defined(_MSC_VER)
359#pragma message("Sha512 HW-SW stub was used")
360// #endif
361void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks);
362void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
363void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
364{
365 Sha512_UpdateBlocks(state, data, numBlocks);
366 /*
367 UNUSED_VAR(state);
368 UNUSED_VAR(data);
369 UNUSED_VAR(numBlocks);
370 exit(1);
371 return;
372 */
373}
374#endif
375
376
377#undef K
378#undef RND2
379#undef MY_rev64_for_LE
380#undef NN
381#undef NNN
382#undef LOAD_128
383#undef STORE_128
384#undef LOAD_SHUFFLE
385#undef SM1
386#undef SM2
387#undef SM
388#undef R2
389#undef R4
390#undef R16
391#undef PREPARE_STATE
392#undef USE_HW_SHA
393#undef ATTRIB_SHA512
394#undef USE_VER_MIN
395#undef Z7_USE_HW_SHA_STUB