aboutsummaryrefslogtreecommitdiff
path: root/C
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-29 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-11-30 15:27:15 +0500
commite5431fa6f5505e385c6f9367260717e9c47dc2ee (patch)
tree4cd2c2c3b225b48c8e7053432c41d7b6b6a3d5f8 /C
parente008ce3976c087bfd21344af8f00a23cf69d4174 (diff)
download7zip-e5431fa6f5505e385c6f9367260717e9c47dc2ee.tar.gz
7zip-e5431fa6f5505e385c6f9367260717e9c47dc2ee.tar.bz2
7zip-e5431fa6f5505e385c6f9367260717e9c47dc2ee.zip
Diffstat (limited to 'C')
-rw-r--r--C/7zDec.c5
-rw-r--r--C/7zVersion.h6
-rw-r--r--C/AesOpt.c233
-rw-r--r--C/CpuArch.c109
-rw-r--r--C/CpuArch.h33
-rw-r--r--C/LzmaEnc.c16
-rw-r--r--C/Md5.c206
-rw-r--r--C/Md5.h34
-rw-r--r--C/Sha1.c125
-rw-r--r--C/Sha1.h18
-rw-r--r--C/Sha1Opt.c146
-rw-r--r--C/Sha256.c162
-rw-r--r--C/Sha256.h18
-rw-r--r--C/Sha256Opt.c172
-rw-r--r--C/Sha3.c359
-rw-r--r--C/Sha3.h36
-rw-r--r--C/Sha512.c618
-rw-r--r--C/Sha512.h86
-rw-r--r--C/Sha512Opt.c395
19 files changed, 2273 insertions, 504 deletions
diff --git a/C/7zDec.c b/C/7zDec.c
index c9b4064..520cbfd 100644
--- a/C/7zDec.c
+++ b/C/7zDec.c
@@ -1,5 +1,5 @@
1/* 7zDec.c -- Decoding from 7z folder 1/* 7zDec.c -- Decoding from 7z folder
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
312 case k_PPMD: 312 case k_PPMD:
313 #endif 313 #endif
314 return True; 314 return True;
315 default:
316 return False;
315 } 317 }
316 return False;
317} 318}
318 319
319static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) 320static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
diff --git a/C/7zVersion.h b/C/7zVersion.h
index 1ddef80..e82ba0b 100644
--- a/C/7zVersion.h
+++ b/C/7zVersion.h
@@ -1,7 +1,7 @@
1#define MY_VER_MAJOR 24 1#define MY_VER_MAJOR 24
2#define MY_VER_MINOR 8 2#define MY_VER_MINOR 9
3#define MY_VER_BUILD 0 3#define MY_VER_BUILD 0
4#define MY_VERSION_NUMBERS "24.08" 4#define MY_VERSION_NUMBERS "24.09"
5#define MY_VERSION MY_VERSION_NUMBERS 5#define MY_VERSION MY_VERSION_NUMBERS
6 6
7#ifdef MY_CPU_NAME 7#ifdef MY_CPU_NAME
@@ -10,7 +10,7 @@
10 #define MY_VERSION_CPU MY_VERSION 10 #define MY_VERSION_CPU MY_VERSION
11#endif 11#endif
12 12
13#define MY_DATE "2024-08-11" 13#define MY_DATE "2024-11-29"
14#undef MY_COPYRIGHT 14#undef MY_COPYRIGHT
15#undef MY_VERSION_COPYRIGHT_DATE 15#undef MY_VERSION_COPYRIGHT_DATE
16#define MY_AUTHOR_NAME "Igor Pavlov" 16#define MY_AUTHOR_NAME "Igor Pavlov"
diff --git a/C/AesOpt.c b/C/AesOpt.c
index 58769ea..b281807 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,5 +1,5 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -80,19 +80,39 @@ AES_FUNC_START (name)
80 80
81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82 82
83#if 1
84// use aligned SSE load/store for data.
85// It is required for our Aes functions, that data is aligned for 16-bytes.
86// So we can use this branch of code.
87// and compiler can use fused load-op SSE instructions:
88// xorps xmm0, XMMWORD PTR [rdx]
89#define LOAD_128(pp) (*(__m128i *)(void *)(pp))
90#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
91// use aligned SSE load/store for data. Alternative code with direct access
92// #define LOAD_128(pp) _mm_load_si128(pp)
93// #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
94#else
95// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96#define LOAD_128(pp) _mm_loadu_si128(pp)
97#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
98#endif
99
83AES_FUNC_START2 (AesCbc_Encode_HW) 100AES_FUNC_START2 (AesCbc_Encode_HW)
84{ 101{
102 if (numBlocks == 0)
103 return;
104 {
85 __m128i *p = (__m128i *)(void *)ivAes; 105 __m128i *p = (__m128i *)(void *)ivAes;
86 __m128i *data = (__m128i *)(void *)data8; 106 __m128i *data = (__m128i *)(void *)data8;
87 __m128i m = *p; 107 __m128i m = *p;
88 const __m128i k0 = p[2]; 108 const __m128i k0 = p[2];
89 const __m128i k1 = p[3]; 109 const __m128i k1 = p[3];
90 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; 110 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
91 for (; numBlocks != 0; numBlocks--, data++) 111 do
92 { 112 {
93 UInt32 r = numRounds2; 113 UInt32 r = numRounds2;
94 const __m128i *w = p + 4; 114 const __m128i *w = p + 4;
95 __m128i temp = *data; 115 __m128i temp = LOAD_128(data);
96 MM_XOR (temp, k0) 116 MM_XOR (temp, k0)
97 MM_XOR (m, temp) 117 MM_XOR (m, temp)
98 MM_OP_m (_mm_aesenc_si128, k1) 118 MM_OP_m (_mm_aesenc_si128, k1)
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
104 } 124 }
105 while (--r); 125 while (--r);
106 MM_OP_m (_mm_aesenclast_si128, w[0]) 126 MM_OP_m (_mm_aesenclast_si128, w[0])
107 *data = m; 127 STORE_128(data, m);
128 data++;
108 } 129 }
130 while (--numBlocks);
109 *p = m; 131 *p = m;
132 }
110} 133}
111 134
112 135
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
139 162
140#define WOP(op) op (m0, 0) WOP_M1(op) 163#define WOP(op) op (m0, 0) WOP_M1(op)
141 164
142
143#define DECLARE_VAR(reg, ii) __m128i reg; 165#define DECLARE_VAR(reg, ii) __m128i reg;
144#define LOAD_data( reg, ii) reg = data[ii]; 166#define LOAD_data_ii(ii) LOAD_128(data + (ii))
145#define STORE_data( reg, ii) data[ii] = reg; 167#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
168#define STORE_data( reg, ii) STORE_128(data + (ii), reg);
146#if (NUM_WAYS > 1) 169#if (NUM_WAYS > 1)
147#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 170#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
148#endif 171#endif
149 172
150#define MM_OP_key(op, reg) MM_OP(op, reg, key); 173#define MM_OP_key(op, reg) MM_OP(op, reg, key);
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
156#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 179#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
157 180
158#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 181#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
159#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 182#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
160 183 LOAD_128 (data + (ii))));
161#define WOP_KEY(op, n) { \ 184#define WOP_KEY(op, n) { \
162 const __m128i key = w[n]; \ 185 const __m128i key = w[n]; \
163 WOP(op); } 186 WOP(op) }
164
165 187
166#define WIDE_LOOP_START \ 188#define WIDE_LOOP_START \
167 dataEnd = data + numBlocks; \ 189 dataEnd = data + numBlocks; \
168 if (numBlocks >= NUM_WAYS) \ 190 if (numBlocks >= NUM_WAYS) \
169 { dataEnd -= NUM_WAYS; do { \ 191 { dataEnd -= NUM_WAYS; do { \
170 192
171
172#define WIDE_LOOP_END \ 193#define WIDE_LOOP_END \
173 data += NUM_WAYS; \ 194 data += NUM_WAYS; \
174 } while (data <= dataEnd); \ 195 } while (data <= dataEnd); \
175 dataEnd += NUM_WAYS; } \ 196 dataEnd += NUM_WAYS; } \
176 197
177
178#define SINGLE_LOOP \ 198#define SINGLE_LOOP \
179 for (; data < dataEnd; data++) 199 for (; data < dataEnd; data++)
180 200
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
184 204
185#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) 205#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186#define AVX_DECLARE_VAR(reg, ii) __m256i reg; 206#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; 207
188#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; 208#if 1
209// use unaligned AVX load/store for data.
210// It is required for our Aes functions, that data is aligned for 16-bytes.
211// But we need 32-bytes reading.
212// So we use intrinsics for unaligned AVX load/store.
213// notes for _mm256_storeu_si256:
214// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215// new gcc11 uses vmovdqu
216// old gcc9 could use pair of instructions:
217// vmovups %xmm7, -224(%rax)
218// vextracti128 $0x1, %ymm7, -208(%rax)
219#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
220#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221#else
222// use aligned AVX load/store for data.
223// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224// msvc2022 uses vmovdqu still
225// gcc uses vmovdqa (that requires 32-bytes alignment)
226#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
227#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
228#endif
229
230#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
189/* 232/*
190AVX_XOR_data_M1() needs unaligned memory load 233AVX_XOR_data_M1() needs unaligned memory load, even if (data)
191if (we don't use _mm256_loadu_si256() here) 234is aligned for 256-bits, because we read 32-bytes chunk that
192{ 235crosses (data) position: from (data - 16bytes) to (data + 16bytes).
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197}
198Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200*/ 236*/
201#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) 237#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
202// for debug only: the following code will fail on execution, if compiled by some compilers:
203// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204 238
205#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) 239#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) 240#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) 241#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) 242#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) 243#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); 244#define AVX_CTR_START(reg, ii) \
211#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) 245 MM_OP (_mm256_add_epi64, ctr2, two) \
246 reg = _mm256_xor_si256(ctr2, key);
247
248#define AVX_CTR_END(reg, ii) \
249 AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250 AVX_LOAD ((__m256i *)(void *)data + (ii))));
251
212#define AVX_WOP_KEY(op, n) { \ 252#define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \ 253 const __m256i key = w[n]; \
214 WOP(op); } 254 WOP(op) }
215 255
216#define NUM_AES_KEYS_MAX 15 256#define NUM_AES_KEYS_MAX 15
217 257
218#define WIDE_LOOP_START_AVX(OP) \ 258#define WIDE_LOOP_START_AVX(OP) \
219 dataEnd = data + numBlocks; \ 259 dataEnd = data + numBlocks; \
220 if (numBlocks >= NUM_WAYS * 2) \ 260 if (numBlocks >= NUM_WAYS * 2) \
221 { __m256i keys[NUM_AES_KEYS_MAX]; \ 261 { __m256i keys[NUM_AES_KEYS_MAX]; \
222 UInt32 ii; \ 262 OP \
223 OP \ 263 { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
224 for (ii = 0; ii < numRounds; ii++) \ 264 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
225 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ 265 dataEnd -= NUM_WAYS * 2; \
226 dataEnd -= NUM_WAYS * 2; do { \ 266 do { \
227
228 267
229#define WIDE_LOOP_END_AVX(OP) \ 268#define WIDE_LOOP_END_AVX(OP) \
230 data += NUM_WAYS * 2; \ 269 data += NUM_WAYS * 2; \
231 } while (data <= dataEnd); \ 270 } while (data <= dataEnd); \
232 dataEnd += NUM_WAYS * 2; \ 271 dataEnd += NUM_WAYS * 2; \
233 OP \ 272 OP \
234 _mm256_zeroupper(); \ 273 _mm256_zeroupper(); \
235 } \ 274 } \
236 275
237/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 276/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
246 __m128i *p = (__m128i *)(void *)ivAes; 285 __m128i *p = (__m128i *)(void *)ivAes;
247 __m128i *data = (__m128i *)(void *)data8; 286 __m128i *data = (__m128i *)(void *)data8;
248 __m128i iv = *p; 287 __m128i iv = *p;
249 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; 288 const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
250 const __m128i *dataEnd; 289 const __m128i *dataEnd;
251 p += 2; 290 p += 2;
252 291
253 WIDE_LOOP_START 292 WIDE_LOOP_START
254 { 293 {
255 const __m128i *w = wStart; 294 const __m128i *w = wStart;
256
257 WOP (DECLARE_VAR) 295 WOP (DECLARE_VAR)
258 WOP (LOAD_data) 296 WOP (LOAD_data)
259 WOP_KEY (AES_XOR, 1) 297 WOP_KEY (AES_XOR, 1)
260
261 do 298 do
262 { 299 {
263 WOP_KEY (AES_DEC, 0) 300 WOP_KEY (AES_DEC, 0)
301
264 w--; 302 w--;
265 } 303 }
266 while (w != p); 304 while (w != p);
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
268 306
269 MM_XOR (m0, iv) 307 MM_XOR (m0, iv)
270 WOP_M1 (XOR_data_M1) 308 WOP_M1 (XOR_data_M1)
271 iv = data[NUM_WAYS - 1]; 309 LOAD_data(iv, NUM_WAYS - 1)
272 WOP (STORE_data) 310 WOP (STORE_data)
273 } 311 }
274 WIDE_LOOP_END 312 WIDE_LOOP_END
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
276 SINGLE_LOOP 314 SINGLE_LOOP
277 { 315 {
278 const __m128i *w = wStart - 1; 316 const __m128i *w = wStart - 1;
279 __m128i m = _mm_xor_si128 (w[2], *data); 317 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318
280 do 319 do
281 { 320 {
282 MM_OP_m (_mm_aesdec_si128, w[1]) 321 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
286 while (w != p); 325 while (w != p);
287 MM_OP_m (_mm_aesdec_si128, w[1]) 326 MM_OP_m (_mm_aesdec_si128, w[1])
288 MM_OP_m (_mm_aesdeclast_si128, w[0]) 327 MM_OP_m (_mm_aesdeclast_si128, w[0])
289
290 MM_XOR (m, iv) 328 MM_XOR (m, iv)
291 iv = *data; 329 LOAD_data(iv, 0)
292 *data = m; 330 STORE_data(m, 0)
293 } 331 }
294 332
295 p[-2] = iv; 333 p[-2] = iv;
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
301 __m128i *p = (__m128i *)(void *)ivAes; 339 __m128i *p = (__m128i *)(void *)ivAes;
302 __m128i *data = (__m128i *)(void *)data8; 340 __m128i *data = (__m128i *)(void *)data8;
303 __m128i ctr = *p; 341 __m128i ctr = *p;
304 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; 342 const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
305 const __m128i *dataEnd; 343 const __m128i *dataEnd;
306 __m128i one = _mm_cvtsi32_si128(1); 344 const __m128i one = _mm_cvtsi32_si128(1);
307 345
308 p += 2; 346 p += 2;
309 347
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
322 } 360 }
323 while (--r); 361 while (--r);
324 WOP_KEY (AES_ENC_LAST, 0) 362 WOP_KEY (AES_ENC_LAST, 0)
325
326 WOP (CTR_END) 363 WOP (CTR_END)
327 } 364 }
328 WIDE_LOOP_END 365 WIDE_LOOP_END
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
344 while (--numRounds2); 381 while (--numRounds2);
345 MM_OP_m (_mm_aesenc_si128, w[0]) 382 MM_OP_m (_mm_aesenc_si128, w[0])
346 MM_OP_m (_mm_aesenclast_si128, w[1]) 383 MM_OP_m (_mm_aesenclast_si128, w[1])
347 MM_XOR (*data, m) 384 CTR_END (m, 0)
348 } 385 }
349 386
350 p[-2] = ctr; 387 p[-2] = ctr;
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
421 __m128i *data = (__m128i *)(void *)data8; 458 __m128i *data = (__m128i *)(void *)data8;
422 __m128i iv = *p; 459 __m128i iv = *p;
423 const __m128i *dataEnd; 460 const __m128i *dataEnd;
424 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 461 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
425 p += 2; 462 p += 2;
426 463
427 WIDE_LOOP_START_AVX(;) 464 WIDE_LOOP_START_AVX(;)
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
440 while (w != keys); 477 while (w != keys);
441 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) 478 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
442 479
443 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) 480 AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
444 WOP_M1 (AVX_XOR_data_M1) 481 WOP_M1 (AVX_XOR_data_M1)
445 iv = data[NUM_WAYS * 2 - 1]; 482 LOAD_data (iv, NUM_WAYS * 2 - 1)
446 WOP (AVX_STORE_data) 483 WOP (AVX_STORE_data)
447 } 484 }
448 WIDE_LOOP_END_AVX(;) 485 WIDE_LOOP_END_AVX(;)
449 486
450 SINGLE_LOOP 487 SINGLE_LOOP
451 { 488 {
452 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; 489 const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
453 __m128i m = _mm_xor_si128 (w[2], *data); 490 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
454 do 491 do
455 { 492 {
456 MM_OP_m (_mm_aesdec_si128, w[1]) 493 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
462 MM_OP_m (_mm_aesdeclast_si128, w[0]) 499 MM_OP_m (_mm_aesdeclast_si128, w[0])
463 500
464 MM_XOR (m, iv) 501 MM_XOR (m, iv)
465 iv = *data; 502 LOAD_data(iv, 0)
466 *data = m; 503 STORE_data(m, 0)
467 } 504 }
468 505
469 p[-2] = iv; 506 p[-2] = iv;
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
493 __m128i *p = (__m128i *)(void *)ivAes; 530 __m128i *p = (__m128i *)(void *)ivAes;
494 __m128i *data = (__m128i *)(void *)data8; 531 __m128i *data = (__m128i *)(void *)data8;
495 __m128i ctr = *p; 532 __m128i ctr = *p;
496 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 533 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
497 const __m128i *dataEnd; 534 const __m128i *dataEnd;
498 __m128i one = _mm_cvtsi32_si128(1); 535 const __m128i one = _mm_cvtsi32_si128(1);
499 __m256i ctr2, two; 536 __m256i ctr2, two;
500 p += 2; 537 p += 2;
501 538
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
536 while (--numRounds2); 573 while (--numRounds2);
537 MM_OP_m (_mm_aesenc_si128, w[0]) 574 MM_OP_m (_mm_aesenc_si128, w[0])
538 MM_OP_m (_mm_aesenclast_si128, w[1]) 575 MM_OP_m (_mm_aesenclast_si128, w[1])
539 MM_XOR (*data, m) 576 CTR_END (m, 0)
540 } 577 }
541 578
542 p[-2] = ctr; 579 p[-2] = ctr;
@@ -731,9 +768,14 @@ AES_FUNC_START (name)
731 768
732AES_FUNC_START2 (AesCbc_Encode_HW) 769AES_FUNC_START2 (AesCbc_Encode_HW)
733{ 770{
734 v128 * const p = (v128*)(void*)ivAes; 771 if (numBlocks == 0)
735 v128 *data = (v128*)(void*)data8; 772 return;
773 {
774 v128 * const p = (v128 *)(void *)ivAes;
775 v128 *data = (v128 *)(void *)data8;
736 v128 m = *p; 776 v128 m = *p;
777 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778 const v128 *w = p + (size_t)numRounds2 * 2;
737 const v128 k0 = p[2]; 779 const v128 k0 = p[2];
738 const v128 k1 = p[3]; 780 const v128 k1 = p[3];
739 const v128 k2 = p[4]; 781 const v128 k2 = p[4];
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
744 const v128 k7 = p[9]; 786 const v128 k7 = p[9];
745 const v128 k8 = p[10]; 787 const v128 k8 = p[10];
746 const v128 k9 = p[11]; 788 const v128 k9 = p[11];
747 const UInt32 numRounds2 = *(const UInt32 *)(p + 1); 789 const v128 k_z4 = w[-2];
748 const v128 *w = p + ((size_t)numRounds2 * 2); 790 const v128 k_z3 = w[-1];
791 const v128 k_z2 = w[0];
749 const v128 k_z1 = w[1]; 792 const v128 k_z1 = w[1];
750 const v128 k_z0 = w[2]; 793 const v128 k_z0 = w[2];
751 for (; numBlocks != 0; numBlocks--, data++) 794 // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795 // because gcc/clang compilers are not good for that optimization.
796 do
752 { 797 {
753 MM_XOR_m (*data) 798 MM_XOR_m (*data)
754 AES_E_MC_m (k0) 799 AES_E_MC_m (k0)
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
757 AES_E_MC_m (k3) 802 AES_E_MC_m (k3)
758 AES_E_MC_m (k4) 803 AES_E_MC_m (k4)
759 AES_E_MC_m (k5) 804 AES_E_MC_m (k5)
760 AES_E_MC_m (k6)
761 AES_E_MC_m (k7)
762 AES_E_MC_m (k8)
763 if (numRounds2 >= 6) 805 if (numRounds2 >= 6)
764 { 806 {
765 AES_E_MC_m (k9) 807 AES_E_MC_m (k6)
766 AES_E_MC_m (p[12]) 808 AES_E_MC_m (k7)
767 if (numRounds2 != 6) 809 if (numRounds2 != 6)
768 { 810 {
769 AES_E_MC_m (p[13]) 811 AES_E_MC_m (k8)
770 AES_E_MC_m (p[14]) 812 AES_E_MC_m (k9)
771 } 813 }
772 } 814 }
773 AES_E_m (k_z1) 815 AES_E_MC_m (k_z4)
774 MM_XOR_m (k_z0) 816 AES_E_MC_m (k_z3)
775 *data = m; 817 AES_E_MC_m (k_z2)
818 AES_E_m (k_z1)
819 MM_XOR_m (k_z0)
820 *data++ = m;
776 } 821 }
822 while (--numBlocks);
777 *p = m; 823 *p = m;
824 }
778} 825}
779 826
780 827
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
834 881
835AES_FUNC_START2 (AesCbc_Decode_HW) 882AES_FUNC_START2 (AesCbc_Decode_HW)
836{ 883{
837 v128 *p = (v128*)(void*)ivAes; 884 v128 *p = (v128 *)(void *)ivAes;
838 v128 *data = (v128*)(void*)data8; 885 v128 *data = (v128 *)(void *)data8;
839 v128 iv = *p; 886 v128 iv = *p;
840 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 887 const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
841 const v128 *dataEnd; 888 const v128 *dataEnd;
842 p += 2; 889 p += 2;
843 890
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
858 WOP_KEY (AES_XOR, 0) 905 WOP_KEY (AES_XOR, 0)
859 MM_XOR (m0, iv) 906 MM_XOR (m0, iv)
860 WOP_M1 (XOR_data_M1) 907 WOP_M1 (XOR_data_M1)
861 iv = data[NUM_WAYS - 1]; 908 LOAD_data(iv, NUM_WAYS - 1)
862 WOP (STORE_data) 909 WOP (STORE_data)
863 } 910 }
864 WIDE_LOOP_END 911 WIDE_LOOP_END
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
866 SINGLE_LOOP 913 SINGLE_LOOP
867 { 914 {
868 const v128 *w = wStart; 915 const v128 *w = wStart;
869 v128 m = *data; 916 v128 m; LOAD_data(m, 0)
870 AES_D_IMC_m (w[2]) 917 AES_D_IMC_m (w[2])
871 do 918 do
872 { 919 {
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
878 AES_D_m (w[1]) 925 AES_D_m (w[1])
879 MM_XOR_m (w[0]) 926 MM_XOR_m (w[0])
880 MM_XOR_m (iv) 927 MM_XOR_m (iv)
881 iv = *data; 928 LOAD_data(iv, 0)
882 *data = m; 929 STORE_data(m, 0)
883 } 930 }
884 931
885 p[-2] = iv; 932 p[-2] = iv;
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
888 935
889AES_FUNC_START2 (AesCtr_Code_HW) 936AES_FUNC_START2 (AesCtr_Code_HW)
890{ 937{
891 v128 *p = (v128*)(void*)ivAes; 938 v128 *p = (v128 *)(void *)ivAes;
892 v128 *data = (v128*)(void*)data8; 939 v128 *data = (v128 *)(void *)data8;
893 uint64x2_t ctr = vreinterpretq_u64_u8(*p); 940 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 941 const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
895 const v128 *dataEnd; 942 const v128 *dataEnd;
896 uint64x2_t one = vdupq_n_u64(0);
897
898// the bug in clang: 943// the bug in clang:
899// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); 944// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900#if defined(__clang__) && (__clang_major__ <= 9) 945#if defined(__clang__) && (__clang_major__ <= 9)
901#pragma GCC diagnostic ignored "-Wvector-conversion" 946#pragma GCC diagnostic ignored "-Wvector-conversion"
902#endif 947#endif
903 one = vsetq_lane_u64(1, one, 0); 948 const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
904 p += 2; 949 p += 2;
905 950
906 WIDE_LOOP_START 951 WIDE_LOOP_START
diff --git a/C/CpuArch.c b/C/CpuArch.c
index e792f39..6e02551 100644
--- a/C/CpuArch.c
+++ b/C/CpuArch.c
@@ -1,5 +1,5 @@
1/* CpuArch.c -- CPU specific code 1/* CpuArch.c -- CPU specific code
22024-07-04 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -17,7 +17,7 @@
17/* 17/*
18 cpuid instruction supports (subFunction) parameter in ECX, 18 cpuid instruction supports (subFunction) parameter in ECX,
19 that is used only with some specific (function) parameter values. 19 that is used only with some specific (function) parameter values.
20 But we always use only (subFunction==0). 20 most functions use only (subFunction==0).
21*/ 21*/
22/* 22/*
23 __cpuid(): MSVC and GCC/CLANG use same function/macro name 23 __cpuid(): MSVC and GCC/CLANG use same function/macro name
@@ -49,43 +49,49 @@
49#if defined(MY_CPU_AMD64) && defined(__PIC__) \ 49#if defined(MY_CPU_AMD64) && defined(__PIC__) \
50 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) 50 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
51 51
52#define x86_cpuid_MACRO(p, func) { \ 52 /* "=&r" selects free register. It can select even rbx, if that register is free.
53 "=&D" for (RDI) also works, but the code can be larger with "=&D"
54 "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
55
56#define x86_cpuid_MACRO_2(p, func, subFunc) { \
53 __asm__ __volatile__ ( \ 57 __asm__ __volatile__ ( \
54 ASM_LN "mov %%rbx, %q1" \ 58 ASM_LN "mov %%rbx, %q1" \
55 ASM_LN "cpuid" \ 59 ASM_LN "cpuid" \
56 ASM_LN "xchg %%rbx, %q1" \ 60 ASM_LN "xchg %%rbx, %q1" \
57 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 61 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
58
59 /* "=&r" selects free register. It can select even rbx, if that register is free.
60 "=&D" for (RDI) also works, but the code can be larger with "=&D"
61 "2"(0) means (subFunction = 0),
62 2 is (zero-based) index in the output constraint list "=c" (ECX). */
63 62
64#elif defined(MY_CPU_X86) && defined(__PIC__) \ 63#elif defined(MY_CPU_X86) && defined(__PIC__) \
65 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) 64 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
66 65
67#define x86_cpuid_MACRO(p, func) { \ 66#define x86_cpuid_MACRO_2(p, func, subFunc) { \
68 __asm__ __volatile__ ( \ 67 __asm__ __volatile__ ( \
69 ASM_LN "mov %%ebx, %k1" \ 68 ASM_LN "mov %%ebx, %k1" \
70 ASM_LN "cpuid" \ 69 ASM_LN "cpuid" \
71 ASM_LN "xchg %%ebx, %k1" \ 70 ASM_LN "xchg %%ebx, %k1" \
72 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 71 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
73 72
74#else 73#else
75 74
76#define x86_cpuid_MACRO(p, func) { \ 75#define x86_cpuid_MACRO_2(p, func, subFunc) { \
77 __asm__ __volatile__ ( \ 76 __asm__ __volatile__ ( \
78 ASM_LN "cpuid" \ 77 ASM_LN "cpuid" \
79 : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 78 : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
80 79
81#endif 80#endif
82 81
82#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
83 83
84void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) 84void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
85{ 85{
86 x86_cpuid_MACRO(p, func) 86 x86_cpuid_MACRO(p, func)
87} 87}
88 88
89static
90void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
91{
92 x86_cpuid_MACRO_2(p, func, subFunc)
93}
94
89 95
90Z7_NO_INLINE 96Z7_NO_INLINE
91UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) 97UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
@@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
205 __asm ret 0 211 __asm ret 0
206} 212}
207 213
214static
215void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
216{
217 UNUSED_VAR(p)
218 UNUSED_VAR(func)
219 UNUSED_VAR(subFunc)
220 __asm push ebx
221 __asm push edi
222 __asm mov edi, ecx // p
223 __asm mov eax, edx // func
224 __asm mov ecx, [esp + 12] // subFunc
225 __asm cpuid
226 __asm mov [edi ], eax
227 __asm mov [edi + 4], ebx
228 __asm mov [edi + 8], ecx
229 __asm mov [edi + 12], edx
230 __asm pop edi
231 __asm pop ebx
232 __asm ret 4
233}
234
208#else // MY_CPU_AMD64 235#else // MY_CPU_AMD64
209 236
210 #if _MSC_VER >= 1600 237 #if _MSC_VER >= 1600
211 #include <intrin.h> 238 #include <intrin.h>
212 #define MY_cpuidex __cpuidex 239 #define MY_cpuidex __cpuidex
240
241static
242void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
243{
244 __cpuidex((int *)p, func, subFunc);
245}
246
213 #else 247 #else
214/* 248/*
215 __cpuid (func == (0 or 7)) requires subfunction number in ECX. 249 __cpuid (func == (0 or 7)) requires subfunction number in ECX.
@@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
219 We still can use __cpuid for low (func) values that don't require ECX, 253 We still can use __cpuid for low (func) values that don't require ECX,
220 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). 254 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
221 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, 255 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
222 where ECX value is first parameter for FASTCALL / NO_INLINE func, 256 where ECX value is first parameter for FASTCALL / NO_INLINE func.
223 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and 257 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
224 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. 258 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
225 259
@@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
233} 267}
234 #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) 268 #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
235 #pragma message("======== MY_cpuidex_HACK WAS USED ========") 269 #pragma message("======== MY_cpuidex_HACK WAS USED ========")
270static
271void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
272{
273 MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
274}
236 #endif // _MSC_VER >= 1600 275 #endif // _MSC_VER >= 1600
237 276
238#if !defined(MY_CPU_AMD64) 277#if !defined(MY_CPU_AMD64)
@@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
445 } 484 }
446} 485}
447 486
487
488BoolInt CPU_IsSupported_SHA512(void)
489{
490 if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
491
492 if (z7_x86_cpuid_GetMaxFunc() < 7)
493 return False;
494 {
495 UInt32 d[4];
496 z7_x86_cpuid_subFunc(d, 7, 0);
497 if (d[0] < 1) // d[0] - is max supported subleaf value
498 return False;
499 z7_x86_cpuid_subFunc(d, 7, 1);
500 return (BoolInt)(d[0]) & 1;
501 }
502}
503
448/* 504/*
449MSVC: _xgetbv() intrinsic is available since VS2010SP1. 505MSVC: _xgetbv() intrinsic is available since VS2010SP1.
450 MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in 506 MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
@@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
776 return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); 832 return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
777} 833}
778 834
835BoolInt CPU_IsSupported_SHA512(void)
836{
837 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
838}
839
840/*
841BoolInt CPU_IsSupported_SHA3(void)
842{
843 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
844}
845*/
846
779#ifdef MY_CPU_ARM64 847#ifdef MY_CPU_ARM64
780#define APPLE_CRYPTO_SUPPORT_VAL 1 848#define APPLE_CRYPTO_SUPPORT_VAL 1
781#else 849#else
@@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
860MY_HWCAP_CHECK_FUNC (SHA1) 928MY_HWCAP_CHECK_FUNC (SHA1)
861MY_HWCAP_CHECK_FUNC (SHA2) 929MY_HWCAP_CHECK_FUNC (SHA2)
862MY_HWCAP_CHECK_FUNC (AES) 930MY_HWCAP_CHECK_FUNC (AES)
931#ifdef MY_CPU_ARM64
932// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
933// we define them here, if they are not defined
934#ifndef HWCAP_SHA3
935// #define HWCAP_SHA3 (1 << 17)
936#endif
937#ifndef HWCAP_SHA512
938// #pragma message("=== HWCAP_SHA512 define === ")
939#define HWCAP_SHA512 (1 << 21)
940#endif
941MY_HWCAP_CHECK_FUNC (SHA512)
942// MY_HWCAP_CHECK_FUNC (SHA3)
943#endif
863 944
864#endif // __APPLE__ 945#endif // __APPLE__
865#endif // _WIN32 946#endif // _WIN32
diff --git a/C/CpuArch.h b/C/CpuArch.h
index 683cfaa..a6297ea 100644
--- a/C/CpuArch.h
+++ b/C/CpuArch.h
@@ -1,5 +1,5 @@
1/* CpuArch.h -- CPU specific code 1/* CpuArch.h -- CPU specific code
22024-06-17 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_CPU_ARCH_H 4#ifndef ZIP7_INC_CPU_ARCH_H
5#define ZIP7_INC_CPU_ARCH_H 5#define ZIP7_INC_CPU_ARCH_H
@@ -509,11 +509,19 @@ problem-4 : performace:
509 509
510#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) 510#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
511 511
512#if 0
513// Z7_BSWAP16 can be slow for x86-msvc
514#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
515#else
516#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
517#endif
518
512#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) 519#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
513#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } 520#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
514 521
515#if defined(MY_CPU_LE_UNALIGN_64) 522#if defined(MY_CPU_LE_UNALIGN_64)
516#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) 523#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
524#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
517#endif 525#endif
518 526
519#else 527#else
@@ -536,11 +544,27 @@ problem-4 : performace:
536#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) 544#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
537#endif 545#endif
538 546
547#ifndef SetBe64
548#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
549 _ppp_[0] = (Byte)(_vvv_ >> 56); \
550 _ppp_[1] = (Byte)(_vvv_ >> 48); \
551 _ppp_[2] = (Byte)(_vvv_ >> 40); \
552 _ppp_[3] = (Byte)(_vvv_ >> 32); \
553 _ppp_[4] = (Byte)(_vvv_ >> 24); \
554 _ppp_[5] = (Byte)(_vvv_ >> 16); \
555 _ppp_[6] = (Byte)(_vvv_ >> 8); \
556 _ppp_[7] = (Byte)_vvv_; }
557#endif
558
539#ifndef GetBe16 559#ifndef GetBe16
560#ifdef GetBe16_to32
561#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
562#else
540#define GetBe16(p) ( (UInt16) ( \ 563#define GetBe16(p) ( (UInt16) ( \
541 ((UInt16)((const Byte *)(p))[0] << 8) | \ 564 ((UInt16)((const Byte *)(p))[0] << 8) | \
542 ((const Byte *)(p))[1] )) 565 ((const Byte *)(p))[1] ))
543#endif 566#endif
567#endif
544 568
545 569
546#if defined(MY_CPU_BE) 570#if defined(MY_CPU_BE)
@@ -589,6 +613,11 @@ problem-4 : performace:
589#endif 613#endif
590 614
591 615
616#ifndef GetBe16_to32
617#define GetBe16_to32(p) GetBe16(p)
618#endif
619
620
592#if defined(MY_CPU_X86_OR_AMD64) \ 621#if defined(MY_CPU_X86_OR_AMD64) \
593 || defined(MY_CPU_ARM_OR_ARM64) \ 622 || defined(MY_CPU_ARM_OR_ARM64) \
594 || defined(MY_CPU_PPC_OR_PPC64) 623 || defined(MY_CPU_PPC_OR_PPC64)
@@ -617,6 +646,7 @@ BoolInt CPU_IsSupported_SSE2(void);
617BoolInt CPU_IsSupported_SSSE3(void); 646BoolInt CPU_IsSupported_SSSE3(void);
618BoolInt CPU_IsSupported_SSE41(void); 647BoolInt CPU_IsSupported_SSE41(void);
619BoolInt CPU_IsSupported_SHA(void); 648BoolInt CPU_IsSupported_SHA(void);
649BoolInt CPU_IsSupported_SHA512(void);
620BoolInt CPU_IsSupported_PageGB(void); 650BoolInt CPU_IsSupported_PageGB(void);
621 651
622#elif defined(MY_CPU_ARM_OR_ARM64) 652#elif defined(MY_CPU_ARM_OR_ARM64)
@@ -634,6 +664,7 @@ BoolInt CPU_IsSupported_SHA1(void);
634BoolInt CPU_IsSupported_SHA2(void); 664BoolInt CPU_IsSupported_SHA2(void);
635BoolInt CPU_IsSupported_AES(void); 665BoolInt CPU_IsSupported_AES(void);
636#endif 666#endif
667BoolInt CPU_IsSupported_SHA512(void);
637 668
638#endif 669#endif
639 670
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c
index 37b2787..088b78f 100644
--- a/C/LzmaEnc.c
+++ b/C/LzmaEnc.c
@@ -1,5 +1,5 @@
1/* LzmaEnc.c -- LZMA Encoder 1/* LzmaEnc.c -- LZMA Encoder
22024-01-24: Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -72,11 +72,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
72 p->level = level; 72 p->level = level;
73 73
74 if (p->dictSize == 0) 74 if (p->dictSize == 0)
75 p->dictSize = 75 p->dictSize = (unsigned)level <= 4 ?
76 ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : 76 (UInt32)1 << (level * 2 + 16) :
77 ( level <= 6 ? ((UInt32)1 << (level + 19)) : 77 (unsigned)level <= sizeof(size_t) / 2 + 4 ?
78 ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) 78 (UInt32)1 << (level + 20) :
79 ))); 79 (UInt32)1 << (sizeof(size_t) / 2 + 24);
80 80
81 if (p->dictSize > p->reduceSize) 81 if (p->dictSize > p->reduceSize)
82 { 82 {
@@ -92,8 +92,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
92 if (p->lp < 0) p->lp = 0; 92 if (p->lp < 0) p->lp = 0;
93 if (p->pb < 0) p->pb = 2; 93 if (p->pb < 0) p->pb = 2;
94 94
95 if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); 95 if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
96 if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); 96 if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
97 if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); 97 if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
98 if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); 98 if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
99 if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); 99 if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
diff --git a/C/Md5.c b/C/Md5.c
new file mode 100644
index 0000000..1b745d7
--- /dev/null
+++ b/C/Md5.c
@@ -0,0 +1,206 @@
1/* Md5.c -- MD5 Hash
2: Igor Pavlov : Public domain
3This code is based on Colin Plumb's public domain md5.c code */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Md5.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks
14
15Z7_NO_INLINE
16void Md5_Init(CMd5 *p)
17{
18 p->count = 0;
19 p->state[0] = 0x67452301;
20 p->state[1] = 0xefcdab89;
21 p->state[2] = 0x98badcfe;
22 p->state[3] = 0x10325476;
23}
24
25#if 0 && !defined(MY_CPU_LE_UNALIGN)
26// optional optimization for Big-endian processors or processors without unaligned access:
27// it is intended to reduce the number of complex LE32 memory reading from 64 to 16.
28// But some compilers (sparc, armt) are better without this optimization.
29#define Z7_MD5_USE_DATA32_ARRAY
30#endif
31
32#define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i))
33
34#ifdef Z7_MD5_USE_DATA32_ARRAY
35#define D(i) data32[i]
36#else
37#define D(i) LOAD_DATA(i)
38#endif
39
40#define F1(x, y, z) (z ^ (x & (y ^ z)))
41#define F2(x, y, z) F1(z, x, y)
42#define F3(x, y, z) (x ^ y ^ z)
43#define F4(x, y, z) (y ^ (x | ~z))
44
45#define R1(i, f, start, step, w, x, y, z, s, k) \
46 w += D((start + step * (i)) % 16) + k; \
47 w += f(x, y, z); \
48 w = rotlFixed(w, s) + x; \
49
50#define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \
51 R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \
52 R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \
53 R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \
54 R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \
55
56#define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \
57 R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \
58 R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \
59 R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \
60 R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \
61
62static
63Z7_NO_INLINE
64void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks)
65{
66 UInt32 a, b, c, d;
67 // if (numBlocks == 0) return;
68 a = state[0];
69 b = state[1];
70 c = state[2];
71 d = state[3];
72 do
73 {
74#ifdef Z7_MD5_USE_DATA32_ARRAY
75 UInt32 data32[MD5_NUM_BLOCK_WORDS];
76 {
77#define LOAD_data32_x4(i) { \
78 data32[i ] = LOAD_DATA(i ); \
79 data32[i + 1] = LOAD_DATA(i + 1); \
80 data32[i + 2] = LOAD_DATA(i + 2); \
81 data32[i + 3] = LOAD_DATA(i + 3); }
82#if 1
83 LOAD_data32_x4 (0 * 4)
84 LOAD_data32_x4 (1 * 4)
85 LOAD_data32_x4 (2 * 4)
86 LOAD_data32_x4 (3 * 4)
87#else
88 unsigned i;
89 for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4)
90 {
91 LOAD_data32_x4(i)
92 }
93#endif
94 }
95#endif
96
97 R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
98 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
99 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
100 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821)
101 R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
102 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
103 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
104 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a)
105 R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
106 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
107 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
108 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665)
109 R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
110 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
111 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
112 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391)
113
114 a += state[0];
115 b += state[1];
116 c += state[2];
117 d += state[3];
118
119 state[0] = a;
120 state[1] = b;
121 state[2] = c;
122 state[3] = d;
123
124 data += MD5_BLOCK_SIZE;
125 }
126 while (--numBlocks);
127}
128
129
130#define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
131
132void Md5_Update(CMd5 *p, const Byte *data, size_t size)
133{
134 if (size == 0)
135 return;
136 {
137 const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
138 const unsigned num = MD5_BLOCK_SIZE - pos;
139 p->count += size;
140 if (num > size)
141 {
142 memcpy(p->buffer + pos, data, size);
143 return;
144 }
145 if (pos != 0)
146 {
147 size -= num;
148 memcpy(p->buffer + pos, data, num);
149 data += num;
150 Md5_UpdateBlock(p);
151 }
152 }
153 {
154 const size_t numBlocks = size >> 6;
155 if (numBlocks)
156 MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
157 size &= MD5_BLOCK_SIZE - 1;
158 if (size == 0)
159 return;
160 data += (numBlocks << 6);
161 memcpy(p->buffer, data, size);
162 }
163}
164
165
166void Md5_Final(CMd5 *p, Byte *digest)
167{
168 unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
169 p->buffer[pos++] = 0x80;
170 if (pos > (MD5_BLOCK_SIZE - 4 * 2))
171 {
172 while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; }
173 // memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos);
174 Md5_UpdateBlock(p);
175 pos = 0;
176 }
177 memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos);
178 {
179 const UInt64 numBits = p->count << 3;
180#if defined(MY_CPU_LE_UNALIGN)
181 SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits)
182#else
183 SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits))
184 SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32))
185#endif
186 }
187 Md5_UpdateBlock(p);
188
189 SetUi32(digest, p->state[0])
190 SetUi32(digest + 4, p->state[1])
191 SetUi32(digest + 8, p->state[2])
192 SetUi32(digest + 12, p->state[3])
193
194 Md5_Init(p);
195}
196
197#undef R1
198#undef R4
199#undef R16
200#undef D
201#undef LOAD_DATA
202#undef LOAD_data32_x4
203#undef F1
204#undef F2
205#undef F3
206#undef F4
diff --git a/C/Md5.h b/C/Md5.h
new file mode 100644
index 0000000..49c0741
--- /dev/null
+++ b/C/Md5.h
@@ -0,0 +1,34 @@
1/* Md5.h -- MD5 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_MD5_H
5#define ZIP7_INC_MD5_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define MD5_NUM_BLOCK_WORDS 16
12#define MD5_NUM_DIGEST_WORDS 4
13
14#define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4)
15#define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4)
16
17typedef struct
18{
19 UInt64 count;
20 UInt64 _pad_1;
21 // we want 16-bytes alignment here
22 UInt32 state[MD5_NUM_DIGEST_WORDS];
23 UInt64 _pad_2[4];
24 // we want 64-bytes alignment here
25 Byte buffer[MD5_BLOCK_SIZE];
26} CMd5;
27
28void Md5_Init(CMd5 *p);
29void Md5_Update(CMd5 *p, const Byte *data, size_t size);
30void Md5_Final(CMd5 *p, Byte *digest);
31
32EXTERN_C_END
33
34#endif
diff --git a/C/Sha1.c b/C/Sha1.c
index 4c92892..4ca21d7 100644
--- a/C/Sha1.c
+++ b/C/Sha1.c
@@ -1,18 +1,14 @@
1/* Sha1.c -- SHA-1 Hash 1/* Sha1.c -- SHA-1 Hash
22024-03-01 : Igor Pavlov : Public domain 2: Igor Pavlov : Public domain
3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ 3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7#include <string.h> 7#include <string.h>
8 8
9#include "CpuArch.h"
10#include "RotateDefs.h"
11#include "Sha1.h" 9#include "Sha1.h"
12 10#include "RotateDefs.h"
13#if defined(_MSC_VER) && (_MSC_VER < 1900) 11#include "CpuArch.h"
14// #define USE_MY_MM
15#endif
16 12
17#ifdef MY_CPU_X86_OR_AMD64 13#ifdef MY_CPU_X86_OR_AMD64
18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ 14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
56 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; 52 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks;
57 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; 53 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW;
58 54
59 #define SHA1_UPDATE_BLOCKS(p) p->func_UpdateBlocks 55 #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
60#else 56#else
61 #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks 57 #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks
62#endif 58#endif
@@ -85,7 +81,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
85 return False; 81 return False;
86 #endif 82 #endif
87 83
88 p->func_UpdateBlocks = func; 84 p->v.vars.func_UpdateBlocks = func;
89 return True; 85 return True;
90} 86}
91 87
@@ -225,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
225 221
226void Sha1_InitState(CSha1 *p) 222void Sha1_InitState(CSha1 *p)
227{ 223{
228 p->count = 0; 224 p->v.vars.count = 0;
229 p->state[0] = 0x67452301; 225 p->state[0] = 0x67452301;
230 p->state[1] = 0xEFCDAB89; 226 p->state[1] = 0xEFCDAB89;
231 p->state[2] = 0x98BADCFE; 227 p->state[2] = 0x98BADCFE;
@@ -235,7 +231,7 @@ void Sha1_InitState(CSha1 *p)
235 231
236void Sha1_Init(CSha1 *p) 232void Sha1_Init(CSha1 *p)
237{ 233{
238 p->func_UpdateBlocks = 234 p->v.vars.func_UpdateBlocks =
239 #ifdef Z7_COMPILER_SHA1_SUPPORTED 235 #ifdef Z7_COMPILER_SHA1_SUPPORTED
240 g_SHA1_FUNC_UPDATE_BLOCKS; 236 g_SHA1_FUNC_UPDATE_BLOCKS;
241 #else 237 #else
@@ -250,7 +246,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
250{ 246{
251 UInt32 a, b, c, d, e; 247 UInt32 a, b, c, d, e;
252 UInt32 W[kNumW]; 248 UInt32 W[kNumW];
253 // if (numBlocks != 0x1264378347) return; 249
254 if (numBlocks == 0) 250 if (numBlocks == 0)
255 return; 251 return;
256 252
@@ -283,7 +279,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
283 state[3] = d; 279 state[3] = d;
284 state[4] = e; 280 state[4] = e;
285 281
286 data += 64; 282 data += SHA1_BLOCK_SIZE;
287 } 283 }
288 while (--numBlocks); 284 while (--numBlocks);
289} 285}
@@ -295,20 +291,15 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
295{ 291{
296 if (size == 0) 292 if (size == 0)
297 return; 293 return;
298
299 { 294 {
300 unsigned pos = (unsigned)p->count & 0x3F; 295 const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
301 unsigned num; 296 const unsigned num = SHA1_BLOCK_SIZE - pos;
302 297 p->v.vars.count += size;
303 p->count += size;
304
305 num = 64 - pos;
306 if (num > size) 298 if (num > size)
307 { 299 {
308 memcpy(p->buffer + pos, data, size); 300 memcpy(p->buffer + pos, data, size);
309 return; 301 return;
310 } 302 }
311
312 if (pos != 0) 303 if (pos != 0)
313 { 304 {
314 size -= num; 305 size -= num;
@@ -318,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
318 } 309 }
319 } 310 }
320 { 311 {
321 size_t numBlocks = size >> 6; 312 const size_t numBlocks = size >> 6;
313 // if (numBlocks)
322 SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); 314 SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
323 size &= 0x3F; 315 size &= SHA1_BLOCK_SIZE - 1;
324 if (size == 0) 316 if (size == 0)
325 return; 317 return;
326 data += (numBlocks << 6); 318 data += (numBlocks << 6);
@@ -331,42 +323,21 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
331 323
332void Sha1_Final(CSha1 *p, Byte *digest) 324void Sha1_Final(CSha1 *p, Byte *digest)
333{ 325{
334 unsigned pos = (unsigned)p->count & 0x3F; 326 unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
335
336
337 p->buffer[pos++] = 0x80; 327 p->buffer[pos++] = 0x80;
338 328 if (pos > (SHA1_BLOCK_SIZE - 4 * 2))
339 if (pos > (64 - 8))
340 { 329 {
341 while (pos != 64) { p->buffer[pos++] = 0; } 330 while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; }
342 // memset(&p->buf.buffer[pos], 0, 64 - pos); 331 // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos);
343 Sha1_UpdateBlock(p); 332 Sha1_UpdateBlock(p);
344 pos = 0; 333 pos = 0;
345 } 334 }
346 335 memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos);
347 /*
348 if (pos & 3)
349 {
350 p->buffer[pos] = 0;
351 p->buffer[pos + 1] = 0;
352 p->buffer[pos + 2] = 0;
353 pos += 3;
354 pos &= ~3;
355 }
356 {
357 for (; pos < 64 - 8; pos += 4)
358 *(UInt32 *)(&p->buffer[pos]) = 0;
359 }
360 */
361
362 memset(&p->buffer[pos], 0, (64 - 8) - pos);
363
364 { 336 {
365 const UInt64 numBits = (p->count << 3); 337 const UInt64 numBits = p->v.vars.count << 3;
366 SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) 338 SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
367 SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) 339 SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
368 } 340 }
369
370 Sha1_UpdateBlock(p); 341 Sha1_UpdateBlock(p);
371 342
372 SetBe32(digest, p->state[0]) 343 SetBe32(digest, p->state[0])
@@ -375,16 +346,13 @@ void Sha1_Final(CSha1 *p, Byte *digest)
375 SetBe32(digest + 12, p->state[3]) 346 SetBe32(digest + 12, p->state[3])
376 SetBe32(digest + 16, p->state[4]) 347 SetBe32(digest + 16, p->state[4])
377 348
378
379
380
381 Sha1_InitState(p); 349 Sha1_InitState(p);
382} 350}
383 351
384 352
385void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) 353void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size)
386{ 354{
387 const UInt64 numBits = (p->count + size) << 3; 355 const UInt64 numBits = (p->v.vars.count + size) << 3;
388 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) 356 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32))
389 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) 357 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits))
390 // SetBe32((UInt32 *)(block + size), 0x80000000); 358 // SetBe32((UInt32 *)(block + size), 0x80000000);
@@ -420,57 +388,32 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest)
420 388
421void Sha1Prepare(void) 389void Sha1Prepare(void)
422{ 390{
423 #ifdef Z7_COMPILER_SHA1_SUPPORTED 391#ifdef Z7_COMPILER_SHA1_SUPPORTED
424 SHA1_FUNC_UPDATE_BLOCKS f, f_hw; 392 SHA1_FUNC_UPDATE_BLOCKS f, f_hw;
425 f = Sha1_UpdateBlocks; 393 f = Sha1_UpdateBlocks;
426 f_hw = NULL; 394 f_hw = NULL;
427 #ifdef MY_CPU_X86_OR_AMD64 395#ifdef MY_CPU_X86_OR_AMD64
428 #ifndef USE_MY_MM
429 if (CPU_IsSupported_SHA() 396 if (CPU_IsSupported_SHA()
430 && CPU_IsSupported_SSSE3() 397 && CPU_IsSupported_SSSE3()
431 // && CPU_IsSupported_SSE41()
432 ) 398 )
433 #endif 399#else
434 #else
435 if (CPU_IsSupported_SHA1()) 400 if (CPU_IsSupported_SHA1())
436 #endif 401#endif
437 { 402 {
438 // printf("\n========== HW SHA1 ======== \n"); 403 // printf("\n========== HW SHA1 ======== \n");
439 #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) 404#if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037)
440 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). 405 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037).
441 It generated incorrect SHA-1 code. 406 It generated incorrect SHA-1 code. */
442 21.03 : we test sha1-hardware code at runtime initialization */ 407 #pragma message("== SHA1 code can work incorrectly with this compiler")
443 408 #error Stop_Compiling_MSC_Compiler_BUG_SHA1
444 #pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") 409#endif
445
446 UInt32 state[5] = { 0, 1, 2, 3, 4 } ;
447 Byte data[64];
448 unsigned i;
449 for (i = 0; i < sizeof(data); i += 2)
450 {
451 data[i ] = (Byte)(i);
452 data[i + 1] = (Byte)(i + 1);
453 }
454
455 Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64);
456
457 if ( state[0] != 0x9acd7297
458 || state[1] != 0x4624d898
459 || state[2] != 0x0bf079f0
460 || state[3] != 0x031e61b3
461 || state[4] != 0x8323fe20)
462 {
463 // printf("\n========== SHA-1 hardware version failure ======== \n");
464 }
465 else
466 #endif
467 { 410 {
468 f = f_hw = Sha1_UpdateBlocks_HW; 411 f = f_hw = Sha1_UpdateBlocks_HW;
469 } 412 }
470 } 413 }
471 g_SHA1_FUNC_UPDATE_BLOCKS = f; 414 g_SHA1_FUNC_UPDATE_BLOCKS = f;
472 g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; 415 g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw;
473 #endif 416#endif
474} 417}
475 418
476#undef kNumW 419#undef kNumW
diff --git a/C/Sha1.h b/C/Sha1.h
index fecd9d3..529be4d 100644
--- a/C/Sha1.h
+++ b/C/Sha1.h
@@ -1,5 +1,5 @@
1/* Sha1.h -- SHA-1 Hash 1/* Sha1.h -- SHA-1 Hash
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SHA1_H 4#ifndef ZIP7_INC_SHA1_H
5#define ZIP7_INC_SHA1_H 5#define ZIP7_INC_SHA1_H
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN
14#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) 14#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4)
15#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) 15#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4)
16 16
17
18
19
17typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); 20typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
18 21
19/* 22/*
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte
32 35
33typedef struct 36typedef struct
34{ 37{
35 SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; 38 union
36 UInt64 count; 39 {
37 UInt64 _pad_2[2]; 40 struct
41 {
42 SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[4];
46 void *_pad_align_ptr[2];
47 } v;
38 UInt32 state[SHA1_NUM_DIGEST_WORDS]; 48 UInt32 state[SHA1_NUM_DIGEST_WORDS];
39 UInt32 _pad_3[3]; 49 UInt32 _pad_3[3];
40 Byte buffer[SHA1_BLOCK_SIZE]; 50 Byte buffer[SHA1_BLOCK_SIZE];
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c
index 4e835f1..8738b94 100644
--- a/C/Sha1Opt.c
+++ b/C/Sha1Opt.c
@@ -1,18 +1,11 @@
1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions 1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha1 HW") 36// #pragma message("Sha1 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -84,7 +69,6 @@ SHA:
84 _mm_sha1* 69 _mm_sha1*
85*/ 70*/
86 71
87
88#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); 72#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
89#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); 73#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
90#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); 74#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
@@ -99,11 +83,12 @@ SHA:
99#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); 83#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
100#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); 84#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
101 85
102
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 SHUFFLE_EPI8(m, mask) \ 88 SHUFFLE_EPI8(m, mask) \
106 89
90#define NNN(m0, m1, m2, m3)
91
107#define SM1(m0, m1, m2, m3) \ 92#define SM1(m0, m1, m2, m3) \
108 SHA1_MSG1(m0, m1) \ 93 SHA1_MSG1(m0, m1) \
109 94
@@ -116,35 +101,19 @@ SHA:
116 SM1(m0, m1, m2, m3) \ 101 SM1(m0, m1, m2, m3) \
117 SHA1_MSG2(m3, m2) \ 102 SHA1_MSG2(m3, m2) \
118 103
119#define NNN(m0, m1, m2, m3) 104#define R4(k, m0, m1, m2, m3, e0, e1, OP) \
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
138 e1 = abcd; \ 105 e1 = abcd; \
139 SHA1_RND4(abcd, e0, (k) / 5) \ 106 SHA1_RND4(abcd, e0, (k) / 5) \
140 SHA1_NEXTE(e1, m1) \ 107 SHA1_NEXTE(e1, m1) \
141 OP(m0, m1, m2, m3) \ 108 OP(m0, m1, m2, m3) \
142 109
110
111
143#define R16(k, mx, OP0, OP1, OP2, OP3) \ 112#define R16(k, mx, OP0, OP1, OP2, OP3) \
144 R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
145 R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
146 R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
147 R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ 116 R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 SHUFFLE_EPI32 (abcd, 0x1B) \ 119 SHUFFLE_EPI32 (abcd, 0x1B) \
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); 132 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
164 133
165 __m128i abcd, e0;
166 134
135 __m128i abcd, e0;
136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
169 139
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
204 PREPARE_STATE 174 PREPARE_STATE
205 175
206 _mm_storeu_si128((__m128i *) (void *) state, abcd); 176 _mm_storeu_si128((__m128i *) (void *) state, abcd);
207 *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); 177 *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
208} 178}
209 179
210#endif // USE_HW_SHA 180#endif // USE_HW_SHA
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -329,26 +287,37 @@ typedef uint32x4_t v128;
329#endif 287#endif
330 288
331#ifdef MY_CPU_BE 289#ifdef MY_CPU_BE
332 #define MY_rev32_for_LE(x) 290 #define MY_rev32_for_LE(x) x
333#else 291#else
334 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 292 #define MY_rev32_for_LE(x) vrev32q_u8(x)
335#endif 293#endif
336 294
337#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 295#define LOAD_128_32(_p) vld1q_u32(_p)
338#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 296#define LOAD_128_8(_p) vld1q_u8 (_p)
297#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
339 298
340#define LOAD_SHUFFLE(m, k) \ 299#define LOAD_SHUFFLE(m, k) \
341 m = LOAD_128((data + (k) * 16)); \ 300 m = vreinterpretq_u32_u8( \
342 MY_rev32_for_LE(m); \ 301 MY_rev32_for_LE( \
343 302 LOAD_128_8(data + (k) * 16))); \
344#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) 303
345#define SU1(dest, src) dest = vsha1su1q_u32(dest, src) 304#define N0(dest, src2, src3)
305#define N1(dest, src)
306#define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
307#define U1(dest, src) dest = vsha1su1q_u32(dest, src);
346#define C(e) abcd = vsha1cq_u32(abcd, e, t) 308#define C(e) abcd = vsha1cq_u32(abcd, e, t)
347#define P(e) abcd = vsha1pq_u32(abcd, e, t) 309#define P(e) abcd = vsha1pq_u32(abcd, e, t)
348#define M(e) abcd = vsha1mq_u32(abcd, e, t) 310#define M(e) abcd = vsha1mq_u32(abcd, e, t)
349#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) 311#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
350#define T(m, c) t = vaddq_u32(m, c) 312#define T(m, c) t = vaddq_u32(m, c)
351 313
314#define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
315 T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \
316 T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \
317 T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \
318 T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \
319
320
352void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); 321void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
353#ifdef ATTRIB_SHA 322#ifdef ATTRIB_SHA
354ATTRIB_SHA 323ATTRIB_SHA
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
367 c2 = vdupq_n_u32(0x8f1bbcdc); 336 c2 = vdupq_n_u32(0x8f1bbcdc);
368 c3 = vdupq_n_u32(0xca62c1d6); 337 c3 = vdupq_n_u32(0xca62c1d6);
369 338
370 abcd = LOAD_128(&state[0]); 339 abcd = LOAD_128_32(&state[0]);
371 e0 = state[4]; 340 e0 = state[4];
372 341
373 do 342 do
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
385 LOAD_SHUFFLE (m2, 2) 354 LOAD_SHUFFLE (m2, 2)
386 LOAD_SHUFFLE (m3, 3) 355 LOAD_SHUFFLE (m3, 3)
387 356
388 T(m0, c0); H(e1); C(e0); 357 R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
389 T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); 358 R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
390 T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); 359 R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
391 T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); 360 R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
392 T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); 361 R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
393 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
394 T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
395 T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
396 T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
397 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
398 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
399 T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
400 T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
401 T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
402 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
403 T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
404 T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
405 T(m1, c3); SU1(m3, m2); H(e0); P(e1);
406 T(m2, c3); H(e1); P(e0);
407 T(m3, c3); H(e0); P(e1);
408 362
409 abcd = vaddq_u32(abcd, abcd_save); 363 abcd = vaddq_u32(abcd, abcd_save);
410 e0 += e0_save; 364 e0 += e0_save;
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
413 } 367 }
414 while (--numBlocks); 368 while (--numBlocks);
415 369
416 STORE_128(&state[0], abcd); 370 STORE_128_32(&state[0], abcd);
417 state[4] = e0; 371 state[4] = e0;
418} 372}
419 373
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
421 375
422#endif // MY_CPU_ARM_OR_ARM64 376#endif // MY_CPU_ARM_OR_ARM64
423 377
424
425#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) 378#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
426// #error Stop_Compiling_UNSUPPORTED_SHA 379// #error Stop_Compiling_UNSUPPORTED_SHA
427// #include <stdlib.h> 380// #include <stdlib.h>
428
429
430
431// #include "Sha1.h" 381// #include "Sha1.h"
432// #if defined(_MSC_VER) 382// #if defined(_MSC_VER)
433#pragma message("Sha1 HW-SW stub was used") 383#pragma message("Sha1 HW-SW stub was used")
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
447} 397}
448#endif 398#endif
449 399
450#undef SU0 400#undef U0
451#undef SU1 401#undef U1
402#undef N0
403#undef N1
452#undef C 404#undef C
453#undef P 405#undef P
454#undef M 406#undef M
diff --git a/C/Sha256.c b/C/Sha256.c
index 14d3be9..ea7ed8e 100644
--- a/C/Sha256.c
+++ b/C/Sha256.c
@@ -1,18 +1,14 @@
1/* Sha256.c -- SHA-256 Hash 1/* Sha256.c -- SHA-256 Hash
22024-03-01 : Igor Pavlov : Public domain 2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */ 3This code is based on public domain code from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7#include <string.h> 7#include <string.h>
8 8
9#include "CpuArch.h"
10#include "RotateDefs.h"
11#include "Sha256.h" 9#include "Sha256.h"
12 10#include "RotateDefs.h"
13#if defined(_MSC_VER) && (_MSC_VER < 1900) 11#include "CpuArch.h"
14// #define USE_MY_MM
15#endif
16 12
17#ifdef MY_CPU_X86_OR_AMD64 13#ifdef MY_CPU_X86_OR_AMD64
18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ 14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
56 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; 52 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
57 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; 53 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
58 54
59 #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks 55 #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
60#else 56#else
61 #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks 57 #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
62#endif 58#endif
@@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
85 return False; 81 return False;
86 #endif 82 #endif
87 83
88 p->func_UpdateBlocks = func; 84 p->v.vars.func_UpdateBlocks = func;
89 return True; 85 return True;
90} 86}
91 87
@@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
111 107
112void Sha256_InitState(CSha256 *p) 108void Sha256_InitState(CSha256 *p)
113{ 109{
114 p->count = 0; 110 p->v.vars.count = 0;
115 p->state[0] = 0x6a09e667; 111 p->state[0] = 0x6a09e667;
116 p->state[1] = 0xbb67ae85; 112 p->state[1] = 0xbb67ae85;
117 p->state[2] = 0x3c6ef372; 113 p->state[2] = 0x3c6ef372;
@@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
122 p->state[7] = 0x5be0cd19; 118 p->state[7] = 0x5be0cd19;
123} 119}
124 120
121
122
123
124
125
126
127
125void Sha256_Init(CSha256 *p) 128void Sha256_Init(CSha256 *p)
126{ 129{
127 p->func_UpdateBlocks = 130 p->v.vars.func_UpdateBlocks =
128 #ifdef Z7_COMPILER_SHA256_SUPPORTED 131 #ifdef Z7_COMPILER_SHA256_SUPPORTED
129 g_SHA256_FUNC_UPDATE_BLOCKS; 132 g_SHA256_FUNC_UPDATE_BLOCKS;
130 #else 133 #else
@@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p)
133 Sha256_InitState(p); 136 Sha256_InitState(p);
134} 137}
135 138
136#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) 139#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
137#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) 140#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
138#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) 141#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
139#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) 142#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
140 143
141#define Ch(x,y,z) (z^(x&(y^z))) 144#define Ch(x,y,z) (z^(x&(y^z)))
142#define Maj(x,y,z) ((x&y)|(z&(x|y))) 145#define Maj(x,y,z) ((x&y)|(z&(x|y)))
@@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)
224 227
225#endif 228#endif
226 229
227// static
228extern MY_ALIGN(64)
229const UInt32 SHA256_K_ARRAY[64];
230 230
231MY_ALIGN(64) 231extern
232const UInt32 SHA256_K_ARRAY[64] = { 232MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
233MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
233 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 234 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
234 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 235 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
235 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 236 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = {
248 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 249 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
249}; 250};
250 251
251#define K SHA256_K_ARRAY
252 252
253 253
254
255
256#define K SHA256_K_ARRAY
257
254Z7_NO_INLINE 258Z7_NO_INLINE
255void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) 259void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
256{ 260{
257 UInt32 W 261 UInt32 W
258 #ifdef Z7_SHA256_BIG_W 262#ifdef Z7_SHA256_BIG_W
259 [64]; 263 [64];
260 #else 264#else
261 [16]; 265 [16];
262 #endif 266#endif
263
264 unsigned j; 267 unsigned j;
265
266 UInt32 a,b,c,d,e,f,g,h; 268 UInt32 a,b,c,d,e,f,g,h;
267 269#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
268 #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
269 UInt32 tmp; 270 UInt32 tmp;
270 #endif 271#endif
271 272
273 if (numBlocks == 0) return;
274
272 a = state[0]; 275 a = state[0];
273 b = state[1]; 276 b = state[1];
274 c = state[2]; 277 c = state[2];
@@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
278 g = state[6]; 281 g = state[6];
279 h = state[7]; 282 h = state[7];
280 283
281 while (numBlocks) 284 do
282 { 285 {
283 286
284 for (j = 0; j < 16; j += STEP_PRE) 287 for (j = 0; j < 16; j += STEP_PRE)
@@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
352 g += state[6]; state[6] = g; 355 g += state[6]; state[6] = g;
353 h += state[7]; state[7] = h; 356 h += state[7]; state[7] = h;
354 357
355 data += 64; 358 data += SHA256_BLOCK_SIZE;
356 numBlocks--;
357 } 359 }
358 360 while (--numBlocks);
359 /* Wipe variables */
360 /* memset(W, 0, sizeof(W)); */
361} 361}
362 362
363#undef S0
364#undef S1
365#undef s0
366#undef s1
367#undef K
368 363
369#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) 364#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
370 365
@@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
372{ 367{
373 if (size == 0) 368 if (size == 0)
374 return; 369 return;
375
376 { 370 {
377 unsigned pos = (unsigned)p->count & 0x3F; 371 const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
378 unsigned num; 372 const unsigned num = SHA256_BLOCK_SIZE - pos;
379 373 p->v.vars.count += size;
380 p->count += size;
381
382 num = 64 - pos;
383 if (num > size) 374 if (num > size)
384 { 375 {
385 memcpy(p->buffer + pos, data, size); 376 memcpy(p->buffer + pos, data, size);
386 return; 377 return;
387 } 378 }
388
389 if (pos != 0) 379 if (pos != 0)
390 { 380 {
391 size -= num; 381 size -= num;
@@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
395 } 385 }
396 } 386 }
397 { 387 {
398 size_t numBlocks = size >> 6; 388 const size_t numBlocks = size >> 6;
389 // if (numBlocks)
399 SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); 390 SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
400 size &= 0x3F; 391 size &= SHA256_BLOCK_SIZE - 1;
401 if (size == 0) 392 if (size == 0)
402 return; 393 return;
403 data += (numBlocks << 6); 394 data += (numBlocks << 6);
@@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
408 399
409void Sha256_Final(CSha256 *p, Byte *digest) 400void Sha256_Final(CSha256 *p, Byte *digest)
410{ 401{
411 unsigned pos = (unsigned)p->count & 0x3F; 402 unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
412 unsigned i;
413
414 p->buffer[pos++] = 0x80; 403 p->buffer[pos++] = 0x80;
415 404 if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
416 if (pos > (64 - 8))
417 { 405 {
418 while (pos != 64) { p->buffer[pos++] = 0; } 406 while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
419 // memset(&p->buf.buffer[pos], 0, 64 - pos); 407 // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
420 Sha256_UpdateBlock(p); 408 Sha256_UpdateBlock(p);
421 pos = 0; 409 pos = 0;
422 } 410 }
423 411 memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
424 /*
425 if (pos & 3)
426 { 412 {
427 p->buffer[pos] = 0; 413 const UInt64 numBits = p->v.vars.count << 3;
428 p->buffer[pos + 1] = 0; 414 SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
429 p->buffer[pos + 2] = 0; 415 SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
430 pos += 3;
431 pos &= ~3;
432 } 416 }
417 Sha256_UpdateBlock(p);
418#if 1 && defined(MY_CPU_BE)
419 memcpy(digest, p->state, SHA256_DIGEST_SIZE);
420#else
433 { 421 {
434 for (; pos < 64 - 8; pos += 4) 422 unsigned i;
435 *(UInt32 *)(&p->buffer[pos]) = 0; 423 for (i = 0; i < 8; i += 2)
424 {
425 const UInt32 v0 = p->state[i];
426 const UInt32 v1 = p->state[(size_t)i + 1];
427 SetBe32(digest , v0)
428 SetBe32(digest + 4, v1)
429 digest += 4 * 2;
430 }
436 } 431 }
437 */
438 432
439 memset(&p->buffer[pos], 0, (64 - 8) - pos);
440 433
441 {
442 UInt64 numBits = (p->count << 3);
443 SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
444 SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
445 }
446
447 Sha256_UpdateBlock(p);
448 434
449 for (i = 0; i < 8; i += 2) 435
450 { 436#endif
451 UInt32 v0 = p->state[i];
452 UInt32 v1 = p->state[(size_t)i + 1];
453 SetBe32(digest , v0)
454 SetBe32(digest + 4, v1)
455 digest += 8;
456 }
457
458 Sha256_InitState(p); 437 Sha256_InitState(p);
459} 438}
460 439
461 440
462void Sha256Prepare(void) 441void Sha256Prepare(void)
463{ 442{
464 #ifdef Z7_COMPILER_SHA256_SUPPORTED 443#ifdef Z7_COMPILER_SHA256_SUPPORTED
465 SHA256_FUNC_UPDATE_BLOCKS f, f_hw; 444 SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
466 f = Sha256_UpdateBlocks; 445 f = Sha256_UpdateBlocks;
467 f_hw = NULL; 446 f_hw = NULL;
468 #ifdef MY_CPU_X86_OR_AMD64 447#ifdef MY_CPU_X86_OR_AMD64
469 #ifndef USE_MY_MM
470 if (CPU_IsSupported_SHA() 448 if (CPU_IsSupported_SHA()
471 && CPU_IsSupported_SSSE3() 449 && CPU_IsSupported_SSSE3()
472 // && CPU_IsSupported_SSE41()
473 ) 450 )
474 #endif 451#else
475 #else
476 if (CPU_IsSupported_SHA2()) 452 if (CPU_IsSupported_SHA2())
477 #endif 453#endif
478 { 454 {
479 // printf("\n========== HW SHA256 ======== \n"); 455 // printf("\n========== HW SHA256 ======== \n");
480 f = f_hw = Sha256_UpdateBlocks_HW; 456 f = f_hw = Sha256_UpdateBlocks_HW;
481 } 457 }
482 g_SHA256_FUNC_UPDATE_BLOCKS = f; 458 g_SHA256_FUNC_UPDATE_BLOCKS = f;
483 g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; 459 g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
484 #endif 460#endif
485} 461}
486 462
463#undef U64C
464#undef K
487#undef S0 465#undef S0
488#undef S1 466#undef S1
489#undef s0 467#undef s0
diff --git a/C/Sha256.h b/C/Sha256.h
index 9e04223..75329cd 100644
--- a/C/Sha256.h
+++ b/C/Sha256.h
@@ -1,5 +1,5 @@
1/* Sha256.h -- SHA-256 Hash 1/* Sha256.h -- SHA-256 Hash
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SHA256_H 4#ifndef ZIP7_INC_SHA256_H
5#define ZIP7_INC_SHA256_H 5#define ZIP7_INC_SHA256_H
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN
14#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) 14#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
15#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) 15#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
16 16
17
18
19
17typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); 20typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
18 21
19/* 22/*
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
32 35
33typedef struct 36typedef struct
34{ 37{
35 SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; 38 union
36 UInt64 count; 39 {
37 UInt64 _pad_2[2]; 40 struct
41 {
42 SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[4];
46 void *_pad_align_ptr[2];
47 } v;
38 UInt32 state[SHA256_NUM_DIGEST_WORDS]; 48 UInt32 state[SHA256_NUM_DIGEST_WORDS];
39 49
40 Byte buffer[SHA256_BLOCK_SIZE]; 50 Byte buffer[SHA256_BLOCK_SIZE];
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c
index eb38166..1c6b50f 100644
--- a/C/Sha256Opt.c
+++ b/C/Sha256Opt.c
@@ -1,18 +1,11 @@
1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions 1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha256 HW") 36// #pragma message("Sha256 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -91,60 +76,44 @@ SHA:
91extern 76extern
92MY_ALIGN(64) 77MY_ALIGN(64)
93const UInt32 SHA256_K_ARRAY[64]; 78const UInt32 SHA256_K_ARRAY[64];
94
95#define K SHA256_K_ARRAY 79#define K SHA256_K_ARRAY
96 80
97 81
98#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); 82#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
99#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); 83#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
100#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); 84#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
101
102 85
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 m = _mm_shuffle_epi8(m, mask); \ 88 m = _mm_shuffle_epi8(m, mask); \
106 89
107#define SM1(g0, g1, g2, g3) \ 90#define NNN(m0, m1, m2, m3)
108 SHA256_MSG1(g3, g0); \
109 91
110#define SM2(g0, g1, g2, g3) \ 92#define SM1(m1, m2, m3, m0) \
111 tmp = _mm_alignr_epi8(g1, g0, 4); \ 93 SHA256_MSG1(m0, m1); \
112 ADD_EPI32(g2, tmp) \
113 SHA25G_MSG2(g2, g1); \
114
115// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
116// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
117
118
119#define NNN(g0, g1, g2, g3)
120 94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
97 SHA256_MSG2(m0, m3); \
121 98
122#define RND2(t0, t1) \ 99#define RND2(t0, t1) \
123 t0 = _mm_sha256rnds2_epu32(t0, t1, msg); 100 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
124 101
125#define RND2_0(m, k) \
126 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
127 RND2(state0, state1); \
128 msg = _mm_shuffle_epi32(msg, 0x0E); \
129 102
130 103
131#define RND2_1 \ 104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1); \
107 msg = _mm_shuffle_epi32(msg, 0x0E); \
108 OP0(m0, m1, m2, m3) \
132 RND2(state1, state0); \ 109 RND2(state1, state0); \
133 110 OP1(m0, m1, m2, m3) \
134
135// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
136
137#define R4(k, g0, g1, g2, g3, OP0, OP1) \
138 RND2_0(g0, k) \
139 OP0(g0, g1, g2, g3) \
140 RND2_1 \
141 OP1(g0, g1, g2, g3) \
142 111
143#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
144 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
145 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
146 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
147 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ 116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ 119 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@@ -161,8 +130,9 @@ ATTRIB_SHA
161void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) 130void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); 132 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
164 __m128i tmp; 133
165 __m128i state0, state1; 134
135 __m128i tmp, state0, state1;
166 136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -324,41 +282,70 @@ typedef uint32x4_t v128;
324// typedef __n128 v128; // MSVC 282// typedef __n128 v128; // MSVC
325 283
326#ifdef MY_CPU_BE 284#ifdef MY_CPU_BE
327 #define MY_rev32_for_LE(x) 285 #define MY_rev32_for_LE(x) x
328#else 286#else
329 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 287 #define MY_rev32_for_LE(x) vrev32q_u8(x)
330#endif 288#endif
331 289
332#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 290#if 1 // 0 for debug
333#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 291// for arm32: it works slower by some reason than direct code
292/*
293for arm32 it generates:
294MSVC-2022, GCC-9:
295 vld1.32 {d18,d19}, [r10]
296 vst1.32 {d4,d5}, [r3]
297 vld1.8 {d20-d21}, [r4]
298there is no align hint (like [r10:128]). So instruction allows unaligned access
299*/
300#define LOAD_128_32(_p) vld1q_u32(_p)
301#define LOAD_128_8(_p) vld1q_u8 (_p)
302#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
303#else
304/*
305for arm32:
306MSVC-2022:
307 vldm r10,{d18,d19}
308 vstm r3,{d4,d5}
309 does it require strict alignment?
310GCC-9:
311 vld1.64 {d30-d31}, [r0:64]
312 vldr d28, [r0, #16]
313 vldr d29, [r0, #24]
314 vst1.64 {d30-d31}, [r0:64]
315 vstr d28, [r0, #16]
316 vstr d29, [r0, #24]
317there is hint [r0:64], so does it requires 64-bit alignment.
318*/
319#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
320#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
321#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
322#endif
334 323
335#define LOAD_SHUFFLE(m, k) \ 324#define LOAD_SHUFFLE(m, k) \
336 m = LOAD_128((data + (k) * 16)); \ 325 m = vreinterpretq_u32_u8( \
337 MY_rev32_for_LE(m); \ 326 MY_rev32_for_LE( \
327 LOAD_128_8(data + (k) * 16))); \
338 328
339// K array must be aligned for 16-bytes at least. 329// K array must be aligned for 16-bytes at least.
340extern 330extern
341MY_ALIGN(64) 331MY_ALIGN(64)
342const UInt32 SHA256_K_ARRAY[64]; 332const UInt32 SHA256_K_ARRAY[64];
343
344#define K SHA256_K_ARRAY 333#define K SHA256_K_ARRAY
345 334
346
347#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); 335#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
348#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); 336#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
349 337
350#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) 338#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
351#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) 339#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
352#define NNN(g0, g1, g2, g3) 340#define NNN(m0, m1, m2, m3)
353 341
354 342#define R4(k, m0, m1, m2, m3, OP0, OP1) \
355#define R4(k, g0, g1, g2, g3, OP0, OP1) \ 343 msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
356 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
357 tmp = state0; \ 344 tmp = state0; \
358 state0 = vsha256hq_u32( state0, state1, msg ); \ 345 state0 = vsha256hq_u32( state0, state1, msg ); \
359 state1 = vsha256h2q_u32( state1, tmp, msg ); \ 346 state1 = vsha256h2q_u32( state1, tmp, msg ); \
360 OP0(g0, g1, g2, g3); \ 347 OP0(m0, m1, m2, m3); \
361 OP1(g0, g1, g2, g3); \ 348 OP1(m0, m1, m2, m3); \
362 349
363 350
364#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 351#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
379 if (numBlocks == 0) 366 if (numBlocks == 0)
380 return; 367 return;
381 368
382 state0 = LOAD_128(&state[0]); 369 state0 = LOAD_128_32(&state[0]);
383 state1 = LOAD_128(&state[4]); 370 state1 = LOAD_128_32(&state[4]);
384 371
385 do 372 do
386 { 373 {
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
408 } 395 }
409 while (--numBlocks); 396 while (--numBlocks);
410 397
411 STORE_128(&state[0], state0); 398 STORE_128_32(&state[0], state0);
412 STORE_128(&state[4], state1); 399 STORE_128_32(&state[4], state1);
413} 400}
414 401
415#endif // USE_HW_SHA 402#endif // USE_HW_SHA
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
443#endif 430#endif
444 431
445 432
446
447#undef K 433#undef K
448#undef RND2 434#undef RND2
449#undef RND2_0
450#undef RND2_1
451
452#undef MY_rev32_for_LE 435#undef MY_rev32_for_LE
436
453#undef NNN 437#undef NNN
454#undef LOAD_128 438#undef LOAD_128
455#undef STORE_128 439#undef STORE_128
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
457#undef SM1 441#undef SM1
458#undef SM2 442#undef SM2
459 443
460#undef NNN 444
461#undef R4 445#undef R4
462#undef R16 446#undef R16
463#undef PREPARE_STATE 447#undef PREPARE_STATE
diff --git a/C/Sha3.c b/C/Sha3.c
new file mode 100644
index 0000000..be972d6
--- /dev/null
+++ b/C/Sha3.c
@@ -0,0 +1,359 @@
1/* Sha3.c -- SHA-3 Hash
2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Sha3.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#define U64C(x) UINT64_CONST(x)
14
15static
16MY_ALIGN(64)
17const UInt64 SHA3_K_ARRAY[24] =
18{
19 U64C(0x0000000000000001), U64C(0x0000000000008082),
20 U64C(0x800000000000808a), U64C(0x8000000080008000),
21 U64C(0x000000000000808b), U64C(0x0000000080000001),
22 U64C(0x8000000080008081), U64C(0x8000000000008009),
23 U64C(0x000000000000008a), U64C(0x0000000000000088),
24 U64C(0x0000000080008009), U64C(0x000000008000000a),
25 U64C(0x000000008000808b), U64C(0x800000000000008b),
26 U64C(0x8000000000008089), U64C(0x8000000000008003),
27 U64C(0x8000000000008002), U64C(0x8000000000000080),
28 U64C(0x000000000000800a), U64C(0x800000008000000a),
29 U64C(0x8000000080008081), U64C(0x8000000000008080),
30 U64C(0x0000000080000001), U64C(0x8000000080008008)
31};
32
33void Sha3_Init(CSha3 *p)
34{
35 p->count = 0;
36 memset(p->state, 0, sizeof(p->state));
37}
38
39#define GET_state(i, a) UInt64 a = state[i];
40#define SET_state(i, a) state[i] = a;
41
42#define LS_5(M, i, a0,a1,a2,a3,a4) \
43 M ((i) * 5 , a0) \
44 M ((i) * 5 + 1, a1) \
45 M ((i) * 5 + 2, a2) \
46 M ((i) * 5 + 3, a3) \
47 M ((i) * 5 + 4, a4) \
48
49#define LS_25(M) \
50 LS_5 (M, 0, a50, a51, a52, a53, a54) \
51 LS_5 (M, 1, a60, a61, a62, a63, a64) \
52 LS_5 (M, 2, a70, a71, a72, a73, a74) \
53 LS_5 (M, 3, a80, a81, a82, a83, a84) \
54 LS_5 (M, 4, a90, a91, a92, a93, a94) \
55
56
57#define XOR_1(i, a0) \
58 a0 ^= GetUi64(data + (i) * 8); \
59
60#define XOR_4(i, a0,a1,a2,a3) \
61 XOR_1 ((i) , a0); \
62 XOR_1 ((i) + 1, a1); \
63 XOR_1 ((i) + 2, a2); \
64 XOR_1 ((i) + 3, a3); \
65
66#define D(d,b1,b2) \
67 d = b1 ^ Z7_ROTL64(b2, 1);
68
69#define D5 \
70 D (d0, c4, c1) \
71 D (d1, c0, c2) \
72 D (d2, c1, c3) \
73 D (d3, c2, c4) \
74 D (d4, c3, c0) \
75
76#define C0(c,a,d) \
77 c = a ^ d; \
78
79#define C(c,a,d,k) \
80 c = a ^ d; \
81 c = Z7_ROTL64(c, k); \
82
83#define E4(e1,e2,e3,e4) \
84 e1 = c1 ^ (~c2 & c3); \
85 e2 = c2 ^ (~c3 & c4); \
86 e3 = c3 ^ (~c4 & c0); \
87 e4 = c4 ^ (~c0 & c1); \
88
89#define CK( v0,w0, \
90 v1,w1,k1, \
91 v2,w2,k2, \
92 v3,w3,k3, \
93 v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \
94 C0(c0,v0,w0) \
95 C (c1,v1,w1,k1) \
96 C (c2,v2,w2,k2) \
97 C (c3,v3,w3,k3) \
98 C (c4,v4,w4,k4) \
99 e0 = c0 ^ (~c1 & c2) ^ keccak_c; \
100 E4(e1,e2,e3,e4) \
101
102#define CE( v0,w0,k0, \
103 v1,w1,k1, \
104 v2,w2,k2, \
105 v3,w3,k3, \
106 v4,w4,k4, e0,e1,e2,e3,e4) \
107 C (c0,v0,w0,k0) \
108 C (c1,v1,w1,k1) \
109 C (c2,v2,w2,k2) \
110 C (c3,v3,w3,k3) \
111 C (c4,v4,w4,k4) \
112 e0 = c0 ^ (~c1 & c2); \
113 E4(e1,e2,e3,e4) \
114
115// numBlocks != 0
116static
117Z7_NO_INLINE
118void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS],
119 const Byte *data, size_t numBlocks, size_t blockSize)
120{
121 LS_25 (GET_state)
122
123 do
124 {
125 unsigned round;
126 XOR_4 ( 0, a50, a51, a52, a53)
127 XOR_4 ( 4, a54, a60, a61, a62)
128 XOR_1 ( 8, a63)
129 if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384
130 if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256
131 if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224
132 if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128
133 XOR_1 (19, a84)
134 XOR_1 (20, a90) }}}}
135 data += blockSize;
136
137 for (round = 0; round < 24; round += 2)
138 {
139 UInt64 c0, c1, c2, c3, c4;
140 UInt64 d0, d1, d2, d3, d4;
141 UInt64 e50, e51, e52, e53, e54;
142 UInt64 e60, e61, e62, e63, e64;
143 UInt64 e70, e71, e72, e73, e74;
144 UInt64 e80, e81, e82, e83, e84;
145 UInt64 e90, e91, e92, e93, e94;
146
147 c0 = a50^a60^a70^a80^a90;
148 c1 = a51^a61^a71^a81^a91;
149 c2 = a52^a62^a72^a82^a92;
150 c3 = a53^a63^a73^a83^a93;
151 c4 = a54^a64^a74^a84^a94;
152 D5
153 CK( a50, d0,
154 a61, d1, 44,
155 a72, d2, 43,
156 a83, d3, 21,
157 a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round])
158 CE( a53, d3, 28,
159 a64, d4, 20,
160 a70, d0, 3,
161 a81, d1, 45,
162 a92, d2, 61, e60, e61, e62, e63, e64)
163 CE( a51, d1, 1,
164 a62, d2, 6,
165 a73, d3, 25,
166 a84, d4, 8,
167 a90, d0, 18, e70, e71, e72, e73, e74)
168 CE( a54, d4, 27,
169 a60, d0, 36,
170 a71, d1, 10,
171 a82, d2, 15,
172 a93, d3, 56, e80, e81, e82, e83, e84)
173 CE( a52, d2, 62,
174 a63, d3, 55,
175 a74, d4, 39,
176 a80, d0, 41,
177 a91, d1, 2, e90, e91, e92, e93, e94)
178
179 // ---------- ROUND + 1 ----------
180
181 c0 = e50^e60^e70^e80^e90;
182 c1 = e51^e61^e71^e81^e91;
183 c2 = e52^e62^e72^e82^e92;
184 c3 = e53^e63^e73^e83^e93;
185 c4 = e54^e64^e74^e84^e94;
186 D5
187 CK( e50, d0,
188 e61, d1, 44,
189 e72, d2, 43,
190 e83, d3, 21,
191 e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1])
192 CE( e53, d3, 28,
193 e64, d4, 20,
194 e70, d0, 3,
195 e81, d1, 45,
196 e92, d2, 61, a60, a61, a62, a63, a64)
197 CE( e51, d1, 1,
198 e62, d2, 6,
199 e73, d3, 25,
200 e84, d4, 8,
201 e90, d0, 18, a70, a71, a72, a73, a74)
202 CE (e54, d4, 27,
203 e60, d0, 36,
204 e71, d1, 10,
205 e82, d2, 15,
206 e93, d3, 56, a80, a81, a82, a83, a84)
207 CE (e52, d2, 62,
208 e63, d3, 55,
209 e74, d4, 39,
210 e80, d0, 41,
211 e91, d1, 2, a90, a91, a92, a93, a94)
212 }
213 }
214 while (--numBlocks);
215
216 LS_25 (SET_state)
217}
218
219
220#define Sha3_UpdateBlock(p) \
221 Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize)
222
223void Sha3_Update(CSha3 *p, const Byte *data, size_t size)
224{
225/*
226 for (;;)
227 {
228 if (size == 0)
229 return;
230 unsigned cur = p->blockSize - p->count;
231 if (cur > size)
232 cur = (unsigned)size;
233 size -= cur;
234 unsigned pos = p->count;
235 p->count = pos + cur;
236 while (pos & 7)
237 {
238 if (cur == 0)
239 return;
240 Byte *pb = &(((Byte *)p->state)[pos]);
241 *pb = (Byte)(*pb ^ *data++);
242 cur--;
243 pos++;
244 }
245 if (cur >= 8)
246 {
247 do
248 {
249 *(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data);
250 data += 8;
251 pos += 8;
252 cur -= 8;
253 }
254 while (cur >= 8);
255 }
256 if (pos != p->blockSize)
257 {
258 if (cur)
259 {
260 Byte *pb = &(((Byte *)p->state)[pos]);
261 do
262 {
263 *pb = (Byte)(*pb ^ *data++);
264 pb++;
265 }
266 while (--cur);
267 }
268 return;
269 }
270 Sha3_UpdateBlock(p->state);
271 p->count = 0;
272 }
273*/
274 if (size == 0)
275 return;
276 {
277 const unsigned pos = p->count;
278 const unsigned num = p->blockSize - pos;
279 if (num > size)
280 {
281 p->count = pos + (unsigned)size;
282 memcpy(p->buffer + pos, data, size);
283 return;
284 }
285 if (pos != 0)
286 {
287 size -= num;
288 memcpy(p->buffer + pos, data, num);
289 data += num;
290 Sha3_UpdateBlock(p);
291 }
292 }
293 if (size >= p->blockSize)
294 {
295 const size_t numBlocks = size / p->blockSize;
296 const Byte *dataOld = data;
297 data += numBlocks * p->blockSize;
298 size = (size_t)(dataOld + size - data);
299 Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize);
300 }
301 p->count = (unsigned)size;
302 if (size)
303 memcpy(p->buffer, data, size);
304}
305
306
307// we support only (digestSize % 4 == 0) cases
308void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake)
309{
310 memset(p->buffer + p->count, 0, p->blockSize - p->count);
311 // we write bits markers from low to higher in current byte:
312 // - if sha-3 : 2 bits : 0,1
313 // - if shake : 4 bits : 1111
314 // then we write bit 1 to same byte.
315 // And we write bit 1 to highest bit of last byte of block.
316 p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06);
317 // we need xor operation (^= 0x80) here because we must write 0x80 bit
318 // to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!!
319 p->buffer[p->blockSize - 1] ^= 0x80;
320/*
321 ((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06);
322 ((Byte *)p->state)[p->blockSize - 1] ^= 0x80;
323*/
324 Sha3_UpdateBlock(p);
325#if 1 && defined(MY_CPU_LE)
326 memcpy(digest, p->state, digestSize);
327#else
328 {
329 const unsigned numWords = digestSize >> 3;
330 unsigned i;
331 for (i = 0; i < numWords; i++)
332 {
333 const UInt64 v = p->state[i];
334 SetUi64(digest, v)
335 digest += 8;
336 }
337 if (digestSize & 4) // for SHA3-224
338 {
339 const UInt32 v = (UInt32)p->state[numWords];
340 SetUi32(digest, v)
341 }
342 }
343#endif
344 Sha3_Init(p);
345}
346
347#undef GET_state
348#undef SET_state
349#undef LS_5
350#undef LS_25
351#undef XOR_1
352#undef XOR_4
353#undef D
354#undef D5
355#undef C0
356#undef C
357#undef E4
358#undef CK
359#undef CE
diff --git a/C/Sha3.h b/C/Sha3.h
new file mode 100644
index 0000000..c5909c9
--- /dev/null
+++ b/C/Sha3.h
@@ -0,0 +1,36 @@
1/* Sha3.h -- SHA-3 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_MD5_H
5#define ZIP7_INC_MD5_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define SHA3_NUM_STATE_WORDS 25
12
13#define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \
14 (SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2)
15
16typedef struct
17{
18 UInt32 count; // < blockSize
19 UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8
20 UInt64 _pad1[3];
21 // we want 32-bytes alignment here
22 UInt64 state[SHA3_NUM_STATE_WORDS];
23 UInt64 _pad2[3];
24 // we want 64-bytes alignment here
25 Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values
26} CSha3;
27
28#define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); }
29
30void Sha3_Init(CSha3 *p);
31void Sha3_Update(CSha3 *p, const Byte *data, size_t size);
32void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake);
33
34EXTERN_C_END
35
36#endif
diff --git a/C/Sha512.c b/C/Sha512.c
new file mode 100644
index 0000000..04827d6
--- /dev/null
+++ b/C/Sha512.c
@@ -0,0 +1,618 @@
1/* Sha512.c -- SHA-512 Hash
2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Sha512.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#ifdef MY_CPU_X86_OR_AMD64
14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
15 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
16 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \
17 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \
18 || defined(_MSC_VER) && (_MSC_VER >= 1940)
19 #define Z7_COMPILER_SHA512_SUPPORTED
20 #endif
21#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
22 #if defined(__ARM_FEATURE_SHA512)
23 #define Z7_COMPILER_SHA512_SUPPORTED
24 #else
25 #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
26 || defined(__GNUC__) && (__GNUC__ >= 9) \
27 ) \
28 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
29 #define Z7_COMPILER_SHA512_SUPPORTED
30 #endif
31 #endif
32#endif
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
48
49#ifdef Z7_COMPILER_SHA512_SUPPORTED
50 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
51
52 static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks;
53 static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW;
54
55 #define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
56#else
57 #define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks
58#endif
59
60
61BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo)
62{
63 SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks;
64
65 #ifdef Z7_COMPILER_SHA512_SUPPORTED
66 if (algo != SHA512_ALGO_SW)
67 {
68 if (algo == SHA512_ALGO_DEFAULT)
69 func = g_SHA512_FUNC_UPDATE_BLOCKS;
70 else
71 {
72 if (algo != SHA512_ALGO_HW)
73 return False;
74 func = g_SHA512_FUNC_UPDATE_BLOCKS_HW;
75 if (!func)
76 return False;
77 }
78 }
79 #else
80 if (algo > 1)
81 return False;
82 #endif
83
84 p->v.vars.func_UpdateBlocks = func;
85 return True;
86}
87
88
89/* define it for speed optimization */
90
91#if 0 // 1 for size optimization
92 #define STEP_PRE 1
93 #define STEP_MAIN 1
94#else
95 #define STEP_PRE 2
96 #define STEP_MAIN 4
97 // #define Z7_SHA512_UNROLL
98#endif
99
100#undef Z7_SHA512_BIG_W
101#if STEP_MAIN != 16
102 #define Z7_SHA512_BIG_W
103#endif
104
105
106#define U64C(x) UINT64_CONST(x)
107
108static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = {
109{ U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf),
110 U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1)
111},
112{ U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd),
113 U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2)
114},
115{ U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939),
116 U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4)
117},
118{ U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1),
119 U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179)
120}};
121
122void Sha512_InitState(CSha512 *p, unsigned digestSize)
123{
124 p->v.vars.count = 0;
125 memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state));
126}
127
128void Sha512_Init(CSha512 *p, unsigned digestSize)
129{
130 p->v.vars.func_UpdateBlocks =
131 #ifdef Z7_COMPILER_SHA512_SUPPORTED
132 g_SHA512_FUNC_UPDATE_BLOCKS;
133 #else
134 NULL;
135 #endif
136 Sha512_InitState(p, digestSize);
137}
138
139#define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39))
140#define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41))
141#define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7))
142#define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6))
143
144#define Ch(x,y,z) (z^(x&(y^z)))
145#define Maj(x,y,z) ((x&y)|(z&(x|y)))
146
147
148#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8))
149
150#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
151
152#ifdef Z7_SHA512_BIG_W
153 // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
154 #define w(j, i) W[(size_t)(j) + i]
155 #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
156#else
157 #if STEP_MAIN == 16
158 #define w(j, i) W[(i) & 15]
159 #else
160 #define w(j, i) W[((size_t)(j) + (i)) & 15]
161 #endif
162 #define blk2(j, i) (w(j, i) += blk2_main(j, i))
163#endif
164
165#define W_MAIN(i) blk2(j, i)
166
167
168#define T1(wx, i) \
169 tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
170 h = g; \
171 g = f; \
172 f = e; \
173 e = d + tmp; \
174 tmp += S0(a) + Maj(a, b, c); \
175 d = c; \
176 c = b; \
177 b = a; \
178 a = tmp; \
179
180#define R1_PRE(i) T1( W_PRE, i)
181#define R1_MAIN(i) T1( W_MAIN, i)
182
183#if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
184#define R2_MAIN(i) \
185 R1_MAIN(i) \
186 R1_MAIN(i + 1) \
187
188#endif
189
190
191
192#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
193
194#define T4( a,b,c,d,e,f,g,h, wx, i) \
195 h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
196 tmp = h; \
197 h += d; \
198 d = tmp + S0(a) + Maj(a, b, c); \
199
200#define R4( wx, i) \
201 T4 ( a,b,c,d,e,f,g,h, wx, (i )); \
202 T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \
203 T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \
204 T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \
205
206#define R4_PRE(i) R4( W_PRE, i)
207#define R4_MAIN(i) R4( W_MAIN, i)
208
209
210#define T8( a,b,c,d,e,f,g,h, wx, i) \
211 h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
212 d += h; \
213 h += S0(a) + Maj(a, b, c); \
214
215#define R8( wx, i) \
216 T8 ( a,b,c,d,e,f,g,h, wx, i ); \
217 T8 ( h,a,b,c,d,e,f,g, wx, i+1); \
218 T8 ( g,h,a,b,c,d,e,f, wx, i+2); \
219 T8 ( f,g,h,a,b,c,d,e, wx, i+3); \
220 T8 ( e,f,g,h,a,b,c,d, wx, i+4); \
221 T8 ( d,e,f,g,h,a,b,c, wx, i+5); \
222 T8 ( c,d,e,f,g,h,a,b, wx, i+6); \
223 T8 ( b,c,d,e,f,g,h,a, wx, i+7); \
224
225#define R8_PRE(i) R8( W_PRE, i)
226#define R8_MAIN(i) R8( W_MAIN, i)
227
228#endif
229
230
231extern
232MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80];
233MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = {
234 U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc),
235 U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118),
236 U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2),
237 U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694),
238 U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65),
239 U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5),
240 U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4),
241 U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70),
242 U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df),
243 U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b),
244 U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30),
245 U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8),
246 U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8),
247 U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3),
248 U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec),
249 U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b),
250 U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178),
251 U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b),
252 U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c),
253 U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817)
254};
255
256#define K SHA512_K_ARRAY
257
258Z7_NO_INLINE
259void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks)
260{
261 UInt64 W
262#ifdef Z7_SHA512_BIG_W
263 [80];
264#else
265 [16];
266#endif
267 unsigned j;
268 UInt64 a,b,c,d,e,f,g,h;
269#if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
270 UInt64 tmp;
271#endif
272
273 if (numBlocks == 0) return;
274
275 a = state[0];
276 b = state[1];
277 c = state[2];
278 d = state[3];
279 e = state[4];
280 f = state[5];
281 g = state[6];
282 h = state[7];
283
284 do
285 {
286
287 for (j = 0; j < 16; j += STEP_PRE)
288 {
289 #if STEP_PRE > 4
290
291 #if STEP_PRE < 8
292 R4_PRE(0);
293 #else
294 R8_PRE(0);
295 #if STEP_PRE == 16
296 R8_PRE(8);
297 #endif
298 #endif
299
300 #else
301
302 R1_PRE(0)
303 #if STEP_PRE >= 2
304 R1_PRE(1)
305 #if STEP_PRE >= 4
306 R1_PRE(2)
307 R1_PRE(3)
308 #endif
309 #endif
310
311 #endif
312 }
313
314 for (j = 16; j < 80; j += STEP_MAIN)
315 {
316 #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
317
318 #if STEP_MAIN < 8
319 R4_MAIN(0)
320 #else
321 R8_MAIN(0)
322 #if STEP_MAIN == 16
323 R8_MAIN(8)
324 #endif
325 #endif
326
327 #else
328
329 R1_MAIN(0)
330 #if STEP_MAIN >= 2
331 R1_MAIN(1)
332 #if STEP_MAIN >= 4
333 R2_MAIN(2)
334 #if STEP_MAIN >= 8
335 R2_MAIN(4)
336 R2_MAIN(6)
337 #if STEP_MAIN >= 16
338 R2_MAIN(8)
339 R2_MAIN(10)
340 R2_MAIN(12)
341 R2_MAIN(14)
342 #endif
343 #endif
344 #endif
345 #endif
346 #endif
347 }
348
349 a += state[0]; state[0] = a;
350 b += state[1]; state[1] = b;
351 c += state[2]; state[2] = c;
352 d += state[3]; state[3] = d;
353 e += state[4]; state[4] = e;
354 f += state[5]; state[5] = f;
355 g += state[6]; state[6] = g;
356 h += state[7]; state[7] = h;
357
358 data += SHA512_BLOCK_SIZE;
359 }
360 while (--numBlocks);
361}
362
363
364#define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
365
366void Sha512_Update(CSha512 *p, const Byte *data, size_t size)
367{
368 if (size == 0)
369 return;
370 {
371 const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
372 const unsigned num = SHA512_BLOCK_SIZE - pos;
373 p->v.vars.count += size;
374 if (num > size)
375 {
376 memcpy(p->buffer + pos, data, size);
377 return;
378 }
379 if (pos != 0)
380 {
381 size -= num;
382 memcpy(p->buffer + pos, data, num);
383 data += num;
384 Sha512_UpdateBlock(p);
385 }
386 }
387 {
388 const size_t numBlocks = size >> 7;
389 // if (numBlocks)
390 SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
391 size &= SHA512_BLOCK_SIZE - 1;
392 if (size == 0)
393 return;
394 data += (numBlocks << 7);
395 memcpy(p->buffer, data, size);
396 }
397}
398
399
400void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize)
401{
402 unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
403 p->buffer[pos++] = 0x80;
404 if (pos > (SHA512_BLOCK_SIZE - 8 * 2))
405 {
406 while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; }
407 // memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos);
408 Sha512_UpdateBlock(p);
409 pos = 0;
410 }
411 memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos);
412 {
413 const UInt64 numBits = p->v.vars.count << 3;
414 SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits)
415 SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits)
416 }
417 Sha512_UpdateBlock(p);
418#if 1 && defined(MY_CPU_BE)
419 memcpy(digest, p->state, digestSize);
420#else
421 {
422 const unsigned numWords = digestSize >> 3;
423 unsigned i;
424 for (i = 0; i < numWords; i++)
425 {
426 const UInt64 v = p->state[i];
427 SetBe64(digest, v)
428 digest += 8;
429 }
430 if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE
431 {
432 const UInt32 v = (UInt32)((p->state[numWords]) >> 32);
433 SetBe32(digest, v)
434 }
435 }
436#endif
437 Sha512_InitState(p, digestSize);
438}
439
440
441
442
443#if defined(_WIN32) && defined(Z7_COMPILER_SHA512_SUPPORTED) \
444 && defined(MY_CPU_ARM64) // we can disable this check to debug in x64
445
446#if 1 // 0 for debug
447
448#include "7zWindows.h"
449// #include <stdio.h>
450#if 0 && defined(MY_CPU_X86_OR_AMD64)
451#include <intrin.h> // for debug : for __ud2()
452#endif
453
454BoolInt CPU_IsSupported_SHA512(void)
455{
456#if defined(MY_CPU_ARM64)
457 // we have no SHA512 flag for IsProcessorFeaturePresent() still.
458 if (!CPU_IsSupported_CRYPTO())
459 return False;
460#endif
461 // printf("\nCPU_IsSupported_SHA512\n");
462 {
463 // we can't read ID_AA64ISAR0_EL1 register from application.
464 // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value.
465 HKEY key = NULL;
466 LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
467 TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
468 0, KEY_READ, &key);
469 if (res != ERROR_SUCCESS)
470 return False;
471 {
472 DWORD type = 0;
473 DWORD count = sizeof(UInt64);
474 UInt64 val = 0;
475 res = RegQueryValueEx(key, TEXT("CP 4030"), NULL,
476 &type, (LPBYTE)&val, &count);
477 RegCloseKey(key);
478 if (res != ERROR_SUCCESS
479 || type != REG_QWORD
480 || count != sizeof(UInt64)
481 || ((unsigned)(val >> 12) & 0xf) != 2)
482 return False;
483 // we parse SHA2 field of ID_AA64ISAR0_EL1 register:
484 // 0 : No SHA2 instructions implemented
485 // 1 : SHA256 implemented
486 // 2 : SHA256 and SHA512 implemented
487 }
488 }
489
490
491#if 1 // 0 for debug to disable SHA512 PROBE code
492
493/*
494----- SHA512 PROBE -----
495
496We suppose that "CP 4030" registry reading is enough.
497But we use additional SHA512 PROBE code, because
498we can catch exception here, and we don't catch exceptions,
499if we call Sha512 functions from main code.
500
501NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64.
502The program just stops.
503Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator
504without SHA512 support (-skl switch),
505The program stops, and we have message from SDE:
506 TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1
507But we still want to catch that exception instead of process stopping.
508Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)?
509Are there any ways to fix the problems with arm64-wine and x64-SDE cases?
510*/
511
512 // printf("\n========== CPU_IsSupported_SHA512 PROBE ========\n");
513 {
514#ifdef __clang_major__
515 #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
516#endif
517 __try
518 {
519#if 0 // 1 : for debug (reduced version to detect sha512)
520 const uint64x2_t a = vdupq_n_u64(1);
521 const uint64x2_t b = vsha512hq_u64(a, a, a);
522 if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002)
523 return True;
524#else
525 MY_ALIGN(16)
526 UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS];
527 memset(temp, 0x5a, sizeof(temp));
528#if 0 && defined(MY_CPU_X86_OR_AMD64)
529 __ud2(); // for debug : that exception is not problem for SDE
530#endif
531#if 1
532 Sha512_UpdateBlocks_HW(temp,
533 (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1);
534 // printf("\n==== t = %x\n", (UInt32)temp[0]);
535 if ((UInt32)temp[0] == 0xa33cfdf7)
536 {
537 // printf("\n=== PROBE SHA512: SHA512 supported\n");
538 return True;
539 }
540#endif
541#endif
542 }
543 __except (EXCEPTION_EXECUTE_HANDLER)
544 {
545 // printf("\n==== CPU_IsSupported_SHA512 EXCEPTION_EXECUTE_HANDLER\n");
546 }
547 }
548 return False;
549#else
550 // without SHA512 PROBE code
551 return True;
552#endif
553
554}
555
556#else
557
558BoolInt CPU_IsSupported_SHA512(void)
559{
560 return False;
561}
562
563#endif
564#endif // WIN32 arm64
565
566
567void Sha512Prepare(void)
568{
569#ifdef Z7_COMPILER_SHA512_SUPPORTED
570 SHA512_FUNC_UPDATE_BLOCKS f, f_hw;
571 f = Sha512_UpdateBlocks;
572 f_hw = NULL;
573#ifdef MY_CPU_X86_OR_AMD64
574 if (CPU_IsSupported_SHA512()
575 && CPU_IsSupported_AVX2()
576 )
577#else
578 if (CPU_IsSupported_SHA512())
579#endif
580 {
581 // printf("\n========== HW SHA512 ======== \n");
582 f = f_hw = Sha512_UpdateBlocks_HW;
583 }
584 g_SHA512_FUNC_UPDATE_BLOCKS = f;
585 g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw;
586#endif
587}
588
589
590#undef K
591#undef S0
592#undef S1
593#undef s0
594#undef s1
595#undef Ch
596#undef Maj
597#undef W_MAIN
598#undef W_PRE
599#undef w
600#undef blk2_main
601#undef blk2
602#undef T1
603#undef T4
604#undef T8
605#undef R1_PRE
606#undef R1_MAIN
607#undef R2_MAIN
608#undef R4
609#undef R4_PRE
610#undef R4_MAIN
611#undef R8
612#undef R8_PRE
613#undef R8_MAIN
614#undef STEP_PRE
615#undef STEP_MAIN
616#undef Z7_SHA512_BIG_W
617#undef Z7_SHA512_UNROLL
618#undef Z7_COMPILER_SHA512_SUPPORTED
diff --git a/C/Sha512.h b/C/Sha512.h
new file mode 100644
index 0000000..1f3a4d1
--- /dev/null
+++ b/C/Sha512.h
@@ -0,0 +1,86 @@
1/* Sha512.h -- SHA-512 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_SHA512_H
5#define ZIP7_INC_SHA512_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define SHA512_NUM_BLOCK_WORDS 16
12#define SHA512_NUM_DIGEST_WORDS 8
13
14#define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8)
15#define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8)
16#define SHA512_224_DIGEST_SIZE (224 / 8)
17#define SHA512_256_DIGEST_SIZE (256 / 8)
18#define SHA512_384_DIGEST_SIZE (384 / 8)
19
20typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks);
21
22/*
23 if (the system supports different SHA512 code implementations)
24 {
25 (CSha512::func_UpdateBlocks) will be used
26 (CSha512::func_UpdateBlocks) can be set by
27 Sha512_Init() - to default (fastest)
28 Sha512_SetFunction() - to any algo
29 }
30 else
31 {
32 (CSha512::func_UpdateBlocks) is ignored.
33 }
34*/
35
36typedef struct
37{
38 union
39 {
40 struct
41 {
42 SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[8];
46 void *_pad_align_ptr[2];
47 } v;
48 UInt64 state[SHA512_NUM_DIGEST_WORDS];
49
50 Byte buffer[SHA512_BLOCK_SIZE];
51} CSha512;
52
53
54#define SHA512_ALGO_DEFAULT 0
55#define SHA512_ALGO_SW 1
56#define SHA512_ALGO_HW 2
57
58/*
59Sha512_SetFunction()
60return:
61 0 - (algo) value is not supported, and func_UpdateBlocks was not changed
62 1 - func_UpdateBlocks was set according (algo) value.
63*/
64
65BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo);
66// we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8
67void Sha512_InitState(CSha512 *p, unsigned digestSize);
68void Sha512_Init(CSha512 *p, unsigned digestSize);
69void Sha512_Update(CSha512 *p, const Byte *data, size_t size);
70void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize);
71
72
73
74
75// void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
76
77/*
78call Sha512Prepare() once at program start.
79It prepares all supported implementations, and detects the fastest implementation.
80*/
81
82void Sha512Prepare(void);
83
84EXTERN_C_END
85
86#endif
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c
new file mode 100644
index 0000000..3a13868
--- /dev/null
+++ b/C/Sha512Opt.c
@@ -0,0 +1,395 @@
1/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
2: Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5#include "Compiler.h"
6#include "CpuArch.h"
7
8// #define Z7_USE_HW_SHA_STUB // for debug
9#ifdef MY_CPU_X86_OR_AMD64
10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
11 #define USE_HW_SHA
12 #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000)
15 #define USE_HW_SHA
16 #if !defined(__INTEL_COMPILER)
17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18 #if !defined(__SHA512__) || !defined(__AVX2__)
19 #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
20 #endif
21 #endif
22 #elif defined(Z7_MSC_VER_ORIGINAL)
23 #if (_MSC_VER >= 1940)
24 #define USE_HW_SHA
25 #else
26 // #define Z7_USE_HW_SHA_STUB
27 #endif
28 #endif
29// #endif // MY_CPU_X86_OR_AMD64
30#ifndef USE_HW_SHA
31 // #define Z7_USE_HW_SHA_STUB // for debug
32#endif
33
34#ifdef USE_HW_SHA
35
36// #pragma message("Sha512 HW")
37
38#include <immintrin.h>
39
40#if defined (__clang__) && defined(_MSC_VER)
41 #if !defined(__AVX__)
42 #include <avxintrin.h>
43 #endif
44 #if !defined(__AVX2__)
45 #include <avx2intrin.h>
46 #endif
47 #if !defined(__SHA512__)
48 #include <sha512intrin.h>
49 #endif
50#else
51
52#endif
53
54/*
55SHA512 uses:
56AVX:
57 _mm256_loadu_si256 (vmovdqu)
58 _mm256_storeu_si256
59 _mm256_set_epi32 (unused)
60AVX2:
61 _mm256_add_epi64 : vpaddq
62 _mm256_shuffle_epi8 : vpshufb
63 _mm256_shuffle_epi32 : pshufd
64 _mm256_blend_epi32 : vpblendd
65 _mm256_permute4x64_epi64 : vpermq : 3c
66 _mm256_permute2x128_si256: vperm2i128 : 3c
67 _mm256_extracti128_si256 : vextracti128 : 3c
68SHA512:
69 _mm256_sha512*
70*/
71
72// K array must be aligned for 32-bytes at least.
73// The compiler can look align attribute and selects
74// vmovdqu - for code without align attribute
75// vmovdqa - for code with align attribute
76extern
77MY_ALIGN(64)
78const UInt64 SHA512_K_ARRAY[80];
79#define K SHA512_K_ARRAY
80
81
82#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src);
83#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
84#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src);
85
86#define LOAD_SHUFFLE(m, k) \
87 m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
88 m = _mm256_shuffle_epi8(m, mask); \
89
90#define NNN(m0, m1, m2, m3)
91
92#define SM1(m1, m2, m3, m0) \
93 SHA512_MSG1(m0, m1); \
94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
97 SHA512_MSG2(m0, m3); \
98
99#define RND2(t0, t1, lane) \
100 t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
101
102
103
104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \
107 RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \
108
109
110
111
112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
117
118#define PREPARE_STATE \
119 state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \
120 state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \
121 tmp = state0; \
122 state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
123 state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \
124
125
126void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
127#ifdef ATTRIB_SHA512
128ATTRIB_SHA512
129#endif
130void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
131{
132 const __m256i mask = _mm256_set_epi32(
133 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
134 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
135 __m256i tmp, state0, state1;
136
137 if (numBlocks == 0)
138 return;
139
140 state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
141 state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
142
143 PREPARE_STATE
144
145 do
146 {
147 __m256i state0_save, state1_save;
148 __m256i m0, m1, m2, m3;
149 __m256i msg;
150 // #define msg tmp
151
152 state0_save = state0;
153 state1_save = state1;
154
155 LOAD_SHUFFLE (m0, 0)
156 LOAD_SHUFFLE (m1, 1)
157 LOAD_SHUFFLE (m2, 2)
158 LOAD_SHUFFLE (m3, 3)
159
160
161
162 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
163 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
164 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
165 R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
166 R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
167 ADD_EPI64(state0, state0_save)
168 ADD_EPI64(state1, state1_save)
169
170 data += 128;
171 }
172 while (--numBlocks);
173
174 PREPARE_STATE
175
176 _mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
177 _mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
178}
179
180#endif // USE_HW_SHA
181
182// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
183#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
184
185 #if defined(__ARM_FEATURE_SHA512)
186 #define USE_HW_SHA
187 #else
188 #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
189 || defined(__GNUC__) && (__GNUC__ >= 9) \
190 ) \
191 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
192 #define USE_HW_SHA
193 #endif
194 #endif
195
196#ifdef USE_HW_SHA
197
198// #pragma message("=== Sha512 HW === ")
199
200
201#if defined(__clang__) || defined(__GNUC__)
202#if !defined(__ARM_FEATURE_SHA512)
203// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
204#if defined(__clang__)
205 #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
206#else
207 #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
208#endif
209#endif
210#endif
211
212
213#if defined(Z7_MSC_VER_ORIGINAL)
214#include <arm64_neon.h>
215#else
216
217#if defined(__clang__) && __clang_major__ < 16
218#if !defined(__ARM_FEATURE_SHA512)
219// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
220 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
221 #define Z7_ARM_FEATURE_SHA512_WAS_SET 1
222 #define __ARM_FEATURE_SHA512 1
223 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
224#endif
225#endif // clang
226
227#include <arm_neon.h>
228
229#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
230 defined(__ARM_FEATURE_SHA512)
231 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
232 #undef __ARM_FEATURE_SHA512
233 #undef Z7_ARM_FEATURE_SHA512_WAS_SET
234 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
235// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
236#endif
237
238#endif // Z7_MSC_VER_ORIGINAL
239
240typedef uint64x2_t v128_64;
241// typedef __n128 v128_64; // MSVC
242
243#ifdef MY_CPU_BE
244 #define MY_rev64_for_LE(x) x
245#else
246 #define MY_rev64_for_LE(x) vrev64q_u8(x)
247#endif
248
249#define LOAD_128_64(_p) vld1q_u64(_p)
250#define LOAD_128_8(_p) vld1q_u8 (_p)
251#define STORE_128_64(_p, _v) vst1q_u64(_p, _v)
252
253#define LOAD_SHUFFLE(m, k) \
254 m = vreinterpretq_u64_u8( \
255 MY_rev64_for_LE( \
256 LOAD_128_8(data + (k) * 16))); \
257
258// K array must be aligned for 16-bytes at least.
259extern
260MY_ALIGN(64)
261const UInt64 SHA512_K_ARRAY[80];
262#define K SHA512_K_ARRAY
263
264#define NN(m0, m1, m4, m5, m7)
265#define SM(m0, m1, m4, m5, m7) \
266 m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
267
268#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
269 OP(m0, m1, m4, m5, m7) \
270 t = vaddq_u64(m0, vld1q_u64(k)); \
271 t = vaddq_u64(vextq_u64(t, t, 1), a3); \
272 t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
273 a3 = vsha512h2q_u64(t, a1, a0); \
274 a1 = vaddq_u64(a1, t); \
275
276#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \
277 R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
278 R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
279 R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
280 R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
281
282#define R16(k, OP) \
283 R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
284 R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
285
286
287void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
288#ifdef ATTRIB_SHA512
289ATTRIB_SHA512
290#endif
291void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
292{
293 v128_64 a0, a1, a2, a3;
294
295 if (numBlocks == 0)
296 return;
297 a0 = LOAD_128_64(&state[0]);
298 a1 = LOAD_128_64(&state[2]);
299 a2 = LOAD_128_64(&state[4]);
300 a3 = LOAD_128_64(&state[6]);
301 do
302 {
303 v128_64 a0_save, a1_save, a2_save, a3_save;
304 v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
305 v128_64 t;
306 unsigned i;
307 const UInt64 *k_ptr;
308
309 LOAD_SHUFFLE (m0, 0)
310 LOAD_SHUFFLE (m1, 1)
311 LOAD_SHUFFLE (m2, 2)
312 LOAD_SHUFFLE (m3, 3)
313 LOAD_SHUFFLE (m4, 4)
314 LOAD_SHUFFLE (m5, 5)
315 LOAD_SHUFFLE (m6, 6)
316 LOAD_SHUFFLE (m7, 7)
317
318 a0_save = a0;
319 a1_save = a1;
320 a2_save = a2;
321 a3_save = a3;
322
323 R16 ( K, NN )
324 k_ptr = K + 16;
325 for (i = 0; i < 4; i++)
326 {
327 R16 ( k_ptr, SM )
328 k_ptr += 16;
329 }
330
331 a0 = vaddq_u64(a0, a0_save);
332 a1 = vaddq_u64(a1, a1_save);
333 a2 = vaddq_u64(a2, a2_save);
334 a3 = vaddq_u64(a3, a3_save);
335
336 data += 128;
337 }
338 while (--numBlocks);
339
340 STORE_128_64(&state[0], a0);
341 STORE_128_64(&state[2], a1);
342 STORE_128_64(&state[4], a2);
343 STORE_128_64(&state[6], a3);
344}
345
346#endif // USE_HW_SHA
347
348#endif // MY_CPU_ARM_OR_ARM64
349
350
351#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
352// #error Stop_Compiling_UNSUPPORTED_SHA
353// #include <stdlib.h>
354// We can compile this file with another C compiler,
355// or we can compile asm version.
356// So we can generate real code instead of this stub function.
357// #include "Sha512.h"
358// #if defined(_MSC_VER)
359#pragma message("Sha512 HW-SW stub was used")
360// #endif
361void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks);
362void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
363void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
364{
365 Sha512_UpdateBlocks(state, data, numBlocks);
366 /*
367 UNUSED_VAR(state);
368 UNUSED_VAR(data);
369 UNUSED_VAR(numBlocks);
370 exit(1);
371 return;
372 */
373}
374#endif
375
376
377#undef K
378#undef RND2
379#undef MY_rev64_for_LE
380#undef NN
381#undef NNN
382#undef LOAD_128
383#undef STORE_128
384#undef LOAD_SHUFFLE
385#undef SM1
386#undef SM2
387#undef SM
388#undef R2
389#undef R4
390#undef R16
391#undef PREPARE_STATE
392#undef USE_HW_SHA
393#undef ATTRIB_SHA512
394#undef USE_VER_MIN
395#undef Z7_USE_HW_SHA_STUB