diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-11-29 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-11-30 15:27:15 +0500 |
commit | e5431fa6f5505e385c6f9367260717e9c47dc2ee (patch) | |
tree | 4cd2c2c3b225b48c8e7053432c41d7b6b6a3d5f8 /C | |
parent | e008ce3976c087bfd21344af8f00a23cf69d4174 (diff) | |
download | 7zip-main.tar.gz 7zip-main.tar.bz2 7zip-main.zip |
Diffstat (limited to 'C')
-rw-r--r-- | C/7zDec.c | 5 | ||||
-rw-r--r-- | C/7zVersion.h | 6 | ||||
-rw-r--r-- | C/AesOpt.c | 233 | ||||
-rw-r--r-- | C/CpuArch.c | 109 | ||||
-rw-r--r-- | C/CpuArch.h | 33 | ||||
-rw-r--r-- | C/LzmaEnc.c | 16 | ||||
-rw-r--r-- | C/Md5.c | 206 | ||||
-rw-r--r-- | C/Md5.h | 34 | ||||
-rw-r--r-- | C/Sha1.c | 125 | ||||
-rw-r--r-- | C/Sha1.h | 18 | ||||
-rw-r--r-- | C/Sha1Opt.c | 146 | ||||
-rw-r--r-- | C/Sha256.c | 162 | ||||
-rw-r--r-- | C/Sha256.h | 18 | ||||
-rw-r--r-- | C/Sha256Opt.c | 172 | ||||
-rw-r--r-- | C/Sha3.c | 359 | ||||
-rw-r--r-- | C/Sha3.h | 36 | ||||
-rw-r--r-- | C/Sha512.c | 618 | ||||
-rw-r--r-- | C/Sha512.h | 86 | ||||
-rw-r--r-- | C/Sha512Opt.c | 395 |
19 files changed, 2273 insertions, 504 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* 7zDec.c -- Decoding from 7z folder | 1 | /* 7zDec.c -- Decoding from 7z folder |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m) | |||
312 | case k_PPMD: | 312 | case k_PPMD: |
313 | #endif | 313 | #endif |
314 | return True; | 314 | return True; |
315 | default: | ||
316 | return False; | ||
315 | } | 317 | } |
316 | return False; | ||
317 | } | 318 | } |
318 | 319 | ||
319 | static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) | 320 | static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) |
diff --git a/C/7zVersion.h b/C/7zVersion.h index 1ddef80..e82ba0b 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #define MY_VER_MAJOR 24 | 1 | #define MY_VER_MAJOR 24 |
2 | #define MY_VER_MINOR 8 | 2 | #define MY_VER_MINOR 9 |
3 | #define MY_VER_BUILD 0 | 3 | #define MY_VER_BUILD 0 |
4 | #define MY_VERSION_NUMBERS "24.08" | 4 | #define MY_VERSION_NUMBERS "24.09" |
5 | #define MY_VERSION MY_VERSION_NUMBERS | 5 | #define MY_VERSION MY_VERSION_NUMBERS |
6 | 6 | ||
7 | #ifdef MY_CPU_NAME | 7 | #ifdef MY_CPU_NAME |
@@ -10,7 +10,7 @@ | |||
10 | #define MY_VERSION_CPU MY_VERSION | 10 | #define MY_VERSION_CPU MY_VERSION |
11 | #endif | 11 | #endif |
12 | 12 | ||
13 | #define MY_DATE "2024-08-11" | 13 | #define MY_DATE "2024-11-29" |
14 | #undef MY_COPYRIGHT | 14 | #undef MY_COPYRIGHT |
15 | #undef MY_VERSION_COPYRIGHT_DATE | 15 | #undef MY_VERSION_COPYRIGHT_DATE |
16 | #define MY_AUTHOR_NAME "Igor Pavlov" | 16 | #define MY_AUTHOR_NAME "Igor Pavlov" |
@@ -1,5 +1,5 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -80,19 +80,39 @@ AES_FUNC_START (name) | |||
80 | 80 | ||
81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
82 | 82 | ||
83 | #if 1 | ||
84 | // use aligned SSE load/store for data. | ||
85 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
86 | // So we can use this branch of code. | ||
87 | // and compiler can use fused load-op SSE instructions: | ||
88 | // xorps xmm0, XMMWORD PTR [rdx] | ||
89 | #define LOAD_128(pp) (*(__m128i *)(void *)(pp)) | ||
90 | #define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v | ||
91 | // use aligned SSE load/store for data. Alternative code with direct access | ||
92 | // #define LOAD_128(pp) _mm_load_si128(pp) | ||
93 | // #define STORE_128(pp, _v) _mm_store_si128(pp, _v) | ||
94 | #else | ||
95 | // use unaligned load/store for data: movdqu XMMWORD PTR [rdx] | ||
96 | #define LOAD_128(pp) _mm_loadu_si128(pp) | ||
97 | #define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) | ||
98 | #endif | ||
99 | |||
83 | AES_FUNC_START2 (AesCbc_Encode_HW) | 100 | AES_FUNC_START2 (AesCbc_Encode_HW) |
84 | { | 101 | { |
102 | if (numBlocks == 0) | ||
103 | return; | ||
104 | { | ||
85 | __m128i *p = (__m128i *)(void *)ivAes; | 105 | __m128i *p = (__m128i *)(void *)ivAes; |
86 | __m128i *data = (__m128i *)(void *)data8; | 106 | __m128i *data = (__m128i *)(void *)data8; |
87 | __m128i m = *p; | 107 | __m128i m = *p; |
88 | const __m128i k0 = p[2]; | 108 | const __m128i k0 = p[2]; |
89 | const __m128i k1 = p[3]; | 109 | const __m128i k1 = p[3]; |
90 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; | 110 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
91 | for (; numBlocks != 0; numBlocks--, data++) | 111 | do |
92 | { | 112 | { |
93 | UInt32 r = numRounds2; | 113 | UInt32 r = numRounds2; |
94 | const __m128i *w = p + 4; | 114 | const __m128i *w = p + 4; |
95 | __m128i temp = *data; | 115 | __m128i temp = LOAD_128(data); |
96 | MM_XOR (temp, k0) | 116 | MM_XOR (temp, k0) |
97 | MM_XOR (m, temp) | 117 | MM_XOR (m, temp) |
98 | MM_OP_m (_mm_aesenc_si128, k1) | 118 | MM_OP_m (_mm_aesenc_si128, k1) |
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
104 | } | 124 | } |
105 | while (--r); | 125 | while (--r); |
106 | MM_OP_m (_mm_aesenclast_si128, w[0]) | 126 | MM_OP_m (_mm_aesenclast_si128, w[0]) |
107 | *data = m; | 127 | STORE_128(data, m); |
128 | data++; | ||
108 | } | 129 | } |
130 | while (--numBlocks); | ||
109 | *p = m; | 131 | *p = m; |
132 | } | ||
110 | } | 133 | } |
111 | 134 | ||
112 | 135 | ||
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
139 | 162 | ||
140 | #define WOP(op) op (m0, 0) WOP_M1(op) | 163 | #define WOP(op) op (m0, 0) WOP_M1(op) |
141 | 164 | ||
142 | |||
143 | #define DECLARE_VAR(reg, ii) __m128i reg; | 165 | #define DECLARE_VAR(reg, ii) __m128i reg; |
144 | #define LOAD_data( reg, ii) reg = data[ii]; | 166 | #define LOAD_data_ii(ii) LOAD_128(data + (ii)) |
145 | #define STORE_data( reg, ii) data[ii] = reg; | 167 | #define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); |
168 | #define STORE_data( reg, ii) STORE_128(data + (ii), reg); | ||
146 | #if (NUM_WAYS > 1) | 169 | #if (NUM_WAYS > 1) |
147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 170 | #define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) |
148 | #endif | 171 | #endif |
149 | 172 | ||
150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 173 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 179 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
157 | 180 | ||
158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 181 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 182 | #define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ |
160 | 183 | LOAD_128 (data + (ii)))); | |
161 | #define WOP_KEY(op, n) { \ | 184 | #define WOP_KEY(op, n) { \ |
162 | const __m128i key = w[n]; \ | 185 | const __m128i key = w[n]; \ |
163 | WOP(op); } | 186 | WOP(op) } |
164 | |||
165 | 187 | ||
166 | #define WIDE_LOOP_START \ | 188 | #define WIDE_LOOP_START \ |
167 | dataEnd = data + numBlocks; \ | 189 | dataEnd = data + numBlocks; \ |
168 | if (numBlocks >= NUM_WAYS) \ | 190 | if (numBlocks >= NUM_WAYS) \ |
169 | { dataEnd -= NUM_WAYS; do { \ | 191 | { dataEnd -= NUM_WAYS; do { \ |
170 | 192 | ||
171 | |||
172 | #define WIDE_LOOP_END \ | 193 | #define WIDE_LOOP_END \ |
173 | data += NUM_WAYS; \ | 194 | data += NUM_WAYS; \ |
174 | } while (data <= dataEnd); \ | 195 | } while (data <= dataEnd); \ |
175 | dataEnd += NUM_WAYS; } \ | 196 | dataEnd += NUM_WAYS; } \ |
176 | 197 | ||
177 | |||
178 | #define SINGLE_LOOP \ | 198 | #define SINGLE_LOOP \ |
179 | for (; data < dataEnd; data++) | 199 | for (; data < dataEnd; data++) |
180 | 200 | ||
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
184 | 204 | ||
185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | 205 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) |
186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | 206 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; |
187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | 207 | |
188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | 208 | #if 1 |
209 | // use unaligned AVX load/store for data. | ||
210 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
211 | // But we need 32-bytes reading. | ||
212 | // So we use intrinsics for unaligned AVX load/store. | ||
213 | // notes for _mm256_storeu_si256: | ||
214 | // msvc2022: uses vmovdqu and keeps the order of instruction sequence. | ||
215 | // new gcc11 uses vmovdqu | ||
216 | // old gcc9 could use pair of instructions: | ||
217 | // vmovups %xmm7, -224(%rax) | ||
218 | // vextracti128 $0x1, %ymm7, -208(%rax) | ||
219 | #define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
220 | #define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); | ||
221 | #else | ||
222 | // use aligned AVX load/store for data. | ||
223 | // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. | ||
224 | // msvc2022 uses vmovdqu still | ||
225 | // gcc uses vmovdqa (that requires 32-bytes alignment) | ||
226 | #define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) | ||
227 | #define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; | ||
228 | #endif | ||
229 | |||
230 | #define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); | ||
231 | #define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) | ||
189 | /* | 232 | /* |
190 | AVX_XOR_data_M1() needs unaligned memory load | 233 | AVX_XOR_data_M1() needs unaligned memory load, even if (data) |
191 | if (we don't use _mm256_loadu_si256() here) | 234 | is aligned for 256-bits, because we read 32-bytes chunk that |
192 | { | 235 | crosses (data) position: from (data - 16bytes) to (data + 16bytes). |
193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
194 | instruction that can load unaligned data. | ||
195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
197 | } | ||
198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
200 | */ | 236 | */ |
201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | 237 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) |
202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
204 | 238 | ||
205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | 239 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) |
206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | 240 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) |
207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | 241 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) |
208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | 242 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) |
209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | 243 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) |
210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | 244 | #define AVX_CTR_START(reg, ii) \ |
211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | 245 | MM_OP (_mm256_add_epi64, ctr2, two) \ |
246 | reg = _mm256_xor_si256(ctr2, key); | ||
247 | |||
248 | #define AVX_CTR_END(reg, ii) \ | ||
249 | AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ | ||
250 | AVX_LOAD ((__m256i *)(void *)data + (ii)))); | ||
251 | |||
212 | #define AVX_WOP_KEY(op, n) { \ | 252 | #define AVX_WOP_KEY(op, n) { \ |
213 | const __m256i key = w[n]; \ | 253 | const __m256i key = w[n]; \ |
214 | WOP(op); } | 254 | WOP(op) } |
215 | 255 | ||
216 | #define NUM_AES_KEYS_MAX 15 | 256 | #define NUM_AES_KEYS_MAX 15 |
217 | 257 | ||
218 | #define WIDE_LOOP_START_AVX(OP) \ | 258 | #define WIDE_LOOP_START_AVX(OP) \ |
219 | dataEnd = data + numBlocks; \ | 259 | dataEnd = data + numBlocks; \ |
220 | if (numBlocks >= NUM_WAYS * 2) \ | 260 | if (numBlocks >= NUM_WAYS * 2) \ |
221 | { __m256i keys[NUM_AES_KEYS_MAX]; \ | 261 | { __m256i keys[NUM_AES_KEYS_MAX]; \ |
222 | UInt32 ii; \ | 262 | OP \ |
223 | OP \ | 263 | { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ |
224 | for (ii = 0; ii < numRounds; ii++) \ | 264 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ |
225 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ | 265 | dataEnd -= NUM_WAYS * 2; \ |
226 | dataEnd -= NUM_WAYS * 2; do { \ | 266 | do { \ |
227 | |||
228 | 267 | ||
229 | #define WIDE_LOOP_END_AVX(OP) \ | 268 | #define WIDE_LOOP_END_AVX(OP) \ |
230 | data += NUM_WAYS * 2; \ | 269 | data += NUM_WAYS * 2; \ |
231 | } while (data <= dataEnd); \ | 270 | } while (data <= dataEnd); \ |
232 | dataEnd += NUM_WAYS * 2; \ | 271 | dataEnd += NUM_WAYS * 2; \ |
233 | OP \ | 272 | OP \ |
234 | _mm256_zeroupper(); \ | 273 | _mm256_zeroupper(); \ |
235 | } \ | 274 | } \ |
236 | 275 | ||
237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 276 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
246 | __m128i *p = (__m128i *)(void *)ivAes; | 285 | __m128i *p = (__m128i *)(void *)ivAes; |
247 | __m128i *data = (__m128i *)(void *)data8; | 286 | __m128i *data = (__m128i *)(void *)data8; |
248 | __m128i iv = *p; | 287 | __m128i iv = *p; |
249 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; | 288 | const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; |
250 | const __m128i *dataEnd; | 289 | const __m128i *dataEnd; |
251 | p += 2; | 290 | p += 2; |
252 | 291 | ||
253 | WIDE_LOOP_START | 292 | WIDE_LOOP_START |
254 | { | 293 | { |
255 | const __m128i *w = wStart; | 294 | const __m128i *w = wStart; |
256 | |||
257 | WOP (DECLARE_VAR) | 295 | WOP (DECLARE_VAR) |
258 | WOP (LOAD_data) | 296 | WOP (LOAD_data) |
259 | WOP_KEY (AES_XOR, 1) | 297 | WOP_KEY (AES_XOR, 1) |
260 | |||
261 | do | 298 | do |
262 | { | 299 | { |
263 | WOP_KEY (AES_DEC, 0) | 300 | WOP_KEY (AES_DEC, 0) |
301 | |||
264 | w--; | 302 | w--; |
265 | } | 303 | } |
266 | while (w != p); | 304 | while (w != p); |
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
268 | 306 | ||
269 | MM_XOR (m0, iv) | 307 | MM_XOR (m0, iv) |
270 | WOP_M1 (XOR_data_M1) | 308 | WOP_M1 (XOR_data_M1) |
271 | iv = data[NUM_WAYS - 1]; | 309 | LOAD_data(iv, NUM_WAYS - 1) |
272 | WOP (STORE_data) | 310 | WOP (STORE_data) |
273 | } | 311 | } |
274 | WIDE_LOOP_END | 312 | WIDE_LOOP_END |
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
276 | SINGLE_LOOP | 314 | SINGLE_LOOP |
277 | { | 315 | { |
278 | const __m128i *w = wStart - 1; | 316 | const __m128i *w = wStart - 1; |
279 | __m128i m = _mm_xor_si128 (w[2], *data); | 317 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
318 | |||
280 | do | 319 | do |
281 | { | 320 | { |
282 | MM_OP_m (_mm_aesdec_si128, w[1]) | 321 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
286 | while (w != p); | 325 | while (w != p); |
287 | MM_OP_m (_mm_aesdec_si128, w[1]) | 326 | MM_OP_m (_mm_aesdec_si128, w[1]) |
288 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 327 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
289 | |||
290 | MM_XOR (m, iv) | 328 | MM_XOR (m, iv) |
291 | iv = *data; | 329 | LOAD_data(iv, 0) |
292 | *data = m; | 330 | STORE_data(m, 0) |
293 | } | 331 | } |
294 | 332 | ||
295 | p[-2] = iv; | 333 | p[-2] = iv; |
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
301 | __m128i *p = (__m128i *)(void *)ivAes; | 339 | __m128i *p = (__m128i *)(void *)ivAes; |
302 | __m128i *data = (__m128i *)(void *)data8; | 340 | __m128i *data = (__m128i *)(void *)data8; |
303 | __m128i ctr = *p; | 341 | __m128i ctr = *p; |
304 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; | 342 | const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; |
305 | const __m128i *dataEnd; | 343 | const __m128i *dataEnd; |
306 | __m128i one = _mm_cvtsi32_si128(1); | 344 | const __m128i one = _mm_cvtsi32_si128(1); |
307 | 345 | ||
308 | p += 2; | 346 | p += 2; |
309 | 347 | ||
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
322 | } | 360 | } |
323 | while (--r); | 361 | while (--r); |
324 | WOP_KEY (AES_ENC_LAST, 0) | 362 | WOP_KEY (AES_ENC_LAST, 0) |
325 | |||
326 | WOP (CTR_END) | 363 | WOP (CTR_END) |
327 | } | 364 | } |
328 | WIDE_LOOP_END | 365 | WIDE_LOOP_END |
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
344 | while (--numRounds2); | 381 | while (--numRounds2); |
345 | MM_OP_m (_mm_aesenc_si128, w[0]) | 382 | MM_OP_m (_mm_aesenc_si128, w[0]) |
346 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 383 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
347 | MM_XOR (*data, m) | 384 | CTR_END (m, 0) |
348 | } | 385 | } |
349 | 386 | ||
350 | p[-2] = ctr; | 387 | p[-2] = ctr; |
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
421 | __m128i *data = (__m128i *)(void *)data8; | 458 | __m128i *data = (__m128i *)(void *)data8; |
422 | __m128i iv = *p; | 459 | __m128i iv = *p; |
423 | const __m128i *dataEnd; | 460 | const __m128i *dataEnd; |
424 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 461 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
425 | p += 2; | 462 | p += 2; |
426 | 463 | ||
427 | WIDE_LOOP_START_AVX(;) | 464 | WIDE_LOOP_START_AVX(;) |
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
440 | while (w != keys); | 477 | while (w != keys); |
441 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) | 478 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) |
442 | 479 | ||
443 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) | 480 | AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) |
444 | WOP_M1 (AVX_XOR_data_M1) | 481 | WOP_M1 (AVX_XOR_data_M1) |
445 | iv = data[NUM_WAYS * 2 - 1]; | 482 | LOAD_data (iv, NUM_WAYS * 2 - 1) |
446 | WOP (AVX_STORE_data) | 483 | WOP (AVX_STORE_data) |
447 | } | 484 | } |
448 | WIDE_LOOP_END_AVX(;) | 485 | WIDE_LOOP_END_AVX(;) |
449 | 486 | ||
450 | SINGLE_LOOP | 487 | SINGLE_LOOP |
451 | { | 488 | { |
452 | const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; | 489 | const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; |
453 | __m128i m = _mm_xor_si128 (w[2], *data); | 490 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
454 | do | 491 | do |
455 | { | 492 | { |
456 | MM_OP_m (_mm_aesdec_si128, w[1]) | 493 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
462 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 499 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
463 | 500 | ||
464 | MM_XOR (m, iv) | 501 | MM_XOR (m, iv) |
465 | iv = *data; | 502 | LOAD_data(iv, 0) |
466 | *data = m; | 503 | STORE_data(m, 0) |
467 | } | 504 | } |
468 | 505 | ||
469 | p[-2] = iv; | 506 | p[-2] = iv; |
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
493 | __m128i *p = (__m128i *)(void *)ivAes; | 530 | __m128i *p = (__m128i *)(void *)ivAes; |
494 | __m128i *data = (__m128i *)(void *)data8; | 531 | __m128i *data = (__m128i *)(void *)data8; |
495 | __m128i ctr = *p; | 532 | __m128i ctr = *p; |
496 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 533 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
497 | const __m128i *dataEnd; | 534 | const __m128i *dataEnd; |
498 | __m128i one = _mm_cvtsi32_si128(1); | 535 | const __m128i one = _mm_cvtsi32_si128(1); |
499 | __m256i ctr2, two; | 536 | __m256i ctr2, two; |
500 | p += 2; | 537 | p += 2; |
501 | 538 | ||
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
536 | while (--numRounds2); | 573 | while (--numRounds2); |
537 | MM_OP_m (_mm_aesenc_si128, w[0]) | 574 | MM_OP_m (_mm_aesenc_si128, w[0]) |
538 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 575 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
539 | MM_XOR (*data, m) | 576 | CTR_END (m, 0) |
540 | } | 577 | } |
541 | 578 | ||
542 | p[-2] = ctr; | 579 | p[-2] = ctr; |
@@ -731,9 +768,14 @@ AES_FUNC_START (name) | |||
731 | 768 | ||
732 | AES_FUNC_START2 (AesCbc_Encode_HW) | 769 | AES_FUNC_START2 (AesCbc_Encode_HW) |
733 | { | 770 | { |
734 | v128 * const p = (v128*)(void*)ivAes; | 771 | if (numBlocks == 0) |
735 | v128 *data = (v128*)(void*)data8; | 772 | return; |
773 | { | ||
774 | v128 * const p = (v128 *)(void *)ivAes; | ||
775 | v128 *data = (v128 *)(void *)data8; | ||
736 | v128 m = *p; | 776 | v128 m = *p; |
777 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | ||
778 | const v128 *w = p + (size_t)numRounds2 * 2; | ||
737 | const v128 k0 = p[2]; | 779 | const v128 k0 = p[2]; |
738 | const v128 k1 = p[3]; | 780 | const v128 k1 = p[3]; |
739 | const v128 k2 = p[4]; | 781 | const v128 k2 = p[4]; |
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
744 | const v128 k7 = p[9]; | 786 | const v128 k7 = p[9]; |
745 | const v128 k8 = p[10]; | 787 | const v128 k8 = p[10]; |
746 | const v128 k9 = p[11]; | 788 | const v128 k9 = p[11]; |
747 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | 789 | const v128 k_z4 = w[-2]; |
748 | const v128 *w = p + ((size_t)numRounds2 * 2); | 790 | const v128 k_z3 = w[-1]; |
791 | const v128 k_z2 = w[0]; | ||
749 | const v128 k_z1 = w[1]; | 792 | const v128 k_z1 = w[1]; |
750 | const v128 k_z0 = w[2]; | 793 | const v128 k_z0 = w[2]; |
751 | for (; numBlocks != 0; numBlocks--, data++) | 794 | // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, |
795 | // because gcc/clang compilers are not good for that optimization. | ||
796 | do | ||
752 | { | 797 | { |
753 | MM_XOR_m (*data) | 798 | MM_XOR_m (*data) |
754 | AES_E_MC_m (k0) | 799 | AES_E_MC_m (k0) |
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
757 | AES_E_MC_m (k3) | 802 | AES_E_MC_m (k3) |
758 | AES_E_MC_m (k4) | 803 | AES_E_MC_m (k4) |
759 | AES_E_MC_m (k5) | 804 | AES_E_MC_m (k5) |
760 | AES_E_MC_m (k6) | ||
761 | AES_E_MC_m (k7) | ||
762 | AES_E_MC_m (k8) | ||
763 | if (numRounds2 >= 6) | 805 | if (numRounds2 >= 6) |
764 | { | 806 | { |
765 | AES_E_MC_m (k9) | 807 | AES_E_MC_m (k6) |
766 | AES_E_MC_m (p[12]) | 808 | AES_E_MC_m (k7) |
767 | if (numRounds2 != 6) | 809 | if (numRounds2 != 6) |
768 | { | 810 | { |
769 | AES_E_MC_m (p[13]) | 811 | AES_E_MC_m (k8) |
770 | AES_E_MC_m (p[14]) | 812 | AES_E_MC_m (k9) |
771 | } | 813 | } |
772 | } | 814 | } |
773 | AES_E_m (k_z1) | 815 | AES_E_MC_m (k_z4) |
774 | MM_XOR_m (k_z0) | 816 | AES_E_MC_m (k_z3) |
775 | *data = m; | 817 | AES_E_MC_m (k_z2) |
818 | AES_E_m (k_z1) | ||
819 | MM_XOR_m (k_z0) | ||
820 | *data++ = m; | ||
776 | } | 821 | } |
822 | while (--numBlocks); | ||
777 | *p = m; | 823 | *p = m; |
824 | } | ||
778 | } | 825 | } |
779 | 826 | ||
780 | 827 | ||
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
834 | 881 | ||
835 | AES_FUNC_START2 (AesCbc_Decode_HW) | 882 | AES_FUNC_START2 (AesCbc_Decode_HW) |
836 | { | 883 | { |
837 | v128 *p = (v128*)(void*)ivAes; | 884 | v128 *p = (v128 *)(void *)ivAes; |
838 | v128 *data = (v128*)(void*)data8; | 885 | v128 *data = (v128 *)(void *)data8; |
839 | v128 iv = *p; | 886 | v128 iv = *p; |
840 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 887 | const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
841 | const v128 *dataEnd; | 888 | const v128 *dataEnd; |
842 | p += 2; | 889 | p += 2; |
843 | 890 | ||
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
858 | WOP_KEY (AES_XOR, 0) | 905 | WOP_KEY (AES_XOR, 0) |
859 | MM_XOR (m0, iv) | 906 | MM_XOR (m0, iv) |
860 | WOP_M1 (XOR_data_M1) | 907 | WOP_M1 (XOR_data_M1) |
861 | iv = data[NUM_WAYS - 1]; | 908 | LOAD_data(iv, NUM_WAYS - 1) |
862 | WOP (STORE_data) | 909 | WOP (STORE_data) |
863 | } | 910 | } |
864 | WIDE_LOOP_END | 911 | WIDE_LOOP_END |
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
866 | SINGLE_LOOP | 913 | SINGLE_LOOP |
867 | { | 914 | { |
868 | const v128 *w = wStart; | 915 | const v128 *w = wStart; |
869 | v128 m = *data; | 916 | v128 m; LOAD_data(m, 0) |
870 | AES_D_IMC_m (w[2]) | 917 | AES_D_IMC_m (w[2]) |
871 | do | 918 | do |
872 | { | 919 | { |
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
878 | AES_D_m (w[1]) | 925 | AES_D_m (w[1]) |
879 | MM_XOR_m (w[0]) | 926 | MM_XOR_m (w[0]) |
880 | MM_XOR_m (iv) | 927 | MM_XOR_m (iv) |
881 | iv = *data; | 928 | LOAD_data(iv, 0) |
882 | *data = m; | 929 | STORE_data(m, 0) |
883 | } | 930 | } |
884 | 931 | ||
885 | p[-2] = iv; | 932 | p[-2] = iv; |
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
888 | 935 | ||
889 | AES_FUNC_START2 (AesCtr_Code_HW) | 936 | AES_FUNC_START2 (AesCtr_Code_HW) |
890 | { | 937 | { |
891 | v128 *p = (v128*)(void*)ivAes; | 938 | v128 *p = (v128 *)(void *)ivAes; |
892 | v128 *data = (v128*)(void*)data8; | 939 | v128 *data = (v128 *)(void *)data8; |
893 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); | 940 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); |
894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 941 | const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
895 | const v128 *dataEnd; | 942 | const v128 *dataEnd; |
896 | uint64x2_t one = vdupq_n_u64(0); | ||
897 | |||
898 | // the bug in clang: | 943 | // the bug in clang: |
899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | 944 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); |
900 | #if defined(__clang__) && (__clang_major__ <= 9) | 945 | #if defined(__clang__) && (__clang_major__ <= 9) |
901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | 946 | #pragma GCC diagnostic ignored "-Wvector-conversion" |
902 | #endif | 947 | #endif |
903 | one = vsetq_lane_u64(1, one, 0); | 948 | const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); |
904 | p += 2; | 949 | p += 2; |
905 | 950 | ||
906 | WIDE_LOOP_START | 951 | WIDE_LOOP_START |
diff --git a/C/CpuArch.c b/C/CpuArch.c index e792f39..6e02551 100644 --- a/C/CpuArch.c +++ b/C/CpuArch.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.c -- CPU specific code | 1 | /* CpuArch.c -- CPU specific code |
2 | 2024-07-04 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -17,7 +17,7 @@ | |||
17 | /* | 17 | /* |
18 | cpuid instruction supports (subFunction) parameter in ECX, | 18 | cpuid instruction supports (subFunction) parameter in ECX, |
19 | that is used only with some specific (function) parameter values. | 19 | that is used only with some specific (function) parameter values. |
20 | But we always use only (subFunction==0). | 20 | most functions use only (subFunction==0). |
21 | */ | 21 | */ |
22 | /* | 22 | /* |
23 | __cpuid(): MSVC and GCC/CLANG use same function/macro name | 23 | __cpuid(): MSVC and GCC/CLANG use same function/macro name |
@@ -49,43 +49,49 @@ | |||
49 | #if defined(MY_CPU_AMD64) && defined(__PIC__) \ | 49 | #if defined(MY_CPU_AMD64) && defined(__PIC__) \ |
50 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) | 50 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
51 | 51 | ||
52 | #define x86_cpuid_MACRO(p, func) { \ | 52 | /* "=&r" selects free register. It can select even rbx, if that register is free. |
53 | "=&D" for (RDI) also works, but the code can be larger with "=&D" | ||
54 | "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ | ||
55 | |||
56 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ | ||
53 | __asm__ __volatile__ ( \ | 57 | __asm__ __volatile__ ( \ |
54 | ASM_LN "mov %%rbx, %q1" \ | 58 | ASM_LN "mov %%rbx, %q1" \ |
55 | ASM_LN "cpuid" \ | 59 | ASM_LN "cpuid" \ |
56 | ASM_LN "xchg %%rbx, %q1" \ | 60 | ASM_LN "xchg %%rbx, %q1" \ |
57 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 61 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
58 | |||
59 | /* "=&r" selects free register. It can select even rbx, if that register is free. | ||
60 | "=&D" for (RDI) also works, but the code can be larger with "=&D" | ||
61 | "2"(0) means (subFunction = 0), | ||
62 | 2 is (zero-based) index in the output constraint list "=c" (ECX). */ | ||
63 | 62 | ||
64 | #elif defined(MY_CPU_X86) && defined(__PIC__) \ | 63 | #elif defined(MY_CPU_X86) && defined(__PIC__) \ |
65 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) | 64 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
66 | 65 | ||
67 | #define x86_cpuid_MACRO(p, func) { \ | 66 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ |
68 | __asm__ __volatile__ ( \ | 67 | __asm__ __volatile__ ( \ |
69 | ASM_LN "mov %%ebx, %k1" \ | 68 | ASM_LN "mov %%ebx, %k1" \ |
70 | ASM_LN "cpuid" \ | 69 | ASM_LN "cpuid" \ |
71 | ASM_LN "xchg %%ebx, %k1" \ | 70 | ASM_LN "xchg %%ebx, %k1" \ |
72 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 71 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
73 | 72 | ||
74 | #else | 73 | #else |
75 | 74 | ||
76 | #define x86_cpuid_MACRO(p, func) { \ | 75 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ |
77 | __asm__ __volatile__ ( \ | 76 | __asm__ __volatile__ ( \ |
78 | ASM_LN "cpuid" \ | 77 | ASM_LN "cpuid" \ |
79 | : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 78 | : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
80 | 79 | ||
81 | #endif | 80 | #endif |
82 | 81 | ||
82 | #define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) | ||
83 | 83 | ||
84 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | 84 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
85 | { | 85 | { |
86 | x86_cpuid_MACRO(p, func) | 86 | x86_cpuid_MACRO(p, func) |
87 | } | 87 | } |
88 | 88 | ||
89 | static | ||
90 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
91 | { | ||
92 | x86_cpuid_MACRO_2(p, func, subFunc) | ||
93 | } | ||
94 | |||
89 | 95 | ||
90 | Z7_NO_INLINE | 96 | Z7_NO_INLINE |
91 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) | 97 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
@@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
205 | __asm ret 0 | 211 | __asm ret 0 |
206 | } | 212 | } |
207 | 213 | ||
214 | static | ||
215 | void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
216 | { | ||
217 | UNUSED_VAR(p) | ||
218 | UNUSED_VAR(func) | ||
219 | UNUSED_VAR(subFunc) | ||
220 | __asm push ebx | ||
221 | __asm push edi | ||
222 | __asm mov edi, ecx // p | ||
223 | __asm mov eax, edx // func | ||
224 | __asm mov ecx, [esp + 12] // subFunc | ||
225 | __asm cpuid | ||
226 | __asm mov [edi ], eax | ||
227 | __asm mov [edi + 4], ebx | ||
228 | __asm mov [edi + 8], ecx | ||
229 | __asm mov [edi + 12], edx | ||
230 | __asm pop edi | ||
231 | __asm pop ebx | ||
232 | __asm ret 4 | ||
233 | } | ||
234 | |||
208 | #else // MY_CPU_AMD64 | 235 | #else // MY_CPU_AMD64 |
209 | 236 | ||
210 | #if _MSC_VER >= 1600 | 237 | #if _MSC_VER >= 1600 |
211 | #include <intrin.h> | 238 | #include <intrin.h> |
212 | #define MY_cpuidex __cpuidex | 239 | #define MY_cpuidex __cpuidex |
240 | |||
241 | static | ||
242 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
243 | { | ||
244 | __cpuidex((int *)p, func, subFunc); | ||
245 | } | ||
246 | |||
213 | #else | 247 | #else |
214 | /* | 248 | /* |
215 | __cpuid (func == (0 or 7)) requires subfunction number in ECX. | 249 | __cpuid (func == (0 or 7)) requires subfunction number in ECX. |
@@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
219 | We still can use __cpuid for low (func) values that don't require ECX, | 253 | We still can use __cpuid for low (func) values that don't require ECX, |
220 | but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). | 254 | but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). |
221 | So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, | 255 | So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, |
222 | where ECX value is first parameter for FASTCALL / NO_INLINE func, | 256 | where ECX value is first parameter for FASTCALL / NO_INLINE func. |
223 | So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and | 257 | So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and |
224 | old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. | 258 | old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. |
225 | 259 | ||
@@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int | |||
233 | } | 267 | } |
234 | #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) | 268 | #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) |
235 | #pragma message("======== MY_cpuidex_HACK WAS USED ========") | 269 | #pragma message("======== MY_cpuidex_HACK WAS USED ========") |
270 | static | ||
271 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
272 | { | ||
273 | MY_cpuidex_HACK(subFunc, func, (Int32 *)p); | ||
274 | } | ||
236 | #endif // _MSC_VER >= 1600 | 275 | #endif // _MSC_VER >= 1600 |
237 | 276 | ||
238 | #if !defined(MY_CPU_AMD64) | 277 | #if !defined(MY_CPU_AMD64) |
@@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void) | |||
445 | } | 484 | } |
446 | } | 485 | } |
447 | 486 | ||
487 | |||
488 | BoolInt CPU_IsSupported_SHA512(void) | ||
489 | { | ||
490 | if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here | ||
491 | |||
492 | if (z7_x86_cpuid_GetMaxFunc() < 7) | ||
493 | return False; | ||
494 | { | ||
495 | UInt32 d[4]; | ||
496 | z7_x86_cpuid_subFunc(d, 7, 0); | ||
497 | if (d[0] < 1) // d[0] - is max supported subleaf value | ||
498 | return False; | ||
499 | z7_x86_cpuid_subFunc(d, 7, 1); | ||
500 | return (BoolInt)(d[0]) & 1; | ||
501 | } | ||
502 | } | ||
503 | |||
448 | /* | 504 | /* |
449 | MSVC: _xgetbv() intrinsic is available since VS2010SP1. | 505 | MSVC: _xgetbv() intrinsic is available since VS2010SP1. |
450 | MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in | 506 | MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in |
@@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void) | |||
776 | return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); | 832 | return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); |
777 | } | 833 | } |
778 | 834 | ||
835 | BoolInt CPU_IsSupported_SHA512(void) | ||
836 | { | ||
837 | return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); | ||
838 | } | ||
839 | |||
840 | /* | ||
841 | BoolInt CPU_IsSupported_SHA3(void) | ||
842 | { | ||
843 | return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); | ||
844 | } | ||
845 | */ | ||
846 | |||
779 | #ifdef MY_CPU_ARM64 | 847 | #ifdef MY_CPU_ARM64 |
780 | #define APPLE_CRYPTO_SUPPORT_VAL 1 | 848 | #define APPLE_CRYPTO_SUPPORT_VAL 1 |
781 | #else | 849 | #else |
@@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32) | |||
860 | MY_HWCAP_CHECK_FUNC (SHA1) | 928 | MY_HWCAP_CHECK_FUNC (SHA1) |
861 | MY_HWCAP_CHECK_FUNC (SHA2) | 929 | MY_HWCAP_CHECK_FUNC (SHA2) |
862 | MY_HWCAP_CHECK_FUNC (AES) | 930 | MY_HWCAP_CHECK_FUNC (AES) |
931 | #ifdef MY_CPU_ARM64 | ||
932 | // <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. | ||
933 | // we define them here, if they are not defined | ||
934 | #ifndef HWCAP_SHA3 | ||
935 | // #define HWCAP_SHA3 (1 << 17) | ||
936 | #endif | ||
937 | #ifndef HWCAP_SHA512 | ||
938 | // #pragma message("=== HWCAP_SHA512 define === ") | ||
939 | #define HWCAP_SHA512 (1 << 21) | ||
940 | #endif | ||
941 | MY_HWCAP_CHECK_FUNC (SHA512) | ||
942 | // MY_HWCAP_CHECK_FUNC (SHA3) | ||
943 | #endif | ||
863 | 944 | ||
864 | #endif // __APPLE__ | 945 | #endif // __APPLE__ |
865 | #endif // _WIN32 | 946 | #endif // _WIN32 |
diff --git a/C/CpuArch.h b/C/CpuArch.h index 683cfaa..a6297ea 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.h -- CPU specific code | 1 | /* CpuArch.h -- CPU specific code |
2 | 2024-06-17 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_CPU_ARCH_H | 4 | #ifndef ZIP7_INC_CPU_ARCH_H |
5 | #define ZIP7_INC_CPU_ARCH_H | 5 | #define ZIP7_INC_CPU_ARCH_H |
@@ -509,11 +509,19 @@ problem-4 : performace: | |||
509 | 509 | ||
510 | #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) | 510 | #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) |
511 | 511 | ||
512 | #if 0 | ||
513 | // Z7_BSWAP16 can be slow for x86-msvc | ||
514 | #define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p))) | ||
515 | #else | ||
516 | #define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16) | ||
517 | #endif | ||
518 | |||
512 | #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) | 519 | #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) |
513 | #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } | 520 | #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } |
514 | 521 | ||
515 | #if defined(MY_CPU_LE_UNALIGN_64) | 522 | #if defined(MY_CPU_LE_UNALIGN_64) |
516 | #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) | 523 | #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) |
524 | #define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); } | ||
517 | #endif | 525 | #endif |
518 | 526 | ||
519 | #else | 527 | #else |
@@ -536,11 +544,27 @@ problem-4 : performace: | |||
536 | #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) | 544 | #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) |
537 | #endif | 545 | #endif |
538 | 546 | ||
547 | #ifndef SetBe64 | ||
548 | #define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \ | ||
549 | _ppp_[0] = (Byte)(_vvv_ >> 56); \ | ||
550 | _ppp_[1] = (Byte)(_vvv_ >> 48); \ | ||
551 | _ppp_[2] = (Byte)(_vvv_ >> 40); \ | ||
552 | _ppp_[3] = (Byte)(_vvv_ >> 32); \ | ||
553 | _ppp_[4] = (Byte)(_vvv_ >> 24); \ | ||
554 | _ppp_[5] = (Byte)(_vvv_ >> 16); \ | ||
555 | _ppp_[6] = (Byte)(_vvv_ >> 8); \ | ||
556 | _ppp_[7] = (Byte)_vvv_; } | ||
557 | #endif | ||
558 | |||
539 | #ifndef GetBe16 | 559 | #ifndef GetBe16 |
560 | #ifdef GetBe16_to32 | ||
561 | #define GetBe16(p) ( (UInt16) GetBe16_to32(p)) | ||
562 | #else | ||
540 | #define GetBe16(p) ( (UInt16) ( \ | 563 | #define GetBe16(p) ( (UInt16) ( \ |
541 | ((UInt16)((const Byte *)(p))[0] << 8) | \ | 564 | ((UInt16)((const Byte *)(p))[0] << 8) | \ |
542 | ((const Byte *)(p))[1] )) | 565 | ((const Byte *)(p))[1] )) |
543 | #endif | 566 | #endif |
567 | #endif | ||
544 | 568 | ||
545 | 569 | ||
546 | #if defined(MY_CPU_BE) | 570 | #if defined(MY_CPU_BE) |
@@ -589,6 +613,11 @@ problem-4 : performace: | |||
589 | #endif | 613 | #endif |
590 | 614 | ||
591 | 615 | ||
616 | #ifndef GetBe16_to32 | ||
617 | #define GetBe16_to32(p) GetBe16(p) | ||
618 | #endif | ||
619 | |||
620 | |||
592 | #if defined(MY_CPU_X86_OR_AMD64) \ | 621 | #if defined(MY_CPU_X86_OR_AMD64) \ |
593 | || defined(MY_CPU_ARM_OR_ARM64) \ | 622 | || defined(MY_CPU_ARM_OR_ARM64) \ |
594 | || defined(MY_CPU_PPC_OR_PPC64) | 623 | || defined(MY_CPU_PPC_OR_PPC64) |
@@ -617,6 +646,7 @@ BoolInt CPU_IsSupported_SSE2(void); | |||
617 | BoolInt CPU_IsSupported_SSSE3(void); | 646 | BoolInt CPU_IsSupported_SSSE3(void); |
618 | BoolInt CPU_IsSupported_SSE41(void); | 647 | BoolInt CPU_IsSupported_SSE41(void); |
619 | BoolInt CPU_IsSupported_SHA(void); | 648 | BoolInt CPU_IsSupported_SHA(void); |
649 | BoolInt CPU_IsSupported_SHA512(void); | ||
620 | BoolInt CPU_IsSupported_PageGB(void); | 650 | BoolInt CPU_IsSupported_PageGB(void); |
621 | 651 | ||
622 | #elif defined(MY_CPU_ARM_OR_ARM64) | 652 | #elif defined(MY_CPU_ARM_OR_ARM64) |
@@ -634,6 +664,7 @@ BoolInt CPU_IsSupported_SHA1(void); | |||
634 | BoolInt CPU_IsSupported_SHA2(void); | 664 | BoolInt CPU_IsSupported_SHA2(void); |
635 | BoolInt CPU_IsSupported_AES(void); | 665 | BoolInt CPU_IsSupported_AES(void); |
636 | #endif | 666 | #endif |
667 | BoolInt CPU_IsSupported_SHA512(void); | ||
637 | 668 | ||
638 | #endif | 669 | #endif |
639 | 670 | ||
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 37b2787..088b78f 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzmaEnc.c -- LZMA Encoder | 1 | /* LzmaEnc.c -- LZMA Encoder |
2 | 2024-01-24: Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -72,11 +72,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) | |||
72 | p->level = level; | 72 | p->level = level; |
73 | 73 | ||
74 | if (p->dictSize == 0) | 74 | if (p->dictSize == 0) |
75 | p->dictSize = | 75 | p->dictSize = (unsigned)level <= 4 ? |
76 | ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : | 76 | (UInt32)1 << (level * 2 + 16) : |
77 | ( level <= 6 ? ((UInt32)1 << (level + 19)) : | 77 | (unsigned)level <= sizeof(size_t) / 2 + 4 ? |
78 | ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) | 78 | (UInt32)1 << (level + 20) : |
79 | ))); | 79 | (UInt32)1 << (sizeof(size_t) / 2 + 24); |
80 | 80 | ||
81 | if (p->dictSize > p->reduceSize) | 81 | if (p->dictSize > p->reduceSize) |
82 | { | 82 | { |
@@ -92,8 +92,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) | |||
92 | if (p->lp < 0) p->lp = 0; | 92 | if (p->lp < 0) p->lp = 0; |
93 | if (p->pb < 0) p->pb = 2; | 93 | if (p->pb < 0) p->pb = 2; |
94 | 94 | ||
95 | if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); | 95 | if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1; |
96 | if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); | 96 | if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64; |
97 | if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); | 97 | if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
98 | if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); | 98 | if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); |
99 | if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); | 99 | if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); |
@@ -0,0 +1,206 @@ | |||
1 | /* Md5.c -- MD5 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on Colin Plumb's public domain md5.c code */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Md5.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks | ||
14 | |||
15 | Z7_NO_INLINE | ||
16 | void Md5_Init(CMd5 *p) | ||
17 | { | ||
18 | p->count = 0; | ||
19 | p->state[0] = 0x67452301; | ||
20 | p->state[1] = 0xefcdab89; | ||
21 | p->state[2] = 0x98badcfe; | ||
22 | p->state[3] = 0x10325476; | ||
23 | } | ||
24 | |||
25 | #if 0 && !defined(MY_CPU_LE_UNALIGN) | ||
26 | // optional optimization for Big-endian processors or processors without unaligned access: | ||
27 | // it is intended to reduce the number of complex LE32 memory reading from 64 to 16. | ||
28 | // But some compilers (sparc, armt) are better without this optimization. | ||
29 | #define Z7_MD5_USE_DATA32_ARRAY | ||
30 | #endif | ||
31 | |||
32 | #define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i)) | ||
33 | |||
34 | #ifdef Z7_MD5_USE_DATA32_ARRAY | ||
35 | #define D(i) data32[i] | ||
36 | #else | ||
37 | #define D(i) LOAD_DATA(i) | ||
38 | #endif | ||
39 | |||
40 | #define F1(x, y, z) (z ^ (x & (y ^ z))) | ||
41 | #define F2(x, y, z) F1(z, x, y) | ||
42 | #define F3(x, y, z) (x ^ y ^ z) | ||
43 | #define F4(x, y, z) (y ^ (x | ~z)) | ||
44 | |||
45 | #define R1(i, f, start, step, w, x, y, z, s, k) \ | ||
46 | w += D((start + step * (i)) % 16) + k; \ | ||
47 | w += f(x, y, z); \ | ||
48 | w = rotlFixed(w, s) + x; \ | ||
49 | |||
50 | #define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \ | ||
51 | R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \ | ||
52 | R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \ | ||
53 | R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \ | ||
54 | R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \ | ||
55 | |||
56 | #define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \ | ||
57 | R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \ | ||
58 | R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \ | ||
59 | R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \ | ||
60 | R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \ | ||
61 | |||
62 | static | ||
63 | Z7_NO_INLINE | ||
64 | void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks) | ||
65 | { | ||
66 | UInt32 a, b, c, d; | ||
67 | // if (numBlocks == 0) return; | ||
68 | a = state[0]; | ||
69 | b = state[1]; | ||
70 | c = state[2]; | ||
71 | d = state[3]; | ||
72 | do | ||
73 | { | ||
74 | #ifdef Z7_MD5_USE_DATA32_ARRAY | ||
75 | UInt32 data32[MD5_NUM_BLOCK_WORDS]; | ||
76 | { | ||
77 | #define LOAD_data32_x4(i) { \ | ||
78 | data32[i ] = LOAD_DATA(i ); \ | ||
79 | data32[i + 1] = LOAD_DATA(i + 1); \ | ||
80 | data32[i + 2] = LOAD_DATA(i + 2); \ | ||
81 | data32[i + 3] = LOAD_DATA(i + 3); } | ||
82 | #if 1 | ||
83 | LOAD_data32_x4 (0 * 4) | ||
84 | LOAD_data32_x4 (1 * 4) | ||
85 | LOAD_data32_x4 (2 * 4) | ||
86 | LOAD_data32_x4 (3 * 4) | ||
87 | #else | ||
88 | unsigned i; | ||
89 | for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4) | ||
90 | { | ||
91 | LOAD_data32_x4(i) | ||
92 | } | ||
93 | #endif | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, | ||
98 | 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, | ||
99 | 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, | ||
100 | 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821) | ||
101 | R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, | ||
102 | 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, | ||
103 | 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, | ||
104 | 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a) | ||
105 | R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, | ||
106 | 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, | ||
107 | 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, | ||
108 | 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665) | ||
109 | R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, | ||
110 | 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, | ||
111 | 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, | ||
112 | 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391) | ||
113 | |||
114 | a += state[0]; | ||
115 | b += state[1]; | ||
116 | c += state[2]; | ||
117 | d += state[3]; | ||
118 | |||
119 | state[0] = a; | ||
120 | state[1] = b; | ||
121 | state[2] = c; | ||
122 | state[3] = d; | ||
123 | |||
124 | data += MD5_BLOCK_SIZE; | ||
125 | } | ||
126 | while (--numBlocks); | ||
127 | } | ||
128 | |||
129 | |||
130 | #define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | ||
131 | |||
132 | void Md5_Update(CMd5 *p, const Byte *data, size_t size) | ||
133 | { | ||
134 | if (size == 0) | ||
135 | return; | ||
136 | { | ||
137 | const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); | ||
138 | const unsigned num = MD5_BLOCK_SIZE - pos; | ||
139 | p->count += size; | ||
140 | if (num > size) | ||
141 | { | ||
142 | memcpy(p->buffer + pos, data, size); | ||
143 | return; | ||
144 | } | ||
145 | if (pos != 0) | ||
146 | { | ||
147 | size -= num; | ||
148 | memcpy(p->buffer + pos, data, num); | ||
149 | data += num; | ||
150 | Md5_UpdateBlock(p); | ||
151 | } | ||
152 | } | ||
153 | { | ||
154 | const size_t numBlocks = size >> 6; | ||
155 | if (numBlocks) | ||
156 | MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | ||
157 | size &= MD5_BLOCK_SIZE - 1; | ||
158 | if (size == 0) | ||
159 | return; | ||
160 | data += (numBlocks << 6); | ||
161 | memcpy(p->buffer, data, size); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | |||
166 | void Md5_Final(CMd5 *p, Byte *digest) | ||
167 | { | ||
168 | unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); | ||
169 | p->buffer[pos++] = 0x80; | ||
170 | if (pos > (MD5_BLOCK_SIZE - 4 * 2)) | ||
171 | { | ||
172 | while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; } | ||
173 | // memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos); | ||
174 | Md5_UpdateBlock(p); | ||
175 | pos = 0; | ||
176 | } | ||
177 | memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos); | ||
178 | { | ||
179 | const UInt64 numBits = p->count << 3; | ||
180 | #if defined(MY_CPU_LE_UNALIGN) | ||
181 | SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits) | ||
182 | #else | ||
183 | SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits)) | ||
184 | SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32)) | ||
185 | #endif | ||
186 | } | ||
187 | Md5_UpdateBlock(p); | ||
188 | |||
189 | SetUi32(digest, p->state[0]) | ||
190 | SetUi32(digest + 4, p->state[1]) | ||
191 | SetUi32(digest + 8, p->state[2]) | ||
192 | SetUi32(digest + 12, p->state[3]) | ||
193 | |||
194 | Md5_Init(p); | ||
195 | } | ||
196 | |||
197 | #undef R1 | ||
198 | #undef R4 | ||
199 | #undef R16 | ||
200 | #undef D | ||
201 | #undef LOAD_DATA | ||
202 | #undef LOAD_data32_x4 | ||
203 | #undef F1 | ||
204 | #undef F2 | ||
205 | #undef F3 | ||
206 | #undef F4 | ||
@@ -0,0 +1,34 @@ | |||
1 | /* Md5.h -- MD5 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_MD5_H | ||
5 | #define ZIP7_INC_MD5_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define MD5_NUM_BLOCK_WORDS 16 | ||
12 | #define MD5_NUM_DIGEST_WORDS 4 | ||
13 | |||
14 | #define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4) | ||
15 | #define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4) | ||
16 | |||
17 | typedef struct | ||
18 | { | ||
19 | UInt64 count; | ||
20 | UInt64 _pad_1; | ||
21 | // we want 16-bytes alignment here | ||
22 | UInt32 state[MD5_NUM_DIGEST_WORDS]; | ||
23 | UInt64 _pad_2[4]; | ||
24 | // we want 64-bytes alignment here | ||
25 | Byte buffer[MD5_BLOCK_SIZE]; | ||
26 | } CMd5; | ||
27 | |||
28 | void Md5_Init(CMd5 *p); | ||
29 | void Md5_Update(CMd5 *p, const Byte *data, size_t size); | ||
30 | void Md5_Final(CMd5 *p, Byte *digest); | ||
31 | |||
32 | EXTERN_C_END | ||
33 | |||
34 | #endif | ||
@@ -1,18 +1,14 @@ | |||
1 | /* Sha1.c -- SHA-1 Hash | 1 | /* Sha1.c -- SHA-1 Hash |
2 | 2024-03-01 : Igor Pavlov : Public domain | 2 | : Igor Pavlov : Public domain |
3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | #include <string.h> | 7 | #include <string.h> |
8 | 8 | ||
9 | #include "CpuArch.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "Sha1.h" | 9 | #include "Sha1.h" |
12 | 10 | #include "RotateDefs.h" | |
13 | #if defined(_MSC_VER) && (_MSC_VER < 1900) | 11 | #include "CpuArch.h" |
14 | // #define USE_MY_MM | ||
15 | #endif | ||
16 | 12 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 13 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ | 14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
56 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; | 52 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; |
57 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; | 53 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; |
58 | 54 | ||
59 | #define SHA1_UPDATE_BLOCKS(p) p->func_UpdateBlocks | 55 | #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks |
60 | #else | 56 | #else |
61 | #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks | 57 | #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks |
62 | #endif | 58 | #endif |
@@ -85,7 +81,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) | |||
85 | return False; | 81 | return False; |
86 | #endif | 82 | #endif |
87 | 83 | ||
88 | p->func_UpdateBlocks = func; | 84 | p->v.vars.func_UpdateBlocks = func; |
89 | return True; | 85 | return True; |
90 | } | 86 | } |
91 | 87 | ||
@@ -225,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) | |||
225 | 221 | ||
226 | void Sha1_InitState(CSha1 *p) | 222 | void Sha1_InitState(CSha1 *p) |
227 | { | 223 | { |
228 | p->count = 0; | 224 | p->v.vars.count = 0; |
229 | p->state[0] = 0x67452301; | 225 | p->state[0] = 0x67452301; |
230 | p->state[1] = 0xEFCDAB89; | 226 | p->state[1] = 0xEFCDAB89; |
231 | p->state[2] = 0x98BADCFE; | 227 | p->state[2] = 0x98BADCFE; |
@@ -235,7 +231,7 @@ void Sha1_InitState(CSha1 *p) | |||
235 | 231 | ||
236 | void Sha1_Init(CSha1 *p) | 232 | void Sha1_Init(CSha1 *p) |
237 | { | 233 | { |
238 | p->func_UpdateBlocks = | 234 | p->v.vars.func_UpdateBlocks = |
239 | #ifdef Z7_COMPILER_SHA1_SUPPORTED | 235 | #ifdef Z7_COMPILER_SHA1_SUPPORTED |
240 | g_SHA1_FUNC_UPDATE_BLOCKS; | 236 | g_SHA1_FUNC_UPDATE_BLOCKS; |
241 | #else | 237 | #else |
@@ -250,7 +246,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
250 | { | 246 | { |
251 | UInt32 a, b, c, d, e; | 247 | UInt32 a, b, c, d, e; |
252 | UInt32 W[kNumW]; | 248 | UInt32 W[kNumW]; |
253 | // if (numBlocks != 0x1264378347) return; | 249 | |
254 | if (numBlocks == 0) | 250 | if (numBlocks == 0) |
255 | return; | 251 | return; |
256 | 252 | ||
@@ -283,7 +279,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
283 | state[3] = d; | 279 | state[3] = d; |
284 | state[4] = e; | 280 | state[4] = e; |
285 | 281 | ||
286 | data += 64; | 282 | data += SHA1_BLOCK_SIZE; |
287 | } | 283 | } |
288 | while (--numBlocks); | 284 | while (--numBlocks); |
289 | } | 285 | } |
@@ -295,20 +291,15 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
295 | { | 291 | { |
296 | if (size == 0) | 292 | if (size == 0) |
297 | return; | 293 | return; |
298 | |||
299 | { | 294 | { |
300 | unsigned pos = (unsigned)p->count & 0x3F; | 295 | const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); |
301 | unsigned num; | 296 | const unsigned num = SHA1_BLOCK_SIZE - pos; |
302 | 297 | p->v.vars.count += size; | |
303 | p->count += size; | ||
304 | |||
305 | num = 64 - pos; | ||
306 | if (num > size) | 298 | if (num > size) |
307 | { | 299 | { |
308 | memcpy(p->buffer + pos, data, size); | 300 | memcpy(p->buffer + pos, data, size); |
309 | return; | 301 | return; |
310 | } | 302 | } |
311 | |||
312 | if (pos != 0) | 303 | if (pos != 0) |
313 | { | 304 | { |
314 | size -= num; | 305 | size -= num; |
@@ -318,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
318 | } | 309 | } |
319 | } | 310 | } |
320 | { | 311 | { |
321 | size_t numBlocks = size >> 6; | 312 | const size_t numBlocks = size >> 6; |
313 | // if (numBlocks) | ||
322 | SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | 314 | SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); |
323 | size &= 0x3F; | 315 | size &= SHA1_BLOCK_SIZE - 1; |
324 | if (size == 0) | 316 | if (size == 0) |
325 | return; | 317 | return; |
326 | data += (numBlocks << 6); | 318 | data += (numBlocks << 6); |
@@ -331,42 +323,21 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
331 | 323 | ||
332 | void Sha1_Final(CSha1 *p, Byte *digest) | 324 | void Sha1_Final(CSha1 *p, Byte *digest) |
333 | { | 325 | { |
334 | unsigned pos = (unsigned)p->count & 0x3F; | 326 | unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); |
335 | |||
336 | |||
337 | p->buffer[pos++] = 0x80; | 327 | p->buffer[pos++] = 0x80; |
338 | 328 | if (pos > (SHA1_BLOCK_SIZE - 4 * 2)) | |
339 | if (pos > (64 - 8)) | ||
340 | { | 329 | { |
341 | while (pos != 64) { p->buffer[pos++] = 0; } | 330 | while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; } |
342 | // memset(&p->buf.buffer[pos], 0, 64 - pos); | 331 | // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos); |
343 | Sha1_UpdateBlock(p); | 332 | Sha1_UpdateBlock(p); |
344 | pos = 0; | 333 | pos = 0; |
345 | } | 334 | } |
346 | 335 | memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos); | |
347 | /* | ||
348 | if (pos & 3) | ||
349 | { | ||
350 | p->buffer[pos] = 0; | ||
351 | p->buffer[pos + 1] = 0; | ||
352 | p->buffer[pos + 2] = 0; | ||
353 | pos += 3; | ||
354 | pos &= ~3; | ||
355 | } | ||
356 | { | ||
357 | for (; pos < 64 - 8; pos += 4) | ||
358 | *(UInt32 *)(&p->buffer[pos]) = 0; | ||
359 | } | ||
360 | */ | ||
361 | |||
362 | memset(&p->buffer[pos], 0, (64 - 8) - pos); | ||
363 | |||
364 | { | 336 | { |
365 | const UInt64 numBits = (p->count << 3); | 337 | const UInt64 numBits = p->v.vars.count << 3; |
366 | SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) | 338 | SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) |
367 | SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) | 339 | SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) |
368 | } | 340 | } |
369 | |||
370 | Sha1_UpdateBlock(p); | 341 | Sha1_UpdateBlock(p); |
371 | 342 | ||
372 | SetBe32(digest, p->state[0]) | 343 | SetBe32(digest, p->state[0]) |
@@ -375,16 +346,13 @@ void Sha1_Final(CSha1 *p, Byte *digest) | |||
375 | SetBe32(digest + 12, p->state[3]) | 346 | SetBe32(digest + 12, p->state[3]) |
376 | SetBe32(digest + 16, p->state[4]) | 347 | SetBe32(digest + 16, p->state[4]) |
377 | 348 | ||
378 | |||
379 | |||
380 | |||
381 | Sha1_InitState(p); | 349 | Sha1_InitState(p); |
382 | } | 350 | } |
383 | 351 | ||
384 | 352 | ||
385 | void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) | 353 | void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) |
386 | { | 354 | { |
387 | const UInt64 numBits = (p->count + size) << 3; | 355 | const UInt64 numBits = (p->v.vars.count + size) << 3; |
388 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) | 356 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) |
389 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) | 357 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) |
390 | // SetBe32((UInt32 *)(block + size), 0x80000000); | 358 | // SetBe32((UInt32 *)(block + size), 0x80000000); |
@@ -420,57 +388,32 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest) | |||
420 | 388 | ||
421 | void Sha1Prepare(void) | 389 | void Sha1Prepare(void) |
422 | { | 390 | { |
423 | #ifdef Z7_COMPILER_SHA1_SUPPORTED | 391 | #ifdef Z7_COMPILER_SHA1_SUPPORTED |
424 | SHA1_FUNC_UPDATE_BLOCKS f, f_hw; | 392 | SHA1_FUNC_UPDATE_BLOCKS f, f_hw; |
425 | f = Sha1_UpdateBlocks; | 393 | f = Sha1_UpdateBlocks; |
426 | f_hw = NULL; | 394 | f_hw = NULL; |
427 | #ifdef MY_CPU_X86_OR_AMD64 | 395 | #ifdef MY_CPU_X86_OR_AMD64 |
428 | #ifndef USE_MY_MM | ||
429 | if (CPU_IsSupported_SHA() | 396 | if (CPU_IsSupported_SHA() |
430 | && CPU_IsSupported_SSSE3() | 397 | && CPU_IsSupported_SSSE3() |
431 | // && CPU_IsSupported_SSE41() | ||
432 | ) | 398 | ) |
433 | #endif | 399 | #else |
434 | #else | ||
435 | if (CPU_IsSupported_SHA1()) | 400 | if (CPU_IsSupported_SHA1()) |
436 | #endif | 401 | #endif |
437 | { | 402 | { |
438 | // printf("\n========== HW SHA1 ======== \n"); | 403 | // printf("\n========== HW SHA1 ======== \n"); |
439 | #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) | 404 | #if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037) |
440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). | 405 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). |
441 | It generated incorrect SHA-1 code. | 406 | It generated incorrect SHA-1 code. */ |
442 | 21.03 : we test sha1-hardware code at runtime initialization */ | 407 | #pragma message("== SHA1 code can work incorrectly with this compiler") |
443 | 408 | #error Stop_Compiling_MSC_Compiler_BUG_SHA1 | |
444 | #pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") | 409 | #endif |
445 | |||
446 | UInt32 state[5] = { 0, 1, 2, 3, 4 } ; | ||
447 | Byte data[64]; | ||
448 | unsigned i; | ||
449 | for (i = 0; i < sizeof(data); i += 2) | ||
450 | { | ||
451 | data[i ] = (Byte)(i); | ||
452 | data[i + 1] = (Byte)(i + 1); | ||
453 | } | ||
454 | |||
455 | Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64); | ||
456 | |||
457 | if ( state[0] != 0x9acd7297 | ||
458 | || state[1] != 0x4624d898 | ||
459 | || state[2] != 0x0bf079f0 | ||
460 | || state[3] != 0x031e61b3 | ||
461 | || state[4] != 0x8323fe20) | ||
462 | { | ||
463 | // printf("\n========== SHA-1 hardware version failure ======== \n"); | ||
464 | } | ||
465 | else | ||
466 | #endif | ||
467 | { | 410 | { |
468 | f = f_hw = Sha1_UpdateBlocks_HW; | 411 | f = f_hw = Sha1_UpdateBlocks_HW; |
469 | } | 412 | } |
470 | } | 413 | } |
471 | g_SHA1_FUNC_UPDATE_BLOCKS = f; | 414 | g_SHA1_FUNC_UPDATE_BLOCKS = f; |
472 | g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; | 415 | g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; |
473 | #endif | 416 | #endif |
474 | } | 417 | } |
475 | 418 | ||
476 | #undef kNumW | 419 | #undef kNumW |
@@ -1,5 +1,5 @@ | |||
1 | /* Sha1.h -- SHA-1 Hash | 1 | /* Sha1.h -- SHA-1 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_SHA1_H | 4 | #ifndef ZIP7_INC_SHA1_H |
5 | #define ZIP7_INC_SHA1_H | 5 | #define ZIP7_INC_SHA1_H |
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN | |||
14 | #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) | 14 | #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) |
15 | #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) | 15 | #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) |
16 | 16 | ||
17 | |||
18 | |||
19 | |||
17 | typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); | 20 | typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); |
18 | 21 | ||
19 | /* | 22 | /* |
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte | |||
32 | 35 | ||
33 | typedef struct | 36 | typedef struct |
34 | { | 37 | { |
35 | SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | 38 | union |
36 | UInt64 count; | 39 | { |
37 | UInt64 _pad_2[2]; | 40 | struct |
41 | { | ||
42 | SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[4]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
38 | UInt32 state[SHA1_NUM_DIGEST_WORDS]; | 48 | UInt32 state[SHA1_NUM_DIGEST_WORDS]; |
39 | UInt32 _pad_3[3]; | 49 | UInt32 _pad_3[3]; |
40 | Byte buffer[SHA1_BLOCK_SIZE]; | 50 | Byte buffer[SHA1_BLOCK_SIZE]; |
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 4e835f1..8738b94 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha1 HW") | 36 | // #pragma message("Sha1 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -84,7 +69,6 @@ SHA: | |||
84 | _mm_sha1* | 69 | _mm_sha1* |
85 | */ | 70 | */ |
86 | 71 | ||
87 | |||
88 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); | 72 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); |
89 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); | 73 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); |
90 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); | 74 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); |
@@ -99,11 +83,12 @@ SHA: | |||
99 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); | 83 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); |
100 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); | 84 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); |
101 | 85 | ||
102 | |||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | SHUFFLE_EPI8(m, mask) \ | 88 | SHUFFLE_EPI8(m, mask) \ |
106 | 89 | ||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
107 | #define SM1(m0, m1, m2, m3) \ | 92 | #define SM1(m0, m1, m2, m3) \ |
108 | SHA1_MSG1(m0, m1) \ | 93 | SHA1_MSG1(m0, m1) \ |
109 | 94 | ||
@@ -116,35 +101,19 @@ SHA: | |||
116 | SM1(m0, m1, m2, m3) \ | 101 | SM1(m0, m1, m2, m3) \ |
117 | SHA1_MSG2(m3, m2) \ | 102 | SHA1_MSG2(m3, m2) \ |
118 | 103 | ||
119 | #define NNN(m0, m1, m2, m3) | 104 | #define R4(k, m0, m1, m2, m3, e0, e1, OP) \ |
120 | |||
121 | |||
122 | |||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
128 | |||
129 | |||
130 | |||
131 | |||
132 | |||
133 | |||
134 | |||
135 | |||
136 | |||
137 | #define R4(k, e0, e1, m0, m1, m2, m3, OP) \ | ||
138 | e1 = abcd; \ | 105 | e1 = abcd; \ |
139 | SHA1_RND4(abcd, e0, (k) / 5) \ | 106 | SHA1_RND4(abcd, e0, (k) / 5) \ |
140 | SHA1_NEXTE(e1, m1) \ | 107 | SHA1_NEXTE(e1, m1) \ |
141 | OP(m0, m1, m2, m3) \ | 108 | OP(m0, m1, m2, m3) \ |
142 | 109 | ||
110 | |||
111 | |||
143 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ | 112 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ |
144 | R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \ |
145 | R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \ |
146 | R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \ |
147 | R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ | 116 | R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | SHUFFLE_EPI32 (abcd, 0x1B) \ | 119 | SHUFFLE_EPI32 (abcd, 0x1B) \ |
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); | 132 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); |
164 | 133 | ||
165 | __m128i abcd, e0; | ||
166 | 134 | ||
135 | __m128i abcd, e0; | ||
136 | |||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
169 | 139 | ||
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
204 | PREPARE_STATE | 174 | PREPARE_STATE |
205 | 175 | ||
206 | _mm_storeu_si128((__m128i *) (void *) state, abcd); | 176 | _mm_storeu_si128((__m128i *) (void *) state, abcd); |
207 | *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); | 177 | *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0); |
208 | } | 178 | } |
209 | 179 | ||
210 | #endif // USE_HW_SHA | 180 | #endif // USE_HW_SHA |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -329,26 +287,37 @@ typedef uint32x4_t v128; | |||
329 | #endif | 287 | #endif |
330 | 288 | ||
331 | #ifdef MY_CPU_BE | 289 | #ifdef MY_CPU_BE |
332 | #define MY_rev32_for_LE(x) | 290 | #define MY_rev32_for_LE(x) x |
333 | #else | 291 | #else |
334 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 292 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
335 | #endif | 293 | #endif |
336 | 294 | ||
337 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 295 | #define LOAD_128_32(_p) vld1q_u32(_p) |
338 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 296 | #define LOAD_128_8(_p) vld1q_u8 (_p) |
297 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
339 | 298 | ||
340 | #define LOAD_SHUFFLE(m, k) \ | 299 | #define LOAD_SHUFFLE(m, k) \ |
341 | m = LOAD_128((data + (k) * 16)); \ | 300 | m = vreinterpretq_u32_u8( \ |
342 | MY_rev32_for_LE(m); \ | 301 | MY_rev32_for_LE( \ |
343 | 302 | LOAD_128_8(data + (k) * 16))); \ | |
344 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) | 303 | |
345 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src) | 304 | #define N0(dest, src2, src3) |
305 | #define N1(dest, src) | ||
306 | #define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | ||
307 | #define U1(dest, src) dest = vsha1su1q_u32(dest, src); | ||
346 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) | 308 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) |
347 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) | 309 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) |
348 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) | 310 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) |
349 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | 311 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) |
350 | #define T(m, c) t = vaddq_u32(m, c) | 312 | #define T(m, c) t = vaddq_u32(m, c) |
351 | 313 | ||
314 | #define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \ | ||
315 | T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \ | ||
316 | T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \ | ||
317 | T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \ | ||
318 | T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \ | ||
319 | |||
320 | |||
352 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 321 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
353 | #ifdef ATTRIB_SHA | 322 | #ifdef ATTRIB_SHA |
354 | ATTRIB_SHA | 323 | ATTRIB_SHA |
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
367 | c2 = vdupq_n_u32(0x8f1bbcdc); | 336 | c2 = vdupq_n_u32(0x8f1bbcdc); |
368 | c3 = vdupq_n_u32(0xca62c1d6); | 337 | c3 = vdupq_n_u32(0xca62c1d6); |
369 | 338 | ||
370 | abcd = LOAD_128(&state[0]); | 339 | abcd = LOAD_128_32(&state[0]); |
371 | e0 = state[4]; | 340 | e0 = state[4]; |
372 | 341 | ||
373 | do | 342 | do |
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
385 | LOAD_SHUFFLE (m2, 2) | 354 | LOAD_SHUFFLE (m2, 2) |
386 | LOAD_SHUFFLE (m3, 3) | 355 | LOAD_SHUFFLE (m3, 3) |
387 | 356 | ||
388 | T(m0, c0); H(e1); C(e0); | 357 | R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C ) |
389 | T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); | 358 | R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P ) |
390 | T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); | 359 | R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M ) |
391 | T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); | 360 | R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P ) |
392 | T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); | 361 | R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P ) |
393 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
394 | T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); | ||
395 | T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
396 | T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
397 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
398 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
399 | T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); | ||
400 | T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); | ||
401 | T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); | ||
402 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
403 | T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
404 | T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
405 | T(m1, c3); SU1(m3, m2); H(e0); P(e1); | ||
406 | T(m2, c3); H(e1); P(e0); | ||
407 | T(m3, c3); H(e0); P(e1); | ||
408 | 362 | ||
409 | abcd = vaddq_u32(abcd, abcd_save); | 363 | abcd = vaddq_u32(abcd, abcd_save); |
410 | e0 += e0_save; | 364 | e0 += e0_save; |
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
413 | } | 367 | } |
414 | while (--numBlocks); | 368 | while (--numBlocks); |
415 | 369 | ||
416 | STORE_128(&state[0], abcd); | 370 | STORE_128_32(&state[0], abcd); |
417 | state[4] = e0; | 371 | state[4] = e0; |
418 | } | 372 | } |
419 | 373 | ||
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
421 | 375 | ||
422 | #endif // MY_CPU_ARM_OR_ARM64 | 376 | #endif // MY_CPU_ARM_OR_ARM64 |
423 | 377 | ||
424 | |||
425 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | 378 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
426 | // #error Stop_Compiling_UNSUPPORTED_SHA | 379 | // #error Stop_Compiling_UNSUPPORTED_SHA |
427 | // #include <stdlib.h> | 380 | // #include <stdlib.h> |
428 | |||
429 | |||
430 | |||
431 | // #include "Sha1.h" | 381 | // #include "Sha1.h" |
432 | // #if defined(_MSC_VER) | 382 | // #if defined(_MSC_VER) |
433 | #pragma message("Sha1 HW-SW stub was used") | 383 | #pragma message("Sha1 HW-SW stub was used") |
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
447 | } | 397 | } |
448 | #endif | 398 | #endif |
449 | 399 | ||
450 | #undef SU0 | 400 | #undef U0 |
451 | #undef SU1 | 401 | #undef U1 |
402 | #undef N0 | ||
403 | #undef N1 | ||
452 | #undef C | 404 | #undef C |
453 | #undef P | 405 | #undef P |
454 | #undef M | 406 | #undef M |
@@ -1,18 +1,14 @@ | |||
1 | /* Sha256.c -- SHA-256 Hash | 1 | /* Sha256.c -- SHA-256 Hash |
2 | 2024-03-01 : Igor Pavlov : Public domain | 2 | : Igor Pavlov : Public domain |
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | #include <string.h> | 7 | #include <string.h> |
8 | 8 | ||
9 | #include "CpuArch.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "Sha256.h" | 9 | #include "Sha256.h" |
12 | 10 | #include "RotateDefs.h" | |
13 | #if defined(_MSC_VER) && (_MSC_VER < 1900) | 11 | #include "CpuArch.h" |
14 | // #define USE_MY_MM | ||
15 | #endif | ||
16 | 12 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 13 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ | 14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
56 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; | 52 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; |
57 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; | 53 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; |
58 | 54 | ||
59 | #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks | 55 | #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks |
60 | #else | 56 | #else |
61 | #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks | 57 | #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks |
62 | #endif | 58 | #endif |
@@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) | |||
85 | return False; | 81 | return False; |
86 | #endif | 82 | #endif |
87 | 83 | ||
88 | p->func_UpdateBlocks = func; | 84 | p->v.vars.func_UpdateBlocks = func; |
89 | return True; | 85 | return True; |
90 | } | 86 | } |
91 | 87 | ||
@@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) | |||
111 | 107 | ||
112 | void Sha256_InitState(CSha256 *p) | 108 | void Sha256_InitState(CSha256 *p) |
113 | { | 109 | { |
114 | p->count = 0; | 110 | p->v.vars.count = 0; |
115 | p->state[0] = 0x6a09e667; | 111 | p->state[0] = 0x6a09e667; |
116 | p->state[1] = 0xbb67ae85; | 112 | p->state[1] = 0xbb67ae85; |
117 | p->state[2] = 0x3c6ef372; | 113 | p->state[2] = 0x3c6ef372; |
@@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p) | |||
122 | p->state[7] = 0x5be0cd19; | 118 | p->state[7] = 0x5be0cd19; |
123 | } | 119 | } |
124 | 120 | ||
121 | |||
122 | |||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
125 | void Sha256_Init(CSha256 *p) | 128 | void Sha256_Init(CSha256 *p) |
126 | { | 129 | { |
127 | p->func_UpdateBlocks = | 130 | p->v.vars.func_UpdateBlocks = |
128 | #ifdef Z7_COMPILER_SHA256_SUPPORTED | 131 | #ifdef Z7_COMPILER_SHA256_SUPPORTED |
129 | g_SHA256_FUNC_UPDATE_BLOCKS; | 132 | g_SHA256_FUNC_UPDATE_BLOCKS; |
130 | #else | 133 | #else |
@@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p) | |||
133 | Sha256_InitState(p); | 136 | Sha256_InitState(p); |
134 | } | 137 | } |
135 | 138 | ||
136 | #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) | 139 | #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22)) |
137 | #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) | 140 | #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25)) |
138 | #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) | 141 | #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) |
139 | #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) | 142 | #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10)) |
140 | 143 | ||
141 | #define Ch(x,y,z) (z^(x&(y^z))) | 144 | #define Ch(x,y,z) (z^(x&(y^z))) |
142 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) | 145 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) |
@@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p) | |||
224 | 227 | ||
225 | #endif | 228 | #endif |
226 | 229 | ||
227 | // static | ||
228 | extern MY_ALIGN(64) | ||
229 | const UInt32 SHA256_K_ARRAY[64]; | ||
230 | 230 | ||
231 | MY_ALIGN(64) | 231 | extern |
232 | const UInt32 SHA256_K_ARRAY[64] = { | 232 | MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; |
233 | MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = { | ||
233 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, | 234 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
234 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, | 235 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
235 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, | 236 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
@@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = { | |||
248 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | 249 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
249 | }; | 250 | }; |
250 | 251 | ||
251 | #define K SHA256_K_ARRAY | ||
252 | 252 | ||
253 | 253 | ||
254 | |||
255 | |||
256 | #define K SHA256_K_ARRAY | ||
257 | |||
254 | Z7_NO_INLINE | 258 | Z7_NO_INLINE |
255 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) | 259 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) |
256 | { | 260 | { |
257 | UInt32 W | 261 | UInt32 W |
258 | #ifdef Z7_SHA256_BIG_W | 262 | #ifdef Z7_SHA256_BIG_W |
259 | [64]; | 263 | [64]; |
260 | #else | 264 | #else |
261 | [16]; | 265 | [16]; |
262 | #endif | 266 | #endif |
263 | |||
264 | unsigned j; | 267 | unsigned j; |
265 | |||
266 | UInt32 a,b,c,d,e,f,g,h; | 268 | UInt32 a,b,c,d,e,f,g,h; |
267 | 269 | #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | |
268 | #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | ||
269 | UInt32 tmp; | 270 | UInt32 tmp; |
270 | #endif | 271 | #endif |
271 | 272 | ||
273 | if (numBlocks == 0) return; | ||
274 | |||
272 | a = state[0]; | 275 | a = state[0]; |
273 | b = state[1]; | 276 | b = state[1]; |
274 | c = state[2]; | 277 | c = state[2]; |
@@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
278 | g = state[6]; | 281 | g = state[6]; |
279 | h = state[7]; | 282 | h = state[7]; |
280 | 283 | ||
281 | while (numBlocks) | 284 | do |
282 | { | 285 | { |
283 | 286 | ||
284 | for (j = 0; j < 16; j += STEP_PRE) | 287 | for (j = 0; j < 16; j += STEP_PRE) |
@@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
352 | g += state[6]; state[6] = g; | 355 | g += state[6]; state[6] = g; |
353 | h += state[7]; state[7] = h; | 356 | h += state[7]; state[7] = h; |
354 | 357 | ||
355 | data += 64; | 358 | data += SHA256_BLOCK_SIZE; |
356 | numBlocks--; | ||
357 | } | 359 | } |
358 | 360 | while (--numBlocks); | |
359 | /* Wipe variables */ | ||
360 | /* memset(W, 0, sizeof(W)); */ | ||
361 | } | 361 | } |
362 | 362 | ||
363 | #undef S0 | ||
364 | #undef S1 | ||
365 | #undef s0 | ||
366 | #undef s1 | ||
367 | #undef K | ||
368 | 363 | ||
369 | #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | 364 | #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) |
370 | 365 | ||
@@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
372 | { | 367 | { |
373 | if (size == 0) | 368 | if (size == 0) |
374 | return; | 369 | return; |
375 | |||
376 | { | 370 | { |
377 | unsigned pos = (unsigned)p->count & 0x3F; | 371 | const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); |
378 | unsigned num; | 372 | const unsigned num = SHA256_BLOCK_SIZE - pos; |
379 | 373 | p->v.vars.count += size; | |
380 | p->count += size; | ||
381 | |||
382 | num = 64 - pos; | ||
383 | if (num > size) | 374 | if (num > size) |
384 | { | 375 | { |
385 | memcpy(p->buffer + pos, data, size); | 376 | memcpy(p->buffer + pos, data, size); |
386 | return; | 377 | return; |
387 | } | 378 | } |
388 | |||
389 | if (pos != 0) | 379 | if (pos != 0) |
390 | { | 380 | { |
391 | size -= num; | 381 | size -= num; |
@@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
395 | } | 385 | } |
396 | } | 386 | } |
397 | { | 387 | { |
398 | size_t numBlocks = size >> 6; | 388 | const size_t numBlocks = size >> 6; |
389 | // if (numBlocks) | ||
399 | SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | 390 | SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); |
400 | size &= 0x3F; | 391 | size &= SHA256_BLOCK_SIZE - 1; |
401 | if (size == 0) | 392 | if (size == 0) |
402 | return; | 393 | return; |
403 | data += (numBlocks << 6); | 394 | data += (numBlocks << 6); |
@@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
408 | 399 | ||
409 | void Sha256_Final(CSha256 *p, Byte *digest) | 400 | void Sha256_Final(CSha256 *p, Byte *digest) |
410 | { | 401 | { |
411 | unsigned pos = (unsigned)p->count & 0x3F; | 402 | unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); |
412 | unsigned i; | ||
413 | |||
414 | p->buffer[pos++] = 0x80; | 403 | p->buffer[pos++] = 0x80; |
415 | 404 | if (pos > (SHA256_BLOCK_SIZE - 4 * 2)) | |
416 | if (pos > (64 - 8)) | ||
417 | { | 405 | { |
418 | while (pos != 64) { p->buffer[pos++] = 0; } | 406 | while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; } |
419 | // memset(&p->buf.buffer[pos], 0, 64 - pos); | 407 | // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos); |
420 | Sha256_UpdateBlock(p); | 408 | Sha256_UpdateBlock(p); |
421 | pos = 0; | 409 | pos = 0; |
422 | } | 410 | } |
423 | 411 | memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos); | |
424 | /* | ||
425 | if (pos & 3) | ||
426 | { | 412 | { |
427 | p->buffer[pos] = 0; | 413 | const UInt64 numBits = p->v.vars.count << 3; |
428 | p->buffer[pos + 1] = 0; | 414 | SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) |
429 | p->buffer[pos + 2] = 0; | 415 | SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) |
430 | pos += 3; | ||
431 | pos &= ~3; | ||
432 | } | 416 | } |
417 | Sha256_UpdateBlock(p); | ||
418 | #if 1 && defined(MY_CPU_BE) | ||
419 | memcpy(digest, p->state, SHA256_DIGEST_SIZE); | ||
420 | #else | ||
433 | { | 421 | { |
434 | for (; pos < 64 - 8; pos += 4) | 422 | unsigned i; |
435 | *(UInt32 *)(&p->buffer[pos]) = 0; | 423 | for (i = 0; i < 8; i += 2) |
424 | { | ||
425 | const UInt32 v0 = p->state[i]; | ||
426 | const UInt32 v1 = p->state[(size_t)i + 1]; | ||
427 | SetBe32(digest , v0) | ||
428 | SetBe32(digest + 4, v1) | ||
429 | digest += 4 * 2; | ||
430 | } | ||
436 | } | 431 | } |
437 | */ | ||
438 | 432 | ||
439 | memset(&p->buffer[pos], 0, (64 - 8) - pos); | ||
440 | 433 | ||
441 | { | ||
442 | UInt64 numBits = (p->count << 3); | ||
443 | SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) | ||
444 | SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) | ||
445 | } | ||
446 | |||
447 | Sha256_UpdateBlock(p); | ||
448 | 434 | ||
449 | for (i = 0; i < 8; i += 2) | 435 | |
450 | { | 436 | #endif |
451 | UInt32 v0 = p->state[i]; | ||
452 | UInt32 v1 = p->state[(size_t)i + 1]; | ||
453 | SetBe32(digest , v0) | ||
454 | SetBe32(digest + 4, v1) | ||
455 | digest += 8; | ||
456 | } | ||
457 | |||
458 | Sha256_InitState(p); | 437 | Sha256_InitState(p); |
459 | } | 438 | } |
460 | 439 | ||
461 | 440 | ||
462 | void Sha256Prepare(void) | 441 | void Sha256Prepare(void) |
463 | { | 442 | { |
464 | #ifdef Z7_COMPILER_SHA256_SUPPORTED | 443 | #ifdef Z7_COMPILER_SHA256_SUPPORTED |
465 | SHA256_FUNC_UPDATE_BLOCKS f, f_hw; | 444 | SHA256_FUNC_UPDATE_BLOCKS f, f_hw; |
466 | f = Sha256_UpdateBlocks; | 445 | f = Sha256_UpdateBlocks; |
467 | f_hw = NULL; | 446 | f_hw = NULL; |
468 | #ifdef MY_CPU_X86_OR_AMD64 | 447 | #ifdef MY_CPU_X86_OR_AMD64 |
469 | #ifndef USE_MY_MM | ||
470 | if (CPU_IsSupported_SHA() | 448 | if (CPU_IsSupported_SHA() |
471 | && CPU_IsSupported_SSSE3() | 449 | && CPU_IsSupported_SSSE3() |
472 | // && CPU_IsSupported_SSE41() | ||
473 | ) | 450 | ) |
474 | #endif | 451 | #else |
475 | #else | ||
476 | if (CPU_IsSupported_SHA2()) | 452 | if (CPU_IsSupported_SHA2()) |
477 | #endif | 453 | #endif |
478 | { | 454 | { |
479 | // printf("\n========== HW SHA256 ======== \n"); | 455 | // printf("\n========== HW SHA256 ======== \n"); |
480 | f = f_hw = Sha256_UpdateBlocks_HW; | 456 | f = f_hw = Sha256_UpdateBlocks_HW; |
481 | } | 457 | } |
482 | g_SHA256_FUNC_UPDATE_BLOCKS = f; | 458 | g_SHA256_FUNC_UPDATE_BLOCKS = f; |
483 | g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; | 459 | g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; |
484 | #endif | 460 | #endif |
485 | } | 461 | } |
486 | 462 | ||
463 | #undef U64C | ||
464 | #undef K | ||
487 | #undef S0 | 465 | #undef S0 |
488 | #undef S1 | 466 | #undef S1 |
489 | #undef s0 | 467 | #undef s0 |
@@ -1,5 +1,5 @@ | |||
1 | /* Sha256.h -- SHA-256 Hash | 1 | /* Sha256.h -- SHA-256 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_SHA256_H | 4 | #ifndef ZIP7_INC_SHA256_H |
5 | #define ZIP7_INC_SHA256_H | 5 | #define ZIP7_INC_SHA256_H |
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN | |||
14 | #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) | 14 | #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) |
15 | #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) | 15 | #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) |
16 | 16 | ||
17 | |||
18 | |||
19 | |||
17 | typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); | 20 | typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); |
18 | 21 | ||
19 | /* | 22 | /* |
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt | |||
32 | 35 | ||
33 | typedef struct | 36 | typedef struct |
34 | { | 37 | { |
35 | SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | 38 | union |
36 | UInt64 count; | 39 | { |
37 | UInt64 _pad_2[2]; | 40 | struct |
41 | { | ||
42 | SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[4]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
38 | UInt32 state[SHA256_NUM_DIGEST_WORDS]; | 48 | UInt32 state[SHA256_NUM_DIGEST_WORDS]; |
39 | 49 | ||
40 | Byte buffer[SHA256_BLOCK_SIZE]; | 50 | Byte buffer[SHA256_BLOCK_SIZE]; |
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index eb38166..1c6b50f 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha256 HW") | 36 | // #pragma message("Sha256 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -91,60 +76,44 @@ SHA: | |||
91 | extern | 76 | extern |
92 | MY_ALIGN(64) | 77 | MY_ALIGN(64) |
93 | const UInt32 SHA256_K_ARRAY[64]; | 78 | const UInt32 SHA256_K_ARRAY[64]; |
94 | |||
95 | #define K SHA256_K_ARRAY | 79 | #define K SHA256_K_ARRAY |
96 | 80 | ||
97 | 81 | ||
98 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | 82 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); |
99 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); | 83 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); |
100 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); | 84 | #define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); |
101 | |||
102 | 85 | ||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | m = _mm_shuffle_epi8(m, mask); \ | 88 | m = _mm_shuffle_epi8(m, mask); \ |
106 | 89 | ||
107 | #define SM1(g0, g1, g2, g3) \ | 90 | #define NNN(m0, m1, m2, m3) |
108 | SHA256_MSG1(g3, g0); \ | ||
109 | 91 | ||
110 | #define SM2(g0, g1, g2, g3) \ | 92 | #define SM1(m1, m2, m3, m0) \ |
111 | tmp = _mm_alignr_epi8(g1, g0, 4); \ | 93 | SHA256_MSG1(m0, m1); \ |
112 | ADD_EPI32(g2, tmp) \ | ||
113 | SHA25G_MSG2(g2, g1); \ | ||
114 | |||
115 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) | ||
116 | // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) | ||
117 | |||
118 | |||
119 | #define NNN(g0, g1, g2, g3) | ||
120 | 94 | ||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ | ||
97 | SHA256_MSG2(m0, m3); \ | ||
121 | 98 | ||
122 | #define RND2(t0, t1) \ | 99 | #define RND2(t0, t1) \ |
123 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); | 100 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); |
124 | 101 | ||
125 | #define RND2_0(m, k) \ | ||
126 | msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
127 | RND2(state0, state1); \ | ||
128 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
129 | 102 | ||
130 | 103 | ||
131 | #define RND2_1 \ | 104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ |
105 | msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1); \ | ||
107 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
108 | OP0(m0, m1, m2, m3) \ | ||
132 | RND2(state1, state0); \ | 109 | RND2(state1, state0); \ |
133 | 110 | OP1(m0, m1, m2, m3) \ | |
134 | |||
135 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 | ||
136 | |||
137 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | ||
138 | RND2_0(g0, k) \ | ||
139 | OP0(g0, g1, g2, g3) \ | ||
140 | RND2_1 \ | ||
141 | OP1(g0, g1, g2, g3) \ | ||
142 | 111 | ||
143 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
144 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ |
145 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ |
146 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ |
147 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | 116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ | 119 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ |
@@ -161,8 +130,9 @@ ATTRIB_SHA | |||
161 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 130 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); | 132 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); |
164 | __m128i tmp; | 133 | |
165 | __m128i state0, state1; | 134 | |
135 | __m128i tmp, state0, state1; | ||
166 | 136 | ||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -324,41 +282,70 @@ typedef uint32x4_t v128; | |||
324 | // typedef __n128 v128; // MSVC | 282 | // typedef __n128 v128; // MSVC |
325 | 283 | ||
326 | #ifdef MY_CPU_BE | 284 | #ifdef MY_CPU_BE |
327 | #define MY_rev32_for_LE(x) | 285 | #define MY_rev32_for_LE(x) x |
328 | #else | 286 | #else |
329 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 287 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
330 | #endif | 288 | #endif |
331 | 289 | ||
332 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 290 | #if 1 // 0 for debug |
333 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 291 | // for arm32: it works slower by some reason than direct code |
292 | /* | ||
293 | for arm32 it generates: | ||
294 | MSVC-2022, GCC-9: | ||
295 | vld1.32 {d18,d19}, [r10] | ||
296 | vst1.32 {d4,d5}, [r3] | ||
297 | vld1.8 {d20-d21}, [r4] | ||
298 | there is no align hint (like [r10:128]). So instruction allows unaligned access | ||
299 | */ | ||
300 | #define LOAD_128_32(_p) vld1q_u32(_p) | ||
301 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
302 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
303 | #else | ||
304 | /* | ||
305 | for arm32: | ||
306 | MSVC-2022: | ||
307 | vldm r10,{d18,d19} | ||
308 | vstm r3,{d4,d5} | ||
309 | does it require strict alignment? | ||
310 | GCC-9: | ||
311 | vld1.64 {d30-d31}, [r0:64] | ||
312 | vldr d28, [r0, #16] | ||
313 | vldr d29, [r0, #24] | ||
314 | vst1.64 {d30-d31}, [r0:64] | ||
315 | vstr d28, [r0, #16] | ||
316 | vstr d29, [r0, #24] | ||
317 | there is hint [r0:64], so does it requires 64-bit alignment. | ||
318 | */ | ||
319 | #define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) | ||
320 | #define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) | ||
321 | #define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) | ||
322 | #endif | ||
334 | 323 | ||
335 | #define LOAD_SHUFFLE(m, k) \ | 324 | #define LOAD_SHUFFLE(m, k) \ |
336 | m = LOAD_128((data + (k) * 16)); \ | 325 | m = vreinterpretq_u32_u8( \ |
337 | MY_rev32_for_LE(m); \ | 326 | MY_rev32_for_LE( \ |
327 | LOAD_128_8(data + (k) * 16))); \ | ||
338 | 328 | ||
339 | // K array must be aligned for 16-bytes at least. | 329 | // K array must be aligned for 16-bytes at least. |
340 | extern | 330 | extern |
341 | MY_ALIGN(64) | 331 | MY_ALIGN(64) |
342 | const UInt32 SHA256_K_ARRAY[64]; | 332 | const UInt32 SHA256_K_ARRAY[64]; |
343 | |||
344 | #define K SHA256_K_ARRAY | 333 | #define K SHA256_K_ARRAY |
345 | 334 | ||
346 | |||
347 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); | 335 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); |
348 | #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); | 336 | #define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); |
349 | 337 | ||
350 | #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) | 338 | #define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) |
351 | #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) | 339 | #define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) |
352 | #define NNN(g0, g1, g2, g3) | 340 | #define NNN(m0, m1, m2, m3) |
353 | 341 | ||
354 | 342 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | |
355 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | 343 | msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ |
356 | msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ | ||
357 | tmp = state0; \ | 344 | tmp = state0; \ |
358 | state0 = vsha256hq_u32( state0, state1, msg ); \ | 345 | state0 = vsha256hq_u32( state0, state1, msg ); \ |
359 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ | 346 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ |
360 | OP0(g0, g1, g2, g3); \ | 347 | OP0(m0, m1, m2, m3); \ |
361 | OP1(g0, g1, g2, g3); \ | 348 | OP1(m0, m1, m2, m3); \ |
362 | 349 | ||
363 | 350 | ||
364 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 351 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
379 | if (numBlocks == 0) | 366 | if (numBlocks == 0) |
380 | return; | 367 | return; |
381 | 368 | ||
382 | state0 = LOAD_128(&state[0]); | 369 | state0 = LOAD_128_32(&state[0]); |
383 | state1 = LOAD_128(&state[4]); | 370 | state1 = LOAD_128_32(&state[4]); |
384 | 371 | ||
385 | do | 372 | do |
386 | { | 373 | { |
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
408 | } | 395 | } |
409 | while (--numBlocks); | 396 | while (--numBlocks); |
410 | 397 | ||
411 | STORE_128(&state[0], state0); | 398 | STORE_128_32(&state[0], state0); |
412 | STORE_128(&state[4], state1); | 399 | STORE_128_32(&state[4], state1); |
413 | } | 400 | } |
414 | 401 | ||
415 | #endif // USE_HW_SHA | 402 | #endif // USE_HW_SHA |
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
443 | #endif | 430 | #endif |
444 | 431 | ||
445 | 432 | ||
446 | |||
447 | #undef K | 433 | #undef K |
448 | #undef RND2 | 434 | #undef RND2 |
449 | #undef RND2_0 | ||
450 | #undef RND2_1 | ||
451 | |||
452 | #undef MY_rev32_for_LE | 435 | #undef MY_rev32_for_LE |
436 | |||
453 | #undef NNN | 437 | #undef NNN |
454 | #undef LOAD_128 | 438 | #undef LOAD_128 |
455 | #undef STORE_128 | 439 | #undef STORE_128 |
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
457 | #undef SM1 | 441 | #undef SM1 |
458 | #undef SM2 | 442 | #undef SM2 |
459 | 443 | ||
460 | #undef NNN | 444 | |
461 | #undef R4 | 445 | #undef R4 |
462 | #undef R16 | 446 | #undef R16 |
463 | #undef PREPARE_STATE | 447 | #undef PREPARE_STATE |
diff --git a/C/Sha3.c b/C/Sha3.c new file mode 100644 index 0000000..be972d6 --- /dev/null +++ b/C/Sha3.c | |||
@@ -0,0 +1,359 @@ | |||
1 | /* Sha3.c -- SHA-3 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Sha3.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #define U64C(x) UINT64_CONST(x) | ||
14 | |||
15 | static | ||
16 | MY_ALIGN(64) | ||
17 | const UInt64 SHA3_K_ARRAY[24] = | ||
18 | { | ||
19 | U64C(0x0000000000000001), U64C(0x0000000000008082), | ||
20 | U64C(0x800000000000808a), U64C(0x8000000080008000), | ||
21 | U64C(0x000000000000808b), U64C(0x0000000080000001), | ||
22 | U64C(0x8000000080008081), U64C(0x8000000000008009), | ||
23 | U64C(0x000000000000008a), U64C(0x0000000000000088), | ||
24 | U64C(0x0000000080008009), U64C(0x000000008000000a), | ||
25 | U64C(0x000000008000808b), U64C(0x800000000000008b), | ||
26 | U64C(0x8000000000008089), U64C(0x8000000000008003), | ||
27 | U64C(0x8000000000008002), U64C(0x8000000000000080), | ||
28 | U64C(0x000000000000800a), U64C(0x800000008000000a), | ||
29 | U64C(0x8000000080008081), U64C(0x8000000000008080), | ||
30 | U64C(0x0000000080000001), U64C(0x8000000080008008) | ||
31 | }; | ||
32 | |||
33 | void Sha3_Init(CSha3 *p) | ||
34 | { | ||
35 | p->count = 0; | ||
36 | memset(p->state, 0, sizeof(p->state)); | ||
37 | } | ||
38 | |||
39 | #define GET_state(i, a) UInt64 a = state[i]; | ||
40 | #define SET_state(i, a) state[i] = a; | ||
41 | |||
42 | #define LS_5(M, i, a0,a1,a2,a3,a4) \ | ||
43 | M ((i) * 5 , a0) \ | ||
44 | M ((i) * 5 + 1, a1) \ | ||
45 | M ((i) * 5 + 2, a2) \ | ||
46 | M ((i) * 5 + 3, a3) \ | ||
47 | M ((i) * 5 + 4, a4) \ | ||
48 | |||
49 | #define LS_25(M) \ | ||
50 | LS_5 (M, 0, a50, a51, a52, a53, a54) \ | ||
51 | LS_5 (M, 1, a60, a61, a62, a63, a64) \ | ||
52 | LS_5 (M, 2, a70, a71, a72, a73, a74) \ | ||
53 | LS_5 (M, 3, a80, a81, a82, a83, a84) \ | ||
54 | LS_5 (M, 4, a90, a91, a92, a93, a94) \ | ||
55 | |||
56 | |||
57 | #define XOR_1(i, a0) \ | ||
58 | a0 ^= GetUi64(data + (i) * 8); \ | ||
59 | |||
60 | #define XOR_4(i, a0,a1,a2,a3) \ | ||
61 | XOR_1 ((i) , a0); \ | ||
62 | XOR_1 ((i) + 1, a1); \ | ||
63 | XOR_1 ((i) + 2, a2); \ | ||
64 | XOR_1 ((i) + 3, a3); \ | ||
65 | |||
66 | #define D(d,b1,b2) \ | ||
67 | d = b1 ^ Z7_ROTL64(b2, 1); | ||
68 | |||
69 | #define D5 \ | ||
70 | D (d0, c4, c1) \ | ||
71 | D (d1, c0, c2) \ | ||
72 | D (d2, c1, c3) \ | ||
73 | D (d3, c2, c4) \ | ||
74 | D (d4, c3, c0) \ | ||
75 | |||
76 | #define C0(c,a,d) \ | ||
77 | c = a ^ d; \ | ||
78 | |||
79 | #define C(c,a,d,k) \ | ||
80 | c = a ^ d; \ | ||
81 | c = Z7_ROTL64(c, k); \ | ||
82 | |||
83 | #define E4(e1,e2,e3,e4) \ | ||
84 | e1 = c1 ^ (~c2 & c3); \ | ||
85 | e2 = c2 ^ (~c3 & c4); \ | ||
86 | e3 = c3 ^ (~c4 & c0); \ | ||
87 | e4 = c4 ^ (~c0 & c1); \ | ||
88 | |||
89 | #define CK( v0,w0, \ | ||
90 | v1,w1,k1, \ | ||
91 | v2,w2,k2, \ | ||
92 | v3,w3,k3, \ | ||
93 | v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \ | ||
94 | C0(c0,v0,w0) \ | ||
95 | C (c1,v1,w1,k1) \ | ||
96 | C (c2,v2,w2,k2) \ | ||
97 | C (c3,v3,w3,k3) \ | ||
98 | C (c4,v4,w4,k4) \ | ||
99 | e0 = c0 ^ (~c1 & c2) ^ keccak_c; \ | ||
100 | E4(e1,e2,e3,e4) \ | ||
101 | |||
102 | #define CE( v0,w0,k0, \ | ||
103 | v1,w1,k1, \ | ||
104 | v2,w2,k2, \ | ||
105 | v3,w3,k3, \ | ||
106 | v4,w4,k4, e0,e1,e2,e3,e4) \ | ||
107 | C (c0,v0,w0,k0) \ | ||
108 | C (c1,v1,w1,k1) \ | ||
109 | C (c2,v2,w2,k2) \ | ||
110 | C (c3,v3,w3,k3) \ | ||
111 | C (c4,v4,w4,k4) \ | ||
112 | e0 = c0 ^ (~c1 & c2); \ | ||
113 | E4(e1,e2,e3,e4) \ | ||
114 | |||
115 | // numBlocks != 0 | ||
116 | static | ||
117 | Z7_NO_INLINE | ||
118 | void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS], | ||
119 | const Byte *data, size_t numBlocks, size_t blockSize) | ||
120 | { | ||
121 | LS_25 (GET_state) | ||
122 | |||
123 | do | ||
124 | { | ||
125 | unsigned round; | ||
126 | XOR_4 ( 0, a50, a51, a52, a53) | ||
127 | XOR_4 ( 4, a54, a60, a61, a62) | ||
128 | XOR_1 ( 8, a63) | ||
129 | if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384 | ||
130 | if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256 | ||
131 | if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224 | ||
132 | if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128 | ||
133 | XOR_1 (19, a84) | ||
134 | XOR_1 (20, a90) }}}} | ||
135 | data += blockSize; | ||
136 | |||
137 | for (round = 0; round < 24; round += 2) | ||
138 | { | ||
139 | UInt64 c0, c1, c2, c3, c4; | ||
140 | UInt64 d0, d1, d2, d3, d4; | ||
141 | UInt64 e50, e51, e52, e53, e54; | ||
142 | UInt64 e60, e61, e62, e63, e64; | ||
143 | UInt64 e70, e71, e72, e73, e74; | ||
144 | UInt64 e80, e81, e82, e83, e84; | ||
145 | UInt64 e90, e91, e92, e93, e94; | ||
146 | |||
147 | c0 = a50^a60^a70^a80^a90; | ||
148 | c1 = a51^a61^a71^a81^a91; | ||
149 | c2 = a52^a62^a72^a82^a92; | ||
150 | c3 = a53^a63^a73^a83^a93; | ||
151 | c4 = a54^a64^a74^a84^a94; | ||
152 | D5 | ||
153 | CK( a50, d0, | ||
154 | a61, d1, 44, | ||
155 | a72, d2, 43, | ||
156 | a83, d3, 21, | ||
157 | a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round]) | ||
158 | CE( a53, d3, 28, | ||
159 | a64, d4, 20, | ||
160 | a70, d0, 3, | ||
161 | a81, d1, 45, | ||
162 | a92, d2, 61, e60, e61, e62, e63, e64) | ||
163 | CE( a51, d1, 1, | ||
164 | a62, d2, 6, | ||
165 | a73, d3, 25, | ||
166 | a84, d4, 8, | ||
167 | a90, d0, 18, e70, e71, e72, e73, e74) | ||
168 | CE( a54, d4, 27, | ||
169 | a60, d0, 36, | ||
170 | a71, d1, 10, | ||
171 | a82, d2, 15, | ||
172 | a93, d3, 56, e80, e81, e82, e83, e84) | ||
173 | CE( a52, d2, 62, | ||
174 | a63, d3, 55, | ||
175 | a74, d4, 39, | ||
176 | a80, d0, 41, | ||
177 | a91, d1, 2, e90, e91, e92, e93, e94) | ||
178 | |||
179 | // ---------- ROUND + 1 ---------- | ||
180 | |||
181 | c0 = e50^e60^e70^e80^e90; | ||
182 | c1 = e51^e61^e71^e81^e91; | ||
183 | c2 = e52^e62^e72^e82^e92; | ||
184 | c3 = e53^e63^e73^e83^e93; | ||
185 | c4 = e54^e64^e74^e84^e94; | ||
186 | D5 | ||
187 | CK( e50, d0, | ||
188 | e61, d1, 44, | ||
189 | e72, d2, 43, | ||
190 | e83, d3, 21, | ||
191 | e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1]) | ||
192 | CE( e53, d3, 28, | ||
193 | e64, d4, 20, | ||
194 | e70, d0, 3, | ||
195 | e81, d1, 45, | ||
196 | e92, d2, 61, a60, a61, a62, a63, a64) | ||
197 | CE( e51, d1, 1, | ||
198 | e62, d2, 6, | ||
199 | e73, d3, 25, | ||
200 | e84, d4, 8, | ||
201 | e90, d0, 18, a70, a71, a72, a73, a74) | ||
202 | CE (e54, d4, 27, | ||
203 | e60, d0, 36, | ||
204 | e71, d1, 10, | ||
205 | e82, d2, 15, | ||
206 | e93, d3, 56, a80, a81, a82, a83, a84) | ||
207 | CE (e52, d2, 62, | ||
208 | e63, d3, 55, | ||
209 | e74, d4, 39, | ||
210 | e80, d0, 41, | ||
211 | e91, d1, 2, a90, a91, a92, a93, a94) | ||
212 | } | ||
213 | } | ||
214 | while (--numBlocks); | ||
215 | |||
216 | LS_25 (SET_state) | ||
217 | } | ||
218 | |||
219 | |||
220 | #define Sha3_UpdateBlock(p) \ | ||
221 | Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize) | ||
222 | |||
223 | void Sha3_Update(CSha3 *p, const Byte *data, size_t size) | ||
224 | { | ||
225 | /* | ||
226 | for (;;) | ||
227 | { | ||
228 | if (size == 0) | ||
229 | return; | ||
230 | unsigned cur = p->blockSize - p->count; | ||
231 | if (cur > size) | ||
232 | cur = (unsigned)size; | ||
233 | size -= cur; | ||
234 | unsigned pos = p->count; | ||
235 | p->count = pos + cur; | ||
236 | while (pos & 7) | ||
237 | { | ||
238 | if (cur == 0) | ||
239 | return; | ||
240 | Byte *pb = &(((Byte *)p->state)[pos]); | ||
241 | *pb = (Byte)(*pb ^ *data++); | ||
242 | cur--; | ||
243 | pos++; | ||
244 | } | ||
245 | if (cur >= 8) | ||
246 | { | ||
247 | do | ||
248 | { | ||
249 | *(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data); | ||
250 | data += 8; | ||
251 | pos += 8; | ||
252 | cur -= 8; | ||
253 | } | ||
254 | while (cur >= 8); | ||
255 | } | ||
256 | if (pos != p->blockSize) | ||
257 | { | ||
258 | if (cur) | ||
259 | { | ||
260 | Byte *pb = &(((Byte *)p->state)[pos]); | ||
261 | do | ||
262 | { | ||
263 | *pb = (Byte)(*pb ^ *data++); | ||
264 | pb++; | ||
265 | } | ||
266 | while (--cur); | ||
267 | } | ||
268 | return; | ||
269 | } | ||
270 | Sha3_UpdateBlock(p->state); | ||
271 | p->count = 0; | ||
272 | } | ||
273 | */ | ||
274 | if (size == 0) | ||
275 | return; | ||
276 | { | ||
277 | const unsigned pos = p->count; | ||
278 | const unsigned num = p->blockSize - pos; | ||
279 | if (num > size) | ||
280 | { | ||
281 | p->count = pos + (unsigned)size; | ||
282 | memcpy(p->buffer + pos, data, size); | ||
283 | return; | ||
284 | } | ||
285 | if (pos != 0) | ||
286 | { | ||
287 | size -= num; | ||
288 | memcpy(p->buffer + pos, data, num); | ||
289 | data += num; | ||
290 | Sha3_UpdateBlock(p); | ||
291 | } | ||
292 | } | ||
293 | if (size >= p->blockSize) | ||
294 | { | ||
295 | const size_t numBlocks = size / p->blockSize; | ||
296 | const Byte *dataOld = data; | ||
297 | data += numBlocks * p->blockSize; | ||
298 | size = (size_t)(dataOld + size - data); | ||
299 | Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize); | ||
300 | } | ||
301 | p->count = (unsigned)size; | ||
302 | if (size) | ||
303 | memcpy(p->buffer, data, size); | ||
304 | } | ||
305 | |||
306 | |||
307 | // we support only (digestSize % 4 == 0) cases | ||
308 | void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake) | ||
309 | { | ||
310 | memset(p->buffer + p->count, 0, p->blockSize - p->count); | ||
311 | // we write bits markers from low to higher in current byte: | ||
312 | // - if sha-3 : 2 bits : 0,1 | ||
313 | // - if shake : 4 bits : 1111 | ||
314 | // then we write bit 1 to same byte. | ||
315 | // And we write bit 1 to highest bit of last byte of block. | ||
316 | p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06); | ||
317 | // we need xor operation (^= 0x80) here because we must write 0x80 bit | ||
318 | // to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!! | ||
319 | p->buffer[p->blockSize - 1] ^= 0x80; | ||
320 | /* | ||
321 | ((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06); | ||
322 | ((Byte *)p->state)[p->blockSize - 1] ^= 0x80; | ||
323 | */ | ||
324 | Sha3_UpdateBlock(p); | ||
325 | #if 1 && defined(MY_CPU_LE) | ||
326 | memcpy(digest, p->state, digestSize); | ||
327 | #else | ||
328 | { | ||
329 | const unsigned numWords = digestSize >> 3; | ||
330 | unsigned i; | ||
331 | for (i = 0; i < numWords; i++) | ||
332 | { | ||
333 | const UInt64 v = p->state[i]; | ||
334 | SetUi64(digest, v) | ||
335 | digest += 8; | ||
336 | } | ||
337 | if (digestSize & 4) // for SHA3-224 | ||
338 | { | ||
339 | const UInt32 v = (UInt32)p->state[numWords]; | ||
340 | SetUi32(digest, v) | ||
341 | } | ||
342 | } | ||
343 | #endif | ||
344 | Sha3_Init(p); | ||
345 | } | ||
346 | |||
347 | #undef GET_state | ||
348 | #undef SET_state | ||
349 | #undef LS_5 | ||
350 | #undef LS_25 | ||
351 | #undef XOR_1 | ||
352 | #undef XOR_4 | ||
353 | #undef D | ||
354 | #undef D5 | ||
355 | #undef C0 | ||
356 | #undef C | ||
357 | #undef E4 | ||
358 | #undef CK | ||
359 | #undef CE | ||
diff --git a/C/Sha3.h b/C/Sha3.h new file mode 100644 index 0000000..c5909c9 --- /dev/null +++ b/C/Sha3.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* Sha3.h -- SHA-3 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_MD5_H | ||
5 | #define ZIP7_INC_MD5_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define SHA3_NUM_STATE_WORDS 25 | ||
12 | |||
13 | #define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \ | ||
14 | (SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2) | ||
15 | |||
16 | typedef struct | ||
17 | { | ||
18 | UInt32 count; // < blockSize | ||
19 | UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8 | ||
20 | UInt64 _pad1[3]; | ||
21 | // we want 32-bytes alignment here | ||
22 | UInt64 state[SHA3_NUM_STATE_WORDS]; | ||
23 | UInt64 _pad2[3]; | ||
24 | // we want 64-bytes alignment here | ||
25 | Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values | ||
26 | } CSha3; | ||
27 | |||
28 | #define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); } | ||
29 | |||
30 | void Sha3_Init(CSha3 *p); | ||
31 | void Sha3_Update(CSha3 *p, const Byte *data, size_t size); | ||
32 | void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake); | ||
33 | |||
34 | EXTERN_C_END | ||
35 | |||
36 | #endif | ||
diff --git a/C/Sha512.c b/C/Sha512.c new file mode 100644 index 0000000..04827d6 --- /dev/null +++ b/C/Sha512.c | |||
@@ -0,0 +1,618 @@ | |||
1 | /* Sha512.c -- SHA-512 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Sha512.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #ifdef MY_CPU_X86_OR_AMD64 | ||
14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ | ||
15 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ | ||
16 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \ | ||
17 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \ | ||
18 | || defined(_MSC_VER) && (_MSC_VER >= 1940) | ||
19 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
20 | #endif | ||
21 | #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
22 | #if defined(__ARM_FEATURE_SHA512) | ||
23 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
24 | #else | ||
25 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ | ||
26 | || defined(__GNUC__) && (__GNUC__ >= 9) \ | ||
27 | ) \ | ||
28 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it | ||
29 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
30 | #endif | ||
31 | #endif | ||
32 | #endif | ||
33 | |||
34 | |||
35 | |||
36 | |||
37 | |||
38 | |||
39 | |||
40 | |||
41 | |||
42 | |||
43 | |||
44 | |||
45 | |||
46 | |||
47 | void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
48 | |||
49 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
50 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
51 | |||
52 | static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks; | ||
53 | static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW; | ||
54 | |||
55 | #define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks | ||
56 | #else | ||
57 | #define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks | ||
58 | #endif | ||
59 | |||
60 | |||
61 | BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo) | ||
62 | { | ||
63 | SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks; | ||
64 | |||
65 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
66 | if (algo != SHA512_ALGO_SW) | ||
67 | { | ||
68 | if (algo == SHA512_ALGO_DEFAULT) | ||
69 | func = g_SHA512_FUNC_UPDATE_BLOCKS; | ||
70 | else | ||
71 | { | ||
72 | if (algo != SHA512_ALGO_HW) | ||
73 | return False; | ||
74 | func = g_SHA512_FUNC_UPDATE_BLOCKS_HW; | ||
75 | if (!func) | ||
76 | return False; | ||
77 | } | ||
78 | } | ||
79 | #else | ||
80 | if (algo > 1) | ||
81 | return False; | ||
82 | #endif | ||
83 | |||
84 | p->v.vars.func_UpdateBlocks = func; | ||
85 | return True; | ||
86 | } | ||
87 | |||
88 | |||
89 | /* define it for speed optimization */ | ||
90 | |||
91 | #if 0 // 1 for size optimization | ||
92 | #define STEP_PRE 1 | ||
93 | #define STEP_MAIN 1 | ||
94 | #else | ||
95 | #define STEP_PRE 2 | ||
96 | #define STEP_MAIN 4 | ||
97 | // #define Z7_SHA512_UNROLL | ||
98 | #endif | ||
99 | |||
100 | #undef Z7_SHA512_BIG_W | ||
101 | #if STEP_MAIN != 16 | ||
102 | #define Z7_SHA512_BIG_W | ||
103 | #endif | ||
104 | |||
105 | |||
106 | #define U64C(x) UINT64_CONST(x) | ||
107 | |||
108 | static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = { | ||
109 | { U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf), | ||
110 | U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1) | ||
111 | }, | ||
112 | { U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd), | ||
113 | U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2) | ||
114 | }, | ||
115 | { U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939), | ||
116 | U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4) | ||
117 | }, | ||
118 | { U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1), | ||
119 | U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179) | ||
120 | }}; | ||
121 | |||
122 | void Sha512_InitState(CSha512 *p, unsigned digestSize) | ||
123 | { | ||
124 | p->v.vars.count = 0; | ||
125 | memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state)); | ||
126 | } | ||
127 | |||
128 | void Sha512_Init(CSha512 *p, unsigned digestSize) | ||
129 | { | ||
130 | p->v.vars.func_UpdateBlocks = | ||
131 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
132 | g_SHA512_FUNC_UPDATE_BLOCKS; | ||
133 | #else | ||
134 | NULL; | ||
135 | #endif | ||
136 | Sha512_InitState(p, digestSize); | ||
137 | } | ||
138 | |||
139 | #define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39)) | ||
140 | #define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41)) | ||
141 | #define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7)) | ||
142 | #define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6)) | ||
143 | |||
144 | #define Ch(x,y,z) (z^(x&(y^z))) | ||
145 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) | ||
146 | |||
147 | |||
148 | #define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8)) | ||
149 | |||
150 | #define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) | ||
151 | |||
152 | #ifdef Z7_SHA512_BIG_W | ||
153 | // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. | ||
154 | #define w(j, i) W[(size_t)(j) + i] | ||
155 | #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) | ||
156 | #else | ||
157 | #if STEP_MAIN == 16 | ||
158 | #define w(j, i) W[(i) & 15] | ||
159 | #else | ||
160 | #define w(j, i) W[((size_t)(j) + (i)) & 15] | ||
161 | #endif | ||
162 | #define blk2(j, i) (w(j, i) += blk2_main(j, i)) | ||
163 | #endif | ||
164 | |||
165 | #define W_MAIN(i) blk2(j, i) | ||
166 | |||
167 | |||
168 | #define T1(wx, i) \ | ||
169 | tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
170 | h = g; \ | ||
171 | g = f; \ | ||
172 | f = e; \ | ||
173 | e = d + tmp; \ | ||
174 | tmp += S0(a) + Maj(a, b, c); \ | ||
175 | d = c; \ | ||
176 | c = b; \ | ||
177 | b = a; \ | ||
178 | a = tmp; \ | ||
179 | |||
180 | #define R1_PRE(i) T1( W_PRE, i) | ||
181 | #define R1_MAIN(i) T1( W_MAIN, i) | ||
182 | |||
183 | #if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) | ||
184 | #define R2_MAIN(i) \ | ||
185 | R1_MAIN(i) \ | ||
186 | R1_MAIN(i + 1) \ | ||
187 | |||
188 | #endif | ||
189 | |||
190 | |||
191 | |||
192 | #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 | ||
193 | |||
194 | #define T4( a,b,c,d,e,f,g,h, wx, i) \ | ||
195 | h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
196 | tmp = h; \ | ||
197 | h += d; \ | ||
198 | d = tmp + S0(a) + Maj(a, b, c); \ | ||
199 | |||
200 | #define R4( wx, i) \ | ||
201 | T4 ( a,b,c,d,e,f,g,h, wx, (i )); \ | ||
202 | T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \ | ||
203 | T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \ | ||
204 | T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \ | ||
205 | |||
206 | #define R4_PRE(i) R4( W_PRE, i) | ||
207 | #define R4_MAIN(i) R4( W_MAIN, i) | ||
208 | |||
209 | |||
210 | #define T8( a,b,c,d,e,f,g,h, wx, i) \ | ||
211 | h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
212 | d += h; \ | ||
213 | h += S0(a) + Maj(a, b, c); \ | ||
214 | |||
215 | #define R8( wx, i) \ | ||
216 | T8 ( a,b,c,d,e,f,g,h, wx, i ); \ | ||
217 | T8 ( h,a,b,c,d,e,f,g, wx, i+1); \ | ||
218 | T8 ( g,h,a,b,c,d,e,f, wx, i+2); \ | ||
219 | T8 ( f,g,h,a,b,c,d,e, wx, i+3); \ | ||
220 | T8 ( e,f,g,h,a,b,c,d, wx, i+4); \ | ||
221 | T8 ( d,e,f,g,h,a,b,c, wx, i+5); \ | ||
222 | T8 ( c,d,e,f,g,h,a,b, wx, i+6); \ | ||
223 | T8 ( b,c,d,e,f,g,h,a, wx, i+7); \ | ||
224 | |||
225 | #define R8_PRE(i) R8( W_PRE, i) | ||
226 | #define R8_MAIN(i) R8( W_MAIN, i) | ||
227 | |||
228 | #endif | ||
229 | |||
230 | |||
231 | extern | ||
232 | MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80]; | ||
233 | MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = { | ||
234 | U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc), | ||
235 | U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118), | ||
236 | U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2), | ||
237 | U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694), | ||
238 | U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65), | ||
239 | U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5), | ||
240 | U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4), | ||
241 | U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70), | ||
242 | U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df), | ||
243 | U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b), | ||
244 | U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30), | ||
245 | U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8), | ||
246 | U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8), | ||
247 | U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3), | ||
248 | U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec), | ||
249 | U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b), | ||
250 | U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178), | ||
251 | U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b), | ||
252 | U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c), | ||
253 | U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817) | ||
254 | }; | ||
255 | |||
256 | #define K SHA512_K_ARRAY | ||
257 | |||
258 | Z7_NO_INLINE | ||
259 | void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
260 | { | ||
261 | UInt64 W | ||
262 | #ifdef Z7_SHA512_BIG_W | ||
263 | [80]; | ||
264 | #else | ||
265 | [16]; | ||
266 | #endif | ||
267 | unsigned j; | ||
268 | UInt64 a,b,c,d,e,f,g,h; | ||
269 | #if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | ||
270 | UInt64 tmp; | ||
271 | #endif | ||
272 | |||
273 | if (numBlocks == 0) return; | ||
274 | |||
275 | a = state[0]; | ||
276 | b = state[1]; | ||
277 | c = state[2]; | ||
278 | d = state[3]; | ||
279 | e = state[4]; | ||
280 | f = state[5]; | ||
281 | g = state[6]; | ||
282 | h = state[7]; | ||
283 | |||
284 | do | ||
285 | { | ||
286 | |||
287 | for (j = 0; j < 16; j += STEP_PRE) | ||
288 | { | ||
289 | #if STEP_PRE > 4 | ||
290 | |||
291 | #if STEP_PRE < 8 | ||
292 | R4_PRE(0); | ||
293 | #else | ||
294 | R8_PRE(0); | ||
295 | #if STEP_PRE == 16 | ||
296 | R8_PRE(8); | ||
297 | #endif | ||
298 | #endif | ||
299 | |||
300 | #else | ||
301 | |||
302 | R1_PRE(0) | ||
303 | #if STEP_PRE >= 2 | ||
304 | R1_PRE(1) | ||
305 | #if STEP_PRE >= 4 | ||
306 | R1_PRE(2) | ||
307 | R1_PRE(3) | ||
308 | #endif | ||
309 | #endif | ||
310 | |||
311 | #endif | ||
312 | } | ||
313 | |||
314 | for (j = 16; j < 80; j += STEP_MAIN) | ||
315 | { | ||
316 | #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 | ||
317 | |||
318 | #if STEP_MAIN < 8 | ||
319 | R4_MAIN(0) | ||
320 | #else | ||
321 | R8_MAIN(0) | ||
322 | #if STEP_MAIN == 16 | ||
323 | R8_MAIN(8) | ||
324 | #endif | ||
325 | #endif | ||
326 | |||
327 | #else | ||
328 | |||
329 | R1_MAIN(0) | ||
330 | #if STEP_MAIN >= 2 | ||
331 | R1_MAIN(1) | ||
332 | #if STEP_MAIN >= 4 | ||
333 | R2_MAIN(2) | ||
334 | #if STEP_MAIN >= 8 | ||
335 | R2_MAIN(4) | ||
336 | R2_MAIN(6) | ||
337 | #if STEP_MAIN >= 16 | ||
338 | R2_MAIN(8) | ||
339 | R2_MAIN(10) | ||
340 | R2_MAIN(12) | ||
341 | R2_MAIN(14) | ||
342 | #endif | ||
343 | #endif | ||
344 | #endif | ||
345 | #endif | ||
346 | #endif | ||
347 | } | ||
348 | |||
349 | a += state[0]; state[0] = a; | ||
350 | b += state[1]; state[1] = b; | ||
351 | c += state[2]; state[2] = c; | ||
352 | d += state[3]; state[3] = d; | ||
353 | e += state[4]; state[4] = e; | ||
354 | f += state[5]; state[5] = f; | ||
355 | g += state[6]; state[6] = g; | ||
356 | h += state[7]; state[7] = h; | ||
357 | |||
358 | data += SHA512_BLOCK_SIZE; | ||
359 | } | ||
360 | while (--numBlocks); | ||
361 | } | ||
362 | |||
363 | |||
364 | #define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | ||
365 | |||
366 | void Sha512_Update(CSha512 *p, const Byte *data, size_t size) | ||
367 | { | ||
368 | if (size == 0) | ||
369 | return; | ||
370 | { | ||
371 | const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); | ||
372 | const unsigned num = SHA512_BLOCK_SIZE - pos; | ||
373 | p->v.vars.count += size; | ||
374 | if (num > size) | ||
375 | { | ||
376 | memcpy(p->buffer + pos, data, size); | ||
377 | return; | ||
378 | } | ||
379 | if (pos != 0) | ||
380 | { | ||
381 | size -= num; | ||
382 | memcpy(p->buffer + pos, data, num); | ||
383 | data += num; | ||
384 | Sha512_UpdateBlock(p); | ||
385 | } | ||
386 | } | ||
387 | { | ||
388 | const size_t numBlocks = size >> 7; | ||
389 | // if (numBlocks) | ||
390 | SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | ||
391 | size &= SHA512_BLOCK_SIZE - 1; | ||
392 | if (size == 0) | ||
393 | return; | ||
394 | data += (numBlocks << 7); | ||
395 | memcpy(p->buffer, data, size); | ||
396 | } | ||
397 | } | ||
398 | |||
399 | |||
400 | void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize) | ||
401 | { | ||
402 | unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); | ||
403 | p->buffer[pos++] = 0x80; | ||
404 | if (pos > (SHA512_BLOCK_SIZE - 8 * 2)) | ||
405 | { | ||
406 | while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; } | ||
407 | // memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos); | ||
408 | Sha512_UpdateBlock(p); | ||
409 | pos = 0; | ||
410 | } | ||
411 | memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos); | ||
412 | { | ||
413 | const UInt64 numBits = p->v.vars.count << 3; | ||
414 | SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits) | ||
415 | SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits) | ||
416 | } | ||
417 | Sha512_UpdateBlock(p); | ||
418 | #if 1 && defined(MY_CPU_BE) | ||
419 | memcpy(digest, p->state, digestSize); | ||
420 | #else | ||
421 | { | ||
422 | const unsigned numWords = digestSize >> 3; | ||
423 | unsigned i; | ||
424 | for (i = 0; i < numWords; i++) | ||
425 | { | ||
426 | const UInt64 v = p->state[i]; | ||
427 | SetBe64(digest, v) | ||
428 | digest += 8; | ||
429 | } | ||
430 | if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE | ||
431 | { | ||
432 | const UInt32 v = (UInt32)((p->state[numWords]) >> 32); | ||
433 | SetBe32(digest, v) | ||
434 | } | ||
435 | } | ||
436 | #endif | ||
437 | Sha512_InitState(p, digestSize); | ||
438 | } | ||
439 | |||
440 | |||
441 | |||
442 | |||
443 | #if defined(_WIN32) && defined(Z7_COMPILER_SHA512_SUPPORTED) \ | ||
444 | && defined(MY_CPU_ARM64) // we can disable this check to debug in x64 | ||
445 | |||
446 | #if 1 // 0 for debug | ||
447 | |||
448 | #include "7zWindows.h" | ||
449 | // #include <stdio.h> | ||
450 | #if 0 && defined(MY_CPU_X86_OR_AMD64) | ||
451 | #include <intrin.h> // for debug : for __ud2() | ||
452 | #endif | ||
453 | |||
454 | BoolInt CPU_IsSupported_SHA512(void) | ||
455 | { | ||
456 | #if defined(MY_CPU_ARM64) | ||
457 | // we have no SHA512 flag for IsProcessorFeaturePresent() still. | ||
458 | if (!CPU_IsSupported_CRYPTO()) | ||
459 | return False; | ||
460 | #endif | ||
461 | // printf("\nCPU_IsSupported_SHA512\n"); | ||
462 | { | ||
463 | // we can't read ID_AA64ISAR0_EL1 register from application. | ||
464 | // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. | ||
465 | HKEY key = NULL; | ||
466 | LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE, | ||
467 | TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), | ||
468 | 0, KEY_READ, &key); | ||
469 | if (res != ERROR_SUCCESS) | ||
470 | return False; | ||
471 | { | ||
472 | DWORD type = 0; | ||
473 | DWORD count = sizeof(UInt64); | ||
474 | UInt64 val = 0; | ||
475 | res = RegQueryValueEx(key, TEXT("CP 4030"), NULL, | ||
476 | &type, (LPBYTE)&val, &count); | ||
477 | RegCloseKey(key); | ||
478 | if (res != ERROR_SUCCESS | ||
479 | || type != REG_QWORD | ||
480 | || count != sizeof(UInt64) | ||
481 | || ((unsigned)(val >> 12) & 0xf) != 2) | ||
482 | return False; | ||
483 | // we parse SHA2 field of ID_AA64ISAR0_EL1 register: | ||
484 | // 0 : No SHA2 instructions implemented | ||
485 | // 1 : SHA256 implemented | ||
486 | // 2 : SHA256 and SHA512 implemented | ||
487 | } | ||
488 | } | ||
489 | |||
490 | |||
491 | #if 1 // 0 for debug to disable SHA512 PROBE code | ||
492 | |||
493 | /* | ||
494 | ----- SHA512 PROBE ----- | ||
495 | |||
496 | We suppose that "CP 4030" registry reading is enough. | ||
497 | But we use additional SHA512 PROBE code, because | ||
498 | we can catch exception here, and we don't catch exceptions, | ||
499 | if we call Sha512 functions from main code. | ||
500 | |||
501 | NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64. | ||
502 | The program just stops. | ||
503 | Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator | ||
504 | without SHA512 support (-skl switch), | ||
505 | The program stops, and we have message from SDE: | ||
506 | TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1 | ||
507 | But we still want to catch that exception instead of process stopping. | ||
508 | Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)? | ||
509 | Are there any ways to fix the problems with arm64-wine and x64-SDE cases? | ||
510 | */ | ||
511 | |||
512 | // printf("\n========== CPU_IsSupported_SHA512 PROBE ========\n"); | ||
513 | { | ||
514 | #ifdef __clang_major__ | ||
515 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" | ||
516 | #endif | ||
517 | __try | ||
518 | { | ||
519 | #if 0 // 1 : for debug (reduced version to detect sha512) | ||
520 | const uint64x2_t a = vdupq_n_u64(1); | ||
521 | const uint64x2_t b = vsha512hq_u64(a, a, a); | ||
522 | if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) | ||
523 | return True; | ||
524 | #else | ||
525 | MY_ALIGN(16) | ||
526 | UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; | ||
527 | memset(temp, 0x5a, sizeof(temp)); | ||
528 | #if 0 && defined(MY_CPU_X86_OR_AMD64) | ||
529 | __ud2(); // for debug : that exception is not problem for SDE | ||
530 | #endif | ||
531 | #if 1 | ||
532 | Sha512_UpdateBlocks_HW(temp, | ||
533 | (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); | ||
534 | // printf("\n==== t = %x\n", (UInt32)temp[0]); | ||
535 | if ((UInt32)temp[0] == 0xa33cfdf7) | ||
536 | { | ||
537 | // printf("\n=== PROBE SHA512: SHA512 supported\n"); | ||
538 | return True; | ||
539 | } | ||
540 | #endif | ||
541 | #endif | ||
542 | } | ||
543 | __except (EXCEPTION_EXECUTE_HANDLER) | ||
544 | { | ||
545 | // printf("\n==== CPU_IsSupported_SHA512 EXCEPTION_EXECUTE_HANDLER\n"); | ||
546 | } | ||
547 | } | ||
548 | return False; | ||
549 | #else | ||
550 | // without SHA512 PROBE code | ||
551 | return True; | ||
552 | #endif | ||
553 | |||
554 | } | ||
555 | |||
556 | #else | ||
557 | |||
558 | BoolInt CPU_IsSupported_SHA512(void) | ||
559 | { | ||
560 | return False; | ||
561 | } | ||
562 | |||
563 | #endif | ||
564 | #endif // WIN32 arm64 | ||
565 | |||
566 | |||
567 | void Sha512Prepare(void) | ||
568 | { | ||
569 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
570 | SHA512_FUNC_UPDATE_BLOCKS f, f_hw; | ||
571 | f = Sha512_UpdateBlocks; | ||
572 | f_hw = NULL; | ||
573 | #ifdef MY_CPU_X86_OR_AMD64 | ||
574 | if (CPU_IsSupported_SHA512() | ||
575 | && CPU_IsSupported_AVX2() | ||
576 | ) | ||
577 | #else | ||
578 | if (CPU_IsSupported_SHA512()) | ||
579 | #endif | ||
580 | { | ||
581 | // printf("\n========== HW SHA512 ======== \n"); | ||
582 | f = f_hw = Sha512_UpdateBlocks_HW; | ||
583 | } | ||
584 | g_SHA512_FUNC_UPDATE_BLOCKS = f; | ||
585 | g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; | ||
586 | #endif | ||
587 | } | ||
588 | |||
589 | |||
590 | #undef K | ||
591 | #undef S0 | ||
592 | #undef S1 | ||
593 | #undef s0 | ||
594 | #undef s1 | ||
595 | #undef Ch | ||
596 | #undef Maj | ||
597 | #undef W_MAIN | ||
598 | #undef W_PRE | ||
599 | #undef w | ||
600 | #undef blk2_main | ||
601 | #undef blk2 | ||
602 | #undef T1 | ||
603 | #undef T4 | ||
604 | #undef T8 | ||
605 | #undef R1_PRE | ||
606 | #undef R1_MAIN | ||
607 | #undef R2_MAIN | ||
608 | #undef R4 | ||
609 | #undef R4_PRE | ||
610 | #undef R4_MAIN | ||
611 | #undef R8 | ||
612 | #undef R8_PRE | ||
613 | #undef R8_MAIN | ||
614 | #undef STEP_PRE | ||
615 | #undef STEP_MAIN | ||
616 | #undef Z7_SHA512_BIG_W | ||
617 | #undef Z7_SHA512_UNROLL | ||
618 | #undef Z7_COMPILER_SHA512_SUPPORTED | ||
diff --git a/C/Sha512.h b/C/Sha512.h new file mode 100644 index 0000000..1f3a4d1 --- /dev/null +++ b/C/Sha512.h | |||
@@ -0,0 +1,86 @@ | |||
1 | /* Sha512.h -- SHA-512 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_SHA512_H | ||
5 | #define ZIP7_INC_SHA512_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define SHA512_NUM_BLOCK_WORDS 16 | ||
12 | #define SHA512_NUM_DIGEST_WORDS 8 | ||
13 | |||
14 | #define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8) | ||
15 | #define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8) | ||
16 | #define SHA512_224_DIGEST_SIZE (224 / 8) | ||
17 | #define SHA512_256_DIGEST_SIZE (256 / 8) | ||
18 | #define SHA512_384_DIGEST_SIZE (384 / 8) | ||
19 | |||
20 | typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
21 | |||
22 | /* | ||
23 | if (the system supports different SHA512 code implementations) | ||
24 | { | ||
25 | (CSha512::func_UpdateBlocks) will be used | ||
26 | (CSha512::func_UpdateBlocks) can be set by | ||
27 | Sha512_Init() - to default (fastest) | ||
28 | Sha512_SetFunction() - to any algo | ||
29 | } | ||
30 | else | ||
31 | { | ||
32 | (CSha512::func_UpdateBlocks) is ignored. | ||
33 | } | ||
34 | */ | ||
35 | |||
36 | typedef struct | ||
37 | { | ||
38 | union | ||
39 | { | ||
40 | struct | ||
41 | { | ||
42 | SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[8]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
48 | UInt64 state[SHA512_NUM_DIGEST_WORDS]; | ||
49 | |||
50 | Byte buffer[SHA512_BLOCK_SIZE]; | ||
51 | } CSha512; | ||
52 | |||
53 | |||
54 | #define SHA512_ALGO_DEFAULT 0 | ||
55 | #define SHA512_ALGO_SW 1 | ||
56 | #define SHA512_ALGO_HW 2 | ||
57 | |||
58 | /* | ||
59 | Sha512_SetFunction() | ||
60 | return: | ||
61 | 0 - (algo) value is not supported, and func_UpdateBlocks was not changed | ||
62 | 1 - func_UpdateBlocks was set according (algo) value. | ||
63 | */ | ||
64 | |||
65 | BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo); | ||
66 | // we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8 | ||
67 | void Sha512_InitState(CSha512 *p, unsigned digestSize); | ||
68 | void Sha512_Init(CSha512 *p, unsigned digestSize); | ||
69 | void Sha512_Update(CSha512 *p, const Byte *data, size_t size); | ||
70 | void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize); | ||
71 | |||
72 | |||
73 | |||
74 | |||
75 | // void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
76 | |||
77 | /* | ||
78 | call Sha512Prepare() once at program start. | ||
79 | It prepares all supported implementations, and detects the fastest implementation. | ||
80 | */ | ||
81 | |||
82 | void Sha512Prepare(void); | ||
83 | |||
84 | EXTERN_C_END | ||
85 | |||
86 | #endif | ||
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c new file mode 100644 index 0000000..3a13868 --- /dev/null +++ b/C/Sha512Opt.c | |||
@@ -0,0 +1,395 @@ | |||
1 | /* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | #include "Compiler.h" | ||
6 | #include "CpuArch.h" | ||
7 | |||
8 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
9 | #ifdef MY_CPU_X86_OR_AMD64 | ||
10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it | ||
11 | #define USE_HW_SHA | ||
12 | #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ | ||
13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ | ||
14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) | ||
15 | #define USE_HW_SHA | ||
16 | #if !defined(__INTEL_COMPILER) | ||
17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | ||
18 | #if !defined(__SHA512__) || !defined(__AVX2__) | ||
19 | #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2"))) | ||
20 | #endif | ||
21 | #endif | ||
22 | #elif defined(Z7_MSC_VER_ORIGINAL) | ||
23 | #if (_MSC_VER >= 1940) | ||
24 | #define USE_HW_SHA | ||
25 | #else | ||
26 | // #define Z7_USE_HW_SHA_STUB | ||
27 | #endif | ||
28 | #endif | ||
29 | // #endif // MY_CPU_X86_OR_AMD64 | ||
30 | #ifndef USE_HW_SHA | ||
31 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
32 | #endif | ||
33 | |||
34 | #ifdef USE_HW_SHA | ||
35 | |||
36 | // #pragma message("Sha512 HW") | ||
37 | |||
38 | #include <immintrin.h> | ||
39 | |||
40 | #if defined (__clang__) && defined(_MSC_VER) | ||
41 | #if !defined(__AVX__) | ||
42 | #include <avxintrin.h> | ||
43 | #endif | ||
44 | #if !defined(__AVX2__) | ||
45 | #include <avx2intrin.h> | ||
46 | #endif | ||
47 | #if !defined(__SHA512__) | ||
48 | #include <sha512intrin.h> | ||
49 | #endif | ||
50 | #else | ||
51 | |||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | SHA512 uses: | ||
56 | AVX: | ||
57 | _mm256_loadu_si256 (vmovdqu) | ||
58 | _mm256_storeu_si256 | ||
59 | _mm256_set_epi32 (unused) | ||
60 | AVX2: | ||
61 | _mm256_add_epi64 : vpaddq | ||
62 | _mm256_shuffle_epi8 : vpshufb | ||
63 | _mm256_shuffle_epi32 : pshufd | ||
64 | _mm256_blend_epi32 : vpblendd | ||
65 | _mm256_permute4x64_epi64 : vpermq : 3c | ||
66 | _mm256_permute2x128_si256: vperm2i128 : 3c | ||
67 | _mm256_extracti128_si256 : vextracti128 : 3c | ||
68 | SHA512: | ||
69 | _mm256_sha512* | ||
70 | */ | ||
71 | |||
72 | // K array must be aligned for 32-bytes at least. | ||
73 | // The compiler can look align attribute and selects | ||
74 | // vmovdqu - for code without align attribute | ||
75 | // vmovdqa - for code with align attribute | ||
76 | extern | ||
77 | MY_ALIGN(64) | ||
78 | const UInt64 SHA512_K_ARRAY[80]; | ||
79 | #define K SHA512_K_ARRAY | ||
80 | |||
81 | |||
82 | #define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src); | ||
83 | #define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0)); | ||
84 | #define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src); | ||
85 | |||
86 | #define LOAD_SHUFFLE(m, k) \ | ||
87 | m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \ | ||
88 | m = _mm256_shuffle_epi8(m, mask); \ | ||
89 | |||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
92 | #define SM1(m1, m2, m3, m0) \ | ||
93 | SHA512_MSG1(m0, m1); \ | ||
94 | |||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \ | ||
97 | SHA512_MSG2(m0, m3); \ | ||
98 | |||
99 | #define RND2(t0, t1, lane) \ | ||
100 | t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane)); | ||
101 | |||
102 | |||
103 | |||
104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | ||
105 | msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \ | ||
107 | RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \ | ||
108 | |||
109 | |||
110 | |||
111 | |||
112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | ||
113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | ||
114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | ||
115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | ||
116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | ||
117 | |||
118 | #define PREPARE_STATE \ | ||
119 | state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \ | ||
120 | state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \ | ||
121 | tmp = state0; \ | ||
122 | state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \ | ||
123 | state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \ | ||
124 | |||
125 | |||
126 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
127 | #ifdef ATTRIB_SHA512 | ||
128 | ATTRIB_SHA512 | ||
129 | #endif | ||
130 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
131 | { | ||
132 | const __m256i mask = _mm256_set_epi32( | ||
133 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607, | ||
134 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607); | ||
135 | __m256i tmp, state0, state1; | ||
136 | |||
137 | if (numBlocks == 0) | ||
138 | return; | ||
139 | |||
140 | state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]); | ||
141 | state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]); | ||
142 | |||
143 | PREPARE_STATE | ||
144 | |||
145 | do | ||
146 | { | ||
147 | __m256i state0_save, state1_save; | ||
148 | __m256i m0, m1, m2, m3; | ||
149 | __m256i msg; | ||
150 | // #define msg tmp | ||
151 | |||
152 | state0_save = state0; | ||
153 | state1_save = state1; | ||
154 | |||
155 | LOAD_SHUFFLE (m0, 0) | ||
156 | LOAD_SHUFFLE (m1, 1) | ||
157 | LOAD_SHUFFLE (m2, 2) | ||
158 | LOAD_SHUFFLE (m3, 3) | ||
159 | |||
160 | |||
161 | |||
162 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) | ||
163 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
164 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
165 | R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
166 | R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) | ||
167 | ADD_EPI64(state0, state0_save) | ||
168 | ADD_EPI64(state1, state1_save) | ||
169 | |||
170 | data += 128; | ||
171 | } | ||
172 | while (--numBlocks); | ||
173 | |||
174 | PREPARE_STATE | ||
175 | |||
176 | _mm256_storeu_si256((__m256i *) (void *) &state[0], state0); | ||
177 | _mm256_storeu_si256((__m256i *) (void *) &state[4], state1); | ||
178 | } | ||
179 | |||
180 | #endif // USE_HW_SHA | ||
181 | |||
182 | // gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc | ||
183 | #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
184 | |||
185 | #if defined(__ARM_FEATURE_SHA512) | ||
186 | #define USE_HW_SHA | ||
187 | #else | ||
188 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ | ||
189 | || defined(__GNUC__) && (__GNUC__ >= 9) \ | ||
190 | ) \ | ||
191 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it | ||
192 | #define USE_HW_SHA | ||
193 | #endif | ||
194 | #endif | ||
195 | |||
196 | #ifdef USE_HW_SHA | ||
197 | |||
198 | // #pragma message("=== Sha512 HW === ") | ||
199 | |||
200 | |||
201 | #if defined(__clang__) || defined(__GNUC__) | ||
202 | #if !defined(__ARM_FEATURE_SHA512) | ||
203 | // #pragma message("=== we define SHA3 ATTRIB_SHA512 === ") | ||
204 | #if defined(__clang__) | ||
205 | #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3" | ||
206 | #else | ||
207 | #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3"))) | ||
208 | #endif | ||
209 | #endif | ||
210 | #endif | ||
211 | |||
212 | |||
213 | #if defined(Z7_MSC_VER_ORIGINAL) | ||
214 | #include <arm64_neon.h> | ||
215 | #else | ||
216 | |||
217 | #if defined(__clang__) && __clang_major__ < 16 | ||
218 | #if !defined(__ARM_FEATURE_SHA512) | ||
219 | // #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ") | ||
220 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
221 | #define Z7_ARM_FEATURE_SHA512_WAS_SET 1 | ||
222 | #define __ARM_FEATURE_SHA512 1 | ||
223 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
224 | #endif | ||
225 | #endif // clang | ||
226 | |||
227 | #include <arm_neon.h> | ||
228 | |||
229 | #if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \ | ||
230 | defined(__ARM_FEATURE_SHA512) | ||
231 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
232 | #undef __ARM_FEATURE_SHA512 | ||
233 | #undef Z7_ARM_FEATURE_SHA512_WAS_SET | ||
234 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
235 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
236 | #endif | ||
237 | |||
238 | #endif // Z7_MSC_VER_ORIGINAL | ||
239 | |||
240 | typedef uint64x2_t v128_64; | ||
241 | // typedef __n128 v128_64; // MSVC | ||
242 | |||
243 | #ifdef MY_CPU_BE | ||
244 | #define MY_rev64_for_LE(x) x | ||
245 | #else | ||
246 | #define MY_rev64_for_LE(x) vrev64q_u8(x) | ||
247 | #endif | ||
248 | |||
249 | #define LOAD_128_64(_p) vld1q_u64(_p) | ||
250 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
251 | #define STORE_128_64(_p, _v) vst1q_u64(_p, _v) | ||
252 | |||
253 | #define LOAD_SHUFFLE(m, k) \ | ||
254 | m = vreinterpretq_u64_u8( \ | ||
255 | MY_rev64_for_LE( \ | ||
256 | LOAD_128_8(data + (k) * 16))); \ | ||
257 | |||
258 | // K array must be aligned for 16-bytes at least. | ||
259 | extern | ||
260 | MY_ALIGN(64) | ||
261 | const UInt64 SHA512_K_ARRAY[80]; | ||
262 | #define K SHA512_K_ARRAY | ||
263 | |||
264 | #define NN(m0, m1, m4, m5, m7) | ||
265 | #define SM(m0, m1, m4, m5, m7) \ | ||
266 | m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); | ||
267 | |||
268 | #define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \ | ||
269 | OP(m0, m1, m4, m5, m7) \ | ||
270 | t = vaddq_u64(m0, vld1q_u64(k)); \ | ||
271 | t = vaddq_u64(vextq_u64(t, t, 1), a3); \ | ||
272 | t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \ | ||
273 | a3 = vsha512h2q_u64(t, a1, a0); \ | ||
274 | a1 = vaddq_u64(a1, t); \ | ||
275 | |||
276 | #define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \ | ||
277 | R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \ | ||
278 | R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \ | ||
279 | R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \ | ||
280 | R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \ | ||
281 | |||
282 | #define R16(k, OP) \ | ||
283 | R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \ | ||
284 | R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \ | ||
285 | |||
286 | |||
287 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
288 | #ifdef ATTRIB_SHA512 | ||
289 | ATTRIB_SHA512 | ||
290 | #endif | ||
291 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
292 | { | ||
293 | v128_64 a0, a1, a2, a3; | ||
294 | |||
295 | if (numBlocks == 0) | ||
296 | return; | ||
297 | a0 = LOAD_128_64(&state[0]); | ||
298 | a1 = LOAD_128_64(&state[2]); | ||
299 | a2 = LOAD_128_64(&state[4]); | ||
300 | a3 = LOAD_128_64(&state[6]); | ||
301 | do | ||
302 | { | ||
303 | v128_64 a0_save, a1_save, a2_save, a3_save; | ||
304 | v128_64 m0, m1, m2, m3, m4, m5, m6, m7; | ||
305 | v128_64 t; | ||
306 | unsigned i; | ||
307 | const UInt64 *k_ptr; | ||
308 | |||
309 | LOAD_SHUFFLE (m0, 0) | ||
310 | LOAD_SHUFFLE (m1, 1) | ||
311 | LOAD_SHUFFLE (m2, 2) | ||
312 | LOAD_SHUFFLE (m3, 3) | ||
313 | LOAD_SHUFFLE (m4, 4) | ||
314 | LOAD_SHUFFLE (m5, 5) | ||
315 | LOAD_SHUFFLE (m6, 6) | ||
316 | LOAD_SHUFFLE (m7, 7) | ||
317 | |||
318 | a0_save = a0; | ||
319 | a1_save = a1; | ||
320 | a2_save = a2; | ||
321 | a3_save = a3; | ||
322 | |||
323 | R16 ( K, NN ) | ||
324 | k_ptr = K + 16; | ||
325 | for (i = 0; i < 4; i++) | ||
326 | { | ||
327 | R16 ( k_ptr, SM ) | ||
328 | k_ptr += 16; | ||
329 | } | ||
330 | |||
331 | a0 = vaddq_u64(a0, a0_save); | ||
332 | a1 = vaddq_u64(a1, a1_save); | ||
333 | a2 = vaddq_u64(a2, a2_save); | ||
334 | a3 = vaddq_u64(a3, a3_save); | ||
335 | |||
336 | data += 128; | ||
337 | } | ||
338 | while (--numBlocks); | ||
339 | |||
340 | STORE_128_64(&state[0], a0); | ||
341 | STORE_128_64(&state[2], a1); | ||
342 | STORE_128_64(&state[4], a2); | ||
343 | STORE_128_64(&state[6], a3); | ||
344 | } | ||
345 | |||
346 | #endif // USE_HW_SHA | ||
347 | |||
348 | #endif // MY_CPU_ARM_OR_ARM64 | ||
349 | |||
350 | |||
351 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | ||
352 | // #error Stop_Compiling_UNSUPPORTED_SHA | ||
353 | // #include <stdlib.h> | ||
354 | // We can compile this file with another C compiler, | ||
355 | // or we can compile asm version. | ||
356 | // So we can generate real code instead of this stub function. | ||
357 | // #include "Sha512.h" | ||
358 | // #if defined(_MSC_VER) | ||
359 | #pragma message("Sha512 HW-SW stub was used") | ||
360 | // #endif | ||
361 | void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks); | ||
362 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
363 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
364 | { | ||
365 | Sha512_UpdateBlocks(state, data, numBlocks); | ||
366 | /* | ||
367 | UNUSED_VAR(state); | ||
368 | UNUSED_VAR(data); | ||
369 | UNUSED_VAR(numBlocks); | ||
370 | exit(1); | ||
371 | return; | ||
372 | */ | ||
373 | } | ||
374 | #endif | ||
375 | |||
376 | |||
377 | #undef K | ||
378 | #undef RND2 | ||
379 | #undef MY_rev64_for_LE | ||
380 | #undef NN | ||
381 | #undef NNN | ||
382 | #undef LOAD_128 | ||
383 | #undef STORE_128 | ||
384 | #undef LOAD_SHUFFLE | ||
385 | #undef SM1 | ||
386 | #undef SM2 | ||
387 | #undef SM | ||
388 | #undef R2 | ||
389 | #undef R4 | ||
390 | #undef R16 | ||
391 | #undef PREPARE_STATE | ||
392 | #undef USE_HW_SHA | ||
393 | #undef ATTRIB_SHA512 | ||
394 | #undef USE_VER_MIN | ||
395 | #undef Z7_USE_HW_SHA_STUB | ||