diff options
Diffstat (limited to 'C')
-rw-r--r-- | C/7zDec.c | 5 | ||||
-rw-r--r-- | C/7zVersion.h | 10 | ||||
-rw-r--r-- | C/AesOpt.c | 233 | ||||
-rw-r--r-- | C/BwtSort.c | 468 | ||||
-rw-r--r-- | C/BwtSort.h | 7 | ||||
-rw-r--r-- | C/Compiler.h | 12 | ||||
-rw-r--r-- | C/CpuArch.c | 109 | ||||
-rw-r--r-- | C/CpuArch.h | 41 | ||||
-rw-r--r-- | C/HuffEnc.c | 384 | ||||
-rw-r--r-- | C/HuffEnc.h | 8 | ||||
-rw-r--r-- | C/LzFind.c | 26 | ||||
-rw-r--r-- | C/LzFindMt.c | 10 | ||||
-rw-r--r-- | C/LzFindMt.h | 6 | ||||
-rw-r--r-- | C/Lzma2Enc.c | 4 | ||||
-rw-r--r-- | C/Lzma2Enc.h | 1 | ||||
-rw-r--r-- | C/LzmaEnc.c | 22 | ||||
-rw-r--r-- | C/LzmaEnc.h | 4 | ||||
-rw-r--r-- | C/Md5.c | 206 | ||||
-rw-r--r-- | C/Md5.h | 34 | ||||
-rw-r--r-- | C/MtCoder.c | 61 | ||||
-rw-r--r-- | C/MtCoder.h | 7 | ||||
-rw-r--r-- | C/Sha1.c | 125 | ||||
-rw-r--r-- | C/Sha1.h | 18 | ||||
-rw-r--r-- | C/Sha1Opt.c | 146 | ||||
-rw-r--r-- | C/Sha256.c | 162 | ||||
-rw-r--r-- | C/Sha256.h | 18 | ||||
-rw-r--r-- | C/Sha256Opt.c | 172 | ||||
-rw-r--r-- | C/Sha3.c | 359 | ||||
-rw-r--r-- | C/Sha3.h | 36 | ||||
-rw-r--r-- | C/Sha512.c | 711 | ||||
-rw-r--r-- | C/Sha512.h | 86 | ||||
-rw-r--r-- | C/Sha512Opt.c | 395 | ||||
-rw-r--r-- | C/Sort.c | 355 | ||||
-rw-r--r-- | C/Sort.h | 7 | ||||
-rw-r--r-- | C/Threads.c | 237 | ||||
-rw-r--r-- | C/Threads.h | 12 | ||||
-rw-r--r-- | C/Util/Lzma/LzmaUtil.dsp | 4 | ||||
-rw-r--r-- | C/Util/LzmaLib/LzmaLib.dsp | 8 | ||||
-rw-r--r-- | C/Xz.h | 12 | ||||
-rw-r--r-- | C/XzCrc64Opt.c | 4 | ||||
-rw-r--r-- | C/XzDec.c | 29 | ||||
-rw-r--r-- | C/XzEnc.c | 8 | ||||
-rw-r--r-- | C/XzEnc.h | 3 | ||||
-rw-r--r-- | C/XzIn.c | 265 |
44 files changed, 3758 insertions, 1072 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* 7zDec.c -- Decoding from 7z folder | 1 | /* 7zDec.c -- Decoding from 7z folder |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m) | |||
312 | case k_PPMD: | 312 | case k_PPMD: |
313 | #endif | 313 | #endif |
314 | return True; | 314 | return True; |
315 | default: | ||
316 | return False; | ||
315 | } | 317 | } |
316 | return False; | ||
317 | } | 318 | } |
318 | 319 | ||
319 | static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) | 320 | static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) |
diff --git a/C/7zVersion.h b/C/7zVersion.h index 1ddef80..b6142e9 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #define MY_VER_MAJOR 24 | 1 | #define MY_VER_MAJOR 25 |
2 | #define MY_VER_MINOR 8 | 2 | #define MY_VER_MINOR 1 |
3 | #define MY_VER_BUILD 0 | 3 | #define MY_VER_BUILD 0 |
4 | #define MY_VERSION_NUMBERS "24.08" | 4 | #define MY_VERSION_NUMBERS "25.01" |
5 | #define MY_VERSION MY_VERSION_NUMBERS | 5 | #define MY_VERSION MY_VERSION_NUMBERS |
6 | 6 | ||
7 | #ifdef MY_CPU_NAME | 7 | #ifdef MY_CPU_NAME |
@@ -10,12 +10,12 @@ | |||
10 | #define MY_VERSION_CPU MY_VERSION | 10 | #define MY_VERSION_CPU MY_VERSION |
11 | #endif | 11 | #endif |
12 | 12 | ||
13 | #define MY_DATE "2024-08-11" | 13 | #define MY_DATE "2025-08-03" |
14 | #undef MY_COPYRIGHT | 14 | #undef MY_COPYRIGHT |
15 | #undef MY_VERSION_COPYRIGHT_DATE | 15 | #undef MY_VERSION_COPYRIGHT_DATE |
16 | #define MY_AUTHOR_NAME "Igor Pavlov" | 16 | #define MY_AUTHOR_NAME "Igor Pavlov" |
17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" | 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" |
18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" | 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov" |
19 | 19 | ||
20 | #ifdef USE_COPYRIGHT_CR | 20 | #ifdef USE_COPYRIGHT_CR |
21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR | 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR |
@@ -1,5 +1,5 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -80,19 +80,39 @@ AES_FUNC_START (name) | |||
80 | 80 | ||
81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
82 | 82 | ||
83 | #if 1 | ||
84 | // use aligned SSE load/store for data. | ||
85 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
86 | // So we can use this branch of code. | ||
87 | // and compiler can use fused load-op SSE instructions: | ||
88 | // xorps xmm0, XMMWORD PTR [rdx] | ||
89 | #define LOAD_128(pp) (*(__m128i *)(void *)(pp)) | ||
90 | #define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v | ||
91 | // use aligned SSE load/store for data. Alternative code with direct access | ||
92 | // #define LOAD_128(pp) _mm_load_si128(pp) | ||
93 | // #define STORE_128(pp, _v) _mm_store_si128(pp, _v) | ||
94 | #else | ||
95 | // use unaligned load/store for data: movdqu XMMWORD PTR [rdx] | ||
96 | #define LOAD_128(pp) _mm_loadu_si128(pp) | ||
97 | #define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) | ||
98 | #endif | ||
99 | |||
83 | AES_FUNC_START2 (AesCbc_Encode_HW) | 100 | AES_FUNC_START2 (AesCbc_Encode_HW) |
84 | { | 101 | { |
102 | if (numBlocks == 0) | ||
103 | return; | ||
104 | { | ||
85 | __m128i *p = (__m128i *)(void *)ivAes; | 105 | __m128i *p = (__m128i *)(void *)ivAes; |
86 | __m128i *data = (__m128i *)(void *)data8; | 106 | __m128i *data = (__m128i *)(void *)data8; |
87 | __m128i m = *p; | 107 | __m128i m = *p; |
88 | const __m128i k0 = p[2]; | 108 | const __m128i k0 = p[2]; |
89 | const __m128i k1 = p[3]; | 109 | const __m128i k1 = p[3]; |
90 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; | 110 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; |
91 | for (; numBlocks != 0; numBlocks--, data++) | 111 | do |
92 | { | 112 | { |
93 | UInt32 r = numRounds2; | 113 | UInt32 r = numRounds2; |
94 | const __m128i *w = p + 4; | 114 | const __m128i *w = p + 4; |
95 | __m128i temp = *data; | 115 | __m128i temp = LOAD_128(data); |
96 | MM_XOR (temp, k0) | 116 | MM_XOR (temp, k0) |
97 | MM_XOR (m, temp) | 117 | MM_XOR (m, temp) |
98 | MM_OP_m (_mm_aesenc_si128, k1) | 118 | MM_OP_m (_mm_aesenc_si128, k1) |
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
104 | } | 124 | } |
105 | while (--r); | 125 | while (--r); |
106 | MM_OP_m (_mm_aesenclast_si128, w[0]) | 126 | MM_OP_m (_mm_aesenclast_si128, w[0]) |
107 | *data = m; | 127 | STORE_128(data, m); |
128 | data++; | ||
108 | } | 129 | } |
130 | while (--numBlocks); | ||
109 | *p = m; | 131 | *p = m; |
132 | } | ||
110 | } | 133 | } |
111 | 134 | ||
112 | 135 | ||
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
139 | 162 | ||
140 | #define WOP(op) op (m0, 0) WOP_M1(op) | 163 | #define WOP(op) op (m0, 0) WOP_M1(op) |
141 | 164 | ||
142 | |||
143 | #define DECLARE_VAR(reg, ii) __m128i reg; | 165 | #define DECLARE_VAR(reg, ii) __m128i reg; |
144 | #define LOAD_data( reg, ii) reg = data[ii]; | 166 | #define LOAD_data_ii(ii) LOAD_128(data + (ii)) |
145 | #define STORE_data( reg, ii) data[ii] = reg; | 167 | #define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); |
168 | #define STORE_data( reg, ii) STORE_128(data + (ii), reg); | ||
146 | #if (NUM_WAYS > 1) | 169 | #if (NUM_WAYS > 1) |
147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 170 | #define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) |
148 | #endif | 171 | #endif |
149 | 172 | ||
150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 173 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 179 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
157 | 180 | ||
158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 181 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 182 | #define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ |
160 | 183 | LOAD_128 (data + (ii)))); | |
161 | #define WOP_KEY(op, n) { \ | 184 | #define WOP_KEY(op, n) { \ |
162 | const __m128i key = w[n]; \ | 185 | const __m128i key = w[n]; \ |
163 | WOP(op); } | 186 | WOP(op) } |
164 | |||
165 | 187 | ||
166 | #define WIDE_LOOP_START \ | 188 | #define WIDE_LOOP_START \ |
167 | dataEnd = data + numBlocks; \ | 189 | dataEnd = data + numBlocks; \ |
168 | if (numBlocks >= NUM_WAYS) \ | 190 | if (numBlocks >= NUM_WAYS) \ |
169 | { dataEnd -= NUM_WAYS; do { \ | 191 | { dataEnd -= NUM_WAYS; do { \ |
170 | 192 | ||
171 | |||
172 | #define WIDE_LOOP_END \ | 193 | #define WIDE_LOOP_END \ |
173 | data += NUM_WAYS; \ | 194 | data += NUM_WAYS; \ |
174 | } while (data <= dataEnd); \ | 195 | } while (data <= dataEnd); \ |
175 | dataEnd += NUM_WAYS; } \ | 196 | dataEnd += NUM_WAYS; } \ |
176 | 197 | ||
177 | |||
178 | #define SINGLE_LOOP \ | 198 | #define SINGLE_LOOP \ |
179 | for (; data < dataEnd; data++) | 199 | for (; data < dataEnd; data++) |
180 | 200 | ||
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
184 | 204 | ||
185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | 205 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) |
186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | 206 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; |
187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | 207 | |
188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | 208 | #if 1 |
209 | // use unaligned AVX load/store for data. | ||
210 | // It is required for our Aes functions, that data is aligned for 16-bytes. | ||
211 | // But we need 32-bytes reading. | ||
212 | // So we use intrinsics for unaligned AVX load/store. | ||
213 | // notes for _mm256_storeu_si256: | ||
214 | // msvc2022: uses vmovdqu and keeps the order of instruction sequence. | ||
215 | // new gcc11 uses vmovdqu | ||
216 | // old gcc9 could use pair of instructions: | ||
217 | // vmovups %xmm7, -224(%rax) | ||
218 | // vextracti128 $0x1, %ymm7, -208(%rax) | ||
219 | #define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
220 | #define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); | ||
221 | #else | ||
222 | // use aligned AVX load/store for data. | ||
223 | // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. | ||
224 | // msvc2022 uses vmovdqu still | ||
225 | // gcc uses vmovdqa (that requires 32-bytes alignment) | ||
226 | #define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) | ||
227 | #define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; | ||
228 | #endif | ||
229 | |||
230 | #define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); | ||
231 | #define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) | ||
189 | /* | 232 | /* |
190 | AVX_XOR_data_M1() needs unaligned memory load | 233 | AVX_XOR_data_M1() needs unaligned memory load, even if (data) |
191 | if (we don't use _mm256_loadu_si256() here) | 234 | is aligned for 256-bits, because we read 32-bytes chunk that |
192 | { | 235 | crosses (data) position: from (data - 16bytes) to (data + 16bytes). |
193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
194 | instruction that can load unaligned data. | ||
195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
197 | } | ||
198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
200 | */ | 236 | */ |
201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | 237 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) |
202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
204 | 238 | ||
205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | 239 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) |
206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | 240 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) |
207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | 241 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) |
208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | 242 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) |
209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | 243 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) |
210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | 244 | #define AVX_CTR_START(reg, ii) \ |
211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | 245 | MM_OP (_mm256_add_epi64, ctr2, two) \ |
246 | reg = _mm256_xor_si256(ctr2, key); | ||
247 | |||
248 | #define AVX_CTR_END(reg, ii) \ | ||
249 | AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ | ||
250 | AVX_LOAD ((__m256i *)(void *)data + (ii)))); | ||
251 | |||
212 | #define AVX_WOP_KEY(op, n) { \ | 252 | #define AVX_WOP_KEY(op, n) { \ |
213 | const __m256i key = w[n]; \ | 253 | const __m256i key = w[n]; \ |
214 | WOP(op); } | 254 | WOP(op) } |
215 | 255 | ||
216 | #define NUM_AES_KEYS_MAX 15 | 256 | #define NUM_AES_KEYS_MAX 15 |
217 | 257 | ||
218 | #define WIDE_LOOP_START_AVX(OP) \ | 258 | #define WIDE_LOOP_START_AVX(OP) \ |
219 | dataEnd = data + numBlocks; \ | 259 | dataEnd = data + numBlocks; \ |
220 | if (numBlocks >= NUM_WAYS * 2) \ | 260 | if (numBlocks >= NUM_WAYS * 2) \ |
221 | { __m256i keys[NUM_AES_KEYS_MAX]; \ | 261 | { __m256i keys[NUM_AES_KEYS_MAX]; \ |
222 | UInt32 ii; \ | 262 | OP \ |
223 | OP \ | 263 | { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ |
224 | for (ii = 0; ii < numRounds; ii++) \ | 264 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ |
225 | keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ | 265 | dataEnd -= NUM_WAYS * 2; \ |
226 | dataEnd -= NUM_WAYS * 2; do { \ | 266 | do { \ |
227 | |||
228 | 267 | ||
229 | #define WIDE_LOOP_END_AVX(OP) \ | 268 | #define WIDE_LOOP_END_AVX(OP) \ |
230 | data += NUM_WAYS * 2; \ | 269 | data += NUM_WAYS * 2; \ |
231 | } while (data <= dataEnd); \ | 270 | } while (data <= dataEnd); \ |
232 | dataEnd += NUM_WAYS * 2; \ | 271 | dataEnd += NUM_WAYS * 2; \ |
233 | OP \ | 272 | OP \ |
234 | _mm256_zeroupper(); \ | 273 | _mm256_zeroupper(); \ |
235 | } \ | 274 | } \ |
236 | 275 | ||
237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 276 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
246 | __m128i *p = (__m128i *)(void *)ivAes; | 285 | __m128i *p = (__m128i *)(void *)ivAes; |
247 | __m128i *data = (__m128i *)(void *)data8; | 286 | __m128i *data = (__m128i *)(void *)data8; |
248 | __m128i iv = *p; | 287 | __m128i iv = *p; |
249 | const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; | 288 | const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; |
250 | const __m128i *dataEnd; | 289 | const __m128i *dataEnd; |
251 | p += 2; | 290 | p += 2; |
252 | 291 | ||
253 | WIDE_LOOP_START | 292 | WIDE_LOOP_START |
254 | { | 293 | { |
255 | const __m128i *w = wStart; | 294 | const __m128i *w = wStart; |
256 | |||
257 | WOP (DECLARE_VAR) | 295 | WOP (DECLARE_VAR) |
258 | WOP (LOAD_data) | 296 | WOP (LOAD_data) |
259 | WOP_KEY (AES_XOR, 1) | 297 | WOP_KEY (AES_XOR, 1) |
260 | |||
261 | do | 298 | do |
262 | { | 299 | { |
263 | WOP_KEY (AES_DEC, 0) | 300 | WOP_KEY (AES_DEC, 0) |
301 | |||
264 | w--; | 302 | w--; |
265 | } | 303 | } |
266 | while (w != p); | 304 | while (w != p); |
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
268 | 306 | ||
269 | MM_XOR (m0, iv) | 307 | MM_XOR (m0, iv) |
270 | WOP_M1 (XOR_data_M1) | 308 | WOP_M1 (XOR_data_M1) |
271 | iv = data[NUM_WAYS - 1]; | 309 | LOAD_data(iv, NUM_WAYS - 1) |
272 | WOP (STORE_data) | 310 | WOP (STORE_data) |
273 | } | 311 | } |
274 | WIDE_LOOP_END | 312 | WIDE_LOOP_END |
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
276 | SINGLE_LOOP | 314 | SINGLE_LOOP |
277 | { | 315 | { |
278 | const __m128i *w = wStart - 1; | 316 | const __m128i *w = wStart - 1; |
279 | __m128i m = _mm_xor_si128 (w[2], *data); | 317 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
318 | |||
280 | do | 319 | do |
281 | { | 320 | { |
282 | MM_OP_m (_mm_aesdec_si128, w[1]) | 321 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
286 | while (w != p); | 325 | while (w != p); |
287 | MM_OP_m (_mm_aesdec_si128, w[1]) | 326 | MM_OP_m (_mm_aesdec_si128, w[1]) |
288 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 327 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
289 | |||
290 | MM_XOR (m, iv) | 328 | MM_XOR (m, iv) |
291 | iv = *data; | 329 | LOAD_data(iv, 0) |
292 | *data = m; | 330 | STORE_data(m, 0) |
293 | } | 331 | } |
294 | 332 | ||
295 | p[-2] = iv; | 333 | p[-2] = iv; |
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
301 | __m128i *p = (__m128i *)(void *)ivAes; | 339 | __m128i *p = (__m128i *)(void *)ivAes; |
302 | __m128i *data = (__m128i *)(void *)data8; | 340 | __m128i *data = (__m128i *)(void *)data8; |
303 | __m128i ctr = *p; | 341 | __m128i ctr = *p; |
304 | UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; | 342 | const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; |
305 | const __m128i *dataEnd; | 343 | const __m128i *dataEnd; |
306 | __m128i one = _mm_cvtsi32_si128(1); | 344 | const __m128i one = _mm_cvtsi32_si128(1); |
307 | 345 | ||
308 | p += 2; | 346 | p += 2; |
309 | 347 | ||
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
322 | } | 360 | } |
323 | while (--r); | 361 | while (--r); |
324 | WOP_KEY (AES_ENC_LAST, 0) | 362 | WOP_KEY (AES_ENC_LAST, 0) |
325 | |||
326 | WOP (CTR_END) | 363 | WOP (CTR_END) |
327 | } | 364 | } |
328 | WIDE_LOOP_END | 365 | WIDE_LOOP_END |
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
344 | while (--numRounds2); | 381 | while (--numRounds2); |
345 | MM_OP_m (_mm_aesenc_si128, w[0]) | 382 | MM_OP_m (_mm_aesenc_si128, w[0]) |
346 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 383 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
347 | MM_XOR (*data, m) | 384 | CTR_END (m, 0) |
348 | } | 385 | } |
349 | 386 | ||
350 | p[-2] = ctr; | 387 | p[-2] = ctr; |
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
421 | __m128i *data = (__m128i *)(void *)data8; | 458 | __m128i *data = (__m128i *)(void *)data8; |
422 | __m128i iv = *p; | 459 | __m128i iv = *p; |
423 | const __m128i *dataEnd; | 460 | const __m128i *dataEnd; |
424 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 461 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
425 | p += 2; | 462 | p += 2; |
426 | 463 | ||
427 | WIDE_LOOP_START_AVX(;) | 464 | WIDE_LOOP_START_AVX(;) |
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
440 | while (w != keys); | 477 | while (w != keys); |
441 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) | 478 | AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) |
442 | 479 | ||
443 | AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) | 480 | AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) |
444 | WOP_M1 (AVX_XOR_data_M1) | 481 | WOP_M1 (AVX_XOR_data_M1) |
445 | iv = data[NUM_WAYS * 2 - 1]; | 482 | LOAD_data (iv, NUM_WAYS * 2 - 1) |
446 | WOP (AVX_STORE_data) | 483 | WOP (AVX_STORE_data) |
447 | } | 484 | } |
448 | WIDE_LOOP_END_AVX(;) | 485 | WIDE_LOOP_END_AVX(;) |
449 | 486 | ||
450 | SINGLE_LOOP | 487 | SINGLE_LOOP |
451 | { | 488 | { |
452 | const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; | 489 | const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; |
453 | __m128i m = _mm_xor_si128 (w[2], *data); | 490 | __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); |
454 | do | 491 | do |
455 | { | 492 | { |
456 | MM_OP_m (_mm_aesdec_si128, w[1]) | 493 | MM_OP_m (_mm_aesdec_si128, w[1]) |
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) | |||
462 | MM_OP_m (_mm_aesdeclast_si128, w[0]) | 499 | MM_OP_m (_mm_aesdeclast_si128, w[0]) |
463 | 500 | ||
464 | MM_XOR (m, iv) | 501 | MM_XOR (m, iv) |
465 | iv = *data; | 502 | LOAD_data(iv, 0) |
466 | *data = m; | 503 | STORE_data(m, 0) |
467 | } | 504 | } |
468 | 505 | ||
469 | p[-2] = iv; | 506 | p[-2] = iv; |
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
493 | __m128i *p = (__m128i *)(void *)ivAes; | 530 | __m128i *p = (__m128i *)(void *)ivAes; |
494 | __m128i *data = (__m128i *)(void *)data8; | 531 | __m128i *data = (__m128i *)(void *)data8; |
495 | __m128i ctr = *p; | 532 | __m128i ctr = *p; |
496 | UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; | 533 | const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; |
497 | const __m128i *dataEnd; | 534 | const __m128i *dataEnd; |
498 | __m128i one = _mm_cvtsi32_si128(1); | 535 | const __m128i one = _mm_cvtsi32_si128(1); |
499 | __m256i ctr2, two; | 536 | __m256i ctr2, two; |
500 | p += 2; | 537 | p += 2; |
501 | 538 | ||
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
536 | while (--numRounds2); | 573 | while (--numRounds2); |
537 | MM_OP_m (_mm_aesenc_si128, w[0]) | 574 | MM_OP_m (_mm_aesenc_si128, w[0]) |
538 | MM_OP_m (_mm_aesenclast_si128, w[1]) | 575 | MM_OP_m (_mm_aesenclast_si128, w[1]) |
539 | MM_XOR (*data, m) | 576 | CTR_END (m, 0) |
540 | } | 577 | } |
541 | 578 | ||
542 | p[-2] = ctr; | 579 | p[-2] = ctr; |
@@ -731,9 +768,14 @@ AES_FUNC_START (name) | |||
731 | 768 | ||
732 | AES_FUNC_START2 (AesCbc_Encode_HW) | 769 | AES_FUNC_START2 (AesCbc_Encode_HW) |
733 | { | 770 | { |
734 | v128 * const p = (v128*)(void*)ivAes; | 771 | if (numBlocks == 0) |
735 | v128 *data = (v128*)(void*)data8; | 772 | return; |
773 | { | ||
774 | v128 * const p = (v128 *)(void *)ivAes; | ||
775 | v128 *data = (v128 *)(void *)data8; | ||
736 | v128 m = *p; | 776 | v128 m = *p; |
777 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | ||
778 | const v128 *w = p + (size_t)numRounds2 * 2; | ||
737 | const v128 k0 = p[2]; | 779 | const v128 k0 = p[2]; |
738 | const v128 k1 = p[3]; | 780 | const v128 k1 = p[3]; |
739 | const v128 k2 = p[4]; | 781 | const v128 k2 = p[4]; |
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
744 | const v128 k7 = p[9]; | 786 | const v128 k7 = p[9]; |
745 | const v128 k8 = p[10]; | 787 | const v128 k8 = p[10]; |
746 | const v128 k9 = p[11]; | 788 | const v128 k9 = p[11]; |
747 | const UInt32 numRounds2 = *(const UInt32 *)(p + 1); | 789 | const v128 k_z4 = w[-2]; |
748 | const v128 *w = p + ((size_t)numRounds2 * 2); | 790 | const v128 k_z3 = w[-1]; |
791 | const v128 k_z2 = w[0]; | ||
749 | const v128 k_z1 = w[1]; | 792 | const v128 k_z1 = w[1]; |
750 | const v128 k_z0 = w[2]; | 793 | const v128 k_z0 = w[2]; |
751 | for (; numBlocks != 0; numBlocks--, data++) | 794 | // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, |
795 | // because gcc/clang compilers are not good for that optimization. | ||
796 | do | ||
752 | { | 797 | { |
753 | MM_XOR_m (*data) | 798 | MM_XOR_m (*data) |
754 | AES_E_MC_m (k0) | 799 | AES_E_MC_m (k0) |
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
757 | AES_E_MC_m (k3) | 802 | AES_E_MC_m (k3) |
758 | AES_E_MC_m (k4) | 803 | AES_E_MC_m (k4) |
759 | AES_E_MC_m (k5) | 804 | AES_E_MC_m (k5) |
760 | AES_E_MC_m (k6) | ||
761 | AES_E_MC_m (k7) | ||
762 | AES_E_MC_m (k8) | ||
763 | if (numRounds2 >= 6) | 805 | if (numRounds2 >= 6) |
764 | { | 806 | { |
765 | AES_E_MC_m (k9) | 807 | AES_E_MC_m (k6) |
766 | AES_E_MC_m (p[12]) | 808 | AES_E_MC_m (k7) |
767 | if (numRounds2 != 6) | 809 | if (numRounds2 != 6) |
768 | { | 810 | { |
769 | AES_E_MC_m (p[13]) | 811 | AES_E_MC_m (k8) |
770 | AES_E_MC_m (p[14]) | 812 | AES_E_MC_m (k9) |
771 | } | 813 | } |
772 | } | 814 | } |
773 | AES_E_m (k_z1) | 815 | AES_E_MC_m (k_z4) |
774 | MM_XOR_m (k_z0) | 816 | AES_E_MC_m (k_z3) |
775 | *data = m; | 817 | AES_E_MC_m (k_z2) |
818 | AES_E_m (k_z1) | ||
819 | MM_XOR_m (k_z0) | ||
820 | *data++ = m; | ||
776 | } | 821 | } |
822 | while (--numBlocks); | ||
777 | *p = m; | 823 | *p = m; |
824 | } | ||
778 | } | 825 | } |
779 | 826 | ||
780 | 827 | ||
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
834 | 881 | ||
835 | AES_FUNC_START2 (AesCbc_Decode_HW) | 882 | AES_FUNC_START2 (AesCbc_Decode_HW) |
836 | { | 883 | { |
837 | v128 *p = (v128*)(void*)ivAes; | 884 | v128 *p = (v128 *)(void *)ivAes; |
838 | v128 *data = (v128*)(void*)data8; | 885 | v128 *data = (v128 *)(void *)data8; |
839 | v128 iv = *p; | 886 | v128 iv = *p; |
840 | const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 887 | const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
841 | const v128 *dataEnd; | 888 | const v128 *dataEnd; |
842 | p += 2; | 889 | p += 2; |
843 | 890 | ||
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
858 | WOP_KEY (AES_XOR, 0) | 905 | WOP_KEY (AES_XOR, 0) |
859 | MM_XOR (m0, iv) | 906 | MM_XOR (m0, iv) |
860 | WOP_M1 (XOR_data_M1) | 907 | WOP_M1 (XOR_data_M1) |
861 | iv = data[NUM_WAYS - 1]; | 908 | LOAD_data(iv, NUM_WAYS - 1) |
862 | WOP (STORE_data) | 909 | WOP (STORE_data) |
863 | } | 910 | } |
864 | WIDE_LOOP_END | 911 | WIDE_LOOP_END |
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
866 | SINGLE_LOOP | 913 | SINGLE_LOOP |
867 | { | 914 | { |
868 | const v128 *w = wStart; | 915 | const v128 *w = wStart; |
869 | v128 m = *data; | 916 | v128 m; LOAD_data(m, 0) |
870 | AES_D_IMC_m (w[2]) | 917 | AES_D_IMC_m (w[2]) |
871 | do | 918 | do |
872 | { | 919 | { |
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
878 | AES_D_m (w[1]) | 925 | AES_D_m (w[1]) |
879 | MM_XOR_m (w[0]) | 926 | MM_XOR_m (w[0]) |
880 | MM_XOR_m (iv) | 927 | MM_XOR_m (iv) |
881 | iv = *data; | 928 | LOAD_data(iv, 0) |
882 | *data = m; | 929 | STORE_data(m, 0) |
883 | } | 930 | } |
884 | 931 | ||
885 | p[-2] = iv; | 932 | p[-2] = iv; |
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
888 | 935 | ||
889 | AES_FUNC_START2 (AesCtr_Code_HW) | 936 | AES_FUNC_START2 (AesCtr_Code_HW) |
890 | { | 937 | { |
891 | v128 *p = (v128*)(void*)ivAes; | 938 | v128 *p = (v128 *)(void *)ivAes; |
892 | v128 *data = (v128*)(void*)data8; | 939 | v128 *data = (v128 *)(void *)data8; |
893 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); | 940 | uint64x2_t ctr = vreinterpretq_u64_u8(*p); |
894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 941 | const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; |
895 | const v128 *dataEnd; | 942 | const v128 *dataEnd; |
896 | uint64x2_t one = vdupq_n_u64(0); | ||
897 | |||
898 | // the bug in clang: | 943 | // the bug in clang: |
899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | 944 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); |
900 | #if defined(__clang__) && (__clang_major__ <= 9) | 945 | #if defined(__clang__) && (__clang_major__ <= 9) |
901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | 946 | #pragma GCC diagnostic ignored "-Wvector-conversion" |
902 | #endif | 947 | #endif |
903 | one = vsetq_lane_u64(1, one, 0); | 948 | const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); |
904 | p += 2; | 949 | p += 2; |
905 | 950 | ||
906 | WIDE_LOOP_START | 951 | WIDE_LOOP_START |
diff --git a/C/BwtSort.c b/C/BwtSort.c index 05ad6de..8f64f9d 100644 --- a/C/BwtSort.c +++ b/C/BwtSort.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* BwtSort.c -- BWT block sorting | 1 | /* BwtSort.c -- BWT block sorting |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -7,6 +7,44 @@ | |||
7 | #include "Sort.h" | 7 | #include "Sort.h" |
8 | 8 | ||
9 | /* #define BLOCK_SORT_USE_HEAP_SORT */ | 9 | /* #define BLOCK_SORT_USE_HEAP_SORT */ |
10 | // #define BLOCK_SORT_USE_HEAP_SORT | ||
11 | |||
12 | #ifdef BLOCK_SORT_USE_HEAP_SORT | ||
13 | |||
14 | #define HeapSortRefDown(p, vals, n, size, temp) \ | ||
15 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | ||
16 | size_t s = k << 1; \ | ||
17 | if (s > size) break; \ | ||
18 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | ||
19 | if (val >= vals[p[s]]) break; \ | ||
20 | p[k] = p[s]; k = s; \ | ||
21 | } p[k] = temp; } | ||
22 | |||
23 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
24 | { | ||
25 | if (size <= 1) | ||
26 | return; | ||
27 | p--; | ||
28 | { | ||
29 | size_t i = size / 2; | ||
30 | do | ||
31 | { | ||
32 | UInt32 temp = p[i]; | ||
33 | HeapSortRefDown(p, vals, i, size, temp); | ||
34 | } | ||
35 | while (--i != 0); | ||
36 | } | ||
37 | do | ||
38 | { | ||
39 | UInt32 temp = p[size]; | ||
40 | p[size--] = p[1]; | ||
41 | HeapSortRefDown(p, vals, 1, size, temp); | ||
42 | } | ||
43 | while (size > 1); | ||
44 | } | ||
45 | |||
46 | #endif // BLOCK_SORT_USE_HEAP_SORT | ||
47 | |||
10 | 48 | ||
11 | /* Don't change it !!! */ | 49 | /* Don't change it !!! */ |
12 | #define kNumHashBytes 2 | 50 | #define kNumHashBytes 2 |
@@ -27,26 +65,27 @@ | |||
27 | 65 | ||
28 | #else | 66 | #else |
29 | 67 | ||
30 | #define kNumBitsMax 20 | 68 | #define kNumBitsMax 20 |
31 | #define kIndexMask ((1 << kNumBitsMax) - 1) | 69 | #define kIndexMask (((UInt32)1 << kNumBitsMax) - 1) |
32 | #define kNumExtraBits (32 - kNumBitsMax) | 70 | #define kNumExtraBits (32 - kNumBitsMax) |
33 | #define kNumExtra0Bits (kNumExtraBits - 2) | 71 | #define kNumExtra0Bits (kNumExtraBits - 2) |
34 | #define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) | 72 | #define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) |
35 | 73 | ||
36 | #define SetFinishedGroupSize(p, size) \ | 74 | #define SetFinishedGroupSize(p, size) \ |
37 | { *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ | 75 | { *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ |
38 | if ((size) > (1 << kNumExtra0Bits)) { \ | 76 | if ((size) > (1 << kNumExtra0Bits)) { \ |
39 | *(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \ | 77 | *(p) |= 0x40000000; \ |
78 | *((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \ | ||
40 | 79 | ||
41 | static void SetGroupSize(UInt32 *p, UInt32 size) | 80 | static void SetGroupSize(UInt32 *p, size_t size) |
42 | { | 81 | { |
43 | if (--size == 0) | 82 | if (--size == 0) |
44 | return; | 83 | return; |
45 | *p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax); | 84 | *p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax); |
46 | if (size >= (1 << kNumExtra0Bits)) | 85 | if (size >= (1 << kNumExtra0Bits)) |
47 | { | 86 | { |
48 | *p |= 0x40000000; | 87 | *p |= 0x40000000; |
49 | p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax); | 88 | p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax); |
50 | } | 89 | } |
51 | } | 90 | } |
52 | 91 | ||
@@ -59,12 +98,14 @@ returns: 1 - if there are groups, 0 - no more groups | |||
59 | */ | 98 | */ |
60 | 99 | ||
61 | static | 100 | static |
62 | UInt32 | 101 | unsigned |
63 | Z7_FASTCALL | 102 | Z7_FASTCALL |
64 | SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices | 103 | SortGroup(size_t BlockSize, size_t NumSortedBytes, |
65 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 104 | size_t groupOffset, size_t groupSize, |
66 | , UInt32 left, UInt32 range | 105 | unsigned NumRefBits, UInt32 *Indices |
67 | #endif | 106 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
107 | , size_t left, size_t range | ||
108 | #endif | ||
68 | ) | 109 | ) |
69 | { | 110 | { |
70 | UInt32 *ind2 = Indices + groupOffset; | 111 | UInt32 *ind2 = Indices + groupOffset; |
@@ -79,90 +120,93 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
79 | return 0; | 120 | return 0; |
80 | } | 121 | } |
81 | Groups = Indices + BlockSize + BS_TEMP_SIZE; | 122 | Groups = Indices + BlockSize + BS_TEMP_SIZE; |
82 | if (groupSize <= ((UInt32)1 << NumRefBits) | 123 | if (groupSize <= ((size_t)1 << NumRefBits) |
83 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 124 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
84 | && groupSize <= range | 125 | && groupSize <= range |
85 | #endif | 126 | #endif |
86 | ) | 127 | ) |
87 | { | 128 | { |
88 | UInt32 *temp = Indices + BlockSize; | 129 | UInt32 *temp = Indices + BlockSize; |
89 | UInt32 j; | 130 | size_t j, group; |
90 | UInt32 mask, thereAreGroups, group, cg; | 131 | UInt32 mask, cg; |
132 | unsigned thereAreGroups; | ||
91 | { | 133 | { |
92 | UInt32 gPrev; | 134 | UInt32 gPrev; |
93 | UInt32 gRes = 0; | 135 | UInt32 gRes = 0; |
94 | { | 136 | { |
95 | UInt32 sp = ind2[0] + NumSortedBytes; | 137 | size_t sp = ind2[0] + NumSortedBytes; |
96 | if (sp >= BlockSize) sp -= BlockSize; | 138 | if (sp >= BlockSize) |
139 | sp -= BlockSize; | ||
97 | gPrev = Groups[sp]; | 140 | gPrev = Groups[sp]; |
98 | temp[0] = (gPrev << NumRefBits); | 141 | temp[0] = gPrev << NumRefBits; |
99 | } | 142 | } |
100 | 143 | ||
101 | for (j = 1; j < groupSize; j++) | 144 | for (j = 1; j < groupSize; j++) |
102 | { | 145 | { |
103 | UInt32 sp = ind2[j] + NumSortedBytes; | 146 | size_t sp = ind2[j] + NumSortedBytes; |
104 | UInt32 g; | 147 | UInt32 g; |
105 | if (sp >= BlockSize) sp -= BlockSize; | 148 | if (sp >= BlockSize) |
149 | sp -= BlockSize; | ||
106 | g = Groups[sp]; | 150 | g = Groups[sp]; |
107 | temp[j] = (g << NumRefBits) | j; | 151 | temp[j] = (g << NumRefBits) | (UInt32)j; |
108 | gRes |= (gPrev ^ g); | 152 | gRes |= (gPrev ^ g); |
109 | } | 153 | } |
110 | if (gRes == 0) | 154 | if (gRes == 0) |
111 | { | 155 | { |
112 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 156 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
113 | SetGroupSize(ind2, groupSize); | 157 | SetGroupSize(ind2, groupSize); |
114 | #endif | 158 | #endif |
115 | return 1; | 159 | return 1; |
116 | } | 160 | } |
117 | } | 161 | } |
118 | 162 | ||
119 | HeapSort(temp, groupSize); | 163 | HeapSort(temp, groupSize); |
120 | mask = (((UInt32)1 << NumRefBits) - 1); | 164 | mask = ((UInt32)1 << NumRefBits) - 1; |
121 | thereAreGroups = 0; | 165 | thereAreGroups = 0; |
122 | 166 | ||
123 | group = groupOffset; | 167 | group = groupOffset; |
124 | cg = (temp[0] >> NumRefBits); | 168 | cg = temp[0] >> NumRefBits; |
125 | temp[0] = ind2[temp[0] & mask]; | 169 | temp[0] = ind2[temp[0] & mask]; |
126 | 170 | ||
127 | { | 171 | { |
128 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 172 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
129 | UInt32 *Flags = Groups + BlockSize; | 173 | UInt32 *Flags = Groups + BlockSize; |
130 | #else | 174 | #else |
131 | UInt32 prevGroupStart = 0; | 175 | size_t prevGroupStart = 0; |
132 | #endif | 176 | #endif |
133 | 177 | ||
134 | for (j = 1; j < groupSize; j++) | 178 | for (j = 1; j < groupSize; j++) |
135 | { | 179 | { |
136 | UInt32 val = temp[j]; | 180 | const UInt32 val = temp[j]; |
137 | UInt32 cgCur = (val >> NumRefBits); | 181 | const UInt32 cgCur = val >> NumRefBits; |
138 | 182 | ||
139 | if (cgCur != cg) | 183 | if (cgCur != cg) |
140 | { | 184 | { |
141 | cg = cgCur; | 185 | cg = cgCur; |
142 | group = groupOffset + j; | 186 | group = groupOffset + j; |
143 | 187 | ||
144 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 188 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
145 | { | 189 | { |
146 | UInt32 t = group - 1; | 190 | const size_t t = group - 1; |
147 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 191 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
148 | } | 192 | } |
149 | #else | 193 | #else |
150 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); | 194 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); |
151 | prevGroupStart = j; | 195 | prevGroupStart = j; |
152 | #endif | 196 | #endif |
153 | } | 197 | } |
154 | else | 198 | else |
155 | thereAreGroups = 1; | 199 | thereAreGroups = 1; |
156 | { | 200 | { |
157 | UInt32 ind = ind2[val & mask]; | 201 | const UInt32 ind = ind2[val & mask]; |
158 | temp[j] = ind; | 202 | temp[j] = ind; |
159 | Groups[ind] = group; | 203 | Groups[ind] = (UInt32)group; |
160 | } | 204 | } |
161 | } | 205 | } |
162 | 206 | ||
163 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 207 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
164 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); | 208 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); |
165 | #endif | 209 | #endif |
166 | } | 210 | } |
167 | 211 | ||
168 | for (j = 0; j < groupSize; j++) | 212 | for (j = 0; j < groupSize; j++) |
@@ -172,37 +216,42 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
172 | 216 | ||
173 | /* Check that all strings are in one group (cannot sort) */ | 217 | /* Check that all strings are in one group (cannot sort) */ |
174 | { | 218 | { |
175 | UInt32 group, j; | 219 | UInt32 group; |
176 | UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 220 | size_t j; |
221 | size_t sp = ind2[0] + NumSortedBytes; | ||
222 | if (sp >= BlockSize) | ||
223 | sp -= BlockSize; | ||
177 | group = Groups[sp]; | 224 | group = Groups[sp]; |
178 | for (j = 1; j < groupSize; j++) | 225 | for (j = 1; j < groupSize; j++) |
179 | { | 226 | { |
180 | sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 227 | sp = ind2[j] + NumSortedBytes; |
228 | if (sp >= BlockSize) | ||
229 | sp -= BlockSize; | ||
181 | if (Groups[sp] != group) | 230 | if (Groups[sp] != group) |
182 | break; | 231 | break; |
183 | } | 232 | } |
184 | if (j == groupSize) | 233 | if (j == groupSize) |
185 | { | 234 | { |
186 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 235 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
187 | SetGroupSize(ind2, groupSize); | 236 | SetGroupSize(ind2, groupSize); |
188 | #endif | 237 | #endif |
189 | return 1; | 238 | return 1; |
190 | } | 239 | } |
191 | } | 240 | } |
192 | 241 | ||
193 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 242 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
194 | { | 243 | { |
195 | /* ---------- Range Sort ---------- */ | 244 | /* ---------- Range Sort ---------- */ |
196 | UInt32 i; | 245 | size_t i; |
197 | UInt32 mid; | 246 | size_t mid; |
198 | for (;;) | 247 | for (;;) |
199 | { | 248 | { |
200 | UInt32 j; | 249 | size_t j; |
201 | if (range <= 1) | 250 | if (range <= 1) |
202 | { | 251 | { |
203 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 252 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
204 | SetGroupSize(ind2, groupSize); | 253 | SetGroupSize(ind2, groupSize); |
205 | #endif | 254 | #endif |
206 | return 1; | 255 | return 1; |
207 | } | 256 | } |
208 | mid = left + ((range + 1) >> 1); | 257 | mid = left + ((range + 1) >> 1); |
@@ -210,7 +259,7 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
210 | i = 0; | 259 | i = 0; |
211 | do | 260 | do |
212 | { | 261 | { |
213 | UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 262 | size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; |
214 | if (Groups[sp] >= mid) | 263 | if (Groups[sp] >= mid) |
215 | { | 264 | { |
216 | for (j--; j > i; j--) | 265 | for (j--; j > i; j--) |
@@ -238,51 +287,53 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
238 | break; | 287 | break; |
239 | } | 288 | } |
240 | 289 | ||
241 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 290 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
242 | { | 291 | { |
243 | UInt32 t = (groupOffset + i - 1); | 292 | const size_t t = groupOffset + i - 1; |
244 | UInt32 *Flags = Groups + BlockSize; | 293 | UInt32 *Flags = Groups + BlockSize; |
245 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 294 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
246 | } | 295 | } |
247 | #endif | 296 | #endif |
248 | 297 | ||
249 | { | 298 | { |
250 | UInt32 j; | 299 | size_t j; |
251 | for (j = i; j < groupSize; j++) | 300 | for (j = i; j < groupSize; j++) |
252 | Groups[ind2[j]] = groupOffset + i; | 301 | Groups[ind2[j]] = (UInt32)(groupOffset + i); |
253 | } | 302 | } |
254 | 303 | ||
255 | { | 304 | { |
256 | UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); | 305 | unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); |
257 | return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); | 306 | return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); |
258 | } | 307 | } |
259 | 308 | ||
260 | } | 309 | } |
261 | 310 | ||
262 | #else | 311 | #else // BLOCK_SORT_USE_HEAP_SORT |
263 | 312 | ||
264 | /* ---------- Heap Sort ---------- */ | 313 | /* ---------- Heap Sort ---------- */ |
265 | 314 | ||
266 | { | 315 | { |
267 | UInt32 j; | 316 | size_t j; |
268 | for (j = 0; j < groupSize; j++) | 317 | for (j = 0; j < groupSize; j++) |
269 | { | 318 | { |
270 | UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 319 | size_t sp = ind2[j] + NumSortedBytes; |
271 | ind2[j] = sp; | 320 | if (sp >= BlockSize) |
321 | sp -= BlockSize; | ||
322 | ind2[j] = (UInt32)sp; | ||
272 | } | 323 | } |
273 | 324 | ||
274 | HeapSortRef(ind2, Groups, groupSize); | 325 | HeapSortRef(ind2, Groups, groupSize); |
275 | 326 | ||
276 | /* Write Flags */ | 327 | /* Write Flags */ |
277 | { | 328 | { |
278 | UInt32 sp = ind2[0]; | 329 | size_t sp = ind2[0]; |
279 | UInt32 group = Groups[sp]; | 330 | UInt32 group = Groups[sp]; |
280 | 331 | ||
281 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 332 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
282 | UInt32 *Flags = Groups + BlockSize; | 333 | UInt32 *Flags = Groups + BlockSize; |
283 | #else | 334 | #else |
284 | UInt32 prevGroupStart = 0; | 335 | size_t prevGroupStart = 0; |
285 | #endif | 336 | #endif |
286 | 337 | ||
287 | for (j = 1; j < groupSize; j++) | 338 | for (j = 1; j < groupSize; j++) |
288 | { | 339 | { |
@@ -290,149 +341,210 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
290 | if (Groups[sp] != group) | 341 | if (Groups[sp] != group) |
291 | { | 342 | { |
292 | group = Groups[sp]; | 343 | group = Groups[sp]; |
293 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 344 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
294 | { | 345 | { |
295 | UInt32 t = groupOffset + j - 1; | 346 | const size_t t = groupOffset + j - 1; |
296 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 347 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); |
297 | } | 348 | } |
298 | #else | 349 | #else |
299 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); | 350 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); |
300 | prevGroupStart = j; | 351 | prevGroupStart = j; |
301 | #endif | 352 | #endif |
302 | } | 353 | } |
303 | } | 354 | } |
304 | 355 | ||
305 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 356 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
306 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); | 357 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); |
307 | #endif | 358 | #endif |
308 | } | 359 | } |
309 | { | 360 | { |
310 | /* Write new Groups values and Check that there are groups */ | 361 | /* Write new Groups values and Check that there are groups */ |
311 | UInt32 thereAreGroups = 0; | 362 | unsigned thereAreGroups = 0; |
312 | for (j = 0; j < groupSize; j++) | 363 | for (j = 0; j < groupSize; j++) |
313 | { | 364 | { |
314 | UInt32 group = groupOffset + j; | 365 | size_t group = groupOffset + j; |
315 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 366 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
316 | UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); | 367 | UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); |
317 | if ((ind2[j] & 0x40000000) != 0) | 368 | if (ind2[j] & 0x40000000) |
318 | subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); | 369 | subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); |
319 | subGroupSize++; | 370 | subGroupSize++; |
320 | for (;;) | 371 | for (;;) |
321 | { | 372 | { |
322 | UInt32 original = ind2[j]; | 373 | const UInt32 original = ind2[j]; |
323 | UInt32 sp = original & kIndexMask; | 374 | size_t sp = original & kIndexMask; |
324 | if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; | 375 | if (sp < NumSortedBytes) |
325 | ind2[j] = sp | (original & ~kIndexMask); | 376 | sp += BlockSize; |
326 | Groups[sp] = group; | 377 | sp -= NumSortedBytes; |
378 | ind2[j] = (UInt32)sp | (original & ~kIndexMask); | ||
379 | Groups[sp] = (UInt32)group; | ||
327 | if (--subGroupSize == 0) | 380 | if (--subGroupSize == 0) |
328 | break; | 381 | break; |
329 | j++; | 382 | j++; |
330 | thereAreGroups = 1; | 383 | thereAreGroups = 1; |
331 | } | 384 | } |
332 | #else | 385 | #else |
333 | UInt32 *Flags = Groups + BlockSize; | 386 | UInt32 *Flags = Groups + BlockSize; |
334 | for (;;) | 387 | for (;;) |
335 | { | 388 | { |
336 | UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; | 389 | size_t sp = ind2[j]; |
337 | ind2[j] = sp; | 390 | if (sp < NumSortedBytes) |
338 | Groups[sp] = group; | 391 | sp += BlockSize; |
392 | sp -= NumSortedBytes; | ||
393 | ind2[j] = (UInt32)sp; | ||
394 | Groups[sp] = (UInt32)group; | ||
339 | if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) | 395 | if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) |
340 | break; | 396 | break; |
341 | j++; | 397 | j++; |
342 | thereAreGroups = 1; | 398 | thereAreGroups = 1; |
343 | } | 399 | } |
344 | #endif | 400 | #endif |
345 | } | 401 | } |
346 | return thereAreGroups; | 402 | return thereAreGroups; |
347 | } | 403 | } |
348 | } | 404 | } |
349 | #endif | 405 | #endif // BLOCK_SORT_USE_HEAP_SORT |
350 | } | 406 | } |
351 | 407 | ||
408 | |||
352 | /* conditions: blockSize > 0 */ | 409 | /* conditions: blockSize > 0 */ |
353 | UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | 410 | UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize) |
354 | { | 411 | { |
355 | UInt32 *counters = Indices + blockSize; | 412 | UInt32 *counters = Indices + blockSize; |
356 | UInt32 i; | 413 | size_t i; |
357 | UInt32 *Groups; | 414 | UInt32 *Groups; |
358 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 415 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
359 | UInt32 *Flags; | 416 | UInt32 *Flags; |
360 | #endif | 417 | #endif |
361 | 418 | ||
362 | /* Radix-Sort for 2 bytes */ | 419 | /* Radix-Sort for 2 bytes */ |
420 | // { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) { | ||
363 | for (i = 0; i < kNumHashValues; i++) | 421 | for (i = 0; i < kNumHashValues; i++) |
364 | counters[i] = 0; | 422 | counters[i] = 0; |
365 | for (i = 0; i < blockSize - 1; i++) | 423 | { |
366 | counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++; | 424 | const Byte *data2 = data; |
367 | counters[((UInt32)data[i] << 8) | data[0]]++; | 425 | size_t a = data[(size_t)blockSize - 1]; |
426 | const Byte *data_lim = data + blockSize; | ||
427 | if (blockSize >= 4) | ||
428 | { | ||
429 | data_lim -= 3; | ||
430 | do | ||
431 | { | ||
432 | size_t b; | ||
433 | b = data2[0]; counters[(a << 8) | b]++; | ||
434 | a = data2[1]; counters[(b << 8) | a]++; | ||
435 | b = data2[2]; counters[(a << 8) | b]++; | ||
436 | a = data2[3]; counters[(b << 8) | a]++; | ||
437 | data2 += 4; | ||
438 | } | ||
439 | while (data2 < data_lim); | ||
440 | data_lim += 3; | ||
441 | } | ||
442 | while (data2 != data_lim) | ||
443 | { | ||
444 | size_t b = *data2++; | ||
445 | counters[(a << 8) | b]++; | ||
446 | a = b; | ||
447 | } | ||
448 | } | ||
449 | // }} | ||
368 | 450 | ||
369 | Groups = counters + BS_TEMP_SIZE; | 451 | Groups = counters + BS_TEMP_SIZE; |
370 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 452 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
371 | Flags = Groups + blockSize; | 453 | Flags = Groups + blockSize; |
372 | { | 454 | { |
373 | UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; | 455 | const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; |
374 | for (i = 0; i < numWords; i++) | 456 | for (i = 0; i < numWords; i++) |
375 | Flags[i] = kAllFlags; | 457 | Flags[i] = kAllFlags; |
376 | } | 458 | } |
377 | #endif | 459 | #endif |
378 | 460 | ||
379 | { | 461 | { |
380 | UInt32 sum = 0; | 462 | UInt32 sum = 0; |
381 | for (i = 0; i < kNumHashValues; i++) | 463 | for (i = 0; i < kNumHashValues; i++) |
382 | { | 464 | { |
383 | UInt32 groupSize = counters[i]; | 465 | const UInt32 groupSize = counters[i]; |
384 | if (groupSize > 0) | 466 | counters[i] = sum; |
467 | sum += groupSize; | ||
468 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | ||
469 | if (groupSize) | ||
385 | { | 470 | { |
386 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 471 | const UInt32 t = sum - 1; |
387 | UInt32 t = sum + groupSize - 1; | 472 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
388 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | ||
389 | #endif | ||
390 | sum += groupSize; | ||
391 | } | 473 | } |
392 | counters[i] = sum - groupSize; | 474 | #endif |
393 | } | 475 | } |
476 | } | ||
394 | 477 | ||
395 | for (i = 0; i < blockSize - 1; i++) | 478 | for (i = 0; i < blockSize - 1; i++) |
396 | Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]; | 479 | Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]]; |
397 | Groups[i] = counters[((UInt32)data[i] << 8) | data[0]]; | 480 | Groups[i] = counters[((unsigned)data[i] << 8) | data[0]]; |
481 | |||
482 | { | ||
483 | #define SET_Indices(a, b, i) \ | ||
484 | { UInt32 c; \ | ||
485 | a = (a << 8) | (b); \ | ||
486 | c = counters[a]; \ | ||
487 | Indices[c] = (UInt32)i++; \ | ||
488 | counters[a] = c + 1; \ | ||
489 | } | ||
398 | 490 | ||
399 | for (i = 0; i < blockSize - 1; i++) | 491 | size_t a = data[0]; |
400 | Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i; | 492 | const Byte *data_ptr = data + 1; |
401 | Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i; | 493 | i = 0; |
402 | 494 | if (blockSize >= 3) | |
403 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 495 | { |
496 | blockSize -= 2; | ||
497 | do | ||
498 | { | ||
499 | size_t b; | ||
500 | b = data_ptr[0]; SET_Indices(a, b, i) | ||
501 | a = data_ptr[1]; SET_Indices(b, a, i) | ||
502 | data_ptr += 2; | ||
503 | } | ||
504 | while (i < blockSize); | ||
505 | blockSize += 2; | ||
506 | } | ||
507 | if (i < blockSize - 1) | ||
404 | { | 508 | { |
509 | SET_Indices(a, data[(size_t)i + 1], i) | ||
510 | a = (Byte)a; | ||
511 | } | ||
512 | SET_Indices(a, data[0], i) | ||
513 | } | ||
514 | |||
515 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | ||
516 | { | ||
405 | UInt32 prev = 0; | 517 | UInt32 prev = 0; |
406 | for (i = 0; i < kNumHashValues; i++) | 518 | for (i = 0; i < kNumHashValues; i++) |
407 | { | 519 | { |
408 | UInt32 prevGroupSize = counters[i] - prev; | 520 | const UInt32 prevGroupSize = counters[i] - prev; |
409 | if (prevGroupSize == 0) | 521 | if (prevGroupSize == 0) |
410 | continue; | 522 | continue; |
411 | SetGroupSize(Indices + prev, prevGroupSize); | 523 | SetGroupSize(Indices + prev, prevGroupSize); |
412 | prev = counters[i]; | 524 | prev = counters[i]; |
413 | } | 525 | } |
414 | } | ||
415 | #endif | ||
416 | } | 526 | } |
527 | #endif | ||
417 | 528 | ||
418 | { | 529 | { |
419 | int NumRefBits; | 530 | unsigned NumRefBits; |
420 | UInt32 NumSortedBytes; | 531 | size_t NumSortedBytes; |
421 | for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++); | 532 | for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++) |
533 | {} | ||
422 | NumRefBits = 32 - NumRefBits; | 534 | NumRefBits = 32 - NumRefBits; |
423 | if (NumRefBits > kNumRefBitsMax) | 535 | if (NumRefBits > kNumRefBitsMax) |
424 | NumRefBits = kNumRefBitsMax; | 536 | NumRefBits = kNumRefBitsMax; |
425 | 537 | ||
426 | for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) | 538 | for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) |
427 | { | 539 | { |
428 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 540 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
429 | UInt32 finishedGroupSize = 0; | 541 | size_t finishedGroupSize = 0; |
430 | #endif | 542 | #endif |
431 | UInt32 newLimit = 0; | 543 | size_t newLimit = 0; |
432 | for (i = 0; i < blockSize;) | 544 | for (i = 0; i < blockSize;) |
433 | { | 545 | { |
434 | UInt32 groupSize; | 546 | size_t groupSize; |
435 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 547 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
436 | 548 | ||
437 | if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) | 549 | if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) |
438 | { | 550 | { |
@@ -441,56 +553,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | |||
441 | } | 553 | } |
442 | for (groupSize = 1; | 554 | for (groupSize = 1; |
443 | (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; | 555 | (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; |
444 | groupSize++); | 556 | groupSize++) |
445 | 557 | {} | |
446 | groupSize++; | 558 | groupSize++; |
447 | 559 | ||
448 | #else | 560 | #else |
449 | 561 | ||
450 | groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); | 562 | groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax; |
451 | { | 563 | { |
452 | BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); | 564 | const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); |
453 | if ((Indices[i] & 0x40000000) != 0) | 565 | if (Indices[i] & 0x40000000) |
454 | { | ||
455 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); | ||
456 | Indices[(size_t)i + 1] &= kIndexMask; | ||
457 | } | ||
458 | Indices[i] &= kIndexMask; | ||
459 | groupSize++; | ||
460 | if (finishedGroup || groupSize == 1) | ||
461 | { | ||
462 | Indices[i - finishedGroupSize] &= kIndexMask; | ||
463 | if (finishedGroupSize > 1) | ||
464 | Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask; | ||
465 | { | 566 | { |
466 | UInt32 newGroupSize = groupSize + finishedGroupSize; | 567 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); |
467 | SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) | 568 | Indices[(size_t)i + 1] &= kIndexMask; |
468 | finishedGroupSize = newGroupSize; | ||
469 | } | 569 | } |
470 | i += groupSize; | 570 | Indices[i] &= kIndexMask; |
471 | continue; | 571 | groupSize++; |
472 | } | 572 | if (finishedGroup || groupSize == 1) |
473 | finishedGroupSize = 0; | 573 | { |
574 | Indices[i - finishedGroupSize] &= kIndexMask; | ||
575 | if (finishedGroupSize > 1) | ||
576 | Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask; | ||
577 | { | ||
578 | const size_t newGroupSize = groupSize + finishedGroupSize; | ||
579 | SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) | ||
580 | finishedGroupSize = newGroupSize; | ||
581 | } | ||
582 | i += groupSize; | ||
583 | continue; | ||
584 | } | ||
585 | finishedGroupSize = 0; | ||
474 | } | 586 | } |
475 | 587 | ||
476 | #endif | 588 | #endif |
477 | 589 | ||
478 | if (NumSortedBytes >= blockSize) | 590 | if (NumSortedBytes >= blockSize) |
479 | { | 591 | { |
480 | UInt32 j; | 592 | size_t j; |
481 | for (j = 0; j < groupSize; j++) | 593 | for (j = 0; j < groupSize; j++) |
482 | { | 594 | { |
483 | UInt32 t = (i + j); | 595 | size_t t = i + j; |
484 | /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ | 596 | /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ |
485 | Groups[Indices[t]] = t; | 597 | Groups[Indices[t]] = (UInt32)t; |
486 | } | 598 | } |
487 | } | 599 | } |
488 | else | 600 | else |
489 | if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices | 601 | if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices |
490 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 602 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
491 | , 0, blockSize | 603 | , 0, blockSize |
492 | #endif | 604 | #endif |
493 | ) != 0) | 605 | )) |
494 | newLimit = i + groupSize; | 606 | newLimit = i + groupSize; |
495 | i += groupSize; | 607 | i += groupSize; |
496 | } | 608 | } |
@@ -498,19 +610,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | |||
498 | break; | 610 | break; |
499 | } | 611 | } |
500 | } | 612 | } |
501 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 613 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
502 | for (i = 0; i < blockSize;) | 614 | for (i = 0; i < blockSize;) |
503 | { | 615 | { |
504 | UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); | 616 | size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax; |
505 | if ((Indices[i] & 0x40000000) != 0) | 617 | if (Indices[i] & 0x40000000) |
506 | { | 618 | { |
507 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); | 619 | groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits; |
508 | Indices[(size_t)i + 1] &= kIndexMask; | 620 | Indices[(size_t)i + 1] &= kIndexMask; |
509 | } | 621 | } |
510 | Indices[i] &= kIndexMask; | 622 | Indices[i] &= kIndexMask; |
511 | groupSize++; | 623 | groupSize++; |
512 | i += groupSize; | 624 | i += groupSize; |
513 | } | 625 | } |
514 | #endif | 626 | #endif |
515 | return Groups[0]; | 627 | return Groups[0]; |
516 | } | 628 | } |
diff --git a/C/BwtSort.h b/C/BwtSort.h index a34b243..1bd2316 100644 --- a/C/BwtSort.h +++ b/C/BwtSort.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* BwtSort.h -- BWT block sorting | 1 | /* BwtSort.h -- BWT block sorting |
2 | 2023-03-03 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_BWT_SORT_H | 4 | #ifndef ZIP7_INC_BWT_SORT_H |
5 | #define ZIP7_INC_BWT_SORT_H | 5 | #define ZIP7_INC_BWT_SORT_H |
@@ -10,16 +10,17 @@ EXTERN_C_BEGIN | |||
10 | 10 | ||
11 | /* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ | 11 | /* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ |
12 | /* #define BLOCK_SORT_EXTERNAL_FLAGS */ | 12 | /* #define BLOCK_SORT_EXTERNAL_FLAGS */ |
13 | // #define BLOCK_SORT_EXTERNAL_FLAGS | ||
13 | 14 | ||
14 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 15 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
15 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5)) | 16 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5) |
16 | #else | 17 | #else |
17 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 | 18 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 |
18 | #endif | 19 | #endif |
19 | 20 | ||
20 | #define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) | 21 | #define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) |
21 | 22 | ||
22 | UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize); | 23 | UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize); |
23 | 24 | ||
24 | EXTERN_C_END | 25 | EXTERN_C_END |
25 | 26 | ||
diff --git a/C/Compiler.h b/C/Compiler.h index 2a9c2b7..b266b27 100644 --- a/C/Compiler.h +++ b/C/Compiler.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Compiler.h : Compiler specific defines and pragmas | 1 | /* Compiler.h : Compiler specific defines and pragmas |
2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_COMPILER_H | 4 | #ifndef ZIP7_INC_COMPILER_H |
5 | #define ZIP7_INC_COMPILER_H | 5 | #define ZIP7_INC_COMPILER_H |
@@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void); | |||
183 | #define Z7_ATTRIB_NO_VECTORIZE | 183 | #define Z7_ATTRIB_NO_VECTORIZE |
184 | #endif | 184 | #endif |
185 | 185 | ||
186 | #if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920) | ||
187 | #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )") | ||
188 | #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )") | ||
189 | #else | ||
190 | #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE | ||
191 | #define Z7_PRAGMA_OPTIMIZE_DEFAULT | ||
192 | #endif | ||
193 | |||
194 | |||
195 | |||
186 | #if defined(MY_CPU_X86_OR_AMD64) && ( \ | 196 | #if defined(MY_CPU_X86_OR_AMD64) && ( \ |
187 | defined(__clang__) && (__clang_major__ >= 4) \ | 197 | defined(__clang__) && (__clang_major__ >= 4) \ |
188 | || defined(__GNUC__) && (__GNUC__ >= 5)) | 198 | || defined(__GNUC__) && (__GNUC__ >= 5)) |
diff --git a/C/CpuArch.c b/C/CpuArch.c index e792f39..6e02551 100644 --- a/C/CpuArch.c +++ b/C/CpuArch.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.c -- CPU specific code | 1 | /* CpuArch.c -- CPU specific code |
2 | 2024-07-04 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -17,7 +17,7 @@ | |||
17 | /* | 17 | /* |
18 | cpuid instruction supports (subFunction) parameter in ECX, | 18 | cpuid instruction supports (subFunction) parameter in ECX, |
19 | that is used only with some specific (function) parameter values. | 19 | that is used only with some specific (function) parameter values. |
20 | But we always use only (subFunction==0). | 20 | most functions use only (subFunction==0). |
21 | */ | 21 | */ |
22 | /* | 22 | /* |
23 | __cpuid(): MSVC and GCC/CLANG use same function/macro name | 23 | __cpuid(): MSVC and GCC/CLANG use same function/macro name |
@@ -49,43 +49,49 @@ | |||
49 | #if defined(MY_CPU_AMD64) && defined(__PIC__) \ | 49 | #if defined(MY_CPU_AMD64) && defined(__PIC__) \ |
50 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) | 50 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
51 | 51 | ||
52 | #define x86_cpuid_MACRO(p, func) { \ | 52 | /* "=&r" selects free register. It can select even rbx, if that register is free. |
53 | "=&D" for (RDI) also works, but the code can be larger with "=&D" | ||
54 | "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ | ||
55 | |||
56 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ | ||
53 | __asm__ __volatile__ ( \ | 57 | __asm__ __volatile__ ( \ |
54 | ASM_LN "mov %%rbx, %q1" \ | 58 | ASM_LN "mov %%rbx, %q1" \ |
55 | ASM_LN "cpuid" \ | 59 | ASM_LN "cpuid" \ |
56 | ASM_LN "xchg %%rbx, %q1" \ | 60 | ASM_LN "xchg %%rbx, %q1" \ |
57 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 61 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
58 | |||
59 | /* "=&r" selects free register. It can select even rbx, if that register is free. | ||
60 | "=&D" for (RDI) also works, but the code can be larger with "=&D" | ||
61 | "2"(0) means (subFunction = 0), | ||
62 | 2 is (zero-based) index in the output constraint list "=c" (ECX). */ | ||
63 | 62 | ||
64 | #elif defined(MY_CPU_X86) && defined(__PIC__) \ | 63 | #elif defined(MY_CPU_X86) && defined(__PIC__) \ |
65 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) | 64 | && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) |
66 | 65 | ||
67 | #define x86_cpuid_MACRO(p, func) { \ | 66 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ |
68 | __asm__ __volatile__ ( \ | 67 | __asm__ __volatile__ ( \ |
69 | ASM_LN "mov %%ebx, %k1" \ | 68 | ASM_LN "mov %%ebx, %k1" \ |
70 | ASM_LN "cpuid" \ | 69 | ASM_LN "cpuid" \ |
71 | ASM_LN "xchg %%ebx, %k1" \ | 70 | ASM_LN "xchg %%ebx, %k1" \ |
72 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 71 | : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
73 | 72 | ||
74 | #else | 73 | #else |
75 | 74 | ||
76 | #define x86_cpuid_MACRO(p, func) { \ | 75 | #define x86_cpuid_MACRO_2(p, func, subFunc) { \ |
77 | __asm__ __volatile__ ( \ | 76 | __asm__ __volatile__ ( \ |
78 | ASM_LN "cpuid" \ | 77 | ASM_LN "cpuid" \ |
79 | : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } | 78 | : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } |
80 | 79 | ||
81 | #endif | 80 | #endif |
82 | 81 | ||
82 | #define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) | ||
83 | 83 | ||
84 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | 84 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
85 | { | 85 | { |
86 | x86_cpuid_MACRO(p, func) | 86 | x86_cpuid_MACRO(p, func) |
87 | } | 87 | } |
88 | 88 | ||
89 | static | ||
90 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
91 | { | ||
92 | x86_cpuid_MACRO_2(p, func, subFunc) | ||
93 | } | ||
94 | |||
89 | 95 | ||
90 | Z7_NO_INLINE | 96 | Z7_NO_INLINE |
91 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) | 97 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
@@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
205 | __asm ret 0 | 211 | __asm ret 0 |
206 | } | 212 | } |
207 | 213 | ||
214 | static | ||
215 | void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
216 | { | ||
217 | UNUSED_VAR(p) | ||
218 | UNUSED_VAR(func) | ||
219 | UNUSED_VAR(subFunc) | ||
220 | __asm push ebx | ||
221 | __asm push edi | ||
222 | __asm mov edi, ecx // p | ||
223 | __asm mov eax, edx // func | ||
224 | __asm mov ecx, [esp + 12] // subFunc | ||
225 | __asm cpuid | ||
226 | __asm mov [edi ], eax | ||
227 | __asm mov [edi + 4], ebx | ||
228 | __asm mov [edi + 8], ecx | ||
229 | __asm mov [edi + 12], edx | ||
230 | __asm pop edi | ||
231 | __asm pop ebx | ||
232 | __asm ret 4 | ||
233 | } | ||
234 | |||
208 | #else // MY_CPU_AMD64 | 235 | #else // MY_CPU_AMD64 |
209 | 236 | ||
210 | #if _MSC_VER >= 1600 | 237 | #if _MSC_VER >= 1600 |
211 | #include <intrin.h> | 238 | #include <intrin.h> |
212 | #define MY_cpuidex __cpuidex | 239 | #define MY_cpuidex __cpuidex |
240 | |||
241 | static | ||
242 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
243 | { | ||
244 | __cpuidex((int *)p, func, subFunc); | ||
245 | } | ||
246 | |||
213 | #else | 247 | #else |
214 | /* | 248 | /* |
215 | __cpuid (func == (0 or 7)) requires subfunction number in ECX. | 249 | __cpuid (func == (0 or 7)) requires subfunction number in ECX. |
@@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
219 | We still can use __cpuid for low (func) values that don't require ECX, | 253 | We still can use __cpuid for low (func) values that don't require ECX, |
220 | but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). | 254 | but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). |
221 | So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, | 255 | So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, |
222 | where ECX value is first parameter for FASTCALL / NO_INLINE func, | 256 | where ECX value is first parameter for FASTCALL / NO_INLINE func. |
223 | So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and | 257 | So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and |
224 | old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. | 258 | old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. |
225 | 259 | ||
@@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int | |||
233 | } | 267 | } |
234 | #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) | 268 | #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) |
235 | #pragma message("======== MY_cpuidex_HACK WAS USED ========") | 269 | #pragma message("======== MY_cpuidex_HACK WAS USED ========") |
270 | static | ||
271 | void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) | ||
272 | { | ||
273 | MY_cpuidex_HACK(subFunc, func, (Int32 *)p); | ||
274 | } | ||
236 | #endif // _MSC_VER >= 1600 | 275 | #endif // _MSC_VER >= 1600 |
237 | 276 | ||
238 | #if !defined(MY_CPU_AMD64) | 277 | #if !defined(MY_CPU_AMD64) |
@@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void) | |||
445 | } | 484 | } |
446 | } | 485 | } |
447 | 486 | ||
487 | |||
488 | BoolInt CPU_IsSupported_SHA512(void) | ||
489 | { | ||
490 | if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here | ||
491 | |||
492 | if (z7_x86_cpuid_GetMaxFunc() < 7) | ||
493 | return False; | ||
494 | { | ||
495 | UInt32 d[4]; | ||
496 | z7_x86_cpuid_subFunc(d, 7, 0); | ||
497 | if (d[0] < 1) // d[0] - is max supported subleaf value | ||
498 | return False; | ||
499 | z7_x86_cpuid_subFunc(d, 7, 1); | ||
500 | return (BoolInt)(d[0]) & 1; | ||
501 | } | ||
502 | } | ||
503 | |||
448 | /* | 504 | /* |
449 | MSVC: _xgetbv() intrinsic is available since VS2010SP1. | 505 | MSVC: _xgetbv() intrinsic is available since VS2010SP1. |
450 | MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in | 506 | MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in |
@@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void) | |||
776 | return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); | 832 | return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); |
777 | } | 833 | } |
778 | 834 | ||
835 | BoolInt CPU_IsSupported_SHA512(void) | ||
836 | { | ||
837 | return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); | ||
838 | } | ||
839 | |||
840 | /* | ||
841 | BoolInt CPU_IsSupported_SHA3(void) | ||
842 | { | ||
843 | return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); | ||
844 | } | ||
845 | */ | ||
846 | |||
779 | #ifdef MY_CPU_ARM64 | 847 | #ifdef MY_CPU_ARM64 |
780 | #define APPLE_CRYPTO_SUPPORT_VAL 1 | 848 | #define APPLE_CRYPTO_SUPPORT_VAL 1 |
781 | #else | 849 | #else |
@@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32) | |||
860 | MY_HWCAP_CHECK_FUNC (SHA1) | 928 | MY_HWCAP_CHECK_FUNC (SHA1) |
861 | MY_HWCAP_CHECK_FUNC (SHA2) | 929 | MY_HWCAP_CHECK_FUNC (SHA2) |
862 | MY_HWCAP_CHECK_FUNC (AES) | 930 | MY_HWCAP_CHECK_FUNC (AES) |
931 | #ifdef MY_CPU_ARM64 | ||
932 | // <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. | ||
933 | // we define them here, if they are not defined | ||
934 | #ifndef HWCAP_SHA3 | ||
935 | // #define HWCAP_SHA3 (1 << 17) | ||
936 | #endif | ||
937 | #ifndef HWCAP_SHA512 | ||
938 | // #pragma message("=== HWCAP_SHA512 define === ") | ||
939 | #define HWCAP_SHA512 (1 << 21) | ||
940 | #endif | ||
941 | MY_HWCAP_CHECK_FUNC (SHA512) | ||
942 | // MY_HWCAP_CHECK_FUNC (SHA3) | ||
943 | #endif | ||
863 | 944 | ||
864 | #endif // __APPLE__ | 945 | #endif // __APPLE__ |
865 | #endif // _WIN32 | 946 | #endif // _WIN32 |
diff --git a/C/CpuArch.h b/C/CpuArch.h index 683cfaa..1690a5b 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.h -- CPU specific code | 1 | /* CpuArch.h -- CPU specific code |
2 | 2024-06-17 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_CPU_ARCH_H | 4 | #ifndef ZIP7_INC_CPU_ARCH_H |
5 | #define ZIP7_INC_CPU_ARCH_H | 5 | #define ZIP7_INC_CPU_ARCH_H |
@@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
47 | #define MY_CPU_SIZEOF_POINTER 4 | 47 | #define MY_CPU_SIZEOF_POINTER 4 |
48 | #endif | 48 | #endif |
49 | 49 | ||
50 | #if defined(__SSE2__) \ | ||
51 | || defined(MY_CPU_AMD64) \ | ||
52 | || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) | ||
53 | #define MY_CPU_SSE2 | ||
54 | #endif | ||
55 | |||
50 | 56 | ||
51 | #if defined(_M_ARM64) \ | 57 | #if defined(_M_ARM64) \ |
52 | || defined(_M_ARM64EC) \ | 58 | || defined(_M_ARM64EC) \ |
@@ -509,11 +515,19 @@ problem-4 : performace: | |||
509 | 515 | ||
510 | #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) | 516 | #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) |
511 | 517 | ||
518 | #if 0 | ||
519 | // Z7_BSWAP16 can be slow for x86-msvc | ||
520 | #define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p))) | ||
521 | #else | ||
522 | #define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16) | ||
523 | #endif | ||
524 | |||
512 | #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) | 525 | #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) |
513 | #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } | 526 | #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } |
514 | 527 | ||
515 | #if defined(MY_CPU_LE_UNALIGN_64) | 528 | #if defined(MY_CPU_LE_UNALIGN_64) |
516 | #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) | 529 | #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) |
530 | #define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); } | ||
517 | #endif | 531 | #endif |
518 | 532 | ||
519 | #else | 533 | #else |
@@ -536,21 +550,39 @@ problem-4 : performace: | |||
536 | #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) | 550 | #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) |
537 | #endif | 551 | #endif |
538 | 552 | ||
553 | #ifndef SetBe64 | ||
554 | #define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \ | ||
555 | _ppp_[0] = (Byte)(_vvv_ >> 56); \ | ||
556 | _ppp_[1] = (Byte)(_vvv_ >> 48); \ | ||
557 | _ppp_[2] = (Byte)(_vvv_ >> 40); \ | ||
558 | _ppp_[3] = (Byte)(_vvv_ >> 32); \ | ||
559 | _ppp_[4] = (Byte)(_vvv_ >> 24); \ | ||
560 | _ppp_[5] = (Byte)(_vvv_ >> 16); \ | ||
561 | _ppp_[6] = (Byte)(_vvv_ >> 8); \ | ||
562 | _ppp_[7] = (Byte)_vvv_; } | ||
563 | #endif | ||
564 | |||
539 | #ifndef GetBe16 | 565 | #ifndef GetBe16 |
566 | #ifdef GetBe16_to32 | ||
567 | #define GetBe16(p) ( (UInt16) GetBe16_to32(p)) | ||
568 | #else | ||
540 | #define GetBe16(p) ( (UInt16) ( \ | 569 | #define GetBe16(p) ( (UInt16) ( \ |
541 | ((UInt16)((const Byte *)(p))[0] << 8) | \ | 570 | ((UInt16)((const Byte *)(p))[0] << 8) | \ |
542 | ((const Byte *)(p))[1] )) | 571 | ((const Byte *)(p))[1] )) |
543 | #endif | 572 | #endif |
573 | #endif | ||
544 | 574 | ||
545 | 575 | ||
546 | #if defined(MY_CPU_BE) | 576 | #if defined(MY_CPU_BE) |
547 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) | 577 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) |
548 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) | 578 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) |
549 | #define Z7_CONV_NATIVE_TO_BE_32(v) (v) | 579 | #define Z7_CONV_NATIVE_TO_BE_32(v) (v) |
580 | // #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8)) | ||
550 | #elif defined(MY_CPU_LE) | 581 | #elif defined(MY_CPU_LE) |
551 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) | 582 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) |
552 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) | 583 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) |
553 | #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) | 584 | #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) |
585 | // #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8)) | ||
554 | #else | 586 | #else |
555 | #error Stop_Compiling_Unknown_Endian_CONV | 587 | #error Stop_Compiling_Unknown_Endian_CONV |
556 | #endif | 588 | #endif |
@@ -589,6 +621,11 @@ problem-4 : performace: | |||
589 | #endif | 621 | #endif |
590 | 622 | ||
591 | 623 | ||
624 | #ifndef GetBe16_to32 | ||
625 | #define GetBe16_to32(p) GetBe16(p) | ||
626 | #endif | ||
627 | |||
628 | |||
592 | #if defined(MY_CPU_X86_OR_AMD64) \ | 629 | #if defined(MY_CPU_X86_OR_AMD64) \ |
593 | || defined(MY_CPU_ARM_OR_ARM64) \ | 630 | || defined(MY_CPU_ARM_OR_ARM64) \ |
594 | || defined(MY_CPU_PPC_OR_PPC64) | 631 | || defined(MY_CPU_PPC_OR_PPC64) |
@@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void); | |||
617 | BoolInt CPU_IsSupported_SSSE3(void); | 654 | BoolInt CPU_IsSupported_SSSE3(void); |
618 | BoolInt CPU_IsSupported_SSE41(void); | 655 | BoolInt CPU_IsSupported_SSE41(void); |
619 | BoolInt CPU_IsSupported_SHA(void); | 656 | BoolInt CPU_IsSupported_SHA(void); |
657 | BoolInt CPU_IsSupported_SHA512(void); | ||
620 | BoolInt CPU_IsSupported_PageGB(void); | 658 | BoolInt CPU_IsSupported_PageGB(void); |
621 | 659 | ||
622 | #elif defined(MY_CPU_ARM_OR_ARM64) | 660 | #elif defined(MY_CPU_ARM_OR_ARM64) |
@@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void); | |||
634 | BoolInt CPU_IsSupported_SHA2(void); | 672 | BoolInt CPU_IsSupported_SHA2(void); |
635 | BoolInt CPU_IsSupported_AES(void); | 673 | BoolInt CPU_IsSupported_AES(void); |
636 | #endif | 674 | #endif |
675 | BoolInt CPU_IsSupported_SHA512(void); | ||
637 | 676 | ||
638 | #endif | 677 | #endif |
639 | 678 | ||
diff --git a/C/HuffEnc.c b/C/HuffEnc.c index 996da30..cbf8c22 100644 --- a/C/HuffEnc.c +++ b/C/HuffEnc.c | |||
@@ -1,60 +1,125 @@ | |||
1 | /* HuffEnc.c -- functions for Huffman encoding | 1 | /* HuffEnc.c -- functions for Huffman encoding |
2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include <string.h> | ||
7 | |||
6 | #include "HuffEnc.h" | 8 | #include "HuffEnc.h" |
7 | #include "Sort.h" | 9 | #include "Sort.h" |
10 | #include "CpuArch.h" | ||
8 | 11 | ||
9 | #define kMaxLen 16 | 12 | #define kMaxLen Z7_HUFFMAN_LEN_MAX |
10 | #define NUM_BITS 10 | 13 | #define NUM_BITS 10 |
11 | #define MASK ((1u << NUM_BITS) - 1) | 14 | #define MASK ((1u << NUM_BITS) - 1) |
12 | 15 | #define FREQ_MASK (~(UInt32)MASK) | |
13 | #define NUM_COUNTERS 64 | 16 | #define NUM_COUNTERS (48 * 2) |
14 | 17 | ||
15 | #define HUFFMAN_SPEED_OPT | 18 | #if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE)) |
19 | #if defined(MY_CPU_LE) | ||
20 | #define HI_HALF_OFFSET 1 | ||
21 | #else | ||
22 | #define HI_HALF_OFFSET 0 | ||
23 | #endif | ||
24 | #define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET)) | ||
25 | #define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val); | ||
26 | #define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi) | ||
27 | #define UPDATE_E(eHi) eHi++; | ||
28 | #else | ||
29 | #define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS)) | ||
30 | #define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field | ||
31 | #define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS)) | ||
32 | #define UPDATE_E(eHi) eHi += 1 << NUM_BITS; | ||
33 | #endif | ||
16 | 34 | ||
17 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen) | 35 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen) |
18 | { | 36 | { |
19 | UInt32 num = 0; | 37 | #if NUM_COUNTERS > 2 |
20 | /* if (maxLen > 10) maxLen = 10; */ | 38 | unsigned counters[NUM_COUNTERS]; |
39 | #endif | ||
40 | #if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2 | ||
41 | #define lenCounters (counters) | ||
42 | #define codes (counters + kMaxLen + 4) | ||
43 | #else | ||
44 | unsigned lenCounters[kMaxLen + 1]; | ||
45 | UInt32 codes[kMaxLen + 1]; | ||
46 | #endif | ||
47 | |||
48 | unsigned num; | ||
21 | { | 49 | { |
22 | UInt32 i; | 50 | unsigned i; |
23 | 51 | // UInt32 sum = 0; | |
24 | #ifdef HUFFMAN_SPEED_OPT | 52 | |
53 | #if NUM_COUNTERS > 2 | ||
25 | 54 | ||
26 | UInt32 counters[NUM_COUNTERS]; | 55 | #define CTR_ITEM_FOR_FREQ(freq) \ |
56 | counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)] | ||
57 | |||
27 | for (i = 0; i < NUM_COUNTERS; i++) | 58 | for (i = 0; i < NUM_COUNTERS; i++) |
28 | counters[i] = 0; | 59 | counters[i] = 0; |
29 | for (i = 0; i < numSymbols; i++) | 60 | memset(lens, 0, numSymbols); |
30 | { | 61 | { |
31 | UInt32 freq = freqs[i]; | 62 | const UInt32 *fp = freqs + numSymbols; |
32 | counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++; | 63 | #define NUM_UNROLLS 1 |
64 | #if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble | ||
65 | if (numSymbols & 1) | ||
66 | { | ||
67 | UInt32 f; | ||
68 | f = *--fp; CTR_ITEM_FOR_FREQ(f)++; | ||
69 | // sum += f; | ||
70 | } | ||
71 | #endif | ||
72 | do | ||
73 | { | ||
74 | UInt32 f; | ||
75 | fp -= NUM_UNROLLS; | ||
76 | f = fp[0]; CTR_ITEM_FOR_FREQ(f)++; | ||
77 | // sum += f; | ||
78 | #if NUM_UNROLLS > 1 | ||
79 | f = fp[1]; CTR_ITEM_FOR_FREQ(f)++; | ||
80 | // sum += f; | ||
81 | #endif | ||
82 | } | ||
83 | while (fp != freqs); | ||
33 | } | 84 | } |
34 | 85 | #if 0 | |
35 | for (i = 1; i < NUM_COUNTERS; i++) | 86 | printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols); |
36 | { | 87 | { |
37 | UInt32 temp = counters[i]; | 88 | unsigned k = 0; |
38 | counters[i] = num; | 89 | for (k = 0; k < NUM_COUNTERS; k++) |
39 | num += temp; | 90 | printf(" %u", counters[k]); |
40 | } | 91 | } |
41 | 92 | #endif | |
42 | for (i = 0; i < numSymbols; i++) | 93 | |
94 | num = counters[1]; | ||
95 | counters[1] = 0; | ||
96 | for (i = 2; i != NUM_COUNTERS; i += 2) | ||
43 | { | 97 | { |
44 | UInt32 freq = freqs[i]; | 98 | unsigned c; |
45 | if (freq == 0) | 99 | c = (counters )[i]; (counters )[i] = num; num += c; |
46 | lens[i] = 0; | 100 | c = (counters + 1)[i]; (counters + 1)[i] = num; num += c; |
47 | else | 101 | } |
48 | p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS); | 102 | counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array |
103 | { | ||
104 | i = 0; | ||
105 | do | ||
106 | { | ||
107 | const UInt32 f = freqs[i]; | ||
108 | #if 0 | ||
109 | if (f == 0) lens[i] = 0; else | ||
110 | #endif | ||
111 | p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS); | ||
112 | } | ||
113 | while (++i != numSymbols); | ||
49 | } | 114 | } |
50 | counters[0] = 0; | ||
51 | HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); | 115 | HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); |
52 | 116 | ||
53 | #else | 117 | #else // NUM_COUNTERS <= 2 |
54 | 118 | ||
119 | num = 0; | ||
55 | for (i = 0; i < numSymbols; i++) | 120 | for (i = 0; i < numSymbols; i++) |
56 | { | 121 | { |
57 | UInt32 freq = freqs[i]; | 122 | const UInt32 freq = freqs[i]; |
58 | if (freq == 0) | 123 | if (freq == 0) |
59 | lens[i] = 0; | 124 | lens[i] = 0; |
60 | else | 125 | else |
@@ -62,17 +127,27 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
62 | } | 127 | } |
63 | HeapSort(p, num); | 128 | HeapSort(p, num); |
64 | 129 | ||
65 | #endif | 130 | #endif |
66 | } | 131 | } |
67 | 132 | ||
68 | if (num < 2) | 133 | if (num <= 2) |
69 | { | 134 | { |
70 | unsigned minCode = 0; | 135 | unsigned minCode = 0; |
71 | unsigned maxCode = 1; | 136 | unsigned maxCode = 1; |
72 | if (num == 1) | 137 | if (num) |
73 | { | 138 | { |
74 | maxCode = (unsigned)p[0] & MASK; | 139 | maxCode = (unsigned)p[(size_t)num - 1] & MASK; |
75 | if (maxCode == 0) | 140 | if (num == 2) |
141 | { | ||
142 | minCode = (unsigned)p[0] & MASK; | ||
143 | if (minCode > maxCode) | ||
144 | { | ||
145 | const unsigned temp = minCode; | ||
146 | minCode = maxCode; | ||
147 | maxCode = temp; | ||
148 | } | ||
149 | } | ||
150 | else if (maxCode == 0) | ||
76 | maxCode++; | 151 | maxCode++; |
77 | } | 152 | } |
78 | p[minCode] = 0; | 153 | p[minCode] = 0; |
@@ -80,69 +155,191 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
80 | lens[minCode] = lens[maxCode] = 1; | 155 | lens[minCode] = lens[maxCode] = 1; |
81 | return; | 156 | return; |
82 | } | 157 | } |
83 | |||
84 | { | 158 | { |
85 | UInt32 b, e, i; | 159 | unsigned i; |
86 | 160 | for (i = 0; i <= kMaxLen; i++) | |
87 | i = b = e = 0; | 161 | lenCounters[i] = 0; |
88 | do | 162 | lenCounters[1] = 2; // by default root node has 2 child leaves at level 1. |
163 | } | ||
164 | // if (num != 2) | ||
165 | { | ||
166 | // num > 2 | ||
167 | // the binary tree will contain (num - 1) internal nodes. | ||
168 | // p[num - 2] will be root node of binary tree. | ||
169 | UInt32 *b; | ||
170 | UInt32 *n; | ||
171 | // first node will have two leaf childs: p[0] and p[1]: | ||
172 | // p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs | ||
173 | // if (pi == n) exit(0); | ||
174 | // if (pi != n) | ||
89 | { | 175 | { |
90 | UInt32 n, m, freq; | 176 | UInt32 fb = (p[1] & FREQ_MASK) + p[0]; |
91 | n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; | 177 | UInt32 f = p[2] & FREQ_MASK; |
92 | freq = (p[n] & ~MASK); | 178 | const UInt32 *pi = p + 2; |
93 | p[n] = (p[n] & MASK) | (e << NUM_BITS); | 179 | UInt32 *e = p; |
94 | m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; | 180 | UInt32 eHi = 0; |
95 | freq += (p[m] & ~MASK); | 181 | n = p + num; |
96 | p[m] = (p[m] & MASK) | (e << NUM_BITS); | 182 | b = p; |
97 | p[e] = (p[e] & MASK) | freq; | 183 | // p[0] = fb; |
98 | e++; | 184 | for (;;) |
185 | { | ||
186 | // (b <= e) | ||
187 | UInt32 sum; | ||
188 | e++; | ||
189 | UPDATE_E(eHi) | ||
190 | |||
191 | // (b < e) | ||
192 | |||
193 | // p range : high bits | ||
194 | // [0, b) : parent : processed nodes that have parent and childs | ||
195 | // [b, e) : FREQ : non-processed nodes that have no parent but have childs | ||
196 | // [e, pi) : FREQ : processed leaves for which parent node was created | ||
197 | // [pi, n) : FREQ : non-processed leaves for which parent node was not created | ||
198 | |||
199 | // first child | ||
200 | // note : (*b < f) is same result as ((*b & FREQ_MASK) < f) | ||
201 | if (fb < f) | ||
202 | { | ||
203 | // node freq is smaller | ||
204 | sum = fb & FREQ_MASK; | ||
205 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
206 | b++; | ||
207 | fb = *b; | ||
208 | if (b == e) | ||
209 | { | ||
210 | if (++pi == n) | ||
211 | break; | ||
212 | sum += f; | ||
213 | fb &= MASK; | ||
214 | fb |= sum; | ||
215 | *e = fb; | ||
216 | f = *pi & FREQ_MASK; | ||
217 | continue; | ||
218 | } | ||
219 | } | ||
220 | else if (++pi == n) | ||
221 | { | ||
222 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
223 | b++; | ||
224 | break; | ||
225 | } | ||
226 | else | ||
227 | { | ||
228 | sum = f; | ||
229 | f = *pi & FREQ_MASK; | ||
230 | } | ||
231 | |||
232 | // (b < e) | ||
233 | |||
234 | // second child | ||
235 | if (fb < f) | ||
236 | { | ||
237 | sum += fb; | ||
238 | sum &= FREQ_MASK; | ||
239 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
240 | b++; | ||
241 | *e = (*e & MASK) | sum; // set frequency sum | ||
242 | // (b <= e) is possible here | ||
243 | fb = *b; | ||
244 | } | ||
245 | else if (++pi == n) | ||
246 | break; | ||
247 | else | ||
248 | { | ||
249 | sum += f; | ||
250 | f = *pi & FREQ_MASK; | ||
251 | *e = (*e & MASK) | sum; // set frequency sum | ||
252 | } | ||
253 | } | ||
99 | } | 254 | } |
100 | while (num - e > 1); | ||
101 | 255 | ||
256 | // printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p); | ||
102 | { | 257 | { |
103 | UInt32 lenCounters[kMaxLen + 1]; | 258 | n -= 2; |
104 | for (i = 0; i <= kMaxLen; i++) | 259 | *n &= MASK; // root node : we clear high bits (zero bits mean level == 0) |
105 | lenCounters[i] = 0; | 260 | if (n != b) |
106 | |||
107 | p[--e] &= MASK; | ||
108 | lenCounters[1] = 2; | ||
109 | while (e != 0) | ||
110 | { | ||
111 | UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1; | ||
112 | p[e] = (p[e] & MASK) | (len << NUM_BITS); | ||
113 | if (len >= maxLen) | ||
114 | for (len = maxLen - 1; lenCounters[len] == 0; len--); | ||
115 | lenCounters[len]--; | ||
116 | lenCounters[(size_t)len + 1] += 2; | ||
117 | } | ||
118 | |||
119 | { | 261 | { |
120 | UInt32 len; | 262 | // We go here, if we have some number of non-created nodes up to root. |
121 | i = 0; | 263 | // We process them in simplified code: |
122 | for (len = maxLen; len != 0; len--) | 264 | // position of parent for each pair of nodes is known. |
265 | // n[-2], n[-1] : current pair of child nodes | ||
266 | // (p1) : parent node for current pair. | ||
267 | UInt32 *p1 = n; | ||
268 | do | ||
123 | { | 269 | { |
124 | UInt32 k; | 270 | const unsigned len = LOAD_PARENT(p1) + 1; |
125 | for (k = lenCounters[len]; k != 0; k--) | 271 | p1--; |
126 | lens[p[i++] & MASK] = (Byte)len; | 272 | (lenCounters )[len] -= 2; // we remove 2 leaves from level (len) |
273 | (lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1) | ||
274 | n -= 2; | ||
275 | STORE_PARENT (n , n[0], len) | ||
276 | STORE_PARENT (n + 1, n[1], len) | ||
127 | } | 277 | } |
278 | while (n != b); | ||
128 | } | 279 | } |
129 | 280 | } | |
281 | |||
282 | if (b != p) | ||
283 | { | ||
284 | // we detect level of each node (realtive to root), | ||
285 | // and update lenCounters[]. | ||
286 | // We process only intermediate nodes and we don't process leaves. | ||
287 | do | ||
130 | { | 288 | { |
131 | UInt32 nextCodes[kMaxLen + 1]; | 289 | // if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii]) |
132 | { | 290 | // if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree |
133 | UInt32 code = 0; | 291 | unsigned len; |
134 | UInt32 len; | 292 | b--; |
135 | for (len = 1; len <= kMaxLen; len++) | 293 | len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1; |
136 | nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1; | 294 | STORE_PARENT (b, *b, len) |
137 | } | 295 | if (len >= maxLen) |
138 | /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */ | ||
139 | |||
140 | { | 296 | { |
141 | UInt32 k; | 297 | // We are not allowed to create node at level (maxLen) and higher, |
142 | for (k = 0; k < numSymbols; k++) | 298 | // because all leaves must be placed to level (maxLen) or lower. |
143 | p[k] = nextCodes[lens[k]]++; | 299 | // We find nearest allowed leaf and place current node to level of that leaf: |
300 | for (len = maxLen - 1; lenCounters[len] == 0; len--) {} | ||
144 | } | 301 | } |
302 | lenCounters[len]--; // we remove 1 leaf from level (len) | ||
303 | (lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1) | ||
304 | } | ||
305 | while (b != p); | ||
306 | } | ||
307 | } | ||
308 | { | ||
309 | { | ||
310 | unsigned len = maxLen; | ||
311 | const UInt32 *p2 = p; | ||
312 | do | ||
313 | { | ||
314 | unsigned k = lenCounters[len]; | ||
315 | if (k) | ||
316 | do | ||
317 | lens[(unsigned)*p2++ & MASK] = (Byte)len; | ||
318 | while (--k); | ||
319 | } | ||
320 | while (--len); | ||
321 | } | ||
322 | codes[0] = 0; // we don't want garbage values to be written to p[] array. | ||
323 | // codes[1] = 0; | ||
324 | { | ||
325 | UInt32 code = 0; | ||
326 | unsigned len; | ||
327 | for (len = 0; len < kMaxLen; len++) | ||
328 | (codes + 1)[len] = code = (code + lenCounters[len]) << 1; | ||
329 | } | ||
330 | /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */ | ||
331 | { | ||
332 | const Byte * const limit = lens + numSymbols; | ||
333 | do | ||
334 | { | ||
335 | unsigned len; | ||
336 | UInt32 c; | ||
337 | len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1; | ||
338 | // len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1; | ||
339 | p += 1; | ||
340 | lens += 1; | ||
145 | } | 341 | } |
342 | while (lens != limit); | ||
146 | } | 343 | } |
147 | } | 344 | } |
148 | } | 345 | } |
@@ -150,5 +347,14 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
150 | #undef kMaxLen | 347 | #undef kMaxLen |
151 | #undef NUM_BITS | 348 | #undef NUM_BITS |
152 | #undef MASK | 349 | #undef MASK |
350 | #undef FREQ_MASK | ||
153 | #undef NUM_COUNTERS | 351 | #undef NUM_COUNTERS |
154 | #undef HUFFMAN_SPEED_OPT | 352 | #undef CTR_ITEM_FOR_FREQ |
353 | #undef LOAD_PARENT | ||
354 | #undef STORE_PARENT | ||
355 | #undef STORE_PARENT_DIRECT | ||
356 | #undef UPDATE_E | ||
357 | #undef HI_HALF_OFFSET | ||
358 | #undef NUM_UNROLLS | ||
359 | #undef lenCounters | ||
360 | #undef codes | ||
diff --git a/C/HuffEnc.h b/C/HuffEnc.h index cbc5d11..2217f55 100644 --- a/C/HuffEnc.h +++ b/C/HuffEnc.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* HuffEnc.h -- Huffman encoding | 1 | /* HuffEnc.h -- Huffman encoding |
2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_HUFF_ENC_H | 4 | #ifndef ZIP7_INC_HUFF_ENC_H |
5 | #define ZIP7_INC_HUFF_ENC_H | 5 | #define ZIP7_INC_HUFF_ENC_H |
@@ -8,14 +8,14 @@ | |||
8 | 8 | ||
9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
10 | 10 | ||
11 | #define Z7_HUFFMAN_LEN_MAX 16 | ||
11 | /* | 12 | /* |
12 | Conditions: | 13 | Conditions: |
13 | num <= 1024 = 2 ^ NUM_BITS | 14 | 2 <= num <= 1024 = 2 ^ NUM_BITS |
14 | Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) | 15 | Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) |
15 | maxLen <= 16 = kMaxLen | 16 | 1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX |
16 | Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) | 17 | Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) |
17 | */ | 18 | */ |
18 | |||
19 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); | 19 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); |
20 | 20 | ||
21 | EXTERN_C_END | 21 | EXTERN_C_END |
@@ -1,5 +1,5 @@ | |||
1 | /* LzFind.c -- Match finder for LZ algorithms | 1 | /* LzFind.c -- Match finder for LZ algorithms |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
404 | const unsigned nbMax = | 404 | const unsigned nbMax = |
405 | (p->numHashBytes == 2 ? 16 : | 405 | (p->numHashBytes == 2 ? 16 : |
406 | (p->numHashBytes == 3 ? 24 : 32)); | 406 | (p->numHashBytes == 3 ? 24 : 32)); |
407 | if (numBits > nbMax) | 407 | if (numBits >= nbMax) |
408 | numBits = nbMax; | 408 | numBits = nbMax; |
409 | if (numBits >= 32) | 409 | if (numBits >= 32) |
410 | hs = (UInt32)0 - 1; | 410 | hs = (UInt32)0 - 1; |
@@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
416 | hs |= (256 << kLzHash_CrcShift_2) - 1; | 416 | hs |= (256 << kLzHash_CrcShift_2) - 1; |
417 | { | 417 | { |
418 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); | 418 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); |
419 | if (hs > hs2) | 419 | if (hs >= hs2) |
420 | hs = hs2; | 420 | hs = hs2; |
421 | } | 421 | } |
422 | hsCur = hs; | 422 | hsCur = hs; |
423 | if (p->expectedDataSize < historySize) | 423 | if (p->expectedDataSize < historySize) |
424 | { | 424 | { |
425 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); | 425 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); |
426 | if (hsCur > hs2) | 426 | if (hsCur >= hs2) |
427 | hsCur = hs2; | 427 | hsCur = hs2; |
428 | } | 428 | } |
429 | } | 429 | } |
@@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
434 | if (p->expectedDataSize < historySize) | 434 | if (p->expectedDataSize < historySize) |
435 | { | 435 | { |
436 | hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); | 436 | hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); |
437 | if (hsCur > hs) // is it possible? | 437 | if (hsCur >= hs) // is it possible? |
438 | hsCur = hs; | 438 | hsCur = hs; |
439 | } | 439 | } |
440 | } | 440 | } |
@@ -598,7 +598,7 @@ void MatchFinder_Init(void *_p) | |||
598 | 598 | ||
599 | #ifdef MY_CPU_X86_OR_AMD64 | 599 | #ifdef MY_CPU_X86_OR_AMD64 |
600 | #if defined(__clang__) && (__clang_major__ >= 4) \ | 600 | #if defined(__clang__) && (__clang_major__ >= 4) \ |
601 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) | 601 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
602 | // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) | 602 | // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) |
603 | 603 | ||
604 | #define USE_LZFIND_SATUR_SUB_128 | 604 | #define USE_LZFIND_SATUR_SUB_128 |
@@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, | |||
890 | return d; | 890 | return d; |
891 | { | 891 | { |
892 | const Byte *pb = cur - delta; | 892 | const Byte *pb = cur - delta; |
893 | curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; | 893 | curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; |
894 | if (pb[maxLen] == cur[maxLen] && *pb == *cur) | 894 | if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
895 | { | 895 | { |
896 | UInt32 len = 0; | 896 | UInt32 len = 0; |
@@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, | |||
925 | break; | 925 | break; |
926 | { | 926 | { |
927 | ptrdiff_t diff; | 927 | ptrdiff_t diff; |
928 | curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; | 928 | curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; |
929 | diff = (ptrdiff_t)0 - (ptrdiff_t)delta; | 929 | diff = (ptrdiff_t)0 - (ptrdiff_t)delta; |
930 | if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) | 930 | if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) |
931 | { | 931 | { |
@@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt | |||
972 | // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } | 972 | // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } |
973 | 973 | ||
974 | cmCheck = (UInt32)(pos - _cyclicBufferSize); | 974 | cmCheck = (UInt32)(pos - _cyclicBufferSize); |
975 | if ((UInt32)pos <= _cyclicBufferSize) | 975 | if ((UInt32)pos < _cyclicBufferSize) |
976 | cmCheck = 0; | 976 | cmCheck = 0; |
977 | 977 | ||
978 | if (cmCheck < curMatch) | 978 | if (cmCheck < curMatch) |
@@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt | |||
980 | { | 980 | { |
981 | const UInt32 delta = pos - curMatch; | 981 | const UInt32 delta = pos - curMatch; |
982 | { | 982 | { |
983 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); | 983 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); |
984 | const Byte *pb = cur - delta; | 984 | const Byte *pb = cur - delta; |
985 | unsigned len = (len0 < len1 ? len0 : len1); | 985 | unsigned len = (len0 < len1 ? len0 : len1); |
986 | const UInt32 pair0 = pair[0]; | 986 | const UInt32 pair0 = pair[0]; |
@@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
1039 | UInt32 cmCheck; | 1039 | UInt32 cmCheck; |
1040 | 1040 | ||
1041 | cmCheck = (UInt32)(pos - _cyclicBufferSize); | 1041 | cmCheck = (UInt32)(pos - _cyclicBufferSize); |
1042 | if ((UInt32)pos <= _cyclicBufferSize) | 1042 | if ((UInt32)pos < _cyclicBufferSize) |
1043 | cmCheck = 0; | 1043 | cmCheck = 0; |
1044 | 1044 | ||
1045 | if (// curMatch >= pos || // failure | 1045 | if (// curMatch >= pos || // failure |
@@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
1048 | { | 1048 | { |
1049 | const UInt32 delta = pos - curMatch; | 1049 | const UInt32 delta = pos - curMatch; |
1050 | { | 1050 | { |
1051 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); | 1051 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); |
1052 | const Byte *pb = cur - delta; | 1052 | const Byte *pb = cur - delta; |
1053 | unsigned len = (len0 < len1 ? len0 : len1); | 1053 | unsigned len = (len0 < len1 ? len0 : len1); |
1054 | if (pb[len] == cur[len]) | 1054 | if (pb[len] == cur[len]) |
@@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) | |||
1595 | UInt32 pos = p->pos; \ | 1595 | UInt32 pos = p->pos; \ |
1596 | UInt32 num2 = num; \ | 1596 | UInt32 num2 = num; \ |
1597 | /* (p->pos == p->posLimit) is not allowed here !!! */ \ | 1597 | /* (p->pos == p->posLimit) is not allowed here !!! */ \ |
1598 | { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ | 1598 | { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \ |
1599 | num -= num2; \ | 1599 | num -= num2; \ |
1600 | { const UInt32 cycPos = p->cyclicBufferPos; \ | 1600 | { const UInt32 cycPos = p->cyclicBufferPos; \ |
1601 | son = p->son + cycPos; \ | 1601 | son = p->son + cycPos; \ |
diff --git a/C/LzFindMt.c b/C/LzFindMt.c index ac9d59d..25fcc46 100644 --- a/C/LzFindMt.c +++ b/C/LzFindMt.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms |
2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes; | |||
82 | Z7_NO_INLINE | 82 | Z7_NO_INLINE |
83 | static void MtSync_Construct(CMtSync *p) | 83 | static void MtSync_Construct(CMtSync *p) |
84 | { | 84 | { |
85 | p->affinityGroup = -1; | ||
86 | p->affinityInGroup = 0; | ||
85 | p->affinity = 0; | 87 | p->affinity = 0; |
86 | p->wasCreated = False; | 88 | p->wasCreated = False; |
87 | p->csWasInitialized = False; | 89 | p->csWasInitialized = False; |
@@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void * | |||
259 | // return ERROR_TOO_MANY_POSTS; // for debug | 261 | // return ERROR_TOO_MANY_POSTS; // for debug |
260 | // return EINVAL; // for debug | 262 | // return EINVAL; // for debug |
261 | 263 | ||
264 | #ifdef _WIN32 | ||
265 | if (p->affinityGroup >= 0) | ||
266 | wres = Thread_Create_With_Group(&p->thread, startAddress, obj, | ||
267 | (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup); | ||
268 | else | ||
269 | #endif | ||
262 | if (p->affinity != 0) | 270 | if (p->affinity != 0) |
263 | wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); | 271 | wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); |
264 | else | 272 | else |
diff --git a/C/LzFindMt.h b/C/LzFindMt.h index fcb479d..89984f5 100644 --- a/C/LzFindMt.h +++ b/C/LzFindMt.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms |
2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_LZ_FIND_MT_H | 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H |
5 | #define ZIP7_INC_LZ_FIND_MT_H | 5 | #define ZIP7_INC_LZ_FIND_MT_H |
@@ -12,8 +12,10 @@ EXTERN_C_BEGIN | |||
12 | typedef struct | 12 | typedef struct |
13 | { | 13 | { |
14 | UInt32 numProcessedBlocks; | 14 | UInt32 numProcessedBlocks; |
15 | CThread thread; | 15 | Int32 affinityGroup; |
16 | UInt64 affinityInGroup; | ||
16 | UInt64 affinity; | 17 | UInt64 affinity; |
18 | CThread thread; | ||
17 | 19 | ||
18 | BoolInt wasCreated; | 20 | BoolInt wasCreated; |
19 | BoolInt needStart; | 21 | BoolInt needStart; |
diff --git a/C/Lzma2Enc.c b/C/Lzma2Enc.c index 703e146..72aec69 100644 --- a/C/Lzma2Enc.c +++ b/C/Lzma2Enc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Lzma2Enc.c -- LZMA2 Encoder | 1 | /* Lzma2Enc.c -- LZMA2 Encoder |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p) | |||
235 | p->numBlockThreads_Reduced = -1; | 235 | p->numBlockThreads_Reduced = -1; |
236 | p->numBlockThreads_Max = -1; | 236 | p->numBlockThreads_Max = -1; |
237 | p->numTotalThreads = -1; | 237 | p->numTotalThreads = -1; |
238 | p->numThreadGroups = 0; | ||
238 | } | 239 | } |
239 | 240 | ||
240 | void Lzma2EncProps_Normalize(CLzma2EncProps *p) | 241 | void Lzma2EncProps_Normalize(CLzma2EncProps *p) |
@@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p, | |||
781 | } | 782 | } |
782 | 783 | ||
783 | p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; | 784 | p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; |
785 | p->mtCoder.numThreadGroups = p->props.numThreadGroups; | ||
784 | p->mtCoder.expectedDataSize = p->expectedDataSize; | 786 | p->mtCoder.expectedDataSize = p->expectedDataSize; |
785 | 787 | ||
786 | { | 788 | { |
diff --git a/C/Lzma2Enc.h b/C/Lzma2Enc.h index cb25275..1e6b50c 100644 --- a/C/Lzma2Enc.h +++ b/C/Lzma2Enc.h | |||
@@ -18,6 +18,7 @@ typedef struct | |||
18 | int numBlockThreads_Reduced; | 18 | int numBlockThreads_Reduced; |
19 | int numBlockThreads_Max; | 19 | int numBlockThreads_Max; |
20 | int numTotalThreads; | 20 | int numTotalThreads; |
21 | unsigned numThreadGroups; // 0 : no groups | ||
21 | } CLzma2EncProps; | 22 | } CLzma2EncProps; |
22 | 23 | ||
23 | void Lzma2EncProps_Init(CLzma2EncProps *p); | 24 | void Lzma2EncProps_Init(CLzma2EncProps *p); |
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 37b2787..84a29a5 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzmaEnc.c -- LZMA Encoder | 1 | /* LzmaEnc.c -- LZMA Encoder |
2 | 2024-01-24: Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p) | |||
62 | p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; | 62 | p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; |
63 | p->numHashOutBits = 0; | 63 | p->numHashOutBits = 0; |
64 | p->writeEndMark = 0; | 64 | p->writeEndMark = 0; |
65 | p->affinityGroup = -1; | ||
65 | p->affinity = 0; | 66 | p->affinity = 0; |
67 | p->affinityInGroup = 0; | ||
66 | } | 68 | } |
67 | 69 | ||
68 | void LzmaEncProps_Normalize(CLzmaEncProps *p) | 70 | void LzmaEncProps_Normalize(CLzmaEncProps *p) |
@@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) | |||
72 | p->level = level; | 74 | p->level = level; |
73 | 75 | ||
74 | if (p->dictSize == 0) | 76 | if (p->dictSize == 0) |
75 | p->dictSize = | 77 | p->dictSize = (unsigned)level <= 4 ? |
76 | ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : | 78 | (UInt32)1 << (level * 2 + 16) : |
77 | ( level <= 6 ? ((UInt32)1 << (level + 19)) : | 79 | (unsigned)level <= sizeof(size_t) / 2 + 4 ? |
78 | ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) | 80 | (UInt32)1 << (level + 20) : |
79 | ))); | 81 | (UInt32)1 << (sizeof(size_t) / 2 + 24); |
80 | 82 | ||
81 | if (p->dictSize > p->reduceSize) | 83 | if (p->dictSize > p->reduceSize) |
82 | { | 84 | { |
@@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) | |||
92 | if (p->lp < 0) p->lp = 0; | 94 | if (p->lp < 0) p->lp = 0; |
93 | if (p->pb < 0) p->pb = 2; | 95 | if (p->pb < 0) p->pb = 2; |
94 | 96 | ||
95 | if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); | 97 | if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1; |
96 | if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); | 98 | if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64; |
97 | if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); | 99 | if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
98 | if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); | 100 | if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); |
99 | if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); | 101 | if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); |
@@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2) | |||
598 | p->multiThread = (props.numThreads > 1); | 600 | p->multiThread = (props.numThreads > 1); |
599 | p->matchFinderMt.btSync.affinity = | 601 | p->matchFinderMt.btSync.affinity = |
600 | p->matchFinderMt.hashSync.affinity = props.affinity; | 602 | p->matchFinderMt.hashSync.affinity = props.affinity; |
603 | p->matchFinderMt.btSync.affinityGroup = | ||
604 | p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup; | ||
605 | p->matchFinderMt.btSync.affinityInGroup = | ||
606 | p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup; | ||
601 | #endif | 607 | #endif |
602 | 608 | ||
603 | return SZ_OK; | 609 | return SZ_OK; |
diff --git a/C/LzmaEnc.h b/C/LzmaEnc.h index 9f8039a..3feb5b4 100644 --- a/C/LzmaEnc.h +++ b/C/LzmaEnc.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzmaEnc.h -- LZMA Encoder | 1 | /* LzmaEnc.h -- LZMA Encoder |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_LZMA_ENC_H | 4 | #ifndef ZIP7_INC_LZMA_ENC_H |
5 | #define ZIP7_INC_LZMA_ENC_H | 5 | #define ZIP7_INC_LZMA_ENC_H |
@@ -29,11 +29,13 @@ typedef struct | |||
29 | int numThreads; /* 1 or 2, default = 2 */ | 29 | int numThreads; /* 1 or 2, default = 2 */ |
30 | 30 | ||
31 | // int _pad; | 31 | // int _pad; |
32 | Int32 affinityGroup; | ||
32 | 33 | ||
33 | UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. | 34 | UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. |
34 | Encoder uses this value to reduce dictionary size */ | 35 | Encoder uses this value to reduce dictionary size */ |
35 | 36 | ||
36 | UInt64 affinity; | 37 | UInt64 affinity; |
38 | UInt64 affinityInGroup; | ||
37 | } CLzmaEncProps; | 39 | } CLzmaEncProps; |
38 | 40 | ||
39 | void LzmaEncProps_Init(CLzmaEncProps *p); | 41 | void LzmaEncProps_Init(CLzmaEncProps *p); |
@@ -0,0 +1,206 @@ | |||
1 | /* Md5.c -- MD5 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on Colin Plumb's public domain md5.c code */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Md5.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks | ||
14 | |||
15 | Z7_NO_INLINE | ||
16 | void Md5_Init(CMd5 *p) | ||
17 | { | ||
18 | p->count = 0; | ||
19 | p->state[0] = 0x67452301; | ||
20 | p->state[1] = 0xefcdab89; | ||
21 | p->state[2] = 0x98badcfe; | ||
22 | p->state[3] = 0x10325476; | ||
23 | } | ||
24 | |||
25 | #if 0 && !defined(MY_CPU_LE_UNALIGN) | ||
26 | // optional optimization for Big-endian processors or processors without unaligned access: | ||
27 | // it is intended to reduce the number of complex LE32 memory reading from 64 to 16. | ||
28 | // But some compilers (sparc, armt) are better without this optimization. | ||
29 | #define Z7_MD5_USE_DATA32_ARRAY | ||
30 | #endif | ||
31 | |||
32 | #define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i)) | ||
33 | |||
34 | #ifdef Z7_MD5_USE_DATA32_ARRAY | ||
35 | #define D(i) data32[i] | ||
36 | #else | ||
37 | #define D(i) LOAD_DATA(i) | ||
38 | #endif | ||
39 | |||
40 | #define F1(x, y, z) (z ^ (x & (y ^ z))) | ||
41 | #define F2(x, y, z) F1(z, x, y) | ||
42 | #define F3(x, y, z) (x ^ y ^ z) | ||
43 | #define F4(x, y, z) (y ^ (x | ~z)) | ||
44 | |||
45 | #define R1(i, f, start, step, w, x, y, z, s, k) \ | ||
46 | w += D((start + step * (i)) % 16) + k; \ | ||
47 | w += f(x, y, z); \ | ||
48 | w = rotlFixed(w, s) + x; \ | ||
49 | |||
50 | #define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \ | ||
51 | R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \ | ||
52 | R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \ | ||
53 | R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \ | ||
54 | R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \ | ||
55 | |||
56 | #define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \ | ||
57 | R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \ | ||
58 | R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \ | ||
59 | R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \ | ||
60 | R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \ | ||
61 | |||
62 | static | ||
63 | Z7_NO_INLINE | ||
64 | void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks) | ||
65 | { | ||
66 | UInt32 a, b, c, d; | ||
67 | // if (numBlocks == 0) return; | ||
68 | a = state[0]; | ||
69 | b = state[1]; | ||
70 | c = state[2]; | ||
71 | d = state[3]; | ||
72 | do | ||
73 | { | ||
74 | #ifdef Z7_MD5_USE_DATA32_ARRAY | ||
75 | UInt32 data32[MD5_NUM_BLOCK_WORDS]; | ||
76 | { | ||
77 | #define LOAD_data32_x4(i) { \ | ||
78 | data32[i ] = LOAD_DATA(i ); \ | ||
79 | data32[i + 1] = LOAD_DATA(i + 1); \ | ||
80 | data32[i + 2] = LOAD_DATA(i + 2); \ | ||
81 | data32[i + 3] = LOAD_DATA(i + 3); } | ||
82 | #if 1 | ||
83 | LOAD_data32_x4 (0 * 4) | ||
84 | LOAD_data32_x4 (1 * 4) | ||
85 | LOAD_data32_x4 (2 * 4) | ||
86 | LOAD_data32_x4 (3 * 4) | ||
87 | #else | ||
88 | unsigned i; | ||
89 | for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4) | ||
90 | { | ||
91 | LOAD_data32_x4(i) | ||
92 | } | ||
93 | #endif | ||
94 | } | ||
95 | #endif | ||
96 | |||
97 | R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, | ||
98 | 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, | ||
99 | 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, | ||
100 | 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821) | ||
101 | R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, | ||
102 | 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, | ||
103 | 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, | ||
104 | 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a) | ||
105 | R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, | ||
106 | 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, | ||
107 | 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, | ||
108 | 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665) | ||
109 | R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, | ||
110 | 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, | ||
111 | 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, | ||
112 | 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391) | ||
113 | |||
114 | a += state[0]; | ||
115 | b += state[1]; | ||
116 | c += state[2]; | ||
117 | d += state[3]; | ||
118 | |||
119 | state[0] = a; | ||
120 | state[1] = b; | ||
121 | state[2] = c; | ||
122 | state[3] = d; | ||
123 | |||
124 | data += MD5_BLOCK_SIZE; | ||
125 | } | ||
126 | while (--numBlocks); | ||
127 | } | ||
128 | |||
129 | |||
130 | #define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | ||
131 | |||
132 | void Md5_Update(CMd5 *p, const Byte *data, size_t size) | ||
133 | { | ||
134 | if (size == 0) | ||
135 | return; | ||
136 | { | ||
137 | const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); | ||
138 | const unsigned num = MD5_BLOCK_SIZE - pos; | ||
139 | p->count += size; | ||
140 | if (num > size) | ||
141 | { | ||
142 | memcpy(p->buffer + pos, data, size); | ||
143 | return; | ||
144 | } | ||
145 | if (pos != 0) | ||
146 | { | ||
147 | size -= num; | ||
148 | memcpy(p->buffer + pos, data, num); | ||
149 | data += num; | ||
150 | Md5_UpdateBlock(p); | ||
151 | } | ||
152 | } | ||
153 | { | ||
154 | const size_t numBlocks = size >> 6; | ||
155 | if (numBlocks) | ||
156 | MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | ||
157 | size &= MD5_BLOCK_SIZE - 1; | ||
158 | if (size == 0) | ||
159 | return; | ||
160 | data += (numBlocks << 6); | ||
161 | memcpy(p->buffer, data, size); | ||
162 | } | ||
163 | } | ||
164 | |||
165 | |||
166 | void Md5_Final(CMd5 *p, Byte *digest) | ||
167 | { | ||
168 | unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); | ||
169 | p->buffer[pos++] = 0x80; | ||
170 | if (pos > (MD5_BLOCK_SIZE - 4 * 2)) | ||
171 | { | ||
172 | while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; } | ||
173 | // memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos); | ||
174 | Md5_UpdateBlock(p); | ||
175 | pos = 0; | ||
176 | } | ||
177 | memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos); | ||
178 | { | ||
179 | const UInt64 numBits = p->count << 3; | ||
180 | #if defined(MY_CPU_LE_UNALIGN) | ||
181 | SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits) | ||
182 | #else | ||
183 | SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits)) | ||
184 | SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32)) | ||
185 | #endif | ||
186 | } | ||
187 | Md5_UpdateBlock(p); | ||
188 | |||
189 | SetUi32(digest, p->state[0]) | ||
190 | SetUi32(digest + 4, p->state[1]) | ||
191 | SetUi32(digest + 8, p->state[2]) | ||
192 | SetUi32(digest + 12, p->state[3]) | ||
193 | |||
194 | Md5_Init(p); | ||
195 | } | ||
196 | |||
197 | #undef R1 | ||
198 | #undef R4 | ||
199 | #undef R16 | ||
200 | #undef D | ||
201 | #undef LOAD_DATA | ||
202 | #undef LOAD_data32_x4 | ||
203 | #undef F1 | ||
204 | #undef F2 | ||
205 | #undef F3 | ||
206 | #undef F4 | ||
@@ -0,0 +1,34 @@ | |||
1 | /* Md5.h -- MD5 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_MD5_H | ||
5 | #define ZIP7_INC_MD5_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define MD5_NUM_BLOCK_WORDS 16 | ||
12 | #define MD5_NUM_DIGEST_WORDS 4 | ||
13 | |||
14 | #define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4) | ||
15 | #define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4) | ||
16 | |||
17 | typedef struct | ||
18 | { | ||
19 | UInt64 count; | ||
20 | UInt64 _pad_1; | ||
21 | // we want 16-bytes alignment here | ||
22 | UInt32 state[MD5_NUM_DIGEST_WORDS]; | ||
23 | UInt64 _pad_2[4]; | ||
24 | // we want 64-bytes alignment here | ||
25 | Byte buffer[MD5_BLOCK_SIZE]; | ||
26 | } CMd5; | ||
27 | |||
28 | void Md5_Init(CMd5 *p); | ||
29 | void Md5_Update(CMd5 *p, const Byte *data, size_t size); | ||
30 | void Md5_Final(CMd5 *p, Byte *digest); | ||
31 | |||
32 | EXTERN_C_END | ||
33 | |||
34 | #endif | ||
diff --git a/C/MtCoder.c b/C/MtCoder.c index 03959b6..923b19a 100644 --- a/C/MtCoder.c +++ b/C/MtCoder.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* MtCoder.c -- Multi-thread Coder | 1 | /* MtCoder.c -- Multi-thread Coder |
2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p) | |||
39 | static THREAD_FUNC_DECL ThreadFunc(void *pp); | 39 | static THREAD_FUNC_DECL ThreadFunc(void *pp); |
40 | 40 | ||
41 | 41 | ||
42 | static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) | 42 | static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t |
43 | #ifdef _WIN32 | ||
44 | , CMtCoder * const mtc | ||
45 | #endif | ||
46 | ) | ||
43 | { | 47 | { |
44 | WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); | 48 | WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); |
49 | // printf("\n====== MtCoderThread_CreateAndStart : \n"); | ||
45 | if (wres == 0) | 50 | if (wres == 0) |
46 | { | 51 | { |
47 | t->stop = False; | 52 | t->stop = False; |
48 | if (!Thread_WasCreated(&t->thread)) | 53 | if (!Thread_WasCreated(&t->thread)) |
49 | wres = Thread_Create(&t->thread, ThreadFunc, t); | 54 | { |
55 | #ifdef _WIN32 | ||
56 | if (mtc->numThreadGroups) | ||
57 | wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t, | ||
58 | ThreadNextGroup_GetNext(&mtc->nextGroup), // group | ||
59 | 0); // affinityMask | ||
60 | else | ||
61 | #endif | ||
62 | wres = Thread_Create(&t->thread, ThreadFunc, t); | ||
63 | } | ||
50 | if (wres == 0) | 64 | if (wres == 0) |
51 | wres = Event_Set(&t->startEvent); | 65 | wres = Event_Set(&t->startEvent); |
52 | } | 66 | } |
@@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) | |||
56 | } | 70 | } |
57 | 71 | ||
58 | 72 | ||
73 | Z7_FORCE_INLINE | ||
59 | static void MtCoderThread_Destruct(CMtCoderThread *t) | 74 | static void MtCoderThread_Destruct(CMtCoderThread *t) |
60 | { | 75 | { |
61 | if (Thread_WasCreated(&t->thread)) | 76 | if (Thread_WasCreated(&t->thread)) |
@@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t) | |||
85 | 100 | ||
86 | static SRes ThreadFunc2(CMtCoderThread *t) | 101 | static SRes ThreadFunc2(CMtCoderThread *t) |
87 | { | 102 | { |
88 | CMtCoder *mtc = t->mtCoder; | 103 | CMtCoder * const mtc = t->mtCoder; |
89 | 104 | ||
90 | for (;;) | 105 | for (;;) |
91 | { | 106 | { |
@@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
185 | if (mtc->numStartedThreads < mtc->numStartedThreadsLimit | 200 | if (mtc->numStartedThreads < mtc->numStartedThreadsLimit |
186 | && mtc->expectedDataSize != readProcessed) | 201 | && mtc->expectedDataSize != readProcessed) |
187 | { | 202 | { |
188 | res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); | 203 | res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads] |
204 | #ifdef _WIN32 | ||
205 | , mtc | ||
206 | #endif | ||
207 | ); | ||
189 | if (res == SZ_OK) | 208 | if (res == SZ_OK) |
190 | mtc->numStartedThreads++; | 209 | mtc->numStartedThreads++; |
191 | else | 210 | else |
@@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
221 | } | 240 | } |
222 | 241 | ||
223 | { | 242 | { |
224 | CMtCoderBlock *block = &mtc->blocks[bi]; | 243 | CMtCoderBlock * const block = &mtc->blocks[bi]; |
225 | block->res = res; | 244 | block->res = res; |
226 | block->bufIndex = bufIndex; | 245 | block->bufIndex = bufIndex; |
227 | block->finished = finished; | 246 | block->finished = finished; |
@@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
311 | 330 | ||
312 | static THREAD_FUNC_DECL ThreadFunc(void *pp) | 331 | static THREAD_FUNC_DECL ThreadFunc(void *pp) |
313 | { | 332 | { |
314 | CMtCoderThread *t = (CMtCoderThread *)pp; | 333 | CMtCoderThread * const t = (CMtCoderThread *)pp; |
315 | for (;;) | 334 | for (;;) |
316 | { | 335 | { |
317 | if (Event_Wait(&t->startEvent) != 0) | 336 | if (Event_Wait(&t->startEvent) != 0) |
@@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) | |||
319 | if (t->stop) | 338 | if (t->stop) |
320 | return 0; | 339 | return 0; |
321 | { | 340 | { |
322 | SRes res = ThreadFunc2(t); | 341 | const SRes res = ThreadFunc2(t); |
323 | CMtCoder *mtc = t->mtCoder; | 342 | CMtCoder *mtc = t->mtCoder; |
324 | if (res != SZ_OK) | 343 | if (res != SZ_OK) |
325 | { | 344 | { |
@@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) | |||
328 | 347 | ||
329 | #ifndef MTCODER_USE_WRITE_THREAD | 348 | #ifndef MTCODER_USE_WRITE_THREAD |
330 | { | 349 | { |
331 | unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); | 350 | const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); |
332 | if (numFinished == mtc->numStartedThreads) | 351 | if (numFinished == mtc->numStartedThreads) |
333 | if (Event_Set(&mtc->finishedEvent) != 0) | 352 | if (Event_Set(&mtc->finishedEvent) != 0) |
334 | return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; | 353 | return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; |
@@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p) | |||
346 | 365 | ||
347 | p->blockSize = 0; | 366 | p->blockSize = 0; |
348 | p->numThreadsMax = 0; | 367 | p->numThreadsMax = 0; |
368 | p->numThreadGroups = 0; | ||
349 | p->expectedDataSize = (UInt64)(Int64)-1; | 369 | p->expectedDataSize = (UInt64)(Int64)-1; |
350 | 370 | ||
351 | p->inStream = NULL; | 371 | p->inStream = NULL; |
@@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p) | |||
429 | unsigned i; | 449 | unsigned i; |
430 | SRes res = SZ_OK; | 450 | SRes res = SZ_OK; |
431 | 451 | ||
452 | // printf("\n====== MtCoder_Code : \n"); | ||
453 | |||
432 | if (numThreads > MTCODER_THREADS_MAX) | 454 | if (numThreads > MTCODER_THREADS_MAX) |
433 | numThreads = MTCODER_THREADS_MAX; | 455 | numThreads = MTCODER_THREADS_MAX; |
434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); | 456 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); |
@@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p) | |||
492 | 514 | ||
493 | p->numStartedThreadsLimit = numThreads; | 515 | p->numStartedThreadsLimit = numThreads; |
494 | p->numStartedThreads = 0; | 516 | p->numStartedThreads = 0; |
517 | ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup | ||
495 | 518 | ||
496 | // for (i = 0; i < numThreads; i++) | 519 | // for (i = 0; i < numThreads; i++) |
497 | { | 520 | { |
521 | // here we create new thread for first block. | ||
522 | // And each new thread will create another new thread after block reading | ||
523 | // until numStartedThreadsLimit is reached. | ||
498 | CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; | 524 | CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; |
499 | RINOK(MtCoderThread_CreateAndStart(nextThread)) | 525 | { |
526 | const SRes res2 = MtCoderThread_CreateAndStart(nextThread | ||
527 | #ifdef _WIN32 | ||
528 | , p | ||
529 | #endif | ||
530 | ); | ||
531 | RINOK(res2) | ||
532 | } | ||
500 | } | 533 | } |
501 | 534 | ||
502 | RINOK_THREAD(Event_Set(&p->readEvent)) | 535 | RINOK_THREAD(Event_Set(&p->readEvent)) |
@@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p) | |||
513 | RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) | 546 | RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) |
514 | 547 | ||
515 | { | 548 | { |
516 | const CMtCoderBlock *block = &p->blocks[bi]; | 549 | const CMtCoderBlock * const block = &p->blocks[bi]; |
517 | unsigned bufIndex = block->bufIndex; | 550 | const unsigned bufIndex = block->bufIndex; |
518 | BoolInt finished = block->finished; | 551 | const BoolInt finished = block->finished; |
519 | if (res == SZ_OK && block->res != SZ_OK) | 552 | if (res == SZ_OK && block->res != SZ_OK) |
520 | res = block->res; | 553 | res = block->res; |
521 | 554 | ||
@@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
545 | } | 578 | } |
546 | #else | 579 | #else |
547 | { | 580 | { |
548 | WRes wres = Event_Wait(&p->finishedEvent); | 581 | const WRes wres = Event_Wait(&p->finishedEvent); |
549 | res = MY_SRes_HRESULT_FROM_WRes(wres); | 582 | res = MY_SRes_HRESULT_FROM_WRes(wres); |
550 | } | 583 | } |
551 | #endif | 584 | #endif |
diff --git a/C/MtCoder.h b/C/MtCoder.h index 1231d3c..8166cca 100644 --- a/C/MtCoder.h +++ b/C/MtCoder.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* MtCoder.h -- Multi-thread Coder | 1 | /* MtCoder.h -- Multi-thread Coder |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_MT_CODER_H | 4 | #ifndef ZIP7_INC_MT_CODER_H |
5 | #define ZIP7_INC_MT_CODER_H | 5 | #define ZIP7_INC_MT_CODER_H |
@@ -16,7 +16,7 @@ EXTERN_C_BEGIN | |||
16 | 16 | ||
17 | #ifndef Z7_ST | 17 | #ifndef Z7_ST |
18 | #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) | 18 | #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) |
19 | #define MTCODER_THREADS_MAX 64 | 19 | #define MTCODER_THREADS_MAX 256 |
20 | #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) | 20 | #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) |
21 | #else | 21 | #else |
22 | #define MTCODER_THREADS_MAX 1 | 22 | #define MTCODER_THREADS_MAX 1 |
@@ -77,6 +77,7 @@ typedef struct CMtCoder_ | |||
77 | 77 | ||
78 | size_t blockSize; /* size of input block */ | 78 | size_t blockSize; /* size of input block */ |
79 | unsigned numThreadsMax; | 79 | unsigned numThreadsMax; |
80 | unsigned numThreadGroups; | ||
80 | UInt64 expectedDataSize; | 81 | UInt64 expectedDataSize; |
81 | 82 | ||
82 | ISeqInStreamPtr inStream; | 83 | ISeqInStreamPtr inStream; |
@@ -125,6 +126,8 @@ typedef struct CMtCoder_ | |||
125 | CMtProgress mtProgress; | 126 | CMtProgress mtProgress; |
126 | CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; | 127 | CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; |
127 | CMtCoderThread threads[MTCODER_THREADS_MAX]; | 128 | CMtCoderThread threads[MTCODER_THREADS_MAX]; |
129 | |||
130 | CThreadNextGroup nextGroup; | ||
128 | } CMtCoder; | 131 | } CMtCoder; |
129 | 132 | ||
130 | 133 | ||
@@ -1,18 +1,14 @@ | |||
1 | /* Sha1.c -- SHA-1 Hash | 1 | /* Sha1.c -- SHA-1 Hash |
2 | 2024-03-01 : Igor Pavlov : Public domain | 2 | : Igor Pavlov : Public domain |
3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | #include <string.h> | 7 | #include <string.h> |
8 | 8 | ||
9 | #include "CpuArch.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "Sha1.h" | 9 | #include "Sha1.h" |
12 | 10 | #include "RotateDefs.h" | |
13 | #if defined(_MSC_VER) && (_MSC_VER < 1900) | 11 | #include "CpuArch.h" |
14 | // #define USE_MY_MM | ||
15 | #endif | ||
16 | 12 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 13 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ | 14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
56 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; | 52 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; |
57 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; | 53 | static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; |
58 | 54 | ||
59 | #define SHA1_UPDATE_BLOCKS(p) p->func_UpdateBlocks | 55 | #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks |
60 | #else | 56 | #else |
61 | #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks | 57 | #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks |
62 | #endif | 58 | #endif |
@@ -85,7 +81,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) | |||
85 | return False; | 81 | return False; |
86 | #endif | 82 | #endif |
87 | 83 | ||
88 | p->func_UpdateBlocks = func; | 84 | p->v.vars.func_UpdateBlocks = func; |
89 | return True; | 85 | return True; |
90 | } | 86 | } |
91 | 87 | ||
@@ -225,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) | |||
225 | 221 | ||
226 | void Sha1_InitState(CSha1 *p) | 222 | void Sha1_InitState(CSha1 *p) |
227 | { | 223 | { |
228 | p->count = 0; | 224 | p->v.vars.count = 0; |
229 | p->state[0] = 0x67452301; | 225 | p->state[0] = 0x67452301; |
230 | p->state[1] = 0xEFCDAB89; | 226 | p->state[1] = 0xEFCDAB89; |
231 | p->state[2] = 0x98BADCFE; | 227 | p->state[2] = 0x98BADCFE; |
@@ -235,7 +231,7 @@ void Sha1_InitState(CSha1 *p) | |||
235 | 231 | ||
236 | void Sha1_Init(CSha1 *p) | 232 | void Sha1_Init(CSha1 *p) |
237 | { | 233 | { |
238 | p->func_UpdateBlocks = | 234 | p->v.vars.func_UpdateBlocks = |
239 | #ifdef Z7_COMPILER_SHA1_SUPPORTED | 235 | #ifdef Z7_COMPILER_SHA1_SUPPORTED |
240 | g_SHA1_FUNC_UPDATE_BLOCKS; | 236 | g_SHA1_FUNC_UPDATE_BLOCKS; |
241 | #else | 237 | #else |
@@ -250,7 +246,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
250 | { | 246 | { |
251 | UInt32 a, b, c, d, e; | 247 | UInt32 a, b, c, d, e; |
252 | UInt32 W[kNumW]; | 248 | UInt32 W[kNumW]; |
253 | // if (numBlocks != 0x1264378347) return; | 249 | |
254 | if (numBlocks == 0) | 250 | if (numBlocks == 0) |
255 | return; | 251 | return; |
256 | 252 | ||
@@ -283,7 +279,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num | |||
283 | state[3] = d; | 279 | state[3] = d; |
284 | state[4] = e; | 280 | state[4] = e; |
285 | 281 | ||
286 | data += 64; | 282 | data += SHA1_BLOCK_SIZE; |
287 | } | 283 | } |
288 | while (--numBlocks); | 284 | while (--numBlocks); |
289 | } | 285 | } |
@@ -295,20 +291,15 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
295 | { | 291 | { |
296 | if (size == 0) | 292 | if (size == 0) |
297 | return; | 293 | return; |
298 | |||
299 | { | 294 | { |
300 | unsigned pos = (unsigned)p->count & 0x3F; | 295 | const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); |
301 | unsigned num; | 296 | const unsigned num = SHA1_BLOCK_SIZE - pos; |
302 | 297 | p->v.vars.count += size; | |
303 | p->count += size; | ||
304 | |||
305 | num = 64 - pos; | ||
306 | if (num > size) | 298 | if (num > size) |
307 | { | 299 | { |
308 | memcpy(p->buffer + pos, data, size); | 300 | memcpy(p->buffer + pos, data, size); |
309 | return; | 301 | return; |
310 | } | 302 | } |
311 | |||
312 | if (pos != 0) | 303 | if (pos != 0) |
313 | { | 304 | { |
314 | size -= num; | 305 | size -= num; |
@@ -318,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
318 | } | 309 | } |
319 | } | 310 | } |
320 | { | 311 | { |
321 | size_t numBlocks = size >> 6; | 312 | const size_t numBlocks = size >> 6; |
313 | // if (numBlocks) | ||
322 | SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | 314 | SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); |
323 | size &= 0x3F; | 315 | size &= SHA1_BLOCK_SIZE - 1; |
324 | if (size == 0) | 316 | if (size == 0) |
325 | return; | 317 | return; |
326 | data += (numBlocks << 6); | 318 | data += (numBlocks << 6); |
@@ -331,42 +323,21 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) | |||
331 | 323 | ||
332 | void Sha1_Final(CSha1 *p, Byte *digest) | 324 | void Sha1_Final(CSha1 *p, Byte *digest) |
333 | { | 325 | { |
334 | unsigned pos = (unsigned)p->count & 0x3F; | 326 | unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); |
335 | |||
336 | |||
337 | p->buffer[pos++] = 0x80; | 327 | p->buffer[pos++] = 0x80; |
338 | 328 | if (pos > (SHA1_BLOCK_SIZE - 4 * 2)) | |
339 | if (pos > (64 - 8)) | ||
340 | { | 329 | { |
341 | while (pos != 64) { p->buffer[pos++] = 0; } | 330 | while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; } |
342 | // memset(&p->buf.buffer[pos], 0, 64 - pos); | 331 | // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos); |
343 | Sha1_UpdateBlock(p); | 332 | Sha1_UpdateBlock(p); |
344 | pos = 0; | 333 | pos = 0; |
345 | } | 334 | } |
346 | 335 | memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos); | |
347 | /* | ||
348 | if (pos & 3) | ||
349 | { | ||
350 | p->buffer[pos] = 0; | ||
351 | p->buffer[pos + 1] = 0; | ||
352 | p->buffer[pos + 2] = 0; | ||
353 | pos += 3; | ||
354 | pos &= ~3; | ||
355 | } | ||
356 | { | ||
357 | for (; pos < 64 - 8; pos += 4) | ||
358 | *(UInt32 *)(&p->buffer[pos]) = 0; | ||
359 | } | ||
360 | */ | ||
361 | |||
362 | memset(&p->buffer[pos], 0, (64 - 8) - pos); | ||
363 | |||
364 | { | 336 | { |
365 | const UInt64 numBits = (p->count << 3); | 337 | const UInt64 numBits = p->v.vars.count << 3; |
366 | SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) | 338 | SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) |
367 | SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) | 339 | SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) |
368 | } | 340 | } |
369 | |||
370 | Sha1_UpdateBlock(p); | 341 | Sha1_UpdateBlock(p); |
371 | 342 | ||
372 | SetBe32(digest, p->state[0]) | 343 | SetBe32(digest, p->state[0]) |
@@ -375,16 +346,13 @@ void Sha1_Final(CSha1 *p, Byte *digest) | |||
375 | SetBe32(digest + 12, p->state[3]) | 346 | SetBe32(digest + 12, p->state[3]) |
376 | SetBe32(digest + 16, p->state[4]) | 347 | SetBe32(digest + 16, p->state[4]) |
377 | 348 | ||
378 | |||
379 | |||
380 | |||
381 | Sha1_InitState(p); | 349 | Sha1_InitState(p); |
382 | } | 350 | } |
383 | 351 | ||
384 | 352 | ||
385 | void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) | 353 | void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) |
386 | { | 354 | { |
387 | const UInt64 numBits = (p->count + size) << 3; | 355 | const UInt64 numBits = (p->v.vars.count + size) << 3; |
388 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) | 356 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) |
389 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) | 357 | SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) |
390 | // SetBe32((UInt32 *)(block + size), 0x80000000); | 358 | // SetBe32((UInt32 *)(block + size), 0x80000000); |
@@ -420,57 +388,32 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest) | |||
420 | 388 | ||
421 | void Sha1Prepare(void) | 389 | void Sha1Prepare(void) |
422 | { | 390 | { |
423 | #ifdef Z7_COMPILER_SHA1_SUPPORTED | 391 | #ifdef Z7_COMPILER_SHA1_SUPPORTED |
424 | SHA1_FUNC_UPDATE_BLOCKS f, f_hw; | 392 | SHA1_FUNC_UPDATE_BLOCKS f, f_hw; |
425 | f = Sha1_UpdateBlocks; | 393 | f = Sha1_UpdateBlocks; |
426 | f_hw = NULL; | 394 | f_hw = NULL; |
427 | #ifdef MY_CPU_X86_OR_AMD64 | 395 | #ifdef MY_CPU_X86_OR_AMD64 |
428 | #ifndef USE_MY_MM | ||
429 | if (CPU_IsSupported_SHA() | 396 | if (CPU_IsSupported_SHA() |
430 | && CPU_IsSupported_SSSE3() | 397 | && CPU_IsSupported_SSSE3() |
431 | // && CPU_IsSupported_SSE41() | ||
432 | ) | 398 | ) |
433 | #endif | 399 | #else |
434 | #else | ||
435 | if (CPU_IsSupported_SHA1()) | 400 | if (CPU_IsSupported_SHA1()) |
436 | #endif | 401 | #endif |
437 | { | 402 | { |
438 | // printf("\n========== HW SHA1 ======== \n"); | 403 | // printf("\n========== HW SHA1 ======== \n"); |
439 | #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) | 404 | #if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037) |
440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). | 405 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). |
441 | It generated incorrect SHA-1 code. | 406 | It generated incorrect SHA-1 code. */ |
442 | 21.03 : we test sha1-hardware code at runtime initialization */ | 407 | #pragma message("== SHA1 code can work incorrectly with this compiler") |
443 | 408 | #error Stop_Compiling_MSC_Compiler_BUG_SHA1 | |
444 | #pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") | 409 | #endif |
445 | |||
446 | UInt32 state[5] = { 0, 1, 2, 3, 4 } ; | ||
447 | Byte data[64]; | ||
448 | unsigned i; | ||
449 | for (i = 0; i < sizeof(data); i += 2) | ||
450 | { | ||
451 | data[i ] = (Byte)(i); | ||
452 | data[i + 1] = (Byte)(i + 1); | ||
453 | } | ||
454 | |||
455 | Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64); | ||
456 | |||
457 | if ( state[0] != 0x9acd7297 | ||
458 | || state[1] != 0x4624d898 | ||
459 | || state[2] != 0x0bf079f0 | ||
460 | || state[3] != 0x031e61b3 | ||
461 | || state[4] != 0x8323fe20) | ||
462 | { | ||
463 | // printf("\n========== SHA-1 hardware version failure ======== \n"); | ||
464 | } | ||
465 | else | ||
466 | #endif | ||
467 | { | 410 | { |
468 | f = f_hw = Sha1_UpdateBlocks_HW; | 411 | f = f_hw = Sha1_UpdateBlocks_HW; |
469 | } | 412 | } |
470 | } | 413 | } |
471 | g_SHA1_FUNC_UPDATE_BLOCKS = f; | 414 | g_SHA1_FUNC_UPDATE_BLOCKS = f; |
472 | g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; | 415 | g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; |
473 | #endif | 416 | #endif |
474 | } | 417 | } |
475 | 418 | ||
476 | #undef kNumW | 419 | #undef kNumW |
@@ -1,5 +1,5 @@ | |||
1 | /* Sha1.h -- SHA-1 Hash | 1 | /* Sha1.h -- SHA-1 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_SHA1_H | 4 | #ifndef ZIP7_INC_SHA1_H |
5 | #define ZIP7_INC_SHA1_H | 5 | #define ZIP7_INC_SHA1_H |
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN | |||
14 | #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) | 14 | #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) |
15 | #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) | 15 | #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) |
16 | 16 | ||
17 | |||
18 | |||
19 | |||
17 | typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); | 20 | typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); |
18 | 21 | ||
19 | /* | 22 | /* |
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte | |||
32 | 35 | ||
33 | typedef struct | 36 | typedef struct |
34 | { | 37 | { |
35 | SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | 38 | union |
36 | UInt64 count; | 39 | { |
37 | UInt64 _pad_2[2]; | 40 | struct |
41 | { | ||
42 | SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[4]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
38 | UInt32 state[SHA1_NUM_DIGEST_WORDS]; | 48 | UInt32 state[SHA1_NUM_DIGEST_WORDS]; |
39 | UInt32 _pad_3[3]; | 49 | UInt32 _pad_3[3]; |
40 | Byte buffer[SHA1_BLOCK_SIZE]; | 50 | Byte buffer[SHA1_BLOCK_SIZE]; |
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 4e835f1..8738b94 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha1 HW") | 36 | // #pragma message("Sha1 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -84,7 +69,6 @@ SHA: | |||
84 | _mm_sha1* | 69 | _mm_sha1* |
85 | */ | 70 | */ |
86 | 71 | ||
87 | |||
88 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); | 72 | #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); |
89 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); | 73 | #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); |
90 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); | 74 | #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); |
@@ -99,11 +83,12 @@ SHA: | |||
99 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); | 83 | #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); |
100 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); | 84 | #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); |
101 | 85 | ||
102 | |||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | SHUFFLE_EPI8(m, mask) \ | 88 | SHUFFLE_EPI8(m, mask) \ |
106 | 89 | ||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
107 | #define SM1(m0, m1, m2, m3) \ | 92 | #define SM1(m0, m1, m2, m3) \ |
108 | SHA1_MSG1(m0, m1) \ | 93 | SHA1_MSG1(m0, m1) \ |
109 | 94 | ||
@@ -116,35 +101,19 @@ SHA: | |||
116 | SM1(m0, m1, m2, m3) \ | 101 | SM1(m0, m1, m2, m3) \ |
117 | SHA1_MSG2(m3, m2) \ | 102 | SHA1_MSG2(m3, m2) \ |
118 | 103 | ||
119 | #define NNN(m0, m1, m2, m3) | 104 | #define R4(k, m0, m1, m2, m3, e0, e1, OP) \ |
120 | |||
121 | |||
122 | |||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
128 | |||
129 | |||
130 | |||
131 | |||
132 | |||
133 | |||
134 | |||
135 | |||
136 | |||
137 | #define R4(k, e0, e1, m0, m1, m2, m3, OP) \ | ||
138 | e1 = abcd; \ | 105 | e1 = abcd; \ |
139 | SHA1_RND4(abcd, e0, (k) / 5) \ | 106 | SHA1_RND4(abcd, e0, (k) / 5) \ |
140 | SHA1_NEXTE(e1, m1) \ | 107 | SHA1_NEXTE(e1, m1) \ |
141 | OP(m0, m1, m2, m3) \ | 108 | OP(m0, m1, m2, m3) \ |
142 | 109 | ||
110 | |||
111 | |||
143 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ | 112 | #define R16(k, mx, OP0, OP1, OP2, OP3) \ |
144 | R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \ |
145 | R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \ |
146 | R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \ |
147 | R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ | 116 | R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | SHUFFLE_EPI32 (abcd, 0x1B) \ | 119 | SHUFFLE_EPI32 (abcd, 0x1B) \ |
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); | 132 | const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); |
164 | 133 | ||
165 | __m128i abcd, e0; | ||
166 | 134 | ||
135 | __m128i abcd, e0; | ||
136 | |||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
169 | 139 | ||
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
204 | PREPARE_STATE | 174 | PREPARE_STATE |
205 | 175 | ||
206 | _mm_storeu_si128((__m128i *) (void *) state, abcd); | 176 | _mm_storeu_si128((__m128i *) (void *) state, abcd); |
207 | *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); | 177 | *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0); |
208 | } | 178 | } |
209 | 179 | ||
210 | #endif // USE_HW_SHA | 180 | #endif // USE_HW_SHA |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -329,26 +287,37 @@ typedef uint32x4_t v128; | |||
329 | #endif | 287 | #endif |
330 | 288 | ||
331 | #ifdef MY_CPU_BE | 289 | #ifdef MY_CPU_BE |
332 | #define MY_rev32_for_LE(x) | 290 | #define MY_rev32_for_LE(x) x |
333 | #else | 291 | #else |
334 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 292 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
335 | #endif | 293 | #endif |
336 | 294 | ||
337 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 295 | #define LOAD_128_32(_p) vld1q_u32(_p) |
338 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 296 | #define LOAD_128_8(_p) vld1q_u8 (_p) |
297 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
339 | 298 | ||
340 | #define LOAD_SHUFFLE(m, k) \ | 299 | #define LOAD_SHUFFLE(m, k) \ |
341 | m = LOAD_128((data + (k) * 16)); \ | 300 | m = vreinterpretq_u32_u8( \ |
342 | MY_rev32_for_LE(m); \ | 301 | MY_rev32_for_LE( \ |
343 | 302 | LOAD_128_8(data + (k) * 16))); \ | |
344 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) | 303 | |
345 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src) | 304 | #define N0(dest, src2, src3) |
305 | #define N1(dest, src) | ||
306 | #define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | ||
307 | #define U1(dest, src) dest = vsha1su1q_u32(dest, src); | ||
346 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) | 308 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) |
347 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) | 309 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) |
348 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) | 310 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) |
349 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | 311 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) |
350 | #define T(m, c) t = vaddq_u32(m, c) | 312 | #define T(m, c) t = vaddq_u32(m, c) |
351 | 313 | ||
314 | #define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \ | ||
315 | T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \ | ||
316 | T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \ | ||
317 | T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \ | ||
318 | T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \ | ||
319 | |||
320 | |||
352 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 321 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
353 | #ifdef ATTRIB_SHA | 322 | #ifdef ATTRIB_SHA |
354 | ATTRIB_SHA | 323 | ATTRIB_SHA |
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
367 | c2 = vdupq_n_u32(0x8f1bbcdc); | 336 | c2 = vdupq_n_u32(0x8f1bbcdc); |
368 | c3 = vdupq_n_u32(0xca62c1d6); | 337 | c3 = vdupq_n_u32(0xca62c1d6); |
369 | 338 | ||
370 | abcd = LOAD_128(&state[0]); | 339 | abcd = LOAD_128_32(&state[0]); |
371 | e0 = state[4]; | 340 | e0 = state[4]; |
372 | 341 | ||
373 | do | 342 | do |
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
385 | LOAD_SHUFFLE (m2, 2) | 354 | LOAD_SHUFFLE (m2, 2) |
386 | LOAD_SHUFFLE (m3, 3) | 355 | LOAD_SHUFFLE (m3, 3) |
387 | 356 | ||
388 | T(m0, c0); H(e1); C(e0); | 357 | R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C ) |
389 | T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); | 358 | R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P ) |
390 | T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); | 359 | R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M ) |
391 | T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); | 360 | R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P ) |
392 | T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); | 361 | R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P ) |
393 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
394 | T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); | ||
395 | T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
396 | T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
397 | T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); | ||
398 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
399 | T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); | ||
400 | T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); | ||
401 | T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); | ||
402 | T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); | ||
403 | T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); | ||
404 | T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); | ||
405 | T(m1, c3); SU1(m3, m2); H(e0); P(e1); | ||
406 | T(m2, c3); H(e1); P(e0); | ||
407 | T(m3, c3); H(e0); P(e1); | ||
408 | 362 | ||
409 | abcd = vaddq_u32(abcd, abcd_save); | 363 | abcd = vaddq_u32(abcd, abcd_save); |
410 | e0 += e0_save; | 364 | e0 += e0_save; |
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
413 | } | 367 | } |
414 | while (--numBlocks); | 368 | while (--numBlocks); |
415 | 369 | ||
416 | STORE_128(&state[0], abcd); | 370 | STORE_128_32(&state[0], abcd); |
417 | state[4] = e0; | 371 | state[4] = e0; |
418 | } | 372 | } |
419 | 373 | ||
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
421 | 375 | ||
422 | #endif // MY_CPU_ARM_OR_ARM64 | 376 | #endif // MY_CPU_ARM_OR_ARM64 |
423 | 377 | ||
424 | |||
425 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | 378 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
426 | // #error Stop_Compiling_UNSUPPORTED_SHA | 379 | // #error Stop_Compiling_UNSUPPORTED_SHA |
427 | // #include <stdlib.h> | 380 | // #include <stdlib.h> |
428 | |||
429 | |||
430 | |||
431 | // #include "Sha1.h" | 381 | // #include "Sha1.h" |
432 | // #if defined(_MSC_VER) | 382 | // #if defined(_MSC_VER) |
433 | #pragma message("Sha1 HW-SW stub was used") | 383 | #pragma message("Sha1 HW-SW stub was used") |
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
447 | } | 397 | } |
448 | #endif | 398 | #endif |
449 | 399 | ||
450 | #undef SU0 | 400 | #undef U0 |
451 | #undef SU1 | 401 | #undef U1 |
402 | #undef N0 | ||
403 | #undef N1 | ||
452 | #undef C | 404 | #undef C |
453 | #undef P | 405 | #undef P |
454 | #undef M | 406 | #undef M |
@@ -1,18 +1,14 @@ | |||
1 | /* Sha256.c -- SHA-256 Hash | 1 | /* Sha256.c -- SHA-256 Hash |
2 | 2024-03-01 : Igor Pavlov : Public domain | 2 | : Igor Pavlov : Public domain |
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | #include <string.h> | 7 | #include <string.h> |
8 | 8 | ||
9 | #include "CpuArch.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "Sha256.h" | 9 | #include "Sha256.h" |
12 | 10 | #include "RotateDefs.h" | |
13 | #if defined(_MSC_VER) && (_MSC_VER < 1900) | 11 | #include "CpuArch.h" |
14 | // #define USE_MY_MM | ||
15 | #endif | ||
16 | 12 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 13 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ | 14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
56 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; | 52 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; |
57 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; | 53 | static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; |
58 | 54 | ||
59 | #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks | 55 | #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks |
60 | #else | 56 | #else |
61 | #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks | 57 | #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks |
62 | #endif | 58 | #endif |
@@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) | |||
85 | return False; | 81 | return False; |
86 | #endif | 82 | #endif |
87 | 83 | ||
88 | p->func_UpdateBlocks = func; | 84 | p->v.vars.func_UpdateBlocks = func; |
89 | return True; | 85 | return True; |
90 | } | 86 | } |
91 | 87 | ||
@@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) | |||
111 | 107 | ||
112 | void Sha256_InitState(CSha256 *p) | 108 | void Sha256_InitState(CSha256 *p) |
113 | { | 109 | { |
114 | p->count = 0; | 110 | p->v.vars.count = 0; |
115 | p->state[0] = 0x6a09e667; | 111 | p->state[0] = 0x6a09e667; |
116 | p->state[1] = 0xbb67ae85; | 112 | p->state[1] = 0xbb67ae85; |
117 | p->state[2] = 0x3c6ef372; | 113 | p->state[2] = 0x3c6ef372; |
@@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p) | |||
122 | p->state[7] = 0x5be0cd19; | 118 | p->state[7] = 0x5be0cd19; |
123 | } | 119 | } |
124 | 120 | ||
121 | |||
122 | |||
123 | |||
124 | |||
125 | |||
126 | |||
127 | |||
125 | void Sha256_Init(CSha256 *p) | 128 | void Sha256_Init(CSha256 *p) |
126 | { | 129 | { |
127 | p->func_UpdateBlocks = | 130 | p->v.vars.func_UpdateBlocks = |
128 | #ifdef Z7_COMPILER_SHA256_SUPPORTED | 131 | #ifdef Z7_COMPILER_SHA256_SUPPORTED |
129 | g_SHA256_FUNC_UPDATE_BLOCKS; | 132 | g_SHA256_FUNC_UPDATE_BLOCKS; |
130 | #else | 133 | #else |
@@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p) | |||
133 | Sha256_InitState(p); | 136 | Sha256_InitState(p); |
134 | } | 137 | } |
135 | 138 | ||
136 | #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) | 139 | #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22)) |
137 | #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) | 140 | #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25)) |
138 | #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) | 141 | #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) |
139 | #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) | 142 | #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10)) |
140 | 143 | ||
141 | #define Ch(x,y,z) (z^(x&(y^z))) | 144 | #define Ch(x,y,z) (z^(x&(y^z))) |
142 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) | 145 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) |
@@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p) | |||
224 | 227 | ||
225 | #endif | 228 | #endif |
226 | 229 | ||
227 | // static | ||
228 | extern MY_ALIGN(64) | ||
229 | const UInt32 SHA256_K_ARRAY[64]; | ||
230 | 230 | ||
231 | MY_ALIGN(64) | 231 | extern |
232 | const UInt32 SHA256_K_ARRAY[64] = { | 232 | MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; |
233 | MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = { | ||
233 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, | 234 | 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, |
234 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, | 235 | 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, |
235 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, | 236 | 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, |
@@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = { | |||
248 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 | 249 | 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 |
249 | }; | 250 | }; |
250 | 251 | ||
251 | #define K SHA256_K_ARRAY | ||
252 | 252 | ||
253 | 253 | ||
254 | |||
255 | |||
256 | #define K SHA256_K_ARRAY | ||
257 | |||
254 | Z7_NO_INLINE | 258 | Z7_NO_INLINE |
255 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) | 259 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) |
256 | { | 260 | { |
257 | UInt32 W | 261 | UInt32 W |
258 | #ifdef Z7_SHA256_BIG_W | 262 | #ifdef Z7_SHA256_BIG_W |
259 | [64]; | 263 | [64]; |
260 | #else | 264 | #else |
261 | [16]; | 265 | [16]; |
262 | #endif | 266 | #endif |
263 | |||
264 | unsigned j; | 267 | unsigned j; |
265 | |||
266 | UInt32 a,b,c,d,e,f,g,h; | 268 | UInt32 a,b,c,d,e,f,g,h; |
267 | 269 | #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | |
268 | #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | ||
269 | UInt32 tmp; | 270 | UInt32 tmp; |
270 | #endif | 271 | #endif |
271 | 272 | ||
273 | if (numBlocks == 0) return; | ||
274 | |||
272 | a = state[0]; | 275 | a = state[0]; |
273 | b = state[1]; | 276 | b = state[1]; |
274 | c = state[2]; | 277 | c = state[2]; |
@@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
278 | g = state[6]; | 281 | g = state[6]; |
279 | h = state[7]; | 282 | h = state[7]; |
280 | 283 | ||
281 | while (numBlocks) | 284 | do |
282 | { | 285 | { |
283 | 286 | ||
284 | for (j = 0; j < 16; j += STEP_PRE) | 287 | for (j = 0; j < 16; j += STEP_PRE) |
@@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n | |||
352 | g += state[6]; state[6] = g; | 355 | g += state[6]; state[6] = g; |
353 | h += state[7]; state[7] = h; | 356 | h += state[7]; state[7] = h; |
354 | 357 | ||
355 | data += 64; | 358 | data += SHA256_BLOCK_SIZE; |
356 | numBlocks--; | ||
357 | } | 359 | } |
358 | 360 | while (--numBlocks); | |
359 | /* Wipe variables */ | ||
360 | /* memset(W, 0, sizeof(W)); */ | ||
361 | } | 361 | } |
362 | 362 | ||
363 | #undef S0 | ||
364 | #undef S1 | ||
365 | #undef s0 | ||
366 | #undef s1 | ||
367 | #undef K | ||
368 | 363 | ||
369 | #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | 364 | #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) |
370 | 365 | ||
@@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
372 | { | 367 | { |
373 | if (size == 0) | 368 | if (size == 0) |
374 | return; | 369 | return; |
375 | |||
376 | { | 370 | { |
377 | unsigned pos = (unsigned)p->count & 0x3F; | 371 | const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); |
378 | unsigned num; | 372 | const unsigned num = SHA256_BLOCK_SIZE - pos; |
379 | 373 | p->v.vars.count += size; | |
380 | p->count += size; | ||
381 | |||
382 | num = 64 - pos; | ||
383 | if (num > size) | 374 | if (num > size) |
384 | { | 375 | { |
385 | memcpy(p->buffer + pos, data, size); | 376 | memcpy(p->buffer + pos, data, size); |
386 | return; | 377 | return; |
387 | } | 378 | } |
388 | |||
389 | if (pos != 0) | 379 | if (pos != 0) |
390 | { | 380 | { |
391 | size -= num; | 381 | size -= num; |
@@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
395 | } | 385 | } |
396 | } | 386 | } |
397 | { | 387 | { |
398 | size_t numBlocks = size >> 6; | 388 | const size_t numBlocks = size >> 6; |
389 | // if (numBlocks) | ||
399 | SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | 390 | SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); |
400 | size &= 0x3F; | 391 | size &= SHA256_BLOCK_SIZE - 1; |
401 | if (size == 0) | 392 | if (size == 0) |
402 | return; | 393 | return; |
403 | data += (numBlocks << 6); | 394 | data += (numBlocks << 6); |
@@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) | |||
408 | 399 | ||
409 | void Sha256_Final(CSha256 *p, Byte *digest) | 400 | void Sha256_Final(CSha256 *p, Byte *digest) |
410 | { | 401 | { |
411 | unsigned pos = (unsigned)p->count & 0x3F; | 402 | unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); |
412 | unsigned i; | ||
413 | |||
414 | p->buffer[pos++] = 0x80; | 403 | p->buffer[pos++] = 0x80; |
415 | 404 | if (pos > (SHA256_BLOCK_SIZE - 4 * 2)) | |
416 | if (pos > (64 - 8)) | ||
417 | { | 405 | { |
418 | while (pos != 64) { p->buffer[pos++] = 0; } | 406 | while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; } |
419 | // memset(&p->buf.buffer[pos], 0, 64 - pos); | 407 | // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos); |
420 | Sha256_UpdateBlock(p); | 408 | Sha256_UpdateBlock(p); |
421 | pos = 0; | 409 | pos = 0; |
422 | } | 410 | } |
423 | 411 | memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos); | |
424 | /* | ||
425 | if (pos & 3) | ||
426 | { | 412 | { |
427 | p->buffer[pos] = 0; | 413 | const UInt64 numBits = p->v.vars.count << 3; |
428 | p->buffer[pos + 1] = 0; | 414 | SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) |
429 | p->buffer[pos + 2] = 0; | 415 | SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) |
430 | pos += 3; | ||
431 | pos &= ~3; | ||
432 | } | 416 | } |
417 | Sha256_UpdateBlock(p); | ||
418 | #if 1 && defined(MY_CPU_BE) | ||
419 | memcpy(digest, p->state, SHA256_DIGEST_SIZE); | ||
420 | #else | ||
433 | { | 421 | { |
434 | for (; pos < 64 - 8; pos += 4) | 422 | unsigned i; |
435 | *(UInt32 *)(&p->buffer[pos]) = 0; | 423 | for (i = 0; i < 8; i += 2) |
424 | { | ||
425 | const UInt32 v0 = p->state[i]; | ||
426 | const UInt32 v1 = p->state[(size_t)i + 1]; | ||
427 | SetBe32(digest , v0) | ||
428 | SetBe32(digest + 4, v1) | ||
429 | digest += 4 * 2; | ||
430 | } | ||
436 | } | 431 | } |
437 | */ | ||
438 | 432 | ||
439 | memset(&p->buffer[pos], 0, (64 - 8) - pos); | ||
440 | 433 | ||
441 | { | ||
442 | UInt64 numBits = (p->count << 3); | ||
443 | SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) | ||
444 | SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) | ||
445 | } | ||
446 | |||
447 | Sha256_UpdateBlock(p); | ||
448 | 434 | ||
449 | for (i = 0; i < 8; i += 2) | 435 | |
450 | { | 436 | #endif |
451 | UInt32 v0 = p->state[i]; | ||
452 | UInt32 v1 = p->state[(size_t)i + 1]; | ||
453 | SetBe32(digest , v0) | ||
454 | SetBe32(digest + 4, v1) | ||
455 | digest += 8; | ||
456 | } | ||
457 | |||
458 | Sha256_InitState(p); | 437 | Sha256_InitState(p); |
459 | } | 438 | } |
460 | 439 | ||
461 | 440 | ||
462 | void Sha256Prepare(void) | 441 | void Sha256Prepare(void) |
463 | { | 442 | { |
464 | #ifdef Z7_COMPILER_SHA256_SUPPORTED | 443 | #ifdef Z7_COMPILER_SHA256_SUPPORTED |
465 | SHA256_FUNC_UPDATE_BLOCKS f, f_hw; | 444 | SHA256_FUNC_UPDATE_BLOCKS f, f_hw; |
466 | f = Sha256_UpdateBlocks; | 445 | f = Sha256_UpdateBlocks; |
467 | f_hw = NULL; | 446 | f_hw = NULL; |
468 | #ifdef MY_CPU_X86_OR_AMD64 | 447 | #ifdef MY_CPU_X86_OR_AMD64 |
469 | #ifndef USE_MY_MM | ||
470 | if (CPU_IsSupported_SHA() | 448 | if (CPU_IsSupported_SHA() |
471 | && CPU_IsSupported_SSSE3() | 449 | && CPU_IsSupported_SSSE3() |
472 | // && CPU_IsSupported_SSE41() | ||
473 | ) | 450 | ) |
474 | #endif | 451 | #else |
475 | #else | ||
476 | if (CPU_IsSupported_SHA2()) | 452 | if (CPU_IsSupported_SHA2()) |
477 | #endif | 453 | #endif |
478 | { | 454 | { |
479 | // printf("\n========== HW SHA256 ======== \n"); | 455 | // printf("\n========== HW SHA256 ======== \n"); |
480 | f = f_hw = Sha256_UpdateBlocks_HW; | 456 | f = f_hw = Sha256_UpdateBlocks_HW; |
481 | } | 457 | } |
482 | g_SHA256_FUNC_UPDATE_BLOCKS = f; | 458 | g_SHA256_FUNC_UPDATE_BLOCKS = f; |
483 | g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; | 459 | g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; |
484 | #endif | 460 | #endif |
485 | } | 461 | } |
486 | 462 | ||
463 | #undef U64C | ||
464 | #undef K | ||
487 | #undef S0 | 465 | #undef S0 |
488 | #undef S1 | 466 | #undef S1 |
489 | #undef s0 | 467 | #undef s0 |
@@ -1,5 +1,5 @@ | |||
1 | /* Sha256.h -- SHA-256 Hash | 1 | /* Sha256.h -- SHA-256 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_SHA256_H | 4 | #ifndef ZIP7_INC_SHA256_H |
5 | #define ZIP7_INC_SHA256_H | 5 | #define ZIP7_INC_SHA256_H |
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN | |||
14 | #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) | 14 | #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) |
15 | #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) | 15 | #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) |
16 | 16 | ||
17 | |||
18 | |||
19 | |||
17 | typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); | 20 | typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); |
18 | 21 | ||
19 | /* | 22 | /* |
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt | |||
32 | 35 | ||
33 | typedef struct | 36 | typedef struct |
34 | { | 37 | { |
35 | SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | 38 | union |
36 | UInt64 count; | 39 | { |
37 | UInt64 _pad_2[2]; | 40 | struct |
41 | { | ||
42 | SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[4]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
38 | UInt32 state[SHA256_NUM_DIGEST_WORDS]; | 48 | UInt32 state[SHA256_NUM_DIGEST_WORDS]; |
39 | 49 | ||
40 | Byte buffer[SHA256_BLOCK_SIZE]; | 50 | Byte buffer[SHA256_BLOCK_SIZE]; |
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index eb38166..1c6b50f 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
@@ -1,18 +1,11 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if defined(_MSC_VER) | ||
9 | #if (_MSC_VER < 1900) && (_MSC_VER >= 1200) | ||
10 | // #define USE_MY_MM | ||
11 | #endif | ||
12 | #endif | ||
13 | |||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | 8 | // #define Z7_USE_HW_SHA_STUB // for debug |
15 | |||
16 | #ifdef MY_CPU_X86_OR_AMD64 | 9 | #ifdef MY_CPU_X86_OR_AMD64 |
17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
18 | #define USE_HW_SHA | 11 | #define USE_HW_SHA |
@@ -20,19 +13,14 @@ | |||
20 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ | 13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
21 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) | 14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) |
22 | #define USE_HW_SHA | 15 | #define USE_HW_SHA |
23 | #if !defined(_INTEL_COMPILER) | 16 | #if !defined(__INTEL_COMPILER) |
24 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | 17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) |
25 | #if !defined(__SHA__) || !defined(__SSSE3__) | 18 | #if !defined(__SHA__) || !defined(__SSSE3__) |
26 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) | 19 | #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) |
27 | #endif | 20 | #endif |
28 | #endif | 21 | #endif |
29 | #elif defined(_MSC_VER) | 22 | #elif defined(_MSC_VER) |
30 | #ifdef USE_MY_MM | 23 | #if (_MSC_VER >= 1900) |
31 | #define USE_VER_MIN 1300 | ||
32 | #else | ||
33 | #define USE_VER_MIN 1900 | ||
34 | #endif | ||
35 | #if (_MSC_VER >= USE_VER_MIN) | ||
36 | #define USE_HW_SHA | 24 | #define USE_HW_SHA |
37 | #else | 25 | #else |
38 | #define Z7_USE_HW_SHA_STUB | 26 | #define Z7_USE_HW_SHA_STUB |
@@ -47,23 +35,20 @@ | |||
47 | 35 | ||
48 | // #pragma message("Sha256 HW") | 36 | // #pragma message("Sha256 HW") |
49 | 37 | ||
38 | |||
39 | |||
40 | |||
50 | // sse/sse2/ssse3: | 41 | // sse/sse2/ssse3: |
51 | #include <tmmintrin.h> | 42 | #include <tmmintrin.h> |
52 | // sha*: | 43 | // sha*: |
53 | #include <immintrin.h> | 44 | #include <immintrin.h> |
54 | 45 | ||
55 | #if defined (__clang__) && defined(_MSC_VER) | 46 | #if defined (__clang__) && defined(_MSC_VER) |
56 | // #if !defined(__SSSE3__) | ||
57 | // #endif | ||
58 | #if !defined(__SHA__) | 47 | #if !defined(__SHA__) |
59 | #include <shaintrin.h> | 48 | #include <shaintrin.h> |
60 | #endif | 49 | #endif |
61 | #else | 50 | #else |
62 | 51 | ||
63 | #ifdef USE_MY_MM | ||
64 | #include "My_mm.h" | ||
65 | #endif | ||
66 | |||
67 | #endif | 52 | #endif |
68 | 53 | ||
69 | /* | 54 | /* |
@@ -91,60 +76,44 @@ SHA: | |||
91 | extern | 76 | extern |
92 | MY_ALIGN(64) | 77 | MY_ALIGN(64) |
93 | const UInt32 SHA256_K_ARRAY[64]; | 78 | const UInt32 SHA256_K_ARRAY[64]; |
94 | |||
95 | #define K SHA256_K_ARRAY | 79 | #define K SHA256_K_ARRAY |
96 | 80 | ||
97 | 81 | ||
98 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); | 82 | #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); |
99 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); | 83 | #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); |
100 | #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); | 84 | #define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); |
101 | |||
102 | 85 | ||
103 | #define LOAD_SHUFFLE(m, k) \ | 86 | #define LOAD_SHUFFLE(m, k) \ |
104 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ | 87 | m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ |
105 | m = _mm_shuffle_epi8(m, mask); \ | 88 | m = _mm_shuffle_epi8(m, mask); \ |
106 | 89 | ||
107 | #define SM1(g0, g1, g2, g3) \ | 90 | #define NNN(m0, m1, m2, m3) |
108 | SHA256_MSG1(g3, g0); \ | ||
109 | 91 | ||
110 | #define SM2(g0, g1, g2, g3) \ | 92 | #define SM1(m1, m2, m3, m0) \ |
111 | tmp = _mm_alignr_epi8(g1, g0, 4); \ | 93 | SHA256_MSG1(m0, m1); \ |
112 | ADD_EPI32(g2, tmp) \ | ||
113 | SHA25G_MSG2(g2, g1); \ | ||
114 | |||
115 | // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) | ||
116 | // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) | ||
117 | |||
118 | |||
119 | #define NNN(g0, g1, g2, g3) | ||
120 | 94 | ||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ | ||
97 | SHA256_MSG2(m0, m3); \ | ||
121 | 98 | ||
122 | #define RND2(t0, t1) \ | 99 | #define RND2(t0, t1) \ |
123 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); | 100 | t0 = _mm_sha256rnds2_epu32(t0, t1, msg); |
124 | 101 | ||
125 | #define RND2_0(m, k) \ | ||
126 | msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
127 | RND2(state0, state1); \ | ||
128 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
129 | 102 | ||
130 | 103 | ||
131 | #define RND2_1 \ | 104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ |
105 | msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1); \ | ||
107 | msg = _mm_shuffle_epi32(msg, 0x0E); \ | ||
108 | OP0(m0, m1, m2, m3) \ | ||
132 | RND2(state1, state0); \ | 109 | RND2(state1, state0); \ |
133 | 110 | OP1(m0, m1, m2, m3) \ | |
134 | |||
135 | // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 | ||
136 | |||
137 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | ||
138 | RND2_0(g0, k) \ | ||
139 | OP0(g0, g1, g2, g3) \ | ||
140 | RND2_1 \ | ||
141 | OP1(g0, g1, g2, g3) \ | ||
142 | 111 | ||
143 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
144 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | 113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ |
145 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | 114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ |
146 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | 115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ |
147 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | 116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ |
148 | 117 | ||
149 | #define PREPARE_STATE \ | 118 | #define PREPARE_STATE \ |
150 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ | 119 | tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ |
@@ -161,8 +130,9 @@ ATTRIB_SHA | |||
161 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 130 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
162 | { | 131 | { |
163 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); | 132 | const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); |
164 | __m128i tmp; | 133 | |
165 | __m128i state0, state1; | 134 | |
135 | __m128i tmp, state0, state1; | ||
166 | 136 | ||
167 | if (numBlocks == 0) | 137 | if (numBlocks == 0) |
168 | return; | 138 | return; |
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
262 | #define _ARM_USE_NEW_NEON_INTRINSICS | 232 | #define _ARM_USE_NEW_NEON_INTRINSICS |
263 | #endif | 233 | #endif |
264 | 234 | ||
265 | |||
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | 235 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
270 | #include <arm64_neon.h> | 236 | #include <arm64_neon.h> |
271 | #else | 237 | #else |
272 | 238 | ||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | 239 | #if defined(__clang__) && __clang_major__ < 16 |
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | 240 | #if !defined(__ARM_FEATURE_SHA2) && \ |
283 | !defined(__ARM_FEATURE_CRYPTO) | 241 | !defined(__ARM_FEATURE_CRYPTO) |
@@ -324,41 +282,70 @@ typedef uint32x4_t v128; | |||
324 | // typedef __n128 v128; // MSVC | 282 | // typedef __n128 v128; // MSVC |
325 | 283 | ||
326 | #ifdef MY_CPU_BE | 284 | #ifdef MY_CPU_BE |
327 | #define MY_rev32_for_LE(x) | 285 | #define MY_rev32_for_LE(x) x |
328 | #else | 286 | #else |
329 | #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) | 287 | #define MY_rev32_for_LE(x) vrev32q_u8(x) |
330 | #endif | 288 | #endif |
331 | 289 | ||
332 | #define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) | 290 | #if 1 // 0 for debug |
333 | #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) | 291 | // for arm32: it works slower by some reason than direct code |
292 | /* | ||
293 | for arm32 it generates: | ||
294 | MSVC-2022, GCC-9: | ||
295 | vld1.32 {d18,d19}, [r10] | ||
296 | vst1.32 {d4,d5}, [r3] | ||
297 | vld1.8 {d20-d21}, [r4] | ||
298 | there is no align hint (like [r10:128]). So instruction allows unaligned access | ||
299 | */ | ||
300 | #define LOAD_128_32(_p) vld1q_u32(_p) | ||
301 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
302 | #define STORE_128_32(_p, _v) vst1q_u32(_p, _v) | ||
303 | #else | ||
304 | /* | ||
305 | for arm32: | ||
306 | MSVC-2022: | ||
307 | vldm r10,{d18,d19} | ||
308 | vstm r3,{d4,d5} | ||
309 | does it require strict alignment? | ||
310 | GCC-9: | ||
311 | vld1.64 {d30-d31}, [r0:64] | ||
312 | vldr d28, [r0, #16] | ||
313 | vldr d29, [r0, #24] | ||
314 | vst1.64 {d30-d31}, [r0:64] | ||
315 | vstr d28, [r0, #16] | ||
316 | vstr d29, [r0, #24] | ||
317 | there is hint [r0:64], so does it requires 64-bit alignment. | ||
318 | */ | ||
319 | #define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) | ||
320 | #define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) | ||
321 | #define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) | ||
322 | #endif | ||
334 | 323 | ||
335 | #define LOAD_SHUFFLE(m, k) \ | 324 | #define LOAD_SHUFFLE(m, k) \ |
336 | m = LOAD_128((data + (k) * 16)); \ | 325 | m = vreinterpretq_u32_u8( \ |
337 | MY_rev32_for_LE(m); \ | 326 | MY_rev32_for_LE( \ |
327 | LOAD_128_8(data + (k) * 16))); \ | ||
338 | 328 | ||
339 | // K array must be aligned for 16-bytes at least. | 329 | // K array must be aligned for 16-bytes at least. |
340 | extern | 330 | extern |
341 | MY_ALIGN(64) | 331 | MY_ALIGN(64) |
342 | const UInt32 SHA256_K_ARRAY[64]; | 332 | const UInt32 SHA256_K_ARRAY[64]; |
343 | |||
344 | #define K SHA256_K_ARRAY | 333 | #define K SHA256_K_ARRAY |
345 | 334 | ||
346 | |||
347 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); | 335 | #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); |
348 | #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); | 336 | #define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); |
349 | 337 | ||
350 | #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) | 338 | #define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) |
351 | #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) | 339 | #define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) |
352 | #define NNN(g0, g1, g2, g3) | 340 | #define NNN(m0, m1, m2, m3) |
353 | 341 | ||
354 | 342 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | |
355 | #define R4(k, g0, g1, g2, g3, OP0, OP1) \ | 343 | msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ |
356 | msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ | ||
357 | tmp = state0; \ | 344 | tmp = state0; \ |
358 | state0 = vsha256hq_u32( state0, state1, msg ); \ | 345 | state0 = vsha256hq_u32( state0, state1, msg ); \ |
359 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ | 346 | state1 = vsha256h2q_u32( state1, tmp, msg ); \ |
360 | OP0(g0, g1, g2, g3); \ | 347 | OP0(m0, m1, m2, m3); \ |
361 | OP1(g0, g1, g2, g3); \ | 348 | OP1(m0, m1, m2, m3); \ |
362 | 349 | ||
363 | 350 | ||
364 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | 351 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ |
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
379 | if (numBlocks == 0) | 366 | if (numBlocks == 0) |
380 | return; | 367 | return; |
381 | 368 | ||
382 | state0 = LOAD_128(&state[0]); | 369 | state0 = LOAD_128_32(&state[0]); |
383 | state1 = LOAD_128(&state[4]); | 370 | state1 = LOAD_128_32(&state[4]); |
384 | 371 | ||
385 | do | 372 | do |
386 | { | 373 | { |
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
408 | } | 395 | } |
409 | while (--numBlocks); | 396 | while (--numBlocks); |
410 | 397 | ||
411 | STORE_128(&state[0], state0); | 398 | STORE_128_32(&state[0], state0); |
412 | STORE_128(&state[4], state1); | 399 | STORE_128_32(&state[4], state1); |
413 | } | 400 | } |
414 | 401 | ||
415 | #endif // USE_HW_SHA | 402 | #endif // USE_HW_SHA |
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
443 | #endif | 430 | #endif |
444 | 431 | ||
445 | 432 | ||
446 | |||
447 | #undef K | 433 | #undef K |
448 | #undef RND2 | 434 | #undef RND2 |
449 | #undef RND2_0 | ||
450 | #undef RND2_1 | ||
451 | |||
452 | #undef MY_rev32_for_LE | 435 | #undef MY_rev32_for_LE |
436 | |||
453 | #undef NNN | 437 | #undef NNN |
454 | #undef LOAD_128 | 438 | #undef LOAD_128 |
455 | #undef STORE_128 | 439 | #undef STORE_128 |
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
457 | #undef SM1 | 441 | #undef SM1 |
458 | #undef SM2 | 442 | #undef SM2 |
459 | 443 | ||
460 | #undef NNN | 444 | |
461 | #undef R4 | 445 | #undef R4 |
462 | #undef R16 | 446 | #undef R16 |
463 | #undef PREPARE_STATE | 447 | #undef PREPARE_STATE |
diff --git a/C/Sha3.c b/C/Sha3.c new file mode 100644 index 0000000..be972d6 --- /dev/null +++ b/C/Sha3.c | |||
@@ -0,0 +1,359 @@ | |||
1 | /* Sha3.c -- SHA-3 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Sha3.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #define U64C(x) UINT64_CONST(x) | ||
14 | |||
15 | static | ||
16 | MY_ALIGN(64) | ||
17 | const UInt64 SHA3_K_ARRAY[24] = | ||
18 | { | ||
19 | U64C(0x0000000000000001), U64C(0x0000000000008082), | ||
20 | U64C(0x800000000000808a), U64C(0x8000000080008000), | ||
21 | U64C(0x000000000000808b), U64C(0x0000000080000001), | ||
22 | U64C(0x8000000080008081), U64C(0x8000000000008009), | ||
23 | U64C(0x000000000000008a), U64C(0x0000000000000088), | ||
24 | U64C(0x0000000080008009), U64C(0x000000008000000a), | ||
25 | U64C(0x000000008000808b), U64C(0x800000000000008b), | ||
26 | U64C(0x8000000000008089), U64C(0x8000000000008003), | ||
27 | U64C(0x8000000000008002), U64C(0x8000000000000080), | ||
28 | U64C(0x000000000000800a), U64C(0x800000008000000a), | ||
29 | U64C(0x8000000080008081), U64C(0x8000000000008080), | ||
30 | U64C(0x0000000080000001), U64C(0x8000000080008008) | ||
31 | }; | ||
32 | |||
33 | void Sha3_Init(CSha3 *p) | ||
34 | { | ||
35 | p->count = 0; | ||
36 | memset(p->state, 0, sizeof(p->state)); | ||
37 | } | ||
38 | |||
39 | #define GET_state(i, a) UInt64 a = state[i]; | ||
40 | #define SET_state(i, a) state[i] = a; | ||
41 | |||
42 | #define LS_5(M, i, a0,a1,a2,a3,a4) \ | ||
43 | M ((i) * 5 , a0) \ | ||
44 | M ((i) * 5 + 1, a1) \ | ||
45 | M ((i) * 5 + 2, a2) \ | ||
46 | M ((i) * 5 + 3, a3) \ | ||
47 | M ((i) * 5 + 4, a4) \ | ||
48 | |||
49 | #define LS_25(M) \ | ||
50 | LS_5 (M, 0, a50, a51, a52, a53, a54) \ | ||
51 | LS_5 (M, 1, a60, a61, a62, a63, a64) \ | ||
52 | LS_5 (M, 2, a70, a71, a72, a73, a74) \ | ||
53 | LS_5 (M, 3, a80, a81, a82, a83, a84) \ | ||
54 | LS_5 (M, 4, a90, a91, a92, a93, a94) \ | ||
55 | |||
56 | |||
57 | #define XOR_1(i, a0) \ | ||
58 | a0 ^= GetUi64(data + (i) * 8); \ | ||
59 | |||
60 | #define XOR_4(i, a0,a1,a2,a3) \ | ||
61 | XOR_1 ((i) , a0); \ | ||
62 | XOR_1 ((i) + 1, a1); \ | ||
63 | XOR_1 ((i) + 2, a2); \ | ||
64 | XOR_1 ((i) + 3, a3); \ | ||
65 | |||
66 | #define D(d,b1,b2) \ | ||
67 | d = b1 ^ Z7_ROTL64(b2, 1); | ||
68 | |||
69 | #define D5 \ | ||
70 | D (d0, c4, c1) \ | ||
71 | D (d1, c0, c2) \ | ||
72 | D (d2, c1, c3) \ | ||
73 | D (d3, c2, c4) \ | ||
74 | D (d4, c3, c0) \ | ||
75 | |||
76 | #define C0(c,a,d) \ | ||
77 | c = a ^ d; \ | ||
78 | |||
79 | #define C(c,a,d,k) \ | ||
80 | c = a ^ d; \ | ||
81 | c = Z7_ROTL64(c, k); \ | ||
82 | |||
83 | #define E4(e1,e2,e3,e4) \ | ||
84 | e1 = c1 ^ (~c2 & c3); \ | ||
85 | e2 = c2 ^ (~c3 & c4); \ | ||
86 | e3 = c3 ^ (~c4 & c0); \ | ||
87 | e4 = c4 ^ (~c0 & c1); \ | ||
88 | |||
89 | #define CK( v0,w0, \ | ||
90 | v1,w1,k1, \ | ||
91 | v2,w2,k2, \ | ||
92 | v3,w3,k3, \ | ||
93 | v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \ | ||
94 | C0(c0,v0,w0) \ | ||
95 | C (c1,v1,w1,k1) \ | ||
96 | C (c2,v2,w2,k2) \ | ||
97 | C (c3,v3,w3,k3) \ | ||
98 | C (c4,v4,w4,k4) \ | ||
99 | e0 = c0 ^ (~c1 & c2) ^ keccak_c; \ | ||
100 | E4(e1,e2,e3,e4) \ | ||
101 | |||
102 | #define CE( v0,w0,k0, \ | ||
103 | v1,w1,k1, \ | ||
104 | v2,w2,k2, \ | ||
105 | v3,w3,k3, \ | ||
106 | v4,w4,k4, e0,e1,e2,e3,e4) \ | ||
107 | C (c0,v0,w0,k0) \ | ||
108 | C (c1,v1,w1,k1) \ | ||
109 | C (c2,v2,w2,k2) \ | ||
110 | C (c3,v3,w3,k3) \ | ||
111 | C (c4,v4,w4,k4) \ | ||
112 | e0 = c0 ^ (~c1 & c2); \ | ||
113 | E4(e1,e2,e3,e4) \ | ||
114 | |||
115 | // numBlocks != 0 | ||
116 | static | ||
117 | Z7_NO_INLINE | ||
118 | void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS], | ||
119 | const Byte *data, size_t numBlocks, size_t blockSize) | ||
120 | { | ||
121 | LS_25 (GET_state) | ||
122 | |||
123 | do | ||
124 | { | ||
125 | unsigned round; | ||
126 | XOR_4 ( 0, a50, a51, a52, a53) | ||
127 | XOR_4 ( 4, a54, a60, a61, a62) | ||
128 | XOR_1 ( 8, a63) | ||
129 | if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384 | ||
130 | if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256 | ||
131 | if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224 | ||
132 | if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128 | ||
133 | XOR_1 (19, a84) | ||
134 | XOR_1 (20, a90) }}}} | ||
135 | data += blockSize; | ||
136 | |||
137 | for (round = 0; round < 24; round += 2) | ||
138 | { | ||
139 | UInt64 c0, c1, c2, c3, c4; | ||
140 | UInt64 d0, d1, d2, d3, d4; | ||
141 | UInt64 e50, e51, e52, e53, e54; | ||
142 | UInt64 e60, e61, e62, e63, e64; | ||
143 | UInt64 e70, e71, e72, e73, e74; | ||
144 | UInt64 e80, e81, e82, e83, e84; | ||
145 | UInt64 e90, e91, e92, e93, e94; | ||
146 | |||
147 | c0 = a50^a60^a70^a80^a90; | ||
148 | c1 = a51^a61^a71^a81^a91; | ||
149 | c2 = a52^a62^a72^a82^a92; | ||
150 | c3 = a53^a63^a73^a83^a93; | ||
151 | c4 = a54^a64^a74^a84^a94; | ||
152 | D5 | ||
153 | CK( a50, d0, | ||
154 | a61, d1, 44, | ||
155 | a72, d2, 43, | ||
156 | a83, d3, 21, | ||
157 | a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round]) | ||
158 | CE( a53, d3, 28, | ||
159 | a64, d4, 20, | ||
160 | a70, d0, 3, | ||
161 | a81, d1, 45, | ||
162 | a92, d2, 61, e60, e61, e62, e63, e64) | ||
163 | CE( a51, d1, 1, | ||
164 | a62, d2, 6, | ||
165 | a73, d3, 25, | ||
166 | a84, d4, 8, | ||
167 | a90, d0, 18, e70, e71, e72, e73, e74) | ||
168 | CE( a54, d4, 27, | ||
169 | a60, d0, 36, | ||
170 | a71, d1, 10, | ||
171 | a82, d2, 15, | ||
172 | a93, d3, 56, e80, e81, e82, e83, e84) | ||
173 | CE( a52, d2, 62, | ||
174 | a63, d3, 55, | ||
175 | a74, d4, 39, | ||
176 | a80, d0, 41, | ||
177 | a91, d1, 2, e90, e91, e92, e93, e94) | ||
178 | |||
179 | // ---------- ROUND + 1 ---------- | ||
180 | |||
181 | c0 = e50^e60^e70^e80^e90; | ||
182 | c1 = e51^e61^e71^e81^e91; | ||
183 | c2 = e52^e62^e72^e82^e92; | ||
184 | c3 = e53^e63^e73^e83^e93; | ||
185 | c4 = e54^e64^e74^e84^e94; | ||
186 | D5 | ||
187 | CK( e50, d0, | ||
188 | e61, d1, 44, | ||
189 | e72, d2, 43, | ||
190 | e83, d3, 21, | ||
191 | e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1]) | ||
192 | CE( e53, d3, 28, | ||
193 | e64, d4, 20, | ||
194 | e70, d0, 3, | ||
195 | e81, d1, 45, | ||
196 | e92, d2, 61, a60, a61, a62, a63, a64) | ||
197 | CE( e51, d1, 1, | ||
198 | e62, d2, 6, | ||
199 | e73, d3, 25, | ||
200 | e84, d4, 8, | ||
201 | e90, d0, 18, a70, a71, a72, a73, a74) | ||
202 | CE (e54, d4, 27, | ||
203 | e60, d0, 36, | ||
204 | e71, d1, 10, | ||
205 | e82, d2, 15, | ||
206 | e93, d3, 56, a80, a81, a82, a83, a84) | ||
207 | CE (e52, d2, 62, | ||
208 | e63, d3, 55, | ||
209 | e74, d4, 39, | ||
210 | e80, d0, 41, | ||
211 | e91, d1, 2, a90, a91, a92, a93, a94) | ||
212 | } | ||
213 | } | ||
214 | while (--numBlocks); | ||
215 | |||
216 | LS_25 (SET_state) | ||
217 | } | ||
218 | |||
219 | |||
220 | #define Sha3_UpdateBlock(p) \ | ||
221 | Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize) | ||
222 | |||
223 | void Sha3_Update(CSha3 *p, const Byte *data, size_t size) | ||
224 | { | ||
225 | /* | ||
226 | for (;;) | ||
227 | { | ||
228 | if (size == 0) | ||
229 | return; | ||
230 | unsigned cur = p->blockSize - p->count; | ||
231 | if (cur > size) | ||
232 | cur = (unsigned)size; | ||
233 | size -= cur; | ||
234 | unsigned pos = p->count; | ||
235 | p->count = pos + cur; | ||
236 | while (pos & 7) | ||
237 | { | ||
238 | if (cur == 0) | ||
239 | return; | ||
240 | Byte *pb = &(((Byte *)p->state)[pos]); | ||
241 | *pb = (Byte)(*pb ^ *data++); | ||
242 | cur--; | ||
243 | pos++; | ||
244 | } | ||
245 | if (cur >= 8) | ||
246 | { | ||
247 | do | ||
248 | { | ||
249 | *(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data); | ||
250 | data += 8; | ||
251 | pos += 8; | ||
252 | cur -= 8; | ||
253 | } | ||
254 | while (cur >= 8); | ||
255 | } | ||
256 | if (pos != p->blockSize) | ||
257 | { | ||
258 | if (cur) | ||
259 | { | ||
260 | Byte *pb = &(((Byte *)p->state)[pos]); | ||
261 | do | ||
262 | { | ||
263 | *pb = (Byte)(*pb ^ *data++); | ||
264 | pb++; | ||
265 | } | ||
266 | while (--cur); | ||
267 | } | ||
268 | return; | ||
269 | } | ||
270 | Sha3_UpdateBlock(p->state); | ||
271 | p->count = 0; | ||
272 | } | ||
273 | */ | ||
274 | if (size == 0) | ||
275 | return; | ||
276 | { | ||
277 | const unsigned pos = p->count; | ||
278 | const unsigned num = p->blockSize - pos; | ||
279 | if (num > size) | ||
280 | { | ||
281 | p->count = pos + (unsigned)size; | ||
282 | memcpy(p->buffer + pos, data, size); | ||
283 | return; | ||
284 | } | ||
285 | if (pos != 0) | ||
286 | { | ||
287 | size -= num; | ||
288 | memcpy(p->buffer + pos, data, num); | ||
289 | data += num; | ||
290 | Sha3_UpdateBlock(p); | ||
291 | } | ||
292 | } | ||
293 | if (size >= p->blockSize) | ||
294 | { | ||
295 | const size_t numBlocks = size / p->blockSize; | ||
296 | const Byte *dataOld = data; | ||
297 | data += numBlocks * p->blockSize; | ||
298 | size = (size_t)(dataOld + size - data); | ||
299 | Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize); | ||
300 | } | ||
301 | p->count = (unsigned)size; | ||
302 | if (size) | ||
303 | memcpy(p->buffer, data, size); | ||
304 | } | ||
305 | |||
306 | |||
307 | // we support only (digestSize % 4 == 0) cases | ||
308 | void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake) | ||
309 | { | ||
310 | memset(p->buffer + p->count, 0, p->blockSize - p->count); | ||
311 | // we write bits markers from low to higher in current byte: | ||
312 | // - if sha-3 : 2 bits : 0,1 | ||
313 | // - if shake : 4 bits : 1111 | ||
314 | // then we write bit 1 to same byte. | ||
315 | // And we write bit 1 to highest bit of last byte of block. | ||
316 | p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06); | ||
317 | // we need xor operation (^= 0x80) here because we must write 0x80 bit | ||
318 | // to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!! | ||
319 | p->buffer[p->blockSize - 1] ^= 0x80; | ||
320 | /* | ||
321 | ((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06); | ||
322 | ((Byte *)p->state)[p->blockSize - 1] ^= 0x80; | ||
323 | */ | ||
324 | Sha3_UpdateBlock(p); | ||
325 | #if 1 && defined(MY_CPU_LE) | ||
326 | memcpy(digest, p->state, digestSize); | ||
327 | #else | ||
328 | { | ||
329 | const unsigned numWords = digestSize >> 3; | ||
330 | unsigned i; | ||
331 | for (i = 0; i < numWords; i++) | ||
332 | { | ||
333 | const UInt64 v = p->state[i]; | ||
334 | SetUi64(digest, v) | ||
335 | digest += 8; | ||
336 | } | ||
337 | if (digestSize & 4) // for SHA3-224 | ||
338 | { | ||
339 | const UInt32 v = (UInt32)p->state[numWords]; | ||
340 | SetUi32(digest, v) | ||
341 | } | ||
342 | } | ||
343 | #endif | ||
344 | Sha3_Init(p); | ||
345 | } | ||
346 | |||
347 | #undef GET_state | ||
348 | #undef SET_state | ||
349 | #undef LS_5 | ||
350 | #undef LS_25 | ||
351 | #undef XOR_1 | ||
352 | #undef XOR_4 | ||
353 | #undef D | ||
354 | #undef D5 | ||
355 | #undef C0 | ||
356 | #undef C | ||
357 | #undef E4 | ||
358 | #undef CK | ||
359 | #undef CE | ||
diff --git a/C/Sha3.h b/C/Sha3.h new file mode 100644 index 0000000..c5909c9 --- /dev/null +++ b/C/Sha3.h | |||
@@ -0,0 +1,36 @@ | |||
1 | /* Sha3.h -- SHA-3 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_MD5_H | ||
5 | #define ZIP7_INC_MD5_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define SHA3_NUM_STATE_WORDS 25 | ||
12 | |||
13 | #define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \ | ||
14 | (SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2) | ||
15 | |||
16 | typedef struct | ||
17 | { | ||
18 | UInt32 count; // < blockSize | ||
19 | UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8 | ||
20 | UInt64 _pad1[3]; | ||
21 | // we want 32-bytes alignment here | ||
22 | UInt64 state[SHA3_NUM_STATE_WORDS]; | ||
23 | UInt64 _pad2[3]; | ||
24 | // we want 64-bytes alignment here | ||
25 | Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values | ||
26 | } CSha3; | ||
27 | |||
28 | #define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); } | ||
29 | |||
30 | void Sha3_Init(CSha3 *p); | ||
31 | void Sha3_Update(CSha3 *p, const Byte *data, size_t size); | ||
32 | void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake); | ||
33 | |||
34 | EXTERN_C_END | ||
35 | |||
36 | #endif | ||
diff --git a/C/Sha512.c b/C/Sha512.c new file mode 100644 index 0000000..f0787fd --- /dev/null +++ b/C/Sha512.c | |||
@@ -0,0 +1,711 @@ | |||
1 | /* Sha512.c -- SHA-512 Hash | ||
2 | : Igor Pavlov : Public domain | ||
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | ||
4 | |||
5 | #include "Precomp.h" | ||
6 | |||
7 | #include <string.h> | ||
8 | |||
9 | #include "Sha512.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "CpuArch.h" | ||
12 | |||
13 | #ifdef MY_CPU_X86_OR_AMD64 | ||
14 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ | ||
15 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ | ||
16 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \ | ||
17 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \ | ||
18 | || defined(_MSC_VER) && (_MSC_VER >= 1940) | ||
19 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
20 | #endif | ||
21 | #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
22 | #if defined(__ARM_FEATURE_SHA512) | ||
23 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
24 | #else | ||
25 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ | ||
26 | || defined(__GNUC__) && (__GNUC__ >= 9) \ | ||
27 | ) \ | ||
28 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it | ||
29 | #define Z7_COMPILER_SHA512_SUPPORTED | ||
30 | #endif | ||
31 | #endif | ||
32 | #endif | ||
33 | |||
34 | |||
35 | |||
36 | |||
37 | |||
38 | |||
39 | |||
40 | |||
41 | |||
42 | |||
43 | |||
44 | |||
45 | |||
46 | |||
47 | void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
48 | |||
49 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
50 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
51 | |||
52 | static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks; | ||
53 | static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW; | ||
54 | |||
55 | #define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks | ||
56 | #else | ||
57 | #define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks | ||
58 | #endif | ||
59 | |||
60 | |||
61 | BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo) | ||
62 | { | ||
63 | SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks; | ||
64 | |||
65 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
66 | if (algo != SHA512_ALGO_SW) | ||
67 | { | ||
68 | if (algo == SHA512_ALGO_DEFAULT) | ||
69 | func = g_SHA512_FUNC_UPDATE_BLOCKS; | ||
70 | else | ||
71 | { | ||
72 | if (algo != SHA512_ALGO_HW) | ||
73 | return False; | ||
74 | func = g_SHA512_FUNC_UPDATE_BLOCKS_HW; | ||
75 | if (!func) | ||
76 | return False; | ||
77 | } | ||
78 | } | ||
79 | #else | ||
80 | if (algo > 1) | ||
81 | return False; | ||
82 | #endif | ||
83 | |||
84 | p->v.vars.func_UpdateBlocks = func; | ||
85 | return True; | ||
86 | } | ||
87 | |||
88 | |||
89 | /* define it for speed optimization */ | ||
90 | |||
91 | #if 0 // 1 for size optimization | ||
92 | #define STEP_PRE 1 | ||
93 | #define STEP_MAIN 1 | ||
94 | #else | ||
95 | #define STEP_PRE 2 | ||
96 | #define STEP_MAIN 4 | ||
97 | // #define Z7_SHA512_UNROLL | ||
98 | #endif | ||
99 | |||
100 | #undef Z7_SHA512_BIG_W | ||
101 | #if STEP_MAIN != 16 | ||
102 | #define Z7_SHA512_BIG_W | ||
103 | #endif | ||
104 | |||
105 | |||
106 | #define U64C(x) UINT64_CONST(x) | ||
107 | |||
108 | static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = { | ||
109 | { U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf), | ||
110 | U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1) | ||
111 | }, | ||
112 | { U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd), | ||
113 | U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2) | ||
114 | }, | ||
115 | { U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939), | ||
116 | U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4) | ||
117 | }, | ||
118 | { U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1), | ||
119 | U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179) | ||
120 | }}; | ||
121 | |||
122 | void Sha512_InitState(CSha512 *p, unsigned digestSize) | ||
123 | { | ||
124 | p->v.vars.count = 0; | ||
125 | memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state)); | ||
126 | } | ||
127 | |||
128 | void Sha512_Init(CSha512 *p, unsigned digestSize) | ||
129 | { | ||
130 | p->v.vars.func_UpdateBlocks = | ||
131 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
132 | g_SHA512_FUNC_UPDATE_BLOCKS; | ||
133 | #else | ||
134 | NULL; | ||
135 | #endif | ||
136 | Sha512_InitState(p, digestSize); | ||
137 | } | ||
138 | |||
139 | #define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39)) | ||
140 | #define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41)) | ||
141 | #define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7)) | ||
142 | #define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6)) | ||
143 | |||
144 | #define Ch(x,y,z) (z^(x&(y^z))) | ||
145 | #define Maj(x,y,z) ((x&y)|(z&(x|y))) | ||
146 | |||
147 | |||
148 | #define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8)) | ||
149 | |||
150 | #define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) | ||
151 | |||
152 | #ifdef Z7_SHA512_BIG_W | ||
153 | // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. | ||
154 | #define w(j, i) W[(size_t)(j) + i] | ||
155 | #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) | ||
156 | #else | ||
157 | #if STEP_MAIN == 16 | ||
158 | #define w(j, i) W[(i) & 15] | ||
159 | #else | ||
160 | #define w(j, i) W[((size_t)(j) + (i)) & 15] | ||
161 | #endif | ||
162 | #define blk2(j, i) (w(j, i) += blk2_main(j, i)) | ||
163 | #endif | ||
164 | |||
165 | #define W_MAIN(i) blk2(j, i) | ||
166 | |||
167 | |||
168 | #define T1(wx, i) \ | ||
169 | tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
170 | h = g; \ | ||
171 | g = f; \ | ||
172 | f = e; \ | ||
173 | e = d + tmp; \ | ||
174 | tmp += S0(a) + Maj(a, b, c); \ | ||
175 | d = c; \ | ||
176 | c = b; \ | ||
177 | b = a; \ | ||
178 | a = tmp; \ | ||
179 | |||
180 | #define R1_PRE(i) T1( W_PRE, i) | ||
181 | #define R1_MAIN(i) T1( W_MAIN, i) | ||
182 | |||
183 | #if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) | ||
184 | #define R2_MAIN(i) \ | ||
185 | R1_MAIN(i) \ | ||
186 | R1_MAIN(i + 1) \ | ||
187 | |||
188 | #endif | ||
189 | |||
190 | |||
191 | |||
192 | #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 | ||
193 | |||
194 | #define T4( a,b,c,d,e,f,g,h, wx, i) \ | ||
195 | h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
196 | tmp = h; \ | ||
197 | h += d; \ | ||
198 | d = tmp + S0(a) + Maj(a, b, c); \ | ||
199 | |||
200 | #define R4( wx, i) \ | ||
201 | T4 ( a,b,c,d,e,f,g,h, wx, (i )); \ | ||
202 | T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \ | ||
203 | T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \ | ||
204 | T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \ | ||
205 | |||
206 | #define R4_PRE(i) R4( W_PRE, i) | ||
207 | #define R4_MAIN(i) R4( W_MAIN, i) | ||
208 | |||
209 | |||
210 | #define T8( a,b,c,d,e,f,g,h, wx, i) \ | ||
211 | h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ | ||
212 | d += h; \ | ||
213 | h += S0(a) + Maj(a, b, c); \ | ||
214 | |||
215 | #define R8( wx, i) \ | ||
216 | T8 ( a,b,c,d,e,f,g,h, wx, i ); \ | ||
217 | T8 ( h,a,b,c,d,e,f,g, wx, i+1); \ | ||
218 | T8 ( g,h,a,b,c,d,e,f, wx, i+2); \ | ||
219 | T8 ( f,g,h,a,b,c,d,e, wx, i+3); \ | ||
220 | T8 ( e,f,g,h,a,b,c,d, wx, i+4); \ | ||
221 | T8 ( d,e,f,g,h,a,b,c, wx, i+5); \ | ||
222 | T8 ( c,d,e,f,g,h,a,b, wx, i+6); \ | ||
223 | T8 ( b,c,d,e,f,g,h,a, wx, i+7); \ | ||
224 | |||
225 | #define R8_PRE(i) R8( W_PRE, i) | ||
226 | #define R8_MAIN(i) R8( W_MAIN, i) | ||
227 | |||
228 | #endif | ||
229 | |||
230 | |||
231 | extern | ||
232 | MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80]; | ||
233 | MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = { | ||
234 | U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc), | ||
235 | U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118), | ||
236 | U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2), | ||
237 | U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694), | ||
238 | U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65), | ||
239 | U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5), | ||
240 | U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4), | ||
241 | U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70), | ||
242 | U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df), | ||
243 | U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b), | ||
244 | U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30), | ||
245 | U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8), | ||
246 | U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8), | ||
247 | U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3), | ||
248 | U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec), | ||
249 | U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b), | ||
250 | U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178), | ||
251 | U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b), | ||
252 | U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c), | ||
253 | U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817) | ||
254 | }; | ||
255 | |||
256 | #define K SHA512_K_ARRAY | ||
257 | |||
258 | Z7_NO_INLINE | ||
259 | void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
260 | { | ||
261 | UInt64 W | ||
262 | #ifdef Z7_SHA512_BIG_W | ||
263 | [80]; | ||
264 | #else | ||
265 | [16]; | ||
266 | #endif | ||
267 | unsigned j; | ||
268 | UInt64 a,b,c,d,e,f,g,h; | ||
269 | #if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) | ||
270 | UInt64 tmp; | ||
271 | #endif | ||
272 | |||
273 | if (numBlocks == 0) return; | ||
274 | |||
275 | a = state[0]; | ||
276 | b = state[1]; | ||
277 | c = state[2]; | ||
278 | d = state[3]; | ||
279 | e = state[4]; | ||
280 | f = state[5]; | ||
281 | g = state[6]; | ||
282 | h = state[7]; | ||
283 | |||
284 | do | ||
285 | { | ||
286 | |||
287 | for (j = 0; j < 16; j += STEP_PRE) | ||
288 | { | ||
289 | #if STEP_PRE > 4 | ||
290 | |||
291 | #if STEP_PRE < 8 | ||
292 | R4_PRE(0); | ||
293 | #else | ||
294 | R8_PRE(0); | ||
295 | #if STEP_PRE == 16 | ||
296 | R8_PRE(8); | ||
297 | #endif | ||
298 | #endif | ||
299 | |||
300 | #else | ||
301 | |||
302 | R1_PRE(0) | ||
303 | #if STEP_PRE >= 2 | ||
304 | R1_PRE(1) | ||
305 | #if STEP_PRE >= 4 | ||
306 | R1_PRE(2) | ||
307 | R1_PRE(3) | ||
308 | #endif | ||
309 | #endif | ||
310 | |||
311 | #endif | ||
312 | } | ||
313 | |||
314 | for (j = 16; j < 80; j += STEP_MAIN) | ||
315 | { | ||
316 | #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 | ||
317 | |||
318 | #if STEP_MAIN < 8 | ||
319 | R4_MAIN(0) | ||
320 | #else | ||
321 | R8_MAIN(0) | ||
322 | #if STEP_MAIN == 16 | ||
323 | R8_MAIN(8) | ||
324 | #endif | ||
325 | #endif | ||
326 | |||
327 | #else | ||
328 | |||
329 | R1_MAIN(0) | ||
330 | #if STEP_MAIN >= 2 | ||
331 | R1_MAIN(1) | ||
332 | #if STEP_MAIN >= 4 | ||
333 | R2_MAIN(2) | ||
334 | #if STEP_MAIN >= 8 | ||
335 | R2_MAIN(4) | ||
336 | R2_MAIN(6) | ||
337 | #if STEP_MAIN >= 16 | ||
338 | R2_MAIN(8) | ||
339 | R2_MAIN(10) | ||
340 | R2_MAIN(12) | ||
341 | R2_MAIN(14) | ||
342 | #endif | ||
343 | #endif | ||
344 | #endif | ||
345 | #endif | ||
346 | #endif | ||
347 | } | ||
348 | |||
349 | a += state[0]; state[0] = a; | ||
350 | b += state[1]; state[1] = b; | ||
351 | c += state[2]; state[2] = c; | ||
352 | d += state[3]; state[3] = d; | ||
353 | e += state[4]; state[4] = e; | ||
354 | f += state[5]; state[5] = f; | ||
355 | g += state[6]; state[6] = g; | ||
356 | h += state[7]; state[7] = h; | ||
357 | |||
358 | data += SHA512_BLOCK_SIZE; | ||
359 | } | ||
360 | while (--numBlocks); | ||
361 | } | ||
362 | |||
363 | |||
364 | #define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) | ||
365 | |||
366 | void Sha512_Update(CSha512 *p, const Byte *data, size_t size) | ||
367 | { | ||
368 | if (size == 0) | ||
369 | return; | ||
370 | { | ||
371 | const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); | ||
372 | const unsigned num = SHA512_BLOCK_SIZE - pos; | ||
373 | p->v.vars.count += size; | ||
374 | if (num > size) | ||
375 | { | ||
376 | memcpy(p->buffer + pos, data, size); | ||
377 | return; | ||
378 | } | ||
379 | if (pos != 0) | ||
380 | { | ||
381 | size -= num; | ||
382 | memcpy(p->buffer + pos, data, num); | ||
383 | data += num; | ||
384 | Sha512_UpdateBlock(p); | ||
385 | } | ||
386 | } | ||
387 | { | ||
388 | const size_t numBlocks = size >> 7; | ||
389 | // if (numBlocks) | ||
390 | SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks); | ||
391 | size &= SHA512_BLOCK_SIZE - 1; | ||
392 | if (size == 0) | ||
393 | return; | ||
394 | data += (numBlocks << 7); | ||
395 | memcpy(p->buffer, data, size); | ||
396 | } | ||
397 | } | ||
398 | |||
399 | |||
400 | void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize) | ||
401 | { | ||
402 | unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); | ||
403 | p->buffer[pos++] = 0x80; | ||
404 | if (pos > (SHA512_BLOCK_SIZE - 8 * 2)) | ||
405 | { | ||
406 | while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; } | ||
407 | // memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos); | ||
408 | Sha512_UpdateBlock(p); | ||
409 | pos = 0; | ||
410 | } | ||
411 | memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos); | ||
412 | { | ||
413 | const UInt64 numBits = p->v.vars.count << 3; | ||
414 | SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits) | ||
415 | SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits) | ||
416 | } | ||
417 | Sha512_UpdateBlock(p); | ||
418 | #if 1 && defined(MY_CPU_BE) | ||
419 | memcpy(digest, p->state, digestSize); | ||
420 | #else | ||
421 | { | ||
422 | const unsigned numWords = digestSize >> 3; | ||
423 | unsigned i; | ||
424 | for (i = 0; i < numWords; i++) | ||
425 | { | ||
426 | const UInt64 v = p->state[i]; | ||
427 | SetBe64(digest, v) | ||
428 | digest += 8; | ||
429 | } | ||
430 | if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE | ||
431 | { | ||
432 | const UInt32 v = (UInt32)((p->state[numWords]) >> 32); | ||
433 | SetBe32(digest, v) | ||
434 | } | ||
435 | } | ||
436 | #endif | ||
437 | Sha512_InitState(p, digestSize); | ||
438 | } | ||
439 | |||
440 | |||
441 | |||
442 | // #define Z7_SHA512_PROBE_DEBUG // for debug | ||
443 | |||
444 | #if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED) | ||
445 | |||
446 | #if defined(Z7_SHA512_PROBE_DEBUG) \ | ||
447 | || defined(_WIN32) && defined(MY_CPU_ARM64) | ||
448 | #ifndef Z7_SHA512_USE_PROBE | ||
449 | #define Z7_SHA512_USE_PROBE | ||
450 | #endif | ||
451 | #endif | ||
452 | |||
453 | #ifdef Z7_SHA512_USE_PROBE | ||
454 | |||
455 | #ifdef Z7_SHA512_PROBE_DEBUG | ||
456 | #include <stdio.h> | ||
457 | #define PRF(x) x | ||
458 | #else | ||
459 | #define PRF(x) | ||
460 | #endif | ||
461 | |||
462 | #if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode | ||
463 | // MINGW doesn't support __try. So we use signal() / longjmp(). | ||
464 | // Note: signal() / longjmp() probably is not thread-safe. | ||
465 | // So we must call Sha512Prepare() from main thread at program start. | ||
466 | #ifndef Z7_SHA512_USE_LONGJMP | ||
467 | #define Z7_SHA512_USE_LONGJMP | ||
468 | #endif | ||
469 | #endif | ||
470 | |||
471 | #ifdef Z7_SHA512_USE_LONGJMP | ||
472 | #include <signal.h> | ||
473 | #include <setjmp.h> | ||
474 | static jmp_buf g_Sha512_jmp_buf; | ||
475 | // static int g_Sha512_Unsupported; | ||
476 | |||
477 | #if defined(__GNUC__) && (__GNUC__ >= 8) \ | ||
478 | || defined(__clang__) && (__clang_major__ >= 3) | ||
479 | __attribute__((noreturn)) | ||
480 | #endif | ||
481 | static void Z7_CDECL Sha512_signal_Handler(int v) | ||
482 | { | ||
483 | PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);) | ||
484 | // g_Sha512_Unsupported = 1; | ||
485 | longjmp(g_Sha512_jmp_buf, 1); | ||
486 | } | ||
487 | #endif // Z7_SHA512_USE_LONGJMP | ||
488 | |||
489 | |||
490 | #if defined(_WIN32) | ||
491 | #include "7zWindows.h" | ||
492 | #endif | ||
493 | |||
494 | #if defined(MY_CPU_ARM64) | ||
495 | // #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug | ||
496 | #endif | ||
497 | |||
498 | #ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE | ||
499 | #include <arm_neon.h> | ||
500 | #if defined(__clang__) | ||
501 | __attribute__((__target__("sha3"))) | ||
502 | #elif !defined(_MSC_VER) | ||
503 | __attribute__((__target__("arch=armv8.2-a+sha3"))) | ||
504 | #endif | ||
505 | #endif | ||
506 | static BoolInt CPU_IsSupported_SHA512_Probe(void) | ||
507 | { | ||
508 | PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");) | ||
509 | #if defined(_WIN32) && defined(MY_CPU_ARM64) | ||
510 | // we have no SHA512 flag for IsProcessorFeaturePresent() still. | ||
511 | if (!CPU_IsSupported_CRYPTO()) | ||
512 | return False; | ||
513 | PRF(printf("==== Registry check\n");) | ||
514 | { | ||
515 | // we can't read ID_AA64ISAR0_EL1 register from application. | ||
516 | // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. | ||
517 | HKEY key = NULL; | ||
518 | LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE, | ||
519 | TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), | ||
520 | 0, KEY_READ, &key); | ||
521 | if (res != ERROR_SUCCESS) | ||
522 | return False; | ||
523 | { | ||
524 | DWORD type = 0; | ||
525 | DWORD count = sizeof(UInt64); | ||
526 | UInt64 val = 0; | ||
527 | res = RegQueryValueEx(key, TEXT("CP 4030"), NULL, | ||
528 | &type, (LPBYTE)&val, &count); | ||
529 | RegCloseKey(key); | ||
530 | if (res != ERROR_SUCCESS | ||
531 | || type != REG_QWORD | ||
532 | || count != sizeof(UInt64) | ||
533 | || ((unsigned)(val >> 12) & 0xf) != 2) | ||
534 | return False; | ||
535 | // we parse SHA2 field of ID_AA64ISAR0_EL1 register: | ||
536 | // 0 : No SHA2 instructions implemented | ||
537 | // 1 : SHA256 implemented | ||
538 | // 2 : SHA256 and SHA512 implemented | ||
539 | } | ||
540 | } | ||
541 | #endif // defined(_WIN32) && defined(MY_CPU_ARM64) | ||
542 | |||
543 | |||
544 | #if 1 // 0 for debug to disable SHA512 PROBE code | ||
545 | |||
546 | /* | ||
547 | ----- SHA512 PROBE ----- | ||
548 | |||
549 | We suppose that "CP 4030" registry reading is enough. | ||
550 | But we use additional SHA512 PROBE code, because | ||
551 | we can catch exception here, and we don't catch exceptions, | ||
552 | if we call Sha512 functions from main code. | ||
553 | |||
554 | NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64. | ||
555 | The program just stops. | ||
556 | Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator | ||
557 | without SHA512 support (-skl switch), | ||
558 | The program stops, and we have message from SDE: | ||
559 | TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1 | ||
560 | But we still want to catch that exception instead of process stopping. | ||
561 | Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)? | ||
562 | Are there any ways to fix the problems with arm64-wine and x64-SDE cases? | ||
563 | */ | ||
564 | |||
565 | PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");) | ||
566 | { | ||
567 | BoolInt isSupported = False; | ||
568 | #ifdef Z7_SHA512_USE_LONGJMP | ||
569 | void (Z7_CDECL *signal_prev)(int); | ||
570 | /* | ||
571 | if (g_Sha512_Unsupported) | ||
572 | { | ||
573 | PRF(printf("==== g_Sha512_Unsupported\n");) | ||
574 | return False; | ||
575 | } | ||
576 | */ | ||
577 | printf("====== signal(SIGILL)\n"); | ||
578 | signal_prev = signal(SIGILL, Sha512_signal_Handler); | ||
579 | if (signal_prev == SIG_ERR) | ||
580 | { | ||
581 | PRF(printf("====== signal fail\n");) | ||
582 | return False; | ||
583 | } | ||
584 | // PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);) | ||
585 | // docs: Before the specified function is executed, | ||
586 | // the value of func is set to SIG_DFL. | ||
587 | // So we can exit if (setjmp(g_Sha512_jmp_buf) != 0). | ||
588 | PRF(printf("====== setjmp\n");) | ||
589 | if (!setjmp(g_Sha512_jmp_buf)) | ||
590 | #else // Z7_SHA512_USE_LONGJMP | ||
591 | |||
592 | #ifdef _MSC_VER | ||
593 | #ifdef __clang_major__ | ||
594 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" | ||
595 | #endif | ||
596 | __try | ||
597 | #endif | ||
598 | #endif // Z7_SHA512_USE_LONGJMP | ||
599 | |||
600 | { | ||
601 | #if defined(Z7_COMPILER_SHA512_SUPPORTED) | ||
602 | #ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE | ||
603 | // simplified sha512 check for arm64: | ||
604 | const uint64x2_t a = vdupq_n_u64(1); | ||
605 | const uint64x2_t b = vsha512hq_u64(a, a, a); | ||
606 | PRF(printf("======== vsha512hq_u64 probe\n");) | ||
607 | if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) | ||
608 | #else | ||
609 | MY_ALIGN(16) | ||
610 | UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; | ||
611 | memset(temp, 0x5a, sizeof(temp)); | ||
612 | PRF(printf("======== Sha512_UpdateBlocks_HW\n");) | ||
613 | Sha512_UpdateBlocks_HW(temp, | ||
614 | (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); | ||
615 | // PRF(printf("======== t = %x\n", (UInt32)temp[0]);) | ||
616 | if ((UInt32)temp[0] == 0xa33cfdf7) | ||
617 | #endif | ||
618 | { | ||
619 | PRF(printf("======== PROBE SHA512: SHA512 is supported\n");) | ||
620 | isSupported = True; | ||
621 | } | ||
622 | #else // Z7_COMPILER_SHA512_SUPPORTED | ||
623 | // for debug : we generate bad instrction or raise exception. | ||
624 | // __except() doesn't catch raise() calls. | ||
625 | #ifdef Z7_SHA512_USE_LONGJMP | ||
626 | PRF(printf("====== raise(SIGILL)\n");) | ||
627 | raise(SIGILL); | ||
628 | #else | ||
629 | #if defined(_MSC_VER) && defined(MY_CPU_X86) | ||
630 | __asm ud2 | ||
631 | #endif | ||
632 | #endif // Z7_SHA512_USE_LONGJMP | ||
633 | #endif // Z7_COMPILER_SHA512_SUPPORTED | ||
634 | } | ||
635 | |||
636 | #ifdef Z7_SHA512_USE_LONGJMP | ||
637 | PRF(printf("====== restore signal SIGILL\n");) | ||
638 | signal(SIGILL, signal_prev); | ||
639 | #elif _MSC_VER | ||
640 | __except (EXCEPTION_EXECUTE_HANDLER) | ||
641 | { | ||
642 | PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");) | ||
643 | } | ||
644 | #endif | ||
645 | PRF(printf("== return (sha512 supported) = %d\n", isSupported);) | ||
646 | return isSupported; | ||
647 | } | ||
648 | #else | ||
649 | // without SHA512 PROBE code | ||
650 | return True; | ||
651 | #endif | ||
652 | } | ||
653 | |||
654 | #endif // Z7_SHA512_USE_PROBE | ||
655 | #endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED) | ||
656 | |||
657 | |||
658 | void Sha512Prepare(void) | ||
659 | { | ||
660 | #ifdef Z7_COMPILER_SHA512_SUPPORTED | ||
661 | SHA512_FUNC_UPDATE_BLOCKS f, f_hw; | ||
662 | f = Sha512_UpdateBlocks; | ||
663 | f_hw = NULL; | ||
664 | #ifdef Z7_SHA512_USE_PROBE | ||
665 | if (CPU_IsSupported_SHA512_Probe()) | ||
666 | #elif defined(MY_CPU_X86_OR_AMD64) | ||
667 | if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2()) | ||
668 | #else | ||
669 | if (CPU_IsSupported_SHA512()) | ||
670 | #endif | ||
671 | { | ||
672 | // printf("\n========== HW SHA512 ======== \n"); | ||
673 | f = f_hw = Sha512_UpdateBlocks_HW; | ||
674 | } | ||
675 | g_SHA512_FUNC_UPDATE_BLOCKS = f; | ||
676 | g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; | ||
677 | #elif defined(Z7_SHA512_PROBE_DEBUG) | ||
678 | CPU_IsSupported_SHA512_Probe(); // for debug | ||
679 | #endif | ||
680 | } | ||
681 | |||
682 | |||
683 | #undef K | ||
684 | #undef S0 | ||
685 | #undef S1 | ||
686 | #undef s0 | ||
687 | #undef s1 | ||
688 | #undef Ch | ||
689 | #undef Maj | ||
690 | #undef W_MAIN | ||
691 | #undef W_PRE | ||
692 | #undef w | ||
693 | #undef blk2_main | ||
694 | #undef blk2 | ||
695 | #undef T1 | ||
696 | #undef T4 | ||
697 | #undef T8 | ||
698 | #undef R1_PRE | ||
699 | #undef R1_MAIN | ||
700 | #undef R2_MAIN | ||
701 | #undef R4 | ||
702 | #undef R4_PRE | ||
703 | #undef R4_MAIN | ||
704 | #undef R8 | ||
705 | #undef R8_PRE | ||
706 | #undef R8_MAIN | ||
707 | #undef STEP_PRE | ||
708 | #undef STEP_MAIN | ||
709 | #undef Z7_SHA512_BIG_W | ||
710 | #undef Z7_SHA512_UNROLL | ||
711 | #undef Z7_COMPILER_SHA512_SUPPORTED | ||
diff --git a/C/Sha512.h b/C/Sha512.h new file mode 100644 index 0000000..1f3a4d1 --- /dev/null +++ b/C/Sha512.h | |||
@@ -0,0 +1,86 @@ | |||
1 | /* Sha512.h -- SHA-512 Hash | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_SHA512_H | ||
5 | #define ZIP7_INC_SHA512_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define SHA512_NUM_BLOCK_WORDS 16 | ||
12 | #define SHA512_NUM_DIGEST_WORDS 8 | ||
13 | |||
14 | #define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8) | ||
15 | #define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8) | ||
16 | #define SHA512_224_DIGEST_SIZE (224 / 8) | ||
17 | #define SHA512_256_DIGEST_SIZE (256 / 8) | ||
18 | #define SHA512_384_DIGEST_SIZE (384 / 8) | ||
19 | |||
20 | typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
21 | |||
22 | /* | ||
23 | if (the system supports different SHA512 code implementations) | ||
24 | { | ||
25 | (CSha512::func_UpdateBlocks) will be used | ||
26 | (CSha512::func_UpdateBlocks) can be set by | ||
27 | Sha512_Init() - to default (fastest) | ||
28 | Sha512_SetFunction() - to any algo | ||
29 | } | ||
30 | else | ||
31 | { | ||
32 | (CSha512::func_UpdateBlocks) is ignored. | ||
33 | } | ||
34 | */ | ||
35 | |||
36 | typedef struct | ||
37 | { | ||
38 | union | ||
39 | { | ||
40 | struct | ||
41 | { | ||
42 | SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks; | ||
43 | UInt64 count; | ||
44 | } vars; | ||
45 | UInt64 _pad_64bit[8]; | ||
46 | void *_pad_align_ptr[2]; | ||
47 | } v; | ||
48 | UInt64 state[SHA512_NUM_DIGEST_WORDS]; | ||
49 | |||
50 | Byte buffer[SHA512_BLOCK_SIZE]; | ||
51 | } CSha512; | ||
52 | |||
53 | |||
54 | #define SHA512_ALGO_DEFAULT 0 | ||
55 | #define SHA512_ALGO_SW 1 | ||
56 | #define SHA512_ALGO_HW 2 | ||
57 | |||
58 | /* | ||
59 | Sha512_SetFunction() | ||
60 | return: | ||
61 | 0 - (algo) value is not supported, and func_UpdateBlocks was not changed | ||
62 | 1 - func_UpdateBlocks was set according (algo) value. | ||
63 | */ | ||
64 | |||
65 | BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo); | ||
66 | // we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8 | ||
67 | void Sha512_InitState(CSha512 *p, unsigned digestSize); | ||
68 | void Sha512_Init(CSha512 *p, unsigned digestSize); | ||
69 | void Sha512_Update(CSha512 *p, const Byte *data, size_t size); | ||
70 | void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize); | ||
71 | |||
72 | |||
73 | |||
74 | |||
75 | // void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
76 | |||
77 | /* | ||
78 | call Sha512Prepare() once at program start. | ||
79 | It prepares all supported implementations, and detects the fastest implementation. | ||
80 | */ | ||
81 | |||
82 | void Sha512Prepare(void); | ||
83 | |||
84 | EXTERN_C_END | ||
85 | |||
86 | #endif | ||
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c new file mode 100644 index 0000000..3a13868 --- /dev/null +++ b/C/Sha512Opt.c | |||
@@ -0,0 +1,395 @@ | |||
1 | /* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions | ||
2 | : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #include "Precomp.h" | ||
5 | #include "Compiler.h" | ||
6 | #include "CpuArch.h" | ||
7 | |||
8 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
9 | #ifdef MY_CPU_X86_OR_AMD64 | ||
10 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it | ||
11 | #define USE_HW_SHA | ||
12 | #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ | ||
13 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ | ||
14 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) | ||
15 | #define USE_HW_SHA | ||
16 | #if !defined(__INTEL_COMPILER) | ||
17 | // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) | ||
18 | #if !defined(__SHA512__) || !defined(__AVX2__) | ||
19 | #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2"))) | ||
20 | #endif | ||
21 | #endif | ||
22 | #elif defined(Z7_MSC_VER_ORIGINAL) | ||
23 | #if (_MSC_VER >= 1940) | ||
24 | #define USE_HW_SHA | ||
25 | #else | ||
26 | // #define Z7_USE_HW_SHA_STUB | ||
27 | #endif | ||
28 | #endif | ||
29 | // #endif // MY_CPU_X86_OR_AMD64 | ||
30 | #ifndef USE_HW_SHA | ||
31 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
32 | #endif | ||
33 | |||
34 | #ifdef USE_HW_SHA | ||
35 | |||
36 | // #pragma message("Sha512 HW") | ||
37 | |||
38 | #include <immintrin.h> | ||
39 | |||
40 | #if defined (__clang__) && defined(_MSC_VER) | ||
41 | #if !defined(__AVX__) | ||
42 | #include <avxintrin.h> | ||
43 | #endif | ||
44 | #if !defined(__AVX2__) | ||
45 | #include <avx2intrin.h> | ||
46 | #endif | ||
47 | #if !defined(__SHA512__) | ||
48 | #include <sha512intrin.h> | ||
49 | #endif | ||
50 | #else | ||
51 | |||
52 | #endif | ||
53 | |||
54 | /* | ||
55 | SHA512 uses: | ||
56 | AVX: | ||
57 | _mm256_loadu_si256 (vmovdqu) | ||
58 | _mm256_storeu_si256 | ||
59 | _mm256_set_epi32 (unused) | ||
60 | AVX2: | ||
61 | _mm256_add_epi64 : vpaddq | ||
62 | _mm256_shuffle_epi8 : vpshufb | ||
63 | _mm256_shuffle_epi32 : pshufd | ||
64 | _mm256_blend_epi32 : vpblendd | ||
65 | _mm256_permute4x64_epi64 : vpermq : 3c | ||
66 | _mm256_permute2x128_si256: vperm2i128 : 3c | ||
67 | _mm256_extracti128_si256 : vextracti128 : 3c | ||
68 | SHA512: | ||
69 | _mm256_sha512* | ||
70 | */ | ||
71 | |||
72 | // K array must be aligned for 32-bytes at least. | ||
73 | // The compiler can look align attribute and selects | ||
74 | // vmovdqu - for code without align attribute | ||
75 | // vmovdqa - for code with align attribute | ||
76 | extern | ||
77 | MY_ALIGN(64) | ||
78 | const UInt64 SHA512_K_ARRAY[80]; | ||
79 | #define K SHA512_K_ARRAY | ||
80 | |||
81 | |||
82 | #define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src); | ||
83 | #define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0)); | ||
84 | #define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src); | ||
85 | |||
86 | #define LOAD_SHUFFLE(m, k) \ | ||
87 | m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \ | ||
88 | m = _mm256_shuffle_epi8(m, mask); \ | ||
89 | |||
90 | #define NNN(m0, m1, m2, m3) | ||
91 | |||
92 | #define SM1(m1, m2, m3, m0) \ | ||
93 | SHA512_MSG1(m0, m1); \ | ||
94 | |||
95 | #define SM2(m2, m3, m0, m1) \ | ||
96 | ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \ | ||
97 | SHA512_MSG2(m0, m3); \ | ||
98 | |||
99 | #define RND2(t0, t1, lane) \ | ||
100 | t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane)); | ||
101 | |||
102 | |||
103 | |||
104 | #define R4(k, m0, m1, m2, m3, OP0, OP1) \ | ||
105 | msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \ | ||
106 | RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \ | ||
107 | RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \ | ||
108 | |||
109 | |||
110 | |||
111 | |||
112 | #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ | ||
113 | R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ | ||
114 | R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ | ||
115 | R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ | ||
116 | R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ | ||
117 | |||
118 | #define PREPARE_STATE \ | ||
119 | state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \ | ||
120 | state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \ | ||
121 | tmp = state0; \ | ||
122 | state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \ | ||
123 | state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \ | ||
124 | |||
125 | |||
126 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
127 | #ifdef ATTRIB_SHA512 | ||
128 | ATTRIB_SHA512 | ||
129 | #endif | ||
130 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
131 | { | ||
132 | const __m256i mask = _mm256_set_epi32( | ||
133 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607, | ||
134 | 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607); | ||
135 | __m256i tmp, state0, state1; | ||
136 | |||
137 | if (numBlocks == 0) | ||
138 | return; | ||
139 | |||
140 | state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]); | ||
141 | state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]); | ||
142 | |||
143 | PREPARE_STATE | ||
144 | |||
145 | do | ||
146 | { | ||
147 | __m256i state0_save, state1_save; | ||
148 | __m256i m0, m1, m2, m3; | ||
149 | __m256i msg; | ||
150 | // #define msg tmp | ||
151 | |||
152 | state0_save = state0; | ||
153 | state1_save = state1; | ||
154 | |||
155 | LOAD_SHUFFLE (m0, 0) | ||
156 | LOAD_SHUFFLE (m1, 1) | ||
157 | LOAD_SHUFFLE (m2, 2) | ||
158 | LOAD_SHUFFLE (m3, 3) | ||
159 | |||
160 | |||
161 | |||
162 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) | ||
163 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
164 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
165 | R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) | ||
166 | R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) | ||
167 | ADD_EPI64(state0, state0_save) | ||
168 | ADD_EPI64(state1, state1_save) | ||
169 | |||
170 | data += 128; | ||
171 | } | ||
172 | while (--numBlocks); | ||
173 | |||
174 | PREPARE_STATE | ||
175 | |||
176 | _mm256_storeu_si256((__m256i *) (void *) &state[0], state0); | ||
177 | _mm256_storeu_si256((__m256i *) (void *) &state[4], state1); | ||
178 | } | ||
179 | |||
180 | #endif // USE_HW_SHA | ||
181 | |||
182 | // gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc | ||
183 | #elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
184 | |||
185 | #if defined(__ARM_FEATURE_SHA512) | ||
186 | #define USE_HW_SHA | ||
187 | #else | ||
188 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ | ||
189 | || defined(__GNUC__) && (__GNUC__ >= 9) \ | ||
190 | ) \ | ||
191 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it | ||
192 | #define USE_HW_SHA | ||
193 | #endif | ||
194 | #endif | ||
195 | |||
196 | #ifdef USE_HW_SHA | ||
197 | |||
198 | // #pragma message("=== Sha512 HW === ") | ||
199 | |||
200 | |||
201 | #if defined(__clang__) || defined(__GNUC__) | ||
202 | #if !defined(__ARM_FEATURE_SHA512) | ||
203 | // #pragma message("=== we define SHA3 ATTRIB_SHA512 === ") | ||
204 | #if defined(__clang__) | ||
205 | #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3" | ||
206 | #else | ||
207 | #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3"))) | ||
208 | #endif | ||
209 | #endif | ||
210 | #endif | ||
211 | |||
212 | |||
213 | #if defined(Z7_MSC_VER_ORIGINAL) | ||
214 | #include <arm64_neon.h> | ||
215 | #else | ||
216 | |||
217 | #if defined(__clang__) && __clang_major__ < 16 | ||
218 | #if !defined(__ARM_FEATURE_SHA512) | ||
219 | // #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ") | ||
220 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
221 | #define Z7_ARM_FEATURE_SHA512_WAS_SET 1 | ||
222 | #define __ARM_FEATURE_SHA512 1 | ||
223 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
224 | #endif | ||
225 | #endif // clang | ||
226 | |||
227 | #include <arm_neon.h> | ||
228 | |||
229 | #if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \ | ||
230 | defined(__ARM_FEATURE_SHA512) | ||
231 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
232 | #undef __ARM_FEATURE_SHA512 | ||
233 | #undef Z7_ARM_FEATURE_SHA512_WAS_SET | ||
234 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
235 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
236 | #endif | ||
237 | |||
238 | #endif // Z7_MSC_VER_ORIGINAL | ||
239 | |||
240 | typedef uint64x2_t v128_64; | ||
241 | // typedef __n128 v128_64; // MSVC | ||
242 | |||
243 | #ifdef MY_CPU_BE | ||
244 | #define MY_rev64_for_LE(x) x | ||
245 | #else | ||
246 | #define MY_rev64_for_LE(x) vrev64q_u8(x) | ||
247 | #endif | ||
248 | |||
249 | #define LOAD_128_64(_p) vld1q_u64(_p) | ||
250 | #define LOAD_128_8(_p) vld1q_u8 (_p) | ||
251 | #define STORE_128_64(_p, _v) vst1q_u64(_p, _v) | ||
252 | |||
253 | #define LOAD_SHUFFLE(m, k) \ | ||
254 | m = vreinterpretq_u64_u8( \ | ||
255 | MY_rev64_for_LE( \ | ||
256 | LOAD_128_8(data + (k) * 16))); \ | ||
257 | |||
258 | // K array must be aligned for 16-bytes at least. | ||
259 | extern | ||
260 | MY_ALIGN(64) | ||
261 | const UInt64 SHA512_K_ARRAY[80]; | ||
262 | #define K SHA512_K_ARRAY | ||
263 | |||
264 | #define NN(m0, m1, m4, m5, m7) | ||
265 | #define SM(m0, m1, m4, m5, m7) \ | ||
266 | m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); | ||
267 | |||
268 | #define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \ | ||
269 | OP(m0, m1, m4, m5, m7) \ | ||
270 | t = vaddq_u64(m0, vld1q_u64(k)); \ | ||
271 | t = vaddq_u64(vextq_u64(t, t, 1), a3); \ | ||
272 | t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \ | ||
273 | a3 = vsha512h2q_u64(t, a1, a0); \ | ||
274 | a1 = vaddq_u64(a1, t); \ | ||
275 | |||
276 | #define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \ | ||
277 | R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \ | ||
278 | R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \ | ||
279 | R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \ | ||
280 | R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \ | ||
281 | |||
282 | #define R16(k, OP) \ | ||
283 | R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \ | ||
284 | R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \ | ||
285 | |||
286 | |||
287 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
288 | #ifdef ATTRIB_SHA512 | ||
289 | ATTRIB_SHA512 | ||
290 | #endif | ||
291 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
292 | { | ||
293 | v128_64 a0, a1, a2, a3; | ||
294 | |||
295 | if (numBlocks == 0) | ||
296 | return; | ||
297 | a0 = LOAD_128_64(&state[0]); | ||
298 | a1 = LOAD_128_64(&state[2]); | ||
299 | a2 = LOAD_128_64(&state[4]); | ||
300 | a3 = LOAD_128_64(&state[6]); | ||
301 | do | ||
302 | { | ||
303 | v128_64 a0_save, a1_save, a2_save, a3_save; | ||
304 | v128_64 m0, m1, m2, m3, m4, m5, m6, m7; | ||
305 | v128_64 t; | ||
306 | unsigned i; | ||
307 | const UInt64 *k_ptr; | ||
308 | |||
309 | LOAD_SHUFFLE (m0, 0) | ||
310 | LOAD_SHUFFLE (m1, 1) | ||
311 | LOAD_SHUFFLE (m2, 2) | ||
312 | LOAD_SHUFFLE (m3, 3) | ||
313 | LOAD_SHUFFLE (m4, 4) | ||
314 | LOAD_SHUFFLE (m5, 5) | ||
315 | LOAD_SHUFFLE (m6, 6) | ||
316 | LOAD_SHUFFLE (m7, 7) | ||
317 | |||
318 | a0_save = a0; | ||
319 | a1_save = a1; | ||
320 | a2_save = a2; | ||
321 | a3_save = a3; | ||
322 | |||
323 | R16 ( K, NN ) | ||
324 | k_ptr = K + 16; | ||
325 | for (i = 0; i < 4; i++) | ||
326 | { | ||
327 | R16 ( k_ptr, SM ) | ||
328 | k_ptr += 16; | ||
329 | } | ||
330 | |||
331 | a0 = vaddq_u64(a0, a0_save); | ||
332 | a1 = vaddq_u64(a1, a1_save); | ||
333 | a2 = vaddq_u64(a2, a2_save); | ||
334 | a3 = vaddq_u64(a3, a3_save); | ||
335 | |||
336 | data += 128; | ||
337 | } | ||
338 | while (--numBlocks); | ||
339 | |||
340 | STORE_128_64(&state[0], a0); | ||
341 | STORE_128_64(&state[2], a1); | ||
342 | STORE_128_64(&state[4], a2); | ||
343 | STORE_128_64(&state[6], a3); | ||
344 | } | ||
345 | |||
346 | #endif // USE_HW_SHA | ||
347 | |||
348 | #endif // MY_CPU_ARM_OR_ARM64 | ||
349 | |||
350 | |||
351 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) | ||
352 | // #error Stop_Compiling_UNSUPPORTED_SHA | ||
353 | // #include <stdlib.h> | ||
354 | // We can compile this file with another C compiler, | ||
355 | // or we can compile asm version. | ||
356 | // So we can generate real code instead of this stub function. | ||
357 | // #include "Sha512.h" | ||
358 | // #if defined(_MSC_VER) | ||
359 | #pragma message("Sha512 HW-SW stub was used") | ||
360 | // #endif | ||
361 | void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks); | ||
362 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); | ||
363 | void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) | ||
364 | { | ||
365 | Sha512_UpdateBlocks(state, data, numBlocks); | ||
366 | /* | ||
367 | UNUSED_VAR(state); | ||
368 | UNUSED_VAR(data); | ||
369 | UNUSED_VAR(numBlocks); | ||
370 | exit(1); | ||
371 | return; | ||
372 | */ | ||
373 | } | ||
374 | #endif | ||
375 | |||
376 | |||
377 | #undef K | ||
378 | #undef RND2 | ||
379 | #undef MY_rev64_for_LE | ||
380 | #undef NN | ||
381 | #undef NNN | ||
382 | #undef LOAD_128 | ||
383 | #undef STORE_128 | ||
384 | #undef LOAD_SHUFFLE | ||
385 | #undef SM1 | ||
386 | #undef SM2 | ||
387 | #undef SM | ||
388 | #undef R2 | ||
389 | #undef R4 | ||
390 | #undef R16 | ||
391 | #undef PREPARE_STATE | ||
392 | #undef USE_HW_SHA | ||
393 | #undef ATTRIB_SHA512 | ||
394 | #undef USE_VER_MIN | ||
395 | #undef Z7_USE_HW_SHA_STUB | ||
@@ -1,141 +1,268 @@ | |||
1 | /* Sort.c -- Sort functions | 1 | /* Sort.c -- Sort functions |
2 | 2014-04-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "Sort.h" | 6 | #include "Sort.h" |
7 | #include "CpuArch.h" | ||
7 | 8 | ||
8 | #define HeapSortDown(p, k, size, temp) \ | 9 | #if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ |
9 | { for (;;) { \ | 10 | || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ |
10 | size_t s = (k << 1); \ | 11 | ) |
11 | if (s > size) break; \ | 12 | // the code with prefetch is slow for small arrays on x86. |
12 | if (s < size && p[s + 1] > p[s]) s++; \ | 13 | // So we disable prefetch for x86. |
13 | if (temp >= p[s]) break; \ | 14 | #ifndef MY_CPU_X86 |
14 | p[k] = p[s]; k = s; \ | 15 | // #pragma message("Z7_PREFETCH : __builtin_prefetch") |
15 | } p[k] = temp; } | 16 | #define Z7_PREFETCH(a) __builtin_prefetch((a)) |
17 | #endif | ||
16 | 18 | ||
17 | void HeapSort(UInt32 *p, size_t size) | 19 | #elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) |
18 | { | 20 | |
19 | if (size <= 1) | 21 | #include "7zWindows.h" |
20 | return; | 22 | |
21 | p--; | 23 | // NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. |
22 | { | 24 | // For example, clang-cl can generate "prefetcht2" instruction for |
23 | size_t i = size / 2; | 25 | // PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. |
24 | do | 26 | // But we want to generate "prefetcht0" instruction. |
25 | { | 27 | // So for CLANG/GCC we must use __builtin_prefetch() in code branch above |
26 | UInt32 temp = p[i]; | 28 | // instead of PreFetchCacheLine() / _mm_prefetch(). |
27 | size_t k = i; | 29 | |
28 | HeapSortDown(p, k, size, temp) | 30 | // New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. |
29 | } | 31 | // But old x86 cpus don't support "prefetcht0". |
30 | while (--i != 0); | 32 | // So we will use PreFetchCacheLine(), only if we are sure that |
31 | } | 33 | // generated instruction is supported by all cpus of that isa. |
32 | /* | 34 | #if defined(MY_CPU_AMD64) \ |
33 | do | 35 | || defined(MY_CPU_ARM64) \ |
34 | { | 36 | || defined(MY_CPU_IA64) |
35 | size_t k = 1; | 37 | // we need to use additional braces for (a) in PreFetchCacheLine call, because |
36 | UInt32 temp = p[size]; | 38 | // PreFetchCacheLine macro doesn't use braces: |
37 | p[size--] = p[1]; | 39 | // #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) |
38 | HeapSortDown(p, k, size, temp) | 40 | // #pragma message("Z7_PREFETCH : PreFetchCacheLine") |
39 | } | 41 | #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) |
40 | while (size > 1); | 42 | #endif |
41 | */ | 43 | |
42 | while (size > 3) | 44 | #endif // _WIN32 |
43 | { | 45 | |
44 | UInt32 temp = p[size]; | 46 | |
45 | size_t k = (p[3] > p[2]) ? 3 : 2; | 47 | #define PREFETCH_NO(p,k,s,size) |
46 | p[size--] = p[1]; | 48 | |
47 | p[1] = p[k]; | 49 | #ifndef Z7_PREFETCH |
48 | HeapSortDown(p, k, size, temp) | 50 | #define SORT_PREFETCH(p,k,s,size) |
49 | } | 51 | #else |
50 | { | 52 | |
51 | UInt32 temp = p[size]; | 53 | // #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes |
52 | p[size] = p[1]; | 54 | #define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) |
53 | if (size > 2 && p[2] < temp) | 55 | // #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) |
54 | { | 56 | |
55 | p[1] = p[2]; | 57 | #if PREFETCH_LEVEL == 0 |
56 | p[2] = temp; | 58 | |
57 | } | 59 | #define SORT_PREFETCH(p,k,s,size) |
58 | else | 60 | |
59 | p[1] = temp; | 61 | #else // PREFETCH_LEVEL != 0 |
60 | } | 62 | |
63 | /* | ||
64 | if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
65 | we prefetch one value per cache line. | ||
66 | Use it if array is aligned for cache line size (64 bytes) | ||
67 | or if array is small (less than L1 cache size). | ||
68 | |||
69 | if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
70 | we perfetch all cache lines that can be required. | ||
71 | it can be faster for big unaligned arrays. | ||
72 | */ | ||
73 | #define USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
74 | |||
75 | // s == k * 2 | ||
76 | #if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) | ||
77 | // x86 supports (lea r1*8+offset) | ||
78 | #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) | ||
79 | #else | ||
80 | #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) | ||
81 | #endif | ||
82 | |||
83 | #if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
84 | #define PREFETCH_ADD_OFFSET 0 | ||
85 | #else | ||
86 | // last offset that can be reqiured in PREFETCH_LEVEL step: | ||
87 | #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) | ||
88 | #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 | ||
89 | #endif | ||
90 | |||
91 | #if PREFETCH_LEVEL <= 3 | ||
92 | |||
93 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
94 | #define SORT_PREFETCH(p,k,s,size) \ | ||
95 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ | ||
96 | if (s2 <= size) { \ | ||
97 | Z7_PREFETCH((p + s2)); \ | ||
98 | }} | ||
99 | #else /* for unaligned array */ | ||
100 | #define SORT_PREFETCH(p,k,s,size) \ | ||
101 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
102 | if (s2 <= size) { \ | ||
103 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
104 | Z7_PREFETCH((p + s2)); \ | ||
105 | }} | ||
106 | #endif | ||
107 | |||
108 | #else // PREFETCH_LEVEL > 3 | ||
109 | |||
110 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
111 | #define SORT_PREFETCH(p,k,s,size) \ | ||
112 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ | ||
113 | if (s2 <= size) { \ | ||
114 | Z7_PREFETCH((p + s2 - 16)); \ | ||
115 | Z7_PREFETCH((p + s2)); \ | ||
116 | }} | ||
117 | #else /* for unaligned array */ | ||
118 | #define SORT_PREFETCH(p,k,s,size) \ | ||
119 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
120 | if (s2 <= size) { \ | ||
121 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
122 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ | ||
123 | Z7_PREFETCH((p + s2)); \ | ||
124 | }} | ||
125 | #endif | ||
126 | |||
127 | #endif // PREFETCH_LEVEL > 3 | ||
128 | #endif // PREFETCH_LEVEL != 0 | ||
129 | #endif // Z7_PREFETCH | ||
130 | |||
131 | |||
132 | #if defined(MY_CPU_ARM64) \ | ||
133 | /* || defined(MY_CPU_AMD64) */ \ | ||
134 | /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ | ||
135 | // we want to use cmov, if cmov is very fast: | ||
136 | // - this cmov version is slower for clang-x64. | ||
137 | // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. | ||
138 | #define Z7_FAST_CMOV_SUPPORTED | ||
139 | #endif | ||
140 | |||
141 | #ifdef Z7_FAST_CMOV_SUPPORTED | ||
142 | // we want to use cmov here, if cmov is fast: new arm64 cpus. | ||
143 | // we want the compiler to use conditional move for this branch | ||
144 | #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; | ||
145 | #else | ||
146 | // use this branch, if cpu doesn't support fast conditional move. | ||
147 | // it uses slow array access reading: | ||
148 | #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; | ||
149 | #endif | ||
150 | |||
151 | #define HeapSortDown(p, k, size, temp, macro_prefetch) \ | ||
152 | { \ | ||
153 | for (;;) { \ | ||
154 | UInt32 n0, n1; \ | ||
155 | size_t s = k * 2; \ | ||
156 | if (s >= size) { \ | ||
157 | if (s == size) { \ | ||
158 | n0 = p[s]; \ | ||
159 | p[k] = n0; \ | ||
160 | if (temp < n0) k = s; \ | ||
161 | } \ | ||
162 | break; \ | ||
163 | } \ | ||
164 | n0 = p[k * 2]; \ | ||
165 | n1 = p[k * 2 + 1]; \ | ||
166 | s += n0 < n1; \ | ||
167 | GET_MAX_VAL(n0, n1, p[s]) \ | ||
168 | if (temp >= n0) break; \ | ||
169 | macro_prefetch(p, k, s, size) \ | ||
170 | p[k] = n0; \ | ||
171 | k = s; \ | ||
172 | } \ | ||
173 | p[k] = temp; \ | ||
61 | } | 174 | } |
62 | 175 | ||
63 | void HeapSort64(UInt64 *p, size_t size) | 176 | |
177 | /* | ||
178 | stage-1 : O(n) : | ||
179 | we generate intermediate partially sorted binary tree: | ||
180 | p[0] : it's additional item for better alignment of tree structure in memory. | ||
181 | p[1] | ||
182 | p[2] p[3] | ||
183 | p[4] p[5] p[6] p[7] | ||
184 | ... | ||
185 | p[x] >= p[x * 2] | ||
186 | p[x] >= p[x * 2 + 1] | ||
187 | |||
188 | stage-2 : O(n)*log2(N): | ||
189 | we move largest item p[0] from head of tree to the end of array | ||
190 | and insert last item to sorted binary tree. | ||
191 | */ | ||
192 | |||
193 | // (p) must be aligned for cache line size (64-bytes) for best performance | ||
194 | |||
195 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) | ||
64 | { | 196 | { |
65 | if (size <= 1) | 197 | if (size < 2) |
66 | return; | 198 | return; |
67 | p--; | 199 | if (size == 2) |
68 | { | ||
69 | size_t i = size / 2; | ||
70 | do | ||
71 | { | ||
72 | UInt64 temp = p[i]; | ||
73 | size_t k = i; | ||
74 | HeapSortDown(p, k, size, temp) | ||
75 | } | ||
76 | while (--i != 0); | ||
77 | } | ||
78 | /* | ||
79 | do | ||
80 | { | 200 | { |
81 | size_t k = 1; | 201 | const UInt32 a0 = p[0]; |
82 | UInt64 temp = p[size]; | 202 | const UInt32 a1 = p[1]; |
83 | p[size--] = p[1]; | 203 | const unsigned k = a1 < a0; |
84 | HeapSortDown(p, k, size, temp) | 204 | p[k] = a0; |
85 | } | 205 | p[k ^ 1] = a1; |
86 | while (size > 1); | 206 | return; |
87 | */ | ||
88 | while (size > 3) | ||
89 | { | ||
90 | UInt64 temp = p[size]; | ||
91 | size_t k = (p[3] > p[2]) ? 3 : 2; | ||
92 | p[size--] = p[1]; | ||
93 | p[1] = p[k]; | ||
94 | HeapSortDown(p, k, size, temp) | ||
95 | } | 207 | } |
96 | { | 208 | { |
97 | UInt64 temp = p[size]; | 209 | // stage-1 : O(n) |
98 | p[size] = p[1]; | 210 | // we transform array to partially sorted binary tree. |
99 | if (size > 2 && p[2] < temp) | 211 | size_t i = --size / 2; |
212 | // (size) now is the index of the last item in tree, | ||
213 | // if (i) | ||
100 | { | 214 | { |
101 | p[1] = p[2]; | 215 | do |
102 | p[2] = temp; | 216 | { |
217 | const UInt32 temp = p[i]; | ||
218 | size_t k = i; | ||
219 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
220 | } | ||
221 | while (--i); | ||
222 | } | ||
223 | { | ||
224 | const UInt32 temp = p[0]; | ||
225 | const UInt32 a1 = p[1]; | ||
226 | if (temp < a1) | ||
227 | { | ||
228 | size_t k = 1; | ||
229 | p[0] = a1; | ||
230 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
231 | } | ||
103 | } | 232 | } |
104 | else | ||
105 | p[1] = temp; | ||
106 | } | 233 | } |
107 | } | ||
108 | 234 | ||
109 | /* | 235 | if (size < 3) |
110 | #define HeapSortRefDown(p, vals, n, size, temp) \ | 236 | { |
111 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | 237 | // size == 2 |
112 | size_t s = (k << 1); \ | 238 | const UInt32 a0 = p[0]; |
113 | if (s > size) break; \ | 239 | p[0] = p[2]; |
114 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | 240 | p[2] = a0; |
115 | if (val >= vals[p[s]]) break; \ | ||
116 | p[k] = p[s]; k = s; \ | ||
117 | } p[k] = temp; } | ||
118 | |||
119 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
120 | { | ||
121 | if (size <= 1) | ||
122 | return; | 241 | return; |
123 | p--; | 242 | } |
243 | if (size != 3) | ||
124 | { | 244 | { |
125 | size_t i = size / 2; | 245 | // stage-2 : O(size) * log2(size): |
246 | // we move largest item p[0] from head to the end of array, | ||
247 | // and insert last item to sorted binary tree. | ||
126 | do | 248 | do |
127 | { | 249 | { |
128 | UInt32 temp = p[i]; | 250 | const UInt32 temp = p[size]; |
129 | HeapSortRefDown(p, vals, i, size, temp); | 251 | size_t k = p[2] < p[3] ? 3 : 2; |
252 | p[size--] = p[0]; | ||
253 | p[0] = p[1]; | ||
254 | p[1] = p[k]; | ||
255 | HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO | ||
130 | } | 256 | } |
131 | while (--i != 0); | 257 | while (size != 3); |
132 | } | 258 | } |
133 | do | ||
134 | { | 259 | { |
135 | UInt32 temp = p[size]; | 260 | const UInt32 a2 = p[2]; |
136 | p[size--] = p[1]; | 261 | const UInt32 a3 = p[3]; |
137 | HeapSortRefDown(p, vals, 1, size, temp); | 262 | const size_t k = a2 < a3; |
263 | p[2] = p[1]; | ||
264 | p[3] = p[0]; | ||
265 | p[k] = a3; | ||
266 | p[k ^ 1] = a2; | ||
138 | } | 267 | } |
139 | while (size > 1); | ||
140 | } | 268 | } |
141 | */ | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Sort.h -- Sort functions | 1 | /* Sort.h -- Sort functions |
2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_SORT_H | 4 | #ifndef ZIP7_INC_SORT_H |
5 | #define ZIP7_INC_SORT_H | 5 | #define ZIP7_INC_SORT_H |
@@ -8,10 +8,7 @@ | |||
8 | 8 | ||
9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
10 | 10 | ||
11 | void HeapSort(UInt32 *p, size_t size); | 11 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size); |
12 | void HeapSort64(UInt64 *p, size_t size); | ||
13 | |||
14 | /* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */ | ||
15 | 12 | ||
16 | EXTERN_C_END | 13 | EXTERN_C_END |
17 | 14 | ||
diff --git a/C/Threads.c b/C/Threads.c index 464efec..177d1d9 100644 --- a/C/Threads.c +++ b/C/Threads.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Threads.c -- multithreading library | 1 | /* Threads.c -- multithreading library |
2 | 2024-03-28 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p) | |||
59 | return (res != 0 ? res : res2); | 59 | return (res != 0 ? res : res2); |
60 | } | 60 | } |
61 | 61 | ||
62 | typedef struct MY_PROCESSOR_NUMBER { | ||
63 | WORD Group; | ||
64 | BYTE Number; | ||
65 | BYTE Reserved; | ||
66 | } MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER; | ||
67 | |||
68 | typedef struct MY_GROUP_AFFINITY { | ||
69 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000) | ||
70 | // KAFFINITY is not defined in old mingw | ||
71 | ULONG_PTR | ||
72 | #else | ||
73 | KAFFINITY | ||
74 | #endif | ||
75 | Mask; | ||
76 | WORD Group; | ||
77 | WORD Reserved[3]; | ||
78 | } MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY; | ||
79 | |||
80 | typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)( | ||
81 | HANDLE hThread, | ||
82 | CONST MY_GROUP_AFFINITY *GroupAffinity, | ||
83 | MY_PGROUP_AFFINITY PreviousGroupAffinity); | ||
84 | |||
85 | typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)( | ||
86 | HANDLE hThread, | ||
87 | MY_PGROUP_AFFINITY GroupAffinity); | ||
88 | |||
89 | typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)( | ||
90 | HANDLE hProcess, | ||
91 | PUSHORT GroupCount, | ||
92 | PUSHORT GroupArray); | ||
93 | |||
94 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
95 | |||
96 | #if 0 | ||
97 | #include <stdio.h> | ||
98 | #define PRF(x) x | ||
99 | /* | ||
100 | -- | ||
101 | before call of SetThreadGroupAffinity() | ||
102 | GetProcessGroupAffinity return one group. | ||
103 | after call of SetThreadGroupAffinity(): | ||
104 | GetProcessGroupAffinity return more than group, | ||
105 | if SetThreadGroupAffinity() was to another group. | ||
106 | -- | ||
107 | GetProcessAffinityMask MS DOCs: | ||
108 | { | ||
109 | If the calling process contains threads in multiple groups, | ||
110 | the function returns zero for both affinity masks. | ||
111 | } | ||
112 | but tests in win10 with 2 groups (less than 64 cores total): | ||
113 | GetProcessAffinityMask() still returns non-zero affinity masks | ||
114 | even after SetThreadGroupAffinity() calls. | ||
115 | */ | ||
116 | static void PrintProcess_Info() | ||
117 | { | ||
118 | { | ||
119 | const | ||
120 | Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity = | ||
121 | (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
122 | "GetProcessGroupAffinity"); | ||
123 | if (fn_GetProcessGroupAffinity) | ||
124 | { | ||
125 | unsigned i; | ||
126 | USHORT GroupCounts[64]; | ||
127 | USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts); | ||
128 | BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(), | ||
129 | &GroupCount, GroupCounts); | ||
130 | printf("\n====== GetProcessGroupAffinity : " | ||
131 | "boolRes=%u GroupCounts = %u :", | ||
132 | boolRes, (unsigned)GroupCount); | ||
133 | for (i = 0; i < GroupCount; i++) | ||
134 | printf(" %u", GroupCounts[i]); | ||
135 | printf("\n"); | ||
136 | } | ||
137 | } | ||
138 | { | ||
139 | DWORD_PTR processAffinityMask, systemAffinityMask; | ||
140 | if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask)) | ||
141 | { | ||
142 | PRF(printf("\n====== GetProcessAffinityMask : " | ||
143 | ": processAffinityMask=%x, systemAffinityMask=%x\n", | ||
144 | (UInt32)processAffinityMask, (UInt32)systemAffinityMask);) | ||
145 | } | ||
146 | else | ||
147 | printf("\n==GetProcessAffinityMask FAIL"); | ||
148 | } | ||
149 | } | ||
150 | #else | ||
151 | #ifndef USE_THREADS_CreateThread | ||
152 | // #define PRF(x) | ||
153 | #endif | ||
154 | #endif | ||
155 | |||
62 | WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | 156 | WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) |
63 | { | 157 | { |
64 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ | 158 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ |
@@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | |||
72 | 166 | ||
73 | unsigned threadId; | 167 | unsigned threadId; |
74 | *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); | 168 | *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); |
75 | 169 | ||
170 | #if 0 // 1 : for debug | ||
171 | { | ||
172 | DWORD_PTR prevMask; | ||
173 | DWORD_PTR affinity = 1 << 0; | ||
174 | prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity); | ||
175 | prevMask = prevMask; | ||
176 | } | ||
177 | #endif | ||
178 | #if 0 // 1 : for debug | ||
179 | { | ||
180 | /* win10: new thread will be created in same group that is assigned to parent thread | ||
181 | but affinity mask will contain all allowed threads of that group, | ||
182 | even if affinity mask of parent group is not full | ||
183 | win11: what group it will be created, if we have set | ||
184 | affinity of parent thread with ThreadGroupAffinity? | ||
185 | */ | ||
186 | const | ||
187 | Func_GetThreadGroupAffinity fn = | ||
188 | (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
189 | "GetThreadGroupAffinity"); | ||
190 | if (fn) | ||
191 | { | ||
192 | // BOOL wres2; | ||
193 | MY_GROUP_AFFINITY groupAffinity; | ||
194 | memset(&groupAffinity, 0, sizeof(groupAffinity)); | ||
195 | /* wres2 = */ fn(*p, &groupAffinity); | ||
196 | PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): " | ||
197 | "wres2_BOOL = %u, group=%u mask=%x\n", | ||
198 | GetCurrentThreadId(), | ||
199 | wres2, | ||
200 | groupAffinity.Group, | ||
201 | (UInt32)groupAffinity.Mask);) | ||
202 | } | ||
203 | } | ||
204 | #endif | ||
205 | |||
76 | #endif | 206 | #endif |
77 | 207 | ||
78 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ | 208 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ |
@@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param | |||
110 | */ | 240 | */ |
111 | } | 241 | } |
112 | { | 242 | { |
113 | DWORD prevSuspendCount = ResumeThread(h); | 243 | const DWORD prevSuspendCount = ResumeThread(h); |
244 | /* ResumeThread() returns: | ||
245 | 0 : was_not_suspended | ||
246 | 1 : was_resumed | ||
247 | -1 : error | ||
248 | */ | ||
249 | if (prevSuspendCount == (DWORD)-1) | ||
250 | wres = GetError(); | ||
251 | } | ||
252 | } | ||
253 | |||
254 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ | ||
255 | return wres; | ||
256 | |||
257 | #endif | ||
258 | } | ||
259 | |||
260 | |||
261 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask) | ||
262 | { | ||
263 | #ifdef USE_THREADS_CreateThread | ||
264 | |||
265 | UNUSED_VAR(group) | ||
266 | UNUSED_VAR(affinityMask) | ||
267 | return Thread_Create(p, func, param); | ||
268 | |||
269 | #else | ||
270 | |||
271 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ | ||
272 | HANDLE h; | ||
273 | WRes wres; | ||
274 | unsigned threadId; | ||
275 | h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); | ||
276 | *p = h; | ||
277 | wres = HandleToWRes(h); | ||
278 | if (h) | ||
279 | { | ||
280 | // PrintProcess_Info(); | ||
281 | { | ||
282 | const | ||
283 | Func_SetThreadGroupAffinity fn = | ||
284 | (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
285 | "SetThreadGroupAffinity"); | ||
286 | if (fn) | ||
287 | { | ||
288 | // WRes wres2; | ||
289 | MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity; | ||
290 | memset(&groupAffinity, 0, sizeof(groupAffinity)); | ||
291 | // groupAffinity.Mask must use only bits that supported by current group | ||
292 | // (groupAffinity.Mask = 0) means all allowed bits | ||
293 | groupAffinity.Mask = affinityMask; | ||
294 | groupAffinity.Group = (WORD)group; | ||
295 | // wres2 = | ||
296 | fn(h, &groupAffinity, &prev_groupAffinity); | ||
297 | /* | ||
298 | if (groupAffinity.Group == prev_groupAffinity.Group) | ||
299 | wres2 = wres2; | ||
300 | else | ||
301 | wres2 = wres2; | ||
302 | if (wres2 == 0) | ||
303 | { | ||
304 | wres2 = GetError(); | ||
305 | PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);) | ||
306 | } | ||
307 | else | ||
308 | { | ||
309 | PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()" | ||
310 | " threadId = %6u" | ||
311 | " group=%u mask=%x\n", | ||
312 | threadId, | ||
313 | prev_groupAffinity.Group, | ||
314 | (UInt32)prev_groupAffinity.Mask);) | ||
315 | } | ||
316 | */ | ||
317 | } | ||
318 | } | ||
319 | { | ||
320 | const DWORD prevSuspendCount = ResumeThread(h); | ||
114 | /* ResumeThread() returns: | 321 | /* ResumeThread() returns: |
115 | 0 : was_not_suspended | 322 | 0 : was_not_suspended |
116 | 1 : was_resumed | 323 | 1 : was_resumed |
@@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | |||
297 | return Thread_Create_With_CpuSet(p, func, param, NULL); | 504 | return Thread_Create_With_CpuSet(p, func, param, NULL); |
298 | } | 505 | } |
299 | 506 | ||
507 | /* | ||
508 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity) | ||
509 | { | ||
510 | UNUSED_VAR(group) | ||
511 | return Thread_Create_With_Affinity(p, func, param, affinity); | ||
512 | } | ||
513 | */ | ||
300 | 514 | ||
301 | WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) | 515 | WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) |
302 | { | 516 | { |
@@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) | |||
577 | return AutoResetEvent_CreateNotSignaled(p); | 791 | return AutoResetEvent_CreateNotSignaled(p); |
578 | } | 792 | } |
579 | 793 | ||
794 | void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup) | ||
795 | { | ||
796 | // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup); | ||
797 | if (numGroups == 0) | ||
798 | numGroups = 1; | ||
799 | p->NumGroups = numGroups; | ||
800 | p->NextGroup = startGroup % numGroups; | ||
801 | } | ||
802 | |||
803 | |||
804 | UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p) | ||
805 | { | ||
806 | const UInt32 next = p->NextGroup; | ||
807 | p->NextGroup = (next + 1) % p->NumGroups; | ||
808 | return next; | ||
809 | } | ||
810 | |||
580 | #undef PRF | 811 | #undef PRF |
581 | #undef Print | 812 | #undef Print |
diff --git a/C/Threads.h b/C/Threads.h index c1484a2..be12e6e 100644 --- a/C/Threads.h +++ b/C/Threads.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Threads.h -- multithreading library | 1 | /* Threads.h -- multithreading library |
2 | 2024-03-28 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_THREADS_H | 4 | #ifndef ZIP7_INC_THREADS_H |
5 | #define ZIP7_INC_THREADS_H | 5 | #define ZIP7_INC_THREADS_H |
@@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param | |||
140 | WRes Thread_Wait_Close(CThread *p); | 140 | WRes Thread_Wait_Close(CThread *p); |
141 | 141 | ||
142 | #ifdef _WIN32 | 142 | #ifdef _WIN32 |
143 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask); | ||
143 | #define Thread_Create_With_CpuSet(p, func, param, cs) \ | 144 | #define Thread_Create_With_CpuSet(p, func, param, cs) \ |
144 | Thread_Create_With_Affinity(p, func, param, *cs) | 145 | Thread_Create_With_Affinity(p, func, param, *cs) |
145 | #else | 146 | #else |
146 | WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); | 147 | WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); |
147 | #endif | 148 | #endif |
148 | 149 | ||
150 | typedef struct | ||
151 | { | ||
152 | unsigned NumGroups; | ||
153 | unsigned NextGroup; | ||
154 | } CThreadNextGroup; | ||
155 | |||
156 | void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup); | ||
157 | unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p); | ||
158 | |||
149 | 159 | ||
150 | #ifdef _WIN32 | 160 | #ifdef _WIN32 |
151 | 161 | ||
diff --git a/C/Util/Lzma/LzmaUtil.dsp b/C/Util/Lzma/LzmaUtil.dsp index e2e7d42..71de950 100644 --- a/C/Util/Lzma/LzmaUtil.dsp +++ b/C/Util/Lzma/LzmaUtil.dsp | |||
@@ -122,6 +122,10 @@ SOURCE=..\..\Compiler.h | |||
122 | # End Source File | 122 | # End Source File |
123 | # Begin Source File | 123 | # Begin Source File |
124 | 124 | ||
125 | SOURCE=..\..\CpuArch.c | ||
126 | # End Source File | ||
127 | # Begin Source File | ||
128 | |||
125 | SOURCE=..\..\CpuArch.h | 129 | SOURCE=..\..\CpuArch.h |
126 | # End Source File | 130 | # End Source File |
127 | # Begin Source File | 131 | # Begin Source File |
diff --git a/C/Util/LzmaLib/LzmaLib.dsp b/C/Util/LzmaLib/LzmaLib.dsp index bacd967..f413137 100644 --- a/C/Util/LzmaLib/LzmaLib.dsp +++ b/C/Util/LzmaLib/LzmaLib.dsp | |||
@@ -43,7 +43,7 @@ RSC=rc.exe | |||
43 | # PROP Ignore_Export_Lib 0 | 43 | # PROP Ignore_Export_Lib 0 |
44 | # PROP Target_Dir "" | 44 | # PROP Target_Dir "" |
45 | # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c | 45 | # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c |
46 | # ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c | 46 | # ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c |
47 | # SUBTRACT CPP /YX | 47 | # SUBTRACT CPP /YX |
48 | # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 | 48 | # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 |
49 | # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 | 49 | # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 |
@@ -71,7 +71,7 @@ LINK32=link.exe | |||
71 | # PROP Ignore_Export_Lib 0 | 71 | # PROP Ignore_Export_Lib 0 |
72 | # PROP Target_Dir "" | 72 | # PROP Target_Dir "" |
73 | # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c | 73 | # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c |
74 | # ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c | 74 | # ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c |
75 | # SUBTRACT CPP /YX | 75 | # SUBTRACT CPP /YX |
76 | # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 | 76 | # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 |
77 | # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 | 77 | # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 |
@@ -128,6 +128,10 @@ SOURCE=..\..\Compiler.h | |||
128 | # End Source File | 128 | # End Source File |
129 | # Begin Source File | 129 | # Begin Source File |
130 | 130 | ||
131 | SOURCE=..\..\CpuArch.c | ||
132 | # End Source File | ||
133 | # Begin Source File | ||
134 | |||
131 | SOURCE=..\..\CpuArch.h | 135 | SOURCE=..\..\CpuArch.h |
132 | # End Source File | 136 | # End Source File |
133 | # Begin Source File | 137 | # Begin Source File |
@@ -1,5 +1,5 @@ | |||
1 | /* Xz.h - Xz interface | 1 | /* Xz.h - Xz interface |
2 | 2024-01-26 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_XZ_H | 4 | #ifndef ZIP7_INC_XZ_H |
5 | #define ZIP7_INC_XZ_H | 5 | #define ZIP7_INC_XZ_H |
@@ -121,6 +121,7 @@ typedef struct | |||
121 | UInt64 startOffset; | 121 | UInt64 startOffset; |
122 | } CXzStream; | 122 | } CXzStream; |
123 | 123 | ||
124 | #define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; } | ||
124 | void Xz_Construct(CXzStream *p); | 125 | void Xz_Construct(CXzStream *p); |
125 | void Xz_Free(CXzStream *p, ISzAllocPtr alloc); | 126 | void Xz_Free(CXzStream *p, ISzAllocPtr alloc); |
126 | 127 | ||
@@ -136,8 +137,13 @@ typedef struct | |||
136 | CXzStream *streams; | 137 | CXzStream *streams; |
137 | } CXzs; | 138 | } CXzs; |
138 | 139 | ||
140 | #define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; } | ||
139 | void Xzs_Construct(CXzs *p); | 141 | void Xzs_Construct(CXzs *p); |
140 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc); | 142 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc); |
143 | /* | ||
144 | Xzs_ReadBackward() must be called for empty CXzs object. | ||
145 | Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error. | ||
146 | */ | ||
141 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); | 147 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); |
142 | 148 | ||
143 | UInt64 Xzs_GetNumBlocks(const CXzs *p); | 149 | UInt64 Xzs_GetNumBlocks(const CXzs *p); |
@@ -268,8 +274,8 @@ typedef struct | |||
268 | size_t outBufSize; | 274 | size_t outBufSize; |
269 | size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked | 275 | size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked |
270 | 276 | ||
271 | Byte shaDigest[SHA256_DIGEST_SIZE]; | 277 | UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4]; |
272 | Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; | 278 | Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes |
273 | } CXzUnpacker; | 279 | } CXzUnpacker; |
274 | 280 | ||
275 | /* alloc : aligned for cache line allocation is better */ | 281 | /* alloc : aligned for cache line allocation is better */ |
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c index 0c1fc2f..6eea4a3 100644 --- a/C/XzCrc64Opt.c +++ b/C/XzCrc64Opt.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) | 1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) |
2 | 2023-12-08 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) | |||
235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); | 235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); |
236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); | 236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); |
237 | #endif | 237 | #endif |
238 | #elif | 238 | #else |
239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | 239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES |
240 | #endif | 240 | #endif |
241 | p += Z7_CRC64_NUM_TABLES_USE; | 241 | p += Z7_CRC64_NUM_TABLES_USE; |
@@ -1,5 +1,5 @@ | |||
1 | /* XzDec.c -- Xz Decode | 1 | /* XzDec.c -- Xz Decode |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value) | |||
59 | 59 | ||
60 | for (i = 0; i < limit;) | 60 | for (i = 0; i < limit;) |
61 | { | 61 | { |
62 | Byte b = p[i]; | 62 | const unsigned b = p[i]; |
63 | *value |= (UInt64)(b & 0x7F) << (7 * i++); | 63 | *value |= (UInt64)(b & 0x7F) << (7 * i++); |
64 | if ((b & 0x80) == 0) | 64 | if ((b & 0x80) == 0) |
65 | return (b == 0 && i != 1) ? 0 : i; | 65 | return (b == 0 && i != 1) ? 0 : i; |
@@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf) | |||
796 | 796 | ||
797 | static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) | 797 | static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) |
798 | { | 798 | { |
799 | return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) | 799 | return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2) |
800 | && GetUi32(buf) == CrcCalc(buf + 4, 6) | 800 | && GetUi32a(buf) == CrcCalc(buf + 4, 6) |
801 | && flags == GetBe16(buf + 8) | 801 | && flags == GetBe16a(buf + 8) |
802 | && buf[10] == XZ_FOOTER_SIG_0 | 802 | && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)); |
803 | && buf[11] == XZ_FOOTER_SIG_1; | ||
804 | } | 803 | } |
805 | 804 | ||
806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 805 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
@@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1166 | p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); | 1165 | p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); |
1167 | p->indexPos = p->indexPreSize; | 1166 | p->indexPos = p->indexPreSize; |
1168 | p->indexSize += p->indexPreSize; | 1167 | p->indexSize += p->indexPreSize; |
1169 | Sha256_Final(&p->sha, p->shaDigest); | 1168 | Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32); |
1170 | Sha256_Init(&p->sha); | 1169 | Sha256_Init(&p->sha); |
1171 | p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); | 1170 | p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); |
1172 | p->state = XZ_STATE_STREAM_INDEX; | 1171 | p->state = XZ_STATE_STREAM_INDEX; |
@@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1241 | break; | 1240 | break; |
1242 | } | 1241 | } |
1243 | { | 1242 | { |
1244 | Byte digest[XZ_CHECK_SIZE_MAX]; | 1243 | UInt32 digest32[XZ_CHECK_SIZE_MAX / 4]; |
1245 | p->state = XZ_STATE_BLOCK_HEADER; | 1244 | p->state = XZ_STATE_BLOCK_HEADER; |
1246 | p->pos = 0; | 1245 | p->pos = 0; |
1247 | if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) | 1246 | if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0) |
1248 | return SZ_ERROR_CRC; | 1247 | return SZ_ERROR_CRC; |
1249 | if (p->decodeOnlyOneBlock) | 1248 | if (p->decodeOnlyOneBlock) |
1250 | { | 1249 | { |
@@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1289 | } | 1288 | } |
1290 | else | 1289 | else |
1291 | { | 1290 | { |
1292 | Byte digest[SHA256_DIGEST_SIZE]; | 1291 | UInt32 digest32[SHA256_DIGEST_SIZE / 4]; |
1293 | p->state = XZ_STATE_STREAM_INDEX_CRC; | 1292 | p->state = XZ_STATE_STREAM_INDEX_CRC; |
1294 | p->indexSize += 4; | 1293 | p->indexSize += 4; |
1295 | p->pos = 0; | 1294 | p->pos = 0; |
1296 | Sha256_Final(&p->sha, digest); | 1295 | Sha256_Final(&p->sha, (void *)digest32); |
1297 | if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) | 1296 | if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0) |
1298 | return SZ_ERROR_CRC; | 1297 | return SZ_ERROR_CRC; |
1299 | } | 1298 | } |
1300 | } | 1299 | } |
@@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1313 | const Byte *ptr = p->buf; | 1312 | const Byte *ptr = p->buf; |
1314 | p->state = XZ_STATE_STREAM_FOOTER; | 1313 | p->state = XZ_STATE_STREAM_FOOTER; |
1315 | p->pos = 0; | 1314 | p->pos = 0; |
1316 | if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) | 1315 | if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr)) |
1317 | return SZ_ERROR_CRC; | 1316 | return SZ_ERROR_CRC; |
1318 | } | 1317 | } |
1319 | break; | 1318 | break; |
@@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1343 | { | 1342 | { |
1344 | if (*src != 0) | 1343 | if (*src != 0) |
1345 | { | 1344 | { |
1346 | if (((UInt32)p->padSize & 3) != 0) | 1345 | if ((unsigned)p->padSize & 3) |
1347 | return SZ_ERROR_NO_ARCHIVE; | 1346 | return SZ_ERROR_NO_ARCHIVE; |
1348 | p->pos = 0; | 1347 | p->pos = 0; |
1349 | p->state = XZ_STATE_STREAM_HEADER; | 1348 | p->state = XZ_STATE_STREAM_HEADER; |
@@ -1,5 +1,5 @@ | |||
1 | /* XzEnc.c -- Xz Encode | 1 | /* XzEnc.c -- Xz Encode |
2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size) | |||
411 | } | 411 | } |
412 | } | 412 | } |
413 | 413 | ||
414 | Z7_FORCE_INLINE | ||
414 | static void SeqInFilter_Construct(CSeqInFilter *p) | 415 | static void SeqInFilter_Construct(CSeqInFilter *p) |
415 | { | 416 | { |
416 | p->buf = NULL; | 417 | p->buf = NULL; |
@@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p) | |||
418 | p->vt.Read = SeqInFilter_Read; | 419 | p->vt.Read = SeqInFilter_Read; |
419 | } | 420 | } |
420 | 421 | ||
422 | Z7_FORCE_INLINE | ||
421 | static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) | 423 | static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) |
422 | { | 424 | { |
423 | if (p->StateCoder.p) | 425 | if (p->StateCoder.p) |
@@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p) | |||
507 | void XzProps_Init(CXzProps *p) | 509 | void XzProps_Init(CXzProps *p) |
508 | { | 510 | { |
509 | p->checkId = XZ_CHECK_CRC32; | 511 | p->checkId = XZ_CHECK_CRC32; |
512 | p->numThreadGroups = 0; | ||
510 | p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; | 513 | p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; |
511 | p->numBlockThreads_Reduced = -1; | 514 | p->numBlockThreads_Reduced = -1; |
512 | p->numBlockThreads_Max = -1; | 515 | p->numBlockThreads_Max = -1; |
@@ -689,6 +692,7 @@ typedef struct | |||
689 | } CLzma2WithFilters; | 692 | } CLzma2WithFilters; |
690 | 693 | ||
691 | 694 | ||
695 | Z7_FORCE_INLINE | ||
692 | static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) | 696 | static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) |
693 | { | 697 | { |
694 | p->lzma2 = NULL; | 698 | p->lzma2 = NULL; |
@@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz | |||
712 | } | 716 | } |
713 | 717 | ||
714 | 718 | ||
719 | Z7_FORCE_INLINE | ||
715 | static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) | 720 | static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) |
716 | { | 721 | { |
717 | #ifdef USE_SUBBLOCK | 722 | #ifdef USE_SUBBLOCK |
@@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in | |||
1236 | } | 1241 | } |
1237 | 1242 | ||
1238 | p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; | 1243 | p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; |
1244 | p->mtCoder.numThreadGroups = props->numThreadGroups; | ||
1239 | p->mtCoder.expectedDataSize = p->expectedDataSize; | 1245 | p->mtCoder.expectedDataSize = p->expectedDataSize; |
1240 | 1246 | ||
1241 | RINOK(MtCoder_Code(&p->mtCoder)) | 1247 | RINOK(MtCoder_Code(&p->mtCoder)) |
@@ -1,5 +1,5 @@ | |||
1 | /* XzEnc.h -- Xz Encode | 1 | /* XzEnc.h -- Xz Encode |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_XZ_ENC_H | 4 | #ifndef ZIP7_INC_XZ_ENC_H |
5 | #define ZIP7_INC_XZ_ENC_H | 5 | #define ZIP7_INC_XZ_ENC_H |
@@ -31,6 +31,7 @@ typedef struct | |||
31 | CLzma2EncProps lzma2Props; | 31 | CLzma2EncProps lzma2Props; |
32 | CXzFilterProps filterProps; | 32 | CXzFilterProps filterProps; |
33 | unsigned checkId; | 33 | unsigned checkId; |
34 | unsigned numThreadGroups; // 0 : no groups | ||
34 | UInt64 blockSize; | 35 | UInt64 blockSize; |
35 | int numBlockThreads_Reduced; | 36 | int numBlockThreads_Reduced; |
36 | int numBlockThreads_Max; | 37 | int numBlockThreads_Max; |
@@ -1,38 +1,39 @@ | |||
1 | /* XzIn.c - Xz input | 1 | /* XzIn.c - Xz input |
2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include <string.h> | 6 | #include <string.h> |
7 | 7 | ||
8 | #include "7zCrc.h" | 8 | #include "7zCrc.h" |
9 | #include "CpuArch.h" | ||
10 | #include "Xz.h" | 9 | #include "Xz.h" |
10 | #include "CpuArch.h" | ||
11 | 11 | ||
12 | /* | 12 | #define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \ |
13 | #define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) | 13 | (GetUi16a((const Byte *)(const void *)(p) + 10) == \ |
14 | */ | 14 | (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8))) |
15 | #define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1) | ||
16 | |||
17 | 15 | ||
18 | SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) | 16 | SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) |
19 | { | 17 | { |
20 | Byte sig[XZ_STREAM_HEADER_SIZE]; | 18 | UInt32 data32[XZ_STREAM_HEADER_SIZE / 4]; |
21 | size_t processedSize = XZ_STREAM_HEADER_SIZE; | 19 | size_t processedSize = XZ_STREAM_HEADER_SIZE; |
22 | RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize)) | 20 | RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize)) |
23 | if (processedSize != XZ_STREAM_HEADER_SIZE | 21 | if (processedSize != XZ_STREAM_HEADER_SIZE |
24 | || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) | 22 | || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0) |
25 | return SZ_ERROR_NO_ARCHIVE; | 23 | return SZ_ERROR_NO_ARCHIVE; |
26 | return Xz_ParseHeader(p, sig); | 24 | return Xz_ParseHeader(p, (const Byte *)(const void *)data32); |
27 | } | 25 | } |
28 | 26 | ||
29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 27 | #define READ_VARINT_AND_CHECK(buf, size, res) \ |
30 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 28 | { const unsigned s = Xz_ReadVarInt(buf, size, res); \ |
31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 29 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
32 | pos += s; } | 30 | size -= s; \ |
31 | buf += s; \ | ||
32 | } | ||
33 | 33 | ||
34 | SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) | 34 | SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) |
35 | { | 35 | { |
36 | MY_ALIGN(4) | ||
36 | Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; | 37 | Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; |
37 | unsigned headerSize; | 38 | unsigned headerSize; |
38 | *headerSizeRes = 0; | 39 | *headerSizeRes = 0; |
@@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
57 | return XzBlock_Parse(p, header); | 58 | return XzBlock_Parse(p, header); |
58 | } | 59 | } |
59 | 60 | ||
61 | |||
60 | #define ADD_SIZE_CHECK(size, val) \ | 62 | #define ADD_SIZE_CHECK(size, val) \ |
61 | { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } | 63 | { const UInt64 newSize = size + (val); \ |
64 | if (newSize < size) return XZ_SIZE_OVERFLOW; \ | ||
65 | size = newSize; \ | ||
66 | } | ||
62 | 67 | ||
63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) | 68 | UInt64 Xz_GetUnpackSize(const CXzStream *p) |
64 | { | 69 | { |
@@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p) | |||
82 | return size; | 87 | return size; |
83 | } | 88 | } |
84 | 89 | ||
85 | /* | ||
86 | SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream) | ||
87 | { | ||
88 | return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f)); | ||
89 | } | ||
90 | */ | ||
91 | 90 | ||
92 | static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) | 91 | // input; |
92 | // CXzStream (p) is empty object. | ||
93 | // size != 0 | ||
94 | // (size & 3) == 0 | ||
95 | // (buf) is aligned for at least 4 bytes. | ||
96 | // output: | ||
97 | // p->numBlocks is number of allocated items in p->blocks | ||
98 | // p->blocks[*] values must be ignored, if function returns error. | ||
99 | static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) | ||
93 | { | 100 | { |
94 | size_t numBlocks, pos = 1; | 101 | size_t numBlocks; |
95 | UInt32 crc; | ||
96 | |||
97 | if (size < 5 || buf[0] != 0) | 102 | if (size < 5 || buf[0] != 0) |
98 | return SZ_ERROR_ARCHIVE; | 103 | return SZ_ERROR_ARCHIVE; |
99 | |||
100 | size -= 4; | 104 | size -= 4; |
101 | crc = CrcCalc(buf, size); | 105 | { |
102 | if (crc != GetUi32(buf + size)) | 106 | const UInt32 crc = CrcCalc(buf, size); |
103 | return SZ_ERROR_ARCHIVE; | 107 | if (crc != GetUi32a(buf + size)) |
104 | 108 | return SZ_ERROR_ARCHIVE; | |
109 | } | ||
110 | buf++; | ||
111 | size--; | ||
105 | { | 112 | { |
106 | UInt64 numBlocks64; | 113 | UInt64 numBlocks64; |
107 | READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64) | 114 | READ_VARINT_AND_CHECK(buf, size, &numBlocks64) |
108 | numBlocks = (size_t)numBlocks64; | 115 | // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2): |
109 | if (numBlocks != numBlocks64 || numBlocks * 2 > size) | 116 | if (numBlocks64 * 2 > size) |
110 | return SZ_ERROR_ARCHIVE; | 117 | return SZ_ERROR_ARCHIVE; |
118 | if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes)) | ||
119 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE | ||
120 | numBlocks = (size_t)numBlocks64; | ||
111 | } | 121 | } |
112 | 122 | // Xz_Free(p, alloc); // it's optional, because (p) is empty already | |
113 | Xz_Free(p, alloc); | 123 | if (numBlocks) |
114 | if (numBlocks != 0) | ||
115 | { | 124 | { |
116 | size_t i; | 125 | CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); |
117 | p->numBlocks = numBlocks; | 126 | if (!blocks) |
118 | p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); | ||
119 | if (!p->blocks) | ||
120 | return SZ_ERROR_MEM; | 127 | return SZ_ERROR_MEM; |
121 | for (i = 0; i < numBlocks; i++) | 128 | p->blocks = blocks; |
129 | p->numBlocks = numBlocks; | ||
130 | // the caller will call Xz_Free() in case of error | ||
131 | do | ||
122 | { | 132 | { |
123 | CXzBlockSizes *block = &p->blocks[i]; | 133 | READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize) |
124 | READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize) | 134 | READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize) |
125 | READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize) | 135 | if (blocks->totalSize == 0) |
126 | if (block->totalSize == 0) | ||
127 | return SZ_ERROR_ARCHIVE; | 136 | return SZ_ERROR_ARCHIVE; |
137 | blocks++; | ||
128 | } | 138 | } |
139 | while (--numBlocks); | ||
129 | } | 140 | } |
130 | while ((pos & 3) != 0) | 141 | if (size >= 4) |
131 | if (buf[pos++] != 0) | 142 | return SZ_ERROR_ARCHIVE; |
143 | while (size) | ||
144 | if (buf[--size]) | ||
132 | return SZ_ERROR_ARCHIVE; | 145 | return SZ_ERROR_ARCHIVE; |
133 | return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; | 146 | return SZ_OK; |
134 | } | 147 | } |
135 | 148 | ||
149 | |||
150 | /* | ||
136 | static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) | 151 | static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) |
137 | { | 152 | { |
138 | SRes res; | 153 | SRes res; |
139 | size_t size; | 154 | size_t size; |
140 | Byte *buf; | 155 | Byte *buf; |
141 | if (indexSize > ((UInt32)1 << 31)) | 156 | if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) |
142 | return SZ_ERROR_UNSUPPORTED; | 157 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE |
143 | size = (size_t)indexSize; | 158 | size = (size_t)indexSize; |
144 | if (size != indexSize) | ||
145 | return SZ_ERROR_UNSUPPORTED; | ||
146 | buf = (Byte *)ISzAlloc_Alloc(alloc, size); | 159 | buf = (Byte *)ISzAlloc_Alloc(alloc, size); |
147 | if (!buf) | 160 | if (!buf) |
148 | return SZ_ERROR_MEM; | 161 | return SZ_ERROR_MEM; |
149 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); | 162 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); |
150 | if (res == SZ_OK) | 163 | if (res == SZ_OK) |
151 | res = Xz_ReadIndex2(p, buf, size, alloc); | 164 | res = Xz_ParseIndex(p, buf, size, alloc); |
152 | ISzAlloc_Free(alloc, buf); | 165 | ISzAlloc_Free(alloc, buf); |
153 | return res; | 166 | return res; |
154 | } | 167 | } |
168 | */ | ||
155 | 169 | ||
156 | static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) | 170 | static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) |
157 | { | 171 | { |
@@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, | |||
160 | /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ | 174 | /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ |
161 | } | 175 | } |
162 | 176 | ||
177 | |||
178 | /* | ||
179 | in: | ||
180 | (*startOffset) is position in (stream) where xz_stream must be finished. | ||
181 | out: | ||
182 | if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream. | ||
183 | */ | ||
163 | static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) | 184 | static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) |
164 | { | 185 | { |
165 | UInt64 indexSize; | 186 | #define TEMP_BUF_SIZE (1 << 10) |
166 | Byte buf[XZ_STREAM_FOOTER_SIZE]; | 187 | UInt32 buf32[TEMP_BUF_SIZE / 4]; |
167 | UInt64 pos = (UInt64)*startOffset; | 188 | UInt64 pos = (UInt64)*startOffset; |
168 | 189 | ||
169 | if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) | 190 | if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE) |
170 | return SZ_ERROR_NO_ARCHIVE; | 191 | return SZ_ERROR_NO_ARCHIVE; |
171 | |||
172 | pos -= XZ_STREAM_FOOTER_SIZE; | 192 | pos -= XZ_STREAM_FOOTER_SIZE; |
173 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) | 193 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) |
174 | 194 | ||
175 | if (!XZ_FOOTER_SIG_CHECK(buf + 10)) | 195 | if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) |
176 | { | 196 | { |
177 | UInt32 total = 0; | ||
178 | pos += XZ_STREAM_FOOTER_SIZE; | 197 | pos += XZ_STREAM_FOOTER_SIZE; |
179 | |||
180 | for (;;) | 198 | for (;;) |
181 | { | 199 | { |
182 | size_t i; | 200 | // pos != 0 |
183 | #define TEMP_BUF_SIZE (1 << 10) | 201 | // (pos & 3) == 0 |
184 | Byte temp[TEMP_BUF_SIZE]; | 202 | size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos; |
185 | |||
186 | i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos; | ||
187 | pos -= i; | 203 | pos -= i; |
188 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)) | 204 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i)) |
189 | total += (UInt32)i; | 205 | i /= 4; |
190 | for (; i != 0; i--) | 206 | do |
191 | if (temp[i - 1] != 0) | 207 | if (buf32[i - 1] != 0) |
192 | break; | 208 | break; |
193 | if (i != 0) | 209 | while (--i); |
194 | { | 210 | |
195 | if ((i & 3) != 0) | 211 | pos += i * 4; |
196 | return SZ_ERROR_NO_ARCHIVE; | 212 | #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16) |
197 | pos += i; | 213 | // here we don't support rare case with big padding for xz stream. |
198 | break; | 214 | // so we have padding limit for backward reading. |
199 | } | 215 | if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX) |
200 | if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16)) | ||
201 | return SZ_ERROR_NO_ARCHIVE; | 216 | return SZ_ERROR_NO_ARCHIVE; |
217 | if (i) | ||
218 | break; | ||
202 | } | 219 | } |
203 | 220 | // we try to open xz stream after skipping zero padding. | |
221 | // ((UInt64)*startOffset == pos) is possible here! | ||
204 | if (pos < XZ_STREAM_FOOTER_SIZE) | 222 | if (pos < XZ_STREAM_FOOTER_SIZE) |
205 | return SZ_ERROR_NO_ARCHIVE; | 223 | return SZ_ERROR_NO_ARCHIVE; |
206 | pos -= XZ_STREAM_FOOTER_SIZE; | 224 | pos -= XZ_STREAM_FOOTER_SIZE; |
207 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) | 225 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) |
208 | if (!XZ_FOOTER_SIG_CHECK(buf + 10)) | 226 | if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) |
209 | return SZ_ERROR_NO_ARCHIVE; | 227 | return SZ_ERROR_NO_ARCHIVE; |
210 | } | 228 | } |
211 | 229 | ||
212 | p->flags = (CXzStreamFlags)GetBe16(buf + 8); | 230 | p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2); |
213 | |||
214 | if (!XzFlags_IsSupported(p->flags)) | 231 | if (!XzFlags_IsSupported(p->flags)) |
215 | return SZ_ERROR_UNSUPPORTED; | 232 | return SZ_ERROR_UNSUPPORTED; |
216 | |||
217 | { | 233 | { |
218 | /* to eliminate GCC 6.3 warning: | 234 | /* to eliminate GCC 6.3 warning: |
219 | dereferencing type-punned pointer will break strict-aliasing rules */ | 235 | dereferencing type-punned pointer will break strict-aliasing rules */ |
220 | const Byte *buf_ptr = buf; | 236 | const UInt32 *buf_ptr = buf32; |
221 | if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) | 237 | if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6)) |
222 | return SZ_ERROR_ARCHIVE; | 238 | return SZ_ERROR_ARCHIVE; |
223 | } | 239 | } |
224 | |||
225 | indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2; | ||
226 | |||
227 | if (pos < indexSize) | ||
228 | return SZ_ERROR_ARCHIVE; | ||
229 | |||
230 | pos -= indexSize; | ||
231 | RINOK(LookInStream_SeekTo(stream, pos)) | ||
232 | RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) | ||
233 | |||
234 | { | 240 | { |
235 | UInt64 totalSize = Xz_GetPackSize(p); | 241 | const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2; |
236 | if (totalSize == XZ_SIZE_OVERFLOW | 242 | if (pos < indexSize) |
237 | || totalSize >= ((UInt64)1 << 63) | ||
238 | || pos < totalSize + XZ_STREAM_HEADER_SIZE) | ||
239 | return SZ_ERROR_ARCHIVE; | 243 | return SZ_ERROR_ARCHIVE; |
240 | pos -= (totalSize + XZ_STREAM_HEADER_SIZE); | 244 | pos -= indexSize; |
245 | // v25.00: relaxed indexSize check. We allow big index table. | ||
246 | // if (indexSize > ((UInt32)1 << 31)) | ||
247 | if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) | ||
248 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE | ||
249 | RINOK(LookInStream_SeekTo(stream, pos)) | ||
250 | // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) | ||
251 | { | ||
252 | SRes res; | ||
253 | const size_t size = (size_t)indexSize; | ||
254 | // if (size != indexSize) return SZ_ERROR_UNSUPPORTED; | ||
255 | Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size); | ||
256 | if (!buf) | ||
257 | return SZ_ERROR_MEM; | ||
258 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); | ||
259 | if (res == SZ_OK) | ||
260 | res = Xz_ParseIndex(p, buf, size, alloc); | ||
261 | ISzAlloc_Free(alloc, buf); | ||
262 | RINOK(res) | ||
263 | } | ||
264 | } | ||
265 | { | ||
266 | UInt64 total = Xz_GetPackSize(p); | ||
267 | if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63)) | ||
268 | return SZ_ERROR_ARCHIVE; | ||
269 | total += XZ_STREAM_HEADER_SIZE; | ||
270 | if (pos < total) | ||
271 | return SZ_ERROR_ARCHIVE; | ||
272 | pos -= total; | ||
241 | RINOK(LookInStream_SeekTo(stream, pos)) | 273 | RINOK(LookInStream_SeekTo(stream, pos)) |
242 | *startOffset = (Int64)pos; | 274 | *startOffset = (Int64)pos; |
243 | } | 275 | } |
@@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO | |||
246 | CSecToRead secToRead; | 278 | CSecToRead secToRead; |
247 | SecToRead_CreateVTable(&secToRead); | 279 | SecToRead_CreateVTable(&secToRead); |
248 | secToRead.realStream = stream; | 280 | secToRead.realStream = stream; |
249 | |||
250 | RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) | 281 | RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) |
251 | return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; | 282 | return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; |
252 | } | 283 | } |
@@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO | |||
257 | 288 | ||
258 | void Xzs_Construct(CXzs *p) | 289 | void Xzs_Construct(CXzs *p) |
259 | { | 290 | { |
260 | p->num = p->numAllocated = 0; | 291 | Xzs_CONSTRUCT(p) |
261 | p->streams = 0; | ||
262 | } | 292 | } |
263 | 293 | ||
264 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc) | 294 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc) |
@@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc) | |||
268 | Xz_Free(&p->streams[i], alloc); | 298 | Xz_Free(&p->streams[i], alloc); |
269 | ISzAlloc_Free(alloc, p->streams); | 299 | ISzAlloc_Free(alloc, p->streams); |
270 | p->num = p->numAllocated = 0; | 300 | p->num = p->numAllocated = 0; |
271 | p->streams = 0; | 301 | p->streams = NULL; |
272 | } | 302 | } |
273 | 303 | ||
274 | UInt64 Xzs_GetNumBlocks(const CXzs *p) | 304 | UInt64 Xzs_GetNumBlocks(const CXzs *p) |
@@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p) | |||
307 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) | 337 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) |
308 | { | 338 | { |
309 | Int64 endOffset = 0; | 339 | Int64 endOffset = 0; |
340 | // it's supposed that CXzs object is empty here. | ||
341 | // if CXzs object is not empty, it will add new streams to that non-empty object. | ||
342 | // Xzs_Free(p, alloc); // it's optional call to empty CXzs object. | ||
310 | RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) | 343 | RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) |
311 | *startOffset = endOffset; | 344 | *startOffset = endOffset; |
312 | for (;;) | 345 | for (;;) |
313 | { | 346 | { |
314 | CXzStream st; | 347 | CXzStream st; |
315 | SRes res; | 348 | SRes res; |
316 | Xz_Construct(&st); | 349 | Xz_CONSTRUCT(&st) |
317 | res = Xz_ReadBackward(&st, stream, startOffset, alloc); | 350 | res = Xz_ReadBackward(&st, stream, startOffset, alloc); |
351 | // if (res == SZ_OK), then (*startOffset) is start offset of new stream if | ||
352 | // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error | ||
318 | st.startOffset = (UInt64)*startOffset; | 353 | st.startOffset = (UInt64)*startOffset; |
319 | RINOK(res) | 354 | // we must store (st) object to array, or we must free (st) local object. |
355 | if (res != SZ_OK) | ||
356 | { | ||
357 | Xz_Free(&st, alloc); | ||
358 | return res; | ||
359 | } | ||
320 | if (p->num == p->numAllocated) | 360 | if (p->num == p->numAllocated) |
321 | { | 361 | { |
322 | const size_t newNum = p->num + p->num / 4 + 1; | 362 | const size_t newNum = p->num + p->num / 4 + 1; |
323 | void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); | 363 | void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); |
324 | if (!data) | 364 | if (!data) |
365 | { | ||
366 | Xz_Free(&st, alloc); | ||
325 | return SZ_ERROR_MEM; | 367 | return SZ_ERROR_MEM; |
368 | } | ||
326 | p->numAllocated = newNum; | 369 | p->numAllocated = newNum; |
327 | if (p->num != 0) | 370 | if (p->num != 0) |
328 | memcpy(data, p->streams, p->num * sizeof(CXzStream)); | 371 | memcpy(data, p->streams, p->num * sizeof(CXzStream)); |
329 | ISzAlloc_Free(alloc, p->streams); | 372 | ISzAlloc_Free(alloc, p->streams); |
330 | p->streams = (CXzStream *)data; | 373 | p->streams = (CXzStream *)data; |
331 | } | 374 | } |
375 | // we use direct copying of raw data from local variable (st) to object in array. | ||
376 | // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++ | ||
332 | p->streams[p->num++] = st; | 377 | p->streams[p->num++] = st; |
333 | if (*startOffset == 0) | 378 | if (*startOffset == 0) |
334 | break; | 379 | return SZ_OK; |
335 | RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) | 380 | // seek operation is optional: |
381 | // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) | ||
336 | if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) | 382 | if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) |
337 | return SZ_ERROR_PROGRESS; | 383 | return SZ_ERROR_PROGRESS; |
338 | } | 384 | } |
339 | return SZ_OK; | ||
340 | } | 385 | } |