aboutsummaryrefslogtreecommitdiff
path: root/C
diff options
context:
space:
mode:
Diffstat (limited to 'C')
-rw-r--r--C/7zDec.c5
-rw-r--r--C/7zVersion.h10
-rw-r--r--C/AesOpt.c233
-rw-r--r--C/BwtSort.c468
-rw-r--r--C/BwtSort.h7
-rw-r--r--C/Compiler.h12
-rw-r--r--C/CpuArch.c109
-rw-r--r--C/CpuArch.h41
-rw-r--r--C/HuffEnc.c384
-rw-r--r--C/HuffEnc.h8
-rw-r--r--C/LzFind.c26
-rw-r--r--C/LzFindMt.c10
-rw-r--r--C/LzFindMt.h6
-rw-r--r--C/Lzma2Enc.c4
-rw-r--r--C/Lzma2Enc.h1
-rw-r--r--C/LzmaEnc.c22
-rw-r--r--C/LzmaEnc.h4
-rw-r--r--C/Md5.c206
-rw-r--r--C/Md5.h34
-rw-r--r--C/MtCoder.c61
-rw-r--r--C/MtCoder.h7
-rw-r--r--C/Sha1.c125
-rw-r--r--C/Sha1.h18
-rw-r--r--C/Sha1Opt.c146
-rw-r--r--C/Sha256.c162
-rw-r--r--C/Sha256.h18
-rw-r--r--C/Sha256Opt.c172
-rw-r--r--C/Sha3.c359
-rw-r--r--C/Sha3.h36
-rw-r--r--C/Sha512.c711
-rw-r--r--C/Sha512.h86
-rw-r--r--C/Sha512Opt.c395
-rw-r--r--C/Sort.c355
-rw-r--r--C/Sort.h7
-rw-r--r--C/Threads.c237
-rw-r--r--C/Threads.h12
-rw-r--r--C/Util/Lzma/LzmaUtil.dsp4
-rw-r--r--C/Util/LzmaLib/LzmaLib.dsp8
-rw-r--r--C/Xz.h12
-rw-r--r--C/XzCrc64Opt.c4
-rw-r--r--C/XzDec.c29
-rw-r--r--C/XzEnc.c8
-rw-r--r--C/XzEnc.h3
-rw-r--r--C/XzIn.c265
44 files changed, 3758 insertions, 1072 deletions
diff --git a/C/7zDec.c b/C/7zDec.c
index c9b4064..520cbfd 100644
--- a/C/7zDec.c
+++ b/C/7zDec.c
@@ -1,5 +1,5 @@
1/* 7zDec.c -- Decoding from 7z folder 1/* 7zDec.c -- Decoding from 7z folder
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
312 case k_PPMD: 312 case k_PPMD:
313 #endif 313 #endif
314 return True; 314 return True;
315 default:
316 return False;
315 } 317 }
316 return False;
317} 318}
318 319
319static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) 320static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
diff --git a/C/7zVersion.h b/C/7zVersion.h
index 1ddef80..b6142e9 100644
--- a/C/7zVersion.h
+++ b/C/7zVersion.h
@@ -1,7 +1,7 @@
1#define MY_VER_MAJOR 24 1#define MY_VER_MAJOR 25
2#define MY_VER_MINOR 8 2#define MY_VER_MINOR 1
3#define MY_VER_BUILD 0 3#define MY_VER_BUILD 0
4#define MY_VERSION_NUMBERS "24.08" 4#define MY_VERSION_NUMBERS "25.01"
5#define MY_VERSION MY_VERSION_NUMBERS 5#define MY_VERSION MY_VERSION_NUMBERS
6 6
7#ifdef MY_CPU_NAME 7#ifdef MY_CPU_NAME
@@ -10,12 +10,12 @@
10 #define MY_VERSION_CPU MY_VERSION 10 #define MY_VERSION_CPU MY_VERSION
11#endif 11#endif
12 12
13#define MY_DATE "2024-08-11" 13#define MY_DATE "2025-08-03"
14#undef MY_COPYRIGHT 14#undef MY_COPYRIGHT
15#undef MY_VERSION_COPYRIGHT_DATE 15#undef MY_VERSION_COPYRIGHT_DATE
16#define MY_AUTHOR_NAME "Igor Pavlov" 16#define MY_AUTHOR_NAME "Igor Pavlov"
17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" 17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" 18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
19 19
20#ifdef USE_COPYRIGHT_CR 20#ifdef USE_COPYRIGHT_CR
21 #define MY_COPYRIGHT MY_COPYRIGHT_CR 21 #define MY_COPYRIGHT MY_COPYRIGHT_CR
diff --git a/C/AesOpt.c b/C/AesOpt.c
index 58769ea..b281807 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,5 +1,5 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -80,19 +80,39 @@ AES_FUNC_START (name)
80 80
81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82 82
83#if 1
84// use aligned SSE load/store for data.
85// It is required for our Aes functions, that data is aligned for 16-bytes.
86// So we can use this branch of code.
87// and compiler can use fused load-op SSE instructions:
88// xorps xmm0, XMMWORD PTR [rdx]
89#define LOAD_128(pp) (*(__m128i *)(void *)(pp))
90#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
91// use aligned SSE load/store for data. Alternative code with direct access
92// #define LOAD_128(pp) _mm_load_si128(pp)
93// #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
94#else
95// use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96#define LOAD_128(pp) _mm_loadu_si128(pp)
97#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
98#endif
99
83AES_FUNC_START2 (AesCbc_Encode_HW) 100AES_FUNC_START2 (AesCbc_Encode_HW)
84{ 101{
102 if (numBlocks == 0)
103 return;
104 {
85 __m128i *p = (__m128i *)(void *)ivAes; 105 __m128i *p = (__m128i *)(void *)ivAes;
86 __m128i *data = (__m128i *)(void *)data8; 106 __m128i *data = (__m128i *)(void *)data8;
87 __m128i m = *p; 107 __m128i m = *p;
88 const __m128i k0 = p[2]; 108 const __m128i k0 = p[2];
89 const __m128i k1 = p[3]; 109 const __m128i k1 = p[3];
90 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; 110 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
91 for (; numBlocks != 0; numBlocks--, data++) 111 do
92 { 112 {
93 UInt32 r = numRounds2; 113 UInt32 r = numRounds2;
94 const __m128i *w = p + 4; 114 const __m128i *w = p + 4;
95 __m128i temp = *data; 115 __m128i temp = LOAD_128(data);
96 MM_XOR (temp, k0) 116 MM_XOR (temp, k0)
97 MM_XOR (m, temp) 117 MM_XOR (m, temp)
98 MM_OP_m (_mm_aesenc_si128, k1) 118 MM_OP_m (_mm_aesenc_si128, k1)
@@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
104 } 124 }
105 while (--r); 125 while (--r);
106 MM_OP_m (_mm_aesenclast_si128, w[0]) 126 MM_OP_m (_mm_aesenclast_si128, w[0])
107 *data = m; 127 STORE_128(data, m);
128 data++;
108 } 129 }
130 while (--numBlocks);
109 *p = m; 131 *p = m;
132 }
110} 133}
111 134
112 135
@@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
139 162
140#define WOP(op) op (m0, 0) WOP_M1(op) 163#define WOP(op) op (m0, 0) WOP_M1(op)
141 164
142
143#define DECLARE_VAR(reg, ii) __m128i reg; 165#define DECLARE_VAR(reg, ii) __m128i reg;
144#define LOAD_data( reg, ii) reg = data[ii]; 166#define LOAD_data_ii(ii) LOAD_128(data + (ii))
145#define STORE_data( reg, ii) data[ii] = reg; 167#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
168#define STORE_data( reg, ii) STORE_128(data + (ii), reg);
146#if (NUM_WAYS > 1) 169#if (NUM_WAYS > 1)
147#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 170#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
148#endif 171#endif
149 172
150#define MM_OP_key(op, reg) MM_OP(op, reg, key); 173#define MM_OP_key(op, reg) MM_OP(op, reg, key);
@@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
156#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 179#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
157 180
158#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 181#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
159#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 182#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
160 183 LOAD_128 (data + (ii))));
161#define WOP_KEY(op, n) { \ 184#define WOP_KEY(op, n) { \
162 const __m128i key = w[n]; \ 185 const __m128i key = w[n]; \
163 WOP(op); } 186 WOP(op) }
164
165 187
166#define WIDE_LOOP_START \ 188#define WIDE_LOOP_START \
167 dataEnd = data + numBlocks; \ 189 dataEnd = data + numBlocks; \
168 if (numBlocks >= NUM_WAYS) \ 190 if (numBlocks >= NUM_WAYS) \
169 { dataEnd -= NUM_WAYS; do { \ 191 { dataEnd -= NUM_WAYS; do { \
170 192
171
172#define WIDE_LOOP_END \ 193#define WIDE_LOOP_END \
173 data += NUM_WAYS; \ 194 data += NUM_WAYS; \
174 } while (data <= dataEnd); \ 195 } while (data <= dataEnd); \
175 dataEnd += NUM_WAYS; } \ 196 dataEnd += NUM_WAYS; } \
176 197
177
178#define SINGLE_LOOP \ 198#define SINGLE_LOOP \
179 for (; data < dataEnd; data++) 199 for (; data < dataEnd; data++)
180 200
@@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
184 204
185#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) 205#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186#define AVX_DECLARE_VAR(reg, ii) __m256i reg; 206#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; 207
188#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; 208#if 1
209// use unaligned AVX load/store for data.
210// It is required for our Aes functions, that data is aligned for 16-bytes.
211// But we need 32-bytes reading.
212// So we use intrinsics for unaligned AVX load/store.
213// notes for _mm256_storeu_si256:
214// msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215// new gcc11 uses vmovdqu
216// old gcc9 could use pair of instructions:
217// vmovups %xmm7, -224(%rax)
218// vextracti128 $0x1, %ymm7, -208(%rax)
219#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
220#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221#else
222// use aligned AVX load/store for data.
223// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224// msvc2022 uses vmovdqu still
225// gcc uses vmovdqa (that requires 32-bytes alignment)
226#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
227#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
228#endif
229
230#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
189/* 232/*
190AVX_XOR_data_M1() needs unaligned memory load 233AVX_XOR_data_M1() needs unaligned memory load, even if (data)
191if (we don't use _mm256_loadu_si256() here) 234is aligned for 256-bits, because we read 32-bytes chunk that
192{ 235crosses (data) position: from (data - 16bytes) to (data + 16bytes).
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197}
198Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200*/ 236*/
201#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) 237#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
202// for debug only: the following code will fail on execution, if compiled by some compilers:
203// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204 238
205#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) 239#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) 240#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) 241#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) 242#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) 243#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); 244#define AVX_CTR_START(reg, ii) \
211#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) 245 MM_OP (_mm256_add_epi64, ctr2, two) \
246 reg = _mm256_xor_si256(ctr2, key);
247
248#define AVX_CTR_END(reg, ii) \
249 AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250 AVX_LOAD ((__m256i *)(void *)data + (ii))));
251
212#define AVX_WOP_KEY(op, n) { \ 252#define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \ 253 const __m256i key = w[n]; \
214 WOP(op); } 254 WOP(op) }
215 255
216#define NUM_AES_KEYS_MAX 15 256#define NUM_AES_KEYS_MAX 15
217 257
218#define WIDE_LOOP_START_AVX(OP) \ 258#define WIDE_LOOP_START_AVX(OP) \
219 dataEnd = data + numBlocks; \ 259 dataEnd = data + numBlocks; \
220 if (numBlocks >= NUM_WAYS * 2) \ 260 if (numBlocks >= NUM_WAYS * 2) \
221 { __m256i keys[NUM_AES_KEYS_MAX]; \ 261 { __m256i keys[NUM_AES_KEYS_MAX]; \
222 UInt32 ii; \ 262 OP \
223 OP \ 263 { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
224 for (ii = 0; ii < numRounds; ii++) \ 264 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
225 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ 265 dataEnd -= NUM_WAYS * 2; \
226 dataEnd -= NUM_WAYS * 2; do { \ 266 do { \
227
228 267
229#define WIDE_LOOP_END_AVX(OP) \ 268#define WIDE_LOOP_END_AVX(OP) \
230 data += NUM_WAYS * 2; \ 269 data += NUM_WAYS * 2; \
231 } while (data <= dataEnd); \ 270 } while (data <= dataEnd); \
232 dataEnd += NUM_WAYS * 2; \ 271 dataEnd += NUM_WAYS * 2; \
233 OP \ 272 OP \
234 _mm256_zeroupper(); \ 273 _mm256_zeroupper(); \
235 } \ 274 } \
236 275
237/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 276/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
@@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
246 __m128i *p = (__m128i *)(void *)ivAes; 285 __m128i *p = (__m128i *)(void *)ivAes;
247 __m128i *data = (__m128i *)(void *)data8; 286 __m128i *data = (__m128i *)(void *)data8;
248 __m128i iv = *p; 287 __m128i iv = *p;
249 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; 288 const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
250 const __m128i *dataEnd; 289 const __m128i *dataEnd;
251 p += 2; 290 p += 2;
252 291
253 WIDE_LOOP_START 292 WIDE_LOOP_START
254 { 293 {
255 const __m128i *w = wStart; 294 const __m128i *w = wStart;
256
257 WOP (DECLARE_VAR) 295 WOP (DECLARE_VAR)
258 WOP (LOAD_data) 296 WOP (LOAD_data)
259 WOP_KEY (AES_XOR, 1) 297 WOP_KEY (AES_XOR, 1)
260
261 do 298 do
262 { 299 {
263 WOP_KEY (AES_DEC, 0) 300 WOP_KEY (AES_DEC, 0)
301
264 w--; 302 w--;
265 } 303 }
266 while (w != p); 304 while (w != p);
@@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
268 306
269 MM_XOR (m0, iv) 307 MM_XOR (m0, iv)
270 WOP_M1 (XOR_data_M1) 308 WOP_M1 (XOR_data_M1)
271 iv = data[NUM_WAYS - 1]; 309 LOAD_data(iv, NUM_WAYS - 1)
272 WOP (STORE_data) 310 WOP (STORE_data)
273 } 311 }
274 WIDE_LOOP_END 312 WIDE_LOOP_END
@@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
276 SINGLE_LOOP 314 SINGLE_LOOP
277 { 315 {
278 const __m128i *w = wStart - 1; 316 const __m128i *w = wStart - 1;
279 __m128i m = _mm_xor_si128 (w[2], *data); 317 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318
280 do 319 do
281 { 320 {
282 MM_OP_m (_mm_aesdec_si128, w[1]) 321 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
286 while (w != p); 325 while (w != p);
287 MM_OP_m (_mm_aesdec_si128, w[1]) 326 MM_OP_m (_mm_aesdec_si128, w[1])
288 MM_OP_m (_mm_aesdeclast_si128, w[0]) 327 MM_OP_m (_mm_aesdeclast_si128, w[0])
289
290 MM_XOR (m, iv) 328 MM_XOR (m, iv)
291 iv = *data; 329 LOAD_data(iv, 0)
292 *data = m; 330 STORE_data(m, 0)
293 } 331 }
294 332
295 p[-2] = iv; 333 p[-2] = iv;
@@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW)
301 __m128i *p = (__m128i *)(void *)ivAes; 339 __m128i *p = (__m128i *)(void *)ivAes;
302 __m128i *data = (__m128i *)(void *)data8; 340 __m128i *data = (__m128i *)(void *)data8;
303 __m128i ctr = *p; 341 __m128i ctr = *p;
304 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; 342 const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
305 const __m128i *dataEnd; 343 const __m128i *dataEnd;
306 __m128i one = _mm_cvtsi32_si128(1); 344 const __m128i one = _mm_cvtsi32_si128(1);
307 345
308 p += 2; 346 p += 2;
309 347
@@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW)
322 } 360 }
323 while (--r); 361 while (--r);
324 WOP_KEY (AES_ENC_LAST, 0) 362 WOP_KEY (AES_ENC_LAST, 0)
325
326 WOP (CTR_END) 363 WOP (CTR_END)
327 } 364 }
328 WIDE_LOOP_END 365 WIDE_LOOP_END
@@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
344 while (--numRounds2); 381 while (--numRounds2);
345 MM_OP_m (_mm_aesenc_si128, w[0]) 382 MM_OP_m (_mm_aesenc_si128, w[0])
346 MM_OP_m (_mm_aesenclast_si128, w[1]) 383 MM_OP_m (_mm_aesenclast_si128, w[1])
347 MM_XOR (*data, m) 384 CTR_END (m, 0)
348 } 385 }
349 386
350 p[-2] = ctr; 387 p[-2] = ctr;
@@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
421 __m128i *data = (__m128i *)(void *)data8; 458 __m128i *data = (__m128i *)(void *)data8;
422 __m128i iv = *p; 459 __m128i iv = *p;
423 const __m128i *dataEnd; 460 const __m128i *dataEnd;
424 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 461 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
425 p += 2; 462 p += 2;
426 463
427 WIDE_LOOP_START_AVX(;) 464 WIDE_LOOP_START_AVX(;)
@@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
440 while (w != keys); 477 while (w != keys);
441 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) 478 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
442 479
443 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) 480 AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
444 WOP_M1 (AVX_XOR_data_M1) 481 WOP_M1 (AVX_XOR_data_M1)
445 iv = data[NUM_WAYS * 2 - 1]; 482 LOAD_data (iv, NUM_WAYS * 2 - 1)
446 WOP (AVX_STORE_data) 483 WOP (AVX_STORE_data)
447 } 484 }
448 WIDE_LOOP_END_AVX(;) 485 WIDE_LOOP_END_AVX(;)
449 486
450 SINGLE_LOOP 487 SINGLE_LOOP
451 { 488 {
452 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; 489 const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
453 __m128i m = _mm_xor_si128 (w[2], *data); 490 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
454 do 491 do
455 { 492 {
456 MM_OP_m (_mm_aesdec_si128, w[1]) 493 MM_OP_m (_mm_aesdec_si128, w[1])
@@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
462 MM_OP_m (_mm_aesdeclast_si128, w[0]) 499 MM_OP_m (_mm_aesdeclast_si128, w[0])
463 500
464 MM_XOR (m, iv) 501 MM_XOR (m, iv)
465 iv = *data; 502 LOAD_data(iv, 0)
466 *data = m; 503 STORE_data(m, 0)
467 } 504 }
468 505
469 p[-2] = iv; 506 p[-2] = iv;
@@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
493 __m128i *p = (__m128i *)(void *)ivAes; 530 __m128i *p = (__m128i *)(void *)ivAes;
494 __m128i *data = (__m128i *)(void *)data8; 531 __m128i *data = (__m128i *)(void *)data8;
495 __m128i ctr = *p; 532 __m128i ctr = *p;
496 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; 533 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
497 const __m128i *dataEnd; 534 const __m128i *dataEnd;
498 __m128i one = _mm_cvtsi32_si128(1); 535 const __m128i one = _mm_cvtsi32_si128(1);
499 __m256i ctr2, two; 536 __m256i ctr2, two;
500 p += 2; 537 p += 2;
501 538
@@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
536 while (--numRounds2); 573 while (--numRounds2);
537 MM_OP_m (_mm_aesenc_si128, w[0]) 574 MM_OP_m (_mm_aesenc_si128, w[0])
538 MM_OP_m (_mm_aesenclast_si128, w[1]) 575 MM_OP_m (_mm_aesenclast_si128, w[1])
539 MM_XOR (*data, m) 576 CTR_END (m, 0)
540 } 577 }
541 578
542 p[-2] = ctr; 579 p[-2] = ctr;
@@ -731,9 +768,14 @@ AES_FUNC_START (name)
731 768
732AES_FUNC_START2 (AesCbc_Encode_HW) 769AES_FUNC_START2 (AesCbc_Encode_HW)
733{ 770{
734 v128 * const p = (v128*)(void*)ivAes; 771 if (numBlocks == 0)
735 v128 *data = (v128*)(void*)data8; 772 return;
773 {
774 v128 * const p = (v128 *)(void *)ivAes;
775 v128 *data = (v128 *)(void *)data8;
736 v128 m = *p; 776 v128 m = *p;
777 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778 const v128 *w = p + (size_t)numRounds2 * 2;
737 const v128 k0 = p[2]; 779 const v128 k0 = p[2];
738 const v128 k1 = p[3]; 780 const v128 k1 = p[3];
739 const v128 k2 = p[4]; 781 const v128 k2 = p[4];
@@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
744 const v128 k7 = p[9]; 786 const v128 k7 = p[9];
745 const v128 k8 = p[10]; 787 const v128 k8 = p[10];
746 const v128 k9 = p[11]; 788 const v128 k9 = p[11];
747 const UInt32 numRounds2 = *(const UInt32 *)(p + 1); 789 const v128 k_z4 = w[-2];
748 const v128 *w = p + ((size_t)numRounds2 * 2); 790 const v128 k_z3 = w[-1];
791 const v128 k_z2 = w[0];
749 const v128 k_z1 = w[1]; 792 const v128 k_z1 = w[1];
750 const v128 k_z0 = w[2]; 793 const v128 k_z0 = w[2];
751 for (; numBlocks != 0; numBlocks--, data++) 794 // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795 // because gcc/clang compilers are not good for that optimization.
796 do
752 { 797 {
753 MM_XOR_m (*data) 798 MM_XOR_m (*data)
754 AES_E_MC_m (k0) 799 AES_E_MC_m (k0)
@@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
757 AES_E_MC_m (k3) 802 AES_E_MC_m (k3)
758 AES_E_MC_m (k4) 803 AES_E_MC_m (k4)
759 AES_E_MC_m (k5) 804 AES_E_MC_m (k5)
760 AES_E_MC_m (k6)
761 AES_E_MC_m (k7)
762 AES_E_MC_m (k8)
763 if (numRounds2 >= 6) 805 if (numRounds2 >= 6)
764 { 806 {
765 AES_E_MC_m (k9) 807 AES_E_MC_m (k6)
766 AES_E_MC_m (p[12]) 808 AES_E_MC_m (k7)
767 if (numRounds2 != 6) 809 if (numRounds2 != 6)
768 { 810 {
769 AES_E_MC_m (p[13]) 811 AES_E_MC_m (k8)
770 AES_E_MC_m (p[14]) 812 AES_E_MC_m (k9)
771 } 813 }
772 } 814 }
773 AES_E_m (k_z1) 815 AES_E_MC_m (k_z4)
774 MM_XOR_m (k_z0) 816 AES_E_MC_m (k_z3)
775 *data = m; 817 AES_E_MC_m (k_z2)
818 AES_E_m (k_z1)
819 MM_XOR_m (k_z0)
820 *data++ = m;
776 } 821 }
822 while (--numBlocks);
777 *p = m; 823 *p = m;
824 }
778} 825}
779 826
780 827
@@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
834 881
835AES_FUNC_START2 (AesCbc_Decode_HW) 882AES_FUNC_START2 (AesCbc_Decode_HW)
836{ 883{
837 v128 *p = (v128*)(void*)ivAes; 884 v128 *p = (v128 *)(void *)ivAes;
838 v128 *data = (v128*)(void*)data8; 885 v128 *data = (v128 *)(void *)data8;
839 v128 iv = *p; 886 v128 iv = *p;
840 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 887 const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
841 const v128 *dataEnd; 888 const v128 *dataEnd;
842 p += 2; 889 p += 2;
843 890
@@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
858 WOP_KEY (AES_XOR, 0) 905 WOP_KEY (AES_XOR, 0)
859 MM_XOR (m0, iv) 906 MM_XOR (m0, iv)
860 WOP_M1 (XOR_data_M1) 907 WOP_M1 (XOR_data_M1)
861 iv = data[NUM_WAYS - 1]; 908 LOAD_data(iv, NUM_WAYS - 1)
862 WOP (STORE_data) 909 WOP (STORE_data)
863 } 910 }
864 WIDE_LOOP_END 911 WIDE_LOOP_END
@@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
866 SINGLE_LOOP 913 SINGLE_LOOP
867 { 914 {
868 const v128 *w = wStart; 915 const v128 *w = wStart;
869 v128 m = *data; 916 v128 m; LOAD_data(m, 0)
870 AES_D_IMC_m (w[2]) 917 AES_D_IMC_m (w[2])
871 do 918 do
872 { 919 {
@@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
878 AES_D_m (w[1]) 925 AES_D_m (w[1])
879 MM_XOR_m (w[0]) 926 MM_XOR_m (w[0])
880 MM_XOR_m (iv) 927 MM_XOR_m (iv)
881 iv = *data; 928 LOAD_data(iv, 0)
882 *data = m; 929 STORE_data(m, 0)
883 } 930 }
884 931
885 p[-2] = iv; 932 p[-2] = iv;
@@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
888 935
889AES_FUNC_START2 (AesCtr_Code_HW) 936AES_FUNC_START2 (AesCtr_Code_HW)
890{ 937{
891 v128 *p = (v128*)(void*)ivAes; 938 v128 *p = (v128 *)(void *)ivAes;
892 v128 *data = (v128*)(void*)data8; 939 v128 *data = (v128 *)(void *)data8;
893 uint64x2_t ctr = vreinterpretq_u64_u8(*p); 940 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 941 const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
895 const v128 *dataEnd; 942 const v128 *dataEnd;
896 uint64x2_t one = vdupq_n_u64(0);
897
898// the bug in clang: 943// the bug in clang:
899// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); 944// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900#if defined(__clang__) && (__clang_major__ <= 9) 945#if defined(__clang__) && (__clang_major__ <= 9)
901#pragma GCC diagnostic ignored "-Wvector-conversion" 946#pragma GCC diagnostic ignored "-Wvector-conversion"
902#endif 947#endif
903 one = vsetq_lane_u64(1, one, 0); 948 const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
904 p += 2; 949 p += 2;
905 950
906 WIDE_LOOP_START 951 WIDE_LOOP_START
diff --git a/C/BwtSort.c b/C/BwtSort.c
index 05ad6de..8f64f9d 100644
--- a/C/BwtSort.c
+++ b/C/BwtSort.c
@@ -1,5 +1,5 @@
1/* BwtSort.c -- BWT block sorting 1/* BwtSort.c -- BWT block sorting
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -7,6 +7,44 @@
7#include "Sort.h" 7#include "Sort.h"
8 8
9/* #define BLOCK_SORT_USE_HEAP_SORT */ 9/* #define BLOCK_SORT_USE_HEAP_SORT */
10// #define BLOCK_SORT_USE_HEAP_SORT
11
12#ifdef BLOCK_SORT_USE_HEAP_SORT
13
14#define HeapSortRefDown(p, vals, n, size, temp) \
15 { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
16 size_t s = k << 1; \
17 if (s > size) break; \
18 if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
19 if (val >= vals[p[s]]) break; \
20 p[k] = p[s]; k = s; \
21 } p[k] = temp; }
22
23void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
24{
25 if (size <= 1)
26 return;
27 p--;
28 {
29 size_t i = size / 2;
30 do
31 {
32 UInt32 temp = p[i];
33 HeapSortRefDown(p, vals, i, size, temp);
34 }
35 while (--i != 0);
36 }
37 do
38 {
39 UInt32 temp = p[size];
40 p[size--] = p[1];
41 HeapSortRefDown(p, vals, 1, size, temp);
42 }
43 while (size > 1);
44}
45
46#endif // BLOCK_SORT_USE_HEAP_SORT
47
10 48
11/* Don't change it !!! */ 49/* Don't change it !!! */
12#define kNumHashBytes 2 50#define kNumHashBytes 2
@@ -27,26 +65,27 @@
27 65
28#else 66#else
29 67
30#define kNumBitsMax 20 68#define kNumBitsMax 20
31#define kIndexMask ((1 << kNumBitsMax) - 1) 69#define kIndexMask (((UInt32)1 << kNumBitsMax) - 1)
32#define kNumExtraBits (32 - kNumBitsMax) 70#define kNumExtraBits (32 - kNumBitsMax)
33#define kNumExtra0Bits (kNumExtraBits - 2) 71#define kNumExtra0Bits (kNumExtraBits - 2)
34#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) 72#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1)
35 73
36#define SetFinishedGroupSize(p, size) \ 74#define SetFinishedGroupSize(p, size) \
37 { *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ 75 { *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \
38 if ((size) > (1 << kNumExtra0Bits)) { \ 76 if ((size) > (1 << kNumExtra0Bits)) { \
39 *(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \ 77 *(p) |= 0x40000000; \
78 *((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \
40 79
41static void SetGroupSize(UInt32 *p, UInt32 size) 80static void SetGroupSize(UInt32 *p, size_t size)
42{ 81{
43 if (--size == 0) 82 if (--size == 0)
44 return; 83 return;
45 *p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax); 84 *p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax);
46 if (size >= (1 << kNumExtra0Bits)) 85 if (size >= (1 << kNumExtra0Bits))
47 { 86 {
48 *p |= 0x40000000; 87 *p |= 0x40000000;
49 p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax); 88 p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax);
50 } 89 }
51} 90}
52 91
@@ -59,12 +98,14 @@ returns: 1 - if there are groups, 0 - no more groups
59*/ 98*/
60 99
61static 100static
62UInt32 101unsigned
63Z7_FASTCALL 102Z7_FASTCALL
64SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices 103SortGroup(size_t BlockSize, size_t NumSortedBytes,
65 #ifndef BLOCK_SORT_USE_HEAP_SORT 104 size_t groupOffset, size_t groupSize,
66 , UInt32 left, UInt32 range 105 unsigned NumRefBits, UInt32 *Indices
67 #endif 106#ifndef BLOCK_SORT_USE_HEAP_SORT
107 , size_t left, size_t range
108#endif
68 ) 109 )
69{ 110{
70 UInt32 *ind2 = Indices + groupOffset; 111 UInt32 *ind2 = Indices + groupOffset;
@@ -79,90 +120,93 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
79 return 0; 120 return 0;
80 } 121 }
81 Groups = Indices + BlockSize + BS_TEMP_SIZE; 122 Groups = Indices + BlockSize + BS_TEMP_SIZE;
82 if (groupSize <= ((UInt32)1 << NumRefBits) 123 if (groupSize <= ((size_t)1 << NumRefBits)
83 #ifndef BLOCK_SORT_USE_HEAP_SORT 124#ifndef BLOCK_SORT_USE_HEAP_SORT
84 && groupSize <= range 125 && groupSize <= range
85 #endif 126#endif
86 ) 127 )
87 { 128 {
88 UInt32 *temp = Indices + BlockSize; 129 UInt32 *temp = Indices + BlockSize;
89 UInt32 j; 130 size_t j, group;
90 UInt32 mask, thereAreGroups, group, cg; 131 UInt32 mask, cg;
132 unsigned thereAreGroups;
91 { 133 {
92 UInt32 gPrev; 134 UInt32 gPrev;
93 UInt32 gRes = 0; 135 UInt32 gRes = 0;
94 { 136 {
95 UInt32 sp = ind2[0] + NumSortedBytes; 137 size_t sp = ind2[0] + NumSortedBytes;
96 if (sp >= BlockSize) sp -= BlockSize; 138 if (sp >= BlockSize)
139 sp -= BlockSize;
97 gPrev = Groups[sp]; 140 gPrev = Groups[sp];
98 temp[0] = (gPrev << NumRefBits); 141 temp[0] = gPrev << NumRefBits;
99 } 142 }
100 143
101 for (j = 1; j < groupSize; j++) 144 for (j = 1; j < groupSize; j++)
102 { 145 {
103 UInt32 sp = ind2[j] + NumSortedBytes; 146 size_t sp = ind2[j] + NumSortedBytes;
104 UInt32 g; 147 UInt32 g;
105 if (sp >= BlockSize) sp -= BlockSize; 148 if (sp >= BlockSize)
149 sp -= BlockSize;
106 g = Groups[sp]; 150 g = Groups[sp];
107 temp[j] = (g << NumRefBits) | j; 151 temp[j] = (g << NumRefBits) | (UInt32)j;
108 gRes |= (gPrev ^ g); 152 gRes |= (gPrev ^ g);
109 } 153 }
110 if (gRes == 0) 154 if (gRes == 0)
111 { 155 {
112 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 156#ifndef BLOCK_SORT_EXTERNAL_FLAGS
113 SetGroupSize(ind2, groupSize); 157 SetGroupSize(ind2, groupSize);
114 #endif 158#endif
115 return 1; 159 return 1;
116 } 160 }
117 } 161 }
118 162
119 HeapSort(temp, groupSize); 163 HeapSort(temp, groupSize);
120 mask = (((UInt32)1 << NumRefBits) - 1); 164 mask = ((UInt32)1 << NumRefBits) - 1;
121 thereAreGroups = 0; 165 thereAreGroups = 0;
122 166
123 group = groupOffset; 167 group = groupOffset;
124 cg = (temp[0] >> NumRefBits); 168 cg = temp[0] >> NumRefBits;
125 temp[0] = ind2[temp[0] & mask]; 169 temp[0] = ind2[temp[0] & mask];
126 170
127 { 171 {
128 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 172#ifdef BLOCK_SORT_EXTERNAL_FLAGS
129 UInt32 *Flags = Groups + BlockSize; 173 UInt32 *Flags = Groups + BlockSize;
130 #else 174#else
131 UInt32 prevGroupStart = 0; 175 size_t prevGroupStart = 0;
132 #endif 176#endif
133 177
134 for (j = 1; j < groupSize; j++) 178 for (j = 1; j < groupSize; j++)
135 { 179 {
136 UInt32 val = temp[j]; 180 const UInt32 val = temp[j];
137 UInt32 cgCur = (val >> NumRefBits); 181 const UInt32 cgCur = val >> NumRefBits;
138 182
139 if (cgCur != cg) 183 if (cgCur != cg)
140 { 184 {
141 cg = cgCur; 185 cg = cgCur;
142 group = groupOffset + j; 186 group = groupOffset + j;
143 187
144 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 188#ifdef BLOCK_SORT_EXTERNAL_FLAGS
145 { 189 {
146 UInt32 t = group - 1; 190 const size_t t = group - 1;
147 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 191 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
148 } 192 }
149 #else 193#else
150 SetGroupSize(temp + prevGroupStart, j - prevGroupStart); 194 SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
151 prevGroupStart = j; 195 prevGroupStart = j;
152 #endif 196#endif
153 } 197 }
154 else 198 else
155 thereAreGroups = 1; 199 thereAreGroups = 1;
156 { 200 {
157 UInt32 ind = ind2[val & mask]; 201 const UInt32 ind = ind2[val & mask];
158 temp[j] = ind; 202 temp[j] = ind;
159 Groups[ind] = group; 203 Groups[ind] = (UInt32)group;
160 } 204 }
161 } 205 }
162 206
163 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 207#ifndef BLOCK_SORT_EXTERNAL_FLAGS
164 SetGroupSize(temp + prevGroupStart, j - prevGroupStart); 208 SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
165 #endif 209#endif
166 } 210 }
167 211
168 for (j = 0; j < groupSize; j++) 212 for (j = 0; j < groupSize; j++)
@@ -172,37 +216,42 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
172 216
173 /* Check that all strings are in one group (cannot sort) */ 217 /* Check that all strings are in one group (cannot sort) */
174 { 218 {
175 UInt32 group, j; 219 UInt32 group;
176 UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 220 size_t j;
221 size_t sp = ind2[0] + NumSortedBytes;
222 if (sp >= BlockSize)
223 sp -= BlockSize;
177 group = Groups[sp]; 224 group = Groups[sp];
178 for (j = 1; j < groupSize; j++) 225 for (j = 1; j < groupSize; j++)
179 { 226 {
180 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 227 sp = ind2[j] + NumSortedBytes;
228 if (sp >= BlockSize)
229 sp -= BlockSize;
181 if (Groups[sp] != group) 230 if (Groups[sp] != group)
182 break; 231 break;
183 } 232 }
184 if (j == groupSize) 233 if (j == groupSize)
185 { 234 {
186 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 235#ifndef BLOCK_SORT_EXTERNAL_FLAGS
187 SetGroupSize(ind2, groupSize); 236 SetGroupSize(ind2, groupSize);
188 #endif 237#endif
189 return 1; 238 return 1;
190 } 239 }
191 } 240 }
192 241
193 #ifndef BLOCK_SORT_USE_HEAP_SORT 242#ifndef BLOCK_SORT_USE_HEAP_SORT
194 { 243 {
195 /* ---------- Range Sort ---------- */ 244 /* ---------- Range Sort ---------- */
196 UInt32 i; 245 size_t i;
197 UInt32 mid; 246 size_t mid;
198 for (;;) 247 for (;;)
199 { 248 {
200 UInt32 j; 249 size_t j;
201 if (range <= 1) 250 if (range <= 1)
202 { 251 {
203 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 252#ifndef BLOCK_SORT_EXTERNAL_FLAGS
204 SetGroupSize(ind2, groupSize); 253 SetGroupSize(ind2, groupSize);
205 #endif 254#endif
206 return 1; 255 return 1;
207 } 256 }
208 mid = left + ((range + 1) >> 1); 257 mid = left + ((range + 1) >> 1);
@@ -210,7 +259,7 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
210 i = 0; 259 i = 0;
211 do 260 do
212 { 261 {
213 UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 262 size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
214 if (Groups[sp] >= mid) 263 if (Groups[sp] >= mid)
215 { 264 {
216 for (j--; j > i; j--) 265 for (j--; j > i; j--)
@@ -238,51 +287,53 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
238 break; 287 break;
239 } 288 }
240 289
241 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 290#ifdef BLOCK_SORT_EXTERNAL_FLAGS
242 { 291 {
243 UInt32 t = (groupOffset + i - 1); 292 const size_t t = groupOffset + i - 1;
244 UInt32 *Flags = Groups + BlockSize; 293 UInt32 *Flags = Groups + BlockSize;
245 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 294 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
246 } 295 }
247 #endif 296#endif
248 297
249 { 298 {
250 UInt32 j; 299 size_t j;
251 for (j = i; j < groupSize; j++) 300 for (j = i; j < groupSize; j++)
252 Groups[ind2[j]] = groupOffset + i; 301 Groups[ind2[j]] = (UInt32)(groupOffset + i);
253 } 302 }
254 303
255 { 304 {
256 UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); 305 unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left);
257 return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); 306 return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left));
258 } 307 }
259 308
260 } 309 }
261 310
262 #else 311#else // BLOCK_SORT_USE_HEAP_SORT
263 312
264 /* ---------- Heap Sort ---------- */ 313 /* ---------- Heap Sort ---------- */
265 314
266 { 315 {
267 UInt32 j; 316 size_t j;
268 for (j = 0; j < groupSize; j++) 317 for (j = 0; j < groupSize; j++)
269 { 318 {
270 UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 319 size_t sp = ind2[j] + NumSortedBytes;
271 ind2[j] = sp; 320 if (sp >= BlockSize)
321 sp -= BlockSize;
322 ind2[j] = (UInt32)sp;
272 } 323 }
273 324
274 HeapSortRef(ind2, Groups, groupSize); 325 HeapSortRef(ind2, Groups, groupSize);
275 326
276 /* Write Flags */ 327 /* Write Flags */
277 { 328 {
278 UInt32 sp = ind2[0]; 329 size_t sp = ind2[0];
279 UInt32 group = Groups[sp]; 330 UInt32 group = Groups[sp];
280 331
281 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 332#ifdef BLOCK_SORT_EXTERNAL_FLAGS
282 UInt32 *Flags = Groups + BlockSize; 333 UInt32 *Flags = Groups + BlockSize;
283 #else 334#else
284 UInt32 prevGroupStart = 0; 335 size_t prevGroupStart = 0;
285 #endif 336#endif
286 337
287 for (j = 1; j < groupSize; j++) 338 for (j = 1; j < groupSize; j++)
288 { 339 {
@@ -290,149 +341,210 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
290 if (Groups[sp] != group) 341 if (Groups[sp] != group)
291 { 342 {
292 group = Groups[sp]; 343 group = Groups[sp];
293 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 344#ifdef BLOCK_SORT_EXTERNAL_FLAGS
294 { 345 {
295 UInt32 t = groupOffset + j - 1; 346 const size_t t = groupOffset + j - 1;
296 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 347 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
297 } 348 }
298 #else 349#else
299 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); 350 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
300 prevGroupStart = j; 351 prevGroupStart = j;
301 #endif 352#endif
302 } 353 }
303 } 354 }
304 355
305 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 356#ifndef BLOCK_SORT_EXTERNAL_FLAGS
306 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); 357 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
307 #endif 358#endif
308 } 359 }
309 { 360 {
310 /* Write new Groups values and Check that there are groups */ 361 /* Write new Groups values and Check that there are groups */
311 UInt32 thereAreGroups = 0; 362 unsigned thereAreGroups = 0;
312 for (j = 0; j < groupSize; j++) 363 for (j = 0; j < groupSize; j++)
313 { 364 {
314 UInt32 group = groupOffset + j; 365 size_t group = groupOffset + j;
315 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 366#ifndef BLOCK_SORT_EXTERNAL_FLAGS
316 UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); 367 UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax);
317 if ((ind2[j] & 0x40000000) != 0) 368 if (ind2[j] & 0x40000000)
318 subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); 369 subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits);
319 subGroupSize++; 370 subGroupSize++;
320 for (;;) 371 for (;;)
321 { 372 {
322 UInt32 original = ind2[j]; 373 const UInt32 original = ind2[j];
323 UInt32 sp = original & kIndexMask; 374 size_t sp = original & kIndexMask;
324 if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; 375 if (sp < NumSortedBytes)
325 ind2[j] = sp | (original & ~kIndexMask); 376 sp += BlockSize;
326 Groups[sp] = group; 377 sp -= NumSortedBytes;
378 ind2[j] = (UInt32)sp | (original & ~kIndexMask);
379 Groups[sp] = (UInt32)group;
327 if (--subGroupSize == 0) 380 if (--subGroupSize == 0)
328 break; 381 break;
329 j++; 382 j++;
330 thereAreGroups = 1; 383 thereAreGroups = 1;
331 } 384 }
332 #else 385#else
333 UInt32 *Flags = Groups + BlockSize; 386 UInt32 *Flags = Groups + BlockSize;
334 for (;;) 387 for (;;)
335 { 388 {
336 UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; 389 size_t sp = ind2[j];
337 ind2[j] = sp; 390 if (sp < NumSortedBytes)
338 Groups[sp] = group; 391 sp += BlockSize;
392 sp -= NumSortedBytes;
393 ind2[j] = (UInt32)sp;
394 Groups[sp] = (UInt32)group;
339 if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) 395 if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0)
340 break; 396 break;
341 j++; 397 j++;
342 thereAreGroups = 1; 398 thereAreGroups = 1;
343 } 399 }
344 #endif 400#endif
345 } 401 }
346 return thereAreGroups; 402 return thereAreGroups;
347 } 403 }
348 } 404 }
349 #endif 405#endif // BLOCK_SORT_USE_HEAP_SORT
350} 406}
351 407
408
352/* conditions: blockSize > 0 */ 409/* conditions: blockSize > 0 */
353UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) 410UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
354{ 411{
355 UInt32 *counters = Indices + blockSize; 412 UInt32 *counters = Indices + blockSize;
356 UInt32 i; 413 size_t i;
357 UInt32 *Groups; 414 UInt32 *Groups;
358 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 415#ifdef BLOCK_SORT_EXTERNAL_FLAGS
359 UInt32 *Flags; 416 UInt32 *Flags;
360 #endif 417#endif
361 418
362 /* Radix-Sort for 2 bytes */ 419/* Radix-Sort for 2 bytes */
420// { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) {
363 for (i = 0; i < kNumHashValues; i++) 421 for (i = 0; i < kNumHashValues; i++)
364 counters[i] = 0; 422 counters[i] = 0;
365 for (i = 0; i < blockSize - 1; i++) 423 {
366 counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++; 424 const Byte *data2 = data;
367 counters[((UInt32)data[i] << 8) | data[0]]++; 425 size_t a = data[(size_t)blockSize - 1];
426 const Byte *data_lim = data + blockSize;
427 if (blockSize >= 4)
428 {
429 data_lim -= 3;
430 do
431 {
432 size_t b;
433 b = data2[0]; counters[(a << 8) | b]++;
434 a = data2[1]; counters[(b << 8) | a]++;
435 b = data2[2]; counters[(a << 8) | b]++;
436 a = data2[3]; counters[(b << 8) | a]++;
437 data2 += 4;
438 }
439 while (data2 < data_lim);
440 data_lim += 3;
441 }
442 while (data2 != data_lim)
443 {
444 size_t b = *data2++;
445 counters[(a << 8) | b]++;
446 a = b;
447 }
448 }
449// }}
368 450
369 Groups = counters + BS_TEMP_SIZE; 451 Groups = counters + BS_TEMP_SIZE;
370 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 452#ifdef BLOCK_SORT_EXTERNAL_FLAGS
371 Flags = Groups + blockSize; 453 Flags = Groups + blockSize;
372 { 454 {
373 UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; 455 const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits;
374 for (i = 0; i < numWords; i++) 456 for (i = 0; i < numWords; i++)
375 Flags[i] = kAllFlags; 457 Flags[i] = kAllFlags;
376 } 458 }
377 #endif 459#endif
378 460
379 { 461 {
380 UInt32 sum = 0; 462 UInt32 sum = 0;
381 for (i = 0; i < kNumHashValues; i++) 463 for (i = 0; i < kNumHashValues; i++)
382 { 464 {
383 UInt32 groupSize = counters[i]; 465 const UInt32 groupSize = counters[i];
384 if (groupSize > 0) 466 counters[i] = sum;
467 sum += groupSize;
468#ifdef BLOCK_SORT_EXTERNAL_FLAGS
469 if (groupSize)
385 { 470 {
386 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 471 const UInt32 t = sum - 1;
387 UInt32 t = sum + groupSize - 1; 472 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
388 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
389 #endif
390 sum += groupSize;
391 } 473 }
392 counters[i] = sum - groupSize; 474#endif
393 } 475 }
476 }
394 477
395 for (i = 0; i < blockSize - 1; i++) 478 for (i = 0; i < blockSize - 1; i++)
396 Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]; 479 Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]];
397 Groups[i] = counters[((UInt32)data[i] << 8) | data[0]]; 480 Groups[i] = counters[((unsigned)data[i] << 8) | data[0]];
481
482 {
483#define SET_Indices(a, b, i) \
484 { UInt32 c; \
485 a = (a << 8) | (b); \
486 c = counters[a]; \
487 Indices[c] = (UInt32)i++; \
488 counters[a] = c + 1; \
489 }
398 490
399 for (i = 0; i < blockSize - 1; i++) 491 size_t a = data[0];
400 Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i; 492 const Byte *data_ptr = data + 1;
401 Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i; 493 i = 0;
402 494 if (blockSize >= 3)
403 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 495 {
496 blockSize -= 2;
497 do
498 {
499 size_t b;
500 b = data_ptr[0]; SET_Indices(a, b, i)
501 a = data_ptr[1]; SET_Indices(b, a, i)
502 data_ptr += 2;
503 }
504 while (i < blockSize);
505 blockSize += 2;
506 }
507 if (i < blockSize - 1)
404 { 508 {
509 SET_Indices(a, data[(size_t)i + 1], i)
510 a = (Byte)a;
511 }
512 SET_Indices(a, data[0], i)
513 }
514
515#ifndef BLOCK_SORT_EXTERNAL_FLAGS
516 {
405 UInt32 prev = 0; 517 UInt32 prev = 0;
406 for (i = 0; i < kNumHashValues; i++) 518 for (i = 0; i < kNumHashValues; i++)
407 { 519 {
408 UInt32 prevGroupSize = counters[i] - prev; 520 const UInt32 prevGroupSize = counters[i] - prev;
409 if (prevGroupSize == 0) 521 if (prevGroupSize == 0)
410 continue; 522 continue;
411 SetGroupSize(Indices + prev, prevGroupSize); 523 SetGroupSize(Indices + prev, prevGroupSize);
412 prev = counters[i]; 524 prev = counters[i];
413 } 525 }
414 }
415 #endif
416 } 526 }
527#endif
417 528
418 { 529 {
419 int NumRefBits; 530 unsigned NumRefBits;
420 UInt32 NumSortedBytes; 531 size_t NumSortedBytes;
421 for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++); 532 for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++)
533 {}
422 NumRefBits = 32 - NumRefBits; 534 NumRefBits = 32 - NumRefBits;
423 if (NumRefBits > kNumRefBitsMax) 535 if (NumRefBits > kNumRefBitsMax)
424 NumRefBits = kNumRefBitsMax; 536 NumRefBits = kNumRefBitsMax;
425 537
426 for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) 538 for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1)
427 { 539 {
428 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 540#ifndef BLOCK_SORT_EXTERNAL_FLAGS
429 UInt32 finishedGroupSize = 0; 541 size_t finishedGroupSize = 0;
430 #endif 542#endif
431 UInt32 newLimit = 0; 543 size_t newLimit = 0;
432 for (i = 0; i < blockSize;) 544 for (i = 0; i < blockSize;)
433 { 545 {
434 UInt32 groupSize; 546 size_t groupSize;
435 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 547#ifdef BLOCK_SORT_EXTERNAL_FLAGS
436 548
437 if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) 549 if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0)
438 { 550 {
@@ -441,56 +553,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
441 } 553 }
442 for (groupSize = 1; 554 for (groupSize = 1;
443 (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; 555 (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0;
444 groupSize++); 556 groupSize++)
445 557 {}
446 groupSize++; 558 groupSize++;
447 559
448 #else 560#else
449 561
450 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); 562 groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
451 { 563 {
452 BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); 564 const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0);
453 if ((Indices[i] & 0x40000000) != 0) 565 if (Indices[i] & 0x40000000)
454 {
455 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
456 Indices[(size_t)i + 1] &= kIndexMask;
457 }
458 Indices[i] &= kIndexMask;
459 groupSize++;
460 if (finishedGroup || groupSize == 1)
461 {
462 Indices[i - finishedGroupSize] &= kIndexMask;
463 if (finishedGroupSize > 1)
464 Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
465 { 566 {
466 UInt32 newGroupSize = groupSize + finishedGroupSize; 567 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
467 SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) 568 Indices[(size_t)i + 1] &= kIndexMask;
468 finishedGroupSize = newGroupSize;
469 } 569 }
470 i += groupSize; 570 Indices[i] &= kIndexMask;
471 continue; 571 groupSize++;
472 } 572 if (finishedGroup || groupSize == 1)
473 finishedGroupSize = 0; 573 {
574 Indices[i - finishedGroupSize] &= kIndexMask;
575 if (finishedGroupSize > 1)
576 Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
577 {
578 const size_t newGroupSize = groupSize + finishedGroupSize;
579 SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize)
580 finishedGroupSize = newGroupSize;
581 }
582 i += groupSize;
583 continue;
584 }
585 finishedGroupSize = 0;
474 } 586 }
475 587
476 #endif 588#endif
477 589
478 if (NumSortedBytes >= blockSize) 590 if (NumSortedBytes >= blockSize)
479 { 591 {
480 UInt32 j; 592 size_t j;
481 for (j = 0; j < groupSize; j++) 593 for (j = 0; j < groupSize; j++)
482 { 594 {
483 UInt32 t = (i + j); 595 size_t t = i + j;
484 /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ 596 /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */
485 Groups[Indices[t]] = t; 597 Groups[Indices[t]] = (UInt32)t;
486 } 598 }
487 } 599 }
488 else 600 else
489 if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices 601 if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices
490 #ifndef BLOCK_SORT_USE_HEAP_SORT 602 #ifndef BLOCK_SORT_USE_HEAP_SORT
491 , 0, blockSize 603 , 0, blockSize
492 #endif 604 #endif
493 ) != 0) 605 ))
494 newLimit = i + groupSize; 606 newLimit = i + groupSize;
495 i += groupSize; 607 i += groupSize;
496 } 608 }
@@ -498,19 +610,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
498 break; 610 break;
499 } 611 }
500 } 612 }
501 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 613#ifndef BLOCK_SORT_EXTERNAL_FLAGS
502 for (i = 0; i < blockSize;) 614 for (i = 0; i < blockSize;)
503 { 615 {
504 UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); 616 size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
505 if ((Indices[i] & 0x40000000) != 0) 617 if (Indices[i] & 0x40000000)
506 { 618 {
507 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); 619 groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits;
508 Indices[(size_t)i + 1] &= kIndexMask; 620 Indices[(size_t)i + 1] &= kIndexMask;
509 } 621 }
510 Indices[i] &= kIndexMask; 622 Indices[i] &= kIndexMask;
511 groupSize++; 623 groupSize++;
512 i += groupSize; 624 i += groupSize;
513 } 625 }
514 #endif 626#endif
515 return Groups[0]; 627 return Groups[0];
516} 628}
diff --git a/C/BwtSort.h b/C/BwtSort.h
index a34b243..1bd2316 100644
--- a/C/BwtSort.h
+++ b/C/BwtSort.h
@@ -1,5 +1,5 @@
1/* BwtSort.h -- BWT block sorting 1/* BwtSort.h -- BWT block sorting
22023-03-03 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_BWT_SORT_H 4#ifndef ZIP7_INC_BWT_SORT_H
5#define ZIP7_INC_BWT_SORT_H 5#define ZIP7_INC_BWT_SORT_H
@@ -10,16 +10,17 @@ EXTERN_C_BEGIN
10 10
11/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ 11/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */
12/* #define BLOCK_SORT_EXTERNAL_FLAGS */ 12/* #define BLOCK_SORT_EXTERNAL_FLAGS */
13// #define BLOCK_SORT_EXTERNAL_FLAGS
13 14
14#ifdef BLOCK_SORT_EXTERNAL_FLAGS 15#ifdef BLOCK_SORT_EXTERNAL_FLAGS
15#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5)) 16#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5)
16#else 17#else
17#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 18#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0
18#endif 19#endif
19 20
20#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) 21#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16))
21 22
22UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize); 23UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize);
23 24
24EXTERN_C_END 25EXTERN_C_END
25 26
diff --git a/C/Compiler.h b/C/Compiler.h
index 2a9c2b7..b266b27 100644
--- a/C/Compiler.h
+++ b/C/Compiler.h
@@ -1,5 +1,5 @@
1/* Compiler.h : Compiler specific defines and pragmas 1/* Compiler.h : Compiler specific defines and pragmas
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_COMPILER_H 4#ifndef ZIP7_INC_COMPILER_H
5#define ZIP7_INC_COMPILER_H 5#define ZIP7_INC_COMPILER_H
@@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
183 #define Z7_ATTRIB_NO_VECTORIZE 183 #define Z7_ATTRIB_NO_VECTORIZE
184#endif 184#endif
185 185
186#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
187 #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
188 #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
189#else
190 #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
191 #define Z7_PRAGMA_OPTIMIZE_DEFAULT
192#endif
193
194
195
186#if defined(MY_CPU_X86_OR_AMD64) && ( \ 196#if defined(MY_CPU_X86_OR_AMD64) && ( \
187 defined(__clang__) && (__clang_major__ >= 4) \ 197 defined(__clang__) && (__clang_major__ >= 4) \
188 || defined(__GNUC__) && (__GNUC__ >= 5)) 198 || defined(__GNUC__) && (__GNUC__ >= 5))
diff --git a/C/CpuArch.c b/C/CpuArch.c
index e792f39..6e02551 100644
--- a/C/CpuArch.c
+++ b/C/CpuArch.c
@@ -1,5 +1,5 @@
1/* CpuArch.c -- CPU specific code 1/* CpuArch.c -- CPU specific code
22024-07-04 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -17,7 +17,7 @@
17/* 17/*
18 cpuid instruction supports (subFunction) parameter in ECX, 18 cpuid instruction supports (subFunction) parameter in ECX,
19 that is used only with some specific (function) parameter values. 19 that is used only with some specific (function) parameter values.
20 But we always use only (subFunction==0). 20 most functions use only (subFunction==0).
21*/ 21*/
22/* 22/*
23 __cpuid(): MSVC and GCC/CLANG use same function/macro name 23 __cpuid(): MSVC and GCC/CLANG use same function/macro name
@@ -49,43 +49,49 @@
49#if defined(MY_CPU_AMD64) && defined(__PIC__) \ 49#if defined(MY_CPU_AMD64) && defined(__PIC__) \
50 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) 50 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
51 51
52#define x86_cpuid_MACRO(p, func) { \ 52 /* "=&r" selects free register. It can select even rbx, if that register is free.
53 "=&D" for (RDI) also works, but the code can be larger with "=&D"
54 "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */
55
56#define x86_cpuid_MACRO_2(p, func, subFunc) { \
53 __asm__ __volatile__ ( \ 57 __asm__ __volatile__ ( \
54 ASM_LN "mov %%rbx, %q1" \ 58 ASM_LN "mov %%rbx, %q1" \
55 ASM_LN "cpuid" \ 59 ASM_LN "cpuid" \
56 ASM_LN "xchg %%rbx, %q1" \ 60 ASM_LN "xchg %%rbx, %q1" \
57 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 61 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
58
59 /* "=&r" selects free register. It can select even rbx, if that register is free.
60 "=&D" for (RDI) also works, but the code can be larger with "=&D"
61 "2"(0) means (subFunction = 0),
62 2 is (zero-based) index in the output constraint list "=c" (ECX). */
63 62
64#elif defined(MY_CPU_X86) && defined(__PIC__) \ 63#elif defined(MY_CPU_X86) && defined(__PIC__) \
65 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) 64 && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__))
66 65
67#define x86_cpuid_MACRO(p, func) { \ 66#define x86_cpuid_MACRO_2(p, func, subFunc) { \
68 __asm__ __volatile__ ( \ 67 __asm__ __volatile__ ( \
69 ASM_LN "mov %%ebx, %k1" \ 68 ASM_LN "mov %%ebx, %k1" \
70 ASM_LN "cpuid" \ 69 ASM_LN "cpuid" \
71 ASM_LN "xchg %%ebx, %k1" \ 70 ASM_LN "xchg %%ebx, %k1" \
72 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 71 : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
73 72
74#else 73#else
75 74
76#define x86_cpuid_MACRO(p, func) { \ 75#define x86_cpuid_MACRO_2(p, func, subFunc) { \
77 __asm__ __volatile__ ( \ 76 __asm__ __volatile__ ( \
78 ASM_LN "cpuid" \ 77 ASM_LN "cpuid" \
79 : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } 78 : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); }
80 79
81#endif 80#endif
82 81
82#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0)
83 83
84void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) 84void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
85{ 85{
86 x86_cpuid_MACRO(p, func) 86 x86_cpuid_MACRO(p, func)
87} 87}
88 88
89static
90void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
91{
92 x86_cpuid_MACRO_2(p, func, subFunc)
93}
94
89 95
90Z7_NO_INLINE 96Z7_NO_INLINE
91UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) 97UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
@@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
205 __asm ret 0 211 __asm ret 0
206} 212}
207 213
214static
215void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
216{
217 UNUSED_VAR(p)
218 UNUSED_VAR(func)
219 UNUSED_VAR(subFunc)
220 __asm push ebx
221 __asm push edi
222 __asm mov edi, ecx // p
223 __asm mov eax, edx // func
224 __asm mov ecx, [esp + 12] // subFunc
225 __asm cpuid
226 __asm mov [edi ], eax
227 __asm mov [edi + 4], ebx
228 __asm mov [edi + 8], ecx
229 __asm mov [edi + 12], edx
230 __asm pop edi
231 __asm pop ebx
232 __asm ret 4
233}
234
208#else // MY_CPU_AMD64 235#else // MY_CPU_AMD64
209 236
210 #if _MSC_VER >= 1600 237 #if _MSC_VER >= 1600
211 #include <intrin.h> 238 #include <intrin.h>
212 #define MY_cpuidex __cpuidex 239 #define MY_cpuidex __cpuidex
240
241static
242void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
243{
244 __cpuidex((int *)p, func, subFunc);
245}
246
213 #else 247 #else
214/* 248/*
215 __cpuid (func == (0 or 7)) requires subfunction number in ECX. 249 __cpuid (func == (0 or 7)) requires subfunction number in ECX.
@@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
219 We still can use __cpuid for low (func) values that don't require ECX, 253 We still can use __cpuid for low (func) values that don't require ECX,
220 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). 254 but __cpuid() in old MSVC will be incorrect for some func values: (func == 7).
221 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, 255 So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction,
222 where ECX value is first parameter for FASTCALL / NO_INLINE func, 256 where ECX value is first parameter for FASTCALL / NO_INLINE func.
223 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and 257 So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and
224 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. 258 old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value.
225 259
@@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int
233} 267}
234 #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) 268 #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info)
235 #pragma message("======== MY_cpuidex_HACK WAS USED ========") 269 #pragma message("======== MY_cpuidex_HACK WAS USED ========")
270static
271void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc)
272{
273 MY_cpuidex_HACK(subFunc, func, (Int32 *)p);
274}
236 #endif // _MSC_VER >= 1600 275 #endif // _MSC_VER >= 1600
237 276
238#if !defined(MY_CPU_AMD64) 277#if !defined(MY_CPU_AMD64)
@@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void)
445 } 484 }
446} 485}
447 486
487
488BoolInt CPU_IsSupported_SHA512(void)
489{
490 if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here
491
492 if (z7_x86_cpuid_GetMaxFunc() < 7)
493 return False;
494 {
495 UInt32 d[4];
496 z7_x86_cpuid_subFunc(d, 7, 0);
497 if (d[0] < 1) // d[0] - is max supported subleaf value
498 return False;
499 z7_x86_cpuid_subFunc(d, 7, 1);
500 return (BoolInt)(d[0]) & 1;
501 }
502}
503
448/* 504/*
449MSVC: _xgetbv() intrinsic is available since VS2010SP1. 505MSVC: _xgetbv() intrinsic is available since VS2010SP1.
450 MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in 506 MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in
@@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void)
776 return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); 832 return z7_sysctlbyname_Get_BoolInt("hw.optional.neon");
777} 833}
778 834
835BoolInt CPU_IsSupported_SHA512(void)
836{
837 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512");
838}
839
840/*
841BoolInt CPU_IsSupported_SHA3(void)
842{
843 return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3");
844}
845*/
846
779#ifdef MY_CPU_ARM64 847#ifdef MY_CPU_ARM64
780#define APPLE_CRYPTO_SUPPORT_VAL 1 848#define APPLE_CRYPTO_SUPPORT_VAL 1
781#else 849#else
@@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32)
860MY_HWCAP_CHECK_FUNC (SHA1) 928MY_HWCAP_CHECK_FUNC (SHA1)
861MY_HWCAP_CHECK_FUNC (SHA2) 929MY_HWCAP_CHECK_FUNC (SHA2)
862MY_HWCAP_CHECK_FUNC (AES) 930MY_HWCAP_CHECK_FUNC (AES)
931#ifdef MY_CPU_ARM64
932// <hwcap.h> supports HWCAP_SHA512 and HWCAP_SHA3 since 2017.
933// we define them here, if they are not defined
934#ifndef HWCAP_SHA3
935// #define HWCAP_SHA3 (1 << 17)
936#endif
937#ifndef HWCAP_SHA512
938// #pragma message("=== HWCAP_SHA512 define === ")
939#define HWCAP_SHA512 (1 << 21)
940#endif
941MY_HWCAP_CHECK_FUNC (SHA512)
942// MY_HWCAP_CHECK_FUNC (SHA3)
943#endif
863 944
864#endif // __APPLE__ 945#endif // __APPLE__
865#endif // _WIN32 946#endif // _WIN32
diff --git a/C/CpuArch.h b/C/CpuArch.h
index 683cfaa..1690a5b 100644
--- a/C/CpuArch.h
+++ b/C/CpuArch.h
@@ -1,5 +1,5 @@
1/* CpuArch.h -- CPU specific code 1/* CpuArch.h -- CPU specific code
22024-06-17 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_CPU_ARCH_H 4#ifndef ZIP7_INC_CPU_ARCH_H
5#define ZIP7_INC_CPU_ARCH_H 5#define ZIP7_INC_CPU_ARCH_H
@@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
47 #define MY_CPU_SIZEOF_POINTER 4 47 #define MY_CPU_SIZEOF_POINTER 4
48#endif 48#endif
49 49
50#if defined(__SSE2__) \
51 || defined(MY_CPU_AMD64) \
52 || defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
53#define MY_CPU_SSE2
54#endif
55
50 56
51#if defined(_M_ARM64) \ 57#if defined(_M_ARM64) \
52 || defined(_M_ARM64EC) \ 58 || defined(_M_ARM64EC) \
@@ -509,11 +515,19 @@ problem-4 : performace:
509 515
510#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) 516#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
511 517
518#if 0
519// Z7_BSWAP16 can be slow for x86-msvc
520#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
521#else
522#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
523#endif
524
512#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) 525#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
513#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } 526#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
514 527
515#if defined(MY_CPU_LE_UNALIGN_64) 528#if defined(MY_CPU_LE_UNALIGN_64)
516#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) 529#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
530#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
517#endif 531#endif
518 532
519#else 533#else
@@ -536,21 +550,39 @@ problem-4 : performace:
536#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) 550#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
537#endif 551#endif
538 552
553#ifndef SetBe64
554#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
555 _ppp_[0] = (Byte)(_vvv_ >> 56); \
556 _ppp_[1] = (Byte)(_vvv_ >> 48); \
557 _ppp_[2] = (Byte)(_vvv_ >> 40); \
558 _ppp_[3] = (Byte)(_vvv_ >> 32); \
559 _ppp_[4] = (Byte)(_vvv_ >> 24); \
560 _ppp_[5] = (Byte)(_vvv_ >> 16); \
561 _ppp_[6] = (Byte)(_vvv_ >> 8); \
562 _ppp_[7] = (Byte)_vvv_; }
563#endif
564
539#ifndef GetBe16 565#ifndef GetBe16
566#ifdef GetBe16_to32
567#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
568#else
540#define GetBe16(p) ( (UInt16) ( \ 569#define GetBe16(p) ( (UInt16) ( \
541 ((UInt16)((const Byte *)(p))[0] << 8) | \ 570 ((UInt16)((const Byte *)(p))[0] << 8) | \
542 ((const Byte *)(p))[1] )) 571 ((const Byte *)(p))[1] ))
543#endif 572#endif
573#endif
544 574
545 575
546#if defined(MY_CPU_BE) 576#if defined(MY_CPU_BE)
547#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) 577#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
548#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) 578#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
549#define Z7_CONV_NATIVE_TO_BE_32(v) (v) 579#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
580// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
550#elif defined(MY_CPU_LE) 581#elif defined(MY_CPU_LE)
551#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) 582#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
552#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) 583#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
553#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) 584#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
585// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
554#else 586#else
555#error Stop_Compiling_Unknown_Endian_CONV 587#error Stop_Compiling_Unknown_Endian_CONV
556#endif 588#endif
@@ -589,6 +621,11 @@ problem-4 : performace:
589#endif 621#endif
590 622
591 623
624#ifndef GetBe16_to32
625#define GetBe16_to32(p) GetBe16(p)
626#endif
627
628
592#if defined(MY_CPU_X86_OR_AMD64) \ 629#if defined(MY_CPU_X86_OR_AMD64) \
593 || defined(MY_CPU_ARM_OR_ARM64) \ 630 || defined(MY_CPU_ARM_OR_ARM64) \
594 || defined(MY_CPU_PPC_OR_PPC64) 631 || defined(MY_CPU_PPC_OR_PPC64)
@@ -617,6 +654,7 @@ BoolInt CPU_IsSupported_SSE2(void);
617BoolInt CPU_IsSupported_SSSE3(void); 654BoolInt CPU_IsSupported_SSSE3(void);
618BoolInt CPU_IsSupported_SSE41(void); 655BoolInt CPU_IsSupported_SSE41(void);
619BoolInt CPU_IsSupported_SHA(void); 656BoolInt CPU_IsSupported_SHA(void);
657BoolInt CPU_IsSupported_SHA512(void);
620BoolInt CPU_IsSupported_PageGB(void); 658BoolInt CPU_IsSupported_PageGB(void);
621 659
622#elif defined(MY_CPU_ARM_OR_ARM64) 660#elif defined(MY_CPU_ARM_OR_ARM64)
@@ -634,6 +672,7 @@ BoolInt CPU_IsSupported_SHA1(void);
634BoolInt CPU_IsSupported_SHA2(void); 672BoolInt CPU_IsSupported_SHA2(void);
635BoolInt CPU_IsSupported_AES(void); 673BoolInt CPU_IsSupported_AES(void);
636#endif 674#endif
675BoolInt CPU_IsSupported_SHA512(void);
637 676
638#endif 677#endif
639 678
diff --git a/C/HuffEnc.c b/C/HuffEnc.c
index 996da30..cbf8c22 100644
--- a/C/HuffEnc.c
+++ b/C/HuffEnc.c
@@ -1,60 +1,125 @@
1/* HuffEnc.c -- functions for Huffman encoding 1/* HuffEnc.c -- functions for Huffman encoding
22023-09-07 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include <string.h>
7
6#include "HuffEnc.h" 8#include "HuffEnc.h"
7#include "Sort.h" 9#include "Sort.h"
10#include "CpuArch.h"
8 11
9#define kMaxLen 16 12#define kMaxLen Z7_HUFFMAN_LEN_MAX
10#define NUM_BITS 10 13#define NUM_BITS 10
11#define MASK ((1u << NUM_BITS) - 1) 14#define MASK ((1u << NUM_BITS) - 1)
12 15#define FREQ_MASK (~(UInt32)MASK)
13#define NUM_COUNTERS 64 16#define NUM_COUNTERS (48 * 2)
14 17
15#define HUFFMAN_SPEED_OPT 18#if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE))
19#if defined(MY_CPU_LE)
20 #define HI_HALF_OFFSET 1
21#else
22 #define HI_HALF_OFFSET 0
23#endif
24#define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET))
25#define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val);
26#define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi)
27#define UPDATE_E(eHi) eHi++;
28#else
29#define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS))
30#define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field
31#define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS))
32#define UPDATE_E(eHi) eHi += 1 << NUM_BITS;
33#endif
16 34
17void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen) 35void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen)
18{ 36{
19 UInt32 num = 0; 37#if NUM_COUNTERS > 2
20 /* if (maxLen > 10) maxLen = 10; */ 38 unsigned counters[NUM_COUNTERS];
39#endif
40#if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2
41 #define lenCounters (counters)
42 #define codes (counters + kMaxLen + 4)
43#else
44 unsigned lenCounters[kMaxLen + 1];
45 UInt32 codes[kMaxLen + 1];
46#endif
47
48 unsigned num;
21 { 49 {
22 UInt32 i; 50 unsigned i;
23 51 // UInt32 sum = 0;
24 #ifdef HUFFMAN_SPEED_OPT 52
53#if NUM_COUNTERS > 2
25 54
26 UInt32 counters[NUM_COUNTERS]; 55#define CTR_ITEM_FOR_FREQ(freq) \
56 counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)]
57
27 for (i = 0; i < NUM_COUNTERS; i++) 58 for (i = 0; i < NUM_COUNTERS; i++)
28 counters[i] = 0; 59 counters[i] = 0;
29 for (i = 0; i < numSymbols; i++) 60 memset(lens, 0, numSymbols);
30 { 61 {
31 UInt32 freq = freqs[i]; 62 const UInt32 *fp = freqs + numSymbols;
32 counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++; 63#define NUM_UNROLLS 1
64#if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble
65 if (numSymbols & 1)
66 {
67 UInt32 f;
68 f = *--fp; CTR_ITEM_FOR_FREQ(f)++;
69 // sum += f;
70 }
71#endif
72 do
73 {
74 UInt32 f;
75 fp -= NUM_UNROLLS;
76 f = fp[0]; CTR_ITEM_FOR_FREQ(f)++;
77 // sum += f;
78#if NUM_UNROLLS > 1
79 f = fp[1]; CTR_ITEM_FOR_FREQ(f)++;
80 // sum += f;
81#endif
82 }
83 while (fp != freqs);
33 } 84 }
34 85#if 0
35 for (i = 1; i < NUM_COUNTERS; i++) 86 printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols);
36 { 87 {
37 UInt32 temp = counters[i]; 88 unsigned k = 0;
38 counters[i] = num; 89 for (k = 0; k < NUM_COUNTERS; k++)
39 num += temp; 90 printf(" %u", counters[k]);
40 } 91 }
41 92#endif
42 for (i = 0; i < numSymbols; i++) 93
94 num = counters[1];
95 counters[1] = 0;
96 for (i = 2; i != NUM_COUNTERS; i += 2)
43 { 97 {
44 UInt32 freq = freqs[i]; 98 unsigned c;
45 if (freq == 0) 99 c = (counters )[i]; (counters )[i] = num; num += c;
46 lens[i] = 0; 100 c = (counters + 1)[i]; (counters + 1)[i] = num; num += c;
47 else 101 }
48 p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS); 102 counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array
103 {
104 i = 0;
105 do
106 {
107 const UInt32 f = freqs[i];
108#if 0
109 if (f == 0) lens[i] = 0; else
110#endif
111 p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS);
112 }
113 while (++i != numSymbols);
49 } 114 }
50 counters[0] = 0;
51 HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); 115 HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]);
52 116
53 #else 117#else // NUM_COUNTERS <= 2
54 118
119 num = 0;
55 for (i = 0; i < numSymbols; i++) 120 for (i = 0; i < numSymbols; i++)
56 { 121 {
57 UInt32 freq = freqs[i]; 122 const UInt32 freq = freqs[i];
58 if (freq == 0) 123 if (freq == 0)
59 lens[i] = 0; 124 lens[i] = 0;
60 else 125 else
@@ -62,17 +127,27 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
62 } 127 }
63 HeapSort(p, num); 128 HeapSort(p, num);
64 129
65 #endif 130#endif
66 } 131 }
67 132
68 if (num < 2) 133 if (num <= 2)
69 { 134 {
70 unsigned minCode = 0; 135 unsigned minCode = 0;
71 unsigned maxCode = 1; 136 unsigned maxCode = 1;
72 if (num == 1) 137 if (num)
73 { 138 {
74 maxCode = (unsigned)p[0] & MASK; 139 maxCode = (unsigned)p[(size_t)num - 1] & MASK;
75 if (maxCode == 0) 140 if (num == 2)
141 {
142 minCode = (unsigned)p[0] & MASK;
143 if (minCode > maxCode)
144 {
145 const unsigned temp = minCode;
146 minCode = maxCode;
147 maxCode = temp;
148 }
149 }
150 else if (maxCode == 0)
76 maxCode++; 151 maxCode++;
77 } 152 }
78 p[minCode] = 0; 153 p[minCode] = 0;
@@ -80,69 +155,191 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
80 lens[minCode] = lens[maxCode] = 1; 155 lens[minCode] = lens[maxCode] = 1;
81 return; 156 return;
82 } 157 }
83
84 { 158 {
85 UInt32 b, e, i; 159 unsigned i;
86 160 for (i = 0; i <= kMaxLen; i++)
87 i = b = e = 0; 161 lenCounters[i] = 0;
88 do 162 lenCounters[1] = 2; // by default root node has 2 child leaves at level 1.
163 }
164 // if (num != 2)
165 {
166 // num > 2
167 // the binary tree will contain (num - 1) internal nodes.
168 // p[num - 2] will be root node of binary tree.
169 UInt32 *b;
170 UInt32 *n;
171 // first node will have two leaf childs: p[0] and p[1]:
172 // p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs
173 // if (pi == n) exit(0);
174 // if (pi != n)
89 { 175 {
90 UInt32 n, m, freq; 176 UInt32 fb = (p[1] & FREQ_MASK) + p[0];
91 n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; 177 UInt32 f = p[2] & FREQ_MASK;
92 freq = (p[n] & ~MASK); 178 const UInt32 *pi = p + 2;
93 p[n] = (p[n] & MASK) | (e << NUM_BITS); 179 UInt32 *e = p;
94 m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; 180 UInt32 eHi = 0;
95 freq += (p[m] & ~MASK); 181 n = p + num;
96 p[m] = (p[m] & MASK) | (e << NUM_BITS); 182 b = p;
97 p[e] = (p[e] & MASK) | freq; 183 // p[0] = fb;
98 e++; 184 for (;;)
185 {
186 // (b <= e)
187 UInt32 sum;
188 e++;
189 UPDATE_E(eHi)
190
191 // (b < e)
192
193 // p range : high bits
194 // [0, b) : parent : processed nodes that have parent and childs
195 // [b, e) : FREQ : non-processed nodes that have no parent but have childs
196 // [e, pi) : FREQ : processed leaves for which parent node was created
197 // [pi, n) : FREQ : non-processed leaves for which parent node was not created
198
199 // first child
200 // note : (*b < f) is same result as ((*b & FREQ_MASK) < f)
201 if (fb < f)
202 {
203 // node freq is smaller
204 sum = fb & FREQ_MASK;
205 STORE_PARENT_DIRECT (b, fb, eHi)
206 b++;
207 fb = *b;
208 if (b == e)
209 {
210 if (++pi == n)
211 break;
212 sum += f;
213 fb &= MASK;
214 fb |= sum;
215 *e = fb;
216 f = *pi & FREQ_MASK;
217 continue;
218 }
219 }
220 else if (++pi == n)
221 {
222 STORE_PARENT_DIRECT (b, fb, eHi)
223 b++;
224 break;
225 }
226 else
227 {
228 sum = f;
229 f = *pi & FREQ_MASK;
230 }
231
232 // (b < e)
233
234 // second child
235 if (fb < f)
236 {
237 sum += fb;
238 sum &= FREQ_MASK;
239 STORE_PARENT_DIRECT (b, fb, eHi)
240 b++;
241 *e = (*e & MASK) | sum; // set frequency sum
242 // (b <= e) is possible here
243 fb = *b;
244 }
245 else if (++pi == n)
246 break;
247 else
248 {
249 sum += f;
250 f = *pi & FREQ_MASK;
251 *e = (*e & MASK) | sum; // set frequency sum
252 }
253 }
99 } 254 }
100 while (num - e > 1);
101 255
256 // printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p);
102 { 257 {
103 UInt32 lenCounters[kMaxLen + 1]; 258 n -= 2;
104 for (i = 0; i <= kMaxLen; i++) 259 *n &= MASK; // root node : we clear high bits (zero bits mean level == 0)
105 lenCounters[i] = 0; 260 if (n != b)
106
107 p[--e] &= MASK;
108 lenCounters[1] = 2;
109 while (e != 0)
110 {
111 UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1;
112 p[e] = (p[e] & MASK) | (len << NUM_BITS);
113 if (len >= maxLen)
114 for (len = maxLen - 1; lenCounters[len] == 0; len--);
115 lenCounters[len]--;
116 lenCounters[(size_t)len + 1] += 2;
117 }
118
119 { 261 {
120 UInt32 len; 262 // We go here, if we have some number of non-created nodes up to root.
121 i = 0; 263 // We process them in simplified code:
122 for (len = maxLen; len != 0; len--) 264 // position of parent for each pair of nodes is known.
265 // n[-2], n[-1] : current pair of child nodes
266 // (p1) : parent node for current pair.
267 UInt32 *p1 = n;
268 do
123 { 269 {
124 UInt32 k; 270 const unsigned len = LOAD_PARENT(p1) + 1;
125 for (k = lenCounters[len]; k != 0; k--) 271 p1--;
126 lens[p[i++] & MASK] = (Byte)len; 272 (lenCounters )[len] -= 2; // we remove 2 leaves from level (len)
273 (lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1)
274 n -= 2;
275 STORE_PARENT (n , n[0], len)
276 STORE_PARENT (n + 1, n[1], len)
127 } 277 }
278 while (n != b);
128 } 279 }
129 280 }
281
282 if (b != p)
283 {
284 // we detect level of each node (realtive to root),
285 // and update lenCounters[].
286 // We process only intermediate nodes and we don't process leaves.
287 do
130 { 288 {
131 UInt32 nextCodes[kMaxLen + 1]; 289 // if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii])
132 { 290 // if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree
133 UInt32 code = 0; 291 unsigned len;
134 UInt32 len; 292 b--;
135 for (len = 1; len <= kMaxLen; len++) 293 len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1;
136 nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1; 294 STORE_PARENT (b, *b, len)
137 } 295 if (len >= maxLen)
138 /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
139
140 { 296 {
141 UInt32 k; 297 // We are not allowed to create node at level (maxLen) and higher,
142 for (k = 0; k < numSymbols; k++) 298 // because all leaves must be placed to level (maxLen) or lower.
143 p[k] = nextCodes[lens[k]]++; 299 // We find nearest allowed leaf and place current node to level of that leaf:
300 for (len = maxLen - 1; lenCounters[len] == 0; len--) {}
144 } 301 }
302 lenCounters[len]--; // we remove 1 leaf from level (len)
303 (lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1)
304 }
305 while (b != p);
306 }
307 }
308 {
309 {
310 unsigned len = maxLen;
311 const UInt32 *p2 = p;
312 do
313 {
314 unsigned k = lenCounters[len];
315 if (k)
316 do
317 lens[(unsigned)*p2++ & MASK] = (Byte)len;
318 while (--k);
319 }
320 while (--len);
321 }
322 codes[0] = 0; // we don't want garbage values to be written to p[] array.
323 // codes[1] = 0;
324 {
325 UInt32 code = 0;
326 unsigned len;
327 for (len = 0; len < kMaxLen; len++)
328 (codes + 1)[len] = code = (code + lenCounters[len]) << 1;
329 }
330 /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
331 {
332 const Byte * const limit = lens + numSymbols;
333 do
334 {
335 unsigned len;
336 UInt32 c;
337 len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1;
338 // len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1;
339 p += 1;
340 lens += 1;
145 } 341 }
342 while (lens != limit);
146 } 343 }
147 } 344 }
148} 345}
@@ -150,5 +347,14 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
150#undef kMaxLen 347#undef kMaxLen
151#undef NUM_BITS 348#undef NUM_BITS
152#undef MASK 349#undef MASK
350#undef FREQ_MASK
153#undef NUM_COUNTERS 351#undef NUM_COUNTERS
154#undef HUFFMAN_SPEED_OPT 352#undef CTR_ITEM_FOR_FREQ
353#undef LOAD_PARENT
354#undef STORE_PARENT
355#undef STORE_PARENT_DIRECT
356#undef UPDATE_E
357#undef HI_HALF_OFFSET
358#undef NUM_UNROLLS
359#undef lenCounters
360#undef codes
diff --git a/C/HuffEnc.h b/C/HuffEnc.h
index cbc5d11..2217f55 100644
--- a/C/HuffEnc.h
+++ b/C/HuffEnc.h
@@ -1,5 +1,5 @@
1/* HuffEnc.h -- Huffman encoding 1/* HuffEnc.h -- Huffman encoding
22023-03-05 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_HUFF_ENC_H 4#ifndef ZIP7_INC_HUFF_ENC_H
5#define ZIP7_INC_HUFF_ENC_H 5#define ZIP7_INC_HUFF_ENC_H
@@ -8,14 +8,14 @@
8 8
9EXTERN_C_BEGIN 9EXTERN_C_BEGIN
10 10
11#define Z7_HUFFMAN_LEN_MAX 16
11/* 12/*
12Conditions: 13Conditions:
13 num <= 1024 = 2 ^ NUM_BITS 14 2 <= num <= 1024 = 2 ^ NUM_BITS
14 Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) 15 Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS)
15 maxLen <= 16 = kMaxLen 16 1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX
16 Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) 17 Num_Items(p) >= HUFFMAN_TEMP_SIZE(num)
17*/ 18*/
18
19void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); 19void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen);
20 20
21EXTERN_C_END 21EXTERN_C_END
diff --git a/C/LzFind.c b/C/LzFind.c
index 1ce4046..330bc17 100644
--- a/C/LzFind.c
+++ b/C/LzFind.c
@@ -1,5 +1,5 @@
1/* LzFind.c -- Match finder for LZ algorithms 1/* LzFind.c -- Match finder for LZ algorithms
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
404 const unsigned nbMax = 404 const unsigned nbMax =
405 (p->numHashBytes == 2 ? 16 : 405 (p->numHashBytes == 2 ? 16 :
406 (p->numHashBytes == 3 ? 24 : 32)); 406 (p->numHashBytes == 3 ? 24 : 32));
407 if (numBits > nbMax) 407 if (numBits >= nbMax)
408 numBits = nbMax; 408 numBits = nbMax;
409 if (numBits >= 32) 409 if (numBits >= 32)
410 hs = (UInt32)0 - 1; 410 hs = (UInt32)0 - 1;
@@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
416 hs |= (256 << kLzHash_CrcShift_2) - 1; 416 hs |= (256 << kLzHash_CrcShift_2) - 1;
417 { 417 {
418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); 418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
419 if (hs > hs2) 419 if (hs >= hs2)
420 hs = hs2; 420 hs = hs2;
421 } 421 }
422 hsCur = hs; 422 hsCur = hs;
423 if (p->expectedDataSize < historySize) 423 if (p->expectedDataSize < historySize)
424 { 424 {
425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); 425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
426 if (hsCur > hs2) 426 if (hsCur >= hs2)
427 hsCur = hs2; 427 hsCur = hs2;
428 } 428 }
429 } 429 }
@@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
434 if (p->expectedDataSize < historySize) 434 if (p->expectedDataSize < historySize)
435 { 435 {
436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); 436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
437 if (hsCur > hs) // is it possible? 437 if (hsCur >= hs) // is it possible?
438 hsCur = hs; 438 hsCur = hs;
439 } 439 }
440 } 440 }
@@ -598,7 +598,7 @@ void MatchFinder_Init(void *_p)
598 598
599#ifdef MY_CPU_X86_OR_AMD64 599#ifdef MY_CPU_X86_OR_AMD64
600 #if defined(__clang__) && (__clang_major__ >= 4) \ 600 #if defined(__clang__) && (__clang_major__ >= 4) \
601 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) 601 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
602 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) 602 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
603 603
604 #define USE_LZFIND_SATUR_SUB_128 604 #define USE_LZFIND_SATUR_SUB_128
@@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
890 return d; 890 return d;
891 { 891 {
892 const Byte *pb = cur - delta; 892 const Byte *pb = cur - delta;
893 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 893 curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
894 if (pb[maxLen] == cur[maxLen] && *pb == *cur) 894 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
895 { 895 {
896 UInt32 len = 0; 896 UInt32 len = 0;
@@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
925 break; 925 break;
926 { 926 {
927 ptrdiff_t diff; 927 ptrdiff_t diff;
928 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 928 curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta; 929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) 930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
931 { 931 {
@@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } 972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
973 973
974 cmCheck = (UInt32)(pos - _cyclicBufferSize); 974 cmCheck = (UInt32)(pos - _cyclicBufferSize);
975 if ((UInt32)pos <= _cyclicBufferSize) 975 if ((UInt32)pos < _cyclicBufferSize)
976 cmCheck = 0; 976 cmCheck = 0;
977 977
978 if (cmCheck < curMatch) 978 if (cmCheck < curMatch)
@@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
980 { 980 {
981 const UInt32 delta = pos - curMatch; 981 const UInt32 delta = pos - curMatch;
982 { 982 {
983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
984 const Byte *pb = cur - delta; 984 const Byte *pb = cur - delta;
985 unsigned len = (len0 < len1 ? len0 : len1); 985 unsigned len = (len0 < len1 ? len0 : len1);
986 const UInt32 pair0 = pair[0]; 986 const UInt32 pair0 = pair[0];
@@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
1039 UInt32 cmCheck; 1039 UInt32 cmCheck;
1040 1040
1041 cmCheck = (UInt32)(pos - _cyclicBufferSize); 1041 cmCheck = (UInt32)(pos - _cyclicBufferSize);
1042 if ((UInt32)pos <= _cyclicBufferSize) 1042 if ((UInt32)pos < _cyclicBufferSize)
1043 cmCheck = 0; 1043 cmCheck = 0;
1044 1044
1045 if (// curMatch >= pos || // failure 1045 if (// curMatch >= pos || // failure
@@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
1048 { 1048 {
1049 const UInt32 delta = pos - curMatch; 1049 const UInt32 delta = pos - curMatch;
1050 { 1050 {
1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
1052 const Byte *pb = cur - delta; 1052 const Byte *pb = cur - delta;
1053 unsigned len = (len0 < len1 ? len0 : len1); 1053 unsigned len = (len0 < len1 ? len0 : len1);
1054 if (pb[len] == cur[len]) 1054 if (pb[len] == cur[len])
@@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1595 UInt32 pos = p->pos; \ 1595 UInt32 pos = p->pos; \
1596 UInt32 num2 = num; \ 1596 UInt32 num2 = num; \
1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \ 1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \
1598 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ 1598 { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
1599 num -= num2; \ 1599 num -= num2; \
1600 { const UInt32 cycPos = p->cyclicBufferPos; \ 1600 { const UInt32 cycPos = p->cyclicBufferPos; \
1601 son = p->son + cycPos; \ 1601 son = p->son + cycPos; \
diff --git a/C/LzFindMt.c b/C/LzFindMt.c
index ac9d59d..25fcc46 100644
--- a/C/LzFindMt.c
+++ b/C/LzFindMt.c
@@ -1,5 +1,5 @@
1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
82Z7_NO_INLINE 82Z7_NO_INLINE
83static void MtSync_Construct(CMtSync *p) 83static void MtSync_Construct(CMtSync *p)
84{ 84{
85 p->affinityGroup = -1;
86 p->affinityInGroup = 0;
85 p->affinity = 0; 87 p->affinity = 0;
86 p->wasCreated = False; 88 p->wasCreated = False;
87 p->csWasInitialized = False; 89 p->csWasInitialized = False;
@@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
259 // return ERROR_TOO_MANY_POSTS; // for debug 261 // return ERROR_TOO_MANY_POSTS; // for debug
260 // return EINVAL; // for debug 262 // return EINVAL; // for debug
261 263
264#ifdef _WIN32
265 if (p->affinityGroup >= 0)
266 wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
267 (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
268 else
269#endif
262 if (p->affinity != 0) 270 if (p->affinity != 0)
263 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); 271 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
264 else 272 else
diff --git a/C/LzFindMt.h b/C/LzFindMt.h
index fcb479d..89984f5 100644
--- a/C/LzFindMt.h
+++ b/C/LzFindMt.h
@@ -1,5 +1,5 @@
1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZ_FIND_MT_H 4#ifndef ZIP7_INC_LZ_FIND_MT_H
5#define ZIP7_INC_LZ_FIND_MT_H 5#define ZIP7_INC_LZ_FIND_MT_H
@@ -12,8 +12,10 @@ EXTERN_C_BEGIN
12typedef struct 12typedef struct
13{ 13{
14 UInt32 numProcessedBlocks; 14 UInt32 numProcessedBlocks;
15 CThread thread; 15 Int32 affinityGroup;
16 UInt64 affinityInGroup;
16 UInt64 affinity; 17 UInt64 affinity;
18 CThread thread;
17 19
18 BoolInt wasCreated; 20 BoolInt wasCreated;
19 BoolInt needStart; 21 BoolInt needStart;
diff --git a/C/Lzma2Enc.c b/C/Lzma2Enc.c
index 703e146..72aec69 100644
--- a/C/Lzma2Enc.c
+++ b/C/Lzma2Enc.c
@@ -1,5 +1,5 @@
1/* Lzma2Enc.c -- LZMA2 Encoder 1/* Lzma2Enc.c -- LZMA2 Encoder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
235 p->numBlockThreads_Reduced = -1; 235 p->numBlockThreads_Reduced = -1;
236 p->numBlockThreads_Max = -1; 236 p->numBlockThreads_Max = -1;
237 p->numTotalThreads = -1; 237 p->numTotalThreads = -1;
238 p->numThreadGroups = 0;
238} 239}
239 240
240void Lzma2EncProps_Normalize(CLzma2EncProps *p) 241void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
781 } 782 }
782 783
783 p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; 784 p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
785 p->mtCoder.numThreadGroups = p->props.numThreadGroups;
784 p->mtCoder.expectedDataSize = p->expectedDataSize; 786 p->mtCoder.expectedDataSize = p->expectedDataSize;
785 787
786 { 788 {
diff --git a/C/Lzma2Enc.h b/C/Lzma2Enc.h
index cb25275..1e6b50c 100644
--- a/C/Lzma2Enc.h
+++ b/C/Lzma2Enc.h
@@ -18,6 +18,7 @@ typedef struct
18 int numBlockThreads_Reduced; 18 int numBlockThreads_Reduced;
19 int numBlockThreads_Max; 19 int numBlockThreads_Max;
20 int numTotalThreads; 20 int numTotalThreads;
21 unsigned numThreadGroups; // 0 : no groups
21} CLzma2EncProps; 22} CLzma2EncProps;
22 23
23void Lzma2EncProps_Init(CLzma2EncProps *p); 24void Lzma2EncProps_Init(CLzma2EncProps *p);
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c
index 37b2787..84a29a5 100644
--- a/C/LzmaEnc.c
+++ b/C/LzmaEnc.c
@@ -1,5 +1,5 @@
1/* LzmaEnc.c -- LZMA Encoder 1/* LzmaEnc.c -- LZMA Encoder
22024-01-24: Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
62 p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; 62 p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
63 p->numHashOutBits = 0; 63 p->numHashOutBits = 0;
64 p->writeEndMark = 0; 64 p->writeEndMark = 0;
65 p->affinityGroup = -1;
65 p->affinity = 0; 66 p->affinity = 0;
67 p->affinityInGroup = 0;
66} 68}
67 69
68void LzmaEncProps_Normalize(CLzmaEncProps *p) 70void LzmaEncProps_Normalize(CLzmaEncProps *p)
@@ -72,11 +74,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
72 p->level = level; 74 p->level = level;
73 75
74 if (p->dictSize == 0) 76 if (p->dictSize == 0)
75 p->dictSize = 77 p->dictSize = (unsigned)level <= 4 ?
76 ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : 78 (UInt32)1 << (level * 2 + 16) :
77 ( level <= 6 ? ((UInt32)1 << (level + 19)) : 79 (unsigned)level <= sizeof(size_t) / 2 + 4 ?
78 ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) 80 (UInt32)1 << (level + 20) :
79 ))); 81 (UInt32)1 << (sizeof(size_t) / 2 + 24);
80 82
81 if (p->dictSize > p->reduceSize) 83 if (p->dictSize > p->reduceSize)
82 { 84 {
@@ -92,8 +94,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
92 if (p->lp < 0) p->lp = 0; 94 if (p->lp < 0) p->lp = 0;
93 if (p->pb < 0) p->pb = 2; 95 if (p->pb < 0) p->pb = 2;
94 96
95 if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); 97 if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1;
96 if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); 98 if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64;
97 if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); 99 if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
98 if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); 100 if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5);
99 if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); 101 if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1);
@@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
598 p->multiThread = (props.numThreads > 1); 600 p->multiThread = (props.numThreads > 1);
599 p->matchFinderMt.btSync.affinity = 601 p->matchFinderMt.btSync.affinity =
600 p->matchFinderMt.hashSync.affinity = props.affinity; 602 p->matchFinderMt.hashSync.affinity = props.affinity;
603 p->matchFinderMt.btSync.affinityGroup =
604 p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
605 p->matchFinderMt.btSync.affinityInGroup =
606 p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
601 #endif 607 #endif
602 608
603 return SZ_OK; 609 return SZ_OK;
diff --git a/C/LzmaEnc.h b/C/LzmaEnc.h
index 9f8039a..3feb5b4 100644
--- a/C/LzmaEnc.h
+++ b/C/LzmaEnc.h
@@ -1,5 +1,5 @@
1/* LzmaEnc.h -- LZMA Encoder 1/* LzmaEnc.h -- LZMA Encoder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZMA_ENC_H 4#ifndef ZIP7_INC_LZMA_ENC_H
5#define ZIP7_INC_LZMA_ENC_H 5#define ZIP7_INC_LZMA_ENC_H
@@ -29,11 +29,13 @@ typedef struct
29 int numThreads; /* 1 or 2, default = 2 */ 29 int numThreads; /* 1 or 2, default = 2 */
30 30
31 // int _pad; 31 // int _pad;
32 Int32 affinityGroup;
32 33
33 UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. 34 UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
34 Encoder uses this value to reduce dictionary size */ 35 Encoder uses this value to reduce dictionary size */
35 36
36 UInt64 affinity; 37 UInt64 affinity;
38 UInt64 affinityInGroup;
37} CLzmaEncProps; 39} CLzmaEncProps;
38 40
39void LzmaEncProps_Init(CLzmaEncProps *p); 41void LzmaEncProps_Init(CLzmaEncProps *p);
diff --git a/C/Md5.c b/C/Md5.c
new file mode 100644
index 0000000..1b745d7
--- /dev/null
+++ b/C/Md5.c
@@ -0,0 +1,206 @@
1/* Md5.c -- MD5 Hash
2: Igor Pavlov : Public domain
3This code is based on Colin Plumb's public domain md5.c code */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Md5.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks
14
15Z7_NO_INLINE
16void Md5_Init(CMd5 *p)
17{
18 p->count = 0;
19 p->state[0] = 0x67452301;
20 p->state[1] = 0xefcdab89;
21 p->state[2] = 0x98badcfe;
22 p->state[3] = 0x10325476;
23}
24
25#if 0 && !defined(MY_CPU_LE_UNALIGN)
26// optional optimization for Big-endian processors or processors without unaligned access:
27// it is intended to reduce the number of complex LE32 memory reading from 64 to 16.
28// But some compilers (sparc, armt) are better without this optimization.
29#define Z7_MD5_USE_DATA32_ARRAY
30#endif
31
32#define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i))
33
34#ifdef Z7_MD5_USE_DATA32_ARRAY
35#define D(i) data32[i]
36#else
37#define D(i) LOAD_DATA(i)
38#endif
39
40#define F1(x, y, z) (z ^ (x & (y ^ z)))
41#define F2(x, y, z) F1(z, x, y)
42#define F3(x, y, z) (x ^ y ^ z)
43#define F4(x, y, z) (y ^ (x | ~z))
44
45#define R1(i, f, start, step, w, x, y, z, s, k) \
46 w += D((start + step * (i)) % 16) + k; \
47 w += f(x, y, z); \
48 w = rotlFixed(w, s) + x; \
49
50#define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \
51 R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \
52 R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \
53 R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \
54 R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \
55
56#define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \
57 R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \
58 R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \
59 R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \
60 R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \
61
62static
63Z7_NO_INLINE
64void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks)
65{
66 UInt32 a, b, c, d;
67 // if (numBlocks == 0) return;
68 a = state[0];
69 b = state[1];
70 c = state[2];
71 d = state[3];
72 do
73 {
74#ifdef Z7_MD5_USE_DATA32_ARRAY
75 UInt32 data32[MD5_NUM_BLOCK_WORDS];
76 {
77#define LOAD_data32_x4(i) { \
78 data32[i ] = LOAD_DATA(i ); \
79 data32[i + 1] = LOAD_DATA(i + 1); \
80 data32[i + 2] = LOAD_DATA(i + 2); \
81 data32[i + 3] = LOAD_DATA(i + 3); }
82#if 1
83 LOAD_data32_x4 (0 * 4)
84 LOAD_data32_x4 (1 * 4)
85 LOAD_data32_x4 (2 * 4)
86 LOAD_data32_x4 (3 * 4)
87#else
88 unsigned i;
89 for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4)
90 {
91 LOAD_data32_x4(i)
92 }
93#endif
94 }
95#endif
96
97 R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
98 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
99 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
100 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821)
101 R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
102 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
103 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
104 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a)
105 R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
106 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
107 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
108 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665)
109 R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
110 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
111 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
112 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391)
113
114 a += state[0];
115 b += state[1];
116 c += state[2];
117 d += state[3];
118
119 state[0] = a;
120 state[1] = b;
121 state[2] = c;
122 state[3] = d;
123
124 data += MD5_BLOCK_SIZE;
125 }
126 while (--numBlocks);
127}
128
129
130#define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
131
132void Md5_Update(CMd5 *p, const Byte *data, size_t size)
133{
134 if (size == 0)
135 return;
136 {
137 const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
138 const unsigned num = MD5_BLOCK_SIZE - pos;
139 p->count += size;
140 if (num > size)
141 {
142 memcpy(p->buffer + pos, data, size);
143 return;
144 }
145 if (pos != 0)
146 {
147 size -= num;
148 memcpy(p->buffer + pos, data, num);
149 data += num;
150 Md5_UpdateBlock(p);
151 }
152 }
153 {
154 const size_t numBlocks = size >> 6;
155 if (numBlocks)
156 MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
157 size &= MD5_BLOCK_SIZE - 1;
158 if (size == 0)
159 return;
160 data += (numBlocks << 6);
161 memcpy(p->buffer, data, size);
162 }
163}
164
165
166void Md5_Final(CMd5 *p, Byte *digest)
167{
168 unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
169 p->buffer[pos++] = 0x80;
170 if (pos > (MD5_BLOCK_SIZE - 4 * 2))
171 {
172 while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; }
173 // memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos);
174 Md5_UpdateBlock(p);
175 pos = 0;
176 }
177 memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos);
178 {
179 const UInt64 numBits = p->count << 3;
180#if defined(MY_CPU_LE_UNALIGN)
181 SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits)
182#else
183 SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits))
184 SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32))
185#endif
186 }
187 Md5_UpdateBlock(p);
188
189 SetUi32(digest, p->state[0])
190 SetUi32(digest + 4, p->state[1])
191 SetUi32(digest + 8, p->state[2])
192 SetUi32(digest + 12, p->state[3])
193
194 Md5_Init(p);
195}
196
197#undef R1
198#undef R4
199#undef R16
200#undef D
201#undef LOAD_DATA
202#undef LOAD_data32_x4
203#undef F1
204#undef F2
205#undef F3
206#undef F4
diff --git a/C/Md5.h b/C/Md5.h
new file mode 100644
index 0000000..49c0741
--- /dev/null
+++ b/C/Md5.h
@@ -0,0 +1,34 @@
1/* Md5.h -- MD5 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_MD5_H
5#define ZIP7_INC_MD5_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define MD5_NUM_BLOCK_WORDS 16
12#define MD5_NUM_DIGEST_WORDS 4
13
14#define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4)
15#define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4)
16
17typedef struct
18{
19 UInt64 count;
20 UInt64 _pad_1;
21 // we want 16-bytes alignment here
22 UInt32 state[MD5_NUM_DIGEST_WORDS];
23 UInt64 _pad_2[4];
24 // we want 64-bytes alignment here
25 Byte buffer[MD5_BLOCK_SIZE];
26} CMd5;
27
28void Md5_Init(CMd5 *p);
29void Md5_Update(CMd5 *p, const Byte *data, size_t size);
30void Md5_Final(CMd5 *p, Byte *digest);
31
32EXTERN_C_END
33
34#endif
diff --git a/C/MtCoder.c b/C/MtCoder.c
index 03959b6..923b19a 100644
--- a/C/MtCoder.c
+++ b/C/MtCoder.c
@@ -1,5 +1,5 @@
1/* MtCoder.c -- Multi-thread Coder 1/* MtCoder.c -- Multi-thread Coder
22023-09-07 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
39static THREAD_FUNC_DECL ThreadFunc(void *pp); 39static THREAD_FUNC_DECL ThreadFunc(void *pp);
40 40
41 41
42static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) 42static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
43#ifdef _WIN32
44 , CMtCoder * const mtc
45#endif
46 )
43{ 47{
44 WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); 48 WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
49 // printf("\n====== MtCoderThread_CreateAndStart : \n");
45 if (wres == 0) 50 if (wres == 0)
46 { 51 {
47 t->stop = False; 52 t->stop = False;
48 if (!Thread_WasCreated(&t->thread)) 53 if (!Thread_WasCreated(&t->thread))
49 wres = Thread_Create(&t->thread, ThreadFunc, t); 54 {
55#ifdef _WIN32
56 if (mtc->numThreadGroups)
57 wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
58 ThreadNextGroup_GetNext(&mtc->nextGroup), // group
59 0); // affinityMask
60 else
61#endif
62 wres = Thread_Create(&t->thread, ThreadFunc, t);
63 }
50 if (wres == 0) 64 if (wres == 0)
51 wres = Event_Set(&t->startEvent); 65 wres = Event_Set(&t->startEvent);
52 } 66 }
@@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
56} 70}
57 71
58 72
73Z7_FORCE_INLINE
59static void MtCoderThread_Destruct(CMtCoderThread *t) 74static void MtCoderThread_Destruct(CMtCoderThread *t)
60{ 75{
61 if (Thread_WasCreated(&t->thread)) 76 if (Thread_WasCreated(&t->thread))
@@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
85 100
86static SRes ThreadFunc2(CMtCoderThread *t) 101static SRes ThreadFunc2(CMtCoderThread *t)
87{ 102{
88 CMtCoder *mtc = t->mtCoder; 103 CMtCoder * const mtc = t->mtCoder;
89 104
90 for (;;) 105 for (;;)
91 { 106 {
@@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
185 if (mtc->numStartedThreads < mtc->numStartedThreadsLimit 200 if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
186 && mtc->expectedDataSize != readProcessed) 201 && mtc->expectedDataSize != readProcessed)
187 { 202 {
188 res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); 203 res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
204#ifdef _WIN32
205 , mtc
206#endif
207 );
189 if (res == SZ_OK) 208 if (res == SZ_OK)
190 mtc->numStartedThreads++; 209 mtc->numStartedThreads++;
191 else 210 else
@@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
221 } 240 }
222 241
223 { 242 {
224 CMtCoderBlock *block = &mtc->blocks[bi]; 243 CMtCoderBlock * const block = &mtc->blocks[bi];
225 block->res = res; 244 block->res = res;
226 block->bufIndex = bufIndex; 245 block->bufIndex = bufIndex;
227 block->finished = finished; 246 block->finished = finished;
@@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
311 330
312static THREAD_FUNC_DECL ThreadFunc(void *pp) 331static THREAD_FUNC_DECL ThreadFunc(void *pp)
313{ 332{
314 CMtCoderThread *t = (CMtCoderThread *)pp; 333 CMtCoderThread * const t = (CMtCoderThread *)pp;
315 for (;;) 334 for (;;)
316 { 335 {
317 if (Event_Wait(&t->startEvent) != 0) 336 if (Event_Wait(&t->startEvent) != 0)
@@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
319 if (t->stop) 338 if (t->stop)
320 return 0; 339 return 0;
321 { 340 {
322 SRes res = ThreadFunc2(t); 341 const SRes res = ThreadFunc2(t);
323 CMtCoder *mtc = t->mtCoder; 342 CMtCoder *mtc = t->mtCoder;
324 if (res != SZ_OK) 343 if (res != SZ_OK)
325 { 344 {
@@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
328 347
329 #ifndef MTCODER_USE_WRITE_THREAD 348 #ifndef MTCODER_USE_WRITE_THREAD
330 { 349 {
331 unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); 350 const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
332 if (numFinished == mtc->numStartedThreads) 351 if (numFinished == mtc->numStartedThreads)
333 if (Event_Set(&mtc->finishedEvent) != 0) 352 if (Event_Set(&mtc->finishedEvent) != 0)
334 return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; 353 return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
346 365
347 p->blockSize = 0; 366 p->blockSize = 0;
348 p->numThreadsMax = 0; 367 p->numThreadsMax = 0;
368 p->numThreadGroups = 0;
349 p->expectedDataSize = (UInt64)(Int64)-1; 369 p->expectedDataSize = (UInt64)(Int64)-1;
350 370
351 p->inStream = NULL; 371 p->inStream = NULL;
@@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
429 unsigned i; 449 unsigned i;
430 SRes res = SZ_OK; 450 SRes res = SZ_OK;
431 451
452 // printf("\n====== MtCoder_Code : \n");
453
432 if (numThreads > MTCODER_THREADS_MAX) 454 if (numThreads > MTCODER_THREADS_MAX)
433 numThreads = MTCODER_THREADS_MAX; 455 numThreads = MTCODER_THREADS_MAX;
434 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); 456 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
@@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)
492 514
493 p->numStartedThreadsLimit = numThreads; 515 p->numStartedThreadsLimit = numThreads;
494 p->numStartedThreads = 0; 516 p->numStartedThreads = 0;
517 ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
495 518
496 // for (i = 0; i < numThreads; i++) 519 // for (i = 0; i < numThreads; i++)
497 { 520 {
521 // here we create new thread for first block.
522 // And each new thread will create another new thread after block reading
523 // until numStartedThreadsLimit is reached.
498 CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; 524 CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
499 RINOK(MtCoderThread_CreateAndStart(nextThread)) 525 {
526 const SRes res2 = MtCoderThread_CreateAndStart(nextThread
527#ifdef _WIN32
528 , p
529#endif
530 );
531 RINOK(res2)
532 }
500 } 533 }
501 534
502 RINOK_THREAD(Event_Set(&p->readEvent)) 535 RINOK_THREAD(Event_Set(&p->readEvent))
@@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
513 RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) 546 RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
514 547
515 { 548 {
516 const CMtCoderBlock *block = &p->blocks[bi]; 549 const CMtCoderBlock * const block = &p->blocks[bi];
517 unsigned bufIndex = block->bufIndex; 550 const unsigned bufIndex = block->bufIndex;
518 BoolInt finished = block->finished; 551 const BoolInt finished = block->finished;
519 if (res == SZ_OK && block->res != SZ_OK) 552 if (res == SZ_OK && block->res != SZ_OK)
520 res = block->res; 553 res = block->res;
521 554
@@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
545 } 578 }
546 #else 579 #else
547 { 580 {
548 WRes wres = Event_Wait(&p->finishedEvent); 581 const WRes wres = Event_Wait(&p->finishedEvent);
549 res = MY_SRes_HRESULT_FROM_WRes(wres); 582 res = MY_SRes_HRESULT_FROM_WRes(wres);
550 } 583 }
551 #endif 584 #endif
diff --git a/C/MtCoder.h b/C/MtCoder.h
index 1231d3c..8166cca 100644
--- a/C/MtCoder.h
+++ b/C/MtCoder.h
@@ -1,5 +1,5 @@
1/* MtCoder.h -- Multi-thread Coder 1/* MtCoder.h -- Multi-thread Coder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_MT_CODER_H 4#ifndef ZIP7_INC_MT_CODER_H
5#define ZIP7_INC_MT_CODER_H 5#define ZIP7_INC_MT_CODER_H
@@ -16,7 +16,7 @@ EXTERN_C_BEGIN
16 16
17#ifndef Z7_ST 17#ifndef Z7_ST
18 #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) 18 #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
19 #define MTCODER_THREADS_MAX 64 19 #define MTCODER_THREADS_MAX 256
20 #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) 20 #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
21#else 21#else
22 #define MTCODER_THREADS_MAX 1 22 #define MTCODER_THREADS_MAX 1
@@ -77,6 +77,7 @@ typedef struct CMtCoder_
77 77
78 size_t blockSize; /* size of input block */ 78 size_t blockSize; /* size of input block */
79 unsigned numThreadsMax; 79 unsigned numThreadsMax;
80 unsigned numThreadGroups;
80 UInt64 expectedDataSize; 81 UInt64 expectedDataSize;
81 82
82 ISeqInStreamPtr inStream; 83 ISeqInStreamPtr inStream;
@@ -125,6 +126,8 @@ typedef struct CMtCoder_
125 CMtProgress mtProgress; 126 CMtProgress mtProgress;
126 CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; 127 CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
127 CMtCoderThread threads[MTCODER_THREADS_MAX]; 128 CMtCoderThread threads[MTCODER_THREADS_MAX];
129
130 CThreadNextGroup nextGroup;
128} CMtCoder; 131} CMtCoder;
129 132
130 133
diff --git a/C/Sha1.c b/C/Sha1.c
index 4c92892..4ca21d7 100644
--- a/C/Sha1.c
+++ b/C/Sha1.c
@@ -1,18 +1,14 @@
1/* Sha1.c -- SHA-1 Hash 1/* Sha1.c -- SHA-1 Hash
22024-03-01 : Igor Pavlov : Public domain 2: Igor Pavlov : Public domain
3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ 3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7#include <string.h> 7#include <string.h>
8 8
9#include "CpuArch.h"
10#include "RotateDefs.h"
11#include "Sha1.h" 9#include "Sha1.h"
12 10#include "RotateDefs.h"
13#if defined(_MSC_VER) && (_MSC_VER < 1900) 11#include "CpuArch.h"
14// #define USE_MY_MM
15#endif
16 12
17#ifdef MY_CPU_X86_OR_AMD64 13#ifdef MY_CPU_X86_OR_AMD64
18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ 14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
56 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; 52 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks;
57 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; 53 static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW;
58 54
59 #define SHA1_UPDATE_BLOCKS(p) p->func_UpdateBlocks 55 #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
60#else 56#else
61 #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks 57 #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks
62#endif 58#endif
@@ -85,7 +81,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
85 return False; 81 return False;
86 #endif 82 #endif
87 83
88 p->func_UpdateBlocks = func; 84 p->v.vars.func_UpdateBlocks = func;
89 return True; 85 return True;
90} 86}
91 87
@@ -225,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
225 221
226void Sha1_InitState(CSha1 *p) 222void Sha1_InitState(CSha1 *p)
227{ 223{
228 p->count = 0; 224 p->v.vars.count = 0;
229 p->state[0] = 0x67452301; 225 p->state[0] = 0x67452301;
230 p->state[1] = 0xEFCDAB89; 226 p->state[1] = 0xEFCDAB89;
231 p->state[2] = 0x98BADCFE; 227 p->state[2] = 0x98BADCFE;
@@ -235,7 +231,7 @@ void Sha1_InitState(CSha1 *p)
235 231
236void Sha1_Init(CSha1 *p) 232void Sha1_Init(CSha1 *p)
237{ 233{
238 p->func_UpdateBlocks = 234 p->v.vars.func_UpdateBlocks =
239 #ifdef Z7_COMPILER_SHA1_SUPPORTED 235 #ifdef Z7_COMPILER_SHA1_SUPPORTED
240 g_SHA1_FUNC_UPDATE_BLOCKS; 236 g_SHA1_FUNC_UPDATE_BLOCKS;
241 #else 237 #else
@@ -250,7 +246,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
250{ 246{
251 UInt32 a, b, c, d, e; 247 UInt32 a, b, c, d, e;
252 UInt32 W[kNumW]; 248 UInt32 W[kNumW];
253 // if (numBlocks != 0x1264378347) return; 249
254 if (numBlocks == 0) 250 if (numBlocks == 0)
255 return; 251 return;
256 252
@@ -283,7 +279,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
283 state[3] = d; 279 state[3] = d;
284 state[4] = e; 280 state[4] = e;
285 281
286 data += 64; 282 data += SHA1_BLOCK_SIZE;
287 } 283 }
288 while (--numBlocks); 284 while (--numBlocks);
289} 285}
@@ -295,20 +291,15 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
295{ 291{
296 if (size == 0) 292 if (size == 0)
297 return; 293 return;
298
299 { 294 {
300 unsigned pos = (unsigned)p->count & 0x3F; 295 const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
301 unsigned num; 296 const unsigned num = SHA1_BLOCK_SIZE - pos;
302 297 p->v.vars.count += size;
303 p->count += size;
304
305 num = 64 - pos;
306 if (num > size) 298 if (num > size)
307 { 299 {
308 memcpy(p->buffer + pos, data, size); 300 memcpy(p->buffer + pos, data, size);
309 return; 301 return;
310 } 302 }
311
312 if (pos != 0) 303 if (pos != 0)
313 { 304 {
314 size -= num; 305 size -= num;
@@ -318,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
318 } 309 }
319 } 310 }
320 { 311 {
321 size_t numBlocks = size >> 6; 312 const size_t numBlocks = size >> 6;
313 // if (numBlocks)
322 SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); 314 SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
323 size &= 0x3F; 315 size &= SHA1_BLOCK_SIZE - 1;
324 if (size == 0) 316 if (size == 0)
325 return; 317 return;
326 data += (numBlocks << 6); 318 data += (numBlocks << 6);
@@ -331,42 +323,21 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
331 323
332void Sha1_Final(CSha1 *p, Byte *digest) 324void Sha1_Final(CSha1 *p, Byte *digest)
333{ 325{
334 unsigned pos = (unsigned)p->count & 0x3F; 326 unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
335
336
337 p->buffer[pos++] = 0x80; 327 p->buffer[pos++] = 0x80;
338 328 if (pos > (SHA1_BLOCK_SIZE - 4 * 2))
339 if (pos > (64 - 8))
340 { 329 {
341 while (pos != 64) { p->buffer[pos++] = 0; } 330 while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; }
342 // memset(&p->buf.buffer[pos], 0, 64 - pos); 331 // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos);
343 Sha1_UpdateBlock(p); 332 Sha1_UpdateBlock(p);
344 pos = 0; 333 pos = 0;
345 } 334 }
346 335 memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos);
347 /*
348 if (pos & 3)
349 {
350 p->buffer[pos] = 0;
351 p->buffer[pos + 1] = 0;
352 p->buffer[pos + 2] = 0;
353 pos += 3;
354 pos &= ~3;
355 }
356 {
357 for (; pos < 64 - 8; pos += 4)
358 *(UInt32 *)(&p->buffer[pos]) = 0;
359 }
360 */
361
362 memset(&p->buffer[pos], 0, (64 - 8) - pos);
363
364 { 336 {
365 const UInt64 numBits = (p->count << 3); 337 const UInt64 numBits = p->v.vars.count << 3;
366 SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) 338 SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
367 SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) 339 SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
368 } 340 }
369
370 Sha1_UpdateBlock(p); 341 Sha1_UpdateBlock(p);
371 342
372 SetBe32(digest, p->state[0]) 343 SetBe32(digest, p->state[0])
@@ -375,16 +346,13 @@ void Sha1_Final(CSha1 *p, Byte *digest)
375 SetBe32(digest + 12, p->state[3]) 346 SetBe32(digest + 12, p->state[3])
376 SetBe32(digest + 16, p->state[4]) 347 SetBe32(digest + 16, p->state[4])
377 348
378
379
380
381 Sha1_InitState(p); 349 Sha1_InitState(p);
382} 350}
383 351
384 352
385void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) 353void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size)
386{ 354{
387 const UInt64 numBits = (p->count + size) << 3; 355 const UInt64 numBits = (p->v.vars.count + size) << 3;
388 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) 356 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32))
389 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) 357 SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits))
390 // SetBe32((UInt32 *)(block + size), 0x80000000); 358 // SetBe32((UInt32 *)(block + size), 0x80000000);
@@ -420,57 +388,32 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest)
420 388
421void Sha1Prepare(void) 389void Sha1Prepare(void)
422{ 390{
423 #ifdef Z7_COMPILER_SHA1_SUPPORTED 391#ifdef Z7_COMPILER_SHA1_SUPPORTED
424 SHA1_FUNC_UPDATE_BLOCKS f, f_hw; 392 SHA1_FUNC_UPDATE_BLOCKS f, f_hw;
425 f = Sha1_UpdateBlocks; 393 f = Sha1_UpdateBlocks;
426 f_hw = NULL; 394 f_hw = NULL;
427 #ifdef MY_CPU_X86_OR_AMD64 395#ifdef MY_CPU_X86_OR_AMD64
428 #ifndef USE_MY_MM
429 if (CPU_IsSupported_SHA() 396 if (CPU_IsSupported_SHA()
430 && CPU_IsSupported_SSSE3() 397 && CPU_IsSupported_SSSE3()
431 // && CPU_IsSupported_SSE41()
432 ) 398 )
433 #endif 399#else
434 #else
435 if (CPU_IsSupported_SHA1()) 400 if (CPU_IsSupported_SHA1())
436 #endif 401#endif
437 { 402 {
438 // printf("\n========== HW SHA1 ======== \n"); 403 // printf("\n========== HW SHA1 ======== \n");
439 #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) 404#if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037)
440 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). 405 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037).
441 It generated incorrect SHA-1 code. 406 It generated incorrect SHA-1 code. */
442 21.03 : we test sha1-hardware code at runtime initialization */ 407 #pragma message("== SHA1 code can work incorrectly with this compiler")
443 408 #error Stop_Compiling_MSC_Compiler_BUG_SHA1
444 #pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") 409#endif
445
446 UInt32 state[5] = { 0, 1, 2, 3, 4 } ;
447 Byte data[64];
448 unsigned i;
449 for (i = 0; i < sizeof(data); i += 2)
450 {
451 data[i ] = (Byte)(i);
452 data[i + 1] = (Byte)(i + 1);
453 }
454
455 Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64);
456
457 if ( state[0] != 0x9acd7297
458 || state[1] != 0x4624d898
459 || state[2] != 0x0bf079f0
460 || state[3] != 0x031e61b3
461 || state[4] != 0x8323fe20)
462 {
463 // printf("\n========== SHA-1 hardware version failure ======== \n");
464 }
465 else
466 #endif
467 { 410 {
468 f = f_hw = Sha1_UpdateBlocks_HW; 411 f = f_hw = Sha1_UpdateBlocks_HW;
469 } 412 }
470 } 413 }
471 g_SHA1_FUNC_UPDATE_BLOCKS = f; 414 g_SHA1_FUNC_UPDATE_BLOCKS = f;
472 g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; 415 g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw;
473 #endif 416#endif
474} 417}
475 418
476#undef kNumW 419#undef kNumW
diff --git a/C/Sha1.h b/C/Sha1.h
index fecd9d3..529be4d 100644
--- a/C/Sha1.h
+++ b/C/Sha1.h
@@ -1,5 +1,5 @@
1/* Sha1.h -- SHA-1 Hash 1/* Sha1.h -- SHA-1 Hash
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SHA1_H 4#ifndef ZIP7_INC_SHA1_H
5#define ZIP7_INC_SHA1_H 5#define ZIP7_INC_SHA1_H
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN
14#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) 14#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4)
15#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) 15#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4)
16 16
17
18
19
17typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); 20typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
18 21
19/* 22/*
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte
32 35
33typedef struct 36typedef struct
34{ 37{
35 SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; 38 union
36 UInt64 count; 39 {
37 UInt64 _pad_2[2]; 40 struct
41 {
42 SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[4];
46 void *_pad_align_ptr[2];
47 } v;
38 UInt32 state[SHA1_NUM_DIGEST_WORDS]; 48 UInt32 state[SHA1_NUM_DIGEST_WORDS];
39 UInt32 _pad_3[3]; 49 UInt32 _pad_3[3];
40 Byte buffer[SHA1_BLOCK_SIZE]; 50 Byte buffer[SHA1_BLOCK_SIZE];
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c
index 4e835f1..8738b94 100644
--- a/C/Sha1Opt.c
+++ b/C/Sha1Opt.c
@@ -1,18 +1,11 @@
1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions 1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha1 HW") 36// #pragma message("Sha1 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -84,7 +69,6 @@ SHA:
84 _mm_sha1* 69 _mm_sha1*
85*/ 70*/
86 71
87
88#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); 72#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
89#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); 73#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
90#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); 74#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
@@ -99,11 +83,12 @@ SHA:
99#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); 83#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
100#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); 84#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
101 85
102
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 SHUFFLE_EPI8(m, mask) \ 88 SHUFFLE_EPI8(m, mask) \
106 89
90#define NNN(m0, m1, m2, m3)
91
107#define SM1(m0, m1, m2, m3) \ 92#define SM1(m0, m1, m2, m3) \
108 SHA1_MSG1(m0, m1) \ 93 SHA1_MSG1(m0, m1) \
109 94
@@ -116,35 +101,19 @@ SHA:
116 SM1(m0, m1, m2, m3) \ 101 SM1(m0, m1, m2, m3) \
117 SHA1_MSG2(m3, m2) \ 102 SHA1_MSG2(m3, m2) \
118 103
119#define NNN(m0, m1, m2, m3) 104#define R4(k, m0, m1, m2, m3, e0, e1, OP) \
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
138 e1 = abcd; \ 105 e1 = abcd; \
139 SHA1_RND4(abcd, e0, (k) / 5) \ 106 SHA1_RND4(abcd, e0, (k) / 5) \
140 SHA1_NEXTE(e1, m1) \ 107 SHA1_NEXTE(e1, m1) \
141 OP(m0, m1, m2, m3) \ 108 OP(m0, m1, m2, m3) \
142 109
110
111
143#define R16(k, mx, OP0, OP1, OP2, OP3) \ 112#define R16(k, mx, OP0, OP1, OP2, OP3) \
144 R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
145 R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
146 R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
147 R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ 116 R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 SHUFFLE_EPI32 (abcd, 0x1B) \ 119 SHUFFLE_EPI32 (abcd, 0x1B) \
@@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); 132 const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
164 133
165 __m128i abcd, e0;
166 134
135 __m128i abcd, e0;
136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
169 139
@@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
204 PREPARE_STATE 174 PREPARE_STATE
205 175
206 _mm_storeu_si128((__m128i *) (void *) state, abcd); 176 _mm_storeu_si128((__m128i *) (void *) state, abcd);
207 *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); 177 *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
208} 178}
209 179
210#endif // USE_HW_SHA 180#endif // USE_HW_SHA
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -329,26 +287,37 @@ typedef uint32x4_t v128;
329#endif 287#endif
330 288
331#ifdef MY_CPU_BE 289#ifdef MY_CPU_BE
332 #define MY_rev32_for_LE(x) 290 #define MY_rev32_for_LE(x) x
333#else 291#else
334 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 292 #define MY_rev32_for_LE(x) vrev32q_u8(x)
335#endif 293#endif
336 294
337#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 295#define LOAD_128_32(_p) vld1q_u32(_p)
338#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 296#define LOAD_128_8(_p) vld1q_u8 (_p)
297#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
339 298
340#define LOAD_SHUFFLE(m, k) \ 299#define LOAD_SHUFFLE(m, k) \
341 m = LOAD_128((data + (k) * 16)); \ 300 m = vreinterpretq_u32_u8( \
342 MY_rev32_for_LE(m); \ 301 MY_rev32_for_LE( \
343 302 LOAD_128_8(data + (k) * 16))); \
344#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) 303
345#define SU1(dest, src) dest = vsha1su1q_u32(dest, src) 304#define N0(dest, src2, src3)
305#define N1(dest, src)
306#define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
307#define U1(dest, src) dest = vsha1su1q_u32(dest, src);
346#define C(e) abcd = vsha1cq_u32(abcd, e, t) 308#define C(e) abcd = vsha1cq_u32(abcd, e, t)
347#define P(e) abcd = vsha1pq_u32(abcd, e, t) 309#define P(e) abcd = vsha1pq_u32(abcd, e, t)
348#define M(e) abcd = vsha1mq_u32(abcd, e, t) 310#define M(e) abcd = vsha1mq_u32(abcd, e, t)
349#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) 311#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
350#define T(m, c) t = vaddq_u32(m, c) 312#define T(m, c) t = vaddq_u32(m, c)
351 313
314#define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
315 T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \
316 T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \
317 T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \
318 T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \
319
320
352void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); 321void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
353#ifdef ATTRIB_SHA 322#ifdef ATTRIB_SHA
354ATTRIB_SHA 323ATTRIB_SHA
@@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
367 c2 = vdupq_n_u32(0x8f1bbcdc); 336 c2 = vdupq_n_u32(0x8f1bbcdc);
368 c3 = vdupq_n_u32(0xca62c1d6); 337 c3 = vdupq_n_u32(0xca62c1d6);
369 338
370 abcd = LOAD_128(&state[0]); 339 abcd = LOAD_128_32(&state[0]);
371 e0 = state[4]; 340 e0 = state[4];
372 341
373 do 342 do
@@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
385 LOAD_SHUFFLE (m2, 2) 354 LOAD_SHUFFLE (m2, 2)
386 LOAD_SHUFFLE (m3, 3) 355 LOAD_SHUFFLE (m3, 3)
387 356
388 T(m0, c0); H(e1); C(e0); 357 R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
389 T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); 358 R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
390 T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); 359 R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
391 T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); 360 R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
392 T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); 361 R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
393 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
394 T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
395 T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
396 T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
397 T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
398 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
399 T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
400 T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
401 T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
402 T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
403 T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
404 T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
405 T(m1, c3); SU1(m3, m2); H(e0); P(e1);
406 T(m2, c3); H(e1); P(e0);
407 T(m3, c3); H(e0); P(e1);
408 362
409 abcd = vaddq_u32(abcd, abcd_save); 363 abcd = vaddq_u32(abcd, abcd_save);
410 e0 += e0_save; 364 e0 += e0_save;
@@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
413 } 367 }
414 while (--numBlocks); 368 while (--numBlocks);
415 369
416 STORE_128(&state[0], abcd); 370 STORE_128_32(&state[0], abcd);
417 state[4] = e0; 371 state[4] = e0;
418} 372}
419 373
@@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
421 375
422#endif // MY_CPU_ARM_OR_ARM64 376#endif // MY_CPU_ARM_OR_ARM64
423 377
424
425#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) 378#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
426// #error Stop_Compiling_UNSUPPORTED_SHA 379// #error Stop_Compiling_UNSUPPORTED_SHA
427// #include <stdlib.h> 380// #include <stdlib.h>
428
429
430
431// #include "Sha1.h" 381// #include "Sha1.h"
432// #if defined(_MSC_VER) 382// #if defined(_MSC_VER)
433#pragma message("Sha1 HW-SW stub was used") 383#pragma message("Sha1 HW-SW stub was used")
@@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
447} 397}
448#endif 398#endif
449 399
450#undef SU0 400#undef U0
451#undef SU1 401#undef U1
402#undef N0
403#undef N1
452#undef C 404#undef C
453#undef P 405#undef P
454#undef M 406#undef M
diff --git a/C/Sha256.c b/C/Sha256.c
index 14d3be9..ea7ed8e 100644
--- a/C/Sha256.c
+++ b/C/Sha256.c
@@ -1,18 +1,14 @@
1/* Sha256.c -- SHA-256 Hash 1/* Sha256.c -- SHA-256 Hash
22024-03-01 : Igor Pavlov : Public domain 2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */ 3This code is based on public domain code from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7#include <string.h> 7#include <string.h>
8 8
9#include "CpuArch.h"
10#include "RotateDefs.h"
11#include "Sha256.h" 9#include "Sha256.h"
12 10#include "RotateDefs.h"
13#if defined(_MSC_VER) && (_MSC_VER < 1900) 11#include "CpuArch.h"
14// #define USE_MY_MM
15#endif
16 12
17#ifdef MY_CPU_X86_OR_AMD64 13#ifdef MY_CPU_X86_OR_AMD64
18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ 14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
@@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
56 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; 52 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
57 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; 53 static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
58 54
59 #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks 55 #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
60#else 56#else
61 #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks 57 #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
62#endif 58#endif
@@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
85 return False; 81 return False;
86 #endif 82 #endif
87 83
88 p->func_UpdateBlocks = func; 84 p->v.vars.func_UpdateBlocks = func;
89 return True; 85 return True;
90} 86}
91 87
@@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
111 107
112void Sha256_InitState(CSha256 *p) 108void Sha256_InitState(CSha256 *p)
113{ 109{
114 p->count = 0; 110 p->v.vars.count = 0;
115 p->state[0] = 0x6a09e667; 111 p->state[0] = 0x6a09e667;
116 p->state[1] = 0xbb67ae85; 112 p->state[1] = 0xbb67ae85;
117 p->state[2] = 0x3c6ef372; 113 p->state[2] = 0x3c6ef372;
@@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p)
122 p->state[7] = 0x5be0cd19; 118 p->state[7] = 0x5be0cd19;
123} 119}
124 120
121
122
123
124
125
126
127
125void Sha256_Init(CSha256 *p) 128void Sha256_Init(CSha256 *p)
126{ 129{
127 p->func_UpdateBlocks = 130 p->v.vars.func_UpdateBlocks =
128 #ifdef Z7_COMPILER_SHA256_SUPPORTED 131 #ifdef Z7_COMPILER_SHA256_SUPPORTED
129 g_SHA256_FUNC_UPDATE_BLOCKS; 132 g_SHA256_FUNC_UPDATE_BLOCKS;
130 #else 133 #else
@@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p)
133 Sha256_InitState(p); 136 Sha256_InitState(p);
134} 137}
135 138
136#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) 139#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
137#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) 140#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
138#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) 141#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
139#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) 142#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
140 143
141#define Ch(x,y,z) (z^(x&(y^z))) 144#define Ch(x,y,z) (z^(x&(y^z)))
142#define Maj(x,y,z) ((x&y)|(z&(x|y))) 145#define Maj(x,y,z) ((x&y)|(z&(x|y)))
@@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p)
224 227
225#endif 228#endif
226 229
227// static
228extern MY_ALIGN(64)
229const UInt32 SHA256_K_ARRAY[64];
230 230
231MY_ALIGN(64) 231extern
232const UInt32 SHA256_K_ARRAY[64] = { 232MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
233MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
233 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 234 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
234 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 235 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
235 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 236 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = {
248 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 249 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
249}; 250};
250 251
251#define K SHA256_K_ARRAY
252 252
253 253
254
255
256#define K SHA256_K_ARRAY
257
254Z7_NO_INLINE 258Z7_NO_INLINE
255void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) 259void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
256{ 260{
257 UInt32 W 261 UInt32 W
258 #ifdef Z7_SHA256_BIG_W 262#ifdef Z7_SHA256_BIG_W
259 [64]; 263 [64];
260 #else 264#else
261 [16]; 265 [16];
262 #endif 266#endif
263
264 unsigned j; 267 unsigned j;
265
266 UInt32 a,b,c,d,e,f,g,h; 268 UInt32 a,b,c,d,e,f,g,h;
267 269#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
268 #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
269 UInt32 tmp; 270 UInt32 tmp;
270 #endif 271#endif
271 272
273 if (numBlocks == 0) return;
274
272 a = state[0]; 275 a = state[0];
273 b = state[1]; 276 b = state[1];
274 c = state[2]; 277 c = state[2];
@@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
278 g = state[6]; 281 g = state[6];
279 h = state[7]; 282 h = state[7];
280 283
281 while (numBlocks) 284 do
282 { 285 {
283 286
284 for (j = 0; j < 16; j += STEP_PRE) 287 for (j = 0; j < 16; j += STEP_PRE)
@@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
352 g += state[6]; state[6] = g; 355 g += state[6]; state[6] = g;
353 h += state[7]; state[7] = h; 356 h += state[7]; state[7] = h;
354 357
355 data += 64; 358 data += SHA256_BLOCK_SIZE;
356 numBlocks--;
357 } 359 }
358 360 while (--numBlocks);
359 /* Wipe variables */
360 /* memset(W, 0, sizeof(W)); */
361} 361}
362 362
363#undef S0
364#undef S1
365#undef s0
366#undef s1
367#undef K
368 363
369#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) 364#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
370 365
@@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
372{ 367{
373 if (size == 0) 368 if (size == 0)
374 return; 369 return;
375
376 { 370 {
377 unsigned pos = (unsigned)p->count & 0x3F; 371 const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
378 unsigned num; 372 const unsigned num = SHA256_BLOCK_SIZE - pos;
379 373 p->v.vars.count += size;
380 p->count += size;
381
382 num = 64 - pos;
383 if (num > size) 374 if (num > size)
384 { 375 {
385 memcpy(p->buffer + pos, data, size); 376 memcpy(p->buffer + pos, data, size);
386 return; 377 return;
387 } 378 }
388
389 if (pos != 0) 379 if (pos != 0)
390 { 380 {
391 size -= num; 381 size -= num;
@@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
395 } 385 }
396 } 386 }
397 { 387 {
398 size_t numBlocks = size >> 6; 388 const size_t numBlocks = size >> 6;
389 // if (numBlocks)
399 SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); 390 SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
400 size &= 0x3F; 391 size &= SHA256_BLOCK_SIZE - 1;
401 if (size == 0) 392 if (size == 0)
402 return; 393 return;
403 data += (numBlocks << 6); 394 data += (numBlocks << 6);
@@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
408 399
409void Sha256_Final(CSha256 *p, Byte *digest) 400void Sha256_Final(CSha256 *p, Byte *digest)
410{ 401{
411 unsigned pos = (unsigned)p->count & 0x3F; 402 unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
412 unsigned i;
413
414 p->buffer[pos++] = 0x80; 403 p->buffer[pos++] = 0x80;
415 404 if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
416 if (pos > (64 - 8))
417 { 405 {
418 while (pos != 64) { p->buffer[pos++] = 0; } 406 while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
419 // memset(&p->buf.buffer[pos], 0, 64 - pos); 407 // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
420 Sha256_UpdateBlock(p); 408 Sha256_UpdateBlock(p);
421 pos = 0; 409 pos = 0;
422 } 410 }
423 411 memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
424 /*
425 if (pos & 3)
426 { 412 {
427 p->buffer[pos] = 0; 413 const UInt64 numBits = p->v.vars.count << 3;
428 p->buffer[pos + 1] = 0; 414 SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
429 p->buffer[pos + 2] = 0; 415 SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
430 pos += 3;
431 pos &= ~3;
432 } 416 }
417 Sha256_UpdateBlock(p);
418#if 1 && defined(MY_CPU_BE)
419 memcpy(digest, p->state, SHA256_DIGEST_SIZE);
420#else
433 { 421 {
434 for (; pos < 64 - 8; pos += 4) 422 unsigned i;
435 *(UInt32 *)(&p->buffer[pos]) = 0; 423 for (i = 0; i < 8; i += 2)
424 {
425 const UInt32 v0 = p->state[i];
426 const UInt32 v1 = p->state[(size_t)i + 1];
427 SetBe32(digest , v0)
428 SetBe32(digest + 4, v1)
429 digest += 4 * 2;
430 }
436 } 431 }
437 */
438 432
439 memset(&p->buffer[pos], 0, (64 - 8) - pos);
440 433
441 {
442 UInt64 numBits = (p->count << 3);
443 SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32))
444 SetBe32(p->buffer + 64 - 4, (UInt32)(numBits))
445 }
446
447 Sha256_UpdateBlock(p);
448 434
449 for (i = 0; i < 8; i += 2) 435
450 { 436#endif
451 UInt32 v0 = p->state[i];
452 UInt32 v1 = p->state[(size_t)i + 1];
453 SetBe32(digest , v0)
454 SetBe32(digest + 4, v1)
455 digest += 8;
456 }
457
458 Sha256_InitState(p); 437 Sha256_InitState(p);
459} 438}
460 439
461 440
462void Sha256Prepare(void) 441void Sha256Prepare(void)
463{ 442{
464 #ifdef Z7_COMPILER_SHA256_SUPPORTED 443#ifdef Z7_COMPILER_SHA256_SUPPORTED
465 SHA256_FUNC_UPDATE_BLOCKS f, f_hw; 444 SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
466 f = Sha256_UpdateBlocks; 445 f = Sha256_UpdateBlocks;
467 f_hw = NULL; 446 f_hw = NULL;
468 #ifdef MY_CPU_X86_OR_AMD64 447#ifdef MY_CPU_X86_OR_AMD64
469 #ifndef USE_MY_MM
470 if (CPU_IsSupported_SHA() 448 if (CPU_IsSupported_SHA()
471 && CPU_IsSupported_SSSE3() 449 && CPU_IsSupported_SSSE3()
472 // && CPU_IsSupported_SSE41()
473 ) 450 )
474 #endif 451#else
475 #else
476 if (CPU_IsSupported_SHA2()) 452 if (CPU_IsSupported_SHA2())
477 #endif 453#endif
478 { 454 {
479 // printf("\n========== HW SHA256 ======== \n"); 455 // printf("\n========== HW SHA256 ======== \n");
480 f = f_hw = Sha256_UpdateBlocks_HW; 456 f = f_hw = Sha256_UpdateBlocks_HW;
481 } 457 }
482 g_SHA256_FUNC_UPDATE_BLOCKS = f; 458 g_SHA256_FUNC_UPDATE_BLOCKS = f;
483 g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; 459 g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
484 #endif 460#endif
485} 461}
486 462
463#undef U64C
464#undef K
487#undef S0 465#undef S0
488#undef S1 466#undef S1
489#undef s0 467#undef s0
diff --git a/C/Sha256.h b/C/Sha256.h
index 9e04223..75329cd 100644
--- a/C/Sha256.h
+++ b/C/Sha256.h
@@ -1,5 +1,5 @@
1/* Sha256.h -- SHA-256 Hash 1/* Sha256.h -- SHA-256 Hash
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SHA256_H 4#ifndef ZIP7_INC_SHA256_H
5#define ZIP7_INC_SHA256_H 5#define ZIP7_INC_SHA256_H
@@ -14,6 +14,9 @@ EXTERN_C_BEGIN
14#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) 14#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
15#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) 15#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
16 16
17
18
19
17typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); 20typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
18 21
19/* 22/*
@@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
32 35
33typedef struct 36typedef struct
34{ 37{
35 SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; 38 union
36 UInt64 count; 39 {
37 UInt64 _pad_2[2]; 40 struct
41 {
42 SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[4];
46 void *_pad_align_ptr[2];
47 } v;
38 UInt32 state[SHA256_NUM_DIGEST_WORDS]; 48 UInt32 state[SHA256_NUM_DIGEST_WORDS];
39 49
40 Byte buffer[SHA256_BLOCK_SIZE]; 50 Byte buffer[SHA256_BLOCK_SIZE];
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c
index eb38166..1c6b50f 100644
--- a/C/Sha256Opt.c
+++ b/C/Sha256Opt.c
@@ -1,18 +1,11 @@
1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions 1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if defined(_MSC_VER)
9#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10// #define USE_MY_MM
11#endif
12#endif
13
14// #define Z7_USE_HW_SHA_STUB // for debug 8// #define Z7_USE_HW_SHA_STUB // for debug
15
16#ifdef MY_CPU_X86_OR_AMD64 9#ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA 11 #define USE_HW_SHA
@@ -20,19 +13,14 @@
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ 13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) 14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA 15 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER) 16 #if !defined(__INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) 17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__) 18 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) 19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif 20 #endif
28 #endif 21 #endif
29 #elif defined(_MSC_VER) 22 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM 23 #if (_MSC_VER >= 1900)
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA 24 #define USE_HW_SHA
37 #else 25 #else
38 #define Z7_USE_HW_SHA_STUB 26 #define Z7_USE_HW_SHA_STUB
@@ -47,23 +35,20 @@
47 35
48// #pragma message("Sha256 HW") 36// #pragma message("Sha256 HW")
49 37
38
39
40
50// sse/sse2/ssse3: 41// sse/sse2/ssse3:
51#include <tmmintrin.h> 42#include <tmmintrin.h>
52// sha*: 43// sha*:
53#include <immintrin.h> 44#include <immintrin.h>
54 45
55#if defined (__clang__) && defined(_MSC_VER) 46#if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__) 47 #if !defined(__SHA__)
59 #include <shaintrin.h> 48 #include <shaintrin.h>
60 #endif 49 #endif
61#else 50#else
62 51
63#ifdef USE_MY_MM
64#include "My_mm.h"
65#endif
66
67#endif 52#endif
68 53
69/* 54/*
@@ -91,60 +76,44 @@ SHA:
91extern 76extern
92MY_ALIGN(64) 77MY_ALIGN(64)
93const UInt32 SHA256_K_ARRAY[64]; 78const UInt32 SHA256_K_ARRAY[64];
94
95#define K SHA256_K_ARRAY 79#define K SHA256_K_ARRAY
96 80
97 81
98#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); 82#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
99#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); 83#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
100#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); 84#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
101
102 85
103#define LOAD_SHUFFLE(m, k) \ 86#define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ 87 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 m = _mm_shuffle_epi8(m, mask); \ 88 m = _mm_shuffle_epi8(m, mask); \
106 89
107#define SM1(g0, g1, g2, g3) \ 90#define NNN(m0, m1, m2, m3)
108 SHA256_MSG1(g3, g0); \
109 91
110#define SM2(g0, g1, g2, g3) \ 92#define SM1(m1, m2, m3, m0) \
111 tmp = _mm_alignr_epi8(g1, g0, 4); \ 93 SHA256_MSG1(m0, m1); \
112 ADD_EPI32(g2, tmp) \
113 SHA25G_MSG2(g2, g1); \
114
115// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
116// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
117
118
119#define NNN(g0, g1, g2, g3)
120 94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
97 SHA256_MSG2(m0, m3); \
121 98
122#define RND2(t0, t1) \ 99#define RND2(t0, t1) \
123 t0 = _mm_sha256rnds2_epu32(t0, t1, msg); 100 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
124 101
125#define RND2_0(m, k) \
126 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
127 RND2(state0, state1); \
128 msg = _mm_shuffle_epi32(msg, 0x0E); \
129 102
130 103
131#define RND2_1 \ 104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1); \
107 msg = _mm_shuffle_epi32(msg, 0x0E); \
108 OP0(m0, m1, m2, m3) \
132 RND2(state1, state0); \ 109 RND2(state1, state0); \
133 110 OP1(m0, m1, m2, m3) \
134
135// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
136
137#define R4(k, g0, g1, g2, g3, OP0, OP1) \
138 RND2_0(g0, k) \
139 OP0(g0, g1, g2, g3) \
140 RND2_1 \
141 OP1(g0, g1, g2, g3) \
142 111
143#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
144 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ 113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
145 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ 114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
146 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ 115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
147 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ 116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
148 117
149#define PREPARE_STATE \ 118#define PREPARE_STATE \
150 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ 119 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@@ -161,8 +130,9 @@ ATTRIB_SHA
161void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) 130void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
162{ 131{
163 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); 132 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
164 __m128i tmp; 133
165 __m128i state0, state1; 134
135 __m128i tmp, state0, state1;
166 136
167 if (numBlocks == 0) 137 if (numBlocks == 0)
168 return; 138 return;
@@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
262 #define _ARM_USE_NEW_NEON_INTRINSICS 232 #define _ARM_USE_NEW_NEON_INTRINSICS
263#endif 233#endif
264 234
265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) 235#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270#include <arm64_neon.h> 236#include <arm64_neon.h>
271#else 237#else
272 238
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16 239#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \ 240#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO) 241 !defined(__ARM_FEATURE_CRYPTO)
@@ -324,41 +282,70 @@ typedef uint32x4_t v128;
324// typedef __n128 v128; // MSVC 282// typedef __n128 v128; // MSVC
325 283
326#ifdef MY_CPU_BE 284#ifdef MY_CPU_BE
327 #define MY_rev32_for_LE(x) 285 #define MY_rev32_for_LE(x) x
328#else 286#else
329 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) 287 #define MY_rev32_for_LE(x) vrev32q_u8(x)
330#endif 288#endif
331 289
332#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) 290#if 1 // 0 for debug
333#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) 291// for arm32: it works slower by some reason than direct code
292/*
293for arm32 it generates:
294MSVC-2022, GCC-9:
295 vld1.32 {d18,d19}, [r10]
296 vst1.32 {d4,d5}, [r3]
297 vld1.8 {d20-d21}, [r4]
298there is no align hint (like [r10:128]). So instruction allows unaligned access
299*/
300#define LOAD_128_32(_p) vld1q_u32(_p)
301#define LOAD_128_8(_p) vld1q_u8 (_p)
302#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
303#else
304/*
305for arm32:
306MSVC-2022:
307 vldm r10,{d18,d19}
308 vstm r3,{d4,d5}
309 does it require strict alignment?
310GCC-9:
311 vld1.64 {d30-d31}, [r0:64]
312 vldr d28, [r0, #16]
313 vldr d29, [r0, #24]
314 vst1.64 {d30-d31}, [r0:64]
315 vstr d28, [r0, #16]
316 vstr d29, [r0, #24]
317there is hint [r0:64], so does it requires 64-bit alignment.
318*/
319#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
320#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
321#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
322#endif
334 323
335#define LOAD_SHUFFLE(m, k) \ 324#define LOAD_SHUFFLE(m, k) \
336 m = LOAD_128((data + (k) * 16)); \ 325 m = vreinterpretq_u32_u8( \
337 MY_rev32_for_LE(m); \ 326 MY_rev32_for_LE( \
327 LOAD_128_8(data + (k) * 16))); \
338 328
339// K array must be aligned for 16-bytes at least. 329// K array must be aligned for 16-bytes at least.
340extern 330extern
341MY_ALIGN(64) 331MY_ALIGN(64)
342const UInt32 SHA256_K_ARRAY[64]; 332const UInt32 SHA256_K_ARRAY[64];
343
344#define K SHA256_K_ARRAY 333#define K SHA256_K_ARRAY
345 334
346
347#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); 335#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
348#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); 336#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
349 337
350#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) 338#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
351#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) 339#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
352#define NNN(g0, g1, g2, g3) 340#define NNN(m0, m1, m2, m3)
353 341
354 342#define R4(k, m0, m1, m2, m3, OP0, OP1) \
355#define R4(k, g0, g1, g2, g3, OP0, OP1) \ 343 msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
356 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
357 tmp = state0; \ 344 tmp = state0; \
358 state0 = vsha256hq_u32( state0, state1, msg ); \ 345 state0 = vsha256hq_u32( state0, state1, msg ); \
359 state1 = vsha256h2q_u32( state1, tmp, msg ); \ 346 state1 = vsha256h2q_u32( state1, tmp, msg ); \
360 OP0(g0, g1, g2, g3); \ 347 OP0(m0, m1, m2, m3); \
361 OP1(g0, g1, g2, g3); \ 348 OP1(m0, m1, m2, m3); \
362 349
363 350
364#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ 351#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
379 if (numBlocks == 0) 366 if (numBlocks == 0)
380 return; 367 return;
381 368
382 state0 = LOAD_128(&state[0]); 369 state0 = LOAD_128_32(&state[0]);
383 state1 = LOAD_128(&state[4]); 370 state1 = LOAD_128_32(&state[4]);
384 371
385 do 372 do
386 { 373 {
@@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
408 } 395 }
409 while (--numBlocks); 396 while (--numBlocks);
410 397
411 STORE_128(&state[0], state0); 398 STORE_128_32(&state[0], state0);
412 STORE_128(&state[4], state1); 399 STORE_128_32(&state[4], state1);
413} 400}
414 401
415#endif // USE_HW_SHA 402#endif // USE_HW_SHA
@@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
443#endif 430#endif
444 431
445 432
446
447#undef K 433#undef K
448#undef RND2 434#undef RND2
449#undef RND2_0
450#undef RND2_1
451
452#undef MY_rev32_for_LE 435#undef MY_rev32_for_LE
436
453#undef NNN 437#undef NNN
454#undef LOAD_128 438#undef LOAD_128
455#undef STORE_128 439#undef STORE_128
@@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
457#undef SM1 441#undef SM1
458#undef SM2 442#undef SM2
459 443
460#undef NNN 444
461#undef R4 445#undef R4
462#undef R16 446#undef R16
463#undef PREPARE_STATE 447#undef PREPARE_STATE
diff --git a/C/Sha3.c b/C/Sha3.c
new file mode 100644
index 0000000..be972d6
--- /dev/null
+++ b/C/Sha3.c
@@ -0,0 +1,359 @@
1/* Sha3.c -- SHA-3 Hash
2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Sha3.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#define U64C(x) UINT64_CONST(x)
14
15static
16MY_ALIGN(64)
17const UInt64 SHA3_K_ARRAY[24] =
18{
19 U64C(0x0000000000000001), U64C(0x0000000000008082),
20 U64C(0x800000000000808a), U64C(0x8000000080008000),
21 U64C(0x000000000000808b), U64C(0x0000000080000001),
22 U64C(0x8000000080008081), U64C(0x8000000000008009),
23 U64C(0x000000000000008a), U64C(0x0000000000000088),
24 U64C(0x0000000080008009), U64C(0x000000008000000a),
25 U64C(0x000000008000808b), U64C(0x800000000000008b),
26 U64C(0x8000000000008089), U64C(0x8000000000008003),
27 U64C(0x8000000000008002), U64C(0x8000000000000080),
28 U64C(0x000000000000800a), U64C(0x800000008000000a),
29 U64C(0x8000000080008081), U64C(0x8000000000008080),
30 U64C(0x0000000080000001), U64C(0x8000000080008008)
31};
32
33void Sha3_Init(CSha3 *p)
34{
35 p->count = 0;
36 memset(p->state, 0, sizeof(p->state));
37}
38
39#define GET_state(i, a) UInt64 a = state[i];
40#define SET_state(i, a) state[i] = a;
41
42#define LS_5(M, i, a0,a1,a2,a3,a4) \
43 M ((i) * 5 , a0) \
44 M ((i) * 5 + 1, a1) \
45 M ((i) * 5 + 2, a2) \
46 M ((i) * 5 + 3, a3) \
47 M ((i) * 5 + 4, a4) \
48
49#define LS_25(M) \
50 LS_5 (M, 0, a50, a51, a52, a53, a54) \
51 LS_5 (M, 1, a60, a61, a62, a63, a64) \
52 LS_5 (M, 2, a70, a71, a72, a73, a74) \
53 LS_5 (M, 3, a80, a81, a82, a83, a84) \
54 LS_5 (M, 4, a90, a91, a92, a93, a94) \
55
56
57#define XOR_1(i, a0) \
58 a0 ^= GetUi64(data + (i) * 8); \
59
60#define XOR_4(i, a0,a1,a2,a3) \
61 XOR_1 ((i) , a0); \
62 XOR_1 ((i) + 1, a1); \
63 XOR_1 ((i) + 2, a2); \
64 XOR_1 ((i) + 3, a3); \
65
66#define D(d,b1,b2) \
67 d = b1 ^ Z7_ROTL64(b2, 1);
68
69#define D5 \
70 D (d0, c4, c1) \
71 D (d1, c0, c2) \
72 D (d2, c1, c3) \
73 D (d3, c2, c4) \
74 D (d4, c3, c0) \
75
76#define C0(c,a,d) \
77 c = a ^ d; \
78
79#define C(c,a,d,k) \
80 c = a ^ d; \
81 c = Z7_ROTL64(c, k); \
82
83#define E4(e1,e2,e3,e4) \
84 e1 = c1 ^ (~c2 & c3); \
85 e2 = c2 ^ (~c3 & c4); \
86 e3 = c3 ^ (~c4 & c0); \
87 e4 = c4 ^ (~c0 & c1); \
88
89#define CK( v0,w0, \
90 v1,w1,k1, \
91 v2,w2,k2, \
92 v3,w3,k3, \
93 v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \
94 C0(c0,v0,w0) \
95 C (c1,v1,w1,k1) \
96 C (c2,v2,w2,k2) \
97 C (c3,v3,w3,k3) \
98 C (c4,v4,w4,k4) \
99 e0 = c0 ^ (~c1 & c2) ^ keccak_c; \
100 E4(e1,e2,e3,e4) \
101
102#define CE( v0,w0,k0, \
103 v1,w1,k1, \
104 v2,w2,k2, \
105 v3,w3,k3, \
106 v4,w4,k4, e0,e1,e2,e3,e4) \
107 C (c0,v0,w0,k0) \
108 C (c1,v1,w1,k1) \
109 C (c2,v2,w2,k2) \
110 C (c3,v3,w3,k3) \
111 C (c4,v4,w4,k4) \
112 e0 = c0 ^ (~c1 & c2); \
113 E4(e1,e2,e3,e4) \
114
115// numBlocks != 0
116static
117Z7_NO_INLINE
118void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS],
119 const Byte *data, size_t numBlocks, size_t blockSize)
120{
121 LS_25 (GET_state)
122
123 do
124 {
125 unsigned round;
126 XOR_4 ( 0, a50, a51, a52, a53)
127 XOR_4 ( 4, a54, a60, a61, a62)
128 XOR_1 ( 8, a63)
129 if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384
130 if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256
131 if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224
132 if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128
133 XOR_1 (19, a84)
134 XOR_1 (20, a90) }}}}
135 data += blockSize;
136
137 for (round = 0; round < 24; round += 2)
138 {
139 UInt64 c0, c1, c2, c3, c4;
140 UInt64 d0, d1, d2, d3, d4;
141 UInt64 e50, e51, e52, e53, e54;
142 UInt64 e60, e61, e62, e63, e64;
143 UInt64 e70, e71, e72, e73, e74;
144 UInt64 e80, e81, e82, e83, e84;
145 UInt64 e90, e91, e92, e93, e94;
146
147 c0 = a50^a60^a70^a80^a90;
148 c1 = a51^a61^a71^a81^a91;
149 c2 = a52^a62^a72^a82^a92;
150 c3 = a53^a63^a73^a83^a93;
151 c4 = a54^a64^a74^a84^a94;
152 D5
153 CK( a50, d0,
154 a61, d1, 44,
155 a72, d2, 43,
156 a83, d3, 21,
157 a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round])
158 CE( a53, d3, 28,
159 a64, d4, 20,
160 a70, d0, 3,
161 a81, d1, 45,
162 a92, d2, 61, e60, e61, e62, e63, e64)
163 CE( a51, d1, 1,
164 a62, d2, 6,
165 a73, d3, 25,
166 a84, d4, 8,
167 a90, d0, 18, e70, e71, e72, e73, e74)
168 CE( a54, d4, 27,
169 a60, d0, 36,
170 a71, d1, 10,
171 a82, d2, 15,
172 a93, d3, 56, e80, e81, e82, e83, e84)
173 CE( a52, d2, 62,
174 a63, d3, 55,
175 a74, d4, 39,
176 a80, d0, 41,
177 a91, d1, 2, e90, e91, e92, e93, e94)
178
179 // ---------- ROUND + 1 ----------
180
181 c0 = e50^e60^e70^e80^e90;
182 c1 = e51^e61^e71^e81^e91;
183 c2 = e52^e62^e72^e82^e92;
184 c3 = e53^e63^e73^e83^e93;
185 c4 = e54^e64^e74^e84^e94;
186 D5
187 CK( e50, d0,
188 e61, d1, 44,
189 e72, d2, 43,
190 e83, d3, 21,
191 e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1])
192 CE( e53, d3, 28,
193 e64, d4, 20,
194 e70, d0, 3,
195 e81, d1, 45,
196 e92, d2, 61, a60, a61, a62, a63, a64)
197 CE( e51, d1, 1,
198 e62, d2, 6,
199 e73, d3, 25,
200 e84, d4, 8,
201 e90, d0, 18, a70, a71, a72, a73, a74)
202 CE (e54, d4, 27,
203 e60, d0, 36,
204 e71, d1, 10,
205 e82, d2, 15,
206 e93, d3, 56, a80, a81, a82, a83, a84)
207 CE (e52, d2, 62,
208 e63, d3, 55,
209 e74, d4, 39,
210 e80, d0, 41,
211 e91, d1, 2, a90, a91, a92, a93, a94)
212 }
213 }
214 while (--numBlocks);
215
216 LS_25 (SET_state)
217}
218
219
220#define Sha3_UpdateBlock(p) \
221 Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize)
222
223void Sha3_Update(CSha3 *p, const Byte *data, size_t size)
224{
225/*
226 for (;;)
227 {
228 if (size == 0)
229 return;
230 unsigned cur = p->blockSize - p->count;
231 if (cur > size)
232 cur = (unsigned)size;
233 size -= cur;
234 unsigned pos = p->count;
235 p->count = pos + cur;
236 while (pos & 7)
237 {
238 if (cur == 0)
239 return;
240 Byte *pb = &(((Byte *)p->state)[pos]);
241 *pb = (Byte)(*pb ^ *data++);
242 cur--;
243 pos++;
244 }
245 if (cur >= 8)
246 {
247 do
248 {
249 *(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data);
250 data += 8;
251 pos += 8;
252 cur -= 8;
253 }
254 while (cur >= 8);
255 }
256 if (pos != p->blockSize)
257 {
258 if (cur)
259 {
260 Byte *pb = &(((Byte *)p->state)[pos]);
261 do
262 {
263 *pb = (Byte)(*pb ^ *data++);
264 pb++;
265 }
266 while (--cur);
267 }
268 return;
269 }
270 Sha3_UpdateBlock(p->state);
271 p->count = 0;
272 }
273*/
274 if (size == 0)
275 return;
276 {
277 const unsigned pos = p->count;
278 const unsigned num = p->blockSize - pos;
279 if (num > size)
280 {
281 p->count = pos + (unsigned)size;
282 memcpy(p->buffer + pos, data, size);
283 return;
284 }
285 if (pos != 0)
286 {
287 size -= num;
288 memcpy(p->buffer + pos, data, num);
289 data += num;
290 Sha3_UpdateBlock(p);
291 }
292 }
293 if (size >= p->blockSize)
294 {
295 const size_t numBlocks = size / p->blockSize;
296 const Byte *dataOld = data;
297 data += numBlocks * p->blockSize;
298 size = (size_t)(dataOld + size - data);
299 Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize);
300 }
301 p->count = (unsigned)size;
302 if (size)
303 memcpy(p->buffer, data, size);
304}
305
306
307// we support only (digestSize % 4 == 0) cases
308void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake)
309{
310 memset(p->buffer + p->count, 0, p->blockSize - p->count);
311 // we write bits markers from low to higher in current byte:
312 // - if sha-3 : 2 bits : 0,1
313 // - if shake : 4 bits : 1111
314 // then we write bit 1 to same byte.
315 // And we write bit 1 to highest bit of last byte of block.
316 p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06);
317 // we need xor operation (^= 0x80) here because we must write 0x80 bit
318 // to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!!
319 p->buffer[p->blockSize - 1] ^= 0x80;
320/*
321 ((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06);
322 ((Byte *)p->state)[p->blockSize - 1] ^= 0x80;
323*/
324 Sha3_UpdateBlock(p);
325#if 1 && defined(MY_CPU_LE)
326 memcpy(digest, p->state, digestSize);
327#else
328 {
329 const unsigned numWords = digestSize >> 3;
330 unsigned i;
331 for (i = 0; i < numWords; i++)
332 {
333 const UInt64 v = p->state[i];
334 SetUi64(digest, v)
335 digest += 8;
336 }
337 if (digestSize & 4) // for SHA3-224
338 {
339 const UInt32 v = (UInt32)p->state[numWords];
340 SetUi32(digest, v)
341 }
342 }
343#endif
344 Sha3_Init(p);
345}
346
347#undef GET_state
348#undef SET_state
349#undef LS_5
350#undef LS_25
351#undef XOR_1
352#undef XOR_4
353#undef D
354#undef D5
355#undef C0
356#undef C
357#undef E4
358#undef CK
359#undef CE
diff --git a/C/Sha3.h b/C/Sha3.h
new file mode 100644
index 0000000..c5909c9
--- /dev/null
+++ b/C/Sha3.h
@@ -0,0 +1,36 @@
1/* Sha3.h -- SHA-3 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_MD5_H
5#define ZIP7_INC_MD5_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define SHA3_NUM_STATE_WORDS 25
12
13#define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \
14 (SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2)
15
16typedef struct
17{
18 UInt32 count; // < blockSize
19 UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8
20 UInt64 _pad1[3];
21 // we want 32-bytes alignment here
22 UInt64 state[SHA3_NUM_STATE_WORDS];
23 UInt64 _pad2[3];
24 // we want 64-bytes alignment here
25 Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values
26} CSha3;
27
28#define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); }
29
30void Sha3_Init(CSha3 *p);
31void Sha3_Update(CSha3 *p, const Byte *data, size_t size);
32void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake);
33
34EXTERN_C_END
35
36#endif
diff --git a/C/Sha512.c b/C/Sha512.c
new file mode 100644
index 0000000..f0787fd
--- /dev/null
+++ b/C/Sha512.c
@@ -0,0 +1,711 @@
1/* Sha512.c -- SHA-512 Hash
2: Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */
4
5#include "Precomp.h"
6
7#include <string.h>
8
9#include "Sha512.h"
10#include "RotateDefs.h"
11#include "CpuArch.h"
12
13#ifdef MY_CPU_X86_OR_AMD64
14 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
15 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
16 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \
17 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \
18 || defined(_MSC_VER) && (_MSC_VER >= 1940)
19 #define Z7_COMPILER_SHA512_SUPPORTED
20 #endif
21#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
22 #if defined(__ARM_FEATURE_SHA512)
23 #define Z7_COMPILER_SHA512_SUPPORTED
24 #else
25 #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
26 || defined(__GNUC__) && (__GNUC__ >= 9) \
27 ) \
28 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
29 #define Z7_COMPILER_SHA512_SUPPORTED
30 #endif
31 #endif
32#endif
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
48
49#ifdef Z7_COMPILER_SHA512_SUPPORTED
50 void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
51
52 static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks;
53 static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW;
54
55 #define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
56#else
57 #define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks
58#endif
59
60
61BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo)
62{
63 SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks;
64
65 #ifdef Z7_COMPILER_SHA512_SUPPORTED
66 if (algo != SHA512_ALGO_SW)
67 {
68 if (algo == SHA512_ALGO_DEFAULT)
69 func = g_SHA512_FUNC_UPDATE_BLOCKS;
70 else
71 {
72 if (algo != SHA512_ALGO_HW)
73 return False;
74 func = g_SHA512_FUNC_UPDATE_BLOCKS_HW;
75 if (!func)
76 return False;
77 }
78 }
79 #else
80 if (algo > 1)
81 return False;
82 #endif
83
84 p->v.vars.func_UpdateBlocks = func;
85 return True;
86}
87
88
89/* define it for speed optimization */
90
91#if 0 // 1 for size optimization
92 #define STEP_PRE 1
93 #define STEP_MAIN 1
94#else
95 #define STEP_PRE 2
96 #define STEP_MAIN 4
97 // #define Z7_SHA512_UNROLL
98#endif
99
100#undef Z7_SHA512_BIG_W
101#if STEP_MAIN != 16
102 #define Z7_SHA512_BIG_W
103#endif
104
105
106#define U64C(x) UINT64_CONST(x)
107
108static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = {
109{ U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf),
110 U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1)
111},
112{ U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd),
113 U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2)
114},
115{ U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939),
116 U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4)
117},
118{ U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1),
119 U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179)
120}};
121
122void Sha512_InitState(CSha512 *p, unsigned digestSize)
123{
124 p->v.vars.count = 0;
125 memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state));
126}
127
128void Sha512_Init(CSha512 *p, unsigned digestSize)
129{
130 p->v.vars.func_UpdateBlocks =
131 #ifdef Z7_COMPILER_SHA512_SUPPORTED
132 g_SHA512_FUNC_UPDATE_BLOCKS;
133 #else
134 NULL;
135 #endif
136 Sha512_InitState(p, digestSize);
137}
138
139#define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39))
140#define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41))
141#define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7))
142#define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6))
143
144#define Ch(x,y,z) (z^(x&(y^z)))
145#define Maj(x,y,z) ((x&y)|(z&(x|y)))
146
147
148#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8))
149
150#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
151
152#ifdef Z7_SHA512_BIG_W
153 // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
154 #define w(j, i) W[(size_t)(j) + i]
155 #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
156#else
157 #if STEP_MAIN == 16
158 #define w(j, i) W[(i) & 15]
159 #else
160 #define w(j, i) W[((size_t)(j) + (i)) & 15]
161 #endif
162 #define blk2(j, i) (w(j, i) += blk2_main(j, i))
163#endif
164
165#define W_MAIN(i) blk2(j, i)
166
167
168#define T1(wx, i) \
169 tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
170 h = g; \
171 g = f; \
172 f = e; \
173 e = d + tmp; \
174 tmp += S0(a) + Maj(a, b, c); \
175 d = c; \
176 c = b; \
177 b = a; \
178 a = tmp; \
179
180#define R1_PRE(i) T1( W_PRE, i)
181#define R1_MAIN(i) T1( W_MAIN, i)
182
183#if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
184#define R2_MAIN(i) \
185 R1_MAIN(i) \
186 R1_MAIN(i + 1) \
187
188#endif
189
190
191
192#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
193
194#define T4( a,b,c,d,e,f,g,h, wx, i) \
195 h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
196 tmp = h; \
197 h += d; \
198 d = tmp + S0(a) + Maj(a, b, c); \
199
200#define R4( wx, i) \
201 T4 ( a,b,c,d,e,f,g,h, wx, (i )); \
202 T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \
203 T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \
204 T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \
205
206#define R4_PRE(i) R4( W_PRE, i)
207#define R4_MAIN(i) R4( W_MAIN, i)
208
209
210#define T8( a,b,c,d,e,f,g,h, wx, i) \
211 h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
212 d += h; \
213 h += S0(a) + Maj(a, b, c); \
214
215#define R8( wx, i) \
216 T8 ( a,b,c,d,e,f,g,h, wx, i ); \
217 T8 ( h,a,b,c,d,e,f,g, wx, i+1); \
218 T8 ( g,h,a,b,c,d,e,f, wx, i+2); \
219 T8 ( f,g,h,a,b,c,d,e, wx, i+3); \
220 T8 ( e,f,g,h,a,b,c,d, wx, i+4); \
221 T8 ( d,e,f,g,h,a,b,c, wx, i+5); \
222 T8 ( c,d,e,f,g,h,a,b, wx, i+6); \
223 T8 ( b,c,d,e,f,g,h,a, wx, i+7); \
224
225#define R8_PRE(i) R8( W_PRE, i)
226#define R8_MAIN(i) R8( W_MAIN, i)
227
228#endif
229
230
231extern
232MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80];
233MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = {
234 U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc),
235 U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118),
236 U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2),
237 U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694),
238 U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65),
239 U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5),
240 U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4),
241 U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70),
242 U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df),
243 U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b),
244 U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30),
245 U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8),
246 U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8),
247 U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3),
248 U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec),
249 U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b),
250 U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178),
251 U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b),
252 U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c),
253 U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817)
254};
255
256#define K SHA512_K_ARRAY
257
258Z7_NO_INLINE
259void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks)
260{
261 UInt64 W
262#ifdef Z7_SHA512_BIG_W
263 [80];
264#else
265 [16];
266#endif
267 unsigned j;
268 UInt64 a,b,c,d,e,f,g,h;
269#if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
270 UInt64 tmp;
271#endif
272
273 if (numBlocks == 0) return;
274
275 a = state[0];
276 b = state[1];
277 c = state[2];
278 d = state[3];
279 e = state[4];
280 f = state[5];
281 g = state[6];
282 h = state[7];
283
284 do
285 {
286
287 for (j = 0; j < 16; j += STEP_PRE)
288 {
289 #if STEP_PRE > 4
290
291 #if STEP_PRE < 8
292 R4_PRE(0);
293 #else
294 R8_PRE(0);
295 #if STEP_PRE == 16
296 R8_PRE(8);
297 #endif
298 #endif
299
300 #else
301
302 R1_PRE(0)
303 #if STEP_PRE >= 2
304 R1_PRE(1)
305 #if STEP_PRE >= 4
306 R1_PRE(2)
307 R1_PRE(3)
308 #endif
309 #endif
310
311 #endif
312 }
313
314 for (j = 16; j < 80; j += STEP_MAIN)
315 {
316 #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
317
318 #if STEP_MAIN < 8
319 R4_MAIN(0)
320 #else
321 R8_MAIN(0)
322 #if STEP_MAIN == 16
323 R8_MAIN(8)
324 #endif
325 #endif
326
327 #else
328
329 R1_MAIN(0)
330 #if STEP_MAIN >= 2
331 R1_MAIN(1)
332 #if STEP_MAIN >= 4
333 R2_MAIN(2)
334 #if STEP_MAIN >= 8
335 R2_MAIN(4)
336 R2_MAIN(6)
337 #if STEP_MAIN >= 16
338 R2_MAIN(8)
339 R2_MAIN(10)
340 R2_MAIN(12)
341 R2_MAIN(14)
342 #endif
343 #endif
344 #endif
345 #endif
346 #endif
347 }
348
349 a += state[0]; state[0] = a;
350 b += state[1]; state[1] = b;
351 c += state[2]; state[2] = c;
352 d += state[3]; state[3] = d;
353 e += state[4]; state[4] = e;
354 f += state[5]; state[5] = f;
355 g += state[6]; state[6] = g;
356 h += state[7]; state[7] = h;
357
358 data += SHA512_BLOCK_SIZE;
359 }
360 while (--numBlocks);
361}
362
363
364#define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
365
366void Sha512_Update(CSha512 *p, const Byte *data, size_t size)
367{
368 if (size == 0)
369 return;
370 {
371 const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
372 const unsigned num = SHA512_BLOCK_SIZE - pos;
373 p->v.vars.count += size;
374 if (num > size)
375 {
376 memcpy(p->buffer + pos, data, size);
377 return;
378 }
379 if (pos != 0)
380 {
381 size -= num;
382 memcpy(p->buffer + pos, data, num);
383 data += num;
384 Sha512_UpdateBlock(p);
385 }
386 }
387 {
388 const size_t numBlocks = size >> 7;
389 // if (numBlocks)
390 SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
391 size &= SHA512_BLOCK_SIZE - 1;
392 if (size == 0)
393 return;
394 data += (numBlocks << 7);
395 memcpy(p->buffer, data, size);
396 }
397}
398
399
400void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize)
401{
402 unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
403 p->buffer[pos++] = 0x80;
404 if (pos > (SHA512_BLOCK_SIZE - 8 * 2))
405 {
406 while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; }
407 // memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos);
408 Sha512_UpdateBlock(p);
409 pos = 0;
410 }
411 memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos);
412 {
413 const UInt64 numBits = p->v.vars.count << 3;
414 SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits)
415 SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits)
416 }
417 Sha512_UpdateBlock(p);
418#if 1 && defined(MY_CPU_BE)
419 memcpy(digest, p->state, digestSize);
420#else
421 {
422 const unsigned numWords = digestSize >> 3;
423 unsigned i;
424 for (i = 0; i < numWords; i++)
425 {
426 const UInt64 v = p->state[i];
427 SetBe64(digest, v)
428 digest += 8;
429 }
430 if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE
431 {
432 const UInt32 v = (UInt32)((p->state[numWords]) >> 32);
433 SetBe32(digest, v)
434 }
435 }
436#endif
437 Sha512_InitState(p, digestSize);
438}
439
440
441
442// #define Z7_SHA512_PROBE_DEBUG // for debug
443
444#if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
445
446#if defined(Z7_SHA512_PROBE_DEBUG) \
447 || defined(_WIN32) && defined(MY_CPU_ARM64)
448#ifndef Z7_SHA512_USE_PROBE
449#define Z7_SHA512_USE_PROBE
450#endif
451#endif
452
453#ifdef Z7_SHA512_USE_PROBE
454
455#ifdef Z7_SHA512_PROBE_DEBUG
456#include <stdio.h>
457#define PRF(x) x
458#else
459#define PRF(x)
460#endif
461
462#if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode
463// MINGW doesn't support __try. So we use signal() / longjmp().
464// Note: signal() / longjmp() probably is not thread-safe.
465// So we must call Sha512Prepare() from main thread at program start.
466#ifndef Z7_SHA512_USE_LONGJMP
467#define Z7_SHA512_USE_LONGJMP
468#endif
469#endif
470
471#ifdef Z7_SHA512_USE_LONGJMP
472#include <signal.h>
473#include <setjmp.h>
474static jmp_buf g_Sha512_jmp_buf;
475// static int g_Sha512_Unsupported;
476
477#if defined(__GNUC__) && (__GNUC__ >= 8) \
478 || defined(__clang__) && (__clang_major__ >= 3)
479 __attribute__((noreturn))
480#endif
481static void Z7_CDECL Sha512_signal_Handler(int v)
482{
483 PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);)
484 // g_Sha512_Unsupported = 1;
485 longjmp(g_Sha512_jmp_buf, 1);
486}
487#endif // Z7_SHA512_USE_LONGJMP
488
489
490#if defined(_WIN32)
491#include "7zWindows.h"
492#endif
493
494#if defined(MY_CPU_ARM64)
495// #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug
496#endif
497
498#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
499#include <arm_neon.h>
500#if defined(__clang__)
501 __attribute__((__target__("sha3")))
502#elif !defined(_MSC_VER)
503 __attribute__((__target__("arch=armv8.2-a+sha3")))
504#endif
505#endif
506static BoolInt CPU_IsSupported_SHA512_Probe(void)
507{
508 PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");)
509#if defined(_WIN32) && defined(MY_CPU_ARM64)
510 // we have no SHA512 flag for IsProcessorFeaturePresent() still.
511 if (!CPU_IsSupported_CRYPTO())
512 return False;
513 PRF(printf("==== Registry check\n");)
514 {
515 // we can't read ID_AA64ISAR0_EL1 register from application.
516 // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value.
517 HKEY key = NULL;
518 LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
519 TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
520 0, KEY_READ, &key);
521 if (res != ERROR_SUCCESS)
522 return False;
523 {
524 DWORD type = 0;
525 DWORD count = sizeof(UInt64);
526 UInt64 val = 0;
527 res = RegQueryValueEx(key, TEXT("CP 4030"), NULL,
528 &type, (LPBYTE)&val, &count);
529 RegCloseKey(key);
530 if (res != ERROR_SUCCESS
531 || type != REG_QWORD
532 || count != sizeof(UInt64)
533 || ((unsigned)(val >> 12) & 0xf) != 2)
534 return False;
535 // we parse SHA2 field of ID_AA64ISAR0_EL1 register:
536 // 0 : No SHA2 instructions implemented
537 // 1 : SHA256 implemented
538 // 2 : SHA256 and SHA512 implemented
539 }
540 }
541#endif // defined(_WIN32) && defined(MY_CPU_ARM64)
542
543
544#if 1 // 0 for debug to disable SHA512 PROBE code
545
546/*
547----- SHA512 PROBE -----
548
549We suppose that "CP 4030" registry reading is enough.
550But we use additional SHA512 PROBE code, because
551we can catch exception here, and we don't catch exceptions,
552if we call Sha512 functions from main code.
553
554NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64.
555The program just stops.
556Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator
557without SHA512 support (-skl switch),
558The program stops, and we have message from SDE:
559 TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1
560But we still want to catch that exception instead of process stopping.
561Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)?
562Are there any ways to fix the problems with arm64-wine and x64-SDE cases?
563*/
564
565 PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");)
566 {
567 BoolInt isSupported = False;
568#ifdef Z7_SHA512_USE_LONGJMP
569 void (Z7_CDECL *signal_prev)(int);
570 /*
571 if (g_Sha512_Unsupported)
572 {
573 PRF(printf("==== g_Sha512_Unsupported\n");)
574 return False;
575 }
576 */
577 printf("====== signal(SIGILL)\n");
578 signal_prev = signal(SIGILL, Sha512_signal_Handler);
579 if (signal_prev == SIG_ERR)
580 {
581 PRF(printf("====== signal fail\n");)
582 return False;
583 }
584 // PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);)
585 // docs: Before the specified function is executed,
586 // the value of func is set to SIG_DFL.
587 // So we can exit if (setjmp(g_Sha512_jmp_buf) != 0).
588 PRF(printf("====== setjmp\n");)
589 if (!setjmp(g_Sha512_jmp_buf))
590#else // Z7_SHA512_USE_LONGJMP
591
592#ifdef _MSC_VER
593#ifdef __clang_major__
594 #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
595#endif
596 __try
597#endif
598#endif // Z7_SHA512_USE_LONGJMP
599
600 {
601#if defined(Z7_COMPILER_SHA512_SUPPORTED)
602#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
603 // simplified sha512 check for arm64:
604 const uint64x2_t a = vdupq_n_u64(1);
605 const uint64x2_t b = vsha512hq_u64(a, a, a);
606 PRF(printf("======== vsha512hq_u64 probe\n");)
607 if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002)
608#else
609 MY_ALIGN(16)
610 UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS];
611 memset(temp, 0x5a, sizeof(temp));
612 PRF(printf("======== Sha512_UpdateBlocks_HW\n");)
613 Sha512_UpdateBlocks_HW(temp,
614 (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1);
615 // PRF(printf("======== t = %x\n", (UInt32)temp[0]);)
616 if ((UInt32)temp[0] == 0xa33cfdf7)
617#endif
618 {
619 PRF(printf("======== PROBE SHA512: SHA512 is supported\n");)
620 isSupported = True;
621 }
622#else // Z7_COMPILER_SHA512_SUPPORTED
623 // for debug : we generate bad instrction or raise exception.
624 // __except() doesn't catch raise() calls.
625#ifdef Z7_SHA512_USE_LONGJMP
626 PRF(printf("====== raise(SIGILL)\n");)
627 raise(SIGILL);
628#else
629#if defined(_MSC_VER) && defined(MY_CPU_X86)
630 __asm ud2
631#endif
632#endif // Z7_SHA512_USE_LONGJMP
633#endif // Z7_COMPILER_SHA512_SUPPORTED
634 }
635
636#ifdef Z7_SHA512_USE_LONGJMP
637 PRF(printf("====== restore signal SIGILL\n");)
638 signal(SIGILL, signal_prev);
639#elif _MSC_VER
640 __except (EXCEPTION_EXECUTE_HANDLER)
641 {
642 PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");)
643 }
644#endif
645 PRF(printf("== return (sha512 supported) = %d\n", isSupported);)
646 return isSupported;
647 }
648#else
649 // without SHA512 PROBE code
650 return True;
651#endif
652}
653
654#endif // Z7_SHA512_USE_PROBE
655#endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
656
657
658void Sha512Prepare(void)
659{
660#ifdef Z7_COMPILER_SHA512_SUPPORTED
661 SHA512_FUNC_UPDATE_BLOCKS f, f_hw;
662 f = Sha512_UpdateBlocks;
663 f_hw = NULL;
664#ifdef Z7_SHA512_USE_PROBE
665 if (CPU_IsSupported_SHA512_Probe())
666#elif defined(MY_CPU_X86_OR_AMD64)
667 if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2())
668#else
669 if (CPU_IsSupported_SHA512())
670#endif
671 {
672 // printf("\n========== HW SHA512 ======== \n");
673 f = f_hw = Sha512_UpdateBlocks_HW;
674 }
675 g_SHA512_FUNC_UPDATE_BLOCKS = f;
676 g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw;
677#elif defined(Z7_SHA512_PROBE_DEBUG)
678 CPU_IsSupported_SHA512_Probe(); // for debug
679#endif
680}
681
682
683#undef K
684#undef S0
685#undef S1
686#undef s0
687#undef s1
688#undef Ch
689#undef Maj
690#undef W_MAIN
691#undef W_PRE
692#undef w
693#undef blk2_main
694#undef blk2
695#undef T1
696#undef T4
697#undef T8
698#undef R1_PRE
699#undef R1_MAIN
700#undef R2_MAIN
701#undef R4
702#undef R4_PRE
703#undef R4_MAIN
704#undef R8
705#undef R8_PRE
706#undef R8_MAIN
707#undef STEP_PRE
708#undef STEP_MAIN
709#undef Z7_SHA512_BIG_W
710#undef Z7_SHA512_UNROLL
711#undef Z7_COMPILER_SHA512_SUPPORTED
diff --git a/C/Sha512.h b/C/Sha512.h
new file mode 100644
index 0000000..1f3a4d1
--- /dev/null
+++ b/C/Sha512.h
@@ -0,0 +1,86 @@
1/* Sha512.h -- SHA-512 Hash
2: Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_SHA512_H
5#define ZIP7_INC_SHA512_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define SHA512_NUM_BLOCK_WORDS 16
12#define SHA512_NUM_DIGEST_WORDS 8
13
14#define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8)
15#define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8)
16#define SHA512_224_DIGEST_SIZE (224 / 8)
17#define SHA512_256_DIGEST_SIZE (256 / 8)
18#define SHA512_384_DIGEST_SIZE (384 / 8)
19
20typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks);
21
22/*
23 if (the system supports different SHA512 code implementations)
24 {
25 (CSha512::func_UpdateBlocks) will be used
26 (CSha512::func_UpdateBlocks) can be set by
27 Sha512_Init() - to default (fastest)
28 Sha512_SetFunction() - to any algo
29 }
30 else
31 {
32 (CSha512::func_UpdateBlocks) is ignored.
33 }
34*/
35
36typedef struct
37{
38 union
39 {
40 struct
41 {
42 SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
43 UInt64 count;
44 } vars;
45 UInt64 _pad_64bit[8];
46 void *_pad_align_ptr[2];
47 } v;
48 UInt64 state[SHA512_NUM_DIGEST_WORDS];
49
50 Byte buffer[SHA512_BLOCK_SIZE];
51} CSha512;
52
53
54#define SHA512_ALGO_DEFAULT 0
55#define SHA512_ALGO_SW 1
56#define SHA512_ALGO_HW 2
57
58/*
59Sha512_SetFunction()
60return:
61 0 - (algo) value is not supported, and func_UpdateBlocks was not changed
62 1 - func_UpdateBlocks was set according (algo) value.
63*/
64
65BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo);
66// we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8
67void Sha512_InitState(CSha512 *p, unsigned digestSize);
68void Sha512_Init(CSha512 *p, unsigned digestSize);
69void Sha512_Update(CSha512 *p, const Byte *data, size_t size);
70void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize);
71
72
73
74
75// void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
76
77/*
78call Sha512Prepare() once at program start.
79It prepares all supported implementations, and detects the fastest implementation.
80*/
81
82void Sha512Prepare(void);
83
84EXTERN_C_END
85
86#endif
diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c
new file mode 100644
index 0000000..3a13868
--- /dev/null
+++ b/C/Sha512Opt.c
@@ -0,0 +1,395 @@
1/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
2: Igor Pavlov : Public domain */
3
4#include "Precomp.h"
5#include "Compiler.h"
6#include "CpuArch.h"
7
8// #define Z7_USE_HW_SHA_STUB // for debug
9#ifdef MY_CPU_X86_OR_AMD64
10 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
11 #define USE_HW_SHA
12 #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
13 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
14 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000)
15 #define USE_HW_SHA
16 #if !defined(__INTEL_COMPILER)
17 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
18 #if !defined(__SHA512__) || !defined(__AVX2__)
19 #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
20 #endif
21 #endif
22 #elif defined(Z7_MSC_VER_ORIGINAL)
23 #if (_MSC_VER >= 1940)
24 #define USE_HW_SHA
25 #else
26 // #define Z7_USE_HW_SHA_STUB
27 #endif
28 #endif
29// #endif // MY_CPU_X86_OR_AMD64
30#ifndef USE_HW_SHA
31 // #define Z7_USE_HW_SHA_STUB // for debug
32#endif
33
34#ifdef USE_HW_SHA
35
36// #pragma message("Sha512 HW")
37
38#include <immintrin.h>
39
40#if defined (__clang__) && defined(_MSC_VER)
41 #if !defined(__AVX__)
42 #include <avxintrin.h>
43 #endif
44 #if !defined(__AVX2__)
45 #include <avx2intrin.h>
46 #endif
47 #if !defined(__SHA512__)
48 #include <sha512intrin.h>
49 #endif
50#else
51
52#endif
53
54/*
55SHA512 uses:
56AVX:
57 _mm256_loadu_si256 (vmovdqu)
58 _mm256_storeu_si256
59 _mm256_set_epi32 (unused)
60AVX2:
61 _mm256_add_epi64 : vpaddq
62 _mm256_shuffle_epi8 : vpshufb
63 _mm256_shuffle_epi32 : pshufd
64 _mm256_blend_epi32 : vpblendd
65 _mm256_permute4x64_epi64 : vpermq : 3c
66 _mm256_permute2x128_si256: vperm2i128 : 3c
67 _mm256_extracti128_si256 : vextracti128 : 3c
68SHA512:
69 _mm256_sha512*
70*/
71
72// K array must be aligned for 32-bytes at least.
73// The compiler can look align attribute and selects
74// vmovdqu - for code without align attribute
75// vmovdqa - for code with align attribute
76extern
77MY_ALIGN(64)
78const UInt64 SHA512_K_ARRAY[80];
79#define K SHA512_K_ARRAY
80
81
82#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src);
83#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
84#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src);
85
86#define LOAD_SHUFFLE(m, k) \
87 m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
88 m = _mm256_shuffle_epi8(m, mask); \
89
90#define NNN(m0, m1, m2, m3)
91
92#define SM1(m1, m2, m3, m0) \
93 SHA512_MSG1(m0, m1); \
94
95#define SM2(m2, m3, m0, m1) \
96 ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
97 SHA512_MSG2(m0, m3); \
98
99#define RND2(t0, t1, lane) \
100 t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
101
102
103
104#define R4(k, m0, m1, m2, m3, OP0, OP1) \
105 msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
106 RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \
107 RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \
108
109
110
111
112#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
113 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
114 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
115 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
116 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
117
118#define PREPARE_STATE \
119 state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \
120 state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \
121 tmp = state0; \
122 state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
123 state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \
124
125
126void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
127#ifdef ATTRIB_SHA512
128ATTRIB_SHA512
129#endif
130void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
131{
132 const __m256i mask = _mm256_set_epi32(
133 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
134 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
135 __m256i tmp, state0, state1;
136
137 if (numBlocks == 0)
138 return;
139
140 state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
141 state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
142
143 PREPARE_STATE
144
145 do
146 {
147 __m256i state0_save, state1_save;
148 __m256i m0, m1, m2, m3;
149 __m256i msg;
150 // #define msg tmp
151
152 state0_save = state0;
153 state1_save = state1;
154
155 LOAD_SHUFFLE (m0, 0)
156 LOAD_SHUFFLE (m1, 1)
157 LOAD_SHUFFLE (m2, 2)
158 LOAD_SHUFFLE (m3, 3)
159
160
161
162 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
163 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
164 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
165 R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
166 R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
167 ADD_EPI64(state0, state0_save)
168 ADD_EPI64(state1, state1_save)
169
170 data += 128;
171 }
172 while (--numBlocks);
173
174 PREPARE_STATE
175
176 _mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
177 _mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
178}
179
180#endif // USE_HW_SHA
181
182// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
183#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
184
185 #if defined(__ARM_FEATURE_SHA512)
186 #define USE_HW_SHA
187 #else
188 #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
189 || defined(__GNUC__) && (__GNUC__ >= 9) \
190 ) \
191 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
192 #define USE_HW_SHA
193 #endif
194 #endif
195
196#ifdef USE_HW_SHA
197
198// #pragma message("=== Sha512 HW === ")
199
200
201#if defined(__clang__) || defined(__GNUC__)
202#if !defined(__ARM_FEATURE_SHA512)
203// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
204#if defined(__clang__)
205 #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
206#else
207 #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
208#endif
209#endif
210#endif
211
212
213#if defined(Z7_MSC_VER_ORIGINAL)
214#include <arm64_neon.h>
215#else
216
217#if defined(__clang__) && __clang_major__ < 16
218#if !defined(__ARM_FEATURE_SHA512)
219// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
220 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
221 #define Z7_ARM_FEATURE_SHA512_WAS_SET 1
222 #define __ARM_FEATURE_SHA512 1
223 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
224#endif
225#endif // clang
226
227#include <arm_neon.h>
228
229#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
230 defined(__ARM_FEATURE_SHA512)
231 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
232 #undef __ARM_FEATURE_SHA512
233 #undef Z7_ARM_FEATURE_SHA512_WAS_SET
234 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
235// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
236#endif
237
238#endif // Z7_MSC_VER_ORIGINAL
239
240typedef uint64x2_t v128_64;
241// typedef __n128 v128_64; // MSVC
242
243#ifdef MY_CPU_BE
244 #define MY_rev64_for_LE(x) x
245#else
246 #define MY_rev64_for_LE(x) vrev64q_u8(x)
247#endif
248
249#define LOAD_128_64(_p) vld1q_u64(_p)
250#define LOAD_128_8(_p) vld1q_u8 (_p)
251#define STORE_128_64(_p, _v) vst1q_u64(_p, _v)
252
253#define LOAD_SHUFFLE(m, k) \
254 m = vreinterpretq_u64_u8( \
255 MY_rev64_for_LE( \
256 LOAD_128_8(data + (k) * 16))); \
257
258// K array must be aligned for 16-bytes at least.
259extern
260MY_ALIGN(64)
261const UInt64 SHA512_K_ARRAY[80];
262#define K SHA512_K_ARRAY
263
264#define NN(m0, m1, m4, m5, m7)
265#define SM(m0, m1, m4, m5, m7) \
266 m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
267
268#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
269 OP(m0, m1, m4, m5, m7) \
270 t = vaddq_u64(m0, vld1q_u64(k)); \
271 t = vaddq_u64(vextq_u64(t, t, 1), a3); \
272 t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
273 a3 = vsha512h2q_u64(t, a1, a0); \
274 a1 = vaddq_u64(a1, t); \
275
276#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \
277 R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
278 R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
279 R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
280 R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
281
282#define R16(k, OP) \
283 R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
284 R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
285
286
287void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
288#ifdef ATTRIB_SHA512
289ATTRIB_SHA512
290#endif
291void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
292{
293 v128_64 a0, a1, a2, a3;
294
295 if (numBlocks == 0)
296 return;
297 a0 = LOAD_128_64(&state[0]);
298 a1 = LOAD_128_64(&state[2]);
299 a2 = LOAD_128_64(&state[4]);
300 a3 = LOAD_128_64(&state[6]);
301 do
302 {
303 v128_64 a0_save, a1_save, a2_save, a3_save;
304 v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
305 v128_64 t;
306 unsigned i;
307 const UInt64 *k_ptr;
308
309 LOAD_SHUFFLE (m0, 0)
310 LOAD_SHUFFLE (m1, 1)
311 LOAD_SHUFFLE (m2, 2)
312 LOAD_SHUFFLE (m3, 3)
313 LOAD_SHUFFLE (m4, 4)
314 LOAD_SHUFFLE (m5, 5)
315 LOAD_SHUFFLE (m6, 6)
316 LOAD_SHUFFLE (m7, 7)
317
318 a0_save = a0;
319 a1_save = a1;
320 a2_save = a2;
321 a3_save = a3;
322
323 R16 ( K, NN )
324 k_ptr = K + 16;
325 for (i = 0; i < 4; i++)
326 {
327 R16 ( k_ptr, SM )
328 k_ptr += 16;
329 }
330
331 a0 = vaddq_u64(a0, a0_save);
332 a1 = vaddq_u64(a1, a1_save);
333 a2 = vaddq_u64(a2, a2_save);
334 a3 = vaddq_u64(a3, a3_save);
335
336 data += 128;
337 }
338 while (--numBlocks);
339
340 STORE_128_64(&state[0], a0);
341 STORE_128_64(&state[2], a1);
342 STORE_128_64(&state[4], a2);
343 STORE_128_64(&state[6], a3);
344}
345
346#endif // USE_HW_SHA
347
348#endif // MY_CPU_ARM_OR_ARM64
349
350
351#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
352// #error Stop_Compiling_UNSUPPORTED_SHA
353// #include <stdlib.h>
354// We can compile this file with another C compiler,
355// or we can compile asm version.
356// So we can generate real code instead of this stub function.
357// #include "Sha512.h"
358// #if defined(_MSC_VER)
359#pragma message("Sha512 HW-SW stub was used")
360// #endif
361void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks);
362void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
363void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
364{
365 Sha512_UpdateBlocks(state, data, numBlocks);
366 /*
367 UNUSED_VAR(state);
368 UNUSED_VAR(data);
369 UNUSED_VAR(numBlocks);
370 exit(1);
371 return;
372 */
373}
374#endif
375
376
377#undef K
378#undef RND2
379#undef MY_rev64_for_LE
380#undef NN
381#undef NNN
382#undef LOAD_128
383#undef STORE_128
384#undef LOAD_SHUFFLE
385#undef SM1
386#undef SM2
387#undef SM
388#undef R2
389#undef R4
390#undef R16
391#undef PREPARE_STATE
392#undef USE_HW_SHA
393#undef ATTRIB_SHA512
394#undef USE_VER_MIN
395#undef Z7_USE_HW_SHA_STUB
diff --git a/C/Sort.c b/C/Sort.c
index e1097e3..20e3e69 100644
--- a/C/Sort.c
+++ b/C/Sort.c
@@ -1,141 +1,268 @@
1/* Sort.c -- Sort functions 1/* Sort.c -- Sort functions
22014-04-05 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "Sort.h" 6#include "Sort.h"
7#include "CpuArch.h"
7 8
8#define HeapSortDown(p, k, size, temp) \ 9#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
9 { for (;;) { \ 10 || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
10 size_t s = (k << 1); \ 11 )
11 if (s > size) break; \ 12// the code with prefetch is slow for small arrays on x86.
12 if (s < size && p[s + 1] > p[s]) s++; \ 13// So we disable prefetch for x86.
13 if (temp >= p[s]) break; \ 14#ifndef MY_CPU_X86
14 p[k] = p[s]; k = s; \ 15 // #pragma message("Z7_PREFETCH : __builtin_prefetch")
15 } p[k] = temp; } 16 #define Z7_PREFETCH(a) __builtin_prefetch((a))
17#endif
16 18
17void HeapSort(UInt32 *p, size_t size) 19#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
18{ 20
19 if (size <= 1) 21#include "7zWindows.h"
20 return; 22
21 p--; 23// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
22 { 24// For example, clang-cl can generate "prefetcht2" instruction for
23 size_t i = size / 2; 25// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
24 do 26// But we want to generate "prefetcht0" instruction.
25 { 27// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
26 UInt32 temp = p[i]; 28// instead of PreFetchCacheLine() / _mm_prefetch().
27 size_t k = i; 29
28 HeapSortDown(p, k, size, temp) 30// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
29 } 31// But old x86 cpus don't support "prefetcht0".
30 while (--i != 0); 32// So we will use PreFetchCacheLine(), only if we are sure that
31 } 33// generated instruction is supported by all cpus of that isa.
32 /* 34#if defined(MY_CPU_AMD64) \
33 do 35 || defined(MY_CPU_ARM64) \
34 { 36 || defined(MY_CPU_IA64)
35 size_t k = 1; 37// we need to use additional braces for (a) in PreFetchCacheLine call, because
36 UInt32 temp = p[size]; 38// PreFetchCacheLine macro doesn't use braces:
37 p[size--] = p[1]; 39// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
38 HeapSortDown(p, k, size, temp) 40 // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
39 } 41 #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
40 while (size > 1); 42#endif
41 */ 43
42 while (size > 3) 44#endif // _WIN32
43 { 45
44 UInt32 temp = p[size]; 46
45 size_t k = (p[3] > p[2]) ? 3 : 2; 47#define PREFETCH_NO(p,k,s,size)
46 p[size--] = p[1]; 48
47 p[1] = p[k]; 49#ifndef Z7_PREFETCH
48 HeapSortDown(p, k, size, temp) 50 #define SORT_PREFETCH(p,k,s,size)
49 } 51#else
50 { 52
51 UInt32 temp = p[size]; 53// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
52 p[size] = p[1]; 54#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
53 if (size > 2 && p[2] < temp) 55// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
54 { 56
55 p[1] = p[2]; 57#if PREFETCH_LEVEL == 0
56 p[2] = temp; 58
57 } 59 #define SORT_PREFETCH(p,k,s,size)
58 else 60
59 p[1] = temp; 61#else // PREFETCH_LEVEL != 0
60 } 62
63/*
64if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
65 we prefetch one value per cache line.
66 Use it if array is aligned for cache line size (64 bytes)
67 or if array is small (less than L1 cache size).
68
69if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
70 we perfetch all cache lines that can be required.
71 it can be faster for big unaligned arrays.
72*/
73 #define USE_PREFETCH_FOR_ALIGNED_ARRAY
74
75// s == k * 2
76#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
77 // x86 supports (lea r1*8+offset)
78 #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
79#else
80 #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
81#endif
82
83#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
84 #define PREFETCH_ADD_OFFSET 0
85#else
86 // last offset that can be reqiured in PREFETCH_LEVEL step:
87 #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
88 #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
89#endif
90
91#if PREFETCH_LEVEL <= 3
92
93#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
94 #define SORT_PREFETCH(p,k,s,size) \
95 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
96 if (s2 <= size) { \
97 Z7_PREFETCH((p + s2)); \
98 }}
99#else /* for unaligned array */
100 #define SORT_PREFETCH(p,k,s,size) \
101 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
102 if (s2 <= size) { \
103 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
104 Z7_PREFETCH((p + s2)); \
105 }}
106#endif
107
108#else // PREFETCH_LEVEL > 3
109
110#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
111 #define SORT_PREFETCH(p,k,s,size) \
112 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
113 if (s2 <= size) { \
114 Z7_PREFETCH((p + s2 - 16)); \
115 Z7_PREFETCH((p + s2)); \
116 }}
117#else /* for unaligned array */
118 #define SORT_PREFETCH(p,k,s,size) \
119 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
120 if (s2 <= size) { \
121 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
122 Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
123 Z7_PREFETCH((p + s2)); \
124 }}
125#endif
126
127#endif // PREFETCH_LEVEL > 3
128#endif // PREFETCH_LEVEL != 0
129#endif // Z7_PREFETCH
130
131
132#if defined(MY_CPU_ARM64) \
133 /* || defined(MY_CPU_AMD64) */ \
134 /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
135 // we want to use cmov, if cmov is very fast:
136 // - this cmov version is slower for clang-x64.
137 // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
138 #define Z7_FAST_CMOV_SUPPORTED
139#endif
140
141#ifdef Z7_FAST_CMOV_SUPPORTED
142 // we want to use cmov here, if cmov is fast: new arm64 cpus.
143 // we want the compiler to use conditional move for this branch
144 #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
145#else
146 // use this branch, if cpu doesn't support fast conditional move.
147 // it uses slow array access reading:
148 #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
149#endif
150
151#define HeapSortDown(p, k, size, temp, macro_prefetch) \
152{ \
153 for (;;) { \
154 UInt32 n0, n1; \
155 size_t s = k * 2; \
156 if (s >= size) { \
157 if (s == size) { \
158 n0 = p[s]; \
159 p[k] = n0; \
160 if (temp < n0) k = s; \
161 } \
162 break; \
163 } \
164 n0 = p[k * 2]; \
165 n1 = p[k * 2 + 1]; \
166 s += n0 < n1; \
167 GET_MAX_VAL(n0, n1, p[s]) \
168 if (temp >= n0) break; \
169 macro_prefetch(p, k, s, size) \
170 p[k] = n0; \
171 k = s; \
172 } \
173 p[k] = temp; \
61} 174}
62 175
63void HeapSort64(UInt64 *p, size_t size) 176
177/*
178stage-1 : O(n) :
179 we generate intermediate partially sorted binary tree:
180 p[0] : it's additional item for better alignment of tree structure in memory.
181 p[1]
182 p[2] p[3]
183 p[4] p[5] p[6] p[7]
184 ...
185 p[x] >= p[x * 2]
186 p[x] >= p[x * 2 + 1]
187
188stage-2 : O(n)*log2(N):
189 we move largest item p[0] from head of tree to the end of array
190 and insert last item to sorted binary tree.
191*/
192
193// (p) must be aligned for cache line size (64-bytes) for best performance
194
195void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
64{ 196{
65 if (size <= 1) 197 if (size < 2)
66 return; 198 return;
67 p--; 199 if (size == 2)
68 {
69 size_t i = size / 2;
70 do
71 {
72 UInt64 temp = p[i];
73 size_t k = i;
74 HeapSortDown(p, k, size, temp)
75 }
76 while (--i != 0);
77 }
78 /*
79 do
80 { 200 {
81 size_t k = 1; 201 const UInt32 a0 = p[0];
82 UInt64 temp = p[size]; 202 const UInt32 a1 = p[1];
83 p[size--] = p[1]; 203 const unsigned k = a1 < a0;
84 HeapSortDown(p, k, size, temp) 204 p[k] = a0;
85 } 205 p[k ^ 1] = a1;
86 while (size > 1); 206 return;
87 */
88 while (size > 3)
89 {
90 UInt64 temp = p[size];
91 size_t k = (p[3] > p[2]) ? 3 : 2;
92 p[size--] = p[1];
93 p[1] = p[k];
94 HeapSortDown(p, k, size, temp)
95 } 207 }
96 { 208 {
97 UInt64 temp = p[size]; 209 // stage-1 : O(n)
98 p[size] = p[1]; 210 // we transform array to partially sorted binary tree.
99 if (size > 2 && p[2] < temp) 211 size_t i = --size / 2;
212 // (size) now is the index of the last item in tree,
213 // if (i)
100 { 214 {
101 p[1] = p[2]; 215 do
102 p[2] = temp; 216 {
217 const UInt32 temp = p[i];
218 size_t k = i;
219 HeapSortDown(p, k, size, temp, PREFETCH_NO)
220 }
221 while (--i);
222 }
223 {
224 const UInt32 temp = p[0];
225 const UInt32 a1 = p[1];
226 if (temp < a1)
227 {
228 size_t k = 1;
229 p[0] = a1;
230 HeapSortDown(p, k, size, temp, PREFETCH_NO)
231 }
103 } 232 }
104 else
105 p[1] = temp;
106 } 233 }
107}
108 234
109/* 235 if (size < 3)
110#define HeapSortRefDown(p, vals, n, size, temp) \ 236 {
111 { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ 237 // size == 2
112 size_t s = (k << 1); \ 238 const UInt32 a0 = p[0];
113 if (s > size) break; \ 239 p[0] = p[2];
114 if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ 240 p[2] = a0;
115 if (val >= vals[p[s]]) break; \
116 p[k] = p[s]; k = s; \
117 } p[k] = temp; }
118
119void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
120{
121 if (size <= 1)
122 return; 241 return;
123 p--; 242 }
243 if (size != 3)
124 { 244 {
125 size_t i = size / 2; 245 // stage-2 : O(size) * log2(size):
246 // we move largest item p[0] from head to the end of array,
247 // and insert last item to sorted binary tree.
126 do 248 do
127 { 249 {
128 UInt32 temp = p[i]; 250 const UInt32 temp = p[size];
129 HeapSortRefDown(p, vals, i, size, temp); 251 size_t k = p[2] < p[3] ? 3 : 2;
252 p[size--] = p[0];
253 p[0] = p[1];
254 p[1] = p[k];
255 HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
130 } 256 }
131 while (--i != 0); 257 while (size != 3);
132 } 258 }
133 do
134 { 259 {
135 UInt32 temp = p[size]; 260 const UInt32 a2 = p[2];
136 p[size--] = p[1]; 261 const UInt32 a3 = p[3];
137 HeapSortRefDown(p, vals, 1, size, temp); 262 const size_t k = a2 < a3;
263 p[2] = p[1];
264 p[3] = p[0];
265 p[k] = a3;
266 p[k ^ 1] = a2;
138 } 267 }
139 while (size > 1);
140} 268}
141*/
diff --git a/C/Sort.h b/C/Sort.h
index 1817b65..de5a4e8 100644
--- a/C/Sort.h
+++ b/C/Sort.h
@@ -1,5 +1,5 @@
1/* Sort.h -- Sort functions 1/* Sort.h -- Sort functions
22023-03-05 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SORT_H 4#ifndef ZIP7_INC_SORT_H
5#define ZIP7_INC_SORT_H 5#define ZIP7_INC_SORT_H
@@ -8,10 +8,7 @@
8 8
9EXTERN_C_BEGIN 9EXTERN_C_BEGIN
10 10
11void HeapSort(UInt32 *p, size_t size); 11void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
12void HeapSort64(UInt64 *p, size_t size);
13
14/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
15 12
16EXTERN_C_END 13EXTERN_C_END
17 14
diff --git a/C/Threads.c b/C/Threads.c
index 464efec..177d1d9 100644
--- a/C/Threads.c
+++ b/C/Threads.c
@@ -1,5 +1,5 @@
1/* Threads.c -- multithreading library 1/* Threads.c -- multithreading library
22024-03-28 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
59 return (res != 0 ? res : res2); 59 return (res != 0 ? res : res2);
60} 60}
61 61
62typedef struct MY_PROCESSOR_NUMBER {
63 WORD Group;
64 BYTE Number;
65 BYTE Reserved;
66} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
67
68typedef struct MY_GROUP_AFFINITY {
69#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
70 // KAFFINITY is not defined in old mingw
71 ULONG_PTR
72#else
73 KAFFINITY
74#endif
75 Mask;
76 WORD Group;
77 WORD Reserved[3];
78} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
79
80typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
81 HANDLE hThread,
82 CONST MY_GROUP_AFFINITY *GroupAffinity,
83 MY_PGROUP_AFFINITY PreviousGroupAffinity);
84
85typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
86 HANDLE hThread,
87 MY_PGROUP_AFFINITY GroupAffinity);
88
89typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
90 HANDLE hProcess,
91 PUSHORT GroupCount,
92 PUSHORT GroupArray);
93
94Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
95
96#if 0
97#include <stdio.h>
98#define PRF(x) x
99/*
100--
101 before call of SetThreadGroupAffinity()
102 GetProcessGroupAffinity return one group.
103 after call of SetThreadGroupAffinity():
104 GetProcessGroupAffinity return more than group,
105 if SetThreadGroupAffinity() was to another group.
106--
107 GetProcessAffinityMask MS DOCs:
108 {
109 If the calling process contains threads in multiple groups,
110 the function returns zero for both affinity masks.
111 }
112 but tests in win10 with 2 groups (less than 64 cores total):
113 GetProcessAffinityMask() still returns non-zero affinity masks
114 even after SetThreadGroupAffinity() calls.
115*/
116static void PrintProcess_Info()
117{
118 {
119 const
120 Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
121 (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
122 "GetProcessGroupAffinity");
123 if (fn_GetProcessGroupAffinity)
124 {
125 unsigned i;
126 USHORT GroupCounts[64];
127 USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
128 BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
129 &GroupCount, GroupCounts);
130 printf("\n====== GetProcessGroupAffinity : "
131 "boolRes=%u GroupCounts = %u :",
132 boolRes, (unsigned)GroupCount);
133 for (i = 0; i < GroupCount; i++)
134 printf(" %u", GroupCounts[i]);
135 printf("\n");
136 }
137 }
138 {
139 DWORD_PTR processAffinityMask, systemAffinityMask;
140 if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
141 {
142 PRF(printf("\n====== GetProcessAffinityMask : "
143 ": processAffinityMask=%x, systemAffinityMask=%x\n",
144 (UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
145 }
146 else
147 printf("\n==GetProcessAffinityMask FAIL");
148 }
149}
150#else
151#ifndef USE_THREADS_CreateThread
152// #define PRF(x)
153#endif
154#endif
155
62WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) 156WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
63{ 157{
64 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ 158 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
@@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
72 166
73 unsigned threadId; 167 unsigned threadId;
74 *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); 168 *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
75 169
170#if 0 // 1 : for debug
171 {
172 DWORD_PTR prevMask;
173 DWORD_PTR affinity = 1 << 0;
174 prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
175 prevMask = prevMask;
176 }
177#endif
178#if 0 // 1 : for debug
179 {
180 /* win10: new thread will be created in same group that is assigned to parent thread
181 but affinity mask will contain all allowed threads of that group,
182 even if affinity mask of parent group is not full
183 win11: what group it will be created, if we have set
184 affinity of parent thread with ThreadGroupAffinity?
185 */
186 const
187 Func_GetThreadGroupAffinity fn =
188 (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
189 "GetThreadGroupAffinity");
190 if (fn)
191 {
192 // BOOL wres2;
193 MY_GROUP_AFFINITY groupAffinity;
194 memset(&groupAffinity, 0, sizeof(groupAffinity));
195 /* wres2 = */ fn(*p, &groupAffinity);
196 PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
197 "wres2_BOOL = %u, group=%u mask=%x\n",
198 GetCurrentThreadId(),
199 wres2,
200 groupAffinity.Group,
201 (UInt32)groupAffinity.Mask);)
202 }
203 }
204#endif
205
76 #endif 206 #endif
77 207
78 /* maybe we must use errno here, but probably GetLastError() is also OK. */ 208 /* maybe we must use errno here, but probably GetLastError() is also OK. */
@@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
110 */ 240 */
111 } 241 }
112 { 242 {
113 DWORD prevSuspendCount = ResumeThread(h); 243 const DWORD prevSuspendCount = ResumeThread(h);
244 /* ResumeThread() returns:
245 0 : was_not_suspended
246 1 : was_resumed
247 -1 : error
248 */
249 if (prevSuspendCount == (DWORD)-1)
250 wres = GetError();
251 }
252 }
253
254 /* maybe we must use errno here, but probably GetLastError() is also OK. */
255 return wres;
256
257 #endif
258}
259
260
261WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
262{
263#ifdef USE_THREADS_CreateThread
264
265 UNUSED_VAR(group)
266 UNUSED_VAR(affinityMask)
267 return Thread_Create(p, func, param);
268
269#else
270
271 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
272 HANDLE h;
273 WRes wres;
274 unsigned threadId;
275 h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
276 *p = h;
277 wres = HandleToWRes(h);
278 if (h)
279 {
280 // PrintProcess_Info();
281 {
282 const
283 Func_SetThreadGroupAffinity fn =
284 (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
285 "SetThreadGroupAffinity");
286 if (fn)
287 {
288 // WRes wres2;
289 MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
290 memset(&groupAffinity, 0, sizeof(groupAffinity));
291 // groupAffinity.Mask must use only bits that supported by current group
292 // (groupAffinity.Mask = 0) means all allowed bits
293 groupAffinity.Mask = affinityMask;
294 groupAffinity.Group = (WORD)group;
295 // wres2 =
296 fn(h, &groupAffinity, &prev_groupAffinity);
297 /*
298 if (groupAffinity.Group == prev_groupAffinity.Group)
299 wres2 = wres2;
300 else
301 wres2 = wres2;
302 if (wres2 == 0)
303 {
304 wres2 = GetError();
305 PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
306 }
307 else
308 {
309 PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
310 " threadId = %6u"
311 " group=%u mask=%x\n",
312 threadId,
313 prev_groupAffinity.Group,
314 (UInt32)prev_groupAffinity.Mask);)
315 }
316 */
317 }
318 }
319 {
320 const DWORD prevSuspendCount = ResumeThread(h);
114 /* ResumeThread() returns: 321 /* ResumeThread() returns:
115 0 : was_not_suspended 322 0 : was_not_suspended
116 1 : was_resumed 323 1 : was_resumed
@@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
297 return Thread_Create_With_CpuSet(p, func, param, NULL); 504 return Thread_Create_With_CpuSet(p, func, param, NULL);
298} 505}
299 506
507/*
508WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
509{
510 UNUSED_VAR(group)
511 return Thread_Create_With_Affinity(p, func, param, affinity);
512}
513*/
300 514
301WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) 515WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
302{ 516{
@@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
577 return AutoResetEvent_CreateNotSignaled(p); 791 return AutoResetEvent_CreateNotSignaled(p);
578} 792}
579 793
794void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
795{
796 // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
797 if (numGroups == 0)
798 numGroups = 1;
799 p->NumGroups = numGroups;
800 p->NextGroup = startGroup % numGroups;
801}
802
803
804UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
805{
806 const UInt32 next = p->NextGroup;
807 p->NextGroup = (next + 1) % p->NumGroups;
808 return next;
809}
810
580#undef PRF 811#undef PRF
581#undef Print 812#undef Print
diff --git a/C/Threads.h b/C/Threads.h
index c1484a2..be12e6e 100644
--- a/C/Threads.h
+++ b/C/Threads.h
@@ -1,5 +1,5 @@
1/* Threads.h -- multithreading library 1/* Threads.h -- multithreading library
22024-03-28 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_THREADS_H 4#ifndef ZIP7_INC_THREADS_H
5#define ZIP7_INC_THREADS_H 5#define ZIP7_INC_THREADS_H
@@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
140WRes Thread_Wait_Close(CThread *p); 140WRes Thread_Wait_Close(CThread *p);
141 141
142#ifdef _WIN32 142#ifdef _WIN32
143WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
143#define Thread_Create_With_CpuSet(p, func, param, cs) \ 144#define Thread_Create_With_CpuSet(p, func, param, cs) \
144 Thread_Create_With_Affinity(p, func, param, *cs) 145 Thread_Create_With_Affinity(p, func, param, *cs)
145#else 146#else
146WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); 147WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
147#endif 148#endif
148 149
150typedef struct
151{
152 unsigned NumGroups;
153 unsigned NextGroup;
154} CThreadNextGroup;
155
156void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
157unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
158
149 159
150#ifdef _WIN32 160#ifdef _WIN32
151 161
diff --git a/C/Util/Lzma/LzmaUtil.dsp b/C/Util/Lzma/LzmaUtil.dsp
index e2e7d42..71de950 100644
--- a/C/Util/Lzma/LzmaUtil.dsp
+++ b/C/Util/Lzma/LzmaUtil.dsp
@@ -122,6 +122,10 @@ SOURCE=..\..\Compiler.h
122# End Source File 122# End Source File
123# Begin Source File 123# Begin Source File
124 124
125SOURCE=..\..\CpuArch.c
126# End Source File
127# Begin Source File
128
125SOURCE=..\..\CpuArch.h 129SOURCE=..\..\CpuArch.h
126# End Source File 130# End Source File
127# Begin Source File 131# Begin Source File
diff --git a/C/Util/LzmaLib/LzmaLib.dsp b/C/Util/LzmaLib/LzmaLib.dsp
index bacd967..f413137 100644
--- a/C/Util/LzmaLib/LzmaLib.dsp
+++ b/C/Util/LzmaLib/LzmaLib.dsp
@@ -43,7 +43,7 @@ RSC=rc.exe
43# PROP Ignore_Export_Lib 0 43# PROP Ignore_Export_Lib 0
44# PROP Target_Dir "" 44# PROP Target_Dir ""
45# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c 45# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c
46# ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c 46# ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c
47# SUBTRACT CPP /YX 47# SUBTRACT CPP /YX
48# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 48# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
49# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 49# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
@@ -71,7 +71,7 @@ LINK32=link.exe
71# PROP Ignore_Export_Lib 0 71# PROP Ignore_Export_Lib 0
72# PROP Target_Dir "" 72# PROP Target_Dir ""
73# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c 73# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c
74# ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c 74# ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c
75# SUBTRACT CPP /YX 75# SUBTRACT CPP /YX
76# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 76# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
77# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 77# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
@@ -128,6 +128,10 @@ SOURCE=..\..\Compiler.h
128# End Source File 128# End Source File
129# Begin Source File 129# Begin Source File
130 130
131SOURCE=..\..\CpuArch.c
132# End Source File
133# Begin Source File
134
131SOURCE=..\..\CpuArch.h 135SOURCE=..\..\CpuArch.h
132# End Source File 136# End Source File
133# Begin Source File 137# Begin Source File
diff --git a/C/Xz.h b/C/Xz.h
index 42bc685..ad63b48 100644
--- a/C/Xz.h
+++ b/C/Xz.h
@@ -1,5 +1,5 @@
1/* Xz.h - Xz interface 1/* Xz.h - Xz interface
22024-01-26 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_H 4#ifndef ZIP7_INC_XZ_H
5#define ZIP7_INC_XZ_H 5#define ZIP7_INC_XZ_H
@@ -121,6 +121,7 @@ typedef struct
121 UInt64 startOffset; 121 UInt64 startOffset;
122} CXzStream; 122} CXzStream;
123 123
124#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; }
124void Xz_Construct(CXzStream *p); 125void Xz_Construct(CXzStream *p);
125void Xz_Free(CXzStream *p, ISzAllocPtr alloc); 126void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
126 127
@@ -136,8 +137,13 @@ typedef struct
136 CXzStream *streams; 137 CXzStream *streams;
137} CXzs; 138} CXzs;
138 139
140#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; }
139void Xzs_Construct(CXzs *p); 141void Xzs_Construct(CXzs *p);
140void Xzs_Free(CXzs *p, ISzAllocPtr alloc); 142void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
143/*
144Xzs_ReadBackward() must be called for empty CXzs object.
145Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
146*/
141SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); 147SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
142 148
143UInt64 Xzs_GetNumBlocks(const CXzs *p); 149UInt64 Xzs_GetNumBlocks(const CXzs *p);
@@ -268,8 +274,8 @@ typedef struct
268 size_t outBufSize; 274 size_t outBufSize;
269 size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked 275 size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
270 276
271 Byte shaDigest[SHA256_DIGEST_SIZE]; 277 UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
272 Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; 278 Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
273} CXzUnpacker; 279} CXzUnpacker;
274 280
275/* alloc : aligned for cache line allocation is better */ 281/* alloc : aligned for cache line allocation is better */
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c
index 0c1fc2f..6eea4a3 100644
--- a/C/XzCrc64Opt.c
+++ b/C/XzCrc64Opt.c
@@ -1,5 +1,5 @@
1/* XzCrc64Opt.c -- CRC64 calculation (optimized functions) 1/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
22023-12-08 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
235 v = Q32BE(1, w1) ^ Q32BE(0, w0); 235 v = Q32BE(1, w1) ^ Q32BE(0, w0);
236 v ^= Q32BE(3, d1) ^ Q32BE(2, d0); 236 v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
237 #endif 237 #endif
238#elif 238#else
239#error Stop_Compiling_Bad_CRC64_NUM_TABLES 239#error Stop_Compiling_Bad_CRC64_NUM_TABLES
240#endif 240#endif
241 p += Z7_CRC64_NUM_TABLES_USE; 241 p += Z7_CRC64_NUM_TABLES_USE;
diff --git a/C/XzDec.c b/C/XzDec.c
index 3d1c98e..2dac324 100644
--- a/C/XzDec.c
+++ b/C/XzDec.c
@@ -1,5 +1,5 @@
1/* XzDec.c -- Xz Decode 1/* XzDec.c -- Xz Decode
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
59 59
60 for (i = 0; i < limit;) 60 for (i = 0; i < limit;)
61 { 61 {
62 Byte b = p[i]; 62 const unsigned b = p[i];
63 *value |= (UInt64)(b & 0x7F) << (7 * i++); 63 *value |= (UInt64)(b & 0x7F) << (7 * i++);
64 if ((b & 0x80) == 0) 64 if ((b & 0x80) == 0)
65 return (b == 0 && i != 1) ? 0 : i; 65 return (b == 0 && i != 1) ? 0 : i;
@@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
796 796
797static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) 797static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
798{ 798{
799 return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) 799 return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
800 && GetUi32(buf) == CrcCalc(buf + 4, 6) 800 && GetUi32a(buf) == CrcCalc(buf + 4, 6)
801 && flags == GetBe16(buf + 8) 801 && flags == GetBe16a(buf + 8)
802 && buf[10] == XZ_FOOTER_SIG_0 802 && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
803 && buf[11] == XZ_FOOTER_SIG_1;
804} 803}
805 804
806#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 805#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
@@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1166 p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); 1165 p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
1167 p->indexPos = p->indexPreSize; 1166 p->indexPos = p->indexPreSize;
1168 p->indexSize += p->indexPreSize; 1167 p->indexSize += p->indexPreSize;
1169 Sha256_Final(&p->sha, p->shaDigest); 1168 Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
1170 Sha256_Init(&p->sha); 1169 Sha256_Init(&p->sha);
1171 p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); 1170 p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
1172 p->state = XZ_STATE_STREAM_INDEX; 1171 p->state = XZ_STATE_STREAM_INDEX;
@@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1241 break; 1240 break;
1242 } 1241 }
1243 { 1242 {
1244 Byte digest[XZ_CHECK_SIZE_MAX]; 1243 UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
1245 p->state = XZ_STATE_BLOCK_HEADER; 1244 p->state = XZ_STATE_BLOCK_HEADER;
1246 p->pos = 0; 1245 p->pos = 0;
1247 if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) 1246 if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
1248 return SZ_ERROR_CRC; 1247 return SZ_ERROR_CRC;
1249 if (p->decodeOnlyOneBlock) 1248 if (p->decodeOnlyOneBlock)
1250 { 1249 {
@@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1289 } 1288 }
1290 else 1289 else
1291 { 1290 {
1292 Byte digest[SHA256_DIGEST_SIZE]; 1291 UInt32 digest32[SHA256_DIGEST_SIZE / 4];
1293 p->state = XZ_STATE_STREAM_INDEX_CRC; 1292 p->state = XZ_STATE_STREAM_INDEX_CRC;
1294 p->indexSize += 4; 1293 p->indexSize += 4;
1295 p->pos = 0; 1294 p->pos = 0;
1296 Sha256_Final(&p->sha, digest); 1295 Sha256_Final(&p->sha, (void *)digest32);
1297 if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) 1296 if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
1298 return SZ_ERROR_CRC; 1297 return SZ_ERROR_CRC;
1299 } 1298 }
1300 } 1299 }
@@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1313 const Byte *ptr = p->buf; 1312 const Byte *ptr = p->buf;
1314 p->state = XZ_STATE_STREAM_FOOTER; 1313 p->state = XZ_STATE_STREAM_FOOTER;
1315 p->pos = 0; 1314 p->pos = 0;
1316 if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) 1315 if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
1317 return SZ_ERROR_CRC; 1316 return SZ_ERROR_CRC;
1318 } 1317 }
1319 break; 1318 break;
@@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1343 { 1342 {
1344 if (*src != 0) 1343 if (*src != 0)
1345 { 1344 {
1346 if (((UInt32)p->padSize & 3) != 0) 1345 if ((unsigned)p->padSize & 3)
1347 return SZ_ERROR_NO_ARCHIVE; 1346 return SZ_ERROR_NO_ARCHIVE;
1348 p->pos = 0; 1347 p->pos = 0;
1349 p->state = XZ_STATE_STREAM_HEADER; 1348 p->state = XZ_STATE_STREAM_HEADER;
diff --git a/C/XzEnc.c b/C/XzEnc.c
index c1affad..e40f0c8 100644
--- a/C/XzEnc.c
+++ b/C/XzEnc.c
@@ -1,5 +1,5 @@
1/* XzEnc.c -- Xz Encode 1/* XzEnc.c -- Xz Encode
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
411 } 411 }
412} 412}
413 413
414Z7_FORCE_INLINE
414static void SeqInFilter_Construct(CSeqInFilter *p) 415static void SeqInFilter_Construct(CSeqInFilter *p)
415{ 416{
416 p->buf = NULL; 417 p->buf = NULL;
@@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
418 p->vt.Read = SeqInFilter_Read; 419 p->vt.Read = SeqInFilter_Read;
419} 420}
420 421
422Z7_FORCE_INLINE
421static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) 423static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
422{ 424{
423 if (p->StateCoder.p) 425 if (p->StateCoder.p)
@@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
507void XzProps_Init(CXzProps *p) 509void XzProps_Init(CXzProps *p)
508{ 510{
509 p->checkId = XZ_CHECK_CRC32; 511 p->checkId = XZ_CHECK_CRC32;
512 p->numThreadGroups = 0;
510 p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; 513 p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
511 p->numBlockThreads_Reduced = -1; 514 p->numBlockThreads_Reduced = -1;
512 p->numBlockThreads_Max = -1; 515 p->numBlockThreads_Max = -1;
@@ -689,6 +692,7 @@ typedef struct
689} CLzma2WithFilters; 692} CLzma2WithFilters;
690 693
691 694
695Z7_FORCE_INLINE
692static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) 696static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
693{ 697{
694 p->lzma2 = NULL; 698 p->lzma2 = NULL;
@@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
712} 716}
713 717
714 718
719Z7_FORCE_INLINE
715static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) 720static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
716{ 721{
717 #ifdef USE_SUBBLOCK 722 #ifdef USE_SUBBLOCK
@@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
1236 } 1241 }
1237 1242
1238 p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; 1243 p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
1244 p->mtCoder.numThreadGroups = props->numThreadGroups;
1239 p->mtCoder.expectedDataSize = p->expectedDataSize; 1245 p->mtCoder.expectedDataSize = p->expectedDataSize;
1240 1246
1241 RINOK(MtCoder_Code(&p->mtCoder)) 1247 RINOK(MtCoder_Code(&p->mtCoder))
diff --git a/C/XzEnc.h b/C/XzEnc.h
index 77b78c0..ac6bbf7 100644
--- a/C/XzEnc.h
+++ b/C/XzEnc.h
@@ -1,5 +1,5 @@
1/* XzEnc.h -- Xz Encode 1/* XzEnc.h -- Xz Encode
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_ENC_H 4#ifndef ZIP7_INC_XZ_ENC_H
5#define ZIP7_INC_XZ_ENC_H 5#define ZIP7_INC_XZ_ENC_H
@@ -31,6 +31,7 @@ typedef struct
31 CLzma2EncProps lzma2Props; 31 CLzma2EncProps lzma2Props;
32 CXzFilterProps filterProps; 32 CXzFilterProps filterProps;
33 unsigned checkId; 33 unsigned checkId;
34 unsigned numThreadGroups; // 0 : no groups
34 UInt64 blockSize; 35 UInt64 blockSize;
35 int numBlockThreads_Reduced; 36 int numBlockThreads_Reduced;
36 int numBlockThreads_Max; 37 int numBlockThreads_Max;
diff --git a/C/XzIn.c b/C/XzIn.c
index b68af96..ba31636 100644
--- a/C/XzIn.c
+++ b/C/XzIn.c
@@ -1,38 +1,39 @@
1/* XzIn.c - Xz input 1/* XzIn.c - Xz input
22023-09-07 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include <string.h> 6#include <string.h>
7 7
8#include "7zCrc.h" 8#include "7zCrc.h"
9#include "CpuArch.h"
10#include "Xz.h" 9#include "Xz.h"
10#include "CpuArch.h"
11 11
12/* 12#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
13#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) 13 (GetUi16a((const Byte *)(const void *)(p) + 10) == \
14*/ 14 (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
15#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
16
17 15
18SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) 16SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
19{ 17{
20 Byte sig[XZ_STREAM_HEADER_SIZE]; 18 UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
21 size_t processedSize = XZ_STREAM_HEADER_SIZE; 19 size_t processedSize = XZ_STREAM_HEADER_SIZE;
22 RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize)) 20 RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
23 if (processedSize != XZ_STREAM_HEADER_SIZE 21 if (processedSize != XZ_STREAM_HEADER_SIZE
24 || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) 22 || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
25 return SZ_ERROR_NO_ARCHIVE; 23 return SZ_ERROR_NO_ARCHIVE;
26 return Xz_ParseHeader(p, sig); 24 return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
27} 25}
28 26
29#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 27#define READ_VARINT_AND_CHECK(buf, size, res) \
30 { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ 28{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
31 if (s == 0) return SZ_ERROR_ARCHIVE; \ 29 if (s == 0) return SZ_ERROR_ARCHIVE; \
32 pos += s; } 30 size -= s; \
31 buf += s; \
32}
33 33
34SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) 34SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
35{ 35{
36 MY_ALIGN(4)
36 Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; 37 Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
37 unsigned headerSize; 38 unsigned headerSize;
38 *headerSizeRes = 0; 39 *headerSizeRes = 0;
@@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
57 return XzBlock_Parse(p, header); 58 return XzBlock_Parse(p, header);
58} 59}
59 60
61
60#define ADD_SIZE_CHECK(size, val) \ 62#define ADD_SIZE_CHECK(size, val) \
61 { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } 63{ const UInt64 newSize = size + (val); \
64 if (newSize < size) return XZ_SIZE_OVERFLOW; \
65 size = newSize; \
66}
62 67
63UInt64 Xz_GetUnpackSize(const CXzStream *p) 68UInt64 Xz_GetUnpackSize(const CXzStream *p)
64{ 69{
@@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
82 return size; 87 return size;
83} 88}
84 89
85/*
86SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
87{
88 return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
89}
90*/
91 90
92static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) 91// input;
92// CXzStream (p) is empty object.
93// size != 0
94// (size & 3) == 0
95// (buf) is aligned for at least 4 bytes.
96// output:
97// p->numBlocks is number of allocated items in p->blocks
98// p->blocks[*] values must be ignored, if function returns error.
99static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
93{ 100{
94 size_t numBlocks, pos = 1; 101 size_t numBlocks;
95 UInt32 crc;
96
97 if (size < 5 || buf[0] != 0) 102 if (size < 5 || buf[0] != 0)
98 return SZ_ERROR_ARCHIVE; 103 return SZ_ERROR_ARCHIVE;
99
100 size -= 4; 104 size -= 4;
101 crc = CrcCalc(buf, size); 105 {
102 if (crc != GetUi32(buf + size)) 106 const UInt32 crc = CrcCalc(buf, size);
103 return SZ_ERROR_ARCHIVE; 107 if (crc != GetUi32a(buf + size))
104 108 return SZ_ERROR_ARCHIVE;
109 }
110 buf++;
111 size--;
105 { 112 {
106 UInt64 numBlocks64; 113 UInt64 numBlocks64;
107 READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64) 114 READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
108 numBlocks = (size_t)numBlocks64; 115 // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
109 if (numBlocks != numBlocks64 || numBlocks * 2 > size) 116 if (numBlocks64 * 2 > size)
110 return SZ_ERROR_ARCHIVE; 117 return SZ_ERROR_ARCHIVE;
118 if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
119 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
120 numBlocks = (size_t)numBlocks64;
111 } 121 }
112 122 // Xz_Free(p, alloc); // it's optional, because (p) is empty already
113 Xz_Free(p, alloc); 123 if (numBlocks)
114 if (numBlocks != 0)
115 { 124 {
116 size_t i; 125 CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
117 p->numBlocks = numBlocks; 126 if (!blocks)
118 p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
119 if (!p->blocks)
120 return SZ_ERROR_MEM; 127 return SZ_ERROR_MEM;
121 for (i = 0; i < numBlocks; i++) 128 p->blocks = blocks;
129 p->numBlocks = numBlocks;
130 // the caller will call Xz_Free() in case of error
131 do
122 { 132 {
123 CXzBlockSizes *block = &p->blocks[i]; 133 READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
124 READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize) 134 READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
125 READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize) 135 if (blocks->totalSize == 0)
126 if (block->totalSize == 0)
127 return SZ_ERROR_ARCHIVE; 136 return SZ_ERROR_ARCHIVE;
137 blocks++;
128 } 138 }
139 while (--numBlocks);
129 } 140 }
130 while ((pos & 3) != 0) 141 if (size >= 4)
131 if (buf[pos++] != 0) 142 return SZ_ERROR_ARCHIVE;
143 while (size)
144 if (buf[--size])
132 return SZ_ERROR_ARCHIVE; 145 return SZ_ERROR_ARCHIVE;
133 return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; 146 return SZ_OK;
134} 147}
135 148
149
150/*
136static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) 151static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
137{ 152{
138 SRes res; 153 SRes res;
139 size_t size; 154 size_t size;
140 Byte *buf; 155 Byte *buf;
141 if (indexSize > ((UInt32)1 << 31)) 156 if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
142 return SZ_ERROR_UNSUPPORTED; 157 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
143 size = (size_t)indexSize; 158 size = (size_t)indexSize;
144 if (size != indexSize)
145 return SZ_ERROR_UNSUPPORTED;
146 buf = (Byte *)ISzAlloc_Alloc(alloc, size); 159 buf = (Byte *)ISzAlloc_Alloc(alloc, size);
147 if (!buf) 160 if (!buf)
148 return SZ_ERROR_MEM; 161 return SZ_ERROR_MEM;
149 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); 162 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
150 if (res == SZ_OK) 163 if (res == SZ_OK)
151 res = Xz_ReadIndex2(p, buf, size, alloc); 164 res = Xz_ParseIndex(p, buf, size, alloc);
152 ISzAlloc_Free(alloc, buf); 165 ISzAlloc_Free(alloc, buf);
153 return res; 166 return res;
154} 167}
168*/
155 169
156static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) 170static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
157{ 171{
@@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
160 /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ 174 /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
161} 175}
162 176
177
178/*
179in:
180 (*startOffset) is position in (stream) where xz_stream must be finished.
181out:
182 if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
183*/
163static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) 184static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
164{ 185{
165 UInt64 indexSize; 186 #define TEMP_BUF_SIZE (1 << 10)
166 Byte buf[XZ_STREAM_FOOTER_SIZE]; 187 UInt32 buf32[TEMP_BUF_SIZE / 4];
167 UInt64 pos = (UInt64)*startOffset; 188 UInt64 pos = (UInt64)*startOffset;
168 189
169 if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) 190 if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
170 return SZ_ERROR_NO_ARCHIVE; 191 return SZ_ERROR_NO_ARCHIVE;
171
172 pos -= XZ_STREAM_FOOTER_SIZE; 192 pos -= XZ_STREAM_FOOTER_SIZE;
173 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) 193 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
174 194
175 if (!XZ_FOOTER_SIG_CHECK(buf + 10)) 195 if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
176 { 196 {
177 UInt32 total = 0;
178 pos += XZ_STREAM_FOOTER_SIZE; 197 pos += XZ_STREAM_FOOTER_SIZE;
179
180 for (;;) 198 for (;;)
181 { 199 {
182 size_t i; 200 // pos != 0
183 #define TEMP_BUF_SIZE (1 << 10) 201 // (pos & 3) == 0
184 Byte temp[TEMP_BUF_SIZE]; 202 size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
185
186 i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
187 pos -= i; 203 pos -= i;
188 RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)) 204 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
189 total += (UInt32)i; 205 i /= 4;
190 for (; i != 0; i--) 206 do
191 if (temp[i - 1] != 0) 207 if (buf32[i - 1] != 0)
192 break; 208 break;
193 if (i != 0) 209 while (--i);
194 { 210
195 if ((i & 3) != 0) 211 pos += i * 4;
196 return SZ_ERROR_NO_ARCHIVE; 212 #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
197 pos += i; 213 // here we don't support rare case with big padding for xz stream.
198 break; 214 // so we have padding limit for backward reading.
199 } 215 if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
200 if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
201 return SZ_ERROR_NO_ARCHIVE; 216 return SZ_ERROR_NO_ARCHIVE;
217 if (i)
218 break;
202 } 219 }
203 220 // we try to open xz stream after skipping zero padding.
221 // ((UInt64)*startOffset == pos) is possible here!
204 if (pos < XZ_STREAM_FOOTER_SIZE) 222 if (pos < XZ_STREAM_FOOTER_SIZE)
205 return SZ_ERROR_NO_ARCHIVE; 223 return SZ_ERROR_NO_ARCHIVE;
206 pos -= XZ_STREAM_FOOTER_SIZE; 224 pos -= XZ_STREAM_FOOTER_SIZE;
207 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) 225 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
208 if (!XZ_FOOTER_SIG_CHECK(buf + 10)) 226 if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
209 return SZ_ERROR_NO_ARCHIVE; 227 return SZ_ERROR_NO_ARCHIVE;
210 } 228 }
211 229
212 p->flags = (CXzStreamFlags)GetBe16(buf + 8); 230 p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
213
214 if (!XzFlags_IsSupported(p->flags)) 231 if (!XzFlags_IsSupported(p->flags))
215 return SZ_ERROR_UNSUPPORTED; 232 return SZ_ERROR_UNSUPPORTED;
216
217 { 233 {
218 /* to eliminate GCC 6.3 warning: 234 /* to eliminate GCC 6.3 warning:
219 dereferencing type-punned pointer will break strict-aliasing rules */ 235 dereferencing type-punned pointer will break strict-aliasing rules */
220 const Byte *buf_ptr = buf; 236 const UInt32 *buf_ptr = buf32;
221 if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) 237 if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
222 return SZ_ERROR_ARCHIVE; 238 return SZ_ERROR_ARCHIVE;
223 } 239 }
224
225 indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
226
227 if (pos < indexSize)
228 return SZ_ERROR_ARCHIVE;
229
230 pos -= indexSize;
231 RINOK(LookInStream_SeekTo(stream, pos))
232 RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
233
234 { 240 {
235 UInt64 totalSize = Xz_GetPackSize(p); 241 const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
236 if (totalSize == XZ_SIZE_OVERFLOW 242 if (pos < indexSize)
237 || totalSize >= ((UInt64)1 << 63)
238 || pos < totalSize + XZ_STREAM_HEADER_SIZE)
239 return SZ_ERROR_ARCHIVE; 243 return SZ_ERROR_ARCHIVE;
240 pos -= (totalSize + XZ_STREAM_HEADER_SIZE); 244 pos -= indexSize;
245 // v25.00: relaxed indexSize check. We allow big index table.
246 // if (indexSize > ((UInt32)1 << 31))
247 if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
248 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
249 RINOK(LookInStream_SeekTo(stream, pos))
250 // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
251 {
252 SRes res;
253 const size_t size = (size_t)indexSize;
254 // if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
255 Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
256 if (!buf)
257 return SZ_ERROR_MEM;
258 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
259 if (res == SZ_OK)
260 res = Xz_ParseIndex(p, buf, size, alloc);
261 ISzAlloc_Free(alloc, buf);
262 RINOK(res)
263 }
264 }
265 {
266 UInt64 total = Xz_GetPackSize(p);
267 if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
268 return SZ_ERROR_ARCHIVE;
269 total += XZ_STREAM_HEADER_SIZE;
270 if (pos < total)
271 return SZ_ERROR_ARCHIVE;
272 pos -= total;
241 RINOK(LookInStream_SeekTo(stream, pos)) 273 RINOK(LookInStream_SeekTo(stream, pos))
242 *startOffset = (Int64)pos; 274 *startOffset = (Int64)pos;
243 } 275 }
@@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
246 CSecToRead secToRead; 278 CSecToRead secToRead;
247 SecToRead_CreateVTable(&secToRead); 279 SecToRead_CreateVTable(&secToRead);
248 secToRead.realStream = stream; 280 secToRead.realStream = stream;
249
250 RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) 281 RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
251 return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; 282 return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
252 } 283 }
@@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
257 288
258void Xzs_Construct(CXzs *p) 289void Xzs_Construct(CXzs *p)
259{ 290{
260 p->num = p->numAllocated = 0; 291 Xzs_CONSTRUCT(p)
261 p->streams = 0;
262} 292}
263 293
264void Xzs_Free(CXzs *p, ISzAllocPtr alloc) 294void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
268 Xz_Free(&p->streams[i], alloc); 298 Xz_Free(&p->streams[i], alloc);
269 ISzAlloc_Free(alloc, p->streams); 299 ISzAlloc_Free(alloc, p->streams);
270 p->num = p->numAllocated = 0; 300 p->num = p->numAllocated = 0;
271 p->streams = 0; 301 p->streams = NULL;
272} 302}
273 303
274UInt64 Xzs_GetNumBlocks(const CXzs *p) 304UInt64 Xzs_GetNumBlocks(const CXzs *p)
@@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
307SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) 337SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
308{ 338{
309 Int64 endOffset = 0; 339 Int64 endOffset = 0;
340 // it's supposed that CXzs object is empty here.
341 // if CXzs object is not empty, it will add new streams to that non-empty object.
342 // Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
310 RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) 343 RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
311 *startOffset = endOffset; 344 *startOffset = endOffset;
312 for (;;) 345 for (;;)
313 { 346 {
314 CXzStream st; 347 CXzStream st;
315 SRes res; 348 SRes res;
316 Xz_Construct(&st); 349 Xz_CONSTRUCT(&st)
317 res = Xz_ReadBackward(&st, stream, startOffset, alloc); 350 res = Xz_ReadBackward(&st, stream, startOffset, alloc);
351 // if (res == SZ_OK), then (*startOffset) is start offset of new stream if
352 // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
318 st.startOffset = (UInt64)*startOffset; 353 st.startOffset = (UInt64)*startOffset;
319 RINOK(res) 354 // we must store (st) object to array, or we must free (st) local object.
355 if (res != SZ_OK)
356 {
357 Xz_Free(&st, alloc);
358 return res;
359 }
320 if (p->num == p->numAllocated) 360 if (p->num == p->numAllocated)
321 { 361 {
322 const size_t newNum = p->num + p->num / 4 + 1; 362 const size_t newNum = p->num + p->num / 4 + 1;
323 void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); 363 void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
324 if (!data) 364 if (!data)
365 {
366 Xz_Free(&st, alloc);
325 return SZ_ERROR_MEM; 367 return SZ_ERROR_MEM;
368 }
326 p->numAllocated = newNum; 369 p->numAllocated = newNum;
327 if (p->num != 0) 370 if (p->num != 0)
328 memcpy(data, p->streams, p->num * sizeof(CXzStream)); 371 memcpy(data, p->streams, p->num * sizeof(CXzStream));
329 ISzAlloc_Free(alloc, p->streams); 372 ISzAlloc_Free(alloc, p->streams);
330 p->streams = (CXzStream *)data; 373 p->streams = (CXzStream *)data;
331 } 374 }
375 // we use direct copying of raw data from local variable (st) to object in array.
376 // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
332 p->streams[p->num++] = st; 377 p->streams[p->num++] = st;
333 if (*startOffset == 0) 378 if (*startOffset == 0)
334 break; 379 return SZ_OK;
335 RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) 380 // seek operation is optional:
381 // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
336 if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) 382 if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
337 return SZ_ERROR_PROGRESS; 383 return SZ_ERROR_PROGRESS;
338 } 384 }
339 return SZ_OK;
340} 385}