aboutsummaryrefslogtreecommitdiff
path: root/C
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2025-07-05 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2025-07-05 19:27:33 +0500
commit395149956d696e6e3099d8b76d797437f94a6942 (patch)
tree6ed5013a637078ae2dfdc4acf1ad93bf29cea356 /C
parente5431fa6f5505e385c6f9367260717e9c47dc2ee (diff)
download7zip-25.00.tar.gz
7zip-25.00.tar.bz2
7zip-25.00.zip
25.0025.00
Diffstat (limited to 'C')
-rw-r--r--C/7zVersion.h10
-rw-r--r--C/BwtSort.c468
-rw-r--r--C/BwtSort.h7
-rw-r--r--C/Compiler.h12
-rw-r--r--C/CpuArch.h8
-rw-r--r--C/HuffEnc.c384
-rw-r--r--C/HuffEnc.h8
-rw-r--r--C/LzFind.c24
-rw-r--r--C/LzFindMt.c10
-rw-r--r--C/LzFindMt.h6
-rw-r--r--C/Lzma2Enc.c4
-rw-r--r--C/Lzma2Enc.h1
-rw-r--r--C/LzmaEnc.c6
-rw-r--r--C/LzmaEnc.h4
-rw-r--r--C/MtCoder.c61
-rw-r--r--C/MtCoder.h7
-rw-r--r--C/Sha512.c167
-rw-r--r--C/Sort.c355
-rw-r--r--C/Sort.h7
-rw-r--r--C/Threads.c237
-rw-r--r--C/Threads.h12
-rw-r--r--C/Util/Lzma/LzmaUtil.dsp4
-rw-r--r--C/Util/LzmaLib/LzmaLib.dsp8
-rw-r--r--C/Xz.h12
-rw-r--r--C/XzCrc64Opt.c4
-rw-r--r--C/XzDec.c29
-rw-r--r--C/XzEnc.c8
-rw-r--r--C/XzEnc.h3
-rw-r--r--C/XzIn.c265
29 files changed, 1524 insertions, 607 deletions
diff --git a/C/7zVersion.h b/C/7zVersion.h
index e82ba0b..72733f7 100644
--- a/C/7zVersion.h
+++ b/C/7zVersion.h
@@ -1,7 +1,7 @@
1#define MY_VER_MAJOR 24 1#define MY_VER_MAJOR 25
2#define MY_VER_MINOR 9 2#define MY_VER_MINOR 0
3#define MY_VER_BUILD 0 3#define MY_VER_BUILD 0
4#define MY_VERSION_NUMBERS "24.09" 4#define MY_VERSION_NUMBERS "25.00"
5#define MY_VERSION MY_VERSION_NUMBERS 5#define MY_VERSION MY_VERSION_NUMBERS
6 6
7#ifdef MY_CPU_NAME 7#ifdef MY_CPU_NAME
@@ -10,12 +10,12 @@
10 #define MY_VERSION_CPU MY_VERSION 10 #define MY_VERSION_CPU MY_VERSION
11#endif 11#endif
12 12
13#define MY_DATE "2024-11-29" 13#define MY_DATE "2025-07-05"
14#undef MY_COPYRIGHT 14#undef MY_COPYRIGHT
15#undef MY_VERSION_COPYRIGHT_DATE 15#undef MY_VERSION_COPYRIGHT_DATE
16#define MY_AUTHOR_NAME "Igor Pavlov" 16#define MY_AUTHOR_NAME "Igor Pavlov"
17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" 17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" 18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
19 19
20#ifdef USE_COPYRIGHT_CR 20#ifdef USE_COPYRIGHT_CR
21 #define MY_COPYRIGHT MY_COPYRIGHT_CR 21 #define MY_COPYRIGHT MY_COPYRIGHT_CR
diff --git a/C/BwtSort.c b/C/BwtSort.c
index 05ad6de..8f64f9d 100644
--- a/C/BwtSort.c
+++ b/C/BwtSort.c
@@ -1,5 +1,5 @@
1/* BwtSort.c -- BWT block sorting 1/* BwtSort.c -- BWT block sorting
22023-04-02 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -7,6 +7,44 @@
7#include "Sort.h" 7#include "Sort.h"
8 8
9/* #define BLOCK_SORT_USE_HEAP_SORT */ 9/* #define BLOCK_SORT_USE_HEAP_SORT */
10// #define BLOCK_SORT_USE_HEAP_SORT
11
12#ifdef BLOCK_SORT_USE_HEAP_SORT
13
14#define HeapSortRefDown(p, vals, n, size, temp) \
15 { size_t k = n; UInt32 val = vals[temp]; for (;;) { \
16 size_t s = k << 1; \
17 if (s > size) break; \
18 if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
19 if (val >= vals[p[s]]) break; \
20 p[k] = p[s]; k = s; \
21 } p[k] = temp; }
22
23void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
24{
25 if (size <= 1)
26 return;
27 p--;
28 {
29 size_t i = size / 2;
30 do
31 {
32 UInt32 temp = p[i];
33 HeapSortRefDown(p, vals, i, size, temp);
34 }
35 while (--i != 0);
36 }
37 do
38 {
39 UInt32 temp = p[size];
40 p[size--] = p[1];
41 HeapSortRefDown(p, vals, 1, size, temp);
42 }
43 while (size > 1);
44}
45
46#endif // BLOCK_SORT_USE_HEAP_SORT
47
10 48
11/* Don't change it !!! */ 49/* Don't change it !!! */
12#define kNumHashBytes 2 50#define kNumHashBytes 2
@@ -27,26 +65,27 @@
27 65
28#else 66#else
29 67
30#define kNumBitsMax 20 68#define kNumBitsMax 20
31#define kIndexMask ((1 << kNumBitsMax) - 1) 69#define kIndexMask (((UInt32)1 << kNumBitsMax) - 1)
32#define kNumExtraBits (32 - kNumBitsMax) 70#define kNumExtraBits (32 - kNumBitsMax)
33#define kNumExtra0Bits (kNumExtraBits - 2) 71#define kNumExtra0Bits (kNumExtraBits - 2)
34#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) 72#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1)
35 73
36#define SetFinishedGroupSize(p, size) \ 74#define SetFinishedGroupSize(p, size) \
37 { *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ 75 { *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \
38 if ((size) > (1 << kNumExtra0Bits)) { \ 76 if ((size) > (1 << kNumExtra0Bits)) { \
39 *(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \ 77 *(p) |= 0x40000000; \
78 *((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \
40 79
41static void SetGroupSize(UInt32 *p, UInt32 size) 80static void SetGroupSize(UInt32 *p, size_t size)
42{ 81{
43 if (--size == 0) 82 if (--size == 0)
44 return; 83 return;
45 *p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax); 84 *p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax);
46 if (size >= (1 << kNumExtra0Bits)) 85 if (size >= (1 << kNumExtra0Bits))
47 { 86 {
48 *p |= 0x40000000; 87 *p |= 0x40000000;
49 p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax); 88 p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax);
50 } 89 }
51} 90}
52 91
@@ -59,12 +98,14 @@ returns: 1 - if there are groups, 0 - no more groups
59*/ 98*/
60 99
61static 100static
62UInt32 101unsigned
63Z7_FASTCALL 102Z7_FASTCALL
64SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices 103SortGroup(size_t BlockSize, size_t NumSortedBytes,
65 #ifndef BLOCK_SORT_USE_HEAP_SORT 104 size_t groupOffset, size_t groupSize,
66 , UInt32 left, UInt32 range 105 unsigned NumRefBits, UInt32 *Indices
67 #endif 106#ifndef BLOCK_SORT_USE_HEAP_SORT
107 , size_t left, size_t range
108#endif
68 ) 109 )
69{ 110{
70 UInt32 *ind2 = Indices + groupOffset; 111 UInt32 *ind2 = Indices + groupOffset;
@@ -79,90 +120,93 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
79 return 0; 120 return 0;
80 } 121 }
81 Groups = Indices + BlockSize + BS_TEMP_SIZE; 122 Groups = Indices + BlockSize + BS_TEMP_SIZE;
82 if (groupSize <= ((UInt32)1 << NumRefBits) 123 if (groupSize <= ((size_t)1 << NumRefBits)
83 #ifndef BLOCK_SORT_USE_HEAP_SORT 124#ifndef BLOCK_SORT_USE_HEAP_SORT
84 && groupSize <= range 125 && groupSize <= range
85 #endif 126#endif
86 ) 127 )
87 { 128 {
88 UInt32 *temp = Indices + BlockSize; 129 UInt32 *temp = Indices + BlockSize;
89 UInt32 j; 130 size_t j, group;
90 UInt32 mask, thereAreGroups, group, cg; 131 UInt32 mask, cg;
132 unsigned thereAreGroups;
91 { 133 {
92 UInt32 gPrev; 134 UInt32 gPrev;
93 UInt32 gRes = 0; 135 UInt32 gRes = 0;
94 { 136 {
95 UInt32 sp = ind2[0] + NumSortedBytes; 137 size_t sp = ind2[0] + NumSortedBytes;
96 if (sp >= BlockSize) sp -= BlockSize; 138 if (sp >= BlockSize)
139 sp -= BlockSize;
97 gPrev = Groups[sp]; 140 gPrev = Groups[sp];
98 temp[0] = (gPrev << NumRefBits); 141 temp[0] = gPrev << NumRefBits;
99 } 142 }
100 143
101 for (j = 1; j < groupSize; j++) 144 for (j = 1; j < groupSize; j++)
102 { 145 {
103 UInt32 sp = ind2[j] + NumSortedBytes; 146 size_t sp = ind2[j] + NumSortedBytes;
104 UInt32 g; 147 UInt32 g;
105 if (sp >= BlockSize) sp -= BlockSize; 148 if (sp >= BlockSize)
149 sp -= BlockSize;
106 g = Groups[sp]; 150 g = Groups[sp];
107 temp[j] = (g << NumRefBits) | j; 151 temp[j] = (g << NumRefBits) | (UInt32)j;
108 gRes |= (gPrev ^ g); 152 gRes |= (gPrev ^ g);
109 } 153 }
110 if (gRes == 0) 154 if (gRes == 0)
111 { 155 {
112 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 156#ifndef BLOCK_SORT_EXTERNAL_FLAGS
113 SetGroupSize(ind2, groupSize); 157 SetGroupSize(ind2, groupSize);
114 #endif 158#endif
115 return 1; 159 return 1;
116 } 160 }
117 } 161 }
118 162
119 HeapSort(temp, groupSize); 163 HeapSort(temp, groupSize);
120 mask = (((UInt32)1 << NumRefBits) - 1); 164 mask = ((UInt32)1 << NumRefBits) - 1;
121 thereAreGroups = 0; 165 thereAreGroups = 0;
122 166
123 group = groupOffset; 167 group = groupOffset;
124 cg = (temp[0] >> NumRefBits); 168 cg = temp[0] >> NumRefBits;
125 temp[0] = ind2[temp[0] & mask]; 169 temp[0] = ind2[temp[0] & mask];
126 170
127 { 171 {
128 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 172#ifdef BLOCK_SORT_EXTERNAL_FLAGS
129 UInt32 *Flags = Groups + BlockSize; 173 UInt32 *Flags = Groups + BlockSize;
130 #else 174#else
131 UInt32 prevGroupStart = 0; 175 size_t prevGroupStart = 0;
132 #endif 176#endif
133 177
134 for (j = 1; j < groupSize; j++) 178 for (j = 1; j < groupSize; j++)
135 { 179 {
136 UInt32 val = temp[j]; 180 const UInt32 val = temp[j];
137 UInt32 cgCur = (val >> NumRefBits); 181 const UInt32 cgCur = val >> NumRefBits;
138 182
139 if (cgCur != cg) 183 if (cgCur != cg)
140 { 184 {
141 cg = cgCur; 185 cg = cgCur;
142 group = groupOffset + j; 186 group = groupOffset + j;
143 187
144 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 188#ifdef BLOCK_SORT_EXTERNAL_FLAGS
145 { 189 {
146 UInt32 t = group - 1; 190 const size_t t = group - 1;
147 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 191 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
148 } 192 }
149 #else 193#else
150 SetGroupSize(temp + prevGroupStart, j - prevGroupStart); 194 SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
151 prevGroupStart = j; 195 prevGroupStart = j;
152 #endif 196#endif
153 } 197 }
154 else 198 else
155 thereAreGroups = 1; 199 thereAreGroups = 1;
156 { 200 {
157 UInt32 ind = ind2[val & mask]; 201 const UInt32 ind = ind2[val & mask];
158 temp[j] = ind; 202 temp[j] = ind;
159 Groups[ind] = group; 203 Groups[ind] = (UInt32)group;
160 } 204 }
161 } 205 }
162 206
163 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 207#ifndef BLOCK_SORT_EXTERNAL_FLAGS
164 SetGroupSize(temp + prevGroupStart, j - prevGroupStart); 208 SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
165 #endif 209#endif
166 } 210 }
167 211
168 for (j = 0; j < groupSize; j++) 212 for (j = 0; j < groupSize; j++)
@@ -172,37 +216,42 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
172 216
173 /* Check that all strings are in one group (cannot sort) */ 217 /* Check that all strings are in one group (cannot sort) */
174 { 218 {
175 UInt32 group, j; 219 UInt32 group;
176 UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 220 size_t j;
221 size_t sp = ind2[0] + NumSortedBytes;
222 if (sp >= BlockSize)
223 sp -= BlockSize;
177 group = Groups[sp]; 224 group = Groups[sp];
178 for (j = 1; j < groupSize; j++) 225 for (j = 1; j < groupSize; j++)
179 { 226 {
180 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 227 sp = ind2[j] + NumSortedBytes;
228 if (sp >= BlockSize)
229 sp -= BlockSize;
181 if (Groups[sp] != group) 230 if (Groups[sp] != group)
182 break; 231 break;
183 } 232 }
184 if (j == groupSize) 233 if (j == groupSize)
185 { 234 {
186 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 235#ifndef BLOCK_SORT_EXTERNAL_FLAGS
187 SetGroupSize(ind2, groupSize); 236 SetGroupSize(ind2, groupSize);
188 #endif 237#endif
189 return 1; 238 return 1;
190 } 239 }
191 } 240 }
192 241
193 #ifndef BLOCK_SORT_USE_HEAP_SORT 242#ifndef BLOCK_SORT_USE_HEAP_SORT
194 { 243 {
195 /* ---------- Range Sort ---------- */ 244 /* ---------- Range Sort ---------- */
196 UInt32 i; 245 size_t i;
197 UInt32 mid; 246 size_t mid;
198 for (;;) 247 for (;;)
199 { 248 {
200 UInt32 j; 249 size_t j;
201 if (range <= 1) 250 if (range <= 1)
202 { 251 {
203 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 252#ifndef BLOCK_SORT_EXTERNAL_FLAGS
204 SetGroupSize(ind2, groupSize); 253 SetGroupSize(ind2, groupSize);
205 #endif 254#endif
206 return 1; 255 return 1;
207 } 256 }
208 mid = left + ((range + 1) >> 1); 257 mid = left + ((range + 1) >> 1);
@@ -210,7 +259,7 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
210 i = 0; 259 i = 0;
211 do 260 do
212 { 261 {
213 UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 262 size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
214 if (Groups[sp] >= mid) 263 if (Groups[sp] >= mid)
215 { 264 {
216 for (j--; j > i; j--) 265 for (j--; j > i; j--)
@@ -238,51 +287,53 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
238 break; 287 break;
239 } 288 }
240 289
241 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 290#ifdef BLOCK_SORT_EXTERNAL_FLAGS
242 { 291 {
243 UInt32 t = (groupOffset + i - 1); 292 const size_t t = groupOffset + i - 1;
244 UInt32 *Flags = Groups + BlockSize; 293 UInt32 *Flags = Groups + BlockSize;
245 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 294 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
246 } 295 }
247 #endif 296#endif
248 297
249 { 298 {
250 UInt32 j; 299 size_t j;
251 for (j = i; j < groupSize; j++) 300 for (j = i; j < groupSize; j++)
252 Groups[ind2[j]] = groupOffset + i; 301 Groups[ind2[j]] = (UInt32)(groupOffset + i);
253 } 302 }
254 303
255 { 304 {
256 UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); 305 unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left);
257 return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); 306 return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left));
258 } 307 }
259 308
260 } 309 }
261 310
262 #else 311#else // BLOCK_SORT_USE_HEAP_SORT
263 312
264 /* ---------- Heap Sort ---------- */ 313 /* ---------- Heap Sort ---------- */
265 314
266 { 315 {
267 UInt32 j; 316 size_t j;
268 for (j = 0; j < groupSize; j++) 317 for (j = 0; j < groupSize; j++)
269 { 318 {
270 UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; 319 size_t sp = ind2[j] + NumSortedBytes;
271 ind2[j] = sp; 320 if (sp >= BlockSize)
321 sp -= BlockSize;
322 ind2[j] = (UInt32)sp;
272 } 323 }
273 324
274 HeapSortRef(ind2, Groups, groupSize); 325 HeapSortRef(ind2, Groups, groupSize);
275 326
276 /* Write Flags */ 327 /* Write Flags */
277 { 328 {
278 UInt32 sp = ind2[0]; 329 size_t sp = ind2[0];
279 UInt32 group = Groups[sp]; 330 UInt32 group = Groups[sp];
280 331
281 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 332#ifdef BLOCK_SORT_EXTERNAL_FLAGS
282 UInt32 *Flags = Groups + BlockSize; 333 UInt32 *Flags = Groups + BlockSize;
283 #else 334#else
284 UInt32 prevGroupStart = 0; 335 size_t prevGroupStart = 0;
285 #endif 336#endif
286 337
287 for (j = 1; j < groupSize; j++) 338 for (j = 1; j < groupSize; j++)
288 { 339 {
@@ -290,149 +341,210 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr
290 if (Groups[sp] != group) 341 if (Groups[sp] != group)
291 { 342 {
292 group = Groups[sp]; 343 group = Groups[sp];
293 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 344#ifdef BLOCK_SORT_EXTERNAL_FLAGS
294 { 345 {
295 UInt32 t = groupOffset + j - 1; 346 const size_t t = groupOffset + j - 1;
296 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); 347 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
297 } 348 }
298 #else 349#else
299 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); 350 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
300 prevGroupStart = j; 351 prevGroupStart = j;
301 #endif 352#endif
302 } 353 }
303 } 354 }
304 355
305 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 356#ifndef BLOCK_SORT_EXTERNAL_FLAGS
306 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); 357 SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
307 #endif 358#endif
308 } 359 }
309 { 360 {
310 /* Write new Groups values and Check that there are groups */ 361 /* Write new Groups values and Check that there are groups */
311 UInt32 thereAreGroups = 0; 362 unsigned thereAreGroups = 0;
312 for (j = 0; j < groupSize; j++) 363 for (j = 0; j < groupSize; j++)
313 { 364 {
314 UInt32 group = groupOffset + j; 365 size_t group = groupOffset + j;
315 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 366#ifndef BLOCK_SORT_EXTERNAL_FLAGS
316 UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); 367 UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax);
317 if ((ind2[j] & 0x40000000) != 0) 368 if (ind2[j] & 0x40000000)
318 subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); 369 subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits);
319 subGroupSize++; 370 subGroupSize++;
320 for (;;) 371 for (;;)
321 { 372 {
322 UInt32 original = ind2[j]; 373 const UInt32 original = ind2[j];
323 UInt32 sp = original & kIndexMask; 374 size_t sp = original & kIndexMask;
324 if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; 375 if (sp < NumSortedBytes)
325 ind2[j] = sp | (original & ~kIndexMask); 376 sp += BlockSize;
326 Groups[sp] = group; 377 sp -= NumSortedBytes;
378 ind2[j] = (UInt32)sp | (original & ~kIndexMask);
379 Groups[sp] = (UInt32)group;
327 if (--subGroupSize == 0) 380 if (--subGroupSize == 0)
328 break; 381 break;
329 j++; 382 j++;
330 thereAreGroups = 1; 383 thereAreGroups = 1;
331 } 384 }
332 #else 385#else
333 UInt32 *Flags = Groups + BlockSize; 386 UInt32 *Flags = Groups + BlockSize;
334 for (;;) 387 for (;;)
335 { 388 {
336 UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; 389 size_t sp = ind2[j];
337 ind2[j] = sp; 390 if (sp < NumSortedBytes)
338 Groups[sp] = group; 391 sp += BlockSize;
392 sp -= NumSortedBytes;
393 ind2[j] = (UInt32)sp;
394 Groups[sp] = (UInt32)group;
339 if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) 395 if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0)
340 break; 396 break;
341 j++; 397 j++;
342 thereAreGroups = 1; 398 thereAreGroups = 1;
343 } 399 }
344 #endif 400#endif
345 } 401 }
346 return thereAreGroups; 402 return thereAreGroups;
347 } 403 }
348 } 404 }
349 #endif 405#endif // BLOCK_SORT_USE_HEAP_SORT
350} 406}
351 407
408
352/* conditions: blockSize > 0 */ 409/* conditions: blockSize > 0 */
353UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) 410UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
354{ 411{
355 UInt32 *counters = Indices + blockSize; 412 UInt32 *counters = Indices + blockSize;
356 UInt32 i; 413 size_t i;
357 UInt32 *Groups; 414 UInt32 *Groups;
358 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 415#ifdef BLOCK_SORT_EXTERNAL_FLAGS
359 UInt32 *Flags; 416 UInt32 *Flags;
360 #endif 417#endif
361 418
362 /* Radix-Sort for 2 bytes */ 419/* Radix-Sort for 2 bytes */
420// { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) {
363 for (i = 0; i < kNumHashValues; i++) 421 for (i = 0; i < kNumHashValues; i++)
364 counters[i] = 0; 422 counters[i] = 0;
365 for (i = 0; i < blockSize - 1; i++) 423 {
366 counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++; 424 const Byte *data2 = data;
367 counters[((UInt32)data[i] << 8) | data[0]]++; 425 size_t a = data[(size_t)blockSize - 1];
426 const Byte *data_lim = data + blockSize;
427 if (blockSize >= 4)
428 {
429 data_lim -= 3;
430 do
431 {
432 size_t b;
433 b = data2[0]; counters[(a << 8) | b]++;
434 a = data2[1]; counters[(b << 8) | a]++;
435 b = data2[2]; counters[(a << 8) | b]++;
436 a = data2[3]; counters[(b << 8) | a]++;
437 data2 += 4;
438 }
439 while (data2 < data_lim);
440 data_lim += 3;
441 }
442 while (data2 != data_lim)
443 {
444 size_t b = *data2++;
445 counters[(a << 8) | b]++;
446 a = b;
447 }
448 }
449// }}
368 450
369 Groups = counters + BS_TEMP_SIZE; 451 Groups = counters + BS_TEMP_SIZE;
370 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 452#ifdef BLOCK_SORT_EXTERNAL_FLAGS
371 Flags = Groups + blockSize; 453 Flags = Groups + blockSize;
372 { 454 {
373 UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; 455 const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits;
374 for (i = 0; i < numWords; i++) 456 for (i = 0; i < numWords; i++)
375 Flags[i] = kAllFlags; 457 Flags[i] = kAllFlags;
376 } 458 }
377 #endif 459#endif
378 460
379 { 461 {
380 UInt32 sum = 0; 462 UInt32 sum = 0;
381 for (i = 0; i < kNumHashValues; i++) 463 for (i = 0; i < kNumHashValues; i++)
382 { 464 {
383 UInt32 groupSize = counters[i]; 465 const UInt32 groupSize = counters[i];
384 if (groupSize > 0) 466 counters[i] = sum;
467 sum += groupSize;
468#ifdef BLOCK_SORT_EXTERNAL_FLAGS
469 if (groupSize)
385 { 470 {
386 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 471 const UInt32 t = sum - 1;
387 UInt32 t = sum + groupSize - 1; 472 Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
388 Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
389 #endif
390 sum += groupSize;
391 } 473 }
392 counters[i] = sum - groupSize; 474#endif
393 } 475 }
476 }
394 477
395 for (i = 0; i < blockSize - 1; i++) 478 for (i = 0; i < blockSize - 1; i++)
396 Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]; 479 Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]];
397 Groups[i] = counters[((UInt32)data[i] << 8) | data[0]]; 480 Groups[i] = counters[((unsigned)data[i] << 8) | data[0]];
481
482 {
483#define SET_Indices(a, b, i) \
484 { UInt32 c; \
485 a = (a << 8) | (b); \
486 c = counters[a]; \
487 Indices[c] = (UInt32)i++; \
488 counters[a] = c + 1; \
489 }
398 490
399 for (i = 0; i < blockSize - 1; i++) 491 size_t a = data[0];
400 Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i; 492 const Byte *data_ptr = data + 1;
401 Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i; 493 i = 0;
402 494 if (blockSize >= 3)
403 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 495 {
496 blockSize -= 2;
497 do
498 {
499 size_t b;
500 b = data_ptr[0]; SET_Indices(a, b, i)
501 a = data_ptr[1]; SET_Indices(b, a, i)
502 data_ptr += 2;
503 }
504 while (i < blockSize);
505 blockSize += 2;
506 }
507 if (i < blockSize - 1)
404 { 508 {
509 SET_Indices(a, data[(size_t)i + 1], i)
510 a = (Byte)a;
511 }
512 SET_Indices(a, data[0], i)
513 }
514
515#ifndef BLOCK_SORT_EXTERNAL_FLAGS
516 {
405 UInt32 prev = 0; 517 UInt32 prev = 0;
406 for (i = 0; i < kNumHashValues; i++) 518 for (i = 0; i < kNumHashValues; i++)
407 { 519 {
408 UInt32 prevGroupSize = counters[i] - prev; 520 const UInt32 prevGroupSize = counters[i] - prev;
409 if (prevGroupSize == 0) 521 if (prevGroupSize == 0)
410 continue; 522 continue;
411 SetGroupSize(Indices + prev, prevGroupSize); 523 SetGroupSize(Indices + prev, prevGroupSize);
412 prev = counters[i]; 524 prev = counters[i];
413 } 525 }
414 }
415 #endif
416 } 526 }
527#endif
417 528
418 { 529 {
419 int NumRefBits; 530 unsigned NumRefBits;
420 UInt32 NumSortedBytes; 531 size_t NumSortedBytes;
421 for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++); 532 for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++)
533 {}
422 NumRefBits = 32 - NumRefBits; 534 NumRefBits = 32 - NumRefBits;
423 if (NumRefBits > kNumRefBitsMax) 535 if (NumRefBits > kNumRefBitsMax)
424 NumRefBits = kNumRefBitsMax; 536 NumRefBits = kNumRefBitsMax;
425 537
426 for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) 538 for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1)
427 { 539 {
428 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 540#ifndef BLOCK_SORT_EXTERNAL_FLAGS
429 UInt32 finishedGroupSize = 0; 541 size_t finishedGroupSize = 0;
430 #endif 542#endif
431 UInt32 newLimit = 0; 543 size_t newLimit = 0;
432 for (i = 0; i < blockSize;) 544 for (i = 0; i < blockSize;)
433 { 545 {
434 UInt32 groupSize; 546 size_t groupSize;
435 #ifdef BLOCK_SORT_EXTERNAL_FLAGS 547#ifdef BLOCK_SORT_EXTERNAL_FLAGS
436 548
437 if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) 549 if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0)
438 { 550 {
@@ -441,56 +553,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
441 } 553 }
442 for (groupSize = 1; 554 for (groupSize = 1;
443 (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; 555 (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0;
444 groupSize++); 556 groupSize++)
445 557 {}
446 groupSize++; 558 groupSize++;
447 559
448 #else 560#else
449 561
450 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); 562 groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
451 { 563 {
452 BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); 564 const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0);
453 if ((Indices[i] & 0x40000000) != 0) 565 if (Indices[i] & 0x40000000)
454 {
455 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
456 Indices[(size_t)i + 1] &= kIndexMask;
457 }
458 Indices[i] &= kIndexMask;
459 groupSize++;
460 if (finishedGroup || groupSize == 1)
461 {
462 Indices[i - finishedGroupSize] &= kIndexMask;
463 if (finishedGroupSize > 1)
464 Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
465 { 566 {
466 UInt32 newGroupSize = groupSize + finishedGroupSize; 567 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
467 SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) 568 Indices[(size_t)i + 1] &= kIndexMask;
468 finishedGroupSize = newGroupSize;
469 } 569 }
470 i += groupSize; 570 Indices[i] &= kIndexMask;
471 continue; 571 groupSize++;
472 } 572 if (finishedGroup || groupSize == 1)
473 finishedGroupSize = 0; 573 {
574 Indices[i - finishedGroupSize] &= kIndexMask;
575 if (finishedGroupSize > 1)
576 Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
577 {
578 const size_t newGroupSize = groupSize + finishedGroupSize;
579 SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize)
580 finishedGroupSize = newGroupSize;
581 }
582 i += groupSize;
583 continue;
584 }
585 finishedGroupSize = 0;
474 } 586 }
475 587
476 #endif 588#endif
477 589
478 if (NumSortedBytes >= blockSize) 590 if (NumSortedBytes >= blockSize)
479 { 591 {
480 UInt32 j; 592 size_t j;
481 for (j = 0; j < groupSize; j++) 593 for (j = 0; j < groupSize; j++)
482 { 594 {
483 UInt32 t = (i + j); 595 size_t t = i + j;
484 /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ 596 /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */
485 Groups[Indices[t]] = t; 597 Groups[Indices[t]] = (UInt32)t;
486 } 598 }
487 } 599 }
488 else 600 else
489 if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices 601 if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices
490 #ifndef BLOCK_SORT_USE_HEAP_SORT 602 #ifndef BLOCK_SORT_USE_HEAP_SORT
491 , 0, blockSize 603 , 0, blockSize
492 #endif 604 #endif
493 ) != 0) 605 ))
494 newLimit = i + groupSize; 606 newLimit = i + groupSize;
495 i += groupSize; 607 i += groupSize;
496 } 608 }
@@ -498,19 +610,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
498 break; 610 break;
499 } 611 }
500 } 612 }
501 #ifndef BLOCK_SORT_EXTERNAL_FLAGS 613#ifndef BLOCK_SORT_EXTERNAL_FLAGS
502 for (i = 0; i < blockSize;) 614 for (i = 0; i < blockSize;)
503 { 615 {
504 UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); 616 size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
505 if ((Indices[i] & 0x40000000) != 0) 617 if (Indices[i] & 0x40000000)
506 { 618 {
507 groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); 619 groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits;
508 Indices[(size_t)i + 1] &= kIndexMask; 620 Indices[(size_t)i + 1] &= kIndexMask;
509 } 621 }
510 Indices[i] &= kIndexMask; 622 Indices[i] &= kIndexMask;
511 groupSize++; 623 groupSize++;
512 i += groupSize; 624 i += groupSize;
513 } 625 }
514 #endif 626#endif
515 return Groups[0]; 627 return Groups[0];
516} 628}
diff --git a/C/BwtSort.h b/C/BwtSort.h
index a34b243..1bd2316 100644
--- a/C/BwtSort.h
+++ b/C/BwtSort.h
@@ -1,5 +1,5 @@
1/* BwtSort.h -- BWT block sorting 1/* BwtSort.h -- BWT block sorting
22023-03-03 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_BWT_SORT_H 4#ifndef ZIP7_INC_BWT_SORT_H
5#define ZIP7_INC_BWT_SORT_H 5#define ZIP7_INC_BWT_SORT_H
@@ -10,16 +10,17 @@ EXTERN_C_BEGIN
10 10
11/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ 11/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */
12/* #define BLOCK_SORT_EXTERNAL_FLAGS */ 12/* #define BLOCK_SORT_EXTERNAL_FLAGS */
13// #define BLOCK_SORT_EXTERNAL_FLAGS
13 14
14#ifdef BLOCK_SORT_EXTERNAL_FLAGS 15#ifdef BLOCK_SORT_EXTERNAL_FLAGS
15#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5)) 16#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5)
16#else 17#else
17#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 18#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0
18#endif 19#endif
19 20
20#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) 21#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16))
21 22
22UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize); 23UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize);
23 24
24EXTERN_C_END 25EXTERN_C_END
25 26
diff --git a/C/Compiler.h b/C/Compiler.h
index 2a9c2b7..b266b27 100644
--- a/C/Compiler.h
+++ b/C/Compiler.h
@@ -1,5 +1,5 @@
1/* Compiler.h : Compiler specific defines and pragmas 1/* Compiler.h : Compiler specific defines and pragmas
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_COMPILER_H 4#ifndef ZIP7_INC_COMPILER_H
5#define ZIP7_INC_COMPILER_H 5#define ZIP7_INC_COMPILER_H
@@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void);
183 #define Z7_ATTRIB_NO_VECTORIZE 183 #define Z7_ATTRIB_NO_VECTORIZE
184#endif 184#endif
185 185
186#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
187 #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
188 #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
189#else
190 #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
191 #define Z7_PRAGMA_OPTIMIZE_DEFAULT
192#endif
193
194
195
186#if defined(MY_CPU_X86_OR_AMD64) && ( \ 196#if defined(MY_CPU_X86_OR_AMD64) && ( \
187 defined(__clang__) && (__clang_major__ >= 4) \ 197 defined(__clang__) && (__clang_major__ >= 4) \
188 || defined(__GNUC__) && (__GNUC__ >= 5)) 198 || defined(__GNUC__) && (__GNUC__ >= 5))
diff --git a/C/CpuArch.h b/C/CpuArch.h
index a6297ea..1690a5b 100644
--- a/C/CpuArch.h
+++ b/C/CpuArch.h
@@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
47 #define MY_CPU_SIZEOF_POINTER 4 47 #define MY_CPU_SIZEOF_POINTER 4
48#endif 48#endif
49 49
50#if defined(__SSE2__) \
51 || defined(MY_CPU_AMD64) \
52 || defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
53#define MY_CPU_SSE2
54#endif
55
50 56
51#if defined(_M_ARM64) \ 57#if defined(_M_ARM64) \
52 || defined(_M_ARM64EC) \ 58 || defined(_M_ARM64EC) \
@@ -571,10 +577,12 @@ problem-4 : performace:
571#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) 577#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
572#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) 578#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
573#define Z7_CONV_NATIVE_TO_BE_32(v) (v) 579#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
580// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
574#elif defined(MY_CPU_LE) 581#elif defined(MY_CPU_LE)
575#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) 582#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
576#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) 583#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
577#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) 584#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
585// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
578#else 586#else
579#error Stop_Compiling_Unknown_Endian_CONV 587#error Stop_Compiling_Unknown_Endian_CONV
580#endif 588#endif
diff --git a/C/HuffEnc.c b/C/HuffEnc.c
index 996da30..cbf8c22 100644
--- a/C/HuffEnc.c
+++ b/C/HuffEnc.c
@@ -1,60 +1,125 @@
1/* HuffEnc.c -- functions for Huffman encoding 1/* HuffEnc.c -- functions for Huffman encoding
22023-09-07 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include <string.h>
7
6#include "HuffEnc.h" 8#include "HuffEnc.h"
7#include "Sort.h" 9#include "Sort.h"
10#include "CpuArch.h"
8 11
9#define kMaxLen 16 12#define kMaxLen Z7_HUFFMAN_LEN_MAX
10#define NUM_BITS 10 13#define NUM_BITS 10
11#define MASK ((1u << NUM_BITS) - 1) 14#define MASK ((1u << NUM_BITS) - 1)
12 15#define FREQ_MASK (~(UInt32)MASK)
13#define NUM_COUNTERS 64 16#define NUM_COUNTERS (48 * 2)
14 17
15#define HUFFMAN_SPEED_OPT 18#if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE))
19#if defined(MY_CPU_LE)
20 #define HI_HALF_OFFSET 1
21#else
22 #define HI_HALF_OFFSET 0
23#endif
24#define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET))
25#define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val);
26#define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi)
27#define UPDATE_E(eHi) eHi++;
28#else
29#define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS))
30#define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field
31#define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS))
32#define UPDATE_E(eHi) eHi += 1 << NUM_BITS;
33#endif
16 34
17void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen) 35void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen)
18{ 36{
19 UInt32 num = 0; 37#if NUM_COUNTERS > 2
20 /* if (maxLen > 10) maxLen = 10; */ 38 unsigned counters[NUM_COUNTERS];
39#endif
40#if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2
41 #define lenCounters (counters)
42 #define codes (counters + kMaxLen + 4)
43#else
44 unsigned lenCounters[kMaxLen + 1];
45 UInt32 codes[kMaxLen + 1];
46#endif
47
48 unsigned num;
21 { 49 {
22 UInt32 i; 50 unsigned i;
23 51 // UInt32 sum = 0;
24 #ifdef HUFFMAN_SPEED_OPT 52
53#if NUM_COUNTERS > 2
25 54
26 UInt32 counters[NUM_COUNTERS]; 55#define CTR_ITEM_FOR_FREQ(freq) \
56 counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)]
57
27 for (i = 0; i < NUM_COUNTERS; i++) 58 for (i = 0; i < NUM_COUNTERS; i++)
28 counters[i] = 0; 59 counters[i] = 0;
29 for (i = 0; i < numSymbols; i++) 60 memset(lens, 0, numSymbols);
30 { 61 {
31 UInt32 freq = freqs[i]; 62 const UInt32 *fp = freqs + numSymbols;
32 counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++; 63#define NUM_UNROLLS 1
64#if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble
65 if (numSymbols & 1)
66 {
67 UInt32 f;
68 f = *--fp; CTR_ITEM_FOR_FREQ(f)++;
69 // sum += f;
70 }
71#endif
72 do
73 {
74 UInt32 f;
75 fp -= NUM_UNROLLS;
76 f = fp[0]; CTR_ITEM_FOR_FREQ(f)++;
77 // sum += f;
78#if NUM_UNROLLS > 1
79 f = fp[1]; CTR_ITEM_FOR_FREQ(f)++;
80 // sum += f;
81#endif
82 }
83 while (fp != freqs);
33 } 84 }
34 85#if 0
35 for (i = 1; i < NUM_COUNTERS; i++) 86 printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols);
36 { 87 {
37 UInt32 temp = counters[i]; 88 unsigned k = 0;
38 counters[i] = num; 89 for (k = 0; k < NUM_COUNTERS; k++)
39 num += temp; 90 printf(" %u", counters[k]);
40 } 91 }
41 92#endif
42 for (i = 0; i < numSymbols; i++) 93
94 num = counters[1];
95 counters[1] = 0;
96 for (i = 2; i != NUM_COUNTERS; i += 2)
43 { 97 {
44 UInt32 freq = freqs[i]; 98 unsigned c;
45 if (freq == 0) 99 c = (counters )[i]; (counters )[i] = num; num += c;
46 lens[i] = 0; 100 c = (counters + 1)[i]; (counters + 1)[i] = num; num += c;
47 else 101 }
48 p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS); 102 counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array
103 {
104 i = 0;
105 do
106 {
107 const UInt32 f = freqs[i];
108#if 0
109 if (f == 0) lens[i] = 0; else
110#endif
111 p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS);
112 }
113 while (++i != numSymbols);
49 } 114 }
50 counters[0] = 0;
51 HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); 115 HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]);
52 116
53 #else 117#else // NUM_COUNTERS <= 2
54 118
119 num = 0;
55 for (i = 0; i < numSymbols; i++) 120 for (i = 0; i < numSymbols; i++)
56 { 121 {
57 UInt32 freq = freqs[i]; 122 const UInt32 freq = freqs[i];
58 if (freq == 0) 123 if (freq == 0)
59 lens[i] = 0; 124 lens[i] = 0;
60 else 125 else
@@ -62,17 +127,27 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
62 } 127 }
63 HeapSort(p, num); 128 HeapSort(p, num);
64 129
65 #endif 130#endif
66 } 131 }
67 132
68 if (num < 2) 133 if (num <= 2)
69 { 134 {
70 unsigned minCode = 0; 135 unsigned minCode = 0;
71 unsigned maxCode = 1; 136 unsigned maxCode = 1;
72 if (num == 1) 137 if (num)
73 { 138 {
74 maxCode = (unsigned)p[0] & MASK; 139 maxCode = (unsigned)p[(size_t)num - 1] & MASK;
75 if (maxCode == 0) 140 if (num == 2)
141 {
142 minCode = (unsigned)p[0] & MASK;
143 if (minCode > maxCode)
144 {
145 const unsigned temp = minCode;
146 minCode = maxCode;
147 maxCode = temp;
148 }
149 }
150 else if (maxCode == 0)
76 maxCode++; 151 maxCode++;
77 } 152 }
78 p[minCode] = 0; 153 p[minCode] = 0;
@@ -80,69 +155,191 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
80 lens[minCode] = lens[maxCode] = 1; 155 lens[minCode] = lens[maxCode] = 1;
81 return; 156 return;
82 } 157 }
83
84 { 158 {
85 UInt32 b, e, i; 159 unsigned i;
86 160 for (i = 0; i <= kMaxLen; i++)
87 i = b = e = 0; 161 lenCounters[i] = 0;
88 do 162 lenCounters[1] = 2; // by default root node has 2 child leaves at level 1.
163 }
164 // if (num != 2)
165 {
166 // num > 2
167 // the binary tree will contain (num - 1) internal nodes.
168 // p[num - 2] will be root node of binary tree.
169 UInt32 *b;
170 UInt32 *n;
171 // first node will have two leaf childs: p[0] and p[1]:
172 // p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs
173 // if (pi == n) exit(0);
174 // if (pi != n)
89 { 175 {
90 UInt32 n, m, freq; 176 UInt32 fb = (p[1] & FREQ_MASK) + p[0];
91 n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; 177 UInt32 f = p[2] & FREQ_MASK;
92 freq = (p[n] & ~MASK); 178 const UInt32 *pi = p + 2;
93 p[n] = (p[n] & MASK) | (e << NUM_BITS); 179 UInt32 *e = p;
94 m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; 180 UInt32 eHi = 0;
95 freq += (p[m] & ~MASK); 181 n = p + num;
96 p[m] = (p[m] & MASK) | (e << NUM_BITS); 182 b = p;
97 p[e] = (p[e] & MASK) | freq; 183 // p[0] = fb;
98 e++; 184 for (;;)
185 {
186 // (b <= e)
187 UInt32 sum;
188 e++;
189 UPDATE_E(eHi)
190
191 // (b < e)
192
193 // p range : high bits
194 // [0, b) : parent : processed nodes that have parent and childs
195 // [b, e) : FREQ : non-processed nodes that have no parent but have childs
196 // [e, pi) : FREQ : processed leaves for which parent node was created
197 // [pi, n) : FREQ : non-processed leaves for which parent node was not created
198
199 // first child
200 // note : (*b < f) is same result as ((*b & FREQ_MASK) < f)
201 if (fb < f)
202 {
203 // node freq is smaller
204 sum = fb & FREQ_MASK;
205 STORE_PARENT_DIRECT (b, fb, eHi)
206 b++;
207 fb = *b;
208 if (b == e)
209 {
210 if (++pi == n)
211 break;
212 sum += f;
213 fb &= MASK;
214 fb |= sum;
215 *e = fb;
216 f = *pi & FREQ_MASK;
217 continue;
218 }
219 }
220 else if (++pi == n)
221 {
222 STORE_PARENT_DIRECT (b, fb, eHi)
223 b++;
224 break;
225 }
226 else
227 {
228 sum = f;
229 f = *pi & FREQ_MASK;
230 }
231
232 // (b < e)
233
234 // second child
235 if (fb < f)
236 {
237 sum += fb;
238 sum &= FREQ_MASK;
239 STORE_PARENT_DIRECT (b, fb, eHi)
240 b++;
241 *e = (*e & MASK) | sum; // set frequency sum
242 // (b <= e) is possible here
243 fb = *b;
244 }
245 else if (++pi == n)
246 break;
247 else
248 {
249 sum += f;
250 f = *pi & FREQ_MASK;
251 *e = (*e & MASK) | sum; // set frequency sum
252 }
253 }
99 } 254 }
100 while (num - e > 1);
101 255
256 // printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p);
102 { 257 {
103 UInt32 lenCounters[kMaxLen + 1]; 258 n -= 2;
104 for (i = 0; i <= kMaxLen; i++) 259 *n &= MASK; // root node : we clear high bits (zero bits mean level == 0)
105 lenCounters[i] = 0; 260 if (n != b)
106
107 p[--e] &= MASK;
108 lenCounters[1] = 2;
109 while (e != 0)
110 {
111 UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1;
112 p[e] = (p[e] & MASK) | (len << NUM_BITS);
113 if (len >= maxLen)
114 for (len = maxLen - 1; lenCounters[len] == 0; len--);
115 lenCounters[len]--;
116 lenCounters[(size_t)len + 1] += 2;
117 }
118
119 { 261 {
120 UInt32 len; 262 // We go here, if we have some number of non-created nodes up to root.
121 i = 0; 263 // We process them in simplified code:
122 for (len = maxLen; len != 0; len--) 264 // position of parent for each pair of nodes is known.
265 // n[-2], n[-1] : current pair of child nodes
266 // (p1) : parent node for current pair.
267 UInt32 *p1 = n;
268 do
123 { 269 {
124 UInt32 k; 270 const unsigned len = LOAD_PARENT(p1) + 1;
125 for (k = lenCounters[len]; k != 0; k--) 271 p1--;
126 lens[p[i++] & MASK] = (Byte)len; 272 (lenCounters )[len] -= 2; // we remove 2 leaves from level (len)
273 (lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1)
274 n -= 2;
275 STORE_PARENT (n , n[0], len)
276 STORE_PARENT (n + 1, n[1], len)
127 } 277 }
278 while (n != b);
128 } 279 }
129 280 }
281
282 if (b != p)
283 {
284 // we detect level of each node (realtive to root),
285 // and update lenCounters[].
286 // We process only intermediate nodes and we don't process leaves.
287 do
130 { 288 {
131 UInt32 nextCodes[kMaxLen + 1]; 289 // if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii])
132 { 290 // if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree
133 UInt32 code = 0; 291 unsigned len;
134 UInt32 len; 292 b--;
135 for (len = 1; len <= kMaxLen; len++) 293 len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1;
136 nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1; 294 STORE_PARENT (b, *b, len)
137 } 295 if (len >= maxLen)
138 /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
139
140 { 296 {
141 UInt32 k; 297 // We are not allowed to create node at level (maxLen) and higher,
142 for (k = 0; k < numSymbols; k++) 298 // because all leaves must be placed to level (maxLen) or lower.
143 p[k] = nextCodes[lens[k]]++; 299 // We find nearest allowed leaf and place current node to level of that leaf:
300 for (len = maxLen - 1; lenCounters[len] == 0; len--) {}
144 } 301 }
302 lenCounters[len]--; // we remove 1 leaf from level (len)
303 (lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1)
304 }
305 while (b != p);
306 }
307 }
308 {
309 {
310 unsigned len = maxLen;
311 const UInt32 *p2 = p;
312 do
313 {
314 unsigned k = lenCounters[len];
315 if (k)
316 do
317 lens[(unsigned)*p2++ & MASK] = (Byte)len;
318 while (--k);
319 }
320 while (--len);
321 }
322 codes[0] = 0; // we don't want garbage values to be written to p[] array.
323 // codes[1] = 0;
324 {
325 UInt32 code = 0;
326 unsigned len;
327 for (len = 0; len < kMaxLen; len++)
328 (codes + 1)[len] = code = (code + lenCounters[len]) << 1;
329 }
330 /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
331 {
332 const Byte * const limit = lens + numSymbols;
333 do
334 {
335 unsigned len;
336 UInt32 c;
337 len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1;
338 // len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1;
339 p += 1;
340 lens += 1;
145 } 341 }
342 while (lens != limit);
146 } 343 }
147 } 344 }
148} 345}
@@ -150,5 +347,14 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
150#undef kMaxLen 347#undef kMaxLen
151#undef NUM_BITS 348#undef NUM_BITS
152#undef MASK 349#undef MASK
350#undef FREQ_MASK
153#undef NUM_COUNTERS 351#undef NUM_COUNTERS
154#undef HUFFMAN_SPEED_OPT 352#undef CTR_ITEM_FOR_FREQ
353#undef LOAD_PARENT
354#undef STORE_PARENT
355#undef STORE_PARENT_DIRECT
356#undef UPDATE_E
357#undef HI_HALF_OFFSET
358#undef NUM_UNROLLS
359#undef lenCounters
360#undef codes
diff --git a/C/HuffEnc.h b/C/HuffEnc.h
index cbc5d11..2217f55 100644
--- a/C/HuffEnc.h
+++ b/C/HuffEnc.h
@@ -1,5 +1,5 @@
1/* HuffEnc.h -- Huffman encoding 1/* HuffEnc.h -- Huffman encoding
22023-03-05 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_HUFF_ENC_H 4#ifndef ZIP7_INC_HUFF_ENC_H
5#define ZIP7_INC_HUFF_ENC_H 5#define ZIP7_INC_HUFF_ENC_H
@@ -8,14 +8,14 @@
8 8
9EXTERN_C_BEGIN 9EXTERN_C_BEGIN
10 10
11#define Z7_HUFFMAN_LEN_MAX 16
11/* 12/*
12Conditions: 13Conditions:
13 num <= 1024 = 2 ^ NUM_BITS 14 2 <= num <= 1024 = 2 ^ NUM_BITS
14 Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) 15 Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS)
15 maxLen <= 16 = kMaxLen 16 1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX
16 Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) 17 Num_Items(p) >= HUFFMAN_TEMP_SIZE(num)
17*/ 18*/
18
19void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); 19void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen);
20 20
21EXTERN_C_END 21EXTERN_C_END
diff --git a/C/LzFind.c b/C/LzFind.c
index 1ce4046..6aba919 100644
--- a/C/LzFind.c
+++ b/C/LzFind.c
@@ -1,5 +1,5 @@
1/* LzFind.c -- Match finder for LZ algorithms 1/* LzFind.c -- Match finder for LZ algorithms
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
404 const unsigned nbMax = 404 const unsigned nbMax =
405 (p->numHashBytes == 2 ? 16 : 405 (p->numHashBytes == 2 ? 16 :
406 (p->numHashBytes == 3 ? 24 : 32)); 406 (p->numHashBytes == 3 ? 24 : 32));
407 if (numBits > nbMax) 407 if (numBits >= nbMax)
408 numBits = nbMax; 408 numBits = nbMax;
409 if (numBits >= 32) 409 if (numBits >= 32)
410 hs = (UInt32)0 - 1; 410 hs = (UInt32)0 - 1;
@@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
416 hs |= (256 << kLzHash_CrcShift_2) - 1; 416 hs |= (256 << kLzHash_CrcShift_2) - 1;
417 { 417 {
418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); 418 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
419 if (hs > hs2) 419 if (hs >= hs2)
420 hs = hs2; 420 hs = hs2;
421 } 421 }
422 hsCur = hs; 422 hsCur = hs;
423 if (p->expectedDataSize < historySize) 423 if (p->expectedDataSize < historySize)
424 { 424 {
425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); 425 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
426 if (hsCur > hs2) 426 if (hsCur >= hs2)
427 hsCur = hs2; 427 hsCur = hs2;
428 } 428 }
429 } 429 }
@@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
434 if (p->expectedDataSize < historySize) 434 if (p->expectedDataSize < historySize)
435 { 435 {
436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); 436 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
437 if (hsCur > hs) // is it possible? 437 if (hsCur >= hs) // is it possible?
438 hsCur = hs; 438 hsCur = hs;
439 } 439 }
440 } 440 }
@@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
890 return d; 890 return d;
891 { 891 {
892 const Byte *pb = cur - delta; 892 const Byte *pb = cur - delta;
893 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 893 curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
894 if (pb[maxLen] == cur[maxLen] && *pb == *cur) 894 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
895 { 895 {
896 UInt32 len = 0; 896 UInt32 len = 0;
@@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
925 break; 925 break;
926 { 926 {
927 ptrdiff_t diff; 927 ptrdiff_t diff;
928 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; 928 curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)];
929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta; 929 diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) 930 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
931 { 931 {
@@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } 972 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
973 973
974 cmCheck = (UInt32)(pos - _cyclicBufferSize); 974 cmCheck = (UInt32)(pos - _cyclicBufferSize);
975 if ((UInt32)pos <= _cyclicBufferSize) 975 if ((UInt32)pos < _cyclicBufferSize)
976 cmCheck = 0; 976 cmCheck = 0;
977 977
978 if (cmCheck < curMatch) 978 if (cmCheck < curMatch)
@@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt
980 { 980 {
981 const UInt32 delta = pos - curMatch; 981 const UInt32 delta = pos - curMatch;
982 { 982 {
983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 983 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
984 const Byte *pb = cur - delta; 984 const Byte *pb = cur - delta;
985 unsigned len = (len0 < len1 ? len0 : len1); 985 unsigned len = (len0 < len1 ? len0 : len1);
986 const UInt32 pair0 = pair[0]; 986 const UInt32 pair0 = pair[0];
@@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
1039 UInt32 cmCheck; 1039 UInt32 cmCheck;
1040 1040
1041 cmCheck = (UInt32)(pos - _cyclicBufferSize); 1041 cmCheck = (UInt32)(pos - _cyclicBufferSize);
1042 if ((UInt32)pos <= _cyclicBufferSize) 1042 if ((UInt32)pos < _cyclicBufferSize)
1043 cmCheck = 0; 1043 cmCheck = 0;
1044 1044
1045 if (// curMatch >= pos || // failure 1045 if (// curMatch >= pos || // failure
@@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
1048 { 1048 {
1049 const UInt32 delta = pos - curMatch; 1049 const UInt32 delta = pos - curMatch;
1050 { 1050 {
1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); 1051 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1);
1052 const Byte *pb = cur - delta; 1052 const Byte *pb = cur - delta;
1053 unsigned len = (len0 < len1 ? len0 : len1); 1053 unsigned len = (len0 < len1 ? len0 : len1);
1054 if (pb[len] == cur[len]) 1054 if (pb[len] == cur[len])
@@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1595 UInt32 pos = p->pos; \ 1595 UInt32 pos = p->pos; \
1596 UInt32 num2 = num; \ 1596 UInt32 num2 = num; \
1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \ 1597 /* (p->pos == p->posLimit) is not allowed here !!! */ \
1598 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ 1598 { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \
1599 num -= num2; \ 1599 num -= num2; \
1600 { const UInt32 cycPos = p->cyclicBufferPos; \ 1600 { const UInt32 cycPos = p->cyclicBufferPos; \
1601 son = p->son + cycPos; \ 1601 son = p->son + cycPos; \
diff --git a/C/LzFindMt.c b/C/LzFindMt.c
index ac9d59d..25fcc46 100644
--- a/C/LzFindMt.c
+++ b/C/LzFindMt.c
@@ -1,5 +1,5 @@
1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes;
82Z7_NO_INLINE 82Z7_NO_INLINE
83static void MtSync_Construct(CMtSync *p) 83static void MtSync_Construct(CMtSync *p)
84{ 84{
85 p->affinityGroup = -1;
86 p->affinityInGroup = 0;
85 p->affinity = 0; 87 p->affinity = 0;
86 p->wasCreated = False; 88 p->wasCreated = False;
87 p->csWasInitialized = False; 89 p->csWasInitialized = False;
@@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
259 // return ERROR_TOO_MANY_POSTS; // for debug 261 // return ERROR_TOO_MANY_POSTS; // for debug
260 // return EINVAL; // for debug 262 // return EINVAL; // for debug
261 263
264#ifdef _WIN32
265 if (p->affinityGroup >= 0)
266 wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
267 (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
268 else
269#endif
262 if (p->affinity != 0) 270 if (p->affinity != 0)
263 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); 271 wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
264 else 272 else
diff --git a/C/LzFindMt.h b/C/LzFindMt.h
index fcb479d..89984f5 100644
--- a/C/LzFindMt.h
+++ b/C/LzFindMt.h
@@ -1,5 +1,5 @@
1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
22024-01-22 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZ_FIND_MT_H 4#ifndef ZIP7_INC_LZ_FIND_MT_H
5#define ZIP7_INC_LZ_FIND_MT_H 5#define ZIP7_INC_LZ_FIND_MT_H
@@ -12,8 +12,10 @@ EXTERN_C_BEGIN
12typedef struct 12typedef struct
13{ 13{
14 UInt32 numProcessedBlocks; 14 UInt32 numProcessedBlocks;
15 CThread thread; 15 Int32 affinityGroup;
16 UInt64 affinityInGroup;
16 UInt64 affinity; 17 UInt64 affinity;
18 CThread thread;
17 19
18 BoolInt wasCreated; 20 BoolInt wasCreated;
19 BoolInt needStart; 21 BoolInt needStart;
diff --git a/C/Lzma2Enc.c b/C/Lzma2Enc.c
index 703e146..72aec69 100644
--- a/C/Lzma2Enc.c
+++ b/C/Lzma2Enc.c
@@ -1,5 +1,5 @@
1/* Lzma2Enc.c -- LZMA2 Encoder 1/* Lzma2Enc.c -- LZMA2 Encoder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p)
235 p->numBlockThreads_Reduced = -1; 235 p->numBlockThreads_Reduced = -1;
236 p->numBlockThreads_Max = -1; 236 p->numBlockThreads_Max = -1;
237 p->numTotalThreads = -1; 237 p->numTotalThreads = -1;
238 p->numThreadGroups = 0;
238} 239}
239 240
240void Lzma2EncProps_Normalize(CLzma2EncProps *p) 241void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
781 } 782 }
782 783
783 p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; 784 p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
785 p->mtCoder.numThreadGroups = p->props.numThreadGroups;
784 p->mtCoder.expectedDataSize = p->expectedDataSize; 786 p->mtCoder.expectedDataSize = p->expectedDataSize;
785 787
786 { 788 {
diff --git a/C/Lzma2Enc.h b/C/Lzma2Enc.h
index cb25275..1e6b50c 100644
--- a/C/Lzma2Enc.h
+++ b/C/Lzma2Enc.h
@@ -18,6 +18,7 @@ typedef struct
18 int numBlockThreads_Reduced; 18 int numBlockThreads_Reduced;
19 int numBlockThreads_Max; 19 int numBlockThreads_Max;
20 int numTotalThreads; 20 int numTotalThreads;
21 unsigned numThreadGroups; // 0 : no groups
21} CLzma2EncProps; 22} CLzma2EncProps;
22 23
23void Lzma2EncProps_Init(CLzma2EncProps *p); 24void Lzma2EncProps_Init(CLzma2EncProps *p);
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c
index 088b78f..84a29a5 100644
--- a/C/LzmaEnc.c
+++ b/C/LzmaEnc.c
@@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
62 p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; 62 p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1;
63 p->numHashOutBits = 0; 63 p->numHashOutBits = 0;
64 p->writeEndMark = 0; 64 p->writeEndMark = 0;
65 p->affinityGroup = -1;
65 p->affinity = 0; 66 p->affinity = 0;
67 p->affinityInGroup = 0;
66} 68}
67 69
68void LzmaEncProps_Normalize(CLzmaEncProps *p) 70void LzmaEncProps_Normalize(CLzmaEncProps *p)
@@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2)
598 p->multiThread = (props.numThreads > 1); 600 p->multiThread = (props.numThreads > 1);
599 p->matchFinderMt.btSync.affinity = 601 p->matchFinderMt.btSync.affinity =
600 p->matchFinderMt.hashSync.affinity = props.affinity; 602 p->matchFinderMt.hashSync.affinity = props.affinity;
603 p->matchFinderMt.btSync.affinityGroup =
604 p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup;
605 p->matchFinderMt.btSync.affinityInGroup =
606 p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup;
601 #endif 607 #endif
602 608
603 return SZ_OK; 609 return SZ_OK;
diff --git a/C/LzmaEnc.h b/C/LzmaEnc.h
index 9f8039a..3feb5b4 100644
--- a/C/LzmaEnc.h
+++ b/C/LzmaEnc.h
@@ -1,5 +1,5 @@
1/* LzmaEnc.h -- LZMA Encoder 1/* LzmaEnc.h -- LZMA Encoder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZMA_ENC_H 4#ifndef ZIP7_INC_LZMA_ENC_H
5#define ZIP7_INC_LZMA_ENC_H 5#define ZIP7_INC_LZMA_ENC_H
@@ -29,11 +29,13 @@ typedef struct
29 int numThreads; /* 1 or 2, default = 2 */ 29 int numThreads; /* 1 or 2, default = 2 */
30 30
31 // int _pad; 31 // int _pad;
32 Int32 affinityGroup;
32 33
33 UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. 34 UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
34 Encoder uses this value to reduce dictionary size */ 35 Encoder uses this value to reduce dictionary size */
35 36
36 UInt64 affinity; 37 UInt64 affinity;
38 UInt64 affinityInGroup;
37} CLzmaEncProps; 39} CLzmaEncProps;
38 40
39void LzmaEncProps_Init(CLzmaEncProps *p); 41void LzmaEncProps_Init(CLzmaEncProps *p);
diff --git a/C/MtCoder.c b/C/MtCoder.c
index 03959b6..923b19a 100644
--- a/C/MtCoder.c
+++ b/C/MtCoder.c
@@ -1,5 +1,5 @@
1/* MtCoder.c -- Multi-thread Coder 1/* MtCoder.c -- Multi-thread Coder
22023-09-07 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
39static THREAD_FUNC_DECL ThreadFunc(void *pp); 39static THREAD_FUNC_DECL ThreadFunc(void *pp);
40 40
41 41
42static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) 42static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
43#ifdef _WIN32
44 , CMtCoder * const mtc
45#endif
46 )
43{ 47{
44 WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); 48 WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
49 // printf("\n====== MtCoderThread_CreateAndStart : \n");
45 if (wres == 0) 50 if (wres == 0)
46 { 51 {
47 t->stop = False; 52 t->stop = False;
48 if (!Thread_WasCreated(&t->thread)) 53 if (!Thread_WasCreated(&t->thread))
49 wres = Thread_Create(&t->thread, ThreadFunc, t); 54 {
55#ifdef _WIN32
56 if (mtc->numThreadGroups)
57 wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
58 ThreadNextGroup_GetNext(&mtc->nextGroup), // group
59 0); // affinityMask
60 else
61#endif
62 wres = Thread_Create(&t->thread, ThreadFunc, t);
63 }
50 if (wres == 0) 64 if (wres == 0)
51 wres = Event_Set(&t->startEvent); 65 wres = Event_Set(&t->startEvent);
52 } 66 }
@@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
56} 70}
57 71
58 72
73Z7_FORCE_INLINE
59static void MtCoderThread_Destruct(CMtCoderThread *t) 74static void MtCoderThread_Destruct(CMtCoderThread *t)
60{ 75{
61 if (Thread_WasCreated(&t->thread)) 76 if (Thread_WasCreated(&t->thread))
@@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
85 100
86static SRes ThreadFunc2(CMtCoderThread *t) 101static SRes ThreadFunc2(CMtCoderThread *t)
87{ 102{
88 CMtCoder *mtc = t->mtCoder; 103 CMtCoder * const mtc = t->mtCoder;
89 104
90 for (;;) 105 for (;;)
91 { 106 {
@@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
185 if (mtc->numStartedThreads < mtc->numStartedThreadsLimit 200 if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
186 && mtc->expectedDataSize != readProcessed) 201 && mtc->expectedDataSize != readProcessed)
187 { 202 {
188 res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); 203 res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
204#ifdef _WIN32
205 , mtc
206#endif
207 );
189 if (res == SZ_OK) 208 if (res == SZ_OK)
190 mtc->numStartedThreads++; 209 mtc->numStartedThreads++;
191 else 210 else
@@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
221 } 240 }
222 241
223 { 242 {
224 CMtCoderBlock *block = &mtc->blocks[bi]; 243 CMtCoderBlock * const block = &mtc->blocks[bi];
225 block->res = res; 244 block->res = res;
226 block->bufIndex = bufIndex; 245 block->bufIndex = bufIndex;
227 block->finished = finished; 246 block->finished = finished;
@@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
311 330
312static THREAD_FUNC_DECL ThreadFunc(void *pp) 331static THREAD_FUNC_DECL ThreadFunc(void *pp)
313{ 332{
314 CMtCoderThread *t = (CMtCoderThread *)pp; 333 CMtCoderThread * const t = (CMtCoderThread *)pp;
315 for (;;) 334 for (;;)
316 { 335 {
317 if (Event_Wait(&t->startEvent) != 0) 336 if (Event_Wait(&t->startEvent) != 0)
@@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
319 if (t->stop) 338 if (t->stop)
320 return 0; 339 return 0;
321 { 340 {
322 SRes res = ThreadFunc2(t); 341 const SRes res = ThreadFunc2(t);
323 CMtCoder *mtc = t->mtCoder; 342 CMtCoder *mtc = t->mtCoder;
324 if (res != SZ_OK) 343 if (res != SZ_OK)
325 { 344 {
@@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
328 347
329 #ifndef MTCODER_USE_WRITE_THREAD 348 #ifndef MTCODER_USE_WRITE_THREAD
330 { 349 {
331 unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); 350 const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
332 if (numFinished == mtc->numStartedThreads) 351 if (numFinished == mtc->numStartedThreads)
333 if (Event_Set(&mtc->finishedEvent) != 0) 352 if (Event_Set(&mtc->finishedEvent) != 0)
334 return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; 353 return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
346 365
347 p->blockSize = 0; 366 p->blockSize = 0;
348 p->numThreadsMax = 0; 367 p->numThreadsMax = 0;
368 p->numThreadGroups = 0;
349 p->expectedDataSize = (UInt64)(Int64)-1; 369 p->expectedDataSize = (UInt64)(Int64)-1;
350 370
351 p->inStream = NULL; 371 p->inStream = NULL;
@@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p)
429 unsigned i; 449 unsigned i;
430 SRes res = SZ_OK; 450 SRes res = SZ_OK;
431 451
452 // printf("\n====== MtCoder_Code : \n");
453
432 if (numThreads > MTCODER_THREADS_MAX) 454 if (numThreads > MTCODER_THREADS_MAX)
433 numThreads = MTCODER_THREADS_MAX; 455 numThreads = MTCODER_THREADS_MAX;
434 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); 456 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
@@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p)
492 514
493 p->numStartedThreadsLimit = numThreads; 515 p->numStartedThreadsLimit = numThreads;
494 p->numStartedThreads = 0; 516 p->numStartedThreads = 0;
517 ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
495 518
496 // for (i = 0; i < numThreads; i++) 519 // for (i = 0; i < numThreads; i++)
497 { 520 {
521 // here we create new thread for first block.
522 // And each new thread will create another new thread after block reading
523 // until numStartedThreadsLimit is reached.
498 CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; 524 CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
499 RINOK(MtCoderThread_CreateAndStart(nextThread)) 525 {
526 const SRes res2 = MtCoderThread_CreateAndStart(nextThread
527#ifdef _WIN32
528 , p
529#endif
530 );
531 RINOK(res2)
532 }
500 } 533 }
501 534
502 RINOK_THREAD(Event_Set(&p->readEvent)) 535 RINOK_THREAD(Event_Set(&p->readEvent))
@@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
513 RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) 546 RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
514 547
515 { 548 {
516 const CMtCoderBlock *block = &p->blocks[bi]; 549 const CMtCoderBlock * const block = &p->blocks[bi];
517 unsigned bufIndex = block->bufIndex; 550 const unsigned bufIndex = block->bufIndex;
518 BoolInt finished = block->finished; 551 const BoolInt finished = block->finished;
519 if (res == SZ_OK && block->res != SZ_OK) 552 if (res == SZ_OK && block->res != SZ_OK)
520 res = block->res; 553 res = block->res;
521 554
@@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
545 } 578 }
546 #else 579 #else
547 { 580 {
548 WRes wres = Event_Wait(&p->finishedEvent); 581 const WRes wres = Event_Wait(&p->finishedEvent);
549 res = MY_SRes_HRESULT_FROM_WRes(wres); 582 res = MY_SRes_HRESULT_FROM_WRes(wres);
550 } 583 }
551 #endif 584 #endif
diff --git a/C/MtCoder.h b/C/MtCoder.h
index 1231d3c..8166cca 100644
--- a/C/MtCoder.h
+++ b/C/MtCoder.h
@@ -1,5 +1,5 @@
1/* MtCoder.h -- Multi-thread Coder 1/* MtCoder.h -- Multi-thread Coder
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_MT_CODER_H 4#ifndef ZIP7_INC_MT_CODER_H
5#define ZIP7_INC_MT_CODER_H 5#define ZIP7_INC_MT_CODER_H
@@ -16,7 +16,7 @@ EXTERN_C_BEGIN
16 16
17#ifndef Z7_ST 17#ifndef Z7_ST
18 #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) 18 #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
19 #define MTCODER_THREADS_MAX 64 19 #define MTCODER_THREADS_MAX 256
20 #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) 20 #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
21#else 21#else
22 #define MTCODER_THREADS_MAX 1 22 #define MTCODER_THREADS_MAX 1
@@ -77,6 +77,7 @@ typedef struct CMtCoder_
77 77
78 size_t blockSize; /* size of input block */ 78 size_t blockSize; /* size of input block */
79 unsigned numThreadsMax; 79 unsigned numThreadsMax;
80 unsigned numThreadGroups;
80 UInt64 expectedDataSize; 81 UInt64 expectedDataSize;
81 82
82 ISeqInStreamPtr inStream; 83 ISeqInStreamPtr inStream;
@@ -125,6 +126,8 @@ typedef struct CMtCoder_
125 CMtProgress mtProgress; 126 CMtProgress mtProgress;
126 CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; 127 CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
127 CMtCoderThread threads[MTCODER_THREADS_MAX]; 128 CMtCoderThread threads[MTCODER_THREADS_MAX];
129
130 CThreadNextGroup nextGroup;
128} CMtCoder; 131} CMtCoder;
129 132
130 133
diff --git a/C/Sha512.c b/C/Sha512.c
index 04827d6..f0787fd 100644
--- a/C/Sha512.c
+++ b/C/Sha512.c
@@ -439,26 +439,78 @@ void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize)
439 439
440 440
441 441
442// #define Z7_SHA512_PROBE_DEBUG // for debug
442 443
443#if defined(_WIN32) && defined(Z7_COMPILER_SHA512_SUPPORTED) \ 444#if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
444 && defined(MY_CPU_ARM64) // we can disable this check to debug in x64
445 445
446#if 1 // 0 for debug 446#if defined(Z7_SHA512_PROBE_DEBUG) \
447 || defined(_WIN32) && defined(MY_CPU_ARM64)
448#ifndef Z7_SHA512_USE_PROBE
449#define Z7_SHA512_USE_PROBE
450#endif
451#endif
447 452
448#include "7zWindows.h" 453#ifdef Z7_SHA512_USE_PROBE
449// #include <stdio.h> 454
450#if 0 && defined(MY_CPU_X86_OR_AMD64) 455#ifdef Z7_SHA512_PROBE_DEBUG
451#include <intrin.h> // for debug : for __ud2() 456#include <stdio.h>
457#define PRF(x) x
458#else
459#define PRF(x)
452#endif 460#endif
453 461
454BoolInt CPU_IsSupported_SHA512(void) 462#if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode
463// MINGW doesn't support __try. So we use signal() / longjmp().
464// Note: signal() / longjmp() probably is not thread-safe.
465// So we must call Sha512Prepare() from main thread at program start.
466#ifndef Z7_SHA512_USE_LONGJMP
467#define Z7_SHA512_USE_LONGJMP
468#endif
469#endif
470
471#ifdef Z7_SHA512_USE_LONGJMP
472#include <signal.h>
473#include <setjmp.h>
474static jmp_buf g_Sha512_jmp_buf;
475// static int g_Sha512_Unsupported;
476
477#if defined(__GNUC__) && (__GNUC__ >= 8) \
478 || defined(__clang__) && (__clang_major__ >= 3)
479 __attribute__((noreturn))
480#endif
481static void Z7_CDECL Sha512_signal_Handler(int v)
455{ 482{
483 PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);)
484 // g_Sha512_Unsupported = 1;
485 longjmp(g_Sha512_jmp_buf, 1);
486}
487#endif // Z7_SHA512_USE_LONGJMP
488
489
490#if defined(_WIN32)
491#include "7zWindows.h"
492#endif
493
456#if defined(MY_CPU_ARM64) 494#if defined(MY_CPU_ARM64)
495// #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug
496#endif
497
498#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
499#include <arm_neon.h>
500#if defined(__clang__)
501 __attribute__((__target__("sha3")))
502#elif !defined(_MSC_VER)
503 __attribute__((__target__("arch=armv8.2-a+sha3")))
504#endif
505#endif
506static BoolInt CPU_IsSupported_SHA512_Probe(void)
507{
508 PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");)
509#if defined(_WIN32) && defined(MY_CPU_ARM64)
457 // we have no SHA512 flag for IsProcessorFeaturePresent() still. 510 // we have no SHA512 flag for IsProcessorFeaturePresent() still.
458 if (!CPU_IsSupported_CRYPTO()) 511 if (!CPU_IsSupported_CRYPTO())
459 return False; 512 return False;
460#endif 513 PRF(printf("==== Registry check\n");)
461 // printf("\nCPU_IsSupported_SHA512\n");
462 { 514 {
463 // we can't read ID_AA64ISAR0_EL1 register from application. 515 // we can't read ID_AA64ISAR0_EL1 register from application.
464 // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. 516 // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value.
@@ -486,6 +538,7 @@ BoolInt CPU_IsSupported_SHA512(void)
486 // 2 : SHA256 and SHA512 implemented 538 // 2 : SHA256 and SHA512 implemented
487 } 539 }
488 } 540 }
541#endif // defined(_WIN32) && defined(MY_CPU_ARM64)
489 542
490 543
491#if 1 // 0 for debug to disable SHA512 PROBE code 544#if 1 // 0 for debug to disable SHA512 PROBE code
@@ -509,59 +562,97 @@ Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instru
509Are there any ways to fix the problems with arm64-wine and x64-SDE cases? 562Are there any ways to fix the problems with arm64-wine and x64-SDE cases?
510*/ 563*/
511 564
512 // printf("\n========== CPU_IsSupported_SHA512 PROBE ========\n"); 565 PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");)
513 { 566 {
567 BoolInt isSupported = False;
568#ifdef Z7_SHA512_USE_LONGJMP
569 void (Z7_CDECL *signal_prev)(int);
570 /*
571 if (g_Sha512_Unsupported)
572 {
573 PRF(printf("==== g_Sha512_Unsupported\n");)
574 return False;
575 }
576 */
577 printf("====== signal(SIGILL)\n");
578 signal_prev = signal(SIGILL, Sha512_signal_Handler);
579 if (signal_prev == SIG_ERR)
580 {
581 PRF(printf("====== signal fail\n");)
582 return False;
583 }
584 // PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);)
585 // docs: Before the specified function is executed,
586 // the value of func is set to SIG_DFL.
587 // So we can exit if (setjmp(g_Sha512_jmp_buf) != 0).
588 PRF(printf("====== setjmp\n");)
589 if (!setjmp(g_Sha512_jmp_buf))
590#else // Z7_SHA512_USE_LONGJMP
591
592#ifdef _MSC_VER
514#ifdef __clang_major__ 593#ifdef __clang_major__
515 #pragma GCC diagnostic ignored "-Wlanguage-extension-token" 594 #pragma GCC diagnostic ignored "-Wlanguage-extension-token"
516#endif 595#endif
517 __try 596 __try
597#endif
598#endif // Z7_SHA512_USE_LONGJMP
599
518 { 600 {
519#if 0 // 1 : for debug (reduced version to detect sha512) 601#if defined(Z7_COMPILER_SHA512_SUPPORTED)
602#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
603 // simplified sha512 check for arm64:
520 const uint64x2_t a = vdupq_n_u64(1); 604 const uint64x2_t a = vdupq_n_u64(1);
521 const uint64x2_t b = vsha512hq_u64(a, a, a); 605 const uint64x2_t b = vsha512hq_u64(a, a, a);
606 PRF(printf("======== vsha512hq_u64 probe\n");)
522 if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) 607 if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002)
523 return True;
524#else 608#else
525 MY_ALIGN(16) 609 MY_ALIGN(16)
526 UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; 610 UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS];
527 memset(temp, 0x5a, sizeof(temp)); 611 memset(temp, 0x5a, sizeof(temp));
528#if 0 && defined(MY_CPU_X86_OR_AMD64) 612 PRF(printf("======== Sha512_UpdateBlocks_HW\n");)
529 __ud2(); // for debug : that exception is not problem for SDE
530#endif
531#if 1
532 Sha512_UpdateBlocks_HW(temp, 613 Sha512_UpdateBlocks_HW(temp,
533 (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); 614 (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1);
534 // printf("\n==== t = %x\n", (UInt32)temp[0]); 615 // PRF(printf("======== t = %x\n", (UInt32)temp[0]);)
535 if ((UInt32)temp[0] == 0xa33cfdf7) 616 if ((UInt32)temp[0] == 0xa33cfdf7)
617#endif
536 { 618 {
537 // printf("\n=== PROBE SHA512: SHA512 supported\n"); 619 PRF(printf("======== PROBE SHA512: SHA512 is supported\n");)
538 return True; 620 isSupported = True;
539 } 621 }
622#else // Z7_COMPILER_SHA512_SUPPORTED
623 // for debug : we generate bad instrction or raise exception.
624 // __except() doesn't catch raise() calls.
625#ifdef Z7_SHA512_USE_LONGJMP
626 PRF(printf("====== raise(SIGILL)\n");)
627 raise(SIGILL);
628#else
629#if defined(_MSC_VER) && defined(MY_CPU_X86)
630 __asm ud2
540#endif 631#endif
541#endif 632#endif // Z7_SHA512_USE_LONGJMP
633#endif // Z7_COMPILER_SHA512_SUPPORTED
542 } 634 }
635
636#ifdef Z7_SHA512_USE_LONGJMP
637 PRF(printf("====== restore signal SIGILL\n");)
638 signal(SIGILL, signal_prev);
639#elif _MSC_VER
543 __except (EXCEPTION_EXECUTE_HANDLER) 640 __except (EXCEPTION_EXECUTE_HANDLER)
544 { 641 {
545 // printf("\n==== CPU_IsSupported_SHA512 EXCEPTION_EXECUTE_HANDLER\n"); 642 PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");)
546 } 643 }
644#endif
645 PRF(printf("== return (sha512 supported) = %d\n", isSupported);)
646 return isSupported;
547 } 647 }
548 return False;
549#else 648#else
550 // without SHA512 PROBE code 649 // without SHA512 PROBE code
551 return True; 650 return True;
552#endif 651#endif
553
554} 652}
555 653
556#else 654#endif // Z7_SHA512_USE_PROBE
557 655#endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
558BoolInt CPU_IsSupported_SHA512(void)
559{
560 return False;
561}
562
563#endif
564#endif // WIN32 arm64
565 656
566 657
567void Sha512Prepare(void) 658void Sha512Prepare(void)
@@ -570,10 +661,10 @@ void Sha512Prepare(void)
570 SHA512_FUNC_UPDATE_BLOCKS f, f_hw; 661 SHA512_FUNC_UPDATE_BLOCKS f, f_hw;
571 f = Sha512_UpdateBlocks; 662 f = Sha512_UpdateBlocks;
572 f_hw = NULL; 663 f_hw = NULL;
573#ifdef MY_CPU_X86_OR_AMD64 664#ifdef Z7_SHA512_USE_PROBE
574 if (CPU_IsSupported_SHA512() 665 if (CPU_IsSupported_SHA512_Probe())
575 && CPU_IsSupported_AVX2() 666#elif defined(MY_CPU_X86_OR_AMD64)
576 ) 667 if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2())
577#else 668#else
578 if (CPU_IsSupported_SHA512()) 669 if (CPU_IsSupported_SHA512())
579#endif 670#endif
@@ -583,6 +674,8 @@ void Sha512Prepare(void)
583 } 674 }
584 g_SHA512_FUNC_UPDATE_BLOCKS = f; 675 g_SHA512_FUNC_UPDATE_BLOCKS = f;
585 g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; 676 g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw;
677#elif defined(Z7_SHA512_PROBE_DEBUG)
678 CPU_IsSupported_SHA512_Probe(); // for debug
586#endif 679#endif
587} 680}
588 681
diff --git a/C/Sort.c b/C/Sort.c
index e1097e3..20e3e69 100644
--- a/C/Sort.c
+++ b/C/Sort.c
@@ -1,141 +1,268 @@
1/* Sort.c -- Sort functions 1/* Sort.c -- Sort functions
22014-04-05 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "Sort.h" 6#include "Sort.h"
7#include "CpuArch.h"
7 8
8#define HeapSortDown(p, k, size, temp) \ 9#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
9 { for (;;) { \ 10 || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
10 size_t s = (k << 1); \ 11 )
11 if (s > size) break; \ 12// the code with prefetch is slow for small arrays on x86.
12 if (s < size && p[s + 1] > p[s]) s++; \ 13// So we disable prefetch for x86.
13 if (temp >= p[s]) break; \ 14#ifndef MY_CPU_X86
14 p[k] = p[s]; k = s; \ 15 // #pragma message("Z7_PREFETCH : __builtin_prefetch")
15 } p[k] = temp; } 16 #define Z7_PREFETCH(a) __builtin_prefetch((a))
17#endif
16 18
17void HeapSort(UInt32 *p, size_t size) 19#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
18{ 20
19 if (size <= 1) 21#include "7zWindows.h"
20 return; 22
21 p--; 23// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
22 { 24// For example, clang-cl can generate "prefetcht2" instruction for
23 size_t i = size / 2; 25// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
24 do 26// But we want to generate "prefetcht0" instruction.
25 { 27// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
26 UInt32 temp = p[i]; 28// instead of PreFetchCacheLine() / _mm_prefetch().
27 size_t k = i; 29
28 HeapSortDown(p, k, size, temp) 30// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
29 } 31// But old x86 cpus don't support "prefetcht0".
30 while (--i != 0); 32// So we will use PreFetchCacheLine(), only if we are sure that
31 } 33// generated instruction is supported by all cpus of that isa.
32 /* 34#if defined(MY_CPU_AMD64) \
33 do 35 || defined(MY_CPU_ARM64) \
34 { 36 || defined(MY_CPU_IA64)
35 size_t k = 1; 37// we need to use additional braces for (a) in PreFetchCacheLine call, because
36 UInt32 temp = p[size]; 38// PreFetchCacheLine macro doesn't use braces:
37 p[size--] = p[1]; 39// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
38 HeapSortDown(p, k, size, temp) 40 // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
39 } 41 #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
40 while (size > 1); 42#endif
41 */ 43
42 while (size > 3) 44#endif // _WIN32
43 { 45
44 UInt32 temp = p[size]; 46
45 size_t k = (p[3] > p[2]) ? 3 : 2; 47#define PREFETCH_NO(p,k,s,size)
46 p[size--] = p[1]; 48
47 p[1] = p[k]; 49#ifndef Z7_PREFETCH
48 HeapSortDown(p, k, size, temp) 50 #define SORT_PREFETCH(p,k,s,size)
49 } 51#else
50 { 52
51 UInt32 temp = p[size]; 53// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
52 p[size] = p[1]; 54#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
53 if (size > 2 && p[2] < temp) 55// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
54 { 56
55 p[1] = p[2]; 57#if PREFETCH_LEVEL == 0
56 p[2] = temp; 58
57 } 59 #define SORT_PREFETCH(p,k,s,size)
58 else 60
59 p[1] = temp; 61#else // PREFETCH_LEVEL != 0
60 } 62
63/*
64if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
65 we prefetch one value per cache line.
66 Use it if array is aligned for cache line size (64 bytes)
67 or if array is small (less than L1 cache size).
68
69if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
70 we perfetch all cache lines that can be required.
71 it can be faster for big unaligned arrays.
72*/
73 #define USE_PREFETCH_FOR_ALIGNED_ARRAY
74
75// s == k * 2
76#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
77 // x86 supports (lea r1*8+offset)
78 #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
79#else
80 #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
81#endif
82
83#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
84 #define PREFETCH_ADD_OFFSET 0
85#else
86 // last offset that can be reqiured in PREFETCH_LEVEL step:
87 #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
88 #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
89#endif
90
91#if PREFETCH_LEVEL <= 3
92
93#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
94 #define SORT_PREFETCH(p,k,s,size) \
95 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
96 if (s2 <= size) { \
97 Z7_PREFETCH((p + s2)); \
98 }}
99#else /* for unaligned array */
100 #define SORT_PREFETCH(p,k,s,size) \
101 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
102 if (s2 <= size) { \
103 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
104 Z7_PREFETCH((p + s2)); \
105 }}
106#endif
107
108#else // PREFETCH_LEVEL > 3
109
110#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
111 #define SORT_PREFETCH(p,k,s,size) \
112 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
113 if (s2 <= size) { \
114 Z7_PREFETCH((p + s2 - 16)); \
115 Z7_PREFETCH((p + s2)); \
116 }}
117#else /* for unaligned array */
118 #define SORT_PREFETCH(p,k,s,size) \
119 { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
120 if (s2 <= size) { \
121 Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
122 Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
123 Z7_PREFETCH((p + s2)); \
124 }}
125#endif
126
127#endif // PREFETCH_LEVEL > 3
128#endif // PREFETCH_LEVEL != 0
129#endif // Z7_PREFETCH
130
131
132#if defined(MY_CPU_ARM64) \
133 /* || defined(MY_CPU_AMD64) */ \
134 /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
135 // we want to use cmov, if cmov is very fast:
136 // - this cmov version is slower for clang-x64.
137 // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
138 #define Z7_FAST_CMOV_SUPPORTED
139#endif
140
141#ifdef Z7_FAST_CMOV_SUPPORTED
142 // we want to use cmov here, if cmov is fast: new arm64 cpus.
143 // we want the compiler to use conditional move for this branch
144 #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
145#else
146 // use this branch, if cpu doesn't support fast conditional move.
147 // it uses slow array access reading:
148 #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
149#endif
150
151#define HeapSortDown(p, k, size, temp, macro_prefetch) \
152{ \
153 for (;;) { \
154 UInt32 n0, n1; \
155 size_t s = k * 2; \
156 if (s >= size) { \
157 if (s == size) { \
158 n0 = p[s]; \
159 p[k] = n0; \
160 if (temp < n0) k = s; \
161 } \
162 break; \
163 } \
164 n0 = p[k * 2]; \
165 n1 = p[k * 2 + 1]; \
166 s += n0 < n1; \
167 GET_MAX_VAL(n0, n1, p[s]) \
168 if (temp >= n0) break; \
169 macro_prefetch(p, k, s, size) \
170 p[k] = n0; \
171 k = s; \
172 } \
173 p[k] = temp; \
61} 174}
62 175
63void HeapSort64(UInt64 *p, size_t size) 176
177/*
178stage-1 : O(n) :
179 we generate intermediate partially sorted binary tree:
180 p[0] : it's additional item for better alignment of tree structure in memory.
181 p[1]
182 p[2] p[3]
183 p[4] p[5] p[6] p[7]
184 ...
185 p[x] >= p[x * 2]
186 p[x] >= p[x * 2 + 1]
187
188stage-2 : O(n)*log2(N):
189 we move largest item p[0] from head of tree to the end of array
190 and insert last item to sorted binary tree.
191*/
192
193// (p) must be aligned for cache line size (64-bytes) for best performance
194
195void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
64{ 196{
65 if (size <= 1) 197 if (size < 2)
66 return; 198 return;
67 p--; 199 if (size == 2)
68 {
69 size_t i = size / 2;
70 do
71 {
72 UInt64 temp = p[i];
73 size_t k = i;
74 HeapSortDown(p, k, size, temp)
75 }
76 while (--i != 0);
77 }
78 /*
79 do
80 { 200 {
81 size_t k = 1; 201 const UInt32 a0 = p[0];
82 UInt64 temp = p[size]; 202 const UInt32 a1 = p[1];
83 p[size--] = p[1]; 203 const unsigned k = a1 < a0;
84 HeapSortDown(p, k, size, temp) 204 p[k] = a0;
85 } 205 p[k ^ 1] = a1;
86 while (size > 1); 206 return;
87 */
88 while (size > 3)
89 {
90 UInt64 temp = p[size];
91 size_t k = (p[3] > p[2]) ? 3 : 2;
92 p[size--] = p[1];
93 p[1] = p[k];
94 HeapSortDown(p, k, size, temp)
95 } 207 }
96 { 208 {
97 UInt64 temp = p[size]; 209 // stage-1 : O(n)
98 p[size] = p[1]; 210 // we transform array to partially sorted binary tree.
99 if (size > 2 && p[2] < temp) 211 size_t i = --size / 2;
212 // (size) now is the index of the last item in tree,
213 // if (i)
100 { 214 {
101 p[1] = p[2]; 215 do
102 p[2] = temp; 216 {
217 const UInt32 temp = p[i];
218 size_t k = i;
219 HeapSortDown(p, k, size, temp, PREFETCH_NO)
220 }
221 while (--i);
222 }
223 {
224 const UInt32 temp = p[0];
225 const UInt32 a1 = p[1];
226 if (temp < a1)
227 {
228 size_t k = 1;
229 p[0] = a1;
230 HeapSortDown(p, k, size, temp, PREFETCH_NO)
231 }
103 } 232 }
104 else
105 p[1] = temp;
106 } 233 }
107}
108 234
109/* 235 if (size < 3)
110#define HeapSortRefDown(p, vals, n, size, temp) \ 236 {
111 { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ 237 // size == 2
112 size_t s = (k << 1); \ 238 const UInt32 a0 = p[0];
113 if (s > size) break; \ 239 p[0] = p[2];
114 if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ 240 p[2] = a0;
115 if (val >= vals[p[s]]) break; \
116 p[k] = p[s]; k = s; \
117 } p[k] = temp; }
118
119void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
120{
121 if (size <= 1)
122 return; 241 return;
123 p--; 242 }
243 if (size != 3)
124 { 244 {
125 size_t i = size / 2; 245 // stage-2 : O(size) * log2(size):
246 // we move largest item p[0] from head to the end of array,
247 // and insert last item to sorted binary tree.
126 do 248 do
127 { 249 {
128 UInt32 temp = p[i]; 250 const UInt32 temp = p[size];
129 HeapSortRefDown(p, vals, i, size, temp); 251 size_t k = p[2] < p[3] ? 3 : 2;
252 p[size--] = p[0];
253 p[0] = p[1];
254 p[1] = p[k];
255 HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
130 } 256 }
131 while (--i != 0); 257 while (size != 3);
132 } 258 }
133 do
134 { 259 {
135 UInt32 temp = p[size]; 260 const UInt32 a2 = p[2];
136 p[size--] = p[1]; 261 const UInt32 a3 = p[3];
137 HeapSortRefDown(p, vals, 1, size, temp); 262 const size_t k = a2 < a3;
263 p[2] = p[1];
264 p[3] = p[0];
265 p[k] = a3;
266 p[k ^ 1] = a2;
138 } 267 }
139 while (size > 1);
140} 268}
141*/
diff --git a/C/Sort.h b/C/Sort.h
index 1817b65..de5a4e8 100644
--- a/C/Sort.h
+++ b/C/Sort.h
@@ -1,5 +1,5 @@
1/* Sort.h -- Sort functions 1/* Sort.h -- Sort functions
22023-03-05 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_SORT_H 4#ifndef ZIP7_INC_SORT_H
5#define ZIP7_INC_SORT_H 5#define ZIP7_INC_SORT_H
@@ -8,10 +8,7 @@
8 8
9EXTERN_C_BEGIN 9EXTERN_C_BEGIN
10 10
11void HeapSort(UInt32 *p, size_t size); 11void Z7_FASTCALL HeapSort(UInt32 *p, size_t size);
12void HeapSort64(UInt64 *p, size_t size);
13
14/* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */
15 12
16EXTERN_C_END 13EXTERN_C_END
17 14
diff --git a/C/Threads.c b/C/Threads.c
index 464efec..177d1d9 100644
--- a/C/Threads.c
+++ b/C/Threads.c
@@ -1,5 +1,5 @@
1/* Threads.c -- multithreading library 1/* Threads.c -- multithreading library
22024-03-28 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p)
59 return (res != 0 ? res : res2); 59 return (res != 0 ? res : res2);
60} 60}
61 61
62typedef struct MY_PROCESSOR_NUMBER {
63 WORD Group;
64 BYTE Number;
65 BYTE Reserved;
66} MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER;
67
68typedef struct MY_GROUP_AFFINITY {
69#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000)
70 // KAFFINITY is not defined in old mingw
71 ULONG_PTR
72#else
73 KAFFINITY
74#endif
75 Mask;
76 WORD Group;
77 WORD Reserved[3];
78} MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY;
79
80typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)(
81 HANDLE hThread,
82 CONST MY_GROUP_AFFINITY *GroupAffinity,
83 MY_PGROUP_AFFINITY PreviousGroupAffinity);
84
85typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)(
86 HANDLE hThread,
87 MY_PGROUP_AFFINITY GroupAffinity);
88
89typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)(
90 HANDLE hProcess,
91 PUSHORT GroupCount,
92 PUSHORT GroupArray);
93
94Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
95
96#if 0
97#include <stdio.h>
98#define PRF(x) x
99/*
100--
101 before call of SetThreadGroupAffinity()
102 GetProcessGroupAffinity return one group.
103 after call of SetThreadGroupAffinity():
104 GetProcessGroupAffinity return more than group,
105 if SetThreadGroupAffinity() was to another group.
106--
107 GetProcessAffinityMask MS DOCs:
108 {
109 If the calling process contains threads in multiple groups,
110 the function returns zero for both affinity masks.
111 }
112 but tests in win10 with 2 groups (less than 64 cores total):
113 GetProcessAffinityMask() still returns non-zero affinity masks
114 even after SetThreadGroupAffinity() calls.
115*/
116static void PrintProcess_Info()
117{
118 {
119 const
120 Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity =
121 (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
122 "GetProcessGroupAffinity");
123 if (fn_GetProcessGroupAffinity)
124 {
125 unsigned i;
126 USHORT GroupCounts[64];
127 USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts);
128 BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(),
129 &GroupCount, GroupCounts);
130 printf("\n====== GetProcessGroupAffinity : "
131 "boolRes=%u GroupCounts = %u :",
132 boolRes, (unsigned)GroupCount);
133 for (i = 0; i < GroupCount; i++)
134 printf(" %u", GroupCounts[i]);
135 printf("\n");
136 }
137 }
138 {
139 DWORD_PTR processAffinityMask, systemAffinityMask;
140 if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask))
141 {
142 PRF(printf("\n====== GetProcessAffinityMask : "
143 ": processAffinityMask=%x, systemAffinityMask=%x\n",
144 (UInt32)processAffinityMask, (UInt32)systemAffinityMask);)
145 }
146 else
147 printf("\n==GetProcessAffinityMask FAIL");
148 }
149}
150#else
151#ifndef USE_THREADS_CreateThread
152// #define PRF(x)
153#endif
154#endif
155
62WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) 156WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
63{ 157{
64 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ 158 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
@@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
72 166
73 unsigned threadId; 167 unsigned threadId;
74 *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); 168 *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId));
75 169
170#if 0 // 1 : for debug
171 {
172 DWORD_PTR prevMask;
173 DWORD_PTR affinity = 1 << 0;
174 prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity);
175 prevMask = prevMask;
176 }
177#endif
178#if 0 // 1 : for debug
179 {
180 /* win10: new thread will be created in same group that is assigned to parent thread
181 but affinity mask will contain all allowed threads of that group,
182 even if affinity mask of parent group is not full
183 win11: what group it will be created, if we have set
184 affinity of parent thread with ThreadGroupAffinity?
185 */
186 const
187 Func_GetThreadGroupAffinity fn =
188 (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
189 "GetThreadGroupAffinity");
190 if (fn)
191 {
192 // BOOL wres2;
193 MY_GROUP_AFFINITY groupAffinity;
194 memset(&groupAffinity, 0, sizeof(groupAffinity));
195 /* wres2 = */ fn(*p, &groupAffinity);
196 PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): "
197 "wres2_BOOL = %u, group=%u mask=%x\n",
198 GetCurrentThreadId(),
199 wres2,
200 groupAffinity.Group,
201 (UInt32)groupAffinity.Mask);)
202 }
203 }
204#endif
205
76 #endif 206 #endif
77 207
78 /* maybe we must use errno here, but probably GetLastError() is also OK. */ 208 /* maybe we must use errno here, but probably GetLastError() is also OK. */
@@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
110 */ 240 */
111 } 241 }
112 { 242 {
113 DWORD prevSuspendCount = ResumeThread(h); 243 const DWORD prevSuspendCount = ResumeThread(h);
244 /* ResumeThread() returns:
245 0 : was_not_suspended
246 1 : was_resumed
247 -1 : error
248 */
249 if (prevSuspendCount == (DWORD)-1)
250 wres = GetError();
251 }
252 }
253
254 /* maybe we must use errno here, but probably GetLastError() is also OK. */
255 return wres;
256
257 #endif
258}
259
260
261WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask)
262{
263#ifdef USE_THREADS_CreateThread
264
265 UNUSED_VAR(group)
266 UNUSED_VAR(affinityMask)
267 return Thread_Create(p, func, param);
268
269#else
270
271 /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */
272 HANDLE h;
273 WRes wres;
274 unsigned threadId;
275 h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId));
276 *p = h;
277 wres = HandleToWRes(h);
278 if (h)
279 {
280 // PrintProcess_Info();
281 {
282 const
283 Func_SetThreadGroupAffinity fn =
284 (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
285 "SetThreadGroupAffinity");
286 if (fn)
287 {
288 // WRes wres2;
289 MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity;
290 memset(&groupAffinity, 0, sizeof(groupAffinity));
291 // groupAffinity.Mask must use only bits that supported by current group
292 // (groupAffinity.Mask = 0) means all allowed bits
293 groupAffinity.Mask = affinityMask;
294 groupAffinity.Group = (WORD)group;
295 // wres2 =
296 fn(h, &groupAffinity, &prev_groupAffinity);
297 /*
298 if (groupAffinity.Group == prev_groupAffinity.Group)
299 wres2 = wres2;
300 else
301 wres2 = wres2;
302 if (wres2 == 0)
303 {
304 wres2 = GetError();
305 PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);)
306 }
307 else
308 {
309 PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()"
310 " threadId = %6u"
311 " group=%u mask=%x\n",
312 threadId,
313 prev_groupAffinity.Group,
314 (UInt32)prev_groupAffinity.Mask);)
315 }
316 */
317 }
318 }
319 {
320 const DWORD prevSuspendCount = ResumeThread(h);
114 /* ResumeThread() returns: 321 /* ResumeThread() returns:
115 0 : was_not_suspended 322 0 : was_not_suspended
116 1 : was_resumed 323 1 : was_resumed
@@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param)
297 return Thread_Create_With_CpuSet(p, func, param, NULL); 504 return Thread_Create_With_CpuSet(p, func, param, NULL);
298} 505}
299 506
507/*
508WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity)
509{
510 UNUSED_VAR(group)
511 return Thread_Create_With_Affinity(p, func, param, affinity);
512}
513*/
300 514
301WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) 515WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity)
302{ 516{
@@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
577 return AutoResetEvent_CreateNotSignaled(p); 791 return AutoResetEvent_CreateNotSignaled(p);
578} 792}
579 793
794void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup)
795{
796 // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup);
797 if (numGroups == 0)
798 numGroups = 1;
799 p->NumGroups = numGroups;
800 p->NextGroup = startGroup % numGroups;
801}
802
803
804UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p)
805{
806 const UInt32 next = p->NextGroup;
807 p->NextGroup = (next + 1) % p->NumGroups;
808 return next;
809}
810
580#undef PRF 811#undef PRF
581#undef Print 812#undef Print
diff --git a/C/Threads.h b/C/Threads.h
index c1484a2..be12e6e 100644
--- a/C/Threads.h
+++ b/C/Threads.h
@@ -1,5 +1,5 @@
1/* Threads.h -- multithreading library 1/* Threads.h -- multithreading library
22024-03-28 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_THREADS_H 4#ifndef ZIP7_INC_THREADS_H
5#define ZIP7_INC_THREADS_H 5#define ZIP7_INC_THREADS_H
@@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param
140WRes Thread_Wait_Close(CThread *p); 140WRes Thread_Wait_Close(CThread *p);
141 141
142#ifdef _WIN32 142#ifdef _WIN32
143WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask);
143#define Thread_Create_With_CpuSet(p, func, param, cs) \ 144#define Thread_Create_With_CpuSet(p, func, param, cs) \
144 Thread_Create_With_Affinity(p, func, param, *cs) 145 Thread_Create_With_Affinity(p, func, param, *cs)
145#else 146#else
146WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); 147WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet);
147#endif 148#endif
148 149
150typedef struct
151{
152 unsigned NumGroups;
153 unsigned NextGroup;
154} CThreadNextGroup;
155
156void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup);
157unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p);
158
149 159
150#ifdef _WIN32 160#ifdef _WIN32
151 161
diff --git a/C/Util/Lzma/LzmaUtil.dsp b/C/Util/Lzma/LzmaUtil.dsp
index e2e7d42..71de950 100644
--- a/C/Util/Lzma/LzmaUtil.dsp
+++ b/C/Util/Lzma/LzmaUtil.dsp
@@ -122,6 +122,10 @@ SOURCE=..\..\Compiler.h
122# End Source File 122# End Source File
123# Begin Source File 123# Begin Source File
124 124
125SOURCE=..\..\CpuArch.c
126# End Source File
127# Begin Source File
128
125SOURCE=..\..\CpuArch.h 129SOURCE=..\..\CpuArch.h
126# End Source File 130# End Source File
127# Begin Source File 131# Begin Source File
diff --git a/C/Util/LzmaLib/LzmaLib.dsp b/C/Util/LzmaLib/LzmaLib.dsp
index bacd967..f413137 100644
--- a/C/Util/LzmaLib/LzmaLib.dsp
+++ b/C/Util/LzmaLib/LzmaLib.dsp
@@ -43,7 +43,7 @@ RSC=rc.exe
43# PROP Ignore_Export_Lib 0 43# PROP Ignore_Export_Lib 0
44# PROP Target_Dir "" 44# PROP Target_Dir ""
45# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c 45# ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c
46# ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c 46# ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c
47# SUBTRACT CPP /YX 47# SUBTRACT CPP /YX
48# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 48# ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32
49# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 49# ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32
@@ -71,7 +71,7 @@ LINK32=link.exe
71# PROP Ignore_Export_Lib 0 71# PROP Ignore_Export_Lib 0
72# PROP Target_Dir "" 72# PROP Target_Dir ""
73# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c 73# ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c
74# ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c 74# ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c
75# SUBTRACT CPP /YX 75# SUBTRACT CPP /YX
76# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 76# ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32
77# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 77# ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32
@@ -128,6 +128,10 @@ SOURCE=..\..\Compiler.h
128# End Source File 128# End Source File
129# Begin Source File 129# Begin Source File
130 130
131SOURCE=..\..\CpuArch.c
132# End Source File
133# Begin Source File
134
131SOURCE=..\..\CpuArch.h 135SOURCE=..\..\CpuArch.h
132# End Source File 136# End Source File
133# Begin Source File 137# Begin Source File
diff --git a/C/Xz.h b/C/Xz.h
index 42bc685..ad63b48 100644
--- a/C/Xz.h
+++ b/C/Xz.h
@@ -1,5 +1,5 @@
1/* Xz.h - Xz interface 1/* Xz.h - Xz interface
22024-01-26 : Igor Pavlov : Public domain */ 2Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_H 4#ifndef ZIP7_INC_XZ_H
5#define ZIP7_INC_XZ_H 5#define ZIP7_INC_XZ_H
@@ -121,6 +121,7 @@ typedef struct
121 UInt64 startOffset; 121 UInt64 startOffset;
122} CXzStream; 122} CXzStream;
123 123
124#define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; }
124void Xz_Construct(CXzStream *p); 125void Xz_Construct(CXzStream *p);
125void Xz_Free(CXzStream *p, ISzAllocPtr alloc); 126void Xz_Free(CXzStream *p, ISzAllocPtr alloc);
126 127
@@ -136,8 +137,13 @@ typedef struct
136 CXzStream *streams; 137 CXzStream *streams;
137} CXzs; 138} CXzs;
138 139
140#define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; }
139void Xzs_Construct(CXzs *p); 141void Xzs_Construct(CXzs *p);
140void Xzs_Free(CXzs *p, ISzAllocPtr alloc); 142void Xzs_Free(CXzs *p, ISzAllocPtr alloc);
143/*
144Xzs_ReadBackward() must be called for empty CXzs object.
145Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error.
146*/
141SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); 147SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc);
142 148
143UInt64 Xzs_GetNumBlocks(const CXzs *p); 149UInt64 Xzs_GetNumBlocks(const CXzs *p);
@@ -268,8 +274,8 @@ typedef struct
268 size_t outBufSize; 274 size_t outBufSize;
269 size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked 275 size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked
270 276
271 Byte shaDigest[SHA256_DIGEST_SIZE]; 277 UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4];
272 Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; 278 Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes
273} CXzUnpacker; 279} CXzUnpacker;
274 280
275/* alloc : aligned for cache line allocation is better */ 281/* alloc : aligned for cache line allocation is better */
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c
index 0c1fc2f..6eea4a3 100644
--- a/C/XzCrc64Opt.c
+++ b/C/XzCrc64Opt.c
@@ -1,5 +1,5 @@
1/* XzCrc64Opt.c -- CRC64 calculation (optimized functions) 1/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
22023-12-08 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
235 v = Q32BE(1, w1) ^ Q32BE(0, w0); 235 v = Q32BE(1, w1) ^ Q32BE(0, w0);
236 v ^= Q32BE(3, d1) ^ Q32BE(2, d0); 236 v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
237 #endif 237 #endif
238#elif 238#else
239#error Stop_Compiling_Bad_CRC64_NUM_TABLES 239#error Stop_Compiling_Bad_CRC64_NUM_TABLES
240#endif 240#endif
241 p += Z7_CRC64_NUM_TABLES_USE; 241 p += Z7_CRC64_NUM_TABLES_USE;
diff --git a/C/XzDec.c b/C/XzDec.c
index 3d1c98e..2dac324 100644
--- a/C/XzDec.c
+++ b/C/XzDec.c
@@ -1,5 +1,5 @@
1/* XzDec.c -- Xz Decode 1/* XzDec.c -- Xz Decode
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value)
59 59
60 for (i = 0; i < limit;) 60 for (i = 0; i < limit;)
61 { 61 {
62 Byte b = p[i]; 62 const unsigned b = p[i];
63 *value |= (UInt64)(b & 0x7F) << (7 * i++); 63 *value |= (UInt64)(b & 0x7F) << (7 * i++);
64 if ((b & 0x80) == 0) 64 if ((b & 0x80) == 0)
65 return (b == 0 && i != 1) ? 0 : i; 65 return (b == 0 && i != 1) ? 0 : i;
@@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf)
796 796
797static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) 797static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf)
798{ 798{
799 return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) 799 return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2)
800 && GetUi32(buf) == CrcCalc(buf + 4, 6) 800 && GetUi32a(buf) == CrcCalc(buf + 4, 6)
801 && flags == GetBe16(buf + 8) 801 && flags == GetBe16a(buf + 8)
802 && buf[10] == XZ_FOOTER_SIG_0 802 && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8));
803 && buf[11] == XZ_FOOTER_SIG_1;
804} 803}
805 804
806#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 805#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
@@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1166 p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); 1165 p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks);
1167 p->indexPos = p->indexPreSize; 1166 p->indexPos = p->indexPreSize;
1168 p->indexSize += p->indexPreSize; 1167 p->indexSize += p->indexPreSize;
1169 Sha256_Final(&p->sha, p->shaDigest); 1168 Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32);
1170 Sha256_Init(&p->sha); 1169 Sha256_Init(&p->sha);
1171 p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); 1170 p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize);
1172 p->state = XZ_STATE_STREAM_INDEX; 1171 p->state = XZ_STATE_STREAM_INDEX;
@@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1241 break; 1240 break;
1242 } 1241 }
1243 { 1242 {
1244 Byte digest[XZ_CHECK_SIZE_MAX]; 1243 UInt32 digest32[XZ_CHECK_SIZE_MAX / 4];
1245 p->state = XZ_STATE_BLOCK_HEADER; 1244 p->state = XZ_STATE_BLOCK_HEADER;
1246 p->pos = 0; 1245 p->pos = 0;
1247 if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) 1246 if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0)
1248 return SZ_ERROR_CRC; 1247 return SZ_ERROR_CRC;
1249 if (p->decodeOnlyOneBlock) 1248 if (p->decodeOnlyOneBlock)
1250 { 1249 {
@@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1289 } 1288 }
1290 else 1289 else
1291 { 1290 {
1292 Byte digest[SHA256_DIGEST_SIZE]; 1291 UInt32 digest32[SHA256_DIGEST_SIZE / 4];
1293 p->state = XZ_STATE_STREAM_INDEX_CRC; 1292 p->state = XZ_STATE_STREAM_INDEX_CRC;
1294 p->indexSize += 4; 1293 p->indexSize += 4;
1295 p->pos = 0; 1294 p->pos = 0;
1296 Sha256_Final(&p->sha, digest); 1295 Sha256_Final(&p->sha, (void *)digest32);
1297 if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) 1296 if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0)
1298 return SZ_ERROR_CRC; 1297 return SZ_ERROR_CRC;
1299 } 1298 }
1300 } 1299 }
@@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1313 const Byte *ptr = p->buf; 1312 const Byte *ptr = p->buf;
1314 p->state = XZ_STATE_STREAM_FOOTER; 1313 p->state = XZ_STATE_STREAM_FOOTER;
1315 p->pos = 0; 1314 p->pos = 0;
1316 if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) 1315 if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr))
1317 return SZ_ERROR_CRC; 1316 return SZ_ERROR_CRC;
1318 } 1317 }
1319 break; 1318 break;
@@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1343 { 1342 {
1344 if (*src != 0) 1343 if (*src != 0)
1345 { 1344 {
1346 if (((UInt32)p->padSize & 3) != 0) 1345 if ((unsigned)p->padSize & 3)
1347 return SZ_ERROR_NO_ARCHIVE; 1346 return SZ_ERROR_NO_ARCHIVE;
1348 p->pos = 0; 1347 p->pos = 0;
1349 p->state = XZ_STATE_STREAM_HEADER; 1348 p->state = XZ_STATE_STREAM_HEADER;
diff --git a/C/XzEnc.c b/C/XzEnc.c
index c1affad..e40f0c8 100644
--- a/C/XzEnc.c
+++ b/C/XzEnc.c
@@ -1,5 +1,5 @@
1/* XzEnc.c -- Xz Encode 1/* XzEnc.c -- Xz Encode
22024-03-01 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size)
411 } 411 }
412} 412}
413 413
414Z7_FORCE_INLINE
414static void SeqInFilter_Construct(CSeqInFilter *p) 415static void SeqInFilter_Construct(CSeqInFilter *p)
415{ 416{
416 p->buf = NULL; 417 p->buf = NULL;
@@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p)
418 p->vt.Read = SeqInFilter_Read; 419 p->vt.Read = SeqInFilter_Read;
419} 420}
420 421
422Z7_FORCE_INLINE
421static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) 423static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc)
422{ 424{
423 if (p->StateCoder.p) 425 if (p->StateCoder.p)
@@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p)
507void XzProps_Init(CXzProps *p) 509void XzProps_Init(CXzProps *p)
508{ 510{
509 p->checkId = XZ_CHECK_CRC32; 511 p->checkId = XZ_CHECK_CRC32;
512 p->numThreadGroups = 0;
510 p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; 513 p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO;
511 p->numBlockThreads_Reduced = -1; 514 p->numBlockThreads_Reduced = -1;
512 p->numBlockThreads_Max = -1; 515 p->numBlockThreads_Max = -1;
@@ -689,6 +692,7 @@ typedef struct
689} CLzma2WithFilters; 692} CLzma2WithFilters;
690 693
691 694
695Z7_FORCE_INLINE
692static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) 696static void Lzma2WithFilters_Construct(CLzma2WithFilters *p)
693{ 697{
694 p->lzma2 = NULL; 698 p->lzma2 = NULL;
@@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz
712} 716}
713 717
714 718
719Z7_FORCE_INLINE
715static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) 720static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc)
716{ 721{
717 #ifdef USE_SUBBLOCK 722 #ifdef USE_SUBBLOCK
@@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in
1236 } 1241 }
1237 1242
1238 p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; 1243 p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max;
1244 p->mtCoder.numThreadGroups = props->numThreadGroups;
1239 p->mtCoder.expectedDataSize = p->expectedDataSize; 1245 p->mtCoder.expectedDataSize = p->expectedDataSize;
1240 1246
1241 RINOK(MtCoder_Code(&p->mtCoder)) 1247 RINOK(MtCoder_Code(&p->mtCoder))
diff --git a/C/XzEnc.h b/C/XzEnc.h
index 77b78c0..ac6bbf7 100644
--- a/C/XzEnc.h
+++ b/C/XzEnc.h
@@ -1,5 +1,5 @@
1/* XzEnc.h -- Xz Encode 1/* XzEnc.h -- Xz Encode
22023-04-13 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_ENC_H 4#ifndef ZIP7_INC_XZ_ENC_H
5#define ZIP7_INC_XZ_ENC_H 5#define ZIP7_INC_XZ_ENC_H
@@ -31,6 +31,7 @@ typedef struct
31 CLzma2EncProps lzma2Props; 31 CLzma2EncProps lzma2Props;
32 CXzFilterProps filterProps; 32 CXzFilterProps filterProps;
33 unsigned checkId; 33 unsigned checkId;
34 unsigned numThreadGroups; // 0 : no groups
34 UInt64 blockSize; 35 UInt64 blockSize;
35 int numBlockThreads_Reduced; 36 int numBlockThreads_Reduced;
36 int numBlockThreads_Max; 37 int numBlockThreads_Max;
diff --git a/C/XzIn.c b/C/XzIn.c
index b68af96..ba31636 100644
--- a/C/XzIn.c
+++ b/C/XzIn.c
@@ -1,38 +1,39 @@
1/* XzIn.c - Xz input 1/* XzIn.c - Xz input
22023-09-07 : Igor Pavlov : Public domain */ 2: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include <string.h> 6#include <string.h>
7 7
8#include "7zCrc.h" 8#include "7zCrc.h"
9#include "CpuArch.h"
10#include "Xz.h" 9#include "Xz.h"
10#include "CpuArch.h"
11 11
12/* 12#define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \
13#define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) 13 (GetUi16a((const Byte *)(const void *)(p) + 10) == \
14*/ 14 (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)))
15#define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1)
16
17 15
18SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) 16SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
19{ 17{
20 Byte sig[XZ_STREAM_HEADER_SIZE]; 18 UInt32 data32[XZ_STREAM_HEADER_SIZE / 4];
21 size_t processedSize = XZ_STREAM_HEADER_SIZE; 19 size_t processedSize = XZ_STREAM_HEADER_SIZE;
22 RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize)) 20 RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize))
23 if (processedSize != XZ_STREAM_HEADER_SIZE 21 if (processedSize != XZ_STREAM_HEADER_SIZE
24 || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) 22 || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0)
25 return SZ_ERROR_NO_ARCHIVE; 23 return SZ_ERROR_NO_ARCHIVE;
26 return Xz_ParseHeader(p, sig); 24 return Xz_ParseHeader(p, (const Byte *)(const void *)data32);
27} 25}
28 26
29#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 27#define READ_VARINT_AND_CHECK(buf, size, res) \
30 { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ 28{ const unsigned s = Xz_ReadVarInt(buf, size, res); \
31 if (s == 0) return SZ_ERROR_ARCHIVE; \ 29 if (s == 0) return SZ_ERROR_ARCHIVE; \
32 pos += s; } 30 size -= s; \
31 buf += s; \
32}
33 33
34SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) 34SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes)
35{ 35{
36 MY_ALIGN(4)
36 Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; 37 Byte header[XZ_BLOCK_HEADER_SIZE_MAX];
37 unsigned headerSize; 38 unsigned headerSize;
38 *headerSizeRes = 0; 39 *headerSizeRes = 0;
@@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
57 return XzBlock_Parse(p, header); 58 return XzBlock_Parse(p, header);
58} 59}
59 60
61
60#define ADD_SIZE_CHECK(size, val) \ 62#define ADD_SIZE_CHECK(size, val) \
61 { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } 63{ const UInt64 newSize = size + (val); \
64 if (newSize < size) return XZ_SIZE_OVERFLOW; \
65 size = newSize; \
66}
62 67
63UInt64 Xz_GetUnpackSize(const CXzStream *p) 68UInt64 Xz_GetUnpackSize(const CXzStream *p)
64{ 69{
@@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p)
82 return size; 87 return size;
83} 88}
84 89
85/*
86SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream)
87{
88 return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f));
89}
90*/
91 90
92static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) 91// input;
92// CXzStream (p) is empty object.
93// size != 0
94// (size & 3) == 0
95// (buf) is aligned for at least 4 bytes.
96// output:
97// p->numBlocks is number of allocated items in p->blocks
98// p->blocks[*] values must be ignored, if function returns error.
99static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc)
93{ 100{
94 size_t numBlocks, pos = 1; 101 size_t numBlocks;
95 UInt32 crc;
96
97 if (size < 5 || buf[0] != 0) 102 if (size < 5 || buf[0] != 0)
98 return SZ_ERROR_ARCHIVE; 103 return SZ_ERROR_ARCHIVE;
99
100 size -= 4; 104 size -= 4;
101 crc = CrcCalc(buf, size); 105 {
102 if (crc != GetUi32(buf + size)) 106 const UInt32 crc = CrcCalc(buf, size);
103 return SZ_ERROR_ARCHIVE; 107 if (crc != GetUi32a(buf + size))
104 108 return SZ_ERROR_ARCHIVE;
109 }
110 buf++;
111 size--;
105 { 112 {
106 UInt64 numBlocks64; 113 UInt64 numBlocks64;
107 READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64) 114 READ_VARINT_AND_CHECK(buf, size, &numBlocks64)
108 numBlocks = (size_t)numBlocks64; 115 // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2):
109 if (numBlocks != numBlocks64 || numBlocks * 2 > size) 116 if (numBlocks64 * 2 > size)
110 return SZ_ERROR_ARCHIVE; 117 return SZ_ERROR_ARCHIVE;
118 if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes))
119 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
120 numBlocks = (size_t)numBlocks64;
111 } 121 }
112 122 // Xz_Free(p, alloc); // it's optional, because (p) is empty already
113 Xz_Free(p, alloc); 123 if (numBlocks)
114 if (numBlocks != 0)
115 { 124 {
116 size_t i; 125 CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
117 p->numBlocks = numBlocks; 126 if (!blocks)
118 p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks);
119 if (!p->blocks)
120 return SZ_ERROR_MEM; 127 return SZ_ERROR_MEM;
121 for (i = 0; i < numBlocks; i++) 128 p->blocks = blocks;
129 p->numBlocks = numBlocks;
130 // the caller will call Xz_Free() in case of error
131 do
122 { 132 {
123 CXzBlockSizes *block = &p->blocks[i]; 133 READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize)
124 READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize) 134 READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize)
125 READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize) 135 if (blocks->totalSize == 0)
126 if (block->totalSize == 0)
127 return SZ_ERROR_ARCHIVE; 136 return SZ_ERROR_ARCHIVE;
137 blocks++;
128 } 138 }
139 while (--numBlocks);
129 } 140 }
130 while ((pos & 3) != 0) 141 if (size >= 4)
131 if (buf[pos++] != 0) 142 return SZ_ERROR_ARCHIVE;
143 while (size)
144 if (buf[--size])
132 return SZ_ERROR_ARCHIVE; 145 return SZ_ERROR_ARCHIVE;
133 return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; 146 return SZ_OK;
134} 147}
135 148
149
150/*
136static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) 151static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc)
137{ 152{
138 SRes res; 153 SRes res;
139 size_t size; 154 size_t size;
140 Byte *buf; 155 Byte *buf;
141 if (indexSize > ((UInt32)1 << 31)) 156 if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
142 return SZ_ERROR_UNSUPPORTED; 157 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
143 size = (size_t)indexSize; 158 size = (size_t)indexSize;
144 if (size != indexSize)
145 return SZ_ERROR_UNSUPPORTED;
146 buf = (Byte *)ISzAlloc_Alloc(alloc, size); 159 buf = (Byte *)ISzAlloc_Alloc(alloc, size);
147 if (!buf) 160 if (!buf)
148 return SZ_ERROR_MEM; 161 return SZ_ERROR_MEM;
149 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); 162 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
150 if (res == SZ_OK) 163 if (res == SZ_OK)
151 res = Xz_ReadIndex2(p, buf, size, alloc); 164 res = Xz_ParseIndex(p, buf, size, alloc);
152 ISzAlloc_Free(alloc, buf); 165 ISzAlloc_Free(alloc, buf);
153 return res; 166 return res;
154} 167}
168*/
155 169
156static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) 170static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size)
157{ 171{
@@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset,
160 /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ 174 /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */
161} 175}
162 176
177
178/*
179in:
180 (*startOffset) is position in (stream) where xz_stream must be finished.
181out:
182 if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream.
183*/
163static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) 184static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc)
164{ 185{
165 UInt64 indexSize; 186 #define TEMP_BUF_SIZE (1 << 10)
166 Byte buf[XZ_STREAM_FOOTER_SIZE]; 187 UInt32 buf32[TEMP_BUF_SIZE / 4];
167 UInt64 pos = (UInt64)*startOffset; 188 UInt64 pos = (UInt64)*startOffset;
168 189
169 if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) 190 if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE)
170 return SZ_ERROR_NO_ARCHIVE; 191 return SZ_ERROR_NO_ARCHIVE;
171
172 pos -= XZ_STREAM_FOOTER_SIZE; 192 pos -= XZ_STREAM_FOOTER_SIZE;
173 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) 193 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
174 194
175 if (!XZ_FOOTER_SIG_CHECK(buf + 10)) 195 if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
176 { 196 {
177 UInt32 total = 0;
178 pos += XZ_STREAM_FOOTER_SIZE; 197 pos += XZ_STREAM_FOOTER_SIZE;
179
180 for (;;) 198 for (;;)
181 { 199 {
182 size_t i; 200 // pos != 0
183 #define TEMP_BUF_SIZE (1 << 10) 201 // (pos & 3) == 0
184 Byte temp[TEMP_BUF_SIZE]; 202 size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos;
185
186 i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos;
187 pos -= i; 203 pos -= i;
188 RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)) 204 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i))
189 total += (UInt32)i; 205 i /= 4;
190 for (; i != 0; i--) 206 do
191 if (temp[i - 1] != 0) 207 if (buf32[i - 1] != 0)
192 break; 208 break;
193 if (i != 0) 209 while (--i);
194 { 210
195 if ((i & 3) != 0) 211 pos += i * 4;
196 return SZ_ERROR_NO_ARCHIVE; 212 #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16)
197 pos += i; 213 // here we don't support rare case with big padding for xz stream.
198 break; 214 // so we have padding limit for backward reading.
199 } 215 if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX)
200 if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16))
201 return SZ_ERROR_NO_ARCHIVE; 216 return SZ_ERROR_NO_ARCHIVE;
217 if (i)
218 break;
202 } 219 }
203 220 // we try to open xz stream after skipping zero padding.
221 // ((UInt64)*startOffset == pos) is possible here!
204 if (pos < XZ_STREAM_FOOTER_SIZE) 222 if (pos < XZ_STREAM_FOOTER_SIZE)
205 return SZ_ERROR_NO_ARCHIVE; 223 return SZ_ERROR_NO_ARCHIVE;
206 pos -= XZ_STREAM_FOOTER_SIZE; 224 pos -= XZ_STREAM_FOOTER_SIZE;
207 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) 225 RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE))
208 if (!XZ_FOOTER_SIG_CHECK(buf + 10)) 226 if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32))
209 return SZ_ERROR_NO_ARCHIVE; 227 return SZ_ERROR_NO_ARCHIVE;
210 } 228 }
211 229
212 p->flags = (CXzStreamFlags)GetBe16(buf + 8); 230 p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2);
213
214 if (!XzFlags_IsSupported(p->flags)) 231 if (!XzFlags_IsSupported(p->flags))
215 return SZ_ERROR_UNSUPPORTED; 232 return SZ_ERROR_UNSUPPORTED;
216
217 { 233 {
218 /* to eliminate GCC 6.3 warning: 234 /* to eliminate GCC 6.3 warning:
219 dereferencing type-punned pointer will break strict-aliasing rules */ 235 dereferencing type-punned pointer will break strict-aliasing rules */
220 const Byte *buf_ptr = buf; 236 const UInt32 *buf_ptr = buf32;
221 if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) 237 if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6))
222 return SZ_ERROR_ARCHIVE; 238 return SZ_ERROR_ARCHIVE;
223 } 239 }
224
225 indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2;
226
227 if (pos < indexSize)
228 return SZ_ERROR_ARCHIVE;
229
230 pos -= indexSize;
231 RINOK(LookInStream_SeekTo(stream, pos))
232 RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
233
234 { 240 {
235 UInt64 totalSize = Xz_GetPackSize(p); 241 const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2;
236 if (totalSize == XZ_SIZE_OVERFLOW 242 if (pos < indexSize)
237 || totalSize >= ((UInt64)1 << 63)
238 || pos < totalSize + XZ_STREAM_HEADER_SIZE)
239 return SZ_ERROR_ARCHIVE; 243 return SZ_ERROR_ARCHIVE;
240 pos -= (totalSize + XZ_STREAM_HEADER_SIZE); 244 pos -= indexSize;
245 // v25.00: relaxed indexSize check. We allow big index table.
246 // if (indexSize > ((UInt32)1 << 31))
247 if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1)))
248 return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE
249 RINOK(LookInStream_SeekTo(stream, pos))
250 // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc))
251 {
252 SRes res;
253 const size_t size = (size_t)indexSize;
254 // if (size != indexSize) return SZ_ERROR_UNSUPPORTED;
255 Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size);
256 if (!buf)
257 return SZ_ERROR_MEM;
258 res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED);
259 if (res == SZ_OK)
260 res = Xz_ParseIndex(p, buf, size, alloc);
261 ISzAlloc_Free(alloc, buf);
262 RINOK(res)
263 }
264 }
265 {
266 UInt64 total = Xz_GetPackSize(p);
267 if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63))
268 return SZ_ERROR_ARCHIVE;
269 total += XZ_STREAM_HEADER_SIZE;
270 if (pos < total)
271 return SZ_ERROR_ARCHIVE;
272 pos -= total;
241 RINOK(LookInStream_SeekTo(stream, pos)) 273 RINOK(LookInStream_SeekTo(stream, pos))
242 *startOffset = (Int64)pos; 274 *startOffset = (Int64)pos;
243 } 275 }
@@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
246 CSecToRead secToRead; 278 CSecToRead secToRead;
247 SecToRead_CreateVTable(&secToRead); 279 SecToRead_CreateVTable(&secToRead);
248 secToRead.realStream = stream; 280 secToRead.realStream = stream;
249
250 RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) 281 RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt))
251 return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; 282 return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE;
252 } 283 }
@@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO
257 288
258void Xzs_Construct(CXzs *p) 289void Xzs_Construct(CXzs *p)
259{ 290{
260 p->num = p->numAllocated = 0; 291 Xzs_CONSTRUCT(p)
261 p->streams = 0;
262} 292}
263 293
264void Xzs_Free(CXzs *p, ISzAllocPtr alloc) 294void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
@@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc)
268 Xz_Free(&p->streams[i], alloc); 298 Xz_Free(&p->streams[i], alloc);
269 ISzAlloc_Free(alloc, p->streams); 299 ISzAlloc_Free(alloc, p->streams);
270 p->num = p->numAllocated = 0; 300 p->num = p->numAllocated = 0;
271 p->streams = 0; 301 p->streams = NULL;
272} 302}
273 303
274UInt64 Xzs_GetNumBlocks(const CXzs *p) 304UInt64 Xzs_GetNumBlocks(const CXzs *p)
@@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p)
307SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) 337SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc)
308{ 338{
309 Int64 endOffset = 0; 339 Int64 endOffset = 0;
340 // it's supposed that CXzs object is empty here.
341 // if CXzs object is not empty, it will add new streams to that non-empty object.
342 // Xzs_Free(p, alloc); // it's optional call to empty CXzs object.
310 RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) 343 RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END))
311 *startOffset = endOffset; 344 *startOffset = endOffset;
312 for (;;) 345 for (;;)
313 { 346 {
314 CXzStream st; 347 CXzStream st;
315 SRes res; 348 SRes res;
316 Xz_Construct(&st); 349 Xz_CONSTRUCT(&st)
317 res = Xz_ReadBackward(&st, stream, startOffset, alloc); 350 res = Xz_ReadBackward(&st, stream, startOffset, alloc);
351 // if (res == SZ_OK), then (*startOffset) is start offset of new stream if
352 // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error
318 st.startOffset = (UInt64)*startOffset; 353 st.startOffset = (UInt64)*startOffset;
319 RINOK(res) 354 // we must store (st) object to array, or we must free (st) local object.
355 if (res != SZ_OK)
356 {
357 Xz_Free(&st, alloc);
358 return res;
359 }
320 if (p->num == p->numAllocated) 360 if (p->num == p->numAllocated)
321 { 361 {
322 const size_t newNum = p->num + p->num / 4 + 1; 362 const size_t newNum = p->num + p->num / 4 + 1;
323 void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); 363 void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream));
324 if (!data) 364 if (!data)
365 {
366 Xz_Free(&st, alloc);
325 return SZ_ERROR_MEM; 367 return SZ_ERROR_MEM;
368 }
326 p->numAllocated = newNum; 369 p->numAllocated = newNum;
327 if (p->num != 0) 370 if (p->num != 0)
328 memcpy(data, p->streams, p->num * sizeof(CXzStream)); 371 memcpy(data, p->streams, p->num * sizeof(CXzStream));
329 ISzAlloc_Free(alloc, p->streams); 372 ISzAlloc_Free(alloc, p->streams);
330 p->streams = (CXzStream *)data; 373 p->streams = (CXzStream *)data;
331 } 374 }
375 // we use direct copying of raw data from local variable (st) to object in array.
376 // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++
332 p->streams[p->num++] = st; 377 p->streams[p->num++] = st;
333 if (*startOffset == 0) 378 if (*startOffset == 0)
334 break; 379 return SZ_OK;
335 RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) 380 // seek operation is optional:
381 // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset))
336 if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) 382 if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK)
337 return SZ_ERROR_PROGRESS; 383 return SZ_ERROR_PROGRESS;
338 } 384 }
339 return SZ_OK;
340} 385}