diff options
| author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2025-07-05 00:00:00 +0000 |
|---|---|---|
| committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2025-07-05 19:27:33 +0500 |
| commit | 395149956d696e6e3099d8b76d797437f94a6942 (patch) | |
| tree | 6ed5013a637078ae2dfdc4acf1ad93bf29cea356 /C | |
| parent | e5431fa6f5505e385c6f9367260717e9c47dc2ee (diff) | |
| download | 7zip-25.00.tar.gz 7zip-25.00.tar.bz2 7zip-25.00.zip | |
25.0025.00
Diffstat (limited to 'C')
| -rw-r--r-- | C/7zVersion.h | 10 | ||||
| -rw-r--r-- | C/BwtSort.c | 468 | ||||
| -rw-r--r-- | C/BwtSort.h | 7 | ||||
| -rw-r--r-- | C/Compiler.h | 12 | ||||
| -rw-r--r-- | C/CpuArch.h | 8 | ||||
| -rw-r--r-- | C/HuffEnc.c | 384 | ||||
| -rw-r--r-- | C/HuffEnc.h | 8 | ||||
| -rw-r--r-- | C/LzFind.c | 24 | ||||
| -rw-r--r-- | C/LzFindMt.c | 10 | ||||
| -rw-r--r-- | C/LzFindMt.h | 6 | ||||
| -rw-r--r-- | C/Lzma2Enc.c | 4 | ||||
| -rw-r--r-- | C/Lzma2Enc.h | 1 | ||||
| -rw-r--r-- | C/LzmaEnc.c | 6 | ||||
| -rw-r--r-- | C/LzmaEnc.h | 4 | ||||
| -rw-r--r-- | C/MtCoder.c | 61 | ||||
| -rw-r--r-- | C/MtCoder.h | 7 | ||||
| -rw-r--r-- | C/Sha512.c | 167 | ||||
| -rw-r--r-- | C/Sort.c | 355 | ||||
| -rw-r--r-- | C/Sort.h | 7 | ||||
| -rw-r--r-- | C/Threads.c | 237 | ||||
| -rw-r--r-- | C/Threads.h | 12 | ||||
| -rw-r--r-- | C/Util/Lzma/LzmaUtil.dsp | 4 | ||||
| -rw-r--r-- | C/Util/LzmaLib/LzmaLib.dsp | 8 | ||||
| -rw-r--r-- | C/Xz.h | 12 | ||||
| -rw-r--r-- | C/XzCrc64Opt.c | 4 | ||||
| -rw-r--r-- | C/XzDec.c | 29 | ||||
| -rw-r--r-- | C/XzEnc.c | 8 | ||||
| -rw-r--r-- | C/XzEnc.h | 3 | ||||
| -rw-r--r-- | C/XzIn.c | 265 |
29 files changed, 1524 insertions, 607 deletions
diff --git a/C/7zVersion.h b/C/7zVersion.h index e82ba0b..72733f7 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #define MY_VER_MAJOR 24 | 1 | #define MY_VER_MAJOR 25 |
| 2 | #define MY_VER_MINOR 9 | 2 | #define MY_VER_MINOR 0 |
| 3 | #define MY_VER_BUILD 0 | 3 | #define MY_VER_BUILD 0 |
| 4 | #define MY_VERSION_NUMBERS "24.09" | 4 | #define MY_VERSION_NUMBERS "25.00" |
| 5 | #define MY_VERSION MY_VERSION_NUMBERS | 5 | #define MY_VERSION MY_VERSION_NUMBERS |
| 6 | 6 | ||
| 7 | #ifdef MY_CPU_NAME | 7 | #ifdef MY_CPU_NAME |
| @@ -10,12 +10,12 @@ | |||
| 10 | #define MY_VERSION_CPU MY_VERSION | 10 | #define MY_VERSION_CPU MY_VERSION |
| 11 | #endif | 11 | #endif |
| 12 | 12 | ||
| 13 | #define MY_DATE "2024-11-29" | 13 | #define MY_DATE "2025-07-05" |
| 14 | #undef MY_COPYRIGHT | 14 | #undef MY_COPYRIGHT |
| 15 | #undef MY_VERSION_COPYRIGHT_DATE | 15 | #undef MY_VERSION_COPYRIGHT_DATE |
| 16 | #define MY_AUTHOR_NAME "Igor Pavlov" | 16 | #define MY_AUTHOR_NAME "Igor Pavlov" |
| 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" | 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" |
| 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" | 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov" |
| 19 | 19 | ||
| 20 | #ifdef USE_COPYRIGHT_CR | 20 | #ifdef USE_COPYRIGHT_CR |
| 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR | 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR |
diff --git a/C/BwtSort.c b/C/BwtSort.c index 05ad6de..8f64f9d 100644 --- a/C/BwtSort.c +++ b/C/BwtSort.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* BwtSort.c -- BWT block sorting | 1 | /* BwtSort.c -- BWT block sorting |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -7,6 +7,44 @@ | |||
| 7 | #include "Sort.h" | 7 | #include "Sort.h" |
| 8 | 8 | ||
| 9 | /* #define BLOCK_SORT_USE_HEAP_SORT */ | 9 | /* #define BLOCK_SORT_USE_HEAP_SORT */ |
| 10 | // #define BLOCK_SORT_USE_HEAP_SORT | ||
| 11 | |||
| 12 | #ifdef BLOCK_SORT_USE_HEAP_SORT | ||
| 13 | |||
| 14 | #define HeapSortRefDown(p, vals, n, size, temp) \ | ||
| 15 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | ||
| 16 | size_t s = k << 1; \ | ||
| 17 | if (s > size) break; \ | ||
| 18 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | ||
| 19 | if (val >= vals[p[s]]) break; \ | ||
| 20 | p[k] = p[s]; k = s; \ | ||
| 21 | } p[k] = temp; } | ||
| 22 | |||
| 23 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
| 24 | { | ||
| 25 | if (size <= 1) | ||
| 26 | return; | ||
| 27 | p--; | ||
| 28 | { | ||
| 29 | size_t i = size / 2; | ||
| 30 | do | ||
| 31 | { | ||
| 32 | UInt32 temp = p[i]; | ||
| 33 | HeapSortRefDown(p, vals, i, size, temp); | ||
| 34 | } | ||
| 35 | while (--i != 0); | ||
| 36 | } | ||
| 37 | do | ||
| 38 | { | ||
| 39 | UInt32 temp = p[size]; | ||
| 40 | p[size--] = p[1]; | ||
| 41 | HeapSortRefDown(p, vals, 1, size, temp); | ||
| 42 | } | ||
| 43 | while (size > 1); | ||
| 44 | } | ||
| 45 | |||
| 46 | #endif // BLOCK_SORT_USE_HEAP_SORT | ||
| 47 | |||
| 10 | 48 | ||
| 11 | /* Don't change it !!! */ | 49 | /* Don't change it !!! */ |
| 12 | #define kNumHashBytes 2 | 50 | #define kNumHashBytes 2 |
| @@ -27,26 +65,27 @@ | |||
| 27 | 65 | ||
| 28 | #else | 66 | #else |
| 29 | 67 | ||
| 30 | #define kNumBitsMax 20 | 68 | #define kNumBitsMax 20 |
| 31 | #define kIndexMask ((1 << kNumBitsMax) - 1) | 69 | #define kIndexMask (((UInt32)1 << kNumBitsMax) - 1) |
| 32 | #define kNumExtraBits (32 - kNumBitsMax) | 70 | #define kNumExtraBits (32 - kNumBitsMax) |
| 33 | #define kNumExtra0Bits (kNumExtraBits - 2) | 71 | #define kNumExtra0Bits (kNumExtraBits - 2) |
| 34 | #define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) | 72 | #define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) |
| 35 | 73 | ||
| 36 | #define SetFinishedGroupSize(p, size) \ | 74 | #define SetFinishedGroupSize(p, size) \ |
| 37 | { *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ | 75 | { *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ |
| 38 | if ((size) > (1 << kNumExtra0Bits)) { \ | 76 | if ((size) > (1 << kNumExtra0Bits)) { \ |
| 39 | *(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \ | 77 | *(p) |= 0x40000000; \ |
| 78 | *((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \ | ||
| 40 | 79 | ||
| 41 | static void SetGroupSize(UInt32 *p, UInt32 size) | 80 | static void SetGroupSize(UInt32 *p, size_t size) |
| 42 | { | 81 | { |
| 43 | if (--size == 0) | 82 | if (--size == 0) |
| 44 | return; | 83 | return; |
| 45 | *p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax); | 84 | *p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax); |
| 46 | if (size >= (1 << kNumExtra0Bits)) | 85 | if (size >= (1 << kNumExtra0Bits)) |
| 47 | { | 86 | { |
| 48 | *p |= 0x40000000; | 87 | *p |= 0x40000000; |
| 49 | p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax); | 88 | p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax); |
| 50 | } | 89 | } |
| 51 | } | 90 | } |
| 52 | 91 | ||
| @@ -59,12 +98,14 @@ returns: 1 - if there are groups, 0 - no more groups | |||
| 59 | */ | 98 | */ |
| 60 | 99 | ||
| 61 | static | 100 | static |
| 62 | UInt32 | 101 | unsigned |
| 63 | Z7_FASTCALL | 102 | Z7_FASTCALL |
| 64 | SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices | 103 | SortGroup(size_t BlockSize, size_t NumSortedBytes, |
| 65 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 104 | size_t groupOffset, size_t groupSize, |
| 66 | , UInt32 left, UInt32 range | 105 | unsigned NumRefBits, UInt32 *Indices |
| 67 | #endif | 106 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
| 107 | , size_t left, size_t range | ||
| 108 | #endif | ||
| 68 | ) | 109 | ) |
| 69 | { | 110 | { |
| 70 | UInt32 *ind2 = Indices + groupOffset; | 111 | UInt32 *ind2 = Indices + groupOffset; |
| @@ -79,90 +120,93 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
| 79 | return 0; | 120 | return 0; |
| 80 | } | 121 | } |
| 81 | Groups = Indices + BlockSize + BS_TEMP_SIZE; | 122 | Groups = Indices + BlockSize + BS_TEMP_SIZE; |
| 82 | if (groupSize <= ((UInt32)1 << NumRefBits) | 123 | if (groupSize <= ((size_t)1 << NumRefBits) |
| 83 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 124 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
| 84 | && groupSize <= range | 125 | && groupSize <= range |
| 85 | #endif | 126 | #endif |
| 86 | ) | 127 | ) |
| 87 | { | 128 | { |
| 88 | UInt32 *temp = Indices + BlockSize; | 129 | UInt32 *temp = Indices + BlockSize; |
| 89 | UInt32 j; | 130 | size_t j, group; |
| 90 | UInt32 mask, thereAreGroups, group, cg; | 131 | UInt32 mask, cg; |
| 132 | unsigned thereAreGroups; | ||
| 91 | { | 133 | { |
| 92 | UInt32 gPrev; | 134 | UInt32 gPrev; |
| 93 | UInt32 gRes = 0; | 135 | UInt32 gRes = 0; |
| 94 | { | 136 | { |
| 95 | UInt32 sp = ind2[0] + NumSortedBytes; | 137 | size_t sp = ind2[0] + NumSortedBytes; |
| 96 | if (sp >= BlockSize) sp -= BlockSize; | 138 | if (sp >= BlockSize) |
| 139 | sp -= BlockSize; | ||
| 97 | gPrev = Groups[sp]; | 140 | gPrev = Groups[sp]; |
| 98 | temp[0] = (gPrev << NumRefBits); | 141 | temp[0] = gPrev << NumRefBits; |
| 99 | } | 142 | } |
| 100 | 143 | ||
| 101 | for (j = 1; j < groupSize; j++) | 144 | for (j = 1; j < groupSize; j++) |
| 102 | { | 145 | { |
| 103 | UInt32 sp = ind2[j] + NumSortedBytes; | 146 | size_t sp = ind2[j] + NumSortedBytes; |
| 104 | UInt32 g; | 147 | UInt32 g; |
| 105 | if (sp >= BlockSize) sp -= BlockSize; | 148 | if (sp >= BlockSize) |
| 149 | sp -= BlockSize; | ||
| 106 | g = Groups[sp]; | 150 | g = Groups[sp]; |
| 107 | temp[j] = (g << NumRefBits) | j; | 151 | temp[j] = (g << NumRefBits) | (UInt32)j; |
| 108 | gRes |= (gPrev ^ g); | 152 | gRes |= (gPrev ^ g); |
| 109 | } | 153 | } |
| 110 | if (gRes == 0) | 154 | if (gRes == 0) |
| 111 | { | 155 | { |
| 112 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 156 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 113 | SetGroupSize(ind2, groupSize); | 157 | SetGroupSize(ind2, groupSize); |
| 114 | #endif | 158 | #endif |
| 115 | return 1; | 159 | return 1; |
| 116 | } | 160 | } |
| 117 | } | 161 | } |
| 118 | 162 | ||
| 119 | HeapSort(temp, groupSize); | 163 | HeapSort(temp, groupSize); |
| 120 | mask = (((UInt32)1 << NumRefBits) - 1); | 164 | mask = ((UInt32)1 << NumRefBits) - 1; |
| 121 | thereAreGroups = 0; | 165 | thereAreGroups = 0; |
| 122 | 166 | ||
| 123 | group = groupOffset; | 167 | group = groupOffset; |
| 124 | cg = (temp[0] >> NumRefBits); | 168 | cg = temp[0] >> NumRefBits; |
| 125 | temp[0] = ind2[temp[0] & mask]; | 169 | temp[0] = ind2[temp[0] & mask]; |
| 126 | 170 | ||
| 127 | { | 171 | { |
| 128 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 172 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 129 | UInt32 *Flags = Groups + BlockSize; | 173 | UInt32 *Flags = Groups + BlockSize; |
| 130 | #else | 174 | #else |
| 131 | UInt32 prevGroupStart = 0; | 175 | size_t prevGroupStart = 0; |
| 132 | #endif | 176 | #endif |
| 133 | 177 | ||
| 134 | for (j = 1; j < groupSize; j++) | 178 | for (j = 1; j < groupSize; j++) |
| 135 | { | 179 | { |
| 136 | UInt32 val = temp[j]; | 180 | const UInt32 val = temp[j]; |
| 137 | UInt32 cgCur = (val >> NumRefBits); | 181 | const UInt32 cgCur = val >> NumRefBits; |
| 138 | 182 | ||
| 139 | if (cgCur != cg) | 183 | if (cgCur != cg) |
| 140 | { | 184 | { |
| 141 | cg = cgCur; | 185 | cg = cgCur; |
| 142 | group = groupOffset + j; | 186 | group = groupOffset + j; |
| 143 | 187 | ||
| 144 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 188 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 145 | { | 189 | { |
| 146 | UInt32 t = group - 1; | 190 | const size_t t = group - 1; |
| 147 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 191 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
| 148 | } | 192 | } |
| 149 | #else | 193 | #else |
| 150 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); | 194 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); |
| 151 | prevGroupStart = j; | 195 | prevGroupStart = j; |
| 152 | #endif | 196 | #endif |
| 153 | } | 197 | } |
| 154 | else | 198 | else |
| 155 | thereAreGroups = 1; | 199 | thereAreGroups = 1; |
| 156 | { | 200 | { |
| 157 | UInt32 ind = ind2[val & mask]; | 201 | const UInt32 ind = ind2[val & mask]; |
| 158 | temp[j] = ind; | 202 | temp[j] = ind; |
| 159 | Groups[ind] = group; | 203 | Groups[ind] = (UInt32)group; |
| 160 | } | 204 | } |
| 161 | } | 205 | } |
| 162 | 206 | ||
| 163 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 207 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 164 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); | 208 | SetGroupSize(temp + prevGroupStart, j - prevGroupStart); |
| 165 | #endif | 209 | #endif |
| 166 | } | 210 | } |
| 167 | 211 | ||
| 168 | for (j = 0; j < groupSize; j++) | 212 | for (j = 0; j < groupSize; j++) |
| @@ -172,37 +216,42 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
| 172 | 216 | ||
| 173 | /* Check that all strings are in one group (cannot sort) */ | 217 | /* Check that all strings are in one group (cannot sort) */ |
| 174 | { | 218 | { |
| 175 | UInt32 group, j; | 219 | UInt32 group; |
| 176 | UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 220 | size_t j; |
| 221 | size_t sp = ind2[0] + NumSortedBytes; | ||
| 222 | if (sp >= BlockSize) | ||
| 223 | sp -= BlockSize; | ||
| 177 | group = Groups[sp]; | 224 | group = Groups[sp]; |
| 178 | for (j = 1; j < groupSize; j++) | 225 | for (j = 1; j < groupSize; j++) |
| 179 | { | 226 | { |
| 180 | sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 227 | sp = ind2[j] + NumSortedBytes; |
| 228 | if (sp >= BlockSize) | ||
| 229 | sp -= BlockSize; | ||
| 181 | if (Groups[sp] != group) | 230 | if (Groups[sp] != group) |
| 182 | break; | 231 | break; |
| 183 | } | 232 | } |
| 184 | if (j == groupSize) | 233 | if (j == groupSize) |
| 185 | { | 234 | { |
| 186 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 235 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 187 | SetGroupSize(ind2, groupSize); | 236 | SetGroupSize(ind2, groupSize); |
| 188 | #endif | 237 | #endif |
| 189 | return 1; | 238 | return 1; |
| 190 | } | 239 | } |
| 191 | } | 240 | } |
| 192 | 241 | ||
| 193 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 242 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
| 194 | { | 243 | { |
| 195 | /* ---------- Range Sort ---------- */ | 244 | /* ---------- Range Sort ---------- */ |
| 196 | UInt32 i; | 245 | size_t i; |
| 197 | UInt32 mid; | 246 | size_t mid; |
| 198 | for (;;) | 247 | for (;;) |
| 199 | { | 248 | { |
| 200 | UInt32 j; | 249 | size_t j; |
| 201 | if (range <= 1) | 250 | if (range <= 1) |
| 202 | { | 251 | { |
| 203 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 252 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 204 | SetGroupSize(ind2, groupSize); | 253 | SetGroupSize(ind2, groupSize); |
| 205 | #endif | 254 | #endif |
| 206 | return 1; | 255 | return 1; |
| 207 | } | 256 | } |
| 208 | mid = left + ((range + 1) >> 1); | 257 | mid = left + ((range + 1) >> 1); |
| @@ -210,7 +259,7 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
| 210 | i = 0; | 259 | i = 0; |
| 211 | do | 260 | do |
| 212 | { | 261 | { |
| 213 | UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 262 | size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; |
| 214 | if (Groups[sp] >= mid) | 263 | if (Groups[sp] >= mid) |
| 215 | { | 264 | { |
| 216 | for (j--; j > i; j--) | 265 | for (j--; j > i; j--) |
| @@ -238,51 +287,53 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
| 238 | break; | 287 | break; |
| 239 | } | 288 | } |
| 240 | 289 | ||
| 241 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 290 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 242 | { | 291 | { |
| 243 | UInt32 t = (groupOffset + i - 1); | 292 | const size_t t = groupOffset + i - 1; |
| 244 | UInt32 *Flags = Groups + BlockSize; | 293 | UInt32 *Flags = Groups + BlockSize; |
| 245 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 294 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
| 246 | } | 295 | } |
| 247 | #endif | 296 | #endif |
| 248 | 297 | ||
| 249 | { | 298 | { |
| 250 | UInt32 j; | 299 | size_t j; |
| 251 | for (j = i; j < groupSize; j++) | 300 | for (j = i; j < groupSize; j++) |
| 252 | Groups[ind2[j]] = groupOffset + i; | 301 | Groups[ind2[j]] = (UInt32)(groupOffset + i); |
| 253 | } | 302 | } |
| 254 | 303 | ||
| 255 | { | 304 | { |
| 256 | UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); | 305 | unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); |
| 257 | return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); | 306 | return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); |
| 258 | } | 307 | } |
| 259 | 308 | ||
| 260 | } | 309 | } |
| 261 | 310 | ||
| 262 | #else | 311 | #else // BLOCK_SORT_USE_HEAP_SORT |
| 263 | 312 | ||
| 264 | /* ---------- Heap Sort ---------- */ | 313 | /* ---------- Heap Sort ---------- */ |
| 265 | 314 | ||
| 266 | { | 315 | { |
| 267 | UInt32 j; | 316 | size_t j; |
| 268 | for (j = 0; j < groupSize; j++) | 317 | for (j = 0; j < groupSize; j++) |
| 269 | { | 318 | { |
| 270 | UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; | 319 | size_t sp = ind2[j] + NumSortedBytes; |
| 271 | ind2[j] = sp; | 320 | if (sp >= BlockSize) |
| 321 | sp -= BlockSize; | ||
| 322 | ind2[j] = (UInt32)sp; | ||
| 272 | } | 323 | } |
| 273 | 324 | ||
| 274 | HeapSortRef(ind2, Groups, groupSize); | 325 | HeapSortRef(ind2, Groups, groupSize); |
| 275 | 326 | ||
| 276 | /* Write Flags */ | 327 | /* Write Flags */ |
| 277 | { | 328 | { |
| 278 | UInt32 sp = ind2[0]; | 329 | size_t sp = ind2[0]; |
| 279 | UInt32 group = Groups[sp]; | 330 | UInt32 group = Groups[sp]; |
| 280 | 331 | ||
| 281 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 332 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 282 | UInt32 *Flags = Groups + BlockSize; | 333 | UInt32 *Flags = Groups + BlockSize; |
| 283 | #else | 334 | #else |
| 284 | UInt32 prevGroupStart = 0; | 335 | size_t prevGroupStart = 0; |
| 285 | #endif | 336 | #endif |
| 286 | 337 | ||
| 287 | for (j = 1; j < groupSize; j++) | 338 | for (j = 1; j < groupSize; j++) |
| 288 | { | 339 | { |
| @@ -290,149 +341,210 @@ SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 gr | |||
| 290 | if (Groups[sp] != group) | 341 | if (Groups[sp] != group) |
| 291 | { | 342 | { |
| 292 | group = Groups[sp]; | 343 | group = Groups[sp]; |
| 293 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 344 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 294 | { | 345 | { |
| 295 | UInt32 t = groupOffset + j - 1; | 346 | const size_t t = groupOffset + j - 1; |
| 296 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | 347 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); |
| 297 | } | 348 | } |
| 298 | #else | 349 | #else |
| 299 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); | 350 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); |
| 300 | prevGroupStart = j; | 351 | prevGroupStart = j; |
| 301 | #endif | 352 | #endif |
| 302 | } | 353 | } |
| 303 | } | 354 | } |
| 304 | 355 | ||
| 305 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 356 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 306 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); | 357 | SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); |
| 307 | #endif | 358 | #endif |
| 308 | } | 359 | } |
| 309 | { | 360 | { |
| 310 | /* Write new Groups values and Check that there are groups */ | 361 | /* Write new Groups values and Check that there are groups */ |
| 311 | UInt32 thereAreGroups = 0; | 362 | unsigned thereAreGroups = 0; |
| 312 | for (j = 0; j < groupSize; j++) | 363 | for (j = 0; j < groupSize; j++) |
| 313 | { | 364 | { |
| 314 | UInt32 group = groupOffset + j; | 365 | size_t group = groupOffset + j; |
| 315 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 366 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 316 | UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); | 367 | UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); |
| 317 | if ((ind2[j] & 0x40000000) != 0) | 368 | if (ind2[j] & 0x40000000) |
| 318 | subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); | 369 | subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); |
| 319 | subGroupSize++; | 370 | subGroupSize++; |
| 320 | for (;;) | 371 | for (;;) |
| 321 | { | 372 | { |
| 322 | UInt32 original = ind2[j]; | 373 | const UInt32 original = ind2[j]; |
| 323 | UInt32 sp = original & kIndexMask; | 374 | size_t sp = original & kIndexMask; |
| 324 | if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; | 375 | if (sp < NumSortedBytes) |
| 325 | ind2[j] = sp | (original & ~kIndexMask); | 376 | sp += BlockSize; |
| 326 | Groups[sp] = group; | 377 | sp -= NumSortedBytes; |
| 378 | ind2[j] = (UInt32)sp | (original & ~kIndexMask); | ||
| 379 | Groups[sp] = (UInt32)group; | ||
| 327 | if (--subGroupSize == 0) | 380 | if (--subGroupSize == 0) |
| 328 | break; | 381 | break; |
| 329 | j++; | 382 | j++; |
| 330 | thereAreGroups = 1; | 383 | thereAreGroups = 1; |
| 331 | } | 384 | } |
| 332 | #else | 385 | #else |
| 333 | UInt32 *Flags = Groups + BlockSize; | 386 | UInt32 *Flags = Groups + BlockSize; |
| 334 | for (;;) | 387 | for (;;) |
| 335 | { | 388 | { |
| 336 | UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; | 389 | size_t sp = ind2[j]; |
| 337 | ind2[j] = sp; | 390 | if (sp < NumSortedBytes) |
| 338 | Groups[sp] = group; | 391 | sp += BlockSize; |
| 392 | sp -= NumSortedBytes; | ||
| 393 | ind2[j] = (UInt32)sp; | ||
| 394 | Groups[sp] = (UInt32)group; | ||
| 339 | if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) | 395 | if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) |
| 340 | break; | 396 | break; |
| 341 | j++; | 397 | j++; |
| 342 | thereAreGroups = 1; | 398 | thereAreGroups = 1; |
| 343 | } | 399 | } |
| 344 | #endif | 400 | #endif |
| 345 | } | 401 | } |
| 346 | return thereAreGroups; | 402 | return thereAreGroups; |
| 347 | } | 403 | } |
| 348 | } | 404 | } |
| 349 | #endif | 405 | #endif // BLOCK_SORT_USE_HEAP_SORT |
| 350 | } | 406 | } |
| 351 | 407 | ||
| 408 | |||
| 352 | /* conditions: blockSize > 0 */ | 409 | /* conditions: blockSize > 0 */ |
| 353 | UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | 410 | UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize) |
| 354 | { | 411 | { |
| 355 | UInt32 *counters = Indices + blockSize; | 412 | UInt32 *counters = Indices + blockSize; |
| 356 | UInt32 i; | 413 | size_t i; |
| 357 | UInt32 *Groups; | 414 | UInt32 *Groups; |
| 358 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 415 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 359 | UInt32 *Flags; | 416 | UInt32 *Flags; |
| 360 | #endif | 417 | #endif |
| 361 | 418 | ||
| 362 | /* Radix-Sort for 2 bytes */ | 419 | /* Radix-Sort for 2 bytes */ |
| 420 | // { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) { | ||
| 363 | for (i = 0; i < kNumHashValues; i++) | 421 | for (i = 0; i < kNumHashValues; i++) |
| 364 | counters[i] = 0; | 422 | counters[i] = 0; |
| 365 | for (i = 0; i < blockSize - 1; i++) | 423 | { |
| 366 | counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++; | 424 | const Byte *data2 = data; |
| 367 | counters[((UInt32)data[i] << 8) | data[0]]++; | 425 | size_t a = data[(size_t)blockSize - 1]; |
| 426 | const Byte *data_lim = data + blockSize; | ||
| 427 | if (blockSize >= 4) | ||
| 428 | { | ||
| 429 | data_lim -= 3; | ||
| 430 | do | ||
| 431 | { | ||
| 432 | size_t b; | ||
| 433 | b = data2[0]; counters[(a << 8) | b]++; | ||
| 434 | a = data2[1]; counters[(b << 8) | a]++; | ||
| 435 | b = data2[2]; counters[(a << 8) | b]++; | ||
| 436 | a = data2[3]; counters[(b << 8) | a]++; | ||
| 437 | data2 += 4; | ||
| 438 | } | ||
| 439 | while (data2 < data_lim); | ||
| 440 | data_lim += 3; | ||
| 441 | } | ||
| 442 | while (data2 != data_lim) | ||
| 443 | { | ||
| 444 | size_t b = *data2++; | ||
| 445 | counters[(a << 8) | b]++; | ||
| 446 | a = b; | ||
| 447 | } | ||
| 448 | } | ||
| 449 | // }} | ||
| 368 | 450 | ||
| 369 | Groups = counters + BS_TEMP_SIZE; | 451 | Groups = counters + BS_TEMP_SIZE; |
| 370 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 452 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 371 | Flags = Groups + blockSize; | 453 | Flags = Groups + blockSize; |
| 372 | { | 454 | { |
| 373 | UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; | 455 | const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; |
| 374 | for (i = 0; i < numWords; i++) | 456 | for (i = 0; i < numWords; i++) |
| 375 | Flags[i] = kAllFlags; | 457 | Flags[i] = kAllFlags; |
| 376 | } | 458 | } |
| 377 | #endif | 459 | #endif |
| 378 | 460 | ||
| 379 | { | 461 | { |
| 380 | UInt32 sum = 0; | 462 | UInt32 sum = 0; |
| 381 | for (i = 0; i < kNumHashValues; i++) | 463 | for (i = 0; i < kNumHashValues; i++) |
| 382 | { | 464 | { |
| 383 | UInt32 groupSize = counters[i]; | 465 | const UInt32 groupSize = counters[i]; |
| 384 | if (groupSize > 0) | 466 | counters[i] = sum; |
| 467 | sum += groupSize; | ||
| 468 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | ||
| 469 | if (groupSize) | ||
| 385 | { | 470 | { |
| 386 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 471 | const UInt32 t = sum - 1; |
| 387 | UInt32 t = sum + groupSize - 1; | 472 | Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask)); |
| 388 | Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); | ||
| 389 | #endif | ||
| 390 | sum += groupSize; | ||
| 391 | } | 473 | } |
| 392 | counters[i] = sum - groupSize; | 474 | #endif |
| 393 | } | 475 | } |
| 476 | } | ||
| 394 | 477 | ||
| 395 | for (i = 0; i < blockSize - 1; i++) | 478 | for (i = 0; i < blockSize - 1; i++) |
| 396 | Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]; | 479 | Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]]; |
| 397 | Groups[i] = counters[((UInt32)data[i] << 8) | data[0]]; | 480 | Groups[i] = counters[((unsigned)data[i] << 8) | data[0]]; |
| 481 | |||
| 482 | { | ||
| 483 | #define SET_Indices(a, b, i) \ | ||
| 484 | { UInt32 c; \ | ||
| 485 | a = (a << 8) | (b); \ | ||
| 486 | c = counters[a]; \ | ||
| 487 | Indices[c] = (UInt32)i++; \ | ||
| 488 | counters[a] = c + 1; \ | ||
| 489 | } | ||
| 398 | 490 | ||
| 399 | for (i = 0; i < blockSize - 1; i++) | 491 | size_t a = data[0]; |
| 400 | Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i; | 492 | const Byte *data_ptr = data + 1; |
| 401 | Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i; | 493 | i = 0; |
| 402 | 494 | if (blockSize >= 3) | |
| 403 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 495 | { |
| 496 | blockSize -= 2; | ||
| 497 | do | ||
| 498 | { | ||
| 499 | size_t b; | ||
| 500 | b = data_ptr[0]; SET_Indices(a, b, i) | ||
| 501 | a = data_ptr[1]; SET_Indices(b, a, i) | ||
| 502 | data_ptr += 2; | ||
| 503 | } | ||
| 504 | while (i < blockSize); | ||
| 505 | blockSize += 2; | ||
| 506 | } | ||
| 507 | if (i < blockSize - 1) | ||
| 404 | { | 508 | { |
| 509 | SET_Indices(a, data[(size_t)i + 1], i) | ||
| 510 | a = (Byte)a; | ||
| 511 | } | ||
| 512 | SET_Indices(a, data[0], i) | ||
| 513 | } | ||
| 514 | |||
| 515 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | ||
| 516 | { | ||
| 405 | UInt32 prev = 0; | 517 | UInt32 prev = 0; |
| 406 | for (i = 0; i < kNumHashValues; i++) | 518 | for (i = 0; i < kNumHashValues; i++) |
| 407 | { | 519 | { |
| 408 | UInt32 prevGroupSize = counters[i] - prev; | 520 | const UInt32 prevGroupSize = counters[i] - prev; |
| 409 | if (prevGroupSize == 0) | 521 | if (prevGroupSize == 0) |
| 410 | continue; | 522 | continue; |
| 411 | SetGroupSize(Indices + prev, prevGroupSize); | 523 | SetGroupSize(Indices + prev, prevGroupSize); |
| 412 | prev = counters[i]; | 524 | prev = counters[i]; |
| 413 | } | 525 | } |
| 414 | } | ||
| 415 | #endif | ||
| 416 | } | 526 | } |
| 527 | #endif | ||
| 417 | 528 | ||
| 418 | { | 529 | { |
| 419 | int NumRefBits; | 530 | unsigned NumRefBits; |
| 420 | UInt32 NumSortedBytes; | 531 | size_t NumSortedBytes; |
| 421 | for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++); | 532 | for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++) |
| 533 | {} | ||
| 422 | NumRefBits = 32 - NumRefBits; | 534 | NumRefBits = 32 - NumRefBits; |
| 423 | if (NumRefBits > kNumRefBitsMax) | 535 | if (NumRefBits > kNumRefBitsMax) |
| 424 | NumRefBits = kNumRefBitsMax; | 536 | NumRefBits = kNumRefBitsMax; |
| 425 | 537 | ||
| 426 | for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) | 538 | for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) |
| 427 | { | 539 | { |
| 428 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 540 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 429 | UInt32 finishedGroupSize = 0; | 541 | size_t finishedGroupSize = 0; |
| 430 | #endif | 542 | #endif |
| 431 | UInt32 newLimit = 0; | 543 | size_t newLimit = 0; |
| 432 | for (i = 0; i < blockSize;) | 544 | for (i = 0; i < blockSize;) |
| 433 | { | 545 | { |
| 434 | UInt32 groupSize; | 546 | size_t groupSize; |
| 435 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 547 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 436 | 548 | ||
| 437 | if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) | 549 | if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) |
| 438 | { | 550 | { |
| @@ -441,56 +553,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | |||
| 441 | } | 553 | } |
| 442 | for (groupSize = 1; | 554 | for (groupSize = 1; |
| 443 | (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; | 555 | (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; |
| 444 | groupSize++); | 556 | groupSize++) |
| 445 | 557 | {} | |
| 446 | groupSize++; | 558 | groupSize++; |
| 447 | 559 | ||
| 448 | #else | 560 | #else |
| 449 | 561 | ||
| 450 | groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); | 562 | groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax; |
| 451 | { | 563 | { |
| 452 | BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); | 564 | const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); |
| 453 | if ((Indices[i] & 0x40000000) != 0) | 565 | if (Indices[i] & 0x40000000) |
| 454 | { | ||
| 455 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); | ||
| 456 | Indices[(size_t)i + 1] &= kIndexMask; | ||
| 457 | } | ||
| 458 | Indices[i] &= kIndexMask; | ||
| 459 | groupSize++; | ||
| 460 | if (finishedGroup || groupSize == 1) | ||
| 461 | { | ||
| 462 | Indices[i - finishedGroupSize] &= kIndexMask; | ||
| 463 | if (finishedGroupSize > 1) | ||
| 464 | Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask; | ||
| 465 | { | 566 | { |
| 466 | UInt32 newGroupSize = groupSize + finishedGroupSize; | 567 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); |
| 467 | SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) | 568 | Indices[(size_t)i + 1] &= kIndexMask; |
| 468 | finishedGroupSize = newGroupSize; | ||
| 469 | } | 569 | } |
| 470 | i += groupSize; | 570 | Indices[i] &= kIndexMask; |
| 471 | continue; | 571 | groupSize++; |
| 472 | } | 572 | if (finishedGroup || groupSize == 1) |
| 473 | finishedGroupSize = 0; | 573 | { |
| 574 | Indices[i - finishedGroupSize] &= kIndexMask; | ||
| 575 | if (finishedGroupSize > 1) | ||
| 576 | Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask; | ||
| 577 | { | ||
| 578 | const size_t newGroupSize = groupSize + finishedGroupSize; | ||
| 579 | SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize) | ||
| 580 | finishedGroupSize = newGroupSize; | ||
| 581 | } | ||
| 582 | i += groupSize; | ||
| 583 | continue; | ||
| 584 | } | ||
| 585 | finishedGroupSize = 0; | ||
| 474 | } | 586 | } |
| 475 | 587 | ||
| 476 | #endif | 588 | #endif |
| 477 | 589 | ||
| 478 | if (NumSortedBytes >= blockSize) | 590 | if (NumSortedBytes >= blockSize) |
| 479 | { | 591 | { |
| 480 | UInt32 j; | 592 | size_t j; |
| 481 | for (j = 0; j < groupSize; j++) | 593 | for (j = 0; j < groupSize; j++) |
| 482 | { | 594 | { |
| 483 | UInt32 t = (i + j); | 595 | size_t t = i + j; |
| 484 | /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ | 596 | /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ |
| 485 | Groups[Indices[t]] = t; | 597 | Groups[Indices[t]] = (UInt32)t; |
| 486 | } | 598 | } |
| 487 | } | 599 | } |
| 488 | else | 600 | else |
| 489 | if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices | 601 | if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices |
| 490 | #ifndef BLOCK_SORT_USE_HEAP_SORT | 602 | #ifndef BLOCK_SORT_USE_HEAP_SORT |
| 491 | , 0, blockSize | 603 | , 0, blockSize |
| 492 | #endif | 604 | #endif |
| 493 | ) != 0) | 605 | )) |
| 494 | newLimit = i + groupSize; | 606 | newLimit = i + groupSize; |
| 495 | i += groupSize; | 607 | i += groupSize; |
| 496 | } | 608 | } |
| @@ -498,19 +610,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) | |||
| 498 | break; | 610 | break; |
| 499 | } | 611 | } |
| 500 | } | 612 | } |
| 501 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS | 613 | #ifndef BLOCK_SORT_EXTERNAL_FLAGS |
| 502 | for (i = 0; i < blockSize;) | 614 | for (i = 0; i < blockSize;) |
| 503 | { | 615 | { |
| 504 | UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); | 616 | size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax; |
| 505 | if ((Indices[i] & 0x40000000) != 0) | 617 | if (Indices[i] & 0x40000000) |
| 506 | { | 618 | { |
| 507 | groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); | 619 | groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits; |
| 508 | Indices[(size_t)i + 1] &= kIndexMask; | 620 | Indices[(size_t)i + 1] &= kIndexMask; |
| 509 | } | 621 | } |
| 510 | Indices[i] &= kIndexMask; | 622 | Indices[i] &= kIndexMask; |
| 511 | groupSize++; | 623 | groupSize++; |
| 512 | i += groupSize; | 624 | i += groupSize; |
| 513 | } | 625 | } |
| 514 | #endif | 626 | #endif |
| 515 | return Groups[0]; | 627 | return Groups[0]; |
| 516 | } | 628 | } |
diff --git a/C/BwtSort.h b/C/BwtSort.h index a34b243..1bd2316 100644 --- a/C/BwtSort.h +++ b/C/BwtSort.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* BwtSort.h -- BWT block sorting | 1 | /* BwtSort.h -- BWT block sorting |
| 2 | 2023-03-03 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_BWT_SORT_H | 4 | #ifndef ZIP7_INC_BWT_SORT_H |
| 5 | #define ZIP7_INC_BWT_SORT_H | 5 | #define ZIP7_INC_BWT_SORT_H |
| @@ -10,16 +10,17 @@ EXTERN_C_BEGIN | |||
| 10 | 10 | ||
| 11 | /* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ | 11 | /* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ |
| 12 | /* #define BLOCK_SORT_EXTERNAL_FLAGS */ | 12 | /* #define BLOCK_SORT_EXTERNAL_FLAGS */ |
| 13 | // #define BLOCK_SORT_EXTERNAL_FLAGS | ||
| 13 | 14 | ||
| 14 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS | 15 | #ifdef BLOCK_SORT_EXTERNAL_FLAGS |
| 15 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5)) | 16 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5) |
| 16 | #else | 17 | #else |
| 17 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 | 18 | #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 |
| 18 | #endif | 19 | #endif |
| 19 | 20 | ||
| 20 | #define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) | 21 | #define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) |
| 21 | 22 | ||
| 22 | UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize); | 23 | UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize); |
| 23 | 24 | ||
| 24 | EXTERN_C_END | 25 | EXTERN_C_END |
| 25 | 26 | ||
diff --git a/C/Compiler.h b/C/Compiler.h index 2a9c2b7..b266b27 100644 --- a/C/Compiler.h +++ b/C/Compiler.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Compiler.h : Compiler specific defines and pragmas | 1 | /* Compiler.h : Compiler specific defines and pragmas |
| 2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_COMPILER_H | 4 | #ifndef ZIP7_INC_COMPILER_H |
| 5 | #define ZIP7_INC_COMPILER_H | 5 | #define ZIP7_INC_COMPILER_H |
| @@ -183,6 +183,16 @@ typedef void (*Z7_void_Function)(void); | |||
| 183 | #define Z7_ATTRIB_NO_VECTORIZE | 183 | #define Z7_ATTRIB_NO_VECTORIZE |
| 184 | #endif | 184 | #endif |
| 185 | 185 | ||
| 186 | #if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920) | ||
| 187 | #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )") | ||
| 188 | #define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )") | ||
| 189 | #else | ||
| 190 | #define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE | ||
| 191 | #define Z7_PRAGMA_OPTIMIZE_DEFAULT | ||
| 192 | #endif | ||
| 193 | |||
| 194 | |||
| 195 | |||
| 186 | #if defined(MY_CPU_X86_OR_AMD64) && ( \ | 196 | #if defined(MY_CPU_X86_OR_AMD64) && ( \ |
| 187 | defined(__clang__) && (__clang_major__ >= 4) \ | 197 | defined(__clang__) && (__clang_major__ >= 4) \ |
| 188 | || defined(__GNUC__) && (__GNUC__ >= 5)) | 198 | || defined(__GNUC__) && (__GNUC__ >= 5)) |
diff --git a/C/CpuArch.h b/C/CpuArch.h index a6297ea..1690a5b 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
| @@ -47,6 +47,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 47 | #define MY_CPU_SIZEOF_POINTER 4 | 47 | #define MY_CPU_SIZEOF_POINTER 4 |
| 48 | #endif | 48 | #endif |
| 49 | 49 | ||
| 50 | #if defined(__SSE2__) \ | ||
| 51 | || defined(MY_CPU_AMD64) \ | ||
| 52 | || defined(_M_IX86_FP) && (_M_IX86_FP >= 2) | ||
| 53 | #define MY_CPU_SSE2 | ||
| 54 | #endif | ||
| 55 | |||
| 50 | 56 | ||
| 51 | #if defined(_M_ARM64) \ | 57 | #if defined(_M_ARM64) \ |
| 52 | || defined(_M_ARM64EC) \ | 58 | || defined(_M_ARM64EC) \ |
| @@ -571,10 +577,12 @@ problem-4 : performace: | |||
| 571 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) | 577 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v) |
| 572 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) | 578 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) |
| 573 | #define Z7_CONV_NATIVE_TO_BE_32(v) (v) | 579 | #define Z7_CONV_NATIVE_TO_BE_32(v) (v) |
| 580 | // #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8)) | ||
| 574 | #elif defined(MY_CPU_LE) | 581 | #elif defined(MY_CPU_LE) |
| 575 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) | 582 | #define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v) |
| 576 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) | 583 | #define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v) |
| 577 | #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) | 584 | #define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v) |
| 585 | // #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8)) | ||
| 578 | #else | 586 | #else |
| 579 | #error Stop_Compiling_Unknown_Endian_CONV | 587 | #error Stop_Compiling_Unknown_Endian_CONV |
| 580 | #endif | 588 | #endif |
diff --git a/C/HuffEnc.c b/C/HuffEnc.c index 996da30..cbf8c22 100644 --- a/C/HuffEnc.c +++ b/C/HuffEnc.c | |||
| @@ -1,60 +1,125 @@ | |||
| 1 | /* HuffEnc.c -- functions for Huffman encoding | 1 | /* HuffEnc.c -- functions for Huffman encoding |
| 2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include <string.h> | ||
| 7 | |||
| 6 | #include "HuffEnc.h" | 8 | #include "HuffEnc.h" |
| 7 | #include "Sort.h" | 9 | #include "Sort.h" |
| 10 | #include "CpuArch.h" | ||
| 8 | 11 | ||
| 9 | #define kMaxLen 16 | 12 | #define kMaxLen Z7_HUFFMAN_LEN_MAX |
| 10 | #define NUM_BITS 10 | 13 | #define NUM_BITS 10 |
| 11 | #define MASK ((1u << NUM_BITS) - 1) | 14 | #define MASK ((1u << NUM_BITS) - 1) |
| 12 | 15 | #define FREQ_MASK (~(UInt32)MASK) | |
| 13 | #define NUM_COUNTERS 64 | 16 | #define NUM_COUNTERS (48 * 2) |
| 14 | 17 | ||
| 15 | #define HUFFMAN_SPEED_OPT | 18 | #if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE)) |
| 19 | #if defined(MY_CPU_LE) | ||
| 20 | #define HI_HALF_OFFSET 1 | ||
| 21 | #else | ||
| 22 | #define HI_HALF_OFFSET 0 | ||
| 23 | #endif | ||
| 24 | #define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET)) | ||
| 25 | #define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val); | ||
| 26 | #define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi) | ||
| 27 | #define UPDATE_E(eHi) eHi++; | ||
| 28 | #else | ||
| 29 | #define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS)) | ||
| 30 | #define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field | ||
| 31 | #define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS)) | ||
| 32 | #define UPDATE_E(eHi) eHi += 1 << NUM_BITS; | ||
| 33 | #endif | ||
| 16 | 34 | ||
| 17 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen) | 35 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen) |
| 18 | { | 36 | { |
| 19 | UInt32 num = 0; | 37 | #if NUM_COUNTERS > 2 |
| 20 | /* if (maxLen > 10) maxLen = 10; */ | 38 | unsigned counters[NUM_COUNTERS]; |
| 39 | #endif | ||
| 40 | #if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2 | ||
| 41 | #define lenCounters (counters) | ||
| 42 | #define codes (counters + kMaxLen + 4) | ||
| 43 | #else | ||
| 44 | unsigned lenCounters[kMaxLen + 1]; | ||
| 45 | UInt32 codes[kMaxLen + 1]; | ||
| 46 | #endif | ||
| 47 | |||
| 48 | unsigned num; | ||
| 21 | { | 49 | { |
| 22 | UInt32 i; | 50 | unsigned i; |
| 23 | 51 | // UInt32 sum = 0; | |
| 24 | #ifdef HUFFMAN_SPEED_OPT | 52 | |
| 53 | #if NUM_COUNTERS > 2 | ||
| 25 | 54 | ||
| 26 | UInt32 counters[NUM_COUNTERS]; | 55 | #define CTR_ITEM_FOR_FREQ(freq) \ |
| 56 | counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)] | ||
| 57 | |||
| 27 | for (i = 0; i < NUM_COUNTERS; i++) | 58 | for (i = 0; i < NUM_COUNTERS; i++) |
| 28 | counters[i] = 0; | 59 | counters[i] = 0; |
| 29 | for (i = 0; i < numSymbols; i++) | 60 | memset(lens, 0, numSymbols); |
| 30 | { | 61 | { |
| 31 | UInt32 freq = freqs[i]; | 62 | const UInt32 *fp = freqs + numSymbols; |
| 32 | counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++; | 63 | #define NUM_UNROLLS 1 |
| 64 | #if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble | ||
| 65 | if (numSymbols & 1) | ||
| 66 | { | ||
| 67 | UInt32 f; | ||
| 68 | f = *--fp; CTR_ITEM_FOR_FREQ(f)++; | ||
| 69 | // sum += f; | ||
| 70 | } | ||
| 71 | #endif | ||
| 72 | do | ||
| 73 | { | ||
| 74 | UInt32 f; | ||
| 75 | fp -= NUM_UNROLLS; | ||
| 76 | f = fp[0]; CTR_ITEM_FOR_FREQ(f)++; | ||
| 77 | // sum += f; | ||
| 78 | #if NUM_UNROLLS > 1 | ||
| 79 | f = fp[1]; CTR_ITEM_FOR_FREQ(f)++; | ||
| 80 | // sum += f; | ||
| 81 | #endif | ||
| 82 | } | ||
| 83 | while (fp != freqs); | ||
| 33 | } | 84 | } |
| 34 | 85 | #if 0 | |
| 35 | for (i = 1; i < NUM_COUNTERS; i++) | 86 | printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols); |
| 36 | { | 87 | { |
| 37 | UInt32 temp = counters[i]; | 88 | unsigned k = 0; |
| 38 | counters[i] = num; | 89 | for (k = 0; k < NUM_COUNTERS; k++) |
| 39 | num += temp; | 90 | printf(" %u", counters[k]); |
| 40 | } | 91 | } |
| 41 | 92 | #endif | |
| 42 | for (i = 0; i < numSymbols; i++) | 93 | |
| 94 | num = counters[1]; | ||
| 95 | counters[1] = 0; | ||
| 96 | for (i = 2; i != NUM_COUNTERS; i += 2) | ||
| 43 | { | 97 | { |
| 44 | UInt32 freq = freqs[i]; | 98 | unsigned c; |
| 45 | if (freq == 0) | 99 | c = (counters )[i]; (counters )[i] = num; num += c; |
| 46 | lens[i] = 0; | 100 | c = (counters + 1)[i]; (counters + 1)[i] = num; num += c; |
| 47 | else | 101 | } |
| 48 | p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS); | 102 | counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array |
| 103 | { | ||
| 104 | i = 0; | ||
| 105 | do | ||
| 106 | { | ||
| 107 | const UInt32 f = freqs[i]; | ||
| 108 | #if 0 | ||
| 109 | if (f == 0) lens[i] = 0; else | ||
| 110 | #endif | ||
| 111 | p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS); | ||
| 112 | } | ||
| 113 | while (++i != numSymbols); | ||
| 49 | } | 114 | } |
| 50 | counters[0] = 0; | ||
| 51 | HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); | 115 | HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); |
| 52 | 116 | ||
| 53 | #else | 117 | #else // NUM_COUNTERS <= 2 |
| 54 | 118 | ||
| 119 | num = 0; | ||
| 55 | for (i = 0; i < numSymbols; i++) | 120 | for (i = 0; i < numSymbols; i++) |
| 56 | { | 121 | { |
| 57 | UInt32 freq = freqs[i]; | 122 | const UInt32 freq = freqs[i]; |
| 58 | if (freq == 0) | 123 | if (freq == 0) |
| 59 | lens[i] = 0; | 124 | lens[i] = 0; |
| 60 | else | 125 | else |
| @@ -62,17 +127,27 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
| 62 | } | 127 | } |
| 63 | HeapSort(p, num); | 128 | HeapSort(p, num); |
| 64 | 129 | ||
| 65 | #endif | 130 | #endif |
| 66 | } | 131 | } |
| 67 | 132 | ||
| 68 | if (num < 2) | 133 | if (num <= 2) |
| 69 | { | 134 | { |
| 70 | unsigned minCode = 0; | 135 | unsigned minCode = 0; |
| 71 | unsigned maxCode = 1; | 136 | unsigned maxCode = 1; |
| 72 | if (num == 1) | 137 | if (num) |
| 73 | { | 138 | { |
| 74 | maxCode = (unsigned)p[0] & MASK; | 139 | maxCode = (unsigned)p[(size_t)num - 1] & MASK; |
| 75 | if (maxCode == 0) | 140 | if (num == 2) |
| 141 | { | ||
| 142 | minCode = (unsigned)p[0] & MASK; | ||
| 143 | if (minCode > maxCode) | ||
| 144 | { | ||
| 145 | const unsigned temp = minCode; | ||
| 146 | minCode = maxCode; | ||
| 147 | maxCode = temp; | ||
| 148 | } | ||
| 149 | } | ||
| 150 | else if (maxCode == 0) | ||
| 76 | maxCode++; | 151 | maxCode++; |
| 77 | } | 152 | } |
| 78 | p[minCode] = 0; | 153 | p[minCode] = 0; |
| @@ -80,69 +155,191 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
| 80 | lens[minCode] = lens[maxCode] = 1; | 155 | lens[minCode] = lens[maxCode] = 1; |
| 81 | return; | 156 | return; |
| 82 | } | 157 | } |
| 83 | |||
| 84 | { | 158 | { |
| 85 | UInt32 b, e, i; | 159 | unsigned i; |
| 86 | 160 | for (i = 0; i <= kMaxLen; i++) | |
| 87 | i = b = e = 0; | 161 | lenCounters[i] = 0; |
| 88 | do | 162 | lenCounters[1] = 2; // by default root node has 2 child leaves at level 1. |
| 163 | } | ||
| 164 | // if (num != 2) | ||
| 165 | { | ||
| 166 | // num > 2 | ||
| 167 | // the binary tree will contain (num - 1) internal nodes. | ||
| 168 | // p[num - 2] will be root node of binary tree. | ||
| 169 | UInt32 *b; | ||
| 170 | UInt32 *n; | ||
| 171 | // first node will have two leaf childs: p[0] and p[1]: | ||
| 172 | // p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs | ||
| 173 | // if (pi == n) exit(0); | ||
| 174 | // if (pi != n) | ||
| 89 | { | 175 | { |
| 90 | UInt32 n, m, freq; | 176 | UInt32 fb = (p[1] & FREQ_MASK) + p[0]; |
| 91 | n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; | 177 | UInt32 f = p[2] & FREQ_MASK; |
| 92 | freq = (p[n] & ~MASK); | 178 | const UInt32 *pi = p + 2; |
| 93 | p[n] = (p[n] & MASK) | (e << NUM_BITS); | 179 | UInt32 *e = p; |
| 94 | m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; | 180 | UInt32 eHi = 0; |
| 95 | freq += (p[m] & ~MASK); | 181 | n = p + num; |
| 96 | p[m] = (p[m] & MASK) | (e << NUM_BITS); | 182 | b = p; |
| 97 | p[e] = (p[e] & MASK) | freq; | 183 | // p[0] = fb; |
| 98 | e++; | 184 | for (;;) |
| 185 | { | ||
| 186 | // (b <= e) | ||
| 187 | UInt32 sum; | ||
| 188 | e++; | ||
| 189 | UPDATE_E(eHi) | ||
| 190 | |||
| 191 | // (b < e) | ||
| 192 | |||
| 193 | // p range : high bits | ||
| 194 | // [0, b) : parent : processed nodes that have parent and childs | ||
| 195 | // [b, e) : FREQ : non-processed nodes that have no parent but have childs | ||
| 196 | // [e, pi) : FREQ : processed leaves for which parent node was created | ||
| 197 | // [pi, n) : FREQ : non-processed leaves for which parent node was not created | ||
| 198 | |||
| 199 | // first child | ||
| 200 | // note : (*b < f) is same result as ((*b & FREQ_MASK) < f) | ||
| 201 | if (fb < f) | ||
| 202 | { | ||
| 203 | // node freq is smaller | ||
| 204 | sum = fb & FREQ_MASK; | ||
| 205 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
| 206 | b++; | ||
| 207 | fb = *b; | ||
| 208 | if (b == e) | ||
| 209 | { | ||
| 210 | if (++pi == n) | ||
| 211 | break; | ||
| 212 | sum += f; | ||
| 213 | fb &= MASK; | ||
| 214 | fb |= sum; | ||
| 215 | *e = fb; | ||
| 216 | f = *pi & FREQ_MASK; | ||
| 217 | continue; | ||
| 218 | } | ||
| 219 | } | ||
| 220 | else if (++pi == n) | ||
| 221 | { | ||
| 222 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
| 223 | b++; | ||
| 224 | break; | ||
| 225 | } | ||
| 226 | else | ||
| 227 | { | ||
| 228 | sum = f; | ||
| 229 | f = *pi & FREQ_MASK; | ||
| 230 | } | ||
| 231 | |||
| 232 | // (b < e) | ||
| 233 | |||
| 234 | // second child | ||
| 235 | if (fb < f) | ||
| 236 | { | ||
| 237 | sum += fb; | ||
| 238 | sum &= FREQ_MASK; | ||
| 239 | STORE_PARENT_DIRECT (b, fb, eHi) | ||
| 240 | b++; | ||
| 241 | *e = (*e & MASK) | sum; // set frequency sum | ||
| 242 | // (b <= e) is possible here | ||
| 243 | fb = *b; | ||
| 244 | } | ||
| 245 | else if (++pi == n) | ||
| 246 | break; | ||
| 247 | else | ||
| 248 | { | ||
| 249 | sum += f; | ||
| 250 | f = *pi & FREQ_MASK; | ||
| 251 | *e = (*e & MASK) | sum; // set frequency sum | ||
| 252 | } | ||
| 253 | } | ||
| 99 | } | 254 | } |
| 100 | while (num - e > 1); | ||
| 101 | 255 | ||
| 256 | // printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p); | ||
| 102 | { | 257 | { |
| 103 | UInt32 lenCounters[kMaxLen + 1]; | 258 | n -= 2; |
| 104 | for (i = 0; i <= kMaxLen; i++) | 259 | *n &= MASK; // root node : we clear high bits (zero bits mean level == 0) |
| 105 | lenCounters[i] = 0; | 260 | if (n != b) |
| 106 | |||
| 107 | p[--e] &= MASK; | ||
| 108 | lenCounters[1] = 2; | ||
| 109 | while (e != 0) | ||
| 110 | { | ||
| 111 | UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1; | ||
| 112 | p[e] = (p[e] & MASK) | (len << NUM_BITS); | ||
| 113 | if (len >= maxLen) | ||
| 114 | for (len = maxLen - 1; lenCounters[len] == 0; len--); | ||
| 115 | lenCounters[len]--; | ||
| 116 | lenCounters[(size_t)len + 1] += 2; | ||
| 117 | } | ||
| 118 | |||
| 119 | { | 261 | { |
| 120 | UInt32 len; | 262 | // We go here, if we have some number of non-created nodes up to root. |
| 121 | i = 0; | 263 | // We process them in simplified code: |
| 122 | for (len = maxLen; len != 0; len--) | 264 | // position of parent for each pair of nodes is known. |
| 265 | // n[-2], n[-1] : current pair of child nodes | ||
| 266 | // (p1) : parent node for current pair. | ||
| 267 | UInt32 *p1 = n; | ||
| 268 | do | ||
| 123 | { | 269 | { |
| 124 | UInt32 k; | 270 | const unsigned len = LOAD_PARENT(p1) + 1; |
| 125 | for (k = lenCounters[len]; k != 0; k--) | 271 | p1--; |
| 126 | lens[p[i++] & MASK] = (Byte)len; | 272 | (lenCounters )[len] -= 2; // we remove 2 leaves from level (len) |
| 273 | (lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1) | ||
| 274 | n -= 2; | ||
| 275 | STORE_PARENT (n , n[0], len) | ||
| 276 | STORE_PARENT (n + 1, n[1], len) | ||
| 127 | } | 277 | } |
| 278 | while (n != b); | ||
| 128 | } | 279 | } |
| 129 | 280 | } | |
| 281 | |||
| 282 | if (b != p) | ||
| 283 | { | ||
| 284 | // we detect level of each node (realtive to root), | ||
| 285 | // and update lenCounters[]. | ||
| 286 | // We process only intermediate nodes and we don't process leaves. | ||
| 287 | do | ||
| 130 | { | 288 | { |
| 131 | UInt32 nextCodes[kMaxLen + 1]; | 289 | // if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii]) |
| 132 | { | 290 | // if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree |
| 133 | UInt32 code = 0; | 291 | unsigned len; |
| 134 | UInt32 len; | 292 | b--; |
| 135 | for (len = 1; len <= kMaxLen; len++) | 293 | len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1; |
| 136 | nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1; | 294 | STORE_PARENT (b, *b, len) |
| 137 | } | 295 | if (len >= maxLen) |
| 138 | /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */ | ||
| 139 | |||
| 140 | { | 296 | { |
| 141 | UInt32 k; | 297 | // We are not allowed to create node at level (maxLen) and higher, |
| 142 | for (k = 0; k < numSymbols; k++) | 298 | // because all leaves must be placed to level (maxLen) or lower. |
| 143 | p[k] = nextCodes[lens[k]]++; | 299 | // We find nearest allowed leaf and place current node to level of that leaf: |
| 300 | for (len = maxLen - 1; lenCounters[len] == 0; len--) {} | ||
| 144 | } | 301 | } |
| 302 | lenCounters[len]--; // we remove 1 leaf from level (len) | ||
| 303 | (lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1) | ||
| 304 | } | ||
| 305 | while (b != p); | ||
| 306 | } | ||
| 307 | } | ||
| 308 | { | ||
| 309 | { | ||
| 310 | unsigned len = maxLen; | ||
| 311 | const UInt32 *p2 = p; | ||
| 312 | do | ||
| 313 | { | ||
| 314 | unsigned k = lenCounters[len]; | ||
| 315 | if (k) | ||
| 316 | do | ||
| 317 | lens[(unsigned)*p2++ & MASK] = (Byte)len; | ||
| 318 | while (--k); | ||
| 319 | } | ||
| 320 | while (--len); | ||
| 321 | } | ||
| 322 | codes[0] = 0; // we don't want garbage values to be written to p[] array. | ||
| 323 | // codes[1] = 0; | ||
| 324 | { | ||
| 325 | UInt32 code = 0; | ||
| 326 | unsigned len; | ||
| 327 | for (len = 0; len < kMaxLen; len++) | ||
| 328 | (codes + 1)[len] = code = (code + lenCounters[len]) << 1; | ||
| 329 | } | ||
| 330 | /* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */ | ||
| 331 | { | ||
| 332 | const Byte * const limit = lens + numSymbols; | ||
| 333 | do | ||
| 334 | { | ||
| 335 | unsigned len; | ||
| 336 | UInt32 c; | ||
| 337 | len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1; | ||
| 338 | // len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1; | ||
| 339 | p += 1; | ||
| 340 | lens += 1; | ||
| 145 | } | 341 | } |
| 342 | while (lens != limit); | ||
| 146 | } | 343 | } |
| 147 | } | 344 | } |
| 148 | } | 345 | } |
| @@ -150,5 +347,14 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb | |||
| 150 | #undef kMaxLen | 347 | #undef kMaxLen |
| 151 | #undef NUM_BITS | 348 | #undef NUM_BITS |
| 152 | #undef MASK | 349 | #undef MASK |
| 350 | #undef FREQ_MASK | ||
| 153 | #undef NUM_COUNTERS | 351 | #undef NUM_COUNTERS |
| 154 | #undef HUFFMAN_SPEED_OPT | 352 | #undef CTR_ITEM_FOR_FREQ |
| 353 | #undef LOAD_PARENT | ||
| 354 | #undef STORE_PARENT | ||
| 355 | #undef STORE_PARENT_DIRECT | ||
| 356 | #undef UPDATE_E | ||
| 357 | #undef HI_HALF_OFFSET | ||
| 358 | #undef NUM_UNROLLS | ||
| 359 | #undef lenCounters | ||
| 360 | #undef codes | ||
diff --git a/C/HuffEnc.h b/C/HuffEnc.h index cbc5d11..2217f55 100644 --- a/C/HuffEnc.h +++ b/C/HuffEnc.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* HuffEnc.h -- Huffman encoding | 1 | /* HuffEnc.h -- Huffman encoding |
| 2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_HUFF_ENC_H | 4 | #ifndef ZIP7_INC_HUFF_ENC_H |
| 5 | #define ZIP7_INC_HUFF_ENC_H | 5 | #define ZIP7_INC_HUFF_ENC_H |
| @@ -8,14 +8,14 @@ | |||
| 8 | 8 | ||
| 9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
| 10 | 10 | ||
| 11 | #define Z7_HUFFMAN_LEN_MAX 16 | ||
| 11 | /* | 12 | /* |
| 12 | Conditions: | 13 | Conditions: |
| 13 | num <= 1024 = 2 ^ NUM_BITS | 14 | 2 <= num <= 1024 = 2 ^ NUM_BITS |
| 14 | Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) | 15 | Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) |
| 15 | maxLen <= 16 = kMaxLen | 16 | 1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX |
| 16 | Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) | 17 | Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) |
| 17 | */ | 18 | */ |
| 18 | |||
| 19 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); | 19 | void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); |
| 20 | 20 | ||
| 21 | EXTERN_C_END | 21 | EXTERN_C_END |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFind.c -- Match finder for LZ algorithms | 1 | /* LzFind.c -- Match finder for LZ algorithms |
| 2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -404,7 +404,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
| 404 | const unsigned nbMax = | 404 | const unsigned nbMax = |
| 405 | (p->numHashBytes == 2 ? 16 : | 405 | (p->numHashBytes == 2 ? 16 : |
| 406 | (p->numHashBytes == 3 ? 24 : 32)); | 406 | (p->numHashBytes == 3 ? 24 : 32)); |
| 407 | if (numBits > nbMax) | 407 | if (numBits >= nbMax) |
| 408 | numBits = nbMax; | 408 | numBits = nbMax; |
| 409 | if (numBits >= 32) | 409 | if (numBits >= 32) |
| 410 | hs = (UInt32)0 - 1; | 410 | hs = (UInt32)0 - 1; |
| @@ -416,14 +416,14 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
| 416 | hs |= (256 << kLzHash_CrcShift_2) - 1; | 416 | hs |= (256 << kLzHash_CrcShift_2) - 1; |
| 417 | { | 417 | { |
| 418 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); | 418 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); |
| 419 | if (hs > hs2) | 419 | if (hs >= hs2) |
| 420 | hs = hs2; | 420 | hs = hs2; |
| 421 | } | 421 | } |
| 422 | hsCur = hs; | 422 | hsCur = hs; |
| 423 | if (p->expectedDataSize < historySize) | 423 | if (p->expectedDataSize < historySize) |
| 424 | { | 424 | { |
| 425 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); | 425 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); |
| 426 | if (hsCur > hs2) | 426 | if (hsCur >= hs2) |
| 427 | hsCur = hs2; | 427 | hsCur = hs2; |
| 428 | } | 428 | } |
| 429 | } | 429 | } |
| @@ -434,7 +434,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
| 434 | if (p->expectedDataSize < historySize) | 434 | if (p->expectedDataSize < historySize) |
| 435 | { | 435 | { |
| 436 | hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); | 436 | hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); |
| 437 | if (hsCur > hs) // is it possible? | 437 | if (hsCur >= hs) // is it possible? |
| 438 | hsCur = hs; | 438 | hsCur = hs; |
| 439 | } | 439 | } |
| 440 | } | 440 | } |
| @@ -890,7 +890,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, | |||
| 890 | return d; | 890 | return d; |
| 891 | { | 891 | { |
| 892 | const Byte *pb = cur - delta; | 892 | const Byte *pb = cur - delta; |
| 893 | curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; | 893 | curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; |
| 894 | if (pb[maxLen] == cur[maxLen] && *pb == *cur) | 894 | if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
| 895 | { | 895 | { |
| 896 | UInt32 len = 0; | 896 | UInt32 len = 0; |
| @@ -925,7 +925,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, | |||
| 925 | break; | 925 | break; |
| 926 | { | 926 | { |
| 927 | ptrdiff_t diff; | 927 | ptrdiff_t diff; |
| 928 | curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; | 928 | curMatch = son[_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)]; |
| 929 | diff = (ptrdiff_t)0 - (ptrdiff_t)delta; | 929 | diff = (ptrdiff_t)0 - (ptrdiff_t)delta; |
| 930 | if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) | 930 | if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff]) |
| 931 | { | 931 | { |
| @@ -972,7 +972,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt | |||
| 972 | // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } | 972 | // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; } |
| 973 | 973 | ||
| 974 | cmCheck = (UInt32)(pos - _cyclicBufferSize); | 974 | cmCheck = (UInt32)(pos - _cyclicBufferSize); |
| 975 | if ((UInt32)pos <= _cyclicBufferSize) | 975 | if ((UInt32)pos < _cyclicBufferSize) |
| 976 | cmCheck = 0; | 976 | cmCheck = 0; |
| 977 | 977 | ||
| 978 | if (cmCheck < curMatch) | 978 | if (cmCheck < curMatch) |
| @@ -980,7 +980,7 @@ UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byt | |||
| 980 | { | 980 | { |
| 981 | const UInt32 delta = pos - curMatch; | 981 | const UInt32 delta = pos - curMatch; |
| 982 | { | 982 | { |
| 983 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); | 983 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); |
| 984 | const Byte *pb = cur - delta; | 984 | const Byte *pb = cur - delta; |
| 985 | unsigned len = (len0 < len1 ? len0 : len1); | 985 | unsigned len = (len0 < len1 ? len0 : len1); |
| 986 | const UInt32 pair0 = pair[0]; | 986 | const UInt32 pair0 = pair[0]; |
| @@ -1039,7 +1039,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
| 1039 | UInt32 cmCheck; | 1039 | UInt32 cmCheck; |
| 1040 | 1040 | ||
| 1041 | cmCheck = (UInt32)(pos - _cyclicBufferSize); | 1041 | cmCheck = (UInt32)(pos - _cyclicBufferSize); |
| 1042 | if ((UInt32)pos <= _cyclicBufferSize) | 1042 | if ((UInt32)pos < _cyclicBufferSize) |
| 1043 | cmCheck = 0; | 1043 | cmCheck = 0; |
| 1044 | 1044 | ||
| 1045 | if (// curMatch >= pos || // failure | 1045 | if (// curMatch >= pos || // failure |
| @@ -1048,7 +1048,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
| 1048 | { | 1048 | { |
| 1049 | const UInt32 delta = pos - curMatch; | 1049 | const UInt32 delta = pos - curMatch; |
| 1050 | { | 1050 | { |
| 1051 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); | 1051 | CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + (_cyclicBufferPos < delta ? _cyclicBufferSize : 0)) << 1); |
| 1052 | const Byte *pb = cur - delta; | 1052 | const Byte *pb = cur - delta; |
| 1053 | unsigned len = (len0 < len1 ? len0 : len1); | 1053 | unsigned len = (len0 < len1 ? len0 : len1); |
| 1054 | if (pb[len] == cur[len]) | 1054 | if (pb[len] == cur[len]) |
| @@ -1595,7 +1595,7 @@ static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) | |||
| 1595 | UInt32 pos = p->pos; \ | 1595 | UInt32 pos = p->pos; \ |
| 1596 | UInt32 num2 = num; \ | 1596 | UInt32 num2 = num; \ |
| 1597 | /* (p->pos == p->posLimit) is not allowed here !!! */ \ | 1597 | /* (p->pos == p->posLimit) is not allowed here !!! */ \ |
| 1598 | { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \ | 1598 | { const UInt32 rem = p->posLimit - pos; if (num2 >= rem) num2 = rem; } \ |
| 1599 | num -= num2; \ | 1599 | num -= num2; \ |
| 1600 | { const UInt32 cycPos = p->cyclicBufferPos; \ | 1600 | { const UInt32 cycPos = p->cyclicBufferPos; \ |
| 1601 | son = p->son + cycPos; \ | 1601 | son = p->son + cycPos; \ |
diff --git a/C/LzFindMt.c b/C/LzFindMt.c index ac9d59d..25fcc46 100644 --- a/C/LzFindMt.c +++ b/C/LzFindMt.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms |
| 2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -82,6 +82,8 @@ extern UInt64 g_NumIters_Bytes; | |||
| 82 | Z7_NO_INLINE | 82 | Z7_NO_INLINE |
| 83 | static void MtSync_Construct(CMtSync *p) | 83 | static void MtSync_Construct(CMtSync *p) |
| 84 | { | 84 | { |
| 85 | p->affinityGroup = -1; | ||
| 86 | p->affinityInGroup = 0; | ||
| 85 | p->affinity = 0; | 87 | p->affinity = 0; |
| 86 | p->wasCreated = False; | 88 | p->wasCreated = False; |
| 87 | p->csWasInitialized = False; | 89 | p->csWasInitialized = False; |
| @@ -259,6 +261,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void * | |||
| 259 | // return ERROR_TOO_MANY_POSTS; // for debug | 261 | // return ERROR_TOO_MANY_POSTS; // for debug |
| 260 | // return EINVAL; // for debug | 262 | // return EINVAL; // for debug |
| 261 | 263 | ||
| 264 | #ifdef _WIN32 | ||
| 265 | if (p->affinityGroup >= 0) | ||
| 266 | wres = Thread_Create_With_Group(&p->thread, startAddress, obj, | ||
| 267 | (unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup); | ||
| 268 | else | ||
| 269 | #endif | ||
| 262 | if (p->affinity != 0) | 270 | if (p->affinity != 0) |
| 263 | wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); | 271 | wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); |
| 264 | else | 272 | else |
diff --git a/C/LzFindMt.h b/C/LzFindMt.h index fcb479d..89984f5 100644 --- a/C/LzFindMt.h +++ b/C/LzFindMt.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms |
| 2 | 2024-01-22 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H | 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H |
| 5 | #define ZIP7_INC_LZ_FIND_MT_H | 5 | #define ZIP7_INC_LZ_FIND_MT_H |
| @@ -12,8 +12,10 @@ EXTERN_C_BEGIN | |||
| 12 | typedef struct | 12 | typedef struct |
| 13 | { | 13 | { |
| 14 | UInt32 numProcessedBlocks; | 14 | UInt32 numProcessedBlocks; |
| 15 | CThread thread; | 15 | Int32 affinityGroup; |
| 16 | UInt64 affinityInGroup; | ||
| 16 | UInt64 affinity; | 17 | UInt64 affinity; |
| 18 | CThread thread; | ||
| 17 | 19 | ||
| 18 | BoolInt wasCreated; | 20 | BoolInt wasCreated; |
| 19 | BoolInt needStart; | 21 | BoolInt needStart; |
diff --git a/C/Lzma2Enc.c b/C/Lzma2Enc.c index 703e146..72aec69 100644 --- a/C/Lzma2Enc.c +++ b/C/Lzma2Enc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Lzma2Enc.c -- LZMA2 Encoder | 1 | /* Lzma2Enc.c -- LZMA2 Encoder |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -235,6 +235,7 @@ void Lzma2EncProps_Init(CLzma2EncProps *p) | |||
| 235 | p->numBlockThreads_Reduced = -1; | 235 | p->numBlockThreads_Reduced = -1; |
| 236 | p->numBlockThreads_Max = -1; | 236 | p->numBlockThreads_Max = -1; |
| 237 | p->numTotalThreads = -1; | 237 | p->numTotalThreads = -1; |
| 238 | p->numThreadGroups = 0; | ||
| 238 | } | 239 | } |
| 239 | 240 | ||
| 240 | void Lzma2EncProps_Normalize(CLzma2EncProps *p) | 241 | void Lzma2EncProps_Normalize(CLzma2EncProps *p) |
| @@ -781,6 +782,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p, | |||
| 781 | } | 782 | } |
| 782 | 783 | ||
| 783 | p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; | 784 | p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; |
| 785 | p->mtCoder.numThreadGroups = p->props.numThreadGroups; | ||
| 784 | p->mtCoder.expectedDataSize = p->expectedDataSize; | 786 | p->mtCoder.expectedDataSize = p->expectedDataSize; |
| 785 | 787 | ||
| 786 | { | 788 | { |
diff --git a/C/Lzma2Enc.h b/C/Lzma2Enc.h index cb25275..1e6b50c 100644 --- a/C/Lzma2Enc.h +++ b/C/Lzma2Enc.h | |||
| @@ -18,6 +18,7 @@ typedef struct | |||
| 18 | int numBlockThreads_Reduced; | 18 | int numBlockThreads_Reduced; |
| 19 | int numBlockThreads_Max; | 19 | int numBlockThreads_Max; |
| 20 | int numTotalThreads; | 20 | int numTotalThreads; |
| 21 | unsigned numThreadGroups; // 0 : no groups | ||
| 21 | } CLzma2EncProps; | 22 | } CLzma2EncProps; |
| 22 | 23 | ||
| 23 | void Lzma2EncProps_Init(CLzma2EncProps *p); | 24 | void Lzma2EncProps_Init(CLzma2EncProps *p); |
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 088b78f..84a29a5 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c | |||
| @@ -62,7 +62,9 @@ void LzmaEncProps_Init(CLzmaEncProps *p) | |||
| 62 | p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; | 62 | p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; |
| 63 | p->numHashOutBits = 0; | 63 | p->numHashOutBits = 0; |
| 64 | p->writeEndMark = 0; | 64 | p->writeEndMark = 0; |
| 65 | p->affinityGroup = -1; | ||
| 65 | p->affinity = 0; | 66 | p->affinity = 0; |
| 67 | p->affinityInGroup = 0; | ||
| 66 | } | 68 | } |
| 67 | 69 | ||
| 68 | void LzmaEncProps_Normalize(CLzmaEncProps *p) | 70 | void LzmaEncProps_Normalize(CLzmaEncProps *p) |
| @@ -598,6 +600,10 @@ SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props2) | |||
| 598 | p->multiThread = (props.numThreads > 1); | 600 | p->multiThread = (props.numThreads > 1); |
| 599 | p->matchFinderMt.btSync.affinity = | 601 | p->matchFinderMt.btSync.affinity = |
| 600 | p->matchFinderMt.hashSync.affinity = props.affinity; | 602 | p->matchFinderMt.hashSync.affinity = props.affinity; |
| 603 | p->matchFinderMt.btSync.affinityGroup = | ||
| 604 | p->matchFinderMt.hashSync.affinityGroup = props.affinityGroup; | ||
| 605 | p->matchFinderMt.btSync.affinityInGroup = | ||
| 606 | p->matchFinderMt.hashSync.affinityInGroup = props.affinityInGroup; | ||
| 601 | #endif | 607 | #endif |
| 602 | 608 | ||
| 603 | return SZ_OK; | 609 | return SZ_OK; |
diff --git a/C/LzmaEnc.h b/C/LzmaEnc.h index 9f8039a..3feb5b4 100644 --- a/C/LzmaEnc.h +++ b/C/LzmaEnc.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzmaEnc.h -- LZMA Encoder | 1 | /* LzmaEnc.h -- LZMA Encoder |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_LZMA_ENC_H | 4 | #ifndef ZIP7_INC_LZMA_ENC_H |
| 5 | #define ZIP7_INC_LZMA_ENC_H | 5 | #define ZIP7_INC_LZMA_ENC_H |
| @@ -29,11 +29,13 @@ typedef struct | |||
| 29 | int numThreads; /* 1 or 2, default = 2 */ | 29 | int numThreads; /* 1 or 2, default = 2 */ |
| 30 | 30 | ||
| 31 | // int _pad; | 31 | // int _pad; |
| 32 | Int32 affinityGroup; | ||
| 32 | 33 | ||
| 33 | UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. | 34 | UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. |
| 34 | Encoder uses this value to reduce dictionary size */ | 35 | Encoder uses this value to reduce dictionary size */ |
| 35 | 36 | ||
| 36 | UInt64 affinity; | 37 | UInt64 affinity; |
| 38 | UInt64 affinityInGroup; | ||
| 37 | } CLzmaEncProps; | 39 | } CLzmaEncProps; |
| 38 | 40 | ||
| 39 | void LzmaEncProps_Init(CLzmaEncProps *p); | 41 | void LzmaEncProps_Init(CLzmaEncProps *p); |
diff --git a/C/MtCoder.c b/C/MtCoder.c index 03959b6..923b19a 100644 --- a/C/MtCoder.c +++ b/C/MtCoder.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* MtCoder.c -- Multi-thread Coder | 1 | /* MtCoder.c -- Multi-thread Coder |
| 2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -39,14 +39,28 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p) | |||
| 39 | static THREAD_FUNC_DECL ThreadFunc(void *pp); | 39 | static THREAD_FUNC_DECL ThreadFunc(void *pp); |
| 40 | 40 | ||
| 41 | 41 | ||
| 42 | static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) | 42 | static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t |
| 43 | #ifdef _WIN32 | ||
| 44 | , CMtCoder * const mtc | ||
| 45 | #endif | ||
| 46 | ) | ||
| 43 | { | 47 | { |
| 44 | WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); | 48 | WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent); |
| 49 | // printf("\n====== MtCoderThread_CreateAndStart : \n"); | ||
| 45 | if (wres == 0) | 50 | if (wres == 0) |
| 46 | { | 51 | { |
| 47 | t->stop = False; | 52 | t->stop = False; |
| 48 | if (!Thread_WasCreated(&t->thread)) | 53 | if (!Thread_WasCreated(&t->thread)) |
| 49 | wres = Thread_Create(&t->thread, ThreadFunc, t); | 54 | { |
| 55 | #ifdef _WIN32 | ||
| 56 | if (mtc->numThreadGroups) | ||
| 57 | wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t, | ||
| 58 | ThreadNextGroup_GetNext(&mtc->nextGroup), // group | ||
| 59 | 0); // affinityMask | ||
| 60 | else | ||
| 61 | #endif | ||
| 62 | wres = Thread_Create(&t->thread, ThreadFunc, t); | ||
| 63 | } | ||
| 50 | if (wres == 0) | 64 | if (wres == 0) |
| 51 | wres = Event_Set(&t->startEvent); | 65 | wres = Event_Set(&t->startEvent); |
| 52 | } | 66 | } |
| @@ -56,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) | |||
| 56 | } | 70 | } |
| 57 | 71 | ||
| 58 | 72 | ||
| 73 | Z7_FORCE_INLINE | ||
| 59 | static void MtCoderThread_Destruct(CMtCoderThread *t) | 74 | static void MtCoderThread_Destruct(CMtCoderThread *t) |
| 60 | { | 75 | { |
| 61 | if (Thread_WasCreated(&t->thread)) | 76 | if (Thread_WasCreated(&t->thread)) |
| @@ -85,7 +100,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t) | |||
| 85 | 100 | ||
| 86 | static SRes ThreadFunc2(CMtCoderThread *t) | 101 | static SRes ThreadFunc2(CMtCoderThread *t) |
| 87 | { | 102 | { |
| 88 | CMtCoder *mtc = t->mtCoder; | 103 | CMtCoder * const mtc = t->mtCoder; |
| 89 | 104 | ||
| 90 | for (;;) | 105 | for (;;) |
| 91 | { | 106 | { |
| @@ -185,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
| 185 | if (mtc->numStartedThreads < mtc->numStartedThreadsLimit | 200 | if (mtc->numStartedThreads < mtc->numStartedThreadsLimit |
| 186 | && mtc->expectedDataSize != readProcessed) | 201 | && mtc->expectedDataSize != readProcessed) |
| 187 | { | 202 | { |
| 188 | res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); | 203 | res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads] |
| 204 | #ifdef _WIN32 | ||
| 205 | , mtc | ||
| 206 | #endif | ||
| 207 | ); | ||
| 189 | if (res == SZ_OK) | 208 | if (res == SZ_OK) |
| 190 | mtc->numStartedThreads++; | 209 | mtc->numStartedThreads++; |
| 191 | else | 210 | else |
| @@ -221,7 +240,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
| 221 | } | 240 | } |
| 222 | 241 | ||
| 223 | { | 242 | { |
| 224 | CMtCoderBlock *block = &mtc->blocks[bi]; | 243 | CMtCoderBlock * const block = &mtc->blocks[bi]; |
| 225 | block->res = res; | 244 | block->res = res; |
| 226 | block->bufIndex = bufIndex; | 245 | block->bufIndex = bufIndex; |
| 227 | block->finished = finished; | 246 | block->finished = finished; |
| @@ -311,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t) | |||
| 311 | 330 | ||
| 312 | static THREAD_FUNC_DECL ThreadFunc(void *pp) | 331 | static THREAD_FUNC_DECL ThreadFunc(void *pp) |
| 313 | { | 332 | { |
| 314 | CMtCoderThread *t = (CMtCoderThread *)pp; | 333 | CMtCoderThread * const t = (CMtCoderThread *)pp; |
| 315 | for (;;) | 334 | for (;;) |
| 316 | { | 335 | { |
| 317 | if (Event_Wait(&t->startEvent) != 0) | 336 | if (Event_Wait(&t->startEvent) != 0) |
| @@ -319,7 +338,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) | |||
| 319 | if (t->stop) | 338 | if (t->stop) |
| 320 | return 0; | 339 | return 0; |
| 321 | { | 340 | { |
| 322 | SRes res = ThreadFunc2(t); | 341 | const SRes res = ThreadFunc2(t); |
| 323 | CMtCoder *mtc = t->mtCoder; | 342 | CMtCoder *mtc = t->mtCoder; |
| 324 | if (res != SZ_OK) | 343 | if (res != SZ_OK) |
| 325 | { | 344 | { |
| @@ -328,7 +347,7 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp) | |||
| 328 | 347 | ||
| 329 | #ifndef MTCODER_USE_WRITE_THREAD | 348 | #ifndef MTCODER_USE_WRITE_THREAD |
| 330 | { | 349 | { |
| 331 | unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); | 350 | const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); |
| 332 | if (numFinished == mtc->numStartedThreads) | 351 | if (numFinished == mtc->numStartedThreads) |
| 333 | if (Event_Set(&mtc->finishedEvent) != 0) | 352 | if (Event_Set(&mtc->finishedEvent) != 0) |
| 334 | return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; | 353 | return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; |
| @@ -346,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p) | |||
| 346 | 365 | ||
| 347 | p->blockSize = 0; | 366 | p->blockSize = 0; |
| 348 | p->numThreadsMax = 0; | 367 | p->numThreadsMax = 0; |
| 368 | p->numThreadGroups = 0; | ||
| 349 | p->expectedDataSize = (UInt64)(Int64)-1; | 369 | p->expectedDataSize = (UInt64)(Int64)-1; |
| 350 | 370 | ||
| 351 | p->inStream = NULL; | 371 | p->inStream = NULL; |
| @@ -429,6 +449,8 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 429 | unsigned i; | 449 | unsigned i; |
| 430 | SRes res = SZ_OK; | 450 | SRes res = SZ_OK; |
| 431 | 451 | ||
| 452 | // printf("\n====== MtCoder_Code : \n"); | ||
| 453 | |||
| 432 | if (numThreads > MTCODER_THREADS_MAX) | 454 | if (numThreads > MTCODER_THREADS_MAX) |
| 433 | numThreads = MTCODER_THREADS_MAX; | 455 | numThreads = MTCODER_THREADS_MAX; |
| 434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); | 456 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); |
| @@ -492,11 +514,22 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 492 | 514 | ||
| 493 | p->numStartedThreadsLimit = numThreads; | 515 | p->numStartedThreadsLimit = numThreads; |
| 494 | p->numStartedThreads = 0; | 516 | p->numStartedThreads = 0; |
| 517 | ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup | ||
| 495 | 518 | ||
| 496 | // for (i = 0; i < numThreads; i++) | 519 | // for (i = 0; i < numThreads; i++) |
| 497 | { | 520 | { |
| 521 | // here we create new thread for first block. | ||
| 522 | // And each new thread will create another new thread after block reading | ||
| 523 | // until numStartedThreadsLimit is reached. | ||
| 498 | CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; | 524 | CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; |
| 499 | RINOK(MtCoderThread_CreateAndStart(nextThread)) | 525 | { |
| 526 | const SRes res2 = MtCoderThread_CreateAndStart(nextThread | ||
| 527 | #ifdef _WIN32 | ||
| 528 | , p | ||
| 529 | #endif | ||
| 530 | ); | ||
| 531 | RINOK(res2) | ||
| 532 | } | ||
| 500 | } | 533 | } |
| 501 | 534 | ||
| 502 | RINOK_THREAD(Event_Set(&p->readEvent)) | 535 | RINOK_THREAD(Event_Set(&p->readEvent)) |
| @@ -513,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 513 | RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) | 546 | RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) |
| 514 | 547 | ||
| 515 | { | 548 | { |
| 516 | const CMtCoderBlock *block = &p->blocks[bi]; | 549 | const CMtCoderBlock * const block = &p->blocks[bi]; |
| 517 | unsigned bufIndex = block->bufIndex; | 550 | const unsigned bufIndex = block->bufIndex; |
| 518 | BoolInt finished = block->finished; | 551 | const BoolInt finished = block->finished; |
| 519 | if (res == SZ_OK && block->res != SZ_OK) | 552 | if (res == SZ_OK && block->res != SZ_OK) |
| 520 | res = block->res; | 553 | res = block->res; |
| 521 | 554 | ||
| @@ -545,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 545 | } | 578 | } |
| 546 | #else | 579 | #else |
| 547 | { | 580 | { |
| 548 | WRes wres = Event_Wait(&p->finishedEvent); | 581 | const WRes wres = Event_Wait(&p->finishedEvent); |
| 549 | res = MY_SRes_HRESULT_FROM_WRes(wres); | 582 | res = MY_SRes_HRESULT_FROM_WRes(wres); |
| 550 | } | 583 | } |
| 551 | #endif | 584 | #endif |
diff --git a/C/MtCoder.h b/C/MtCoder.h index 1231d3c..8166cca 100644 --- a/C/MtCoder.h +++ b/C/MtCoder.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* MtCoder.h -- Multi-thread Coder | 1 | /* MtCoder.h -- Multi-thread Coder |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_MT_CODER_H | 4 | #ifndef ZIP7_INC_MT_CODER_H |
| 5 | #define ZIP7_INC_MT_CODER_H | 5 | #define ZIP7_INC_MT_CODER_H |
| @@ -16,7 +16,7 @@ EXTERN_C_BEGIN | |||
| 16 | 16 | ||
| 17 | #ifndef Z7_ST | 17 | #ifndef Z7_ST |
| 18 | #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) | 18 | #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) |
| 19 | #define MTCODER_THREADS_MAX 64 | 19 | #define MTCODER_THREADS_MAX 256 |
| 20 | #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) | 20 | #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3) |
| 21 | #else | 21 | #else |
| 22 | #define MTCODER_THREADS_MAX 1 | 22 | #define MTCODER_THREADS_MAX 1 |
| @@ -77,6 +77,7 @@ typedef struct CMtCoder_ | |||
| 77 | 77 | ||
| 78 | size_t blockSize; /* size of input block */ | 78 | size_t blockSize; /* size of input block */ |
| 79 | unsigned numThreadsMax; | 79 | unsigned numThreadsMax; |
| 80 | unsigned numThreadGroups; | ||
| 80 | UInt64 expectedDataSize; | 81 | UInt64 expectedDataSize; |
| 81 | 82 | ||
| 82 | ISeqInStreamPtr inStream; | 83 | ISeqInStreamPtr inStream; |
| @@ -125,6 +126,8 @@ typedef struct CMtCoder_ | |||
| 125 | CMtProgress mtProgress; | 126 | CMtProgress mtProgress; |
| 126 | CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; | 127 | CMtCoderBlock blocks[MTCODER_BLOCKS_MAX]; |
| 127 | CMtCoderThread threads[MTCODER_THREADS_MAX]; | 128 | CMtCoderThread threads[MTCODER_THREADS_MAX]; |
| 129 | |||
| 130 | CThreadNextGroup nextGroup; | ||
| 128 | } CMtCoder; | 131 | } CMtCoder; |
| 129 | 132 | ||
| 130 | 133 | ||
| @@ -439,26 +439,78 @@ void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize) | |||
| 439 | 439 | ||
| 440 | 440 | ||
| 441 | 441 | ||
| 442 | // #define Z7_SHA512_PROBE_DEBUG // for debug | ||
| 442 | 443 | ||
| 443 | #if defined(_WIN32) && defined(Z7_COMPILER_SHA512_SUPPORTED) \ | 444 | #if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED) |
| 444 | && defined(MY_CPU_ARM64) // we can disable this check to debug in x64 | ||
| 445 | 445 | ||
| 446 | #if 1 // 0 for debug | 446 | #if defined(Z7_SHA512_PROBE_DEBUG) \ |
| 447 | || defined(_WIN32) && defined(MY_CPU_ARM64) | ||
| 448 | #ifndef Z7_SHA512_USE_PROBE | ||
| 449 | #define Z7_SHA512_USE_PROBE | ||
| 450 | #endif | ||
| 451 | #endif | ||
| 447 | 452 | ||
| 448 | #include "7zWindows.h" | 453 | #ifdef Z7_SHA512_USE_PROBE |
| 449 | // #include <stdio.h> | 454 | |
| 450 | #if 0 && defined(MY_CPU_X86_OR_AMD64) | 455 | #ifdef Z7_SHA512_PROBE_DEBUG |
| 451 | #include <intrin.h> // for debug : for __ud2() | 456 | #include <stdio.h> |
| 457 | #define PRF(x) x | ||
| 458 | #else | ||
| 459 | #define PRF(x) | ||
| 452 | #endif | 460 | #endif |
| 453 | 461 | ||
| 454 | BoolInt CPU_IsSupported_SHA512(void) | 462 | #if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode |
| 463 | // MINGW doesn't support __try. So we use signal() / longjmp(). | ||
| 464 | // Note: signal() / longjmp() probably is not thread-safe. | ||
| 465 | // So we must call Sha512Prepare() from main thread at program start. | ||
| 466 | #ifndef Z7_SHA512_USE_LONGJMP | ||
| 467 | #define Z7_SHA512_USE_LONGJMP | ||
| 468 | #endif | ||
| 469 | #endif | ||
| 470 | |||
| 471 | #ifdef Z7_SHA512_USE_LONGJMP | ||
| 472 | #include <signal.h> | ||
| 473 | #include <setjmp.h> | ||
| 474 | static jmp_buf g_Sha512_jmp_buf; | ||
| 475 | // static int g_Sha512_Unsupported; | ||
| 476 | |||
| 477 | #if defined(__GNUC__) && (__GNUC__ >= 8) \ | ||
| 478 | || defined(__clang__) && (__clang_major__ >= 3) | ||
| 479 | __attribute__((noreturn)) | ||
| 480 | #endif | ||
| 481 | static void Z7_CDECL Sha512_signal_Handler(int v) | ||
| 455 | { | 482 | { |
| 483 | PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);) | ||
| 484 | // g_Sha512_Unsupported = 1; | ||
| 485 | longjmp(g_Sha512_jmp_buf, 1); | ||
| 486 | } | ||
| 487 | #endif // Z7_SHA512_USE_LONGJMP | ||
| 488 | |||
| 489 | |||
| 490 | #if defined(_WIN32) | ||
| 491 | #include "7zWindows.h" | ||
| 492 | #endif | ||
| 493 | |||
| 456 | #if defined(MY_CPU_ARM64) | 494 | #if defined(MY_CPU_ARM64) |
| 495 | // #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug | ||
| 496 | #endif | ||
| 497 | |||
| 498 | #ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE | ||
| 499 | #include <arm_neon.h> | ||
| 500 | #if defined(__clang__) | ||
| 501 | __attribute__((__target__("sha3"))) | ||
| 502 | #elif !defined(_MSC_VER) | ||
| 503 | __attribute__((__target__("arch=armv8.2-a+sha3"))) | ||
| 504 | #endif | ||
| 505 | #endif | ||
| 506 | static BoolInt CPU_IsSupported_SHA512_Probe(void) | ||
| 507 | { | ||
| 508 | PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");) | ||
| 509 | #if defined(_WIN32) && defined(MY_CPU_ARM64) | ||
| 457 | // we have no SHA512 flag for IsProcessorFeaturePresent() still. | 510 | // we have no SHA512 flag for IsProcessorFeaturePresent() still. |
| 458 | if (!CPU_IsSupported_CRYPTO()) | 511 | if (!CPU_IsSupported_CRYPTO()) |
| 459 | return False; | 512 | return False; |
| 460 | #endif | 513 | PRF(printf("==== Registry check\n");) |
| 461 | // printf("\nCPU_IsSupported_SHA512\n"); | ||
| 462 | { | 514 | { |
| 463 | // we can't read ID_AA64ISAR0_EL1 register from application. | 515 | // we can't read ID_AA64ISAR0_EL1 register from application. |
| 464 | // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. | 516 | // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. |
| @@ -486,6 +538,7 @@ BoolInt CPU_IsSupported_SHA512(void) | |||
| 486 | // 2 : SHA256 and SHA512 implemented | 538 | // 2 : SHA256 and SHA512 implemented |
| 487 | } | 539 | } |
| 488 | } | 540 | } |
| 541 | #endif // defined(_WIN32) && defined(MY_CPU_ARM64) | ||
| 489 | 542 | ||
| 490 | 543 | ||
| 491 | #if 1 // 0 for debug to disable SHA512 PROBE code | 544 | #if 1 // 0 for debug to disable SHA512 PROBE code |
| @@ -509,59 +562,97 @@ Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instru | |||
| 509 | Are there any ways to fix the problems with arm64-wine and x64-SDE cases? | 562 | Are there any ways to fix the problems with arm64-wine and x64-SDE cases? |
| 510 | */ | 563 | */ |
| 511 | 564 | ||
| 512 | // printf("\n========== CPU_IsSupported_SHA512 PROBE ========\n"); | 565 | PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");) |
| 513 | { | 566 | { |
| 567 | BoolInt isSupported = False; | ||
| 568 | #ifdef Z7_SHA512_USE_LONGJMP | ||
| 569 | void (Z7_CDECL *signal_prev)(int); | ||
| 570 | /* | ||
| 571 | if (g_Sha512_Unsupported) | ||
| 572 | { | ||
| 573 | PRF(printf("==== g_Sha512_Unsupported\n");) | ||
| 574 | return False; | ||
| 575 | } | ||
| 576 | */ | ||
| 577 | printf("====== signal(SIGILL)\n"); | ||
| 578 | signal_prev = signal(SIGILL, Sha512_signal_Handler); | ||
| 579 | if (signal_prev == SIG_ERR) | ||
| 580 | { | ||
| 581 | PRF(printf("====== signal fail\n");) | ||
| 582 | return False; | ||
| 583 | } | ||
| 584 | // PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);) | ||
| 585 | // docs: Before the specified function is executed, | ||
| 586 | // the value of func is set to SIG_DFL. | ||
| 587 | // So we can exit if (setjmp(g_Sha512_jmp_buf) != 0). | ||
| 588 | PRF(printf("====== setjmp\n");) | ||
| 589 | if (!setjmp(g_Sha512_jmp_buf)) | ||
| 590 | #else // Z7_SHA512_USE_LONGJMP | ||
| 591 | |||
| 592 | #ifdef _MSC_VER | ||
| 514 | #ifdef __clang_major__ | 593 | #ifdef __clang_major__ |
| 515 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" | 594 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" |
| 516 | #endif | 595 | #endif |
| 517 | __try | 596 | __try |
| 597 | #endif | ||
| 598 | #endif // Z7_SHA512_USE_LONGJMP | ||
| 599 | |||
| 518 | { | 600 | { |
| 519 | #if 0 // 1 : for debug (reduced version to detect sha512) | 601 | #if defined(Z7_COMPILER_SHA512_SUPPORTED) |
| 602 | #ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE | ||
| 603 | // simplified sha512 check for arm64: | ||
| 520 | const uint64x2_t a = vdupq_n_u64(1); | 604 | const uint64x2_t a = vdupq_n_u64(1); |
| 521 | const uint64x2_t b = vsha512hq_u64(a, a, a); | 605 | const uint64x2_t b = vsha512hq_u64(a, a, a); |
| 606 | PRF(printf("======== vsha512hq_u64 probe\n");) | ||
| 522 | if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) | 607 | if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) |
| 523 | return True; | ||
| 524 | #else | 608 | #else |
| 525 | MY_ALIGN(16) | 609 | MY_ALIGN(16) |
| 526 | UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; | 610 | UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; |
| 527 | memset(temp, 0x5a, sizeof(temp)); | 611 | memset(temp, 0x5a, sizeof(temp)); |
| 528 | #if 0 && defined(MY_CPU_X86_OR_AMD64) | 612 | PRF(printf("======== Sha512_UpdateBlocks_HW\n");) |
| 529 | __ud2(); // for debug : that exception is not problem for SDE | ||
| 530 | #endif | ||
| 531 | #if 1 | ||
| 532 | Sha512_UpdateBlocks_HW(temp, | 613 | Sha512_UpdateBlocks_HW(temp, |
| 533 | (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); | 614 | (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); |
| 534 | // printf("\n==== t = %x\n", (UInt32)temp[0]); | 615 | // PRF(printf("======== t = %x\n", (UInt32)temp[0]);) |
| 535 | if ((UInt32)temp[0] == 0xa33cfdf7) | 616 | if ((UInt32)temp[0] == 0xa33cfdf7) |
| 617 | #endif | ||
| 536 | { | 618 | { |
| 537 | // printf("\n=== PROBE SHA512: SHA512 supported\n"); | 619 | PRF(printf("======== PROBE SHA512: SHA512 is supported\n");) |
| 538 | return True; | 620 | isSupported = True; |
| 539 | } | 621 | } |
| 622 | #else // Z7_COMPILER_SHA512_SUPPORTED | ||
| 623 | // for debug : we generate bad instrction or raise exception. | ||
| 624 | // __except() doesn't catch raise() calls. | ||
| 625 | #ifdef Z7_SHA512_USE_LONGJMP | ||
| 626 | PRF(printf("====== raise(SIGILL)\n");) | ||
| 627 | raise(SIGILL); | ||
| 628 | #else | ||
| 629 | #if defined(_MSC_VER) && defined(MY_CPU_X86) | ||
| 630 | __asm ud2 | ||
| 540 | #endif | 631 | #endif |
| 541 | #endif | 632 | #endif // Z7_SHA512_USE_LONGJMP |
| 633 | #endif // Z7_COMPILER_SHA512_SUPPORTED | ||
| 542 | } | 634 | } |
| 635 | |||
| 636 | #ifdef Z7_SHA512_USE_LONGJMP | ||
| 637 | PRF(printf("====== restore signal SIGILL\n");) | ||
| 638 | signal(SIGILL, signal_prev); | ||
| 639 | #elif _MSC_VER | ||
| 543 | __except (EXCEPTION_EXECUTE_HANDLER) | 640 | __except (EXCEPTION_EXECUTE_HANDLER) |
| 544 | { | 641 | { |
| 545 | // printf("\n==== CPU_IsSupported_SHA512 EXCEPTION_EXECUTE_HANDLER\n"); | 642 | PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");) |
| 546 | } | 643 | } |
| 644 | #endif | ||
| 645 | PRF(printf("== return (sha512 supported) = %d\n", isSupported);) | ||
| 646 | return isSupported; | ||
| 547 | } | 647 | } |
| 548 | return False; | ||
| 549 | #else | 648 | #else |
| 550 | // without SHA512 PROBE code | 649 | // without SHA512 PROBE code |
| 551 | return True; | 650 | return True; |
| 552 | #endif | 651 | #endif |
| 553 | |||
| 554 | } | 652 | } |
| 555 | 653 | ||
| 556 | #else | 654 | #endif // Z7_SHA512_USE_PROBE |
| 557 | 655 | #endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED) | |
| 558 | BoolInt CPU_IsSupported_SHA512(void) | ||
| 559 | { | ||
| 560 | return False; | ||
| 561 | } | ||
| 562 | |||
| 563 | #endif | ||
| 564 | #endif // WIN32 arm64 | ||
| 565 | 656 | ||
| 566 | 657 | ||
| 567 | void Sha512Prepare(void) | 658 | void Sha512Prepare(void) |
| @@ -570,10 +661,10 @@ void Sha512Prepare(void) | |||
| 570 | SHA512_FUNC_UPDATE_BLOCKS f, f_hw; | 661 | SHA512_FUNC_UPDATE_BLOCKS f, f_hw; |
| 571 | f = Sha512_UpdateBlocks; | 662 | f = Sha512_UpdateBlocks; |
| 572 | f_hw = NULL; | 663 | f_hw = NULL; |
| 573 | #ifdef MY_CPU_X86_OR_AMD64 | 664 | #ifdef Z7_SHA512_USE_PROBE |
| 574 | if (CPU_IsSupported_SHA512() | 665 | if (CPU_IsSupported_SHA512_Probe()) |
| 575 | && CPU_IsSupported_AVX2() | 666 | #elif defined(MY_CPU_X86_OR_AMD64) |
| 576 | ) | 667 | if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2()) |
| 577 | #else | 668 | #else |
| 578 | if (CPU_IsSupported_SHA512()) | 669 | if (CPU_IsSupported_SHA512()) |
| 579 | #endif | 670 | #endif |
| @@ -583,6 +674,8 @@ void Sha512Prepare(void) | |||
| 583 | } | 674 | } |
| 584 | g_SHA512_FUNC_UPDATE_BLOCKS = f; | 675 | g_SHA512_FUNC_UPDATE_BLOCKS = f; |
| 585 | g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; | 676 | g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; |
| 677 | #elif defined(Z7_SHA512_PROBE_DEBUG) | ||
| 678 | CPU_IsSupported_SHA512_Probe(); // for debug | ||
| 586 | #endif | 679 | #endif |
| 587 | } | 680 | } |
| 588 | 681 | ||
| @@ -1,141 +1,268 @@ | |||
| 1 | /* Sort.c -- Sort functions | 1 | /* Sort.c -- Sort functions |
| 2 | 2014-04-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "Sort.h" | 6 | #include "Sort.h" |
| 7 | #include "CpuArch.h" | ||
| 7 | 8 | ||
| 8 | #define HeapSortDown(p, k, size, temp) \ | 9 | #if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ |
| 9 | { for (;;) { \ | 10 | || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \ |
| 10 | size_t s = (k << 1); \ | 11 | ) |
| 11 | if (s > size) break; \ | 12 | // the code with prefetch is slow for small arrays on x86. |
| 12 | if (s < size && p[s + 1] > p[s]) s++; \ | 13 | // So we disable prefetch for x86. |
| 13 | if (temp >= p[s]) break; \ | 14 | #ifndef MY_CPU_X86 |
| 14 | p[k] = p[s]; k = s; \ | 15 | // #pragma message("Z7_PREFETCH : __builtin_prefetch") |
| 15 | } p[k] = temp; } | 16 | #define Z7_PREFETCH(a) __builtin_prefetch((a)) |
| 17 | #endif | ||
| 16 | 18 | ||
| 17 | void HeapSort(UInt32 *p, size_t size) | 19 | #elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200) |
| 18 | { | 20 | |
| 19 | if (size <= 1) | 21 | #include "7zWindows.h" |
| 20 | return; | 22 | |
| 21 | p--; | 23 | // NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1. |
| 22 | { | 24 | // For example, clang-cl can generate "prefetcht2" instruction for |
| 23 | size_t i = size / 2; | 25 | // PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call. |
| 24 | do | 26 | // But we want to generate "prefetcht0" instruction. |
| 25 | { | 27 | // So for CLANG/GCC we must use __builtin_prefetch() in code branch above |
| 26 | UInt32 temp = p[i]; | 28 | // instead of PreFetchCacheLine() / _mm_prefetch(). |
| 27 | size_t k = i; | 29 | |
| 28 | HeapSortDown(p, k, size, temp) | 30 | // New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call. |
| 29 | } | 31 | // But old x86 cpus don't support "prefetcht0". |
| 30 | while (--i != 0); | 32 | // So we will use PreFetchCacheLine(), only if we are sure that |
| 31 | } | 33 | // generated instruction is supported by all cpus of that isa. |
| 32 | /* | 34 | #if defined(MY_CPU_AMD64) \ |
| 33 | do | 35 | || defined(MY_CPU_ARM64) \ |
| 34 | { | 36 | || defined(MY_CPU_IA64) |
| 35 | size_t k = 1; | 37 | // we need to use additional braces for (a) in PreFetchCacheLine call, because |
| 36 | UInt32 temp = p[size]; | 38 | // PreFetchCacheLine macro doesn't use braces: |
| 37 | p[size--] = p[1]; | 39 | // #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l) |
| 38 | HeapSortDown(p, k, size, temp) | 40 | // #pragma message("Z7_PREFETCH : PreFetchCacheLine") |
| 39 | } | 41 | #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a)) |
| 40 | while (size > 1); | 42 | #endif |
| 41 | */ | 43 | |
| 42 | while (size > 3) | 44 | #endif // _WIN32 |
| 43 | { | 45 | |
| 44 | UInt32 temp = p[size]; | 46 | |
| 45 | size_t k = (p[3] > p[2]) ? 3 : 2; | 47 | #define PREFETCH_NO(p,k,s,size) |
| 46 | p[size--] = p[1]; | 48 | |
| 47 | p[1] = p[k]; | 49 | #ifndef Z7_PREFETCH |
| 48 | HeapSortDown(p, k, size, temp) | 50 | #define SORT_PREFETCH(p,k,s,size) |
| 49 | } | 51 | #else |
| 50 | { | 52 | |
| 51 | UInt32 temp = p[size]; | 53 | // #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes |
| 52 | p[size] = p[1]; | 54 | #define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch) |
| 53 | if (size > 2 && p[2] < temp) | 55 | // #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch) |
| 54 | { | 56 | |
| 55 | p[1] = p[2]; | 57 | #if PREFETCH_LEVEL == 0 |
| 56 | p[2] = temp; | 58 | |
| 57 | } | 59 | #define SORT_PREFETCH(p,k,s,size) |
| 58 | else | 60 | |
| 59 | p[1] = temp; | 61 | #else // PREFETCH_LEVEL != 0 |
| 60 | } | 62 | |
| 63 | /* | ||
| 64 | if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 65 | we prefetch one value per cache line. | ||
| 66 | Use it if array is aligned for cache line size (64 bytes) | ||
| 67 | or if array is small (less than L1 cache size). | ||
| 68 | |||
| 69 | if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 70 | we perfetch all cache lines that can be required. | ||
| 71 | it can be faster for big unaligned arrays. | ||
| 72 | */ | ||
| 73 | #define USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 74 | |||
| 75 | // s == k * 2 | ||
| 76 | #if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64) | ||
| 77 | // x86 supports (lea r1*8+offset) | ||
| 78 | #define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL) | ||
| 79 | #else | ||
| 80 | #define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1)) | ||
| 81 | #endif | ||
| 82 | |||
| 83 | #if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY) | ||
| 84 | #define PREFETCH_ADD_OFFSET 0 | ||
| 85 | #else | ||
| 86 | // last offset that can be reqiured in PREFETCH_LEVEL step: | ||
| 87 | #define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1) | ||
| 88 | #define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2 | ||
| 89 | #endif | ||
| 90 | |||
| 91 | #if PREFETCH_LEVEL <= 3 | ||
| 92 | |||
| 93 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 94 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 95 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \ | ||
| 96 | if (s2 <= size) { \ | ||
| 97 | Z7_PREFETCH((p + s2)); \ | ||
| 98 | }} | ||
| 99 | #else /* for unaligned array */ | ||
| 100 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 101 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
| 102 | if (s2 <= size) { \ | ||
| 103 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
| 104 | Z7_PREFETCH((p + s2)); \ | ||
| 105 | }} | ||
| 106 | #endif | ||
| 107 | |||
| 108 | #else // PREFETCH_LEVEL > 3 | ||
| 109 | |||
| 110 | #ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY | ||
| 111 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 112 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \ | ||
| 113 | if (s2 <= size) { \ | ||
| 114 | Z7_PREFETCH((p + s2 - 16)); \ | ||
| 115 | Z7_PREFETCH((p + s2)); \ | ||
| 116 | }} | ||
| 117 | #else /* for unaligned array */ | ||
| 118 | #define SORT_PREFETCH(p,k,s,size) \ | ||
| 119 | { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \ | ||
| 120 | if (s2 <= size) { \ | ||
| 121 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \ | ||
| 122 | Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \ | ||
| 123 | Z7_PREFETCH((p + s2)); \ | ||
| 124 | }} | ||
| 125 | #endif | ||
| 126 | |||
| 127 | #endif // PREFETCH_LEVEL > 3 | ||
| 128 | #endif // PREFETCH_LEVEL != 0 | ||
| 129 | #endif // Z7_PREFETCH | ||
| 130 | |||
| 131 | |||
| 132 | #if defined(MY_CPU_ARM64) \ | ||
| 133 | /* || defined(MY_CPU_AMD64) */ \ | ||
| 134 | /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */ | ||
| 135 | // we want to use cmov, if cmov is very fast: | ||
| 136 | // - this cmov version is slower for clang-x64. | ||
| 137 | // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus. | ||
| 138 | #define Z7_FAST_CMOV_SUPPORTED | ||
| 139 | #endif | ||
| 140 | |||
| 141 | #ifdef Z7_FAST_CMOV_SUPPORTED | ||
| 142 | // we want to use cmov here, if cmov is fast: new arm64 cpus. | ||
| 143 | // we want the compiler to use conditional move for this branch | ||
| 144 | #define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1; | ||
| 145 | #else | ||
| 146 | // use this branch, if cpu doesn't support fast conditional move. | ||
| 147 | // it uses slow array access reading: | ||
| 148 | #define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow; | ||
| 149 | #endif | ||
| 150 | |||
| 151 | #define HeapSortDown(p, k, size, temp, macro_prefetch) \ | ||
| 152 | { \ | ||
| 153 | for (;;) { \ | ||
| 154 | UInt32 n0, n1; \ | ||
| 155 | size_t s = k * 2; \ | ||
| 156 | if (s >= size) { \ | ||
| 157 | if (s == size) { \ | ||
| 158 | n0 = p[s]; \ | ||
| 159 | p[k] = n0; \ | ||
| 160 | if (temp < n0) k = s; \ | ||
| 161 | } \ | ||
| 162 | break; \ | ||
| 163 | } \ | ||
| 164 | n0 = p[k * 2]; \ | ||
| 165 | n1 = p[k * 2 + 1]; \ | ||
| 166 | s += n0 < n1; \ | ||
| 167 | GET_MAX_VAL(n0, n1, p[s]) \ | ||
| 168 | if (temp >= n0) break; \ | ||
| 169 | macro_prefetch(p, k, s, size) \ | ||
| 170 | p[k] = n0; \ | ||
| 171 | k = s; \ | ||
| 172 | } \ | ||
| 173 | p[k] = temp; \ | ||
| 61 | } | 174 | } |
| 62 | 175 | ||
| 63 | void HeapSort64(UInt64 *p, size_t size) | 176 | |
| 177 | /* | ||
| 178 | stage-1 : O(n) : | ||
| 179 | we generate intermediate partially sorted binary tree: | ||
| 180 | p[0] : it's additional item for better alignment of tree structure in memory. | ||
| 181 | p[1] | ||
| 182 | p[2] p[3] | ||
| 183 | p[4] p[5] p[6] p[7] | ||
| 184 | ... | ||
| 185 | p[x] >= p[x * 2] | ||
| 186 | p[x] >= p[x * 2 + 1] | ||
| 187 | |||
| 188 | stage-2 : O(n)*log2(N): | ||
| 189 | we move largest item p[0] from head of tree to the end of array | ||
| 190 | and insert last item to sorted binary tree. | ||
| 191 | */ | ||
| 192 | |||
| 193 | // (p) must be aligned for cache line size (64-bytes) for best performance | ||
| 194 | |||
| 195 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size) | ||
| 64 | { | 196 | { |
| 65 | if (size <= 1) | 197 | if (size < 2) |
| 66 | return; | 198 | return; |
| 67 | p--; | 199 | if (size == 2) |
| 68 | { | ||
| 69 | size_t i = size / 2; | ||
| 70 | do | ||
| 71 | { | ||
| 72 | UInt64 temp = p[i]; | ||
| 73 | size_t k = i; | ||
| 74 | HeapSortDown(p, k, size, temp) | ||
| 75 | } | ||
| 76 | while (--i != 0); | ||
| 77 | } | ||
| 78 | /* | ||
| 79 | do | ||
| 80 | { | 200 | { |
| 81 | size_t k = 1; | 201 | const UInt32 a0 = p[0]; |
| 82 | UInt64 temp = p[size]; | 202 | const UInt32 a1 = p[1]; |
| 83 | p[size--] = p[1]; | 203 | const unsigned k = a1 < a0; |
| 84 | HeapSortDown(p, k, size, temp) | 204 | p[k] = a0; |
| 85 | } | 205 | p[k ^ 1] = a1; |
| 86 | while (size > 1); | 206 | return; |
| 87 | */ | ||
| 88 | while (size > 3) | ||
| 89 | { | ||
| 90 | UInt64 temp = p[size]; | ||
| 91 | size_t k = (p[3] > p[2]) ? 3 : 2; | ||
| 92 | p[size--] = p[1]; | ||
| 93 | p[1] = p[k]; | ||
| 94 | HeapSortDown(p, k, size, temp) | ||
| 95 | } | 207 | } |
| 96 | { | 208 | { |
| 97 | UInt64 temp = p[size]; | 209 | // stage-1 : O(n) |
| 98 | p[size] = p[1]; | 210 | // we transform array to partially sorted binary tree. |
| 99 | if (size > 2 && p[2] < temp) | 211 | size_t i = --size / 2; |
| 212 | // (size) now is the index of the last item in tree, | ||
| 213 | // if (i) | ||
| 100 | { | 214 | { |
| 101 | p[1] = p[2]; | 215 | do |
| 102 | p[2] = temp; | 216 | { |
| 217 | const UInt32 temp = p[i]; | ||
| 218 | size_t k = i; | ||
| 219 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
| 220 | } | ||
| 221 | while (--i); | ||
| 222 | } | ||
| 223 | { | ||
| 224 | const UInt32 temp = p[0]; | ||
| 225 | const UInt32 a1 = p[1]; | ||
| 226 | if (temp < a1) | ||
| 227 | { | ||
| 228 | size_t k = 1; | ||
| 229 | p[0] = a1; | ||
| 230 | HeapSortDown(p, k, size, temp, PREFETCH_NO) | ||
| 231 | } | ||
| 103 | } | 232 | } |
| 104 | else | ||
| 105 | p[1] = temp; | ||
| 106 | } | 233 | } |
| 107 | } | ||
| 108 | 234 | ||
| 109 | /* | 235 | if (size < 3) |
| 110 | #define HeapSortRefDown(p, vals, n, size, temp) \ | 236 | { |
| 111 | { size_t k = n; UInt32 val = vals[temp]; for (;;) { \ | 237 | // size == 2 |
| 112 | size_t s = (k << 1); \ | 238 | const UInt32 a0 = p[0]; |
| 113 | if (s > size) break; \ | 239 | p[0] = p[2]; |
| 114 | if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \ | 240 | p[2] = a0; |
| 115 | if (val >= vals[p[s]]) break; \ | ||
| 116 | p[k] = p[s]; k = s; \ | ||
| 117 | } p[k] = temp; } | ||
| 118 | |||
| 119 | void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) | ||
| 120 | { | ||
| 121 | if (size <= 1) | ||
| 122 | return; | 241 | return; |
| 123 | p--; | 242 | } |
| 243 | if (size != 3) | ||
| 124 | { | 244 | { |
| 125 | size_t i = size / 2; | 245 | // stage-2 : O(size) * log2(size): |
| 246 | // we move largest item p[0] from head to the end of array, | ||
| 247 | // and insert last item to sorted binary tree. | ||
| 126 | do | 248 | do |
| 127 | { | 249 | { |
| 128 | UInt32 temp = p[i]; | 250 | const UInt32 temp = p[size]; |
| 129 | HeapSortRefDown(p, vals, i, size, temp); | 251 | size_t k = p[2] < p[3] ? 3 : 2; |
| 252 | p[size--] = p[0]; | ||
| 253 | p[0] = p[1]; | ||
| 254 | p[1] = p[k]; | ||
| 255 | HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO | ||
| 130 | } | 256 | } |
| 131 | while (--i != 0); | 257 | while (size != 3); |
| 132 | } | 258 | } |
| 133 | do | ||
| 134 | { | 259 | { |
| 135 | UInt32 temp = p[size]; | 260 | const UInt32 a2 = p[2]; |
| 136 | p[size--] = p[1]; | 261 | const UInt32 a3 = p[3]; |
| 137 | HeapSortRefDown(p, vals, 1, size, temp); | 262 | const size_t k = a2 < a3; |
| 263 | p[2] = p[1]; | ||
| 264 | p[3] = p[0]; | ||
| 265 | p[k] = a3; | ||
| 266 | p[k ^ 1] = a2; | ||
| 138 | } | 267 | } |
| 139 | while (size > 1); | ||
| 140 | } | 268 | } |
| 141 | */ | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Sort.h -- Sort functions | 1 | /* Sort.h -- Sort functions |
| 2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_SORT_H | 4 | #ifndef ZIP7_INC_SORT_H |
| 5 | #define ZIP7_INC_SORT_H | 5 | #define ZIP7_INC_SORT_H |
| @@ -8,10 +8,7 @@ | |||
| 8 | 8 | ||
| 9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
| 10 | 10 | ||
| 11 | void HeapSort(UInt32 *p, size_t size); | 11 | void Z7_FASTCALL HeapSort(UInt32 *p, size_t size); |
| 12 | void HeapSort64(UInt64 *p, size_t size); | ||
| 13 | |||
| 14 | /* void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size); */ | ||
| 15 | 12 | ||
| 16 | EXTERN_C_END | 13 | EXTERN_C_END |
| 17 | 14 | ||
diff --git a/C/Threads.c b/C/Threads.c index 464efec..177d1d9 100644 --- a/C/Threads.c +++ b/C/Threads.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Threads.c -- multithreading library | 1 | /* Threads.c -- multithreading library |
| 2 | 2024-03-28 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -59,6 +59,100 @@ WRes Thread_Wait_Close(CThread *p) | |||
| 59 | return (res != 0 ? res : res2); | 59 | return (res != 0 ? res : res2); |
| 60 | } | 60 | } |
| 61 | 61 | ||
| 62 | typedef struct MY_PROCESSOR_NUMBER { | ||
| 63 | WORD Group; | ||
| 64 | BYTE Number; | ||
| 65 | BYTE Reserved; | ||
| 66 | } MY_PROCESSOR_NUMBER, *MY_PPROCESSOR_NUMBER; | ||
| 67 | |||
| 68 | typedef struct MY_GROUP_AFFINITY { | ||
| 69 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 100000) | ||
| 70 | // KAFFINITY is not defined in old mingw | ||
| 71 | ULONG_PTR | ||
| 72 | #else | ||
| 73 | KAFFINITY | ||
| 74 | #endif | ||
| 75 | Mask; | ||
| 76 | WORD Group; | ||
| 77 | WORD Reserved[3]; | ||
| 78 | } MY_GROUP_AFFINITY, *MY_PGROUP_AFFINITY; | ||
| 79 | |||
| 80 | typedef BOOL (WINAPI *Func_SetThreadGroupAffinity)( | ||
| 81 | HANDLE hThread, | ||
| 82 | CONST MY_GROUP_AFFINITY *GroupAffinity, | ||
| 83 | MY_PGROUP_AFFINITY PreviousGroupAffinity); | ||
| 84 | |||
| 85 | typedef BOOL (WINAPI *Func_GetThreadGroupAffinity)( | ||
| 86 | HANDLE hThread, | ||
| 87 | MY_PGROUP_AFFINITY GroupAffinity); | ||
| 88 | |||
| 89 | typedef BOOL (WINAPI *Func_GetProcessGroupAffinity)( | ||
| 90 | HANDLE hProcess, | ||
| 91 | PUSHORT GroupCount, | ||
| 92 | PUSHORT GroupArray); | ||
| 93 | |||
| 94 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
| 95 | |||
| 96 | #if 0 | ||
| 97 | #include <stdio.h> | ||
| 98 | #define PRF(x) x | ||
| 99 | /* | ||
| 100 | -- | ||
| 101 | before call of SetThreadGroupAffinity() | ||
| 102 | GetProcessGroupAffinity return one group. | ||
| 103 | after call of SetThreadGroupAffinity(): | ||
| 104 | GetProcessGroupAffinity return more than group, | ||
| 105 | if SetThreadGroupAffinity() was to another group. | ||
| 106 | -- | ||
| 107 | GetProcessAffinityMask MS DOCs: | ||
| 108 | { | ||
| 109 | If the calling process contains threads in multiple groups, | ||
| 110 | the function returns zero for both affinity masks. | ||
| 111 | } | ||
| 112 | but tests in win10 with 2 groups (less than 64 cores total): | ||
| 113 | GetProcessAffinityMask() still returns non-zero affinity masks | ||
| 114 | even after SetThreadGroupAffinity() calls. | ||
| 115 | */ | ||
| 116 | static void PrintProcess_Info() | ||
| 117 | { | ||
| 118 | { | ||
| 119 | const | ||
| 120 | Func_GetProcessGroupAffinity fn_GetProcessGroupAffinity = | ||
| 121 | (Func_GetProcessGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
| 122 | "GetProcessGroupAffinity"); | ||
| 123 | if (fn_GetProcessGroupAffinity) | ||
| 124 | { | ||
| 125 | unsigned i; | ||
| 126 | USHORT GroupCounts[64]; | ||
| 127 | USHORT GroupCount = Z7_ARRAY_SIZE(GroupCounts); | ||
| 128 | BOOL boolRes = fn_GetProcessGroupAffinity(GetCurrentProcess(), | ||
| 129 | &GroupCount, GroupCounts); | ||
| 130 | printf("\n====== GetProcessGroupAffinity : " | ||
| 131 | "boolRes=%u GroupCounts = %u :", | ||
| 132 | boolRes, (unsigned)GroupCount); | ||
| 133 | for (i = 0; i < GroupCount; i++) | ||
| 134 | printf(" %u", GroupCounts[i]); | ||
| 135 | printf("\n"); | ||
| 136 | } | ||
| 137 | } | ||
| 138 | { | ||
| 139 | DWORD_PTR processAffinityMask, systemAffinityMask; | ||
| 140 | if (GetProcessAffinityMask(GetCurrentProcess(), &processAffinityMask, &systemAffinityMask)) | ||
| 141 | { | ||
| 142 | PRF(printf("\n====== GetProcessAffinityMask : " | ||
| 143 | ": processAffinityMask=%x, systemAffinityMask=%x\n", | ||
| 144 | (UInt32)processAffinityMask, (UInt32)systemAffinityMask);) | ||
| 145 | } | ||
| 146 | else | ||
| 147 | printf("\n==GetProcessAffinityMask FAIL"); | ||
| 148 | } | ||
| 149 | } | ||
| 150 | #else | ||
| 151 | #ifndef USE_THREADS_CreateThread | ||
| 152 | // #define PRF(x) | ||
| 153 | #endif | ||
| 154 | #endif | ||
| 155 | |||
| 62 | WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | 156 | WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) |
| 63 | { | 157 | { |
| 64 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ | 158 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ |
| @@ -72,7 +166,43 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | |||
| 72 | 166 | ||
| 73 | unsigned threadId; | 167 | unsigned threadId; |
| 74 | *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); | 168 | *p = (HANDLE)(_beginthreadex(NULL, 0, func, param, 0, &threadId)); |
| 75 | 169 | ||
| 170 | #if 0 // 1 : for debug | ||
| 171 | { | ||
| 172 | DWORD_PTR prevMask; | ||
| 173 | DWORD_PTR affinity = 1 << 0; | ||
| 174 | prevMask = SetThreadAffinityMask(*p, (DWORD_PTR)affinity); | ||
| 175 | prevMask = prevMask; | ||
| 176 | } | ||
| 177 | #endif | ||
| 178 | #if 0 // 1 : for debug | ||
| 179 | { | ||
| 180 | /* win10: new thread will be created in same group that is assigned to parent thread | ||
| 181 | but affinity mask will contain all allowed threads of that group, | ||
| 182 | even if affinity mask of parent group is not full | ||
| 183 | win11: what group it will be created, if we have set | ||
| 184 | affinity of parent thread with ThreadGroupAffinity? | ||
| 185 | */ | ||
| 186 | const | ||
| 187 | Func_GetThreadGroupAffinity fn = | ||
| 188 | (Func_GetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
| 189 | "GetThreadGroupAffinity"); | ||
| 190 | if (fn) | ||
| 191 | { | ||
| 192 | // BOOL wres2; | ||
| 193 | MY_GROUP_AFFINITY groupAffinity; | ||
| 194 | memset(&groupAffinity, 0, sizeof(groupAffinity)); | ||
| 195 | /* wres2 = */ fn(*p, &groupAffinity); | ||
| 196 | PRF(printf("\n==Thread_Create cur = %6u GetThreadGroupAffinity(): " | ||
| 197 | "wres2_BOOL = %u, group=%u mask=%x\n", | ||
| 198 | GetCurrentThreadId(), | ||
| 199 | wres2, | ||
| 200 | groupAffinity.Group, | ||
| 201 | (UInt32)groupAffinity.Mask);) | ||
| 202 | } | ||
| 203 | } | ||
| 204 | #endif | ||
| 205 | |||
| 76 | #endif | 206 | #endif |
| 77 | 207 | ||
| 78 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ | 208 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ |
| @@ -110,7 +240,84 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param | |||
| 110 | */ | 240 | */ |
| 111 | } | 241 | } |
| 112 | { | 242 | { |
| 113 | DWORD prevSuspendCount = ResumeThread(h); | 243 | const DWORD prevSuspendCount = ResumeThread(h); |
| 244 | /* ResumeThread() returns: | ||
| 245 | 0 : was_not_suspended | ||
| 246 | 1 : was_resumed | ||
| 247 | -1 : error | ||
| 248 | */ | ||
| 249 | if (prevSuspendCount == (DWORD)-1) | ||
| 250 | wres = GetError(); | ||
| 251 | } | ||
| 252 | } | ||
| 253 | |||
| 254 | /* maybe we must use errno here, but probably GetLastError() is also OK. */ | ||
| 255 | return wres; | ||
| 256 | |||
| 257 | #endif | ||
| 258 | } | ||
| 259 | |||
| 260 | |||
| 261 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask) | ||
| 262 | { | ||
| 263 | #ifdef USE_THREADS_CreateThread | ||
| 264 | |||
| 265 | UNUSED_VAR(group) | ||
| 266 | UNUSED_VAR(affinityMask) | ||
| 267 | return Thread_Create(p, func, param); | ||
| 268 | |||
| 269 | #else | ||
| 270 | |||
| 271 | /* Windows Me/98/95: threadId parameter may not be NULL in _beginthreadex/CreateThread functions */ | ||
| 272 | HANDLE h; | ||
| 273 | WRes wres; | ||
| 274 | unsigned threadId; | ||
| 275 | h = (HANDLE)(_beginthreadex(NULL, 0, func, param, CREATE_SUSPENDED, &threadId)); | ||
| 276 | *p = h; | ||
| 277 | wres = HandleToWRes(h); | ||
| 278 | if (h) | ||
| 279 | { | ||
| 280 | // PrintProcess_Info(); | ||
| 281 | { | ||
| 282 | const | ||
| 283 | Func_SetThreadGroupAffinity fn = | ||
| 284 | (Func_SetThreadGroupAffinity) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | ||
| 285 | "SetThreadGroupAffinity"); | ||
| 286 | if (fn) | ||
| 287 | { | ||
| 288 | // WRes wres2; | ||
| 289 | MY_GROUP_AFFINITY groupAffinity, prev_groupAffinity; | ||
| 290 | memset(&groupAffinity, 0, sizeof(groupAffinity)); | ||
| 291 | // groupAffinity.Mask must use only bits that supported by current group | ||
| 292 | // (groupAffinity.Mask = 0) means all allowed bits | ||
| 293 | groupAffinity.Mask = affinityMask; | ||
| 294 | groupAffinity.Group = (WORD)group; | ||
| 295 | // wres2 = | ||
| 296 | fn(h, &groupAffinity, &prev_groupAffinity); | ||
| 297 | /* | ||
| 298 | if (groupAffinity.Group == prev_groupAffinity.Group) | ||
| 299 | wres2 = wres2; | ||
| 300 | else | ||
| 301 | wres2 = wres2; | ||
| 302 | if (wres2 == 0) | ||
| 303 | { | ||
| 304 | wres2 = GetError(); | ||
| 305 | PRF(printf("\n==SetThreadGroupAffinity error: %u\n", wres2);) | ||
| 306 | } | ||
| 307 | else | ||
| 308 | { | ||
| 309 | PRF(printf("\n==Thread_Create_With_Group::SetThreadGroupAffinity()" | ||
| 310 | " threadId = %6u" | ||
| 311 | " group=%u mask=%x\n", | ||
| 312 | threadId, | ||
| 313 | prev_groupAffinity.Group, | ||
| 314 | (UInt32)prev_groupAffinity.Mask);) | ||
| 315 | } | ||
| 316 | */ | ||
| 317 | } | ||
| 318 | } | ||
| 319 | { | ||
| 320 | const DWORD prevSuspendCount = ResumeThread(h); | ||
| 114 | /* ResumeThread() returns: | 321 | /* ResumeThread() returns: |
| 115 | 0 : was_not_suspended | 322 | 0 : was_not_suspended |
| 116 | 1 : was_resumed | 323 | 1 : was_resumed |
| @@ -297,6 +504,13 @@ WRes Thread_Create(CThread *p, THREAD_FUNC_TYPE func, LPVOID param) | |||
| 297 | return Thread_Create_With_CpuSet(p, func, param, NULL); | 504 | return Thread_Create_With_CpuSet(p, func, param, NULL); |
| 298 | } | 505 | } |
| 299 | 506 | ||
| 507 | /* | ||
| 508 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinity) | ||
| 509 | { | ||
| 510 | UNUSED_VAR(group) | ||
| 511 | return Thread_Create_With_Affinity(p, func, param, affinity); | ||
| 512 | } | ||
| 513 | */ | ||
| 300 | 514 | ||
| 301 | WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) | 515 | WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, CAffinityMask affinity) |
| 302 | { | 516 | { |
| @@ -577,5 +791,22 @@ WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) | |||
| 577 | return AutoResetEvent_CreateNotSignaled(p); | 791 | return AutoResetEvent_CreateNotSignaled(p); |
| 578 | } | 792 | } |
| 579 | 793 | ||
| 794 | void ThreadNextGroup_Init(CThreadNextGroup *p, UInt32 numGroups, UInt32 startGroup) | ||
| 795 | { | ||
| 796 | // printf("\n====== ThreadNextGroup_Init numGroups = %x: startGroup=%x\n", numGroups, startGroup); | ||
| 797 | if (numGroups == 0) | ||
| 798 | numGroups = 1; | ||
| 799 | p->NumGroups = numGroups; | ||
| 800 | p->NextGroup = startGroup % numGroups; | ||
| 801 | } | ||
| 802 | |||
| 803 | |||
| 804 | UInt32 ThreadNextGroup_GetNext(CThreadNextGroup *p) | ||
| 805 | { | ||
| 806 | const UInt32 next = p->NextGroup; | ||
| 807 | p->NextGroup = (next + 1) % p->NumGroups; | ||
| 808 | return next; | ||
| 809 | } | ||
| 810 | |||
| 580 | #undef PRF | 811 | #undef PRF |
| 581 | #undef Print | 812 | #undef Print |
diff --git a/C/Threads.h b/C/Threads.h index c1484a2..be12e6e 100644 --- a/C/Threads.h +++ b/C/Threads.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Threads.h -- multithreading library | 1 | /* Threads.h -- multithreading library |
| 2 | 2024-03-28 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_THREADS_H | 4 | #ifndef ZIP7_INC_THREADS_H |
| 5 | #define ZIP7_INC_THREADS_H | 5 | #define ZIP7_INC_THREADS_H |
| @@ -140,12 +140,22 @@ WRes Thread_Create_With_Affinity(CThread *p, THREAD_FUNC_TYPE func, LPVOID param | |||
| 140 | WRes Thread_Wait_Close(CThread *p); | 140 | WRes Thread_Wait_Close(CThread *p); |
| 141 | 141 | ||
| 142 | #ifdef _WIN32 | 142 | #ifdef _WIN32 |
| 143 | WRes Thread_Create_With_Group(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, unsigned group, CAffinityMask affinityMask); | ||
| 143 | #define Thread_Create_With_CpuSet(p, func, param, cs) \ | 144 | #define Thread_Create_With_CpuSet(p, func, param, cs) \ |
| 144 | Thread_Create_With_Affinity(p, func, param, *cs) | 145 | Thread_Create_With_Affinity(p, func, param, *cs) |
| 145 | #else | 146 | #else |
| 146 | WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); | 147 | WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, const CCpuSet *cpuSet); |
| 147 | #endif | 148 | #endif |
| 148 | 149 | ||
| 150 | typedef struct | ||
| 151 | { | ||
| 152 | unsigned NumGroups; | ||
| 153 | unsigned NextGroup; | ||
| 154 | } CThreadNextGroup; | ||
| 155 | |||
| 156 | void ThreadNextGroup_Init(CThreadNextGroup *p, unsigned numGroups, unsigned startGroup); | ||
| 157 | unsigned ThreadNextGroup_GetNext(CThreadNextGroup *p); | ||
| 158 | |||
| 149 | 159 | ||
| 150 | #ifdef _WIN32 | 160 | #ifdef _WIN32 |
| 151 | 161 | ||
diff --git a/C/Util/Lzma/LzmaUtil.dsp b/C/Util/Lzma/LzmaUtil.dsp index e2e7d42..71de950 100644 --- a/C/Util/Lzma/LzmaUtil.dsp +++ b/C/Util/Lzma/LzmaUtil.dsp | |||
| @@ -122,6 +122,10 @@ SOURCE=..\..\Compiler.h | |||
| 122 | # End Source File | 122 | # End Source File |
| 123 | # Begin Source File | 123 | # Begin Source File |
| 124 | 124 | ||
| 125 | SOURCE=..\..\CpuArch.c | ||
| 126 | # End Source File | ||
| 127 | # Begin Source File | ||
| 128 | |||
| 125 | SOURCE=..\..\CpuArch.h | 129 | SOURCE=..\..\CpuArch.h |
| 126 | # End Source File | 130 | # End Source File |
| 127 | # Begin Source File | 131 | # Begin Source File |
diff --git a/C/Util/LzmaLib/LzmaLib.dsp b/C/Util/LzmaLib/LzmaLib.dsp index bacd967..f413137 100644 --- a/C/Util/LzmaLib/LzmaLib.dsp +++ b/C/Util/LzmaLib/LzmaLib.dsp | |||
| @@ -43,7 +43,7 @@ RSC=rc.exe | |||
| 43 | # PROP Ignore_Export_Lib 0 | 43 | # PROP Ignore_Export_Lib 0 |
| 44 | # PROP Target_Dir "" | 44 | # PROP Target_Dir "" |
| 45 | # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c | 45 | # ADD BASE CPP /nologo /MT /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /c |
| 46 | # ADD CPP /nologo /Gr /MT /W3 /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c | 46 | # ADD CPP /nologo /Gr /MT /W4 /WX /O2 /D "NDEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /FD /c |
| 47 | # SUBTRACT CPP /YX | 47 | # SUBTRACT CPP /YX |
| 48 | # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 | 48 | # ADD BASE MTL /nologo /D "NDEBUG" /mktyplib203 /win32 |
| 49 | # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 | 49 | # ADD MTL /nologo /D "NDEBUG" /mktyplib203 /win32 |
| @@ -71,7 +71,7 @@ LINK32=link.exe | |||
| 71 | # PROP Ignore_Export_Lib 0 | 71 | # PROP Ignore_Export_Lib 0 |
| 72 | # PROP Target_Dir "" | 72 | # PROP Target_Dir "" |
| 73 | # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c | 73 | # ADD BASE CPP /nologo /MTd /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /YX /FD /GZ /c |
| 74 | # ADD CPP /nologo /MTd /W3 /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c | 74 | # ADD CPP /nologo /MTd /W4 /WX /Gm /ZI /Od /D "_DEBUG" /D "WIN32" /D "_WINDOWS" /D "_MBCS" /D "_USRDLL" /D "LZMALIB_EXPORTS" /D "COMPRESS_MF_MT" /FD /GZ /c |
| 75 | # SUBTRACT CPP /YX | 75 | # SUBTRACT CPP /YX |
| 76 | # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 | 76 | # ADD BASE MTL /nologo /D "_DEBUG" /mktyplib203 /win32 |
| 77 | # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 | 77 | # ADD MTL /nologo /D "_DEBUG" /mktyplib203 /win32 |
| @@ -128,6 +128,10 @@ SOURCE=..\..\Compiler.h | |||
| 128 | # End Source File | 128 | # End Source File |
| 129 | # Begin Source File | 129 | # Begin Source File |
| 130 | 130 | ||
| 131 | SOURCE=..\..\CpuArch.c | ||
| 132 | # End Source File | ||
| 133 | # Begin Source File | ||
| 134 | |||
| 131 | SOURCE=..\..\CpuArch.h | 135 | SOURCE=..\..\CpuArch.h |
| 132 | # End Source File | 136 | # End Source File |
| 133 | # Begin Source File | 137 | # Begin Source File |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Xz.h - Xz interface | 1 | /* Xz.h - Xz interface |
| 2 | 2024-01-26 : Igor Pavlov : Public domain */ | 2 | Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_XZ_H | 4 | #ifndef ZIP7_INC_XZ_H |
| 5 | #define ZIP7_INC_XZ_H | 5 | #define ZIP7_INC_XZ_H |
| @@ -121,6 +121,7 @@ typedef struct | |||
| 121 | UInt64 startOffset; | 121 | UInt64 startOffset; |
| 122 | } CXzStream; | 122 | } CXzStream; |
| 123 | 123 | ||
| 124 | #define Xz_CONSTRUCT(p) { (p)->numBlocks = 0; (p)->blocks = NULL; (p)->flags = 0; } | ||
| 124 | void Xz_Construct(CXzStream *p); | 125 | void Xz_Construct(CXzStream *p); |
| 125 | void Xz_Free(CXzStream *p, ISzAllocPtr alloc); | 126 | void Xz_Free(CXzStream *p, ISzAllocPtr alloc); |
| 126 | 127 | ||
| @@ -136,8 +137,13 @@ typedef struct | |||
| 136 | CXzStream *streams; | 137 | CXzStream *streams; |
| 137 | } CXzs; | 138 | } CXzs; |
| 138 | 139 | ||
| 140 | #define Xzs_CONSTRUCT(p) { (p)->num = 0; (p)->numAllocated = 0; (p)->streams = NULL; } | ||
| 139 | void Xzs_Construct(CXzs *p); | 141 | void Xzs_Construct(CXzs *p); |
| 140 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc); | 142 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc); |
| 143 | /* | ||
| 144 | Xzs_ReadBackward() must be called for empty CXzs object. | ||
| 145 | Xzs_ReadBackward() can return non empty object with (p->num != 0) even in case of error. | ||
| 146 | */ | ||
| 141 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); | 147 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr inStream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc); |
| 142 | 148 | ||
| 143 | UInt64 Xzs_GetNumBlocks(const CXzs *p); | 149 | UInt64 Xzs_GetNumBlocks(const CXzs *p); |
| @@ -268,8 +274,8 @@ typedef struct | |||
| 268 | size_t outBufSize; | 274 | size_t outBufSize; |
| 269 | size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked | 275 | size_t outDataWritten; // the size of data in (outBuf) that were fully unpacked |
| 270 | 276 | ||
| 271 | Byte shaDigest[SHA256_DIGEST_SIZE]; | 277 | UInt32 shaDigest32[SHA256_DIGEST_SIZE / 4]; |
| 272 | Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; | 278 | Byte buf[XZ_BLOCK_HEADER_SIZE_MAX]; // it must be aligned for 4-bytes |
| 273 | } CXzUnpacker; | 279 | } CXzUnpacker; |
| 274 | 280 | ||
| 275 | /* alloc : aligned for cache line allocation is better */ | 281 | /* alloc : aligned for cache line allocation is better */ |
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c index 0c1fc2f..6eea4a3 100644 --- a/C/XzCrc64Opt.c +++ b/C/XzCrc64Opt.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) | 1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) |
| 2 | 2023-12-08 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -235,7 +235,7 @@ CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) | |||
| 235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); | 235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); |
| 236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); | 236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); |
| 237 | #endif | 237 | #endif |
| 238 | #elif | 238 | #else |
| 239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | 239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES |
| 240 | #endif | 240 | #endif |
| 241 | p += Z7_CRC64_NUM_TABLES_USE; | 241 | p += Z7_CRC64_NUM_TABLES_USE; |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzDec.c -- Xz Decode | 1 | /* XzDec.c -- Xz Decode |
| 2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -59,7 +59,7 @@ unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value) | |||
| 59 | 59 | ||
| 60 | for (i = 0; i < limit;) | 60 | for (i = 0; i < limit;) |
| 61 | { | 61 | { |
| 62 | Byte b = p[i]; | 62 | const unsigned b = p[i]; |
| 63 | *value |= (UInt64)(b & 0x7F) << (7 * i++); | 63 | *value |= (UInt64)(b & 0x7F) << (7 * i++); |
| 64 | if ((b & 0x80) == 0) | 64 | if ((b & 0x80) == 0) |
| 65 | return (b == 0 && i != 1) ? 0 : i; | 65 | return (b == 0 && i != 1) ? 0 : i; |
| @@ -796,11 +796,10 @@ SRes Xz_ParseHeader(CXzStreamFlags *p, const Byte *buf) | |||
| 796 | 796 | ||
| 797 | static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) | 797 | static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte *buf) |
| 798 | { | 798 | { |
| 799 | return indexSize == (((UInt64)GetUi32(buf + 4) + 1) << 2) | 799 | return indexSize == (((UInt64)GetUi32a(buf + 4) + 1) << 2) |
| 800 | && GetUi32(buf) == CrcCalc(buf + 4, 6) | 800 | && GetUi32a(buf) == CrcCalc(buf + 4, 6) |
| 801 | && flags == GetBe16(buf + 8) | 801 | && flags == GetBe16a(buf + 8) |
| 802 | && buf[10] == XZ_FOOTER_SIG_0 | 802 | && GetUi16a(buf + 10) == (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8)); |
| 803 | && buf[11] == XZ_FOOTER_SIG_1; | ||
| 804 | } | 803 | } |
| 805 | 804 | ||
| 806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 805 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
| @@ -1166,7 +1165,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1166 | p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); | 1165 | p->indexPreSize = 1 + Xz_WriteVarInt(p->buf + 1, p->numBlocks); |
| 1167 | p->indexPos = p->indexPreSize; | 1166 | p->indexPos = p->indexPreSize; |
| 1168 | p->indexSize += p->indexPreSize; | 1167 | p->indexSize += p->indexPreSize; |
| 1169 | Sha256_Final(&p->sha, p->shaDigest); | 1168 | Sha256_Final(&p->sha, (Byte *)(void *)p->shaDigest32); |
| 1170 | Sha256_Init(&p->sha); | 1169 | Sha256_Init(&p->sha); |
| 1171 | p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); | 1170 | p->crc = CrcUpdate(CRC_INIT_VAL, p->buf, p->indexPreSize); |
| 1172 | p->state = XZ_STATE_STREAM_INDEX; | 1171 | p->state = XZ_STATE_STREAM_INDEX; |
| @@ -1241,10 +1240,10 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1241 | break; | 1240 | break; |
| 1242 | } | 1241 | } |
| 1243 | { | 1242 | { |
| 1244 | Byte digest[XZ_CHECK_SIZE_MAX]; | 1243 | UInt32 digest32[XZ_CHECK_SIZE_MAX / 4]; |
| 1245 | p->state = XZ_STATE_BLOCK_HEADER; | 1244 | p->state = XZ_STATE_BLOCK_HEADER; |
| 1246 | p->pos = 0; | 1245 | p->pos = 0; |
| 1247 | if (XzCheck_Final(&p->check, digest) && memcmp(digest, p->buf, checkSize) != 0) | 1246 | if (XzCheck_Final(&p->check, (void *)digest32) && memcmp(digest32, p->buf, checkSize) != 0) |
| 1248 | return SZ_ERROR_CRC; | 1247 | return SZ_ERROR_CRC; |
| 1249 | if (p->decodeOnlyOneBlock) | 1248 | if (p->decodeOnlyOneBlock) |
| 1250 | { | 1249 | { |
| @@ -1289,12 +1288,12 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1289 | } | 1288 | } |
| 1290 | else | 1289 | else |
| 1291 | { | 1290 | { |
| 1292 | Byte digest[SHA256_DIGEST_SIZE]; | 1291 | UInt32 digest32[SHA256_DIGEST_SIZE / 4]; |
| 1293 | p->state = XZ_STATE_STREAM_INDEX_CRC; | 1292 | p->state = XZ_STATE_STREAM_INDEX_CRC; |
| 1294 | p->indexSize += 4; | 1293 | p->indexSize += 4; |
| 1295 | p->pos = 0; | 1294 | p->pos = 0; |
| 1296 | Sha256_Final(&p->sha, digest); | 1295 | Sha256_Final(&p->sha, (void *)digest32); |
| 1297 | if (memcmp(digest, p->shaDigest, SHA256_DIGEST_SIZE) != 0) | 1296 | if (memcmp(digest32, p->shaDigest32, SHA256_DIGEST_SIZE) != 0) |
| 1298 | return SZ_ERROR_CRC; | 1297 | return SZ_ERROR_CRC; |
| 1299 | } | 1298 | } |
| 1300 | } | 1299 | } |
| @@ -1313,7 +1312,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1313 | const Byte *ptr = p->buf; | 1312 | const Byte *ptr = p->buf; |
| 1314 | p->state = XZ_STATE_STREAM_FOOTER; | 1313 | p->state = XZ_STATE_STREAM_FOOTER; |
| 1315 | p->pos = 0; | 1314 | p->pos = 0; |
| 1316 | if (CRC_GET_DIGEST(p->crc) != GetUi32(ptr)) | 1315 | if (CRC_GET_DIGEST(p->crc) != GetUi32a(ptr)) |
| 1317 | return SZ_ERROR_CRC; | 1316 | return SZ_ERROR_CRC; |
| 1318 | } | 1317 | } |
| 1319 | break; | 1318 | break; |
| @@ -1343,7 +1342,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1343 | { | 1342 | { |
| 1344 | if (*src != 0) | 1343 | if (*src != 0) |
| 1345 | { | 1344 | { |
| 1346 | if (((UInt32)p->padSize & 3) != 0) | 1345 | if ((unsigned)p->padSize & 3) |
| 1347 | return SZ_ERROR_NO_ARCHIVE; | 1346 | return SZ_ERROR_NO_ARCHIVE; |
| 1348 | p->pos = 0; | 1347 | p->pos = 0; |
| 1349 | p->state = XZ_STATE_STREAM_HEADER; | 1348 | p->state = XZ_STATE_STREAM_HEADER; |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzEnc.c -- Xz Encode | 1 | /* XzEnc.c -- Xz Encode |
| 2 | 2024-03-01 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -411,6 +411,7 @@ static SRes SeqInFilter_Read(ISeqInStreamPtr pp, void *data, size_t *size) | |||
| 411 | } | 411 | } |
| 412 | } | 412 | } |
| 413 | 413 | ||
| 414 | Z7_FORCE_INLINE | ||
| 414 | static void SeqInFilter_Construct(CSeqInFilter *p) | 415 | static void SeqInFilter_Construct(CSeqInFilter *p) |
| 415 | { | 416 | { |
| 416 | p->buf = NULL; | 417 | p->buf = NULL; |
| @@ -418,6 +419,7 @@ static void SeqInFilter_Construct(CSeqInFilter *p) | |||
| 418 | p->vt.Read = SeqInFilter_Read; | 419 | p->vt.Read = SeqInFilter_Read; |
| 419 | } | 420 | } |
| 420 | 421 | ||
| 422 | Z7_FORCE_INLINE | ||
| 421 | static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) | 423 | static void SeqInFilter_Free(CSeqInFilter *p, ISzAllocPtr alloc) |
| 422 | { | 424 | { |
| 423 | if (p->StateCoder.p) | 425 | if (p->StateCoder.p) |
| @@ -507,6 +509,7 @@ void XzFilterProps_Init(CXzFilterProps *p) | |||
| 507 | void XzProps_Init(CXzProps *p) | 509 | void XzProps_Init(CXzProps *p) |
| 508 | { | 510 | { |
| 509 | p->checkId = XZ_CHECK_CRC32; | 511 | p->checkId = XZ_CHECK_CRC32; |
| 512 | p->numThreadGroups = 0; | ||
| 510 | p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; | 513 | p->blockSize = XZ_PROPS_BLOCK_SIZE_AUTO; |
| 511 | p->numBlockThreads_Reduced = -1; | 514 | p->numBlockThreads_Reduced = -1; |
| 512 | p->numBlockThreads_Max = -1; | 515 | p->numBlockThreads_Max = -1; |
| @@ -689,6 +692,7 @@ typedef struct | |||
| 689 | } CLzma2WithFilters; | 692 | } CLzma2WithFilters; |
| 690 | 693 | ||
| 691 | 694 | ||
| 695 | Z7_FORCE_INLINE | ||
| 692 | static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) | 696 | static void Lzma2WithFilters_Construct(CLzma2WithFilters *p) |
| 693 | { | 697 | { |
| 694 | p->lzma2 = NULL; | 698 | p->lzma2 = NULL; |
| @@ -712,6 +716,7 @@ static SRes Lzma2WithFilters_Create(CLzma2WithFilters *p, ISzAllocPtr alloc, ISz | |||
| 712 | } | 716 | } |
| 713 | 717 | ||
| 714 | 718 | ||
| 719 | Z7_FORCE_INLINE | ||
| 715 | static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) | 720 | static void Lzma2WithFilters_Free(CLzma2WithFilters *p, ISzAllocPtr alloc) |
| 716 | { | 721 | { |
| 717 | #ifdef USE_SUBBLOCK | 722 | #ifdef USE_SUBBLOCK |
| @@ -1236,6 +1241,7 @@ SRes XzEnc_Encode(CXzEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr in | |||
| 1236 | } | 1241 | } |
| 1237 | 1242 | ||
| 1238 | p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; | 1243 | p->mtCoder.numThreadsMax = (unsigned)props->numBlockThreads_Max; |
| 1244 | p->mtCoder.numThreadGroups = props->numThreadGroups; | ||
| 1239 | p->mtCoder.expectedDataSize = p->expectedDataSize; | 1245 | p->mtCoder.expectedDataSize = p->expectedDataSize; |
| 1240 | 1246 | ||
| 1241 | RINOK(MtCoder_Code(&p->mtCoder)) | 1247 | RINOK(MtCoder_Code(&p->mtCoder)) |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzEnc.h -- Xz Encode | 1 | /* XzEnc.h -- Xz Encode |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_XZ_ENC_H | 4 | #ifndef ZIP7_INC_XZ_ENC_H |
| 5 | #define ZIP7_INC_XZ_ENC_H | 5 | #define ZIP7_INC_XZ_ENC_H |
| @@ -31,6 +31,7 @@ typedef struct | |||
| 31 | CLzma2EncProps lzma2Props; | 31 | CLzma2EncProps lzma2Props; |
| 32 | CXzFilterProps filterProps; | 32 | CXzFilterProps filterProps; |
| 33 | unsigned checkId; | 33 | unsigned checkId; |
| 34 | unsigned numThreadGroups; // 0 : no groups | ||
| 34 | UInt64 blockSize; | 35 | UInt64 blockSize; |
| 35 | int numBlockThreads_Reduced; | 36 | int numBlockThreads_Reduced; |
| 36 | int numBlockThreads_Max; | 37 | int numBlockThreads_Max; |
| @@ -1,38 +1,39 @@ | |||
| 1 | /* XzIn.c - Xz input | 1 | /* XzIn.c - Xz input |
| 2 | 2023-09-07 : Igor Pavlov : Public domain */ | 2 | : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include <string.h> | 6 | #include <string.h> |
| 7 | 7 | ||
| 8 | #include "7zCrc.h" | 8 | #include "7zCrc.h" |
| 9 | #include "CpuArch.h" | ||
| 10 | #include "Xz.h" | 9 | #include "Xz.h" |
| 10 | #include "CpuArch.h" | ||
| 11 | 11 | ||
| 12 | /* | 12 | #define XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(p) \ |
| 13 | #define XZ_FOOTER_SIG_CHECK(p) (memcmp((p), XZ_FOOTER_SIG, XZ_FOOTER_SIG_SIZE) == 0) | 13 | (GetUi16a((const Byte *)(const void *)(p) + 10) == \ |
| 14 | */ | 14 | (XZ_FOOTER_SIG_0 | (XZ_FOOTER_SIG_1 << 8))) |
| 15 | #define XZ_FOOTER_SIG_CHECK(p) ((p)[0] == XZ_FOOTER_SIG_0 && (p)[1] == XZ_FOOTER_SIG_1) | ||
| 16 | |||
| 17 | 15 | ||
| 18 | SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) | 16 | SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) |
| 19 | { | 17 | { |
| 20 | Byte sig[XZ_STREAM_HEADER_SIZE]; | 18 | UInt32 data32[XZ_STREAM_HEADER_SIZE / 4]; |
| 21 | size_t processedSize = XZ_STREAM_HEADER_SIZE; | 19 | size_t processedSize = XZ_STREAM_HEADER_SIZE; |
| 22 | RINOK(SeqInStream_ReadMax(inStream, sig, &processedSize)) | 20 | RINOK(SeqInStream_ReadMax(inStream, data32, &processedSize)) |
| 23 | if (processedSize != XZ_STREAM_HEADER_SIZE | 21 | if (processedSize != XZ_STREAM_HEADER_SIZE |
| 24 | || memcmp(sig, XZ_SIG, XZ_SIG_SIZE) != 0) | 22 | || memcmp(data32, XZ_SIG, XZ_SIG_SIZE) != 0) |
| 25 | return SZ_ERROR_NO_ARCHIVE; | 23 | return SZ_ERROR_NO_ARCHIVE; |
| 26 | return Xz_ParseHeader(p, sig); | 24 | return Xz_ParseHeader(p, (const Byte *)(const void *)data32); |
| 27 | } | 25 | } |
| 28 | 26 | ||
| 29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 27 | #define READ_VARINT_AND_CHECK(buf, size, res) \ |
| 30 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 28 | { const unsigned s = Xz_ReadVarInt(buf, size, res); \ |
| 31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 29 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
| 32 | pos += s; } | 30 | size -= s; \ |
| 31 | buf += s; \ | ||
| 32 | } | ||
| 33 | 33 | ||
| 34 | SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) | 34 | SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, UInt32 *headerSizeRes) |
| 35 | { | 35 | { |
| 36 | MY_ALIGN(4) | ||
| 36 | Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; | 37 | Byte header[XZ_BLOCK_HEADER_SIZE_MAX]; |
| 37 | unsigned headerSize; | 38 | unsigned headerSize; |
| 38 | *headerSizeRes = 0; | 39 | *headerSizeRes = 0; |
| @@ -57,8 +58,12 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
| 57 | return XzBlock_Parse(p, header); | 58 | return XzBlock_Parse(p, header); |
| 58 | } | 59 | } |
| 59 | 60 | ||
| 61 | |||
| 60 | #define ADD_SIZE_CHECK(size, val) \ | 62 | #define ADD_SIZE_CHECK(size, val) \ |
| 61 | { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } | 63 | { const UInt64 newSize = size + (val); \ |
| 64 | if (newSize < size) return XZ_SIZE_OVERFLOW; \ | ||
| 65 | size = newSize; \ | ||
| 66 | } | ||
| 62 | 67 | ||
| 63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) | 68 | UInt64 Xz_GetUnpackSize(const CXzStream *p) |
| 64 | { | 69 | { |
| @@ -82,76 +87,85 @@ UInt64 Xz_GetPackSize(const CXzStream *p) | |||
| 82 | return size; | 87 | return size; |
| 83 | } | 88 | } |
| 84 | 89 | ||
| 85 | /* | ||
| 86 | SRes XzBlock_ReadFooter(CXzBlock *p, CXzStreamFlags f, ISeqInStreamPtr inStream) | ||
| 87 | { | ||
| 88 | return SeqInStream_Read(inStream, p->check, XzFlags_GetCheckSize(f)); | ||
| 89 | } | ||
| 90 | */ | ||
| 91 | 90 | ||
| 92 | static SRes Xz_ReadIndex2(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) | 91 | // input; |
| 92 | // CXzStream (p) is empty object. | ||
| 93 | // size != 0 | ||
| 94 | // (size & 3) == 0 | ||
| 95 | // (buf) is aligned for at least 4 bytes. | ||
| 96 | // output: | ||
| 97 | // p->numBlocks is number of allocated items in p->blocks | ||
| 98 | // p->blocks[*] values must be ignored, if function returns error. | ||
| 99 | static SRes Xz_ParseIndex(CXzStream *p, const Byte *buf, size_t size, ISzAllocPtr alloc) | ||
| 93 | { | 100 | { |
| 94 | size_t numBlocks, pos = 1; | 101 | size_t numBlocks; |
| 95 | UInt32 crc; | ||
| 96 | |||
| 97 | if (size < 5 || buf[0] != 0) | 102 | if (size < 5 || buf[0] != 0) |
| 98 | return SZ_ERROR_ARCHIVE; | 103 | return SZ_ERROR_ARCHIVE; |
| 99 | |||
| 100 | size -= 4; | 104 | size -= 4; |
| 101 | crc = CrcCalc(buf, size); | 105 | { |
| 102 | if (crc != GetUi32(buf + size)) | 106 | const UInt32 crc = CrcCalc(buf, size); |
| 103 | return SZ_ERROR_ARCHIVE; | 107 | if (crc != GetUi32a(buf + size)) |
| 104 | 108 | return SZ_ERROR_ARCHIVE; | |
| 109 | } | ||
| 110 | buf++; | ||
| 111 | size--; | ||
| 105 | { | 112 | { |
| 106 | UInt64 numBlocks64; | 113 | UInt64 numBlocks64; |
| 107 | READ_VARINT_AND_CHECK(buf, pos, size, &numBlocks64) | 114 | READ_VARINT_AND_CHECK(buf, size, &numBlocks64) |
| 108 | numBlocks = (size_t)numBlocks64; | 115 | // (numBlocks64) is 63-bit value, so we can calculate (numBlocks64 * 2): |
| 109 | if (numBlocks != numBlocks64 || numBlocks * 2 > size) | 116 | if (numBlocks64 * 2 > size) |
| 110 | return SZ_ERROR_ARCHIVE; | 117 | return SZ_ERROR_ARCHIVE; |
| 118 | if (numBlocks64 >= ((size_t)1 << (sizeof(size_t) * 8 - 1)) / sizeof(CXzBlockSizes)) | ||
| 119 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE | ||
| 120 | numBlocks = (size_t)numBlocks64; | ||
| 111 | } | 121 | } |
| 112 | 122 | // Xz_Free(p, alloc); // it's optional, because (p) is empty already | |
| 113 | Xz_Free(p, alloc); | 123 | if (numBlocks) |
| 114 | if (numBlocks != 0) | ||
| 115 | { | 124 | { |
| 116 | size_t i; | 125 | CXzBlockSizes *blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); |
| 117 | p->numBlocks = numBlocks; | 126 | if (!blocks) |
| 118 | p->blocks = (CXzBlockSizes *)ISzAlloc_Alloc(alloc, sizeof(CXzBlockSizes) * numBlocks); | ||
| 119 | if (!p->blocks) | ||
| 120 | return SZ_ERROR_MEM; | 127 | return SZ_ERROR_MEM; |
| 121 | for (i = 0; i < numBlocks; i++) | 128 | p->blocks = blocks; |
| 129 | p->numBlocks = numBlocks; | ||
| 130 | // the caller will call Xz_Free() in case of error | ||
| 131 | do | ||
| 122 | { | 132 | { |
| 123 | CXzBlockSizes *block = &p->blocks[i]; | 133 | READ_VARINT_AND_CHECK(buf, size, &blocks->totalSize) |
| 124 | READ_VARINT_AND_CHECK(buf, pos, size, &block->totalSize) | 134 | READ_VARINT_AND_CHECK(buf, size, &blocks->unpackSize) |
| 125 | READ_VARINT_AND_CHECK(buf, pos, size, &block->unpackSize) | 135 | if (blocks->totalSize == 0) |
| 126 | if (block->totalSize == 0) | ||
| 127 | return SZ_ERROR_ARCHIVE; | 136 | return SZ_ERROR_ARCHIVE; |
| 137 | blocks++; | ||
| 128 | } | 138 | } |
| 139 | while (--numBlocks); | ||
| 129 | } | 140 | } |
| 130 | while ((pos & 3) != 0) | 141 | if (size >= 4) |
| 131 | if (buf[pos++] != 0) | 142 | return SZ_ERROR_ARCHIVE; |
| 143 | while (size) | ||
| 144 | if (buf[--size]) | ||
| 132 | return SZ_ERROR_ARCHIVE; | 145 | return SZ_ERROR_ARCHIVE; |
| 133 | return (pos == size) ? SZ_OK : SZ_ERROR_ARCHIVE; | 146 | return SZ_OK; |
| 134 | } | 147 | } |
| 135 | 148 | ||
| 149 | |||
| 150 | /* | ||
| 136 | static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) | 151 | static SRes Xz_ReadIndex(CXzStream *p, ILookInStreamPtr stream, UInt64 indexSize, ISzAllocPtr alloc) |
| 137 | { | 152 | { |
| 138 | SRes res; | 153 | SRes res; |
| 139 | size_t size; | 154 | size_t size; |
| 140 | Byte *buf; | 155 | Byte *buf; |
| 141 | if (indexSize > ((UInt32)1 << 31)) | 156 | if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) |
| 142 | return SZ_ERROR_UNSUPPORTED; | 157 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE |
| 143 | size = (size_t)indexSize; | 158 | size = (size_t)indexSize; |
| 144 | if (size != indexSize) | ||
| 145 | return SZ_ERROR_UNSUPPORTED; | ||
| 146 | buf = (Byte *)ISzAlloc_Alloc(alloc, size); | 159 | buf = (Byte *)ISzAlloc_Alloc(alloc, size); |
| 147 | if (!buf) | 160 | if (!buf) |
| 148 | return SZ_ERROR_MEM; | 161 | return SZ_ERROR_MEM; |
| 149 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); | 162 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); |
| 150 | if (res == SZ_OK) | 163 | if (res == SZ_OK) |
| 151 | res = Xz_ReadIndex2(p, buf, size, alloc); | 164 | res = Xz_ParseIndex(p, buf, size, alloc); |
| 152 | ISzAlloc_Free(alloc, buf); | 165 | ISzAlloc_Free(alloc, buf); |
| 153 | return res; | 166 | return res; |
| 154 | } | 167 | } |
| 168 | */ | ||
| 155 | 169 | ||
| 156 | static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) | 170 | static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, void *buf, size_t size) |
| 157 | { | 171 | { |
| @@ -160,84 +174,102 @@ static SRes LookInStream_SeekRead_ForArc(ILookInStreamPtr stream, UInt64 offset, | |||
| 160 | /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ | 174 | /* return LookInStream_Read2(stream, buf, size, SZ_ERROR_NO_ARCHIVE); */ |
| 161 | } | 175 | } |
| 162 | 176 | ||
| 177 | |||
| 178 | /* | ||
| 179 | in: | ||
| 180 | (*startOffset) is position in (stream) where xz_stream must be finished. | ||
| 181 | out: | ||
| 182 | if returns SZ_OK, then (*startOffset) is position in stream that shows start of xz_stream. | ||
| 183 | */ | ||
| 163 | static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) | 184 | static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startOffset, ISzAllocPtr alloc) |
| 164 | { | 185 | { |
| 165 | UInt64 indexSize; | 186 | #define TEMP_BUF_SIZE (1 << 10) |
| 166 | Byte buf[XZ_STREAM_FOOTER_SIZE]; | 187 | UInt32 buf32[TEMP_BUF_SIZE / 4]; |
| 167 | UInt64 pos = (UInt64)*startOffset; | 188 | UInt64 pos = (UInt64)*startOffset; |
| 168 | 189 | ||
| 169 | if ((pos & 3) != 0 || pos < XZ_STREAM_FOOTER_SIZE) | 190 | if ((pos & 3) || pos < XZ_STREAM_FOOTER_SIZE) |
| 170 | return SZ_ERROR_NO_ARCHIVE; | 191 | return SZ_ERROR_NO_ARCHIVE; |
| 171 | |||
| 172 | pos -= XZ_STREAM_FOOTER_SIZE; | 192 | pos -= XZ_STREAM_FOOTER_SIZE; |
| 173 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) | 193 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) |
| 174 | 194 | ||
| 175 | if (!XZ_FOOTER_SIG_CHECK(buf + 10)) | 195 | if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) |
| 176 | { | 196 | { |
| 177 | UInt32 total = 0; | ||
| 178 | pos += XZ_STREAM_FOOTER_SIZE; | 197 | pos += XZ_STREAM_FOOTER_SIZE; |
| 179 | |||
| 180 | for (;;) | 198 | for (;;) |
| 181 | { | 199 | { |
| 182 | size_t i; | 200 | // pos != 0 |
| 183 | #define TEMP_BUF_SIZE (1 << 10) | 201 | // (pos & 3) == 0 |
| 184 | Byte temp[TEMP_BUF_SIZE]; | 202 | size_t i = pos >= TEMP_BUF_SIZE ? TEMP_BUF_SIZE : (size_t)pos; |
| 185 | |||
| 186 | i = (pos > TEMP_BUF_SIZE) ? TEMP_BUF_SIZE : (size_t)pos; | ||
| 187 | pos -= i; | 203 | pos -= i; |
| 188 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, temp, i)) | 204 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, i)) |
| 189 | total += (UInt32)i; | 205 | i /= 4; |
| 190 | for (; i != 0; i--) | 206 | do |
| 191 | if (temp[i - 1] != 0) | 207 | if (buf32[i - 1] != 0) |
| 192 | break; | 208 | break; |
| 193 | if (i != 0) | 209 | while (--i); |
| 194 | { | 210 | |
| 195 | if ((i & 3) != 0) | 211 | pos += i * 4; |
| 196 | return SZ_ERROR_NO_ARCHIVE; | 212 | #define XZ_STREAM_BACKWARD_READING_PAD_MAX (1 << 16) |
| 197 | pos += i; | 213 | // here we don't support rare case with big padding for xz stream. |
| 198 | break; | 214 | // so we have padding limit for backward reading. |
| 199 | } | 215 | if ((UInt64)*startOffset - pos > XZ_STREAM_BACKWARD_READING_PAD_MAX) |
| 200 | if (pos < XZ_STREAM_FOOTER_SIZE || total > (1 << 16)) | ||
| 201 | return SZ_ERROR_NO_ARCHIVE; | 216 | return SZ_ERROR_NO_ARCHIVE; |
| 217 | if (i) | ||
| 218 | break; | ||
| 202 | } | 219 | } |
| 203 | 220 | // we try to open xz stream after skipping zero padding. | |
| 221 | // ((UInt64)*startOffset == pos) is possible here! | ||
| 204 | if (pos < XZ_STREAM_FOOTER_SIZE) | 222 | if (pos < XZ_STREAM_FOOTER_SIZE) |
| 205 | return SZ_ERROR_NO_ARCHIVE; | 223 | return SZ_ERROR_NO_ARCHIVE; |
| 206 | pos -= XZ_STREAM_FOOTER_SIZE; | 224 | pos -= XZ_STREAM_FOOTER_SIZE; |
| 207 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf, XZ_STREAM_FOOTER_SIZE)) | 225 | RINOK(LookInStream_SeekRead_ForArc(stream, pos, buf32, XZ_STREAM_FOOTER_SIZE)) |
| 208 | if (!XZ_FOOTER_SIG_CHECK(buf + 10)) | 226 | if (!XZ_FOOTER_12B_ALIGNED16_SIG_CHECK(buf32)) |
| 209 | return SZ_ERROR_NO_ARCHIVE; | 227 | return SZ_ERROR_NO_ARCHIVE; |
| 210 | } | 228 | } |
| 211 | 229 | ||
| 212 | p->flags = (CXzStreamFlags)GetBe16(buf + 8); | 230 | p->flags = (CXzStreamFlags)GetBe16a(buf32 + 2); |
| 213 | |||
| 214 | if (!XzFlags_IsSupported(p->flags)) | 231 | if (!XzFlags_IsSupported(p->flags)) |
| 215 | return SZ_ERROR_UNSUPPORTED; | 232 | return SZ_ERROR_UNSUPPORTED; |
| 216 | |||
| 217 | { | 233 | { |
| 218 | /* to eliminate GCC 6.3 warning: | 234 | /* to eliminate GCC 6.3 warning: |
| 219 | dereferencing type-punned pointer will break strict-aliasing rules */ | 235 | dereferencing type-punned pointer will break strict-aliasing rules */ |
| 220 | const Byte *buf_ptr = buf; | 236 | const UInt32 *buf_ptr = buf32; |
| 221 | if (GetUi32(buf_ptr) != CrcCalc(buf + 4, 6)) | 237 | if (GetUi32a(buf_ptr) != CrcCalc(buf32 + 1, 6)) |
| 222 | return SZ_ERROR_ARCHIVE; | 238 | return SZ_ERROR_ARCHIVE; |
| 223 | } | 239 | } |
| 224 | |||
| 225 | indexSize = ((UInt64)GetUi32(buf + 4) + 1) << 2; | ||
| 226 | |||
| 227 | if (pos < indexSize) | ||
| 228 | return SZ_ERROR_ARCHIVE; | ||
| 229 | |||
| 230 | pos -= indexSize; | ||
| 231 | RINOK(LookInStream_SeekTo(stream, pos)) | ||
| 232 | RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) | ||
| 233 | |||
| 234 | { | 240 | { |
| 235 | UInt64 totalSize = Xz_GetPackSize(p); | 241 | const UInt64 indexSize = ((UInt64)GetUi32a(buf32 + 1) + 1) << 2; |
| 236 | if (totalSize == XZ_SIZE_OVERFLOW | 242 | if (pos < indexSize) |
| 237 | || totalSize >= ((UInt64)1 << 63) | ||
| 238 | || pos < totalSize + XZ_STREAM_HEADER_SIZE) | ||
| 239 | return SZ_ERROR_ARCHIVE; | 243 | return SZ_ERROR_ARCHIVE; |
| 240 | pos -= (totalSize + XZ_STREAM_HEADER_SIZE); | 244 | pos -= indexSize; |
| 245 | // v25.00: relaxed indexSize check. We allow big index table. | ||
| 246 | // if (indexSize > ((UInt32)1 << 31)) | ||
| 247 | if (indexSize >= ((size_t)1 << (sizeof(size_t) * 8 - 1))) | ||
| 248 | return SZ_ERROR_MEM; // SZ_ERROR_ARCHIVE | ||
| 249 | RINOK(LookInStream_SeekTo(stream, pos)) | ||
| 250 | // RINOK(Xz_ReadIndex(p, stream, indexSize, alloc)) | ||
| 251 | { | ||
| 252 | SRes res; | ||
| 253 | const size_t size = (size_t)indexSize; | ||
| 254 | // if (size != indexSize) return SZ_ERROR_UNSUPPORTED; | ||
| 255 | Byte *buf = (Byte *)ISzAlloc_Alloc(alloc, size); | ||
| 256 | if (!buf) | ||
| 257 | return SZ_ERROR_MEM; | ||
| 258 | res = LookInStream_Read2(stream, buf, size, SZ_ERROR_UNSUPPORTED); | ||
| 259 | if (res == SZ_OK) | ||
| 260 | res = Xz_ParseIndex(p, buf, size, alloc); | ||
| 261 | ISzAlloc_Free(alloc, buf); | ||
| 262 | RINOK(res) | ||
| 263 | } | ||
| 264 | } | ||
| 265 | { | ||
| 266 | UInt64 total = Xz_GetPackSize(p); | ||
| 267 | if (total == XZ_SIZE_OVERFLOW || total >= ((UInt64)1 << 63)) | ||
| 268 | return SZ_ERROR_ARCHIVE; | ||
| 269 | total += XZ_STREAM_HEADER_SIZE; | ||
| 270 | if (pos < total) | ||
| 271 | return SZ_ERROR_ARCHIVE; | ||
| 272 | pos -= total; | ||
| 241 | RINOK(LookInStream_SeekTo(stream, pos)) | 273 | RINOK(LookInStream_SeekTo(stream, pos)) |
| 242 | *startOffset = (Int64)pos; | 274 | *startOffset = (Int64)pos; |
| 243 | } | 275 | } |
| @@ -246,7 +278,6 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO | |||
| 246 | CSecToRead secToRead; | 278 | CSecToRead secToRead; |
| 247 | SecToRead_CreateVTable(&secToRead); | 279 | SecToRead_CreateVTable(&secToRead); |
| 248 | secToRead.realStream = stream; | 280 | secToRead.realStream = stream; |
| 249 | |||
| 250 | RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) | 281 | RINOK(Xz_ReadHeader(&headerFlags, &secToRead.vt)) |
| 251 | return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; | 282 | return (p->flags == headerFlags) ? SZ_OK : SZ_ERROR_ARCHIVE; |
| 252 | } | 283 | } |
| @@ -257,8 +288,7 @@ static SRes Xz_ReadBackward(CXzStream *p, ILookInStreamPtr stream, Int64 *startO | |||
| 257 | 288 | ||
| 258 | void Xzs_Construct(CXzs *p) | 289 | void Xzs_Construct(CXzs *p) |
| 259 | { | 290 | { |
| 260 | p->num = p->numAllocated = 0; | 291 | Xzs_CONSTRUCT(p) |
| 261 | p->streams = 0; | ||
| 262 | } | 292 | } |
| 263 | 293 | ||
| 264 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc) | 294 | void Xzs_Free(CXzs *p, ISzAllocPtr alloc) |
| @@ -268,7 +298,7 @@ void Xzs_Free(CXzs *p, ISzAllocPtr alloc) | |||
| 268 | Xz_Free(&p->streams[i], alloc); | 298 | Xz_Free(&p->streams[i], alloc); |
| 269 | ISzAlloc_Free(alloc, p->streams); | 299 | ISzAlloc_Free(alloc, p->streams); |
| 270 | p->num = p->numAllocated = 0; | 300 | p->num = p->numAllocated = 0; |
| 271 | p->streams = 0; | 301 | p->streams = NULL; |
| 272 | } | 302 | } |
| 273 | 303 | ||
| 274 | UInt64 Xzs_GetNumBlocks(const CXzs *p) | 304 | UInt64 Xzs_GetNumBlocks(const CXzs *p) |
| @@ -307,34 +337,49 @@ UInt64 Xzs_GetPackSize(const CXzs *p) | |||
| 307 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) | 337 | SRes Xzs_ReadBackward(CXzs *p, ILookInStreamPtr stream, Int64 *startOffset, ICompressProgressPtr progress, ISzAllocPtr alloc) |
| 308 | { | 338 | { |
| 309 | Int64 endOffset = 0; | 339 | Int64 endOffset = 0; |
| 340 | // it's supposed that CXzs object is empty here. | ||
| 341 | // if CXzs object is not empty, it will add new streams to that non-empty object. | ||
| 342 | // Xzs_Free(p, alloc); // it's optional call to empty CXzs object. | ||
| 310 | RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) | 343 | RINOK(ILookInStream_Seek(stream, &endOffset, SZ_SEEK_END)) |
| 311 | *startOffset = endOffset; | 344 | *startOffset = endOffset; |
| 312 | for (;;) | 345 | for (;;) |
| 313 | { | 346 | { |
| 314 | CXzStream st; | 347 | CXzStream st; |
| 315 | SRes res; | 348 | SRes res; |
| 316 | Xz_Construct(&st); | 349 | Xz_CONSTRUCT(&st) |
| 317 | res = Xz_ReadBackward(&st, stream, startOffset, alloc); | 350 | res = Xz_ReadBackward(&st, stream, startOffset, alloc); |
| 351 | // if (res == SZ_OK), then (*startOffset) is start offset of new stream if | ||
| 352 | // if (res != SZ_OK), then (*startOffset) is unchend or it's expected start offset of stream with error | ||
| 318 | st.startOffset = (UInt64)*startOffset; | 353 | st.startOffset = (UInt64)*startOffset; |
| 319 | RINOK(res) | 354 | // we must store (st) object to array, or we must free (st) local object. |
| 355 | if (res != SZ_OK) | ||
| 356 | { | ||
| 357 | Xz_Free(&st, alloc); | ||
| 358 | return res; | ||
| 359 | } | ||
| 320 | if (p->num == p->numAllocated) | 360 | if (p->num == p->numAllocated) |
| 321 | { | 361 | { |
| 322 | const size_t newNum = p->num + p->num / 4 + 1; | 362 | const size_t newNum = p->num + p->num / 4 + 1; |
| 323 | void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); | 363 | void *data = ISzAlloc_Alloc(alloc, newNum * sizeof(CXzStream)); |
| 324 | if (!data) | 364 | if (!data) |
| 365 | { | ||
| 366 | Xz_Free(&st, alloc); | ||
| 325 | return SZ_ERROR_MEM; | 367 | return SZ_ERROR_MEM; |
| 368 | } | ||
| 326 | p->numAllocated = newNum; | 369 | p->numAllocated = newNum; |
| 327 | if (p->num != 0) | 370 | if (p->num != 0) |
| 328 | memcpy(data, p->streams, p->num * sizeof(CXzStream)); | 371 | memcpy(data, p->streams, p->num * sizeof(CXzStream)); |
| 329 | ISzAlloc_Free(alloc, p->streams); | 372 | ISzAlloc_Free(alloc, p->streams); |
| 330 | p->streams = (CXzStream *)data; | 373 | p->streams = (CXzStream *)data; |
| 331 | } | 374 | } |
| 375 | // we use direct copying of raw data from local variable (st) to object in array. | ||
| 376 | // so we don't need to call Xz_Free(&st, alloc) after copying and after p->num++ | ||
| 332 | p->streams[p->num++] = st; | 377 | p->streams[p->num++] = st; |
| 333 | if (*startOffset == 0) | 378 | if (*startOffset == 0) |
| 334 | break; | 379 | return SZ_OK; |
| 335 | RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) | 380 | // seek operation is optional: |
| 381 | // RINOK(LookInStream_SeekTo(stream, (UInt64)*startOffset)) | ||
| 336 | if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) | 382 | if (progress && ICompressProgress_Progress(progress, (UInt64)(endOffset - *startOffset), (UInt64)(Int64)-1) != SZ_OK) |
| 337 | return SZ_ERROR_PROGRESS; | 383 | return SZ_ERROR_PROGRESS; |
| 338 | } | 384 | } |
| 339 | return SZ_OK; | ||
| 340 | } | 385 | } |
