diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-06-21 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2023-12-17 14:59:19 +0500 |
commit | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (patch) | |
tree | fe5e17420300b715021a76328444088d32047963 /C/LzFind.c | |
parent | 93be7d4abfd4233228f58ee1fbbcd76d91be66a4 (diff) | |
download | 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.gz 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.tar.bz2 7zip-5b39dc76f1bc82f941d5c800ab9f34407a06b53a.zip |
23.0123.01
Diffstat (limited to 'C/LzFind.c')
-rw-r--r-- | C/LzFind.c | 519 |
1 files changed, 304 insertions, 215 deletions
@@ -1,5 +1,5 @@ | |||
1 | /* LzFind.c -- Match finder for LZ algorithms | 1 | /* LzFind.c -- Match finder for LZ algorithms |
2 | 2021-11-29 : Igor Pavlov : Public domain */ | 2 | 2023-03-14 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -17,7 +17,7 @@ | |||
17 | #define kEmptyHashValue 0 | 17 | #define kEmptyHashValue 0 |
18 | 18 | ||
19 | #define kMaxValForNormalize ((UInt32)0) | 19 | #define kMaxValForNormalize ((UInt32)0) |
20 | // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug | 20 | // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug |
21 | 21 | ||
22 | // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses | 22 | // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses |
23 | 23 | ||
@@ -67,10 +67,10 @@ | |||
67 | 67 | ||
68 | static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) | 68 | static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc) |
69 | { | 69 | { |
70 | if (!p->directInput) | 70 | // if (!p->directInput) |
71 | { | 71 | { |
72 | ISzAlloc_Free(alloc, p->bufferBase); | 72 | ISzAlloc_Free(alloc, p->bufBase); |
73 | p->bufferBase = NULL; | 73 | p->bufBase = NULL; |
74 | } | 74 | } |
75 | } | 75 | } |
76 | 76 | ||
@@ -79,7 +79,7 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all | |||
79 | { | 79 | { |
80 | if (blockSize == 0) | 80 | if (blockSize == 0) |
81 | return 0; | 81 | return 0; |
82 | if (!p->bufferBase || p->blockSize != blockSize) | 82 | if (!p->bufBase || p->blockSize != blockSize) |
83 | { | 83 | { |
84 | // size_t blockSizeT; | 84 | // size_t blockSizeT; |
85 | LzInWindow_Free(p, alloc); | 85 | LzInWindow_Free(p, alloc); |
@@ -101,11 +101,11 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all | |||
101 | #endif | 101 | #endif |
102 | */ | 102 | */ |
103 | 103 | ||
104 | p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); | 104 | p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize); |
105 | // printf("\nbufferBase = %p\n", p->bufferBase); | 105 | // printf("\nbufferBase = %p\n", p->bufBase); |
106 | // return 0; // for debug | 106 | // return 0; // for debug |
107 | } | 107 | } |
108 | return (p->bufferBase != NULL); | 108 | return (p->bufBase != NULL); |
109 | } | 109 | } |
110 | 110 | ||
111 | static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } | 111 | static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } |
@@ -113,7 +113,7 @@ static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return | |||
113 | static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } | 113 | static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } |
114 | 114 | ||
115 | 115 | ||
116 | MY_NO_INLINE | 116 | Z7_NO_INLINE |
117 | static void MatchFinder_ReadBlock(CMatchFinder *p) | 117 | static void MatchFinder_ReadBlock(CMatchFinder *p) |
118 | { | 118 | { |
119 | if (p->streamEndWasReached || p->result != SZ_OK) | 119 | if (p->streamEndWasReached || p->result != SZ_OK) |
@@ -127,8 +127,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) | |||
127 | UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); | 127 | UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p); |
128 | if (curSize > p->directInputRem) | 128 | if (curSize > p->directInputRem) |
129 | curSize = (UInt32)p->directInputRem; | 129 | curSize = (UInt32)p->directInputRem; |
130 | p->directInputRem -= curSize; | ||
131 | p->streamPos += curSize; | 130 | p->streamPos += curSize; |
131 | p->directInputRem -= curSize; | ||
132 | if (p->directInputRem == 0) | 132 | if (p->directInputRem == 0) |
133 | p->streamEndWasReached = 1; | 133 | p->streamEndWasReached = 1; |
134 | return; | 134 | return; |
@@ -136,8 +136,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) | |||
136 | 136 | ||
137 | for (;;) | 137 | for (;;) |
138 | { | 138 | { |
139 | Byte *dest = p->buffer + GET_AVAIL_BYTES(p); | 139 | const Byte *dest = p->buffer + GET_AVAIL_BYTES(p); |
140 | size_t size = (size_t)(p->bufferBase + p->blockSize - dest); | 140 | size_t size = (size_t)(p->bufBase + p->blockSize - dest); |
141 | if (size == 0) | 141 | if (size == 0) |
142 | { | 142 | { |
143 | /* we call ReadBlock() after NeedMove() and MoveBlock(). | 143 | /* we call ReadBlock() after NeedMove() and MoveBlock(). |
@@ -153,7 +153,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) | |||
153 | // #define kRead 3 | 153 | // #define kRead 3 |
154 | // if (size > kRead) size = kRead; // for debug | 154 | // if (size > kRead) size = kRead; // for debug |
155 | 155 | ||
156 | p->result = ISeqInStream_Read(p->stream, dest, &size); | 156 | /* |
157 | // we need cast (Byte *)dest. | ||
158 | #ifdef __clang__ | ||
159 | #pragma GCC diagnostic ignored "-Wcast-qual" | ||
160 | #endif | ||
161 | */ | ||
162 | p->result = ISeqInStream_Read(p->stream, | ||
163 | p->bufBase + (dest - p->bufBase), &size); | ||
157 | if (p->result != SZ_OK) | 164 | if (p->result != SZ_OK) |
158 | return; | 165 | return; |
159 | if (size == 0) | 166 | if (size == 0) |
@@ -173,14 +180,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p) | |||
173 | 180 | ||
174 | 181 | ||
175 | 182 | ||
176 | MY_NO_INLINE | 183 | Z7_NO_INLINE |
177 | void MatchFinder_MoveBlock(CMatchFinder *p) | 184 | void MatchFinder_MoveBlock(CMatchFinder *p) |
178 | { | 185 | { |
179 | const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore; | 186 | const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore; |
180 | const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; | 187 | const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore; |
181 | p->buffer = p->bufferBase + keepBefore; | 188 | p->buffer = p->bufBase + keepBefore; |
182 | memmove(p->bufferBase, | 189 | memmove(p->bufBase, |
183 | p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)), | 190 | p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)), |
184 | keepBefore + (size_t)GET_AVAIL_BYTES(p)); | 191 | keepBefore + (size_t)GET_AVAIL_BYTES(p)); |
185 | } | 192 | } |
186 | 193 | ||
@@ -198,7 +205,7 @@ int MatchFinder_NeedMove(CMatchFinder *p) | |||
198 | return 0; | 205 | return 0; |
199 | if (p->streamEndWasReached || p->result != SZ_OK) | 206 | if (p->streamEndWasReached || p->result != SZ_OK) |
200 | return 0; | 207 | return 0; |
201 | return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); | 208 | return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter); |
202 | } | 209 | } |
203 | 210 | ||
204 | void MatchFinder_ReadIfRequired(CMatchFinder *p) | 211 | void MatchFinder_ReadIfRequired(CMatchFinder *p) |
@@ -214,6 +221,8 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p) | |||
214 | p->cutValue = 32; | 221 | p->cutValue = 32; |
215 | p->btMode = 1; | 222 | p->btMode = 1; |
216 | p->numHashBytes = 4; | 223 | p->numHashBytes = 4; |
224 | p->numHashBytes_Min = 2; | ||
225 | p->numHashOutBits = 0; | ||
217 | p->bigHash = 0; | 226 | p->bigHash = 0; |
218 | } | 227 | } |
219 | 228 | ||
@@ -222,8 +231,10 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p) | |||
222 | void MatchFinder_Construct(CMatchFinder *p) | 231 | void MatchFinder_Construct(CMatchFinder *p) |
223 | { | 232 | { |
224 | unsigned i; | 233 | unsigned i; |
225 | p->bufferBase = NULL; | 234 | p->buffer = NULL; |
235 | p->bufBase = NULL; | ||
226 | p->directInput = 0; | 236 | p->directInput = 0; |
237 | p->stream = NULL; | ||
227 | p->hash = NULL; | 238 | p->hash = NULL; |
228 | p->expectedDataSize = (UInt64)(Int64)-1; | 239 | p->expectedDataSize = (UInt64)(Int64)-1; |
229 | MatchFinder_SetDefaultSettings(p); | 240 | MatchFinder_SetDefaultSettings(p); |
@@ -238,6 +249,8 @@ void MatchFinder_Construct(CMatchFinder *p) | |||
238 | } | 249 | } |
239 | } | 250 | } |
240 | 251 | ||
252 | #undef kCrcPoly | ||
253 | |||
241 | static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) | 254 | static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc) |
242 | { | 255 | { |
243 | ISzAlloc_Free(alloc, p->hash); | 256 | ISzAlloc_Free(alloc, p->hash); |
@@ -252,7 +265,7 @@ void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc) | |||
252 | 265 | ||
253 | static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) | 266 | static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc) |
254 | { | 267 | { |
255 | size_t sizeInBytes = (size_t)num * sizeof(CLzRef); | 268 | const size_t sizeInBytes = (size_t)num * sizeof(CLzRef); |
256 | if (sizeInBytes / sizeof(CLzRef) != num) | 269 | if (sizeInBytes / sizeof(CLzRef) != num) |
257 | return NULL; | 270 | return NULL; |
258 | return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); | 271 | return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes); |
@@ -298,6 +311,62 @@ static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize) | |||
298 | } | 311 | } |
299 | 312 | ||
300 | 313 | ||
314 | // input is historySize | ||
315 | static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs) | ||
316 | { | ||
317 | if (p->numHashBytes == 2) | ||
318 | return (1 << 16) - 1; | ||
319 | if (hs != 0) | ||
320 | hs--; | ||
321 | hs |= (hs >> 1); | ||
322 | hs |= (hs >> 2); | ||
323 | hs |= (hs >> 4); | ||
324 | hs |= (hs >> 8); | ||
325 | // we propagated 16 bits in (hs). Low 16 bits must be set later | ||
326 | if (hs >= (1 << 24)) | ||
327 | { | ||
328 | if (p->numHashBytes == 3) | ||
329 | hs = (1 << 24) - 1; | ||
330 | /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ | ||
331 | } | ||
332 | // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) | ||
333 | hs |= (1 << 16) - 1; /* don't change it! */ | ||
334 | // bt5: we adjust the size with recommended minimum size | ||
335 | if (p->numHashBytes >= 5) | ||
336 | hs |= (256 << kLzHash_CrcShift_2) - 1; | ||
337 | return hs; | ||
338 | } | ||
339 | |||
340 | // input is historySize | ||
341 | static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs) | ||
342 | { | ||
343 | if (p->numHashBytes == 2) | ||
344 | return (1 << 16) - 1; | ||
345 | if (hs != 0) | ||
346 | hs--; | ||
347 | hs |= (hs >> 1); | ||
348 | hs |= (hs >> 2); | ||
349 | hs |= (hs >> 4); | ||
350 | hs |= (hs >> 8); | ||
351 | // we propagated 16 bits in (hs). Low 16 bits must be set later | ||
352 | hs >>= 1; | ||
353 | if (hs >= (1 << 24)) | ||
354 | { | ||
355 | if (p->numHashBytes == 3) | ||
356 | hs = (1 << 24) - 1; | ||
357 | else | ||
358 | hs >>= 1; | ||
359 | /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ | ||
360 | } | ||
361 | // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) | ||
362 | hs |= (1 << 16) - 1; /* don't change it! */ | ||
363 | // bt5: we adjust the size with recommended minimum size | ||
364 | if (p->numHashBytes >= 5) | ||
365 | hs |= (256 << kLzHash_CrcShift_2) - 1; | ||
366 | return hs; | ||
367 | } | ||
368 | |||
369 | |||
301 | int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | 370 | int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, |
302 | UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, | 371 | UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, |
303 | ISzAllocPtr alloc) | 372 | ISzAllocPtr alloc) |
@@ -318,78 +387,91 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
318 | p->blockSize = 0; | 387 | p->blockSize = 0; |
319 | if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) | 388 | if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc)) |
320 | { | 389 | { |
321 | const UInt32 newCyclicBufferSize = historySize + 1; // do not change it | 390 | size_t hashSizeSum; |
322 | UInt32 hs; | ||
323 | p->matchMaxLen = matchMaxLen; | ||
324 | { | 391 | { |
325 | // UInt32 hs4; | 392 | UInt32 hs; |
326 | p->fixedHashSize = 0; | 393 | UInt32 hsCur; |
327 | hs = (1 << 16) - 1; | 394 | |
328 | if (p->numHashBytes != 2) | 395 | if (p->numHashOutBits != 0) |
329 | { | 396 | { |
330 | hs = historySize; | 397 | unsigned numBits = p->numHashOutBits; |
331 | if (hs > p->expectedDataSize) | 398 | const unsigned nbMax = |
332 | hs = (UInt32)p->expectedDataSize; | 399 | (p->numHashBytes == 2 ? 16 : |
333 | if (hs != 0) | 400 | (p->numHashBytes == 3 ? 24 : 32)); |
334 | hs--; | 401 | if (numBits > nbMax) |
335 | hs |= (hs >> 1); | 402 | numBits = nbMax; |
336 | hs |= (hs >> 2); | 403 | if (numBits >= 32) |
337 | hs |= (hs >> 4); | 404 | hs = (UInt32)0 - 1; |
338 | hs |= (hs >> 8); | 405 | else |
339 | // we propagated 16 bits in (hs). Low 16 bits must be set later | 406 | hs = ((UInt32)1 << numBits) - 1; |
340 | hs >>= 1; | ||
341 | if (hs >= (1 << 24)) | ||
342 | { | ||
343 | if (p->numHashBytes == 3) | ||
344 | hs = (1 << 24) - 1; | ||
345 | else | ||
346 | hs >>= 1; | ||
347 | /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */ | ||
348 | } | ||
349 | |||
350 | // hs = ((UInt32)1 << 25) - 1; // for test | ||
351 | |||
352 | // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) | 407 | // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2) |
353 | hs |= (1 << 16) - 1; /* don't change it! */ | 408 | hs |= (1 << 16) - 1; /* don't change it! */ |
354 | |||
355 | // bt5: we adjust the size with recommended minimum size | ||
356 | if (p->numHashBytes >= 5) | 409 | if (p->numHashBytes >= 5) |
357 | hs |= (256 << kLzHash_CrcShift_2) - 1; | 410 | hs |= (256 << kLzHash_CrcShift_2) - 1; |
411 | { | ||
412 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize); | ||
413 | if (hs > hs2) | ||
414 | hs = hs2; | ||
415 | } | ||
416 | hsCur = hs; | ||
417 | if (p->expectedDataSize < historySize) | ||
418 | { | ||
419 | const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize); | ||
420 | if (hsCur > hs2) | ||
421 | hsCur = hs2; | ||
422 | } | ||
358 | } | 423 | } |
359 | p->hashMask = hs; | 424 | else |
360 | hs++; | 425 | { |
361 | 426 | hs = MatchFinder_GetHashMask(p, historySize); | |
362 | /* | 427 | hsCur = hs; |
363 | hs4 = (1 << 20); | 428 | if (p->expectedDataSize < historySize) |
364 | if (hs4 > hs) | 429 | { |
365 | hs4 = hs; | 430 | hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize); |
366 | // hs4 = (1 << 16); // for test | 431 | if (hsCur > hs) // is it possible? |
367 | p->hash4Mask = hs4 - 1; | 432 | hsCur = hs; |
368 | */ | 433 | } |
434 | } | ||
435 | |||
436 | p->hashMask = hsCur; | ||
369 | 437 | ||
370 | if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; | 438 | hashSizeSum = hs; |
371 | if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; | 439 | hashSizeSum++; |
372 | // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; | 440 | if (hashSizeSum < hs) |
373 | hs += p->fixedHashSize; | 441 | return 0; |
442 | { | ||
443 | UInt32 fixedHashSize = 0; | ||
444 | if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size; | ||
445 | if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size; | ||
446 | // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size; | ||
447 | hashSizeSum += fixedHashSize; | ||
448 | p->fixedHashSize = fixedHashSize; | ||
449 | } | ||
374 | } | 450 | } |
375 | 451 | ||
452 | p->matchMaxLen = matchMaxLen; | ||
453 | |||
376 | { | 454 | { |
377 | size_t newSize; | 455 | size_t newSize; |
378 | size_t numSons; | 456 | size_t numSons; |
457 | const UInt32 newCyclicBufferSize = historySize + 1; // do not change it | ||
379 | p->historySize = historySize; | 458 | p->historySize = historySize; |
380 | p->hashSizeSum = hs; | ||
381 | p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) | 459 | p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1) |
382 | 460 | ||
383 | numSons = newCyclicBufferSize; | 461 | numSons = newCyclicBufferSize; |
384 | if (p->btMode) | 462 | if (p->btMode) |
385 | numSons <<= 1; | 463 | numSons <<= 1; |
386 | newSize = hs + numSons; | 464 | newSize = hashSizeSum + numSons; |
465 | |||
466 | if (numSons < newCyclicBufferSize || newSize < numSons) | ||
467 | return 0; | ||
387 | 468 | ||
388 | // aligned size is not required here, but it can be better for some loops | 469 | // aligned size is not required here, but it can be better for some loops |
389 | #define NUM_REFS_ALIGN_MASK 0xF | 470 | #define NUM_REFS_ALIGN_MASK 0xF |
390 | newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; | 471 | newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK; |
391 | 472 | ||
392 | if (p->hash && p->numRefs == newSize) | 473 | // 22.02: we don't reallocate buffer, if old size is enough |
474 | if (p->hash && p->numRefs >= newSize) | ||
393 | return 1; | 475 | return 1; |
394 | 476 | ||
395 | MatchFinder_FreeThisClassMemory(p, alloc); | 477 | MatchFinder_FreeThisClassMemory(p, alloc); |
@@ -398,7 +480,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, | |||
398 | 480 | ||
399 | if (p->hash) | 481 | if (p->hash) |
400 | { | 482 | { |
401 | p->son = p->hash + p->hashSizeSum; | 483 | p->son = p->hash + hashSizeSum; |
402 | return 1; | 484 | return 1; |
403 | } | 485 | } |
404 | } | 486 | } |
@@ -470,7 +552,8 @@ void MatchFinder_Init_HighHash(CMatchFinder *p) | |||
470 | 552 | ||
471 | void MatchFinder_Init_4(CMatchFinder *p) | 553 | void MatchFinder_Init_4(CMatchFinder *p) |
472 | { | 554 | { |
473 | p->buffer = p->bufferBase; | 555 | if (!p->directInput) |
556 | p->buffer = p->bufBase; | ||
474 | { | 557 | { |
475 | /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. | 558 | /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker. |
476 | the code in CMatchFinderMt expects (pos = 1) */ | 559 | the code in CMatchFinderMt expects (pos = 1) */ |
@@ -507,20 +590,20 @@ void MatchFinder_Init(CMatchFinder *p) | |||
507 | 590 | ||
508 | 591 | ||
509 | #ifdef MY_CPU_X86_OR_AMD64 | 592 | #ifdef MY_CPU_X86_OR_AMD64 |
510 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 593 | #if defined(__clang__) && (__clang_major__ >= 4) \ |
511 | || defined(__GNUC__) && (__GNUC__ >= 8) \ | 594 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701) |
512 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) | 595 | // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900) |
513 | #define USE_SATUR_SUB_128 | 596 | |
514 | #define USE_AVX2 | 597 | #define USE_LZFIND_SATUR_SUB_128 |
515 | #define ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) | 598 | #define USE_LZFIND_SATUR_SUB_256 |
516 | #define ATTRIB_AVX2 __attribute__((__target__("avx2"))) | 599 | #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1"))) |
600 | #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2"))) | ||
517 | #elif defined(_MSC_VER) | 601 | #elif defined(_MSC_VER) |
518 | #if (_MSC_VER >= 1600) | 602 | #if (_MSC_VER >= 1600) |
519 | #define USE_SATUR_SUB_128 | 603 | #define USE_LZFIND_SATUR_SUB_128 |
520 | #if (_MSC_VER >= 1900) | 604 | #endif |
521 | #define USE_AVX2 | 605 | #if (_MSC_VER >= 1900) |
522 | #include <immintrin.h> // avx | 606 | #define USE_LZFIND_SATUR_SUB_256 |
523 | #endif | ||
524 | #endif | 607 | #endif |
525 | #endif | 608 | #endif |
526 | 609 | ||
@@ -529,16 +612,16 @@ void MatchFinder_Init(CMatchFinder *p) | |||
529 | 612 | ||
530 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 613 | #if defined(__clang__) && (__clang_major__ >= 8) \ |
531 | || defined(__GNUC__) && (__GNUC__ >= 8) | 614 | || defined(__GNUC__) && (__GNUC__ >= 8) |
532 | #define USE_SATUR_SUB_128 | 615 | #define USE_LZFIND_SATUR_SUB_128 |
533 | #ifdef MY_CPU_ARM64 | 616 | #ifdef MY_CPU_ARM64 |
534 | // #define ATTRIB_SSE41 __attribute__((__target__(""))) | 617 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) |
535 | #else | 618 | #else |
536 | // #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 619 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
537 | #endif | 620 | #endif |
538 | 621 | ||
539 | #elif defined(_MSC_VER) | 622 | #elif defined(_MSC_VER) |
540 | #if (_MSC_VER >= 1910) | 623 | #if (_MSC_VER >= 1910) |
541 | #define USE_SATUR_SUB_128 | 624 | #define USE_LZFIND_SATUR_SUB_128 |
542 | #endif | 625 | #endif |
543 | #endif | 626 | #endif |
544 | 627 | ||
@@ -550,121 +633,130 @@ void MatchFinder_Init(CMatchFinder *p) | |||
550 | 633 | ||
551 | #endif | 634 | #endif |
552 | 635 | ||
553 | /* | ||
554 | #ifndef ATTRIB_SSE41 | ||
555 | #define ATTRIB_SSE41 | ||
556 | #endif | ||
557 | #ifndef ATTRIB_AVX2 | ||
558 | #define ATTRIB_AVX2 | ||
559 | #endif | ||
560 | */ | ||
561 | 636 | ||
562 | #ifdef USE_SATUR_SUB_128 | 637 | #ifdef USE_LZFIND_SATUR_SUB_128 |
563 | 638 | ||
564 | // #define _SHOW_HW_STATUS | 639 | // #define Z7_SHOW_HW_STATUS |
565 | 640 | ||
566 | #ifdef _SHOW_HW_STATUS | 641 | #ifdef Z7_SHOW_HW_STATUS |
567 | #include <stdio.h> | 642 | #include <stdio.h> |
568 | #define _PRF(x) x | 643 | #define PRF(x) x |
569 | _PRF(;) | 644 | PRF(;) |
570 | #else | 645 | #else |
571 | #define _PRF(x) | 646 | #define PRF(x) |
572 | #endif | 647 | #endif |
573 | 648 | ||
649 | |||
574 | #ifdef MY_CPU_ARM_OR_ARM64 | 650 | #ifdef MY_CPU_ARM_OR_ARM64 |
575 | 651 | ||
576 | #ifdef MY_CPU_ARM64 | 652 | #ifdef MY_CPU_ARM64 |
577 | // #define FORCE_SATUR_SUB_128 | 653 | // #define FORCE_LZFIND_SATUR_SUB_128 |
578 | #endif | 654 | #endif |
655 | typedef uint32x4_t LzFind_v128; | ||
656 | #define SASUB_128_V(v, s) \ | ||
657 | vsubq_u32(vmaxq_u32(v, s), s) | ||
579 | 658 | ||
580 | typedef uint32x4_t v128; | 659 | #else // MY_CPU_ARM_OR_ARM64 |
581 | #define SASUB_128(i) \ | ||
582 | *(v128 *)(void *)(items + (i) * 4) = \ | ||
583 | vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); | ||
584 | |||
585 | #else | ||
586 | 660 | ||
587 | #include <smmintrin.h> // sse4.1 | 661 | #include <smmintrin.h> // sse4.1 |
588 | 662 | ||
589 | typedef __m128i v128; | 663 | typedef __m128i LzFind_v128; |
590 | #define SASUB_128(i) \ | 664 | // SSE 4.1 |
591 | *(v128 *)(void *)(items + (i) * 4) = \ | 665 | #define SASUB_128_V(v, s) \ |
592 | _mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1 | 666 | _mm_sub_epi32(_mm_max_epu32(v, s), s) |
593 | 667 | ||
594 | #endif | 668 | #endif // MY_CPU_ARM_OR_ARM64 |
595 | 669 | ||
596 | 670 | ||
671 | #define SASUB_128(i) \ | ||
672 | *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \ | ||
673 | *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2); | ||
674 | |||
597 | 675 | ||
598 | MY_NO_INLINE | 676 | Z7_NO_INLINE |
599 | static | 677 | static |
600 | #ifdef ATTRIB_SSE41 | 678 | #ifdef LZFIND_ATTRIB_SSE41 |
601 | ATTRIB_SSE41 | 679 | LZFIND_ATTRIB_SSE41 |
602 | #endif | 680 | #endif |
603 | void | 681 | void |
604 | MY_FAST_CALL | 682 | Z7_FASTCALL |
605 | LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) | 683 | LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim) |
606 | { | 684 | { |
607 | v128 sub2 = | 685 | const LzFind_v128 sub2 = |
608 | #ifdef MY_CPU_ARM_OR_ARM64 | 686 | #ifdef MY_CPU_ARM_OR_ARM64 |
609 | vdupq_n_u32(subValue); | 687 | vdupq_n_u32(subValue); |
610 | #else | 688 | #else |
611 | _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); | 689 | _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); |
612 | #endif | 690 | #endif |
691 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
613 | do | 692 | do |
614 | { | 693 | { |
615 | SASUB_128(0) | 694 | SASUB_128(0) SASUB_128(1) items += 2 * 4; |
616 | SASUB_128(1) | 695 | SASUB_128(0) SASUB_128(1) items += 2 * 4; |
617 | SASUB_128(2) | ||
618 | SASUB_128(3) | ||
619 | items += 4 * 4; | ||
620 | } | 696 | } |
621 | while (items != lim); | 697 | while (items != lim); |
622 | } | 698 | } |
623 | 699 | ||
624 | 700 | ||
625 | 701 | ||
626 | #ifdef USE_AVX2 | 702 | #ifdef USE_LZFIND_SATUR_SUB_256 |
627 | 703 | ||
628 | #include <immintrin.h> // avx | 704 | #include <immintrin.h> // avx |
705 | /* | ||
706 | clang :immintrin.h uses | ||
707 | #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ | ||
708 | defined(__AVX2__) | ||
709 | #include <avx2intrin.h> | ||
710 | #endif | ||
711 | so we need <avxintrin.h> for clang-cl */ | ||
629 | 712 | ||
630 | #define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2 | 713 | #if defined(__clang__) |
714 | #include <avxintrin.h> | ||
715 | #include <avx2intrin.h> | ||
716 | #endif | ||
631 | 717 | ||
632 | MY_NO_INLINE | 718 | // AVX2: |
719 | #define SASUB_256(i) \ | ||
720 | *( __m256i *)( void *)(items + (i) * 8) = \ | ||
721 | _mm256_sub_epi32(_mm256_max_epu32( \ | ||
722 | *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); | ||
723 | |||
724 | Z7_NO_INLINE | ||
633 | static | 725 | static |
634 | #ifdef ATTRIB_AVX2 | 726 | #ifdef LZFIND_ATTRIB_AVX2 |
635 | ATTRIB_AVX2 | 727 | LZFIND_ATTRIB_AVX2 |
636 | #endif | 728 | #endif |
637 | void | 729 | void |
638 | MY_FAST_CALL | 730 | Z7_FASTCALL |
639 | LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) | 731 | LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim) |
640 | { | 732 | { |
641 | __m256i sub2 = _mm256_set_epi32( | 733 | const __m256i sub2 = _mm256_set_epi32( |
642 | (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, | 734 | (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue, |
643 | (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); | 735 | (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue); |
736 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
644 | do | 737 | do |
645 | { | 738 | { |
646 | SASUB_256(0) | 739 | SASUB_256(0) SASUB_256(1) items += 2 * 8; |
647 | SASUB_256(1) | 740 | SASUB_256(0) SASUB_256(1) items += 2 * 8; |
648 | items += 2 * 8; | ||
649 | } | 741 | } |
650 | while (items != lim); | 742 | while (items != lim); |
651 | } | 743 | } |
652 | #endif // USE_AVX2 | 744 | #endif // USE_LZFIND_SATUR_SUB_256 |
653 | 745 | ||
654 | #ifndef FORCE_SATUR_SUB_128 | 746 | #ifndef FORCE_LZFIND_SATUR_SUB_128 |
655 | typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)( | 747 | typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)( |
656 | UInt32 subValue, CLzRef *items, const CLzRef *lim); | 748 | UInt32 subValue, CLzRef *items, const CLzRef *lim); |
657 | static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; | 749 | static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; |
658 | #endif // FORCE_SATUR_SUB_128 | 750 | #endif // FORCE_LZFIND_SATUR_SUB_128 |
659 | 751 | ||
660 | #endif // USE_SATUR_SUB_128 | 752 | #endif // USE_LZFIND_SATUR_SUB_128 |
661 | 753 | ||
662 | 754 | ||
663 | // kEmptyHashValue must be zero | 755 | // kEmptyHashValue must be zero |
664 | // #define SASUB_32(i) v = items[i]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; | 756 | // #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; } |
665 | #define SASUB_32(i) v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; | 757 | #define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; } |
666 | 758 | ||
667 | #ifdef FORCE_SATUR_SUB_128 | 759 | #ifdef FORCE_LZFIND_SATUR_SUB_128 |
668 | 760 | ||
669 | #define DEFAULT_SaturSub LzFind_SaturSub_128 | 761 | #define DEFAULT_SaturSub LzFind_SaturSub_128 |
670 | 762 | ||
@@ -672,24 +764,19 @@ static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub; | |||
672 | 764 | ||
673 | #define DEFAULT_SaturSub LzFind_SaturSub_32 | 765 | #define DEFAULT_SaturSub LzFind_SaturSub_32 |
674 | 766 | ||
675 | MY_NO_INLINE | 767 | Z7_NO_INLINE |
676 | static | 768 | static |
677 | void | 769 | void |
678 | MY_FAST_CALL | 770 | Z7_FASTCALL |
679 | LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) | 771 | LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) |
680 | { | 772 | { |
773 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
681 | do | 774 | do |
682 | { | 775 | { |
683 | UInt32 v; | 776 | SASUB_32(0) SASUB_32(1) items += 2; |
684 | SASUB_32(0) | 777 | SASUB_32(0) SASUB_32(1) items += 2; |
685 | SASUB_32(1) | 778 | SASUB_32(0) SASUB_32(1) items += 2; |
686 | SASUB_32(2) | 779 | SASUB_32(0) SASUB_32(1) items += 2; |
687 | SASUB_32(3) | ||
688 | SASUB_32(4) | ||
689 | SASUB_32(5) | ||
690 | SASUB_32(6) | ||
691 | SASUB_32(7) | ||
692 | items += 8; | ||
693 | } | 780 | } |
694 | while (items != lim); | 781 | while (items != lim); |
695 | } | 782 | } |
@@ -697,27 +784,23 @@ LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim) | |||
697 | #endif | 784 | #endif |
698 | 785 | ||
699 | 786 | ||
700 | MY_NO_INLINE | 787 | Z7_NO_INLINE |
701 | void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) | 788 | void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) |
702 | { | 789 | { |
703 | #define K_NORM_ALIGN_BLOCK_SIZE (1 << 6) | 790 | #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7) |
704 | 791 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | |
705 | CLzRef *lim; | 792 | for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) |
706 | |||
707 | for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--) | ||
708 | { | 793 | { |
709 | UInt32 v; | 794 | SASUB_32(0) |
710 | SASUB_32(0); | ||
711 | items++; | 795 | items++; |
712 | } | 796 | } |
713 | |||
714 | { | 797 | { |
715 | #define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1) | 798 | const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1); |
716 | lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK); | 799 | CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask); |
717 | numItems &= K_NORM_ALIGN_MASK; | 800 | numItems &= k_Align_Mask; |
718 | if (items != lim) | 801 | if (items != lim) |
719 | { | 802 | { |
720 | #if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128) | 803 | #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128) |
721 | if (g_LzFind_SaturSub) | 804 | if (g_LzFind_SaturSub) |
722 | g_LzFind_SaturSub(subValue, items, lim); | 805 | g_LzFind_SaturSub(subValue, items, lim); |
723 | else | 806 | else |
@@ -726,12 +809,10 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) | |||
726 | } | 809 | } |
727 | items = lim; | 810 | items = lim; |
728 | } | 811 | } |
729 | 812 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | |
730 | |||
731 | for (; numItems != 0; numItems--) | 813 | for (; numItems != 0; numItems--) |
732 | { | 814 | { |
733 | UInt32 v; | 815 | SASUB_32(0) |
734 | SASUB_32(0); | ||
735 | items++; | 816 | items++; |
736 | } | 817 | } |
737 | } | 818 | } |
@@ -740,7 +821,7 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems) | |||
740 | 821 | ||
741 | // call MatchFinder_CheckLimits() only after (p->pos++) update | 822 | // call MatchFinder_CheckLimits() only after (p->pos++) update |
742 | 823 | ||
743 | MY_NO_INLINE | 824 | Z7_NO_INLINE |
744 | static void MatchFinder_CheckLimits(CMatchFinder *p) | 825 | static void MatchFinder_CheckLimits(CMatchFinder *p) |
745 | { | 826 | { |
746 | if (// !p->streamEndWasReached && p->result == SZ_OK && | 827 | if (// !p->streamEndWasReached && p->result == SZ_OK && |
@@ -768,11 +849,14 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) | |||
768 | const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; | 849 | const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */; |
769 | // const UInt32 subValue = (1 << 15); // for debug | 850 | // const UInt32 subValue = (1 << 15); // for debug |
770 | // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); | 851 | // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue); |
771 | size_t numSonRefs = p->cyclicBufferSize; | 852 | MatchFinder_REDUCE_OFFSETS(p, subValue) |
772 | if (p->btMode) | 853 | MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize); |
773 | numSonRefs <<= 1; | 854 | { |
774 | Inline_MatchFinder_ReduceOffsets(p, subValue); | 855 | size_t numSonRefs = p->cyclicBufferSize; |
775 | MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs); | 856 | if (p->btMode) |
857 | numSonRefs <<= 1; | ||
858 | MatchFinder_Normalize3(subValue, p->son, numSonRefs); | ||
859 | } | ||
776 | } | 860 | } |
777 | 861 | ||
778 | if (p->cyclicBufferPos == p->cyclicBufferSize) | 862 | if (p->cyclicBufferPos == p->cyclicBufferSize) |
@@ -785,7 +869,7 @@ static void MatchFinder_CheckLimits(CMatchFinder *p) | |||
785 | /* | 869 | /* |
786 | (lenLimit > maxLen) | 870 | (lenLimit > maxLen) |
787 | */ | 871 | */ |
788 | MY_FORCE_INLINE | 872 | Z7_FORCE_INLINE |
789 | static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, | 873 | static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, |
790 | size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, | 874 | size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, |
791 | UInt32 *d, unsigned maxLen) | 875 | UInt32 *d, unsigned maxLen) |
@@ -867,7 +951,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, | |||
867 | } | 951 | } |
868 | 952 | ||
869 | 953 | ||
870 | MY_FORCE_INLINE | 954 | Z7_FORCE_INLINE |
871 | UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, | 955 | UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son, |
872 | size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, | 956 | size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue, |
873 | UInt32 *d, UInt32 maxLen) | 957 | UInt32 *d, UInt32 maxLen) |
@@ -1004,7 +1088,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
1004 | 1088 | ||
1005 | #define MOVE_POS_RET MOVE_POS return distances; | 1089 | #define MOVE_POS_RET MOVE_POS return distances; |
1006 | 1090 | ||
1007 | MY_NO_INLINE | 1091 | Z7_NO_INLINE |
1008 | static void MatchFinder_MovePos(CMatchFinder *p) | 1092 | static void MatchFinder_MovePos(CMatchFinder *p) |
1009 | { | 1093 | { |
1010 | /* we go here at the end of stream data, when (avail < num_hash_bytes) | 1094 | /* we go here at the end of stream data, when (avail < num_hash_bytes) |
@@ -1015,11 +1099,11 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
1015 | if (p->btMode) | 1099 | if (p->btMode) |
1016 | p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue | 1100 | p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue |
1017 | */ | 1101 | */ |
1018 | MOVE_POS; | 1102 | MOVE_POS |
1019 | } | 1103 | } |
1020 | 1104 | ||
1021 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ | 1105 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ |
1022 | unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \ | 1106 | unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ |
1023 | lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ | 1107 | lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ |
1024 | cur = p->buffer; | 1108 | cur = p->buffer; |
1025 | 1109 | ||
@@ -1028,11 +1112,11 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
1028 | 1112 | ||
1029 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue | 1113 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue |
1030 | 1114 | ||
1031 | #define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num); | 1115 | #define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); |
1032 | 1116 | ||
1033 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ | 1117 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ |
1034 | distances = func(MF_PARAMS(p), \ | 1118 | distances = func(MF_PARAMS(p), \ |
1035 | distances, (UInt32)_maxLen_); MOVE_POS_RET; | 1119 | distances, (UInt32)_maxLen_); MOVE_POS_RET |
1036 | 1120 | ||
1037 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ | 1121 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ |
1038 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) | 1122 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) |
@@ -1052,7 +1136,7 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
1052 | static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1136 | static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) |
1053 | { | 1137 | { |
1054 | GET_MATCHES_HEADER(2) | 1138 | GET_MATCHES_HEADER(2) |
1055 | HASH2_CALC; | 1139 | HASH2_CALC |
1056 | curMatch = p->hash[hv]; | 1140 | curMatch = p->hash[hv]; |
1057 | p->hash[hv] = p->pos; | 1141 | p->hash[hv] = p->pos; |
1058 | GET_MATCHES_FOOTER_BT(1) | 1142 | GET_MATCHES_FOOTER_BT(1) |
@@ -1061,7 +1145,7 @@ static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1061 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1145 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) |
1062 | { | 1146 | { |
1063 | GET_MATCHES_HEADER(3) | 1147 | GET_MATCHES_HEADER(3) |
1064 | HASH_ZIP_CALC; | 1148 | HASH_ZIP_CALC |
1065 | curMatch = p->hash[hv]; | 1149 | curMatch = p->hash[hv]; |
1066 | p->hash[hv] = p->pos; | 1150 | p->hash[hv] = p->pos; |
1067 | GET_MATCHES_FOOTER_BT(2) | 1151 | GET_MATCHES_FOOTER_BT(2) |
@@ -1082,7 +1166,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1082 | UInt32 *hash; | 1166 | UInt32 *hash; |
1083 | GET_MATCHES_HEADER(3) | 1167 | GET_MATCHES_HEADER(3) |
1084 | 1168 | ||
1085 | HASH3_CALC; | 1169 | HASH3_CALC |
1086 | 1170 | ||
1087 | hash = p->hash; | 1171 | hash = p->hash; |
1088 | pos = p->pos; | 1172 | pos = p->pos; |
@@ -1107,7 +1191,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1107 | if (maxLen == lenLimit) | 1191 | if (maxLen == lenLimit) |
1108 | { | 1192 | { |
1109 | SkipMatchesSpec(MF_PARAMS(p)); | 1193 | SkipMatchesSpec(MF_PARAMS(p)); |
1110 | MOVE_POS_RET; | 1194 | MOVE_POS_RET |
1111 | } | 1195 | } |
1112 | } | 1196 | } |
1113 | 1197 | ||
@@ -1123,7 +1207,7 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1123 | UInt32 *hash; | 1207 | UInt32 *hash; |
1124 | GET_MATCHES_HEADER(4) | 1208 | GET_MATCHES_HEADER(4) |
1125 | 1209 | ||
1126 | HASH4_CALC; | 1210 | HASH4_CALC |
1127 | 1211 | ||
1128 | hash = p->hash; | 1212 | hash = p->hash; |
1129 | pos = p->pos; | 1213 | pos = p->pos; |
@@ -1190,7 +1274,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1190 | UInt32 *hash; | 1274 | UInt32 *hash; |
1191 | GET_MATCHES_HEADER(5) | 1275 | GET_MATCHES_HEADER(5) |
1192 | 1276 | ||
1193 | HASH5_CALC; | 1277 | HASH5_CALC |
1194 | 1278 | ||
1195 | hash = p->hash; | 1279 | hash = p->hash; |
1196 | pos = p->pos; | 1280 | pos = p->pos; |
@@ -1246,7 +1330,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1246 | if (maxLen == lenLimit) | 1330 | if (maxLen == lenLimit) |
1247 | { | 1331 | { |
1248 | SkipMatchesSpec(MF_PARAMS(p)); | 1332 | SkipMatchesSpec(MF_PARAMS(p)); |
1249 | MOVE_POS_RET; | 1333 | MOVE_POS_RET |
1250 | } | 1334 | } |
1251 | break; | 1335 | break; |
1252 | } | 1336 | } |
@@ -1263,7 +1347,7 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1263 | UInt32 *hash; | 1347 | UInt32 *hash; |
1264 | GET_MATCHES_HEADER(4) | 1348 | GET_MATCHES_HEADER(4) |
1265 | 1349 | ||
1266 | HASH4_CALC; | 1350 | HASH4_CALC |
1267 | 1351 | ||
1268 | hash = p->hash; | 1352 | hash = p->hash; |
1269 | pos = p->pos; | 1353 | pos = p->pos; |
@@ -1314,12 +1398,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1314 | if (maxLen == lenLimit) | 1398 | if (maxLen == lenLimit) |
1315 | { | 1399 | { |
1316 | p->son[p->cyclicBufferPos] = curMatch; | 1400 | p->son[p->cyclicBufferPos] = curMatch; |
1317 | MOVE_POS_RET; | 1401 | MOVE_POS_RET |
1318 | } | 1402 | } |
1319 | break; | 1403 | break; |
1320 | } | 1404 | } |
1321 | 1405 | ||
1322 | GET_MATCHES_FOOTER_HC(maxLen); | 1406 | GET_MATCHES_FOOTER_HC(maxLen) |
1323 | } | 1407 | } |
1324 | 1408 | ||
1325 | 1409 | ||
@@ -1330,7 +1414,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1330 | UInt32 *hash; | 1414 | UInt32 *hash; |
1331 | GET_MATCHES_HEADER(5) | 1415 | GET_MATCHES_HEADER(5) |
1332 | 1416 | ||
1333 | HASH5_CALC; | 1417 | HASH5_CALC |
1334 | 1418 | ||
1335 | hash = p->hash; | 1419 | hash = p->hash; |
1336 | pos = p->pos; | 1420 | pos = p->pos; |
@@ -1386,19 +1470,19 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1386 | if (maxLen == lenLimit) | 1470 | if (maxLen == lenLimit) |
1387 | { | 1471 | { |
1388 | p->son[p->cyclicBufferPos] = curMatch; | 1472 | p->son[p->cyclicBufferPos] = curMatch; |
1389 | MOVE_POS_RET; | 1473 | MOVE_POS_RET |
1390 | } | 1474 | } |
1391 | break; | 1475 | break; |
1392 | } | 1476 | } |
1393 | 1477 | ||
1394 | GET_MATCHES_FOOTER_HC(maxLen); | 1478 | GET_MATCHES_FOOTER_HC(maxLen) |
1395 | } | 1479 | } |
1396 | 1480 | ||
1397 | 1481 | ||
1398 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1482 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) |
1399 | { | 1483 | { |
1400 | GET_MATCHES_HEADER(3) | 1484 | GET_MATCHES_HEADER(3) |
1401 | HASH_ZIP_CALC; | 1485 | HASH_ZIP_CALC |
1402 | curMatch = p->hash[hv]; | 1486 | curMatch = p->hash[hv]; |
1403 | p->hash[hv] = p->pos; | 1487 | p->hash[hv] = p->pos; |
1404 | GET_MATCHES_FOOTER_HC(2) | 1488 | GET_MATCHES_FOOTER_HC(2) |
@@ -1409,7 +1493,7 @@ static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1409 | { | 1493 | { |
1410 | SKIP_HEADER(2) | 1494 | SKIP_HEADER(2) |
1411 | { | 1495 | { |
1412 | HASH2_CALC; | 1496 | HASH2_CALC |
1413 | curMatch = p->hash[hv]; | 1497 | curMatch = p->hash[hv]; |
1414 | p->hash[hv] = p->pos; | 1498 | p->hash[hv] = p->pos; |
1415 | } | 1499 | } |
@@ -1420,7 +1504,7 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1420 | { | 1504 | { |
1421 | SKIP_HEADER(3) | 1505 | SKIP_HEADER(3) |
1422 | { | 1506 | { |
1423 | HASH_ZIP_CALC; | 1507 | HASH_ZIP_CALC |
1424 | curMatch = p->hash[hv]; | 1508 | curMatch = p->hash[hv]; |
1425 | p->hash[hv] = p->pos; | 1509 | p->hash[hv] = p->pos; |
1426 | } | 1510 | } |
@@ -1433,7 +1517,7 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1433 | { | 1517 | { |
1434 | UInt32 h2; | 1518 | UInt32 h2; |
1435 | UInt32 *hash; | 1519 | UInt32 *hash; |
1436 | HASH3_CALC; | 1520 | HASH3_CALC |
1437 | hash = p->hash; | 1521 | hash = p->hash; |
1438 | curMatch = (hash + kFix3HashSize)[hv]; | 1522 | curMatch = (hash + kFix3HashSize)[hv]; |
1439 | hash[h2] = | 1523 | hash[h2] = |
@@ -1448,7 +1532,7 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1448 | { | 1532 | { |
1449 | UInt32 h2, h3; | 1533 | UInt32 h2, h3; |
1450 | UInt32 *hash; | 1534 | UInt32 *hash; |
1451 | HASH4_CALC; | 1535 | HASH4_CALC |
1452 | hash = p->hash; | 1536 | hash = p->hash; |
1453 | curMatch = (hash + kFix4HashSize)[hv]; | 1537 | curMatch = (hash + kFix4HashSize)[hv]; |
1454 | hash [h2] = | 1538 | hash [h2] = |
@@ -1464,7 +1548,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1464 | { | 1548 | { |
1465 | UInt32 h2, h3; | 1549 | UInt32 h2, h3; |
1466 | UInt32 *hash; | 1550 | UInt32 *hash; |
1467 | HASH5_CALC; | 1551 | HASH5_CALC |
1468 | hash = p->hash; | 1552 | hash = p->hash; |
1469 | curMatch = (hash + kFix5HashSize)[hv]; | 1553 | curMatch = (hash + kFix5HashSize)[hv]; |
1470 | hash [h2] = | 1554 | hash [h2] = |
@@ -1478,7 +1562,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1478 | 1562 | ||
1479 | #define HC_SKIP_HEADER(minLen) \ | 1563 | #define HC_SKIP_HEADER(minLen) \ |
1480 | do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ | 1564 | do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \ |
1481 | Byte *cur; \ | 1565 | const Byte *cur; \ |
1482 | UInt32 *hash; \ | 1566 | UInt32 *hash; \ |
1483 | UInt32 *son; \ | 1567 | UInt32 *son; \ |
1484 | UInt32 pos = p->pos; \ | 1568 | UInt32 pos = p->pos; \ |
@@ -1510,7 +1594,7 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1510 | HC_SKIP_HEADER(4) | 1594 | HC_SKIP_HEADER(4) |
1511 | 1595 | ||
1512 | UInt32 h2, h3; | 1596 | UInt32 h2, h3; |
1513 | HASH4_CALC; | 1597 | HASH4_CALC |
1514 | curMatch = (hash + kFix4HashSize)[hv]; | 1598 | curMatch = (hash + kFix4HashSize)[hv]; |
1515 | hash [h2] = | 1599 | hash [h2] = |
1516 | (hash + kFix3HashSize)[h3] = | 1600 | (hash + kFix3HashSize)[h3] = |
@@ -1540,7 +1624,7 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1540 | { | 1624 | { |
1541 | HC_SKIP_HEADER(3) | 1625 | HC_SKIP_HEADER(3) |
1542 | 1626 | ||
1543 | HASH_ZIP_CALC; | 1627 | HASH_ZIP_CALC |
1544 | curMatch = hash[hv]; | 1628 | curMatch = hash[hv]; |
1545 | hash[hv] = pos; | 1629 | hash[hv] = pos; |
1546 | 1630 | ||
@@ -1590,17 +1674,17 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) | |||
1590 | 1674 | ||
1591 | 1675 | ||
1592 | 1676 | ||
1593 | void LzFindPrepare() | 1677 | void LzFindPrepare(void) |
1594 | { | 1678 | { |
1595 | #ifndef FORCE_SATUR_SUB_128 | 1679 | #ifndef FORCE_LZFIND_SATUR_SUB_128 |
1596 | #ifdef USE_SATUR_SUB_128 | 1680 | #ifdef USE_LZFIND_SATUR_SUB_128 |
1597 | LZFIND_SATUR_SUB_CODE_FUNC f = NULL; | 1681 | LZFIND_SATUR_SUB_CODE_FUNC f = NULL; |
1598 | #ifdef MY_CPU_ARM_OR_ARM64 | 1682 | #ifdef MY_CPU_ARM_OR_ARM64 |
1599 | { | 1683 | { |
1600 | if (CPU_IsSupported_NEON()) | 1684 | if (CPU_IsSupported_NEON()) |
1601 | { | 1685 | { |
1602 | // #pragma message ("=== LzFind NEON") | 1686 | // #pragma message ("=== LzFind NEON") |
1603 | _PRF(printf("\n=== LzFind NEON\n")); | 1687 | PRF(printf("\n=== LzFind NEON\n")); |
1604 | f = LzFind_SaturSub_128; | 1688 | f = LzFind_SaturSub_128; |
1605 | } | 1689 | } |
1606 | // f = 0; // for debug | 1690 | // f = 0; // for debug |
@@ -1609,20 +1693,25 @@ void LzFindPrepare() | |||
1609 | if (CPU_IsSupported_SSE41()) | 1693 | if (CPU_IsSupported_SSE41()) |
1610 | { | 1694 | { |
1611 | // #pragma message ("=== LzFind SSE41") | 1695 | // #pragma message ("=== LzFind SSE41") |
1612 | _PRF(printf("\n=== LzFind SSE41\n")); | 1696 | PRF(printf("\n=== LzFind SSE41\n")); |
1613 | f = LzFind_SaturSub_128; | 1697 | f = LzFind_SaturSub_128; |
1614 | 1698 | ||
1615 | #ifdef USE_AVX2 | 1699 | #ifdef USE_LZFIND_SATUR_SUB_256 |
1616 | if (CPU_IsSupported_AVX2()) | 1700 | if (CPU_IsSupported_AVX2()) |
1617 | { | 1701 | { |
1618 | // #pragma message ("=== LzFind AVX2") | 1702 | // #pragma message ("=== LzFind AVX2") |
1619 | _PRF(printf("\n=== LzFind AVX2\n")); | 1703 | PRF(printf("\n=== LzFind AVX2\n")); |
1620 | f = LzFind_SaturSub_256; | 1704 | f = LzFind_SaturSub_256; |
1621 | } | 1705 | } |
1622 | #endif | 1706 | #endif |
1623 | } | 1707 | } |
1624 | #endif // MY_CPU_ARM_OR_ARM64 | 1708 | #endif // MY_CPU_ARM_OR_ARM64 |
1625 | g_LzFind_SaturSub = f; | 1709 | g_LzFind_SaturSub = f; |
1626 | #endif // USE_SATUR_SUB_128 | 1710 | #endif // USE_LZFIND_SATUR_SUB_128 |
1627 | #endif // FORCE_SATUR_SUB_128 | 1711 | #endif // FORCE_LZFIND_SATUR_SUB_128 |
1628 | } | 1712 | } |
1713 | |||
1714 | |||
1715 | #undef MOVE_POS | ||
1716 | #undef MOVE_POS_RET | ||
1717 | #undef PRF | ||