aboutsummaryrefslogtreecommitdiff
path: root/C
diff options
context:
space:
mode:
authorIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-14 00:00:00 +0000
committerIgor Pavlov <87184205+ip7z@users.noreply.github.com>2024-05-15 23:55:04 +0500
commitfc662341e6f85da78ada0e443f6116b978f79f22 (patch)
tree1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /C
parent5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff)
download7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.gz
7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.bz2
7zip-fc662341e6f85da78ada0e443f6116b978f79f22.zip
24.0524.05
Diffstat (limited to 'C')
-rw-r--r--C/7zArcIn.c6
-rw-r--r--C/7zCrc.c508
-rw-r--r--C/7zCrc.h5
-rw-r--r--C/7zCrcOpt.c244
-rw-r--r--C/7zDec.c46
-rw-r--r--C/7zTypes.h14
-rw-r--r--C/7zVersion.h10
-rw-r--r--C/7zip_gcc_c.mak4
-rw-r--r--C/Aes.c56
-rw-r--r--C/AesOpt.c225
-rw-r--r--C/Alloc.c174
-rw-r--r--C/Alloc.h15
-rw-r--r--C/Asm_c.mak12
-rw-r--r--C/Blake2.h111
-rw-r--r--C/Blake2s.c2693
-rw-r--r--C/Bra.c325
-rw-r--r--C/Bra.h36
-rw-r--r--C/Compiler.h91
-rw-r--r--C/CpuArch.c94
-rw-r--r--C/CpuArch.h144
-rw-r--r--C/DllSecur.c18
-rw-r--r--C/HuffEnc.c4
-rw-r--r--C/LzFind.c127
-rw-r--r--C/LzFind.h5
-rw-r--r--C/LzFindMt.c58
-rw-r--r--C/LzFindMt.h9
-rw-r--r--C/Lzma2Dec.c6
-rw-r--r--C/LzmaEnc.c26
-rw-r--r--C/MtCoder.c8
-rw-r--r--C/MtDec.c18
-rw-r--r--C/Ppmd7.c33
-rw-r--r--C/Ppmd7Dec.c24
-rw-r--r--C/Ppmd7Enc.c17
-rw-r--r--C/Ppmd7aDec.c24
-rw-r--r--C/Ppmd8.c33
-rw-r--r--C/Ppmd8Dec.c22
-rw-r--r--C/Ppmd8Enc.c17
-rw-r--r--C/Precomp.h123
-rw-r--r--C/Sha1.c52
-rw-r--r--C/Sha1Opt.c132
-rw-r--r--C/Sha256.c52
-rw-r--r--C/Sha256Opt.c127
-rw-r--r--C/SwapBytes.c63
-rw-r--r--C/Threads.c53
-rw-r--r--C/Threads.h20
-rw-r--r--C/Util/7z/7z.dsp8
-rw-r--r--C/Util/7z/7zMain.c45
-rw-r--r--C/Util/7z/Precomp.h13
-rw-r--r--C/Util/7z/makefile8
-rw-r--r--C/Util/7zipInstall/7zipInstall.c32
-rw-r--r--C/Util/7zipInstall/Precomp.h13
-rw-r--r--C/Util/7zipInstall/makefile9
-rw-r--r--C/Util/7zipInstall/resource.rc5
-rw-r--r--C/Util/7zipUninstall/7zipUninstall.c75
-rw-r--r--C/Util/7zipUninstall/Precomp.h13
-rw-r--r--C/Util/7zipUninstall/resource.rc5
-rw-r--r--C/Util/Lzma/Precomp.h13
-rw-r--r--C/Util/LzmaLib/Precomp.h13
-rw-r--r--C/Util/LzmaLib/makefile7
-rw-r--r--C/Util/SfxSetup/Precomp.h13
-rw-r--r--C/Util/SfxSetup/SfxSetup.c17
-rw-r--r--C/Util/SfxSetup/makefile8
-rw-r--r--C/Xxh64.c327
-rw-r--r--C/Xxh64.h50
-rw-r--r--C/Xz.c4
-rw-r--r--C/Xz.h7
-rw-r--r--C/XzCrc64.c122
-rw-r--r--C/XzCrc64.h8
-rw-r--r--C/XzCrc64Opt.c254
-rw-r--r--C/XzDec.c84
-rw-r--r--C/XzEnc.c60
-rw-r--r--C/XzIn.c10
-rw-r--r--C/ZstdDec.c4064
-rw-r--r--C/ZstdDec.h173
-rw-r--r--C/var_clang_arm64.mak1
75 files changed, 10047 insertions, 1298 deletions
diff --git a/C/7zArcIn.c b/C/7zArcIn.c
index 43fa7c2..23f2949 100644
--- a/C/7zArcIn.c
+++ b/C/7zArcIn.c
@@ -1,5 +1,5 @@
1/* 7zArcIn.c -- 7z Input functions 1/* 7zArcIn.c -- 7z Input functions
22023-05-11 : Igor Pavlov : Public domain */ 22023-09-07 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -301,7 +301,7 @@ static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v)
301 301
302static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) 302static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems)
303{ 303{
304 Byte b = 0; 304 unsigned b = 0;
305 unsigned m = 0; 305 unsigned m = 0;
306 UInt32 sum = 0; 306 UInt32 sum = 0;
307 for (; numItems != 0; numItems--) 307 for (; numItems != 0; numItems--)
@@ -312,7 +312,7 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems)
312 m = 8; 312 m = 8;
313 } 313 }
314 m--; 314 m--;
315 sum += ((b >> m) & 1); 315 sum += (UInt32)((b >> m) & 1);
316 } 316 }
317 return sum; 317 return sum;
318} 318}
diff --git a/C/7zCrc.c b/C/7zCrc.c
index c995a8b..6e2db9e 100644
--- a/C/7zCrc.c
+++ b/C/7zCrc.c
@@ -1,93 +1,96 @@
1/* 7zCrc.c -- CRC32 calculation and init 1/* 7zCrc.c -- CRC32 calculation and init
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "7zCrc.h" 6#include "7zCrc.h"
7#include "CpuArch.h" 7#include "CpuArch.h"
8 8
9#define kCrcPoly 0xEDB88320 9// for debug:
10// #define __ARM_FEATURE_CRC32 1
10 11
11#ifdef MY_CPU_LE 12#ifdef __ARM_FEATURE_CRC32
12 #define CRC_NUM_TABLES 8 13// #pragma message("__ARM_FEATURE_CRC32")
13#else 14#define Z7_CRC_HW_FORCE
14 #define CRC_NUM_TABLES 9 15#endif
15 16
16 UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); 17// #define Z7_CRC_DEBUG_BE
17 UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table); 18#ifdef Z7_CRC_DEBUG_BE
19#undef MY_CPU_LE
20#define MY_CPU_BE
18#endif 21#endif
19 22
20#ifndef MY_CPU_BE 23#ifdef Z7_CRC_HW_FORCE
21 UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); 24 #define Z7_CRC_NUM_TABLES_USE 1
22 UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); 25#else
26#ifdef Z7_CRC_NUM_TABLES
27 #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
28#else
29 #define Z7_CRC_NUM_TABLES_USE 12
30#endif
23#endif 31#endif
24 32
25/* 33#if Z7_CRC_NUM_TABLES_USE < 1
26extern 34 #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
27CRC_FUNC g_CrcUpdateT4; 35#endif
28CRC_FUNC g_CrcUpdateT4;
29*/
30extern
31CRC_FUNC g_CrcUpdateT8;
32CRC_FUNC g_CrcUpdateT8;
33extern
34CRC_FUNC g_CrcUpdateT0_32;
35CRC_FUNC g_CrcUpdateT0_32;
36extern
37CRC_FUNC g_CrcUpdateT0_64;
38CRC_FUNC g_CrcUpdateT0_64;
39extern
40CRC_FUNC g_CrcUpdate;
41CRC_FUNC g_CrcUpdate;
42
43UInt32 g_CrcTable[256 * CRC_NUM_TABLES];
44
45UInt32 Z7_FASTCALL CrcUpdate(UInt32 v, const void *data, size_t size)
46{
47 return g_CrcUpdate(v, data, size, g_CrcTable);
48}
49 36
50UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) 37#if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1)
51{ 38 #define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE
52 return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL; 39#else
53} 40 #define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1)
41#endif
54 42
55#if CRC_NUM_TABLES < 4 \ 43#ifndef Z7_CRC_HW_FORCE
56 || (CRC_NUM_TABLES == 4 && defined(MY_CPU_BE)) \ 44
45#if Z7_CRC_NUM_TABLES_USE == 1 \
57 || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) 46 || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
58#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) 47#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
59UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table); 48#define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1
60UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) 49static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size)
61{ 50{
51 const UInt32 *table = g_CrcTable;
62 const Byte *p = (const Byte *)data; 52 const Byte *p = (const Byte *)data;
63 const Byte *pEnd = p + size; 53 const Byte *lim = p + size;
64 for (; p != pEnd; p++) 54 for (; p != lim; p++)
65 v = CRC_UPDATE_BYTE_2(v, *p); 55 v = CRC_UPDATE_BYTE_2(v, *p);
66 return v; 56 return v;
67} 57}
68#endif 58#endif
69 59
60
61#if Z7_CRC_NUM_TABLES_USE != 1
62#ifndef MY_CPU_BE
63 #define FUNC_NAME_LE_2(s) CrcUpdateT ## s
64 #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s)
65 #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE)
66 UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table);
67#endif
68#ifndef MY_CPU_LE
69 #define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s
70 #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s)
71 #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE)
72 UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table);
73#endif
74#endif
75
76#endif // Z7_CRC_HW_FORCE
77
70/* ---------- hardware CRC ---------- */ 78/* ---------- hardware CRC ---------- */
71 79
72#ifdef MY_CPU_LE 80#ifdef MY_CPU_LE
73 81
74#if defined(MY_CPU_ARM_OR_ARM64) 82#if defined(MY_CPU_ARM_OR_ARM64)
75
76// #pragma message("ARM*") 83// #pragma message("ARM*")
77 84
78 #if defined(_MSC_VER) 85 #if (defined(__clang__) && (__clang_major__ >= 3)) \
79 #if defined(MY_CPU_ARM64) 86 || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \
80 #if (_MSC_VER >= 1910) 87 || defined(__GNUC__) && (__GNUC__ >= 8)
81 #ifndef __clang__
82 #define USE_ARM64_CRC
83 #include <intrin.h>
84 #endif
85 #endif
86 #endif
87 #elif (defined(__clang__) && (__clang_major__ >= 3)) \
88 || (defined(__GNUC__) && (__GNUC__ > 4))
89 #if !defined(__ARM_FEATURE_CRC32) 88 #if !defined(__ARM_FEATURE_CRC32)
89// #pragma message("!defined(__ARM_FEATURE_CRC32)")
90Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
90 #define __ARM_FEATURE_CRC32 1 91 #define __ARM_FEATURE_CRC32 1
92Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
93 #define Z7_ARM_FEATURE_CRC32_WAS_SET
91 #if defined(__clang__) 94 #if defined(__clang__)
92 #if defined(MY_CPU_ARM64) 95 #if defined(MY_CPU_ARM64)
93 #define ATTRIB_CRC __attribute__((__target__("crc"))) 96 #define ATTRIB_CRC __attribute__((__target__("crc")))
@@ -96,100 +99,120 @@ UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UI
96 #endif 99 #endif
97 #else 100 #else
98 #if defined(MY_CPU_ARM64) 101 #if defined(MY_CPU_ARM64)
102#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000)
99 #define ATTRIB_CRC __attribute__((__target__("+crc"))) 103 #define ATTRIB_CRC __attribute__((__target__("+crc")))
104#endif
100 #else 105 #else
106#if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8)
107#if defined(__ARM_FP) && __GNUC__ >= 8
108// for -mfloat-abi=hard: similar to <arm_acle.h>
109 #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd")))
110#else
101 #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) 111 #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc")))
112#endif
113#endif
102 #endif 114 #endif
103 #endif 115 #endif
104 #endif 116 #endif
105 #if defined(__ARM_FEATURE_CRC32) 117 #if defined(__ARM_FEATURE_CRC32)
106 #define USE_ARM64_CRC 118 // #pragma message("<arm_acle.h>")
119/*
120arm_acle.h (GGC):
121 before Nov 17, 2017:
122#ifdef __ARM_FEATURE_CRC32
123
124 Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked"
125#if __ARM_ARCH >= 8
126#pragma GCC target ("arch=armv8-a+crc")
127
128 Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1:
129#ifdef __ARM_FEATURE_CRC32
130#ifdef __ARM_FP
131#pragma GCC target ("arch=armv8-a+crc+simd")
132#else
133#pragma GCC target ("arch=armv8-a+crc")
134#endif
135*/
136#if defined(__ARM_ARCH) && __ARM_ARCH < 8
137#if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \
138 || defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \
139 || defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100)
140Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
141// #pragma message("#define __ARM_ARCH 8")
142#undef __ARM_ARCH
143#define __ARM_ARCH 8
144Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
145#endif
146#endif
147 #define Z7_CRC_HW_USE
107 #include <arm_acle.h> 148 #include <arm_acle.h>
108 #endif 149 #endif
150 #elif defined(_MSC_VER)
151 #if defined(MY_CPU_ARM64)
152 #if (_MSC_VER >= 1910)
153 #ifdef __clang__
154 // #define Z7_CRC_HW_USE
155 // #include <arm_acle.h>
156 #else
157 #define Z7_CRC_HW_USE
158 #include <intrin.h>
159 #endif
160 #endif
161 #endif
109 #endif 162 #endif
110 163
111#else 164#else // non-ARM*
112
113// no hardware CRC
114
115// #define USE_CRC_EMU
116
117#ifdef USE_CRC_EMU
118
119#pragma message("ARM64 CRC emulation")
120
121Z7_FORCE_INLINE
122UInt32 __crc32b(UInt32 v, UInt32 data)
123{
124 const UInt32 *table = g_CrcTable;
125 v = CRC_UPDATE_BYTE_2(v, (Byte)data);
126 return v;
127}
128 165
129Z7_FORCE_INLINE 166// #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code
130UInt32 __crc32w(UInt32 v, UInt32 data) 167#ifdef Z7_CRC_HW_USE
131{ 168#include "7zCrcEmu.h"
132 const UInt32 *table = g_CrcTable; 169#endif
133 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
134 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
135 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
136 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
137 return v;
138}
139 170
140Z7_FORCE_INLINE 171#endif // non-ARM*
141UInt32 __crc32d(UInt32 v, UInt64 data)
142{
143 const UInt32 *table = g_CrcTable;
144 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
145 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
146 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
147 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
148 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
149 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
150 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
151 v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
152 return v;
153}
154 172
155#endif // USE_CRC_EMU
156 173
157#endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
158 174
175#if defined(Z7_CRC_HW_USE)
159 176
177// #pragma message("USE ARM HW CRC")
160 178
161#if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) 179#ifdef MY_CPU_64BIT
180 #define CRC_HW_WORD_TYPE UInt64
181 #define CRC_HW_WORD_FUNC __crc32d
182#else
183 #define CRC_HW_WORD_TYPE UInt32
184 #define CRC_HW_WORD_FUNC __crc32w
185#endif
162 186
163#define T0_32_UNROLL_BYTES (4 * 4) 187#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4)
164#define T0_64_UNROLL_BYTES (4 * 8)
165 188
166#ifndef ATTRIB_CRC 189#ifdef ATTRIB_CRC
167#define ATTRIB_CRC 190 ATTRIB_CRC
168#endif 191#endif
169// #pragma message("USE ARM HW CRC") 192Z7_NO_INLINE
170 193#ifdef Z7_CRC_HW_FORCE
171ATTRIB_CRC 194 UInt32 Z7_FASTCALL CrcUpdate
172UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); 195#else
173ATTRIB_CRC 196 static UInt32 Z7_FASTCALL CrcUpdate_HW
174UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) 197#endif
198 (UInt32 v, const void *data, size_t size)
175{ 199{
176 const Byte *p = (const Byte *)data; 200 const Byte *p = (const Byte *)data;
177 UNUSED_VAR(table); 201 for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--)
178
179 for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--)
180 v = __crc32b(v, *p++); 202 v = __crc32b(v, *p++);
181 203 if (size >= CRC_HW_UNROLL_BYTES)
182 if (size >= T0_32_UNROLL_BYTES)
183 { 204 {
184 const Byte *lim = p + size; 205 const Byte *lim = p + size;
185 size &= (T0_32_UNROLL_BYTES - 1); 206 size &= CRC_HW_UNROLL_BYTES - 1;
186 lim -= size; 207 lim -= size;
187 do 208 do
188 { 209 {
189 v = __crc32w(v, *(const UInt32 *)(const void *)(p)); 210 v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
190 v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; 211 v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
191 v = __crc32w(v, *(const UInt32 *)(const void *)(p)); 212 p += 2 * sizeof(CRC_HW_WORD_TYPE);
192 v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; 213 v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
214 v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
215 p += 2 * sizeof(CRC_HW_WORD_TYPE);
193 } 216 }
194 while (p != lim); 217 while (p != lim);
195 } 218 }
@@ -200,46 +223,86 @@ UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const
200 return v; 223 return v;
201} 224}
202 225
203ATTRIB_CRC 226#ifdef Z7_ARM_FEATURE_CRC32_WAS_SET
204UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); 227Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
205ATTRIB_CRC 228#undef __ARM_FEATURE_CRC32
206UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) 229Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
230#undef Z7_ARM_FEATURE_CRC32_WAS_SET
231#endif
232
233#endif // defined(Z7_CRC_HW_USE)
234#endif // MY_CPU_LE
235
236
237
238#ifndef Z7_CRC_HW_FORCE
239
240#if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
241/*
242typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC)
243 (UInt32 v, const void *data, size_t size, const UInt32 *table);
244Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate;
245*/
246static unsigned g_Crc_Algo;
247#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
248static unsigned g_Crc_Be;
249#endif
250#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
251
252
253
254Z7_NO_INLINE
255#ifdef Z7_CRC_HW_USE
256 static UInt32 Z7_FASTCALL CrcUpdate_Base
257#else
258 UInt32 Z7_FASTCALL CrcUpdate
259#endif
260 (UInt32 crc, const void *data, size_t size)
207{ 261{
208 const Byte *p = (const Byte *)data; 262#if Z7_CRC_NUM_TABLES_USE == 1
209 UNUSED_VAR(table); 263 return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
264#else // Z7_CRC_NUM_TABLES_USE != 1
265#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME
266 if (g_Crc_Algo == 1)
267 return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
268#endif
210 269
211 for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--) 270#ifdef MY_CPU_LE
212 v = __crc32b(v, *p++); 271 return FUNC_NAME_LE(crc, data, size, g_CrcTable);
272#elif defined(MY_CPU_BE)
273 return FUNC_NAME_BE(crc, data, size, g_CrcTable);
274#else
275 if (g_Crc_Be)
276 return FUNC_NAME_BE(crc, data, size, g_CrcTable);
277 else
278 return FUNC_NAME_LE(crc, data, size, g_CrcTable);
279#endif
280#endif // Z7_CRC_NUM_TABLES_USE != 1
281}
213 282
214 if (size >= T0_64_UNROLL_BYTES)
215 {
216 const Byte *lim = p + size;
217 size &= (T0_64_UNROLL_BYTES - 1);
218 lim -= size;
219 do
220 {
221 v = __crc32d(v, *(const UInt64 *)(const void *)(p));
222 v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
223 v = __crc32d(v, *(const UInt64 *)(const void *)(p));
224 v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
225 }
226 while (p != lim);
227 }
228
229 for (; size != 0; size--)
230 v = __crc32b(v, *p++);
231 283
232 return v; 284#ifdef Z7_CRC_HW_USE
285Z7_NO_INLINE
286UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size)
287{
288 if (g_Crc_Algo == 0)
289 return CrcUpdate_HW(crc, data, size);
290 return CrcUpdate_Base(crc, data, size);
233} 291}
292#endif
234 293
235#undef T0_32_UNROLL_BYTES 294#endif // !defined(Z7_CRC_HW_FORCE)
236#undef T0_64_UNROLL_BYTES
237 295
238#endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU)
239 296
240#endif // MY_CPU_LE 297
298UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size)
299{
300 return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL;
301}
241 302
242 303
304MY_ALIGN(64)
305UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL];
243 306
244 307
245void Z7_FASTCALL CrcGenerateTable(void) 308void Z7_FASTCALL CrcGenerateTable(void)
@@ -247,94 +310,111 @@ void Z7_FASTCALL CrcGenerateTable(void)
247 UInt32 i; 310 UInt32 i;
248 for (i = 0; i < 256; i++) 311 for (i = 0; i < 256; i++)
249 { 312 {
313#if defined(Z7_CRC_HW_FORCE)
314 g_CrcTable[i] = __crc32b(i, 0);
315#else
316 #define kCrcPoly 0xEDB88320
250 UInt32 r = i; 317 UInt32 r = i;
251 unsigned j; 318 unsigned j;
252 for (j = 0; j < 8; j++) 319 for (j = 0; j < 8; j++)
253 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); 320 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
254 g_CrcTable[i] = r; 321 g_CrcTable[i] = r;
322#endif
255 } 323 }
256 for (i = 256; i < 256 * CRC_NUM_TABLES; i++) 324 for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++)
257 { 325 {
258 const UInt32 r = g_CrcTable[(size_t)i - 256]; 326 const UInt32 r = g_CrcTable[(size_t)i - 256];
259 g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); 327 g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8);
260 } 328 }
261 329
262 #if CRC_NUM_TABLES < 4 330#if !defined(Z7_CRC_HW_FORCE) && \
263 g_CrcUpdate = CrcUpdateT1; 331 (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE))
264 #elif defined(MY_CPU_LE) 332
265 // g_CrcUpdateT4 = CrcUpdateT4; 333#if Z7_CRC_NUM_TABLES_USE <= 1
266 #if CRC_NUM_TABLES < 8 334 g_Crc_Algo = 1;
267 g_CrcUpdate = CrcUpdateT4; 335#else // Z7_CRC_NUM_TABLES_USE <= 1
268 #else // CRC_NUM_TABLES >= 8 336
269 g_CrcUpdateT8 = CrcUpdateT8; 337#if defined(MY_CPU_LE)
270 /* 338 g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
271 #ifdef MY_CPU_X86_OR_AMD64 339#else // !defined(MY_CPU_LE)
272 if (!CPU_Is_InOrder())
273 #endif
274 */
275 g_CrcUpdate = CrcUpdateT8;
276 #endif
277 #else
278 { 340 {
279 #ifndef MY_CPU_BE 341#ifndef MY_CPU_BE
280 UInt32 k = 0x01020304; 342 UInt32 k = 0x01020304;
281 const Byte *p = (const Byte *)&k; 343 const Byte *p = (const Byte *)&k;
282 if (p[0] == 4 && p[1] == 3) 344 if (p[0] == 4 && p[1] == 3)
283 { 345 g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
284 #if CRC_NUM_TABLES < 8
285 // g_CrcUpdateT4 = CrcUpdateT4;
286 g_CrcUpdate = CrcUpdateT4;
287 #else // CRC_NUM_TABLES >= 8
288 g_CrcUpdateT8 = CrcUpdateT8;
289 g_CrcUpdate = CrcUpdateT8;
290 #endif
291 }
292 else if (p[0] != 1 || p[1] != 2) 346 else if (p[0] != 1 || p[1] != 2)
293 g_CrcUpdate = CrcUpdateT1; 347 g_Crc_Algo = 1;
294 else 348 else
295 #endif // MY_CPU_BE 349#endif // MY_CPU_BE
296 { 350 {
297 for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) 351 for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--)
298 { 352 {
299 const UInt32 x = g_CrcTable[(size_t)i - 256]; 353 const UInt32 x = g_CrcTable[(size_t)i - 256];
300 g_CrcTable[i] = Z7_BSWAP32(x); 354 g_CrcTable[i] = Z7_BSWAP32(x);
301 } 355 }
302 #if CRC_NUM_TABLES <= 4 356#if defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
303 g_CrcUpdate = CrcUpdateT1; 357 g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
304 #elif CRC_NUM_TABLES <= 8 358#endif
305 // g_CrcUpdateT4 = CrcUpdateT1_BeT4; 359#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
306 g_CrcUpdate = CrcUpdateT1_BeT4; 360 g_Crc_Be = 1;
307 #else // CRC_NUM_TABLES > 8 361#endif
308 g_CrcUpdateT8 = CrcUpdateT1_BeT8;
309 g_CrcUpdate = CrcUpdateT1_BeT8;
310 #endif
311 } 362 }
312 } 363 }
313 #endif // CRC_NUM_TABLES < 4 364#endif // !defined(MY_CPU_LE)
314 365
315 #ifdef MY_CPU_LE 366#ifdef MY_CPU_LE
316 #ifdef USE_ARM64_CRC 367#ifdef Z7_CRC_HW_USE
317 if (CPU_IsSupported_CRC32()) 368 if (CPU_IsSupported_CRC32())
318 { 369 g_Crc_Algo = 0;
319 g_CrcUpdateT0_32 = CrcUpdateT0_32; 370#endif // Z7_CRC_HW_USE
320 g_CrcUpdateT0_64 = CrcUpdateT0_64; 371#endif // MY_CPU_LE
321 g_CrcUpdate = 372
322 #if defined(MY_CPU_ARM) 373#endif // Z7_CRC_NUM_TABLES_USE <= 1
323 CrcUpdateT0_32; 374#endif // g_Crc_Algo was declared
324 #else 375}
325 CrcUpdateT0_64; 376
326 #endif 377Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo)
327 } 378{
328 #endif 379 if (algo == 0)
329 380 return &CrcUpdate;
330 #ifdef USE_CRC_EMU 381
331 g_CrcUpdateT0_32 = CrcUpdateT0_32; 382#if defined(Z7_CRC_HW_USE)
332 g_CrcUpdateT0_64 = CrcUpdateT0_64; 383 if (algo == sizeof(CRC_HW_WORD_TYPE) * 8)
333 g_CrcUpdate = CrcUpdateT0_64; 384 {
334 #endif 385#ifdef Z7_CRC_HW_FORCE
386 return &CrcUpdate;
387#else
388 if (g_Crc_Algo == 0)
389 return &CrcUpdate_HW;
390#endif
391 }
392#endif
393
394#ifndef Z7_CRC_HW_FORCE
395 if (algo == Z7_CRC_NUM_TABLES_USE)
396 return
397 #ifdef Z7_CRC_HW_USE
398 &CrcUpdate_Base;
399 #else
400 &CrcUpdate;
335 #endif 401 #endif
402#endif
403
404 return NULL;
336} 405}
337 406
338#undef kCrcPoly 407#undef kCrcPoly
339#undef CRC64_NUM_TABLES 408#undef Z7_CRC_NUM_TABLES_USE
409#undef Z7_CRC_NUM_TABLES_TOTAL
340#undef CRC_UPDATE_BYTE_2 410#undef CRC_UPDATE_BYTE_2
411#undef FUNC_NAME_LE_2
412#undef FUNC_NAME_LE_1
413#undef FUNC_NAME_LE
414#undef FUNC_NAME_BE_2
415#undef FUNC_NAME_BE_1
416#undef FUNC_NAME_BE
417
418#undef CRC_HW_UNROLL_BYTES
419#undef CRC_HW_WORD_FUNC
420#undef CRC_HW_WORD_TYPE
diff --git a/C/7zCrc.h b/C/7zCrc.h
index 4afaeae..3e6d408 100644
--- a/C/7zCrc.h
+++ b/C/7zCrc.h
@@ -1,5 +1,5 @@
1/* 7zCrc.h -- CRC32 calculation 1/* 7zCrc.h -- CRC32 calculation
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_7Z_CRC_H 4#ifndef ZIP7_INC_7Z_CRC_H
5#define ZIP7_INC_7Z_CRC_H 5#define ZIP7_INC_7Z_CRC_H
@@ -20,7 +20,8 @@ void Z7_FASTCALL CrcGenerateTable(void);
20UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); 20UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size);
21UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); 21UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size);
22 22
23typedef UInt32 (Z7_FASTCALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); 23typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size);
24Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo);
24 25
25EXTERN_C_END 26EXTERN_C_END
26 27
diff --git a/C/7zCrcOpt.c b/C/7zCrcOpt.c
index 9c64929..9408017 100644
--- a/C/7zCrcOpt.c
+++ b/C/7zCrcOpt.c
@@ -1,117 +1,199 @@
1/* 7zCrcOpt.c -- CRC32 calculation 1/* 7zCrcOpt.c -- CRC32 calculation (optimized functions)
22023-04-02 : Igor Pavlov : Public domain */ 22023-12-07 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1
9
10// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu
11// #define Z7_CRC_DEBUG_BE
12#ifdef Z7_CRC_DEBUG_BE
13#undef MY_CPU_LE
14#define MY_CPU_BE
15#endif
16
17// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c
18#ifdef Z7_CRC_NUM_TABLES
19#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
20#else
21#define Z7_CRC_NUM_TABLES_USE 12
22#endif
23
24#if Z7_CRC_NUM_TABLES_USE % 4 || \
25 Z7_CRC_NUM_TABLES_USE < 4 * 1 || \
26 Z7_CRC_NUM_TABLES_USE > 4 * 6
27 #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
28#endif
29
30
8#ifndef MY_CPU_BE 31#ifndef MY_CPU_BE
9 32
10#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) 33#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
11 34
12UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); 35#define Q(n, d) \
13UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) 36 ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \
14{ 37 ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
15 const Byte *p = (const Byte *)data; 38 ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
16 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) 39 ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
17 v = CRC_UPDATE_BYTE_2(v, *p); 40
18 for (; size >= 4; size -= 4, p += 4) 41#define R(a) *((const UInt32 *)(const void *)p + (a))
19 { 42
20 v ^= *(const UInt32 *)(const void *)p; 43#define CRC_FUNC_PRE_LE2(step) \
21 v = 44UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
22 (table + 0x300)[((v ) & 0xFF)]
23 ^ (table + 0x200)[((v >> 8) & 0xFF)]
24 ^ (table + 0x100)[((v >> 16) & 0xFF)]
25 ^ (table + 0x000)[((v >> 24))];
26 }
27 for (; size > 0; size--, p++)
28 v = CRC_UPDATE_BYTE_2(v, *p);
29 return v;
30}
31 45
32UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); 46#define CRC_FUNC_PRE_LE(step) \
33UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) 47 CRC_FUNC_PRE_LE2(step); \
48 CRC_FUNC_PRE_LE2(step)
49
50CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE)
34{ 51{
35 const Byte *p = (const Byte *)data; 52 const Byte *p = (const Byte *)data;
36 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) 53 const Byte *lim;
54 for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
37 v = CRC_UPDATE_BYTE_2(v, *p); 55 v = CRC_UPDATE_BYTE_2(v, *p);
38 for (; size >= 8; size -= 8, p += 8) 56 lim = p + size;
57 if (size >= Z7_CRC_NUM_TABLES_USE)
39 { 58 {
40 UInt32 d; 59 lim -= Z7_CRC_NUM_TABLES_USE;
41 v ^= *(const UInt32 *)(const void *)p; 60 do
42 v = 61 {
43 (table + 0x700)[((v ) & 0xFF)] 62 v ^= R(0);
44 ^ (table + 0x600)[((v >> 8) & 0xFF)] 63 {
45 ^ (table + 0x500)[((v >> 16) & 0xFF)] 64#if Z7_CRC_NUM_TABLES_USE == 1 * 4
46 ^ (table + 0x400)[((v >> 24))]; 65 v = Q(0, v);
47 d = *((const UInt32 *)(const void *)p + 1); 66#else
48 v ^= 67#define U2(r, op) \
49 (table + 0x300)[((d ) & 0xFF)] 68 { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
50 ^ (table + 0x200)[((d >> 8) & 0xFF)] 69 UInt32 d, x;
51 ^ (table + 0x100)[((d >> 16) & 0xFF)] 70 U2(1, =)
52 ^ (table + 0x000)[((d >> 24))]; 71#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
72#define U(r) U2(r, ^=)
73 U(2)
74#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
75 U(3)
76#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
77 U(4)
78#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
79 U(5)
80#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
81#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
82#endif
83#endif
84#endif
85#endif
86#endif
87#undef U
88#undef U2
89 v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
90#endif
91 }
92 p += Z7_CRC_NUM_TABLES_USE;
93 }
94 while (p <= lim);
95 lim += Z7_CRC_NUM_TABLES_USE;
53 } 96 }
54 for (; size > 0; size--, p++) 97 for (; p < lim; p++)
55 v = CRC_UPDATE_BYTE_2(v, *p); 98 v = CRC_UPDATE_BYTE_2(v, *p);
56 return v; 99 return v;
57} 100}
58 101
102#undef CRC_UPDATE_BYTE_2
103#undef R
104#undef Q
105#undef CRC_FUNC_PRE_LE
106#undef CRC_FUNC_PRE_LE2
107
59#endif 108#endif
60 109
61 110
111
112
62#ifndef MY_CPU_LE 113#ifndef MY_CPU_LE
63 114
64#define CRC_UINT32_SWAP(v) Z7_BSWAP32(v) 115#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8))
65 116
66#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) 117#define Q(n, d) \
118 ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \
119 ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
120 ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
121 ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
67 122
68UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) 123#ifdef Z7_CRC_DEBUG_BE
69{ 124 #define R(a) GetBe32a((const UInt32 *)(const void *)p + (a))
70 const Byte *p = (const Byte *)data; 125#else
71 table += 0x100; 126 #define R(a) *((const UInt32 *)(const void *)p + (a))
72 v = CRC_UINT32_SWAP(v); 127#endif
73 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) 128
74 v = CRC_UPDATE_BYTE_2_BE(v, *p); 129
75 for (; size >= 4; size -= 4, p += 4) 130#define CRC_FUNC_PRE_BE2(step) \
76 { 131UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
77 v ^= *(const UInt32 *)(const void *)p;
78 v =
79 (table + 0x000)[((v ) & 0xFF)]
80 ^ (table + 0x100)[((v >> 8) & 0xFF)]
81 ^ (table + 0x200)[((v >> 16) & 0xFF)]
82 ^ (table + 0x300)[((v >> 24))];
83 }
84 for (; size > 0; size--, p++)
85 v = CRC_UPDATE_BYTE_2_BE(v, *p);
86 return CRC_UINT32_SWAP(v);
87}
88 132
89UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) 133#define CRC_FUNC_PRE_BE(step) \
134 CRC_FUNC_PRE_BE2(step); \
135 CRC_FUNC_PRE_BE2(step)
136
137CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE)
90{ 138{
91 const Byte *p = (const Byte *)data; 139 const Byte *p = (const Byte *)data;
140 const Byte *lim;
92 table += 0x100; 141 table += 0x100;
93 v = CRC_UINT32_SWAP(v); 142 v = Z7_BSWAP32(v);
94 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) 143 for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
95 v = CRC_UPDATE_BYTE_2_BE(v, *p); 144 v = CRC_UPDATE_BYTE_2_BE(v, *p);
96 for (; size >= 8; size -= 8, p += 8) 145 lim = p + size;
146 if (size >= Z7_CRC_NUM_TABLES_USE)
97 { 147 {
98 UInt32 d; 148 lim -= Z7_CRC_NUM_TABLES_USE;
99 v ^= *(const UInt32 *)(const void *)p; 149 do
100 v = 150 {
101 (table + 0x400)[((v ) & 0xFF)] 151 v ^= R(0);
102 ^ (table + 0x500)[((v >> 8) & 0xFF)] 152 {
103 ^ (table + 0x600)[((v >> 16) & 0xFF)] 153#if Z7_CRC_NUM_TABLES_USE == 1 * 4
104 ^ (table + 0x700)[((v >> 24))]; 154 v = Q(0, v);
105 d = *((const UInt32 *)(const void *)p + 1); 155#else
106 v ^= 156#define U2(r, op) \
107 (table + 0x000)[((d ) & 0xFF)] 157 { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
108 ^ (table + 0x100)[((d >> 8) & 0xFF)] 158 UInt32 d, x;
109 ^ (table + 0x200)[((d >> 16) & 0xFF)] 159 U2(1, =)
110 ^ (table + 0x300)[((d >> 24))]; 160#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
161#define U(r) U2(r, ^=)
162 U(2)
163#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
164 U(3)
165#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
166 U(4)
167#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
168 U(5)
169#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
170#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
171#endif
172#endif
173#endif
174#endif
175#endif
176#undef U
177#undef U2
178 v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
179#endif
180 }
181 p += Z7_CRC_NUM_TABLES_USE;
182 }
183 while (p <= lim);
184 lim += Z7_CRC_NUM_TABLES_USE;
111 } 185 }
112 for (; size > 0; size--, p++) 186 for (; p < lim; p++)
113 v = CRC_UPDATE_BYTE_2_BE(v, *p); 187 v = CRC_UPDATE_BYTE_2_BE(v, *p);
114 return CRC_UINT32_SWAP(v); 188 return Z7_BSWAP32(v);
115} 189}
116 190
191#undef CRC_UPDATE_BYTE_2_BE
192#undef R
193#undef Q
194#undef CRC_FUNC_PRE_BE
195#undef CRC_FUNC_PRE_BE2
196
197#endif
198#undef Z7_CRC_NUM_TABLES_USE
117#endif 199#endif
diff --git a/C/7zDec.c b/C/7zDec.c
index 96c6035..c9b4064 100644
--- a/C/7zDec.c
+++ b/C/7zDec.c
@@ -1,5 +1,5 @@
1/* 7zDec.c -- Decoding from 7z folder 1/* 7zDec.c -- Decoding from 7z folder
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -51,6 +51,7 @@
51 51
52#ifndef Z7_NO_METHODS_FILTERS 52#ifndef Z7_NO_METHODS_FILTERS
53#define k_Delta 3 53#define k_Delta 3
54#define k_RISCV 0xb
54#define k_BCJ 0x3030103 55#define k_BCJ 0x3030103
55#define k_PPC 0x3030205 56#define k_PPC 0x3030205
56#define k_IA64 0x3030401 57#define k_IA64 0x3030401
@@ -362,6 +363,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
362 case k_IA64: 363 case k_IA64:
363 case k_SPARC: 364 case k_SPARC:
364 case k_ARM: 365 case k_ARM:
366 case k_RISCV:
365 #endif 367 #endif
366 #ifdef Z7_USE_FILTER_ARM64 368 #ifdef Z7_USE_FILTER_ARM64
367 case k_ARM64: 369 case k_ARM64:
@@ -535,10 +537,10 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
535 } 537 }
536 } 538 }
537 } 539 }
538 #if defined(Z7_USE_BRANCH_FILTER) 540#if defined(Z7_USE_BRANCH_FILTER)
539 else if (ci == 1) 541 else if (ci == 1)
540 { 542 {
541 #if !defined(Z7_NO_METHODS_FILTERS) 543#if !defined(Z7_NO_METHODS_FILTERS)
542 if (coder->MethodID == k_Delta) 544 if (coder->MethodID == k_Delta)
543 { 545 {
544 if (coder->PropsSize != 1) 546 if (coder->PropsSize != 1)
@@ -550,22 +552,43 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
550 } 552 }
551 continue; 553 continue;
552 } 554 }
553 #endif 555#endif
554 556
555 #ifdef Z7_USE_FILTER_ARM64 557#ifdef Z7_USE_FILTER_ARM64
556 if (coder->MethodID == k_ARM64) 558 if (coder->MethodID == k_ARM64)
557 { 559 {
558 UInt32 pc = 0; 560 UInt32 pc = 0;
559 if (coder->PropsSize == 4) 561 if (coder->PropsSize == 4)
562 {
560 pc = GetUi32(propsData + coder->PropsOffset); 563 pc = GetUi32(propsData + coder->PropsOffset);
564 if (pc & 3)
565 return SZ_ERROR_UNSUPPORTED;
566 }
561 else if (coder->PropsSize != 0) 567 else if (coder->PropsSize != 0)
562 return SZ_ERROR_UNSUPPORTED; 568 return SZ_ERROR_UNSUPPORTED;
563 z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); 569 z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc);
564 continue; 570 continue;
565 } 571 }
566 #endif 572#endif
567 573
568 #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) 574#if !defined(Z7_NO_METHODS_FILTERS)
575 if (coder->MethodID == k_RISCV)
576 {
577 UInt32 pc = 0;
578 if (coder->PropsSize == 4)
579 {
580 pc = GetUi32(propsData + coder->PropsOffset);
581 if (pc & 1)
582 return SZ_ERROR_UNSUPPORTED;
583 }
584 else if (coder->PropsSize != 0)
585 return SZ_ERROR_UNSUPPORTED;
586 z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc);
587 continue;
588 }
589#endif
590
591#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
569 { 592 {
570 if (coder->PropsSize != 0) 593 if (coder->PropsSize != 0)
571 return SZ_ERROR_UNSUPPORTED; 594 return SZ_ERROR_UNSUPPORTED;
@@ -579,7 +602,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
579 z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 602 z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0
580 break; 603 break;
581 } 604 }
582 CASE_BRA_CONV(PPC) 605 case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0;
606 // CASE_BRA_CONV(PPC)
583 CASE_BRA_CONV(IA64) 607 CASE_BRA_CONV(IA64)
584 CASE_BRA_CONV(SPARC) 608 CASE_BRA_CONV(SPARC)
585 CASE_BRA_CONV(ARM) 609 CASE_BRA_CONV(ARM)
@@ -592,9 +616,9 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
592 } 616 }
593 continue; 617 continue;
594 } 618 }
595 #endif 619#endif
596 } // (c == 1) 620 } // (c == 1)
597 #endif 621#endif // Z7_USE_BRANCH_FILTER
598 else 622 else
599 return SZ_ERROR_UNSUPPORTED; 623 return SZ_ERROR_UNSUPPORTED;
600 } 624 }
diff --git a/C/7zTypes.h b/C/7zTypes.h
index 1fcb247..5b77420 100644
--- a/C/7zTypes.h
+++ b/C/7zTypes.h
@@ -1,5 +1,5 @@
1/* 7zTypes.h -- Basic types 1/* 7zTypes.h -- Basic types
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-24 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_7Z_TYPES_H 4#ifndef ZIP7_7Z_TYPES_H
5#define ZIP7_7Z_TYPES_H 5#define ZIP7_7Z_TYPES_H
@@ -530,20 +530,20 @@ struct ISzAlloc
530#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) 530#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m)
531*/ 531*/
532#if defined (__clang__) || defined(__GNUC__) 532#if defined (__clang__) || defined(__GNUC__)
533#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ 533#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
534 _Pragma("GCC diagnostic push") \ 534 _Pragma("GCC diagnostic push") \
535 _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") 535 _Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
536#define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL \ 536#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \
537 _Pragma("GCC diagnostic pop") 537 _Pragma("GCC diagnostic pop")
538#else 538#else
539#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL 539#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL
540#define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL 540#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
541#endif 541#endif
542 542
543#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ 543#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \
544 Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ 544 Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
545 type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ 545 type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \
546 Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL 546 Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
547 547
548#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ 548#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \
549 Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) 549 Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p)
diff --git a/C/7zVersion.h b/C/7zVersion.h
index 7549239..72b915a 100644
--- a/C/7zVersion.h
+++ b/C/7zVersion.h
@@ -1,7 +1,7 @@
1#define MY_VER_MAJOR 23 1#define MY_VER_MAJOR 24
2#define MY_VER_MINOR 01 2#define MY_VER_MINOR 05
3#define MY_VER_BUILD 0 3#define MY_VER_BUILD 0
4#define MY_VERSION_NUMBERS "23.01" 4#define MY_VERSION_NUMBERS "24.05"
5#define MY_VERSION MY_VERSION_NUMBERS 5#define MY_VERSION MY_VERSION_NUMBERS
6 6
7#ifdef MY_CPU_NAME 7#ifdef MY_CPU_NAME
@@ -10,12 +10,12 @@
10 #define MY_VERSION_CPU MY_VERSION 10 #define MY_VERSION_CPU MY_VERSION
11#endif 11#endif
12 12
13#define MY_DATE "2023-06-20" 13#define MY_DATE "2024-05-14"
14#undef MY_COPYRIGHT 14#undef MY_COPYRIGHT
15#undef MY_VERSION_COPYRIGHT_DATE 15#undef MY_VERSION_COPYRIGHT_DATE
16#define MY_AUTHOR_NAME "Igor Pavlov" 16#define MY_AUTHOR_NAME "Igor Pavlov"
17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" 17#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2023 Igor Pavlov" 18#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov"
19 19
20#ifdef USE_COPYRIGHT_CR 20#ifdef USE_COPYRIGHT_CR
21 #define MY_COPYRIGHT MY_COPYRIGHT_CR 21 #define MY_COPYRIGHT MY_COPYRIGHT_CR
diff --git a/C/7zip_gcc_c.mak b/C/7zip_gcc_c.mak
index f19a99b..195d23d 100644
--- a/C/7zip_gcc_c.mak
+++ b/C/7zip_gcc_c.mak
@@ -22,8 +22,8 @@ CFLAGS_BASE_LIST = -c
22# for ASM file 22# for ASM file
23# CFLAGS_BASE_LIST = -S 23# CFLAGS_BASE_LIST = -S
24 24
25FLAGS_FLTO =
26FLAGS_FLTO = -flto 25FLAGS_FLTO = -flto
26FLAGS_FLTO =
27 27
28CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \ 28CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \
29 -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE 29 -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
@@ -329,7 +329,7 @@ endif
329 329
330ifdef IS_ARM64 330ifdef IS_ARM64
331$O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S 331$O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S
332 $(CC) $(CFLAGS) $< 332 $(CC) $(CFLAGS) $(ASM_FLAGS) $<
333endif 333endif
334 334
335$O/LzmaDec.o: ../../LzmaDec.c 335$O/LzmaDec.o: ../../LzmaDec.c
diff --git a/C/Aes.c b/C/Aes.c
index bcaafab..abc5d24 100644
--- a/C/Aes.c
+++ b/C/Aes.c
@@ -1,5 +1,5 @@
1/* Aes.c -- AES encryption / decryption 1/* Aes.c -- AES encryption / decryption
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -13,7 +13,9 @@ AES_CODE_FUNC g_AesCtr_Code;
13UInt32 g_Aes_SupportedFunctions_Flags; 13UInt32 g_Aes_SupportedFunctions_Flags;
14#endif 14#endif
15 15
16MY_ALIGN(64)
16static UInt32 T[256 * 4]; 17static UInt32 T[256 * 4];
18MY_ALIGN(64)
17static const Byte Sbox[256] = { 19static const Byte Sbox[256] = {
18 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 20 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
19 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 21 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@@ -33,7 +35,9 @@ static const Byte Sbox[256] = {
33 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; 35 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
34 36
35 37
38MY_ALIGN(64)
36static UInt32 D[256 * 4]; 39static UInt32 D[256 * 4];
40MY_ALIGN(64)
37static Byte InvS[256]; 41static Byte InvS[256];
38 42
39#define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) 43#define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF)
@@ -54,24 +58,54 @@ static Byte InvS[256];
54// #define Z7_SHOW_AES_STATUS 58// #define Z7_SHOW_AES_STATUS
55 59
56#ifdef MY_CPU_X86_OR_AMD64 60#ifdef MY_CPU_X86_OR_AMD64
57 #define USE_HW_AES 61
58#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) 62 #if defined(__INTEL_COMPILER)
59 #if defined(__clang__) 63 #if (__INTEL_COMPILER >= 1110)
60 #if (__clang_major__ >= 8) // fix that check
61 #define USE_HW_AES
62 #endif
63 #elif defined(__GNUC__)
64 #if (__GNUC__ >= 6) // fix that check
65 #define USE_HW_AES 64 #define USE_HW_AES
65 #if (__INTEL_COMPILER >= 1900)
66 #define USE_HW_VAES
67 #endif
66 #endif 68 #endif
69 #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
70 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
71 #define USE_HW_AES
72 #if defined(__clang__) && (__clang_major__ >= 8) \
73 || defined(__GNUC__) && (__GNUC__ >= 8)
74 #define USE_HW_VAES
75 #endif
67 #elif defined(_MSC_VER) 76 #elif defined(_MSC_VER)
68 #if _MSC_VER >= 1910 77 #define USE_HW_AES
78 #define USE_HW_VAES
79 #endif
80
81#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
82
83 #if defined(__ARM_FEATURE_AES) \
84 || defined(__ARM_FEATURE_CRYPTO)
85 #define USE_HW_AES
86 #else
87 #if defined(MY_CPU_ARM64) \
88 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
89 || defined(Z7_MSC_VER_ORIGINAL)
90 #if defined(__ARM_FP) && \
91 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
92 || defined(__GNUC__) && (__GNUC__ >= 6) \
93 ) \
94 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
95 #if defined(MY_CPU_ARM64) \
96 || !defined(Z7_CLANG_VERSION) \
97 || defined(__ARM_NEON) && \
98 (Z7_CLANG_VERSION < 170000 || \
99 Z7_CLANG_VERSION > 170001)
69 #define USE_HW_AES 100 #define USE_HW_AES
70 #endif 101 #endif
102 #endif
103 #endif
71 #endif 104 #endif
72#endif 105#endif
73 106
74#ifdef USE_HW_AES 107#ifdef USE_HW_AES
108// #pragma message("=== Aes.c USE_HW_AES === ")
75#ifdef Z7_SHOW_AES_STATUS 109#ifdef Z7_SHOW_AES_STATUS
76#include <stdio.h> 110#include <stdio.h>
77#define PRF(x) x 111#define PRF(x) x
@@ -136,6 +170,7 @@ void AesGenTables(void)
136 #endif 170 #endif
137 171
138 #ifdef MY_CPU_X86_OR_AMD64 172 #ifdef MY_CPU_X86_OR_AMD64
173 #ifdef USE_HW_VAES
139 if (CPU_IsSupported_VAES_AVX2()) 174 if (CPU_IsSupported_VAES_AVX2())
140 { 175 {
141 PRF(printf("\n===vaes avx2\n")); 176 PRF(printf("\n===vaes avx2\n"));
@@ -146,6 +181,7 @@ void AesGenTables(void)
146 #endif 181 #endif
147 } 182 }
148 #endif 183 #endif
184 #endif
149 } 185 }
150 #endif 186 #endif
151 187
diff --git a/C/AesOpt.c b/C/AesOpt.c
index cfa6413..58769ea 100644
--- a/C/AesOpt.c
+++ b/C/AesOpt.c
@@ -1,5 +1,5 @@
1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions 1/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -15,8 +15,8 @@
15 #define USE_INTEL_VAES 15 #define USE_INTEL_VAES
16 #endif 16 #endif
17 #endif 17 #endif
18 #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ 18 #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19 || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) 19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
20 #define USE_INTEL_AES 20 #define USE_INTEL_AES
21 #if !defined(__AES__) 21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes"))) 22 #define ATTRIB_AES __attribute__((__target__("aes")))
@@ -35,27 +35,37 @@
35 #define USE_INTEL_VAES 35 #define USE_INTEL_VAES
36 #endif 36 #endif
37 #endif 37 #endif
38 #ifndef USE_INTEL_AES
39 #define Z7_USE_AES_HW_STUB
40 #endif
41 #ifndef USE_INTEL_VAES
42 #define Z7_USE_VAES_HW_STUB
43 #endif
38 #endif 44 #endif
39 45
40#ifndef ATTRIB_AES 46 #ifndef USE_INTEL_AES
41 #define ATTRIB_AES 47 // #define Z7_USE_AES_HW_STUB // for debug
42#endif 48 #endif
43#ifndef ATTRIB_VAES 49 #ifndef USE_INTEL_VAES
44 #define ATTRIB_VAES 50 // #define Z7_USE_VAES_HW_STUB // for debug
45#endif 51 #endif
46 52
47 53
48#ifdef USE_INTEL_AES 54#ifdef USE_INTEL_AES
49 55
50#include <wmmintrin.h> 56#include <wmmintrin.h>
51 57
52#ifndef USE_INTEL_VAES 58#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
53#define AES_TYPE_keys UInt32 59#define AES_TYPE_keys UInt32
54#define AES_TYPE_data Byte 60#define AES_TYPE_data Byte
55// #define AES_TYPE_keys __m128i 61// #define AES_TYPE_keys __m128i
56// #define AES_TYPE_data __m128i 62// #define AES_TYPE_data __m128i
57#endif 63#endif
58 64
65#ifndef ATTRIB_AES
66 #define ATTRIB_AES
67#endif
68
59#define AES_FUNC_START(name) \ 69#define AES_FUNC_START(name) \
60 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) 70 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) 71 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
@@ -69,8 +79,6 @@ AES_FUNC_START (name)
69#define MM_OP_m(op, src) MM_OP(op, m, src) 79#define MM_OP_m(op, src) MM_OP(op, m, src)
70 80
71#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) 81#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
72#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
73
74 82
75AES_FUNC_START2 (AesCbc_Encode_HW) 83AES_FUNC_START2 (AesCbc_Encode_HW)
76{ 84{
@@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
139#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) 147#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
140#endif 148#endif
141 149
142#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
143#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
144#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
145#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
146
147#define MM_OP_key(op, reg) MM_OP(op, reg, key); 150#define MM_OP_key(op, reg) MM_OP(op, reg, key);
148 151
149#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) 152#define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
@@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
152#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) 155#define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
153#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) 156#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
154 157
155
156#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
157#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
158#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
159#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
160#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
161
162#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; 158#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
163#define CTR_END( reg, ii) MM_XOR (data[ii], reg) 159#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
164 160
165#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
166#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
167
168#define WOP_KEY(op, n) { \ 161#define WOP_KEY(op, n) { \
169 const __m128i key = w[n]; \ 162 const __m128i key = w[n]; \
170 WOP(op); } 163 WOP(op); }
171 164
172#define AVX_WOP_KEY(op, n) { \
173 const __m256i key = w[n]; \
174 WOP(op); }
175
176 165
177#define WIDE_LOOP_START \ 166#define WIDE_LOOP_START \
178 dataEnd = data + numBlocks; \ 167 dataEnd = data + numBlocks; \
@@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
190 for (; data < dataEnd; data++) 179 for (; data < dataEnd; data++)
191 180
192 181
182
183#ifdef USE_INTEL_VAES
184
185#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
188#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
189/*
190AVX_XOR_data_M1() needs unaligned memory load
191if (we don't use _mm256_loadu_si256() here)
192{
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197}
198Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200*/
201#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
202// for debug only: the following code will fail on execution, if compiled by some compilers:
203// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204
205#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
211#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
212#define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \
214 WOP(op); }
215
193#define NUM_AES_KEYS_MAX 15 216#define NUM_AES_KEYS_MAX 15
194 217
195#define WIDE_LOOP_START_AVX(OP) \ 218#define WIDE_LOOP_START_AVX(OP) \
@@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
214/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, 237/* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
215 MSVC still can insert vzeroupper instruction. */ 238 MSVC still can insert vzeroupper instruction. */
216 239
240#endif
241
242
217 243
218AES_FUNC_START2 (AesCbc_Decode_HW) 244AES_FUNC_START2 (AesCbc_Decode_HW)
219{ 245{
@@ -380,6 +406,9 @@ required that <immintrin.h> must be included before <avxintrin.h>.
380 #endif 406 #endif
381#endif // __clang__ && _MSC_VER 407#endif // __clang__ && _MSC_VER
382 408
409#ifndef ATTRIB_VAES
410 #define ATTRIB_VAES
411#endif
383 412
384#define VAES_FUNC_START2(name) \ 413#define VAES_FUNC_START2(name) \
385AES_FUNC_START (name); \ 414AES_FUNC_START (name); \
@@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
519 548
520/* no USE_INTEL_AES */ 549/* no USE_INTEL_AES */
521 550
551#if defined(Z7_USE_AES_HW_STUB)
552// We can compile this file with another C compiler,
553// or we can compile asm version.
554// So we can generate real code instead of this stub function.
555// #if defined(_MSC_VER)
522#pragma message("AES HW_SW stub was used") 556#pragma message("AES HW_SW stub was used")
557// #endif
523 558
559#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
524#define AES_TYPE_keys UInt32 560#define AES_TYPE_keys UInt32
525#define AES_TYPE_data Byte 561#define AES_TYPE_data Byte
562#endif
526 563
527#define AES_FUNC_START(name) \ 564#define AES_FUNC_START(name) \
528 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ 565 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
@@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
535AES_COMPAT_STUB (AesCbc_Encode) 572AES_COMPAT_STUB (AesCbc_Encode)
536AES_COMPAT_STUB (AesCbc_Decode) 573AES_COMPAT_STUB (AesCbc_Decode)
537AES_COMPAT_STUB (AesCtr_Code) 574AES_COMPAT_STUB (AesCtr_Code)
575#endif // Z7_USE_AES_HW_STUB
538 576
539#endif // USE_INTEL_AES 577#endif // USE_INTEL_AES
540 578
541 579
542#ifndef USE_INTEL_VAES 580#ifndef USE_INTEL_VAES
543 581#if defined(Z7_USE_VAES_HW_STUB)
582// #if defined(_MSC_VER)
544#pragma message("VAES HW_SW stub was used") 583#pragma message("VAES HW_SW stub was used")
584// #endif
545 585
546#define VAES_COMPAT_STUB(name) \ 586#define VAES_COMPAT_STUB(name) \
547 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ 587 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
@@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code)
550 590
551VAES_COMPAT_STUB (AesCbc_Decode_HW) 591VAES_COMPAT_STUB (AesCbc_Decode_HW)
552VAES_COMPAT_STUB (AesCtr_Code_HW) 592VAES_COMPAT_STUB (AesCtr_Code_HW)
553 593#endif
554#endif // ! USE_INTEL_VAES 594#endif // ! USE_INTEL_VAES
555 595
556 596
597
598
557#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) 599#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
558 600
559 #if defined(__clang__) 601 #if defined(__ARM_FEATURE_AES) \
560 #if (__clang_major__ >= 8) // fix that check 602 || defined(__ARM_FEATURE_CRYPTO)
603 #define USE_HW_AES
604 #else
605 #if defined(MY_CPU_ARM64) \
606 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
607 || defined(Z7_MSC_VER_ORIGINAL)
608 #if defined(__ARM_FP) && \
609 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
610 || defined(__GNUC__) && (__GNUC__ >= 6) \
611 ) \
612 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
613 #if defined(MY_CPU_ARM64) \
614 || !defined(Z7_CLANG_VERSION) \
615 || defined(__ARM_NEON) && \
616 (Z7_CLANG_VERSION < 170000 || \
617 Z7_CLANG_VERSION > 170001)
561 #define USE_HW_AES 618 #define USE_HW_AES
562 #endif 619 #endif
563 #elif defined(__GNUC__)
564 #if (__GNUC__ >= 6) // fix that check
565 #define USE_HW_AES
566 #endif 620 #endif
567 #elif defined(_MSC_VER)
568 #if _MSC_VER >= 1910
569 #define USE_HW_AES
570 #endif 621 #endif
571 #endif 622 #endif
572 623
573#ifdef USE_HW_AES 624#ifdef USE_HW_AES
574 625
575// #pragma message("=== AES HW === ") 626// #pragma message("=== AES HW === ")
627// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
576 628
577#if defined(__clang__) || defined(__GNUC__) 629#if defined(__clang__) || defined(__GNUC__)
630#if !defined(__ARM_FEATURE_AES) && \
631 !defined(__ARM_FEATURE_CRYPTO)
578 #ifdef MY_CPU_ARM64 632 #ifdef MY_CPU_ARM64
633#if defined(__clang__)
634 #define ATTRIB_AES __attribute__((__target__("crypto")))
635#else
579 #define ATTRIB_AES __attribute__((__target__("+crypto"))) 636 #define ATTRIB_AES __attribute__((__target__("+crypto")))
637#endif
580 #else 638 #else
639#if defined(__clang__)
640 #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
641#else
581 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 642 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
643#endif
582 #endif 644 #endif
645#endif
583#else 646#else
584 // _MSC_VER 647 // _MSC_VER
585 // for arm32 648 // for arm32
@@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW)
590 #define ATTRIB_AES 653 #define ATTRIB_AES
591#endif 654#endif
592 655
593#if defined(_MSC_VER) && defined(MY_CPU_ARM64) 656#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
594#include <arm64_neon.h> 657#include <arm64_neon.h>
595#else 658#else
659/*
660 clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
661 clang
662 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO)
663 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
664 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
665 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
666 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
667*/
668#if defined(__clang__) && __clang_major__ < 16
669#if !defined(__ARM_FEATURE_AES) && \
670 !defined(__ARM_FEATURE_CRYPTO)
671// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
672 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
673 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
674// #if defined(__clang__) && __clang_major__ < 13
675 #define __ARM_FEATURE_CRYPTO 1
676// #else
677 #define __ARM_FEATURE_AES 1
678// #endif
679 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
680#endif
681#endif // clang
682
683#if defined(__clang__)
684
685#if defined(__ARM_ARCH) && __ARM_ARCH < 8
686 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
687// #pragma message("#define __ARM_ARCH 8")
688 #undef __ARM_ARCH
689 #define __ARM_ARCH 8
690 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
691#endif
692
693#endif // clang
694
596#include <arm_neon.h> 695#include <arm_neon.h>
696
697#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
698 defined(__ARM_FEATURE_CRYPTO) && \
699 defined(__ARM_FEATURE_AES)
700Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
701 #undef __ARM_FEATURE_CRYPTO
702 #undef __ARM_FEATURE_AES
703 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
704Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
705// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
597#endif 706#endif
598 707
708#endif // Z7_MSC_VER_ORIGINAL
709
599typedef uint8x16_t v128; 710typedef uint8x16_t v128;
600 711
601#define AES_FUNC_START(name) \ 712#define AES_FUNC_START(name) \
@@ -620,7 +731,7 @@ AES_FUNC_START (name)
620 731
621AES_FUNC_START2 (AesCbc_Encode_HW) 732AES_FUNC_START2 (AesCbc_Encode_HW)
622{ 733{
623 v128 *p = (v128*)(void*)ivAes; 734 v128 * const p = (v128*)(void*)ivAes;
624 v128 *data = (v128*)(void*)data8; 735 v128 *data = (v128*)(void*)data8;
625 v128 m = *p; 736 v128 m = *p;
626 const v128 k0 = p[2]; 737 const v128 k0 = p[2];
@@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
639 const v128 k_z0 = w[2]; 750 const v128 k_z0 = w[2];
640 for (; numBlocks != 0; numBlocks--, data++) 751 for (; numBlocks != 0; numBlocks--, data++)
641 { 752 {
642 MM_XOR_m (*data); 753 MM_XOR_m (*data)
643 AES_E_MC_m (k0) 754 AES_E_MC_m (k0)
644 AES_E_MC_m (k1) 755 AES_E_MC_m (k1)
645 AES_E_MC_m (k2) 756 AES_E_MC_m (k2)
@@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
660 } 771 }
661 } 772 }
662 AES_E_m (k_z1) 773 AES_E_m (k_z1)
663 MM_XOR_m (k_z0); 774 MM_XOR_m (k_z0)
664 *data = m; 775 *data = m;
665 } 776 }
666 *p = m; 777 *p = m;
@@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
745 while (w != p); 856 while (w != p);
746 WOP_KEY (AES_D, 1) 857 WOP_KEY (AES_D, 1)
747 WOP_KEY (AES_XOR, 0) 858 WOP_KEY (AES_XOR, 0)
748 MM_XOR (m0, iv); 859 MM_XOR (m0, iv)
749 WOP_M1 (XOR_data_M1) 860 WOP_M1 (XOR_data_M1)
750 iv = data[NUM_WAYS - 1]; 861 iv = data[NUM_WAYS - 1];
751 WOP (STORE_data) 862 WOP (STORE_data)
@@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
759 AES_D_IMC_m (w[2]) 870 AES_D_IMC_m (w[2])
760 do 871 do
761 { 872 {
762 AES_D_IMC_m (w[1]); 873 AES_D_IMC_m (w[1])
763 AES_D_IMC_m (w[0]); 874 AES_D_IMC_m (w[0])
764 w -= 2; 875 w -= 2;
765 } 876 }
766 while (w != p); 877 while (w != p);
767 AES_D_m (w[1]); 878 AES_D_m (w[1])
768 MM_XOR_m (w[0]); 879 MM_XOR_m (w[0])
769 MM_XOR_m (iv); 880 MM_XOR_m (iv)
770 iv = *data; 881 iv = *data;
771 *data = m; 882 *data = m;
772 } 883 }
@@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW)
783 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; 894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
784 const v128 *dataEnd; 895 const v128 *dataEnd;
785 uint64x2_t one = vdupq_n_u64(0); 896 uint64x2_t one = vdupq_n_u64(0);
897
898// the bug in clang:
899// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900#if defined(__clang__) && (__clang_major__ <= 9)
901#pragma GCC diagnostic ignored "-Wvector-conversion"
902#endif
786 one = vsetq_lane_u64(1, one, 0); 903 one = vsetq_lane_u64(1, one, 0);
787 p += 2; 904 p += 2;
788 905
@@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW)
809 { 926 {
810 const v128 *w = p; 927 const v128 *w = p;
811 v128 m; 928 v128 m;
812 CTR_START (m, 0); 929 CTR_START (m, 0)
813 do 930 do
814 { 931 {
815 AES_E_MC_m (w[0]); 932 AES_E_MC_m (w[0])
816 AES_E_MC_m (w[1]); 933 AES_E_MC_m (w[1])
817 w += 2; 934 w += 2;
818 } 935 }
819 while (w != wEnd); 936 while (w != wEnd);
diff --git a/C/Alloc.c b/C/Alloc.c
index d841bf2..63e1a12 100644
--- a/C/Alloc.c
+++ b/C/Alloc.c
@@ -1,5 +1,5 @@
1/* Alloc.c -- Memory allocation functions 1/* Alloc.c -- Memory allocation functions
22023-04-02 : Igor Pavlov : Public domain */ 22024-02-18 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -10,19 +10,18 @@
10 10
11#include "Alloc.h" 11#include "Alloc.h"
12 12
13#ifdef _WIN32 13#if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \
14#ifdef Z7_LARGE_PAGES 14 (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64)
15#if defined(__clang__) || defined(__GNUC__) 15 #define Z7_USE_DYN_GetLargePageMinimum
16typedef void (*Z7_voidFunction)(void); 16#endif
17#define MY_CAST_FUNC (Z7_voidFunction) 17
18#elif defined(_MSC_VER) && _MSC_VER > 1920 18// for debug:
19#define MY_CAST_FUNC (void *) 19#if 0
20// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' 20#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
21#else 21// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ")
22#define MY_CAST_FUNC 22#define Z7_ALLOC_NO_OFFSET_ALLOCATOR
23#endif
23#endif 24#endif
24#endif // Z7_LARGE_PAGES
25#endif // _WIN32
26 25
27// #define SZ_ALLOC_DEBUG 26// #define SZ_ALLOC_DEBUG
28/* #define SZ_ALLOC_DEBUG */ 27/* #define SZ_ALLOC_DEBUG */
@@ -146,7 +145,9 @@ static void PrintAddr(void *p)
146#define PRINT_FREE(name, cnt, ptr) 145#define PRINT_FREE(name, cnt, ptr)
147#define Print(s) 146#define Print(s)
148#define PrintLn() 147#define PrintLn()
148#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
149#define PrintHex(v, align) 149#define PrintHex(v, align)
150#endif
150#define PrintAddr(p) 151#define PrintAddr(p)
151 152
152#endif 153#endif
@@ -246,9 +247,9 @@ void MidFree(void *address)
246#ifdef Z7_LARGE_PAGES 247#ifdef Z7_LARGE_PAGES
247 248
248#ifdef MEM_LARGE_PAGES 249#ifdef MEM_LARGE_PAGES
249 #define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES 250 #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES
250#else 251#else
251 #define MY__MEM_LARGE_PAGES 0x20000000 252 #define MY_MEM_LARGE_PAGES 0x20000000
252#endif 253#endif
253 254
254extern 255extern
@@ -258,19 +259,23 @@ typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID);
258 259
259void SetLargePageSize(void) 260void SetLargePageSize(void)
260{ 261{
261 #ifdef Z7_LARGE_PAGES
262 SIZE_T size; 262 SIZE_T size;
263#ifdef Z7_USE_DYN_GetLargePageMinimum
264Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
265
263 const 266 const
264 Func_GetLargePageMinimum fn = 267 Func_GetLargePageMinimum fn =
265 (Func_GetLargePageMinimum) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), 268 (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
266 "GetLargePageMinimum"); 269 "GetLargePageMinimum");
267 if (!fn) 270 if (!fn)
268 return; 271 return;
269 size = fn(); 272 size = fn();
273#else
274 size = GetLargePageMinimum();
275#endif
270 if (size == 0 || (size & (size - 1)) != 0) 276 if (size == 0 || (size & (size - 1)) != 0)
271 return; 277 return;
272 g_LargePageSize = size; 278 g_LargePageSize = size;
273 #endif
274} 279}
275 280
276#endif // Z7_LARGE_PAGES 281#endif // Z7_LARGE_PAGES
@@ -292,7 +297,7 @@ void *BigAlloc(size_t size)
292 size2 = (size + ps) & ~ps; 297 size2 = (size + ps) & ~ps;
293 if (size2 >= size) 298 if (size2 >= size)
294 { 299 {
295 void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); 300 void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE);
296 if (p) 301 if (p)
297 { 302 {
298 PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) 303 PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p)
@@ -328,20 +333,7 @@ const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree };
328const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; 333const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
329#endif 334#endif
330 335
331/* 336#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
332 uintptr_t : <stdint.h> C99 (optional)
333 : unsupported in VS6
334*/
335
336#ifdef _WIN32
337 typedef UINT_PTR UIntPtr;
338#else
339 /*
340 typedef uintptr_t UIntPtr;
341 */
342 typedef ptrdiff_t UIntPtr;
343#endif
344
345 337
346#define ADJUST_ALLOC_SIZE 0 338#define ADJUST_ALLOC_SIZE 0
347/* 339/*
@@ -352,14 +344,36 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
352 MyAlloc() can return address that is NOT multiple of sizeof(void *). 344 MyAlloc() can return address that is NOT multiple of sizeof(void *).
353*/ 345*/
354 346
355
356/* 347/*
357#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) 348 uintptr_t : <stdint.h> C99 (optional)
349 : unsupported in VS6
358*/ 350*/
359#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) 351typedef
352 #ifdef _WIN32
353 UINT_PTR
354 #elif 1
355 uintptr_t
356 #else
357 ptrdiff_t
358 #endif
359 MY_uintptr_t;
360
361#if 0 \
362 || (defined(__CHERI__) \
363 || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8))
364// for 128-bit pointers (cheri):
365#define MY_ALIGN_PTR_DOWN(p, align) \
366 ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1))))
367#else
368#define MY_ALIGN_PTR_DOWN(p, align) \
369 ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1))))
370#endif
360 371
372#endif
361 373
362#if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) 374#if !defined(_WIN32) \
375 && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \
376 || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L))
363 #define USE_posix_memalign 377 #define USE_posix_memalign
364#endif 378#endif
365 379
@@ -399,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size)
399 413
400#define ALLOC_ALIGN_SIZE ((size_t)1 << 7) 414#define ALLOC_ALIGN_SIZE ((size_t)1 << 7)
401 415
402static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) 416void *z7_AlignedAlloc(size_t size)
403{ 417{
404 #ifndef USE_posix_memalign 418#ifndef USE_posix_memalign
405 419
406 void *p; 420 void *p;
407 void *pAligned; 421 void *pAligned;
408 size_t newSize; 422 size_t newSize;
409 UNUSED_VAR(pp)
410 423
411 /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned 424 /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned
412 block to prevent cache line sharing with another allocated blocks */ 425 block to prevent cache line sharing with another allocated blocks */
@@ -431,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
431 444
432 return pAligned; 445 return pAligned;
433 446
434 #else 447#else
435 448
436 void *p; 449 void *p;
437 UNUSED_VAR(pp)
438 if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) 450 if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size))
439 return NULL; 451 return NULL;
440 452
@@ -443,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
443 455
444 return p; 456 return p;
445 457
446 #endif 458#endif
459}
460
461
462void z7_AlignedFree(void *address)
463{
464#ifndef USE_posix_memalign
465 if (address)
466 MyFree(((void **)address)[-1]);
467#else
468 free(address);
469#endif
470}
471
472
473static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
474{
475 UNUSED_VAR(pp)
476 return z7_AlignedAlloc(size);
447} 477}
448 478
449 479
450static void SzAlignedFree(ISzAllocPtr pp, void *address) 480static void SzAlignedFree(ISzAllocPtr pp, void *address)
451{ 481{
452 UNUSED_VAR(pp) 482 UNUSED_VAR(pp)
453 #ifndef USE_posix_memalign 483#ifndef USE_posix_memalign
454 if (address) 484 if (address)
455 MyFree(((void **)address)[-1]); 485 MyFree(((void **)address)[-1]);
456 #else 486#else
457 free(address); 487 free(address);
458 #endif 488#endif
459} 489}
460 490
461 491
@@ -463,16 +493,44 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree };
463 493
464 494
465 495
466#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
467
468/* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ 496/* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */
469#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] 497#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
470/* 498#if 1
471#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] 499 #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
472*/ 500 #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
501#else
502 // we can use this simplified code,
503 // if (CAlignOffsetAlloc::offset == (k * sizeof(void *))
504 #define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1])
505#endif
506#endif
507
508
509#if 0
510#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
511#include <stdio.h>
512static void PrintPtr(const char *s, const void *p)
513{
514 const Byte *p2 = (const Byte *)&p;
515 unsigned i;
516 printf("%s %p ", s, p);
517 for (i = sizeof(p); i != 0;)
518 {
519 i--;
520 printf("%02x", p2[i]);
521 }
522 printf("\n");
523}
524#endif
525#endif
526
473 527
474static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) 528static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
475{ 529{
530#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
531 UNUSED_VAR(pp)
532 return z7_AlignedAlloc(size);
533#else
476 const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); 534 const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
477 void *adr; 535 void *adr;
478 void *pAligned; 536 void *pAligned;
@@ -501,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
501 pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + 559 pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr +
502 alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; 560 alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset;
503 561
562#if 0
563 printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size);
564 PrintPtr("base", adr);
565 PrintPtr("alig", pAligned);
566#endif
567
504 PrintLn(); 568 PrintLn();
505 Print("- Aligned: "); 569 Print("- Aligned: ");
506 Print(" size="); PrintHex(size, 8); 570 Print(" size="); PrintHex(size, 8);
@@ -512,11 +576,16 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
512 REAL_BLOCK_PTR_VAR(pAligned) = adr; 576 REAL_BLOCK_PTR_VAR(pAligned) = adr;
513 577
514 return pAligned; 578 return pAligned;
579#endif
515} 580}
516 581
517 582
518static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) 583static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address)
519{ 584{
585#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
586 UNUSED_VAR(pp)
587 z7_AlignedFree(address);
588#else
520 if (address) 589 if (address)
521 { 590 {
522 const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); 591 const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
@@ -525,6 +594,7 @@ static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address)
525 PrintLn(); 594 PrintLn();
526 ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); 595 ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address));
527 } 596 }
597#endif
528} 598}
529 599
530 600
diff --git a/C/Alloc.h b/C/Alloc.h
index fac5b62..01bf6b7 100644
--- a/C/Alloc.h
+++ b/C/Alloc.h
@@ -1,5 +1,5 @@
1/* Alloc.h -- Memory allocation functions 1/* Alloc.h -- Memory allocation functions
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_ALLOC_H 4#ifndef ZIP7_INC_ALLOC_H
5#define ZIP7_INC_ALLOC_H 5#define ZIP7_INC_ALLOC_H
@@ -22,6 +22,9 @@ void *MyAlloc(size_t size);
22void MyFree(void *address); 22void MyFree(void *address);
23void *MyRealloc(void *address, size_t size); 23void *MyRealloc(void *address, size_t size);
24 24
25void *z7_AlignedAlloc(size_t size);
26void z7_AlignedFree(void *p);
27
25#ifdef _WIN32 28#ifdef _WIN32
26 29
27#ifdef Z7_LARGE_PAGES 30#ifdef Z7_LARGE_PAGES
@@ -33,12 +36,14 @@ void MidFree(void *address);
33void *BigAlloc(size_t size); 36void *BigAlloc(size_t size);
34void BigFree(void *address); 37void BigFree(void *address);
35 38
39/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */
40
36#else 41#else
37 42
38#define MidAlloc(size) MyAlloc(size) 43#define MidAlloc(size) z7_AlignedAlloc(size)
39#define MidFree(address) MyFree(address) 44#define MidFree(address) z7_AlignedFree(address)
40#define BigAlloc(size) MyAlloc(size) 45#define BigAlloc(size) z7_AlignedAlloc(size)
41#define BigFree(address) MyFree(address) 46#define BigFree(address) z7_AlignedFree(address)
42 47
43#endif 48#endif
44 49
diff --git a/C/Asm_c.mak b/C/Asm_c.mak
new file mode 100644
index 0000000..9431816
--- /dev/null
+++ b/C/Asm_c.mak
@@ -0,0 +1,12 @@
1!IFDEF ASM_OBJS
2!IF "$(PLATFORM)" == "arm64"
3$(ASM_OBJS): ../../../Asm/arm64/$(*B).S
4 $(COMPL_ASM_CLANG)
5!ELSEIF "$(PLATFORM)" == "arm"
6$(ASM_OBJS): ../../../Asm/arm/$(*B).asm
7 $(COMPL_ASM)
8!ELSEIF "$(PLATFORM)" != "ia64" && "$(PLATFORM)" != "mips"
9$(ASM_OBJS): ../../../Asm/x86/$(*B).asm
10 $(COMPL_ASM)
11!ENDIF
12!ENDIF
diff --git a/C/Blake2.h b/C/Blake2.h
index 7235235..801ea7a 100644
--- a/C/Blake2.h
+++ b/C/Blake2.h
@@ -1,47 +1,104 @@
1/* Blake2.h -- BLAKE2 Hash 1/* Blake2.h -- BLAKE2sp Hash
22023-03-04 : Igor Pavlov : Public domain 22024-01-17 : Igor Pavlov : Public domain */
32015 : Samuel Neves : Public domain */
4 3
5#ifndef ZIP7_INC_BLAKE2_H 4#ifndef ZIP7_INC_BLAKE2_H
6#define ZIP7_INC_BLAKE2_H 5#define ZIP7_INC_BLAKE2_H
7 6
8#include "7zTypes.h" 7#include "7zTypes.h"
9 8
10EXTERN_C_BEGIN 9#if 0
10#include "Compiler.h"
11#include "CpuArch.h"
12#if defined(MY_CPU_X86_OR_AMD64)
13#if defined(__SSE2__) \
14 || defined(_MSC_VER) && _MSC_VER > 1200 \
15 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
16 || defined(__clang__) \
17 || defined(__INTEL_COMPILER)
18#include <emmintrin.h> // SSE2
19#endif
11 20
12#define BLAKE2S_BLOCK_SIZE 64 21#if defined(__AVX2__) \
13#define BLAKE2S_DIGEST_SIZE 32 22 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
14#define BLAKE2SP_PARALLEL_DEGREE 8 23 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
24 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
25 || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
26 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
27#include <immintrin.h>
28#if defined(__clang__)
29#include <avxintrin.h>
30#include <avx2intrin.h>
31#endif
32#endif // avx2
33#endif // MY_CPU_X86_OR_AMD64
34#endif // 0
15 35
16typedef struct 36EXTERN_C_BEGIN
17{
18 UInt32 h[8];
19 UInt32 t[2];
20 UInt32 f[2];
21 Byte buf[BLAKE2S_BLOCK_SIZE];
22 UInt32 bufPos;
23 UInt32 lastNode_f1;
24 UInt32 dummy[2]; /* for sizeof(CBlake2s) alignment */
25} CBlake2s;
26
27/* You need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */
28/*
29void Blake2s_Init0(CBlake2s *p);
30void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size);
31void Blake2s_Final(CBlake2s *p, Byte *digest);
32*/
33 37
38#define Z7_BLAKE2S_BLOCK_SIZE 64
39#define Z7_BLAKE2S_DIGEST_SIZE 32
40#define Z7_BLAKE2SP_PARALLEL_DEGREE 8
41#define Z7_BLAKE2SP_NUM_STRUCT_WORDS 16
34 42
43#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
44typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_COMPRESS)(UInt32 *states, const Byte *data, const Byte *end);
45typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_INIT)(UInt32 *states);
46#endif
47
48// it's required that CBlake2sp is aligned for 32-bytes,
49// because the code can use unaligned access with sse and avx256.
50// but 64-bytes alignment can be better.
51MY_ALIGN(64)
35typedef struct 52typedef struct
36{ 53{
37 CBlake2s S[BLAKE2SP_PARALLEL_DEGREE]; 54 union
38 unsigned bufPos; 55 {
39} CBlake2sp; 56#if 0
57#if defined(MY_CPU_X86_OR_AMD64)
58#if defined(__SSE2__) \
59 || defined(_MSC_VER) && _MSC_VER > 1200 \
60 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
61 || defined(__clang__) \
62 || defined(__INTEL_COMPILER)
63 __m128i _pad_align_128bit[4];
64#endif // sse2
65#if defined(__AVX2__) \
66 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
67 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
68 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
69 || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
70 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
71 __m256i _pad_align_256bit[2];
72#endif // avx2
73#endif // x86
74#endif // 0
40 75
76 void * _pad_align_ptr[8];
77 UInt32 _pad_align_32bit[16];
78 struct
79 {
80 unsigned cycPos;
81 unsigned _pad_unused;
82#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
83 Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Fast;
84 Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Single;
85 Z7_BLAKE2SP_FUNC_INIT func_Init;
86 Z7_BLAKE2SP_FUNC_INIT func_Final;
87#endif
88 } header;
89 } u;
90 // MY_ALIGN(64)
91 UInt32 states[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS];
92 // MY_ALIGN(64)
93 UInt32 buf32[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS * 2];
94} CBlake2sp;
41 95
96BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo);
42void Blake2sp_Init(CBlake2sp *p); 97void Blake2sp_Init(CBlake2sp *p);
98void Blake2sp_InitState(CBlake2sp *p);
43void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); 99void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size);
44void Blake2sp_Final(CBlake2sp *p, Byte *digest); 100void Blake2sp_Final(CBlake2sp *p, Byte *digest);
101void z7_Black2sp_Prepare(void);
45 102
46EXTERN_C_END 103EXTERN_C_END
47 104
diff --git a/C/Blake2s.c b/C/Blake2s.c
index 2a84b57..459e76b 100644
--- a/C/Blake2s.c
+++ b/C/Blake2s.c
@@ -1,250 +1,2645 @@
1/* Blake2s.c -- BLAKE2s and BLAKE2sp Hash 1/* Blake2s.c -- BLAKE2sp Hash
22023-03-04 : Igor Pavlov : Public domain 22024-01-29 : Igor Pavlov : Public domain
32015 : Samuel Neves : Public domain */ 32015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
6 6
7// #include <stdio.h>
7#include <string.h> 8#include <string.h>
8 9
9#include "Blake2.h" 10#include "Blake2.h"
10#include "CpuArch.h"
11#include "RotateDefs.h" 11#include "RotateDefs.h"
12#include "Compiler.h"
13#include "CpuArch.h"
14
15#if defined(__SSE2__)
16 #define Z7_BLAKE2S_USE_VECTORS
17#elif defined(MY_CPU_X86_OR_AMD64)
18 #if defined(_MSC_VER) && _MSC_VER > 1200 \
19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
20 || defined(__clang__) \
21 || defined(__INTEL_COMPILER)
22 #define Z7_BLAKE2S_USE_VECTORS
23 #endif
24#endif
25
26#ifdef Z7_BLAKE2S_USE_VECTORS
27
28#define Z7_BLAKE2SP_USE_FUNCTIONS
29
30// define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
31// #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
32
33// SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
34#if defined(__SSSE3__)
35 #define Z7_BLAKE2S_USE_SSSE3
36#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
37 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
38 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
39 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
40 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
41 #define Z7_BLAKE2S_USE_SSSE3
42#endif
43
44#ifdef Z7_BLAKE2S_USE_SSSE3
45/* SSE41 : for _mm_insert_epi32 (pinsrd)
46 it can slightly reduce code size and improves the performance in some cases.
47 it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
48 it can be used for all blocks in another algos (4+).
49*/
50#if defined(__SSE4_1__)
51 #define Z7_BLAKE2S_USE_SSE41
52#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
53 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
54 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
55 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
56 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
57 #define Z7_BLAKE2S_USE_SSE41
58#endif
59#endif // SSSE3
60
61#if defined(__GNUC__) || defined(__clang__)
62 #if defined(Z7_BLAKE2S_USE_SSE41)
63 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1")))
64 #elif defined(Z7_BLAKE2S_USE_SSSE3)
65 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3")))
66 #else
67 #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2")))
68 #endif
69#endif
70
71
72#if defined(__AVX2__)
73 #define Z7_BLAKE2S_USE_AVX2
74#else
75 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
76 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
77 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
78 #define Z7_BLAKE2S_USE_AVX2
79 #ifdef Z7_BLAKE2S_USE_AVX2
80 #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2")))
81 #endif
82 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
83 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
84 #if (Z7_MSC_VER_ORIGINAL == 1900)
85 #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
86 #endif
87 #define Z7_BLAKE2S_USE_AVX2
88 #endif
89#endif
90
91#ifdef Z7_BLAKE2S_USE_SSE41
92#include <smmintrin.h> // SSE4.1
93#elif defined(Z7_BLAKE2S_USE_SSSE3)
94#include <tmmintrin.h> // SSSE3
95#else
96#include <emmintrin.h> // SSE2
97#endif
98
99#ifdef Z7_BLAKE2S_USE_AVX2
100#include <immintrin.h>
101#if defined(__clang__)
102#include <avxintrin.h>
103#include <avx2intrin.h>
104#endif
105#endif // avx2
106
107
108#if defined(__AVX512F__) && defined(__AVX512VL__)
109 // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
110 #define Z7_BLAKE2S_USE_AVX512_ALWAYS
111 // #pragma message ("=== Blake2s AVX512")
112#endif
12 113
13#define rotr32 rotrFixed
14 114
15#define BLAKE2S_NUM_ROUNDS 10 115#define Z7_BLAKE2S_USE_V128_FAST
16#define BLAKE2S_FINAL_FLAG (~(UInt32)0) 116// for speed optimization for small messages:
117// #define Z7_BLAKE2S_USE_V128_WAY2
17 118
119#ifdef Z7_BLAKE2S_USE_AVX2
120
121// for debug:
122// gather is slow
123// #define Z7_BLAKE2S_USE_GATHER
124
125 #define Z7_BLAKE2S_USE_AVX2_FAST
126// for speed optimization for small messages:
127// #define Z7_BLAKE2S_USE_AVX2_WAY2
128// #define Z7_BLAKE2S_USE_AVX2_WAY4
129#if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
130 defined(Z7_BLAKE2S_USE_AVX2_WAY4)
131 #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW
132#endif
133#endif
134
135 #define Z7_BLAKE2SP_ALGO_DEFAULT 0
136 #define Z7_BLAKE2SP_ALGO_SCALAR 1
137#ifdef Z7_BLAKE2S_USE_V128_FAST
138 #define Z7_BLAKE2SP_ALGO_V128_FAST 2
139#endif
140#ifdef Z7_BLAKE2S_USE_AVX2_FAST
141 #define Z7_BLAKE2SP_ALGO_V256_FAST 3
142#endif
143 #define Z7_BLAKE2SP_ALGO_V128_WAY1 4
144#ifdef Z7_BLAKE2S_USE_V128_WAY2
145 #define Z7_BLAKE2SP_ALGO_V128_WAY2 5
146#endif
147#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
148 #define Z7_BLAKE2SP_ALGO_V256_WAY2 6
149#endif
150#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
151 #define Z7_BLAKE2SP_ALGO_V256_WAY4 7
152#endif
153
154#endif // Z7_BLAKE2S_USE_VECTORS
155
156
157
158
159#define BLAKE2S_FINAL_FLAG (~(UInt32)0)
160#define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS
161#define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
162#define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1)
163
164#define V_INDEX_0_0 0
165#define V_INDEX_1_0 1
166#define V_INDEX_2_0 2
167#define V_INDEX_3_0 3
168#define V_INDEX_0_1 4
169#define V_INDEX_1_1 5
170#define V_INDEX_2_1 6
171#define V_INDEX_3_1 7
172#define V_INDEX_0_2 8
173#define V_INDEX_1_2 9
174#define V_INDEX_2_2 10
175#define V_INDEX_3_2 11
176#define V_INDEX_0_3 12
177#define V_INDEX_1_3 13
178#define V_INDEX_2_3 14
179#define V_INDEX_3_3 15
180#define V_INDEX_4_0 0
181#define V_INDEX_5_0 1
182#define V_INDEX_6_0 2
183#define V_INDEX_7_0 3
184#define V_INDEX_7_1 4
185#define V_INDEX_4_1 5
186#define V_INDEX_5_1 6
187#define V_INDEX_6_1 7
188#define V_INDEX_6_2 8
189#define V_INDEX_7_2 9
190#define V_INDEX_4_2 10
191#define V_INDEX_5_2 11
192#define V_INDEX_5_3 12
193#define V_INDEX_6_3 13
194#define V_INDEX_7_3 14
195#define V_INDEX_4_3 15
196
197#define V(row, col) v[V_INDEX_ ## row ## _ ## col]
198
199#define k_Blake2s_IV_0 0x6A09E667UL
200#define k_Blake2s_IV_1 0xBB67AE85UL
201#define k_Blake2s_IV_2 0x3C6EF372UL
202#define k_Blake2s_IV_3 0xA54FF53AUL
203#define k_Blake2s_IV_4 0x510E527FUL
204#define k_Blake2s_IV_5 0x9B05688CUL
205#define k_Blake2s_IV_6 0x1F83D9ABUL
206#define k_Blake2s_IV_7 0x5BE0CD19UL
207
208#define KIV(n) (k_Blake2s_IV_## n)
209
210#ifdef Z7_BLAKE2S_USE_VECTORS
211MY_ALIGN(16)
18static const UInt32 k_Blake2s_IV[8] = 212static const UInt32 k_Blake2s_IV[8] =
19{ 213{
20 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 214 KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
21 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
22}; 215};
216#endif
23 217
24static const Byte k_Blake2s_Sigma[BLAKE2S_NUM_ROUNDS][16] = 218#define STATE_T(s) ((s) + 8)
25{ 219#define STATE_F(s) ((s) + 10)
26 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , 220
27 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , 221#ifdef Z7_BLAKE2S_USE_VECTORS
28 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
29 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
30 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
31 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
32 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
33 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
34 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
35 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
36};
37 222
223#define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
224#define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
225#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
226 // here we use unaligned load and stores
227 // use this branch if CBlake2sp can be unaligned for 16 bytes
228 #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
229 #define LOAD_128_FROM_STRUCT(p) LOADU_128(p)
230 #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r)
231#else
232 // here we use aligned load and stores
233 // use this branch if CBlake2sp is aligned for 16 bytes
234 #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r)
235 #define LOAD_128_FROM_STRUCT(p) LOAD_128(p)
236 #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r)
237#endif
38 238
39static void Blake2s_Init0(CBlake2s *p) 239#endif // Z7_BLAKE2S_USE_VECTORS
240
241
242#if 0
243static void PrintState(const UInt32 *s, unsigned num)
244{
245 unsigned i;
246 printf("\n");
247 for (i = 0; i < num; i++)
248 printf(" %08x", (unsigned)s[i]);
249}
250static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
40{ 251{
41 unsigned i; 252 unsigned i;
42 for (i = 0; i < 8; i++) 253 for (i = 0; i < y; i++)
43 p->h[i] = k_Blake2s_IV[i]; 254 PrintState(s + i * x, x);
44 p->t[0] = 0; 255 printf("\n");
45 p->t[1] = 0;
46 p->f[0] = 0;
47 p->f[1] = 0;
48 p->bufPos = 0;
49 p->lastNode_f1 = 0;
50} 256}
257#endif
258
259
260#define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
261
262#define BLAKE2S_NUM_ROUNDS 10
263
264#if defined(Z7_BLAKE2S_USE_VECTORS)
265#define ROUNDS_LOOP(mac) \
266 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
267#endif
268/*
269#define ROUNDS_LOOP_2(mac) \
270 { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
271*/
272#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
273#define ROUNDS_LOOP_UNROLLED(m) \
274 { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
275#endif
276
277#define SIGMA_TABLE(M) \
278 M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \
279 M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \
280 M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \
281 M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \
282 M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \
283 M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \
284 M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \
285 M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \
286 M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \
287 M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 )
288
289#define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
290 { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
291#define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
292 SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
293
294// MY_ALIGN(32)
295MY_ALIGN(16)
296static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
297 { SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
298
299#define GET_SIGMA_PTR(p, index) \
300 ((const void *)((const Byte *)(const void *)(p) + (index)))
51 301
302#define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
303 ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
52 304
53static void Blake2s_Compress(CBlake2s *p) 305
306#ifdef Z7_BLAKE2S_USE_VECTORS
307
308
309#if 0
310 // use loading constants from memory
311 // is faster for some compilers.
312 #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n)
313MY_ALIGN(64)
314static const UInt32 k_Blake2s_IV_WAY4[]=
54{ 315{
55 UInt32 m[16]; 316 KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
56 UInt32 v[16]; 317};
57 318 #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
319#else
320 // use constant generation:
321 #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i))
322#endif
323
324
325#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
326#define GET_CONST_128_FROM_ARRAY32(k) \
327 _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
328#endif
329
330
331#if 0
332#define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
333#define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
334#define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
335#define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
336#define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
337#else
338#if defined(Z7_BLAKE2S_USE_SSSE3) && \
339 !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
340MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
341MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
342#define k_r8 LOAD_128(k_r8_arr)
343#define k_r16 LOAD_128(k_r16_arr)
344#endif
345MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
346#define k_inc LOAD_128(k_inc_arr)
347#define k_iv0_128 LOAD_128(k_Blake2s_IV + 0)
348#define k_iv4_128 LOAD_128(k_Blake2s_IV + 4)
349#endif
350
351
352#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
353
354#ifdef Z7_BLAKE2S_USE_AVX2
355#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
356 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
357#else
358 #define MY_mm256_set_m128i _mm256_set_m128i
359#endif
360
361#define SET_FROM_128(a) MY_mm256_set_m128i(a, a)
362
363#ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
364MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
365{
366 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
367 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
368};
369MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
370{
371 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
372 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
373};
374#define k_r8_256 LOAD_256(k_r8_arr_256)
375#define k_r16_256 LOAD_256(k_r16_arr_256)
376#endif
377
378// #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
379// #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
380// #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
381// #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
382#define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
383#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
384#endif
385
386
387/*
388IPC(TP) ports:
3891 p__5 : skl- : SSE : shufps : _mm_shuffle_ps
3902 p_15 : icl+
3911 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps
3923 p015 : skl+
393
3943 p015 : SSE2 : pxor : _mm_xor_si128
3952 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32
3962 p0_5: mrm-wsm :
3973 p015 : skl+
398
3992 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
4002 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32
4012 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16
4022 p_15 : : SSE2 : psrldq :
4032 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8
4042 p_15 : : SSE4 : pblendw : _mm_blend_epi16
4051 p__5 : hsw-skl : *
406
4071 p0 : SSE2 : pslld (i8) : _mm_slli_si128
4082 p01 : skl+ :
409
4102 p_15 : ivb- : SSE3 : palignr
4111 p__5 : hsw+
412
4132 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8)
4141 p__5 + p23 : hsw-skl
4151 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8)
4160.5 2*p5 : hsw-skl
417
4182 p23 : SSE2 : movd (m32)
4193 p23A : adl :
4201 p5: : SSE2 : movd (r32)
421*/
422
423#if 0 && defined(__XOP__)
424// we must debug and test __XOP__ instruction
425#include <x86intrin.h>
426#include <ammintrin.h>
427 #define LOAD_ROTATE_CONSTS
428 #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c))
429 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
430#elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
431 #define LOAD_ROTATE_CONSTS
432 #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c)
433 #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
434#else
435
436// MSVC_1937+ uses "orps" instruction for _mm_or_si128().
437// But "orps" has low throughput: TP=1 for bdw-nhm.
438// So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
439// But "orps" is fast for modern cpus (skl+).
440// So we are default with "or" version:
441#if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
442 // minor optimization for some old cpus, if "xorps" is slow.
443 #define MM128_EPI32_OR_or_ADD _mm_add_epi32
444#else
445 #define MM128_EPI32_OR_or_ADD _mm_or_si128
446#endif
447
448 #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
449 MM128_EPI32_OR_or_ADD( \
450 _mm_srli_epi32((r), (c)), \
451 _mm_slli_epi32((r), 32-(c))))
452 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
453 #define LOAD_ROTATE_CONSTS \
454 const __m128i r8 = k_r8; \
455 const __m128i r16 = k_r16;
456 #define MM_ROR_EPI32(r, c) ( \
457 ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
458 : (16==(c)) ? _mm_shuffle_epi8(r,r16) \
459 : MM_ROR_EPI32_VIA_SHIFT(r, c))
460 #else
461 #define LOAD_ROTATE_CONSTS
462 #define MM_ROR_EPI32(r, c) ( \
463 (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
464 : MM_ROR_EPI32_VIA_SHIFT(r, c))
465 #endif
466#endif
467
468/*
469we have 3 main ways to load 4 32-bit integers to __m128i:
470 1) SSE2: _mm_set_epi32()
471 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
472 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
473good compiler for _mm_set_epi32() generates these instructions:
474{
475 movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
476}
477good new compiler generates one instruction
478{
479 for _mm_insert_epi32() : { pinsrd xmm, [m32], i }
480 for _mm_cvtsi32_si128() : { movd xmm, [m32] }
481}
482but vc2010 generates slow pair of instructions:
483{
484 for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i }
485 for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 }
486}
487_mm_insert_epi32() (pinsrd) code reduces xmm register pressure
488in comparison with _mm_set_epi32() (movd + vpunpckld) code.
489Note that variant with "movd xmm, r32" can be more slow,
490but register pressure can be more important.
491So we can force to "pinsrd" always.
492*/
493// #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
494#ifdef Z7_BLAKE2S_USE_SSE41
495 /* _mm_set_epi32() can be more effective for GCC and CLANG
496 _mm_insert_epi32() is more effective for MSVC */
497 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
498 #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
499 #endif
500#endif // USE_SSE41
501// #endif
502
503#ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
504 // for SSE4.1
505#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
506 _mm_insert_epi32( \
507 _mm_insert_epi32( \
508 _mm_insert_epi32( \
509 _mm_cvtsi32_si128( \
510 *(const Int32 *)p0), \
511 *(const Int32 *)p1, 1), \
512 *(const Int32 *)p2, 2), \
513 *(const Int32 *)p3, 3)
514#elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
515/* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
516 Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
517 But _mm_set_epi32() is more effective for GCC and CLANG.
518 So we use _mm_unpacklo_epi32 for MSVC only */
519#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
520 _mm_unpacklo_epi64( \
521 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \
522 _mm_cvtsi32_si128(*(const Int32 *)p1)), \
523 _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \
524 _mm_cvtsi32_si128(*(const Int32 *)p3)))
525#else
526#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
527 _mm_set_epi32( \
528 *(const Int32 *)p3, \
529 *(const Int32 *)p2, \
530 *(const Int32 *)p1, \
531 *(const Int32 *)p0)
532#endif
533
534#define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \
535 MM_LOAD_EPI32_FROM_4_POINTERS( \
536 GET_SIGMA_PTR(input, i0), \
537 GET_SIGMA_PTR(input, i1), \
538 GET_SIGMA_PTR(input, i2), \
539 GET_SIGMA_PTR(input, i3))
540
541#define SET_ROW_FROM_SIGMA(input, sigma_index) \
542 SET_ROW_FROM_SIGMA_BASE(input, \
543 sigma[(sigma_index) ], \
544 sigma[(sigma_index) + 2 * 1], \
545 sigma[(sigma_index) + 2 * 2], \
546 sigma[(sigma_index) + 2 * 3]) \
547
548
549#define ADD_128(a, b) _mm_add_epi32(a, b)
550#define XOR_128(a, b) _mm_xor_si128(a, b)
551
552#define D_ADD_128(dest, src) dest = ADD_128(dest, src)
553#define D_XOR_128(dest, src) dest = XOR_128(dest, src)
554#define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift)
555#define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src)
556
557
558#define AXR(a, b, d, shift) \
559 D_ADD_128(a, b); \
560 D_XOR_128(d, a); \
561 D_ROR_128(d, shift);
562
563#define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
564 a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
565 AXR(a, b, d, shift1) \
566 AXR(c, d, b, shift2)
567
568#define ROTATE_WORDS_TO_RIGHT(a, n) \
569 a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
570
571#define AXR4(a, b, c, d, input, sigma_index) \
572 AXR2(a, b, c, d, input, sigma_index, 16, 12) \
573 AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \
574
575#define RR2(a, b, c, d, input) \
576 { \
577 AXR4(a, b, c, d, input, 0) \
578 ROTATE_WORDS_TO_RIGHT(b, 1) \
579 ROTATE_WORDS_TO_RIGHT(c, 2) \
580 ROTATE_WORDS_TO_RIGHT(d, 3) \
581 AXR4(a, b, c, d, input, 8) \
582 ROTATE_WORDS_TO_RIGHT(b, 3) \
583 ROTATE_WORDS_TO_RIGHT(c, 2) \
584 ROTATE_WORDS_TO_RIGHT(d, 1) \
585 }
586
587
588/*
589Way1:
590per 64 bytes block:
59110 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
592 * (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
593additional operations per 7_op_iter :
5944 movzx byte mem
5951 movd mem
5963 pinsrd mem
5971.5 pshufd
598*/
599
600static
601#if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
602 defined(Z7_BLAKE2S_USE_V256_WAY2))
603 Z7_NO_INLINE
604#else
605 Z7_FORCE_INLINE
606#endif
607#ifdef BLAKE2S_ATTRIB_128BIT
608 BLAKE2S_ATTRIB_128BIT
609#endif
610void
611Z7_FASTCALL
612Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
613{
614 __m128i a, b, c, d;
615 __m128i f0, f1;
616
617 LOAD_ROTATE_CONSTS
618 d = LOAD_128_FROM_STRUCT(STATE_T(s));
619 c = k_iv0_128;
620 a = f0 = LOAD_128_FROM_STRUCT(s);
621 b = f1 = LOAD_128_FROM_STRUCT(s + 4);
622 D_ADD_EPI64_128(d, k_inc);
623 STORE_128_TO_STRUCT (STATE_T(s), d);
624 D_XOR_128(d, k_iv4_128);
625
626#define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
627 RR2(a, b, c, d, input) }
628
629 ROUNDS_LOOP(RR)
630#undef RR
631
632 STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c)));
633 STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
634}
635
636
637static
638Z7_NO_INLINE
639#ifdef BLAKE2S_ATTRIB_128BIT
640 BLAKE2S_ATTRIB_128BIT
641#endif
642void
643Z7_FASTCALL
644Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
645{
646 size_t pos = 0;
647 do
58 { 648 {
59 unsigned i; 649 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
650 Blake2s_Compress_V128_Way1(s, data);
651 data += Z7_BLAKE2S_BLOCK_SIZE;
652 pos += Z7_BLAKE2S_BLOCK_SIZE;
653 pos &= SUPER_BLOCK_MASK;
654 }
655 while (data != end);
656}
657
658
659#if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
660 defined(Z7_BLAKE2S_USE_AVX2_WAY2)
661#if 1
662 #define Z7_BLAKE2S_CompressSingleBlock(s, data) \
663 Blake2sp_Compress2_V128_Way1(s, data, \
664 (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
665#else
666 #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1
667#endif
668#endif
669
670
671#if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
672 defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
673 !defined(Z7_BLAKE2S_USE_GATHER)
674#define AXR2_LOAD_INDEXES(sigma_index) \
675 const unsigned i0 = sigma[(sigma_index)]; \
676 const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
677 const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
678 const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
679
680#define SET_ROW_FROM_SIGMA_W(input) \
681 SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
682#endif
683
684
685#ifdef Z7_BLAKE2S_USE_V128_WAY2
686
687#if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
688/* we use SET_ROW_FROM_SIGMA_BASE, that uses
689 (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
690 (SSE2) _mm_set_epi32()
691 MSVC can be faster for this branch:
692*/
693#define AXR2_W(sigma_index, shift1, shift2) \
694 { \
695 AXR2_LOAD_INDEXES(sigma_index) \
696 a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
697 a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
698 AXR(a0, b0, d0, shift1) \
699 AXR(a1, b1, d1, shift1) \
700 AXR(c0, d0, b0, shift2) \
701 AXR(c1, d1, b1, shift2) \
702 }
703#else
704/* we use interleaved _mm_insert_epi32():
705 GCC can be faster for this branch:
706*/
707#define AXR2_W_PRE_INSERT(sigma_index, i) \
708 { const unsigned ii = sigma[(sigma_index) + i * 2]; \
709 t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \
710 t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
711 }
712#define AXR2_W(sigma_index, shift1, shift2) \
713 { __m128i t0, t1; \
714 { const unsigned ii = sigma[sigma_index]; \
715 t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
716 t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
717 } \
718 AXR2_W_PRE_INSERT(sigma_index, 1) \
719 AXR2_W_PRE_INSERT(sigma_index, 2) \
720 AXR2_W_PRE_INSERT(sigma_index, 3) \
721 a0 = _mm_add_epi32(a0, t0); \
722 a1 = _mm_add_epi32(a1, t1); \
723 AXR(a0, b0, d0, shift1) \
724 AXR(a1, b1, d1, shift1) \
725 AXR(c0, d0, b0, shift2) \
726 AXR(c1, d1, b1, shift2) \
727 }
728#endif
729
730
731#define AXR4_W(sigma_index) \
732 AXR2_W(sigma_index, 16, 12) \
733 AXR2_W(sigma_index + 1, 8, 7) \
734
735#define WW(r) \
736 { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
737 AXR4_W(0) \
738 ROTATE_WORDS_TO_RIGHT(b0, 1) \
739 ROTATE_WORDS_TO_RIGHT(b1, 1) \
740 ROTATE_WORDS_TO_RIGHT(c0, 2) \
741 ROTATE_WORDS_TO_RIGHT(c1, 2) \
742 ROTATE_WORDS_TO_RIGHT(d0, 3) \
743 ROTATE_WORDS_TO_RIGHT(d1, 3) \
744 AXR4_W(8) \
745 ROTATE_WORDS_TO_RIGHT(b0, 3) \
746 ROTATE_WORDS_TO_RIGHT(b1, 3) \
747 ROTATE_WORDS_TO_RIGHT(c0, 2) \
748 ROTATE_WORDS_TO_RIGHT(c1, 2) \
749 ROTATE_WORDS_TO_RIGHT(d0, 1) \
750 ROTATE_WORDS_TO_RIGHT(d1, 1) \
751 }
752
753
754static
755Z7_NO_INLINE
756#ifdef BLAKE2S_ATTRIB_128BIT
757 BLAKE2S_ATTRIB_128BIT
758#endif
759void
760Z7_FASTCALL
761Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
762{
763 size_t pos = 0;
764 end -= Z7_BLAKE2S_BLOCK_SIZE;
765
766 if (data != end)
767 {
768 LOAD_ROTATE_CONSTS
769 do
770 {
771 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
772 __m128i a0, b0, c0, d0;
773 __m128i a1, b1, c1, d1;
774 {
775 const __m128i inc = k_inc;
776 const __m128i temp = k_iv4_128;
777 d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
778 d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
779 D_ADD_EPI64_128(d0, inc);
780 D_ADD_EPI64_128(d1, inc);
781 STORE_128_TO_STRUCT (STATE_T(s ), d0);
782 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
783 D_XOR_128(d0, temp);
784 D_XOR_128(d1, temp);
785 }
786 c1 = c0 = k_iv0_128;
787 a0 = LOAD_128_FROM_STRUCT(s);
788 b0 = LOAD_128_FROM_STRUCT(s + 4);
789 a1 = LOAD_128_FROM_STRUCT(s + NSW);
790 b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
791
792 ROUNDS_LOOP (WW)
793
794#undef WW
795
796 D_XOR_128(a0, c0);
797 D_XOR_128(b0, d0);
798 D_XOR_128(a1, c1);
799 D_XOR_128(b1, d1);
800
801 D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
802 D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
803 D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
804 D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
805
806 STORE_128_TO_STRUCT(s, a0);
807 STORE_128_TO_STRUCT(s + 4, b0);
808 STORE_128_TO_STRUCT(s + NSW, a1);
809 STORE_128_TO_STRUCT(s + NSW + 4, b1);
810
811 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
812 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
813 pos &= SUPER_BLOCK_MASK;
814 }
815 while (data < end);
816 if (data != end)
817 return;
818 }
819 {
820 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
821 Z7_BLAKE2S_CompressSingleBlock(s, data);
822 }
823}
824#endif // Z7_BLAKE2S_USE_V128_WAY2
825
826
827#ifdef Z7_BLAKE2S_USE_V128_WAY2
828 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2
829#else
830 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1
831#endif
832
833
834
835#ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
836 #define ROT_128_8(x) MM_ROR_EPI32(x, 8)
837 #define ROT_128_16(x) MM_ROR_EPI32(x, 16)
838 #define ROT_128_7(x) MM_ROR_EPI32(x, 7)
839 #define ROT_128_12(x) MM_ROR_EPI32(x, 12)
840#else
841#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
842 #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8
843 #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16
844#else
845 #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8)
846 #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16)
847#endif
848 #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7)
849 #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12)
850#endif
851
852
853#if 1
854// this branch can provide similar speed on x86* in most cases,
855// because [base + index*4] provides same speed as [base + index].
856// but some compilers can generate different code with this branch, that can be faster sometimes.
857// this branch uses additional table of 10*16=160 bytes.
858#define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
859 SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
860MY_ALIGN(16)
861static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
862 { SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
863#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r];
864#define GET_SIGMA_VAL_128(n) (sigma[n])
865#else
866#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
867#define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n])
868#endif
869
870
871#ifdef Z7_BLAKE2S_USE_AVX2_FAST
872#if 1
873#define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
874 SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
875MY_ALIGN(64)
876static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
877 { SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
878#define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
879#define GET_SIGMA_VAL_256(n) (sigma[n])
880#else
881#define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
882#define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n])
883#endif
884#endif // Z7_BLAKE2S_USE_AVX2_FAST
885
886
887#define D_ROT_128_7(dest) dest = ROT_128_7(dest)
888#define D_ROT_128_8(dest) dest = ROT_128_8(dest)
889#define D_ROT_128_12(dest) dest = ROT_128_12(dest)
890#define D_ROT_128_16(dest) dest = ROT_128_16(dest)
891
892#define OP_L(a, i) D_ADD_128 (V(a, 0), \
893 LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
894
895#define OP_0(a) OP_L(a, 0)
896#define OP_7(a) OP_L(a, 1)
897
898#define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1));
899#define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0));
900#define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3));
901#define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2));
902
903#define OP_3(a) D_ROT_128_16 (V(a, 3));
904#define OP_6(a) D_ROT_128_12 (V(a, 1));
905#define OP_8(a) D_ROT_128_8 (V(a, 3));
906#define OP_9(a) D_ROT_128_7 (V(a, 1));
907
908
909// for 32-bit x86 : interleave mode works slower, because of register pressure.
910
911#if 0 || 1 && (defined(MY_CPU_X86) \
912 || defined(__GNUC__) && !defined(__clang__))
913// non-inteleaved version:
914// is fast for x86 32-bit.
915// is fast for GCC x86-64.
916
917#define V4G(a) \
918 OP_0 (a) \
919 OP_1 (a) \
920 OP_2 (a) \
921 OP_3 (a) \
922 OP_4 (a) \
923 OP_5 (a) \
924 OP_6 (a) \
925 OP_7 (a) \
926 OP_1 (a) \
927 OP_2 (a) \
928 OP_8 (a) \
929 OP_4 (a) \
930 OP_5 (a) \
931 OP_9 (a) \
932
933#define V4R \
934{ \
935 V4G (0) \
936 V4G (1) \
937 V4G (2) \
938 V4G (3) \
939 V4G (4) \
940 V4G (5) \
941 V4G (6) \
942 V4G (7) \
943}
944
945#elif 0 || 1 && defined(MY_CPU_X86)
946
947#define OP_INTER_2(op, a,b) \
948 op (a) \
949 op (b) \
950
951#define V4G(a,b) \
952 OP_INTER_2 (OP_0, a,b) \
953 OP_INTER_2 (OP_1, a,b) \
954 OP_INTER_2 (OP_2, a,b) \
955 OP_INTER_2 (OP_3, a,b) \
956 OP_INTER_2 (OP_4, a,b) \
957 OP_INTER_2 (OP_5, a,b) \
958 OP_INTER_2 (OP_6, a,b) \
959 OP_INTER_2 (OP_7, a,b) \
960 OP_INTER_2 (OP_1, a,b) \
961 OP_INTER_2 (OP_2, a,b) \
962 OP_INTER_2 (OP_8, a,b) \
963 OP_INTER_2 (OP_4, a,b) \
964 OP_INTER_2 (OP_5, a,b) \
965 OP_INTER_2 (OP_9, a,b) \
966
967#define V4R \
968{ \
969 V4G (0, 1) \
970 V4G (2, 3) \
971 V4G (4, 5) \
972 V4G (6, 7) \
973}
974
975#else
976// iterleave-4 version is fast for x64 (MSVC/CLANG)
977
978#define OP_INTER_4(op, a,b,c,d) \
979 op (a) \
980 op (b) \
981 op (c) \
982 op (d) \
983
984#define V4G(a,b,c,d) \
985 OP_INTER_4 (OP_0, a,b,c,d) \
986 OP_INTER_4 (OP_1, a,b,c,d) \
987 OP_INTER_4 (OP_2, a,b,c,d) \
988 OP_INTER_4 (OP_3, a,b,c,d) \
989 OP_INTER_4 (OP_4, a,b,c,d) \
990 OP_INTER_4 (OP_5, a,b,c,d) \
991 OP_INTER_4 (OP_6, a,b,c,d) \
992 OP_INTER_4 (OP_7, a,b,c,d) \
993 OP_INTER_4 (OP_1, a,b,c,d) \
994 OP_INTER_4 (OP_2, a,b,c,d) \
995 OP_INTER_4 (OP_8, a,b,c,d) \
996 OP_INTER_4 (OP_4, a,b,c,d) \
997 OP_INTER_4 (OP_5, a,b,c,d) \
998 OP_INTER_4 (OP_9, a,b,c,d) \
999
1000#define V4R \
1001{ \
1002 V4G (0, 1, 2, 3) \
1003 V4G (4, 5, 6, 7) \
1004}
1005
1006#endif
1007
1008#define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R }
1009
1010
1011#define V4_LOAD_MSG_1(w, m, i) \
1012{ \
1013 __m128i m0, m1, m2, m3; \
1014 __m128i t0, t1, t2, t3; \
1015 m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
1016 m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
1017 m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
1018 m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
1019 t0 = _mm_unpacklo_epi32(m0, m1); \
1020 t1 = _mm_unpackhi_epi32(m0, m1); \
1021 t2 = _mm_unpacklo_epi32(m2, m3); \
1022 t3 = _mm_unpackhi_epi32(m2, m3); \
1023 w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
1024 w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
1025 w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
1026 w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
1027}
1028
1029#define V4_LOAD_MSG(w, m) \
1030{ \
1031 V4_LOAD_MSG_1 (w, m, 0) \
1032 V4_LOAD_MSG_1 (w, m, 1) \
1033 V4_LOAD_MSG_1 (w, m, 2) \
1034 V4_LOAD_MSG_1 (w, m, 3) \
1035}
1036
1037#define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
1038{ \
1039 const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \
1040 const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \
1041 d0 = _mm_unpacklo_epi32(v0, v1); \
1042 d1 = _mm_unpackhi_epi32(v0, v1); \
1043}
1044
1045#define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
1046{ \
1047 STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \
1048 STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \
1049}
1050
1051#define V4_UNPACK_STATE(dest32, src32) \
1052{ \
1053 __m128i t0, t1, t2, t3, t4, t5, t6, t7; \
1054 V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \
1055 V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \
1056 V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \
1057 V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \
1058 V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \
1059 V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \
1060 V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \
1061 V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \
1062}
1063
1064
1065static
1066Z7_NO_INLINE
1067#ifdef BLAKE2S_ATTRIB_128BIT
1068 BLAKE2S_ATTRIB_128BIT
1069#endif
1070void
1071Z7_FASTCALL
1072Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1073{
1074 // PrintStates2(s_items, 8, 16);
1075 size_t pos = 0;
1076 pos /= 2;
1077 do
1078 {
1079#if defined(Z7_BLAKE2S_USE_SSSE3) && \
1080 !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
1081 const __m128i r8 = k_r8;
1082 const __m128i r16 = k_r16;
1083#endif
1084 __m128i w[16];
1085 __m128i v[16];
1086 UInt32 *s;
1087 V4_LOAD_MSG(w, data)
1088 s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1089 {
1090 __m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
1091 D_ADD_EPI64_128 (ctr, k_inc);
1092 STORE_128_TO_STRUCT(s + 64, ctr);
1093 v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1094 v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1095 }
1096 v[ 8] = GET_128_IV_WAY4(0);
1097 v[ 9] = GET_128_IV_WAY4(1);
1098 v[10] = GET_128_IV_WAY4(2);
1099 v[11] = GET_128_IV_WAY4(3);
1100 v[14] = GET_128_IV_WAY4(6);
1101 v[15] = GET_128_IV_WAY4(7);
60 1102
61 for (i = 0; i < 16; i++) 1103#define LOAD_STATE_128_FROM_STRUCT(i) \
62 m[i] = GetUi32(p->buf + i * sizeof(m[i])); 1104 v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
1105
1106#define UPDATE_STATE_128_IN_STRUCT(i) \
1107 STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
1108 XOR_128(v[i], v[(i) + 8]), \
1109 LOAD_128_FROM_STRUCT(s + (i) * 4)));
63 1110
64 for (i = 0; i < 8; i++) 1111 REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
65 v[i] = p->h[i]; 1112 ROUNDS_LOOP (V4_ROUND)
1113 REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
1114
1115 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1116 pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
1117 pos &= SUPER_BLOCK_SIZE / 2 - 1;
66 } 1118 }
1119 while (data != end);
1120}
67 1121
68 v[ 8] = k_Blake2s_IV[0];
69 v[ 9] = k_Blake2s_IV[1];
70 v[10] = k_Blake2s_IV[2];
71 v[11] = k_Blake2s_IV[3];
72
73 v[12] = p->t[0] ^ k_Blake2s_IV[4];
74 v[13] = p->t[1] ^ k_Blake2s_IV[5];
75 v[14] = p->f[0] ^ k_Blake2s_IV[6];
76 v[15] = p->f[1] ^ k_Blake2s_IV[7];
77 1122
78 #define G(r,i,a,b,c,d) \ 1123static
79 a += b + m[sigma[2*i+0]]; d ^= a; d = rotr32(d, 16); c += d; b ^= c; b = rotr32(b, 12); \ 1124Z7_NO_INLINE
80 a += b + m[sigma[2*i+1]]; d ^= a; d = rotr32(d, 8); c += d; b ^= c; b = rotr32(b, 7); \ 1125#ifdef BLAKE2S_ATTRIB_128BIT
1126 BLAKE2S_ATTRIB_128BIT
1127#endif
1128void
1129Z7_FASTCALL
1130Blake2sp_Final_V128_Fast(UInt32 *states)
1131{
1132 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1133 // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
1134 // PrintStates2(states, 8, 16);
1135 {
1136 ptrdiff_t pos = 8 * 4;
1137 do
1138 {
1139 UInt32 *src32 = states + (size_t)(pos * 1);
1140 UInt32 *dest32 = states + (size_t)(pos * 2);
1141 V4_UNPACK_STATE(dest32, src32)
1142 pos -= 8 * 4;
1143 }
1144 while (pos >= 0);
1145 }
1146 {
1147 unsigned k;
1148 for (k = 0; k < 8; k++)
1149 {
1150 UInt32 *s = states + (size_t)k * 16;
1151 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1152 }
1153 }
1154 // PrintStates2(states, 8, 16);
1155}
1156
1157
1158
1159#ifdef Z7_BLAKE2S_USE_AVX2
1160
1161#define ADD_256(a, b) _mm256_add_epi32(a, b)
1162#define XOR_256(a, b) _mm256_xor_si256(a, b)
1163
1164#if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
1165 #define MM256_ROR_EPI32 _mm256_ror_epi32
1166 #define Z7_MM256_ROR_EPI32_IS_SUPPORTED
1167 #define LOAD_ROTATE_CONSTS_256
1168#else
1169#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1170#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1171 #define LOAD_ROTATE_CONSTS_256 \
1172 const __m256i r8 = k_r8_256; \
1173 const __m256i r16 = k_r16_256;
1174#endif // AVX2_WAY2
1175
1176 #define MM256_ROR_EPI32(r, c) ( \
1177 ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
1178 : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
1179 : _mm256_or_si256( \
1180 _mm256_srli_epi32((r), (c)), \
1181 _mm256_slli_epi32((r), 32-(c))))
1182#endif // WAY_SLOW
1183#endif
1184
1185
1186#define D_ADD_256(dest, src) dest = ADD_256(dest, src)
1187#define D_XOR_256(dest, src) dest = XOR_256(dest, src)
1188
1189#define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
1190
1191#ifdef Z7_BLAKE2S_USE_AVX2_FAST
1192
1193#ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1194#define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
1195#define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
1196#define ROT_256_8(x) MM256_ROR_EPI32((x), 8)
1197#define ROT_256_7(x) MM256_ROR_EPI32((x), 7)
1198#else
1199#define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
1200 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
1201#define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
1202 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
1203#define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
1204#define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
1205#define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8)
1206#define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25))
1207#endif
1208
1209#define D_ROT_256_7(dest) dest = ROT_256_7(dest)
1210#define D_ROT_256_8(dest) dest = ROT_256_8(dest)
1211#define D_ROT_256_12(dest) dest = ROT_256_12(dest)
1212#define D_ROT_256_16(dest) dest = ROT_256_16(dest)
1213
1214#define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p))
1215#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
1216 #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
1217 #define LOAD_256_FROM_STRUCT(p) LOADU_256(p)
1218 #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r)
1219#else
1220 // if struct is aligned for 32-bytes
1221 #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r)
1222 #define LOAD_256_FROM_STRUCT(p) LOAD_256(p)
1223 #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r)
1224#endif
1225
1226#endif // Z7_BLAKE2S_USE_AVX2_FAST
1227
1228
1229
1230#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1231
1232#if 0
1233 #define DIAG_PERM2(s) \
1234 { \
1235 const __m256i a = LOAD_256_FROM_STRUCT((s) ); \
1236 const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
1237 STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \
1238 STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
1239 }
1240#else
1241 #define DIAG_PERM2(s) \
1242 { \
1243 const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
1244 const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
1245 STORE_128_TO_STRUCT((s) + NSW, a); \
1246 STORE_128_TO_STRUCT((s) + 4 , b); \
1247 }
1248#endif
1249 #define DIAG_PERM8(s_items) \
1250 { \
1251 DIAG_PERM2(s_items) \
1252 DIAG_PERM2(s_items + NSW * 2) \
1253 DIAG_PERM2(s_items + NSW * 4) \
1254 DIAG_PERM2(s_items + NSW * 6) \
1255 }
1256
1257
1258#define AXR256(a, b, d, shift) \
1259 D_ADD_256(a, b); \
1260 D_XOR_256(d, a); \
1261 d = MM256_ROR_EPI32(d, shift); \
1262
1263
1264
1265#ifdef Z7_BLAKE2S_USE_GATHER
1266
1267 #define TABLE_GATHER_256_4(a0,a1,a2,a3) \
1268 a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
1269 #define TABLE_GATHER_256( \
1270 a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
1271 { TABLE_GATHER_256_4(a0,a2,a4,a6), \
1272 TABLE_GATHER_256_4(a1,a3,a5,a7), \
1273 TABLE_GATHER_256_4(a8,a10,a12,a14), \
1274 TABLE_GATHER_256_4(a9,a11,a13,a15) }
1275MY_ALIGN(64)
1276static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
1277 { SIGMA_TABLE(TABLE_GATHER_256) };
1278 #define GET_SIGMA(r) \
1279 const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
1280 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1281 const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
1282 #define SET_ROW_FROM_SIGMA_AVX(in) \
1283 _mm256_i32gather_epi32((const void *)(in), i01234567, 4)
1284 #define SIGMA_INTERLEAVE 8
1285 #define SIGMA_HALF_ROW_SIZE 16
1286
1287#else // !Z7_BLAKE2S_USE_GATHER
1288
1289 #define GET_SIGMA(r) \
1290 const Byte * const sigma = k_Blake2s_Sigma_4[r];
1291 #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1292 AXR2_LOAD_INDEXES(sigma_index)
1293 #define SET_ROW_FROM_SIGMA_AVX(in) \
1294 MY_mm256_set_m128i( \
1295 SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
1296 SET_ROW_FROM_SIGMA_W(in))
1297 #define SIGMA_INTERLEAVE 1
1298 #define SIGMA_HALF_ROW_SIZE 8
1299#endif // !Z7_BLAKE2S_USE_GATHER
1300
81 1301
82 #define R(r) \ 1302#define ROTATE_WORDS_TO_RIGHT_256(a, n) \
83 G(r,0,v[ 0],v[ 4],v[ 8],v[12]) \ 1303 a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
84 G(r,1,v[ 1],v[ 5],v[ 9],v[13]) \
85 G(r,2,v[ 2],v[ 6],v[10],v[14]) \
86 G(r,3,v[ 3],v[ 7],v[11],v[15]) \
87 G(r,4,v[ 0],v[ 5],v[10],v[15]) \
88 G(r,5,v[ 1],v[ 6],v[11],v[12]) \
89 G(r,6,v[ 2],v[ 7],v[ 8],v[13]) \
90 G(r,7,v[ 3],v[ 4],v[ 9],v[14]) \
91 1304
1305
1306
1307#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1308
1309#define AXR2_A(sigma_index, shift1, shift2) \
1310 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1311 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1312 AXR256(a0, b0, d0, shift1) \
1313 AXR256(c0, d0, b0, shift2) \
1314
1315#define AXR4_A(sigma_index) \
1316 { AXR2_A(sigma_index, 16, 12) } \
1317 { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1318
1319#define EE1(r) \
1320 { GET_SIGMA(r) \
1321 AXR4_A(0) \
1322 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1323 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1324 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1325 AXR4_A(SIGMA_HALF_ROW_SIZE) \
1326 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1327 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1328 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1329 }
1330
1331static
1332Z7_NO_INLINE
1333#ifdef BLAKE2S_ATTRIB_AVX2
1334 BLAKE2S_ATTRIB_AVX2
1335#endif
1336void
1337Z7_FASTCALL
1338Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
1339{
1340 size_t pos = 0;
1341 end -= Z7_BLAKE2S_BLOCK_SIZE;
1342
1343 if (data != end)
92 { 1344 {
93 unsigned r; 1345 LOAD_ROTATE_CONSTS_256
94 for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) 1346 DIAG_PERM8(s_items)
1347 do
95 { 1348 {
96 const Byte *sigma = k_Blake2s_Sigma[r]; 1349 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
97 R(r) 1350 __m256i a0, b0, c0, d0;
1351 {
1352 const __m128i inc = k_inc;
1353 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1354 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1355 D_ADD_EPI64_128(d0_128, inc);
1356 D_ADD_EPI64_128(d1_128, inc);
1357 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1358 STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
1359 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1360 D_XOR_256(d0, k_iv4_256);
1361 }
1362 c0 = SET_FROM_128(k_iv0_128);
1363 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1364 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1365
1366 ROUNDS_LOOP (EE1)
1367
1368 D_XOR_256(a0, c0);
1369 D_XOR_256(b0, d0);
1370
1371 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1372 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1373
1374 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1375 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1376
1377 data += Z7_BLAKE2S_BLOCK_SIZE * 2;
1378 pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
1379 pos &= SUPER_BLOCK_MASK;
98 } 1380 }
99 /* R(0); R(1); R(2); R(3); R(4); R(5); R(6); R(7); R(8); R(9); */ 1381 while (data < end);
1382 DIAG_PERM8(s_items)
1383 if (data != end)
1384 return;
1385 }
1386 {
1387 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1388 Z7_BLAKE2S_CompressSingleBlock(s, data);
100 } 1389 }
1390}
1391
1392#endif // Z7_BLAKE2S_USE_AVX2_WAY2
101 1393
102 #undef G
103 #undef R
104 1394
1395
1396#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
1397
1398#define AXR2_X(sigma_index, shift1, shift2) \
1399 AXR2_LOAD_INDEXES_AVX(sigma_index) \
1400 D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1401 D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
1402 AXR256(a0, b0, d0, shift1) \
1403 AXR256(a1, b1, d1, shift1) \
1404 AXR256(c0, d0, b0, shift2) \
1405 AXR256(c1, d1, b1, shift2) \
1406
1407#define AXR4_X(sigma_index) \
1408 { AXR2_X(sigma_index, 16, 12) } \
1409 { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1410
1411#define EE2(r) \
1412 { GET_SIGMA(r) \
1413 AXR4_X(0) \
1414 ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1415 ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
1416 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1417 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1418 ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1419 ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
1420 AXR4_X(SIGMA_HALF_ROW_SIZE) \
1421 ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1422 ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
1423 ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1424 ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1425 ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1426 ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
1427 }
1428
1429static
1430Z7_NO_INLINE
1431#ifdef BLAKE2S_ATTRIB_AVX2
1432 BLAKE2S_ATTRIB_AVX2
1433#endif
1434void
1435Z7_FASTCALL
1436Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
1437{
1438 size_t pos = 0;
1439
1440 if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
105 { 1441 {
106 unsigned i; 1442#ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
107 for (i = 0; i < 8; i++) 1443 const __m256i r8 = k_r8_256;
108 p->h[i] ^= v[i] ^ v[i + 8]; 1444 const __m256i r16 = k_r16_256;
1445#endif
1446 end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
1447 DIAG_PERM8(s_items)
1448 do
1449 {
1450 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1451 __m256i a0, b0, c0, d0;
1452 __m256i a1, b1, c1, d1;
1453 {
1454 const __m128i inc = k_inc;
1455 __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1456 __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1457 __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
1458 __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
1459 D_ADD_EPI64_128(d0_128, inc);
1460 D_ADD_EPI64_128(d1_128, inc);
1461 D_ADD_EPI64_128(d2_128, inc);
1462 D_ADD_EPI64_128(d3_128, inc);
1463 STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
1464 STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
1465 STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
1466 STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
1467 d0 = MY_mm256_set_m128i(d1_128, d0_128);
1468 d1 = MY_mm256_set_m128i(d3_128, d2_128);
1469 D_XOR_256(d0, k_iv4_256);
1470 D_XOR_256(d1, k_iv4_256);
1471 }
1472 c1 = c0 = SET_FROM_128(k_iv0_128);
1473 a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1474 b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1475 a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
1476 b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
1477
1478 ROUNDS_LOOP (EE2)
1479
1480 D_XOR_256(a0, c0);
1481 D_XOR_256(b0, d0);
1482 D_XOR_256(a1, c1);
1483 D_XOR_256(b1, d1);
1484
1485 D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1486 D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1487 D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
1488 D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
1489
1490 STORE_256_TO_STRUCT(s + NSW * 0, a0);
1491 STORE_256_TO_STRUCT(s + NSW * 1, b0);
1492 STORE_256_TO_STRUCT(s + NSW * 2, a1);
1493 STORE_256_TO_STRUCT(s + NSW * 3, b1);
1494
1495 data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1496 pos += Z7_BLAKE2S_BLOCK_SIZE * 4;
1497 pos &= SUPER_BLOCK_MASK;
1498 }
1499 while (data < end);
1500 DIAG_PERM8(s_items)
1501 end += Z7_BLAKE2S_BLOCK_SIZE * 3;
109 } 1502 }
1503 if (data == end)
1504 return;
1505 // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
1506 do
1507 {
1508 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1509 Z7_BLAKE2S_CompressSingleBlock(s, data);
1510 data += Z7_BLAKE2S_BLOCK_SIZE;
1511 pos += Z7_BLAKE2S_BLOCK_SIZE;
1512 pos &= SUPER_BLOCK_MASK;
1513 }
1514 while (data != end);
1515}
1516
1517#endif // Z7_BLAKE2S_USE_AVX2_WAY4
1518#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1519
1520
1521// ---------------------------------------------------------
1522
1523#ifdef Z7_BLAKE2S_USE_AVX2_FAST
1524
1525#define OP256_L(a, i) D_ADD_256 (V(a, 0), \
1526 LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
1527
1528#define OP256_0(a) OP256_L(a, 0)
1529#define OP256_7(a) OP256_L(a, 1)
1530
1531#define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1));
1532#define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0));
1533#define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3));
1534#define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2));
1535
1536#define OP256_3(a) D_ROT_256_16 (V(a, 3));
1537#define OP256_6(a) D_ROT_256_12 (V(a, 1));
1538#define OP256_8(a) D_ROT_256_8 (V(a, 3));
1539#define OP256_9(a) D_ROT_256_7 (V(a, 1));
1540
1541
1542#if 0 || 1 && defined(MY_CPU_X86)
1543
1544#define V8_G(a) \
1545 OP256_0 (a) \
1546 OP256_1 (a) \
1547 OP256_2 (a) \
1548 OP256_3 (a) \
1549 OP256_4 (a) \
1550 OP256_5 (a) \
1551 OP256_6 (a) \
1552 OP256_7 (a) \
1553 OP256_1 (a) \
1554 OP256_2 (a) \
1555 OP256_8 (a) \
1556 OP256_4 (a) \
1557 OP256_5 (a) \
1558 OP256_9 (a) \
1559
1560#define V8R { \
1561 V8_G (0); \
1562 V8_G (1); \
1563 V8_G (2); \
1564 V8_G (3); \
1565 V8_G (4); \
1566 V8_G (5); \
1567 V8_G (6); \
1568 V8_G (7); \
1569}
1570
1571#else
1572
1573#define OP256_INTER_4(op, a,b,c,d) \
1574 op (a) \
1575 op (b) \
1576 op (c) \
1577 op (d) \
1578
1579#define V8_G(a,b,c,d) \
1580 OP256_INTER_4 (OP256_0, a,b,c,d) \
1581 OP256_INTER_4 (OP256_1, a,b,c,d) \
1582 OP256_INTER_4 (OP256_2, a,b,c,d) \
1583 OP256_INTER_4 (OP256_3, a,b,c,d) \
1584 OP256_INTER_4 (OP256_4, a,b,c,d) \
1585 OP256_INTER_4 (OP256_5, a,b,c,d) \
1586 OP256_INTER_4 (OP256_6, a,b,c,d) \
1587 OP256_INTER_4 (OP256_7, a,b,c,d) \
1588 OP256_INTER_4 (OP256_1, a,b,c,d) \
1589 OP256_INTER_4 (OP256_2, a,b,c,d) \
1590 OP256_INTER_4 (OP256_8, a,b,c,d) \
1591 OP256_INTER_4 (OP256_4, a,b,c,d) \
1592 OP256_INTER_4 (OP256_5, a,b,c,d) \
1593 OP256_INTER_4 (OP256_9, a,b,c,d) \
1594
1595#define V8R { \
1596 V8_G (0, 1, 2, 3) \
1597 V8_G (4, 5, 6, 7) \
1598}
1599#endif
1600
1601#define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R }
1602
1603
1604// for debug:
1605// #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
1606#if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
1607// gather instruction is slow.
1608#define V8_LOAD_MSG(w, m) \
1609{ \
1610 unsigned i; \
1611 for (i = 0; i < 16; ++i) { \
1612 w[i] = _mm256_i32gather_epi32( \
1613 (const void *)((m) + i * sizeof(UInt32)),\
1614 _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
1615 sizeof(UInt32)); \
1616 } \
1617}
1618#else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1619
1620#define V8_LOAD_MSG_2(w, a0, a1) \
1621{ \
1622 (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \
1623 (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \
1624}
1625
1626#define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
1627{ \
1628 __m256i s0, s1, s2, s3; \
1629 s0 = _mm256_unpacklo_epi64(z0, z1); \
1630 s1 = _mm256_unpackhi_epi64(z0, z1); \
1631 s2 = _mm256_unpacklo_epi64(z2, z3); \
1632 s3 = _mm256_unpackhi_epi64(z2, z3); \
1633 V8_LOAD_MSG_2((w) + 0, s0, s2) \
1634 V8_LOAD_MSG_2((w) + 1, s1, s3) \
1635}
1636
1637#define V8_LOAD_MSG_0(t0, t1, m) \
1638{ \
1639 __m256i m0, m1; \
1640 m0 = LOADU_256(m); \
1641 m1 = LOADU_256((m) + 2 * 32); \
1642 t0 = _mm256_unpacklo_epi32(m0, m1); \
1643 t1 = _mm256_unpackhi_epi32(m0, m1); \
1644}
1645
1646#define V8_LOAD_MSG_8(w, m) \
1647{ \
1648 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1649 V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \
1650 V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \
1651 V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \
1652 V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \
1653 V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \
1654 V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \
1655}
1656
1657#define V8_LOAD_MSG(w, m) \
1658{ \
1659 V8_LOAD_MSG_8(w, m) \
1660 V8_LOAD_MSG_8((w) + 8, (m) + 32) \
1661}
1662
1663#endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1664
1665
1666#define V8_PERM_PAIR_STORE(u, a0, a2) \
1667{ \
1668 STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \
1669 STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \
1670}
1671
1672#define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
1673{ \
1674 __m256i s0, s1, s2, s3; \
1675 s0 = _mm256_unpacklo_epi64(z0, z1); \
1676 s1 = _mm256_unpackhi_epi64(z0, z1); \
1677 s2 = _mm256_unpacklo_epi64(z2, z3); \
1678 s3 = _mm256_unpackhi_epi64(z2, z3); \
1679 V8_PERM_PAIR_STORE(u + 0, s0, s2) \
1680 V8_PERM_PAIR_STORE(u + 2, s1, s3) \
1681}
1682
1683#define V8_UNPACK_STORE_0(src32, d0, d1) \
1684{ \
1685 const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \
1686 const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \
1687 d0 = _mm256_unpacklo_epi32(v0, v1); \
1688 d1 = _mm256_unpackhi_epi32(v0, v1); \
1689}
1690
1691#define V8_UNPACK_STATE(dest32, src32) \
1692{ \
1693 __m256i t0, t1, t2, t3, t4, t5, t6, t7; \
1694 V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \
1695 V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \
1696 V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \
1697 V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \
1698 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \
1699 V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \
110} 1700}
111 1701
112 1702
113#define Blake2s_Increment_Counter(S, inc) \
114 { p->t[0] += (inc); p->t[1] += (p->t[0] < (inc)); }
115 1703
116#define Blake2s_Set_LastBlock(p) \ 1704#define V8_LOAD_STATE_256_FROM_STRUCT(i) \
117 { p->f[0] = BLAKE2S_FINAL_FLAG; p->f[1] = p->lastNode_f1; } 1705 v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
1706
1707#if 0 || 0 && defined(MY_CPU_X86)
1708#define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1709#endif
1710
1711#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1712// this branch doesn't use (iv) array
1713// so register pressure can be lower.
1714// it can be faster sometimes
1715#define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i)
1716#define V8_UPDATE_STATE_256(i) \
1717{ \
1718 STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
1719 XOR_256(v[i], v[(i) + 8]), \
1720 LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
1721}
1722#else
1723// it uses more variables (iv) registers
1724// it's better for gcc
1725// maybe that branch is better, if register pressure will be lower (avx512)
1726#define V8_LOAD_STATE_256(i) { iv[i] = v[i]; }
1727#define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
1728#define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
1729#endif
118 1730
119 1731
120static void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size) 1732#if 0
1733 // use loading constants from memory
1734 #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
1735MY_ALIGN(64)
1736static const UInt32 k_Blake2s_IV_WAY8[]=
121{ 1737{
122 while (size != 0) 1738 KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
123 { 1739};
124 unsigned pos = (unsigned)p->bufPos; 1740 #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
125 unsigned rem = BLAKE2S_BLOCK_SIZE - pos; 1741#else
1742 // use constant generation:
1743 #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i))
1744#endif
126 1745
127 if (size <= rem) 1746
1747static
1748Z7_NO_INLINE
1749#ifdef BLAKE2S_ATTRIB_AVX2
1750 BLAKE2S_ATTRIB_AVX2
1751#endif
1752void
1753Z7_FASTCALL
1754Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1755{
1756#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1757 __m256i v[16];
1758#endif
1759
1760 // PrintStates2(s_items, 8, 16);
1761
1762#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1763 REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
1764#endif
1765
1766 do
1767 {
1768 __m256i w[16];
1769#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1770 __m256i v[16];
1771#else
1772 __m256i iv[8];
1773#endif
1774 V8_LOAD_MSG(w, data)
128 { 1775 {
129 memcpy(p->buf + pos, data, size); 1776 // we use load/store ctr inside loop to reduce register pressure:
130 p->bufPos += (UInt32)size; 1777#if 1 || 1 && defined(MY_CPU_X86)
131 return; 1778 const __m256i ctr = _mm256_add_epi64(
1779 LOAD_256_FROM_STRUCT(s_items + 64),
1780 _mm256_set_epi32(
1781 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
1782 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
1783 STORE_256_TO_STRUCT(s_items + 64, ctr);
1784#else
1785 const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
1786 + Z7_BLAKE2S_BLOCK_SIZE;
1787 const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
1788 *(UInt64 *)(void *)(s_items + 64) = ctr64;
1789#endif
1790 v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1791 v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
132 } 1792 }
1793 v[ 8] = GET_256_IV_WAY8(0);
1794 v[ 9] = GET_256_IV_WAY8(1);
1795 v[10] = GET_256_IV_WAY8(2);
1796 v[11] = GET_256_IV_WAY8(3);
1797 v[14] = GET_256_IV_WAY8(6);
1798 v[15] = GET_256_IV_WAY8(7);
133 1799
134 memcpy(p->buf + pos, data, rem); 1800 REP8_MACRO (V8_LOAD_STATE_256)
135 Blake2s_Increment_Counter(S, BLAKE2S_BLOCK_SIZE) 1801 ROUNDS_LOOP (V8_ROUND)
136 Blake2s_Compress(p); 1802 REP8_MACRO (V8_UPDATE_STATE_256)
137 p->bufPos = 0; 1803 data += SUPER_BLOCK_SIZE;
138 data += rem;
139 size -= rem;
140 } 1804 }
1805 while (data != end);
1806
1807#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1808 REP8_MACRO (V8_STORE_STATE_256)
1809#endif
141} 1810}
142 1811
143 1812
144static void Blake2s_Final(CBlake2s *p, Byte *digest) 1813static
1814Z7_NO_INLINE
1815#ifdef BLAKE2S_ATTRIB_AVX2
1816 BLAKE2S_ATTRIB_AVX2
1817#endif
1818void
1819Z7_FASTCALL
1820Blake2sp_Final_AVX2_Fast(UInt32 *states)
145{ 1821{
146 unsigned i; 1822 const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1823 // PrintStates2(states, 8, 16);
1824 V8_UNPACK_STATE(states, states)
1825 // PrintStates2(states, 8, 16);
1826 {
1827 unsigned k;
1828 for (k = 0; k < 8; k++)
1829 {
1830 UInt32 *s = states + (size_t)k * 16;
1831 STORE_128_TO_STRUCT (STATE_T(s), ctr);
1832 }
1833 }
1834 // PrintStates2(states, 8, 16);
1835 // printf("\nafter V8_UNPACK_STATE \n");
1836}
1837
1838#endif // Z7_BLAKE2S_USE_AVX2_FAST
1839#endif // avx2
1840#endif // vector
1841
1842
1843/*
1844#define Blake2s_Increment_Counter(s, inc) \
1845 { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
1846#define Blake2s_Increment_Counter_Small(s, inc) \
1847 { STATE_T(s)[0] += (inc); }
1848*/
1849
1850#define Blake2s_Set_LastBlock(s) \
1851 { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
1852
1853
1854#if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
1855 // good for vs2022
1856 #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
1857#else
1858 // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
1859 #define LOOP_8(mac) { REP8_MACRO(mac) }
1860#endif
1861
1862
1863static
1864Z7_FORCE_INLINE
1865// Z7_NO_INLINE
1866void
1867Z7_FASTCALL
1868Blake2s_Compress(UInt32 *s, const Byte *input)
1869{
1870 UInt32 m[16];
1871 UInt32 v[16];
1872 {
1873 unsigned i;
1874 for (i = 0; i < 16; i++)
1875 m[i] = GetUi32(input + i * 4);
1876 }
1877
1878#define INIT_v_FROM_s(i) v[i] = s[i];
1879
1880 LOOP_8(INIT_v_FROM_s)
1881
1882 // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
1883 {
1884 const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
1885 const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
1886 STATE_T(s)[0] = t0;
1887 STATE_T(s)[1] = t1;
1888 v[12] = t0 ^ KIV(4);
1889 v[13] = t1 ^ KIV(5);
1890 }
1891 // v[12] = STATE_T(s)[0] ^ KIV(4);
1892 // v[13] = STATE_T(s)[1] ^ KIV(5);
1893 v[14] = STATE_F(s)[0] ^ KIV(6);
1894 v[15] = STATE_F(s)[1] ^ KIV(7);
1895
1896 v[ 8] = KIV(0);
1897 v[ 9] = KIV(1);
1898 v[10] = KIV(2);
1899 v[11] = KIV(3);
1900 // PrintStates2((const UInt32 *)v, 1, 16);
1901
1902 #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
1903 #define ADD32M(dest, src, a) V(a, dest) += V(a, src);
1904 #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src);
1905 #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift);
1906
1907// big interleaving can provides big performance gain, if scheduler queues are small.
1908#if 0 || 1 && defined(MY_CPU_X86)
1909 // interleave-1: for small register number (x86-32bit)
1910 #define G2(index, a, x, y) \
1911 ADD_SIGMA (a, (index) + 2 * 0) \
1912 ADD32M (0, 1, a) \
1913 XOR32M (3, 0, a) \
1914 RTR32M (3, x, a) \
1915 ADD32M (2, 3, a) \
1916 XOR32M (1, 2, a) \
1917 RTR32M (1, y, a) \
1918
1919 #define G(a) \
1920 G2(a * 2 , a, 16, 12) \
1921 G2(a * 2 + 1, a, 8, 7) \
1922
1923 #define R2 \
1924 G(0) \
1925 G(1) \
1926 G(2) \
1927 G(3) \
1928 G(4) \
1929 G(5) \
1930 G(6) \
1931 G(7) \
1932
1933#elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
1934 // interleave-2: is good if the number of registers is not big (x86-64).
1935
1936 #define REP2(mac, dest, src, a, b) \
1937 mac(dest, src, a) \
1938 mac(dest, src, b)
1939
1940 #define G2(index, a, b, x, y) \
1941 ADD_SIGMA (a, (index) + 2 * 0) \
1942 ADD_SIGMA (b, (index) + 2 * 1) \
1943 REP2 (ADD32M, 0, 1, a, b) \
1944 REP2 (XOR32M, 3, 0, a, b) \
1945 REP2 (RTR32M, 3, x, a, b) \
1946 REP2 (ADD32M, 2, 3, a, b) \
1947 REP2 (XOR32M, 1, 2, a, b) \
1948 REP2 (RTR32M, 1, y, a, b) \
1949
1950 #define G(a, b) \
1951 G2(a * 2 , a, b, 16, 12) \
1952 G2(a * 2 + 1, a, b, 8, 7) \
1953
1954 #define R2 \
1955 G(0, 1) \
1956 G(2, 3) \
1957 G(4, 5) \
1958 G(6, 7) \
147 1959
148 Blake2s_Increment_Counter(S, (UInt32)p->bufPos) 1960#else
149 Blake2s_Set_LastBlock(p) 1961 // interleave-4:
150 memset(p->buf + p->bufPos, 0, BLAKE2S_BLOCK_SIZE - p->bufPos); 1962 // it has big register pressure for x86/x64.
151 Blake2s_Compress(p); 1963 // and MSVC compilers for x86/x64 are slow for this branch.
1964 // but if we have big number of registers, this branch can be faster.
152 1965
153 for (i = 0; i < 8; i++) 1966 #define REP4(mac, dest, src, a, b, c, d) \
1967 mac(dest, src, a) \
1968 mac(dest, src, b) \
1969 mac(dest, src, c) \
1970 mac(dest, src, d)
1971
1972 #define G2(index, a, b, c, d, x, y) \
1973 ADD_SIGMA (a, (index) + 2 * 0) \
1974 ADD_SIGMA (b, (index) + 2 * 1) \
1975 ADD_SIGMA (c, (index) + 2 * 2) \
1976 ADD_SIGMA (d, (index) + 2 * 3) \
1977 REP4 (ADD32M, 0, 1, a, b, c, d) \
1978 REP4 (XOR32M, 3, 0, a, b, c, d) \
1979 REP4 (RTR32M, 3, x, a, b, c, d) \
1980 REP4 (ADD32M, 2, 3, a, b, c, d) \
1981 REP4 (XOR32M, 1, 2, a, b, c, d) \
1982 REP4 (RTR32M, 1, y, a, b, c, d) \
1983
1984 #define G(a, b, c, d) \
1985 G2(a * 2 , a, b, c, d, 16, 12) \
1986 G2(a * 2 + 1, a, b, c, d, 8, 7) \
1987
1988 #define R2 \
1989 G(0, 1, 2, 3) \
1990 G(4, 5, 6, 7) \
1991
1992#endif
1993
1994 #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 }
1995
1996 // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
1997 // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
1998 // 30-60% faster for (arm64-arm32) GCC.
1999 // 5-11% faster for (arm64) CLANG-MAC.
2000 // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
2001 // But if there is vectors branch (for x86*), this scalar code will be unused mostly.
2002 // So we want smaller code (without unrolling) in that case (x86*).
2003#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
2004 #define Z7_BLAKE2S_UNROLL
2005#endif
2006
2007#ifdef Z7_BLAKE2S_UNROLL
2008 ROUNDS_LOOP_UNROLLED (R)
2009#else
2010 ROUNDS_LOOP (R)
2011#endif
2012
2013 #undef G
2014 #undef G2
2015 #undef R
2016 #undef R2
2017
2018 // printf("\n v after: \n");
2019 // PrintStates2((const UInt32 *)v, 1, 16);
2020#define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8];
2021
2022 LOOP_8(XOR_s_PAIR_v)
2023 // printf("\n s after:\n");
2024 // PrintStates2((const UInt32 *)s, 1, 16);
2025}
2026
2027
2028static
2029Z7_NO_INLINE
2030void
2031Z7_FASTCALL
2032Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
2033{
2034 size_t pos = 0;
2035 // PrintStates2(s_items, 8, 16);
2036 do
154 { 2037 {
155 SetUi32(digest + sizeof(p->h[i]) * i, p->h[i]) 2038 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
2039 Blake2s_Compress(s, data);
2040 data += Z7_BLAKE2S_BLOCK_SIZE;
2041 pos += Z7_BLAKE2S_BLOCK_SIZE;
2042 pos &= SUPER_BLOCK_MASK;
156 } 2043 }
2044 while (data != end);
157} 2045}
158 2046
159 2047
160/* ---------- BLAKE2s ---------- */ 2048#ifdef Z7_BLAKE2S_USE_VECTORS
2049
2050static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2;
2051static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
2052static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init;
2053static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final;
2054static unsigned g_z7_Blake2sp_SupportedFlags;
2055
2056 #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast
2057 #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
2058#else
2059 #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2
2060 #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
2061#endif // Z7_BLAKE2S_USE_VECTORS
2062
161 2063
162/* we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ 2064#if 1 && defined(MY_CPU_LE)
2065 #define GET_DIGEST(_s, _digest) \
2066 { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
2067#else
2068 #define GET_DIGEST(_s, _digest) \
2069 { unsigned _i; for (_i = 0; _i < 8; _i++) \
2070 { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
2071 }
2072#endif
2073
2074
2075/* ---------- BLAKE2s ---------- */
163/* 2076/*
2077// we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
164typedef struct 2078typedef struct
165{ 2079{
166 Byte digest_length; 2080 Byte digest_length;
167 Byte key_length; 2081 Byte key_length;
168 Byte fanout; 2082 Byte fanout; // = 1 : in sequential mode
169 Byte depth; 2083 Byte depth; // = 1 : in sequential mode
170 UInt32 leaf_length; 2084 UInt32 leaf_length;
171 Byte node_offset[6]; 2085 Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode
172 Byte node_depth; 2086 Byte node_depth; // 0 for the leaves, or in sequential mode
173 Byte inner_length; 2087 Byte inner_length; // [0, 32], 0 in sequential mode
174 Byte salt[BLAKE2S_SALTBYTES]; 2088 Byte salt[BLAKE2S_SALTBYTES];
175 Byte personal[BLAKE2S_PERSONALBYTES]; 2089 Byte personal[BLAKE2S_PERSONALBYTES];
176} CBlake2sParam; 2090} CBlake2sParam;
177*/ 2091*/
178 2092
2093#define k_Blake2sp_IV_0 \
2094 (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
2095#define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \
2096 (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
179 2097
180static void Blake2sp_Init_Spec(CBlake2s *p, unsigned node_offset, unsigned node_depth) 2098Z7_FORCE_INLINE
2099static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
181{ 2100{
182 Blake2s_Init0(p); 2101 s[0] = k_Blake2sp_IV_0;
183 2102 s[1] = KIV(1);
184 p->h[0] ^= (BLAKE2S_DIGEST_SIZE | ((UInt32)BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)); 2103 s[2] = KIV(2) ^ (UInt32)node_offset;
185 p->h[2] ^= ((UInt32)node_offset); 2104 s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
186 p->h[3] ^= ((UInt32)node_depth << 16) | ((UInt32)BLAKE2S_DIGEST_SIZE << 24); 2105 s[4] = KIV(4);
187 /* 2106 s[5] = KIV(5);
188 P->digest_length = BLAKE2S_DIGEST_SIZE; 2107 s[6] = KIV(6);
189 P->key_length = 0; 2108 s[7] = KIV(7);
190 P->fanout = BLAKE2SP_PARALLEL_DEGREE; 2109
191 P->depth = 2; 2110 STATE_T(s)[0] = 0;
192 P->leaf_length = 0; 2111 STATE_T(s)[1] = 0;
193 store48(P->node_offset, node_offset); 2112 STATE_F(s)[0] = 0;
194 P->node_depth = node_depth; 2113 STATE_F(s)[1] = 0;
195 P->inner_length = BLAKE2S_DIGEST_SIZE;
196 */
197} 2114}
198 2115
199 2116
2117#ifdef Z7_BLAKE2S_USE_V128_FAST
2118
2119static
2120Z7_NO_INLINE
2121#ifdef BLAKE2S_ATTRIB_128BIT
2122 BLAKE2S_ATTRIB_128BIT
2123#endif
2124void
2125Z7_FASTCALL
2126Blake2sp_InitState_V128_Fast(UInt32 *states)
2127{
2128#define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
2129 { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \
2130 STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
2131 }
2132#define STORE_128_PAIR_INIT_STATES_1(i, mac) \
2133 { const __m128i t = mac; \
2134 STORE_128_PAIR_INIT_STATES_2(i, t, t) \
2135 }
2136#define STORE_128_PAIR_INIT_STATES_IV(i) \
2137 STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
2138
2139 STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
2140 STORE_128_PAIR_INIT_STATES_IV (1)
2141 {
2142 const __m128i t = GET_128_IV_WAY4(2);
2143 STORE_128_PAIR_INIT_STATES_2 (2,
2144 XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
2145 XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
2146 }
2147 STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2148 STORE_128_PAIR_INIT_STATES_IV (4)
2149 STORE_128_PAIR_INIT_STATES_IV (5)
2150 STORE_128_PAIR_INIT_STATES_IV (6)
2151 STORE_128_PAIR_INIT_STATES_IV (7)
2152 STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0))
2153 // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
2154}
2155
2156#endif // Z7_BLAKE2S_USE_V128_FAST
2157
2158
2159#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2160
2161static
2162Z7_NO_INLINE
2163#ifdef BLAKE2S_ATTRIB_AVX2
2164 BLAKE2S_ATTRIB_AVX2
2165#endif
2166void
2167Z7_FASTCALL
2168Blake2sp_InitState_AVX2_Fast(UInt32 *states)
2169{
2170#define STORE_256_INIT_STATES(i, t) \
2171 STORE_256_TO_STRUCT(states + 8 * (i), t);
2172#define STORE_256_INIT_STATES_IV(i) \
2173 STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
2174
2175 STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
2176 STORE_256_INIT_STATES_IV (1)
2177 STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2),
2178 _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
2179 STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2180 STORE_256_INIT_STATES_IV (4)
2181 STORE_256_INIT_STATES_IV (5)
2182 STORE_256_INIT_STATES_IV (6)
2183 STORE_256_INIT_STATES_IV (7)
2184 STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
2185 // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
2186}
2187
2188#endif // Z7_BLAKE2S_USE_AVX2_FAST
2189
2190
2191
2192Z7_NO_INLINE
2193void Blake2sp_InitState(CBlake2sp *p)
2194{
2195 size_t i;
2196 // memset(p->states, 0, sizeof(p->states)); // for debug
2197 p->u.header.cycPos = 0;
2198#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2199 if (p->u.header.func_Init)
2200 {
2201 p->u.header.func_Init(p->states);
2202 return;
2203 }
2204#endif
2205 for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
2206 Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
2207}
2208
200void Blake2sp_Init(CBlake2sp *p) 2209void Blake2sp_Init(CBlake2sp *p)
201{ 2210{
202 unsigned i; 2211#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
203 2212 p->u.header.func_Compress_Fast =
204 p->bufPos = 0; 2213#ifdef Z7_BLAKE2S_USE_VECTORS
2214 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2215#else
2216 NULL;
2217#endif
2218
2219 p->u.header.func_Compress_Single =
2220#ifdef Z7_BLAKE2S_USE_VECTORS
2221 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2222#else
2223 NULL;
2224#endif
2225
2226 p->u.header.func_Init =
2227#ifdef Z7_BLAKE2S_USE_VECTORS
2228 g_Z7_BLAKE2SP_FUNC_INIT_Init;
2229#else
2230 NULL;
2231#endif
205 2232
206 for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) 2233 p->u.header.func_Final =
207 Blake2sp_Init_Spec(&p->S[i], i, 0); 2234#ifdef Z7_BLAKE2S_USE_VECTORS
2235 g_Z7_BLAKE2SP_FUNC_INIT_Final;
2236#else
2237 NULL;
2238#endif
2239#endif
208 2240
209 p->S[BLAKE2SP_PARALLEL_DEGREE - 1].lastNode_f1 = BLAKE2S_FINAL_FLAG; 2241 Blake2sp_InitState(p);
210} 2242}
211 2243
212 2244
213void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) 2245void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
214{ 2246{
215 unsigned pos = p->bufPos; 2247 size_t pos;
216 while (size != 0) 2248 // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
2249 if (size == 0)
2250 return;
2251 pos = p->u.header.cycPos;
2252 // pos < SUPER_BLOCK_SIZE * 2 : is expected
2253 // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also
2254 {
2255 const size_t pos2 = pos & SUPER_BLOCK_MASK;
2256 if (pos2)
2257 {
2258 const size_t rem = SUPER_BLOCK_SIZE - pos2;
2259 if (rem > size)
2260 {
2261 p->u.header.cycPos = (unsigned)(pos + size);
2262 // cycPos < SUPER_BLOCK_SIZE * 2
2263 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2264 /* to simpilify the code here we don't try to process first superblock,
2265 if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
2266 return;
2267 }
2268 // (rem <= size)
2269 memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
2270 pos += rem;
2271 data += rem;
2272 size -= rem;
2273 }
2274 }
2275
2276 // pos <= SUPER_BLOCK_SIZE * 2
2277 // pos % SUPER_BLOCK_SIZE == 0
2278 if (pos)
2279 {
2280 /* pos == SUPER_BLOCK_SIZE ||
2281 pos == SUPER_BLOCK_SIZE * 2 */
2282 size_t end = pos;
2283 if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
2284 || (end -= SUPER_BLOCK_SIZE))
2285 {
2286 Z7_BLAKE2SP_Compress_Fast(p)(p->states,
2287 (const Byte *)(const void *)p->buf32,
2288 (const Byte *)(const void *)p->buf32 + end);
2289 if (pos -= end)
2290 memcpy(p->buf32, (const Byte *)(const void *)p->buf32
2291 + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
2292 }
2293 }
2294
2295 // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
2296 if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2297 {
2298 // pos == 0
2299 const Byte *end;
2300 const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
2301 & ~(size_t)SUPER_BLOCK_MASK;
2302 size -= size2;
2303 // size < SUPER_BLOCK_SIZE * 2
2304 end = data + size2;
2305 Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
2306 data = end;
2307 }
2308
2309 if (size != 0)
217 { 2310 {
218 unsigned index = pos / BLAKE2S_BLOCK_SIZE; 2311 memcpy((Byte *)(void *)p->buf32 + pos, data, size);
219 unsigned rem = BLAKE2S_BLOCK_SIZE - (pos & (BLAKE2S_BLOCK_SIZE - 1)); 2312 pos += size;
220 if (rem > size)
221 rem = (unsigned)size;
222 Blake2s_Update(&p->S[index], data, rem);
223 size -= rem;
224 data += rem;
225 pos += rem;
226 pos &= (BLAKE2S_BLOCK_SIZE * BLAKE2SP_PARALLEL_DEGREE - 1);
227 } 2313 }
228 p->bufPos = pos; 2314 p->u.header.cycPos = (unsigned)pos;
2315 // cycPos < SUPER_BLOCK_SIZE * 2
229} 2316}
230 2317
231 2318
232void Blake2sp_Final(CBlake2sp *p, Byte *digest) 2319void Blake2sp_Final(CBlake2sp *p, Byte *digest)
233{ 2320{
234 CBlake2s R; 2321 // UInt32 * const R_states = p->states;
235 unsigned i; 2322 // printf("\nBlake2sp_Final \n");
2323#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2324 if (p->u.header.func_Final)
2325 p->u.header.func_Final(p->states);
2326#endif
2327 // printf("\n=====\nBlake2sp_Final \n");
2328 // PrintStates(p->states, 32);
2329
2330 // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
2331 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
2332 {
2333 unsigned pos;
2334 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
2335 0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
2336 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2337 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2338 {
2339 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2340 Blake2s_Set_LastBlock(s)
2341 if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
2342 {
2343 UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
2344 if (pos < p->u.header.cycPos)
2345 delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
2346 // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
2347 {
2348 const UInt32 v = STATE_T(s)[0];
2349 STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0)
2350 STATE_T(s)[0] = v - delta;
2351 }
2352 }
2353 }
2354 // PrintStates(p->states, 16);
2355 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2356 (Byte *)(void *)p->buf32,
2357 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2358 // PrintStates(p->states, 16);
2359 }
2360 else
2361 {
2362 // (p->u.header.cycPos > SUPER_BLOCK_SIZE)
2363 unsigned pos;
2364 for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2365 {
2366 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2367 if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
2368 Blake2s_Set_LastBlock(s)
2369 }
2370 if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2371 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2372
2373 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2374 (Byte *)(void *)p->buf32,
2375 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
236 2376
237 Blake2sp_Init_Spec(&R, 0, 1); 2377 // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
238 R.lastNode_f1 = BLAKE2S_FINAL_FLAG; 2378 STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2379
2380 // if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
2381 {
2382 pos = SUPER_BLOCK_SIZE;
2383 for (;;)
2384 {
2385 UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
2386 Blake2s_Set_LastBlock(s)
2387 pos += Z7_BLAKE2S_BLOCK_SIZE;
2388 if (pos >= p->u.header.cycPos)
2389 {
2390 if (pos != p->u.header.cycPos)
2391 {
2392 const UInt32 delta = pos - p->u.header.cycPos;
2393 const UInt32 v = STATE_T(s)[0];
2394 STATE_T(s)[1] -= v < delta;
2395 STATE_T(s)[0] = v - delta;
2396 memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
2397 }
2398 break;
2399 }
2400 }
2401 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2402 (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
2403 (Byte *)(void *)p->buf32 + pos);
2404 }
2405 }
239 2406
240 for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++)
241 { 2407 {
242 Byte hash[BLAKE2S_DIGEST_SIZE]; 2408 size_t pos;
243 Blake2s_Final(&p->S[i], hash); 2409 for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
244 Blake2s_Update(&R, hash, BLAKE2S_DIGEST_SIZE); 2410 {
2411 const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
2412 Byte *dest = (Byte *)(void *)p->buf32 + pos;
2413 GET_DIGEST(s, dest)
2414 }
245 } 2415 }
2416 Blake2sp_Init_Spec(p->states, 0, 1);
2417 {
2418 size_t pos;
2419 for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
2420 - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2421 {
2422 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2423 (const Byte *)(const void *)p->buf32 + pos,
2424 (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
2425 }
2426 }
2427 // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
2428 Blake2s_Set_LastBlock(p->states)
2429 STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
2430 {
2431 Z7_BLAKE2SP_Compress_Single(p)(p->states,
2432 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
2433 (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
2434 }
2435 GET_DIGEST(p->states, digest)
2436 // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
2437}
2438
246 2439
247 Blake2s_Final(&R, digest); 2440BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
2441{
2442 // printf("\n========== setfunction = %d ======== \n", algo);
2443#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2444 Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
2445 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
2446 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2447 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2448#else
2449 UNUSED_VAR(p)
2450#endif
2451
2452#ifdef Z7_BLAKE2S_USE_VECTORS
2453
2454 func = func_Single = Blake2sp_Compress2;
2455
2456 if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
2457 {
2458 // printf("\n========== setfunction NON-SCALER ======== \n");
2459 if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
2460 {
2461 func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2462 func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2463 func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init;
2464 func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final;
2465 }
2466 else
2467 {
2468 if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
2469 return False;
2470
2471#ifdef Z7_BLAKE2S_USE_AVX2
2472
2473 func_Single =
2474#if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2475 Blake2sp_Compress2_AVX2_Way2;
2476#else
2477 Z7_BLAKE2S_Compress2_V128;
2478#endif
2479
2480#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2481 if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
2482 {
2483 func = Blake2sp_Compress2_AVX2_Fast;
2484 func_Final = Blake2sp_Final_AVX2_Fast;
2485 func_Init = Blake2sp_InitState_AVX2_Fast;
2486 }
2487 else
2488#endif
2489#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2490 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
2491 func = Blake2sp_Compress2_AVX2_Way2;
2492 else
2493#endif
2494#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2495 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
2496 {
2497 func_Single = func = Blake2sp_Compress2_AVX2_Way4;
2498 }
2499 else
2500#endif
2501#endif // avx2
2502 {
2503 if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
2504 {
2505 func = Blake2sp_Compress2_V128_Fast;
2506 func_Final = Blake2sp_Final_V128_Fast;
2507 func_Init = Blake2sp_InitState_V128_Fast;
2508 func_Single = Z7_BLAKE2S_Compress2_V128;
2509 }
2510 else
2511#ifdef Z7_BLAKE2S_USE_V128_WAY2
2512 if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
2513 func = func_Single = Blake2sp_Compress2_V128_Way2;
2514 else
2515#endif
2516 {
2517 if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
2518 return False;
2519 func = func_Single = Blake2sp_Compress2_V128_Way1;
2520 }
2521 }
2522 }
2523 }
2524#else // !VECTORS
2525 if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
2526 return False;
2527#endif // !VECTORS
2528
2529#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2530 p->u.header.func_Compress_Fast = func;
2531 p->u.header.func_Compress_Single = func_Single;
2532 p->u.header.func_Final = func_Final;
2533 p->u.header.func_Init = func_Init;
2534#endif
2535 // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
2536 return True;
2537}
2538
2539
2540void z7_Black2sp_Prepare(void)
2541{
2542#ifdef Z7_BLAKE2S_USE_VECTORS
2543 unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
2544
2545 Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
2546 Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
2547 Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2548 Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2549
2550#if defined(MY_CPU_X86_OR_AMD64)
2551 #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2552 if (CPU_IsSupported_AVX512F_AVX512VL())
2553 #endif
2554 #if defined(Z7_BLAKE2S_USE_SSE41)
2555 if (CPU_IsSupported_SSE41())
2556 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2557 if (CPU_IsSupported_SSSE3())
2558 #elif !defined(MY_CPU_AMD64)
2559 if (CPU_IsSupported_SSE2())
2560 #endif
2561#endif
2562 {
2563 #if defined(Z7_BLAKE2S_USE_SSE41)
2564 // printf("\n========== Blake2s SSE41 128-bit\n");
2565 #elif defined(Z7_BLAKE2S_USE_SSSE3)
2566 // printf("\n========== Blake2s SSSE3 128-bit\n");
2567 #else
2568 // printf("\n========== Blake2s SSE2 128-bit\n");
2569 #endif
2570 // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
2571 // printf("\n========== Blake2sp_Compress2_V128_Way2\n");
2572 func_Fast =
2573 func_Single = Z7_BLAKE2S_Compress2_V128;
2574 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
2575#ifdef Z7_BLAKE2S_USE_V128_WAY2
2576 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
2577#endif
2578#ifdef Z7_BLAKE2S_USE_V128_FAST
2579 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
2580 func_Fast = Blake2sp_Compress2_V128_Fast;
2581 func_Init = Blake2sp_InitState_V128_Fast;
2582 func_Final = Blake2sp_Final_V128_Fast;
2583#endif
2584
2585#ifdef Z7_BLAKE2S_USE_AVX2
2586#if defined(MY_CPU_X86_OR_AMD64)
2587 if (
2588 #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2589 CPU_IsSupported_AVX512F_AVX512VL() &&
2590 #endif
2591 CPU_IsSupported_AVX2()
2592 )
2593#endif
2594 {
2595 // #pragma message ("=== Blake2s AVX2")
2596 // printf("\n========== Blake2s AVX2\n");
2597
2598#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2599 func_Single = Blake2sp_Compress2_AVX2_Way2;
2600 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
2601#endif
2602#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2603 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
2604#endif
2605
2606#ifdef Z7_BLAKE2S_USE_AVX2_FAST
2607 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
2608 func_Fast = Blake2sp_Compress2_AVX2_Fast;
2609 func_Init = Blake2sp_InitState_AVX2_Fast;
2610 func_Final = Blake2sp_Final_AVX2_Fast;
2611#elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
2612 func_Fast = Blake2sp_Compress2_AVX2_Way4;
2613#elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2614 func_Fast = Blake2sp_Compress2_AVX2_Way2;
2615#endif
2616 } // avx2
2617#endif // avx2
2618 } // sse*
2619 g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast;
2620 g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
2621 g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init;
2622 g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final;
2623 g_z7_Blake2sp_SupportedFlags = flags;
2624 // printf("\nflags=%x\n", flags);
2625#endif // vectors
248} 2626}
249 2627
250#undef rotr32 2628/*
2629#ifdef Z7_BLAKE2S_USE_VECTORS
2630void align_test2(CBlake2sp *sp);
2631void align_test2(CBlake2sp *sp)
2632{
2633 __m128i a = LOAD_128(sp->states);
2634 D_XOR_128(a, LOAD_128(sp->states + 4));
2635 STORE_128(sp->states, a);
2636}
2637void align_test2(void);
2638void align_test2(void)
2639{
2640 CBlake2sp sp;
2641 Blake2sp_Init(&sp);
2642 Blake2sp_Update(&sp, NULL, 0);
2643}
2644#endif
2645*/
diff --git a/C/Bra.c b/C/Bra.c
index 22e0e47..e61edf8 100644
--- a/C/Bra.c
+++ b/C/Bra.c
@@ -1,11 +1,11 @@
1/* Bra.c -- Branch converters for RISC code 1/* Bra.c -- Branch converters for RISC code
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-20 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "Bra.h" 6#include "Bra.h"
7#include "CpuArch.h"
8#include "RotateDefs.h" 7#include "RotateDefs.h"
8#include "CpuArch.h"
9 9
10#if defined(MY_CPU_SIZEOF_POINTER) \ 10#if defined(MY_CPU_SIZEOF_POINTER) \
11 && ( MY_CPU_SIZEOF_POINTER == 4 \ 11 && ( MY_CPU_SIZEOF_POINTER == 4 \
@@ -26,7 +26,7 @@
26#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; 26#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
27// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; 27// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
28 28
29#define Z7_BRANCH_CONV(name) z7_BranchConv_ ## name 29#define Z7_BRANCH_CONV(name) z7_ ## name
30 30
31#define Z7_BRANCH_FUNC_MAIN(name) \ 31#define Z7_BRANCH_FUNC_MAIN(name) \
32static \ 32static \
@@ -42,11 +42,11 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
42 42
43#ifdef Z7_EXTRACT_ONLY 43#ifdef Z7_EXTRACT_ONLY
44#define Z7_BRANCH_FUNCS_IMP(name) \ 44#define Z7_BRANCH_FUNCS_IMP(name) \
45 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) 45 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0)
46#else 46#else
47#define Z7_BRANCH_FUNCS_IMP(name) \ 47#define Z7_BRANCH_FUNCS_IMP(name) \
48 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) \ 48 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \
49 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC, 1) 49 Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1)
50#endif 50#endif
51 51
52#if defined(__clang__) 52#if defined(__clang__)
@@ -72,7 +72,7 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
72#endif 72#endif
73 73
74 74
75Z7_BRANCH_FUNC_MAIN(ARM64) 75Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64)
76{ 76{
77 // Byte *p = data; 77 // Byte *p = data;
78 const Byte *lim; 78 const Byte *lim;
@@ -121,10 +121,10 @@ Z7_BRANCH_FUNC_MAIN(ARM64)
121 } 121 }
122 } 122 }
123} 123}
124Z7_BRANCH_FUNCS_IMP(ARM64) 124Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64)
125 125
126 126
127Z7_BRANCH_FUNC_MAIN(ARM) 127Z7_BRANCH_FUNC_MAIN(BranchConv_ARM)
128{ 128{
129 // Byte *p = data; 129 // Byte *p = data;
130 const Byte *lim; 130 const Byte *lim;
@@ -152,10 +152,10 @@ Z7_BRANCH_FUNC_MAIN(ARM)
152 } 152 }
153 } 153 }
154} 154}
155Z7_BRANCH_FUNCS_IMP(ARM) 155Z7_BRANCH_FUNCS_IMP(BranchConv_ARM)
156 156
157 157
158Z7_BRANCH_FUNC_MAIN(PPC) 158Z7_BRANCH_FUNC_MAIN(BranchConv_PPC)
159{ 159{
160 // Byte *p = data; 160 // Byte *p = data;
161 const Byte *lim; 161 const Byte *lim;
@@ -192,14 +192,14 @@ Z7_BRANCH_FUNC_MAIN(PPC)
192 } 192 }
193 } 193 }
194} 194}
195Z7_BRANCH_FUNCS_IMP(PPC) 195Z7_BRANCH_FUNCS_IMP(BranchConv_PPC)
196 196
197 197
198#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED 198#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
199#define BR_SPARC_USE_ROTATE 199#define BR_SPARC_USE_ROTATE
200#endif 200#endif
201 201
202Z7_BRANCH_FUNC_MAIN(SPARC) 202Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC)
203{ 203{
204 // Byte *p = data; 204 // Byte *p = data;
205 const Byte *lim; 205 const Byte *lim;
@@ -254,10 +254,10 @@ Z7_BRANCH_FUNC_MAIN(SPARC)
254 } 254 }
255 } 255 }
256} 256}
257Z7_BRANCH_FUNCS_IMP(SPARC) 257Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC)
258 258
259 259
260Z7_BRANCH_FUNC_MAIN(ARMT) 260Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT)
261{ 261{
262 // Byte *p = data; 262 // Byte *p = data;
263 Byte *lim; 263 Byte *lim;
@@ -335,12 +335,12 @@ Z7_BRANCH_FUNC_MAIN(ARMT)
335 // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); 335 // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
336 // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); 336 // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
337} 337}
338Z7_BRANCH_FUNCS_IMP(ARMT) 338Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT)
339 339
340 340
341// #define BR_IA64_NO_INLINE 341// #define BR_IA64_NO_INLINE
342 342
343Z7_BRANCH_FUNC_MAIN(IA64) 343Z7_BRANCH_FUNC_MAIN(BranchConv_IA64)
344{ 344{
345 // Byte *p = data; 345 // Byte *p = data;
346 const Byte *lim; 346 const Byte *lim;
@@ -417,4 +417,293 @@ Z7_BRANCH_FUNC_MAIN(IA64)
417 } 417 }
418 } 418 }
419} 419}
420Z7_BRANCH_FUNCS_IMP(IA64) 420Z7_BRANCH_FUNCS_IMP(BranchConv_IA64)
421
422
423#define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET;
424#define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET;
425
426#if 1 && defined(MY_CPU_LE_UNALIGN)
427 #define RISCV_USE_UNALIGNED_LOAD
428#endif
429
430#ifdef RISCV_USE_UNALIGNED_LOAD
431 #define RISCV_GET_UI32(p) GetUi32(p)
432 #define RISCV_SET_UI32(p, v) { SetUi32(p, v) }
433#else
434 #define RISCV_GET_UI32(p) \
435 ((UInt32)GetUi16a(p) + \
436 ((UInt32)GetUi16a((p) + 2) << 16))
437 #define RISCV_SET_UI32(p, v) { \
438 SetUi16a(p, (UInt16)(v)) \
439 SetUi16a((p) + 2, (UInt16)(v >> 16)) }
440#endif
441
442#if 1 && defined(MY_CPU_LE)
443 #define RISCV_USE_16BIT_LOAD
444#endif
445
446#ifdef RISCV_USE_16BIT_LOAD
447 #define RISCV_LOAD_VAL(p) GetUi16a(p)
448#else
449 #define RISCV_LOAD_VAL(p) (*(p))
450#endif
451
452#define RISCV_INSTR_SIZE 2
453#define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE)
454#define RISCV_STEP_2 4
455#define RISCV_REG_VAL (2 << 7)
456#define RISCV_CMD_VAL 3
457#if 1
458 // for code size optimization:
459 #define RISCV_DELTA_7F 0x7f
460#else
461 #define RISCV_DELTA_7F 0
462#endif
463
464#define RISCV_CHECK_1(v, b) \
465 (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0)
466
467#if 1
468 #define RISCV_CHECK_2(v, r) \
469 ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \
470 << 18) \
471 < ((r) & 0x1d))
472#else
473 // this branch gives larger code, because
474 // compilers generate larger code for big constants.
475 #define RISCV_CHECK_2(v, r) \
476 ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
477 & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
478 < ((r) & 0x1d))
479#endif
480
481
482#define RISCV_SCAN_LOOP \
483 Byte *lim; \
484 size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \
485 if (size <= 6) return p; \
486 size -= 6; \
487 lim = p + size; \
488 BR_PC_INIT \
489 for (;;) \
490 { \
491 UInt32 a, v; \
492 /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \
493 for (;;) \
494 { \
495 if Z7_UNLIKELY(p >= lim) { return p; } \
496 a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \
497 if ((a & 0x77) == 0) break; \
498 a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \
499 p += RISCV_INSTR_SIZE * 2; \
500 if ((a & 0x77) == 0) \
501 { \
502 p -= RISCV_INSTR_SIZE; \
503 if Z7_UNLIKELY(p >= lim) { return p; } \
504 break; \
505 } \
506 }
507// (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL
508// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL
509// (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC
510// (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC
511
512Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc)
513{
514 RISCV_SCAN_LOOP
515 v = a;
516 a = RISCV_GET_UI32(p);
517#ifndef RISCV_USE_16BIT_LOAD
518 v += (UInt32)p[1] << 8;
519#endif
520
521 if ((v & 8) == 0) // JAL
522 {
523 if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80)
524 {
525 p += RISCV_INSTR_SIZE;
526 continue;
527 }
528 {
529 v = ((a & 1u << 31) >> 11)
530 | ((a & 0x3ff << 21) >> 20)
531 | ((a & 1 << 20) >> 9)
532 | (a & 0xff << 12);
533 BR_CONVERT_VAL_ENC(v)
534 // ((v & 1) == 0)
535 // v: bits [1 : 20] contain offset bits
536#if 0 && defined(RISCV_USE_UNALIGNED_LOAD)
537 a &= 0xfff;
538 a |= ((UInt32)(v << 23))
539 | ((UInt32)(v << 7) & ((UInt32)0xff << 16))
540 | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8));
541 RISCV_SET_UI32(p, a)
542#else // aligned
543#if 0
544 SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff)))
545#else
546 p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf));
547#endif
548
549#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
550 v <<= 15;
551 v = Z7_BSWAP32(v);
552 SetUi16a(p + 2, (UInt16)v)
553#else
554 p[2] = (Byte)(v >> 9);
555 p[3] = (Byte)(v >> 1);
556#endif
557#endif // aligned
558 }
559 p += 4;
560 continue;
561 } // JAL
562
563 {
564 // AUIPC
565 if (v & 0xe80) // (not x0) and (not x2)
566 {
567 const UInt32 b = RISCV_GET_UI32(p + 4);
568 if (RISCV_CHECK_1(v, b))
569 {
570 {
571 const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL);
572 RISCV_SET_UI32(p, temp)
573 }
574 a &= 0xfffff000;
575 {
576#if 1
577 const int t = -1 >> 1;
578 if (t != -1)
579 a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation
580 else
581#endif
582 a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension).
583 }
584 BR_CONVERT_VAL_ENC(a)
585#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
586 a = Z7_BSWAP32(a);
587 RISCV_SET_UI32(p + 4, a)
588#else
589 SetBe32(p + 4, a)
590#endif
591 p += 8;
592 }
593 else
594 p += RISCV_STEP_1;
595 }
596 else
597 {
598 UInt32 r = a >> 27;
599 if (RISCV_CHECK_2(v, r))
600 {
601 v = RISCV_GET_UI32(p + 4);
602 r = (r << 7) + 0x17 + (v & 0xfffff000);
603 a = (a >> 12) | (v << 20);
604 RISCV_SET_UI32(p, r)
605 RISCV_SET_UI32(p + 4, a)
606 p += 8;
607 }
608 else
609 p += RISCV_STEP_2;
610 }
611 }
612 } // for
613}
614
615
616Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc)
617{
618 RISCV_SCAN_LOOP
619#ifdef RISCV_USE_16BIT_LOAD
620 if ((a & 8) == 0)
621 {
622#else
623 v = a;
624 a += (UInt32)p[1] << 8;
625 if ((v & 8) == 0)
626 {
627#endif
628 // JAL
629 a -= 0x100 - RISCV_DELTA_7F;
630 if (a & 0xd80)
631 {
632 p += RISCV_INSTR_SIZE;
633 continue;
634 }
635 {
636 const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff;
637#if 0 // unaligned
638 a = GetUi32(p);
639 v = (UInt32)(a >> 23) & ((UInt32)0xff << 1)
640 | (UInt32)(a >> 7) & ((UInt32)0xff << 9)
641#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
642 v = GetUi16a(p + 2);
643 v = Z7_BSWAP32(v) >> 15
644#else
645 v = (UInt32)p[3] << 1
646 | (UInt32)p[2] << 9
647#endif
648 | (UInt32)((a & 0xf000) << 5);
649 BR_CONVERT_VAL_DEC(v)
650 a = a_old
651 | (v << 11 & 1u << 31)
652 | (v << 20 & 0x3ff << 21)
653 | (v << 9 & 1 << 20)
654 | (v & 0xff << 12);
655 RISCV_SET_UI32(p, a)
656 }
657 p += 4;
658 continue;
659 } // JAL
660
661 {
662 // AUIPC
663 v = a;
664#if 1 && defined(RISCV_USE_UNALIGNED_LOAD)
665 a = GetUi32(p);
666#else
667 a |= (UInt32)GetUi16a(p + 2) << 16;
668#endif
669 if ((v & 0xe80) == 0) // x0/x2
670 {
671 const UInt32 r = a >> 27;
672 if (RISCV_CHECK_2(v, r))
673 {
674 UInt32 b;
675#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
676 b = RISCV_GET_UI32(p + 4);
677 b = Z7_BSWAP32(b);
678#else
679 b = GetBe32(p + 4);
680#endif
681 v = a >> 12;
682 BR_CONVERT_VAL_DEC(b)
683 a = (r << 7) + 0x17;
684 a += (b + 0x800) & 0xfffff000;
685 v |= b << 20;
686 RISCV_SET_UI32(p, a)
687 RISCV_SET_UI32(p + 4, v)
688 p += 8;
689 }
690 else
691 p += RISCV_STEP_2;
692 }
693 else
694 {
695 const UInt32 b = RISCV_GET_UI32(p + 4);
696 if (!RISCV_CHECK_1(v, b))
697 p += RISCV_STEP_1;
698 else
699 {
700 v = (a & 0xfffff000) | (b >> 20);
701 a = (b << 12) | (0x17 + RISCV_REG_VAL);
702 RISCV_SET_UI32(p, a)
703 RISCV_SET_UI32(p + 4, v)
704 p += 8;
705 }
706 }
707 }
708 } // for
709}
diff --git a/C/Bra.h b/C/Bra.h
index a4ee568..b47112c 100644
--- a/C/Bra.h
+++ b/C/Bra.h
@@ -1,5 +1,5 @@
1/* Bra.h -- Branch converters for executables 1/* Bra.h -- Branch converters for executables
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-20 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_BRA_H 4#ifndef ZIP7_INC_BRA_H
5#define ZIP7_INC_BRA_H 5#define ZIP7_INC_BRA_H
@@ -8,8 +8,12 @@
8 8
9EXTERN_C_BEGIN 9EXTERN_C_BEGIN
10 10
11#define Z7_BRANCH_CONV_DEC(name) z7_BranchConv_ ## name ## _Dec 11/* #define PPC BAD_PPC_11 // for debug */
12#define Z7_BRANCH_CONV_ENC(name) z7_BranchConv_ ## name ## _Enc 12
13#define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec
14#define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc
15#define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name)
16#define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name)
13#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec 17#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec
14#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc 18#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc
15 19
@@ -20,19 +24,20 @@ typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv));
20typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); 24typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt));
21 25
22#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 26#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0
23Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_DEC(X86)); 27Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86));
24Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_ENC(X86)); 28Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86));
25 29
26#define Z7_BRANCH_FUNCS_DECL(name) \ 30#define Z7_BRANCH_FUNCS_DECL(name) \
27Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_DEC(name)); \ 31Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \
28Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_ENC(name)); 32Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name));
29 33
30Z7_BRANCH_FUNCS_DECL(ARM64) 34Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64)
31Z7_BRANCH_FUNCS_DECL(ARM) 35Z7_BRANCH_FUNCS_DECL (BranchConv_ARM)
32Z7_BRANCH_FUNCS_DECL(ARMT) 36Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT)
33Z7_BRANCH_FUNCS_DECL(PPC) 37Z7_BRANCH_FUNCS_DECL (BranchConv_PPC)
34Z7_BRANCH_FUNCS_DECL(SPARC) 38Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC)
35Z7_BRANCH_FUNCS_DECL(IA64) 39Z7_BRANCH_FUNCS_DECL (BranchConv_IA64)
40Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV)
36 41
37/* 42/*
38These functions convert data that contain CPU instructions. 43These functions convert data that contain CPU instructions.
@@ -49,14 +54,14 @@ and one for decoding (_Enc/_Dec postfixes in function name).
49In params: 54In params:
50 data : data buffer 55 data : data buffer
51 size : size of data 56 size : size of data
52 pc : current virtual Program Counter (Instruction Pinter) value 57 pc : current virtual Program Counter (Instruction Pointer) value
53In/Out param: 58In/Out param:
54 state : pointer to state variable (for X86 converter only) 59 state : pointer to state variable (for X86 converter only)
55 60
56Return: 61Return:
57 The pointer to position in (data) buffer after last byte that was processed. 62 The pointer to position in (data) buffer after last byte that was processed.
58 If the caller calls converter again, it must call it starting with that position. 63 If the caller calls converter again, it must call it starting with that position.
59 But the caller is allowed to move data in buffer. so pointer to 64 But the caller is allowed to move data in buffer. So pointer to
60 current processed position also will be changed for next call. 65 current processed position also will be changed for next call.
61 Also the caller must increase internal (pc) value for next call. 66 Also the caller must increase internal (pc) value for next call.
62 67
@@ -65,6 +70,7 @@ Each converter has some characteristics: Endian, Alignment, LookAhead.
65 70
66 X86 little 1 4 71 X86 little 1 4
67 ARMT little 2 2 72 ARMT little 2 2
73 RISCV little 2 6
68 ARM little 4 0 74 ARM little 4 0
69 ARM64 little 4 0 75 ARM64 little 4 0
70 PPC big 4 0 76 PPC big 4 0
diff --git a/C/Compiler.h b/C/Compiler.h
index 185a52d..2a9c2b7 100644
--- a/C/Compiler.h
+++ b/C/Compiler.h
@@ -1,5 +1,5 @@
1/* Compiler.h : Compiler specific defines and pragmas 1/* Compiler.h : Compiler specific defines and pragmas
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_COMPILER_H 4#ifndef ZIP7_INC_COMPILER_H
5#define ZIP7_INC_COMPILER_H 5#define ZIP7_INC_COMPILER_H
@@ -25,11 +25,79 @@
25#define Z7_MINGW 25#define Z7_MINGW
26#endif 26#endif
27 27
28#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__))
29#define Z7_MCST_LCC
30#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__)
31#endif
32
33/*
34#if defined(__AVX2__) \
35 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
36 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
37 || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
38 || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
39 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
40 #define Z7_COMPILER_AVX2_SUPPORTED
41 #endif
42#endif
43*/
44
28// #pragma GCC diagnostic ignored "-Wunknown-pragmas" 45// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
29 46
30#ifdef __clang__ 47#ifdef __clang__
31// padding size of '' with 4 bytes to alignment boundary 48// padding size of '' with 4 bytes to alignment boundary
32#pragma GCC diagnostic ignored "-Wpadded" 49#pragma GCC diagnostic ignored "-Wpadded"
50
51#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \
52 && defined(__FreeBSD__)
53// freebsd:
54#pragma GCC diagnostic ignored "-Wexcess-padding"
55#endif
56
57#if __clang_major__ >= 16
58#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
59#endif
60
61#if __clang_major__ == 13
62#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
63// cheri
64#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast"
65#endif
66#endif
67
68#if __clang_major__ == 13
69 // for <arm_neon.h>
70 #pragma GCC diagnostic ignored "-Wreserved-identifier"
71#endif
72
73#endif // __clang__
74
75#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16
76// #pragma GCC diagnostic ignored "-Wcast-function-type-strict"
77#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \
78 _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"")
79#else
80#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
81#endif
82
83typedef void (*Z7_void_Function)(void);
84#if defined(__clang__) || defined(__GNUC__)
85#define Z7_CAST_FUNC_C (Z7_void_Function)
86#elif defined(_MSC_VER) && _MSC_VER > 1920
87#define Z7_CAST_FUNC_C (void *)
88// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
89#else
90#define Z7_CAST_FUNC_C
91#endif
92/*
93#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__)
94 // #pragma GCC diagnostic ignored "-Wcast-function-type"
95#endif
96*/
97#ifdef __GNUC__
98#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000)
99#pragma GCC diagnostic ignored "-Wstrict-aliasing"
100#endif
33#endif 101#endif
34 102
35 103
@@ -101,7 +169,8 @@
101 _Pragma("clang loop unroll(disable)") \ 169 _Pragma("clang loop unroll(disable)") \
102 _Pragma("clang loop vectorize(disable)") 170 _Pragma("clang loop vectorize(disable)")
103 #define Z7_ATTRIB_NO_VECTORIZE 171 #define Z7_ATTRIB_NO_VECTORIZE
104#elif defined(__GNUC__) && (__GNUC__ >= 5) 172#elif defined(__GNUC__) && (__GNUC__ >= 5) \
173 && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610))
105 #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) 174 #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
106 // __attribute__((optimize("no-unroll-loops"))); 175 // __attribute__((optimize("no-unroll-loops")));
107 #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE 176 #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
@@ -142,15 +211,23 @@
142#endif 211#endif
143 212
144 213
145#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 36000)) 214#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600))
146#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ 215
216#if (Z7_CLANG_VERSION < 130000)
217#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
218 _Pragma("GCC diagnostic push") \
219 _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"")
220#else
221#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
147 _Pragma("GCC diagnostic push") \ 222 _Pragma("GCC diagnostic push") \
148 _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") 223 _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"")
149#define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ 224#endif
225
226#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \
150 _Pragma("GCC diagnostic pop") 227 _Pragma("GCC diagnostic pop")
151#else 228#else
152#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER 229#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
153#define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER 230#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
154#endif 231#endif
155 232
156#define UNUSED_VAR(x) (void)x; 233#define UNUSED_VAR(x) (void)x;
diff --git a/C/CpuArch.c b/C/CpuArch.c
index 33f8a3a..d51b38a 100644
--- a/C/CpuArch.c
+++ b/C/CpuArch.c
@@ -1,5 +1,5 @@
1/* CpuArch.c -- CPU specific code 1/* CpuArch.c -- CPU specific code
22023-05-18 : Igor Pavlov : Public domain */ 22024-03-02 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -226,7 +226,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
226DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! 226DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!!
227*/ 227*/
228static 228static
229Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo) 229Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo)
230{ 230{
231 UNUSED_VAR(subFunction) 231 UNUSED_VAR(subFunction)
232 __cpuid(CPUInfo, func); 232 __cpuid(CPUInfo, func);
@@ -242,13 +242,13 @@ Z7_NO_INLINE
242#endif 242#endif
243void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) 243void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func)
244{ 244{
245 MY_cpuidex((int *)p, (int)func, 0); 245 MY_cpuidex((Int32 *)p, (Int32)func, 0);
246} 246}
247 247
248Z7_NO_INLINE 248Z7_NO_INLINE
249UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) 249UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void)
250{ 250{
251 int a[4]; 251 Int32 a[4];
252 MY_cpuidex(a, 0, 0); 252 MY_cpuidex(a, 0, 0);
253 return a[0]; 253 return a[0];
254} 254}
@@ -384,7 +384,7 @@ BoolInt CPU_IsSupported_CMOV(void)
384 UInt32 a[4]; 384 UInt32 a[4];
385 if (!x86cpuid_Func_1(&a[0])) 385 if (!x86cpuid_Func_1(&a[0]))
386 return 0; 386 return 0;
387 return (a[3] >> 15) & 1; 387 return (BoolInt)(a[3] >> 15) & 1;
388} 388}
389 389
390BoolInt CPU_IsSupported_SSE(void) 390BoolInt CPU_IsSupported_SSE(void)
@@ -393,7 +393,7 @@ BoolInt CPU_IsSupported_SSE(void)
393 CHECK_SYS_SSE_SUPPORT 393 CHECK_SYS_SSE_SUPPORT
394 if (!x86cpuid_Func_1(&a[0])) 394 if (!x86cpuid_Func_1(&a[0]))
395 return 0; 395 return 0;
396 return (a[3] >> 25) & 1; 396 return (BoolInt)(a[3] >> 25) & 1;
397} 397}
398 398
399BoolInt CPU_IsSupported_SSE2(void) 399BoolInt CPU_IsSupported_SSE2(void)
@@ -402,7 +402,7 @@ BoolInt CPU_IsSupported_SSE2(void)
402 CHECK_SYS_SSE_SUPPORT 402 CHECK_SYS_SSE_SUPPORT
403 if (!x86cpuid_Func_1(&a[0])) 403 if (!x86cpuid_Func_1(&a[0]))
404 return 0; 404 return 0;
405 return (a[3] >> 26) & 1; 405 return (BoolInt)(a[3] >> 26) & 1;
406} 406}
407 407
408#endif 408#endif
@@ -419,17 +419,17 @@ static UInt32 x86cpuid_Func_1_ECX(void)
419 419
420BoolInt CPU_IsSupported_AES(void) 420BoolInt CPU_IsSupported_AES(void)
421{ 421{
422 return (x86cpuid_Func_1_ECX() >> 25) & 1; 422 return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1;
423} 423}
424 424
425BoolInt CPU_IsSupported_SSSE3(void) 425BoolInt CPU_IsSupported_SSSE3(void)
426{ 426{
427 return (x86cpuid_Func_1_ECX() >> 9) & 1; 427 return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1;
428} 428}
429 429
430BoolInt CPU_IsSupported_SSE41(void) 430BoolInt CPU_IsSupported_SSE41(void)
431{ 431{
432 return (x86cpuid_Func_1_ECX() >> 19) & 1; 432 return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1;
433} 433}
434 434
435BoolInt CPU_IsSupported_SHA(void) 435BoolInt CPU_IsSupported_SHA(void)
@@ -441,7 +441,7 @@ BoolInt CPU_IsSupported_SHA(void)
441 { 441 {
442 UInt32 d[4]; 442 UInt32 d[4];
443 z7_x86_cpuid(d, 7); 443 z7_x86_cpuid(d, 7);
444 return (d[1] >> 29) & 1; 444 return (BoolInt)(d[1] >> 29) & 1;
445 } 445 }
446} 446}
447 447
@@ -640,8 +640,8 @@ BoolInt CPU_IsSupported_AVX(void)
640 const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); 640 const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK);
641 // printf("\n=== XGetBV=%d\n", bm); 641 // printf("\n=== XGetBV=%d\n", bm);
642 return 1 642 return 1
643 & (bm >> 1) // SSE state is supported (set by OS) for storing/restoring 643 & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring
644 & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring 644 & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring
645 } 645 }
646 // since Win7SP1: we can use GetEnabledXStateFeatures(); 646 // since Win7SP1: we can use GetEnabledXStateFeatures();
647} 647}
@@ -658,10 +658,29 @@ BoolInt CPU_IsSupported_AVX2(void)
658 z7_x86_cpuid(d, 7); 658 z7_x86_cpuid(d, 7);
659 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); 659 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
660 return 1 660 return 1
661 & (d[1] >> 5); // avx2 661 & (BoolInt)(d[1] >> 5); // avx2
662 } 662 }
663} 663}
664 664
665/*
666// fix it:
667BoolInt CPU_IsSupported_AVX512F_AVX512VL(void)
668{
669 if (!CPU_IsSupported_AVX())
670 return False;
671 if (z7_x86_cpuid_GetMaxFunc() < 7)
672 return False;
673 {
674 UInt32 d[4];
675 z7_x86_cpuid(d, 7);
676 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
677 return 1
678 & (BoolInt)(d[1] >> 16) // avx512-f
679 & (BoolInt)(d[1] >> 31); // avx512-Vl
680 }
681}
682*/
683
665BoolInt CPU_IsSupported_VAES_AVX2(void) 684BoolInt CPU_IsSupported_VAES_AVX2(void)
666{ 685{
667 if (!CPU_IsSupported_AVX()) 686 if (!CPU_IsSupported_AVX())
@@ -673,9 +692,9 @@ BoolInt CPU_IsSupported_VAES_AVX2(void)
673 z7_x86_cpuid(d, 7); 692 z7_x86_cpuid(d, 7);
674 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); 693 // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]);
675 return 1 694 return 1
676 & (d[1] >> 5) // avx2 695 & (BoolInt)(d[1] >> 5) // avx2
677 // & (d[1] >> 31) // avx512vl 696 // & (d[1] >> 31) // avx512vl
678 & (d[2] >> 9); // vaes // VEX-256/EVEX 697 & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX
679 } 698 }
680} 699}
681 700
@@ -688,7 +707,7 @@ BoolInt CPU_IsSupported_PageGB(void)
688 if (d[0] < 0x80000001) 707 if (d[0] < 0x80000001)
689 return False; 708 return False;
690 z7_x86_cpuid(d, 0x80000001); 709 z7_x86_cpuid(d, 0x80000001);
691 return (d[3] >> 26) & 1; 710 return (BoolInt)(d[3] >> 26) & 1;
692 } 711 }
693} 712}
694 713
@@ -760,32 +779,65 @@ BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; }
760 779
761#else // __APPLE__ 780#else // __APPLE__
762 781
763#include <sys/auxv.h> 782#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216)
783 #define Z7_GETAUXV_AVAILABLE
784#else
785// #pragma message("=== is not NEW GLIBC === ")
786 #if defined __has_include
787 #if __has_include (<sys/auxv.h>)
788// #pragma message("=== sys/auxv.h is avail=== ")
789 #define Z7_GETAUXV_AVAILABLE
790 #endif
791 #endif
792#endif
764 793
794#ifdef Z7_GETAUXV_AVAILABLE
795// #pragma message("=== Z7_GETAUXV_AVAILABLE === ")
796#include <sys/auxv.h>
765#define USE_HWCAP 797#define USE_HWCAP
798#endif
766 799
767#ifdef USE_HWCAP 800#ifdef USE_HWCAP
768 801
802#if defined(__FreeBSD__)
803static unsigned long MY_getauxval(int aux)
804{
805 unsigned long val;
806 if (elf_aux_info(aux, &val, sizeof(val)))
807 return 0;
808 return val;
809}
810#else
811#define MY_getauxval getauxval
812 #if defined __has_include
813 #if __has_include (<asm/hwcap.h>)
769#include <asm/hwcap.h> 814#include <asm/hwcap.h>
815 #endif
816 #endif
817#endif
770 818
771 #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ 819 #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \
772 BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } 820 BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); }
773 821
774#ifdef MY_CPU_ARM64 822#ifdef MY_CPU_ARM64
775 #define MY_HWCAP_CHECK_FUNC(name) \ 823 #define MY_HWCAP_CHECK_FUNC(name) \
776 MY_HWCAP_CHECK_FUNC_2(name, name) 824 MY_HWCAP_CHECK_FUNC_2(name, name)
825#if 1 || defined(__ARM_NEON)
826 BoolInt CPU_IsSupported_NEON(void) { return True; }
827#else
777 MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) 828 MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD)
829#endif
778// MY_HWCAP_CHECK_FUNC (ASIMD) 830// MY_HWCAP_CHECK_FUNC (ASIMD)
779#elif defined(MY_CPU_ARM) 831#elif defined(MY_CPU_ARM)
780 #define MY_HWCAP_CHECK_FUNC(name) \ 832 #define MY_HWCAP_CHECK_FUNC(name) \
781 BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } 833 BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); }
782 MY_HWCAP_CHECK_FUNC_2(NEON, NEON) 834 MY_HWCAP_CHECK_FUNC_2(NEON, NEON)
783#endif 835#endif
784 836
785#else // USE_HWCAP 837#else // USE_HWCAP
786 838
787 #define MY_HWCAP_CHECK_FUNC(name) \ 839 #define MY_HWCAP_CHECK_FUNC(name) \
788 BoolInt CPU_IsSupported_ ## name() { return 0; } 840 BoolInt CPU_IsSupported_ ## name(void) { return 0; }
789 MY_HWCAP_CHECK_FUNC(NEON) 841 MY_HWCAP_CHECK_FUNC(NEON)
790 842
791#endif // USE_HWCAP 843#endif // USE_HWCAP
diff --git a/C/CpuArch.h b/C/CpuArch.h
index 8e5d8a5..dfc68f1 100644
--- a/C/CpuArch.h
+++ b/C/CpuArch.h
@@ -1,5 +1,5 @@
1/* CpuArch.h -- CPU specific code 1/* CpuArch.h -- CPU specific code
22023-04-02 : Igor Pavlov : Public domain */ 22024-05-13 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_CPU_ARCH_H 4#ifndef ZIP7_INC_CPU_ARCH_H
5#define ZIP7_INC_CPU_ARCH_H 5#define ZIP7_INC_CPU_ARCH_H
@@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
20 MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) 20 MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
21*/ 21*/
22 22
23#if !defined(_M_ARM64EC)
23#if defined(_M_X64) \ 24#if defined(_M_X64) \
24 || defined(_M_AMD64) \ 25 || defined(_M_AMD64) \
25 || defined(__x86_64__) \ 26 || defined(__x86_64__) \
@@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
35 #endif 36 #endif
36 #define MY_CPU_64BIT 37 #define MY_CPU_64BIT
37#endif 38#endif
39#endif
38 40
39 41
40#if defined(_M_IX86) \ 42#if defined(_M_IX86) \
@@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
47 49
48 50
49#if defined(_M_ARM64) \ 51#if defined(_M_ARM64) \
52 || defined(_M_ARM64EC) \
50 || defined(__AARCH64EL__) \ 53 || defined(__AARCH64EL__) \
51 || defined(__AARCH64EB__) \ 54 || defined(__AARCH64EB__) \
52 || defined(__aarch64__) 55 || defined(__aarch64__)
53 #define MY_CPU_ARM64 56 #define MY_CPU_ARM64
54 #ifdef __ILP32__ 57#if defined(__ILP32__) \
58 || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
55 #define MY_CPU_NAME "arm64-32" 59 #define MY_CPU_NAME "arm64-32"
56 #define MY_CPU_SIZEOF_POINTER 4 60 #define MY_CPU_SIZEOF_POINTER 4
57 #else 61#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
62 #define MY_CPU_NAME "arm64-128"
63 #define MY_CPU_SIZEOF_POINTER 16
64#else
65#if defined(_M_ARM64EC)
66 #define MY_CPU_NAME "arm64ec"
67#else
58 #define MY_CPU_NAME "arm64" 68 #define MY_CPU_NAME "arm64"
69#endif
59 #define MY_CPU_SIZEOF_POINTER 8 70 #define MY_CPU_SIZEOF_POINTER 8
60 #endif 71#endif
61 #define MY_CPU_64BIT 72 #define MY_CPU_64BIT
62#endif 73#endif
63 74
@@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
133#endif 144#endif
134 145
135 146
147#if defined(__sparc__) \
148 || defined(__sparc)
149 #define MY_CPU_SPARC
150 #if defined(__LP64__) \
151 || defined(_LP64) \
152 || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
153 #define MY_CPU_NAME "sparcv9"
154 #define MY_CPU_SIZEOF_POINTER 8
155 #define MY_CPU_64BIT
156 #elif defined(__sparc_v9__) \
157 || defined(__sparcv9)
158 #define MY_CPU_64BIT
159 #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
160 #define MY_CPU_NAME "sparcv9-32"
161 #else
162 #define MY_CPU_NAME "sparcv9m"
163 #endif
164 #elif defined(__sparc_v8__) \
165 || defined(__sparcv8)
166 #define MY_CPU_NAME "sparcv8"
167 #define MY_CPU_SIZEOF_POINTER 4
168 #else
169 #define MY_CPU_NAME "sparc"
170 #endif
171#endif
172
173
136#if defined(__riscv) \ 174#if defined(__riscv) \
137 || defined(__riscv__) 175 || defined(__riscv__)
176 #define MY_CPU_RISCV
138 #if __riscv_xlen == 32 177 #if __riscv_xlen == 32
139 #define MY_CPU_NAME "riscv32" 178 #define MY_CPU_NAME "riscv32"
140 #elif __riscv_xlen == 64 179 #elif __riscv_xlen == 64
@@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
145#endif 184#endif
146 185
147 186
187#if defined(__loongarch__)
188 #define MY_CPU_LOONGARCH
189 #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
190 #define MY_CPU_64BIT
191 #endif
192 #if defined(__loongarch64)
193 #define MY_CPU_NAME "loongarch64"
194 #define MY_CPU_LOONGARCH64
195 #else
196 #define MY_CPU_NAME "loongarch"
197 #endif
198#endif
199
200
201// #undef MY_CPU_NAME
202// #undef MY_CPU_SIZEOF_POINTER
203// #define __e2k__
204// #define __SIZEOF_POINTER__ 4
205#if defined(__e2k__)
206 #define MY_CPU_E2K
207 #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
208 #define MY_CPU_NAME "e2k-32"
209 #define MY_CPU_SIZEOF_POINTER 4
210 #else
211 #define MY_CPU_NAME "e2k"
212 #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
213 #define MY_CPU_SIZEOF_POINTER 8
214 #endif
215 #endif
216 #define MY_CPU_64BIT
217#endif
218
219
148#if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) 220#if defined(MY_CPU_X86) || defined(MY_CPU_AMD64)
149#define MY_CPU_X86_OR_AMD64 221#define MY_CPU_X86_OR_AMD64
150#endif 222#endif
@@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
175 || defined(MY_CPU_ARM_LE) \ 247 || defined(MY_CPU_ARM_LE) \
176 || defined(MY_CPU_ARM64_LE) \ 248 || defined(MY_CPU_ARM64_LE) \
177 || defined(MY_CPU_IA64_LE) \ 249 || defined(MY_CPU_IA64_LE) \
250 || defined(_LITTLE_ENDIAN) \
178 || defined(__LITTLE_ENDIAN__) \ 251 || defined(__LITTLE_ENDIAN__) \
179 || defined(__ARMEL__) \ 252 || defined(__ARMEL__) \
180 || defined(__THUMBEL__) \ 253 || defined(__THUMBEL__) \
@@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
251 324
252 325
253#ifndef MY_CPU_NAME 326#ifndef MY_CPU_NAME
327 // #define MY_CPU_IS_UNKNOWN
254 #ifdef MY_CPU_LE 328 #ifdef MY_CPU_LE
255 #define MY_CPU_NAME "LE" 329 #define MY_CPU_NAME "LE"
256 #elif defined(MY_CPU_BE) 330 #elif defined(MY_CPU_BE)
@@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
295#define Z7_BSWAP64(v) _byteswap_uint64(v) 369#define Z7_BSWAP64(v) _byteswap_uint64(v)
296#define Z7_CPU_FAST_BSWAP_SUPPORTED 370#define Z7_CPU_FAST_BSWAP_SUPPORTED
297 371
298#elif (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ 372/* GCC can generate slow code that calls function for __builtin_bswap32() for:
299 || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) 373 - GCC for RISCV, if Zbb extension is not used.
300 374 - GCC for SPARC.
375 The code from CLANG for SPARC also is not fastest.
376 So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
377*/
378#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \
379 && !defined(MY_CPU_SPARC) \
380 && ( \
381 (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
382 || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
383 )
384
301#define Z7_BSWAP16(v) __builtin_bswap16(v) 385#define Z7_BSWAP16(v) __builtin_bswap16(v)
302#define Z7_BSWAP32(v) __builtin_bswap32(v) 386#define Z7_BSWAP32(v) __builtin_bswap32(v)
303#define Z7_BSWAP64(v) __builtin_bswap64(v) 387#define Z7_BSWAP64(v) __builtin_bswap64(v)
@@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
329 413
330#ifdef MY_CPU_LE 414#ifdef MY_CPU_LE
331 #if defined(MY_CPU_X86_OR_AMD64) \ 415 #if defined(MY_CPU_X86_OR_AMD64) \
332 || defined(MY_CPU_ARM64) 416 || defined(MY_CPU_ARM64) \
417 || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
418 || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
333 #define MY_CPU_LE_UNALIGN 419 #define MY_CPU_LE_UNALIGN
334 #define MY_CPU_LE_UNALIGN_64 420 #define MY_CPU_LE_UNALIGN_64
335 #elif defined(__ARM_FEATURE_UNALIGNED) 421 #elif defined(__ARM_FEATURE_UNALIGNED)
336 /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. 422/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
337 So we can't use unaligned 64-bit operations. */ 423 Description of problems:
338 #define MY_CPU_LE_UNALIGN 424problem-1 : 32-bit ARM architecture:
425 multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
426 require 32-bit (WORD) alignment (by 32-bit ARM architecture).
427 So there is "Alignment fault exception", if data is not aligned for 32-bit.
428
429problem-2 : 32-bit kernels and arm64 kernels:
430 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
431 So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
432
433 But some arm64 kernels do not handle these faults in 32-bit programs.
434 So we have unhandled exception for such instructions.
435 Probably some new arm64 kernels have fixed it, and unaligned
436 paired-access instructions work in new kernels?
437
438problem-3 : compiler for 32-bit arm:
439 Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
440 and for another cases where two 32-bit accesses are fused
441 to one multi-access instruction.
442 So UInt64 variables must be aligned for 32-bit, and each
443 32-bit access must be aligned for 32-bit, if we want to
444 avoid "Alignment fault" exception (handled or unhandled).
445
446problem-4 : performace:
447 Even if unaligned access is handled by kernel, it will be slow.
448 So if we allow unaligned access, we can get fast unaligned
449 single-access, and slow unaligned paired-access.
450
451 We don't allow unaligned access on 32-bit arm, because compiler
452 genarates paired-access instructions that require 32-bit alignment,
453 and some arm64 kernels have no handler for these instructions.
454 Also unaligned paired-access instructions will be slow, if kernel handles them.
455*/
456 // it must be disabled:
457 // #define MY_CPU_LE_UNALIGN
339 #endif 458 #endif
340#endif 459#endif
341 460
@@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
439 558
440#if defined(MY_CPU_BE) 559#if defined(MY_CPU_BE)
441 560
561#define GetBe64a(p) (*(const UInt64 *)(const void *)(p))
442#define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) 562#define GetBe32a(p) (*(const UInt32 *)(const void *)(p))
443#define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) 563#define GetBe16a(p) (*(const UInt16 *)(const void *)(p))
444#define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } 564#define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
@@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
456#define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } 576#define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
457#define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } 577#define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
458 578
579#define GetBe64a(p) GetBe64(p)
459#define GetBe32a(p) GetBe32(p) 580#define GetBe32a(p) GetBe32(p)
460#define GetBe16a(p) GetBe16(p) 581#define GetBe16a(p) GetBe16(p)
461#define SetBe32a(p, v) SetBe32(p, v) 582#define SetBe32a(p, v) SetBe32(p, v)
@@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
486BoolInt CPU_IsSupported_AES(void); 607BoolInt CPU_IsSupported_AES(void);
487BoolInt CPU_IsSupported_AVX(void); 608BoolInt CPU_IsSupported_AVX(void);
488BoolInt CPU_IsSupported_AVX2(void); 609BoolInt CPU_IsSupported_AVX2(void);
610// BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
489BoolInt CPU_IsSupported_VAES_AVX2(void); 611BoolInt CPU_IsSupported_VAES_AVX2(void);
490BoolInt CPU_IsSupported_CMOV(void); 612BoolInt CPU_IsSupported_CMOV(void);
491BoolInt CPU_IsSupported_SSE(void); 613BoolInt CPU_IsSupported_SSE(void);
diff --git a/C/DllSecur.c b/C/DllSecur.c
index 02a0f97..bbbfc0a 100644
--- a/C/DllSecur.c
+++ b/C/DllSecur.c
@@ -1,5 +1,5 @@
1/* DllSecur.c -- DLL loading security 1/* DllSecur.c -- DLL loading security
22023-04-02 : Igor Pavlov : Public domain */ 22023-12-03 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -11,19 +11,7 @@
11 11
12#ifndef UNDER_CE 12#ifndef UNDER_CE
13 13
14#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) 14Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
15 // #pragma GCC diagnostic ignored "-Wcast-function-type"
16#endif
17
18#if defined(__clang__) || defined(__GNUC__)
19typedef void (*Z7_voidFunction)(void);
20#define MY_CAST_FUNC (Z7_voidFunction)
21#elif defined(_MSC_VER) && _MSC_VER > 1920
22#define MY_CAST_FUNC (void *)
23// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
24#else
25#define MY_CAST_FUNC
26#endif
27 15
28typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); 16typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags);
29 17
@@ -61,7 +49,7 @@ static const char * const g_Dlls =
61 if ((UInt16)GetVersion() != 6) { \ 49 if ((UInt16)GetVersion() != 6) { \
62 const \ 50 const \
63 Func_SetDefaultDllDirectories setDllDirs = \ 51 Func_SetDefaultDllDirectories setDllDirs = \
64 (Func_SetDefaultDllDirectories) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ 52 (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \
65 "SetDefaultDllDirectories"); \ 53 "SetDefaultDllDirectories"); \
66 if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } 54 if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; }
67 55
diff --git a/C/HuffEnc.c b/C/HuffEnc.c
index 3dc1e39..996da30 100644
--- a/C/HuffEnc.c
+++ b/C/HuffEnc.c
@@ -1,5 +1,5 @@
1/* HuffEnc.c -- functions for Huffman encoding 1/* HuffEnc.c -- functions for Huffman encoding
22023-03-04 : Igor Pavlov : Public domain */ 22023-09-07 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -8,7 +8,7 @@
8 8
9#define kMaxLen 16 9#define kMaxLen 16
10#define NUM_BITS 10 10#define NUM_BITS 10
11#define MASK (((unsigned)1 << NUM_BITS) - 1) 11#define MASK ((1u << NUM_BITS) - 1)
12 12
13#define NUM_COUNTERS 64 13#define NUM_COUNTERS 64
14 14
diff --git a/C/LzFind.c b/C/LzFind.c
index 0fbd5aa..1ce4046 100644
--- a/C/LzFind.c
+++ b/C/LzFind.c
@@ -1,5 +1,5 @@
1/* LzFind.c -- Match finder for LZ algorithms 1/* LzFind.c -- Match finder for LZ algorithms
22023-03-14 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -108,9 +108,15 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all
108 return (p->bufBase != NULL); 108 return (p->bufBase != NULL);
109} 109}
110 110
111static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } 111static const Byte *MatchFinder_GetPointerToCurrentPos(void *p)
112{
113 return ((CMatchFinder *)p)->buffer;
114}
112 115
113static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } 116static UInt32 MatchFinder_GetNumAvailableBytes(void *p)
117{
118 return GET_AVAIL_BYTES((CMatchFinder *)p);
119}
114 120
115 121
116Z7_NO_INLINE 122Z7_NO_INLINE
@@ -571,8 +577,9 @@ void MatchFinder_Init_4(CMatchFinder *p)
571#define CYC_TO_POS_OFFSET 0 577#define CYC_TO_POS_OFFSET 0
572// #define CYC_TO_POS_OFFSET 1 // for debug 578// #define CYC_TO_POS_OFFSET 1 // for debug
573 579
574void MatchFinder_Init(CMatchFinder *p) 580void MatchFinder_Init(void *_p)
575{ 581{
582 CMatchFinder *p = (CMatchFinder *)_p;
576 MatchFinder_Init_HighHash(p); 583 MatchFinder_Init_HighHash(p);
577 MatchFinder_Init_LowHash(p); 584 MatchFinder_Init_LowHash(p);
578 MatchFinder_Init_4(p); 585 MatchFinder_Init_4(p);
@@ -607,16 +614,16 @@ void MatchFinder_Init(CMatchFinder *p)
607 #endif 614 #endif
608 #endif 615 #endif
609 616
610// #elif defined(MY_CPU_ARM_OR_ARM64) 617#elif defined(MY_CPU_ARM64) \
611#elif defined(MY_CPU_ARM64) 618 /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */
612 619
613 #if defined(__clang__) && (__clang_major__ >= 8) \ 620 #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
614 || defined(__GNUC__) && (__GNUC__ >= 8) 621 || defined(__GNUC__) && (__GNUC__ >= 6)
615 #define USE_LZFIND_SATUR_SUB_128 622 #define USE_LZFIND_SATUR_SUB_128
616 #ifdef MY_CPU_ARM64 623 #ifdef MY_CPU_ARM64
617 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) 624 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
618 #else 625 #else
619 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 626 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon")))
620 #endif 627 #endif
621 628
622 #elif defined(_MSC_VER) 629 #elif defined(_MSC_VER)
@@ -625,7 +632,7 @@ void MatchFinder_Init(CMatchFinder *p)
625 #endif 632 #endif
626 #endif 633 #endif
627 634
628 #if defined(_MSC_VER) && defined(MY_CPU_ARM64) 635 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
629 #include <arm64_neon.h> 636 #include <arm64_neon.h>
630 #else 637 #else
631 #include <arm_neon.h> 638 #include <arm_neon.h>
@@ -1082,9 +1089,11 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
1082 1089
1083 1090
1084#define MOVE_POS \ 1091#define MOVE_POS \
1085 ++p->cyclicBufferPos; \ 1092 p->cyclicBufferPos++; \
1086 p->buffer++; \ 1093 p->buffer++; \
1087 { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } 1094 { const UInt32 pos1 = p->pos + 1; \
1095 p->pos = pos1; \
1096 if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1088 1097
1089#define MOVE_POS_RET MOVE_POS return distances; 1098#define MOVE_POS_RET MOVE_POS return distances;
1090 1099
@@ -1103,20 +1112,26 @@ static void MatchFinder_MovePos(CMatchFinder *p)
1103} 1112}
1104 1113
1105#define GET_MATCHES_HEADER2(minLen, ret_op) \ 1114#define GET_MATCHES_HEADER2(minLen, ret_op) \
1106 unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ 1115 UInt32 hv; const Byte *cur; UInt32 curMatch; \
1107 lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ 1116 UInt32 lenLimit = p->lenLimit; \
1117 if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \
1108 cur = p->buffer; 1118 cur = p->buffer;
1109 1119
1110#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) 1120#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1111#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) 1121#define SKIP_HEADER(minLen) \
1122 do { GET_MATCHES_HEADER2(minLen, continue)
1112 1123
1113#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue 1124#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \
1125 p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1114 1126
1115#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); 1127#define SKIP_FOOTER \
1128 SkipMatchesSpec(MF_PARAMS(p)); \
1129 MOVE_POS \
1130 } while (--num);
1116 1131
1117#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ 1132#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1118 distances = func(MF_PARAMS(p), \ 1133 distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \
1119 distances, (UInt32)_maxLen_); MOVE_POS_RET 1134 MOVE_POS_RET
1120 1135
1121#define GET_MATCHES_FOOTER_BT(_maxLen_) \ 1136#define GET_MATCHES_FOOTER_BT(_maxLen_) \
1122 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) 1137 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
@@ -1133,8 +1148,9 @@ static void MatchFinder_MovePos(CMatchFinder *p)
1133 for (; c != lim; c++) if (*(c + diff) != *c) break; \ 1148 for (; c != lim; c++) if (*(c + diff) != *c) break; \
1134 maxLen = (unsigned)(c - cur); } 1149 maxLen = (unsigned)(c - cur); }
1135 1150
1136static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1151static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1137{ 1152{
1153 CMatchFinder *p = (CMatchFinder *)_p;
1138 GET_MATCHES_HEADER(2) 1154 GET_MATCHES_HEADER(2)
1139 HASH2_CALC 1155 HASH2_CALC
1140 curMatch = p->hash[hv]; 1156 curMatch = p->hash[hv];
@@ -1158,8 +1174,9 @@ UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1158 mmm = pos; 1174 mmm = pos;
1159 1175
1160 1176
1161static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1177static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1162{ 1178{
1179 CMatchFinder *p = (CMatchFinder *)_p;
1163 UInt32 mmm; 1180 UInt32 mmm;
1164 UInt32 h2, d2, pos; 1181 UInt32 h2, d2, pos;
1165 unsigned maxLen; 1182 unsigned maxLen;
@@ -1199,8 +1216,9 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1199} 1216}
1200 1217
1201 1218
1202static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1219static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1203{ 1220{
1221 CMatchFinder *p = (CMatchFinder *)_p;
1204 UInt32 mmm; 1222 UInt32 mmm;
1205 UInt32 h2, h3, d2, d3, pos; 1223 UInt32 h2, h3, d2, d3, pos;
1206 unsigned maxLen; 1224 unsigned maxLen;
@@ -1267,10 +1285,12 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1267} 1285}
1268 1286
1269 1287
1270static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1288static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1271{ 1289{
1290 CMatchFinder *p = (CMatchFinder *)_p;
1272 UInt32 mmm; 1291 UInt32 mmm;
1273 UInt32 h2, h3, d2, d3, maxLen, pos; 1292 UInt32 h2, h3, d2, d3, pos;
1293 unsigned maxLen;
1274 UInt32 *hash; 1294 UInt32 *hash;
1275 GET_MATCHES_HEADER(5) 1295 GET_MATCHES_HEADER(5)
1276 1296
@@ -1339,8 +1359,9 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1339} 1359}
1340 1360
1341 1361
1342static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1362static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1343{ 1363{
1364 CMatchFinder *p = (CMatchFinder *)_p;
1344 UInt32 mmm; 1365 UInt32 mmm;
1345 UInt32 h2, h3, d2, d3, pos; 1366 UInt32 h2, h3, d2, d3, pos;
1346 unsigned maxLen; 1367 unsigned maxLen;
@@ -1407,10 +1428,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1407} 1428}
1408 1429
1409 1430
1410static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) 1431static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances)
1411{ 1432{
1433 CMatchFinder *p = (CMatchFinder *)_p;
1412 UInt32 mmm; 1434 UInt32 mmm;
1413 UInt32 h2, h3, d2, d3, maxLen, pos; 1435 UInt32 h2, h3, d2, d3, pos;
1436 unsigned maxLen;
1414 UInt32 *hash; 1437 UInt32 *hash;
1415 GET_MATCHES_HEADER(5) 1438 GET_MATCHES_HEADER(5)
1416 1439
@@ -1466,7 +1489,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1466 if (*(cur - d2 + 3) != cur[3]) 1489 if (*(cur - d2 + 3) != cur[3])
1467 break; 1490 break;
1468 UPDATE_maxLen 1491 UPDATE_maxLen
1469 distances[-2] = maxLen; 1492 distances[-2] = (UInt32)maxLen;
1470 if (maxLen == lenLimit) 1493 if (maxLen == lenLimit)
1471 { 1494 {
1472 p->son[p->cyclicBufferPos] = curMatch; 1495 p->son[p->cyclicBufferPos] = curMatch;
@@ -1489,8 +1512,9 @@ UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1489} 1512}
1490 1513
1491 1514
1492static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1515static void Bt2_MatchFinder_Skip(void *_p, UInt32 num)
1493{ 1516{
1517 CMatchFinder *p = (CMatchFinder *)_p;
1494 SKIP_HEADER(2) 1518 SKIP_HEADER(2)
1495 { 1519 {
1496 HASH2_CALC 1520 HASH2_CALC
@@ -1511,8 +1535,9 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1511 SKIP_FOOTER 1535 SKIP_FOOTER
1512} 1536}
1513 1537
1514static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1538static void Bt3_MatchFinder_Skip(void *_p, UInt32 num)
1515{ 1539{
1540 CMatchFinder *p = (CMatchFinder *)_p;
1516 SKIP_HEADER(3) 1541 SKIP_HEADER(3)
1517 { 1542 {
1518 UInt32 h2; 1543 UInt32 h2;
@@ -1526,8 +1551,9 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1526 SKIP_FOOTER 1551 SKIP_FOOTER
1527} 1552}
1528 1553
1529static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1554static void Bt4_MatchFinder_Skip(void *_p, UInt32 num)
1530{ 1555{
1556 CMatchFinder *p = (CMatchFinder *)_p;
1531 SKIP_HEADER(4) 1557 SKIP_HEADER(4)
1532 { 1558 {
1533 UInt32 h2, h3; 1559 UInt32 h2, h3;
@@ -1542,8 +1568,9 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1542 SKIP_FOOTER 1568 SKIP_FOOTER
1543} 1569}
1544 1570
1545static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1571static void Bt5_MatchFinder_Skip(void *_p, UInt32 num)
1546{ 1572{
1573 CMatchFinder *p = (CMatchFinder *)_p;
1547 SKIP_HEADER(5) 1574 SKIP_HEADER(5)
1548 { 1575 {
1549 UInt32 h2, h3; 1576 UInt32 h2, h3;
@@ -1589,8 +1616,9 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1589 }} while(num); \ 1616 }} while(num); \
1590 1617
1591 1618
1592static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1619static void Hc4_MatchFinder_Skip(void *_p, UInt32 num)
1593{ 1620{
1621 CMatchFinder *p = (CMatchFinder *)_p;
1594 HC_SKIP_HEADER(4) 1622 HC_SKIP_HEADER(4)
1595 1623
1596 UInt32 h2, h3; 1624 UInt32 h2, h3;
@@ -1604,8 +1632,9 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1604} 1632}
1605 1633
1606 1634
1607static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) 1635static void Hc5_MatchFinder_Skip(void *_p, UInt32 num)
1608{ 1636{
1637 CMatchFinder *p = (CMatchFinder *)_p;
1609 HC_SKIP_HEADER(5) 1638 HC_SKIP_HEADER(5)
1610 1639
1611 UInt32 h2, h3; 1640 UInt32 h2, h3;
@@ -1634,41 +1663,41 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1634 1663
1635void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) 1664void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1636{ 1665{
1637 vTable->Init = (Mf_Init_Func)MatchFinder_Init; 1666 vTable->Init = MatchFinder_Init;
1638 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; 1667 vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes;
1639 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; 1668 vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos;
1640 if (!p->btMode) 1669 if (!p->btMode)
1641 { 1670 {
1642 if (p->numHashBytes <= 4) 1671 if (p->numHashBytes <= 4)
1643 { 1672 {
1644 vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; 1673 vTable->GetMatches = Hc4_MatchFinder_GetMatches;
1645 vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; 1674 vTable->Skip = Hc4_MatchFinder_Skip;
1646 } 1675 }
1647 else 1676 else
1648 { 1677 {
1649 vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; 1678 vTable->GetMatches = Hc5_MatchFinder_GetMatches;
1650 vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; 1679 vTable->Skip = Hc5_MatchFinder_Skip;
1651 } 1680 }
1652 } 1681 }
1653 else if (p->numHashBytes == 2) 1682 else if (p->numHashBytes == 2)
1654 { 1683 {
1655 vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; 1684 vTable->GetMatches = Bt2_MatchFinder_GetMatches;
1656 vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; 1685 vTable->Skip = Bt2_MatchFinder_Skip;
1657 } 1686 }
1658 else if (p->numHashBytes == 3) 1687 else if (p->numHashBytes == 3)
1659 { 1688 {
1660 vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; 1689 vTable->GetMatches = Bt3_MatchFinder_GetMatches;
1661 vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; 1690 vTable->Skip = Bt3_MatchFinder_Skip;
1662 } 1691 }
1663 else if (p->numHashBytes == 4) 1692 else if (p->numHashBytes == 4)
1664 { 1693 {
1665 vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; 1694 vTable->GetMatches = Bt4_MatchFinder_GetMatches;
1666 vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; 1695 vTable->Skip = Bt4_MatchFinder_Skip;
1667 } 1696 }
1668 else 1697 else
1669 { 1698 {
1670 vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; 1699 vTable->GetMatches = Bt5_MatchFinder_GetMatches;
1671 vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; 1700 vTable->Skip = Bt5_MatchFinder_Skip;
1672 } 1701 }
1673} 1702}
1674 1703
diff --git a/C/LzFind.h b/C/LzFind.h
index a3f72c9..67e8a6e 100644
--- a/C/LzFind.h
+++ b/C/LzFind.h
@@ -1,5 +1,5 @@
1/* LzFind.h -- Match finder for LZ algorithms 1/* LzFind.h -- Match finder for LZ algorithms
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZ_FIND_H 4#ifndef ZIP7_INC_LZ_FIND_H
5#define ZIP7_INC_LZ_FIND_H 5#define ZIP7_INC_LZ_FIND_H
@@ -144,7 +144,8 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable);
144void MatchFinder_Init_LowHash(CMatchFinder *p); 144void MatchFinder_Init_LowHash(CMatchFinder *p);
145void MatchFinder_Init_HighHash(CMatchFinder *p); 145void MatchFinder_Init_HighHash(CMatchFinder *p);
146void MatchFinder_Init_4(CMatchFinder *p); 146void MatchFinder_Init_4(CMatchFinder *p);
147void MatchFinder_Init(CMatchFinder *p); 147// void MatchFinder_Init(CMatchFinder *p);
148void MatchFinder_Init(void *p);
148 149
149UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); 150UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
150UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); 151UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
diff --git a/C/LzFindMt.c b/C/LzFindMt.c
index 5253e6e..ac9d59d 100644
--- a/C/LzFindMt.c
+++ b/C/LzFindMt.c
@@ -1,5 +1,5 @@
1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -94,7 +94,7 @@ static void MtSync_Construct(CMtSync *p)
94} 94}
95 95
96 96
97#define DEBUG_BUFFER_LOCK // define it to debug lock state 97// #define DEBUG_BUFFER_LOCK // define it to debug lock state
98 98
99#ifdef DEBUG_BUFFER_LOCK 99#ifdef DEBUG_BUFFER_LOCK
100#include <stdlib.h> 100#include <stdlib.h>
@@ -877,8 +877,9 @@ SRes MatchFinderMt_InitMt(CMatchFinderMt *p)
877} 877}
878 878
879 879
880static void MatchFinderMt_Init(CMatchFinderMt *p) 880static void MatchFinderMt_Init(void *_p)
881{ 881{
882 CMatchFinderMt *p = (CMatchFinderMt *)_p;
882 CMatchFinder *mf = MF(p); 883 CMatchFinder *mf = MF(p);
883 884
884 p->btBufPos = 885 p->btBufPos =
@@ -981,8 +982,9 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
981 982
982 983
983 984
984static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) 985static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p)
985{ 986{
987 CMatchFinderMt *p = (CMatchFinderMt *)_p;
986 return p->pointerToCurPos; 988 return p->pointerToCurPos;
987} 989}
988 990
@@ -990,8 +992,9 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
990#define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); 992#define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
991 993
992 994
993static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) 995static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p)
994{ 996{
997 CMatchFinderMt *p = (CMatchFinderMt *)_p;
995 if (p->btBufPos != p->btBufPosLimit) 998 if (p->btBufPos != p->btBufPosLimit)
996 return p->btNumAvailBytes; 999 return p->btNumAvailBytes;
997 return MatchFinderMt_GetNextBlock_Bt(p); 1000 return MatchFinderMt_GetNextBlock_Bt(p);
@@ -1243,8 +1246,9 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
1243} 1246}
1244 1247
1245 1248
1246static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) 1249static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d)
1247{ 1250{
1251 CMatchFinderMt *p = (CMatchFinderMt *)_p;
1248 const UInt32 *bt = p->btBufPos; 1252 const UInt32 *bt = p->btBufPos;
1249 const UInt32 len = *bt++; 1253 const UInt32 len = *bt++;
1250 const UInt32 *btLim = bt + len; 1254 const UInt32 *btLim = bt + len;
@@ -1267,8 +1271,9 @@ static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
1267 1271
1268 1272
1269 1273
1270static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) 1274static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d)
1271{ 1275{
1276 CMatchFinderMt *p = (CMatchFinderMt *)_p;
1272 const UInt32 *bt = p->btBufPos; 1277 const UInt32 *bt = p->btBufPos;
1273 UInt32 len = *bt++; 1278 UInt32 len = *bt++;
1274 const UInt32 avail = p->btNumAvailBytes - 1; 1279 const UInt32 avail = p->btNumAvailBytes - 1;
@@ -1315,14 +1320,16 @@ static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
1315#define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; 1320#define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
1316#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); 1321#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0);
1317 1322
1318static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) 1323static void MatchFinderMt0_Skip(void *_p, UInt32 num)
1319{ 1324{
1325 CMatchFinderMt *p = (CMatchFinderMt *)_p;
1320 SKIP_HEADER2_MT { p->btNumAvailBytes--; 1326 SKIP_HEADER2_MT { p->btNumAvailBytes--;
1321 SKIP_FOOTER_MT 1327 SKIP_FOOTER_MT
1322} 1328}
1323 1329
1324static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) 1330static void MatchFinderMt2_Skip(void *_p, UInt32 num)
1325{ 1331{
1332 CMatchFinderMt *p = (CMatchFinderMt *)_p;
1326 SKIP_HEADER_MT(2) 1333 SKIP_HEADER_MT(2)
1327 UInt32 h2; 1334 UInt32 h2;
1328 MT_HASH2_CALC 1335 MT_HASH2_CALC
@@ -1330,8 +1337,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
1330 SKIP_FOOTER_MT 1337 SKIP_FOOTER_MT
1331} 1338}
1332 1339
1333static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) 1340static void MatchFinderMt3_Skip(void *_p, UInt32 num)
1334{ 1341{
1342 CMatchFinderMt *p = (CMatchFinderMt *)_p;
1335 SKIP_HEADER_MT(3) 1343 SKIP_HEADER_MT(3)
1336 UInt32 h2, h3; 1344 UInt32 h2, h3;
1337 MT_HASH3_CALC 1345 MT_HASH3_CALC
@@ -1361,39 +1369,39 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
1361 1369
1362void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) 1370void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable)
1363{ 1371{
1364 vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; 1372 vTable->Init = MatchFinderMt_Init;
1365 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; 1373 vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes;
1366 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; 1374 vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos;
1367 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; 1375 vTable->GetMatches = MatchFinderMt_GetMatches;
1368 1376
1369 switch (MF(p)->numHashBytes) 1377 switch (MF(p)->numHashBytes)
1370 { 1378 {
1371 case 2: 1379 case 2:
1372 p->GetHeadsFunc = GetHeads2; 1380 p->GetHeadsFunc = GetHeads2;
1373 p->MixMatchesFunc = (Mf_Mix_Matches)NULL; 1381 p->MixMatchesFunc = NULL;
1374 vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; 1382 vTable->Skip = MatchFinderMt0_Skip;
1375 vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; 1383 vTable->GetMatches = MatchFinderMt2_GetMatches;
1376 break; 1384 break;
1377 case 3: 1385 case 3:
1378 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; 1386 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3;
1379 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; 1387 p->MixMatchesFunc = MixMatches2;
1380 vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; 1388 vTable->Skip = MatchFinderMt2_Skip;
1381 break; 1389 break;
1382 case 4: 1390 case 4:
1383 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; 1391 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4;
1384 1392
1385 // it's fast inline version of GetMatches() 1393 // it's fast inline version of GetMatches()
1386 // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; 1394 // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4;
1387 1395
1388 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; 1396 p->MixMatchesFunc = MixMatches3;
1389 vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; 1397 vTable->Skip = MatchFinderMt3_Skip;
1390 break; 1398 break;
1391 default: 1399 default:
1392 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; 1400 p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5;
1393 p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; 1401 p->MixMatchesFunc = MixMatches4;
1394 vTable->Skip = 1402 vTable->Skip =
1395 (Mf_Skip_Func)MatchFinderMt3_Skip; 1403 MatchFinderMt3_Skip;
1396 // (Mf_Skip_Func)MatchFinderMt4_Skip; 1404 // MatchFinderMt4_Skip;
1397 break; 1405 break;
1398 } 1406 }
1399} 1407}
diff --git a/C/LzFindMt.h b/C/LzFindMt.h
index db5923e..fcb479d 100644
--- a/C/LzFindMt.h
+++ b/C/LzFindMt.h
@@ -1,5 +1,5 @@
1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms 1/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
22023-03-05 : Igor Pavlov : Public domain */ 22024-01-22 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_LZ_FIND_MT_H 4#ifndef ZIP7_INC_LZ_FIND_MT_H
5#define ZIP7_INC_LZ_FIND_MT_H 5#define ZIP7_INC_LZ_FIND_MT_H
@@ -31,7 +31,10 @@ typedef struct
31 // UInt32 numBlocks_Sent; 31 // UInt32 numBlocks_Sent;
32} CMtSync; 32} CMtSync;
33 33
34typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); 34
35struct CMatchFinderMt_;
36
37typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances);
35 38
36/* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ 39/* kMtCacheLineDummy must be >= size_of_CPU_cache_line */
37#define kMtCacheLineDummy 128 40#define kMtCacheLineDummy 128
@@ -39,7 +42,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance
39typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, 42typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos,
40 UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); 43 UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc);
41 44
42typedef struct 45typedef struct CMatchFinderMt_
43{ 46{
44 /* LZ */ 47 /* LZ */
45 const Byte *pointerToCurPos; 48 const Byte *pointerToCurPos;
diff --git a/C/Lzma2Dec.c b/C/Lzma2Dec.c
index 388cbc7..8bf54e4 100644
--- a/C/Lzma2Dec.c
+++ b/C/Lzma2Dec.c
@@ -1,5 +1,5 @@
1/* Lzma2Dec.c -- LZMA2 Decoder 1/* Lzma2Dec.c -- LZMA2 Decoder
22023-03-03 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4/* #define SHOW_DEBUG_INFO */ 4/* #define SHOW_DEBUG_INFO */
5 5
@@ -157,8 +157,10 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
157 p->decoder.prop.lp = (Byte)lp; 157 p->decoder.prop.lp = (Byte)lp;
158 return LZMA2_STATE_DATA; 158 return LZMA2_STATE_DATA;
159 } 159 }
160
161 default:
162 return LZMA2_STATE_ERROR;
160 } 163 }
161 return LZMA2_STATE_ERROR;
162} 164}
163 165
164static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) 166static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size)
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c
index 6d13cac..37b2787 100644
--- a/C/LzmaEnc.c
+++ b/C/LzmaEnc.c
@@ -1,5 +1,5 @@
1/* LzmaEnc.c -- LZMA Encoder 1/* LzmaEnc.c -- LZMA Encoder
22023-04-13: Igor Pavlov : Public domain */ 22024-01-24: Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -195,11 +195,11 @@ unsigned GetPosSlot1(UInt32 pos);
195unsigned GetPosSlot1(UInt32 pos) 195unsigned GetPosSlot1(UInt32 pos)
196{ 196{
197 unsigned res; 197 unsigned res;
198 BSR2_RET(pos, res); 198 BSR2_RET(pos, res)
199 return res; 199 return res;
200} 200}
201#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } 201#define GetPosSlot2(pos, res) { BSR2_RET(pos, res) }
202#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } 202#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) }
203 203
204 204
205#else // ! LZMA_LOG_BSR 205#else // ! LZMA_LOG_BSR
@@ -512,7 +512,7 @@ struct CLzmaEnc
512 COPY_ARR(d, s, posEncoders) \ 512 COPY_ARR(d, s, posEncoders) \
513 (d)->lenProbs = (s)->lenProbs; \ 513 (d)->lenProbs = (s)->lenProbs; \
514 (d)->repLenProbs = (s)->repLenProbs; \ 514 (d)->repLenProbs = (s)->repLenProbs; \
515 memcpy((d)->litProbs, (s)->litProbs, ((UInt32)0x300 << (p)->lclp) * sizeof(CLzmaProb)); 515 memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp);
516 516
517void LzmaEnc_SaveState(CLzmaEncHandle p) 517void LzmaEnc_SaveState(CLzmaEncHandle p)
518{ 518{
@@ -1040,14 +1040,14 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables(
1040 UInt32 price = b; 1040 UInt32 price = b;
1041 do 1041 do
1042 { 1042 {
1043 unsigned bit = sym & 1; 1043 const unsigned bit = sym & 1;
1044 sym >>= 1; 1044 sym >>= 1;
1045 price += GET_PRICEa(probs[sym], bit); 1045 price += GET_PRICEa(probs[sym], bit);
1046 } 1046 }
1047 while (sym >= 2); 1047 while (sym >= 2);
1048 1048
1049 { 1049 {
1050 unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; 1050 const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))];
1051 prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); 1051 prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob);
1052 prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); 1052 prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob);
1053 } 1053 }
@@ -1056,7 +1056,7 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables(
1056 1056
1057 { 1057 {
1058 unsigned posState; 1058 unsigned posState;
1059 size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); 1059 const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]);
1060 for (posState = 1; posState < numPosStates; posState++) 1060 for (posState = 1; posState < numPosStates; posState++)
1061 memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); 1061 memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num);
1062 } 1062 }
@@ -2696,12 +2696,12 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc,
2696 #endif 2696 #endif
2697 2697
2698 { 2698 {
2699 unsigned lclp = p->lc + p->lp; 2699 const unsigned lclp = p->lc + p->lp;
2700 if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) 2700 if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp)
2701 { 2701 {
2702 LzmaEnc_FreeLits(p, alloc); 2702 LzmaEnc_FreeLits(p, alloc);
2703 p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); 2703 p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp);
2704 p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); 2704 p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp);
2705 if (!p->litProbs || !p->saveState.litProbs) 2705 if (!p->litProbs || !p->saveState.litProbs)
2706 { 2706 {
2707 LzmaEnc_FreeLits(p, alloc); 2707 LzmaEnc_FreeLits(p, alloc);
@@ -2802,8 +2802,8 @@ static void LzmaEnc_Init(CLzmaEnc *p)
2802 } 2802 }
2803 2803
2804 { 2804 {
2805 UInt32 num = (UInt32)0x300 << (p->lp + p->lc); 2805 const size_t num = (size_t)0x300 << (p->lp + p->lc);
2806 UInt32 k; 2806 size_t k;
2807 CLzmaProb *probs = p->litProbs; 2807 CLzmaProb *probs = p->litProbs;
2808 for (k = 0; k < num; k++) 2808 for (k = 0; k < num; k++)
2809 probs[k] = kProbInitValue; 2809 probs[k] = kProbInitValue;
diff --git a/C/MtCoder.c b/C/MtCoder.c
index 6f58abb..03959b6 100644
--- a/C/MtCoder.c
+++ b/C/MtCoder.c
@@ -1,5 +1,5 @@
1/* MtCoder.c -- Multi-thread Coder 1/* MtCoder.c -- Multi-thread Coder
22023-04-13 : Igor Pavlov : Public domain */ 22023-09-07 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -430,7 +430,7 @@ SRes MtCoder_Code(CMtCoder *p)
430 SRes res = SZ_OK; 430 SRes res = SZ_OK;
431 431
432 if (numThreads > MTCODER_THREADS_MAX) 432 if (numThreads > MTCODER_THREADS_MAX)
433 numThreads = MTCODER_THREADS_MAX; 433 numThreads = MTCODER_THREADS_MAX;
434 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); 434 numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
435 435
436 if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; 436 if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++;
@@ -438,7 +438,7 @@ SRes MtCoder_Code(CMtCoder *p)
438 if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; 438 if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++;
439 439
440 if (numBlocksMax > MTCODER_BLOCKS_MAX) 440 if (numBlocksMax > MTCODER_BLOCKS_MAX)
441 numBlocksMax = MTCODER_BLOCKS_MAX; 441 numBlocksMax = MTCODER_BLOCKS_MAX;
442 442
443 if (p->blockSize != p->allocatedBufsSize) 443 if (p->blockSize != p->allocatedBufsSize)
444 { 444 {
@@ -469,7 +469,7 @@ SRes MtCoder_Code(CMtCoder *p)
469 469
470 { 470 {
471 RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) 471 RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent))
472 RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)) 472 RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax))
473 } 473 }
474 474
475 for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) 475 for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++)
diff --git a/C/MtDec.c b/C/MtDec.c
index 7820699..96274b6 100644
--- a/C/MtDec.c
+++ b/C/MtDec.c
@@ -1,5 +1,5 @@
1/* MtDec.c -- Multi-thread Decoder 1/* MtDec.c -- Multi-thread Decoder
22023-04-02 : Igor Pavlov : Public domain */ 22024-02-20 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -809,6 +809,16 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
809#endif 809#endif
810 810
811 811
812typedef
813 #ifdef _WIN32
814 UINT_PTR
815 #elif 1
816 uintptr_t
817 #else
818 ptrdiff_t
819 #endif
820 MY_uintptr_t;
821
812static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) 822static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
813{ 823{
814 WRes res; 824 WRes res;
@@ -821,7 +831,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
821 res = MtDec_ThreadFunc2(t); 831 res = MtDec_ThreadFunc2(t);
822 p = t->mtDec; 832 p = t->mtDec;
823 if (res == 0) 833 if (res == 0)
824 return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; 834 return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes;
825 { 835 {
826 // it's unexpected situation for some threading function error 836 // it's unexpected situation for some threading function error
827 if (p->exitThreadWRes == 0) 837 if (p->exitThreadWRes == 0)
@@ -832,7 +842,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
832 Event_Set(&p->threads[0].canWrite); 842 Event_Set(&p->threads[0].canWrite);
833 MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); 843 MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res));
834 } 844 }
835 return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; 845 return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res;
836} 846}
837 847
838static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) 848static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp)
@@ -1072,7 +1082,7 @@ SRes MtDec_Code(CMtDec *p)
1072 if (wres == 0) { wres = Event_Set(&nextThread->canWrite); 1082 if (wres == 0) { wres = Event_Set(&nextThread->canWrite);
1073 if (wres == 0) { wres = Event_Set(&nextThread->canRead); 1083 if (wres == 0) { wres = Event_Set(&nextThread->canRead);
1074 if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); 1084 if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread);
1075 wres = (WRes)(UINT_PTR)res; 1085 wres = (WRes)(MY_uintptr_t)res;
1076 if (wres != 0) 1086 if (wres != 0)
1077 { 1087 {
1078 p->needContinue = False; 1088 p->needContinue = False;
diff --git a/C/Ppmd7.c b/C/Ppmd7.c
index 6e1307e..efcc5d8 100644
--- a/C/Ppmd7.c
+++ b/C/Ppmd7.c
@@ -1,5 +1,5 @@
1/* Ppmd7.c -- PPMdH codec 1/* Ppmd7.c -- PPMdH codec
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ 3This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
@@ -302,8 +302,17 @@ static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx)
302 302
303 303
304#define MEM_12_CPY(dest, src, num) \ 304#define MEM_12_CPY(dest, src, num) \
305 { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ 305 { UInt32 *d = (UInt32 *)(dest); \
306 do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } 306 const UInt32 *z = (const UInt32 *)(src); \
307 unsigned n = (num); \
308 do { \
309 d[0] = z[0]; \
310 d[1] = z[1]; \
311 d[2] = z[2]; \
312 z += 3; \
313 d += 3; \
314 } while (--n); \
315 }
307 316
308 317
309/* 318/*
@@ -711,8 +720,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
711 if ((ns1 & 1) == 0) 720 if ((ns1 & 1) == 0)
712 { 721 {
713 /* Expand for one UNIT */ 722 /* Expand for one UNIT */
714 unsigned oldNU = ns1 >> 1; 723 const unsigned oldNU = ns1 >> 1;
715 unsigned i = U2I(oldNU); 724 const unsigned i = U2I(oldNU);
716 if (i != U2I((size_t)oldNU + 1)) 725 if (i != U2I((size_t)oldNU + 1))
717 { 726 {
718 void *ptr = Ppmd7_AllocUnits(p, i + 1); 727 void *ptr = Ppmd7_AllocUnits(p, i + 1);
@@ -731,7 +740,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
731 sum = c->Union2.SummFreq; 740 sum = c->Union2.SummFreq;
732 /* max increase of Escape_Freq is 3 here. 741 /* max increase of Escape_Freq is 3 here.
733 total increase of Union2.SummFreq for all symbols is less than 256 here */ 742 total increase of Union2.SummFreq for all symbols is less than 256 here */
734 sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); 743 sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)));
735 /* original PPMdH uses 16-bit variable for (sum) here. 744 /* original PPMdH uses 16-bit variable for (sum) here.
736 But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ 745 But (sum < 0x9000). So we don't truncate (sum) to 16-bit */
737 // sum = (UInt16)sum; 746 // sum = (UInt16)sum;
@@ -761,7 +770,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
761 // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context 770 // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context
762 s->Freq = (Byte)freq; 771 s->Freq = (Byte)freq;
763 // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here 772 // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here
764 sum = freq + p->InitEsc + (ns > 3); 773 sum = (UInt32)(freq + p->InitEsc + (ns > 3));
765 } 774 }
766 } 775 }
767 776
@@ -933,10 +942,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
933 p->HiBitsFlag; 942 p->HiBitsFlag;
934 { 943 {
935 // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ 944 // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
936 unsigned summ = (UInt16)see->Summ; // & 0xFFFF 945 const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
937 unsigned r = (summ >> see->Shift); 946 const unsigned r = (summ >> see->Shift);
938 see->Summ = (UInt16)(summ - r); 947 see->Summ = (UInt16)(summ - r);
939 *escFreq = r + (r == 0); 948 *escFreq = (UInt32)(r + (r == 0));
940 } 949 }
941 } 950 }
942 else 951 else
@@ -981,9 +990,9 @@ void Ppmd7_Update1_0(CPpmd7 *p)
981 CPpmd_State *s = p->FoundState; 990 CPpmd_State *s = p->FoundState;
982 CPpmd7_Context *mc = p->MinContext; 991 CPpmd7_Context *mc = p->MinContext;
983 unsigned freq = s->Freq; 992 unsigned freq = s->Freq;
984 unsigned summFreq = mc->Union2.SummFreq; 993 const unsigned summFreq = mc->Union2.SummFreq;
985 p->PrevSuccess = (2 * freq > summFreq); 994 p->PrevSuccess = (2 * freq > summFreq);
986 p->RunLength += (int)p->PrevSuccess; 995 p->RunLength += (Int32)p->PrevSuccess;
987 mc->Union2.SummFreq = (UInt16)(summFreq + 4); 996 mc->Union2.SummFreq = (UInt16)(summFreq + 4);
988 freq += 4; 997 freq += 4;
989 s->Freq = (Byte)freq; 998 s->Freq = (Byte)freq;
diff --git a/C/Ppmd7Dec.c b/C/Ppmd7Dec.c
index 8323828..081ab89 100644
--- a/C/Ppmd7Dec.c
+++ b/C/Ppmd7Dec.c
@@ -1,5 +1,5 @@
1/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder 1/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on: 3This code is based on:
4 PPMd var.H (2001): Dmitry Shkarin : Public domain */ 4 PPMd var.H (2001): Dmitry Shkarin : Public domain */
5 5
@@ -58,7 +58,7 @@ static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) 58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
59void Ppmd7_UpdateModel(CPpmd7 *p); 59void Ppmd7_UpdateModel(CPpmd7 *p);
60 60
61#define MASK(sym) ((unsigned char *)charMask)[sym] 61#define MASK(sym) ((Byte *)charMask)[sym]
62// Z7_FORCE_INLINE 62// Z7_FORCE_INLINE
63// static 63// static
64int Ppmd7z_DecodeSymbol(CPpmd7 *p) 64int Ppmd7z_DecodeSymbol(CPpmd7 *p)
@@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
120 MASK(s->Symbol) = 0; 120 MASK(s->Symbol) = 0;
121 do 121 do
122 { 122 {
123 unsigned sym0 = s2[0].Symbol; 123 const unsigned sym0 = s2[0].Symbol;
124 unsigned sym1 = s2[1].Symbol; 124 const unsigned sym1 = s2[1].Symbol;
125 s2 += 2; 125 s2 += 2;
126 MASK(sym0) = 0; 126 MASK(sym0) = 0;
127 MASK(sym1) = 0; 127 MASK(sym1) = 0;
@@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
209 unsigned num2 = num / 2; 209 unsigned num2 = num / 2;
210 210
211 num &= 1; 211 num &= 1;
212 hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); 212 hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
213 s += num; 213 s += num;
214 p->MinContext = mc; 214 p->MinContext = mc;
215 215
216 do 216 do
217 { 217 {
218 unsigned sym0 = s[0].Symbol; 218 const unsigned sym0 = s[0].Symbol;
219 unsigned sym1 = s[1].Symbol; 219 const unsigned sym1 = s[1].Symbol;
220 s += 2; 220 s += 2;
221 hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); 221 hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
222 hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); 222 hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
223 } 223 }
224 while (--num2); 224 while (--num2);
225 } 225 }
@@ -238,13 +238,13 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
238 238
239 s = Ppmd7_GetStats(p, p->MinContext); 239 s = Ppmd7_GetStats(p, p->MinContext);
240 hiCnt = count; 240 hiCnt = count;
241 // count -= s->Freq & (unsigned)(MASK(s->Symbol)); 241 // count -= s->Freq & (UInt32)(MASK(s->Symbol));
242 // if ((Int32)count >= 0) 242 // if ((Int32)count >= 0)
243 { 243 {
244 for (;;) 244 for (;;)
245 { 245 {
246 count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 246 count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
247 // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 247 // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
248 } 248 }
249 } 249 }
250 s--; 250 s--;
diff --git a/C/Ppmd7Enc.c b/C/Ppmd7Enc.c
index 41106ba..49cbbe6 100644
--- a/C/Ppmd7Enc.c
+++ b/C/Ppmd7Enc.c
@@ -1,5 +1,5 @@
1/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder 1/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on: 3This code is based on:
4 PPMd var.H (2001): Dmitry Shkarin : Public domain */ 4 PPMd var.H (2001): Dmitry Shkarin : Public domain */
5 5
@@ -82,7 +82,7 @@ void Ppmd7z_Flush_RangeEnc(CPpmd7 *p)
82 82
83void Ppmd7_UpdateModel(CPpmd7 *p); 83void Ppmd7_UpdateModel(CPpmd7 *p);
84 84
85#define MASK(sym) ((unsigned char *)charMask)[sym] 85#define MASK(sym) ((Byte *)charMask)[sym]
86 86
87Z7_FORCE_INLINE 87Z7_FORCE_INLINE
88static 88static
@@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
139 MASK(s->Symbol) = 0; 139 MASK(s->Symbol) = 0;
140 do 140 do
141 { 141 {
142 unsigned sym0 = s2[0].Symbol; 142 const unsigned sym0 = s2[0].Symbol;
143 unsigned sym1 = s2[1].Symbol; 143 const unsigned sym1 = s2[1].Symbol;
144 s2 += 2; 144 s2 += 2;
145 MASK(sym0) = 0; 145 MASK(sym0) = 0;
146 MASK(sym1) = 0; 146 MASK(sym1) = 0;
@@ -265,16 +265,15 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
265 if (num2 != 0) 265 if (num2 != 0)
266 { 266 {
267 s += i; 267 s += i;
268 for (;;) 268 do
269 { 269 {
270 unsigned sym0 = s[0].Symbol; 270 const unsigned sym0 = s[0].Symbol;
271 unsigned sym1 = s[1].Symbol; 271 const unsigned sym1 = s[1].Symbol;
272 s += 2; 272 s += 2;
273 sum += (s[-2].Freq & (unsigned)(MASK(sym0))); 273 sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
274 sum += (s[-1].Freq & (unsigned)(MASK(sym1))); 274 sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
275 if (--num2 == 0)
276 break;
277 } 275 }
276 while (--num2);
278 } 277 }
279 278
280 279
diff --git a/C/Ppmd7aDec.c b/C/Ppmd7aDec.c
index 55e164e..ef86dde 100644
--- a/C/Ppmd7aDec.c
+++ b/C/Ppmd7aDec.c
@@ -1,5 +1,5 @@
1/* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder 1/* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on: 3This code is based on:
4 PPMd var.H (2001): Dmitry Shkarin : Public domain 4 PPMd var.H (2001): Dmitry Shkarin : Public domain
5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ 5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@@ -58,7 +58,7 @@ typedef CPpmd7_Context * CTX_PTR;
58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) 58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
59void Ppmd7_UpdateModel(CPpmd7 *p); 59void Ppmd7_UpdateModel(CPpmd7 *p);
60 60
61#define MASK(sym) ((unsigned char *)charMask)[sym] 61#define MASK(sym) ((Byte *)charMask)[sym]
62 62
63 63
64int Ppmd7a_DecodeSymbol(CPpmd7 *p) 64int Ppmd7a_DecodeSymbol(CPpmd7 *p)
@@ -120,8 +120,8 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
120 MASK(s->Symbol) = 0; 120 MASK(s->Symbol) = 0;
121 do 121 do
122 { 122 {
123 unsigned sym0 = s2[0].Symbol; 123 const unsigned sym0 = s2[0].Symbol;
124 unsigned sym1 = s2[1].Symbol; 124 const unsigned sym1 = s2[1].Symbol;
125 s2 += 2; 125 s2 += 2;
126 MASK(sym0) = 0; 126 MASK(sym0) = 0;
127 MASK(sym1) = 0; 127 MASK(sym1) = 0;
@@ -209,17 +209,17 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
209 unsigned num2 = num / 2; 209 unsigned num2 = num / 2;
210 210
211 num &= 1; 211 num &= 1;
212 hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); 212 hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
213 s += num; 213 s += num;
214 p->MinContext = mc; 214 p->MinContext = mc;
215 215
216 do 216 do
217 { 217 {
218 unsigned sym0 = s[0].Symbol; 218 const unsigned sym0 = s[0].Symbol;
219 unsigned sym1 = s[1].Symbol; 219 const unsigned sym1 = s[1].Symbol;
220 s += 2; 220 s += 2;
221 hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); 221 hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
222 hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); 222 hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
223 } 223 }
224 while (--num2); 224 while (--num2);
225 } 225 }
@@ -238,13 +238,13 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
238 238
239 s = Ppmd7_GetStats(p, p->MinContext); 239 s = Ppmd7_GetStats(p, p->MinContext);
240 hiCnt = count; 240 hiCnt = count;
241 // count -= s->Freq & (unsigned)(MASK(s->Symbol)); 241 // count -= s->Freq & (UInt32)(MASK(s->Symbol));
242 // if ((Int32)count >= 0) 242 // if ((Int32)count >= 0)
243 { 243 {
244 for (;;) 244 for (;;)
245 { 245 {
246 count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 246 count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
247 // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 247 // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
248 } 248 }
249 } 249 }
250 s--; 250 s--;
diff --git a/C/Ppmd8.c b/C/Ppmd8.c
index 28abf27..c6bdd86 100644
--- a/C/Ppmd8.c
+++ b/C/Ppmd8.c
@@ -1,5 +1,5 @@
1/* Ppmd8.c -- PPMdI codec 1/* Ppmd8.c -- PPMdI codec
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ 3This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
@@ -302,8 +302,17 @@ static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx)
302 302
303 303
304#define MEM_12_CPY(dest, src, num) \ 304#define MEM_12_CPY(dest, src, num) \
305 { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ 305 { UInt32 *d = (UInt32 *)(dest); \
306 do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } 306 const UInt32 *z = (const UInt32 *)(src); \
307 unsigned n = (num); \
308 do { \
309 d[0] = z[0]; \
310 d[1] = z[1]; \
311 d[2] = z[2]; \
312 z += 3; \
313 d += 3; \
314 } while (--n); \
315 }
307 316
308 317
309 318
@@ -1215,8 +1224,8 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
1215 if ((ns1 & 1) != 0) 1224 if ((ns1 & 1) != 0)
1216 { 1225 {
1217 /* Expand for one UNIT */ 1226 /* Expand for one UNIT */
1218 unsigned oldNU = (ns1 + 1) >> 1; 1227 const unsigned oldNU = (ns1 + 1) >> 1;
1219 unsigned i = U2I(oldNU); 1228 const unsigned i = U2I(oldNU);
1220 if (i != U2I((size_t)oldNU + 1)) 1229 if (i != U2I((size_t)oldNU + 1))
1221 { 1230 {
1222 void *ptr = Ppmd8_AllocUnits(p, i + 1); 1231 void *ptr = Ppmd8_AllocUnits(p, i + 1);
@@ -1235,7 +1244,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
1235 sum = c->Union2.SummFreq; 1244 sum = c->Union2.SummFreq;
1236 /* max increase of Escape_Freq is 1 here. 1245 /* max increase of Escape_Freq is 1 here.
1237 an average increase is 1/3 per symbol */ 1246 an average increase is 1/3 per symbol */
1238 sum += (3 * ns1 + 1 < ns); 1247 sum += (UInt32)(unsigned)(3 * ns1 + 1 < ns);
1239 /* original PPMdH uses 16-bit variable for (sum) here. 1248 /* original PPMdH uses 16-bit variable for (sum) here.
1240 But (sum < ???). Do we need to truncate (sum) to 16-bit */ 1249 But (sum < ???). Do we need to truncate (sum) to 16-bit */
1241 // sum = (UInt16)sum; 1250 // sum = (UInt16)sum;
@@ -1265,7 +1274,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
1265 1274
1266 s->Freq = (Byte)freq; 1275 s->Freq = (Byte)freq;
1267 1276
1268 sum = freq + p->InitEsc + (ns > 2); // Ppmd8 (> 2) 1277 sum = (UInt32)(freq + p->InitEsc + (ns > 2)); // Ppmd8 (> 2)
1269 } 1278 }
1270 } 1279 }
1271 1280
@@ -1437,10 +1446,10 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq)
1437 1446
1438 { 1447 {
1439 // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ 1448 // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
1440 unsigned summ = (UInt16)see->Summ; // & 0xFFFF 1449 const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
1441 unsigned r = (summ >> see->Shift); 1450 const unsigned r = (summ >> see->Shift);
1442 see->Summ = (UInt16)(summ - r); 1451 see->Summ = (UInt16)(summ - r);
1443 *escFreq = r + (r == 0); 1452 *escFreq = (UInt32)(r + (r == 0));
1444 } 1453 }
1445 } 1454 }
1446 else 1455 else
@@ -1485,9 +1494,9 @@ void Ppmd8_Update1_0(CPpmd8 *p)
1485 CPpmd_State *s = p->FoundState; 1494 CPpmd_State *s = p->FoundState;
1486 CPpmd8_Context *mc = p->MinContext; 1495 CPpmd8_Context *mc = p->MinContext;
1487 unsigned freq = s->Freq; 1496 unsigned freq = s->Freq;
1488 unsigned summFreq = mc->Union2.SummFreq; 1497 const unsigned summFreq = mc->Union2.SummFreq;
1489 p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) 1498 p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=)
1490 p->RunLength += (int)p->PrevSuccess; 1499 p->RunLength += (Int32)p->PrevSuccess;
1491 mc->Union2.SummFreq = (UInt16)(summFreq + 4); 1500 mc->Union2.SummFreq = (UInt16)(summFreq + 4);
1492 freq += 4; 1501 freq += 4;
1493 s->Freq = (Byte)freq; 1502 s->Freq = (Byte)freq;
diff --git a/C/Ppmd8Dec.c b/C/Ppmd8Dec.c
index 72d3626..ff91167 100644
--- a/C/Ppmd8Dec.c
+++ b/C/Ppmd8Dec.c
@@ -1,5 +1,5 @@
1/* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder 1/* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on: 3This code is based on:
4 PPMd var.I (2002): Dmitry Shkarin : Public domain 4 PPMd var.I (2002): Dmitry Shkarin : Public domain
5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ 5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@@ -58,7 +58,7 @@ static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) 58#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
59void Ppmd8_UpdateModel(CPpmd8 *p); 59void Ppmd8_UpdateModel(CPpmd8 *p);
60 60
61#define MASK(sym) ((unsigned char *)charMask)[sym] 61#define MASK(sym) ((Byte *)charMask)[sym]
62 62
63 63
64int Ppmd8_DecodeSymbol(CPpmd8 *p) 64int Ppmd8_DecodeSymbol(CPpmd8 *p)
@@ -120,8 +120,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
120 MASK(s->Symbol) = 0; 120 MASK(s->Symbol) = 0;
121 do 121 do
122 { 122 {
123 unsigned sym0 = s2[0].Symbol; 123 const unsigned sym0 = s2[0].Symbol;
124 unsigned sym1 = s2[1].Symbol; 124 const unsigned sym1 = s2[1].Symbol;
125 s2 += 2; 125 s2 += 2;
126 MASK(sym0) = 0; 126 MASK(sym0) = 0;
127 MASK(sym1) = 0; 127 MASK(sym1) = 0;
@@ -209,17 +209,17 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
209 unsigned num2 = num / 2; 209 unsigned num2 = num / 2;
210 210
211 num &= 1; 211 num &= 1;
212 hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); 212 hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
213 s += num; 213 s += num;
214 p->MinContext = mc; 214 p->MinContext = mc;
215 215
216 do 216 do
217 { 217 {
218 unsigned sym0 = s[0].Symbol; 218 const unsigned sym0 = s[0].Symbol;
219 unsigned sym1 = s[1].Symbol; 219 const unsigned sym1 = s[1].Symbol;
220 s += 2; 220 s += 2;
221 hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); 221 hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
222 hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); 222 hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
223 } 223 }
224 while (--num2); 224 while (--num2);
225 } 225 }
@@ -243,8 +243,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
243 { 243 {
244 for (;;) 244 for (;;)
245 { 245 {
246 count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 246 count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
247 // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; 247 // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
248 } 248 }
249 } 249 }
250 s--; 250 s--;
diff --git a/C/Ppmd8Enc.c b/C/Ppmd8Enc.c
index 9e29ef7..b0e34c4 100644
--- a/C/Ppmd8Enc.c
+++ b/C/Ppmd8Enc.c
@@ -1,5 +1,5 @@
1/* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder 1/* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder
22023-04-02 : Igor Pavlov : Public domain 22023-09-07 : Igor Pavlov : Public domain
3This code is based on: 3This code is based on:
4 PPMd var.I (2002): Dmitry Shkarin : Public domain 4 PPMd var.I (2002): Dmitry Shkarin : Public domain
5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ 5 Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@@ -82,7 +82,7 @@ static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 t
82 82
83void Ppmd8_UpdateModel(CPpmd8 *p); 83void Ppmd8_UpdateModel(CPpmd8 *p);
84 84
85#define MASK(sym) ((unsigned char *)charMask)[sym] 85#define MASK(sym) ((Byte *)charMask)[sym]
86 86
87// Z7_FORCE_INLINE 87// Z7_FORCE_INLINE
88// static 88// static
@@ -139,8 +139,8 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
139 MASK(s->Symbol) = 0; 139 MASK(s->Symbol) = 0;
140 do 140 do
141 { 141 {
142 unsigned sym0 = s2[0].Symbol; 142 const unsigned sym0 = s2[0].Symbol;
143 unsigned sym1 = s2[1].Symbol; 143 const unsigned sym1 = s2[1].Symbol;
144 s2 += 2; 144 s2 += 2;
145 MASK(sym0) = 0; 145 MASK(sym0) = 0;
146 MASK(sym1) = 0; 146 MASK(sym1) = 0;
@@ -265,16 +265,15 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
265 if (num2 != 0) 265 if (num2 != 0)
266 { 266 {
267 s += i; 267 s += i;
268 for (;;) 268 do
269 { 269 {
270 unsigned sym0 = s[0].Symbol; 270 const unsigned sym0 = s[0].Symbol;
271 unsigned sym1 = s[1].Symbol; 271 const unsigned sym1 = s[1].Symbol;
272 s += 2; 272 s += 2;
273 sum += (s[-2].Freq & (unsigned)(MASK(sym0))); 273 sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
274 sum += (s[-1].Freq & (unsigned)(MASK(sym1))); 274 sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
275 if (--num2 == 0)
276 break;
277 } 275 }
276 while (--num2);
278 } 277 }
279 278
280 PPMD8_CORRECT_SUM_RANGE(p, sum) 279 PPMD8_CORRECT_SUM_RANGE(p, sum)
diff --git a/C/Precomp.h b/C/Precomp.h
index 69afb2f..7747fdd 100644
--- a/C/Precomp.h
+++ b/C/Precomp.h
@@ -1,10 +1,127 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- precompilation file
22023-04-02 : Igor Pavlov : Public domain */ 22024-01-25 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4#ifndef ZIP7_INC_PRECOMP_H
5#define ZIP7_INC_PRECOMP_H 5#define ZIP7_INC_PRECOMP_H
6 6
7/*
8 this file must be included before another *.h files and before <windows.h>.
9 this file is included from the following files:
10 C\*.c
11 C\Util\*\Precomp.h <- C\Util\*\*.c
12 CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp
13
14 this file can set the following macros:
15 Z7_LARGE_PAGES 1
16 Z7_LONG_PATH 1
17 Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip
18 _WIN32_WINNT 0x0500 (or higher)
19 WINVER _WIN32_WINNT
20 UNICODE 1
21 _UNICODE 1
22*/
23
7#include "Compiler.h" 24#include "Compiler.h"
8/* #include "7zTypes.h" */ 25
26#ifdef _MSC_VER
27// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
28#if _MSC_VER >= 1912
29// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception.
30#endif
31#endif
32
33/*
34// for debug:
35#define UNICODE 1
36#define _UNICODE 1
37#define _WIN32_WINNT 0x0500 // win2000
38#ifndef WINVER
39 #define WINVER _WIN32_WINNT
40#endif
41*/
42
43#ifdef _WIN32
44/*
45 this "Precomp.h" file must be included before <windows.h>,
46 if we want to define _WIN32_WINNT before <windows.h>.
47*/
48
49#ifndef Z7_LARGE_PAGES
50#ifndef Z7_NO_LARGE_PAGES
51#define Z7_LARGE_PAGES 1
52#endif
53#endif
54
55#ifndef Z7_LONG_PATH
56#ifndef Z7_NO_LONG_PATH
57#define Z7_LONG_PATH 1
58#endif
59#endif
60
61#ifndef Z7_DEVICE_FILE
62#ifndef Z7_NO_DEVICE_FILE
63// #define Z7_DEVICE_FILE 1
64#endif
65#endif
66
67// we don't change macros if included after <windows.h>
68#ifndef _WINDOWS_
69
70#ifndef Z7_WIN32_WINNT_MIN
71 #if defined(_M_ARM64) || defined(__aarch64__)
72 // #define Z7_WIN32_WINNT_MIN 0x0a00 // win10
73 #define Z7_WIN32_WINNT_MIN 0x0600 // vista
74 #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT)
75 // #define Z7_WIN32_WINNT_MIN 0x0602 // win8
76 #define Z7_WIN32_WINNT_MIN 0x0600 // vista
77 #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64)
78 #define Z7_WIN32_WINNT_MIN 0x0503 // win2003
79 // #elif defined(_M_IX86) || defined(__i386__)
80 // #define Z7_WIN32_WINNT_MIN 0x0500 // win2000
81 #else // x86 and another(old) systems
82 #define Z7_WIN32_WINNT_MIN 0x0500 // win2000
83 // #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug
84 #endif
85#endif // Z7_WIN32_WINNT_MIN
86
87
88#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT
89#ifdef _WIN32_WINNT
90 // #error Stop_Compiling_Bad_WIN32_WINNT
91#else
92 #ifndef Z7_NO_DEFINE_WIN32_WINNT
93Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
94 #define _WIN32_WINNT Z7_WIN32_WINNT_MIN
95Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
96 #endif
97#endif // _WIN32_WINNT
98
99#ifndef WINVER
100 #define WINVER _WIN32_WINNT
101#endif
102#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT
103
104
105#ifndef _MBCS
106#ifndef Z7_NO_UNICODE
107// UNICODE and _UNICODE are used by <windows.h> and by 7-zip code.
108
109#ifndef UNICODE
110#define UNICODE 1
111#endif
112
113#ifndef _UNICODE
114Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
115#define _UNICODE 1
116Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
117#endif
118
119#endif // Z7_NO_UNICODE
120#endif // _MBCS
121#endif // _WINDOWS_
122
123// #include "7zWindows.h"
124
125#endif // _WIN32
9 126
10#endif 127#endif
diff --git a/C/Sha1.c b/C/Sha1.c
index fd6c018..4c92892 100644
--- a/C/Sha1.c
+++ b/C/Sha1.c
@@ -1,5 +1,5 @@
1/* Sha1.c -- SHA-1 Hash 1/* Sha1.c -- SHA-1 Hash
22023-04-02 : Igor Pavlov : Public domain 22024-03-01 : Igor Pavlov : Public domain
3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ 3This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
@@ -15,35 +15,35 @@ This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ l
15#endif 15#endif
16 16
17#ifdef MY_CPU_X86_OR_AMD64 17#ifdef MY_CPU_X86_OR_AMD64
18 #ifdef _MSC_VER 18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
19 #if _MSC_VER >= 1200 19 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
20 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
21 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
22 || defined(_MSC_VER) && (_MSC_VER >= 1200)
20 #define Z7_COMPILER_SHA1_SUPPORTED 23 #define Z7_COMPILER_SHA1_SUPPORTED
21 #endif
22 #elif defined(__clang__)
23 #if (__clang_major__ >= 8) // fix that check
24 #define Z7_COMPILER_SHA1_SUPPORTED
25 #endif
26 #elif defined(__GNUC__)
27 #if (__GNUC__ >= 8) // fix that check
28 #define Z7_COMPILER_SHA1_SUPPORTED
29 #endif
30 #elif defined(__INTEL_COMPILER)
31 #if (__INTEL_COMPILER >= 1800) // fix that check
32 #define Z7_COMPILER_SHA1_SUPPORTED
33 #endif
34 #endif 24 #endif
35#elif defined(MY_CPU_ARM_OR_ARM64) 25#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
36 #ifdef _MSC_VER 26 && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
37 #if _MSC_VER >= 1910 && _MSC_VER >= 1929 && _MSC_FULL_VER >= 192930037 27 #if defined(__ARM_FEATURE_SHA2) \
28 || defined(__ARM_FEATURE_CRYPTO)
29 #define Z7_COMPILER_SHA1_SUPPORTED
30 #else
31 #if defined(MY_CPU_ARM64) \
32 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
33 || defined(Z7_MSC_VER_ORIGINAL)
34 #if defined(__ARM_FP) && \
35 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
36 || defined(__GNUC__) && (__GNUC__ >= 6) \
37 ) \
38 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
39 #if defined(MY_CPU_ARM64) \
40 || !defined(Z7_CLANG_VERSION) \
41 || defined(__ARM_NEON) && \
42 (Z7_CLANG_VERSION < 170000 || \
43 Z7_CLANG_VERSION > 170001)
38 #define Z7_COMPILER_SHA1_SUPPORTED 44 #define Z7_COMPILER_SHA1_SUPPORTED
39 #endif 45 #endif
40 #elif defined(__clang__)
41 #if (__clang_major__ >= 8) // fix that check
42 #define Z7_COMPILER_SHA1_SUPPORTED
43 #endif 46 #endif
44 #elif defined(__GNUC__)
45 #if (__GNUC__ >= 6) // fix that check
46 #define Z7_COMPILER_SHA1_SUPPORTED
47 #endif 47 #endif
48 #endif 48 #endif
49#endif 49#endif
@@ -436,7 +436,7 @@ void Sha1Prepare(void)
436 #endif 436 #endif
437 { 437 {
438 // printf("\n========== HW SHA1 ======== \n"); 438 // printf("\n========== HW SHA1 ======== \n");
439 #if defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) 439 #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER)
440 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). 440 /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037).
441 It generated incorrect SHA-1 code. 441 It generated incorrect SHA-1 code.
442 21.03 : we test sha1-hardware code at runtime initialization */ 442 21.03 : we test sha1-hardware code at runtime initialization */
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c
index 27796aa..4e835f1 100644
--- a/C/Sha1Opt.c
+++ b/C/Sha1Opt.c
@@ -1,5 +1,5 @@
1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions 1/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
@@ -11,6 +11,8 @@
11#endif 11#endif
12#endif 12#endif
13 13
14// #define Z7_USE_HW_SHA_STUB // for debug
15
14#ifdef MY_CPU_X86_OR_AMD64 16#ifdef MY_CPU_X86_OR_AMD64
15 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
16 #define USE_HW_SHA 18 #define USE_HW_SHA
@@ -32,9 +34,14 @@
32 #endif 34 #endif
33 #if (_MSC_VER >= USE_VER_MIN) 35 #if (_MSC_VER >= USE_VER_MIN)
34 #define USE_HW_SHA 36 #define USE_HW_SHA
37 #else
38 #define Z7_USE_HW_SHA_STUB
35 #endif 39 #endif
36 #endif 40 #endif
37// #endif // MY_CPU_X86_OR_AMD64 41// #endif // MY_CPU_X86_OR_AMD64
42#ifndef USE_HW_SHA
43 // #define Z7_USE_HW_SHA_STUB // for debug
44#endif
38 45
39#ifdef USE_HW_SHA 46#ifdef USE_HW_SHA
40 47
@@ -202,46 +209,124 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
202 209
203#endif // USE_HW_SHA 210#endif // USE_HW_SHA
204 211
205#elif defined(MY_CPU_ARM_OR_ARM64) 212#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
206 213 && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
207 #if defined(__clang__) 214 #if defined(__ARM_FEATURE_SHA2) \
208 #if (__clang_major__ >= 8) // fix that check 215 || defined(__ARM_FEATURE_CRYPTO)
216 #define USE_HW_SHA
217 #else
218 #if defined(MY_CPU_ARM64) \
219 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
220 || defined(Z7_MSC_VER_ORIGINAL)
221 #if defined(__ARM_FP) && \
222 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
223 || defined(__GNUC__) && (__GNUC__ >= 6) \
224 ) \
225 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
226 #if defined(MY_CPU_ARM64) \
227 || !defined(Z7_CLANG_VERSION) \
228 || defined(__ARM_NEON) && \
229 (Z7_CLANG_VERSION < 170000 || \
230 Z7_CLANG_VERSION > 170001)
209 #define USE_HW_SHA 231 #define USE_HW_SHA
210 #endif 232 #endif
211 #elif defined(__GNUC__)
212 #if (__GNUC__ >= 6) // fix that check
213 #define USE_HW_SHA
214 #endif 233 #endif
215 #elif defined(_MSC_VER)
216 #if _MSC_VER >= 1910
217 #define USE_HW_SHA
218 #endif 234 #endif
219 #endif 235 #endif
220 236
221#ifdef USE_HW_SHA 237#ifdef USE_HW_SHA
222 238
223// #pragma message("=== Sha1 HW === ") 239// #pragma message("=== Sha1 HW === ")
240// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2
224 241
225#if defined(__clang__) || defined(__GNUC__) 242#if defined(__clang__) || defined(__GNUC__)
243#if !defined(__ARM_FEATURE_SHA2) && \
244 !defined(__ARM_FEATURE_CRYPTO)
226 #ifdef MY_CPU_ARM64 245 #ifdef MY_CPU_ARM64
246#if defined(__clang__)
247 #define ATTRIB_SHA __attribute__((__target__("crypto")))
248#else
227 #define ATTRIB_SHA __attribute__((__target__("+crypto"))) 249 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
250#endif
228 #else 251 #else
252#if defined(__clang__) && (__clang_major__ >= 1)
253 #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
254#else
229 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 255 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
256#endif
230 #endif 257 #endif
258#endif
231#else 259#else
232 // _MSC_VER 260 // _MSC_VER
233 // for arm32 261 // for arm32
234 #define _ARM_USE_NEW_NEON_INTRINSICS 262 #define _ARM_USE_NEW_NEON_INTRINSICS
235#endif 263#endif
236 264
237#if defined(_MSC_VER) && defined(MY_CPU_ARM64) 265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
238#include <arm64_neon.h> 270#include <arm64_neon.h>
239#else 271#else
272
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO)
284// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
285 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
286 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
287// #if defined(__clang__) && __clang_major__ < 13
288 #define __ARM_FEATURE_CRYPTO 1
289// #else
290 #define __ARM_FEATURE_SHA2 1
291// #endif
292 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
293#endif
294#endif // clang
295
296#if defined(__clang__)
297
298#if defined(__ARM_ARCH) && __ARM_ARCH < 8
299 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
300// #pragma message("#define __ARM_ARCH 8")
301 #undef __ARM_ARCH
302 #define __ARM_ARCH 8
303 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
304#endif
305
306#endif // clang
307
240#include <arm_neon.h> 308#include <arm_neon.h>
309
310#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
311 defined(__ARM_FEATURE_CRYPTO) && \
312 defined(__ARM_FEATURE_SHA2)
313Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
314 #undef __ARM_FEATURE_CRYPTO
315 #undef __ARM_FEATURE_SHA2
316 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
317Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
318// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
241#endif 319#endif
242 320
321#endif // Z7_MSC_VER_ORIGINAL
322
243typedef uint32x4_t v128; 323typedef uint32x4_t v128;
244// typedef __n128 v128; // MSVC 324// typedef __n128 v128; // MSVC
325// the bug in clang 3.8.1:
326// __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1);
327#if defined(__clang__) && (__clang_major__ <= 9)
328#pragma GCC diagnostic ignored "-Wvector-conversion"
329#endif
245 330
246#ifdef MY_CPU_BE 331#ifdef MY_CPU_BE
247 #define MY_rev32_for_LE(x) 332 #define MY_rev32_for_LE(x)
@@ -256,11 +341,11 @@ typedef uint32x4_t v128;
256 m = LOAD_128((data + (k) * 16)); \ 341 m = LOAD_128((data + (k) * 16)); \
257 MY_rev32_for_LE(m); \ 342 MY_rev32_for_LE(m); \
258 343
259#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); 344#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3)
260#define SU1(dest, src) dest = vsha1su1q_u32(dest, src); 345#define SU1(dest, src) dest = vsha1su1q_u32(dest, src)
261#define C(e) abcd = vsha1cq_u32(abcd, e, t); 346#define C(e) abcd = vsha1cq_u32(abcd, e, t)
262#define P(e) abcd = vsha1pq_u32(abcd, e, t); 347#define P(e) abcd = vsha1pq_u32(abcd, e, t)
263#define M(e) abcd = vsha1mq_u32(abcd, e, t); 348#define M(e) abcd = vsha1mq_u32(abcd, e, t)
264#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) 349#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
265#define T(m, c) t = vaddq_u32(m, c) 350#define T(m, c) t = vaddq_u32(m, c)
266 351
@@ -337,16 +422,17 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
337#endif // MY_CPU_ARM_OR_ARM64 422#endif // MY_CPU_ARM_OR_ARM64
338 423
339 424
340#ifndef USE_HW_SHA 425#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
341
342// #error Stop_Compiling_UNSUPPORTED_SHA 426// #error Stop_Compiling_UNSUPPORTED_SHA
343// #include <stdlib.h> 427// #include <stdlib.h>
344 428
345// #include "Sha1.h"
346void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
347 429
348#pragma message("Sha1 HW-SW stub was used")
349 430
431// #include "Sha1.h"
432// #if defined(_MSC_VER)
433#pragma message("Sha1 HW-SW stub was used")
434// #endif
435void Z7_FASTCALL Sha1_UpdateBlocks (UInt32 state[5], const Byte *data, size_t numBlocks);
350void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); 436void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
351void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) 437void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
352{ 438{
@@ -359,7 +445,6 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
359 return; 445 return;
360 */ 446 */
361} 447}
362
363#endif 448#endif
364 449
365#undef SU0 450#undef SU0
@@ -384,3 +469,4 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
384#undef USE_HW_SHA 469#undef USE_HW_SHA
385#undef ATTRIB_SHA 470#undef ATTRIB_SHA
386#undef USE_VER_MIN 471#undef USE_VER_MIN
472#undef Z7_USE_HW_SHA_STUB
diff --git a/C/Sha256.c b/C/Sha256.c
index 018cf6f..14d3be9 100644
--- a/C/Sha256.c
+++ b/C/Sha256.c
@@ -1,5 +1,5 @@
1/* Sha256.c -- SHA-256 Hash 1/* Sha256.c -- SHA-256 Hash
22023-04-02 : Igor Pavlov : Public domain 22024-03-01 : Igor Pavlov : Public domain
3This code is based on public domain code from Wei Dai's Crypto++ library. */ 3This code is based on public domain code from Wei Dai's Crypto++ library. */
4 4
5#include "Precomp.h" 5#include "Precomp.h"
@@ -15,35 +15,35 @@ This code is based on public domain code from Wei Dai's Crypto++ library. */
15#endif 15#endif
16 16
17#ifdef MY_CPU_X86_OR_AMD64 17#ifdef MY_CPU_X86_OR_AMD64
18 #ifdef _MSC_VER 18 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
19 #if _MSC_VER >= 1200 19 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
20 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
21 || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
22 || defined(_MSC_VER) && (_MSC_VER >= 1200)
20 #define Z7_COMPILER_SHA256_SUPPORTED 23 #define Z7_COMPILER_SHA256_SUPPORTED
21 #endif
22 #elif defined(__clang__)
23 #if (__clang_major__ >= 8) // fix that check
24 #define Z7_COMPILER_SHA256_SUPPORTED
25 #endif
26 #elif defined(__GNUC__)
27 #if (__GNUC__ >= 8) // fix that check
28 #define Z7_COMPILER_SHA256_SUPPORTED
29 #endif
30 #elif defined(__INTEL_COMPILER)
31 #if (__INTEL_COMPILER >= 1800) // fix that check
32 #define Z7_COMPILER_SHA256_SUPPORTED
33 #endif
34 #endif 24 #endif
35#elif defined(MY_CPU_ARM_OR_ARM64) 25#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
36 #ifdef _MSC_VER 26
37 #if _MSC_VER >= 1910 27 #if defined(__ARM_FEATURE_SHA2) \
28 || defined(__ARM_FEATURE_CRYPTO)
29 #define Z7_COMPILER_SHA256_SUPPORTED
30 #else
31 #if defined(MY_CPU_ARM64) \
32 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
33 || defined(Z7_MSC_VER_ORIGINAL)
34 #if defined(__ARM_FP) && \
35 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
36 || defined(__GNUC__) && (__GNUC__ >= 6) \
37 ) \
38 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
39 #if defined(MY_CPU_ARM64) \
40 || !defined(Z7_CLANG_VERSION) \
41 || defined(__ARM_NEON) && \
42 (Z7_CLANG_VERSION < 170000 || \
43 Z7_CLANG_VERSION > 170001)
38 #define Z7_COMPILER_SHA256_SUPPORTED 44 #define Z7_COMPILER_SHA256_SUPPORTED
39 #endif 45 #endif
40 #elif defined(__clang__)
41 #if (__clang_major__ >= 8) // fix that check
42 #define Z7_COMPILER_SHA256_SUPPORTED
43 #endif 46 #endif
44 #elif defined(__GNUC__)
45 #if (__GNUC__ >= 6) // fix that check
46 #define Z7_COMPILER_SHA256_SUPPORTED
47 #endif 47 #endif
48 #endif 48 #endif
49#endif 49#endif
@@ -224,8 +224,6 @@ void Sha256_Init(CSha256 *p)
224 224
225#endif 225#endif
226 226
227void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
228
229// static 227// static
230extern MY_ALIGN(64) 228extern MY_ALIGN(64)
231const UInt32 SHA256_K_ARRAY[64]; 229const UInt32 SHA256_K_ARRAY[64];
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c
index e4465e3..eb38166 100644
--- a/C/Sha256Opt.c
+++ b/C/Sha256Opt.c
@@ -1,5 +1,5 @@
1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions 1/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5#include "Compiler.h" 5#include "Compiler.h"
@@ -11,6 +11,8 @@
11#endif 11#endif
12#endif 12#endif
13 13
14// #define Z7_USE_HW_SHA_STUB // for debug
15
14#ifdef MY_CPU_X86_OR_AMD64 16#ifdef MY_CPU_X86_OR_AMD64
15 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check 17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
16 #define USE_HW_SHA 18 #define USE_HW_SHA
@@ -32,9 +34,14 @@
32 #endif 34 #endif
33 #if (_MSC_VER >= USE_VER_MIN) 35 #if (_MSC_VER >= USE_VER_MIN)
34 #define USE_HW_SHA 36 #define USE_HW_SHA
37 #else
38 #define Z7_USE_HW_SHA_STUB
35 #endif 39 #endif
36 #endif 40 #endif
37// #endif // MY_CPU_X86_OR_AMD64 41// #endif // MY_CPU_X86_OR_AMD64
42#ifndef USE_HW_SHA
43 // #define Z7_USE_HW_SHA_STUB // for debug
44#endif
38 45
39#ifdef USE_HW_SHA 46#ifdef USE_HW_SHA
40 47
@@ -202,19 +209,28 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
202 209
203#endif // USE_HW_SHA 210#endif // USE_HW_SHA
204 211
205#elif defined(MY_CPU_ARM_OR_ARM64) 212#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
206 213
207 #if defined(__clang__) 214 #if defined(__ARM_FEATURE_SHA2) \
208 #if (__clang_major__ >= 8) // fix that check 215 || defined(__ARM_FEATURE_CRYPTO)
216 #define USE_HW_SHA
217 #else
218 #if defined(MY_CPU_ARM64) \
219 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
220 || defined(Z7_MSC_VER_ORIGINAL)
221 #if defined(__ARM_FP) && \
222 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
223 || defined(__GNUC__) && (__GNUC__ >= 6) \
224 ) \
225 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
226 #if defined(MY_CPU_ARM64) \
227 || !defined(Z7_CLANG_VERSION) \
228 || defined(__ARM_NEON) && \
229 (Z7_CLANG_VERSION < 170000 || \
230 Z7_CLANG_VERSION > 170001)
209 #define USE_HW_SHA 231 #define USE_HW_SHA
210 #endif 232 #endif
211 #elif defined(__GNUC__)
212 #if (__GNUC__ >= 6) // fix that check
213 #define USE_HW_SHA
214 #endif 233 #endif
215 #elif defined(_MSC_VER)
216 #if _MSC_VER >= 1910
217 #define USE_HW_SHA
218 #endif 234 #endif
219 #endif 235 #endif
220 236
@@ -222,24 +238,88 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
222 238
223// #pragma message("=== Sha256 HW === ") 239// #pragma message("=== Sha256 HW === ")
224 240
241
225#if defined(__clang__) || defined(__GNUC__) 242#if defined(__clang__) || defined(__GNUC__)
243#if !defined(__ARM_FEATURE_SHA2) && \
244 !defined(__ARM_FEATURE_CRYPTO)
226 #ifdef MY_CPU_ARM64 245 #ifdef MY_CPU_ARM64
246#if defined(__clang__)
247 #define ATTRIB_SHA __attribute__((__target__("crypto")))
248#else
227 #define ATTRIB_SHA __attribute__((__target__("+crypto"))) 249 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
250#endif
228 #else 251 #else
252#if defined(__clang__) && (__clang_major__ >= 1)
253 #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
254#else
229 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 255 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
256#endif
230 #endif 257 #endif
258#endif
231#else 259#else
232 // _MSC_VER 260 // _MSC_VER
233 // for arm32 261 // for arm32
234 #define _ARM_USE_NEW_NEON_INTRINSICS 262 #define _ARM_USE_NEW_NEON_INTRINSICS
235#endif 263#endif
236 264
237#if defined(_MSC_VER) && defined(MY_CPU_ARM64) 265
266
267
268
269#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
238#include <arm64_neon.h> 270#include <arm64_neon.h>
239#else 271#else
272
273
274
275
276
277
278
279
280
281#if defined(__clang__) && __clang_major__ < 16
282#if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO)
284// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
285 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
286 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
287// #if defined(__clang__) && __clang_major__ < 13
288 #define __ARM_FEATURE_CRYPTO 1
289// #else
290 #define __ARM_FEATURE_SHA2 1
291// #endif
292 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
293#endif
294#endif // clang
295
296#if defined(__clang__)
297
298#if defined(__ARM_ARCH) && __ARM_ARCH < 8
299 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
300// #pragma message("#define __ARM_ARCH 8")
301 #undef __ARM_ARCH
302 #define __ARM_ARCH 8
303 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
304#endif
305
306#endif // clang
307
240#include <arm_neon.h> 308#include <arm_neon.h>
309
310#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
311 defined(__ARM_FEATURE_CRYPTO) && \
312 defined(__ARM_FEATURE_SHA2)
313Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
314 #undef __ARM_FEATURE_CRYPTO
315 #undef __ARM_FEATURE_SHA2
316 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
317Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
318// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
241#endif 319#endif
242 320
321#endif // Z7_MSC_VER_ORIGINAL
322
243typedef uint32x4_t v128; 323typedef uint32x4_t v128;
244// typedef __n128 v128; // MSVC 324// typedef __n128 v128; // MSVC
245 325
@@ -316,10 +396,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
316 LOAD_SHUFFLE (m2, 2) 396 LOAD_SHUFFLE (m2, 2)
317 LOAD_SHUFFLE (m3, 3) 397 LOAD_SHUFFLE (m3, 3)
318 398
319 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); 399 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
320 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); 400 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
321 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); 401 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
322 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); 402 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
323 403
324 state0 = vaddq_u32(state0, state0_save); 404 state0 = vaddq_u32(state0, state0_save);
325 state1 = vaddq_u32(state1, state1_save); 405 state1 = vaddq_u32(state1, state1_save);
@@ -337,16 +417,17 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
337#endif // MY_CPU_ARM_OR_ARM64 417#endif // MY_CPU_ARM_OR_ARM64
338 418
339 419
340#ifndef USE_HW_SHA 420#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
341
342// #error Stop_Compiling_UNSUPPORTED_SHA 421// #error Stop_Compiling_UNSUPPORTED_SHA
343// #include <stdlib.h> 422// #include <stdlib.h>
344 423// We can compile this file with another C compiler,
424// or we can compile asm version.
425// So we can generate real code instead of this stub function.
345// #include "Sha256.h" 426// #include "Sha256.h"
346void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); 427// #if defined(_MSC_VER)
347
348#pragma message("Sha256 HW-SW stub was used") 428#pragma message("Sha256 HW-SW stub was used")
349 429// #endif
430void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks);
350void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); 431void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
351void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) 432void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
352{ 433{
@@ -359,7 +440,6 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
359 return; 440 return;
360 */ 441 */
361} 442}
362
363#endif 443#endif
364 444
365 445
@@ -384,3 +464,4 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
384#undef USE_HW_SHA 464#undef USE_HW_SHA
385#undef ATTRIB_SHA 465#undef ATTRIB_SHA
386#undef USE_VER_MIN 466#undef USE_VER_MIN
467#undef Z7_USE_HW_SHA_STUB
diff --git a/C/SwapBytes.c b/C/SwapBytes.c
index 7901bba..9290592 100644
--- a/C/SwapBytes.c
+++ b/C/SwapBytes.c
@@ -1,5 +1,5 @@
1/* SwapBytes.c -- Byte Swap conversion filter 1/* SwapBytes.c -- Byte Swap conversion filter
22023-04-07 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -305,11 +305,12 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
305 msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want 305 msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want
306 */ 306 */
307 // _mm256_broadcastsi128_si256(*mask128_ptr); 307 // _mm256_broadcastsi128_si256(*mask128_ptr);
308 /* 308#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
309 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) 309 #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
310 MY_mm256_set_m128i 310#else
311 */ 311 #define MY_mm256_set_m128i _mm256_set_m128i
312 _mm256_set_m128i( 312#endif
313 MY_mm256_set_m128i(
313 *(const __m128i *)mask128_ptr, 314 *(const __m128i *)mask128_ptr,
314 *(const __m128i *)mask128_ptr); 315 *(const __m128i *)mask128_ptr);
315 #endif 316 #endif
@@ -330,32 +331,59 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr)
330 331
331 332
332// compile message "NEON intrinsics not available with the soft-float ABI" 333// compile message "NEON intrinsics not available with the soft-float ABI"
333#elif defined(MY_CPU_ARM_OR_ARM64) || \ 334#elif defined(MY_CPU_ARM_OR_ARM64) \
334 (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) 335 && defined(MY_CPU_LE) \
335// #elif defined(MY_CPU_ARM64) 336 && !defined(Z7_DISABLE_ARM_NEON)
336 337
337 #if defined(__clang__) && (__clang_major__ >= 8) \ 338 #if defined(__clang__) && (__clang_major__ >= 8) \
338 || defined(__GNUC__) && (__GNUC__ >= 8) 339 || defined(__GNUC__) && (__GNUC__ >= 6)
339 #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ 340 #if defined(__ARM_FP)
341 #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \
340 || defined(MY_CPU_ARM64) 342 || defined(MY_CPU_ARM64)
343 #if defined(MY_CPU_ARM64) \
344 || !defined(Z7_CLANG_VERSION) \
345 || defined(__ARM_NEON)
341 #define USE_SWAP_128 346 #define USE_SWAP_128
342 #endif
343 #ifdef MY_CPU_ARM64 347 #ifdef MY_CPU_ARM64
344 // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) 348 // #define SWAP_ATTRIB_NEON __attribute__((__target__("")))
345 #else 349 #else
346 // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) 350#if defined(Z7_CLANG_VERSION)
347 #endif 351 // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon")))
352#else
353 // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))")
354 #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon")))
355#endif
356 #endif // MY_CPU_ARM64
357 #endif // __ARM_NEON
358 #endif // __ARM_ARCH
359 #endif // __ARM_FP
360
348 #elif defined(_MSC_VER) 361 #elif defined(_MSC_VER)
349 #if (_MSC_VER >= 1910) 362 #if (_MSC_VER >= 1910)
350 #define USE_SWAP_128 363 #define USE_SWAP_128
351 #endif 364 #endif
352 #endif 365 #endif
353 366
354 #if defined(_MSC_VER) && defined(MY_CPU_ARM64) 367 #ifdef USE_SWAP_128
368 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
355 #include <arm64_neon.h> 369 #include <arm64_neon.h>
356 #else 370 #else
371
372/*
373#if !defined(__ARM_NEON)
374#if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \
375 || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \
376 || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100)
377Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
378#pragma message("#define __ARM_NEON 1")
379// #define __ARM_NEON 1
380Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
381#endif
382#endif
383*/
357 #include <arm_neon.h> 384 #include <arm_neon.h>
358 #endif 385 #endif
386 #endif
359 387
360#ifndef USE_SWAP_128 388#ifndef USE_SWAP_128
361 #define FORCE_SWAP_MODE 389 #define FORCE_SWAP_MODE
@@ -464,6 +492,13 @@ Z7_ATTRIB_NO_VECTOR \
464void Z7_FASTCALL 492void Z7_FASTCALL
465 493
466 494
495#if defined(MY_CPU_ARM_OR_ARM64)
496#if defined(__clang__)
497#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
498#endif
499#endif
500
501
467#ifdef MY_CPU_64BIT 502#ifdef MY_CPU_64BIT
468 503
469#if defined(MY_CPU_ARM64) \ 504#if defined(MY_CPU_ARM64) \
diff --git a/C/Threads.c b/C/Threads.c
index cf52bd3..464efec 100644
--- a/C/Threads.c
+++ b/C/Threads.c
@@ -1,5 +1,5 @@
1/* Threads.c -- multithreading library 1/* Threads.c -- multithreading library
22023-03-04 : Igor Pavlov : Public domain */ 22024-03-28 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -195,20 +195,19 @@ WRes CriticalSection_Init(CCriticalSection *p)
195 195
196// ---------- POSIX ---------- 196// ---------- POSIX ----------
197 197
198#ifndef __APPLE__ 198#if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__)
199#ifndef Z7_AFFINITY_DISABLE 199#ifndef Z7_AFFINITY_DISABLE
200// _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET 200// _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET
201// clang < 3.6 : unknown warning group '-Wreserved-id-macro' 201// clang < 3.6 : unknown warning group '-Wreserved-id-macro'
202// clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" 202// clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier"
203// clang >= 13 : do not give warning 203// clang >= 13 : do not give warning
204#if !defined(_GNU_SOURCE) 204#if !defined(_GNU_SOURCE)
205 #if defined(__clang__) && (__clang_major__ >= 4) && (__clang_major__ <= 12) 205Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
206 #pragma GCC diagnostic ignored "-Wreserved-id-macro" 206// #define _GNU_SOURCE
207 #endif 207Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
208#define _GNU_SOURCE
209#endif // !defined(_GNU_SOURCE) 208#endif // !defined(_GNU_SOURCE)
210#endif // Z7_AFFINITY_DISABLE 209#endif // Z7_AFFINITY_DISABLE
211#endif // __APPLE__ 210#endif // __linux__
212 211
213#include "Threads.h" 212#include "Threads.h"
214 213
@@ -244,8 +243,9 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param,
244 { 243 {
245 if (cpuSet) 244 if (cpuSet)
246 { 245 {
247 #ifdef Z7_AFFINITY_SUPPORTED 246 // pthread_attr_setaffinity_np() is not supported for MUSL compile.
248 247 // so we check for __GLIBC__ here
248#if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__)
249 /* 249 /*
250 printf("\n affinity :"); 250 printf("\n affinity :");
251 unsigned i; 251 unsigned i;
@@ -267,7 +267,7 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param,
267 // ret2 = 267 // ret2 =
268 pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); 268 pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet);
269 // if (ret2) ret = ret2; 269 // if (ret2) ret = ret2;
270 #endif 270#endif
271 } 271 }
272 272
273 ret = pthread_create(&p->_tid, &attr, func, param); 273 ret = pthread_create(&p->_tid, &attr, func, param);
@@ -369,13 +369,20 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p)
369 { return AutoResetEvent_Create(p, 0); } 369 { return AutoResetEvent_Create(p, 0); }
370 370
371 371
372#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13)
373// freebsd:
374#pragma GCC diagnostic ignored "-Wthread-safety-analysis"
375#endif
376
372WRes Event_Set(CEvent *p) 377WRes Event_Set(CEvent *p)
373{ 378{
374 RINOK(pthread_mutex_lock(&p->_mutex)) 379 RINOK(pthread_mutex_lock(&p->_mutex))
375 p->_state = True; 380 p->_state = True;
376 int res1 = pthread_cond_broadcast(&p->_cond); 381 {
377 int res2 = pthread_mutex_unlock(&p->_mutex); 382 const int res1 = pthread_cond_broadcast(&p->_cond);
378 return (res2 ? res2 : res1); 383 const int res2 = pthread_mutex_unlock(&p->_mutex);
384 return (res2 ? res2 : res1);
385 }
379} 386}
380 387
381WRes Event_Reset(CEvent *p) 388WRes Event_Reset(CEvent *p)
@@ -408,8 +415,8 @@ WRes Event_Close(CEvent *p)
408 return 0; 415 return 0;
409 p->_created = 0; 416 p->_created = 0;
410 { 417 {
411 int res1 = pthread_mutex_destroy(&p->_mutex); 418 const int res1 = pthread_mutex_destroy(&p->_mutex);
412 int res2 = pthread_cond_destroy(&p->_cond); 419 const int res2 = pthread_cond_destroy(&p->_cond);
413 return (res1 ? res1 : res2); 420 return (res1 ? res1 : res2);
414 } 421 }
415} 422}
@@ -487,8 +494,8 @@ WRes Semaphore_Close(CSemaphore *p)
487 return 0; 494 return 0;
488 p->_created = 0; 495 p->_created = 0;
489 { 496 {
490 int res1 = pthread_mutex_destroy(&p->_mutex); 497 const int res1 = pthread_mutex_destroy(&p->_mutex);
491 int res2 = pthread_cond_destroy(&p->_cond); 498 const int res2 = pthread_cond_destroy(&p->_cond);
492 return (res1 ? res1 : res2); 499 return (res1 ? res1 : res2);
493 } 500 }
494} 501}
@@ -549,6 +556,18 @@ LONG InterlockedIncrement(LONG volatile *addend)
549 #endif 556 #endif
550} 557}
551 558
559LONG InterlockedDecrement(LONG volatile *addend)
560{
561 // Print("InterlockedDecrement")
562 #ifdef USE_HACK_UNSAFE_ATOMIC
563 LONG val = *addend - 1;
564 *addend = val;
565 return val;
566 #else
567 return __sync_sub_and_fetch(addend, 1);
568 #endif
569}
570
552#endif // _WIN32 571#endif // _WIN32
553 572
554WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) 573WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p)
diff --git a/C/Threads.h b/C/Threads.h
index 4028464..c1484a2 100644
--- a/C/Threads.h
+++ b/C/Threads.h
@@ -1,5 +1,5 @@
1/* Threads.h -- multithreading library 1/* Threads.h -- multithreading library
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-28 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_THREADS_H 4#ifndef ZIP7_INC_THREADS_H
5#define ZIP7_INC_THREADS_H 5#define ZIP7_INC_THREADS_H
@@ -9,12 +9,21 @@
9 9
10#else 10#else
11 11
12#include "Compiler.h"
13
14// #define Z7_AFFINITY_DISABLE
12#if defined(__linux__) 15#if defined(__linux__)
13#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) 16#if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__)
14#ifndef Z7_AFFINITY_DISABLE 17#ifndef Z7_AFFINITY_DISABLE
15#define Z7_AFFINITY_SUPPORTED 18#define Z7_AFFINITY_SUPPORTED
16// #pragma message(" ==== Z7_AFFINITY_SUPPORTED") 19// #pragma message(" ==== Z7_AFFINITY_SUPPORTED")
17// #define _GNU_SOURCE 20#if !defined(_GNU_SOURCE)
21// #pragma message(" ==== _GNU_SOURCE set")
22// we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL
23Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
24#define _GNU_SOURCE
25Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
26#endif
18#endif 27#endif
19#endif 28#endif
20#endif 29#endif
@@ -173,7 +182,7 @@ WRes CriticalSection_Init(CCriticalSection *p);
173 182
174#else // _WIN32 183#else // _WIN32
175 184
176typedef struct _CEvent 185typedef struct
177{ 186{
178 int _created; 187 int _created;
179 int _manual_reset; 188 int _manual_reset;
@@ -199,7 +208,7 @@ WRes Event_Wait(CEvent *p);
199WRes Event_Close(CEvent *p); 208WRes Event_Close(CEvent *p);
200 209
201 210
202typedef struct _CSemaphore 211typedef struct
203{ 212{
204 int _created; 213 int _created;
205 UInt32 _count; 214 UInt32 _count;
@@ -219,7 +228,7 @@ WRes Semaphore_Wait(CSemaphore *p);
219WRes Semaphore_Close(CSemaphore *p); 228WRes Semaphore_Close(CSemaphore *p);
220 229
221 230
222typedef struct _CCriticalSection 231typedef struct
223{ 232{
224 pthread_mutex_t _mutex; 233 pthread_mutex_t _mutex;
225} CCriticalSection; 234} CCriticalSection;
@@ -230,6 +239,7 @@ void CriticalSection_Enter(CCriticalSection *cs);
230void CriticalSection_Leave(CCriticalSection *cs); 239void CriticalSection_Leave(CCriticalSection *cs);
231 240
232LONG InterlockedIncrement(LONG volatile *addend); 241LONG InterlockedIncrement(LONG volatile *addend);
242LONG InterlockedDecrement(LONG volatile *addend);
233 243
234#endif // _WIN32 244#endif // _WIN32
235 245
diff --git a/C/Util/7z/7z.dsp b/C/Util/7z/7z.dsp
index 11e1b03..474c660 100644
--- a/C/Util/7z/7z.dsp
+++ b/C/Util/7z/7z.dsp
@@ -42,7 +42,7 @@ RSC=rc.exe
42# PROP Ignore_Export_Lib 0 42# PROP Ignore_Export_Lib 0
43# PROP Target_Dir "" 43# PROP Target_Dir ""
44# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c 44# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
45# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /FAcs /Yu"Precomp.h" /FD /c 45# ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /FAcs /Yu"Precomp.h" /FD /c
46# ADD BASE RSC /l 0x419 /d "NDEBUG" 46# ADD BASE RSC /l 0x419 /d "NDEBUG"
47# ADD RSC /l 0x419 /d "NDEBUG" 47# ADD RSC /l 0x419 /d "NDEBUG"
48BSC32=bscmake.exe 48BSC32=bscmake.exe
@@ -67,7 +67,7 @@ LINK32=link.exe
67# PROP Ignore_Export_Lib 0 67# PROP Ignore_Export_Lib 0
68# PROP Target_Dir "" 68# PROP Target_Dir ""
69# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c 69# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
70# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /Yu"Precomp.h" /FD /GZ /c 70# ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /Yu"Precomp.h" /FD /GZ /c
71# ADD BASE RSC /l 0x419 /d "_DEBUG" 71# ADD BASE RSC /l 0x419 /d "_DEBUG"
72# ADD RSC /l 0x419 /d "_DEBUG" 72# ADD RSC /l 0x419 /d "_DEBUG"
73BSC32=bscmake.exe 73BSC32=bscmake.exe
@@ -234,6 +234,10 @@ SOURCE=.\Precomp.c
234# End Source File 234# End Source File
235# Begin Source File 235# Begin Source File
236 236
237SOURCE=..\..\Precomp.h
238# End Source File
239# Begin Source File
240
237SOURCE=.\Precomp.h 241SOURCE=.\Precomp.h
238# End Source File 242# End Source File
239# End Group 243# End Group
diff --git a/C/Util/7z/7zMain.c b/C/Util/7z/7zMain.c
index 547920a..6baf979 100644
--- a/C/Util/7z/7zMain.c
+++ b/C/Util/7z/7zMain.c
@@ -1,20 +1,11 @@
1/* 7zMain.c - Test application for 7z Decoder 1/* 7zMain.c - Test application for 7z Decoder
22023-04-04 : Igor Pavlov : Public domain */ 22024-02-28 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include <stdio.h> 6#include <stdio.h>
7#include <string.h> 7#include <string.h>
8 8
9#include "../../CpuArch.h"
10
11#include "../../7z.h"
12#include "../../7zAlloc.h"
13#include "../../7zBuf.h"
14#include "../../7zCrc.h"
15#include "../../7zFile.h"
16#include "../../7zVersion.h"
17
18#ifndef USE_WINDOWS_FILE 9#ifndef USE_WINDOWS_FILE
19/* for mkdir */ 10/* for mkdir */
20#ifdef _WIN32 11#ifdef _WIN32
@@ -32,6 +23,15 @@
32#endif 23#endif
33#endif 24#endif
34 25
26#include "../../7zFile.h"
27#include "../../7z.h"
28#include "../../7zAlloc.h"
29#include "../../7zBuf.h"
30#include "../../7zCrc.h"
31#include "../../7zVersion.h"
32
33#include "../../CpuArch.h"
34
35#define kInputBufSize ((size_t)1 << 18) 35#define kInputBufSize ((size_t)1 << 18)
36 36
37static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; 37static const ISzAlloc g_Alloc = { SzAlloc, SzFree };
@@ -168,12 +168,12 @@ static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s
168 #endif 168 #endif
169 ) 169 )
170{ 170{
171 unsigned len = 0; 171 size_t len = 0;
172 for (len = 0; s[len] != 0; len++) {} 172 for (len = 0; s[len] != 0; len++) {}
173 173
174 #ifndef MY_USE_UTF8 174 #ifndef MY_USE_UTF8
175 { 175 {
176 const unsigned size = len * 3 + 100; 176 const size_t size = len * 3 + 100;
177 if (!Buf_EnsureSize(buf, size)) 177 if (!Buf_EnsureSize(buf, size))
178 return SZ_ERROR_MEM; 178 return SZ_ERROR_MEM;
179 { 179 {
@@ -320,21 +320,20 @@ static void UIntToStr_2(char *s, unsigned value)
320// typedef long BOOL; 320// typedef long BOOL;
321typedef int BOOL; 321typedef int BOOL;
322 322
323typedef struct _FILETIME 323typedef struct
324{ 324{
325 DWORD dwLowDateTime; 325 DWORD dwLowDateTime;
326 DWORD dwHighDateTime; 326 DWORD dwHighDateTime;
327} FILETIME; 327} FILETIME;
328 328
329static LONG TIME_GetBias() 329static LONG TIME_GetBias(void)
330{ 330{
331 const time_t utc = time(NULL); 331 const time_t utc = time(NULL);
332 struct tm *ptm = localtime(&utc); 332 struct tm *ptm = localtime(&utc);
333 const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ 333 const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */
334 ptm = gmtime(&utc); 334 ptm = gmtime(&utc);
335 ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ 335 ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */
336 const LONG bias = (int)(mktime(ptm) - utc); 336 return (int)(mktime(ptm) - utc);
337 return bias;
338} 337}
339 338
340#define TICKS_PER_SEC 10000000 339#define TICKS_PER_SEC 10000000
@@ -359,11 +358,11 @@ static BOOL WINAPI FileTimeToLocalFileTime(const FILETIME *fileTime, FILETIME *l
359static const UInt32 kNumTimeQuantumsInSecond = 10000000; 358static const UInt32 kNumTimeQuantumsInSecond = 10000000;
360static const UInt32 kFileTimeStartYear = 1601; 359static const UInt32 kFileTimeStartYear = 1601;
361static const UInt32 kUnixTimeStartYear = 1970; 360static const UInt32 kUnixTimeStartYear = 1970;
362static const UInt64 kUnixTimeOffset =
363 (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear));
364 361
365static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) 362static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft)
366{ 363{
364 const UInt64 kUnixTimeOffset =
365 (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear));
367 const UInt64 winTime = GET_TIME_64(ft); 366 const UInt64 winTime = GET_TIME_64(ft);
368 return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; 367 return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset;
369} 368}
@@ -384,8 +383,10 @@ static void FILETIME_To_timespec(const FILETIME *ft, struct MY_ST_TIMESPEC *ts)
384 if (sec2 == sec) 383 if (sec2 == sec)
385 { 384 {
386 ts->tv_sec = sec2; 385 ts->tv_sec = sec2;
387 const UInt64 winTime = GET_TIME_64(ft); 386 {
388 ts->tv_nsec = (long)((winTime % 10000000) * 100); 387 const UInt64 winTime = GET_TIME_64(ft);
388 ts->tv_nsec = (long)((winTime % 10000000) * 100);
389 }
389 return; 390 return;
390 } 391 }
391 } 392 }
@@ -429,7 +430,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s)
429{ 430{
430 unsigned year, mon, hour, min, sec; 431 unsigned year, mon, hour, min, sec;
431 Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; 432 Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 };
432 unsigned t; 433 UInt32 t;
433 UInt32 v; 434 UInt32 v;
434 // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); 435 // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32);
435 UInt64 v64; 436 UInt64 v64;
@@ -461,7 +462,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s)
461 ms[1] = 29; 462 ms[1] = 29;
462 for (mon = 0;; mon++) 463 for (mon = 0;; mon++)
463 { 464 {
464 const unsigned d = ms[mon]; 465 const UInt32 d = ms[mon];
465 if (v < d) 466 if (v < d)
466 break; 467 break;
467 v -= d; 468 v -= d;
diff --git a/C/Util/7z/Precomp.h b/C/Util/7z/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/7z/Precomp.h
+++ b/C/Util/7z/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/7z/makefile b/C/Util/7z/makefile
index dfc560e..987f065 100644
--- a/C/Util/7z/makefile
+++ b/C/Util/7z/makefile
@@ -5,8 +5,6 @@ PROG = 7zDec.exe
5C_OBJS = \ 5C_OBJS = \
6 $O\7zAlloc.obj \ 6 $O\7zAlloc.obj \
7 $O\7zBuf.obj \ 7 $O\7zBuf.obj \
8 $O\7zCrc.obj \
9 $O\7zCrcOpt.obj \
10 $O\7zFile.obj \ 8 $O\7zFile.obj \
11 $O\7zDec.obj \ 9 $O\7zDec.obj \
12 $O\7zArcIn.obj \ 10 $O\7zArcIn.obj \
@@ -25,10 +23,14 @@ C_OBJS = \
257Z_OBJS = \ 237Z_OBJS = \
26 $O\7zMain.obj \ 24 $O\7zMain.obj \
27 25
26!include "../../../CPP/7zip/Crc.mak"
27!include "../../../CPP/7zip/LzmaDec.mak"
28
28OBJS = \ 29OBJS = \
29 $O\Precomp.obj \ 30 $O\Precomp.obj \
30 $(7Z_OBJS) \ 31 $(7Z_OBJS) \
31 $(C_OBJS) \ 32 $(C_OBJS) \
33 $(ASM_OBJS) \
32 34
33!include "../../../CPP/Build.mak" 35!include "../../../CPP/Build.mak"
34 36
@@ -38,3 +40,5 @@ $(C_OBJS): ../../$(*B).c
38 $(CCOMPL_USE) 40 $(CCOMPL_USE)
39$O\Precomp.obj: Precomp.c 41$O\Precomp.obj: Precomp.c
40 $(CCOMPL_PCH) 42 $(CCOMPL_PCH)
43
44!include "../../Asm_c.mak"
diff --git a/C/Util/7zipInstall/7zipInstall.c b/C/Util/7zipInstall/7zipInstall.c
index 7f5fd19..7d8e8c4 100644
--- a/C/Util/7zipInstall/7zipInstall.c
+++ b/C/Util/7zipInstall/7zipInstall.c
@@ -1,5 +1,5 @@
1/* 7zipInstall.c - 7-Zip Installer 1/* 7zipInstall.c - 7-Zip Installer
22023-04-04 : Igor Pavlov : Public domain */ 22024-04-05 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -11,6 +11,8 @@
11#pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union 11#pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union
12#endif 12#endif
13 13
14Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
15
14#ifdef Z7_OLD_WIN_SDK 16#ifdef Z7_OLD_WIN_SDK
15struct IShellView; 17struct IShellView;
16#define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE 18#define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE
@@ -41,16 +43,6 @@ typedef enum {
41 // #pragma GCC diagnostic ignored "-Wcast-function-type" 43 // #pragma GCC diagnostic ignored "-Wcast-function-type"
42#endif 44#endif
43 45
44#if defined(__clang__) || defined(__GNUC__)
45typedef void (*Z7_voidFunction)(void);
46#define MY_CAST_FUNC (Z7_voidFunction)
47#elif defined(_MSC_VER) && _MSC_VER > 1920
48#define MY_CAST_FUNC (void *)
49// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
50#else
51#define MY_CAST_FUNC
52#endif
53
54#define LLL_(quote) L##quote 46#define LLL_(quote) L##quote
55#define LLL(quote) LLL_(quote) 47#define LLL(quote) LLL_(quote)
56 48
@@ -118,11 +110,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path"
118 #define k_Reg_WOW_Flag 0 110 #define k_Reg_WOW_Flag 0
119#endif 111#endif
120 112
113#ifdef USE_7ZIP_32_DLL
121#ifdef _WIN64 114#ifdef _WIN64
122 #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY 115 #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY
123#else 116#else
124 #define k_Reg_WOW_Flag_32 0 117 #define k_Reg_WOW_Flag_32 0
125#endif 118#endif
119#endif
126 120
127#define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" 121#define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}"
128 122
@@ -219,11 +213,11 @@ static DWORD GetFileVersion(LPCWSTR s)
219 return 0; 213 return 0;
220 } 214 }
221 215
222 my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, 216 my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule,
223 "GetFileVersionInfoSizeW"); 217 "GetFileVersionInfoSizeW");
224 my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, 218 my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule,
225 "GetFileVersionInfoW"); 219 "GetFileVersionInfoW");
226 my_VerQueryValueW = (Func_VerQueryValueW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, 220 my_VerQueryValueW = (Func_VerQueryValueW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule,
227 "VerQueryValueW"); 221 "VerQueryValueW");
228 222
229 if (!my_GetFileVersionInfoSizeW 223 if (!my_GetFileVersionInfoSizeW
@@ -1102,7 +1096,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
1102 { 1096 {
1103 BOOL isWow64 = FALSE; 1097 BOOL isWow64 = FALSE;
1104 const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process) 1098 const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process)
1105 MY_CAST_FUNC GetProcAddress(GetModuleHandleW(L"kernel32.dll"), 1099 Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"kernel32.dll"),
1106 "IsWow64Process"); 1100 "IsWow64Process");
1107 1101
1108 if (func_IsWow64Process) 1102 if (func_IsWow64Process)
@@ -1111,7 +1105,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
1111 if (!isWow64) 1105 if (!isWow64)
1112 { 1106 {
1113 if (!g_SilentMode) 1107 if (!g_SilentMode)
1114 PrintErrorMessage("This installation requires Windows " MY_CPU_NAME, NULL); 1108 PrintErrorMessage("This installation requires Windows "
1109 #ifdef MY_CPU_X86_OR_AMD64
1110 "x64"
1111 #else
1112 "64-bit"
1113 #endif
1114 , NULL);
1115 return 1; 1115 return 1;
1116 } 1116 }
1117 } 1117 }
diff --git a/C/Util/7zipInstall/Precomp.h b/C/Util/7zipInstall/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/7zipInstall/Precomp.h
+++ b/C/Util/7zipInstall/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/7zipInstall/makefile b/C/Util/7zipInstall/makefile
index 18e2783..424bd6c 100644
--- a/C/Util/7zipInstall/makefile
+++ b/C/Util/7zipInstall/makefile
@@ -19,9 +19,6 @@ C_OBJS = \
19 $O\7zAlloc.obj \ 19 $O\7zAlloc.obj \
20 $O\7zArcIn.obj \ 20 $O\7zArcIn.obj \
21 $O\7zBuf.obj \ 21 $O\7zBuf.obj \
22 $O\7zBuf2.obj \
23 $O\7zCrc.obj \
24 $O\7zCrcOpt.obj \
25 $O\7zFile.obj \ 22 $O\7zFile.obj \
26 $O\7zDec.obj \ 23 $O\7zDec.obj \
27 $O\7zStream.obj \ 24 $O\7zStream.obj \
@@ -34,11 +31,17 @@ C_OBJS = \
34OBJS = \ 31OBJS = \
35 $(MAIN_OBJS) \ 32 $(MAIN_OBJS) \
36 $(C_OBJS) \ 33 $(C_OBJS) \
34 $(ASM_OBJS) \
37 $O\resource.res 35 $O\resource.res
38 36
37!include "../../../CPP/7zip/Crc.mak"
38# !include "../../../CPP/7zip/LzmaDec.mak"
39
39!include "../../../CPP/Build.mak" 40!include "../../../CPP/Build.mak"
40 41
41$(MAIN_OBJS): $(*B).c 42$(MAIN_OBJS): $(*B).c
42 $(COMPL_O1) 43 $(COMPL_O1)
43$(C_OBJS): ../../$(*B).c 44$(C_OBJS): ../../$(*B).c
44 $(COMPL_O1) 45 $(COMPL_O1)
46
47!include "../../Asm_c.mak"
diff --git a/C/Util/7zipInstall/resource.rc b/C/Util/7zipInstall/resource.rc
index df6474e..40ed580 100644
--- a/C/Util/7zipInstall/resource.rc
+++ b/C/Util/7zipInstall/resource.rc
@@ -1,5 +1,6 @@
1#include <winnt.h> 1#include <windows.h>
2#include <WinUser.h> 2// #include <winnt.h>
3// #include <WinUser.h>
3#include <CommCtrl.h> 4#include <CommCtrl.h>
4 5
5#define USE_COPYRIGHT_CR 6#define USE_COPYRIGHT_CR
diff --git a/C/Util/7zipUninstall/7zipUninstall.c b/C/Util/7zipUninstall/7zipUninstall.c
index 8bc18b3..e7051e2 100644
--- a/C/Util/7zipUninstall/7zipUninstall.c
+++ b/C/Util/7zipUninstall/7zipUninstall.c
@@ -1,10 +1,11 @@
1/* 7zipUninstall.c - 7-Zip Uninstaller 1/* 7zipUninstall.c - 7-Zip Uninstaller
22022-07-15 : Igor Pavlov : Public domain */ 22024-03-21 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6// #define SZ_ERROR_ABORT 100 6// #define SZ_ERROR_ABORT 100
7 7
8#include "../../7zTypes.h"
8#include "../../7zWindows.h" 9#include "../../7zWindows.h"
9 10
10#if defined(_MSC_VER) && _MSC_VER < 1600 11#if defined(_MSC_VER) && _MSC_VER < 1600
@@ -31,16 +32,7 @@ typedef enum {
31 32
32#include "resource.h" 33#include "resource.h"
33 34
34#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__)
35 // #pragma GCC diagnostic ignored "-Wcast-function-type"
36#endif
37 35
38#if defined(_MSC_VER) && _MSC_VER > 1920
39#define MY_CAST_FUNC (void *)
40// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
41#else
42#define MY_CAST_FUNC
43#endif
44 36
45 37
46#define LLL_(quote) L##quote 38#define LLL_(quote) L##quote
@@ -101,11 +93,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path"
101 #define k_Reg_WOW_Flag 0 93 #define k_Reg_WOW_Flag 0
102#endif 94#endif
103 95
96#ifdef USE_7ZIP_32_DLL
104#ifdef _WIN64 97#ifdef _WIN64
105 #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY 98 #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY
106#else 99#else
107 #define k_Reg_WOW_Flag_32 0 100 #define k_Reg_WOW_Flag_32 0
108#endif 101#endif
102#endif
109 103
110#define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" 104#define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}"
111 105
@@ -124,9 +118,19 @@ static HWND g_Path_HWND;
124static HWND g_InfoLine_HWND; 118static HWND g_InfoLine_HWND;
125static HWND g_Progress_HWND; 119static HWND g_Progress_HWND;
126 120
127// WINADVAPI 121// RegDeleteKeyExW is supported starting from win2003sp1/xp-pro-x64
122// Z7_WIN32_WINNT_MIN < 0x0600 // Vista
123#if !defined(Z7_WIN32_WINNT_MIN) \
124 || Z7_WIN32_WINNT_MIN < 0x0502 /* < win2003 */ \
125 || Z7_WIN32_WINNT_MIN == 0x0502 && !defined(_M_AMD64)
126#define Z7_USE_DYN_RegDeleteKeyExW
127#endif
128
129#ifdef Z7_USE_DYN_RegDeleteKeyExW
130Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
128typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved); 131typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved);
129static Func_RegDeleteKeyExW func_RegDeleteKeyExW; 132static Func_RegDeleteKeyExW func_RegDeleteKeyExW;
133#endif
130 134
131static WCHAR cmd[MAX_PATH + 4]; 135static WCHAR cmd[MAX_PATH + 4];
132static WCHAR cmdError[MAX_PATH + 4]; 136static WCHAR cmdError[MAX_PATH + 4];
@@ -247,13 +251,18 @@ static LONG MyRegistry_OpenKey_ReadWrite(HKEY parentKey, LPCWSTR name, HKEY *des
247 251
248static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name) 252static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name)
249{ 253{
250 #if k_Reg_WOW_Flag != 0 254#if k_Reg_WOW_Flag != 0
251 if (func_RegDeleteKeyExW) 255#ifdef Z7_USE_DYN_RegDeleteKeyExW
252 return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag, 0); 256 if (!func_RegDeleteKeyExW)
253 return E_FAIL; 257 return E_FAIL;
254 #else 258 return func_RegDeleteKeyExW
259#else
260 return RegDeleteKeyExW
261#endif
262 (parentKey, name, k_Reg_WOW_Flag, 0);
263#else
255 return RegDeleteKeyW(parentKey, name); 264 return RegDeleteKeyW(parentKey, name);
256 #endif 265#endif
257} 266}
258 267
259#ifdef USE_7ZIP_32_DLL 268#ifdef USE_7ZIP_32_DLL
@@ -278,13 +287,18 @@ static LONG MyRegistry_OpenKey_ReadWrite_32(HKEY parentKey, LPCWSTR name, HKEY *
278 287
279static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name) 288static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name)
280{ 289{
281 #if k_Reg_WOW_Flag_32 != 0 290#if k_Reg_WOW_Flag_32 != 0
282 if (func_RegDeleteKeyExW) 291#ifdef Z7_USE_DYN_RegDeleteKeyExW
283 return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag_32, 0); 292 if (!func_RegDeleteKeyExW)
284 return E_FAIL; 293 return E_FAIL;
285 #else 294 return func_RegDeleteKeyExW
295#else
296 return RegDeleteKeyExW
297#endif
298 (parentKey, name, k_Reg_WOW_Flag_32, 0);
299#else
286 return RegDeleteKeyW(parentKey, name); 300 return RegDeleteKeyW(parentKey, name);
287 #endif 301#endif
288} 302}
289 303
290#endif 304#endif
@@ -930,14 +944,17 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
930 UNUSED_VAR(lpCmdLine) 944 UNUSED_VAR(lpCmdLine)
931 UNUSED_VAR(nCmdShow) 945 UNUSED_VAR(nCmdShow)
932 946
933 #ifndef UNDER_CE 947#ifndef UNDER_CE
934 CoInitialize(NULL); 948 CoInitialize(NULL);
935 #endif 949#endif
936 950
937 #ifndef UNDER_CE 951#ifndef UNDER_CE
938 func_RegDeleteKeyExW = (Func_RegDeleteKeyExW) MY_CAST_FUNC 952#ifdef Z7_USE_DYN_RegDeleteKeyExW
939 GetProcAddress(GetModuleHandleW(L"advapi32.dll"), "RegDeleteKeyExW"); 953 func_RegDeleteKeyExW =
940 #endif 954 (Func_RegDeleteKeyExW) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"advapi32.dll"),
955 "RegDeleteKeyExW");
956#endif
957#endif
941 958
942 { 959 {
943 const wchar_t *s = GetCommandLineW(); 960 const wchar_t *s = GetCommandLineW();
diff --git a/C/Util/7zipUninstall/Precomp.h b/C/Util/7zipUninstall/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/7zipUninstall/Precomp.h
+++ b/C/Util/7zipUninstall/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/7zipUninstall/resource.rc b/C/Util/7zipUninstall/resource.rc
index 00bdcc0..79400c6 100644
--- a/C/Util/7zipUninstall/resource.rc
+++ b/C/Util/7zipUninstall/resource.rc
@@ -1,5 +1,6 @@
1#include <winnt.h> 1#include <windows.h>
2#include <WinUser.h> 2// #include <winnt.h>
3// #include <WinUser.h>
3#include <CommCtrl.h> 4#include <CommCtrl.h>
4 5
5#define USE_COPYRIGHT_CR 6#define USE_COPYRIGHT_CR
diff --git a/C/Util/Lzma/Precomp.h b/C/Util/Lzma/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/Lzma/Precomp.h
+++ b/C/Util/Lzma/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/LzmaLib/Precomp.h b/C/Util/LzmaLib/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/LzmaLib/Precomp.h
+++ b/C/Util/LzmaLib/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/LzmaLib/makefile b/C/Util/LzmaLib/makefile
index b8e054e..9ed0aa4 100644
--- a/C/Util/LzmaLib/makefile
+++ b/C/Util/LzmaLib/makefile
@@ -14,16 +14,19 @@ C_OBJS = \
14 $O\CpuArch.obj \ 14 $O\CpuArch.obj \
15 $O\LzFind.obj \ 15 $O\LzFind.obj \
16 $O\LzFindMt.obj \ 16 $O\LzFindMt.obj \
17 $O\LzFindOpt.obj \
18 $O\LzmaDec.obj \ 17 $O\LzmaDec.obj \
19 $O\LzmaEnc.obj \ 18 $O\LzmaEnc.obj \
20 $O\LzmaLib.obj \ 19 $O\LzmaLib.obj \
21 $O\Threads.obj \ 20 $O\Threads.obj \
22 21
22!include "../../../CPP/7zip/LzFindOpt.mak"
23!include "../../../CPP/7zip/LzmaDec.mak"
24
23OBJS = \ 25OBJS = \
24 $O\Precomp.obj \ 26 $O\Precomp.obj \
25 $(LIB_OBJS) \ 27 $(LIB_OBJS) \
26 $(C_OBJS) \ 28 $(C_OBJS) \
29 $(ASM_OBJS) \
27 $O\resource.res 30 $O\resource.res
28 31
29!include "../../../CPP/Build.mak" 32!include "../../../CPP/Build.mak"
@@ -52,3 +55,5 @@ $(C_OBJS): ../../$(*B).c
52 $(CCOMPLB_USE) 55 $(CCOMPLB_USE)
53 56
54!ENDIF 57!ENDIF
58
59!include "../../Asm_c.mak"
diff --git a/C/Util/SfxSetup/Precomp.h b/C/Util/SfxSetup/Precomp.h
index bc8fa21..13a41ef 100644
--- a/C/Util/SfxSetup/Precomp.h
+++ b/C/Util/SfxSetup/Precomp.h
@@ -1,14 +1,13 @@
1/* Precomp.h -- StdAfx 1/* Precomp.h -- Precomp
22023-03-04 : Igor Pavlov : Public domain */ 22024-01-23 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_PRECOMP_H 4// #ifndef ZIP7_INC_PRECOMP_LOC_H
5#define ZIP7_INC_PRECOMP_H 5// #define ZIP7_INC_PRECOMP_LOC_H
6 6
7#if defined(_MSC_VER) && _MSC_VER >= 1800 7#if defined(_MSC_VER) && _MSC_VER >= 1800
8#pragma warning(disable : 4464) // relative include path contains '..' 8#pragma warning(disable : 4464) // relative include path contains '..'
9#endif 9#endif
10 10
11#include "../../Compiler.h" 11#include "../../Precomp.h"
12#include "../../7zTypes.h"
13 12
14#endif 13// #endif
diff --git a/C/Util/SfxSetup/SfxSetup.c b/C/Util/SfxSetup/SfxSetup.c
index 7304a0b..9b5c1f9 100644
--- a/C/Util/SfxSetup/SfxSetup.c
+++ b/C/Util/SfxSetup/SfxSetup.c
@@ -1,5 +1,5 @@
1/* SfxSetup.c - 7z SFX Setup 1/* SfxSetup.c - 7z SFX Setup
22019-02-02 : Igor Pavlov : Public domain */ 22024-01-24 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -278,10 +278,10 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
278 #ifdef _CONSOLE 278 #ifdef _CONSOLE
279 SetConsoleCtrlHandler(HandlerRoutine, TRUE); 279 SetConsoleCtrlHandler(HandlerRoutine, TRUE);
280 #else 280 #else
281 UNUSED_VAR(hInstance); 281 UNUSED_VAR(hInstance)
282 UNUSED_VAR(hPrevInstance); 282 UNUSED_VAR(hPrevInstance)
283 UNUSED_VAR(lpCmdLine); 283 UNUSED_VAR(lpCmdLine)
284 UNUSED_VAR(nCmdShow); 284 UNUSED_VAR(nCmdShow)
285 #endif 285 #endif
286 286
287 CrcGenerateTable(); 287 CrcGenerateTable();
@@ -516,12 +516,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance,
516 #endif 516 #endif
517 517
518 { 518 {
519 const SRes res2 = File_Close(&outFile); 519 const WRes res2 = File_Close(&outFile);
520 if (res != SZ_OK) 520 if (res != SZ_OK)
521 break; 521 break;
522 if (res2 != SZ_OK) 522 if (res2 != 0)
523 { 523 {
524 res = res2; 524 errorMessage = "Can't close output file";
525 res = SZ_ERROR_FAIL;
525 break; 526 break;
526 } 527 }
527 } 528 }
diff --git a/C/Util/SfxSetup/makefile b/C/Util/SfxSetup/makefile
index bc0cf8b..b3f25a2 100644
--- a/C/Util/SfxSetup/makefile
+++ b/C/Util/SfxSetup/makefile
@@ -9,8 +9,6 @@ C_OBJS = \
9 $O\7zArcIn.obj \ 9 $O\7zArcIn.obj \
10 $O\7zBuf.obj \ 10 $O\7zBuf.obj \
11 $O\7zBuf2.obj \ 11 $O\7zBuf2.obj \
12 $O\7zCrc.obj \
13 $O\7zCrcOpt.obj \
14 $O\7zFile.obj \ 12 $O\7zFile.obj \
15 $O\7zDec.obj \ 13 $O\7zDec.obj \
16 $O\7zStream.obj \ 14 $O\7zStream.obj \
@@ -27,9 +25,13 @@ C_OBJS = \
277Z_OBJS = \ 257Z_OBJS = \
28 $O\SfxSetup.obj \ 26 $O\SfxSetup.obj \
29 27
28!include "../../../CPP/7zip/Crc.mak"
29# !include "../../../CPP/7zip/LzmaDec.mak"
30
30OBJS = \ 31OBJS = \
31 $(7Z_OBJS) \ 32 $(7Z_OBJS) \
32 $(C_OBJS) \ 33 $(C_OBJS) \
34 $(ASM_OBJS) \
33 $O\resource.res 35 $O\resource.res
34 36
35!include "../../../CPP/Build.mak" 37!include "../../../CPP/Build.mak"
@@ -38,3 +40,5 @@ $(7Z_OBJS): $(*B).c
38 $(COMPL_O1) 40 $(COMPL_O1)
39$(C_OBJS): ../../$(*B).c 41$(C_OBJS): ../../$(*B).c
40 $(COMPL_O1) 42 $(COMPL_O1)
43
44!include "../../Asm_c.mak"
diff --git a/C/Xxh64.c b/C/Xxh64.c
new file mode 100644
index 0000000..dc02a02
--- /dev/null
+++ b/C/Xxh64.c
@@ -0,0 +1,327 @@
1/* Xxh64.c -- XXH64 hash calculation
2original code: Copyright (c) Yann Collet.
32023-08-18 : modified by Igor Pavlov.
4This source code is licensed under BSD 2-Clause License.
5*/
6
7#include "Precomp.h"
8
9#include "CpuArch.h"
10#include "RotateDefs.h"
11#include "Xxh64.h"
12
13#define Z7_XXH_PRIME64_1 UINT64_CONST(0x9E3779B185EBCA87)
14#define Z7_XXH_PRIME64_2 UINT64_CONST(0xC2B2AE3D27D4EB4F)
15#define Z7_XXH_PRIME64_3 UINT64_CONST(0x165667B19E3779F9)
16#define Z7_XXH_PRIME64_4 UINT64_CONST(0x85EBCA77C2B2AE63)
17#define Z7_XXH_PRIME64_5 UINT64_CONST(0x27D4EB2F165667C5)
18
19void Xxh64State_Init(CXxh64State *p)
20{
21 const UInt64 seed = 0;
22 p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2;
23 p->v[1] = seed + Z7_XXH_PRIME64_2;
24 p->v[2] = seed;
25 p->v[3] = seed - Z7_XXH_PRIME64_1;
26}
27
28#if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER)
29 #define Z7_XXH64_USE_ASM
30#endif
31
32#if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \
33 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200
34/* we try to avoid __allmul calls in MSVC for 64-bit multiply.
35 But MSVC6 still uses __allmul for our code.
36 So for MSVC6 we use default 64-bit multiply without our optimization.
37*/
38#define LOW32(b) ((UInt32)(b & 0xffffffff))
39/* MSVC compiler (MSVC > 1200) can use "mul" instruction
40 without __allmul for our MY_emulu MACRO.
41 MY_emulu is similar to __emulu(a, b) MACRO */
42#define MY_emulu(a, b) ((UInt64)(a) * (b))
43#define MY_SET_HIGH32(a) ((UInt64)(a) << 32)
44#define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b))
45// /*
46#define MY_MUL64(a, b) \
47 ( MY_emulu((UInt32)(a), LOW32(b)) + \
48 MY_SET_HIGH32( \
49 (UInt32)((a) >> 32) * LOW32(b) + \
50 (UInt32)(a) * (UInt32)((b) >> 32) \
51 ))
52// */
53/*
54#define MY_MUL64(a, b) \
55 ( MY_emulu((UInt32)(a), LOW32(b)) \
56 + MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \
57 + MY_MUL32_SET_HIGH32(a, (b) >> 32) \
58 )
59*/
60
61#define MY_MUL_32_64(a32, b) \
62 ( MY_emulu((UInt32)(a32), LOW32(b)) \
63 + MY_MUL32_SET_HIGH32(a32, (b) >> 32) \
64 )
65
66#else
67#define MY_MUL64(a, b) ((a) * (b))
68#define MY_MUL_32_64(a32, b) ((a32) * (UInt64)(b))
69#endif
70
71
72static
73Z7_FORCE_INLINE
74UInt64 Xxh64_Round(UInt64 acc, UInt64 input)
75{
76 acc += MY_MUL64(input, Z7_XXH_PRIME64_2);
77 acc = Z7_ROTL64(acc, 31);
78 return MY_MUL64(acc, Z7_XXH_PRIME64_1);
79}
80
81static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val)
82{
83 acc ^= Xxh64_Round(0, val);
84 return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
85}
86
87
88#ifdef Z7_XXH64_USE_ASM
89
90#define Z7_XXH_PRIME64_1_HIGH 0x9E3779B1
91#define Z7_XXH_PRIME64_1_LOW 0x85EBCA87
92#define Z7_XXH_PRIME64_2_HIGH 0xC2B2AE3D
93#define Z7_XXH_PRIME64_2_LOW 0x27D4EB4F
94
95void
96Z7_NO_INLINE
97__declspec(naked)
98Z7_FASTCALL
99Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end)
100{
101 #if !defined(__clang__)
102 UNUSED_VAR(p)
103 UNUSED_VAR(data)
104 UNUSED_VAR(end)
105 #endif
106 __asm push ebx
107 __asm push ebp
108 __asm push esi
109 __asm push edi
110
111 #define STACK_OFFSET 4 * 8
112 __asm sub esp, STACK_OFFSET
113
114#define COPY_1(n) \
115 __asm mov eax, [ecx + n * 4] \
116 __asm mov [esp + n * 4], eax \
117
118#define COPY_2(n) \
119 __asm mov eax, [esp + n * 4] \
120 __asm mov [ecx + n * 4], eax \
121
122 COPY_1(0)
123 __asm mov edi, [ecx + 1 * 4] \
124 COPY_1(2)
125 COPY_1(3)
126 COPY_1(4)
127 COPY_1(5)
128 COPY_1(6)
129 COPY_1(7)
130
131 __asm mov esi, edx \
132 __asm mov [esp + 0 * 8 + 4], ecx
133 __asm mov ecx, Z7_XXH_PRIME64_2_LOW \
134 __asm mov ebp, Z7_XXH_PRIME64_1_LOW \
135
136#define R(n, state1, state1_reg) \
137 __asm mov eax, [esi + n * 8] \
138 __asm imul ebx, eax, Z7_XXH_PRIME64_2_HIGH \
139 __asm add ebx, state1 \
140 __asm mul ecx \
141 __asm add edx, ebx \
142 __asm mov ebx, [esi + n * 8 + 4] \
143 __asm imul ebx, ecx \
144 __asm add eax, [esp + n * 8] \
145 __asm adc edx, ebx \
146 __asm mov ebx, eax \
147 __asm shld eax, edx, 31 \
148 __asm shld edx, ebx, 31 \
149 __asm imul state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \
150 __asm imul edx, ebp \
151 __asm add state1_reg, edx \
152 __asm mul ebp \
153 __asm add state1_reg, edx \
154 __asm mov [esp + n * 8], eax \
155
156#define R2(n) \
157 R(n, [esp + n * 8 + 4], ebx) \
158 __asm mov [esp + n * 8 + 4], ebx \
159
160 __asm align 16
161 __asm main_loop:
162 R(0, edi, edi)
163 R2(1)
164 R2(2)
165 R2(3)
166 __asm add esi, 32
167 __asm cmp esi, [esp + STACK_OFFSET + 4 * 4 + 4]
168 __asm jne main_loop
169
170 __asm mov ecx, [esp + 0 * 8 + 4]
171
172 COPY_2(0)
173 __asm mov [ecx + 1 * 4], edi
174 COPY_2(2)
175 COPY_2(3)
176 COPY_2(4)
177 COPY_2(5)
178 COPY_2(6)
179 COPY_2(7)
180
181 __asm add esp, STACK_OFFSET
182 __asm pop edi
183 __asm pop esi
184 __asm pop ebp
185 __asm pop ebx
186 __asm ret 4
187}
188
189#else
190
191void
192Z7_NO_INLINE
193Z7_FASTCALL
194Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end)
195{
196 const Byte *data = (const Byte *)_data;
197 UInt64 v[4];
198 v[0] = p->v[0];
199 v[1] = p->v[1];
200 v[2] = p->v[2];
201 v[3] = p->v[3];
202 do
203 {
204 v[0] = Xxh64_Round(v[0], GetUi64(data)); data += 8;
205 v[1] = Xxh64_Round(v[1], GetUi64(data)); data += 8;
206 v[2] = Xxh64_Round(v[2], GetUi64(data)); data += 8;
207 v[3] = Xxh64_Round(v[3], GetUi64(data)); data += 8;
208 }
209 while (data != end);
210 p->v[0] = v[0];
211 p->v[1] = v[1];
212 p->v[2] = v[2];
213 p->v[3] = v[3];
214}
215
216#endif
217
218UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count)
219{
220 UInt64 h = p->v[2];
221
222 if (count >= 32)
223 {
224 h = Z7_ROTL64(p->v[0], 1) +
225 Z7_ROTL64(p->v[1], 7) +
226 Z7_ROTL64(h, 12) +
227 Z7_ROTL64(p->v[3], 18);
228 h = Xxh64_Merge(h, p->v[0]);
229 h = Xxh64_Merge(h, p->v[1]);
230 h = Xxh64_Merge(h, p->v[2]);
231 h = Xxh64_Merge(h, p->v[3]);
232 }
233 else
234 h += Z7_XXH_PRIME64_5;
235
236 h += count;
237
238 // XXH64_finalize():
239 {
240 unsigned cnt = (unsigned)count & 31;
241 const Byte *data = (const Byte *)_data;
242 while (cnt >= 8)
243 {
244 h ^= Xxh64_Round(0, GetUi64(data));
245 data += 8;
246 h = Z7_ROTL64(h, 27);
247 h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
248 cnt -= 8;
249 }
250 if (cnt >= 4)
251 {
252 const UInt32 v = GetUi32(data);
253 data += 4;
254 h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1);
255 h = Z7_ROTL64(h, 23);
256 h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3;
257 cnt -= 4;
258 }
259 while (cnt)
260 {
261 const UInt32 v = *data++;
262 h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5);
263 h = Z7_ROTL64(h, 11);
264 h = MY_MUL64(h, Z7_XXH_PRIME64_1);
265 cnt--;
266 }
267 // XXH64_avalanche(h):
268 h ^= h >> 33; h = MY_MUL64(h, Z7_XXH_PRIME64_2);
269 h ^= h >> 29; h = MY_MUL64(h, Z7_XXH_PRIME64_3);
270 h ^= h >> 32;
271 return h;
272 }
273}
274
275
276void Xxh64_Init(CXxh64 *p)
277{
278 Xxh64State_Init(&p->state);
279 p->count = 0;
280 p->buf64[0] = 0;
281 p->buf64[1] = 0;
282 p->buf64[2] = 0;
283 p->buf64[3] = 0;
284}
285
286void Xxh64_Update(CXxh64 *p, const void *_data, size_t size)
287{
288 const Byte *data = (const Byte *)_data;
289 unsigned cnt;
290 if (size == 0)
291 return;
292 cnt = (unsigned)p->count;
293 p->count += size;
294
295 if (cnt &= 31)
296 {
297 unsigned rem = 32 - cnt;
298 Byte *dest = (Byte *)p->buf64 + cnt;
299 if (rem > size)
300 rem = (unsigned)size;
301 size -= rem;
302 cnt += rem;
303 // memcpy((Byte *)p->buf64 + cnt, data, rem);
304 do
305 *dest++ = *data++;
306 while (--rem);
307 if (cnt != 32)
308 return;
309 Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]);
310 }
311
312 if (size &= ~(size_t)31)
313 {
314 Xxh64State_UpdateBlocks(&p->state, data, data + size);
315 data += size;
316 }
317
318 cnt = (unsigned)p->count & 31;
319 if (cnt)
320 {
321 // memcpy(p->buf64, data, cnt);
322 Byte *dest = (Byte *)p->buf64;
323 do
324 *dest++ = *data++;
325 while (--cnt);
326 }
327}
diff --git a/C/Xxh64.h b/C/Xxh64.h
new file mode 100644
index 0000000..efef65e
--- /dev/null
+++ b/C/Xxh64.h
@@ -0,0 +1,50 @@
1/* Xxh64.h -- XXH64 hash calculation interfaces
22023-08-18 : Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_XXH64_H
5#define ZIP7_INC_XXH64_H
6
7#include "7zTypes.h"
8
9EXTERN_C_BEGIN
10
11#define Z7_XXH64_BLOCK_SIZE (4 * 8)
12
13typedef struct
14{
15 UInt64 v[4];
16} CXxh64State;
17
18void Xxh64State_Init(CXxh64State *p);
19
20// end != data && end == data + Z7_XXH64_BLOCK_SIZE * numBlocks
21void Z7_FASTCALL Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end);
22
23/*
24Xxh64State_Digest():
25data:
26 the function processes only
27 (totalCount & (Z7_XXH64_BLOCK_SIZE - 1)) bytes in (data): (smaller than 32 bytes).
28totalCount: total size of hashed stream:
29 it includes total size of data processed by previous Xxh64State_UpdateBlocks() calls,
30 and it also includes current processed size in (data).
31*/
32UInt64 Xxh64State_Digest(const CXxh64State *p, const void *data, UInt64 totalCount);
33
34
35typedef struct
36{
37 CXxh64State state;
38 UInt64 count;
39 UInt64 buf64[4];
40} CXxh64;
41
42void Xxh64_Init(CXxh64 *p);
43void Xxh64_Update(CXxh64 *p, const void *data, size_t size);
44
45#define Xxh64_Digest(p) \
46 Xxh64State_Digest(&(p)->state, (p)->buf64, (p)->count)
47
48EXTERN_C_END
49
50#endif
diff --git a/C/Xz.c b/C/Xz.c
index 4ad0710..d07550d 100644
--- a/C/Xz.c
+++ b/C/Xz.c
@@ -1,5 +1,5 @@
1/* Xz.c - Xz 1/* Xz.c - Xz
22023-04-02 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode)
52 case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; 52 case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break;
53 case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; 53 case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break;
54 case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; 54 case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break;
55 default: break;
55 } 56 }
56} 57}
57 58
@@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size)
62 case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; 63 case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break;
63 case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; 64 case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break;
64 case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; 65 case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break;
66 default: break;
65 } 67 }
66} 68}
67 69
diff --git a/C/Xz.h b/C/Xz.h
index d5001f6..42bc685 100644
--- a/C/Xz.h
+++ b/C/Xz.h
@@ -1,5 +1,5 @@
1/* Xz.h - Xz interface 1/* Xz.h - Xz interface
22023-04-13 : Igor Pavlov : Public domain */ 22024-01-26 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_H 4#ifndef ZIP7_INC_XZ_H
5#define ZIP7_INC_XZ_H 5#define ZIP7_INC_XZ_H
@@ -18,6 +18,7 @@ EXTERN_C_BEGIN
18#define XZ_ID_ARMT 8 18#define XZ_ID_ARMT 8
19#define XZ_ID_SPARC 9 19#define XZ_ID_SPARC 9
20#define XZ_ID_ARM64 0xa 20#define XZ_ID_ARM64 0xa
21#define XZ_ID_RISCV 0xb
21#define XZ_ID_LZMA2 0x21 22#define XZ_ID_LZMA2 0x21
22 23
23unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); 24unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value);
@@ -233,13 +234,13 @@ typedef enum
233typedef struct 234typedef struct
234{ 235{
235 EXzState state; 236 EXzState state;
236 UInt32 pos; 237 unsigned pos;
237 unsigned alignPos; 238 unsigned alignPos;
238 unsigned indexPreSize; 239 unsigned indexPreSize;
239 240
240 CXzStreamFlags streamFlags; 241 CXzStreamFlags streamFlags;
241 242
242 UInt32 blockHeaderSize; 243 unsigned blockHeaderSize;
243 UInt64 packSize; 244 UInt64 packSize;
244 UInt64 unpackSize; 245 UInt64 unpackSize;
245 246
diff --git a/C/XzCrc64.c b/C/XzCrc64.c
index c2fad6c..94fc1af 100644
--- a/C/XzCrc64.c
+++ b/C/XzCrc64.c
@@ -1,5 +1,5 @@
1/* XzCrc64.c -- CRC64 calculation 1/* XzCrc64.c -- CRC64 calculation
22023-04-02 : Igor Pavlov : Public domain */ 22023-12-08 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -8,36 +8,76 @@
8 8
9#define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) 9#define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42)
10 10
11#ifdef MY_CPU_LE 11// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu
12 #define CRC64_NUM_TABLES 4 12// #define Z7_CRC64_DEBUG_BE
13#ifdef Z7_CRC64_DEBUG_BE
14#undef MY_CPU_LE
15#define MY_CPU_BE
16#endif
17
18#ifdef Z7_CRC64_NUM_TABLES
19 #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES
13#else 20#else
14 #define CRC64_NUM_TABLES 5 21 #define Z7_CRC64_NUM_TABLES_USE 12
22#endif
15 23
16 UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); 24#if Z7_CRC64_NUM_TABLES_USE < 1
25 #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
17#endif 26#endif
18 27
28
29#if Z7_CRC64_NUM_TABLES_USE != 1
30
19#ifndef MY_CPU_BE 31#ifndef MY_CPU_BE
20 UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); 32 #define FUNC_NAME_LE_2(s) XzCrc64UpdateT ## s
33 #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s)
34 #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE)
35 UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table);
36#endif
37#ifndef MY_CPU_LE
38 #define FUNC_NAME_BE_2(s) XzCrc64UpdateBeT ## s
39 #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s)
40 #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE)
41 UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table);
21#endif 42#endif
22 43
23typedef UInt64 (Z7_FASTCALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table); 44#if defined(MY_CPU_LE)
45 #define FUNC_REF FUNC_NAME_LE
46#elif defined(MY_CPU_BE)
47 #define FUNC_REF FUNC_NAME_BE
48#else
49 #define FUNC_REF g_Crc64Update
50 static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table);
51#endif
52
53#endif
54
55
56MY_ALIGN(64)
57static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE];
24 58
25static CRC64_FUNC g_Crc64Update;
26UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES];
27 59
28UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) 60UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size)
29{ 61{
30 return g_Crc64Update(v, data, size, g_Crc64Table); 62#if Z7_CRC64_NUM_TABLES_USE == 1
63 #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
64 const UInt64 *table = g_Crc64Table;
65 const Byte *p = (const Byte *)data;
66 const Byte *lim = p + size;
67 for (; p != lim; p++)
68 v = CRC64_UPDATE_BYTE_2(v, *p);
69 return v;
70 #undef CRC64_UPDATE_BYTE_2
71#else
72 return FUNC_REF (v, data, size, g_Crc64Table);
73#endif
31} 74}
32 75
33UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size)
34{
35 return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL;
36}
37 76
77Z7_NO_INLINE
38void Z7_FASTCALL Crc64GenerateTable(void) 78void Z7_FASTCALL Crc64GenerateTable(void)
39{ 79{
40 UInt32 i; 80 unsigned i;
41 for (i = 0; i < 256; i++) 81 for (i = 0; i < 256; i++)
42 { 82 {
43 UInt64 r = i; 83 UInt64 r = i;
@@ -46,35 +86,55 @@ void Z7_FASTCALL Crc64GenerateTable(void)
46 r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); 86 r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1)));
47 g_Crc64Table[i] = r; 87 g_Crc64Table[i] = r;
48 } 88 }
49 for (i = 256; i < 256 * CRC64_NUM_TABLES; i++) 89
90#if Z7_CRC64_NUM_TABLES_USE != 1
91#if 1 || 1 && defined(MY_CPU_X86) // low register count
92 for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++)
50 { 93 {
51 const UInt64 r = g_Crc64Table[(size_t)i - 256]; 94 const UInt64 r0 = g_Crc64Table[(size_t)i];
52 g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8); 95 g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8);
53 } 96 }
54 97#else
55 #ifdef MY_CPU_LE 98 for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2)
56 99 {
57 g_Crc64Update = XzCrc64UpdateT4; 100 UInt64 r0 = g_Crc64Table[(size_t)(i) ];
101 UInt64 r1 = g_Crc64Table[(size_t)(i) + 1];
102 r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8);
103 r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8);
104 g_Crc64Table[(size_t)i + 256 ] = r0;
105 g_Crc64Table[(size_t)i + 256 + 1] = r1;
106 }
107#endif
58 108
59 #else 109#ifndef MY_CPU_LE
60 { 110 {
61 #ifndef MY_CPU_BE 111#ifndef MY_CPU_BE
62 UInt32 k = 1; 112 UInt32 k = 1;
63 if (*(const Byte *)&k == 1) 113 if (*(const Byte *)&k == 1)
64 g_Crc64Update = XzCrc64UpdateT4; 114 FUNC_REF = FUNC_NAME_LE;
65 else 115 else
66 #endif 116#endif
67 { 117 {
68 for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--) 118#ifndef MY_CPU_BE
119 FUNC_REF = FUNC_NAME_BE;
120#endif
121 for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++)
69 { 122 {
70 const UInt64 x = g_Crc64Table[(size_t)i - 256]; 123 const UInt64 x = g_Crc64Table[i];
71 g_Crc64Table[i] = Z7_BSWAP64(x); 124 g_Crc64Table[i] = Z7_BSWAP64(x);
72 } 125 }
73 g_Crc64Update = XzCrc64UpdateT1_BeT4;
74 } 126 }
75 } 127 }
76 #endif 128#endif // ndef MY_CPU_LE
129#endif // Z7_CRC64_NUM_TABLES_USE != 1
77} 130}
78 131
79#undef kCrc64Poly 132#undef kCrc64Poly
80#undef CRC64_NUM_TABLES 133#undef Z7_CRC64_NUM_TABLES_USE
134#undef FUNC_REF
135#undef FUNC_NAME_LE_2
136#undef FUNC_NAME_LE_1
137#undef FUNC_NAME_LE
138#undef FUNC_NAME_BE_2
139#undef FUNC_NAME_BE_1
140#undef FUNC_NAME_BE
diff --git a/C/XzCrc64.h b/C/XzCrc64.h
index ca46869..04f8153 100644
--- a/C/XzCrc64.h
+++ b/C/XzCrc64.h
@@ -1,5 +1,5 @@
1/* XzCrc64.h -- CRC64 calculation 1/* XzCrc64.h -- CRC64 calculation
22023-04-02 : Igor Pavlov : Public domain */ 22023-12-08 : Igor Pavlov : Public domain */
3 3
4#ifndef ZIP7_INC_XZ_CRC64_H 4#ifndef ZIP7_INC_XZ_CRC64_H
5#define ZIP7_INC_XZ_CRC64_H 5#define ZIP7_INC_XZ_CRC64_H
@@ -10,16 +10,16 @@
10 10
11EXTERN_C_BEGIN 11EXTERN_C_BEGIN
12 12
13extern UInt64 g_Crc64Table[]; 13// extern UInt64 g_Crc64Table[];
14 14
15void Z7_FASTCALL Crc64GenerateTable(void); 15void Z7_FASTCALL Crc64GenerateTable(void);
16 16
17#define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) 17#define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF)
18#define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) 18#define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL)
19#define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) 19// #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
20 20
21UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); 21UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size);
22UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); 22// UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size);
23 23
24EXTERN_C_END 24EXTERN_C_END
25 25
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c
index d03374c..0c1fc2f 100644
--- a/C/XzCrc64Opt.c
+++ b/C/XzCrc64Opt.c
@@ -1,61 +1,261 @@
1/* XzCrc64Opt.c -- CRC64 calculation 1/* XzCrc64Opt.c -- CRC64 calculation (optimized functions)
22023-04-02 : Igor Pavlov : Public domain */ 22023-12-08 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
6#include "CpuArch.h" 6#include "CpuArch.h"
7 7
8#if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1
9
10// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu
11// #define Z7_CRC64_DEBUG_BE
12#ifdef Z7_CRC64_DEBUG_BE
13#undef MY_CPU_LE
14#define MY_CPU_BE
15#endif
16
17#if defined(MY_CPU_64BIT)
18#define Z7_CRC64_USE_64BIT
19#endif
20
21// the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c
22#ifdef Z7_CRC64_NUM_TABLES
23#define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES
24#else
25#define Z7_CRC64_NUM_TABLES_USE 12
26#endif
27
28#if Z7_CRC64_NUM_TABLES_USE % 4 || \
29 Z7_CRC64_NUM_TABLES_USE < 4 || \
30 Z7_CRC64_NUM_TABLES_USE > 4 * 4
31 #error Stop_Compiling_Bad_CRC64_NUM_TABLES
32#endif
33
34
8#ifndef MY_CPU_BE 35#ifndef MY_CPU_BE
9 36
10#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) 37#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
38
39#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0)
11 40
12UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); 41#define Q64LE(n, d) \
13UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) 42 ( (table + ((n) * 8 + 7) * 0x100)[((d) ) & 0xFF] \
43 ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \
44 ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \
45 ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \
46 ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \
47 ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \
48 ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \
49 ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] )
50
51#define R64(a) *((const UInt64 *)(const void *)p + (a))
52
53#else
54
55#define Q32LE(n, d) \
56 ( (table + ((n) * 4 + 3) * 0x100)[((d) ) & 0xFF] \
57 ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
58 ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
59 ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
60
61#define R32(a) *((const UInt32 *)(const void *)p + (a))
62
63#endif
64
65
66#define CRC64_FUNC_PRE_LE2(step) \
67UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table)
68
69#define CRC64_FUNC_PRE_LE(step) \
70 CRC64_FUNC_PRE_LE2(step); \
71 CRC64_FUNC_PRE_LE2(step)
72
73CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE)
14{ 74{
15 const Byte *p = (const Byte *)data; 75 const Byte *p = (const Byte *)data;
16 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) 76 const Byte *lim;
77 for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++)
17 v = CRC64_UPDATE_BYTE_2(v, *p); 78 v = CRC64_UPDATE_BYTE_2(v, *p);
18 for (; size >= 4; size -= 4, p += 4) 79 lim = p + size;
80 if (size >= Z7_CRC64_NUM_TABLES_USE)
19 { 81 {
20 const UInt32 d = (UInt32)v ^ *(const UInt32 *)(const void *)p; 82 lim -= Z7_CRC64_NUM_TABLES_USE;
21 v = (v >> 32) 83 do
22 ^ (table + 0x300)[((d ) & 0xFF)] 84 {
23 ^ (table + 0x200)[((d >> 8) & 0xFF)] 85#if Z7_CRC64_NUM_TABLES_USE == 4
24 ^ (table + 0x100)[((d >> 16) & 0xFF)] 86 const UInt32 d = (UInt32)v ^ R32(0);
25 ^ (table + 0x000)[((d >> 24))]; 87 v = (v >> 32) ^ Q32LE(0, d);
88#elif Z7_CRC64_NUM_TABLES_USE == 8
89#ifdef Z7_CRC64_USE_64BIT
90 v ^= R64(0);
91 v = Q64LE(0, v);
92#else
93 UInt32 v0, v1;
94 v0 = (UInt32)v ^ R32(0);
95 v1 = (UInt32)(v >> 32) ^ R32(1);
96 v = Q32LE(1, v0) ^ Q32LE(0, v1);
97#endif
98#elif Z7_CRC64_NUM_TABLES_USE == 12
99 UInt32 w;
100 UInt32 v0, v1;
101 v0 = (UInt32)v ^ R32(0);
102 v1 = (UInt32)(v >> 32) ^ R32(1);
103 w = R32(2);
104 v = Q32LE(0, w);
105 v ^= Q32LE(2, v0) ^ Q32LE(1, v1);
106#elif Z7_CRC64_NUM_TABLES_USE == 16
107#ifdef Z7_CRC64_USE_64BIT
108 UInt64 w;
109 UInt64 x;
110 w = R64(1); x = Q64LE(0, w);
111 v ^= R64(0); v = x ^ Q64LE(1, v);
112#else
113 UInt32 v0, v1;
114 UInt32 r0, r1;
115 v0 = (UInt32)v ^ R32(0);
116 v1 = (UInt32)(v >> 32) ^ R32(1);
117 r0 = R32(2);
118 r1 = R32(3);
119 v = Q32LE(1, r0) ^ Q32LE(0, r1);
120 v ^= Q32LE(3, v0) ^ Q32LE(2, v1);
121#endif
122#else
123#error Stop_Compiling_Bad_CRC64_NUM_TABLES
124#endif
125 p += Z7_CRC64_NUM_TABLES_USE;
126 }
127 while (p <= lim);
128 lim += Z7_CRC64_NUM_TABLES_USE;
26 } 129 }
27 for (; size > 0; size--, p++) 130 for (; p < lim; p++)
28 v = CRC64_UPDATE_BYTE_2(v, *p); 131 v = CRC64_UPDATE_BYTE_2(v, *p);
29 return v; 132 return v;
30} 133}
31 134
135#undef CRC64_UPDATE_BYTE_2
136#undef R32
137#undef R64
138#undef Q32LE
139#undef Q64LE
140#undef CRC64_FUNC_PRE_LE
141#undef CRC64_FUNC_PRE_LE2
142
32#endif 143#endif
33 144
34 145
146
147
35#ifndef MY_CPU_LE 148#ifndef MY_CPU_LE
36 149
37#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) 150#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8))
151
152#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0)
153
154#define Q64BE(n, d) \
155 ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \
156 ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
157 ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
158 ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \
159 ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \
160 ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \
161 ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \
162 ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] )
163
164#ifdef Z7_CRC64_DEBUG_BE
165 #define R64BE(a) GetBe64a((const UInt64 *)(const void *)p + (a))
166#else
167 #define R64BE(a) *((const UInt64 *)(const void *)p + (a))
168#endif
169
170#else
171
172#define Q32BE(n, d) \
173 ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \
174 ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
175 ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
176 ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
38 177
39UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); 178#ifdef Z7_CRC64_DEBUG_BE
40UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) 179 #define R32BE(a) GetBe32a((const UInt32 *)(const void *)p + (a))
180#else
181 #define R32BE(a) *((const UInt32 *)(const void *)p + (a))
182#endif
183
184#endif
185
186#define CRC64_FUNC_PRE_BE2(step) \
187UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table)
188
189#define CRC64_FUNC_PRE_BE(step) \
190 CRC64_FUNC_PRE_BE2(step); \
191 CRC64_FUNC_PRE_BE2(step)
192
193CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE)
41{ 194{
42 const Byte *p = (const Byte *)data; 195 const Byte *p = (const Byte *)data;
43 table += 0x100; 196 const Byte *lim;
44 v = Z7_BSWAP64(v); 197 v = Z7_BSWAP64(v);
45 for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) 198 for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++)
46 v = CRC64_UPDATE_BYTE_2_BE(v, *p); 199 v = CRC64_UPDATE_BYTE_2_BE(v, *p);
47 for (; size >= 4; size -= 4, p += 4) 200 lim = p + size;
201 if (size >= Z7_CRC64_NUM_TABLES_USE)
48 { 202 {
49 const UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)(const void *)p; 203 lim -= Z7_CRC64_NUM_TABLES_USE;
50 v = (v << 32) 204 do
51 ^ (table + 0x000)[((d ) & 0xFF)] 205 {
52 ^ (table + 0x100)[((d >> 8) & 0xFF)] 206#if Z7_CRC64_NUM_TABLES_USE == 4
53 ^ (table + 0x200)[((d >> 16) & 0xFF)] 207 const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0);
54 ^ (table + 0x300)[((d >> 24))]; 208 v = (v << 32) ^ Q32BE(0, d);
209#elif Z7_CRC64_NUM_TABLES_USE == 12
210 const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
211 const UInt32 d0 = (UInt32)(v ) ^ R32BE(1);
212 const UInt32 w = R32BE(2);
213 v = Q32BE(0, w);
214 v ^= Q32BE(2, d1) ^ Q32BE(1, d0);
215
216#elif Z7_CRC64_NUM_TABLES_USE == 8
217 #ifdef Z7_CRC64_USE_64BIT
218 v ^= R64BE(0);
219 v = Q64BE(0, v);
220 #else
221 const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
222 const UInt32 d0 = (UInt32)(v ) ^ R32BE(1);
223 v = Q32BE(1, d1) ^ Q32BE(0, d0);
224 #endif
225#elif Z7_CRC64_NUM_TABLES_USE == 16
226 #ifdef Z7_CRC64_USE_64BIT
227 const UInt64 w = R64BE(1);
228 v ^= R64BE(0);
229 v = Q64BE(0, w) ^ Q64BE(1, v);
230 #else
231 const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0);
232 const UInt32 d0 = (UInt32)(v ) ^ R32BE(1);
233 const UInt32 w1 = R32BE(2);
234 const UInt32 w0 = R32BE(3);
235 v = Q32BE(1, w1) ^ Q32BE(0, w0);
236 v ^= Q32BE(3, d1) ^ Q32BE(2, d0);
237 #endif
238#elif
239#error Stop_Compiling_Bad_CRC64_NUM_TABLES
240#endif
241 p += Z7_CRC64_NUM_TABLES_USE;
242 }
243 while (p <= lim);
244 lim += Z7_CRC64_NUM_TABLES_USE;
55 } 245 }
56 for (; size > 0; size--, p++) 246 for (; p < lim; p++)
57 v = CRC64_UPDATE_BYTE_2_BE(v, *p); 247 v = CRC64_UPDATE_BYTE_2_BE(v, *p);
58 return Z7_BSWAP64(v); 248 return Z7_BSWAP64(v);
59} 249}
60 250
251#undef CRC64_UPDATE_BYTE_2_BE
252#undef R32BE
253#undef R64BE
254#undef Q32BE
255#undef Q64BE
256#undef CRC64_FUNC_PRE_BE
257#undef CRC64_FUNC_PRE_BE2
258
259#endif
260#undef Z7_CRC64_NUM_TABLES_USE
61#endif 261#endif
diff --git a/C/XzDec.c b/C/XzDec.c
index a5f7039..3d1c98e 100644
--- a/C/XzDec.c
+++ b/C/XzDec.c
@@ -1,5 +1,5 @@
1/* XzDec.c -- Xz Decode 1/* XzDec.c -- Xz Decode
22023-04-13 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -105,30 +105,32 @@ static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSiz
105 { 105 {
106 if (propSize != 1) 106 if (propSize != 1)
107 return SZ_ERROR_UNSUPPORTED; 107 return SZ_ERROR_UNSUPPORTED;
108 p->delta = (unsigned)props[0] + 1; 108 p->delta = (UInt32)props[0] + 1;
109 } 109 }
110 else 110 else
111 { 111 {
112 if (propSize == 4) 112 if (propSize == 4)
113 { 113 {
114 UInt32 v = GetUi32(props); 114 const UInt32 v = GetUi32(props);
115 switch (p->methodId) 115 switch (p->methodId)
116 { 116 {
117 case XZ_ID_PPC: 117 case XZ_ID_PPC:
118 case XZ_ID_ARM: 118 case XZ_ID_ARM:
119 case XZ_ID_SPARC: 119 case XZ_ID_SPARC:
120 case XZ_ID_ARM64: 120 case XZ_ID_ARM64:
121 if ((v & 3) != 0) 121 if (v & 3)
122 return SZ_ERROR_UNSUPPORTED; 122 return SZ_ERROR_UNSUPPORTED;
123 break; 123 break;
124 case XZ_ID_ARMT: 124 case XZ_ID_ARMT:
125 if ((v & 1) != 0) 125 case XZ_ID_RISCV:
126 if (v & 1)
126 return SZ_ERROR_UNSUPPORTED; 127 return SZ_ERROR_UNSUPPORTED;
127 break; 128 break;
128 case XZ_ID_IA64: 129 case XZ_ID_IA64:
129 if ((v & 0xF) != 0) 130 if (v & 0xf)
130 return SZ_ERROR_UNSUPPORTED; 131 return SZ_ERROR_UNSUPPORTED;
131 break; 132 break;
133 default: break;
132 } 134 }
133 p->ip = v; 135 p->ip = v;
134 } 136 }
@@ -151,12 +153,13 @@ static void XzBcFilterState_Init(void *pp)
151 153
152static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = 154static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] =
153{ 155{
154 Z7_BRANCH_CONV_DEC(PPC), 156 Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC),
155 Z7_BRANCH_CONV_DEC(IA64), 157 Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64),
156 Z7_BRANCH_CONV_DEC(ARM), 158 Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM),
157 Z7_BRANCH_CONV_DEC(ARMT), 159 Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT),
158 Z7_BRANCH_CONV_DEC(SPARC), 160 Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC),
159 Z7_BRANCH_CONV_DEC(ARM64) 161 Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64),
162 Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV)
160}; 163};
161 164
162static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) 165static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size)
@@ -262,7 +265,7 @@ static SRes XzBcFilterState_Code2(void *pp,
262 265
263 266
264#define XZ_IS_SUPPORTED_FILTER_ID(id) \ 267#define XZ_IS_SUPPORTED_FILTER_ID(id) \
265 ((id) >= XZ_ID_Delta && (id) <= XZ_ID_ARM64) 268 ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV)
266 269
267SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, 270SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id,
268 Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) 271 Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc)
@@ -541,13 +544,12 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met
541{ 544{
542 IStateCoder *sc = &p->coders[coderIndex]; 545 IStateCoder *sc = &p->coders[coderIndex];
543 p->ids[coderIndex] = methodId; 546 p->ids[coderIndex] = methodId;
544 switch (methodId) 547 if (methodId == XZ_ID_LZMA2)
545 { 548 return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc);
546 case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); 549#ifdef USE_SUBBLOCK
547 #ifdef USE_SUBBLOCK 550 if (methodId == XZ_ID_Subblock)
548 case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc); 551 return SbState_SetFromMethod(sc, p->alloc);
549 #endif 552#endif
550 }
551 if (coderIndex == 0) 553 if (coderIndex == 0)
552 return SZ_ERROR_UNSUPPORTED; 554 return SZ_ERROR_UNSUPPORTED;
553 return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, 555 return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId,
@@ -558,10 +560,8 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met
558static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) 560static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize)
559{ 561{
560 IStateCoder *sc = &p->coders[coderIndex]; 562 IStateCoder *sc = &p->coders[coderIndex];
561 switch (methodId) 563 if (methodId == XZ_ID_LZMA2)
562 { 564 return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize);
563 case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize);
564 }
565 return SZ_ERROR_UNSUPPORTED; 565 return SZ_ERROR_UNSUPPORTED;
566} 566}
567 567
@@ -804,7 +804,7 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte
804} 804}
805 805
806#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 806#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
807 { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ 807 { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
808 if (s == 0) return SZ_ERROR_ARCHIVE; \ 808 if (s == 0) return SZ_ERROR_ARCHIVE; \
809 pos += s; } 809 pos += s; }
810 810
@@ -1034,7 +1034,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1034 SRes res; 1034 SRes res;
1035 1035
1036 ECoderFinishMode finishMode2 = finishMode; 1036 ECoderFinishMode finishMode2 = finishMode;
1037 BoolInt srcFinished2 = srcFinished; 1037 BoolInt srcFinished2 = (BoolInt)srcFinished;
1038 BoolInt destFinish = False; 1038 BoolInt destFinish = False;
1039 1039
1040 if (p->block.packSize != (UInt64)(Int64)-1) 1040 if (p->block.packSize != (UInt64)(Int64)-1)
@@ -1127,7 +1127,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1127 return SZ_OK; 1127 return SZ_OK;
1128 } 1128 }
1129 1129
1130 switch (p->state) 1130 switch ((int)p->state)
1131 { 1131 {
1132 case XZ_STATE_STREAM_HEADER: 1132 case XZ_STATE_STREAM_HEADER:
1133 { 1133 {
@@ -1172,15 +1172,15 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1172 p->state = XZ_STATE_STREAM_INDEX; 1172 p->state = XZ_STATE_STREAM_INDEX;
1173 break; 1173 break;
1174 } 1174 }
1175 p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4; 1175 p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4;
1176 break; 1176 break;
1177 } 1177 }
1178 1178
1179 if (p->pos != p->blockHeaderSize) 1179 if (p->pos != p->blockHeaderSize)
1180 { 1180 {
1181 UInt32 cur = p->blockHeaderSize - p->pos; 1181 unsigned cur = p->blockHeaderSize - p->pos;
1182 if (cur > srcRem) 1182 if (cur > srcRem)
1183 cur = (UInt32)srcRem; 1183 cur = (unsigned)srcRem;
1184 memcpy(p->buf + p->pos, src, cur); 1184 memcpy(p->buf + p->pos, src, cur);
1185 p->pos += cur; 1185 p->pos += cur;
1186 (*srcLen) += cur; 1186 (*srcLen) += cur;
@@ -1222,8 +1222,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1222 } 1222 }
1223 else 1223 else
1224 { 1224 {
1225 UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags); 1225 const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags);
1226 UInt32 cur = checkSize - p->pos; 1226 unsigned cur = checkSize - p->pos;
1227 if (cur != 0) 1227 if (cur != 0)
1228 { 1228 {
1229 if (srcRem == 0) 1229 if (srcRem == 0)
@@ -1232,7 +1232,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1232 return SZ_OK; 1232 return SZ_OK;
1233 } 1233 }
1234 if (cur > srcRem) 1234 if (cur > srcRem)
1235 cur = (UInt32)srcRem; 1235 cur = (unsigned)srcRem;
1236 memcpy(p->buf + p->pos, src, cur); 1236 memcpy(p->buf + p->pos, src, cur);
1237 p->pos += cur; 1237 p->pos += cur;
1238 (*srcLen) += cur; 1238 (*srcLen) += cur;
@@ -1321,9 +1321,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1321 1321
1322 case XZ_STATE_STREAM_FOOTER: 1322 case XZ_STATE_STREAM_FOOTER:
1323 { 1323 {
1324 UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos; 1324 unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos;
1325 if (cur > srcRem) 1325 if (cur > srcRem)
1326 cur = (UInt32)srcRem; 1326 cur = (unsigned)srcRem;
1327 memcpy(p->buf + p->pos, src, cur); 1327 memcpy(p->buf + p->pos, src, cur);
1328 p->pos += cur; 1328 p->pos += cur;
1329 (*srcLen) += cur; 1329 (*srcLen) += cur;
@@ -1358,6 +1358,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen,
1358 } 1358 }
1359 1359
1360 case XZ_STATE_BLOCK: break; /* to disable GCC warning */ 1360 case XZ_STATE_BLOCK: break; /* to disable GCC warning */
1361
1362 default: return SZ_ERROR_FAIL;
1361 } 1363 }
1362 } 1364 }
1363 /* 1365 /*
@@ -1773,10 +1775,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac
1773 } 1775 }
1774 } 1776 }
1775 { 1777 {
1776 UInt64 packSize = block->packSize; 1778 const UInt64 packSize = block->packSize;
1777 UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); 1779 const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3);
1778 UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); 1780 const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags);
1779 UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; 1781 const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize;
1780 // if (blockPackSum <= me->props.inBlockMax) 1782 // if (blockPackSum <= me->props.inBlockMax)
1781 // unpackBlockMaxSize 1783 // unpackBlockMaxSize
1782 { 1784 {
@@ -2381,7 +2383,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p
2381 if (tMode) 2383 if (tMode)
2382 { 2384 {
2383 XzDecMt_FreeOutBufs(p); 2385 XzDecMt_FreeOutBufs(p);
2384 tMode = MtDec_PrepareRead(&p->mtc); 2386 tMode = (BoolInt)MtDec_PrepareRead(&p->mtc);
2385 } 2387 }
2386 #endif 2388 #endif
2387 2389
@@ -2644,7 +2646,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle p,
2644 p->outSize = *outDataSize; 2646 p->outSize = *outDataSize;
2645 } 2647 }
2646 2648
2647 p->finishMode = finishMode; 2649 p->finishMode = (BoolInt)finishMode;
2648 2650
2649 // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test 2651 // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test
2650 2652
diff --git a/C/XzEnc.c b/C/XzEnc.c
index 22408e2..c1affad 100644
--- a/C/XzEnc.c
+++ b/C/XzEnc.c
@@ -1,5 +1,5 @@
1/* XzEnc.c -- Xz Encode 1/* XzEnc.c -- Xz Encode
22023-04-13 : Igor Pavlov : Public domain */ 22024-03-01 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -29,8 +29,9 @@
29 29
30#define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) 30#define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3)
31 31
32/* max pack size for LZMA2 block + check-64bytrs: */ 32#define XZ_CHECK_SIZE_MAX 64
33#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64) 33/* max pack size for LZMA2 block + pad4 + check_size: */
34#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX)
34 35
35#define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) 36#define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize))
36 37
@@ -325,12 +326,13 @@ typedef struct
325 326
326static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = 327static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] =
327{ 328{
328 Z7_BRANCH_CONV_ENC(PPC), 329 Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC),
329 Z7_BRANCH_CONV_ENC(IA64), 330 Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64),
330 Z7_BRANCH_CONV_ENC(ARM), 331 Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM),
331 Z7_BRANCH_CONV_ENC(ARMT), 332 Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT),
332 Z7_BRANCH_CONV_ENC(SPARC), 333 Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC),
333 Z7_BRANCH_CONV_ENC(ARM64) 334 Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64),
335 Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV)
334}; 336};
335 337
336static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) 338static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size)
@@ -888,9 +890,9 @@ static SRes Xz_CompressBlock(
888 blockSizes->unpackSize = checkInStream.processed; 890 blockSizes->unpackSize = checkInStream.processed;
889 } 891 }
890 { 892 {
891 Byte buf[4 + 64]; 893 Byte buf[4 + XZ_CHECK_SIZE_MAX];
892 unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); 894 const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed);
893 UInt64 packSize = seqSizeOutStream.processed; 895 const UInt64 packSize = seqSizeOutStream.processed;
894 896
895 buf[0] = 0; 897 buf[0] = 0;
896 buf[1] = 0; 898 buf[1] = 0;
@@ -898,7 +900,8 @@ static SRes Xz_CompressBlock(
898 buf[3] = 0; 900 buf[3] = 0;
899 901
900 SeqCheckInStream_GetDigest(&checkInStream, buf + 4); 902 SeqCheckInStream_GetDigest(&checkInStream, buf + 4);
901 RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) 903 RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize),
904 padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId)))
902 905
903 blockSizes->totalSize = seqSizeOutStream.processed - padSize; 906 blockSizes->totalSize = seqSizeOutStream.processed - padSize;
904 907
@@ -1083,18 +1086,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf
1083 CXzEnc *me = (CXzEnc *)pp; 1086 CXzEnc *me = (CXzEnc *)pp;
1084 SRes res; 1087 SRes res;
1085 CMtProgressThunk progressThunk; 1088 CMtProgressThunk progressThunk;
1086 1089 Byte *dest;
1087 Byte *dest = me->outBufs[outBufIndex];
1088
1089 UNUSED_VAR(finished) 1090 UNUSED_VAR(finished)
1090
1091 { 1091 {
1092 CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; 1092 CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex];
1093 bInfo->totalSize = 0; 1093 bInfo->totalSize = 0;
1094 bInfo->unpackSize = 0; 1094 bInfo->unpackSize = 0;
1095 bInfo->headerSize = 0; 1095 bInfo->headerSize = 0;
1096 // v23.02: we don't compress empty blocks
1097 // also we must ignore that empty block in XzEnc_MtCallback_Write()
1098 if (srcSize == 0)
1099 return SZ_OK;
1096 } 1100 }
1097 1101 dest = me->outBufs[outBufIndex];
1098 if (!dest) 1102 if (!dest)
1099 { 1103 {
1100 dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); 1104 dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize);
@@ -1140,18 +1144,20 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf
1140static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) 1144static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex)
1141{ 1145{
1142 CXzEnc *me = (CXzEnc *)pp; 1146 CXzEnc *me = (CXzEnc *)pp;
1143
1144 const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; 1147 const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex];
1145 const Byte *data = me->outBufs[outBufIndex]; 1148 // v23.02: we don't write empty blocks
1146 1149 // note: if (bInfo->unpackSize == 0) then there is no compressed data of block
1147 RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) 1150 if (bInfo->unpackSize == 0)
1148 1151 return SZ_OK;
1149 { 1152 {
1150 UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); 1153 const Byte *data = me->outBufs[outBufIndex];
1151 RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) 1154 RINOK(WriteBytes(me->outStream, data, bInfo->headerSize))
1155 {
1156 const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize);
1157 RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize))
1158 }
1159 return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc);
1152 } 1160 }
1153
1154 return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc);
1155} 1161}
1156 1162
1157#endif 1163#endif
diff --git a/C/XzIn.c b/C/XzIn.c
index d0fc763..b68af96 100644
--- a/C/XzIn.c
+++ b/C/XzIn.c
@@ -1,5 +1,5 @@
1/* XzIn.c - Xz input 1/* XzIn.c - Xz input
22023-04-02 : Igor Pavlov : Public domain */ 22023-09-07 : Igor Pavlov : Public domain */
3 3
4#include "Precomp.h" 4#include "Precomp.h"
5 5
@@ -27,7 +27,7 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream)
27} 27}
28 28
29#define READ_VARINT_AND_CHECK(buf, pos, size, res) \ 29#define READ_VARINT_AND_CHECK(buf, pos, size, res) \
30 { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ 30 { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \
31 if (s == 0) return SZ_ERROR_ARCHIVE; \ 31 if (s == 0) return SZ_ERROR_ARCHIVE; \
32 pos += s; } 32 pos += s; }
33 33
@@ -37,7 +37,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
37 unsigned headerSize; 37 unsigned headerSize;
38 *headerSizeRes = 0; 38 *headerSizeRes = 0;
39 RINOK(SeqInStream_ReadByte(inStream, &header[0])) 39 RINOK(SeqInStream_ReadByte(inStream, &header[0]))
40 headerSize = (unsigned)header[0]; 40 headerSize = header[0];
41 if (headerSize == 0) 41 if (headerSize == 0)
42 { 42 {
43 *headerSizeRes = 1; 43 *headerSizeRes = 1;
@@ -47,7 +47,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
47 47
48 *isIndex = False; 48 *isIndex = False;
49 headerSize = (headerSize << 2) + 4; 49 headerSize = (headerSize << 2) + 4;
50 *headerSizeRes = headerSize; 50 *headerSizeRes = (UInt32)headerSize;
51 { 51 {
52 size_t processedSize = headerSize - 1; 52 size_t processedSize = headerSize - 1;
53 RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) 53 RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize))
@@ -58,7 +58,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex,
58} 58}
59 59
60#define ADD_SIZE_CHECK(size, val) \ 60#define ADD_SIZE_CHECK(size, val) \
61 { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } 61 { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; }
62 62
63UInt64 Xz_GetUnpackSize(const CXzStream *p) 63UInt64 Xz_GetUnpackSize(const CXzStream *p)
64{ 64{
diff --git a/C/ZstdDec.c b/C/ZstdDec.c
new file mode 100644
index 0000000..ecf6d22
--- /dev/null
+++ b/C/ZstdDec.c
@@ -0,0 +1,4064 @@
1/* ZstdDec.c -- Zstd Decoder
22024-01-21 : the code was developed by Igor Pavlov, using Zstandard format
3 specification and original zstd decoder code as reference code.
4original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved.
5This source code is licensed under BSD 3-Clause License.
6*/
7
8#include "Precomp.h"
9
10#include <string.h>
11#include <stdlib.h>
12// #include <stdio.h>
13
14#include "Alloc.h"
15#include "Xxh64.h"
16#include "ZstdDec.h"
17#include "CpuArch.h"
18
19#if defined(MY_CPU_ARM64)
20#include <arm_neon.h>
21#endif
22
23/* original-zstd still doesn't support window larger than 2 GiB.
24 So we also limit our decoder for 2 GiB window: */
25#if defined(MY_CPU_64BIT) && 0 == 1
26 #define MAX_WINDOW_SIZE_LOG 41
27#else
28 #define MAX_WINDOW_SIZE_LOG 31
29#endif
30
31typedef
32 #if MAX_WINDOW_SIZE_LOG < 32
33 UInt32
34 #else
35 size_t
36 #endif
37 CZstdDecOffset;
38
39// for debug: simpler and smaller code but slow:
40// #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
41
42// #define SHOW_STAT
43#ifdef SHOW_STAT
44#include <stdio.h>
45static unsigned g_Num_Blocks_Compressed = 0;
46static unsigned g_Num_Blocks_memcpy = 0;
47static unsigned g_Num_Wrap_memmove_Num = 0;
48static unsigned g_Num_Wrap_memmove_Bytes = 0;
49static unsigned g_NumSeqs_total = 0;
50// static unsigned g_NumCopy = 0;
51static unsigned g_NumOver = 0;
52static unsigned g_NumOver2 = 0;
53static unsigned g_Num_Match = 0;
54static unsigned g_Num_Lits = 0;
55static unsigned g_Num_LitsBig = 0;
56static unsigned g_Num_Lit0 = 0;
57static unsigned g_Num_Rep0 = 0;
58static unsigned g_Num_Rep1 = 0;
59static unsigned g_Num_Rep2 = 0;
60static unsigned g_Num_Rep3 = 0;
61static unsigned g_Num_Threshold_0 = 0;
62static unsigned g_Num_Threshold_1 = 0;
63static unsigned g_Num_Threshold_0sum = 0;
64static unsigned g_Num_Threshold_1sum = 0;
65#define STAT_UPDATE(v) v
66#else
67#define STAT_UPDATE(v)
68#endif
69#define STAT_INC(v) STAT_UPDATE(v++;)
70
71
72typedef struct
73{
74 const Byte *ptr;
75 size_t len;
76}
77CInBufPair;
78
79
80#if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64)
81 #if (defined(__clang__) && (__clang_major__ >= 6)) \
82 || (defined(__GNUC__) && (__GNUC__ >= 6))
83 // disable for debug:
84 #define Z7_ZSTD_DEC_USE_BSR
85 #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
86 // #if defined(MY_CPU_ARM_OR_ARM64)
87 #if (_MSC_VER >= 1600)
88 #include <intrin.h>
89 #endif
90 // disable for debug:
91 #define Z7_ZSTD_DEC_USE_BSR
92 #endif
93#endif
94
95#ifdef Z7_ZSTD_DEC_USE_BSR
96 #if defined(__clang__) || defined(__GNUC__)
97 #define MY_clz(x) ((unsigned)__builtin_clz((UInt32)x))
98 #else // #if defined(_MSC_VER)
99 #ifdef MY_CPU_ARM_OR_ARM64
100 #define MY_clz _CountLeadingZeros
101 #endif // MY_CPU_X86_OR_AMD64
102 #endif // _MSC_VER
103#elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE)
104 #define Z7_ZSTD_DEC_USE_LOG_TABLE
105#endif
106
107
108static
109Z7_FORCE_INLINE
110unsigned GetHighestSetBit_32_nonzero_big(UInt32 num)
111{
112 // (num != 0)
113 #ifdef MY_clz
114 return 31 - MY_clz(num);
115 #elif defined(Z7_ZSTD_DEC_USE_BSR)
116 {
117 unsigned long zz;
118 _BitScanReverse(&zz, num);
119 return zz;
120 }
121 #else
122 {
123 int i = -1;
124 for (;;)
125 {
126 i++;
127 num >>= 1;
128 if (num == 0)
129 return (unsigned)i;
130 }
131 }
132 #endif
133}
134
135#ifdef Z7_ZSTD_DEC_USE_LOG_TABLE
136
137#define R1(a) a, a
138#define R2(a) R1(a), R1(a)
139#define R3(a) R2(a), R2(a)
140#define R4(a) R3(a), R3(a)
141#define R5(a) R4(a), R4(a)
142#define R6(a) R5(a), R5(a)
143#define R7(a) R6(a), R6(a)
144#define R8(a) R7(a), R7(a)
145#define R9(a) R8(a), R8(a)
146
147#define Z7_ZSTD_FSE_MAX_ACCURACY 9
148// states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits.
149static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] =
150{
151 R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9)
152};
153
154#define GetHighestSetBit_32_nonzero_small(num) (k_zstd_LogTable[num])
155#else
156#define GetHighestSetBit_32_nonzero_small GetHighestSetBit_32_nonzero_big
157#endif
158
159
160#ifdef MY_clz
161 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
162 bitOffset -= (CBitCtr)(MY_clz(b) - 23);
163#elif defined(Z7_ZSTD_DEC_USE_BSR)
164 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
165 { unsigned long zz; _BitScanReverse(&zz, b); bitOffset -= 8; bitOffset += zz; }
166#else
167 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
168 for (;;) { bitOffset--; if (b & 0x80) { break; } b <<= 1; }
169#endif
170
171#define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \
172{ \
173 unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \
174 if (lastByte == 0) return SZ_ERROR_DATA; \
175 bitOffset = (CBitCtr)((srcLen) * 8); \
176 UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
177}
178
179#ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
180
181#define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \
182{ \
183 unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \
184 if (lastByte == 0) return SZ_ERROR_DATA; \
185 srcLen_res *= 8; \
186 bitOffset = (CBitCtr)srcLen_res; \
187 UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
188}
189
190#endif
191
192/*
193typedef Int32 CBitCtr_signed;
194typedef Int32 CBitCtr;
195*/
196// /*
197typedef ptrdiff_t CBitCtr_signed;
198typedef ptrdiff_t CBitCtr;
199// */
200
201
202#define MATCH_LEN_MIN 3
203#define kBlockSizeMax (1u << 17)
204
205// #define Z7_ZSTD_DEC_PRINT_TABLE
206
207#ifdef Z7_ZSTD_DEC_PRINT_TABLE
208#define NUM_OFFSET_SYMBOLS_PREDEF 29
209#endif
210#define NUM_OFFSET_SYMBOLS_MAX (MAX_WINDOW_SIZE_LOG + 1) // 32
211#define NUM_LL_SYMBOLS 36
212#define NUM_ML_SYMBOLS 53
213#define FSE_NUM_SYMBOLS_MAX 53 // NUM_ML_SYMBOLS
214
215// /*
216#if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT)
217#define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
218#endif
219// */
220// for debug:
221// #define Z7_ZSTD_DEC_USE_BASES_LOCAL
222// #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
223
224#define GLOBAL_TABLE(n) k_ ## n
225
226#if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL)
227 #define BASES_TABLE(n) a_ ## n
228#elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
229 #define BASES_TABLE(n) p->m_ ## n
230#else
231 #define BASES_TABLE(n) GLOBAL_TABLE(n)
232#endif
233
234#define Z7_ZSTD_DEC_USE_ML_PLUS3
235
236#if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \
237 defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
238
239#define SEQ_EXTRA_TABLES(n) \
240 Byte n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \
241 Byte n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \
242 UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \
243 UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \
244
245#define Z7_ZSTD_DEC_USE_BASES_CALC
246
247#ifdef Z7_ZSTD_DEC_USE_BASES_CALC
248
249 #define FILL_LOC_BASES(n, startSum) \
250 { unsigned i; UInt32 sum = startSum; \
251 for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \
252 { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \
253 BASES_TABLE(n ## _BASES)[i] = sum; \
254 /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \
255 sum += (UInt32)1 << a; \
256 BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }}
257
258 #define FILL_LOC_BASES_ALL \
259 FILL_LOC_BASES (SEQ_LL, 0) \
260 FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \
261
262#else
263 #define COPY_GLOBAL_ARR(n) \
264 memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n)));
265 #define FILL_LOC_BASES_ALL \
266 COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \
267 COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \
268 COPY_GLOBAL_ARR (SEQ_LL_BASES) \
269 COPY_GLOBAL_ARR (SEQ_ML_BASES) \
270
271#endif
272
273#endif
274
275
276
277/// The sequence decoding baseline and number of additional bits to read/add
278#if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
279static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] =
280{
281 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
282 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
283 0x2000, 0x4000, 0x8000, 0x10000
284};
285#endif
286
287static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] =
288{
289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12,
291 13, 14, 15, 16
292};
293
294#if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
295static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] =
296{
297 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
298 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
299 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
300 0x1003, 0x2003, 0x4003, 0x8003, 0x10003
301};
302#endif
303
304static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] =
305{
306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11,
309 12, 13, 14, 15, 16
310};
311
312
313#ifdef Z7_ZSTD_DEC_PRINT_TABLE
314
315static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] =
316{
317 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
318 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
319 -1,-1,-1,-1
320};
321static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] =
322{
323 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
324 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1
325};
326static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] =
327{
328 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
330 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
331 -1,-1,-1,-1,-1
332};
333
334#endif
335
336// typedef int FastInt;
337// typedef Int32 FastInt32;
338typedef unsigned FastInt;
339typedef UInt32 FastInt32;
340typedef FastInt32 CFseRecord;
341
342
343#define FSE_REC_LEN_OFFSET 8
344#define FSE_REC_STATE_OFFSET 16
345#define GET_FSE_REC_SYM(st) ((Byte)(st))
346#define GET_FSE_REC_LEN(st) ((Byte)((st) >> FSE_REC_LEN_OFFSET))
347#define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET)
348
349// #define FSE_REC_SYM_MASK (0xff)
350// #define GET_FSE_REC_SYM(st) (st & FSE_REC_SYM_MASK)
351
352#define W_BASE(state, len, sym) \
353 (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \
354 (len << FSE_REC_LEN_OFFSET) + (sym))
355#define W(state, len, sym) W_BASE(state, len, sym)
356static const CFseRecord k_PredefRecords_LL[1 << 6] = {
357W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9),
358W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22),
359W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1),
360W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13),
361W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25),
362W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5),
363W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18),
364W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32)
365};
366static const CFseRecord k_PredefRecords_OF[1 << 5] = {
367W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12),
368W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7),
369W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1),
370W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24)
371};
372#if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
373#undef W
374#define W(state, len, sym) W_BASE(state, len, (sym + MATCH_LEN_MIN))
375#endif
376static const CFseRecord k_PredefRecords_ML[1 << 6] = {
377W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10),
378W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33),
379W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2),
380W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18),
381W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38),
382W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5),
383W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26),
384W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46)
385};
386
387
388// sum of freqs[] must be correct
389// (numSyms != 0)
390// (accuracy >= 5)
391static
392Z7_NO_INLINE
393// Z7_FORCE_INLINE
394void FSE_Generate(CFseRecord *table,
395 const Int16 *const freqs, const size_t numSyms,
396 const unsigned accuracy, UInt32 delta)
397{
398 size_t size = (size_t)1 << accuracy;
399 // max value in states[x] is ((1 << accuracy) * 2)
400 UInt16 states[FSE_NUM_SYMBOLS_MAX];
401 {
402 /* Symbols with "less than 1" probability get a single cell,
403 starting from the end of the table.
404 These symbols define a full state reset, reading (accuracy) bits. */
405 size_t threshold = size;
406 {
407 size_t s = 0;
408 do
409 if (freqs[s] == -1)
410 {
411 table[--threshold] = (CFseRecord)s;
412 states[s] = 1;
413 }
414 while (++s != numSyms);
415 }
416
417 #ifdef SHOW_STAT
418 if (threshold == size)
419 {
420 STAT_INC(g_Num_Threshold_0)
421 STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;)
422 }
423 else
424 {
425 STAT_INC(g_Num_Threshold_1)
426 STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;)
427 }
428 #endif
429
430 // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++)
431 {
432 // Each (symbol) gets freqs[symbol] cells.
433 // Cell allocation is spread, not linear.
434 const size_t step = (size >> 1) + (size >> 3) + 3;
435 size_t pos = 0;
436 // const unsigned mask = size - 1;
437 /*
438 if (threshold == size)
439 {
440 size_t s = 0;
441 size--;
442 do
443 {
444 int freq = freqs[s];
445 if (freq <= 0)
446 continue;
447 states[s] = (UInt16)freq;
448 do
449 {
450 table[pos] (CFseRecord)s;
451 pos = (pos + step) & size; // & mask;
452 }
453 while (--freq);
454 }
455 while (++s != numSyms);
456 }
457 else
458 */
459 {
460 size_t s = 0;
461 size--;
462 do
463 {
464 int freq = freqs[s];
465 if (freq <= 0)
466 continue;
467 states[s] = (UInt16)freq;
468 do
469 {
470 table[pos] = (CFseRecord)s;
471 // we skip position, if it's already occupied by a "less than 1" probability symbol.
472 // (step) is coprime to table size, so the cycle will visit each position exactly once
473 do
474 pos = (pos + step) & size; // & mask;
475 while (pos >= threshold);
476 }
477 while (--freq);
478 }
479 while (++s != numSyms);
480 }
481 size++;
482 // (pos != 0) is unexpected case that means that freqs[] are not correct.
483 // so it's some failure in code (for example, incorrect predefined freq[] table)
484 // if (pos != 0) return SZ_ERROR_FAIL;
485 }
486 // }
487 }
488 {
489 const CFseRecord * const limit = table + size;
490 delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta;
491 /* State increases by symbol over time, decreasing number of bits.
492 Baseline increases until the bit threshold is passed, at which point it resets to 0 */
493 do
494 {
495 #define TABLE_ITER(a) \
496 { \
497 const FastInt sym = (FastInt)table[a]; \
498 const unsigned nextState = states[sym]; \
499 unsigned nb; \
500 states[sym] = (UInt16)(nextState + 1); \
501 nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \
502 table[a] = (CFseRecord)(sym - delta \
503 + ((UInt32)nb << FSE_REC_LEN_OFFSET) \
504 + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \
505 }
506 TABLE_ITER(0)
507 TABLE_ITER(1)
508 table += 2;
509 }
510 while (table != limit);
511 }
512}
513
514
515#ifdef Z7_ZSTD_DEC_PRINT_TABLE
516
517static void Print_Predef(unsigned predefAccuracy,
518 const unsigned numSymsPredef,
519 const Int16 * const predefFreqs,
520 const CFseRecord *checkTable)
521{
522 CFseRecord table[1 << 6];
523 unsigned i;
524 FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy,
525 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
526 numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
527 #endif
528 0
529 );
530 if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0)
531 exit(1);
532 for (i = 0; i < (1u << predefAccuracy); i++)
533 {
534 const UInt32 v = table[i];
535 const unsigned state = (unsigned)(GET_FSE_REC_STATE(v));
536 if (state & 0xf)
537 exit(1);
538 if (i != 0)
539 {
540 printf(",");
541 if (i % 8 == 0)
542 printf("\n");
543 }
544 printf("W(%d,%d,%2d)",
545 (unsigned)(state >> 4),
546 (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff),
547 (unsigned)GET_FSE_REC_SYM(v));
548 }
549 printf("\n\n");
550}
551
552#endif
553
554
555#define GET16(dest, p) { const Byte *ptr = p; dest = GetUi16(ptr); }
556#define GET32(dest, p) { const Byte *ptr = p; dest = GetUi32(ptr); }
557
558// (1 <= numBits <= 9)
559#define FORWARD_READ_BITS(destVal, numBits, mask) \
560 { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
561 if (bos3 >= 0) return SZ_ERROR_DATA; \
562 GET16(destVal, src + bos3) \
563 destVal >>= (bitOffset) & 7; \
564 bitOffset += (CBitCtr_signed)(numBits); \
565 mask = (1u << (numBits)) - 1; \
566 destVal &= mask; \
567 }
568
569#define FORWARD_READ_1BIT(destVal) \
570 { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
571 if (bos3 >= 0) return SZ_ERROR_DATA; \
572 destVal = *(src + bos3); \
573 destVal >>= (bitOffset) & 7; \
574 (bitOffset)++; \
575 destVal &= 1; \
576 }
577
578
579// in: (accuracyMax <= 9)
580// at least 2 bytes will be processed from (in) stream.
581// at return: (in->len > 0)
582static
583Z7_NO_INLINE
584SRes FSE_DecodeHeader(CFseRecord *const table,
585 CInBufPair *const in,
586 const unsigned accuracyMax,
587 Byte *const accuracyRes,
588 unsigned numSymbolsMax)
589{
590 unsigned accuracy;
591 unsigned remain1;
592 unsigned syms;
593 Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat)
594 const Byte *src = in->ptr;
595 CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1;
596 if (bitOffset <= 0)
597 return SZ_ERROR_DATA;
598 accuracy = *src & 0xf;
599 accuracy += 5;
600 if (accuracy > accuracyMax)
601 return SZ_ERROR_DATA;
602 *accuracyRes = (Byte)accuracy;
603 remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1)
604 syms = 0;
605 src += bitOffset; // src points to last byte
606 bitOffset = 4 - (bitOffset << 3);
607
608 for (;;)
609 {
610 // (2 <= remain1)
611 const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1);
612 // (1 <= bits <= accuracy)
613 unsigned val; // it must be unsigned or int
614 unsigned mask;
615 FORWARD_READ_BITS(val, bits, mask)
616 {
617 const unsigned val2 = remain1 + val - mask;
618 if (val2 > mask)
619 {
620 unsigned bit;
621 FORWARD_READ_1BIT(bit)
622 if (bit)
623 val = val2;
624 }
625 }
626 {
627 // (remain1 >= 2)
628 // (0 <= (int)val <= remain1)
629 val = (unsigned)((int)val - 1);
630 // val now is "probability" of symbol
631 // (probability == -1) means "less than 1" frequency.
632 // (-1 <= (int)val <= remain1 - 1)
633 freqs[syms++] = (Int16)(int)val;
634 if (val != 0)
635 {
636 remain1 -= (int)val < 0 ? 1u : (unsigned)val;
637 // remain1 -= val;
638 // val >>= (sizeof(val) * 8 - 2);
639 // remain1 -= val & 2;
640 // freqs[syms++] = (Int16)(int)val;
641 // syms++;
642 if (remain1 == 1)
643 break;
644 if (syms >= FSE_NUM_SYMBOLS_MAX)
645 return SZ_ERROR_DATA;
646 }
647 else // if (val == 0)
648 {
649 // freqs[syms++] = 0;
650 // syms++;
651 for (;;)
652 {
653 unsigned repeat;
654 FORWARD_READ_BITS(repeat, 2, mask)
655 freqs[syms ] = 0;
656 freqs[syms + 1] = 0;
657 freqs[syms + 2] = 0;
658 syms += repeat;
659 if (syms >= FSE_NUM_SYMBOLS_MAX)
660 return SZ_ERROR_DATA;
661 if (repeat != 3)
662 break;
663 }
664 }
665 }
666 }
667
668 if (syms > numSymbolsMax)
669 return SZ_ERROR_DATA;
670 bitOffset += 7;
671 bitOffset >>= 3;
672 if (bitOffset > 0)
673 return SZ_ERROR_DATA;
674 in->ptr = src + bitOffset;
675 in->len = (size_t)(1 - bitOffset);
676 {
677 // unsigned uuu; for (uuu = 0; uuu < 50; uuu++)
678 FSE_Generate(table, freqs, syms, accuracy,
679 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
680 numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
681 #endif
682 0
683 );
684 }
685 return SZ_OK;
686}
687
688
689// ---------- HUFFMAN ----------
690
691#define HUF_MAX_BITS 12
692#define HUF_MAX_SYMBS 256
693#define HUF_DUMMY_SIZE (128 + 8 * 2) // it must multiple of 8
694// #define HUF_DUMMY_SIZE 0
695#define HUF_TABLE_SIZE ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
696#define HUF_GET_SYMBOLS(table) ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
697// #define HUF_GET_LENS(table) (table)
698
699typedef struct
700{
701 // Byte table[HUF_TABLE_SIZE];
702 UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)];
703}
704CZstdDecHufTable;
705
706/*
707Input:
708 numSyms != 0
709 (bits) array size must be aligned for 2
710 if (numSyms & 1), then bits[numSyms] == 0,
711 Huffman tree must be correct before Huf_Build() call:
712 (sum (1/2^bits[i]) == 1).
713 && (bits[i] <= HUF_MAX_BITS)
714*/
715static
716Z7_FORCE_INLINE
717void Huf_Build(Byte * const table,
718 const Byte *bits, const unsigned numSyms)
719{
720 unsigned counts0[HUF_MAX_BITS + 1];
721 unsigned counts1[HUF_MAX_BITS + 1];
722 const Byte * const bitsEnd = bits + numSyms;
723 // /*
724 {
725 unsigned t;
726 for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0;
727 for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0;
728 }
729 // */
730 // memset(counts0, 0, sizeof(counts0));
731 // memset(counts1, 0, sizeof(counts1));
732 {
733 const Byte *bits2 = bits;
734 // we access additional bits[symbol] if (numSyms & 1)
735 do
736 {
737 counts0[bits2[0]]++;
738 counts1[bits2[1]]++;
739 }
740 while ((bits2 += 2) < bitsEnd);
741 }
742 {
743 unsigned r = 0;
744 unsigned i = HUF_MAX_BITS;
745 // Byte *lens = HUF_GET_LENS(symbols);
746 do
747 {
748 const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i);
749 counts0[i] = r;
750 if (num)
751 {
752 Byte *lens = &table[r];
753 r += num;
754 memset(lens, (int)i, num);
755 }
756 }
757 while (--i);
758 counts0[0] = 0; // for speculated loads
759 // no need for check:
760 // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0);
761 }
762 {
763 #ifdef MY_CPU_64BIT
764 UInt64
765 #else
766 UInt32
767 #endif
768 v = 0;
769 Byte *symbols = HUF_GET_SYMBOLS(table);
770 do
771 {
772 const unsigned nb = *bits++;
773 if (nb)
774 {
775 const unsigned code = counts0[nb];
776 const unsigned num = (1u << HUF_MAX_BITS) >> nb;
777 counts0[nb] = code + num;
778 // memset(&symbols[code], i, num);
779 // /*
780 {
781 Byte *s2 = &symbols[code];
782 if (num <= 2)
783 {
784 s2[0] = (Byte)v;
785 s2[(size_t)num - 1] = (Byte)v;
786 }
787 else if (num <= 8)
788 {
789 *(UInt32 *)(void *)s2 = (UInt32)v;
790 *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v;
791 }
792 else
793 {
794 #ifdef MY_CPU_64BIT
795 UInt64 *s = (UInt64 *)(void *)s2;
796 const UInt64 *lim = (UInt64 *)(void *)(s2 + num);
797 do
798 {
799 s[0] = v; s[1] = v; s += 2;
800 }
801 while (s != lim);
802 #else
803 UInt32 *s = (UInt32 *)(void *)s2;
804 const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num);
805 do
806 {
807 s[0] = v; s[1] = v; s += 2;
808 s[0] = v; s[1] = v; s += 2;
809 }
810 while (s != lim);
811 #endif
812 }
813 }
814 // */
815 }
816 v +=
817 #ifdef MY_CPU_64BIT
818 0x0101010101010101;
819 #else
820 0x01010101;
821 #endif
822 }
823 while (bits != bitsEnd);
824 }
825}
826
827
828
829// how many bytes (src) was moved back from original value.
830// we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access
831#define HUF_SRC_OFFSET 3
832
833// v <<= 8 - (bitOffset & 7) + numBits;
834// v >>= 32 - HUF_MAX_BITS;
835#define HUF_GET_STATE(v, bitOffset, numBits) \
836 GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \
837 v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \
838 v &= (1u << HUF_MAX_BITS) - 1; \
839
840
841#ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
842#if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \
843 || !defined(MY_CPU_X86_OR_AMD64) \
844 // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */
845 // we need big number (>=16) of registers for PRELOAD4
846 #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
847 // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug
848#endif
849#endif
850
851// for debug: simpler and smaller code but slow:
852// #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
853
854#if defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
855 !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS)
856
857#define HUF_DECODE(bitOffset, dest) \
858{ \
859 UInt32 v; \
860 HUF_GET_STATE(v, bitOffset, 0) \
861 bitOffset -= table[v]; \
862 *(dest) = symbols[v]; \
863 if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
864}
865
866#endif
867
868#if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
869 defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \
870 defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \
871
872#define HUF_DECODE_2_INIT(v, bitOffset) \
873 HUF_GET_STATE(v, bitOffset, 0)
874
875#define HUF_DECODE_2(v, bitOffset, dest) \
876{ \
877 unsigned numBits; \
878 numBits = table[v]; \
879 *(dest) = symbols[v]; \
880 HUF_GET_STATE(v, bitOffset, numBits) \
881 bitOffset -= (CBitCtr)numBits; \
882 if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
883}
884
885#endif
886
887
888// src == ptr - HUF_SRC_OFFSET
889// we are allowed to access 3 bytes before start of input buffer
890static
891Z7_NO_INLINE
892SRes Huf_Decompress_1stream(const Byte * const table,
893 const Byte *src, const size_t srcLen,
894 Byte *dest, const size_t destLen)
895{
896 CBitCtr bitOffset;
897 if (srcLen == 0)
898 return SZ_ERROR_DATA;
899 SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen)
900 if (destLen)
901 {
902 const Byte *symbols = HUF_GET_SYMBOLS(table);
903 const Byte *destLim = dest + destLen;
904 #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
905 {
906 do
907 {
908 HUF_DECODE (bitOffset, dest)
909 }
910 while (++dest != destLim);
911 }
912 #else
913 {
914 UInt32 v;
915 HUF_DECODE_2_INIT (v, bitOffset)
916 do
917 {
918 HUF_DECODE_2 (v, bitOffset, dest)
919 }
920 while (++dest != destLim);
921 }
922 #endif
923 }
924 return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA;
925}
926
927
928// for debug : it reduces register pressure : by array copy can be slow :
929// #define Z7_ZSTD_DEC_USE_HUF_LOCAL
930
931// src == ptr + (6 - HUF_SRC_OFFSET)
932// srcLen >= 10
933// we are allowed to access 3 bytes before start of input buffer
934static
935Z7_NO_INLINE
936SRes Huf_Decompress_4stream(const Byte * const
937 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
938 table2,
939 #else
940 table,
941 #endif
942 const Byte *src, size_t srcLen,
943 Byte *dest, size_t destLen)
944{
945 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
946 Byte table[HUF_TABLE_SIZE];
947 #endif
948 UInt32 sizes[3];
949 const size_t delta = (destLen + 3) / 4;
950 if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
951 if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
952 sizes[1] += sizes[0];
953 if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
954 sizes[2] += sizes[1];
955 srcLen -= 6;
956 if (srcLen <= sizes[2])
957 return SZ_ERROR_DATA;
958
959 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
960 {
961 // unsigned i = 0; for(; i < 1000; i++)
962 memcpy(table, table2, HUF_TABLE_SIZE);
963 }
964 #endif
965
966 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
967 {
968 CBitCtr bitOffset_0,
969 bitOffset_1,
970 bitOffset_2,
971 bitOffset_3;
972 {
973 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0])
974 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1])
975 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2])
976 SET_bitOffset_TO_PAD (bitOffset_3, src + HUF_SRC_OFFSET, srcLen)
977 }
978 {
979 const Byte * const symbols = HUF_GET_SYMBOLS(table);
980 Byte *destLim = dest + destLen - delta * 3;
981
982 if (dest != destLim)
983 #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
984 {
985 UInt32 v_0, v_1, v_2, v_3;
986 HUF_DECODE_2_INIT (v_0, bitOffset_0)
987 HUF_DECODE_2_INIT (v_1, bitOffset_1)
988 HUF_DECODE_2_INIT (v_2, bitOffset_2)
989 HUF_DECODE_2_INIT (v_3, bitOffset_3)
990 // #define HUF_DELTA (1 << 17) / 4
991 do
992 {
993 HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3)
994 HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2)
995 HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
996 HUF_DECODE_2 (v_0, bitOffset_0, dest)
997 }
998 while (++dest != destLim);
999 /*
1000 {// unsigned y = 0; for (;y < 1; y++)
1001 {
1002 const size_t num = destLen - delta * 3;
1003 Byte *orig = dest - num;
1004 memmove (orig + delta , orig + HUF_DELTA, num);
1005 memmove (orig + delta * 2, orig + HUF_DELTA * 2, num);
1006 memmove (orig + delta * 3, orig + HUF_DELTA * 3, num);
1007 }}
1008 */
1009 }
1010 #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2)
1011 {
1012 UInt32 v_0, v_1, v_2, v_3;
1013 HUF_DECODE_2_INIT (v_0, bitOffset_0)
1014 HUF_DECODE_2_INIT (v_1, bitOffset_1)
1015 do
1016 {
1017 HUF_DECODE_2 (v_0, bitOffset_0, dest)
1018 HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
1019 }
1020 while (++dest != destLim);
1021 dest = destLim - (destLen - delta * 3);
1022 dest += delta * 2;
1023 destLim += delta * 2;
1024 HUF_DECODE_2_INIT (v_2, bitOffset_2)
1025 HUF_DECODE_2_INIT (v_3, bitOffset_3)
1026 do
1027 {
1028 HUF_DECODE_2 (v_2, bitOffset_2, dest)
1029 HUF_DECODE_2 (v_3, bitOffset_3, dest + delta)
1030 }
1031 while (++dest != destLim);
1032 dest -= delta * 2;
1033 destLim -= delta * 2;
1034 }
1035 #else
1036 {
1037 do
1038 {
1039 HUF_DECODE (bitOffset_3, dest + delta * 3)
1040 HUF_DECODE (bitOffset_2, dest + delta * 2)
1041 HUF_DECODE (bitOffset_1, dest + delta)
1042 HUF_DECODE (bitOffset_0, dest)
1043 }
1044 while (++dest != destLim);
1045 }
1046 #endif
1047
1048 if (bitOffset_3 != (CBitCtr)sizes[2])
1049 return SZ_ERROR_DATA;
1050 if (destLen &= 3)
1051 {
1052 destLim = dest + 4 - destLen;
1053 do
1054 {
1055 HUF_DECODE (bitOffset_2, dest + delta * 2)
1056 HUF_DECODE (bitOffset_1, dest + delta)
1057 HUF_DECODE (bitOffset_0, dest)
1058 }
1059 while (++dest != destLim);
1060 }
1061 if ( bitOffset_0 != 0
1062 || bitOffset_1 != (CBitCtr)sizes[0]
1063 || bitOffset_2 != (CBitCtr)sizes[1])
1064 return SZ_ERROR_DATA;
1065 }
1066 }
1067 #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
1068 {
1069 unsigned i;
1070 for (i = 0; i < 4; i++)
1071 {
1072 size_t d = destLen;
1073 size_t size = srcLen;
1074 if (i != 3)
1075 {
1076 d = delta;
1077 size = sizes[i];
1078 }
1079 if (i != 0)
1080 size -= sizes[i - 1];
1081 destLen -= d;
1082 RINOK(Huf_Decompress_1stream(table, src, size, dest, d))
1083 dest += d;
1084 src += size;
1085 }
1086 }
1087 #endif
1088
1089 return SZ_OK;
1090}
1091
1092
1093
1094// (in->len != 0)
1095// we are allowed to access in->ptr[-3]
1096// at least 2 bytes in (in->ptr) will be processed
1097static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in)
1098{
1099 Byte weights[HUF_MAX_SYMBS + 1]; // +1 for extra write for loop unroll
1100 unsigned numSyms;
1101 const unsigned header = *(in->ptr)++;
1102 in->len--;
1103 // memset(weights, 0, sizeof(weights));
1104 if (header >= 128)
1105 {
1106 // direct representation: 4 bits field (0-15) per weight
1107 numSyms = header - 127;
1108 // numSyms != 0
1109 {
1110 const size_t numBytes = (numSyms + 1) / 2;
1111 const Byte *const ws = in->ptr;
1112 size_t i = 0;
1113 if (in->len < numBytes)
1114 return SZ_ERROR_DATA;
1115 in->ptr += numBytes;
1116 in->len -= numBytes;
1117 do
1118 {
1119 const unsigned b = ws[i];
1120 weights[i * 2 ] = (Byte)(b >> 4);
1121 weights[i * 2 + 1] = (Byte)(b & 0xf);
1122 }
1123 while (++i != numBytes);
1124 /* 7ZIP: we can restore correct zero value for weights[numSyms],
1125 if we want to use zero values starting from numSyms in code below. */
1126 // weights[numSyms] = 0;
1127 }
1128 }
1129 else
1130 {
1131 #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6
1132 CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS];
1133
1134 Byte accuracy;
1135 const Byte *src;
1136 size_t srcLen;
1137 if (in->len < header)
1138 return SZ_ERROR_DATA;
1139 {
1140 CInBufPair fse_stream;
1141 fse_stream.len = header;
1142 fse_stream.ptr = in->ptr;
1143 in->ptr += header;
1144 in->len -= header;
1145 RINOK(FSE_DecodeHeader(table, &fse_stream,
1146 MAX_ACCURACY_LOG_FOR_WEIGHTS,
1147 &accuracy,
1148 16 // num weight symbols max (max-symbol is 15)
1149 ))
1150 // at least 2 bytes were processed in fse_stream.
1151 // (srcLen > 0) after FSE_DecodeHeader()
1152 // if (srcLen == 0) return SZ_ERROR_DATA;
1153 src = fse_stream.ptr;
1154 srcLen = fse_stream.len;
1155 }
1156 // we are allowed to access src[-5]
1157 {
1158 // unsigned yyy = 200; do {
1159 CBitCtr bitOffset;
1160 FastInt32 state1, state2;
1161 SET_bitOffset_TO_PAD (bitOffset, src, srcLen)
1162 state1 = accuracy;
1163 src -= state1 >> 2; // src -= 1; // for GET16() optimization
1164 state1 <<= FSE_REC_LEN_OFFSET;
1165 state2 = state1;
1166 numSyms = 0;
1167 for (;;)
1168 {
1169 #define FSE_WEIGHT_DECODE(st) \
1170 { \
1171 const unsigned bits = GET_FSE_REC_LEN(st); \
1172 FastInt r; \
1173 GET16(r, src + (bitOffset >> 3)) \
1174 r >>= (unsigned)bitOffset & 7; \
1175 if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \
1176 { if (bitOffset + (CBitCtr)bits != 0) \
1177 return SZ_ERROR_DATA; \
1178 break; } \
1179 r &= 0xff; \
1180 r >>= 8 - bits; \
1181 st = table[GET_FSE_REC_STATE(st) + r]; \
1182 weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \
1183 }
1184 FSE_WEIGHT_DECODE (state1)
1185 FSE_WEIGHT_DECODE (state2)
1186 if (numSyms == HUF_MAX_SYMBS)
1187 return SZ_ERROR_DATA;
1188 }
1189 // src += (unsigned)accuracy >> 2; } while (--yyy);
1190 }
1191 }
1192
1193 // Build using weights:
1194 {
1195 UInt32 sum = 0;
1196 {
1197 // numSyms >= 1
1198 unsigned i = 0;
1199 weights[numSyms] = 0;
1200 do
1201 {
1202 sum += ((UInt32)1 << weights[i ]) & ~(UInt32)1;
1203 sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1;
1204 i += 2;
1205 }
1206 while (i < numSyms);
1207 if (sum == 0)
1208 return SZ_ERROR_DATA;
1209 }
1210 {
1211 const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1;
1212 {
1213 const UInt32 left = ((UInt32)1 << maxBits) - sum;
1214 // (left != 0)
1215 // (left) must be power of 2 in correct stream
1216 if (left & (left - 1))
1217 return SZ_ERROR_DATA;
1218 weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left);
1219 }
1220 // if (numSyms & 1)
1221 weights[numSyms] = 0; // for loop unroll
1222 // numSyms >= 2
1223 {
1224 unsigned i = 0;
1225 do
1226 {
1227 /*
1228 #define WEIGHT_ITER(a) \
1229 { unsigned w = weights[i + (a)]; \
1230 const unsigned t = maxBits - w; \
1231 w = w ? t: w; \
1232 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1233 weights[i + (a)] = (Byte)w; }
1234 */
1235 // /*
1236 #define WEIGHT_ITER(a) \
1237 { unsigned w = weights[i + (a)]; \
1238 if (w) { \
1239 w = maxBits - w; \
1240 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1241 weights[i + (a)] = (Byte)w; }}
1242 // */
1243 WEIGHT_ITER(0)
1244 // WEIGHT_ITER(1)
1245 // i += 2;
1246 }
1247 while (++i != numSyms);
1248 }
1249 }
1250 }
1251 {
1252 // unsigned yyy; for (yyy = 0; yyy < 100; yyy++)
1253 Huf_Build((Byte *)(void *)p->table64, weights, numSyms);
1254 }
1255 return SZ_OK;
1256}
1257
1258
1259typedef enum
1260{
1261 k_SeqMode_Predef = 0,
1262 k_SeqMode_RLE = 1,
1263 k_SeqMode_FSE = 2,
1264 k_SeqMode_Repeat = 3
1265}
1266z7_zstd_enum_SeqMode;
1267
1268// predefAccuracy == 5 for OFFSET symbols
1269// predefAccuracy == 6 for MATCH/LIT LEN symbols
1270static
1271SRes
1272Z7_NO_INLINE
1273// Z7_FORCE_INLINE
1274FSE_Decode_SeqTable(CFseRecord * const table,
1275 CInBufPair * const in,
1276 unsigned predefAccuracy,
1277 Byte * const accuracyRes,
1278 unsigned numSymbolsMax,
1279 const CFseRecord * const predefs,
1280 const unsigned seqMode)
1281{
1282 // UNUSED_VAR(numSymsPredef)
1283 // UNUSED_VAR(predefFreqs)
1284 if (seqMode == k_SeqMode_FSE)
1285 {
1286 // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK(
1287 return
1288 FSE_DecodeHeader(table, in,
1289 predefAccuracy + 3, // accuracyMax
1290 accuracyRes,
1291 numSymbolsMax)
1292 ;
1293 // )} while (--y); return SZ_OK;
1294 }
1295 // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax
1296 // numSymsMax == 32 for offsets
1297
1298 if (seqMode == k_SeqMode_Predef)
1299 {
1300 *accuracyRes = (Byte)predefAccuracy;
1301 memcpy(table, predefs, sizeof(UInt32) << predefAccuracy);
1302 return SZ_OK;
1303 }
1304
1305 // (seqMode == k_SeqMode_RLE)
1306 if (in->len == 0)
1307 return SZ_ERROR_DATA;
1308 in->len--;
1309 {
1310 const Byte *ptr = in->ptr;
1311 const Byte sym = ptr[0];
1312 in->ptr = ptr + 1;
1313 table[0] = (FastInt32)sym
1314 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
1315 + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0)
1316 #endif
1317 ;
1318 *accuracyRes = 0;
1319 }
1320 return SZ_OK;
1321}
1322
1323
1324typedef struct
1325{
1326 CFseRecord of[1 << 8];
1327 CFseRecord ll[1 << 9];
1328 CFseRecord ml[1 << 9];
1329}
1330CZstdDecFseTables;
1331
1332
1333typedef struct
1334{
1335 Byte *win;
1336 SizeT cycSize;
1337 /*
1338 if (outBuf_fromCaller) : cycSize = outBufSize_fromCaller
1339 else {
1340 if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space)
1341 if (!isCyclicMode) : cycSize = ContentSize,
1342 (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown
1343 }
1344 */
1345 SizeT winPos;
1346
1347 CZstdDecOffset reps[3];
1348
1349 Byte ll_accuracy;
1350 Byte of_accuracy;
1351 Byte ml_accuracy;
1352 // Byte seqTables_wereSet;
1353 Byte litHuf_wasSet;
1354
1355 Byte *literalsBase;
1356
1357 size_t winSize; // from header
1358 size_t totalOutCheck; // totalOutCheck <= winSize
1359
1360 #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1361 SEQ_EXTRA_TABLES(m_)
1362 #endif
1363 // UInt64 _pad_Alignment; // is not required now
1364 CZstdDecFseTables fse;
1365 CZstdDecHufTable huf;
1366}
1367CZstdDec1;
1368
1369#define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \
1370 ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax)
1371
1372#define SEQ_TABLES_WERE_NOT_SET_ml_accuracy 1 // accuracy=1 is not used by zstd
1373#define IS_SEQ_TABLES_WERE_SET(p) (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy))
1374// #define IS_SEQ_TABLES_WERE_SET(p) ((p)->seqTables_wereSet)
1375
1376
1377static void ZstdDec1_Construct(CZstdDec1 *p)
1378{
1379 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
1380 Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL);
1381 Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF);
1382 Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML);
1383 #endif
1384
1385 p->win = NULL;
1386 p->cycSize = 0;
1387 p->literalsBase = NULL;
1388 #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1389 FILL_LOC_BASES_ALL
1390 #endif
1391}
1392
1393
1394static void ZstdDec1_Init(CZstdDec1 *p)
1395{
1396 p->reps[0] = 1;
1397 p->reps[1] = 4;
1398 p->reps[2] = 8;
1399 // p->seqTables_wereSet = False;
1400 p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy;
1401 p->litHuf_wasSet = False;
1402 p->totalOutCheck = 0;
1403}
1404
1405
1406
1407#ifdef MY_CPU_LE_UNALIGN
1408 #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1409#endif
1410
1411#ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1412
1413 #define COPY_CHUNK_SIZE 16
1414
1415 #define COPY_CHUNK_4_2(dest, src) \
1416 { \
1417 ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \
1418 ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \
1419 src += 4 * 2; \
1420 dest += 4 * 2; \
1421 }
1422
1423 /* sse2 doesn't help here in GCC and CLANG.
1424 so we disabled sse2 here */
1425 /*
1426 #if defined(MY_CPU_AMD64)
1427 #define Z7_ZSTD_DEC_USE_SSE2
1428 #elif defined(MY_CPU_X86)
1429 #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
1430 || defined(__SSE2__) \
1431 // || 1 == 1 // for debug only
1432 #define Z7_ZSTD_DEC_USE_SSE2
1433 #endif
1434 #endif
1435 */
1436
1437 #if defined(MY_CPU_ARM64)
1438 #define COPY_OFFSET_MIN 16
1439 #define COPY_CHUNK1(dest, src) \
1440 { \
1441 vst1q_u8((uint8_t *)(void *)dest, \
1442 vld1q_u8((const uint8_t *)(const void *)src)); \
1443 src += 16; \
1444 dest += 16; \
1445 }
1446
1447 #define COPY_CHUNK(dest, src) \
1448 { \
1449 COPY_CHUNK1(dest, src) \
1450 if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1451 COPY_CHUNK1(dest, src) \
1452 }
1453
1454 #elif defined(Z7_ZSTD_DEC_USE_SSE2)
1455 #include <emmintrin.h> // sse2
1456 #define COPY_OFFSET_MIN 16
1457
1458 #define COPY_CHUNK1(dest, src) \
1459 { \
1460 _mm_storeu_si128((__m128i *)(void *)dest, \
1461 _mm_loadu_si128((const __m128i *)(const void *)src)); \
1462 src += 16; \
1463 dest += 16; \
1464 }
1465
1466 #define COPY_CHUNK(dest, src) \
1467 { \
1468 COPY_CHUNK1(dest, src) \
1469 if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1470 COPY_CHUNK1(dest, src) \
1471 }
1472
1473 #elif defined(MY_CPU_64BIT)
1474 #define COPY_OFFSET_MIN 8
1475
1476 #define COPY_CHUNK(dest, src) \
1477 { \
1478 ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \
1479 ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \
1480 src += 8 * 2; \
1481 dest += 8 * 2; \
1482 }
1483
1484 #else
1485 #define COPY_OFFSET_MIN 4
1486
1487 #define COPY_CHUNK(dest, src) \
1488 { \
1489 COPY_CHUNK_4_2(dest, src); \
1490 COPY_CHUNK_4_2(dest, src); \
1491 }
1492
1493 #endif
1494#endif
1495
1496
1497#ifndef COPY_CHUNK_SIZE
1498 #define COPY_OFFSET_MIN 4
1499 #define COPY_CHUNK_SIZE 8
1500 #define COPY_CHUNK_2(dest, src) \
1501 { \
1502 const Byte a0 = src[0]; \
1503 const Byte a1 = src[1]; \
1504 dest[0] = a0; \
1505 dest[1] = a1; \
1506 src += 2; \
1507 dest += 2; \
1508 }
1509 #define COPY_CHUNK(dest, src) \
1510 { \
1511 COPY_CHUNK_2(dest, src) \
1512 COPY_CHUNK_2(dest, src) \
1513 COPY_CHUNK_2(dest, src) \
1514 COPY_CHUNK_2(dest, src) \
1515 }
1516#endif
1517
1518
1519#define COPY_PREPARE \
1520 len += (COPY_CHUNK_SIZE - 1); \
1521 len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \
1522 { if (len > rem) \
1523 { len = rem; \
1524 rem &= (COPY_CHUNK_SIZE - 1); \
1525 if (rem) { \
1526 len -= rem; \
1527 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1528 do *dest++ = *src++; while (--rem); \
1529 if (len == 0) return; }}}
1530
1531#define COPY_CHUNKS \
1532{ \
1533 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1534 do { COPY_CHUNK(dest, src) } \
1535 while (len -= COPY_CHUNK_SIZE); \
1536}
1537
1538// (len != 0)
1539// (len <= rem)
1540static
1541Z7_FORCE_INLINE
1542// Z7_ATTRIB_NO_VECTOR
1543void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem)
1544{
1545 COPY_PREPARE
1546 COPY_CHUNKS
1547}
1548
1549
1550/* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional
1551 space after cycSize that can be used to reduce the code in CopyMatch(): */
1552// for debug:
1553// #define Z7_STD_DEC_USE_AFTER_CYC_BUF
1554
1555/*
1556CopyMatch()
1557if wrap (offset > winPos)
1558{
1559 then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src):
1560 (cycSize >= offset + COPY_CHUNK_SIZE)
1561 if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
1562 we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1],
1563}
1564(len != 0)
1565*/
1566static
1567Z7_FORCE_INLINE
1568// Z7_ATTRIB_NO_VECTOR
1569void CopyMatch(size_t offset, size_t len,
1570 Byte *win, size_t winPos, size_t rem, const size_t cycSize)
1571{
1572 Byte *dest = win + winPos;
1573 const Byte *src;
1574 // STAT_INC(g_NumCopy)
1575
1576 if (offset > winPos)
1577 {
1578 size_t back = offset - winPos;
1579 // src = win + cycSize - back;
1580 // cycSize -= offset;
1581 STAT_INC(g_NumOver)
1582 src = dest + (cycSize - offset);
1583 // (src >= dest) here
1584 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
1585 if (back < len)
1586 {
1587 #else
1588 if (back < len + (COPY_CHUNK_SIZE - 1))
1589 {
1590 if (back >= len)
1591 {
1592 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1593 do
1594 *dest++ = *src++;
1595 while (--len);
1596 return;
1597 }
1598 #endif
1599 // back < len
1600 STAT_INC(g_NumOver2)
1601 len -= back;
1602 rem -= back;
1603 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1604 do
1605 *dest++ = *src++;
1606 while (--back);
1607 src = dest - offset;
1608 // src = win;
1609 // we go to MAIN-COPY
1610 }
1611 }
1612 else
1613 src = dest - offset;
1614
1615 // len != 0
1616 // do *dest++ = *src++; while (--len); return;
1617
1618 // --- MAIN COPY ---
1619 // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE)
1620 // so we have at least COPY_CHUNK_SIZE space before overlap for writing.
1621 COPY_PREPARE
1622
1623 /* now (len == COPY_CHUNK_SIZE * x)
1624 so we can unroll for aligned copy */
1625 {
1626 // const unsigned b0 = src[0];
1627 // (COPY_OFFSET_MIN >= 4)
1628
1629 if (offset >= COPY_OFFSET_MIN)
1630 {
1631 COPY_CHUNKS
1632 // return;
1633 }
1634 else
1635 #if (COPY_OFFSET_MIN > 4)
1636 #if COPY_CHUNK_SIZE < 8
1637 #error Stop_Compiling_Bad_COPY_CHUNK_SIZE
1638 #endif
1639 if (offset >= 4)
1640 {
1641 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1642 do
1643 {
1644 COPY_CHUNK_4_2(dest, src)
1645 #if COPY_CHUNK_SIZE != 16
1646 if (len == 8) break;
1647 #endif
1648 COPY_CHUNK_4_2(dest, src)
1649 }
1650 while (len -= 16);
1651 // return;
1652 }
1653 else
1654 #endif
1655 {
1656 // (offset < 4)
1657 const unsigned b0 = src[0];
1658 if (offset < 2)
1659 {
1660 #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16)
1661 #if defined(MY_CPU_64BIT)
1662 {
1663 const UInt64 v64 = (UInt64)b0 * 0x0101010101010101;
1664 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1665 do
1666 {
1667 ((UInt64 *)(void *)dest)[0] = v64;
1668 ((UInt64 *)(void *)dest)[1] = v64;
1669 dest += 16;
1670 }
1671 while (len -= 16);
1672 }
1673 #else
1674 {
1675 UInt32 v = b0;
1676 v |= v << 8;
1677 v |= v << 16;
1678 do
1679 {
1680 ((UInt32 *)(void *)dest)[0] = v;
1681 ((UInt32 *)(void *)dest)[1] = v;
1682 dest += 8;
1683 ((UInt32 *)(void *)dest)[0] = v;
1684 ((UInt32 *)(void *)dest)[1] = v;
1685 dest += 8;
1686 }
1687 while (len -= 16);
1688 }
1689 #endif
1690 #else
1691 do
1692 {
1693 dest[0] = (Byte)b0;
1694 dest[1] = (Byte)b0;
1695 dest += 2;
1696 dest[0] = (Byte)b0;
1697 dest[1] = (Byte)b0;
1698 dest += 2;
1699 }
1700 while (len -= 4);
1701 #endif
1702 }
1703 else if (offset == 2)
1704 {
1705 const Byte b1 = src[1];
1706 {
1707 do
1708 {
1709 dest[0] = (Byte)b0;
1710 dest[1] = b1;
1711 dest += 2;
1712 }
1713 while (len -= 2);
1714 }
1715 }
1716 else // (offset == 3)
1717 {
1718 const Byte *lim = dest + len - 2;
1719 const Byte b1 = src[1];
1720 const Byte b2 = src[2];
1721 do
1722 {
1723 dest[0] = (Byte)b0;
1724 dest[1] = b1;
1725 dest[2] = b2;
1726 dest += 3;
1727 }
1728 while (dest < lim);
1729 lim++; // points to last byte that must be written
1730 if (dest <= lim)
1731 {
1732 *dest = (Byte)b0;
1733 if (dest != lim)
1734 dest[1] = b1;
1735 }
1736 }
1737 }
1738 }
1739}
1740
1741
1742
1743#define UPDATE_TOTAL_OUT(p, size) \
1744{ \
1745 size_t _toc = (p)->totalOutCheck + (size); \
1746 const size_t _ws = (p)->winSize; \
1747 if (_toc >= _ws) _toc = _ws; \
1748 (p)->totalOutCheck = _toc; \
1749}
1750
1751
1752#if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN)
1753// we can disable it for debug:
1754#define Z7_ZSTD_DEC_USE_64BIT_LOADS
1755#endif
1756// #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit
1757
1758// SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value.
1759// we need (SEQ_SRC_OFFSET != 0) for optimized memory access
1760#ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1761 #define SEQ_SRC_OFFSET 7
1762#else
1763 #define SEQ_SRC_OFFSET 3
1764#endif
1765#define SRC_PLUS_FOR_4BYTES(bitOffset) (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3)
1766#define BIT_OFFSET_7BITS(bitOffset) ((unsigned)(bitOffset) & 7)
1767/*
1768 if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits
1769 if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1
1770 and we can read 1 bit more in that mode : (8 * n + 1).
1771*/
1772// #define BIT_OFFSET_DELTA_BITS 0
1773#define BIT_OFFSET_DELTA_BITS 1
1774#if BIT_OFFSET_DELTA_BITS == 1
1775 #define GET_SHIFT_FROM_BOFFS7(boff7) (7 ^ (boff7))
1776#else
1777 #define GET_SHIFT_FROM_BOFFS7(boff7) (8 - BIT_OFFSET_DELTA_BITS - (boff7))
1778#endif
1779
1780#define UPDATE_BIT_OFFSET(bitOffset, numBits) \
1781 (bitOffset) -= (CBitCtr)(numBits);
1782
1783#define GET_SHIFT(bitOffset) GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset))
1784
1785
1786#if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS)
1787 #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32)
1788 /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1),
1789 we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */
1790 #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1791 #endif
1792 #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1793 #if (BIT_OFFSET_DELTA_BITS == 1)
1794 /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17)))
1795 {
1796 the case (16 bits literal extra + 16 match extra) is not possible
1797 in correct stream. So error will be detected for (16 + 16) case.
1798 And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits).
1799 So we can use just one 64-bit load here in that case.
1800 }
1801 */
1802 #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
1803 #endif
1804 #endif
1805#endif
1806
1807
1808#if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \
1809 (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
1810 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML))
1811// in : (0 < bits <= (24 or 25)):
1812#define STREAM_READ_BITS(dest, bits) \
1813{ \
1814 GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1815 dest <<= GET_SHIFT(bitOffset); \
1816 UPDATE_BIT_OFFSET(bitOffset, bits) \
1817 dest >>= 32 - bits; \
1818}
1819#endif
1820
1821
1822#define FSE_Peek_1(table, state) table[state]
1823
1824#define STATE_VAR(name) state_ ## name
1825
1826// in : (0 <= accuracy <= (24 or 25))
1827#define FSE_INIT_STATE(name, cond) \
1828{ \
1829 UInt32 r; \
1830 const unsigned bits = p->name ## _accuracy; \
1831 GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1832 r <<= GET_SHIFT(bitOffset); \
1833 r >>= 1; \
1834 r >>= 31 ^ bits; \
1835 UPDATE_BIT_OFFSET(bitOffset, bits) \
1836 cond \
1837 STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \
1838 /* STATE_VAR(name) = dest << 16; */ \
1839}
1840
1841
1842#define FSE_Peek_Plus(name, r) \
1843 STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \
1844 GET_FSE_REC_STATE(STATE_VAR(name)) + r);
1845
1846#define LZ_LOOP_ERROR_EXIT { return SZ_ERROR_DATA; }
1847
1848#define BO_OVERFLOW_CHECK \
1849 { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT }
1850
1851
1852#ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1853
1854#define GET64(dest, p) { const Byte *ptr = p; dest = GetUi64(ptr); }
1855
1856#define FSE_PRELOAD \
1857{ \
1858 GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1859 v <<= GET_SHIFT(bitOffset); \
1860}
1861
1862#define FSE_UPDATE_STATE_2(name, cond) \
1863{ \
1864 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1865 UInt64 r = v; \
1866 v <<= bits; \
1867 r >>= 1; \
1868 UPDATE_BIT_OFFSET(bitOffset, bits) \
1869 cond \
1870 r >>= 63 ^ bits; \
1871 FSE_Peek_Plus(name, r); \
1872}
1873
1874#define FSE_UPDATE_STATES \
1875 FSE_UPDATE_STATE_2 (ll, {} ) \
1876 FSE_UPDATE_STATE_2 (ml, {} ) \
1877 FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \
1878
1879#else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1880
1881// it supports 8 bits accuracy for any code
1882// it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1)
1883#define FSE_UPDATE_STATE_0(name, cond) \
1884{ \
1885 UInt32 r; \
1886 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1887 GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1888 r >>= (bitOffset & 7); \
1889 r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \
1890 UPDATE_BIT_OFFSET(bitOffset, bits) \
1891 cond \
1892 r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \
1893 FSE_Peek_Plus(name, r); \
1894}
1895
1896// for debug (slow):
1897// #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE
1898#if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE)
1899 #define Z7_ZSTD_DEC_USE_FSE_FUSION
1900#endif
1901
1902#ifdef Z7_ZSTD_DEC_USE_FSE_FUSION
1903#define FSE_UPDATE_STATE_1(name) \
1904{ UInt32 rest2; \
1905{ \
1906 UInt32 r; \
1907 unsigned bits; \
1908 GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1909 bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1910 r <<= GET_SHIFT(bitOffset); \
1911 rest2 = r << bits; \
1912 r >>= 1; \
1913 UPDATE_BIT_OFFSET(bitOffset, bits) \
1914 r >>= 31 ^ bits; \
1915 FSE_Peek_Plus(name, r); \
1916}
1917
1918#define FSE_UPDATE_STATE_3(name) \
1919{ \
1920 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1921 rest2 >>= 1; \
1922 UPDATE_BIT_OFFSET(bitOffset, bits) \
1923 rest2 >>= 31 ^ bits; \
1924 FSE_Peek_Plus(name, rest2); \
1925}}
1926
1927#define FSE_UPDATE_STATES \
1928 FSE_UPDATE_STATE_1 (ll) \
1929 FSE_UPDATE_STATE_3 (ml) \
1930 FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1931
1932#else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1933
1934#define FSE_UPDATE_STATES \
1935 FSE_UPDATE_STATE_0 (ll, {} ) \
1936 FSE_UPDATE_STATE_0 (ml, {} ) \
1937 FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1938
1939#endif // Z7_ZSTD_DEC_USE_FSE_FUSION
1940#endif // Z7_ZSTD_DEC_USE_64BIT_LOADS
1941
1942
1943
1944typedef struct
1945{
1946 UInt32 numSeqs;
1947 UInt32 literalsLen;
1948 const Byte *literals;
1949}
1950CZstdDec1_Vars;
1951
1952
1953// if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0)
1954#define BIT_OFFSET_DELTA_BYTES BIT_OFFSET_DELTA_BITS
1955
1956/* if (NUM_OFFSET_SYMBOLS_MAX == 32)
1957 max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits
1958 if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward
1959 lookahead offset, and we read UInt64 after literal_len reading.
1960 if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32)
1961 MAX_BACKWARD_DEPTH = 16 bytes
1962*/
1963#define MAX_BACKWARD_DEPTH \
1964 ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES)
1965
1966/* srcLen != 0
1967 src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES
1968 if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then
1969 (winLimit - p->winPos <= (1 << 17)) is required
1970*/
1971static
1972Z7_NO_INLINE
1973// Z7_ATTRIB_NO_VECTOR
1974SRes Decompress_Sequences(CZstdDec1 * const p,
1975 const Byte *src, const size_t srcLen,
1976 const size_t winLimit,
1977 const CZstdDec1_Vars * const vars)
1978{
1979#ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
1980 SEQ_EXTRA_TABLES(a_)
1981#endif
1982
1983 // for debug:
1984 // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1985#ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1986 #define FSE_TABLE(n) fse. n
1987 const CZstdDecFseTables fse = p->fse;
1988 /*
1989 CZstdDecFseTables fse;
1990 #define COPY_FSE_TABLE(n) \
1991 memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy);
1992 COPY_FSE_TABLE(of)
1993 COPY_FSE_TABLE(ll)
1994 COPY_FSE_TABLE(ml)
1995 */
1996#else
1997 #define FSE_TABLE(n) (p->fse. n)
1998#endif
1999
2000#ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
2001 FILL_LOC_BASES_ALL
2002#endif
2003
2004 {
2005 unsigned numSeqs = vars->numSeqs;
2006 const Byte *literals = vars->literals;
2007 ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen;
2008 Byte * const win = p->win;
2009 size_t winPos = p->winPos;
2010 const size_t cycSize = p->cycSize;
2011 size_t totalOutCheck = p->totalOutCheck;
2012 const size_t winSize = p->winSize;
2013 size_t reps_0 = p->reps[0];
2014 size_t reps_1 = p->reps[1];
2015 size_t reps_2 = p->reps[2];
2016 UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml);
2017 CBitCtr bitOffset;
2018
2019 SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES)
2020
2021 bitOffset -= BIT_OFFSET_DELTA_BITS;
2022
2023 FSE_INIT_STATE(ll, {} )
2024 FSE_INIT_STATE(of, {} )
2025 FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK)
2026
2027 for (;;)
2028 {
2029 size_t matchLen;
2030 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2031 UInt64 v;
2032 #endif
2033
2034 #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
2035 FSE_PRELOAD
2036 #endif
2037
2038 // if (of_code == 0)
2039 if ((Byte)STATE_VAR(of) == 0)
2040 {
2041 if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2042 {
2043 const size_t offset = reps_1;
2044 reps_1 = reps_0;
2045 reps_0 = offset;
2046 STAT_INC(g_Num_Rep1)
2047 }
2048 STAT_UPDATE(else g_Num_Rep0++;)
2049 }
2050 else
2051 {
2052 const unsigned of_code = (Byte)STATE_VAR(of);
2053
2054 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2055 #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2056 FSE_PRELOAD
2057 #endif
2058 #else
2059 UInt32 v;
2060 {
2061 const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset);
2062 const unsigned skip = GET_SHIFT(bitOffset);
2063 GET32(v, src4)
2064 v <<= skip;
2065 v |= (UInt32)src4[-1] >> (8 - skip);
2066 }
2067 #endif
2068
2069 UPDATE_BIT_OFFSET(bitOffset, of_code)
2070
2071 if (of_code == 1)
2072 {
2073 // read 1 bit
2074 #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64)
2075 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2076 #define CHECK_HIGH_BIT_64(a) ((Int64)(UInt64)(a) < 0)
2077 #else
2078 #define CHECK_HIGH_BIT_32(a) ((Int32)(UInt32)(a) < 0)
2079 #endif
2080 #else
2081 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2082 #define CHECK_HIGH_BIT_64(a) ((UInt64)(a) & ((UInt64)1 << 63))
2083 #else
2084 #define CHECK_HIGH_BIT_32(a) ((UInt32)(a) & ((UInt32)1 << 31))
2085 #endif
2086 #endif
2087
2088 if
2089 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2090 CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2091 #else
2092 CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2093 #endif
2094 {
2095 v <<= 1;
2096 {
2097 const size_t offset = reps_2;
2098 reps_2 = reps_1;
2099 reps_1 = reps_0;
2100 reps_0 = offset;
2101 STAT_INC(g_Num_Rep2)
2102 }
2103 }
2104 else
2105 {
2106 if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2107 {
2108 // litLen == 0 && bit == 1
2109 STAT_INC(g_Num_Rep3)
2110 v <<= 1;
2111 reps_2 = reps_1;
2112 reps_1 = reps_0;
2113 if (--reps_0 == 0)
2114 {
2115 // LZ_LOOP_ERROR_EXIT
2116 // original-zstd decoder : input is corrupted; force offset to 1
2117 // reps_0 = 1;
2118 reps_0++;
2119 }
2120 }
2121 else
2122 {
2123 // litLen != 0 && bit == 0
2124 v <<= 1;
2125 {
2126 const size_t offset = reps_1;
2127 reps_1 = reps_0;
2128 reps_0 = offset;
2129 STAT_INC(g_Num_Rep1)
2130 }
2131 }
2132 }
2133 }
2134 else
2135 {
2136 // (2 <= of_code)
2137 // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check
2138 // we don't allow (of_code >= 32) cases in another code
2139 reps_2 = reps_1;
2140 reps_1 = reps_0;
2141 reps_0 = ((size_t)1 << of_code) - 3 + (size_t)
2142 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2143 (v >> (64 - of_code));
2144 v <<= of_code;
2145 #else
2146 (v >> (32 - of_code));
2147 #endif
2148 }
2149 }
2150
2151 #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
2152 FSE_PRELOAD
2153 #endif
2154
2155 matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml))
2156 #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3
2157 + MATCH_LEN_MIN
2158 #endif
2159 ;
2160 {
2161 {
2162 if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20)
2163 {
2164 const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN];
2165 matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN];
2166 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2167 (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \
2168 defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF))
2169 {
2170 UPDATE_BIT_OFFSET(bitOffset, extra)
2171 matchLen += (size_t)(v >> (64 - extra));
2172 #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2173 FSE_PRELOAD
2174 #else
2175 v <<= extra;
2176 #endif
2177 }
2178 #else
2179 {
2180 UInt32 v32;
2181 STREAM_READ_BITS(v32, extra)
2182 matchLen += v32;
2183 }
2184 #endif
2185 STAT_INC(g_Num_Match)
2186 }
2187 }
2188 }
2189
2190 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2191 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
2192 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)
2193 FSE_PRELOAD
2194 #endif
2195
2196 {
2197 size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll));
2198 if (litLen)
2199 {
2200 // if (STATE_VAR(ll) & 0x70)
2201 if (litLen >= 16)
2202 {
2203 const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen];
2204 litLen = BASES_TABLE(SEQ_LL_BASES) [litLen];
2205 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2206 {
2207 UPDATE_BIT_OFFSET(bitOffset, extra)
2208 litLen += (size_t)(v >> (64 - extra));
2209 #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2210 FSE_PRELOAD
2211 #else
2212 v <<= extra;
2213 #endif
2214 }
2215 #else
2216 {
2217 UInt32 v32;
2218 STREAM_READ_BITS(v32, extra)
2219 litLen += v32;
2220 }
2221 #endif
2222 STAT_INC(g_Num_LitsBig)
2223 }
2224
2225 if ((literalsLen -= (ptrdiff_t)litLen) < 0)
2226 LZ_LOOP_ERROR_EXIT
2227 totalOutCheck += litLen;
2228 {
2229 const size_t rem = winLimit - winPos;
2230 if (litLen > rem)
2231 LZ_LOOP_ERROR_EXIT
2232 {
2233 const Byte *literals_temp = literals;
2234 Byte *d = win + winPos;
2235 literals += litLen;
2236 winPos += litLen;
2237 CopyLiterals(d, literals_temp, litLen, rem);
2238 }
2239 }
2240 }
2241 STAT_UPDATE(else g_Num_Lit0++;)
2242 }
2243
2244 #define COPY_MATCH \
2245 { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \
2246 totalOutCheck += matchLen; \
2247 { const size_t rem = winLimit - winPos; \
2248 if (matchLen > rem) LZ_LOOP_ERROR_EXIT \
2249 { const size_t winPos_temp = winPos; \
2250 winPos += matchLen; \
2251 CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}}
2252
2253 if (--numSeqs == 0)
2254 {
2255 COPY_MATCH
2256 break;
2257 }
2258 FSE_UPDATE_STATES
2259 COPY_MATCH
2260 } // for
2261
2262 if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS)
2263 return SZ_ERROR_DATA;
2264
2265 if (literalsLen)
2266 {
2267 const size_t rem = winLimit - winPos;
2268 if ((size_t)literalsLen > rem)
2269 return SZ_ERROR_DATA;
2270 {
2271 Byte *d = win + winPos;
2272 winPos += (size_t)literalsLen;
2273 totalOutCheck += (size_t)literalsLen;
2274 CopyLiterals
2275 // memcpy
2276 (d, literals, (size_t)literalsLen, rem);
2277 }
2278 }
2279 if (totalOutCheck >= winSize)
2280 totalOutCheck = winSize;
2281 p->totalOutCheck = totalOutCheck;
2282 p->winPos = winPos;
2283 p->reps[0] = (CZstdDecOffset)reps_0;
2284 p->reps[1] = (CZstdDecOffset)reps_1;
2285 p->reps[2] = (CZstdDecOffset)reps_2;
2286 }
2287 return SZ_OK;
2288}
2289
2290
2291// for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly:
2292// #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only
2293#ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2294static unsigned g_numSeqs;
2295#endif
2296
2297
2298#define k_LitBlockType_Flag_RLE_or_Treeless 1
2299#define k_LitBlockType_Flag_Compressed 2
2300
2301// outLimit : is strong limit
2302// outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p)
2303// inSize != 0
2304static
2305Z7_NO_INLINE
2306SRes ZstdDec1_DecodeBlock(CZstdDec1 *p,
2307 const Byte *src, SizeT inSize, SizeT afterAvail,
2308 const size_t outLimit)
2309{
2310 CZstdDec1_Vars vars;
2311 vars.literals = p->literalsBase;
2312 {
2313 const unsigned b0 = *src++;
2314 UInt32 numLits, compressedSize;
2315 const Byte *litStream;
2316 Byte *literalsDest;
2317 inSize--;
2318
2319 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2320 {
2321 // we need at least one additional byte for (numSeqs).
2322 // so we check for that additional byte in conditions.
2323 numLits = b0 >> 3;
2324 if (b0 & 4)
2325 {
2326 UInt32 v;
2327 if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs).
2328 return SZ_ERROR_DATA;
2329 numLits >>= 1;
2330 v = GetUi16(src);
2331 src += 2;
2332 inSize -= 2;
2333 if ((b0 & 8) == 0)
2334 {
2335 src--;
2336 inSize++;
2337 v = (Byte)v;
2338 }
2339 numLits += v << 4;
2340 }
2341 compressedSize = 1;
2342 if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2343 compressedSize = numLits;
2344 }
2345 else if (inSize < 4)
2346 return SZ_ERROR_DATA;
2347 else
2348 {
2349 const unsigned mode4Streams = b0 & 0xc;
2350 const unsigned numBytes = (3 * mode4Streams + 32) >> 4;
2351 const unsigned numBits = 4 * numBytes - 2;
2352 const UInt32 mask = ((UInt32)16 << numBits) - 1;
2353 compressedSize = GetUi32(src);
2354 numLits = ((
2355 #ifdef MY_CPU_LE_UNALIGN
2356 GetUi32(src - 1)
2357 #else
2358 ((compressedSize << 8) + b0)
2359 #endif
2360 ) >> 4) & mask;
2361 src += numBytes;
2362 inSize -= numBytes;
2363 compressedSize >>= numBits;
2364 compressedSize &= mask;
2365 /*
2366 if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u ratio2 = %u\n",
2367 i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits,
2368 (unsigned)numLits * 100 / (unsigned)inSize);
2369 }
2370 */
2371 if (compressedSize == 0)
2372 return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed
2373 }
2374
2375 STAT_UPDATE(g_Num_Lits += numLits;)
2376
2377 vars.literalsLen = numLits;
2378
2379 if (compressedSize >= inSize)
2380 return SZ_ERROR_DATA;
2381 litStream = src;
2382 src += compressedSize;
2383 inSize -= compressedSize;
2384 // inSize != 0
2385 {
2386 UInt32 numSeqs = *src++;
2387 inSize--;
2388 if (numSeqs > 127)
2389 {
2390 UInt32 b1;
2391 if (inSize == 0)
2392 return SZ_ERROR_DATA;
2393 numSeqs -= 128;
2394 b1 = *src++;
2395 inSize--;
2396 if (numSeqs == 127)
2397 {
2398 if (inSize == 0)
2399 return SZ_ERROR_DATA;
2400 numSeqs = (UInt32)(*src++) + 127;
2401 inSize--;
2402 }
2403 numSeqs = (numSeqs << 8) + b1;
2404 }
2405 if (numSeqs * MATCH_LEN_MIN + numLits > outLimit)
2406 return SZ_ERROR_DATA;
2407 vars.numSeqs = numSeqs;
2408
2409 STAT_UPDATE(g_NumSeqs_total += numSeqs;)
2410 /*
2411 #ifdef SHOW_STAT
2412 printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total,
2413 (int)g_NumSeqs_total / g_Num_Blocks_Compressed);
2414 #endif
2415 // printf("\nnumSeqs2 = %d", numSeqs);
2416 */
2417 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2418 if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug
2419 #endif
2420 if (numSeqs == 0)
2421 {
2422 if (inSize != 0)
2423 return SZ_ERROR_DATA;
2424 literalsDest = p->win + p->winPos;
2425 }
2426 else
2427 literalsDest = p->literalsBase;
2428 }
2429
2430 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2431 {
2432 if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2433 {
2434 memset(literalsDest, litStream[0], numLits);
2435 if (vars.numSeqs)
2436 {
2437 // literalsDest == p->literalsBase == vars.literals
2438 #if COPY_CHUNK_SIZE > 1
2439 memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2440 #endif
2441 }
2442 }
2443 else
2444 {
2445 // unsigned y;
2446 // for (y = 0; y < 10000; y++)
2447 memcpy(literalsDest, litStream, numLits);
2448 if (vars.numSeqs)
2449 {
2450 /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals().
2451 If we have additional space in input stream after literals stream,
2452 we use direct copy of rar literals in input stream */
2453 if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1))
2454 vars.literals = litStream;
2455 else
2456 {
2457 // literalsDest == p->literalsBase == vars.literals
2458 #if COPY_CHUNK_SIZE > 1
2459 /* CopyLiterals():
2460 1) we don't want reading non-initialized data
2461 2) we will copy only zero byte after literals buffer */
2462 memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2463 #endif
2464 }
2465 }
2466 }
2467 }
2468 else
2469 {
2470 CInBufPair hufStream;
2471 hufStream.ptr = litStream;
2472 hufStream.len = compressedSize;
2473
2474 if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2475 {
2476 // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2;
2477 RINOK(Huf_DecodeTable(&p->huf, &hufStream))
2478 p->litHuf_wasSet = True;
2479 // } while (--y);
2480 }
2481 else if (!p->litHuf_wasSet)
2482 return SZ_ERROR_DATA;
2483
2484 {
2485 // int yyy; for (yyy = 0; yyy < 34; yyy++) {
2486 SRes sres;
2487 if ((b0 & 0xc) == 0) // mode4Streams
2488 sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64,
2489 hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits);
2490 else
2491 {
2492 // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes)
2493 if (hufStream.len < 6 + 4)
2494 return SZ_ERROR_DATA;
2495 // the condition from original-zstd decoder:
2496 #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6
2497 if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS)
2498 return SZ_ERROR_DATA;
2499 sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64,
2500 hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits);
2501 }
2502 RINOK(sres)
2503 // }
2504 }
2505 }
2506
2507 if (vars.numSeqs == 0)
2508 {
2509 p->winPos += numLits;
2510 return SZ_OK;
2511 }
2512 }
2513 {
2514 CInBufPair in;
2515 unsigned mode;
2516 unsigned seqMode;
2517
2518 in.ptr = src;
2519 in.len = inSize;
2520 if (in.len == 0)
2521 return SZ_ERROR_DATA;
2522 in.len--;
2523 mode = *in.ptr++;
2524 if (mode & 3) // Reserved bits
2525 return SZ_ERROR_DATA;
2526
2527 seqMode = (mode >> 6);
2528 if (seqMode == k_SeqMode_Repeat)
2529 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2530 else RINOK(FSE_Decode_SeqTable(
2531 p->fse.ll,
2532 &in,
2533 6, // predefAccuracy
2534 &p->ll_accuracy,
2535 NUM_LL_SYMBOLS,
2536 k_PredefRecords_LL,
2537 seqMode))
2538
2539 seqMode = (mode >> 4) & 3;
2540 if (seqMode == k_SeqMode_Repeat)
2541 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2542 else RINOK(FSE_Decode_SeqTable(
2543 p->fse.of,
2544 &in,
2545 5, // predefAccuracy
2546 &p->of_accuracy,
2547 NUM_OFFSET_SYMBOLS_MAX,
2548 k_PredefRecords_OF,
2549 seqMode))
2550
2551 seqMode = (mode >> 2) & 3;
2552 if (seqMode == k_SeqMode_Repeat)
2553 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2554 else
2555 {
2556 RINOK(FSE_Decode_SeqTable(
2557 p->fse.ml,
2558 &in,
2559 6, // predefAccuracy
2560 &p->ml_accuracy,
2561 NUM_ML_SYMBOLS,
2562 k_PredefRecords_ML,
2563 seqMode))
2564 /*
2565 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
2566 // { unsigned y = 1 << 10; do
2567 {
2568 const unsigned accuracy = p->ml_accuracy;
2569 if (accuracy == 0)
2570 p->fse.ml[0] += 3;
2571 else
2572 #ifdef MY_CPU_64BIT
2573 {
2574 // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code
2575 UInt64 *table = (UInt64 *)(void *)p->fse.ml;
2576 const UInt64 *end = (const UInt64 *)(const void *)
2577 ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2578 do
2579 {
2580 table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2581 table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2582 table += 2;
2583 }
2584 while (table != end);
2585 }
2586 #else
2587 {
2588 UInt32 *table = p->fse.ml;
2589 const UInt32 *end = (const UInt32 *)(const void *)
2590 ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2591 do
2592 {
2593 table[0] += MATCH_LEN_MIN;
2594 table[1] += MATCH_LEN_MIN;
2595 table += 2;
2596 table[0] += MATCH_LEN_MIN;
2597 table[1] += MATCH_LEN_MIN;
2598 table += 2;
2599 }
2600 while (table != end);
2601 }
2602 #endif
2603 }
2604 // while (--y); }
2605 #endif
2606 */
2607 }
2608
2609 // p->seqTables_wereSet = True;
2610 if (in.len == 0)
2611 return SZ_ERROR_DATA;
2612 return Decompress_Sequences(p,
2613 in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len,
2614 p->winPos + outLimit, &vars);
2615 }
2616}
2617
2618
2619
2620
2621// inSize != 0
2622// it must do similar to ZstdDec1_DecodeBlock()
2623static size_t ZstdDec1_NeedTempBufferForInput(
2624 const SizeT beforeSize, const Byte * const src, const SizeT inSize)
2625{
2626 unsigned b0;
2627 UInt32 pos;
2628
2629 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2630 g_numSeqs = 1 << 24;
2631 #else
2632 // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode
2633 #define MIN_BLOCK_LZ_HEADERS_SIZE 3
2634 if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE)
2635 return 0;
2636 #endif
2637
2638 b0 = src[0];
2639
2640 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2641 {
2642 UInt32 numLits = b0 >> 3;
2643 pos = 1;
2644 if (b0 & 4)
2645 {
2646 UInt32 v;
2647 if (inSize < 3)
2648 return 0;
2649 numLits >>= 1;
2650 v = GetUi16(src + 1);
2651 pos = 3;
2652 if ((b0 & 8) == 0)
2653 {
2654 pos = 2;
2655 v = (Byte)v;
2656 }
2657 numLits += v << 4;
2658 }
2659 if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2660 numLits = 1;
2661 pos += numLits;
2662 }
2663 else if (inSize < 5)
2664 return 0;
2665 else
2666 {
2667 const unsigned mode4Streams = b0 & 0xc;
2668 const unsigned numBytes = (3 * mode4Streams + 48) >> 4;
2669 const unsigned numBits = 4 * numBytes - 6;
2670 UInt32 cs = GetUi32(src + 1);
2671 cs >>= numBits;
2672 cs &= ((UInt32)16 << numBits) - 1;
2673 if (cs == 0)
2674 return 0;
2675 pos = numBytes + cs;
2676 }
2677
2678 if (pos >= inSize)
2679 return 0;
2680 {
2681 UInt32 numSeqs = src[pos++];
2682 if (numSeqs > 127)
2683 {
2684 UInt32 b1;
2685 if (pos >= inSize)
2686 return 0;
2687 numSeqs -= 128;
2688 b1 = src[pos++];
2689 if (numSeqs == 127)
2690 {
2691 if (pos >= inSize)
2692 return 0;
2693 numSeqs = (UInt32)(src[pos++]) + 127;
2694 }
2695 numSeqs = (numSeqs << 8) + b1;
2696 }
2697 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2698 g_numSeqs = numSeqs; // for debug
2699 #endif
2700 if (numSeqs == 0)
2701 return 0;
2702 }
2703 /*
2704 if (pos >= inSize)
2705 return 0;
2706 pos++;
2707 */
2708 // we will have one additional byte for seqMode:
2709 if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1)
2710 return 0;
2711 return 1;
2712}
2713
2714
2715
2716// ---------- ZSTD FRAME ----------
2717
2718#define kBlockType_Raw 0
2719#define kBlockType_RLE 1
2720#define kBlockType_Compressed 2
2721#define kBlockType_Reserved 3
2722
2723typedef enum
2724{
2725 // begin: states that require 4 bytes:
2726 ZSTD2_STATE_SIGNATURE,
2727 ZSTD2_STATE_HASH,
2728 ZSTD2_STATE_SKIP_HEADER,
2729 // end of states that require 4 bytes
2730
2731 ZSTD2_STATE_SKIP_DATA,
2732 ZSTD2_STATE_FRAME_HEADER,
2733 ZSTD2_STATE_AFTER_HEADER,
2734 ZSTD2_STATE_BLOCK,
2735 ZSTD2_STATE_DATA,
2736 ZSTD2_STATE_FINISHED
2737} EZstd2State;
2738
2739
2740struct CZstdDec
2741{
2742 EZstd2State frameState;
2743 unsigned tempSize;
2744
2745 Byte temp[14]; // 14 is required
2746
2747 Byte descriptor;
2748 Byte windowDescriptor;
2749 Byte isLastBlock;
2750 Byte blockType;
2751 Byte isErrorState;
2752 Byte hashError;
2753 Byte disableHash;
2754 Byte isCyclicMode;
2755
2756 UInt32 blockSize;
2757 UInt32 dictionaryId;
2758 UInt32 curBlockUnpackRem; // for compressed blocks only
2759 UInt32 inTempPos;
2760
2761 UInt64 contentSize;
2762 UInt64 contentProcessed;
2763 CXxh64State xxh64;
2764
2765 Byte *inTemp;
2766 SizeT winBufSize_Allocated;
2767 Byte *win_Base;
2768
2769 ISzAllocPtr alloc_Small;
2770 ISzAllocPtr alloc_Big;
2771
2772 CZstdDec1 decoder;
2773};
2774
2775#define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \
2776 ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1))
2777
2778#define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock)
2779
2780
2781static void ZstdDec_FreeWindow(CZstdDec * const p)
2782{
2783 if (p->win_Base)
2784 {
2785 ISzAlloc_Free(p->alloc_Big, p->win_Base);
2786 p->win_Base = NULL;
2787 // p->decoder.win = NULL;
2788 p->winBufSize_Allocated = 0;
2789 }
2790}
2791
2792
2793CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big)
2794{
2795 CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec));
2796 if (!p)
2797 return NULL;
2798 p->alloc_Small = alloc_Small;
2799 p->alloc_Big = alloc_Big;
2800 // ZstdDec_CONSTRUCT(p)
2801 p->inTemp = NULL;
2802 p->win_Base = NULL;
2803 p->winBufSize_Allocated = 0;
2804 p->disableHash = False;
2805 ZstdDec1_Construct(&p->decoder);
2806 return p;
2807}
2808
2809void ZstdDec_Destroy(CZstdDecHandle p)
2810{
2811 #ifdef SHOW_STAT
2812 #define PRINT_STAT1(name, v) \
2813 printf("\n%25s = %9u", name, v);
2814 PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed)
2815 PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy)
2816 PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num)
2817 PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes)
2818 if (g_Num_Blocks_Compressed)
2819 {
2820 #define PRINT_STAT(name, v) \
2821 printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed);
2822 PRINT_STAT("g_NumSeqs", g_NumSeqs_total)
2823 // PRINT_STAT("g_NumCopy", g_NumCopy)
2824 PRINT_STAT("g_NumOver", g_NumOver)
2825 PRINT_STAT("g_NumOver2", g_NumOver2)
2826 PRINT_STAT("g_Num_Match", g_Num_Match)
2827 PRINT_STAT("g_Num_Lits", g_Num_Lits)
2828 PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig)
2829 PRINT_STAT("g_Num_Lit0", g_Num_Lit0)
2830 PRINT_STAT("g_Num_Rep_0", g_Num_Rep0)
2831 PRINT_STAT("g_Num_Rep_1", g_Num_Rep1)
2832 PRINT_STAT("g_Num_Rep_2", g_Num_Rep2)
2833 PRINT_STAT("g_Num_Rep_3", g_Num_Rep3)
2834 PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0)
2835 PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1)
2836 PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum)
2837 PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum)
2838 }
2839 printf("\n");
2840 #endif
2841
2842 ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase);
2843 // p->->decoder.literalsBase = NULL;
2844 ISzAlloc_Free(p->alloc_Small, p->inTemp);
2845 // p->inTemp = NULL;
2846 ZstdDec_FreeWindow(p);
2847 ISzAlloc_Free(p->alloc_Small, p);
2848}
2849
2850
2851
2852#define kTempBuffer_PreSize (1u << 6)
2853#if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH
2854 #error Stop_Compiling_Bad_kTempBuffer_PreSize
2855#endif
2856
2857static SRes ZstdDec_AllocateMisc(CZstdDec *p)
2858{
2859 #define k_Lit_AfterAvail (1u << 6)
2860 #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1)
2861 #error Stop_Compiling_Bad_k_Lit_AfterAvail
2862 #endif
2863 // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small);
2864 if (!p->decoder.literalsBase)
2865 {
2866 p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2867 kBlockSizeMax + k_Lit_AfterAvail);
2868 if (!p->decoder.literalsBase)
2869 return SZ_ERROR_MEM;
2870 }
2871 if (!p->inTemp)
2872 {
2873 // we need k_Lit_AfterAvail here for owerread from raw literals stream
2874 p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2875 kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail);
2876 if (!p->inTemp)
2877 return SZ_ERROR_MEM;
2878 }
2879 return SZ_OK;
2880}
2881
2882
2883static void ZstdDec_Init_ForNewFrame(CZstdDec *p)
2884{
2885 p->frameState = ZSTD2_STATE_SIGNATURE;
2886 p->tempSize = 0;
2887
2888 p->isErrorState = False;
2889 p->hashError = False;
2890 p->isCyclicMode = False;
2891 p->contentProcessed = 0;
2892 Xxh64State_Init(&p->xxh64);
2893 ZstdDec1_Init(&p->decoder);
2894}
2895
2896
2897void ZstdDec_Init(CZstdDec *p)
2898{
2899 ZstdDec_Init_ForNewFrame(p);
2900 p->decoder.winPos = 0;
2901 memset(p->temp, 0, sizeof(p->temp));
2902}
2903
2904
2905#define DESCRIPTOR_Get_DictionaryId_Flag(d) ((d) & 3)
2906#define DESCRIPTOR_FLAG_CHECKSUM (1 << 2)
2907#define DESCRIPTOR_FLAG_RESERVED (1 << 3)
2908// #define DESCRIPTOR_FLAG_UNUSED (1 << 4)
2909#define DESCRIPTOR_FLAG_SINGLE (1 << 5)
2910#define DESCRIPTOR_Get_ContentSize_Flag3(d) ((d) >> 5)
2911#define DESCRIPTOR_Is_ContentSize_Defined(d) (((d) & 0xe0) != 0)
2912
2913
2914static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info)
2915{
2916 unsigned tempSize = p->tempSize;
2917 p->temp[tempSize++] = b;
2918 p->tempSize = tempSize;
2919
2920 if (p->frameState == ZSTD2_STATE_BLOCK)
2921 {
2922 if (tempSize < 3)
2923 return ZSTD2_STATE_BLOCK;
2924 {
2925 UInt32 b0 = GetUi32(p->temp);
2926 const unsigned type = ((unsigned)b0 >> 1) & 3;
2927 if (type == kBlockType_RLE && tempSize == 3)
2928 return ZSTD2_STATE_BLOCK;
2929 // info->num_Blocks_forType[type]++;
2930 info->num_Blocks++;
2931 if (type == kBlockType_Reserved)
2932 {
2933 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED
2934 return ZSTD2_STATE_BLOCK;
2935 }
2936 p->blockType = (Byte)type;
2937 p->isLastBlock = (Byte)(b0 & 1);
2938 p->inTempPos = 0;
2939 p->tempSize = 0;
2940 b0 >>= 3;
2941 b0 &= 0x1fffff;
2942 // info->num_BlockBytes_forType[type] += b0;
2943 if (b0 == 0)
2944 {
2945 // empty RAW/RLE blocks are allowed in original-zstd decoder
2946 if (type == kBlockType_Compressed)
2947 {
2948 p->isErrorState = True;
2949 return ZSTD2_STATE_BLOCK;
2950 }
2951 if (!ZSTD_DEC_IS_LAST_BLOCK(p))
2952 return ZSTD2_STATE_BLOCK;
2953 if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
2954 return ZSTD2_STATE_HASH;
2955 return ZSTD2_STATE_FINISHED;
2956 }
2957 p->blockSize = b0;
2958 {
2959 UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder);
2960 // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size)
2961 if (b0 > blockLim)
2962 {
2963 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2964 return ZSTD2_STATE_BLOCK;
2965 }
2966 if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor))
2967 {
2968 const UInt64 rem = p->contentSize - p->contentProcessed;
2969 if (blockLim > rem)
2970 blockLim = (UInt32)rem;
2971 }
2972 p->curBlockUnpackRem = blockLim;
2973 // uncompressed block size cannot be larger than remain data size:
2974 if (type != kBlockType_Compressed)
2975 {
2976 if (b0 > blockLim)
2977 {
2978 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2979 return ZSTD2_STATE_BLOCK;
2980 }
2981 }
2982 }
2983 }
2984 return ZSTD2_STATE_DATA;
2985 }
2986
2987 if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA)
2988 {
2989 UInt32 v;
2990 if (tempSize != 4)
2991 return p->frameState;
2992 v = GetUi32(p->temp);
2993 if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE
2994 {
2995 if (v == 0xfd2fb528)
2996 {
2997 p->tempSize = 0;
2998 info->num_DataFrames++;
2999 return ZSTD2_STATE_FRAME_HEADER;
3000 }
3001 if ((v & 0xfffffff0) == 0x184d2a50)
3002 {
3003 p->tempSize = 0;
3004 info->num_SkipFrames++;
3005 return ZSTD2_STATE_SKIP_HEADER;
3006 }
3007 p->isErrorState = True;
3008 return ZSTD2_STATE_SIGNATURE;
3009 // return ZSTD2_STATE_ERROR; // is not ZSTD stream
3010 }
3011 if (p->frameState == ZSTD2_STATE_HASH)
3012 {
3013 info->checksum_Defined = True;
3014 info->checksum = v;
3015 // #ifndef DISABLE_XXH_CHECK
3016 if (!p->disableHash)
3017 {
3018 if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p))
3019 {
3020 // unexpected code failure
3021 p->isErrorState = True;
3022 // SZ_ERROR_FAIL;
3023 }
3024 else
3025 if ((UInt32)Xxh64State_Digest(&p->xxh64,
3026 p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)),
3027 p->contentProcessed) != v)
3028 {
3029 p->hashError = True;
3030 // return ZSTD2_STATE_ERROR; // hash error
3031 }
3032 }
3033 // #endif
3034 return ZSTD2_STATE_FINISHED;
3035 }
3036 // (p->frameState == ZSTD2_STATE_SKIP_HEADER)
3037 {
3038 p->blockSize = v;
3039 info->skipFrames_Size += v;
3040 p->tempSize = 0;
3041 /* we want the caller could know that there was finished frame
3042 finished frame. So we allow the case where
3043 we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0).
3044 */
3045 // if (v == 0) return ZSTD2_STATE_SIGNATURE;
3046 return ZSTD2_STATE_SKIP_DATA;
3047 }
3048 }
3049
3050 // if (p->frameState == ZSTD2_STATE_FRAME_HEADER)
3051 {
3052 unsigned descriptor;
3053 const Byte *h;
3054 descriptor = p->temp[0];
3055 p->descriptor = (Byte)descriptor;
3056 if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit
3057 {
3058 p->isErrorState = True;
3059 return ZSTD2_STATE_FRAME_HEADER;
3060 // return ZSTD2_STATE_ERROR;
3061 }
3062 {
3063 const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3064 // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1));
3065 tempSize -= (0x9a563422u >> (n * 4)) & 0xf;
3066 }
3067 if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor))))
3068 return ZSTD2_STATE_FRAME_HEADER;
3069
3070 info->descriptor_OR = (Byte)(info->descriptor_OR | descriptor);
3071 info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor);
3072
3073 h = &p->temp[1];
3074 {
3075 Byte w = 0;
3076 if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3077 {
3078 w = *h++;
3079 if (info->windowDescriptor_MAX < w)
3080 info->windowDescriptor_MAX = w;
3081 // info->are_WindowDescriptors = True;
3082 // info->num_WindowDescriptors++;
3083 }
3084 else
3085 {
3086 // info->are_SingleSegments = True;
3087 // info->num_SingleSegments++;
3088 }
3089 p->windowDescriptor = w;
3090 }
3091 {
3092 unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor);
3093 UInt32 d = 0;
3094 if (n)
3095 {
3096 n = 1u << (n - 1);
3097 d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n));
3098 h += n;
3099 }
3100 p->dictionaryId = d;
3101 // info->dictionaryId_Cur = d;
3102 if (d != 0)
3103 {
3104 if (info->dictionaryId == 0)
3105 info->dictionaryId = d;
3106 else if (info->dictionaryId != d)
3107 info->are_DictionaryId_Different = True;
3108 }
3109 }
3110 {
3111 unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3112 UInt64 v = 0;
3113 if (n)
3114 {
3115 n >>= 1;
3116 if (n == 1)
3117 v = 256;
3118 v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n)));
3119 // info->are_ContentSize_Known = True;
3120 // info->num_Frames_with_ContentSize++;
3121 if (info->contentSize_MAX < v)
3122 info->contentSize_MAX = v;
3123 info->contentSize_Total += v;
3124 }
3125 else
3126 {
3127 info->are_ContentSize_Unknown = True;
3128 // info->num_Frames_without_ContentSize++;
3129 }
3130 p->contentSize = v;
3131 }
3132 // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure
3133 p->tempSize = 0;
3134
3135 info->checksum_Defined = False;
3136 /*
3137 if (descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3138 info->are_Checksums = True;
3139 else
3140 info->are_Non_Checksums = True;
3141 */
3142
3143 return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK;
3144 }
3145}
3146
3147
3148static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos)
3149{
3150 /*
3151 #ifdef DISABLE_XXH_CHECK
3152 UNUSED_VAR(data)
3153 #else
3154 */
3155 if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM))
3156 {
3157 // const size_t pos = p->xxh64_winPos;
3158 const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31;
3159 if (size)
3160 {
3161 // p->xxh64_winPos = pos + size;
3162 Xxh64State_UpdateBlocks(&p->xxh64,
3163 p->decoder.win + xxh64_winPos,
3164 p->decoder.win + xxh64_winPos + size);
3165 }
3166 }
3167}
3168
3169
3170/*
3171in:
3172 (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible).
3173 - this function uses (winLimit) for RAW/RLE blocks only,
3174 because this function can decode single RAW/RLE block in several different calls.
3175 - this function DOESN'T use (winLimit) for Compressed blocks,
3176 because this function decodes full compressed block in single call.
3177 (CZstdDec1::winPos <= winLimit)
3178 (winLimit <= CZstdDec1::cycSize).
3179 Note: if (ds->outBuf_fromCaller) mode is used, then
3180 {
3181 (strong_limit) is stored in CZstdDec1::cycSize.
3182 So (winLimit) is more strong than (strong_limit).
3183 }
3184
3185exit:
3186 Note: (CZstdDecState::winPos) will be set by caller after exit of this function.
3187
3188 This function can exit for any of these conditions:
3189 - (frameState == ZSTD2_STATE_AFTER_HEADER)
3190 - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set
3191 - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart).
3192 - ZSTD_STATUS_NEEDS_MORE_INPUT in src
3193 - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block
3194
3195 This function decodes no more than one non-empty block.
3196 So it fulfills the condition at exit:
3197 (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max)
3198 Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding.
3199
3200 if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo)
3201 {
3202 then this function uses additional strong limit from (CZstdDec1::cycSize).
3203 So this function will not write any data after (CZstdDec1::cycSize)
3204 And it fulfills the condition at exit:
3205 (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize)
3206 }
3207*/
3208static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds,
3209 SizeT winLimitAdd)
3210{
3211 const Byte *src = ds->inBuf;
3212 SizeT * const srcLen = &ds->inPos;
3213 const SizeT inSize = ds->inLim;
3214 // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0;
3215 enum_ZstdStatus * const status = &ds->status;
3216 CZstdDecInfo * const info = &ds->info;
3217 SizeT winLimit;
3218
3219 const SizeT winPos_atFuncStart = p->decoder.winPos;
3220 src += *srcLen;
3221 *status = ZSTD_STATUS_NOT_SPECIFIED;
3222
3223 // finishMode = ZSTD_FINISH_ANY;
3224 if (ds->outSize_Defined)
3225 {
3226 if (ds->outSize < ds->outProcessed)
3227 {
3228 // p->isAfterSizeMode = 2; // we have extra bytes already
3229 *status = ZSTD_STATUS_OUT_REACHED;
3230 return SZ_OK;
3231 // size = 0;
3232 }
3233 else
3234 {
3235 // p->outSize >= p->outProcessed
3236 const UInt64 rem = ds->outSize - ds->outProcessed;
3237 /*
3238 if (rem == 0)
3239 p->isAfterSizeMode = 1; // we have reached exact required size
3240 */
3241 if (winLimitAdd >= rem)
3242 {
3243 winLimitAdd = (SizeT)rem;
3244 // if (p->finishMode) finishMode = ZSTD_FINISH_END;
3245 }
3246 }
3247 }
3248
3249 winLimit = p->decoder.winPos + winLimitAdd;
3250 // (p->decoder.winPos <= winLimit)
3251
3252 // while (p->frameState != ZSTD2_STATE_ERROR)
3253 while (!p->isErrorState)
3254 {
3255 SizeT inCur = inSize - *srcLen;
3256
3257 if (p->frameState == ZSTD2_STATE_DATA)
3258 {
3259 /* (p->decoder.winPos == winPos_atFuncStart) is expected,
3260 because this function doesn't start new block.
3261 if it have finished some non-empty block in this call. */
3262 if (p->decoder.winPos != winPos_atFuncStart)
3263 return SZ_ERROR_FAIL; // it's unexpected
3264
3265 /*
3266 if (p->decoder.winPos > winLimit)
3267 {
3268 // we can be here, if in this function call
3269 // - we have extracted non-empty compressed block, and (winPos > winLimit) after that.
3270 // - we have started new block decoding after that.
3271 // It's unexpected case, because we exit after non-empty non-last block.
3272 *status = (inSize == *srcLen) ?
3273 ZSTD_STATUS_NEEDS_MORE_INPUT :
3274 ZSTD_STATUS_NOT_FINISHED;
3275 return SZ_OK;
3276 }
3277 */
3278 // p->decoder.winPos <= winLimit
3279
3280 if (p->blockType != kBlockType_Compressed)
3281 {
3282 // it's RLE or RAW block.
3283 // p->BlockSize != 0_
3284 // winLimit <= p->decoder.cycSize
3285 /* So here we use more strong (winLimit), even for
3286 (ds->outBuf_fromCaller) mode. */
3287 SizeT outCur = winLimit - p->decoder.winPos;
3288 {
3289 const UInt32 rem = p->blockSize;
3290 if (outCur > rem)
3291 outCur = rem;
3292 }
3293 if (p->blockType == kBlockType_Raw)
3294 {
3295 if (outCur > inCur)
3296 outCur = inCur;
3297 /* output buffer is better aligned for XXH code.
3298 So we use hash for output buffer data */
3299 // ZstdDec_Update_XXH(p, src, outCur); // for debug:
3300 memcpy(p->decoder.win + p->decoder.winPos, src, outCur);
3301 src += outCur;
3302 *srcLen += outCur;
3303 }
3304 else // kBlockType_RLE
3305 {
3306 #define RLE_BYTE_INDEX_IN_temp 3
3307 memset(p->decoder.win + p->decoder.winPos,
3308 p->temp[RLE_BYTE_INDEX_IN_temp], outCur);
3309 }
3310 {
3311 const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3312 p->decoder.winPos += outCur;
3313 p->contentProcessed += outCur;
3314 ZstdDec_Update_XXH(p, xxh64_winPos);
3315 }
3316 // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug:
3317 UPDATE_TOTAL_OUT(&p->decoder, outCur)
3318 ds->outProcessed += outCur;
3319 if (p->blockSize -= (UInt32)outCur)
3320 {
3321 /*
3322 if (ds->outSize_Defined)
3323 {
3324 if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3325 (ds->outSize == ds->outProcessed ? 1u: 2u);
3326 }
3327 */
3328 *status = (enum_ZstdStatus)
3329 (ds->outSize_Defined && ds->outSize <= ds->outProcessed ?
3330 ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ?
3331 ZSTD_STATUS_NEEDS_MORE_INPUT :
3332 ZSTD_STATUS_NOT_FINISHED);
3333 return SZ_OK;
3334 }
3335 }
3336 else // kBlockType_Compressed
3337 {
3338 // p->blockSize != 0
3339 // (uncompressed_size_of_block == 0) is allowed
3340 // (p->curBlockUnpackRem == 0) is allowed
3341 /*
3342 if (p->decoder.winPos >= winLimit)
3343 {
3344 if (p->decoder.winPos != winPos_atFuncStart)
3345 {
3346 // it's unexpected case
3347 // We already have some data in finished blocks in this function call.
3348 // So we don't decompress new block after (>=winLimit),
3349 // even if it's empty block.
3350 *status = (inSize == *srcLen) ?
3351 ZSTD_STATUS_NEEDS_MORE_INPUT :
3352 ZSTD_STATUS_NOT_FINISHED;
3353 return SZ_OK;
3354 }
3355 // (p->decoder.winPos == winLimit == winPos_atFuncStart)
3356 // we will decode current block, because that current
3357 // block can be empty block and we want to make some visible
3358 // change of (src) stream after function start.
3359 }
3360 */
3361 /*
3362 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3363 {
3364 // we don't want to start new block, if we have more extra decoded bytes already
3365 *status = ZSTD_STATUS_OUT_REACHED;
3366 return SZ_OK;
3367 }
3368 */
3369 {
3370 const Byte *comprStream;
3371 size_t afterAvail;
3372 UInt32 inTempPos = p->inTempPos;
3373 const UInt32 rem = p->blockSize - inTempPos;
3374 // rem != 0
3375 if (inTempPos != 0 // (inTemp) buffer already contains some input data
3376 || inCur < rem // available input data size is smaller than compressed block size
3377 || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem))
3378 {
3379 if (inCur > rem)
3380 inCur = rem;
3381 if (inCur)
3382 {
3383 STAT_INC(g_Num_Blocks_memcpy)
3384 // we clear data for backward lookahead reading
3385 if (inTempPos == 0)
3386 memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH);
3387 // { unsigned y = 0; for(;y < 1000; y++)
3388 memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur);
3389 // }
3390 src += inCur;
3391 *srcLen += inCur;
3392 inTempPos += (UInt32)inCur;
3393 p->inTempPos = inTempPos;
3394 }
3395 if (inTempPos != p->blockSize)
3396 {
3397 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3398 return SZ_OK;
3399 }
3400 #if COPY_CHUNK_SIZE > 1
3401 memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE);
3402 #endif
3403 comprStream = p->inTemp + kTempBuffer_PreSize;
3404 afterAvail = k_Lit_AfterAvail;
3405 // we don't want to read non-initialized data or junk in CopyMatch():
3406 }
3407 else
3408 {
3409 // inCur >= rem
3410 // we use direct decoding from (src) buffer:
3411 afterAvail = inCur - rem;
3412 comprStream = src;
3413 src += rem;
3414 *srcLen += rem;
3415 }
3416
3417 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
3418 ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize);
3419 #endif
3420 // printf("\nblockSize=%u", p->blockSize);
3421 // printf("%x\n", (unsigned)p->contentProcessed);
3422 STAT_INC(g_Num_Blocks_Compressed)
3423 {
3424 SRes sres;
3425 const size_t winPos = p->decoder.winPos;
3426 /*
3427 if ( useAdditionalWinLimit), we use strong unpack limit: smallest from
3428 - limit from stream : (curBlockUnpackRem)
3429 - limit from caller : (cycSize - winPos)
3430 if (!useAdditionalWinLimit), we use only relaxed limit:
3431 - limit from stream : (curBlockUnpackRem)
3432 */
3433 SizeT outLimit = p->curBlockUnpackRem;
3434 if (ds->outBuf_fromCaller)
3435 // if (useAdditionalWinLimit)
3436 {
3437 const size_t limit = p->decoder.cycSize - winPos;
3438 if (outLimit > limit)
3439 outLimit = limit;
3440 }
3441 sres = ZstdDec1_DecodeBlock(&p->decoder,
3442 comprStream, p->blockSize, afterAvail, outLimit);
3443 // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug:
3444 if (sres)
3445 {
3446 p->isErrorState = True;
3447 return sres;
3448 }
3449 {
3450 const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3451 const size_t num = p->decoder.winPos - winPos;
3452 ds->outProcessed += num;
3453 p->contentProcessed += num;
3454 ZstdDec_Update_XXH(p, xxh64_winPos);
3455 }
3456 }
3457 // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos);
3458 }
3459 }
3460
3461 /*
3462 if (ds->outSize_Defined)
3463 {
3464 if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3465 (ds->outSize == ds->outProcessed ? 1u: 2u);
3466 }
3467 */
3468
3469 if (!ZSTD_DEC_IS_LAST_BLOCK(p))
3470 {
3471 p->frameState = ZSTD2_STATE_BLOCK;
3472 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3473 {
3474 *status = ZSTD_STATUS_OUT_REACHED;
3475 return SZ_OK;
3476 }
3477 // we exit only if (winPos) was changed in this function call:
3478 if (p->decoder.winPos != winPos_atFuncStart)
3479 {
3480 // decoded block was not empty. So we exit:
3481 *status = (enum_ZstdStatus)(
3482 (inSize == *srcLen) ?
3483 ZSTD_STATUS_NEEDS_MORE_INPUT :
3484 ZSTD_STATUS_NOT_FINISHED);
3485 return SZ_OK;
3486 }
3487 // (p->decoder.winPos == winPos_atFuncStart)
3488 // so current decoded block was empty.
3489 // we will try to decode more blocks in this function.
3490 continue;
3491 }
3492
3493 // decoded block was last in frame
3494 if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3495 {
3496 p->frameState = ZSTD2_STATE_HASH;
3497 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3498 {
3499 *status = ZSTD_STATUS_OUT_REACHED;
3500 return SZ_OK; // disable if want to
3501 /* We want to get same return codes for any input buffer sizes.
3502 We want to get faster ZSTD_STATUS_OUT_REACHED status.
3503 So we exit with ZSTD_STATUS_OUT_REACHED here,
3504 instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing.
3505 that depends from input buffer size and that can set
3506 ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC.
3507 */
3508 }
3509 }
3510 else
3511 {
3512 /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */
3513 p->frameState = ZSTD2_STATE_FINISHED;
3514 }
3515 /*
3516 p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ?
3517 ZSTD2_STATE_HASH :
3518 ZSTD2_STATE_FINISHED;
3519 */
3520 /* it's required to process ZSTD2_STATE_FINISHED state in this function call,
3521 because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code,
3522 while the caller can reinit full state for ZSTD2_STATE_FINISHED
3523 So we can't exit from function here. */
3524 continue;
3525 }
3526
3527 if (p->frameState == ZSTD2_STATE_FINISHED)
3528 {
3529 *status = ZSTD_STATUS_FINISHED_FRAME;
3530 if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)
3531 && p->contentSize != p->contentProcessed)
3532 return SZ_ERROR_DATA;
3533 if (p->hashError) // for debug
3534 return SZ_ERROR_CRC;
3535 return SZ_OK;
3536 // p->frameState = ZSTD2_STATE_SIGNATURE;
3537 // continue;
3538 }
3539
3540 if (p->frameState == ZSTD2_STATE_AFTER_HEADER)
3541 return SZ_OK; // we need memory allocation for that state
3542
3543 if (p->frameState == ZSTD2_STATE_SKIP_DATA)
3544 {
3545 UInt32 blockSize = p->blockSize;
3546 // (blockSize == 0) is possible
3547 if (inCur > blockSize)
3548 inCur = blockSize;
3549 src += inCur;
3550 *srcLen += inCur;
3551 blockSize -= (UInt32)inCur;
3552 p->blockSize = blockSize;
3553 if (blockSize == 0)
3554 {
3555 p->frameState = ZSTD2_STATE_SIGNATURE;
3556 // continue; // for debug: we can continue without return to caller.
3557 // we notify the caller that skip frame was finished:
3558 *status = ZSTD_STATUS_FINISHED_FRAME;
3559 return SZ_OK;
3560 }
3561 // blockSize != 0
3562 // (inCur) was smaller than previous value of p->blockSize.
3563 // (inSize == *srcLen) now
3564 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3565 return SZ_OK;
3566 }
3567
3568 if (inCur == 0)
3569 {
3570 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3571 return SZ_OK;
3572 }
3573
3574 {
3575 (*srcLen)++;
3576 p->frameState = ZstdDec_UpdateState(p, *src++, info);
3577 }
3578 }
3579
3580 *status = ZSTD_STATUS_NOT_SPECIFIED;
3581 p->isErrorState = True;
3582 // p->frameState = ZSTD2_STATE_ERROR;
3583 // if (p->frameState = ZSTD2_STATE_SIGNATURE) return SZ_ERROR_NO_ARCHIVE
3584 return SZ_ERROR_DATA;
3585}
3586
3587
3588
3589
3590SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p)
3591{
3592 p->needWrite_Size = 0;
3593 p->status = ZSTD_STATUS_NOT_SPECIFIED;
3594 dec->disableHash = p->disableHash;
3595
3596 if (p->outBuf_fromCaller)
3597 {
3598 dec->decoder.win = p->outBuf_fromCaller;
3599 dec->decoder.cycSize = p->outBufSize_fromCaller;
3600 }
3601
3602 // p->winPos = dec->decoder.winPos;
3603
3604 for (;;)
3605 {
3606 SizeT winPos, size;
3607 // SizeT outProcessed;
3608 SRes res;
3609
3610 if (p->wrPos > dec->decoder.winPos)
3611 return SZ_ERROR_FAIL;
3612
3613 if (dec->frameState == ZSTD2_STATE_FINISHED)
3614 {
3615 if (!p->outBuf_fromCaller)
3616 {
3617 // we need to set positions to zero for new frame.
3618 if (p->wrPos != dec->decoder.winPos)
3619 {
3620 /* We have already asked the caller to flush all data
3621 with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status.
3622 So it's unexpected case */
3623 // p->winPos = dec->decoder.winPos;
3624 // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking
3625 // return SZ_OK; // ask to flush again
3626 return SZ_ERROR_FAIL;
3627 }
3628 // (p->wrPos == dec->decoder.winPos), and we wrap to zero:
3629 dec->decoder.winPos = 0;
3630 p->winPos = 0;
3631 p->wrPos = 0;
3632 }
3633 ZstdDec_Init_ForNewFrame(dec);
3634 // continue;
3635 }
3636
3637 winPos = dec->decoder.winPos;
3638 {
3639 SizeT next = dec->decoder.cycSize;
3640 /* cycSize == 0, if no buffer was allocated still,
3641 or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */
3642 if (!p->outBuf_fromCaller
3643 && next
3644 && next <= winPos
3645 && dec->isCyclicMode)
3646 {
3647 // (0 < decoder.cycSize <= winPos) in isCyclicMode.
3648 // so we need to wrap (winPos) and (wrPos) over (cycSize).
3649 const size_t delta = next;
3650 // (delta) is how many bytes we remove from buffer.
3651 /*
3652 // we don't need data older than last (cycSize) bytes.
3653 size_t delta = winPos - next; // num bytes after (cycSize)
3654 if (delta <= next) // it's expected case
3655 delta = next;
3656 // delta == Max(cycSize, winPos - cycSize)
3657 */
3658 if (p->wrPos < delta)
3659 {
3660 // (wrPos < decoder.cycSize)
3661 // We have asked already the caller to flush required data
3662 // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3663 // p->winPos = winPos;
3664 // p->needWrite_Size = delta - p->wrPos; // flush size asking
3665 // return SZ_OK; // ask to flush again
3666 return SZ_ERROR_FAIL;
3667 }
3668 // p->wrPos >= decoder.cycSize
3669 // we move extra data after (decoder.cycSize) to start of cyclic buffer:
3670 winPos -= delta;
3671 if (winPos)
3672 {
3673 if (winPos >= delta)
3674 return SZ_ERROR_FAIL;
3675 memmove(dec->decoder.win, dec->decoder.win + delta, winPos);
3676 // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos);
3677 STAT_INC(g_Num_Wrap_memmove_Num)
3678 STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;)
3679 }
3680 dec->decoder.winPos = winPos;
3681 p->winPos = winPos;
3682 p->wrPos -= delta;
3683 // dec->xxh64_winPos -= delta;
3684
3685 // (winPos < delta)
3686 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3687 /* we set the data after cycSize, because
3688 we don't want to read non-initialized data or junk in CopyMatch(). */
3689 memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE);
3690 #endif
3691
3692 /*
3693 if (winPos == next)
3694 {
3695 if (winPos != p->wrPos)
3696 {
3697 // we already requested before to flush full data for that case.
3698 // but we give the caller a second chance to flush data:
3699 p->needWrite_Size = winPos - p->wrPos;
3700 return SZ_OK;
3701 }
3702 // (decoder.cycSize == winPos == p->wrPos)
3703 // so we do second wrapping to zero:
3704 winPos = 0;
3705 dec->decoder.winPos = 0;
3706 p->winPos = 0;
3707 p->wrPos = 0;
3708 }
3709 */
3710 // (winPos < next)
3711 }
3712
3713 if (winPos > next)
3714 return SZ_ERROR_FAIL; // it's unexpected case
3715 /*
3716 if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3717 then (winPos < cycSize)
3718 else (winPos <= cycSize)
3719 */
3720 if (!p->outBuf_fromCaller)
3721 {
3722 // that code is optional. We try to optimize write chunk sizes.
3723 /* (next2) is expected next write position in the caller,
3724 if the caller writes by kBlockSizeMax chunks.
3725 */
3726 /*
3727 const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1);
3728 if (winPos < next2 && next2 < next)
3729 next = next2;
3730 */
3731 }
3732 size = next - winPos;
3733 }
3734
3735 // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks
3736 res = ZstdDec_DecodeBlock(dec, p, size);
3737 /*
3738 after one block decoding:
3739 if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3740 then (winPos < cycSize + max_block_size)
3741 else (winPos <= cycSize)
3742 */
3743
3744 if (!p->outBuf_fromCaller)
3745 p->win = dec->decoder.win;
3746 p->winPos = dec->decoder.winPos;
3747
3748 // outProcessed = dec->decoder.winPos - winPos;
3749 // p->outProcessed += outProcessed;
3750
3751 if (res != SZ_OK)
3752 return res;
3753
3754 if (dec->frameState != ZSTD2_STATE_AFTER_HEADER)
3755 {
3756 if (p->outBuf_fromCaller)
3757 return SZ_OK;
3758 {
3759 // !p->outBuf_fromCaller
3760 /*
3761 if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because
3762 1) it's simpler to work with allocation and extracting of next frame,
3763 2) it's better to start writing to next new frame with aligned memory
3764 for faster xxh 64-bit reads.
3765 */
3766 size_t end = dec->decoder.winPos; // end pos for all data flushing
3767 if (p->status != ZSTD_STATUS_FINISHED_FRAME)
3768 {
3769 // we will request flush here only for cases when wrap in cyclic buffer can be required in next call.
3770 if (!dec->isCyclicMode)
3771 return SZ_OK;
3772 // isCyclicMode
3773 {
3774 const size_t delta = dec->decoder.cycSize;
3775 if (end < delta)
3776 return SZ_OK; // (winPos < cycSize). no need for flush
3777 // cycSize <= winPos
3778 // So we ask the caller to flush of (cycSize - wrPos) bytes,
3779 // and then we will wrap cylicBuffer in next call
3780 end = delta;
3781 }
3782 }
3783 p->needWrite_Size = end - p->wrPos;
3784 }
3785 return SZ_OK;
3786 }
3787
3788 // ZSTD2_STATE_AFTER_HEADER
3789 {
3790 BoolInt useCyclic = False;
3791 size_t cycSize;
3792
3793 // p->status = ZSTD_STATUS_NOT_FINISHED;
3794 if (dec->dictionaryId != 0)
3795 {
3796 /* actually we can try to decode some data,
3797 because it's possible that some data doesn't use dictionary */
3798 // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3799 return SZ_ERROR_UNSUPPORTED;
3800 }
3801
3802 {
3803 UInt64 winSize = dec->contentSize;
3804 UInt64 winSize_Allocate = winSize;
3805 const unsigned descriptor = dec->descriptor;
3806
3807 if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3808 {
3809 const Byte wd = dec->windowDescriptor;
3810 winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3);
3811 if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor)
3812 || winSize_Allocate > winSize)
3813 {
3814 winSize_Allocate = winSize;
3815 useCyclic = True;
3816 }
3817 }
3818 /*
3819 else
3820 {
3821 if (p->info.singleSegment_ContentSize_MAX < winSize)
3822 p->info.singleSegment_ContentSize_MAX = winSize;
3823 // p->info.num_SingleSegments++;
3824 }
3825 */
3826 if (p->info.windowSize_MAX < winSize)
3827 p->info.windowSize_MAX = winSize;
3828 if (p->info.windowSize_Allocate_MAX < winSize_Allocate)
3829 p->info.windowSize_Allocate_MAX = winSize_Allocate;
3830 /*
3831 winSize_Allocate is MIN(content_size, window_size_from_descriptor).
3832 Wven if (content_size < (window_size_from_descriptor))
3833 original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed.
3834 We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate))
3835 */
3836 if (
3837 // winSize_Allocate // it's relaxed check
3838 winSize // it's more strict check to be compatible with original-zstd
3839 > ((UInt64)1 << MAX_WINDOW_SIZE_LOG))
3840 return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM
3841 cycSize = (size_t)winSize_Allocate;
3842 if (cycSize != winSize_Allocate)
3843 return SZ_ERROR_MEM;
3844 // cycSize <= winSize
3845 /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes.
3846 if (there is window descriptor)
3847 {
3848 We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate).
3849 Does original-zstd do it that way also?
3850 }
3851 Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize).
3852 Also we don't want too big values for (CZstdDec1::winSize).
3853 our (CZstdDec1::winSize) will meet the condition:
3854 (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize).
3855 */
3856 dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize;
3857 // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic)
3858 }
3859
3860 RINOK(ZstdDec_AllocateMisc(dec))
3861
3862 if (p->outBuf_fromCaller)
3863 dec->isCyclicMode = False;
3864 else
3865 {
3866 size_t d = cycSize;
3867
3868 if (dec->decoder.winPos != p->wrPos)
3869 return SZ_ERROR_FAIL;
3870
3871 dec->decoder.winPos = 0;
3872 p->wrPos = 0;
3873 p->winPos = dec->decoder.winPos;
3874
3875 /*
3876 const size_t needWrite = dec->decoder.winPos - p->wrPos;
3877 if (!needWrite)
3878 {
3879 dec->decoder.winPos = 0;
3880 p->wrPos = 0;
3881 p->winPos = dec->decoder.winPos;
3882 }
3883 */
3884 /* if (!useCyclic) we allocate only cycSize = ContentSize.
3885 But if we want to support the case where new frame starts with winPos != 0,
3886 then we will wrap over zero, and we still need
3887 to set (useCyclic) and allocate additional buffer spaces.
3888 Now we don't allow new frame starting with (winPos != 0).
3889 so (dec->decoder->winPos == 0)
3890 can use (!useCyclic) with reduced buffer sizes.
3891 */
3892 /*
3893 if (dec->decoder->winPos != 0)
3894 useCyclic = True;
3895 */
3896
3897 if (useCyclic)
3898 {
3899 /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes
3900 larger than window size, because CopyMatch() can write additional
3901 (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer.
3902 But for performance reasons we align (cycSize) for (kBlockSizeMax).
3903 also we must provide (cycSize >= max_decoded_data_after_cycSize),
3904 because after data move wrapping over zero we must provide (winPos < cycSize).
3905 */
3906 const size_t alignSize = kBlockSizeMax;
3907 /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because
3908 we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */
3909 // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize)
3910 cycSize += (1 << 7) + alignSize;
3911 cycSize &= ~(size_t)(alignSize - 1);
3912 // cycSize must be aligned for 32, because xxh requires 32-bytes blocks.
3913 // cycSize += 12345; // for debug
3914 // cycSize += 1 << 10; // for debug
3915 // cycSize += 32; // for debug
3916 // cycSize += kBlockSizeMax; // for debug
3917 if (cycSize < d)
3918 return SZ_ERROR_MEM;
3919 /*
3920 in cyclic buffer mode we allow to decode one additional block
3921 that exceeds (cycSize).
3922 So we must allocate additional (kBlockSizeMax) bytes after (cycSize).
3923 if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
3924 {
3925 we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize)
3926 but we aready allocate additional kBlockSizeMax that
3927 is larger than COPY_CHUNK_SIZE.
3928 So we don't need additional space of COPY_CHUNK_SIZE after (cycSize).
3929 }
3930 */
3931 /*
3932 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3933 d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1)
3934 #endif
3935 */
3936 d = cycSize + kBlockSizeMax;
3937 if (d < cycSize)
3938 return SZ_ERROR_MEM;
3939 }
3940
3941 {
3942 const size_t kMinWinAllocSize = 1 << 12;
3943 if (d < kMinWinAllocSize)
3944 d = kMinWinAllocSize;
3945 }
3946
3947 if (d > dec->winBufSize_Allocated)
3948 {
3949 /*
3950 if (needWrite)
3951 {
3952 p->needWrite_Size = needWrite;
3953 return SZ_OK;
3954 // return SZ_ERROR_FAIL;
3955 }
3956 */
3957
3958 if (dec->winBufSize_Allocated != 0)
3959 {
3960 const size_t k_extra = (useCyclic || d >= (1u << 20)) ?
3961 2 * kBlockSizeMax : 0;
3962 unsigned i = useCyclic ? 17 : 12;
3963 for (; i < sizeof(size_t) * 8; i++)
3964 {
3965 const size_t d2 = ((size_t)1 << i) + k_extra;
3966 if (d2 >= d)
3967 {
3968 d = d2;
3969 break;
3970 }
3971 }
3972 }
3973 // RINOK(ZstdDec_AllocateWindow(dec, d))
3974 ZstdDec_FreeWindow(dec);
3975 dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d);
3976 if (!dec->win_Base)
3977 return SZ_ERROR_MEM;
3978 dec->decoder.win = dec->win_Base;
3979 dec->winBufSize_Allocated = d;
3980 }
3981 /*
3982 else
3983 {
3984 // for non-cyclycMode we want flush data, and set winPos = 0
3985 if (needWrite)
3986 {
3987 if (!useCyclic || dec->decoder.winPos >= cycSize)
3988 {
3989 p->needWrite_Size = needWrite;
3990 return SZ_OK;
3991 // return SZ_ERROR_FAIL;
3992 }
3993 }
3994 }
3995 */
3996
3997 dec->decoder.cycSize = cycSize;
3998 p->win = dec->decoder.win;
3999 // p->cycSize = dec->decoder.cycSize;
4000 dec->isCyclicMode = (Byte)useCyclic;
4001 } // (!p->outBuf_fromCaller) end
4002
4003 // p->winPos = dec->decoder.winPos;
4004 dec->frameState = ZSTD2_STATE_BLOCK;
4005 // continue;
4006 } // ZSTD2_STATE_AFTER_HEADER end
4007 }
4008}
4009
4010
4011void ZstdDec_GetResInfo(const CZstdDec *dec,
4012 const CZstdDecState *p,
4013 SRes res,
4014 CZstdDecResInfo *stat)
4015{
4016 // ZstdDecInfo_CLEAR(stat);
4017 stat->extraSize = 0;
4018 stat->is_NonFinishedFrame = False;
4019 if (dec->frameState != ZSTD2_STATE_FINISHED)
4020 {
4021 if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4022 {
4023 stat->extraSize = (Byte)dec->tempSize;
4024 if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0)
4025 res = SZ_ERROR_NO_ARCHIVE;
4026 }
4027 else
4028 {
4029 stat->is_NonFinishedFrame = True;
4030 if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT)
4031 res = SZ_ERROR_INPUT_EOF;
4032 }
4033 }
4034 stat->decode_SRes = res;
4035}
4036
4037
4038size_t ZstdDec_ReadUnusedFromInBuf(
4039 CZstdDecHandle dec,
4040 size_t afterDecoding_tempPos,
4041 void *data, size_t size)
4042{
4043 size_t processed = 0;
4044 if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4045 {
4046 Byte *dest = (Byte *)data;
4047 const size_t tempSize = dec->tempSize;
4048 while (afterDecoding_tempPos < tempSize)
4049 {
4050 if (size == 0)
4051 break;
4052 size--;
4053 *dest++ = dec->temp[afterDecoding_tempPos++];
4054 processed++;
4055 }
4056 }
4057 return processed;
4058}
4059
4060
4061void ZstdDecState_Clear(CZstdDecState *p)
4062{
4063 memset(p, 0 , sizeof(*p));
4064}
diff --git a/C/ZstdDec.h b/C/ZstdDec.h
new file mode 100644
index 0000000..cd26131
--- /dev/null
+++ b/C/ZstdDec.h
@@ -0,0 +1,173 @@
1/* ZstdDec.h -- Zstd Decoder interfaces
22024-01-21 : Igor Pavlov : Public domain */
3
4#ifndef ZIP7_INC_ZSTD_DEC_H
5#define ZIP7_INC_ZSTD_DEC_H
6
7EXTERN_C_BEGIN
8
9typedef struct CZstdDec CZstdDec;
10typedef CZstdDec * CZstdDecHandle;
11
12CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big);
13void ZstdDec_Destroy(CZstdDecHandle p);
14
15typedef enum
16{
17 ZSTD_STATUS_NOT_SPECIFIED, /* use main error code instead */
18 ZSTD_STATUS_FINISHED_FRAME, /* data frame or skip frame was finished */
19 ZSTD_STATUS_NOT_FINISHED, /* just finished non-empty block or unfinished RAW/RLE block */
20 ZSTD_STATUS_NEEDS_MORE_INPUT, /* the callee needs more input bytes. It has more priority over ZSTD_STATUS_NOT_FINISHED */
21 ZSTD_STATUS_OUT_REACHED /* is not finihed frame and ((outProcessed > outSize) || (outProcessed == outSize && unfinished RAW/RLE block) */
22} enum_ZstdStatus_Dummy;
23
24#define ZstdDecState_DOES_NEED_MORE_INPUT_OR_FINISHED_FRAME(p) \
25 ((p)->status & ZSTD_STATUS_FINISHED_FRAME)
26/*
27 ((p)->status == ZSTD_STATUS_NEEDS_MORE_INPUT || \
28 (p)->status == ZSTD_STATUS_FINISHED_FRAME)
29*/
30
31typedef Byte enum_ZstdStatus;
32
33
34void ZstdDec_Init(CZstdDecHandle p);
35
36typedef struct
37{
38 UInt64 num_Blocks;
39 Byte descriptor_OR;
40 Byte descriptor_NOT_OR;
41 Byte are_ContentSize_Unknown;
42 Byte windowDescriptor_MAX;
43
44 // Byte are_ContentSize_Known;
45 // Byte are_SingleSegments;
46 // Byte are_WindowDescriptors;
47 Byte checksum_Defined;
48 // Byte are_Checksums;
49 // Byte are_Non_Checksums;
50
51 // Byte are_DictionaryId;
52 Byte are_DictionaryId_Different;
53
54 // Byte reserved[3];
55
56 UInt32 checksum; // checksum of last data frame
57 /// UInt32 dictionaryId_Cur;
58 UInt32 dictionaryId; // if there are non-zero dictionary IDs, then it's first dictionaryId
59
60 UInt64 num_DataFrames;
61 UInt64 num_SkipFrames;
62 UInt64 skipFrames_Size;
63 UInt64 contentSize_Total;
64 UInt64 contentSize_MAX;
65 // UInt64 num_Checksums;
66 // UInt64 num_Non_Checksums; // frames without checksum
67 // UInt64 num_WindowDescriptors;
68 // UInt64 num_SingleSegments;
69 // UInt64 num_Frames_with_ContentSize;
70 // UInt64 num_Frames_without_ContentSize;
71 UInt64 windowSize_MAX;
72 UInt64 windowSize_Allocate_MAX;
73 // UInt64 num_DictionaryIds;
74 // UInt64 num_Blocks_forType[4];
75 // UInt64 num_BlockBytes_forType[4];
76 // UInt64 num_SingleSegments;
77 // UInt64 singleSegment_ContentSize_MAX;
78} CZstdDecInfo;
79
80#define ZstdDecInfo_CLEAR(p) { memset(p, 0, sizeof(*(p))); }
81
82#define ZstdDecInfo_GET_NUM_FRAMES(p) ((p)->num_DataFrames + (p)->num_SkipFrames)
83
84
85typedef struct CZstdDecState
86{
87 enum_ZstdStatus status; // out
88 Byte disableHash;
89 // Byte mustBeFinished;
90 Byte outSize_Defined;
91 // Byte isAfterSizeMode;
92 // UInt64 inProcessed;
93 // SRes codeRes;
94 // Byte needWrite_IsStrong;
95
96 const Byte *inBuf;
97 size_t inPos; // in/out
98 size_t inLim;
99
100 const Byte *win; // out
101 size_t winPos; // out
102 size_t wrPos; // in/out
103 // size_t cycSize; // out : if (!outBuf_fromCaller)
104 size_t needWrite_Size; // out
105
106 Byte *outBuf_fromCaller;
107 size_t outBufSize_fromCaller;
108 /* (outBufSize_fromCaller >= full_uncompressed_size_of_all_frames) is required
109 for success decoding.
110 If outBufSize_fromCaller < full_uncompressed_size_of_all_frames),
111 decoding can give error message, because we decode per block basis.
112 */
113
114 // size_t outStep;
115 UInt64 outSize; // total in all frames
116 UInt64 outProcessed; // out decoded in all frames (it can be >= outSize)
117
118 CZstdDecInfo info;
119} CZstdDecState;
120
121void ZstdDecState_Clear(CZstdDecState *p);
122
123/*
124ZstdDec_Decode()
125return:
126 SZ_OK - no error
127 SZ_ERROR_DATA - Data Error
128 SZ_ERROR_MEM - Memory allocation error
129 SZ_ERROR_UNSUPPORTED - Unsupported method or method properties
130 SZ_ERROR_CRC - XXH hash Error
131 // SZ_ERROR_ARCHIVE - Headers error (not used now)
132*/
133SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p);
134
135/*
136ZstdDec_ReadUnusedFromInBuf():
137returns: the number of bytes that were read from InBuf
138(*afterDecoding_tempPos) must be set to zero before first call of ZstdDec_ReadUnusedFromInBuf()
139*/
140size_t ZstdDec_ReadUnusedFromInBuf(
141 CZstdDecHandle dec,
142 size_t afterDecoding_tempPos, // in/out
143 void *data, size_t size);
144
145typedef struct
146{
147 SRes decode_SRes; // error code of data decoding
148 Byte is_NonFinishedFrame; // there is unfinished decoding for data frame or skip frame
149 Byte extraSize;
150} CZstdDecResInfo;
151
152/*
153#define ZstdDecResInfo_CLEAR(p) \
154{ (p)->decode_SRes = 0; \
155 (p)->is_NonFinishedFrame; \
156 (p)->extraSize = 0; \
157}
158// memset(p, 0, sizeof(*p));
159*/
160
161/*
162additional error codes for CZstdDecResInfo::decode_SRes:
163 SZ_ERROR_NO_ARCHIVE - is not zstd stream (no frames)
164 SZ_ERROR_INPUT_EOF - need more data in input stream
165*/
166void ZstdDec_GetResInfo(const CZstdDec *dec,
167 const CZstdDecState *p,
168 SRes res, // it's result from ZstdDec_Decode()
169 CZstdDecResInfo *info);
170
171EXTERN_C_END
172
173#endif
diff --git a/C/var_clang_arm64.mak b/C/var_clang_arm64.mak
index 4b35409..971101a 100644
--- a/C/var_clang_arm64.mak
+++ b/C/var_clang_arm64.mak
@@ -6,6 +6,7 @@ IS_ARM64=1
6CROSS_COMPILE= 6CROSS_COMPILE=
7MY_ARCH= 7MY_ARCH=
8USE_ASM=1 8USE_ASM=1
9ASM_FLAGS=-Wno-unused-macros
9CC=$(CROSS_COMPILE)clang 10CC=$(CROSS_COMPILE)clang
10CXX=$(CROSS_COMPILE)clang++ 11CXX=$(CROSS_COMPILE)clang++
11USE_CLANG=1 12USE_CLANG=1