diff options
author | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-14 00:00:00 +0000 |
---|---|---|
committer | Igor Pavlov <87184205+ip7z@users.noreply.github.com> | 2024-05-15 23:55:04 +0500 |
commit | fc662341e6f85da78ada0e443f6116b978f79f22 (patch) | |
tree | 1be1cc402a7a9cbc18d4eeea6b141354c2d559e3 /C | |
parent | 5b39dc76f1bc82f941d5c800ab9f34407a06b53a (diff) | |
download | 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.gz 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.tar.bz2 7zip-fc662341e6f85da78ada0e443f6116b978f79f22.zip |
24.0524.05
Diffstat (limited to 'C')
75 files changed, 10047 insertions, 1298 deletions
diff --git a/C/7zArcIn.c b/C/7zArcIn.c index 43fa7c2..23f2949 100644 --- a/C/7zArcIn.c +++ b/C/7zArcIn.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* 7zArcIn.c -- 7z Input functions | 1 | /* 7zArcIn.c -- 7z Input functions |
2 | 2023-05-11 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -301,7 +301,7 @@ static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v) | |||
301 | 301 | ||
302 | static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) | 302 | static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) |
303 | { | 303 | { |
304 | Byte b = 0; | 304 | unsigned b = 0; |
305 | unsigned m = 0; | 305 | unsigned m = 0; |
306 | UInt32 sum = 0; | 306 | UInt32 sum = 0; |
307 | for (; numItems != 0; numItems--) | 307 | for (; numItems != 0; numItems--) |
@@ -312,7 +312,7 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) | |||
312 | m = 8; | 312 | m = 8; |
313 | } | 313 | } |
314 | m--; | 314 | m--; |
315 | sum += ((b >> m) & 1); | 315 | sum += (UInt32)((b >> m) & 1); |
316 | } | 316 | } |
317 | return sum; | 317 | return sum; |
318 | } | 318 | } |
@@ -1,93 +1,96 @@ | |||
1 | /* 7zCrc.c -- CRC32 calculation and init | 1 | /* 7zCrc.c -- CRC32 calculation and init |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "7zCrc.h" | 6 | #include "7zCrc.h" |
7 | #include "CpuArch.h" | 7 | #include "CpuArch.h" |
8 | 8 | ||
9 | #define kCrcPoly 0xEDB88320 | 9 | // for debug: |
10 | // #define __ARM_FEATURE_CRC32 1 | ||
10 | 11 | ||
11 | #ifdef MY_CPU_LE | 12 | #ifdef __ARM_FEATURE_CRC32 |
12 | #define CRC_NUM_TABLES 8 | 13 | // #pragma message("__ARM_FEATURE_CRC32") |
13 | #else | 14 | #define Z7_CRC_HW_FORCE |
14 | #define CRC_NUM_TABLES 9 | 15 | #endif |
15 | 16 | ||
16 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 17 | // #define Z7_CRC_DEBUG_BE |
17 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 18 | #ifdef Z7_CRC_DEBUG_BE |
19 | #undef MY_CPU_LE | ||
20 | #define MY_CPU_BE | ||
18 | #endif | 21 | #endif |
19 | 22 | ||
20 | #ifndef MY_CPU_BE | 23 | #ifdef Z7_CRC_HW_FORCE |
21 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 24 | #define Z7_CRC_NUM_TABLES_USE 1 |
22 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 25 | #else |
26 | #ifdef Z7_CRC_NUM_TABLES | ||
27 | #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES | ||
28 | #else | ||
29 | #define Z7_CRC_NUM_TABLES_USE 12 | ||
30 | #endif | ||
23 | #endif | 31 | #endif |
24 | 32 | ||
25 | /* | 33 | #if Z7_CRC_NUM_TABLES_USE < 1 |
26 | extern | 34 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES |
27 | CRC_FUNC g_CrcUpdateT4; | 35 | #endif |
28 | CRC_FUNC g_CrcUpdateT4; | ||
29 | */ | ||
30 | extern | ||
31 | CRC_FUNC g_CrcUpdateT8; | ||
32 | CRC_FUNC g_CrcUpdateT8; | ||
33 | extern | ||
34 | CRC_FUNC g_CrcUpdateT0_32; | ||
35 | CRC_FUNC g_CrcUpdateT0_32; | ||
36 | extern | ||
37 | CRC_FUNC g_CrcUpdateT0_64; | ||
38 | CRC_FUNC g_CrcUpdateT0_64; | ||
39 | extern | ||
40 | CRC_FUNC g_CrcUpdate; | ||
41 | CRC_FUNC g_CrcUpdate; | ||
42 | |||
43 | UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; | ||
44 | |||
45 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 v, const void *data, size_t size) | ||
46 | { | ||
47 | return g_CrcUpdate(v, data, size, g_CrcTable); | ||
48 | } | ||
49 | 36 | ||
50 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) | 37 | #if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1) |
51 | { | 38 | #define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE |
52 | return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL; | 39 | #else |
53 | } | 40 | #define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1) |
41 | #endif | ||
54 | 42 | ||
55 | #if CRC_NUM_TABLES < 4 \ | 43 | #ifndef Z7_CRC_HW_FORCE |
56 | || (CRC_NUM_TABLES == 4 && defined(MY_CPU_BE)) \ | 44 | |
45 | #if Z7_CRC_NUM_TABLES_USE == 1 \ | ||
57 | || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) | 46 | || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) |
58 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 47 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
59 | UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table); | 48 | #define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1 |
60 | UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) | 49 | static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size) |
61 | { | 50 | { |
51 | const UInt32 *table = g_CrcTable; | ||
62 | const Byte *p = (const Byte *)data; | 52 | const Byte *p = (const Byte *)data; |
63 | const Byte *pEnd = p + size; | 53 | const Byte *lim = p + size; |
64 | for (; p != pEnd; p++) | 54 | for (; p != lim; p++) |
65 | v = CRC_UPDATE_BYTE_2(v, *p); | 55 | v = CRC_UPDATE_BYTE_2(v, *p); |
66 | return v; | 56 | return v; |
67 | } | 57 | } |
68 | #endif | 58 | #endif |
69 | 59 | ||
60 | |||
61 | #if Z7_CRC_NUM_TABLES_USE != 1 | ||
62 | #ifndef MY_CPU_BE | ||
63 | #define FUNC_NAME_LE_2(s) CrcUpdateT ## s | ||
64 | #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) | ||
65 | #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE) | ||
66 | UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
67 | #endif | ||
68 | #ifndef MY_CPU_LE | ||
69 | #define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s | ||
70 | #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) | ||
71 | #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE) | ||
72 | UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
73 | #endif | ||
74 | #endif | ||
75 | |||
76 | #endif // Z7_CRC_HW_FORCE | ||
77 | |||
70 | /* ---------- hardware CRC ---------- */ | 78 | /* ---------- hardware CRC ---------- */ |
71 | 79 | ||
72 | #ifdef MY_CPU_LE | 80 | #ifdef MY_CPU_LE |
73 | 81 | ||
74 | #if defined(MY_CPU_ARM_OR_ARM64) | 82 | #if defined(MY_CPU_ARM_OR_ARM64) |
75 | |||
76 | // #pragma message("ARM*") | 83 | // #pragma message("ARM*") |
77 | 84 | ||
78 | #if defined(_MSC_VER) | 85 | #if (defined(__clang__) && (__clang_major__ >= 3)) \ |
79 | #if defined(MY_CPU_ARM64) | 86 | || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \ |
80 | #if (_MSC_VER >= 1910) | 87 | || defined(__GNUC__) && (__GNUC__ >= 8) |
81 | #ifndef __clang__ | ||
82 | #define USE_ARM64_CRC | ||
83 | #include <intrin.h> | ||
84 | #endif | ||
85 | #endif | ||
86 | #endif | ||
87 | #elif (defined(__clang__) && (__clang_major__ >= 3)) \ | ||
88 | || (defined(__GNUC__) && (__GNUC__ > 4)) | ||
89 | #if !defined(__ARM_FEATURE_CRC32) | 88 | #if !defined(__ARM_FEATURE_CRC32) |
89 | // #pragma message("!defined(__ARM_FEATURE_CRC32)") | ||
90 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
90 | #define __ARM_FEATURE_CRC32 1 | 91 | #define __ARM_FEATURE_CRC32 1 |
92 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
93 | #define Z7_ARM_FEATURE_CRC32_WAS_SET | ||
91 | #if defined(__clang__) | 94 | #if defined(__clang__) |
92 | #if defined(MY_CPU_ARM64) | 95 | #if defined(MY_CPU_ARM64) |
93 | #define ATTRIB_CRC __attribute__((__target__("crc"))) | 96 | #define ATTRIB_CRC __attribute__((__target__("crc"))) |
@@ -96,100 +99,120 @@ UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UI | |||
96 | #endif | 99 | #endif |
97 | #else | 100 | #else |
98 | #if defined(MY_CPU_ARM64) | 101 | #if defined(MY_CPU_ARM64) |
102 | #if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000) | ||
99 | #define ATTRIB_CRC __attribute__((__target__("+crc"))) | 103 | #define ATTRIB_CRC __attribute__((__target__("+crc"))) |
104 | #endif | ||
100 | #else | 105 | #else |
106 | #if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8) | ||
107 | #if defined(__ARM_FP) && __GNUC__ >= 8 | ||
108 | // for -mfloat-abi=hard: similar to <arm_acle.h> | ||
109 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd"))) | ||
110 | #else | ||
101 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) | 111 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) |
112 | #endif | ||
113 | #endif | ||
102 | #endif | 114 | #endif |
103 | #endif | 115 | #endif |
104 | #endif | 116 | #endif |
105 | #if defined(__ARM_FEATURE_CRC32) | 117 | #if defined(__ARM_FEATURE_CRC32) |
106 | #define USE_ARM64_CRC | 118 | // #pragma message("<arm_acle.h>") |
119 | /* | ||
120 | arm_acle.h (GGC): | ||
121 | before Nov 17, 2017: | ||
122 | #ifdef __ARM_FEATURE_CRC32 | ||
123 | |||
124 | Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked" | ||
125 | #if __ARM_ARCH >= 8 | ||
126 | #pragma GCC target ("arch=armv8-a+crc") | ||
127 | |||
128 | Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1: | ||
129 | #ifdef __ARM_FEATURE_CRC32 | ||
130 | #ifdef __ARM_FP | ||
131 | #pragma GCC target ("arch=armv8-a+crc+simd") | ||
132 | #else | ||
133 | #pragma GCC target ("arch=armv8-a+crc") | ||
134 | #endif | ||
135 | */ | ||
136 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
137 | #if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \ | ||
138 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \ | ||
139 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100) | ||
140 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
141 | // #pragma message("#define __ARM_ARCH 8") | ||
142 | #undef __ARM_ARCH | ||
143 | #define __ARM_ARCH 8 | ||
144 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
145 | #endif | ||
146 | #endif | ||
147 | #define Z7_CRC_HW_USE | ||
107 | #include <arm_acle.h> | 148 | #include <arm_acle.h> |
108 | #endif | 149 | #endif |
150 | #elif defined(_MSC_VER) | ||
151 | #if defined(MY_CPU_ARM64) | ||
152 | #if (_MSC_VER >= 1910) | ||
153 | #ifdef __clang__ | ||
154 | // #define Z7_CRC_HW_USE | ||
155 | // #include <arm_acle.h> | ||
156 | #else | ||
157 | #define Z7_CRC_HW_USE | ||
158 | #include <intrin.h> | ||
159 | #endif | ||
160 | #endif | ||
161 | #endif | ||
109 | #endif | 162 | #endif |
110 | 163 | ||
111 | #else | 164 | #else // non-ARM* |
112 | |||
113 | // no hardware CRC | ||
114 | |||
115 | // #define USE_CRC_EMU | ||
116 | |||
117 | #ifdef USE_CRC_EMU | ||
118 | |||
119 | #pragma message("ARM64 CRC emulation") | ||
120 | |||
121 | Z7_FORCE_INLINE | ||
122 | UInt32 __crc32b(UInt32 v, UInt32 data) | ||
123 | { | ||
124 | const UInt32 *table = g_CrcTable; | ||
125 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); | ||
126 | return v; | ||
127 | } | ||
128 | 165 | ||
129 | Z7_FORCE_INLINE | 166 | // #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code |
130 | UInt32 __crc32w(UInt32 v, UInt32 data) | 167 | #ifdef Z7_CRC_HW_USE |
131 | { | 168 | #include "7zCrcEmu.h" |
132 | const UInt32 *table = g_CrcTable; | 169 | #endif |
133 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
134 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
135 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
136 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
137 | return v; | ||
138 | } | ||
139 | 170 | ||
140 | Z7_FORCE_INLINE | 171 | #endif // non-ARM* |
141 | UInt32 __crc32d(UInt32 v, UInt64 data) | ||
142 | { | ||
143 | const UInt32 *table = g_CrcTable; | ||
144 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
145 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
146 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
147 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
148 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
149 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
150 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
151 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
152 | return v; | ||
153 | } | ||
154 | 172 | ||
155 | #endif // USE_CRC_EMU | ||
156 | 173 | ||
157 | #endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
158 | 174 | ||
175 | #if defined(Z7_CRC_HW_USE) | ||
159 | 176 | ||
177 | // #pragma message("USE ARM HW CRC") | ||
160 | 178 | ||
161 | #if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) | 179 | #ifdef MY_CPU_64BIT |
180 | #define CRC_HW_WORD_TYPE UInt64 | ||
181 | #define CRC_HW_WORD_FUNC __crc32d | ||
182 | #else | ||
183 | #define CRC_HW_WORD_TYPE UInt32 | ||
184 | #define CRC_HW_WORD_FUNC __crc32w | ||
185 | #endif | ||
162 | 186 | ||
163 | #define T0_32_UNROLL_BYTES (4 * 4) | 187 | #define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4) |
164 | #define T0_64_UNROLL_BYTES (4 * 8) | ||
165 | 188 | ||
166 | #ifndef ATTRIB_CRC | 189 | #ifdef ATTRIB_CRC |
167 | #define ATTRIB_CRC | 190 | ATTRIB_CRC |
168 | #endif | 191 | #endif |
169 | // #pragma message("USE ARM HW CRC") | 192 | Z7_NO_INLINE |
170 | 193 | #ifdef Z7_CRC_HW_FORCE | |
171 | ATTRIB_CRC | 194 | UInt32 Z7_FASTCALL CrcUpdate |
172 | UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); | 195 | #else |
173 | ATTRIB_CRC | 196 | static UInt32 Z7_FASTCALL CrcUpdate_HW |
174 | UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) | 197 | #endif |
198 | (UInt32 v, const void *data, size_t size) | ||
175 | { | 199 | { |
176 | const Byte *p = (const Byte *)data; | 200 | const Byte *p = (const Byte *)data; |
177 | UNUSED_VAR(table); | 201 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--) |
178 | |||
179 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--) | ||
180 | v = __crc32b(v, *p++); | 202 | v = __crc32b(v, *p++); |
181 | 203 | if (size >= CRC_HW_UNROLL_BYTES) | |
182 | if (size >= T0_32_UNROLL_BYTES) | ||
183 | { | 204 | { |
184 | const Byte *lim = p + size; | 205 | const Byte *lim = p + size; |
185 | size &= (T0_32_UNROLL_BYTES - 1); | 206 | size &= CRC_HW_UNROLL_BYTES - 1; |
186 | lim -= size; | 207 | lim -= size; |
187 | do | 208 | do |
188 | { | 209 | { |
189 | v = __crc32w(v, *(const UInt32 *)(const void *)(p)); | 210 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); |
190 | v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; | 211 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); |
191 | v = __crc32w(v, *(const UInt32 *)(const void *)(p)); | 212 | p += 2 * sizeof(CRC_HW_WORD_TYPE); |
192 | v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; | 213 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); |
214 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); | ||
215 | p += 2 * sizeof(CRC_HW_WORD_TYPE); | ||
193 | } | 216 | } |
194 | while (p != lim); | 217 | while (p != lim); |
195 | } | 218 | } |
@@ -200,46 +223,86 @@ UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const | |||
200 | return v; | 223 | return v; |
201 | } | 224 | } |
202 | 225 | ||
203 | ATTRIB_CRC | 226 | #ifdef Z7_ARM_FEATURE_CRC32_WAS_SET |
204 | UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); | 227 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
205 | ATTRIB_CRC | 228 | #undef __ARM_FEATURE_CRC32 |
206 | UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) | 229 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
230 | #undef Z7_ARM_FEATURE_CRC32_WAS_SET | ||
231 | #endif | ||
232 | |||
233 | #endif // defined(Z7_CRC_HW_USE) | ||
234 | #endif // MY_CPU_LE | ||
235 | |||
236 | |||
237 | |||
238 | #ifndef Z7_CRC_HW_FORCE | ||
239 | |||
240 | #if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) | ||
241 | /* | ||
242 | typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC) | ||
243 | (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
244 | Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate; | ||
245 | */ | ||
246 | static unsigned g_Crc_Algo; | ||
247 | #if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) | ||
248 | static unsigned g_Crc_Be; | ||
249 | #endif | ||
250 | #endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) | ||
251 | |||
252 | |||
253 | |||
254 | Z7_NO_INLINE | ||
255 | #ifdef Z7_CRC_HW_USE | ||
256 | static UInt32 Z7_FASTCALL CrcUpdate_Base | ||
257 | #else | ||
258 | UInt32 Z7_FASTCALL CrcUpdate | ||
259 | #endif | ||
260 | (UInt32 crc, const void *data, size_t size) | ||
207 | { | 261 | { |
208 | const Byte *p = (const Byte *)data; | 262 | #if Z7_CRC_NUM_TABLES_USE == 1 |
209 | UNUSED_VAR(table); | 263 | return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); |
264 | #else // Z7_CRC_NUM_TABLES_USE != 1 | ||
265 | #ifdef Z7_CRC_UPDATE_T1_FUNC_NAME | ||
266 | if (g_Crc_Algo == 1) | ||
267 | return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); | ||
268 | #endif | ||
210 | 269 | ||
211 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--) | 270 | #ifdef MY_CPU_LE |
212 | v = __crc32b(v, *p++); | 271 | return FUNC_NAME_LE(crc, data, size, g_CrcTable); |
272 | #elif defined(MY_CPU_BE) | ||
273 | return FUNC_NAME_BE(crc, data, size, g_CrcTable); | ||
274 | #else | ||
275 | if (g_Crc_Be) | ||
276 | return FUNC_NAME_BE(crc, data, size, g_CrcTable); | ||
277 | else | ||
278 | return FUNC_NAME_LE(crc, data, size, g_CrcTable); | ||
279 | #endif | ||
280 | #endif // Z7_CRC_NUM_TABLES_USE != 1 | ||
281 | } | ||
213 | 282 | ||
214 | if (size >= T0_64_UNROLL_BYTES) | ||
215 | { | ||
216 | const Byte *lim = p + size; | ||
217 | size &= (T0_64_UNROLL_BYTES - 1); | ||
218 | lim -= size; | ||
219 | do | ||
220 | { | ||
221 | v = __crc32d(v, *(const UInt64 *)(const void *)(p)); | ||
222 | v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; | ||
223 | v = __crc32d(v, *(const UInt64 *)(const void *)(p)); | ||
224 | v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; | ||
225 | } | ||
226 | while (p != lim); | ||
227 | } | ||
228 | |||
229 | for (; size != 0; size--) | ||
230 | v = __crc32b(v, *p++); | ||
231 | 283 | ||
232 | return v; | 284 | #ifdef Z7_CRC_HW_USE |
285 | Z7_NO_INLINE | ||
286 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size) | ||
287 | { | ||
288 | if (g_Crc_Algo == 0) | ||
289 | return CrcUpdate_HW(crc, data, size); | ||
290 | return CrcUpdate_Base(crc, data, size); | ||
233 | } | 291 | } |
292 | #endif | ||
234 | 293 | ||
235 | #undef T0_32_UNROLL_BYTES | 294 | #endif // !defined(Z7_CRC_HW_FORCE) |
236 | #undef T0_64_UNROLL_BYTES | ||
237 | 295 | ||
238 | #endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) | ||
239 | 296 | ||
240 | #endif // MY_CPU_LE | 297 | |
298 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) | ||
299 | { | ||
300 | return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL; | ||
301 | } | ||
241 | 302 | ||
242 | 303 | ||
304 | MY_ALIGN(64) | ||
305 | UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL]; | ||
243 | 306 | ||
244 | 307 | ||
245 | void Z7_FASTCALL CrcGenerateTable(void) | 308 | void Z7_FASTCALL CrcGenerateTable(void) |
@@ -247,94 +310,111 @@ void Z7_FASTCALL CrcGenerateTable(void) | |||
247 | UInt32 i; | 310 | UInt32 i; |
248 | for (i = 0; i < 256; i++) | 311 | for (i = 0; i < 256; i++) |
249 | { | 312 | { |
313 | #if defined(Z7_CRC_HW_FORCE) | ||
314 | g_CrcTable[i] = __crc32b(i, 0); | ||
315 | #else | ||
316 | #define kCrcPoly 0xEDB88320 | ||
250 | UInt32 r = i; | 317 | UInt32 r = i; |
251 | unsigned j; | 318 | unsigned j; |
252 | for (j = 0; j < 8; j++) | 319 | for (j = 0; j < 8; j++) |
253 | r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); | 320 | r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); |
254 | g_CrcTable[i] = r; | 321 | g_CrcTable[i] = r; |
322 | #endif | ||
255 | } | 323 | } |
256 | for (i = 256; i < 256 * CRC_NUM_TABLES; i++) | 324 | for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++) |
257 | { | 325 | { |
258 | const UInt32 r = g_CrcTable[(size_t)i - 256]; | 326 | const UInt32 r = g_CrcTable[(size_t)i - 256]; |
259 | g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); | 327 | g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); |
260 | } | 328 | } |
261 | 329 | ||
262 | #if CRC_NUM_TABLES < 4 | 330 | #if !defined(Z7_CRC_HW_FORCE) && \ |
263 | g_CrcUpdate = CrcUpdateT1; | 331 | (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE)) |
264 | #elif defined(MY_CPU_LE) | 332 | |
265 | // g_CrcUpdateT4 = CrcUpdateT4; | 333 | #if Z7_CRC_NUM_TABLES_USE <= 1 |
266 | #if CRC_NUM_TABLES < 8 | 334 | g_Crc_Algo = 1; |
267 | g_CrcUpdate = CrcUpdateT4; | 335 | #else // Z7_CRC_NUM_TABLES_USE <= 1 |
268 | #else // CRC_NUM_TABLES >= 8 | 336 | |
269 | g_CrcUpdateT8 = CrcUpdateT8; | 337 | #if defined(MY_CPU_LE) |
270 | /* | 338 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
271 | #ifdef MY_CPU_X86_OR_AMD64 | 339 | #else // !defined(MY_CPU_LE) |
272 | if (!CPU_Is_InOrder()) | ||
273 | #endif | ||
274 | */ | ||
275 | g_CrcUpdate = CrcUpdateT8; | ||
276 | #endif | ||
277 | #else | ||
278 | { | 340 | { |
279 | #ifndef MY_CPU_BE | 341 | #ifndef MY_CPU_BE |
280 | UInt32 k = 0x01020304; | 342 | UInt32 k = 0x01020304; |
281 | const Byte *p = (const Byte *)&k; | 343 | const Byte *p = (const Byte *)&k; |
282 | if (p[0] == 4 && p[1] == 3) | 344 | if (p[0] == 4 && p[1] == 3) |
283 | { | 345 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
284 | #if CRC_NUM_TABLES < 8 | ||
285 | // g_CrcUpdateT4 = CrcUpdateT4; | ||
286 | g_CrcUpdate = CrcUpdateT4; | ||
287 | #else // CRC_NUM_TABLES >= 8 | ||
288 | g_CrcUpdateT8 = CrcUpdateT8; | ||
289 | g_CrcUpdate = CrcUpdateT8; | ||
290 | #endif | ||
291 | } | ||
292 | else if (p[0] != 1 || p[1] != 2) | 346 | else if (p[0] != 1 || p[1] != 2) |
293 | g_CrcUpdate = CrcUpdateT1; | 347 | g_Crc_Algo = 1; |
294 | else | 348 | else |
295 | #endif // MY_CPU_BE | 349 | #endif // MY_CPU_BE |
296 | { | 350 | { |
297 | for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) | 351 | for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--) |
298 | { | 352 | { |
299 | const UInt32 x = g_CrcTable[(size_t)i - 256]; | 353 | const UInt32 x = g_CrcTable[(size_t)i - 256]; |
300 | g_CrcTable[i] = Z7_BSWAP32(x); | 354 | g_CrcTable[i] = Z7_BSWAP32(x); |
301 | } | 355 | } |
302 | #if CRC_NUM_TABLES <= 4 | 356 | #if defined(Z7_CRC_UPDATE_T1_FUNC_NAME) |
303 | g_CrcUpdate = CrcUpdateT1; | 357 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
304 | #elif CRC_NUM_TABLES <= 8 | 358 | #endif |
305 | // g_CrcUpdateT4 = CrcUpdateT1_BeT4; | 359 | #if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) |
306 | g_CrcUpdate = CrcUpdateT1_BeT4; | 360 | g_Crc_Be = 1; |
307 | #else // CRC_NUM_TABLES > 8 | 361 | #endif |
308 | g_CrcUpdateT8 = CrcUpdateT1_BeT8; | ||
309 | g_CrcUpdate = CrcUpdateT1_BeT8; | ||
310 | #endif | ||
311 | } | 362 | } |
312 | } | 363 | } |
313 | #endif // CRC_NUM_TABLES < 4 | 364 | #endif // !defined(MY_CPU_LE) |
314 | 365 | ||
315 | #ifdef MY_CPU_LE | 366 | #ifdef MY_CPU_LE |
316 | #ifdef USE_ARM64_CRC | 367 | #ifdef Z7_CRC_HW_USE |
317 | if (CPU_IsSupported_CRC32()) | 368 | if (CPU_IsSupported_CRC32()) |
318 | { | 369 | g_Crc_Algo = 0; |
319 | g_CrcUpdateT0_32 = CrcUpdateT0_32; | 370 | #endif // Z7_CRC_HW_USE |
320 | g_CrcUpdateT0_64 = CrcUpdateT0_64; | 371 | #endif // MY_CPU_LE |
321 | g_CrcUpdate = | 372 | |
322 | #if defined(MY_CPU_ARM) | 373 | #endif // Z7_CRC_NUM_TABLES_USE <= 1 |
323 | CrcUpdateT0_32; | 374 | #endif // g_Crc_Algo was declared |
324 | #else | 375 | } |
325 | CrcUpdateT0_64; | 376 | |
326 | #endif | 377 | Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo) |
327 | } | 378 | { |
328 | #endif | 379 | if (algo == 0) |
329 | 380 | return &CrcUpdate; | |
330 | #ifdef USE_CRC_EMU | 381 | |
331 | g_CrcUpdateT0_32 = CrcUpdateT0_32; | 382 | #if defined(Z7_CRC_HW_USE) |
332 | g_CrcUpdateT0_64 = CrcUpdateT0_64; | 383 | if (algo == sizeof(CRC_HW_WORD_TYPE) * 8) |
333 | g_CrcUpdate = CrcUpdateT0_64; | 384 | { |
334 | #endif | 385 | #ifdef Z7_CRC_HW_FORCE |
386 | return &CrcUpdate; | ||
387 | #else | ||
388 | if (g_Crc_Algo == 0) | ||
389 | return &CrcUpdate_HW; | ||
390 | #endif | ||
391 | } | ||
392 | #endif | ||
393 | |||
394 | #ifndef Z7_CRC_HW_FORCE | ||
395 | if (algo == Z7_CRC_NUM_TABLES_USE) | ||
396 | return | ||
397 | #ifdef Z7_CRC_HW_USE | ||
398 | &CrcUpdate_Base; | ||
399 | #else | ||
400 | &CrcUpdate; | ||
335 | #endif | 401 | #endif |
402 | #endif | ||
403 | |||
404 | return NULL; | ||
336 | } | 405 | } |
337 | 406 | ||
338 | #undef kCrcPoly | 407 | #undef kCrcPoly |
339 | #undef CRC64_NUM_TABLES | 408 | #undef Z7_CRC_NUM_TABLES_USE |
409 | #undef Z7_CRC_NUM_TABLES_TOTAL | ||
340 | #undef CRC_UPDATE_BYTE_2 | 410 | #undef CRC_UPDATE_BYTE_2 |
411 | #undef FUNC_NAME_LE_2 | ||
412 | #undef FUNC_NAME_LE_1 | ||
413 | #undef FUNC_NAME_LE | ||
414 | #undef FUNC_NAME_BE_2 | ||
415 | #undef FUNC_NAME_BE_1 | ||
416 | #undef FUNC_NAME_BE | ||
417 | |||
418 | #undef CRC_HW_UNROLL_BYTES | ||
419 | #undef CRC_HW_WORD_FUNC | ||
420 | #undef CRC_HW_WORD_TYPE | ||
@@ -1,5 +1,5 @@ | |||
1 | /* 7zCrc.h -- CRC32 calculation | 1 | /* 7zCrc.h -- CRC32 calculation |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_7Z_CRC_H | 4 | #ifndef ZIP7_INC_7Z_CRC_H |
5 | #define ZIP7_INC_7Z_CRC_H | 5 | #define ZIP7_INC_7Z_CRC_H |
@@ -20,7 +20,8 @@ void Z7_FASTCALL CrcGenerateTable(void); | |||
20 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); | 20 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); |
21 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); | 21 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); |
22 | 22 | ||
23 | typedef UInt32 (Z7_FASTCALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); | 23 | typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size); |
24 | Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo); | ||
24 | 25 | ||
25 | EXTERN_C_END | 26 | EXTERN_C_END |
26 | 27 | ||
diff --git a/C/7zCrcOpt.c b/C/7zCrcOpt.c index 9c64929..9408017 100644 --- a/C/7zCrcOpt.c +++ b/C/7zCrcOpt.c | |||
@@ -1,117 +1,199 @@ | |||
1 | /* 7zCrcOpt.c -- CRC32 calculation | 1 | /* 7zCrcOpt.c -- CRC32 calculation (optimized functions) |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-07 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1 | ||
9 | |||
10 | // for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu | ||
11 | // #define Z7_CRC_DEBUG_BE | ||
12 | #ifdef Z7_CRC_DEBUG_BE | ||
13 | #undef MY_CPU_LE | ||
14 | #define MY_CPU_BE | ||
15 | #endif | ||
16 | |||
17 | // the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c | ||
18 | #ifdef Z7_CRC_NUM_TABLES | ||
19 | #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES | ||
20 | #else | ||
21 | #define Z7_CRC_NUM_TABLES_USE 12 | ||
22 | #endif | ||
23 | |||
24 | #if Z7_CRC_NUM_TABLES_USE % 4 || \ | ||
25 | Z7_CRC_NUM_TABLES_USE < 4 * 1 || \ | ||
26 | Z7_CRC_NUM_TABLES_USE > 4 * 6 | ||
27 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
28 | #endif | ||
29 | |||
30 | |||
8 | #ifndef MY_CPU_BE | 31 | #ifndef MY_CPU_BE |
9 | 32 | ||
10 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 33 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
11 | 34 | ||
12 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 35 | #define Q(n, d) \ |
13 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) | 36 | ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \ |
14 | { | 37 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ |
15 | const Byte *p = (const Byte *)data; | 38 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ |
16 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 39 | ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) |
17 | v = CRC_UPDATE_BYTE_2(v, *p); | 40 | |
18 | for (; size >= 4; size -= 4, p += 4) | 41 | #define R(a) *((const UInt32 *)(const void *)p + (a)) |
19 | { | 42 | |
20 | v ^= *(const UInt32 *)(const void *)p; | 43 | #define CRC_FUNC_PRE_LE2(step) \ |
21 | v = | 44 | UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) |
22 | (table + 0x300)[((v ) & 0xFF)] | ||
23 | ^ (table + 0x200)[((v >> 8) & 0xFF)] | ||
24 | ^ (table + 0x100)[((v >> 16) & 0xFF)] | ||
25 | ^ (table + 0x000)[((v >> 24))]; | ||
26 | } | ||
27 | for (; size > 0; size--, p++) | ||
28 | v = CRC_UPDATE_BYTE_2(v, *p); | ||
29 | return v; | ||
30 | } | ||
31 | 45 | ||
32 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 46 | #define CRC_FUNC_PRE_LE(step) \ |
33 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) | 47 | CRC_FUNC_PRE_LE2(step); \ |
48 | CRC_FUNC_PRE_LE2(step) | ||
49 | |||
50 | CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE) | ||
34 | { | 51 | { |
35 | const Byte *p = (const Byte *)data; | 52 | const Byte *p = (const Byte *)data; |
36 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) | 53 | const Byte *lim; |
54 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) | ||
37 | v = CRC_UPDATE_BYTE_2(v, *p); | 55 | v = CRC_UPDATE_BYTE_2(v, *p); |
38 | for (; size >= 8; size -= 8, p += 8) | 56 | lim = p + size; |
57 | if (size >= Z7_CRC_NUM_TABLES_USE) | ||
39 | { | 58 | { |
40 | UInt32 d; | 59 | lim -= Z7_CRC_NUM_TABLES_USE; |
41 | v ^= *(const UInt32 *)(const void *)p; | 60 | do |
42 | v = | 61 | { |
43 | (table + 0x700)[((v ) & 0xFF)] | 62 | v ^= R(0); |
44 | ^ (table + 0x600)[((v >> 8) & 0xFF)] | 63 | { |
45 | ^ (table + 0x500)[((v >> 16) & 0xFF)] | 64 | #if Z7_CRC_NUM_TABLES_USE == 1 * 4 |
46 | ^ (table + 0x400)[((v >> 24))]; | 65 | v = Q(0, v); |
47 | d = *((const UInt32 *)(const void *)p + 1); | 66 | #else |
48 | v ^= | 67 | #define U2(r, op) \ |
49 | (table + 0x300)[((d ) & 0xFF)] | 68 | { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } |
50 | ^ (table + 0x200)[((d >> 8) & 0xFF)] | 69 | UInt32 d, x; |
51 | ^ (table + 0x100)[((d >> 16) & 0xFF)] | 70 | U2(1, =) |
52 | ^ (table + 0x000)[((d >> 24))]; | 71 | #if Z7_CRC_NUM_TABLES_USE >= 3 * 4 |
72 | #define U(r) U2(r, ^=) | ||
73 | U(2) | ||
74 | #if Z7_CRC_NUM_TABLES_USE >= 4 * 4 | ||
75 | U(3) | ||
76 | #if Z7_CRC_NUM_TABLES_USE >= 5 * 4 | ||
77 | U(4) | ||
78 | #if Z7_CRC_NUM_TABLES_USE >= 6 * 4 | ||
79 | U(5) | ||
80 | #if Z7_CRC_NUM_TABLES_USE >= 7 * 4 | ||
81 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
82 | #endif | ||
83 | #endif | ||
84 | #endif | ||
85 | #endif | ||
86 | #endif | ||
87 | #undef U | ||
88 | #undef U2 | ||
89 | v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); | ||
90 | #endif | ||
91 | } | ||
92 | p += Z7_CRC_NUM_TABLES_USE; | ||
93 | } | ||
94 | while (p <= lim); | ||
95 | lim += Z7_CRC_NUM_TABLES_USE; | ||
53 | } | 96 | } |
54 | for (; size > 0; size--, p++) | 97 | for (; p < lim; p++) |
55 | v = CRC_UPDATE_BYTE_2(v, *p); | 98 | v = CRC_UPDATE_BYTE_2(v, *p); |
56 | return v; | 99 | return v; |
57 | } | 100 | } |
58 | 101 | ||
102 | #undef CRC_UPDATE_BYTE_2 | ||
103 | #undef R | ||
104 | #undef Q | ||
105 | #undef CRC_FUNC_PRE_LE | ||
106 | #undef CRC_FUNC_PRE_LE2 | ||
107 | |||
59 | #endif | 108 | #endif |
60 | 109 | ||
61 | 110 | ||
111 | |||
112 | |||
62 | #ifndef MY_CPU_LE | 113 | #ifndef MY_CPU_LE |
63 | 114 | ||
64 | #define CRC_UINT32_SWAP(v) Z7_BSWAP32(v) | 115 | #define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8)) |
65 | 116 | ||
66 | #define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) | 117 | #define Q(n, d) \ |
118 | ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \ | ||
119 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
120 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
121 | ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) | ||
67 | 122 | ||
68 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) | 123 | #ifdef Z7_CRC_DEBUG_BE |
69 | { | 124 | #define R(a) GetBe32a((const UInt32 *)(const void *)p + (a)) |
70 | const Byte *p = (const Byte *)data; | 125 | #else |
71 | table += 0x100; | 126 | #define R(a) *((const UInt32 *)(const void *)p + (a)) |
72 | v = CRC_UINT32_SWAP(v); | 127 | #endif |
73 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 128 | |
74 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 129 | |
75 | for (; size >= 4; size -= 4, p += 4) | 130 | #define CRC_FUNC_PRE_BE2(step) \ |
76 | { | 131 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) |
77 | v ^= *(const UInt32 *)(const void *)p; | ||
78 | v = | ||
79 | (table + 0x000)[((v ) & 0xFF)] | ||
80 | ^ (table + 0x100)[((v >> 8) & 0xFF)] | ||
81 | ^ (table + 0x200)[((v >> 16) & 0xFF)] | ||
82 | ^ (table + 0x300)[((v >> 24))]; | ||
83 | } | ||
84 | for (; size > 0; size--, p++) | ||
85 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | ||
86 | return CRC_UINT32_SWAP(v); | ||
87 | } | ||
88 | 132 | ||
89 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) | 133 | #define CRC_FUNC_PRE_BE(step) \ |
134 | CRC_FUNC_PRE_BE2(step); \ | ||
135 | CRC_FUNC_PRE_BE2(step) | ||
136 | |||
137 | CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE) | ||
90 | { | 138 | { |
91 | const Byte *p = (const Byte *)data; | 139 | const Byte *p = (const Byte *)data; |
140 | const Byte *lim; | ||
92 | table += 0x100; | 141 | table += 0x100; |
93 | v = CRC_UINT32_SWAP(v); | 142 | v = Z7_BSWAP32(v); |
94 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) | 143 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) |
95 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 144 | v = CRC_UPDATE_BYTE_2_BE(v, *p); |
96 | for (; size >= 8; size -= 8, p += 8) | 145 | lim = p + size; |
146 | if (size >= Z7_CRC_NUM_TABLES_USE) | ||
97 | { | 147 | { |
98 | UInt32 d; | 148 | lim -= Z7_CRC_NUM_TABLES_USE; |
99 | v ^= *(const UInt32 *)(const void *)p; | 149 | do |
100 | v = | 150 | { |
101 | (table + 0x400)[((v ) & 0xFF)] | 151 | v ^= R(0); |
102 | ^ (table + 0x500)[((v >> 8) & 0xFF)] | 152 | { |
103 | ^ (table + 0x600)[((v >> 16) & 0xFF)] | 153 | #if Z7_CRC_NUM_TABLES_USE == 1 * 4 |
104 | ^ (table + 0x700)[((v >> 24))]; | 154 | v = Q(0, v); |
105 | d = *((const UInt32 *)(const void *)p + 1); | 155 | #else |
106 | v ^= | 156 | #define U2(r, op) \ |
107 | (table + 0x000)[((d ) & 0xFF)] | 157 | { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } |
108 | ^ (table + 0x100)[((d >> 8) & 0xFF)] | 158 | UInt32 d, x; |
109 | ^ (table + 0x200)[((d >> 16) & 0xFF)] | 159 | U2(1, =) |
110 | ^ (table + 0x300)[((d >> 24))]; | 160 | #if Z7_CRC_NUM_TABLES_USE >= 3 * 4 |
161 | #define U(r) U2(r, ^=) | ||
162 | U(2) | ||
163 | #if Z7_CRC_NUM_TABLES_USE >= 4 * 4 | ||
164 | U(3) | ||
165 | #if Z7_CRC_NUM_TABLES_USE >= 5 * 4 | ||
166 | U(4) | ||
167 | #if Z7_CRC_NUM_TABLES_USE >= 6 * 4 | ||
168 | U(5) | ||
169 | #if Z7_CRC_NUM_TABLES_USE >= 7 * 4 | ||
170 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
171 | #endif | ||
172 | #endif | ||
173 | #endif | ||
174 | #endif | ||
175 | #endif | ||
176 | #undef U | ||
177 | #undef U2 | ||
178 | v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); | ||
179 | #endif | ||
180 | } | ||
181 | p += Z7_CRC_NUM_TABLES_USE; | ||
182 | } | ||
183 | while (p <= lim); | ||
184 | lim += Z7_CRC_NUM_TABLES_USE; | ||
111 | } | 185 | } |
112 | for (; size > 0; size--, p++) | 186 | for (; p < lim; p++) |
113 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 187 | v = CRC_UPDATE_BYTE_2_BE(v, *p); |
114 | return CRC_UINT32_SWAP(v); | 188 | return Z7_BSWAP32(v); |
115 | } | 189 | } |
116 | 190 | ||
191 | #undef CRC_UPDATE_BYTE_2_BE | ||
192 | #undef R | ||
193 | #undef Q | ||
194 | #undef CRC_FUNC_PRE_BE | ||
195 | #undef CRC_FUNC_PRE_BE2 | ||
196 | |||
197 | #endif | ||
198 | #undef Z7_CRC_NUM_TABLES_USE | ||
117 | #endif | 199 | #endif |
@@ -1,5 +1,5 @@ | |||
1 | /* 7zDec.c -- Decoding from 7z folder | 1 | /* 7zDec.c -- Decoding from 7z folder |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -51,6 +51,7 @@ | |||
51 | 51 | ||
52 | #ifndef Z7_NO_METHODS_FILTERS | 52 | #ifndef Z7_NO_METHODS_FILTERS |
53 | #define k_Delta 3 | 53 | #define k_Delta 3 |
54 | #define k_RISCV 0xb | ||
54 | #define k_BCJ 0x3030103 | 55 | #define k_BCJ 0x3030103 |
55 | #define k_PPC 0x3030205 | 56 | #define k_PPC 0x3030205 |
56 | #define k_IA64 0x3030401 | 57 | #define k_IA64 0x3030401 |
@@ -362,6 +363,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f) | |||
362 | case k_IA64: | 363 | case k_IA64: |
363 | case k_SPARC: | 364 | case k_SPARC: |
364 | case k_ARM: | 365 | case k_ARM: |
366 | case k_RISCV: | ||
365 | #endif | 367 | #endif |
366 | #ifdef Z7_USE_FILTER_ARM64 | 368 | #ifdef Z7_USE_FILTER_ARM64 |
367 | case k_ARM64: | 369 | case k_ARM64: |
@@ -535,10 +537,10 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
535 | } | 537 | } |
536 | } | 538 | } |
537 | } | 539 | } |
538 | #if defined(Z7_USE_BRANCH_FILTER) | 540 | #if defined(Z7_USE_BRANCH_FILTER) |
539 | else if (ci == 1) | 541 | else if (ci == 1) |
540 | { | 542 | { |
541 | #if !defined(Z7_NO_METHODS_FILTERS) | 543 | #if !defined(Z7_NO_METHODS_FILTERS) |
542 | if (coder->MethodID == k_Delta) | 544 | if (coder->MethodID == k_Delta) |
543 | { | 545 | { |
544 | if (coder->PropsSize != 1) | 546 | if (coder->PropsSize != 1) |
@@ -550,22 +552,43 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
550 | } | 552 | } |
551 | continue; | 553 | continue; |
552 | } | 554 | } |
553 | #endif | 555 | #endif |
554 | 556 | ||
555 | #ifdef Z7_USE_FILTER_ARM64 | 557 | #ifdef Z7_USE_FILTER_ARM64 |
556 | if (coder->MethodID == k_ARM64) | 558 | if (coder->MethodID == k_ARM64) |
557 | { | 559 | { |
558 | UInt32 pc = 0; | 560 | UInt32 pc = 0; |
559 | if (coder->PropsSize == 4) | 561 | if (coder->PropsSize == 4) |
562 | { | ||
560 | pc = GetUi32(propsData + coder->PropsOffset); | 563 | pc = GetUi32(propsData + coder->PropsOffset); |
564 | if (pc & 3) | ||
565 | return SZ_ERROR_UNSUPPORTED; | ||
566 | } | ||
561 | else if (coder->PropsSize != 0) | 567 | else if (coder->PropsSize != 0) |
562 | return SZ_ERROR_UNSUPPORTED; | 568 | return SZ_ERROR_UNSUPPORTED; |
563 | z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); | 569 | z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); |
564 | continue; | 570 | continue; |
565 | } | 571 | } |
566 | #endif | 572 | #endif |
567 | 573 | ||
568 | #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) | 574 | #if !defined(Z7_NO_METHODS_FILTERS) |
575 | if (coder->MethodID == k_RISCV) | ||
576 | { | ||
577 | UInt32 pc = 0; | ||
578 | if (coder->PropsSize == 4) | ||
579 | { | ||
580 | pc = GetUi32(propsData + coder->PropsOffset); | ||
581 | if (pc & 1) | ||
582 | return SZ_ERROR_UNSUPPORTED; | ||
583 | } | ||
584 | else if (coder->PropsSize != 0) | ||
585 | return SZ_ERROR_UNSUPPORTED; | ||
586 | z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc); | ||
587 | continue; | ||
588 | } | ||
589 | #endif | ||
590 | |||
591 | #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) | ||
569 | { | 592 | { |
570 | if (coder->PropsSize != 0) | 593 | if (coder->PropsSize != 0) |
571 | return SZ_ERROR_UNSUPPORTED; | 594 | return SZ_ERROR_UNSUPPORTED; |
@@ -579,7 +602,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
579 | z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 | 602 | z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 |
580 | break; | 603 | break; |
581 | } | 604 | } |
582 | CASE_BRA_CONV(PPC) | 605 | case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0; |
606 | // CASE_BRA_CONV(PPC) | ||
583 | CASE_BRA_CONV(IA64) | 607 | CASE_BRA_CONV(IA64) |
584 | CASE_BRA_CONV(SPARC) | 608 | CASE_BRA_CONV(SPARC) |
585 | CASE_BRA_CONV(ARM) | 609 | CASE_BRA_CONV(ARM) |
@@ -592,9 +616,9 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
592 | } | 616 | } |
593 | continue; | 617 | continue; |
594 | } | 618 | } |
595 | #endif | 619 | #endif |
596 | } // (c == 1) | 620 | } // (c == 1) |
597 | #endif | 621 | #endif // Z7_USE_BRANCH_FILTER |
598 | else | 622 | else |
599 | return SZ_ERROR_UNSUPPORTED; | 623 | return SZ_ERROR_UNSUPPORTED; |
600 | } | 624 | } |
diff --git a/C/7zTypes.h b/C/7zTypes.h index 1fcb247..5b77420 100644 --- a/C/7zTypes.h +++ b/C/7zTypes.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* 7zTypes.h -- Basic types | 1 | /* 7zTypes.h -- Basic types |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-24 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_7Z_TYPES_H | 4 | #ifndef ZIP7_7Z_TYPES_H |
5 | #define ZIP7_7Z_TYPES_H | 5 | #define ZIP7_7Z_TYPES_H |
@@ -530,20 +530,20 @@ struct ISzAlloc | |||
530 | #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) | 530 | #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) |
531 | */ | 531 | */ |
532 | #if defined (__clang__) || defined(__GNUC__) | 532 | #if defined (__clang__) || defined(__GNUC__) |
533 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ | 533 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ |
534 | _Pragma("GCC diagnostic push") \ | 534 | _Pragma("GCC diagnostic push") \ |
535 | _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") | 535 | _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") |
536 | #define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL \ | 536 | #define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \ |
537 | _Pragma("GCC diagnostic pop") | 537 | _Pragma("GCC diagnostic pop") |
538 | #else | 538 | #else |
539 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL | 539 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL |
540 | #define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL | 540 | #define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL |
541 | #endif | 541 | #endif |
542 | 542 | ||
543 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ | 543 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ |
544 | Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ | 544 | Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ |
545 | type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ | 545 | type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ |
546 | Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL | 546 | Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL |
547 | 547 | ||
548 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ | 548 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ |
549 | Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) | 549 | Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) |
diff --git a/C/7zVersion.h b/C/7zVersion.h index 7549239..72b915a 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #define MY_VER_MAJOR 23 | 1 | #define MY_VER_MAJOR 24 |
2 | #define MY_VER_MINOR 01 | 2 | #define MY_VER_MINOR 05 |
3 | #define MY_VER_BUILD 0 | 3 | #define MY_VER_BUILD 0 |
4 | #define MY_VERSION_NUMBERS "23.01" | 4 | #define MY_VERSION_NUMBERS "24.05" |
5 | #define MY_VERSION MY_VERSION_NUMBERS | 5 | #define MY_VERSION MY_VERSION_NUMBERS |
6 | 6 | ||
7 | #ifdef MY_CPU_NAME | 7 | #ifdef MY_CPU_NAME |
@@ -10,12 +10,12 @@ | |||
10 | #define MY_VERSION_CPU MY_VERSION | 10 | #define MY_VERSION_CPU MY_VERSION |
11 | #endif | 11 | #endif |
12 | 12 | ||
13 | #define MY_DATE "2023-06-20" | 13 | #define MY_DATE "2024-05-14" |
14 | #undef MY_COPYRIGHT | 14 | #undef MY_COPYRIGHT |
15 | #undef MY_VERSION_COPYRIGHT_DATE | 15 | #undef MY_VERSION_COPYRIGHT_DATE |
16 | #define MY_AUTHOR_NAME "Igor Pavlov" | 16 | #define MY_AUTHOR_NAME "Igor Pavlov" |
17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" | 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" |
18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2023 Igor Pavlov" | 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" |
19 | 19 | ||
20 | #ifdef USE_COPYRIGHT_CR | 20 | #ifdef USE_COPYRIGHT_CR |
21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR | 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR |
diff --git a/C/7zip_gcc_c.mak b/C/7zip_gcc_c.mak index f19a99b..195d23d 100644 --- a/C/7zip_gcc_c.mak +++ b/C/7zip_gcc_c.mak | |||
@@ -22,8 +22,8 @@ CFLAGS_BASE_LIST = -c | |||
22 | # for ASM file | 22 | # for ASM file |
23 | # CFLAGS_BASE_LIST = -S | 23 | # CFLAGS_BASE_LIST = -S |
24 | 24 | ||
25 | FLAGS_FLTO = | ||
26 | FLAGS_FLTO = -flto | 25 | FLAGS_FLTO = -flto |
26 | FLAGS_FLTO = | ||
27 | 27 | ||
28 | CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \ | 28 | CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \ |
29 | -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE | 29 | -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE |
@@ -329,7 +329,7 @@ endif | |||
329 | 329 | ||
330 | ifdef IS_ARM64 | 330 | ifdef IS_ARM64 |
331 | $O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S | 331 | $O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S |
332 | $(CC) $(CFLAGS) $< | 332 | $(CC) $(CFLAGS) $(ASM_FLAGS) $< |
333 | endif | 333 | endif |
334 | 334 | ||
335 | $O/LzmaDec.o: ../../LzmaDec.c | 335 | $O/LzmaDec.o: ../../LzmaDec.c |
@@ -1,5 +1,5 @@ | |||
1 | /* Aes.c -- AES encryption / decryption | 1 | /* Aes.c -- AES encryption / decryption |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -13,7 +13,9 @@ AES_CODE_FUNC g_AesCtr_Code; | |||
13 | UInt32 g_Aes_SupportedFunctions_Flags; | 13 | UInt32 g_Aes_SupportedFunctions_Flags; |
14 | #endif | 14 | #endif |
15 | 15 | ||
16 | MY_ALIGN(64) | ||
16 | static UInt32 T[256 * 4]; | 17 | static UInt32 T[256 * 4]; |
18 | MY_ALIGN(64) | ||
17 | static const Byte Sbox[256] = { | 19 | static const Byte Sbox[256] = { |
18 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, | 20 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, |
19 | 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, | 21 | 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, |
@@ -33,7 +35,9 @@ static const Byte Sbox[256] = { | |||
33 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; | 35 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; |
34 | 36 | ||
35 | 37 | ||
38 | MY_ALIGN(64) | ||
36 | static UInt32 D[256 * 4]; | 39 | static UInt32 D[256 * 4]; |
40 | MY_ALIGN(64) | ||
37 | static Byte InvS[256]; | 41 | static Byte InvS[256]; |
38 | 42 | ||
39 | #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) | 43 | #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) |
@@ -54,24 +58,54 @@ static Byte InvS[256]; | |||
54 | // #define Z7_SHOW_AES_STATUS | 58 | // #define Z7_SHOW_AES_STATUS |
55 | 59 | ||
56 | #ifdef MY_CPU_X86_OR_AMD64 | 60 | #ifdef MY_CPU_X86_OR_AMD64 |
57 | #define USE_HW_AES | 61 | |
58 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | 62 | #if defined(__INTEL_COMPILER) |
59 | #if defined(__clang__) | 63 | #if (__INTEL_COMPILER >= 1110) |
60 | #if (__clang_major__ >= 8) // fix that check | ||
61 | #define USE_HW_AES | ||
62 | #endif | ||
63 | #elif defined(__GNUC__) | ||
64 | #if (__GNUC__ >= 6) // fix that check | ||
65 | #define USE_HW_AES | 64 | #define USE_HW_AES |
65 | #if (__INTEL_COMPILER >= 1900) | ||
66 | #define USE_HW_VAES | ||
67 | #endif | ||
66 | #endif | 68 | #endif |
69 | #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
70 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) | ||
71 | #define USE_HW_AES | ||
72 | #if defined(__clang__) && (__clang_major__ >= 8) \ | ||
73 | || defined(__GNUC__) && (__GNUC__ >= 8) | ||
74 | #define USE_HW_VAES | ||
75 | #endif | ||
67 | #elif defined(_MSC_VER) | 76 | #elif defined(_MSC_VER) |
68 | #if _MSC_VER >= 1910 | 77 | #define USE_HW_AES |
78 | #define USE_HW_VAES | ||
79 | #endif | ||
80 | |||
81 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | ||
82 | |||
83 | #if defined(__ARM_FEATURE_AES) \ | ||
84 | || defined(__ARM_FEATURE_CRYPTO) | ||
85 | #define USE_HW_AES | ||
86 | #else | ||
87 | #if defined(MY_CPU_ARM64) \ | ||
88 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
89 | || defined(Z7_MSC_VER_ORIGINAL) | ||
90 | #if defined(__ARM_FP) && \ | ||
91 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
92 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
93 | ) \ | ||
94 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
95 | #if defined(MY_CPU_ARM64) \ | ||
96 | || !defined(Z7_CLANG_VERSION) \ | ||
97 | || defined(__ARM_NEON) && \ | ||
98 | (Z7_CLANG_VERSION < 170000 || \ | ||
99 | Z7_CLANG_VERSION > 170001) | ||
69 | #define USE_HW_AES | 100 | #define USE_HW_AES |
70 | #endif | 101 | #endif |
102 | #endif | ||
103 | #endif | ||
71 | #endif | 104 | #endif |
72 | #endif | 105 | #endif |
73 | 106 | ||
74 | #ifdef USE_HW_AES | 107 | #ifdef USE_HW_AES |
108 | // #pragma message("=== Aes.c USE_HW_AES === ") | ||
75 | #ifdef Z7_SHOW_AES_STATUS | 109 | #ifdef Z7_SHOW_AES_STATUS |
76 | #include <stdio.h> | 110 | #include <stdio.h> |
77 | #define PRF(x) x | 111 | #define PRF(x) x |
@@ -136,6 +170,7 @@ void AesGenTables(void) | |||
136 | #endif | 170 | #endif |
137 | 171 | ||
138 | #ifdef MY_CPU_X86_OR_AMD64 | 172 | #ifdef MY_CPU_X86_OR_AMD64 |
173 | #ifdef USE_HW_VAES | ||
139 | if (CPU_IsSupported_VAES_AVX2()) | 174 | if (CPU_IsSupported_VAES_AVX2()) |
140 | { | 175 | { |
141 | PRF(printf("\n===vaes avx2\n")); | 176 | PRF(printf("\n===vaes avx2\n")); |
@@ -146,6 +181,7 @@ void AesGenTables(void) | |||
146 | #endif | 181 | #endif |
147 | } | 182 | } |
148 | #endif | 183 | #endif |
184 | #endif | ||
149 | } | 185 | } |
150 | #endif | 186 | #endif |
151 | 187 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -15,8 +15,8 @@ | |||
15 | #define USE_INTEL_VAES | 15 | #define USE_INTEL_VAES |
16 | #endif | 16 | #endif |
17 | #endif | 17 | #endif |
18 | #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ | 18 | #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ |
19 | || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) | 19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) |
20 | #define USE_INTEL_AES | 20 | #define USE_INTEL_AES |
21 | #if !defined(__AES__) | 21 | #if !defined(__AES__) |
22 | #define ATTRIB_AES __attribute__((__target__("aes"))) | 22 | #define ATTRIB_AES __attribute__((__target__("aes"))) |
@@ -35,27 +35,37 @@ | |||
35 | #define USE_INTEL_VAES | 35 | #define USE_INTEL_VAES |
36 | #endif | 36 | #endif |
37 | #endif | 37 | #endif |
38 | #ifndef USE_INTEL_AES | ||
39 | #define Z7_USE_AES_HW_STUB | ||
40 | #endif | ||
41 | #ifndef USE_INTEL_VAES | ||
42 | #define Z7_USE_VAES_HW_STUB | ||
43 | #endif | ||
38 | #endif | 44 | #endif |
39 | 45 | ||
40 | #ifndef ATTRIB_AES | 46 | #ifndef USE_INTEL_AES |
41 | #define ATTRIB_AES | 47 | // #define Z7_USE_AES_HW_STUB // for debug |
42 | #endif | 48 | #endif |
43 | #ifndef ATTRIB_VAES | 49 | #ifndef USE_INTEL_VAES |
44 | #define ATTRIB_VAES | 50 | // #define Z7_USE_VAES_HW_STUB // for debug |
45 | #endif | 51 | #endif |
46 | 52 | ||
47 | 53 | ||
48 | #ifdef USE_INTEL_AES | 54 | #ifdef USE_INTEL_AES |
49 | 55 | ||
50 | #include <wmmintrin.h> | 56 | #include <wmmintrin.h> |
51 | 57 | ||
52 | #ifndef USE_INTEL_VAES | 58 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) |
53 | #define AES_TYPE_keys UInt32 | 59 | #define AES_TYPE_keys UInt32 |
54 | #define AES_TYPE_data Byte | 60 | #define AES_TYPE_data Byte |
55 | // #define AES_TYPE_keys __m128i | 61 | // #define AES_TYPE_keys __m128i |
56 | // #define AES_TYPE_data __m128i | 62 | // #define AES_TYPE_data __m128i |
57 | #endif | 63 | #endif |
58 | 64 | ||
65 | #ifndef ATTRIB_AES | ||
66 | #define ATTRIB_AES | ||
67 | #endif | ||
68 | |||
59 | #define AES_FUNC_START(name) \ | 69 | #define AES_FUNC_START(name) \ |
60 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) | 70 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) |
61 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) | 71 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) |
@@ -69,8 +79,6 @@ AES_FUNC_START (name) | |||
69 | #define MM_OP_m(op, src) MM_OP(op, m, src) | 79 | #define MM_OP_m(op, src) MM_OP(op, m, src) |
70 | 80 | ||
71 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
72 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
73 | |||
74 | 82 | ||
75 | AES_FUNC_START2 (AesCbc_Encode_HW) | 83 | AES_FUNC_START2 (AesCbc_Encode_HW) |
76 | { | 84 | { |
@@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
139 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) |
140 | #endif | 148 | #endif |
141 | 149 | ||
142 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
143 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
144 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
145 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
146 | |||
147 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
148 | 151 | ||
149 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) | 152 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) |
@@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
152 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) | 155 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) |
153 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
154 | 157 | ||
155 | |||
156 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
157 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
158 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
159 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
160 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
161 | |||
162 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
163 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) |
164 | 160 | ||
165 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
166 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
167 | |||
168 | #define WOP_KEY(op, n) { \ | 161 | #define WOP_KEY(op, n) { \ |
169 | const __m128i key = w[n]; \ | 162 | const __m128i key = w[n]; \ |
170 | WOP(op); } | 163 | WOP(op); } |
171 | 164 | ||
172 | #define AVX_WOP_KEY(op, n) { \ | ||
173 | const __m256i key = w[n]; \ | ||
174 | WOP(op); } | ||
175 | |||
176 | 165 | ||
177 | #define WIDE_LOOP_START \ | 166 | #define WIDE_LOOP_START \ |
178 | dataEnd = data + numBlocks; \ | 167 | dataEnd = data + numBlocks; \ |
@@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
190 | for (; data < dataEnd; data++) | 179 | for (; data < dataEnd; data++) |
191 | 180 | ||
192 | 181 | ||
182 | |||
183 | #ifdef USE_INTEL_VAES | ||
184 | |||
185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
189 | /* | ||
190 | AVX_XOR_data_M1() needs unaligned memory load | ||
191 | if (we don't use _mm256_loadu_si256() here) | ||
192 | { | ||
193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
194 | instruction that can load unaligned data. | ||
195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
197 | } | ||
198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
200 | */ | ||
201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | ||
202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
204 | |||
205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
212 | #define AVX_WOP_KEY(op, n) { \ | ||
213 | const __m256i key = w[n]; \ | ||
214 | WOP(op); } | ||
215 | |||
193 | #define NUM_AES_KEYS_MAX 15 | 216 | #define NUM_AES_KEYS_MAX 15 |
194 | 217 | ||
195 | #define WIDE_LOOP_START_AVX(OP) \ | 218 | #define WIDE_LOOP_START_AVX(OP) \ |
@@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
214 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
215 | MSVC still can insert vzeroupper instruction. */ | 238 | MSVC still can insert vzeroupper instruction. */ |
216 | 239 | ||
240 | #endif | ||
241 | |||
242 | |||
217 | 243 | ||
218 | AES_FUNC_START2 (AesCbc_Decode_HW) | 244 | AES_FUNC_START2 (AesCbc_Decode_HW) |
219 | { | 245 | { |
@@ -380,6 +406,9 @@ required that <immintrin.h> must be included before <avxintrin.h>. | |||
380 | #endif | 406 | #endif |
381 | #endif // __clang__ && _MSC_VER | 407 | #endif // __clang__ && _MSC_VER |
382 | 408 | ||
409 | #ifndef ATTRIB_VAES | ||
410 | #define ATTRIB_VAES | ||
411 | #endif | ||
383 | 412 | ||
384 | #define VAES_FUNC_START2(name) \ | 413 | #define VAES_FUNC_START2(name) \ |
385 | AES_FUNC_START (name); \ | 414 | AES_FUNC_START (name); \ |
@@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
519 | 548 | ||
520 | /* no USE_INTEL_AES */ | 549 | /* no USE_INTEL_AES */ |
521 | 550 | ||
551 | #if defined(Z7_USE_AES_HW_STUB) | ||
552 | // We can compile this file with another C compiler, | ||
553 | // or we can compile asm version. | ||
554 | // So we can generate real code instead of this stub function. | ||
555 | // #if defined(_MSC_VER) | ||
522 | #pragma message("AES HW_SW stub was used") | 556 | #pragma message("AES HW_SW stub was used") |
557 | // #endif | ||
523 | 558 | ||
559 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) | ||
524 | #define AES_TYPE_keys UInt32 | 560 | #define AES_TYPE_keys UInt32 |
525 | #define AES_TYPE_data Byte | 561 | #define AES_TYPE_data Byte |
562 | #endif | ||
526 | 563 | ||
527 | #define AES_FUNC_START(name) \ | 564 | #define AES_FUNC_START(name) \ |
528 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ | 565 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ |
@@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
535 | AES_COMPAT_STUB (AesCbc_Encode) | 572 | AES_COMPAT_STUB (AesCbc_Encode) |
536 | AES_COMPAT_STUB (AesCbc_Decode) | 573 | AES_COMPAT_STUB (AesCbc_Decode) |
537 | AES_COMPAT_STUB (AesCtr_Code) | 574 | AES_COMPAT_STUB (AesCtr_Code) |
575 | #endif // Z7_USE_AES_HW_STUB | ||
538 | 576 | ||
539 | #endif // USE_INTEL_AES | 577 | #endif // USE_INTEL_AES |
540 | 578 | ||
541 | 579 | ||
542 | #ifndef USE_INTEL_VAES | 580 | #ifndef USE_INTEL_VAES |
543 | 581 | #if defined(Z7_USE_VAES_HW_STUB) | |
582 | // #if defined(_MSC_VER) | ||
544 | #pragma message("VAES HW_SW stub was used") | 583 | #pragma message("VAES HW_SW stub was used") |
584 | // #endif | ||
545 | 585 | ||
546 | #define VAES_COMPAT_STUB(name) \ | 586 | #define VAES_COMPAT_STUB(name) \ |
547 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ | 587 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ |
@@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code) | |||
550 | 590 | ||
551 | VAES_COMPAT_STUB (AesCbc_Decode_HW) | 591 | VAES_COMPAT_STUB (AesCbc_Decode_HW) |
552 | VAES_COMPAT_STUB (AesCtr_Code_HW) | 592 | VAES_COMPAT_STUB (AesCtr_Code_HW) |
553 | 593 | #endif | |
554 | #endif // ! USE_INTEL_VAES | 594 | #endif // ! USE_INTEL_VAES |
555 | 595 | ||
556 | 596 | ||
597 | |||
598 | |||
557 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | 599 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
558 | 600 | ||
559 | #if defined(__clang__) | 601 | #if defined(__ARM_FEATURE_AES) \ |
560 | #if (__clang_major__ >= 8) // fix that check | 602 | || defined(__ARM_FEATURE_CRYPTO) |
603 | #define USE_HW_AES | ||
604 | #else | ||
605 | #if defined(MY_CPU_ARM64) \ | ||
606 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
607 | || defined(Z7_MSC_VER_ORIGINAL) | ||
608 | #if defined(__ARM_FP) && \ | ||
609 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
610 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
611 | ) \ | ||
612 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
613 | #if defined(MY_CPU_ARM64) \ | ||
614 | || !defined(Z7_CLANG_VERSION) \ | ||
615 | || defined(__ARM_NEON) && \ | ||
616 | (Z7_CLANG_VERSION < 170000 || \ | ||
617 | Z7_CLANG_VERSION > 170001) | ||
561 | #define USE_HW_AES | 618 | #define USE_HW_AES |
562 | #endif | 619 | #endif |
563 | #elif defined(__GNUC__) | ||
564 | #if (__GNUC__ >= 6) // fix that check | ||
565 | #define USE_HW_AES | ||
566 | #endif | 620 | #endif |
567 | #elif defined(_MSC_VER) | ||
568 | #if _MSC_VER >= 1910 | ||
569 | #define USE_HW_AES | ||
570 | #endif | 621 | #endif |
571 | #endif | 622 | #endif |
572 | 623 | ||
573 | #ifdef USE_HW_AES | 624 | #ifdef USE_HW_AES |
574 | 625 | ||
575 | // #pragma message("=== AES HW === ") | 626 | // #pragma message("=== AES HW === ") |
627 | // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES | ||
576 | 628 | ||
577 | #if defined(__clang__) || defined(__GNUC__) | 629 | #if defined(__clang__) || defined(__GNUC__) |
630 | #if !defined(__ARM_FEATURE_AES) && \ | ||
631 | !defined(__ARM_FEATURE_CRYPTO) | ||
578 | #ifdef MY_CPU_ARM64 | 632 | #ifdef MY_CPU_ARM64 |
633 | #if defined(__clang__) | ||
634 | #define ATTRIB_AES __attribute__((__target__("crypto"))) | ||
635 | #else | ||
579 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) | 636 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) |
637 | #endif | ||
580 | #else | 638 | #else |
639 | #if defined(__clang__) | ||
640 | #define ATTRIB_AES __attribute__((__target__("armv8-a,aes"))) | ||
641 | #else | ||
581 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 642 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
643 | #endif | ||
582 | #endif | 644 | #endif |
645 | #endif | ||
583 | #else | 646 | #else |
584 | // _MSC_VER | 647 | // _MSC_VER |
585 | // for arm32 | 648 | // for arm32 |
@@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW) | |||
590 | #define ATTRIB_AES | 653 | #define ATTRIB_AES |
591 | #endif | 654 | #endif |
592 | 655 | ||
593 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 656 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
594 | #include <arm64_neon.h> | 657 | #include <arm64_neon.h> |
595 | #else | 658 | #else |
659 | /* | ||
660 | clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese | ||
661 | clang | ||
662 | 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO) | ||
663 | 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
664 | 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
665 | 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES) | ||
666 | 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 | ||
667 | */ | ||
668 | #if defined(__clang__) && __clang_major__ < 16 | ||
669 | #if !defined(__ARM_FEATURE_AES) && \ | ||
670 | !defined(__ARM_FEATURE_CRYPTO) | ||
671 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
672 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
673 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
674 | // #if defined(__clang__) && __clang_major__ < 13 | ||
675 | #define __ARM_FEATURE_CRYPTO 1 | ||
676 | // #else | ||
677 | #define __ARM_FEATURE_AES 1 | ||
678 | // #endif | ||
679 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
680 | #endif | ||
681 | #endif // clang | ||
682 | |||
683 | #if defined(__clang__) | ||
684 | |||
685 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
686 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
687 | // #pragma message("#define __ARM_ARCH 8") | ||
688 | #undef __ARM_ARCH | ||
689 | #define __ARM_ARCH 8 | ||
690 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
691 | #endif | ||
692 | |||
693 | #endif // clang | ||
694 | |||
596 | #include <arm_neon.h> | 695 | #include <arm_neon.h> |
696 | |||
697 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
698 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
699 | defined(__ARM_FEATURE_AES) | ||
700 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
701 | #undef __ARM_FEATURE_CRYPTO | ||
702 | #undef __ARM_FEATURE_AES | ||
703 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
704 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
705 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
597 | #endif | 706 | #endif |
598 | 707 | ||
708 | #endif // Z7_MSC_VER_ORIGINAL | ||
709 | |||
599 | typedef uint8x16_t v128; | 710 | typedef uint8x16_t v128; |
600 | 711 | ||
601 | #define AES_FUNC_START(name) \ | 712 | #define AES_FUNC_START(name) \ |
@@ -620,7 +731,7 @@ AES_FUNC_START (name) | |||
620 | 731 | ||
621 | AES_FUNC_START2 (AesCbc_Encode_HW) | 732 | AES_FUNC_START2 (AesCbc_Encode_HW) |
622 | { | 733 | { |
623 | v128 *p = (v128*)(void*)ivAes; | 734 | v128 * const p = (v128*)(void*)ivAes; |
624 | v128 *data = (v128*)(void*)data8; | 735 | v128 *data = (v128*)(void*)data8; |
625 | v128 m = *p; | 736 | v128 m = *p; |
626 | const v128 k0 = p[2]; | 737 | const v128 k0 = p[2]; |
@@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
639 | const v128 k_z0 = w[2]; | 750 | const v128 k_z0 = w[2]; |
640 | for (; numBlocks != 0; numBlocks--, data++) | 751 | for (; numBlocks != 0; numBlocks--, data++) |
641 | { | 752 | { |
642 | MM_XOR_m (*data); | 753 | MM_XOR_m (*data) |
643 | AES_E_MC_m (k0) | 754 | AES_E_MC_m (k0) |
644 | AES_E_MC_m (k1) | 755 | AES_E_MC_m (k1) |
645 | AES_E_MC_m (k2) | 756 | AES_E_MC_m (k2) |
@@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
660 | } | 771 | } |
661 | } | 772 | } |
662 | AES_E_m (k_z1) | 773 | AES_E_m (k_z1) |
663 | MM_XOR_m (k_z0); | 774 | MM_XOR_m (k_z0) |
664 | *data = m; | 775 | *data = m; |
665 | } | 776 | } |
666 | *p = m; | 777 | *p = m; |
@@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
745 | while (w != p); | 856 | while (w != p); |
746 | WOP_KEY (AES_D, 1) | 857 | WOP_KEY (AES_D, 1) |
747 | WOP_KEY (AES_XOR, 0) | 858 | WOP_KEY (AES_XOR, 0) |
748 | MM_XOR (m0, iv); | 859 | MM_XOR (m0, iv) |
749 | WOP_M1 (XOR_data_M1) | 860 | WOP_M1 (XOR_data_M1) |
750 | iv = data[NUM_WAYS - 1]; | 861 | iv = data[NUM_WAYS - 1]; |
751 | WOP (STORE_data) | 862 | WOP (STORE_data) |
@@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
759 | AES_D_IMC_m (w[2]) | 870 | AES_D_IMC_m (w[2]) |
760 | do | 871 | do |
761 | { | 872 | { |
762 | AES_D_IMC_m (w[1]); | 873 | AES_D_IMC_m (w[1]) |
763 | AES_D_IMC_m (w[0]); | 874 | AES_D_IMC_m (w[0]) |
764 | w -= 2; | 875 | w -= 2; |
765 | } | 876 | } |
766 | while (w != p); | 877 | while (w != p); |
767 | AES_D_m (w[1]); | 878 | AES_D_m (w[1]) |
768 | MM_XOR_m (w[0]); | 879 | MM_XOR_m (w[0]) |
769 | MM_XOR_m (iv); | 880 | MM_XOR_m (iv) |
770 | iv = *data; | 881 | iv = *data; |
771 | *data = m; | 882 | *data = m; |
772 | } | 883 | } |
@@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
783 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; |
784 | const v128 *dataEnd; | 895 | const v128 *dataEnd; |
785 | uint64x2_t one = vdupq_n_u64(0); | 896 | uint64x2_t one = vdupq_n_u64(0); |
897 | |||
898 | // the bug in clang: | ||
899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | ||
900 | #if defined(__clang__) && (__clang_major__ <= 9) | ||
901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | ||
902 | #endif | ||
786 | one = vsetq_lane_u64(1, one, 0); | 903 | one = vsetq_lane_u64(1, one, 0); |
787 | p += 2; | 904 | p += 2; |
788 | 905 | ||
@@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
809 | { | 926 | { |
810 | const v128 *w = p; | 927 | const v128 *w = p; |
811 | v128 m; | 928 | v128 m; |
812 | CTR_START (m, 0); | 929 | CTR_START (m, 0) |
813 | do | 930 | do |
814 | { | 931 | { |
815 | AES_E_MC_m (w[0]); | 932 | AES_E_MC_m (w[0]) |
816 | AES_E_MC_m (w[1]); | 933 | AES_E_MC_m (w[1]) |
817 | w += 2; | 934 | w += 2; |
818 | } | 935 | } |
819 | while (w != wEnd); | 936 | while (w != wEnd); |
@@ -1,5 +1,5 @@ | |||
1 | /* Alloc.c -- Memory allocation functions | 1 | /* Alloc.c -- Memory allocation functions |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-02-18 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -10,19 +10,18 @@ | |||
10 | 10 | ||
11 | #include "Alloc.h" | 11 | #include "Alloc.h" |
12 | 12 | ||
13 | #ifdef _WIN32 | 13 | #if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \ |
14 | #ifdef Z7_LARGE_PAGES | 14 | (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64) |
15 | #if defined(__clang__) || defined(__GNUC__) | 15 | #define Z7_USE_DYN_GetLargePageMinimum |
16 | typedef void (*Z7_voidFunction)(void); | 16 | #endif |
17 | #define MY_CAST_FUNC (Z7_voidFunction) | 17 | |
18 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | 18 | // for debug: |
19 | #define MY_CAST_FUNC (void *) | 19 | #if 0 |
20 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | 20 | #if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) |
21 | #else | 21 | // #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ") |
22 | #define MY_CAST_FUNC | 22 | #define Z7_ALLOC_NO_OFFSET_ALLOCATOR |
23 | #endif | ||
23 | #endif | 24 | #endif |
24 | #endif // Z7_LARGE_PAGES | ||
25 | #endif // _WIN32 | ||
26 | 25 | ||
27 | // #define SZ_ALLOC_DEBUG | 26 | // #define SZ_ALLOC_DEBUG |
28 | /* #define SZ_ALLOC_DEBUG */ | 27 | /* #define SZ_ALLOC_DEBUG */ |
@@ -146,7 +145,9 @@ static void PrintAddr(void *p) | |||
146 | #define PRINT_FREE(name, cnt, ptr) | 145 | #define PRINT_FREE(name, cnt, ptr) |
147 | #define Print(s) | 146 | #define Print(s) |
148 | #define PrintLn() | 147 | #define PrintLn() |
148 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR | ||
149 | #define PrintHex(v, align) | 149 | #define PrintHex(v, align) |
150 | #endif | ||
150 | #define PrintAddr(p) | 151 | #define PrintAddr(p) |
151 | 152 | ||
152 | #endif | 153 | #endif |
@@ -246,9 +247,9 @@ void MidFree(void *address) | |||
246 | #ifdef Z7_LARGE_PAGES | 247 | #ifdef Z7_LARGE_PAGES |
247 | 248 | ||
248 | #ifdef MEM_LARGE_PAGES | 249 | #ifdef MEM_LARGE_PAGES |
249 | #define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES | 250 | #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES |
250 | #else | 251 | #else |
251 | #define MY__MEM_LARGE_PAGES 0x20000000 | 252 | #define MY_MEM_LARGE_PAGES 0x20000000 |
252 | #endif | 253 | #endif |
253 | 254 | ||
254 | extern | 255 | extern |
@@ -258,19 +259,23 @@ typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID); | |||
258 | 259 | ||
259 | void SetLargePageSize(void) | 260 | void SetLargePageSize(void) |
260 | { | 261 | { |
261 | #ifdef Z7_LARGE_PAGES | ||
262 | SIZE_T size; | 262 | SIZE_T size; |
263 | #ifdef Z7_USE_DYN_GetLargePageMinimum | ||
264 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
265 | |||
263 | const | 266 | const |
264 | Func_GetLargePageMinimum fn = | 267 | Func_GetLargePageMinimum fn = |
265 | (Func_GetLargePageMinimum) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | 268 | (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), |
266 | "GetLargePageMinimum"); | 269 | "GetLargePageMinimum"); |
267 | if (!fn) | 270 | if (!fn) |
268 | return; | 271 | return; |
269 | size = fn(); | 272 | size = fn(); |
273 | #else | ||
274 | size = GetLargePageMinimum(); | ||
275 | #endif | ||
270 | if (size == 0 || (size & (size - 1)) != 0) | 276 | if (size == 0 || (size & (size - 1)) != 0) |
271 | return; | 277 | return; |
272 | g_LargePageSize = size; | 278 | g_LargePageSize = size; |
273 | #endif | ||
274 | } | 279 | } |
275 | 280 | ||
276 | #endif // Z7_LARGE_PAGES | 281 | #endif // Z7_LARGE_PAGES |
@@ -292,7 +297,7 @@ void *BigAlloc(size_t size) | |||
292 | size2 = (size + ps) & ~ps; | 297 | size2 = (size + ps) & ~ps; |
293 | if (size2 >= size) | 298 | if (size2 >= size) |
294 | { | 299 | { |
295 | void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); | 300 | void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE); |
296 | if (p) | 301 | if (p) |
297 | { | 302 | { |
298 | PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) | 303 | PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) |
@@ -328,20 +333,7 @@ const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; | |||
328 | const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; | 333 | const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; |
329 | #endif | 334 | #endif |
330 | 335 | ||
331 | /* | 336 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR |
332 | uintptr_t : <stdint.h> C99 (optional) | ||
333 | : unsupported in VS6 | ||
334 | */ | ||
335 | |||
336 | #ifdef _WIN32 | ||
337 | typedef UINT_PTR UIntPtr; | ||
338 | #else | ||
339 | /* | ||
340 | typedef uintptr_t UIntPtr; | ||
341 | */ | ||
342 | typedef ptrdiff_t UIntPtr; | ||
343 | #endif | ||
344 | |||
345 | 337 | ||
346 | #define ADJUST_ALLOC_SIZE 0 | 338 | #define ADJUST_ALLOC_SIZE 0 |
347 | /* | 339 | /* |
@@ -352,14 +344,36 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; | |||
352 | MyAlloc() can return address that is NOT multiple of sizeof(void *). | 344 | MyAlloc() can return address that is NOT multiple of sizeof(void *). |
353 | */ | 345 | */ |
354 | 346 | ||
355 | |||
356 | /* | 347 | /* |
357 | #define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) | 348 | uintptr_t : <stdint.h> C99 (optional) |
349 | : unsupported in VS6 | ||
358 | */ | 350 | */ |
359 | #define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) | 351 | typedef |
352 | #ifdef _WIN32 | ||
353 | UINT_PTR | ||
354 | #elif 1 | ||
355 | uintptr_t | ||
356 | #else | ||
357 | ptrdiff_t | ||
358 | #endif | ||
359 | MY_uintptr_t; | ||
360 | |||
361 | #if 0 \ | ||
362 | || (defined(__CHERI__) \ | ||
363 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8)) | ||
364 | // for 128-bit pointers (cheri): | ||
365 | #define MY_ALIGN_PTR_DOWN(p, align) \ | ||
366 | ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1)))) | ||
367 | #else | ||
368 | #define MY_ALIGN_PTR_DOWN(p, align) \ | ||
369 | ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1)))) | ||
370 | #endif | ||
360 | 371 | ||
372 | #endif | ||
361 | 373 | ||
362 | #if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) | 374 | #if !defined(_WIN32) \ |
375 | && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \ | ||
376 | || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) | ||
363 | #define USE_posix_memalign | 377 | #define USE_posix_memalign |
364 | #endif | 378 | #endif |
365 | 379 | ||
@@ -399,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size) | |||
399 | 413 | ||
400 | #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) | 414 | #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) |
401 | 415 | ||
402 | static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | 416 | void *z7_AlignedAlloc(size_t size) |
403 | { | 417 | { |
404 | #ifndef USE_posix_memalign | 418 | #ifndef USE_posix_memalign |
405 | 419 | ||
406 | void *p; | 420 | void *p; |
407 | void *pAligned; | 421 | void *pAligned; |
408 | size_t newSize; | 422 | size_t newSize; |
409 | UNUSED_VAR(pp) | ||
410 | 423 | ||
411 | /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned | 424 | /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned |
412 | block to prevent cache line sharing with another allocated blocks */ | 425 | block to prevent cache line sharing with another allocated blocks */ |
@@ -431,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | |||
431 | 444 | ||
432 | return pAligned; | 445 | return pAligned; |
433 | 446 | ||
434 | #else | 447 | #else |
435 | 448 | ||
436 | void *p; | 449 | void *p; |
437 | UNUSED_VAR(pp) | ||
438 | if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) | 450 | if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) |
439 | return NULL; | 451 | return NULL; |
440 | 452 | ||
@@ -443,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | |||
443 | 455 | ||
444 | return p; | 456 | return p; |
445 | 457 | ||
446 | #endif | 458 | #endif |
459 | } | ||
460 | |||
461 | |||
462 | void z7_AlignedFree(void *address) | ||
463 | { | ||
464 | #ifndef USE_posix_memalign | ||
465 | if (address) | ||
466 | MyFree(((void **)address)[-1]); | ||
467 | #else | ||
468 | free(address); | ||
469 | #endif | ||
470 | } | ||
471 | |||
472 | |||
473 | static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | ||
474 | { | ||
475 | UNUSED_VAR(pp) | ||
476 | return z7_AlignedAlloc(size); | ||
447 | } | 477 | } |
448 | 478 | ||
449 | 479 | ||
450 | static void SzAlignedFree(ISzAllocPtr pp, void *address) | 480 | static void SzAlignedFree(ISzAllocPtr pp, void *address) |
451 | { | 481 | { |
452 | UNUSED_VAR(pp) | 482 | UNUSED_VAR(pp) |
453 | #ifndef USE_posix_memalign | 483 | #ifndef USE_posix_memalign |
454 | if (address) | 484 | if (address) |
455 | MyFree(((void **)address)[-1]); | 485 | MyFree(((void **)address)[-1]); |
456 | #else | 486 | #else |
457 | free(address); | 487 | free(address); |
458 | #endif | 488 | #endif |
459 | } | 489 | } |
460 | 490 | ||
461 | 491 | ||
@@ -463,16 +493,44 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree }; | |||
463 | 493 | ||
464 | 494 | ||
465 | 495 | ||
466 | #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) | ||
467 | |||
468 | /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ | 496 | /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ |
469 | #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] | 497 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR |
470 | /* | 498 | #if 1 |
471 | #define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] | 499 | #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) |
472 | */ | 500 | #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] |
501 | #else | ||
502 | // we can use this simplified code, | ||
503 | // if (CAlignOffsetAlloc::offset == (k * sizeof(void *)) | ||
504 | #define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1]) | ||
505 | #endif | ||
506 | #endif | ||
507 | |||
508 | |||
509 | #if 0 | ||
510 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR | ||
511 | #include <stdio.h> | ||
512 | static void PrintPtr(const char *s, const void *p) | ||
513 | { | ||
514 | const Byte *p2 = (const Byte *)&p; | ||
515 | unsigned i; | ||
516 | printf("%s %p ", s, p); | ||
517 | for (i = sizeof(p); i != 0;) | ||
518 | { | ||
519 | i--; | ||
520 | printf("%02x", p2[i]); | ||
521 | } | ||
522 | printf("\n"); | ||
523 | } | ||
524 | #endif | ||
525 | #endif | ||
526 | |||
473 | 527 | ||
474 | static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | 528 | static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) |
475 | { | 529 | { |
530 | #if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) | ||
531 | UNUSED_VAR(pp) | ||
532 | return z7_AlignedAlloc(size); | ||
533 | #else | ||
476 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); | 534 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); |
477 | void *adr; | 535 | void *adr; |
478 | void *pAligned; | 536 | void *pAligned; |
@@ -501,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | |||
501 | pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + | 559 | pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + |
502 | alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; | 560 | alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; |
503 | 561 | ||
562 | #if 0 | ||
563 | printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size); | ||
564 | PrintPtr("base", adr); | ||
565 | PrintPtr("alig", pAligned); | ||
566 | #endif | ||
567 | |||
504 | PrintLn(); | 568 | PrintLn(); |
505 | Print("- Aligned: "); | 569 | Print("- Aligned: "); |
506 | Print(" size="); PrintHex(size, 8); | 570 | Print(" size="); PrintHex(size, 8); |
@@ -512,11 +576,16 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | |||
512 | REAL_BLOCK_PTR_VAR(pAligned) = adr; | 576 | REAL_BLOCK_PTR_VAR(pAligned) = adr; |
513 | 577 | ||
514 | return pAligned; | 578 | return pAligned; |
579 | #endif | ||
515 | } | 580 | } |
516 | 581 | ||
517 | 582 | ||
518 | static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) | 583 | static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) |
519 | { | 584 | { |
585 | #if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) | ||
586 | UNUSED_VAR(pp) | ||
587 | z7_AlignedFree(address); | ||
588 | #else | ||
520 | if (address) | 589 | if (address) |
521 | { | 590 | { |
522 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); | 591 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); |
@@ -525,6 +594,7 @@ static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) | |||
525 | PrintLn(); | 594 | PrintLn(); |
526 | ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); | 595 | ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); |
527 | } | 596 | } |
597 | #endif | ||
528 | } | 598 | } |
529 | 599 | ||
530 | 600 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Alloc.h -- Memory allocation functions | 1 | /* Alloc.h -- Memory allocation functions |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_ALLOC_H | 4 | #ifndef ZIP7_INC_ALLOC_H |
5 | #define ZIP7_INC_ALLOC_H | 5 | #define ZIP7_INC_ALLOC_H |
@@ -22,6 +22,9 @@ void *MyAlloc(size_t size); | |||
22 | void MyFree(void *address); | 22 | void MyFree(void *address); |
23 | void *MyRealloc(void *address, size_t size); | 23 | void *MyRealloc(void *address, size_t size); |
24 | 24 | ||
25 | void *z7_AlignedAlloc(size_t size); | ||
26 | void z7_AlignedFree(void *p); | ||
27 | |||
25 | #ifdef _WIN32 | 28 | #ifdef _WIN32 |
26 | 29 | ||
27 | #ifdef Z7_LARGE_PAGES | 30 | #ifdef Z7_LARGE_PAGES |
@@ -33,12 +36,14 @@ void MidFree(void *address); | |||
33 | void *BigAlloc(size_t size); | 36 | void *BigAlloc(size_t size); |
34 | void BigFree(void *address); | 37 | void BigFree(void *address); |
35 | 38 | ||
39 | /* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */ | ||
40 | |||
36 | #else | 41 | #else |
37 | 42 | ||
38 | #define MidAlloc(size) MyAlloc(size) | 43 | #define MidAlloc(size) z7_AlignedAlloc(size) |
39 | #define MidFree(address) MyFree(address) | 44 | #define MidFree(address) z7_AlignedFree(address) |
40 | #define BigAlloc(size) MyAlloc(size) | 45 | #define BigAlloc(size) z7_AlignedAlloc(size) |
41 | #define BigFree(address) MyFree(address) | 46 | #define BigFree(address) z7_AlignedFree(address) |
42 | 47 | ||
43 | #endif | 48 | #endif |
44 | 49 | ||
diff --git a/C/Asm_c.mak b/C/Asm_c.mak new file mode 100644 index 0000000..9431816 --- /dev/null +++ b/C/Asm_c.mak | |||
@@ -0,0 +1,12 @@ | |||
1 | !IFDEF ASM_OBJS | ||
2 | !IF "$(PLATFORM)" == "arm64" | ||
3 | $(ASM_OBJS): ../../../Asm/arm64/$(*B).S | ||
4 | $(COMPL_ASM_CLANG) | ||
5 | !ELSEIF "$(PLATFORM)" == "arm" | ||
6 | $(ASM_OBJS): ../../../Asm/arm/$(*B).asm | ||
7 | $(COMPL_ASM) | ||
8 | !ELSEIF "$(PLATFORM)" != "ia64" && "$(PLATFORM)" != "mips" | ||
9 | $(ASM_OBJS): ../../../Asm/x86/$(*B).asm | ||
10 | $(COMPL_ASM) | ||
11 | !ENDIF | ||
12 | !ENDIF | ||
@@ -1,47 +1,104 @@ | |||
1 | /* Blake2.h -- BLAKE2 Hash | 1 | /* Blake2.h -- BLAKE2sp Hash |
2 | 2023-03-04 : Igor Pavlov : Public domain | 2 | 2024-01-17 : Igor Pavlov : Public domain */ |
3 | 2015 : Samuel Neves : Public domain */ | ||
4 | 3 | ||
5 | #ifndef ZIP7_INC_BLAKE2_H | 4 | #ifndef ZIP7_INC_BLAKE2_H |
6 | #define ZIP7_INC_BLAKE2_H | 5 | #define ZIP7_INC_BLAKE2_H |
7 | 6 | ||
8 | #include "7zTypes.h" | 7 | #include "7zTypes.h" |
9 | 8 | ||
10 | EXTERN_C_BEGIN | 9 | #if 0 |
10 | #include "Compiler.h" | ||
11 | #include "CpuArch.h" | ||
12 | #if defined(MY_CPU_X86_OR_AMD64) | ||
13 | #if defined(__SSE2__) \ | ||
14 | || defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
15 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
16 | || defined(__clang__) \ | ||
17 | || defined(__INTEL_COMPILER) | ||
18 | #include <emmintrin.h> // SSE2 | ||
19 | #endif | ||
11 | 20 | ||
12 | #define BLAKE2S_BLOCK_SIZE 64 | 21 | #if defined(__AVX2__) \ |
13 | #define BLAKE2S_DIGEST_SIZE 32 | 22 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ |
14 | #define BLAKE2SP_PARALLEL_DEGREE 8 | 23 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ |
24 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
25 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
26 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
27 | #include <immintrin.h> | ||
28 | #if defined(__clang__) | ||
29 | #include <avxintrin.h> | ||
30 | #include <avx2intrin.h> | ||
31 | #endif | ||
32 | #endif // avx2 | ||
33 | #endif // MY_CPU_X86_OR_AMD64 | ||
34 | #endif // 0 | ||
15 | 35 | ||
16 | typedef struct | 36 | EXTERN_C_BEGIN |
17 | { | ||
18 | UInt32 h[8]; | ||
19 | UInt32 t[2]; | ||
20 | UInt32 f[2]; | ||
21 | Byte buf[BLAKE2S_BLOCK_SIZE]; | ||
22 | UInt32 bufPos; | ||
23 | UInt32 lastNode_f1; | ||
24 | UInt32 dummy[2]; /* for sizeof(CBlake2s) alignment */ | ||
25 | } CBlake2s; | ||
26 | |||
27 | /* You need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ | ||
28 | /* | ||
29 | void Blake2s_Init0(CBlake2s *p); | ||
30 | void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size); | ||
31 | void Blake2s_Final(CBlake2s *p, Byte *digest); | ||
32 | */ | ||
33 | 37 | ||
38 | #define Z7_BLAKE2S_BLOCK_SIZE 64 | ||
39 | #define Z7_BLAKE2S_DIGEST_SIZE 32 | ||
40 | #define Z7_BLAKE2SP_PARALLEL_DEGREE 8 | ||
41 | #define Z7_BLAKE2SP_NUM_STRUCT_WORDS 16 | ||
34 | 42 | ||
43 | #if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS) | ||
44 | typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_COMPRESS)(UInt32 *states, const Byte *data, const Byte *end); | ||
45 | typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_INIT)(UInt32 *states); | ||
46 | #endif | ||
47 | |||
48 | // it's required that CBlake2sp is aligned for 32-bytes, | ||
49 | // because the code can use unaligned access with sse and avx256. | ||
50 | // but 64-bytes alignment can be better. | ||
51 | MY_ALIGN(64) | ||
35 | typedef struct | 52 | typedef struct |
36 | { | 53 | { |
37 | CBlake2s S[BLAKE2SP_PARALLEL_DEGREE]; | 54 | union |
38 | unsigned bufPos; | 55 | { |
39 | } CBlake2sp; | 56 | #if 0 |
57 | #if defined(MY_CPU_X86_OR_AMD64) | ||
58 | #if defined(__SSE2__) \ | ||
59 | || defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
60 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
61 | || defined(__clang__) \ | ||
62 | || defined(__INTEL_COMPILER) | ||
63 | __m128i _pad_align_128bit[4]; | ||
64 | #endif // sse2 | ||
65 | #if defined(__AVX2__) \ | ||
66 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
67 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
68 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
69 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
70 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
71 | __m256i _pad_align_256bit[2]; | ||
72 | #endif // avx2 | ||
73 | #endif // x86 | ||
74 | #endif // 0 | ||
40 | 75 | ||
76 | void * _pad_align_ptr[8]; | ||
77 | UInt32 _pad_align_32bit[16]; | ||
78 | struct | ||
79 | { | ||
80 | unsigned cycPos; | ||
81 | unsigned _pad_unused; | ||
82 | #if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS) | ||
83 | Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Fast; | ||
84 | Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Single; | ||
85 | Z7_BLAKE2SP_FUNC_INIT func_Init; | ||
86 | Z7_BLAKE2SP_FUNC_INIT func_Final; | ||
87 | #endif | ||
88 | } header; | ||
89 | } u; | ||
90 | // MY_ALIGN(64) | ||
91 | UInt32 states[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS]; | ||
92 | // MY_ALIGN(64) | ||
93 | UInt32 buf32[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS * 2]; | ||
94 | } CBlake2sp; | ||
41 | 95 | ||
96 | BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo); | ||
42 | void Blake2sp_Init(CBlake2sp *p); | 97 | void Blake2sp_Init(CBlake2sp *p); |
98 | void Blake2sp_InitState(CBlake2sp *p); | ||
43 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); | 99 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); |
44 | void Blake2sp_Final(CBlake2sp *p, Byte *digest); | 100 | void Blake2sp_Final(CBlake2sp *p, Byte *digest); |
101 | void z7_Black2sp_Prepare(void); | ||
45 | 102 | ||
46 | EXTERN_C_END | 103 | EXTERN_C_END |
47 | 104 | ||
diff --git a/C/Blake2s.c b/C/Blake2s.c index 2a84b57..459e76b 100644 --- a/C/Blake2s.c +++ b/C/Blake2s.c | |||
@@ -1,250 +1,2645 @@ | |||
1 | /* Blake2s.c -- BLAKE2s and BLAKE2sp Hash | 1 | /* Blake2s.c -- BLAKE2sp Hash |
2 | 2023-03-04 : Igor Pavlov : Public domain | 2 | 2024-01-29 : Igor Pavlov : Public domain |
3 | 2015 : Samuel Neves : Public domain */ | 3 | 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
6 | 6 | ||
7 | // #include <stdio.h> | ||
7 | #include <string.h> | 8 | #include <string.h> |
8 | 9 | ||
9 | #include "Blake2.h" | 10 | #include "Blake2.h" |
10 | #include "CpuArch.h" | ||
11 | #include "RotateDefs.h" | 11 | #include "RotateDefs.h" |
12 | #include "Compiler.h" | ||
13 | #include "CpuArch.h" | ||
14 | |||
15 | #if defined(__SSE2__) | ||
16 | #define Z7_BLAKE2S_USE_VECTORS | ||
17 | #elif defined(MY_CPU_X86_OR_AMD64) | ||
18 | #if defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
20 | || defined(__clang__) \ | ||
21 | || defined(__INTEL_COMPILER) | ||
22 | #define Z7_BLAKE2S_USE_VECTORS | ||
23 | #endif | ||
24 | #endif | ||
25 | |||
26 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
27 | |||
28 | #define Z7_BLAKE2SP_USE_FUNCTIONS | ||
29 | |||
30 | // define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes. | ||
31 | // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
32 | |||
33 | // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%. | ||
34 | #if defined(__SSSE3__) | ||
35 | #define Z7_BLAKE2S_USE_SSSE3 | ||
36 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
37 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
38 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
39 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
40 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
41 | #define Z7_BLAKE2S_USE_SSSE3 | ||
42 | #endif | ||
43 | |||
44 | #ifdef Z7_BLAKE2S_USE_SSSE3 | ||
45 | /* SSE41 : for _mm_insert_epi32 (pinsrd) | ||
46 | it can slightly reduce code size and improves the performance in some cases. | ||
47 | it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used. | ||
48 | it can be used for all blocks in another algos (4+). | ||
49 | */ | ||
50 | #if defined(__SSE4_1__) | ||
51 | #define Z7_BLAKE2S_USE_SSE41 | ||
52 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
53 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
54 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
55 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
56 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
57 | #define Z7_BLAKE2S_USE_SSE41 | ||
58 | #endif | ||
59 | #endif // SSSE3 | ||
60 | |||
61 | #if defined(__GNUC__) || defined(__clang__) | ||
62 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
63 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1"))) | ||
64 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
65 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3"))) | ||
66 | #else | ||
67 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2"))) | ||
68 | #endif | ||
69 | #endif | ||
70 | |||
71 | |||
72 | #if defined(__AVX2__) | ||
73 | #define Z7_BLAKE2S_USE_AVX2 | ||
74 | #else | ||
75 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
76 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
77 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) | ||
78 | #define Z7_BLAKE2S_USE_AVX2 | ||
79 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
80 | #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2"))) | ||
81 | #endif | ||
82 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
83 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
84 | #if (Z7_MSC_VER_ORIGINAL == 1900) | ||
85 | #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX | ||
86 | #endif | ||
87 | #define Z7_BLAKE2S_USE_AVX2 | ||
88 | #endif | ||
89 | #endif | ||
90 | |||
91 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
92 | #include <smmintrin.h> // SSE4.1 | ||
93 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
94 | #include <tmmintrin.h> // SSSE3 | ||
95 | #else | ||
96 | #include <emmintrin.h> // SSE2 | ||
97 | #endif | ||
98 | |||
99 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
100 | #include <immintrin.h> | ||
101 | #if defined(__clang__) | ||
102 | #include <avxintrin.h> | ||
103 | #include <avx2intrin.h> | ||
104 | #endif | ||
105 | #endif // avx2 | ||
106 | |||
107 | |||
108 | #if defined(__AVX512F__) && defined(__AVX512VL__) | ||
109 | // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930) | ||
110 | #define Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
111 | // #pragma message ("=== Blake2s AVX512") | ||
112 | #endif | ||
12 | 113 | ||
13 | #define rotr32 rotrFixed | ||
14 | 114 | ||
15 | #define BLAKE2S_NUM_ROUNDS 10 | 115 | #define Z7_BLAKE2S_USE_V128_FAST |
16 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | 116 | // for speed optimization for small messages: |
117 | // #define Z7_BLAKE2S_USE_V128_WAY2 | ||
17 | 118 | ||
119 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
120 | |||
121 | // for debug: | ||
122 | // gather is slow | ||
123 | // #define Z7_BLAKE2S_USE_GATHER | ||
124 | |||
125 | #define Z7_BLAKE2S_USE_AVX2_FAST | ||
126 | // for speed optimization for small messages: | ||
127 | // #define Z7_BLAKE2S_USE_AVX2_WAY2 | ||
128 | // #define Z7_BLAKE2S_USE_AVX2_WAY4 | ||
129 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \ | ||
130 | defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
131 | #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
132 | #endif | ||
133 | #endif | ||
134 | |||
135 | #define Z7_BLAKE2SP_ALGO_DEFAULT 0 | ||
136 | #define Z7_BLAKE2SP_ALGO_SCALAR 1 | ||
137 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
138 | #define Z7_BLAKE2SP_ALGO_V128_FAST 2 | ||
139 | #endif | ||
140 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
141 | #define Z7_BLAKE2SP_ALGO_V256_FAST 3 | ||
142 | #endif | ||
143 | #define Z7_BLAKE2SP_ALGO_V128_WAY1 4 | ||
144 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
145 | #define Z7_BLAKE2SP_ALGO_V128_WAY2 5 | ||
146 | #endif | ||
147 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
148 | #define Z7_BLAKE2SP_ALGO_V256_WAY2 6 | ||
149 | #endif | ||
150 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
151 | #define Z7_BLAKE2SP_ALGO_V256_WAY4 7 | ||
152 | #endif | ||
153 | |||
154 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
155 | |||
156 | |||
157 | |||
158 | |||
159 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | ||
160 | #define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS | ||
161 | #define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE) | ||
162 | #define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1) | ||
163 | |||
164 | #define V_INDEX_0_0 0 | ||
165 | #define V_INDEX_1_0 1 | ||
166 | #define V_INDEX_2_0 2 | ||
167 | #define V_INDEX_3_0 3 | ||
168 | #define V_INDEX_0_1 4 | ||
169 | #define V_INDEX_1_1 5 | ||
170 | #define V_INDEX_2_1 6 | ||
171 | #define V_INDEX_3_1 7 | ||
172 | #define V_INDEX_0_2 8 | ||
173 | #define V_INDEX_1_2 9 | ||
174 | #define V_INDEX_2_2 10 | ||
175 | #define V_INDEX_3_2 11 | ||
176 | #define V_INDEX_0_3 12 | ||
177 | #define V_INDEX_1_3 13 | ||
178 | #define V_INDEX_2_3 14 | ||
179 | #define V_INDEX_3_3 15 | ||
180 | #define V_INDEX_4_0 0 | ||
181 | #define V_INDEX_5_0 1 | ||
182 | #define V_INDEX_6_0 2 | ||
183 | #define V_INDEX_7_0 3 | ||
184 | #define V_INDEX_7_1 4 | ||
185 | #define V_INDEX_4_1 5 | ||
186 | #define V_INDEX_5_1 6 | ||
187 | #define V_INDEX_6_1 7 | ||
188 | #define V_INDEX_6_2 8 | ||
189 | #define V_INDEX_7_2 9 | ||
190 | #define V_INDEX_4_2 10 | ||
191 | #define V_INDEX_5_2 11 | ||
192 | #define V_INDEX_5_3 12 | ||
193 | #define V_INDEX_6_3 13 | ||
194 | #define V_INDEX_7_3 14 | ||
195 | #define V_INDEX_4_3 15 | ||
196 | |||
197 | #define V(row, col) v[V_INDEX_ ## row ## _ ## col] | ||
198 | |||
199 | #define k_Blake2s_IV_0 0x6A09E667UL | ||
200 | #define k_Blake2s_IV_1 0xBB67AE85UL | ||
201 | #define k_Blake2s_IV_2 0x3C6EF372UL | ||
202 | #define k_Blake2s_IV_3 0xA54FF53AUL | ||
203 | #define k_Blake2s_IV_4 0x510E527FUL | ||
204 | #define k_Blake2s_IV_5 0x9B05688CUL | ||
205 | #define k_Blake2s_IV_6 0x1F83D9ABUL | ||
206 | #define k_Blake2s_IV_7 0x5BE0CD19UL | ||
207 | |||
208 | #define KIV(n) (k_Blake2s_IV_## n) | ||
209 | |||
210 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
211 | MY_ALIGN(16) | ||
18 | static const UInt32 k_Blake2s_IV[8] = | 212 | static const UInt32 k_Blake2s_IV[8] = |
19 | { | 213 | { |
20 | 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, | 214 | KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7) |
21 | 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL | ||
22 | }; | 215 | }; |
216 | #endif | ||
23 | 217 | ||
24 | static const Byte k_Blake2s_Sigma[BLAKE2S_NUM_ROUNDS][16] = | 218 | #define STATE_T(s) ((s) + 8) |
25 | { | 219 | #define STATE_F(s) ((s) + 10) |
26 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , | 220 | |
27 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , | 221 | #ifdef Z7_BLAKE2S_USE_VECTORS |
28 | { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , | ||
29 | { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , | ||
30 | { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , | ||
31 | { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , | ||
32 | { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , | ||
33 | { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , | ||
34 | { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , | ||
35 | { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , | ||
36 | }; | ||
37 | 222 | ||
223 | #define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p)) | ||
224 | #define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p)) | ||
225 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
226 | // here we use unaligned load and stores | ||
227 | // use this branch if CBlake2sp can be unaligned for 16 bytes | ||
228 | #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r) | ||
229 | #define LOAD_128_FROM_STRUCT(p) LOADU_128(p) | ||
230 | #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r) | ||
231 | #else | ||
232 | // here we use aligned load and stores | ||
233 | // use this branch if CBlake2sp is aligned for 16 bytes | ||
234 | #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r) | ||
235 | #define LOAD_128_FROM_STRUCT(p) LOAD_128(p) | ||
236 | #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r) | ||
237 | #endif | ||
38 | 238 | ||
39 | static void Blake2s_Init0(CBlake2s *p) | 239 | #endif // Z7_BLAKE2S_USE_VECTORS |
240 | |||
241 | |||
242 | #if 0 | ||
243 | static void PrintState(const UInt32 *s, unsigned num) | ||
244 | { | ||
245 | unsigned i; | ||
246 | printf("\n"); | ||
247 | for (i = 0; i < num; i++) | ||
248 | printf(" %08x", (unsigned)s[i]); | ||
249 | } | ||
250 | static void PrintStates2(const UInt32 *s, unsigned x, unsigned y) | ||
40 | { | 251 | { |
41 | unsigned i; | 252 | unsigned i; |
42 | for (i = 0; i < 8; i++) | 253 | for (i = 0; i < y; i++) |
43 | p->h[i] = k_Blake2s_IV[i]; | 254 | PrintState(s + i * x, x); |
44 | p->t[0] = 0; | 255 | printf("\n"); |
45 | p->t[1] = 0; | ||
46 | p->f[0] = 0; | ||
47 | p->f[1] = 0; | ||
48 | p->bufPos = 0; | ||
49 | p->lastNode_f1 = 0; | ||
50 | } | 256 | } |
257 | #endif | ||
258 | |||
259 | |||
260 | #define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) } | ||
261 | |||
262 | #define BLAKE2S_NUM_ROUNDS 10 | ||
263 | |||
264 | #if defined(Z7_BLAKE2S_USE_VECTORS) | ||
265 | #define ROUNDS_LOOP(mac) \ | ||
266 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) } | ||
267 | #endif | ||
268 | /* | ||
269 | #define ROUNDS_LOOP_2(mac) \ | ||
270 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } } | ||
271 | */ | ||
272 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
273 | #define ROUNDS_LOOP_UNROLLED(m) \ | ||
274 | { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) } | ||
275 | #endif | ||
276 | |||
277 | #define SIGMA_TABLE(M) \ | ||
278 | M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \ | ||
279 | M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \ | ||
280 | M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \ | ||
281 | M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \ | ||
282 | M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \ | ||
283 | M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \ | ||
284 | M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \ | ||
285 | M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \ | ||
286 | M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \ | ||
287 | M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 ) | ||
288 | |||
289 | #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
290 | { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m } | ||
291 | #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
292 | SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
293 | |||
294 | // MY_ALIGN(32) | ||
295 | MY_ALIGN(16) | ||
296 | static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] = | ||
297 | { SIGMA_TABLE(SIGMA_TABLE_MULT_4) }; | ||
298 | |||
299 | #define GET_SIGMA_PTR(p, index) \ | ||
300 | ((const void *)((const Byte *)(const void *)(p) + (index))) | ||
51 | 301 | ||
302 | #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \ | ||
303 | ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos))) | ||
52 | 304 | ||
53 | static void Blake2s_Compress(CBlake2s *p) | 305 | |
306 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
307 | |||
308 | |||
309 | #if 0 | ||
310 | // use loading constants from memory | ||
311 | // is faster for some compilers. | ||
312 | #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n) | ||
313 | MY_ALIGN(64) | ||
314 | static const UInt32 k_Blake2s_IV_WAY4[]= | ||
54 | { | 315 | { |
55 | UInt32 m[16]; | 316 | KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7) |
56 | UInt32 v[16]; | 317 | }; |
57 | 318 | #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i)) | |
319 | #else | ||
320 | // use constant generation: | ||
321 | #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i)) | ||
322 | #endif | ||
323 | |||
324 | |||
325 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
326 | #define GET_CONST_128_FROM_ARRAY32(k) \ | ||
327 | _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0]) | ||
328 | #endif | ||
329 | |||
330 | |||
331 | #if 0 | ||
332 | #define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
333 | #define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
334 | #define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE) | ||
335 | #define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0) | ||
336 | #define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4) | ||
337 | #else | ||
338 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
339 | !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
340 | MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 }; | ||
341 | MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 }; | ||
342 | #define k_r8 LOAD_128(k_r8_arr) | ||
343 | #define k_r16 LOAD_128(k_r16_arr) | ||
344 | #endif | ||
345 | MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 }; | ||
346 | #define k_inc LOAD_128(k_inc_arr) | ||
347 | #define k_iv0_128 LOAD_128(k_Blake2s_IV + 0) | ||
348 | #define k_iv4_128 LOAD_128(k_Blake2s_IV + 4) | ||
349 | #endif | ||
350 | |||
351 | |||
352 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
353 | |||
354 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
355 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) | ||
356 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | ||
357 | #else | ||
358 | #define MY_mm256_set_m128i _mm256_set_m128i | ||
359 | #endif | ||
360 | |||
361 | #define SET_FROM_128(a) MY_mm256_set_m128i(a, a) | ||
362 | |||
363 | #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
364 | MY_ALIGN(32) static const Byte k_r8_arr_256 [32] = | ||
365 | { | ||
366 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12, | ||
367 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 | ||
368 | }; | ||
369 | MY_ALIGN(32) static const Byte k_r16_arr_256[32] = | ||
370 | { | ||
371 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, | ||
372 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 | ||
373 | }; | ||
374 | #define k_r8_256 LOAD_256(k_r8_arr_256) | ||
375 | #define k_r16_256 LOAD_256(k_r16_arr_256) | ||
376 | #endif | ||
377 | |||
378 | // #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)) | ||
379 | // #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)) | ||
380 | // #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)) | ||
381 | // #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)) | ||
382 | #define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)) | ||
383 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
384 | #endif | ||
385 | |||
386 | |||
387 | /* | ||
388 | IPC(TP) ports: | ||
389 | 1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps | ||
390 | 2 p_15 : icl+ | ||
391 | 1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps | ||
392 | 3 p015 : skl+ | ||
393 | |||
394 | 3 p015 : SSE2 : pxor : _mm_xor_si128 | ||
395 | 2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32 | ||
396 | 2 p0_5: mrm-wsm : | ||
397 | 3 p015 : skl+ | ||
398 | |||
399 | 2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq | ||
400 | 2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32 | ||
401 | 2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16 | ||
402 | 2 p_15 : : SSE2 : psrldq : | ||
403 | 2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8 | ||
404 | 2 p_15 : : SSE4 : pblendw : _mm_blend_epi16 | ||
405 | 1 p__5 : hsw-skl : * | ||
406 | |||
407 | 1 p0 : SSE2 : pslld (i8) : _mm_slli_si128 | ||
408 | 2 p01 : skl+ : | ||
409 | |||
410 | 2 p_15 : ivb- : SSE3 : palignr | ||
411 | 1 p__5 : hsw+ | ||
412 | |||
413 | 2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8) | ||
414 | 1 p__5 + p23 : hsw-skl | ||
415 | 1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8) | ||
416 | 0.5 2*p5 : hsw-skl | ||
417 | |||
418 | 2 p23 : SSE2 : movd (m32) | ||
419 | 3 p23A : adl : | ||
420 | 1 p5: : SSE2 : movd (r32) | ||
421 | */ | ||
422 | |||
423 | #if 0 && defined(__XOP__) | ||
424 | // we must debug and test __XOP__ instruction | ||
425 | #include <x86intrin.h> | ||
426 | #include <ammintrin.h> | ||
427 | #define LOAD_ROTATE_CONSTS | ||
428 | #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c)) | ||
429 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
430 | #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
431 | #define LOAD_ROTATE_CONSTS | ||
432 | #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c) | ||
433 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
434 | #else | ||
435 | |||
436 | // MSVC_1937+ uses "orps" instruction for _mm_or_si128(). | ||
437 | // But "orps" has low throughput: TP=1 for bdw-nhm. | ||
438 | // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps". | ||
439 | // But "orps" is fast for modern cpus (skl+). | ||
440 | // So we are default with "or" version: | ||
441 | #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937 | ||
442 | // minor optimization for some old cpus, if "xorps" is slow. | ||
443 | #define MM128_EPI32_OR_or_ADD _mm_add_epi32 | ||
444 | #else | ||
445 | #define MM128_EPI32_OR_or_ADD _mm_or_si128 | ||
446 | #endif | ||
447 | |||
448 | #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \ | ||
449 | MM128_EPI32_OR_or_ADD( \ | ||
450 | _mm_srli_epi32((r), (c)), \ | ||
451 | _mm_slli_epi32((r), 32-(c)))) | ||
452 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
453 | #define LOAD_ROTATE_CONSTS \ | ||
454 | const __m128i r8 = k_r8; \ | ||
455 | const __m128i r16 = k_r16; | ||
456 | #define MM_ROR_EPI32(r, c) ( \ | ||
457 | ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \ | ||
458 | : (16==(c)) ? _mm_shuffle_epi8(r,r16) \ | ||
459 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
460 | #else | ||
461 | #define LOAD_ROTATE_CONSTS | ||
462 | #define MM_ROR_EPI32(r, c) ( \ | ||
463 | (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \ | ||
464 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
465 | #endif | ||
466 | #endif | ||
467 | |||
468 | /* | ||
469 | we have 3 main ways to load 4 32-bit integers to __m128i: | ||
470 | 1) SSE2: _mm_set_epi32() | ||
471 | 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128() | ||
472 | 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128() | ||
473 | good compiler for _mm_set_epi32() generates these instructions: | ||
474 | { | ||
475 | movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq; | ||
476 | } | ||
477 | good new compiler generates one instruction | ||
478 | { | ||
479 | for _mm_insert_epi32() : { pinsrd xmm, [m32], i } | ||
480 | for _mm_cvtsi32_si128() : { movd xmm, [m32] } | ||
481 | } | ||
482 | but vc2010 generates slow pair of instructions: | ||
483 | { | ||
484 | for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i } | ||
485 | for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 } | ||
486 | } | ||
487 | _mm_insert_epi32() (pinsrd) code reduces xmm register pressure | ||
488 | in comparison with _mm_set_epi32() (movd + vpunpckld) code. | ||
489 | Note that variant with "movd xmm, r32" can be more slow, | ||
490 | but register pressure can be more important. | ||
491 | So we can force to "pinsrd" always. | ||
492 | */ | ||
493 | // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86) | ||
494 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
495 | /* _mm_set_epi32() can be more effective for GCC and CLANG | ||
496 | _mm_insert_epi32() is more effective for MSVC */ | ||
497 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
498 | #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
499 | #endif | ||
500 | #endif // USE_SSE41 | ||
501 | // #endif | ||
502 | |||
503 | #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
504 | // for SSE4.1 | ||
505 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
506 | _mm_insert_epi32( \ | ||
507 | _mm_insert_epi32( \ | ||
508 | _mm_insert_epi32( \ | ||
509 | _mm_cvtsi32_si128( \ | ||
510 | *(const Int32 *)p0), \ | ||
511 | *(const Int32 *)p1, 1), \ | ||
512 | *(const Int32 *)p2, 2), \ | ||
513 | *(const Int32 *)p3, 3) | ||
514 | #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
515 | /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read. | ||
516 | Also _mm_unpacklo_epi32 is more effective for another MSVC compilers. | ||
517 | But _mm_set_epi32() is more effective for GCC and CLANG. | ||
518 | So we use _mm_unpacklo_epi32 for MSVC only */ | ||
519 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
520 | _mm_unpacklo_epi64( \ | ||
521 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \ | ||
522 | _mm_cvtsi32_si128(*(const Int32 *)p1)), \ | ||
523 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \ | ||
524 | _mm_cvtsi32_si128(*(const Int32 *)p3))) | ||
525 | #else | ||
526 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
527 | _mm_set_epi32( \ | ||
528 | *(const Int32 *)p3, \ | ||
529 | *(const Int32 *)p2, \ | ||
530 | *(const Int32 *)p1, \ | ||
531 | *(const Int32 *)p0) | ||
532 | #endif | ||
533 | |||
534 | #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \ | ||
535 | MM_LOAD_EPI32_FROM_4_POINTERS( \ | ||
536 | GET_SIGMA_PTR(input, i0), \ | ||
537 | GET_SIGMA_PTR(input, i1), \ | ||
538 | GET_SIGMA_PTR(input, i2), \ | ||
539 | GET_SIGMA_PTR(input, i3)) | ||
540 | |||
541 | #define SET_ROW_FROM_SIGMA(input, sigma_index) \ | ||
542 | SET_ROW_FROM_SIGMA_BASE(input, \ | ||
543 | sigma[(sigma_index) ], \ | ||
544 | sigma[(sigma_index) + 2 * 1], \ | ||
545 | sigma[(sigma_index) + 2 * 2], \ | ||
546 | sigma[(sigma_index) + 2 * 3]) \ | ||
547 | |||
548 | |||
549 | #define ADD_128(a, b) _mm_add_epi32(a, b) | ||
550 | #define XOR_128(a, b) _mm_xor_si128(a, b) | ||
551 | |||
552 | #define D_ADD_128(dest, src) dest = ADD_128(dest, src) | ||
553 | #define D_XOR_128(dest, src) dest = XOR_128(dest, src) | ||
554 | #define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift) | ||
555 | #define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src) | ||
556 | |||
557 | |||
558 | #define AXR(a, b, d, shift) \ | ||
559 | D_ADD_128(a, b); \ | ||
560 | D_XOR_128(d, a); \ | ||
561 | D_ROR_128(d, shift); | ||
562 | |||
563 | #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \ | ||
564 | a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \ | ||
565 | AXR(a, b, d, shift1) \ | ||
566 | AXR(c, d, b, shift2) | ||
567 | |||
568 | #define ROTATE_WORDS_TO_RIGHT(a, n) \ | ||
569 | a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); | ||
570 | |||
571 | #define AXR4(a, b, c, d, input, sigma_index) \ | ||
572 | AXR2(a, b, c, d, input, sigma_index, 16, 12) \ | ||
573 | AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \ | ||
574 | |||
575 | #define RR2(a, b, c, d, input) \ | ||
576 | { \ | ||
577 | AXR4(a, b, c, d, input, 0) \ | ||
578 | ROTATE_WORDS_TO_RIGHT(b, 1) \ | ||
579 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
580 | ROTATE_WORDS_TO_RIGHT(d, 3) \ | ||
581 | AXR4(a, b, c, d, input, 8) \ | ||
582 | ROTATE_WORDS_TO_RIGHT(b, 3) \ | ||
583 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
584 | ROTATE_WORDS_TO_RIGHT(d, 1) \ | ||
585 | } | ||
586 | |||
587 | |||
588 | /* | ||
589 | Way1: | ||
590 | per 64 bytes block: | ||
591 | 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1 | ||
592 | * (7 + 1) = 320 cycles = if pslld TP=2 (skl+) | ||
593 | additional operations per 7_op_iter : | ||
594 | 4 movzx byte mem | ||
595 | 1 movd mem | ||
596 | 3 pinsrd mem | ||
597 | 1.5 pshufd | ||
598 | */ | ||
599 | |||
600 | static | ||
601 | #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
602 | defined(Z7_BLAKE2S_USE_V256_WAY2)) | ||
603 | Z7_NO_INLINE | ||
604 | #else | ||
605 | Z7_FORCE_INLINE | ||
606 | #endif | ||
607 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
608 | BLAKE2S_ATTRIB_128BIT | ||
609 | #endif | ||
610 | void | ||
611 | Z7_FASTCALL | ||
612 | Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input) | ||
613 | { | ||
614 | __m128i a, b, c, d; | ||
615 | __m128i f0, f1; | ||
616 | |||
617 | LOAD_ROTATE_CONSTS | ||
618 | d = LOAD_128_FROM_STRUCT(STATE_T(s)); | ||
619 | c = k_iv0_128; | ||
620 | a = f0 = LOAD_128_FROM_STRUCT(s); | ||
621 | b = f1 = LOAD_128_FROM_STRUCT(s + 4); | ||
622 | D_ADD_EPI64_128(d, k_inc); | ||
623 | STORE_128_TO_STRUCT (STATE_T(s), d); | ||
624 | D_XOR_128(d, k_iv4_128); | ||
625 | |||
626 | #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
627 | RR2(a, b, c, d, input) } | ||
628 | |||
629 | ROUNDS_LOOP(RR) | ||
630 | #undef RR | ||
631 | |||
632 | STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c))); | ||
633 | STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d))); | ||
634 | } | ||
635 | |||
636 | |||
637 | static | ||
638 | Z7_NO_INLINE | ||
639 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
640 | BLAKE2S_ATTRIB_128BIT | ||
641 | #endif | ||
642 | void | ||
643 | Z7_FASTCALL | ||
644 | Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end) | ||
645 | { | ||
646 | size_t pos = 0; | ||
647 | do | ||
58 | { | 648 | { |
59 | unsigned i; | 649 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
650 | Blake2s_Compress_V128_Way1(s, data); | ||
651 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
652 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
653 | pos &= SUPER_BLOCK_MASK; | ||
654 | } | ||
655 | while (data != end); | ||
656 | } | ||
657 | |||
658 | |||
659 | #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
660 | defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
661 | #if 1 | ||
662 | #define Z7_BLAKE2S_CompressSingleBlock(s, data) \ | ||
663 | Blake2sp_Compress2_V128_Way1(s, data, \ | ||
664 | (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE) | ||
665 | #else | ||
666 | #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1 | ||
667 | #endif | ||
668 | #endif | ||
669 | |||
670 | |||
671 | #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \ | ||
672 | defined(Z7_BLAKE2S_USE_V128_WAY2)) && \ | ||
673 | !defined(Z7_BLAKE2S_USE_GATHER) | ||
674 | #define AXR2_LOAD_INDEXES(sigma_index) \ | ||
675 | const unsigned i0 = sigma[(sigma_index)]; \ | ||
676 | const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \ | ||
677 | const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \ | ||
678 | const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \ | ||
679 | |||
680 | #define SET_ROW_FROM_SIGMA_W(input) \ | ||
681 | SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) | ||
682 | #endif | ||
683 | |||
684 | |||
685 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
686 | |||
687 | #if 1 || !defined(Z7_BLAKE2S_USE_SSE41) | ||
688 | /* we use SET_ROW_FROM_SIGMA_BASE, that uses | ||
689 | (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined | ||
690 | (SSE2) _mm_set_epi32() | ||
691 | MSVC can be faster for this branch: | ||
692 | */ | ||
693 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
694 | { \ | ||
695 | AXR2_LOAD_INDEXES(sigma_index) \ | ||
696 | a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \ | ||
697 | a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \ | ||
698 | AXR(a0, b0, d0, shift1) \ | ||
699 | AXR(a1, b1, d1, shift1) \ | ||
700 | AXR(c0, d0, b0, shift2) \ | ||
701 | AXR(c1, d1, b1, shift2) \ | ||
702 | } | ||
703 | #else | ||
704 | /* we use interleaved _mm_insert_epi32(): | ||
705 | GCC can be faster for this branch: | ||
706 | */ | ||
707 | #define AXR2_W_PRE_INSERT(sigma_index, i) \ | ||
708 | { const unsigned ii = sigma[(sigma_index) + i * 2]; \ | ||
709 | t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \ | ||
710 | t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \ | ||
711 | } | ||
712 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
713 | { __m128i t0, t1; \ | ||
714 | { const unsigned ii = sigma[sigma_index]; \ | ||
715 | t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \ | ||
716 | t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \ | ||
717 | } \ | ||
718 | AXR2_W_PRE_INSERT(sigma_index, 1) \ | ||
719 | AXR2_W_PRE_INSERT(sigma_index, 2) \ | ||
720 | AXR2_W_PRE_INSERT(sigma_index, 3) \ | ||
721 | a0 = _mm_add_epi32(a0, t0); \ | ||
722 | a1 = _mm_add_epi32(a1, t1); \ | ||
723 | AXR(a0, b0, d0, shift1) \ | ||
724 | AXR(a1, b1, d1, shift1) \ | ||
725 | AXR(c0, d0, b0, shift2) \ | ||
726 | AXR(c1, d1, b1, shift2) \ | ||
727 | } | ||
728 | #endif | ||
729 | |||
730 | |||
731 | #define AXR4_W(sigma_index) \ | ||
732 | AXR2_W(sigma_index, 16, 12) \ | ||
733 | AXR2_W(sigma_index + 1, 8, 7) \ | ||
734 | |||
735 | #define WW(r) \ | ||
736 | { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
737 | AXR4_W(0) \ | ||
738 | ROTATE_WORDS_TO_RIGHT(b0, 1) \ | ||
739 | ROTATE_WORDS_TO_RIGHT(b1, 1) \ | ||
740 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
741 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
742 | ROTATE_WORDS_TO_RIGHT(d0, 3) \ | ||
743 | ROTATE_WORDS_TO_RIGHT(d1, 3) \ | ||
744 | AXR4_W(8) \ | ||
745 | ROTATE_WORDS_TO_RIGHT(b0, 3) \ | ||
746 | ROTATE_WORDS_TO_RIGHT(b1, 3) \ | ||
747 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
748 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
749 | ROTATE_WORDS_TO_RIGHT(d0, 1) \ | ||
750 | ROTATE_WORDS_TO_RIGHT(d1, 1) \ | ||
751 | } | ||
752 | |||
753 | |||
754 | static | ||
755 | Z7_NO_INLINE | ||
756 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
757 | BLAKE2S_ATTRIB_128BIT | ||
758 | #endif | ||
759 | void | ||
760 | Z7_FASTCALL | ||
761 | Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
762 | { | ||
763 | size_t pos = 0; | ||
764 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
765 | |||
766 | if (data != end) | ||
767 | { | ||
768 | LOAD_ROTATE_CONSTS | ||
769 | do | ||
770 | { | ||
771 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
772 | __m128i a0, b0, c0, d0; | ||
773 | __m128i a1, b1, c1, d1; | ||
774 | { | ||
775 | const __m128i inc = k_inc; | ||
776 | const __m128i temp = k_iv4_128; | ||
777 | d0 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
778 | d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
779 | D_ADD_EPI64_128(d0, inc); | ||
780 | D_ADD_EPI64_128(d1, inc); | ||
781 | STORE_128_TO_STRUCT (STATE_T(s ), d0); | ||
782 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1); | ||
783 | D_XOR_128(d0, temp); | ||
784 | D_XOR_128(d1, temp); | ||
785 | } | ||
786 | c1 = c0 = k_iv0_128; | ||
787 | a0 = LOAD_128_FROM_STRUCT(s); | ||
788 | b0 = LOAD_128_FROM_STRUCT(s + 4); | ||
789 | a1 = LOAD_128_FROM_STRUCT(s + NSW); | ||
790 | b1 = LOAD_128_FROM_STRUCT(s + NSW + 4); | ||
791 | |||
792 | ROUNDS_LOOP (WW) | ||
793 | |||
794 | #undef WW | ||
795 | |||
796 | D_XOR_128(a0, c0); | ||
797 | D_XOR_128(b0, d0); | ||
798 | D_XOR_128(a1, c1); | ||
799 | D_XOR_128(b1, d1); | ||
800 | |||
801 | D_XOR_128(a0, LOAD_128_FROM_STRUCT(s)); | ||
802 | D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4)); | ||
803 | D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW)); | ||
804 | D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4)); | ||
805 | |||
806 | STORE_128_TO_STRUCT(s, a0); | ||
807 | STORE_128_TO_STRUCT(s + 4, b0); | ||
808 | STORE_128_TO_STRUCT(s + NSW, a1); | ||
809 | STORE_128_TO_STRUCT(s + NSW + 4, b1); | ||
810 | |||
811 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
812 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
813 | pos &= SUPER_BLOCK_MASK; | ||
814 | } | ||
815 | while (data < end); | ||
816 | if (data != end) | ||
817 | return; | ||
818 | } | ||
819 | { | ||
820 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
821 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
822 | } | ||
823 | } | ||
824 | #endif // Z7_BLAKE2S_USE_V128_WAY2 | ||
825 | |||
826 | |||
827 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
828 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2 | ||
829 | #else | ||
830 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1 | ||
831 | #endif | ||
832 | |||
833 | |||
834 | |||
835 | #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
836 | #define ROT_128_8(x) MM_ROR_EPI32(x, 8) | ||
837 | #define ROT_128_16(x) MM_ROR_EPI32(x, 16) | ||
838 | #define ROT_128_7(x) MM_ROR_EPI32(x, 7) | ||
839 | #define ROT_128_12(x) MM_ROR_EPI32(x, 12) | ||
840 | #else | ||
841 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
842 | #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8 | ||
843 | #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16 | ||
844 | #else | ||
845 | #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8) | ||
846 | #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16) | ||
847 | #endif | ||
848 | #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7) | ||
849 | #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12) | ||
850 | #endif | ||
851 | |||
852 | |||
853 | #if 1 | ||
854 | // this branch can provide similar speed on x86* in most cases, | ||
855 | // because [base + index*4] provides same speed as [base + index]. | ||
856 | // but some compilers can generate different code with this branch, that can be faster sometimes. | ||
857 | // this branch uses additional table of 10*16=160 bytes. | ||
858 | #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
859 | SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
860 | MY_ALIGN(16) | ||
861 | static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] = | ||
862 | { SIGMA_TABLE(SIGMA_TABLE_MULT_16) }; | ||
863 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r]; | ||
864 | #define GET_SIGMA_VAL_128(n) (sigma[n]) | ||
865 | #else | ||
866 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
867 | #define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n]) | ||
868 | #endif | ||
869 | |||
870 | |||
871 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
872 | #if 1 | ||
873 | #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
874 | SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
875 | MY_ALIGN(64) | ||
876 | static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] = | ||
877 | { SIGMA_TABLE(SIGMA_TABLE_MULT_32) }; | ||
878 | #define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r]; | ||
879 | #define GET_SIGMA_VAL_256(n) (sigma[n]) | ||
880 | #else | ||
881 | #define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
882 | #define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n]) | ||
883 | #endif | ||
884 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
885 | |||
886 | |||
887 | #define D_ROT_128_7(dest) dest = ROT_128_7(dest) | ||
888 | #define D_ROT_128_8(dest) dest = ROT_128_8(dest) | ||
889 | #define D_ROT_128_12(dest) dest = ROT_128_12(dest) | ||
890 | #define D_ROT_128_16(dest) dest = ROT_128_16(dest) | ||
891 | |||
892 | #define OP_L(a, i) D_ADD_128 (V(a, 0), \ | ||
893 | LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i)))); | ||
894 | |||
895 | #define OP_0(a) OP_L(a, 0) | ||
896 | #define OP_7(a) OP_L(a, 1) | ||
897 | |||
898 | #define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1)); | ||
899 | #define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0)); | ||
900 | #define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3)); | ||
901 | #define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2)); | ||
902 | |||
903 | #define OP_3(a) D_ROT_128_16 (V(a, 3)); | ||
904 | #define OP_6(a) D_ROT_128_12 (V(a, 1)); | ||
905 | #define OP_8(a) D_ROT_128_8 (V(a, 3)); | ||
906 | #define OP_9(a) D_ROT_128_7 (V(a, 1)); | ||
907 | |||
908 | |||
909 | // for 32-bit x86 : interleave mode works slower, because of register pressure. | ||
910 | |||
911 | #if 0 || 1 && (defined(MY_CPU_X86) \ | ||
912 | || defined(__GNUC__) && !defined(__clang__)) | ||
913 | // non-inteleaved version: | ||
914 | // is fast for x86 32-bit. | ||
915 | // is fast for GCC x86-64. | ||
916 | |||
917 | #define V4G(a) \ | ||
918 | OP_0 (a) \ | ||
919 | OP_1 (a) \ | ||
920 | OP_2 (a) \ | ||
921 | OP_3 (a) \ | ||
922 | OP_4 (a) \ | ||
923 | OP_5 (a) \ | ||
924 | OP_6 (a) \ | ||
925 | OP_7 (a) \ | ||
926 | OP_1 (a) \ | ||
927 | OP_2 (a) \ | ||
928 | OP_8 (a) \ | ||
929 | OP_4 (a) \ | ||
930 | OP_5 (a) \ | ||
931 | OP_9 (a) \ | ||
932 | |||
933 | #define V4R \ | ||
934 | { \ | ||
935 | V4G (0) \ | ||
936 | V4G (1) \ | ||
937 | V4G (2) \ | ||
938 | V4G (3) \ | ||
939 | V4G (4) \ | ||
940 | V4G (5) \ | ||
941 | V4G (6) \ | ||
942 | V4G (7) \ | ||
943 | } | ||
944 | |||
945 | #elif 0 || 1 && defined(MY_CPU_X86) | ||
946 | |||
947 | #define OP_INTER_2(op, a,b) \ | ||
948 | op (a) \ | ||
949 | op (b) \ | ||
950 | |||
951 | #define V4G(a,b) \ | ||
952 | OP_INTER_2 (OP_0, a,b) \ | ||
953 | OP_INTER_2 (OP_1, a,b) \ | ||
954 | OP_INTER_2 (OP_2, a,b) \ | ||
955 | OP_INTER_2 (OP_3, a,b) \ | ||
956 | OP_INTER_2 (OP_4, a,b) \ | ||
957 | OP_INTER_2 (OP_5, a,b) \ | ||
958 | OP_INTER_2 (OP_6, a,b) \ | ||
959 | OP_INTER_2 (OP_7, a,b) \ | ||
960 | OP_INTER_2 (OP_1, a,b) \ | ||
961 | OP_INTER_2 (OP_2, a,b) \ | ||
962 | OP_INTER_2 (OP_8, a,b) \ | ||
963 | OP_INTER_2 (OP_4, a,b) \ | ||
964 | OP_INTER_2 (OP_5, a,b) \ | ||
965 | OP_INTER_2 (OP_9, a,b) \ | ||
966 | |||
967 | #define V4R \ | ||
968 | { \ | ||
969 | V4G (0, 1) \ | ||
970 | V4G (2, 3) \ | ||
971 | V4G (4, 5) \ | ||
972 | V4G (6, 7) \ | ||
973 | } | ||
974 | |||
975 | #else | ||
976 | // iterleave-4 version is fast for x64 (MSVC/CLANG) | ||
977 | |||
978 | #define OP_INTER_4(op, a,b,c,d) \ | ||
979 | op (a) \ | ||
980 | op (b) \ | ||
981 | op (c) \ | ||
982 | op (d) \ | ||
983 | |||
984 | #define V4G(a,b,c,d) \ | ||
985 | OP_INTER_4 (OP_0, a,b,c,d) \ | ||
986 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
987 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
988 | OP_INTER_4 (OP_3, a,b,c,d) \ | ||
989 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
990 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
991 | OP_INTER_4 (OP_6, a,b,c,d) \ | ||
992 | OP_INTER_4 (OP_7, a,b,c,d) \ | ||
993 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
994 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
995 | OP_INTER_4 (OP_8, a,b,c,d) \ | ||
996 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
997 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
998 | OP_INTER_4 (OP_9, a,b,c,d) \ | ||
999 | |||
1000 | #define V4R \ | ||
1001 | { \ | ||
1002 | V4G (0, 1, 2, 3) \ | ||
1003 | V4G (4, 5, 6, 7) \ | ||
1004 | } | ||
1005 | |||
1006 | #endif | ||
1007 | |||
1008 | #define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R } | ||
1009 | |||
1010 | |||
1011 | #define V4_LOAD_MSG_1(w, m, i) \ | ||
1012 | { \ | ||
1013 | __m128i m0, m1, m2, m3; \ | ||
1014 | __m128i t0, t1, t2, t3; \ | ||
1015 | m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \ | ||
1016 | m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \ | ||
1017 | m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \ | ||
1018 | m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \ | ||
1019 | t0 = _mm_unpacklo_epi32(m0, m1); \ | ||
1020 | t1 = _mm_unpackhi_epi32(m0, m1); \ | ||
1021 | t2 = _mm_unpacklo_epi32(m2, m3); \ | ||
1022 | t3 = _mm_unpackhi_epi32(m2, m3); \ | ||
1023 | w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \ | ||
1024 | w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \ | ||
1025 | w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \ | ||
1026 | w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \ | ||
1027 | } | ||
1028 | |||
1029 | #define V4_LOAD_MSG(w, m) \ | ||
1030 | { \ | ||
1031 | V4_LOAD_MSG_1 (w, m, 0) \ | ||
1032 | V4_LOAD_MSG_1 (w, m, 1) \ | ||
1033 | V4_LOAD_MSG_1 (w, m, 2) \ | ||
1034 | V4_LOAD_MSG_1 (w, m, 3) \ | ||
1035 | } | ||
1036 | |||
1037 | #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \ | ||
1038 | { \ | ||
1039 | const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \ | ||
1040 | const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \ | ||
1041 | d0 = _mm_unpacklo_epi32(v0, v1); \ | ||
1042 | d1 = _mm_unpackhi_epi32(v0, v1); \ | ||
1043 | } | ||
1044 | |||
1045 | #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \ | ||
1046 | { \ | ||
1047 | STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \ | ||
1048 | STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \ | ||
1049 | } | ||
1050 | |||
1051 | #define V4_UNPACK_STATE(dest32, src32) \ | ||
1052 | { \ | ||
1053 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1054 | V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \ | ||
1055 | V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \ | ||
1056 | V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \ | ||
1057 | V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \ | ||
1058 | V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \ | ||
1059 | V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \ | ||
1060 | V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \ | ||
1061 | V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \ | ||
1062 | } | ||
1063 | |||
1064 | |||
1065 | static | ||
1066 | Z7_NO_INLINE | ||
1067 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
1068 | BLAKE2S_ATTRIB_128BIT | ||
1069 | #endif | ||
1070 | void | ||
1071 | Z7_FASTCALL | ||
1072 | Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1073 | { | ||
1074 | // PrintStates2(s_items, 8, 16); | ||
1075 | size_t pos = 0; | ||
1076 | pos /= 2; | ||
1077 | do | ||
1078 | { | ||
1079 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
1080 | !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED) | ||
1081 | const __m128i r8 = k_r8; | ||
1082 | const __m128i r16 = k_r16; | ||
1083 | #endif | ||
1084 | __m128i w[16]; | ||
1085 | __m128i v[16]; | ||
1086 | UInt32 *s; | ||
1087 | V4_LOAD_MSG(w, data) | ||
1088 | s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1089 | { | ||
1090 | __m128i ctr = LOAD_128_FROM_STRUCT(s + 64); | ||
1091 | D_ADD_EPI64_128 (ctr, k_inc); | ||
1092 | STORE_128_TO_STRUCT(s + 64, ctr); | ||
1093 | v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
1094 | v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
1095 | } | ||
1096 | v[ 8] = GET_128_IV_WAY4(0); | ||
1097 | v[ 9] = GET_128_IV_WAY4(1); | ||
1098 | v[10] = GET_128_IV_WAY4(2); | ||
1099 | v[11] = GET_128_IV_WAY4(3); | ||
1100 | v[14] = GET_128_IV_WAY4(6); | ||
1101 | v[15] = GET_128_IV_WAY4(7); | ||
60 | 1102 | ||
61 | for (i = 0; i < 16; i++) | 1103 | #define LOAD_STATE_128_FROM_STRUCT(i) \ |
62 | m[i] = GetUi32(p->buf + i * sizeof(m[i])); | 1104 | v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4); |
1105 | |||
1106 | #define UPDATE_STATE_128_IN_STRUCT(i) \ | ||
1107 | STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \ | ||
1108 | XOR_128(v[i], v[(i) + 8]), \ | ||
1109 | LOAD_128_FROM_STRUCT(s + (i) * 4))); | ||
63 | 1110 | ||
64 | for (i = 0; i < 8; i++) | 1111 | REP8_MACRO (LOAD_STATE_128_FROM_STRUCT) |
65 | v[i] = p->h[i]; | 1112 | ROUNDS_LOOP (V4_ROUND) |
1113 | REP8_MACRO (UPDATE_STATE_128_IN_STRUCT) | ||
1114 | |||
1115 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1116 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2; | ||
1117 | pos &= SUPER_BLOCK_SIZE / 2 - 1; | ||
66 | } | 1118 | } |
1119 | while (data != end); | ||
1120 | } | ||
67 | 1121 | ||
68 | v[ 8] = k_Blake2s_IV[0]; | ||
69 | v[ 9] = k_Blake2s_IV[1]; | ||
70 | v[10] = k_Blake2s_IV[2]; | ||
71 | v[11] = k_Blake2s_IV[3]; | ||
72 | |||
73 | v[12] = p->t[0] ^ k_Blake2s_IV[4]; | ||
74 | v[13] = p->t[1] ^ k_Blake2s_IV[5]; | ||
75 | v[14] = p->f[0] ^ k_Blake2s_IV[6]; | ||
76 | v[15] = p->f[1] ^ k_Blake2s_IV[7]; | ||
77 | 1122 | ||
78 | #define G(r,i,a,b,c,d) \ | 1123 | static |
79 | a += b + m[sigma[2*i+0]]; d ^= a; d = rotr32(d, 16); c += d; b ^= c; b = rotr32(b, 12); \ | 1124 | Z7_NO_INLINE |
80 | a += b + m[sigma[2*i+1]]; d ^= a; d = rotr32(d, 8); c += d; b ^= c; b = rotr32(b, 7); \ | 1125 | #ifdef BLAKE2S_ATTRIB_128BIT |
1126 | BLAKE2S_ATTRIB_128BIT | ||
1127 | #endif | ||
1128 | void | ||
1129 | Z7_FASTCALL | ||
1130 | Blake2sp_Final_V128_Fast(UInt32 *states) | ||
1131 | { | ||
1132 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); | ||
1133 | // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n"); | ||
1134 | // PrintStates2(states, 8, 16); | ||
1135 | { | ||
1136 | ptrdiff_t pos = 8 * 4; | ||
1137 | do | ||
1138 | { | ||
1139 | UInt32 *src32 = states + (size_t)(pos * 1); | ||
1140 | UInt32 *dest32 = states + (size_t)(pos * 2); | ||
1141 | V4_UNPACK_STATE(dest32, src32) | ||
1142 | pos -= 8 * 4; | ||
1143 | } | ||
1144 | while (pos >= 0); | ||
1145 | } | ||
1146 | { | ||
1147 | unsigned k; | ||
1148 | for (k = 0; k < 8; k++) | ||
1149 | { | ||
1150 | UInt32 *s = states + (size_t)k * 16; | ||
1151 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
1152 | } | ||
1153 | } | ||
1154 | // PrintStates2(states, 8, 16); | ||
1155 | } | ||
1156 | |||
1157 | |||
1158 | |||
1159 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
1160 | |||
1161 | #define ADD_256(a, b) _mm256_add_epi32(a, b) | ||
1162 | #define XOR_256(a, b) _mm256_xor_si256(a, b) | ||
1163 | |||
1164 | #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
1165 | #define MM256_ROR_EPI32 _mm256_ror_epi32 | ||
1166 | #define Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
1167 | #define LOAD_ROTATE_CONSTS_256 | ||
1168 | #else | ||
1169 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1170 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
1171 | #define LOAD_ROTATE_CONSTS_256 \ | ||
1172 | const __m256i r8 = k_r8_256; \ | ||
1173 | const __m256i r16 = k_r16_256; | ||
1174 | #endif // AVX2_WAY2 | ||
1175 | |||
1176 | #define MM256_ROR_EPI32(r, c) ( \ | ||
1177 | ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \ | ||
1178 | : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \ | ||
1179 | : _mm256_or_si256( \ | ||
1180 | _mm256_srli_epi32((r), (c)), \ | ||
1181 | _mm256_slli_epi32((r), 32-(c)))) | ||
1182 | #endif // WAY_SLOW | ||
1183 | #endif | ||
1184 | |||
1185 | |||
1186 | #define D_ADD_256(dest, src) dest = ADD_256(dest, src) | ||
1187 | #define D_XOR_256(dest, src) dest = XOR_256(dest, src) | ||
1188 | |||
1189 | #define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
1190 | |||
1191 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
1192 | |||
1193 | #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
1194 | #define ROT_256_16(x) MM256_ROR_EPI32((x), 16) | ||
1195 | #define ROT_256_12(x) MM256_ROR_EPI32((x), 12) | ||
1196 | #define ROT_256_8(x) MM256_ROR_EPI32((x), 8) | ||
1197 | #define ROT_256_7(x) MM256_ROR_EPI32((x), 7) | ||
1198 | #else | ||
1199 | #define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \ | ||
1200 | 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
1201 | #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \ | ||
1202 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
1203 | #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16) | ||
1204 | #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20)) | ||
1205 | #define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8) | ||
1206 | #define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25)) | ||
1207 | #endif | ||
1208 | |||
1209 | #define D_ROT_256_7(dest) dest = ROT_256_7(dest) | ||
1210 | #define D_ROT_256_8(dest) dest = ROT_256_8(dest) | ||
1211 | #define D_ROT_256_12(dest) dest = ROT_256_12(dest) | ||
1212 | #define D_ROT_256_16(dest) dest = ROT_256_16(dest) | ||
1213 | |||
1214 | #define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p)) | ||
1215 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
1216 | #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r) | ||
1217 | #define LOAD_256_FROM_STRUCT(p) LOADU_256(p) | ||
1218 | #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r) | ||
1219 | #else | ||
1220 | // if struct is aligned for 32-bytes | ||
1221 | #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r) | ||
1222 | #define LOAD_256_FROM_STRUCT(p) LOAD_256(p) | ||
1223 | #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r) | ||
1224 | #endif | ||
1225 | |||
1226 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
1227 | |||
1228 | |||
1229 | |||
1230 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1231 | |||
1232 | #if 0 | ||
1233 | #define DIAG_PERM2(s) \ | ||
1234 | { \ | ||
1235 | const __m256i a = LOAD_256_FROM_STRUCT((s) ); \ | ||
1236 | const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \ | ||
1237 | STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \ | ||
1238 | STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \ | ||
1239 | } | ||
1240 | #else | ||
1241 | #define DIAG_PERM2(s) \ | ||
1242 | { \ | ||
1243 | const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \ | ||
1244 | const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \ | ||
1245 | STORE_128_TO_STRUCT((s) + NSW, a); \ | ||
1246 | STORE_128_TO_STRUCT((s) + 4 , b); \ | ||
1247 | } | ||
1248 | #endif | ||
1249 | #define DIAG_PERM8(s_items) \ | ||
1250 | { \ | ||
1251 | DIAG_PERM2(s_items) \ | ||
1252 | DIAG_PERM2(s_items + NSW * 2) \ | ||
1253 | DIAG_PERM2(s_items + NSW * 4) \ | ||
1254 | DIAG_PERM2(s_items + NSW * 6) \ | ||
1255 | } | ||
1256 | |||
1257 | |||
1258 | #define AXR256(a, b, d, shift) \ | ||
1259 | D_ADD_256(a, b); \ | ||
1260 | D_XOR_256(d, a); \ | ||
1261 | d = MM256_ROR_EPI32(d, shift); \ | ||
1262 | |||
1263 | |||
1264 | |||
1265 | #ifdef Z7_BLAKE2S_USE_GATHER | ||
1266 | |||
1267 | #define TABLE_GATHER_256_4(a0,a1,a2,a3) \ | ||
1268 | a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16 | ||
1269 | #define TABLE_GATHER_256( \ | ||
1270 | a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
1271 | { TABLE_GATHER_256_4(a0,a2,a4,a6), \ | ||
1272 | TABLE_GATHER_256_4(a1,a3,a5,a7), \ | ||
1273 | TABLE_GATHER_256_4(a8,a10,a12,a14), \ | ||
1274 | TABLE_GATHER_256_4(a9,a11,a13,a15) } | ||
1275 | MY_ALIGN(64) | ||
1276 | static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] = | ||
1277 | { SIGMA_TABLE(TABLE_GATHER_256) }; | ||
1278 | #define GET_SIGMA(r) \ | ||
1279 | const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r]; | ||
1280 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1281 | const __m256i i01234567 = LOAD_256(sigma + (sigma_index)); | ||
1282 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
1283 | _mm256_i32gather_epi32((const void *)(in), i01234567, 4) | ||
1284 | #define SIGMA_INTERLEAVE 8 | ||
1285 | #define SIGMA_HALF_ROW_SIZE 16 | ||
1286 | |||
1287 | #else // !Z7_BLAKE2S_USE_GATHER | ||
1288 | |||
1289 | #define GET_SIGMA(r) \ | ||
1290 | const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
1291 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1292 | AXR2_LOAD_INDEXES(sigma_index) | ||
1293 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
1294 | MY_mm256_set_m128i( \ | ||
1295 | SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \ | ||
1296 | SET_ROW_FROM_SIGMA_W(in)) | ||
1297 | #define SIGMA_INTERLEAVE 1 | ||
1298 | #define SIGMA_HALF_ROW_SIZE 8 | ||
1299 | #endif // !Z7_BLAKE2S_USE_GATHER | ||
1300 | |||
81 | 1301 | ||
82 | #define R(r) \ | 1302 | #define ROTATE_WORDS_TO_RIGHT_256(a, n) \ |
83 | G(r,0,v[ 0],v[ 4],v[ 8],v[12]) \ | 1303 | a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); |
84 | G(r,1,v[ 1],v[ 5],v[ 9],v[13]) \ | ||
85 | G(r,2,v[ 2],v[ 6],v[10],v[14]) \ | ||
86 | G(r,3,v[ 3],v[ 7],v[11],v[15]) \ | ||
87 | G(r,4,v[ 0],v[ 5],v[10],v[15]) \ | ||
88 | G(r,5,v[ 1],v[ 6],v[11],v[12]) \ | ||
89 | G(r,6,v[ 2],v[ 7],v[ 8],v[13]) \ | ||
90 | G(r,7,v[ 3],v[ 4],v[ 9],v[14]) \ | ||
91 | 1304 | ||
1305 | |||
1306 | |||
1307 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
1308 | |||
1309 | #define AXR2_A(sigma_index, shift1, shift2) \ | ||
1310 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1311 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
1312 | AXR256(a0, b0, d0, shift1) \ | ||
1313 | AXR256(c0, d0, b0, shift2) \ | ||
1314 | |||
1315 | #define AXR4_A(sigma_index) \ | ||
1316 | { AXR2_A(sigma_index, 16, 12) } \ | ||
1317 | { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
1318 | |||
1319 | #define EE1(r) \ | ||
1320 | { GET_SIGMA(r) \ | ||
1321 | AXR4_A(0) \ | ||
1322 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
1323 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1324 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
1325 | AXR4_A(SIGMA_HALF_ROW_SIZE) \ | ||
1326 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
1327 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1328 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
1329 | } | ||
1330 | |||
1331 | static | ||
1332 | Z7_NO_INLINE | ||
1333 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1334 | BLAKE2S_ATTRIB_AVX2 | ||
1335 | #endif | ||
1336 | void | ||
1337 | Z7_FASTCALL | ||
1338 | Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1339 | { | ||
1340 | size_t pos = 0; | ||
1341 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
1342 | |||
1343 | if (data != end) | ||
92 | { | 1344 | { |
93 | unsigned r; | 1345 | LOAD_ROTATE_CONSTS_256 |
94 | for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) | 1346 | DIAG_PERM8(s_items) |
1347 | do | ||
95 | { | 1348 | { |
96 | const Byte *sigma = k_Blake2s_Sigma[r]; | 1349 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
97 | R(r) | 1350 | __m256i a0, b0, c0, d0; |
1351 | { | ||
1352 | const __m128i inc = k_inc; | ||
1353 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
1354 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
1355 | D_ADD_EPI64_128(d0_128, inc); | ||
1356 | D_ADD_EPI64_128(d1_128, inc); | ||
1357 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
1358 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128); | ||
1359 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
1360 | D_XOR_256(d0, k_iv4_256); | ||
1361 | } | ||
1362 | c0 = SET_FROM_128(k_iv0_128); | ||
1363 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
1364 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
1365 | |||
1366 | ROUNDS_LOOP (EE1) | ||
1367 | |||
1368 | D_XOR_256(a0, c0); | ||
1369 | D_XOR_256(b0, d0); | ||
1370 | |||
1371 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
1372 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
1373 | |||
1374 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
1375 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
1376 | |||
1377 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
1378 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
1379 | pos &= SUPER_BLOCK_MASK; | ||
98 | } | 1380 | } |
99 | /* R(0); R(1); R(2); R(3); R(4); R(5); R(6); R(7); R(8); R(9); */ | 1381 | while (data < end); |
1382 | DIAG_PERM8(s_items) | ||
1383 | if (data != end) | ||
1384 | return; | ||
1385 | } | ||
1386 | { | ||
1387 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1388 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
100 | } | 1389 | } |
1390 | } | ||
1391 | |||
1392 | #endif // Z7_BLAKE2S_USE_AVX2_WAY2 | ||
101 | 1393 | ||
102 | #undef G | ||
103 | #undef R | ||
104 | 1394 | ||
1395 | |||
1396 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
1397 | |||
1398 | #define AXR2_X(sigma_index, shift1, shift2) \ | ||
1399 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
1400 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
1401 | D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \ | ||
1402 | AXR256(a0, b0, d0, shift1) \ | ||
1403 | AXR256(a1, b1, d1, shift1) \ | ||
1404 | AXR256(c0, d0, b0, shift2) \ | ||
1405 | AXR256(c1, d1, b1, shift2) \ | ||
1406 | |||
1407 | #define AXR4_X(sigma_index) \ | ||
1408 | { AXR2_X(sigma_index, 16, 12) } \ | ||
1409 | { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
1410 | |||
1411 | #define EE2(r) \ | ||
1412 | { GET_SIGMA(r) \ | ||
1413 | AXR4_X(0) \ | ||
1414 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
1415 | ROTATE_WORDS_TO_RIGHT_256(b1, 1) \ | ||
1416 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1417 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
1418 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
1419 | ROTATE_WORDS_TO_RIGHT_256(d1, 3) \ | ||
1420 | AXR4_X(SIGMA_HALF_ROW_SIZE) \ | ||
1421 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
1422 | ROTATE_WORDS_TO_RIGHT_256(b1, 3) \ | ||
1423 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
1424 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
1425 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
1426 | ROTATE_WORDS_TO_RIGHT_256(d1, 1) \ | ||
1427 | } | ||
1428 | |||
1429 | static | ||
1430 | Z7_NO_INLINE | ||
1431 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1432 | BLAKE2S_ATTRIB_AVX2 | ||
1433 | #endif | ||
1434 | void | ||
1435 | Z7_FASTCALL | ||
1436 | Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1437 | { | ||
1438 | size_t pos = 0; | ||
1439 | |||
1440 | if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4) | ||
105 | { | 1441 | { |
106 | unsigned i; | 1442 | #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED |
107 | for (i = 0; i < 8; i++) | 1443 | const __m256i r8 = k_r8_256; |
108 | p->h[i] ^= v[i] ^ v[i + 8]; | 1444 | const __m256i r16 = k_r16_256; |
1445 | #endif | ||
1446 | end -= Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
1447 | DIAG_PERM8(s_items) | ||
1448 | do | ||
1449 | { | ||
1450 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1451 | __m256i a0, b0, c0, d0; | ||
1452 | __m256i a1, b1, c1, d1; | ||
1453 | { | ||
1454 | const __m128i inc = k_inc; | ||
1455 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
1456 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
1457 | __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2)); | ||
1458 | __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3)); | ||
1459 | D_ADD_EPI64_128(d0_128, inc); | ||
1460 | D_ADD_EPI64_128(d1_128, inc); | ||
1461 | D_ADD_EPI64_128(d2_128, inc); | ||
1462 | D_ADD_EPI64_128(d3_128, inc); | ||
1463 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
1464 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128); | ||
1465 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128); | ||
1466 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128); | ||
1467 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
1468 | d1 = MY_mm256_set_m128i(d3_128, d2_128); | ||
1469 | D_XOR_256(d0, k_iv4_256); | ||
1470 | D_XOR_256(d1, k_iv4_256); | ||
1471 | } | ||
1472 | c1 = c0 = SET_FROM_128(k_iv0_128); | ||
1473 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
1474 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
1475 | a1 = LOAD_256_FROM_STRUCT(s + NSW * 2); | ||
1476 | b1 = LOAD_256_FROM_STRUCT(s + NSW * 3); | ||
1477 | |||
1478 | ROUNDS_LOOP (EE2) | ||
1479 | |||
1480 | D_XOR_256(a0, c0); | ||
1481 | D_XOR_256(b0, d0); | ||
1482 | D_XOR_256(a1, c1); | ||
1483 | D_XOR_256(b1, d1); | ||
1484 | |||
1485 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
1486 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
1487 | D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2)); | ||
1488 | D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3)); | ||
1489 | |||
1490 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
1491 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
1492 | STORE_256_TO_STRUCT(s + NSW * 2, a1); | ||
1493 | STORE_256_TO_STRUCT(s + NSW * 3, b1); | ||
1494 | |||
1495 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1496 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
1497 | pos &= SUPER_BLOCK_MASK; | ||
1498 | } | ||
1499 | while (data < end); | ||
1500 | DIAG_PERM8(s_items) | ||
1501 | end += Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
109 | } | 1502 | } |
1503 | if (data == end) | ||
1504 | return; | ||
1505 | // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos); | ||
1506 | do | ||
1507 | { | ||
1508 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
1509 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
1510 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
1511 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
1512 | pos &= SUPER_BLOCK_MASK; | ||
1513 | } | ||
1514 | while (data != end); | ||
1515 | } | ||
1516 | |||
1517 | #endif // Z7_BLAKE2S_USE_AVX2_WAY4 | ||
1518 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
1519 | |||
1520 | |||
1521 | // --------------------------------------------------------- | ||
1522 | |||
1523 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
1524 | |||
1525 | #define OP256_L(a, i) D_ADD_256 (V(a, 0), \ | ||
1526 | LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i)))); | ||
1527 | |||
1528 | #define OP256_0(a) OP256_L(a, 0) | ||
1529 | #define OP256_7(a) OP256_L(a, 1) | ||
1530 | |||
1531 | #define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1)); | ||
1532 | #define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0)); | ||
1533 | #define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3)); | ||
1534 | #define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2)); | ||
1535 | |||
1536 | #define OP256_3(a) D_ROT_256_16 (V(a, 3)); | ||
1537 | #define OP256_6(a) D_ROT_256_12 (V(a, 1)); | ||
1538 | #define OP256_8(a) D_ROT_256_8 (V(a, 3)); | ||
1539 | #define OP256_9(a) D_ROT_256_7 (V(a, 1)); | ||
1540 | |||
1541 | |||
1542 | #if 0 || 1 && defined(MY_CPU_X86) | ||
1543 | |||
1544 | #define V8_G(a) \ | ||
1545 | OP256_0 (a) \ | ||
1546 | OP256_1 (a) \ | ||
1547 | OP256_2 (a) \ | ||
1548 | OP256_3 (a) \ | ||
1549 | OP256_4 (a) \ | ||
1550 | OP256_5 (a) \ | ||
1551 | OP256_6 (a) \ | ||
1552 | OP256_7 (a) \ | ||
1553 | OP256_1 (a) \ | ||
1554 | OP256_2 (a) \ | ||
1555 | OP256_8 (a) \ | ||
1556 | OP256_4 (a) \ | ||
1557 | OP256_5 (a) \ | ||
1558 | OP256_9 (a) \ | ||
1559 | |||
1560 | #define V8R { \ | ||
1561 | V8_G (0); \ | ||
1562 | V8_G (1); \ | ||
1563 | V8_G (2); \ | ||
1564 | V8_G (3); \ | ||
1565 | V8_G (4); \ | ||
1566 | V8_G (5); \ | ||
1567 | V8_G (6); \ | ||
1568 | V8_G (7); \ | ||
1569 | } | ||
1570 | |||
1571 | #else | ||
1572 | |||
1573 | #define OP256_INTER_4(op, a,b,c,d) \ | ||
1574 | op (a) \ | ||
1575 | op (b) \ | ||
1576 | op (c) \ | ||
1577 | op (d) \ | ||
1578 | |||
1579 | #define V8_G(a,b,c,d) \ | ||
1580 | OP256_INTER_4 (OP256_0, a,b,c,d) \ | ||
1581 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
1582 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
1583 | OP256_INTER_4 (OP256_3, a,b,c,d) \ | ||
1584 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
1585 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
1586 | OP256_INTER_4 (OP256_6, a,b,c,d) \ | ||
1587 | OP256_INTER_4 (OP256_7, a,b,c,d) \ | ||
1588 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
1589 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
1590 | OP256_INTER_4 (OP256_8, a,b,c,d) \ | ||
1591 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
1592 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
1593 | OP256_INTER_4 (OP256_9, a,b,c,d) \ | ||
1594 | |||
1595 | #define V8R { \ | ||
1596 | V8_G (0, 1, 2, 3) \ | ||
1597 | V8_G (4, 5, 6, 7) \ | ||
1598 | } | ||
1599 | #endif | ||
1600 | |||
1601 | #define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R } | ||
1602 | |||
1603 | |||
1604 | // for debug: | ||
1605 | // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1606 | #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER) | ||
1607 | // gather instruction is slow. | ||
1608 | #define V8_LOAD_MSG(w, m) \ | ||
1609 | { \ | ||
1610 | unsigned i; \ | ||
1611 | for (i = 0; i < 16; ++i) { \ | ||
1612 | w[i] = _mm256_i32gather_epi32( \ | ||
1613 | (const void *)((m) + i * sizeof(UInt32)),\ | ||
1614 | _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \ | ||
1615 | sizeof(UInt32)); \ | ||
1616 | } \ | ||
1617 | } | ||
1618 | #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1619 | |||
1620 | #define V8_LOAD_MSG_2(w, a0, a1) \ | ||
1621 | { \ | ||
1622 | (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \ | ||
1623 | (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \ | ||
1624 | } | ||
1625 | |||
1626 | #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \ | ||
1627 | { \ | ||
1628 | __m256i s0, s1, s2, s3; \ | ||
1629 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
1630 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
1631 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
1632 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
1633 | V8_LOAD_MSG_2((w) + 0, s0, s2) \ | ||
1634 | V8_LOAD_MSG_2((w) + 1, s1, s3) \ | ||
1635 | } | ||
1636 | |||
1637 | #define V8_LOAD_MSG_0(t0, t1, m) \ | ||
1638 | { \ | ||
1639 | __m256i m0, m1; \ | ||
1640 | m0 = LOADU_256(m); \ | ||
1641 | m1 = LOADU_256((m) + 2 * 32); \ | ||
1642 | t0 = _mm256_unpacklo_epi32(m0, m1); \ | ||
1643 | t1 = _mm256_unpackhi_epi32(m0, m1); \ | ||
1644 | } | ||
1645 | |||
1646 | #define V8_LOAD_MSG_8(w, m) \ | ||
1647 | { \ | ||
1648 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1649 | V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \ | ||
1650 | V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \ | ||
1651 | V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \ | ||
1652 | V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \ | ||
1653 | V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \ | ||
1654 | V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \ | ||
1655 | } | ||
1656 | |||
1657 | #define V8_LOAD_MSG(w, m) \ | ||
1658 | { \ | ||
1659 | V8_LOAD_MSG_8(w, m) \ | ||
1660 | V8_LOAD_MSG_8((w) + 8, (m) + 32) \ | ||
1661 | } | ||
1662 | |||
1663 | #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
1664 | |||
1665 | |||
1666 | #define V8_PERM_PAIR_STORE(u, a0, a2) \ | ||
1667 | { \ | ||
1668 | STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \ | ||
1669 | STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \ | ||
1670 | } | ||
1671 | |||
1672 | #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \ | ||
1673 | { \ | ||
1674 | __m256i s0, s1, s2, s3; \ | ||
1675 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
1676 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
1677 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
1678 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
1679 | V8_PERM_PAIR_STORE(u + 0, s0, s2) \ | ||
1680 | V8_PERM_PAIR_STORE(u + 2, s1, s3) \ | ||
1681 | } | ||
1682 | |||
1683 | #define V8_UNPACK_STORE_0(src32, d0, d1) \ | ||
1684 | { \ | ||
1685 | const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \ | ||
1686 | const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \ | ||
1687 | d0 = _mm256_unpacklo_epi32(v0, v1); \ | ||
1688 | d1 = _mm256_unpackhi_epi32(v0, v1); \ | ||
1689 | } | ||
1690 | |||
1691 | #define V8_UNPACK_STATE(dest32, src32) \ | ||
1692 | { \ | ||
1693 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
1694 | V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \ | ||
1695 | V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \ | ||
1696 | V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \ | ||
1697 | V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \ | ||
1698 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \ | ||
1699 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \ | ||
110 | } | 1700 | } |
111 | 1701 | ||
112 | 1702 | ||
113 | #define Blake2s_Increment_Counter(S, inc) \ | ||
114 | { p->t[0] += (inc); p->t[1] += (p->t[0] < (inc)); } | ||
115 | 1703 | ||
116 | #define Blake2s_Set_LastBlock(p) \ | 1704 | #define V8_LOAD_STATE_256_FROM_STRUCT(i) \ |
117 | { p->f[0] = BLAKE2S_FINAL_FLAG; p->f[1] = p->lastNode_f1; } | 1705 | v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8); |
1706 | |||
1707 | #if 0 || 0 && defined(MY_CPU_X86) | ||
1708 | #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1709 | #endif | ||
1710 | |||
1711 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1712 | // this branch doesn't use (iv) array | ||
1713 | // so register pressure can be lower. | ||
1714 | // it can be faster sometimes | ||
1715 | #define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i) | ||
1716 | #define V8_UPDATE_STATE_256(i) \ | ||
1717 | { \ | ||
1718 | STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \ | ||
1719 | XOR_256(v[i], v[(i) + 8]), \ | ||
1720 | LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \ | ||
1721 | } | ||
1722 | #else | ||
1723 | // it uses more variables (iv) registers | ||
1724 | // it's better for gcc | ||
1725 | // maybe that branch is better, if register pressure will be lower (avx512) | ||
1726 | #define V8_LOAD_STATE_256(i) { iv[i] = v[i]; } | ||
1727 | #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); } | ||
1728 | #define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); } | ||
1729 | #endif | ||
118 | 1730 | ||
119 | 1731 | ||
120 | static void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size) | 1732 | #if 0 |
1733 | // use loading constants from memory | ||
1734 | #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n) | ||
1735 | MY_ALIGN(64) | ||
1736 | static const UInt32 k_Blake2s_IV_WAY8[]= | ||
121 | { | 1737 | { |
122 | while (size != 0) | 1738 | KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7) |
123 | { | 1739 | }; |
124 | unsigned pos = (unsigned)p->bufPos; | 1740 | #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i)) |
125 | unsigned rem = BLAKE2S_BLOCK_SIZE - pos; | 1741 | #else |
1742 | // use constant generation: | ||
1743 | #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i)) | ||
1744 | #endif | ||
126 | 1745 | ||
127 | if (size <= rem) | 1746 | |
1747 | static | ||
1748 | Z7_NO_INLINE | ||
1749 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1750 | BLAKE2S_ATTRIB_AVX2 | ||
1751 | #endif | ||
1752 | void | ||
1753 | Z7_FASTCALL | ||
1754 | Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
1755 | { | ||
1756 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1757 | __m256i v[16]; | ||
1758 | #endif | ||
1759 | |||
1760 | // PrintStates2(s_items, 8, 16); | ||
1761 | |||
1762 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1763 | REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT) | ||
1764 | #endif | ||
1765 | |||
1766 | do | ||
1767 | { | ||
1768 | __m256i w[16]; | ||
1769 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1770 | __m256i v[16]; | ||
1771 | #else | ||
1772 | __m256i iv[8]; | ||
1773 | #endif | ||
1774 | V8_LOAD_MSG(w, data) | ||
128 | { | 1775 | { |
129 | memcpy(p->buf + pos, data, size); | 1776 | // we use load/store ctr inside loop to reduce register pressure: |
130 | p->bufPos += (UInt32)size; | 1777 | #if 1 || 1 && defined(MY_CPU_X86) |
131 | return; | 1778 | const __m256i ctr = _mm256_add_epi64( |
1779 | LOAD_256_FROM_STRUCT(s_items + 64), | ||
1780 | _mm256_set_epi32( | ||
1781 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE, | ||
1782 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)); | ||
1783 | STORE_256_TO_STRUCT(s_items + 64, ctr); | ||
1784 | #else | ||
1785 | const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64) | ||
1786 | + Z7_BLAKE2S_BLOCK_SIZE; | ||
1787 | const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64); | ||
1788 | *(UInt64 *)(void *)(s_items + 64) = ctr64; | ||
1789 | #endif | ||
1790 | v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
1791 | v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
132 | } | 1792 | } |
1793 | v[ 8] = GET_256_IV_WAY8(0); | ||
1794 | v[ 9] = GET_256_IV_WAY8(1); | ||
1795 | v[10] = GET_256_IV_WAY8(2); | ||
1796 | v[11] = GET_256_IV_WAY8(3); | ||
1797 | v[14] = GET_256_IV_WAY8(6); | ||
1798 | v[15] = GET_256_IV_WAY8(7); | ||
133 | 1799 | ||
134 | memcpy(p->buf + pos, data, rem); | 1800 | REP8_MACRO (V8_LOAD_STATE_256) |
135 | Blake2s_Increment_Counter(S, BLAKE2S_BLOCK_SIZE) | 1801 | ROUNDS_LOOP (V8_ROUND) |
136 | Blake2s_Compress(p); | 1802 | REP8_MACRO (V8_UPDATE_STATE_256) |
137 | p->bufPos = 0; | 1803 | data += SUPER_BLOCK_SIZE; |
138 | data += rem; | ||
139 | size -= rem; | ||
140 | } | 1804 | } |
1805 | while (data != end); | ||
1806 | |||
1807 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
1808 | REP8_MACRO (V8_STORE_STATE_256) | ||
1809 | #endif | ||
141 | } | 1810 | } |
142 | 1811 | ||
143 | 1812 | ||
144 | static void Blake2s_Final(CBlake2s *p, Byte *digest) | 1813 | static |
1814 | Z7_NO_INLINE | ||
1815 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
1816 | BLAKE2S_ATTRIB_AVX2 | ||
1817 | #endif | ||
1818 | void | ||
1819 | Z7_FASTCALL | ||
1820 | Blake2sp_Final_AVX2_Fast(UInt32 *states) | ||
145 | { | 1821 | { |
146 | unsigned i; | 1822 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); |
1823 | // PrintStates2(states, 8, 16); | ||
1824 | V8_UNPACK_STATE(states, states) | ||
1825 | // PrintStates2(states, 8, 16); | ||
1826 | { | ||
1827 | unsigned k; | ||
1828 | for (k = 0; k < 8; k++) | ||
1829 | { | ||
1830 | UInt32 *s = states + (size_t)k * 16; | ||
1831 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
1832 | } | ||
1833 | } | ||
1834 | // PrintStates2(states, 8, 16); | ||
1835 | // printf("\nafter V8_UNPACK_STATE \n"); | ||
1836 | } | ||
1837 | |||
1838 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
1839 | #endif // avx2 | ||
1840 | #endif // vector | ||
1841 | |||
1842 | |||
1843 | /* | ||
1844 | #define Blake2s_Increment_Counter(s, inc) \ | ||
1845 | { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); } | ||
1846 | #define Blake2s_Increment_Counter_Small(s, inc) \ | ||
1847 | { STATE_T(s)[0] += (inc); } | ||
1848 | */ | ||
1849 | |||
1850 | #define Blake2s_Set_LastBlock(s) \ | ||
1851 | { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ } | ||
1852 | |||
1853 | |||
1854 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600 | ||
1855 | // good for vs2022 | ||
1856 | #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) } | ||
1857 | #else | ||
1858 | // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64. | ||
1859 | #define LOOP_8(mac) { REP8_MACRO(mac) } | ||
1860 | #endif | ||
1861 | |||
1862 | |||
1863 | static | ||
1864 | Z7_FORCE_INLINE | ||
1865 | // Z7_NO_INLINE | ||
1866 | void | ||
1867 | Z7_FASTCALL | ||
1868 | Blake2s_Compress(UInt32 *s, const Byte *input) | ||
1869 | { | ||
1870 | UInt32 m[16]; | ||
1871 | UInt32 v[16]; | ||
1872 | { | ||
1873 | unsigned i; | ||
1874 | for (i = 0; i < 16; i++) | ||
1875 | m[i] = GetUi32(input + i * 4); | ||
1876 | } | ||
1877 | |||
1878 | #define INIT_v_FROM_s(i) v[i] = s[i]; | ||
1879 | |||
1880 | LOOP_8(INIT_v_FROM_s) | ||
1881 | |||
1882 | // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE) | ||
1883 | { | ||
1884 | const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE; | ||
1885 | const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE); | ||
1886 | STATE_T(s)[0] = t0; | ||
1887 | STATE_T(s)[1] = t1; | ||
1888 | v[12] = t0 ^ KIV(4); | ||
1889 | v[13] = t1 ^ KIV(5); | ||
1890 | } | ||
1891 | // v[12] = STATE_T(s)[0] ^ KIV(4); | ||
1892 | // v[13] = STATE_T(s)[1] ^ KIV(5); | ||
1893 | v[14] = STATE_F(s)[0] ^ KIV(6); | ||
1894 | v[15] = STATE_F(s)[1] ^ KIV(7); | ||
1895 | |||
1896 | v[ 8] = KIV(0); | ||
1897 | v[ 9] = KIV(1); | ||
1898 | v[10] = KIV(2); | ||
1899 | v[11] = KIV(3); | ||
1900 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
1901 | |||
1902 | #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]); | ||
1903 | #define ADD32M(dest, src, a) V(a, dest) += V(a, src); | ||
1904 | #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src); | ||
1905 | #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift); | ||
1906 | |||
1907 | // big interleaving can provides big performance gain, if scheduler queues are small. | ||
1908 | #if 0 || 1 && defined(MY_CPU_X86) | ||
1909 | // interleave-1: for small register number (x86-32bit) | ||
1910 | #define G2(index, a, x, y) \ | ||
1911 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1912 | ADD32M (0, 1, a) \ | ||
1913 | XOR32M (3, 0, a) \ | ||
1914 | RTR32M (3, x, a) \ | ||
1915 | ADD32M (2, 3, a) \ | ||
1916 | XOR32M (1, 2, a) \ | ||
1917 | RTR32M (1, y, a) \ | ||
1918 | |||
1919 | #define G(a) \ | ||
1920 | G2(a * 2 , a, 16, 12) \ | ||
1921 | G2(a * 2 + 1, a, 8, 7) \ | ||
1922 | |||
1923 | #define R2 \ | ||
1924 | G(0) \ | ||
1925 | G(1) \ | ||
1926 | G(2) \ | ||
1927 | G(3) \ | ||
1928 | G(4) \ | ||
1929 | G(5) \ | ||
1930 | G(6) \ | ||
1931 | G(7) \ | ||
1932 | |||
1933 | #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64) | ||
1934 | // interleave-2: is good if the number of registers is not big (x86-64). | ||
1935 | |||
1936 | #define REP2(mac, dest, src, a, b) \ | ||
1937 | mac(dest, src, a) \ | ||
1938 | mac(dest, src, b) | ||
1939 | |||
1940 | #define G2(index, a, b, x, y) \ | ||
1941 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1942 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
1943 | REP2 (ADD32M, 0, 1, a, b) \ | ||
1944 | REP2 (XOR32M, 3, 0, a, b) \ | ||
1945 | REP2 (RTR32M, 3, x, a, b) \ | ||
1946 | REP2 (ADD32M, 2, 3, a, b) \ | ||
1947 | REP2 (XOR32M, 1, 2, a, b) \ | ||
1948 | REP2 (RTR32M, 1, y, a, b) \ | ||
1949 | |||
1950 | #define G(a, b) \ | ||
1951 | G2(a * 2 , a, b, 16, 12) \ | ||
1952 | G2(a * 2 + 1, a, b, 8, 7) \ | ||
1953 | |||
1954 | #define R2 \ | ||
1955 | G(0, 1) \ | ||
1956 | G(2, 3) \ | ||
1957 | G(4, 5) \ | ||
1958 | G(6, 7) \ | ||
147 | 1959 | ||
148 | Blake2s_Increment_Counter(S, (UInt32)p->bufPos) | 1960 | #else |
149 | Blake2s_Set_LastBlock(p) | 1961 | // interleave-4: |
150 | memset(p->buf + p->bufPos, 0, BLAKE2S_BLOCK_SIZE - p->bufPos); | 1962 | // it has big register pressure for x86/x64. |
151 | Blake2s_Compress(p); | 1963 | // and MSVC compilers for x86/x64 are slow for this branch. |
1964 | // but if we have big number of registers, this branch can be faster. | ||
152 | 1965 | ||
153 | for (i = 0; i < 8; i++) | 1966 | #define REP4(mac, dest, src, a, b, c, d) \ |
1967 | mac(dest, src, a) \ | ||
1968 | mac(dest, src, b) \ | ||
1969 | mac(dest, src, c) \ | ||
1970 | mac(dest, src, d) | ||
1971 | |||
1972 | #define G2(index, a, b, c, d, x, y) \ | ||
1973 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
1974 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
1975 | ADD_SIGMA (c, (index) + 2 * 2) \ | ||
1976 | ADD_SIGMA (d, (index) + 2 * 3) \ | ||
1977 | REP4 (ADD32M, 0, 1, a, b, c, d) \ | ||
1978 | REP4 (XOR32M, 3, 0, a, b, c, d) \ | ||
1979 | REP4 (RTR32M, 3, x, a, b, c, d) \ | ||
1980 | REP4 (ADD32M, 2, 3, a, b, c, d) \ | ||
1981 | REP4 (XOR32M, 1, 2, a, b, c, d) \ | ||
1982 | REP4 (RTR32M, 1, y, a, b, c, d) \ | ||
1983 | |||
1984 | #define G(a, b, c, d) \ | ||
1985 | G2(a * 2 , a, b, c, d, 16, 12) \ | ||
1986 | G2(a * 2 + 1, a, b, c, d, 8, 7) \ | ||
1987 | |||
1988 | #define R2 \ | ||
1989 | G(0, 1, 2, 3) \ | ||
1990 | G(4, 5, 6, 7) \ | ||
1991 | |||
1992 | #endif | ||
1993 | |||
1994 | #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 } | ||
1995 | |||
1996 | // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster: | ||
1997 | // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG. | ||
1998 | // 30-60% faster for (arm64-arm32) GCC. | ||
1999 | // 5-11% faster for (arm64) CLANG-MAC. | ||
2000 | // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch. | ||
2001 | // But if there is vectors branch (for x86*), this scalar code will be unused mostly. | ||
2002 | // So we want smaller code (without unrolling) in that case (x86*). | ||
2003 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
2004 | #define Z7_BLAKE2S_UNROLL | ||
2005 | #endif | ||
2006 | |||
2007 | #ifdef Z7_BLAKE2S_UNROLL | ||
2008 | ROUNDS_LOOP_UNROLLED (R) | ||
2009 | #else | ||
2010 | ROUNDS_LOOP (R) | ||
2011 | #endif | ||
2012 | |||
2013 | #undef G | ||
2014 | #undef G2 | ||
2015 | #undef R | ||
2016 | #undef R2 | ||
2017 | |||
2018 | // printf("\n v after: \n"); | ||
2019 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
2020 | #define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8]; | ||
2021 | |||
2022 | LOOP_8(XOR_s_PAIR_v) | ||
2023 | // printf("\n s after:\n"); | ||
2024 | // PrintStates2((const UInt32 *)s, 1, 16); | ||
2025 | } | ||
2026 | |||
2027 | |||
2028 | static | ||
2029 | Z7_NO_INLINE | ||
2030 | void | ||
2031 | Z7_FASTCALL | ||
2032 | Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
2033 | { | ||
2034 | size_t pos = 0; | ||
2035 | // PrintStates2(s_items, 8, 16); | ||
2036 | do | ||
154 | { | 2037 | { |
155 | SetUi32(digest + sizeof(p->h[i]) * i, p->h[i]) | 2038 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
2039 | Blake2s_Compress(s, data); | ||
2040 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
2041 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
2042 | pos &= SUPER_BLOCK_MASK; | ||
156 | } | 2043 | } |
2044 | while (data != end); | ||
157 | } | 2045 | } |
158 | 2046 | ||
159 | 2047 | ||
160 | /* ---------- BLAKE2s ---------- */ | 2048 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2049 | |||
2050 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2; | ||
2051 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2; | ||
2052 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2053 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2054 | static unsigned g_z7_Blake2sp_SupportedFlags; | ||
2055 | |||
2056 | #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast | ||
2057 | #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single | ||
2058 | #else | ||
2059 | #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2 | ||
2060 | #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2 | ||
2061 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
2062 | |||
161 | 2063 | ||
162 | /* we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ | 2064 | #if 1 && defined(MY_CPU_LE) |
2065 | #define GET_DIGEST(_s, _digest) \ | ||
2066 | { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); } | ||
2067 | #else | ||
2068 | #define GET_DIGEST(_s, _digest) \ | ||
2069 | { unsigned _i; for (_i = 0; _i < 8; _i++) \ | ||
2070 | { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \ | ||
2071 | } | ||
2072 | #endif | ||
2073 | |||
2074 | |||
2075 | /* ---------- BLAKE2s ---------- */ | ||
163 | /* | 2076 | /* |
2077 | // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() | ||
164 | typedef struct | 2078 | typedef struct |
165 | { | 2079 | { |
166 | Byte digest_length; | 2080 | Byte digest_length; |
167 | Byte key_length; | 2081 | Byte key_length; |
168 | Byte fanout; | 2082 | Byte fanout; // = 1 : in sequential mode |
169 | Byte depth; | 2083 | Byte depth; // = 1 : in sequential mode |
170 | UInt32 leaf_length; | 2084 | UInt32 leaf_length; |
171 | Byte node_offset[6]; | 2085 | Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode |
172 | Byte node_depth; | 2086 | Byte node_depth; // 0 for the leaves, or in sequential mode |
173 | Byte inner_length; | 2087 | Byte inner_length; // [0, 32], 0 in sequential mode |
174 | Byte salt[BLAKE2S_SALTBYTES]; | 2088 | Byte salt[BLAKE2S_SALTBYTES]; |
175 | Byte personal[BLAKE2S_PERSONALBYTES]; | 2089 | Byte personal[BLAKE2S_PERSONALBYTES]; |
176 | } CBlake2sParam; | 2090 | } CBlake2sParam; |
177 | */ | 2091 | */ |
178 | 2092 | ||
2093 | #define k_Blake2sp_IV_0 \ | ||
2094 | (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24))) | ||
2095 | #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \ | ||
2096 | (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24)) | ||
179 | 2097 | ||
180 | static void Blake2sp_Init_Spec(CBlake2s *p, unsigned node_offset, unsigned node_depth) | 2098 | Z7_FORCE_INLINE |
2099 | static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth) | ||
181 | { | 2100 | { |
182 | Blake2s_Init0(p); | 2101 | s[0] = k_Blake2sp_IV_0; |
183 | 2102 | s[1] = KIV(1); | |
184 | p->h[0] ^= (BLAKE2S_DIGEST_SIZE | ((UInt32)BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)); | 2103 | s[2] = KIV(2) ^ (UInt32)node_offset; |
185 | p->h[2] ^= ((UInt32)node_offset); | 2104 | s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth); |
186 | p->h[3] ^= ((UInt32)node_depth << 16) | ((UInt32)BLAKE2S_DIGEST_SIZE << 24); | 2105 | s[4] = KIV(4); |
187 | /* | 2106 | s[5] = KIV(5); |
188 | P->digest_length = BLAKE2S_DIGEST_SIZE; | 2107 | s[6] = KIV(6); |
189 | P->key_length = 0; | 2108 | s[7] = KIV(7); |
190 | P->fanout = BLAKE2SP_PARALLEL_DEGREE; | 2109 | |
191 | P->depth = 2; | 2110 | STATE_T(s)[0] = 0; |
192 | P->leaf_length = 0; | 2111 | STATE_T(s)[1] = 0; |
193 | store48(P->node_offset, node_offset); | 2112 | STATE_F(s)[0] = 0; |
194 | P->node_depth = node_depth; | 2113 | STATE_F(s)[1] = 0; |
195 | P->inner_length = BLAKE2S_DIGEST_SIZE; | ||
196 | */ | ||
197 | } | 2114 | } |
198 | 2115 | ||
199 | 2116 | ||
2117 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
2118 | |||
2119 | static | ||
2120 | Z7_NO_INLINE | ||
2121 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
2122 | BLAKE2S_ATTRIB_128BIT | ||
2123 | #endif | ||
2124 | void | ||
2125 | Z7_FASTCALL | ||
2126 | Blake2sp_InitState_V128_Fast(UInt32 *states) | ||
2127 | { | ||
2128 | #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \ | ||
2129 | { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \ | ||
2130 | STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \ | ||
2131 | } | ||
2132 | #define STORE_128_PAIR_INIT_STATES_1(i, mac) \ | ||
2133 | { const __m128i t = mac; \ | ||
2134 | STORE_128_PAIR_INIT_STATES_2(i, t, t) \ | ||
2135 | } | ||
2136 | #define STORE_128_PAIR_INIT_STATES_IV(i) \ | ||
2137 | STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i)) | ||
2138 | |||
2139 | STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
2140 | STORE_128_PAIR_INIT_STATES_IV (1) | ||
2141 | { | ||
2142 | const __m128i t = GET_128_IV_WAY4(2); | ||
2143 | STORE_128_PAIR_INIT_STATES_2 (2, | ||
2144 | XOR_128(t, _mm_set_epi32(3, 2, 1, 0)), | ||
2145 | XOR_128(t, _mm_set_epi32(7, 6, 5, 4))) | ||
2146 | } | ||
2147 | STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
2148 | STORE_128_PAIR_INIT_STATES_IV (4) | ||
2149 | STORE_128_PAIR_INIT_STATES_IV (5) | ||
2150 | STORE_128_PAIR_INIT_STATES_IV (6) | ||
2151 | STORE_128_PAIR_INIT_STATES_IV (7) | ||
2152 | STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0)) | ||
2153 | // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]); | ||
2154 | } | ||
2155 | |||
2156 | #endif // Z7_BLAKE2S_USE_V128_FAST | ||
2157 | |||
2158 | |||
2159 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2160 | |||
2161 | static | ||
2162 | Z7_NO_INLINE | ||
2163 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
2164 | BLAKE2S_ATTRIB_AVX2 | ||
2165 | #endif | ||
2166 | void | ||
2167 | Z7_FASTCALL | ||
2168 | Blake2sp_InitState_AVX2_Fast(UInt32 *states) | ||
2169 | { | ||
2170 | #define STORE_256_INIT_STATES(i, t) \ | ||
2171 | STORE_256_TO_STRUCT(states + 8 * (i), t); | ||
2172 | #define STORE_256_INIT_STATES_IV(i) \ | ||
2173 | STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i)) | ||
2174 | |||
2175 | STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
2176 | STORE_256_INIT_STATES_IV (1) | ||
2177 | STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2), | ||
2178 | _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))) | ||
2179 | STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
2180 | STORE_256_INIT_STATES_IV (4) | ||
2181 | STORE_256_INIT_STATES_IV (5) | ||
2182 | STORE_256_INIT_STATES_IV (6) | ||
2183 | STORE_256_INIT_STATES_IV (7) | ||
2184 | STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0)) | ||
2185 | // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n"); | ||
2186 | } | ||
2187 | |||
2188 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
2189 | |||
2190 | |||
2191 | |||
2192 | Z7_NO_INLINE | ||
2193 | void Blake2sp_InitState(CBlake2sp *p) | ||
2194 | { | ||
2195 | size_t i; | ||
2196 | // memset(p->states, 0, sizeof(p->states)); // for debug | ||
2197 | p->u.header.cycPos = 0; | ||
2198 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2199 | if (p->u.header.func_Init) | ||
2200 | { | ||
2201 | p->u.header.func_Init(p->states); | ||
2202 | return; | ||
2203 | } | ||
2204 | #endif | ||
2205 | for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++) | ||
2206 | Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0); | ||
2207 | } | ||
2208 | |||
200 | void Blake2sp_Init(CBlake2sp *p) | 2209 | void Blake2sp_Init(CBlake2sp *p) |
201 | { | 2210 | { |
202 | unsigned i; | 2211 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS |
203 | 2212 | p->u.header.func_Compress_Fast = | |
204 | p->bufPos = 0; | 2213 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2214 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
2215 | #else | ||
2216 | NULL; | ||
2217 | #endif | ||
2218 | |||
2219 | p->u.header.func_Compress_Single = | ||
2220 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2221 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
2222 | #else | ||
2223 | NULL; | ||
2224 | #endif | ||
2225 | |||
2226 | p->u.header.func_Init = | ||
2227 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2228 | g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2229 | #else | ||
2230 | NULL; | ||
2231 | #endif | ||
205 | 2232 | ||
206 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | 2233 | p->u.header.func_Final = |
207 | Blake2sp_Init_Spec(&p->S[i], i, 0); | 2234 | #ifdef Z7_BLAKE2S_USE_VECTORS |
2235 | g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2236 | #else | ||
2237 | NULL; | ||
2238 | #endif | ||
2239 | #endif | ||
208 | 2240 | ||
209 | p->S[BLAKE2SP_PARALLEL_DEGREE - 1].lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2241 | Blake2sp_InitState(p); |
210 | } | 2242 | } |
211 | 2243 | ||
212 | 2244 | ||
213 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) | 2245 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) |
214 | { | 2246 | { |
215 | unsigned pos = p->bufPos; | 2247 | size_t pos; |
216 | while (size != 0) | 2248 | // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data); |
2249 | if (size == 0) | ||
2250 | return; | ||
2251 | pos = p->u.header.cycPos; | ||
2252 | // pos < SUPER_BLOCK_SIZE * 2 : is expected | ||
2253 | // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also | ||
2254 | { | ||
2255 | const size_t pos2 = pos & SUPER_BLOCK_MASK; | ||
2256 | if (pos2) | ||
2257 | { | ||
2258 | const size_t rem = SUPER_BLOCK_SIZE - pos2; | ||
2259 | if (rem > size) | ||
2260 | { | ||
2261 | p->u.header.cycPos = (unsigned)(pos + size); | ||
2262 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
2263 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); | ||
2264 | /* to simpilify the code here we don't try to process first superblock, | ||
2265 | if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */ | ||
2266 | return; | ||
2267 | } | ||
2268 | // (rem <= size) | ||
2269 | memcpy((Byte *)(void *)p->buf32 + pos, data, rem); | ||
2270 | pos += rem; | ||
2271 | data += rem; | ||
2272 | size -= rem; | ||
2273 | } | ||
2274 | } | ||
2275 | |||
2276 | // pos <= SUPER_BLOCK_SIZE * 2 | ||
2277 | // pos % SUPER_BLOCK_SIZE == 0 | ||
2278 | if (pos) | ||
2279 | { | ||
2280 | /* pos == SUPER_BLOCK_SIZE || | ||
2281 | pos == SUPER_BLOCK_SIZE * 2 */ | ||
2282 | size_t end = pos; | ||
2283 | if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE | ||
2284 | || (end -= SUPER_BLOCK_SIZE)) | ||
2285 | { | ||
2286 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, | ||
2287 | (const Byte *)(const void *)p->buf32, | ||
2288 | (const Byte *)(const void *)p->buf32 + end); | ||
2289 | if (pos -= end) | ||
2290 | memcpy(p->buf32, (const Byte *)(const void *)p->buf32 | ||
2291 | + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE); | ||
2292 | } | ||
2293 | } | ||
2294 | |||
2295 | // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE) | ||
2296 | if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
2297 | { | ||
2298 | // pos == 0 | ||
2299 | const Byte *end; | ||
2300 | const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1)) | ||
2301 | & ~(size_t)SUPER_BLOCK_MASK; | ||
2302 | size -= size2; | ||
2303 | // size < SUPER_BLOCK_SIZE * 2 | ||
2304 | end = data + size2; | ||
2305 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end); | ||
2306 | data = end; | ||
2307 | } | ||
2308 | |||
2309 | if (size != 0) | ||
217 | { | 2310 | { |
218 | unsigned index = pos / BLAKE2S_BLOCK_SIZE; | 2311 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); |
219 | unsigned rem = BLAKE2S_BLOCK_SIZE - (pos & (BLAKE2S_BLOCK_SIZE - 1)); | 2312 | pos += size; |
220 | if (rem > size) | ||
221 | rem = (unsigned)size; | ||
222 | Blake2s_Update(&p->S[index], data, rem); | ||
223 | size -= rem; | ||
224 | data += rem; | ||
225 | pos += rem; | ||
226 | pos &= (BLAKE2S_BLOCK_SIZE * BLAKE2SP_PARALLEL_DEGREE - 1); | ||
227 | } | 2313 | } |
228 | p->bufPos = pos; | 2314 | p->u.header.cycPos = (unsigned)pos; |
2315 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
229 | } | 2316 | } |
230 | 2317 | ||
231 | 2318 | ||
232 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) | 2319 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) |
233 | { | 2320 | { |
234 | CBlake2s R; | 2321 | // UInt32 * const R_states = p->states; |
235 | unsigned i; | 2322 | // printf("\nBlake2sp_Final \n"); |
2323 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2324 | if (p->u.header.func_Final) | ||
2325 | p->u.header.func_Final(p->states); | ||
2326 | #endif | ||
2327 | // printf("\n=====\nBlake2sp_Final \n"); | ||
2328 | // PrintStates(p->states, 32); | ||
2329 | |||
2330 | // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch: | ||
2331 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE) | ||
2332 | { | ||
2333 | unsigned pos; | ||
2334 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, | ||
2335 | 0, SUPER_BLOCK_SIZE - p->u.header.cycPos); | ||
2336 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
2337 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2338 | { | ||
2339 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
2340 | Blake2s_Set_LastBlock(s) | ||
2341 | if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos) | ||
2342 | { | ||
2343 | UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE; | ||
2344 | if (pos < p->u.header.cycPos) | ||
2345 | delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1); | ||
2346 | // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE | ||
2347 | { | ||
2348 | const UInt32 v = STATE_T(s)[0]; | ||
2349 | STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0) | ||
2350 | STATE_T(s)[0] = v - delta; | ||
2351 | } | ||
2352 | } | ||
2353 | } | ||
2354 | // PrintStates(p->states, 16); | ||
2355 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2356 | (Byte *)(void *)p->buf32, | ||
2357 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
2358 | // PrintStates(p->states, 16); | ||
2359 | } | ||
2360 | else | ||
2361 | { | ||
2362 | // (p->u.header.cycPos > SUPER_BLOCK_SIZE) | ||
2363 | unsigned pos; | ||
2364 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2365 | { | ||
2366 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
2367 | if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos) | ||
2368 | Blake2s_Set_LastBlock(s) | ||
2369 | } | ||
2370 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
2371 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
2372 | |||
2373 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2374 | (Byte *)(void *)p->buf32, | ||
2375 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
236 | 2376 | ||
237 | Blake2sp_Init_Spec(&R, 0, 1); | 2377 | // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE; |
238 | R.lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2378 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; |
2379 | |||
2380 | // if (p->u.header.cycPos != SUPER_BLOCK_SIZE) | ||
2381 | { | ||
2382 | pos = SUPER_BLOCK_SIZE; | ||
2383 | for (;;) | ||
2384 | { | ||
2385 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK); | ||
2386 | Blake2s_Set_LastBlock(s) | ||
2387 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
2388 | if (pos >= p->u.header.cycPos) | ||
2389 | { | ||
2390 | if (pos != p->u.header.cycPos) | ||
2391 | { | ||
2392 | const UInt32 delta = pos - p->u.header.cycPos; | ||
2393 | const UInt32 v = STATE_T(s)[0]; | ||
2394 | STATE_T(s)[1] -= v < delta; | ||
2395 | STATE_T(s)[0] = v - delta; | ||
2396 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta); | ||
2397 | } | ||
2398 | break; | ||
2399 | } | ||
2400 | } | ||
2401 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2402 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE, | ||
2403 | (Byte *)(void *)p->buf32 + pos); | ||
2404 | } | ||
2405 | } | ||
239 | 2406 | ||
240 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | ||
241 | { | 2407 | { |
242 | Byte hash[BLAKE2S_DIGEST_SIZE]; | 2408 | size_t pos; |
243 | Blake2s_Final(&p->S[i], hash); | 2409 | for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2) |
244 | Blake2s_Update(&R, hash, BLAKE2S_DIGEST_SIZE); | 2410 | { |
2411 | const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2)); | ||
2412 | Byte *dest = (Byte *)(void *)p->buf32 + pos; | ||
2413 | GET_DIGEST(s, dest) | ||
2414 | } | ||
245 | } | 2415 | } |
2416 | Blake2sp_Init_Spec(p->states, 0, 1); | ||
2417 | { | ||
2418 | size_t pos; | ||
2419 | for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE) | ||
2420 | - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
2421 | { | ||
2422 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2423 | (const Byte *)(const void *)p->buf32 + pos, | ||
2424 | (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE); | ||
2425 | } | ||
2426 | } | ||
2427 | // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i); | ||
2428 | Blake2s_Set_LastBlock(p->states) | ||
2429 | STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG; | ||
2430 | { | ||
2431 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
2432 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE, | ||
2433 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE); | ||
2434 | } | ||
2435 | GET_DIGEST(p->states, digest) | ||
2436 | // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs); | ||
2437 | } | ||
2438 | |||
246 | 2439 | ||
247 | Blake2s_Final(&R, digest); | 2440 | BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo) |
2441 | { | ||
2442 | // printf("\n========== setfunction = %d ======== \n", algo); | ||
2443 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2444 | Z7_BLAKE2SP_FUNC_COMPRESS func = NULL; | ||
2445 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL; | ||
2446 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
2447 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
2448 | #else | ||
2449 | UNUSED_VAR(p) | ||
2450 | #endif | ||
2451 | |||
2452 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2453 | |||
2454 | func = func_Single = Blake2sp_Compress2; | ||
2455 | |||
2456 | if (algo != Z7_BLAKE2SP_ALGO_SCALAR) | ||
2457 | { | ||
2458 | // printf("\n========== setfunction NON-SCALER ======== \n"); | ||
2459 | if (algo == Z7_BLAKE2SP_ALGO_DEFAULT) | ||
2460 | { | ||
2461 | func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
2462 | func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
2463 | func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
2464 | func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
2465 | } | ||
2466 | else | ||
2467 | { | ||
2468 | if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0) | ||
2469 | return False; | ||
2470 | |||
2471 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
2472 | |||
2473 | func_Single = | ||
2474 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
2475 | Blake2sp_Compress2_AVX2_Way2; | ||
2476 | #else | ||
2477 | Z7_BLAKE2S_Compress2_V128; | ||
2478 | #endif | ||
2479 | |||
2480 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2481 | if (algo == Z7_BLAKE2SP_ALGO_V256_FAST) | ||
2482 | { | ||
2483 | func = Blake2sp_Compress2_AVX2_Fast; | ||
2484 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
2485 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
2486 | } | ||
2487 | else | ||
2488 | #endif | ||
2489 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
2490 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2) | ||
2491 | func = Blake2sp_Compress2_AVX2_Way2; | ||
2492 | else | ||
2493 | #endif | ||
2494 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
2495 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4) | ||
2496 | { | ||
2497 | func_Single = func = Blake2sp_Compress2_AVX2_Way4; | ||
2498 | } | ||
2499 | else | ||
2500 | #endif | ||
2501 | #endif // avx2 | ||
2502 | { | ||
2503 | if (algo == Z7_BLAKE2SP_ALGO_V128_FAST) | ||
2504 | { | ||
2505 | func = Blake2sp_Compress2_V128_Fast; | ||
2506 | func_Final = Blake2sp_Final_V128_Fast; | ||
2507 | func_Init = Blake2sp_InitState_V128_Fast; | ||
2508 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
2509 | } | ||
2510 | else | ||
2511 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
2512 | if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2) | ||
2513 | func = func_Single = Blake2sp_Compress2_V128_Way2; | ||
2514 | else | ||
2515 | #endif | ||
2516 | { | ||
2517 | if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1) | ||
2518 | return False; | ||
2519 | func = func_Single = Blake2sp_Compress2_V128_Way1; | ||
2520 | } | ||
2521 | } | ||
2522 | } | ||
2523 | } | ||
2524 | #else // !VECTORS | ||
2525 | if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR | ||
2526 | return False; | ||
2527 | #endif // !VECTORS | ||
2528 | |||
2529 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
2530 | p->u.header.func_Compress_Fast = func; | ||
2531 | p->u.header.func_Compress_Single = func_Single; | ||
2532 | p->u.header.func_Final = func_Final; | ||
2533 | p->u.header.func_Init = func_Init; | ||
2534 | #endif | ||
2535 | // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress); | ||
2536 | return True; | ||
2537 | } | ||
2538 | |||
2539 | |||
2540 | void z7_Black2sp_Prepare(void) | ||
2541 | { | ||
2542 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2543 | unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR); | ||
2544 | |||
2545 | Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2; | ||
2546 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2; | ||
2547 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
2548 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
2549 | |||
2550 | #if defined(MY_CPU_X86_OR_AMD64) | ||
2551 | #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
2552 | if (CPU_IsSupported_AVX512F_AVX512VL()) | ||
2553 | #endif | ||
2554 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
2555 | if (CPU_IsSupported_SSE41()) | ||
2556 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
2557 | if (CPU_IsSupported_SSSE3()) | ||
2558 | #elif !defined(MY_CPU_AMD64) | ||
2559 | if (CPU_IsSupported_SSE2()) | ||
2560 | #endif | ||
2561 | #endif | ||
2562 | { | ||
2563 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
2564 | // printf("\n========== Blake2s SSE41 128-bit\n"); | ||
2565 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
2566 | // printf("\n========== Blake2s SSSE3 128-bit\n"); | ||
2567 | #else | ||
2568 | // printf("\n========== Blake2s SSE2 128-bit\n"); | ||
2569 | #endif | ||
2570 | // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2; | ||
2571 | // printf("\n========== Blake2sp_Compress2_V128_Way2\n"); | ||
2572 | func_Fast = | ||
2573 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
2574 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1); | ||
2575 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
2576 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2); | ||
2577 | #endif | ||
2578 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
2579 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST); | ||
2580 | func_Fast = Blake2sp_Compress2_V128_Fast; | ||
2581 | func_Init = Blake2sp_InitState_V128_Fast; | ||
2582 | func_Final = Blake2sp_Final_V128_Fast; | ||
2583 | #endif | ||
2584 | |||
2585 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
2586 | #if defined(MY_CPU_X86_OR_AMD64) | ||
2587 | if ( | ||
2588 | #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
2589 | CPU_IsSupported_AVX512F_AVX512VL() && | ||
2590 | #endif | ||
2591 | CPU_IsSupported_AVX2() | ||
2592 | ) | ||
2593 | #endif | ||
2594 | { | ||
2595 | // #pragma message ("=== Blake2s AVX2") | ||
2596 | // printf("\n========== Blake2s AVX2\n"); | ||
2597 | |||
2598 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
2599 | func_Single = Blake2sp_Compress2_AVX2_Way2; | ||
2600 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2); | ||
2601 | #endif | ||
2602 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
2603 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4); | ||
2604 | #endif | ||
2605 | |||
2606 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
2607 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST); | ||
2608 | func_Fast = Blake2sp_Compress2_AVX2_Fast; | ||
2609 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
2610 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
2611 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
2612 | func_Fast = Blake2sp_Compress2_AVX2_Way4; | ||
2613 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
2614 | func_Fast = Blake2sp_Compress2_AVX2_Way2; | ||
2615 | #endif | ||
2616 | } // avx2 | ||
2617 | #endif // avx2 | ||
2618 | } // sse* | ||
2619 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast; | ||
2620 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single; | ||
2621 | g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init; | ||
2622 | g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final; | ||
2623 | g_z7_Blake2sp_SupportedFlags = flags; | ||
2624 | // printf("\nflags=%x\n", flags); | ||
2625 | #endif // vectors | ||
248 | } | 2626 | } |
249 | 2627 | ||
250 | #undef rotr32 | 2628 | /* |
2629 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
2630 | void align_test2(CBlake2sp *sp); | ||
2631 | void align_test2(CBlake2sp *sp) | ||
2632 | { | ||
2633 | __m128i a = LOAD_128(sp->states); | ||
2634 | D_XOR_128(a, LOAD_128(sp->states + 4)); | ||
2635 | STORE_128(sp->states, a); | ||
2636 | } | ||
2637 | void align_test2(void); | ||
2638 | void align_test2(void) | ||
2639 | { | ||
2640 | CBlake2sp sp; | ||
2641 | Blake2sp_Init(&sp); | ||
2642 | Blake2sp_Update(&sp, NULL, 0); | ||
2643 | } | ||
2644 | #endif | ||
2645 | */ | ||
@@ -1,11 +1,11 @@ | |||
1 | /* Bra.c -- Branch converters for RISC code | 1 | /* Bra.c -- Branch converters for RISC code |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-20 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "Bra.h" | 6 | #include "Bra.h" |
7 | #include "CpuArch.h" | ||
8 | #include "RotateDefs.h" | 7 | #include "RotateDefs.h" |
8 | #include "CpuArch.h" | ||
9 | 9 | ||
10 | #if defined(MY_CPU_SIZEOF_POINTER) \ | 10 | #if defined(MY_CPU_SIZEOF_POINTER) \ |
11 | && ( MY_CPU_SIZEOF_POINTER == 4 \ | 11 | && ( MY_CPU_SIZEOF_POINTER == 4 \ |
@@ -26,7 +26,7 @@ | |||
26 | #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; | 26 | #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; |
27 | // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; | 27 | // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; |
28 | 28 | ||
29 | #define Z7_BRANCH_CONV(name) z7_BranchConv_ ## name | 29 | #define Z7_BRANCH_CONV(name) z7_ ## name |
30 | 30 | ||
31 | #define Z7_BRANCH_FUNC_MAIN(name) \ | 31 | #define Z7_BRANCH_FUNC_MAIN(name) \ |
32 | static \ | 32 | static \ |
@@ -42,11 +42,11 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ | |||
42 | 42 | ||
43 | #ifdef Z7_EXTRACT_ONLY | 43 | #ifdef Z7_EXTRACT_ONLY |
44 | #define Z7_BRANCH_FUNCS_IMP(name) \ | 44 | #define Z7_BRANCH_FUNCS_IMP(name) \ |
45 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) | 45 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) |
46 | #else | 46 | #else |
47 | #define Z7_BRANCH_FUNCS_IMP(name) \ | 47 | #define Z7_BRANCH_FUNCS_IMP(name) \ |
48 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) \ | 48 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \ |
49 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC, 1) | 49 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1) |
50 | #endif | 50 | #endif |
51 | 51 | ||
52 | #if defined(__clang__) | 52 | #if defined(__clang__) |
@@ -72,7 +72,7 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ | |||
72 | #endif | 72 | #endif |
73 | 73 | ||
74 | 74 | ||
75 | Z7_BRANCH_FUNC_MAIN(ARM64) | 75 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64) |
76 | { | 76 | { |
77 | // Byte *p = data; | 77 | // Byte *p = data; |
78 | const Byte *lim; | 78 | const Byte *lim; |
@@ -121,10 +121,10 @@ Z7_BRANCH_FUNC_MAIN(ARM64) | |||
121 | } | 121 | } |
122 | } | 122 | } |
123 | } | 123 | } |
124 | Z7_BRANCH_FUNCS_IMP(ARM64) | 124 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64) |
125 | 125 | ||
126 | 126 | ||
127 | Z7_BRANCH_FUNC_MAIN(ARM) | 127 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARM) |
128 | { | 128 | { |
129 | // Byte *p = data; | 129 | // Byte *p = data; |
130 | const Byte *lim; | 130 | const Byte *lim; |
@@ -152,10 +152,10 @@ Z7_BRANCH_FUNC_MAIN(ARM) | |||
152 | } | 152 | } |
153 | } | 153 | } |
154 | } | 154 | } |
155 | Z7_BRANCH_FUNCS_IMP(ARM) | 155 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARM) |
156 | 156 | ||
157 | 157 | ||
158 | Z7_BRANCH_FUNC_MAIN(PPC) | 158 | Z7_BRANCH_FUNC_MAIN(BranchConv_PPC) |
159 | { | 159 | { |
160 | // Byte *p = data; | 160 | // Byte *p = data; |
161 | const Byte *lim; | 161 | const Byte *lim; |
@@ -192,14 +192,14 @@ Z7_BRANCH_FUNC_MAIN(PPC) | |||
192 | } | 192 | } |
193 | } | 193 | } |
194 | } | 194 | } |
195 | Z7_BRANCH_FUNCS_IMP(PPC) | 195 | Z7_BRANCH_FUNCS_IMP(BranchConv_PPC) |
196 | 196 | ||
197 | 197 | ||
198 | #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED | 198 | #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED |
199 | #define BR_SPARC_USE_ROTATE | 199 | #define BR_SPARC_USE_ROTATE |
200 | #endif | 200 | #endif |
201 | 201 | ||
202 | Z7_BRANCH_FUNC_MAIN(SPARC) | 202 | Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC) |
203 | { | 203 | { |
204 | // Byte *p = data; | 204 | // Byte *p = data; |
205 | const Byte *lim; | 205 | const Byte *lim; |
@@ -254,10 +254,10 @@ Z7_BRANCH_FUNC_MAIN(SPARC) | |||
254 | } | 254 | } |
255 | } | 255 | } |
256 | } | 256 | } |
257 | Z7_BRANCH_FUNCS_IMP(SPARC) | 257 | Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC) |
258 | 258 | ||
259 | 259 | ||
260 | Z7_BRANCH_FUNC_MAIN(ARMT) | 260 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT) |
261 | { | 261 | { |
262 | // Byte *p = data; | 262 | // Byte *p = data; |
263 | Byte *lim; | 263 | Byte *lim; |
@@ -335,12 +335,12 @@ Z7_BRANCH_FUNC_MAIN(ARMT) | |||
335 | // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); | 335 | // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); |
336 | // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); | 336 | // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); |
337 | } | 337 | } |
338 | Z7_BRANCH_FUNCS_IMP(ARMT) | 338 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT) |
339 | 339 | ||
340 | 340 | ||
341 | // #define BR_IA64_NO_INLINE | 341 | // #define BR_IA64_NO_INLINE |
342 | 342 | ||
343 | Z7_BRANCH_FUNC_MAIN(IA64) | 343 | Z7_BRANCH_FUNC_MAIN(BranchConv_IA64) |
344 | { | 344 | { |
345 | // Byte *p = data; | 345 | // Byte *p = data; |
346 | const Byte *lim; | 346 | const Byte *lim; |
@@ -417,4 +417,293 @@ Z7_BRANCH_FUNC_MAIN(IA64) | |||
417 | } | 417 | } |
418 | } | 418 | } |
419 | } | 419 | } |
420 | Z7_BRANCH_FUNCS_IMP(IA64) | 420 | Z7_BRANCH_FUNCS_IMP(BranchConv_IA64) |
421 | |||
422 | |||
423 | #define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET; | ||
424 | #define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET; | ||
425 | |||
426 | #if 1 && defined(MY_CPU_LE_UNALIGN) | ||
427 | #define RISCV_USE_UNALIGNED_LOAD | ||
428 | #endif | ||
429 | |||
430 | #ifdef RISCV_USE_UNALIGNED_LOAD | ||
431 | #define RISCV_GET_UI32(p) GetUi32(p) | ||
432 | #define RISCV_SET_UI32(p, v) { SetUi32(p, v) } | ||
433 | #else | ||
434 | #define RISCV_GET_UI32(p) \ | ||
435 | ((UInt32)GetUi16a(p) + \ | ||
436 | ((UInt32)GetUi16a((p) + 2) << 16)) | ||
437 | #define RISCV_SET_UI32(p, v) { \ | ||
438 | SetUi16a(p, (UInt16)(v)) \ | ||
439 | SetUi16a((p) + 2, (UInt16)(v >> 16)) } | ||
440 | #endif | ||
441 | |||
442 | #if 1 && defined(MY_CPU_LE) | ||
443 | #define RISCV_USE_16BIT_LOAD | ||
444 | #endif | ||
445 | |||
446 | #ifdef RISCV_USE_16BIT_LOAD | ||
447 | #define RISCV_LOAD_VAL(p) GetUi16a(p) | ||
448 | #else | ||
449 | #define RISCV_LOAD_VAL(p) (*(p)) | ||
450 | #endif | ||
451 | |||
452 | #define RISCV_INSTR_SIZE 2 | ||
453 | #define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE) | ||
454 | #define RISCV_STEP_2 4 | ||
455 | #define RISCV_REG_VAL (2 << 7) | ||
456 | #define RISCV_CMD_VAL 3 | ||
457 | #if 1 | ||
458 | // for code size optimization: | ||
459 | #define RISCV_DELTA_7F 0x7f | ||
460 | #else | ||
461 | #define RISCV_DELTA_7F 0 | ||
462 | #endif | ||
463 | |||
464 | #define RISCV_CHECK_1(v, b) \ | ||
465 | (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0) | ||
466 | |||
467 | #if 1 | ||
468 | #define RISCV_CHECK_2(v, r) \ | ||
469 | ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \ | ||
470 | << 18) \ | ||
471 | < ((r) & 0x1d)) | ||
472 | #else | ||
473 | // this branch gives larger code, because | ||
474 | // compilers generate larger code for big constants. | ||
475 | #define RISCV_CHECK_2(v, r) \ | ||
476 | ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ | ||
477 | & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ | ||
478 | < ((r) & 0x1d)) | ||
479 | #endif | ||
480 | |||
481 | |||
482 | #define RISCV_SCAN_LOOP \ | ||
483 | Byte *lim; \ | ||
484 | size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \ | ||
485 | if (size <= 6) return p; \ | ||
486 | size -= 6; \ | ||
487 | lim = p + size; \ | ||
488 | BR_PC_INIT \ | ||
489 | for (;;) \ | ||
490 | { \ | ||
491 | UInt32 a, v; \ | ||
492 | /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \ | ||
493 | for (;;) \ | ||
494 | { \ | ||
495 | if Z7_UNLIKELY(p >= lim) { return p; } \ | ||
496 | a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \ | ||
497 | if ((a & 0x77) == 0) break; \ | ||
498 | a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \ | ||
499 | p += RISCV_INSTR_SIZE * 2; \ | ||
500 | if ((a & 0x77) == 0) \ | ||
501 | { \ | ||
502 | p -= RISCV_INSTR_SIZE; \ | ||
503 | if Z7_UNLIKELY(p >= lim) { return p; } \ | ||
504 | break; \ | ||
505 | } \ | ||
506 | } | ||
507 | // (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL | ||
508 | // (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL | ||
509 | // (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC | ||
510 | // (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC | ||
511 | |||
512 | Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc) | ||
513 | { | ||
514 | RISCV_SCAN_LOOP | ||
515 | v = a; | ||
516 | a = RISCV_GET_UI32(p); | ||
517 | #ifndef RISCV_USE_16BIT_LOAD | ||
518 | v += (UInt32)p[1] << 8; | ||
519 | #endif | ||
520 | |||
521 | if ((v & 8) == 0) // JAL | ||
522 | { | ||
523 | if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80) | ||
524 | { | ||
525 | p += RISCV_INSTR_SIZE; | ||
526 | continue; | ||
527 | } | ||
528 | { | ||
529 | v = ((a & 1u << 31) >> 11) | ||
530 | | ((a & 0x3ff << 21) >> 20) | ||
531 | | ((a & 1 << 20) >> 9) | ||
532 | | (a & 0xff << 12); | ||
533 | BR_CONVERT_VAL_ENC(v) | ||
534 | // ((v & 1) == 0) | ||
535 | // v: bits [1 : 20] contain offset bits | ||
536 | #if 0 && defined(RISCV_USE_UNALIGNED_LOAD) | ||
537 | a &= 0xfff; | ||
538 | a |= ((UInt32)(v << 23)) | ||
539 | | ((UInt32)(v << 7) & ((UInt32)0xff << 16)) | ||
540 | | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8)); | ||
541 | RISCV_SET_UI32(p, a) | ||
542 | #else // aligned | ||
543 | #if 0 | ||
544 | SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff))) | ||
545 | #else | ||
546 | p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf)); | ||
547 | #endif | ||
548 | |||
549 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
550 | v <<= 15; | ||
551 | v = Z7_BSWAP32(v); | ||
552 | SetUi16a(p + 2, (UInt16)v) | ||
553 | #else | ||
554 | p[2] = (Byte)(v >> 9); | ||
555 | p[3] = (Byte)(v >> 1); | ||
556 | #endif | ||
557 | #endif // aligned | ||
558 | } | ||
559 | p += 4; | ||
560 | continue; | ||
561 | } // JAL | ||
562 | |||
563 | { | ||
564 | // AUIPC | ||
565 | if (v & 0xe80) // (not x0) and (not x2) | ||
566 | { | ||
567 | const UInt32 b = RISCV_GET_UI32(p + 4); | ||
568 | if (RISCV_CHECK_1(v, b)) | ||
569 | { | ||
570 | { | ||
571 | const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL); | ||
572 | RISCV_SET_UI32(p, temp) | ||
573 | } | ||
574 | a &= 0xfffff000; | ||
575 | { | ||
576 | #if 1 | ||
577 | const int t = -1 >> 1; | ||
578 | if (t != -1) | ||
579 | a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation | ||
580 | else | ||
581 | #endif | ||
582 | a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension). | ||
583 | } | ||
584 | BR_CONVERT_VAL_ENC(a) | ||
585 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
586 | a = Z7_BSWAP32(a); | ||
587 | RISCV_SET_UI32(p + 4, a) | ||
588 | #else | ||
589 | SetBe32(p + 4, a) | ||
590 | #endif | ||
591 | p += 8; | ||
592 | } | ||
593 | else | ||
594 | p += RISCV_STEP_1; | ||
595 | } | ||
596 | else | ||
597 | { | ||
598 | UInt32 r = a >> 27; | ||
599 | if (RISCV_CHECK_2(v, r)) | ||
600 | { | ||
601 | v = RISCV_GET_UI32(p + 4); | ||
602 | r = (r << 7) + 0x17 + (v & 0xfffff000); | ||
603 | a = (a >> 12) | (v << 20); | ||
604 | RISCV_SET_UI32(p, r) | ||
605 | RISCV_SET_UI32(p + 4, a) | ||
606 | p += 8; | ||
607 | } | ||
608 | else | ||
609 | p += RISCV_STEP_2; | ||
610 | } | ||
611 | } | ||
612 | } // for | ||
613 | } | ||
614 | |||
615 | |||
616 | Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc) | ||
617 | { | ||
618 | RISCV_SCAN_LOOP | ||
619 | #ifdef RISCV_USE_16BIT_LOAD | ||
620 | if ((a & 8) == 0) | ||
621 | { | ||
622 | #else | ||
623 | v = a; | ||
624 | a += (UInt32)p[1] << 8; | ||
625 | if ((v & 8) == 0) | ||
626 | { | ||
627 | #endif | ||
628 | // JAL | ||
629 | a -= 0x100 - RISCV_DELTA_7F; | ||
630 | if (a & 0xd80) | ||
631 | { | ||
632 | p += RISCV_INSTR_SIZE; | ||
633 | continue; | ||
634 | } | ||
635 | { | ||
636 | const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff; | ||
637 | #if 0 // unaligned | ||
638 | a = GetUi32(p); | ||
639 | v = (UInt32)(a >> 23) & ((UInt32)0xff << 1) | ||
640 | | (UInt32)(a >> 7) & ((UInt32)0xff << 9) | ||
641 | #elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
642 | v = GetUi16a(p + 2); | ||
643 | v = Z7_BSWAP32(v) >> 15 | ||
644 | #else | ||
645 | v = (UInt32)p[3] << 1 | ||
646 | | (UInt32)p[2] << 9 | ||
647 | #endif | ||
648 | | (UInt32)((a & 0xf000) << 5); | ||
649 | BR_CONVERT_VAL_DEC(v) | ||
650 | a = a_old | ||
651 | | (v << 11 & 1u << 31) | ||
652 | | (v << 20 & 0x3ff << 21) | ||
653 | | (v << 9 & 1 << 20) | ||
654 | | (v & 0xff << 12); | ||
655 | RISCV_SET_UI32(p, a) | ||
656 | } | ||
657 | p += 4; | ||
658 | continue; | ||
659 | } // JAL | ||
660 | |||
661 | { | ||
662 | // AUIPC | ||
663 | v = a; | ||
664 | #if 1 && defined(RISCV_USE_UNALIGNED_LOAD) | ||
665 | a = GetUi32(p); | ||
666 | #else | ||
667 | a |= (UInt32)GetUi16a(p + 2) << 16; | ||
668 | #endif | ||
669 | if ((v & 0xe80) == 0) // x0/x2 | ||
670 | { | ||
671 | const UInt32 r = a >> 27; | ||
672 | if (RISCV_CHECK_2(v, r)) | ||
673 | { | ||
674 | UInt32 b; | ||
675 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
676 | b = RISCV_GET_UI32(p + 4); | ||
677 | b = Z7_BSWAP32(b); | ||
678 | #else | ||
679 | b = GetBe32(p + 4); | ||
680 | #endif | ||
681 | v = a >> 12; | ||
682 | BR_CONVERT_VAL_DEC(b) | ||
683 | a = (r << 7) + 0x17; | ||
684 | a += (b + 0x800) & 0xfffff000; | ||
685 | v |= b << 20; | ||
686 | RISCV_SET_UI32(p, a) | ||
687 | RISCV_SET_UI32(p + 4, v) | ||
688 | p += 8; | ||
689 | } | ||
690 | else | ||
691 | p += RISCV_STEP_2; | ||
692 | } | ||
693 | else | ||
694 | { | ||
695 | const UInt32 b = RISCV_GET_UI32(p + 4); | ||
696 | if (!RISCV_CHECK_1(v, b)) | ||
697 | p += RISCV_STEP_1; | ||
698 | else | ||
699 | { | ||
700 | v = (a & 0xfffff000) | (b >> 20); | ||
701 | a = (b << 12) | (0x17 + RISCV_REG_VAL); | ||
702 | RISCV_SET_UI32(p, a) | ||
703 | RISCV_SET_UI32(p + 4, v) | ||
704 | p += 8; | ||
705 | } | ||
706 | } | ||
707 | } | ||
708 | } // for | ||
709 | } | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Bra.h -- Branch converters for executables | 1 | /* Bra.h -- Branch converters for executables |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-20 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_BRA_H | 4 | #ifndef ZIP7_INC_BRA_H |
5 | #define ZIP7_INC_BRA_H | 5 | #define ZIP7_INC_BRA_H |
@@ -8,8 +8,12 @@ | |||
8 | 8 | ||
9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
10 | 10 | ||
11 | #define Z7_BRANCH_CONV_DEC(name) z7_BranchConv_ ## name ## _Dec | 11 | /* #define PPC BAD_PPC_11 // for debug */ |
12 | #define Z7_BRANCH_CONV_ENC(name) z7_BranchConv_ ## name ## _Enc | 12 | |
13 | #define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec | ||
14 | #define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc | ||
15 | #define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name) | ||
16 | #define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name) | ||
13 | #define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec | 17 | #define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec |
14 | #define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc | 18 | #define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc |
15 | 19 | ||
@@ -20,19 +24,20 @@ typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv)); | |||
20 | typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); | 24 | typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); |
21 | 25 | ||
22 | #define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 | 26 | #define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 |
23 | Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_DEC(X86)); | 27 | Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86)); |
24 | Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_ENC(X86)); | 28 | Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86)); |
25 | 29 | ||
26 | #define Z7_BRANCH_FUNCS_DECL(name) \ | 30 | #define Z7_BRANCH_FUNCS_DECL(name) \ |
27 | Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_DEC(name)); \ | 31 | Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \ |
28 | Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_ENC(name)); | 32 | Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name)); |
29 | 33 | ||
30 | Z7_BRANCH_FUNCS_DECL(ARM64) | 34 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64) |
31 | Z7_BRANCH_FUNCS_DECL(ARM) | 35 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARM) |
32 | Z7_BRANCH_FUNCS_DECL(ARMT) | 36 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT) |
33 | Z7_BRANCH_FUNCS_DECL(PPC) | 37 | Z7_BRANCH_FUNCS_DECL (BranchConv_PPC) |
34 | Z7_BRANCH_FUNCS_DECL(SPARC) | 38 | Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC) |
35 | Z7_BRANCH_FUNCS_DECL(IA64) | 39 | Z7_BRANCH_FUNCS_DECL (BranchConv_IA64) |
40 | Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV) | ||
36 | 41 | ||
37 | /* | 42 | /* |
38 | These functions convert data that contain CPU instructions. | 43 | These functions convert data that contain CPU instructions. |
@@ -49,14 +54,14 @@ and one for decoding (_Enc/_Dec postfixes in function name). | |||
49 | In params: | 54 | In params: |
50 | data : data buffer | 55 | data : data buffer |
51 | size : size of data | 56 | size : size of data |
52 | pc : current virtual Program Counter (Instruction Pinter) value | 57 | pc : current virtual Program Counter (Instruction Pointer) value |
53 | In/Out param: | 58 | In/Out param: |
54 | state : pointer to state variable (for X86 converter only) | 59 | state : pointer to state variable (for X86 converter only) |
55 | 60 | ||
56 | Return: | 61 | Return: |
57 | The pointer to position in (data) buffer after last byte that was processed. | 62 | The pointer to position in (data) buffer after last byte that was processed. |
58 | If the caller calls converter again, it must call it starting with that position. | 63 | If the caller calls converter again, it must call it starting with that position. |
59 | But the caller is allowed to move data in buffer. so pointer to | 64 | But the caller is allowed to move data in buffer. So pointer to |
60 | current processed position also will be changed for next call. | 65 | current processed position also will be changed for next call. |
61 | Also the caller must increase internal (pc) value for next call. | 66 | Also the caller must increase internal (pc) value for next call. |
62 | 67 | ||
@@ -65,6 +70,7 @@ Each converter has some characteristics: Endian, Alignment, LookAhead. | |||
65 | 70 | ||
66 | X86 little 1 4 | 71 | X86 little 1 4 |
67 | ARMT little 2 2 | 72 | ARMT little 2 2 |
73 | RISCV little 2 6 | ||
68 | ARM little 4 0 | 74 | ARM little 4 0 |
69 | ARM64 little 4 0 | 75 | ARM64 little 4 0 |
70 | PPC big 4 0 | 76 | PPC big 4 0 |
diff --git a/C/Compiler.h b/C/Compiler.h index 185a52d..2a9c2b7 100644 --- a/C/Compiler.h +++ b/C/Compiler.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Compiler.h : Compiler specific defines and pragmas | 1 | /* Compiler.h : Compiler specific defines and pragmas |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_COMPILER_H | 4 | #ifndef ZIP7_INC_COMPILER_H |
5 | #define ZIP7_INC_COMPILER_H | 5 | #define ZIP7_INC_COMPILER_H |
@@ -25,11 +25,79 @@ | |||
25 | #define Z7_MINGW | 25 | #define Z7_MINGW |
26 | #endif | 26 | #endif |
27 | 27 | ||
28 | #if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__)) | ||
29 | #define Z7_MCST_LCC | ||
30 | #define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__) | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | #if defined(__AVX2__) \ | ||
35 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
36 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
37 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
38 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
39 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
40 | #define Z7_COMPILER_AVX2_SUPPORTED | ||
41 | #endif | ||
42 | #endif | ||
43 | */ | ||
44 | |||
28 | // #pragma GCC diagnostic ignored "-Wunknown-pragmas" | 45 | // #pragma GCC diagnostic ignored "-Wunknown-pragmas" |
29 | 46 | ||
30 | #ifdef __clang__ | 47 | #ifdef __clang__ |
31 | // padding size of '' with 4 bytes to alignment boundary | 48 | // padding size of '' with 4 bytes to alignment boundary |
32 | #pragma GCC diagnostic ignored "-Wpadded" | 49 | #pragma GCC diagnostic ignored "-Wpadded" |
50 | |||
51 | #if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \ | ||
52 | && defined(__FreeBSD__) | ||
53 | // freebsd: | ||
54 | #pragma GCC diagnostic ignored "-Wexcess-padding" | ||
55 | #endif | ||
56 | |||
57 | #if __clang_major__ >= 16 | ||
58 | #pragma GCC diagnostic ignored "-Wunsafe-buffer-usage" | ||
59 | #endif | ||
60 | |||
61 | #if __clang_major__ == 13 | ||
62 | #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) | ||
63 | // cheri | ||
64 | #pragma GCC diagnostic ignored "-Wcapability-to-integer-cast" | ||
65 | #endif | ||
66 | #endif | ||
67 | |||
68 | #if __clang_major__ == 13 | ||
69 | // for <arm_neon.h> | ||
70 | #pragma GCC diagnostic ignored "-Wreserved-identifier" | ||
71 | #endif | ||
72 | |||
73 | #endif // __clang__ | ||
74 | |||
75 | #if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16 | ||
76 | // #pragma GCC diagnostic ignored "-Wcast-function-type-strict" | ||
77 | #define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \ | ||
78 | _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"") | ||
79 | #else | ||
80 | #define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
81 | #endif | ||
82 | |||
83 | typedef void (*Z7_void_Function)(void); | ||
84 | #if defined(__clang__) || defined(__GNUC__) | ||
85 | #define Z7_CAST_FUNC_C (Z7_void_Function) | ||
86 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
87 | #define Z7_CAST_FUNC_C (void *) | ||
88 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
89 | #else | ||
90 | #define Z7_CAST_FUNC_C | ||
91 | #endif | ||
92 | /* | ||
93 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | ||
94 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
95 | #endif | ||
96 | */ | ||
97 | #ifdef __GNUC__ | ||
98 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000) | ||
99 | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||
100 | #endif | ||
33 | #endif | 101 | #endif |
34 | 102 | ||
35 | 103 | ||
@@ -101,7 +169,8 @@ | |||
101 | _Pragma("clang loop unroll(disable)") \ | 169 | _Pragma("clang loop unroll(disable)") \ |
102 | _Pragma("clang loop vectorize(disable)") | 170 | _Pragma("clang loop vectorize(disable)") |
103 | #define Z7_ATTRIB_NO_VECTORIZE | 171 | #define Z7_ATTRIB_NO_VECTORIZE |
104 | #elif defined(__GNUC__) && (__GNUC__ >= 5) | 172 | #elif defined(__GNUC__) && (__GNUC__ >= 5) \ |
173 | && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610)) | ||
105 | #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) | 174 | #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) |
106 | // __attribute__((optimize("no-unroll-loops"))); | 175 | // __attribute__((optimize("no-unroll-loops"))); |
107 | #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | 176 | #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE |
@@ -142,15 +211,23 @@ | |||
142 | #endif | 211 | #endif |
143 | 212 | ||
144 | 213 | ||
145 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 36000)) | 214 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600)) |
146 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | 215 | |
216 | #if (Z7_CLANG_VERSION < 130000) | ||
217 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | ||
218 | _Pragma("GCC diagnostic push") \ | ||
219 | _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"") | ||
220 | #else | ||
221 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | ||
147 | _Pragma("GCC diagnostic push") \ | 222 | _Pragma("GCC diagnostic push") \ |
148 | _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") | 223 | _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") |
149 | #define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ | 224 | #endif |
225 | |||
226 | #define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ | ||
150 | _Pragma("GCC diagnostic pop") | 227 | _Pragma("GCC diagnostic pop") |
151 | #else | 228 | #else |
152 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | 229 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
153 | #define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | 230 | #define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
154 | #endif | 231 | #endif |
155 | 232 | ||
156 | #define UNUSED_VAR(x) (void)x; | 233 | #define UNUSED_VAR(x) (void)x; |
diff --git a/C/CpuArch.c b/C/CpuArch.c index 33f8a3a..d51b38a 100644 --- a/C/CpuArch.c +++ b/C/CpuArch.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.c -- CPU specific code | 1 | /* CpuArch.c -- CPU specific code |
2 | 2023-05-18 : Igor Pavlov : Public domain */ | 2 | 2024-03-02 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -226,7 +226,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
226 | DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! | 226 | DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! |
227 | */ | 227 | */ |
228 | static | 228 | static |
229 | Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo) | 229 | Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) |
230 | { | 230 | { |
231 | UNUSED_VAR(subFunction) | 231 | UNUSED_VAR(subFunction) |
232 | __cpuid(CPUInfo, func); | 232 | __cpuid(CPUInfo, func); |
@@ -242,13 +242,13 @@ Z7_NO_INLINE | |||
242 | #endif | 242 | #endif |
243 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | 243 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
244 | { | 244 | { |
245 | MY_cpuidex((int *)p, (int)func, 0); | 245 | MY_cpuidex((Int32 *)p, (Int32)func, 0); |
246 | } | 246 | } |
247 | 247 | ||
248 | Z7_NO_INLINE | 248 | Z7_NO_INLINE |
249 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) | 249 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
250 | { | 250 | { |
251 | int a[4]; | 251 | Int32 a[4]; |
252 | MY_cpuidex(a, 0, 0); | 252 | MY_cpuidex(a, 0, 0); |
253 | return a[0]; | 253 | return a[0]; |
254 | } | 254 | } |
@@ -384,7 +384,7 @@ BoolInt CPU_IsSupported_CMOV(void) | |||
384 | UInt32 a[4]; | 384 | UInt32 a[4]; |
385 | if (!x86cpuid_Func_1(&a[0])) | 385 | if (!x86cpuid_Func_1(&a[0])) |
386 | return 0; | 386 | return 0; |
387 | return (a[3] >> 15) & 1; | 387 | return (BoolInt)(a[3] >> 15) & 1; |
388 | } | 388 | } |
389 | 389 | ||
390 | BoolInt CPU_IsSupported_SSE(void) | 390 | BoolInt CPU_IsSupported_SSE(void) |
@@ -393,7 +393,7 @@ BoolInt CPU_IsSupported_SSE(void) | |||
393 | CHECK_SYS_SSE_SUPPORT | 393 | CHECK_SYS_SSE_SUPPORT |
394 | if (!x86cpuid_Func_1(&a[0])) | 394 | if (!x86cpuid_Func_1(&a[0])) |
395 | return 0; | 395 | return 0; |
396 | return (a[3] >> 25) & 1; | 396 | return (BoolInt)(a[3] >> 25) & 1; |
397 | } | 397 | } |
398 | 398 | ||
399 | BoolInt CPU_IsSupported_SSE2(void) | 399 | BoolInt CPU_IsSupported_SSE2(void) |
@@ -402,7 +402,7 @@ BoolInt CPU_IsSupported_SSE2(void) | |||
402 | CHECK_SYS_SSE_SUPPORT | 402 | CHECK_SYS_SSE_SUPPORT |
403 | if (!x86cpuid_Func_1(&a[0])) | 403 | if (!x86cpuid_Func_1(&a[0])) |
404 | return 0; | 404 | return 0; |
405 | return (a[3] >> 26) & 1; | 405 | return (BoolInt)(a[3] >> 26) & 1; |
406 | } | 406 | } |
407 | 407 | ||
408 | #endif | 408 | #endif |
@@ -419,17 +419,17 @@ static UInt32 x86cpuid_Func_1_ECX(void) | |||
419 | 419 | ||
420 | BoolInt CPU_IsSupported_AES(void) | 420 | BoolInt CPU_IsSupported_AES(void) |
421 | { | 421 | { |
422 | return (x86cpuid_Func_1_ECX() >> 25) & 1; | 422 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; |
423 | } | 423 | } |
424 | 424 | ||
425 | BoolInt CPU_IsSupported_SSSE3(void) | 425 | BoolInt CPU_IsSupported_SSSE3(void) |
426 | { | 426 | { |
427 | return (x86cpuid_Func_1_ECX() >> 9) & 1; | 427 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; |
428 | } | 428 | } |
429 | 429 | ||
430 | BoolInt CPU_IsSupported_SSE41(void) | 430 | BoolInt CPU_IsSupported_SSE41(void) |
431 | { | 431 | { |
432 | return (x86cpuid_Func_1_ECX() >> 19) & 1; | 432 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; |
433 | } | 433 | } |
434 | 434 | ||
435 | BoolInt CPU_IsSupported_SHA(void) | 435 | BoolInt CPU_IsSupported_SHA(void) |
@@ -441,7 +441,7 @@ BoolInt CPU_IsSupported_SHA(void) | |||
441 | { | 441 | { |
442 | UInt32 d[4]; | 442 | UInt32 d[4]; |
443 | z7_x86_cpuid(d, 7); | 443 | z7_x86_cpuid(d, 7); |
444 | return (d[1] >> 29) & 1; | 444 | return (BoolInt)(d[1] >> 29) & 1; |
445 | } | 445 | } |
446 | } | 446 | } |
447 | 447 | ||
@@ -640,8 +640,8 @@ BoolInt CPU_IsSupported_AVX(void) | |||
640 | const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); | 640 | const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); |
641 | // printf("\n=== XGetBV=%d\n", bm); | 641 | // printf("\n=== XGetBV=%d\n", bm); |
642 | return 1 | 642 | return 1 |
643 | & (bm >> 1) // SSE state is supported (set by OS) for storing/restoring | 643 | & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring |
644 | & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring | 644 | & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring |
645 | } | 645 | } |
646 | // since Win7SP1: we can use GetEnabledXStateFeatures(); | 646 | // since Win7SP1: we can use GetEnabledXStateFeatures(); |
647 | } | 647 | } |
@@ -658,10 +658,29 @@ BoolInt CPU_IsSupported_AVX2(void) | |||
658 | z7_x86_cpuid(d, 7); | 658 | z7_x86_cpuid(d, 7); |
659 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | 659 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
660 | return 1 | 660 | return 1 |
661 | & (d[1] >> 5); // avx2 | 661 | & (BoolInt)(d[1] >> 5); // avx2 |
662 | } | 662 | } |
663 | } | 663 | } |
664 | 664 | ||
665 | /* | ||
666 | // fix it: | ||
667 | BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) | ||
668 | { | ||
669 | if (!CPU_IsSupported_AVX()) | ||
670 | return False; | ||
671 | if (z7_x86_cpuid_GetMaxFunc() < 7) | ||
672 | return False; | ||
673 | { | ||
674 | UInt32 d[4]; | ||
675 | z7_x86_cpuid(d, 7); | ||
676 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | ||
677 | return 1 | ||
678 | & (BoolInt)(d[1] >> 16) // avx512-f | ||
679 | & (BoolInt)(d[1] >> 31); // avx512-Vl | ||
680 | } | ||
681 | } | ||
682 | */ | ||
683 | |||
665 | BoolInt CPU_IsSupported_VAES_AVX2(void) | 684 | BoolInt CPU_IsSupported_VAES_AVX2(void) |
666 | { | 685 | { |
667 | if (!CPU_IsSupported_AVX()) | 686 | if (!CPU_IsSupported_AVX()) |
@@ -673,9 +692,9 @@ BoolInt CPU_IsSupported_VAES_AVX2(void) | |||
673 | z7_x86_cpuid(d, 7); | 692 | z7_x86_cpuid(d, 7); |
674 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | 693 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
675 | return 1 | 694 | return 1 |
676 | & (d[1] >> 5) // avx2 | 695 | & (BoolInt)(d[1] >> 5) // avx2 |
677 | // & (d[1] >> 31) // avx512vl | 696 | // & (d[1] >> 31) // avx512vl |
678 | & (d[2] >> 9); // vaes // VEX-256/EVEX | 697 | & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX |
679 | } | 698 | } |
680 | } | 699 | } |
681 | 700 | ||
@@ -688,7 +707,7 @@ BoolInt CPU_IsSupported_PageGB(void) | |||
688 | if (d[0] < 0x80000001) | 707 | if (d[0] < 0x80000001) |
689 | return False; | 708 | return False; |
690 | z7_x86_cpuid(d, 0x80000001); | 709 | z7_x86_cpuid(d, 0x80000001); |
691 | return (d[3] >> 26) & 1; | 710 | return (BoolInt)(d[3] >> 26) & 1; |
692 | } | 711 | } |
693 | } | 712 | } |
694 | 713 | ||
@@ -760,32 +779,65 @@ BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } | |||
760 | 779 | ||
761 | #else // __APPLE__ | 780 | #else // __APPLE__ |
762 | 781 | ||
763 | #include <sys/auxv.h> | 782 | #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) |
783 | #define Z7_GETAUXV_AVAILABLE | ||
784 | #else | ||
785 | // #pragma message("=== is not NEW GLIBC === ") | ||
786 | #if defined __has_include | ||
787 | #if __has_include (<sys/auxv.h>) | ||
788 | // #pragma message("=== sys/auxv.h is avail=== ") | ||
789 | #define Z7_GETAUXV_AVAILABLE | ||
790 | #endif | ||
791 | #endif | ||
792 | #endif | ||
764 | 793 | ||
794 | #ifdef Z7_GETAUXV_AVAILABLE | ||
795 | // #pragma message("=== Z7_GETAUXV_AVAILABLE === ") | ||
796 | #include <sys/auxv.h> | ||
765 | #define USE_HWCAP | 797 | #define USE_HWCAP |
798 | #endif | ||
766 | 799 | ||
767 | #ifdef USE_HWCAP | 800 | #ifdef USE_HWCAP |
768 | 801 | ||
802 | #if defined(__FreeBSD__) | ||
803 | static unsigned long MY_getauxval(int aux) | ||
804 | { | ||
805 | unsigned long val; | ||
806 | if (elf_aux_info(aux, &val, sizeof(val))) | ||
807 | return 0; | ||
808 | return val; | ||
809 | } | ||
810 | #else | ||
811 | #define MY_getauxval getauxval | ||
812 | #if defined __has_include | ||
813 | #if __has_include (<asm/hwcap.h>) | ||
769 | #include <asm/hwcap.h> | 814 | #include <asm/hwcap.h> |
815 | #endif | ||
816 | #endif | ||
817 | #endif | ||
770 | 818 | ||
771 | #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ | 819 | #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ |
772 | BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } | 820 | BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } |
773 | 821 | ||
774 | #ifdef MY_CPU_ARM64 | 822 | #ifdef MY_CPU_ARM64 |
775 | #define MY_HWCAP_CHECK_FUNC(name) \ | 823 | #define MY_HWCAP_CHECK_FUNC(name) \ |
776 | MY_HWCAP_CHECK_FUNC_2(name, name) | 824 | MY_HWCAP_CHECK_FUNC_2(name, name) |
825 | #if 1 || defined(__ARM_NEON) | ||
826 | BoolInt CPU_IsSupported_NEON(void) { return True; } | ||
827 | #else | ||
777 | MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) | 828 | MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) |
829 | #endif | ||
778 | // MY_HWCAP_CHECK_FUNC (ASIMD) | 830 | // MY_HWCAP_CHECK_FUNC (ASIMD) |
779 | #elif defined(MY_CPU_ARM) | 831 | #elif defined(MY_CPU_ARM) |
780 | #define MY_HWCAP_CHECK_FUNC(name) \ | 832 | #define MY_HWCAP_CHECK_FUNC(name) \ |
781 | BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } | 833 | BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } |
782 | MY_HWCAP_CHECK_FUNC_2(NEON, NEON) | 834 | MY_HWCAP_CHECK_FUNC_2(NEON, NEON) |
783 | #endif | 835 | #endif |
784 | 836 | ||
785 | #else // USE_HWCAP | 837 | #else // USE_HWCAP |
786 | 838 | ||
787 | #define MY_HWCAP_CHECK_FUNC(name) \ | 839 | #define MY_HWCAP_CHECK_FUNC(name) \ |
788 | BoolInt CPU_IsSupported_ ## name() { return 0; } | 840 | BoolInt CPU_IsSupported_ ## name(void) { return 0; } |
789 | MY_HWCAP_CHECK_FUNC(NEON) | 841 | MY_HWCAP_CHECK_FUNC(NEON) |
790 | 842 | ||
791 | #endif // USE_HWCAP | 843 | #endif // USE_HWCAP |
diff --git a/C/CpuArch.h b/C/CpuArch.h index 8e5d8a5..dfc68f1 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* CpuArch.h -- CPU specific code | 1 | /* CpuArch.h -- CPU specific code |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-05-13 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_CPU_ARCH_H | 4 | #ifndef ZIP7_INC_CPU_ARCH_H |
5 | #define ZIP7_INC_CPU_ARCH_H | 5 | #define ZIP7_INC_CPU_ARCH_H |
@@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) | 20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #if !defined(_M_ARM64EC) | ||
23 | #if defined(_M_X64) \ | 24 | #if defined(_M_X64) \ |
24 | || defined(_M_AMD64) \ | 25 | || defined(_M_AMD64) \ |
25 | || defined(__x86_64__) \ | 26 | || defined(__x86_64__) \ |
@@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
35 | #endif | 36 | #endif |
36 | #define MY_CPU_64BIT | 37 | #define MY_CPU_64BIT |
37 | #endif | 38 | #endif |
39 | #endif | ||
38 | 40 | ||
39 | 41 | ||
40 | #if defined(_M_IX86) \ | 42 | #if defined(_M_IX86) \ |
@@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
47 | 49 | ||
48 | 50 | ||
49 | #if defined(_M_ARM64) \ | 51 | #if defined(_M_ARM64) \ |
52 | || defined(_M_ARM64EC) \ | ||
50 | || defined(__AARCH64EL__) \ | 53 | || defined(__AARCH64EL__) \ |
51 | || defined(__AARCH64EB__) \ | 54 | || defined(__AARCH64EB__) \ |
52 | || defined(__aarch64__) | 55 | || defined(__aarch64__) |
53 | #define MY_CPU_ARM64 | 56 | #define MY_CPU_ARM64 |
54 | #ifdef __ILP32__ | 57 | #if defined(__ILP32__) \ |
58 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
55 | #define MY_CPU_NAME "arm64-32" | 59 | #define MY_CPU_NAME "arm64-32" |
56 | #define MY_CPU_SIZEOF_POINTER 4 | 60 | #define MY_CPU_SIZEOF_POINTER 4 |
57 | #else | 61 | #elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) |
62 | #define MY_CPU_NAME "arm64-128" | ||
63 | #define MY_CPU_SIZEOF_POINTER 16 | ||
64 | #else | ||
65 | #if defined(_M_ARM64EC) | ||
66 | #define MY_CPU_NAME "arm64ec" | ||
67 | #else | ||
58 | #define MY_CPU_NAME "arm64" | 68 | #define MY_CPU_NAME "arm64" |
69 | #endif | ||
59 | #define MY_CPU_SIZEOF_POINTER 8 | 70 | #define MY_CPU_SIZEOF_POINTER 8 |
60 | #endif | 71 | #endif |
61 | #define MY_CPU_64BIT | 72 | #define MY_CPU_64BIT |
62 | #endif | 73 | #endif |
63 | 74 | ||
@@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
133 | #endif | 144 | #endif |
134 | 145 | ||
135 | 146 | ||
147 | #if defined(__sparc__) \ | ||
148 | || defined(__sparc) | ||
149 | #define MY_CPU_SPARC | ||
150 | #if defined(__LP64__) \ | ||
151 | || defined(_LP64) \ | ||
152 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
153 | #define MY_CPU_NAME "sparcv9" | ||
154 | #define MY_CPU_SIZEOF_POINTER 8 | ||
155 | #define MY_CPU_64BIT | ||
156 | #elif defined(__sparc_v9__) \ | ||
157 | || defined(__sparcv9) | ||
158 | #define MY_CPU_64BIT | ||
159 | #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
160 | #define MY_CPU_NAME "sparcv9-32" | ||
161 | #else | ||
162 | #define MY_CPU_NAME "sparcv9m" | ||
163 | #endif | ||
164 | #elif defined(__sparc_v8__) \ | ||
165 | || defined(__sparcv8) | ||
166 | #define MY_CPU_NAME "sparcv8" | ||
167 | #define MY_CPU_SIZEOF_POINTER 4 | ||
168 | #else | ||
169 | #define MY_CPU_NAME "sparc" | ||
170 | #endif | ||
171 | #endif | ||
172 | |||
173 | |||
136 | #if defined(__riscv) \ | 174 | #if defined(__riscv) \ |
137 | || defined(__riscv__) | 175 | || defined(__riscv__) |
176 | #define MY_CPU_RISCV | ||
138 | #if __riscv_xlen == 32 | 177 | #if __riscv_xlen == 32 |
139 | #define MY_CPU_NAME "riscv32" | 178 | #define MY_CPU_NAME "riscv32" |
140 | #elif __riscv_xlen == 64 | 179 | #elif __riscv_xlen == 64 |
@@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
145 | #endif | 184 | #endif |
146 | 185 | ||
147 | 186 | ||
187 | #if defined(__loongarch__) | ||
188 | #define MY_CPU_LOONGARCH | ||
189 | #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64) | ||
190 | #define MY_CPU_64BIT | ||
191 | #endif | ||
192 | #if defined(__loongarch64) | ||
193 | #define MY_CPU_NAME "loongarch64" | ||
194 | #define MY_CPU_LOONGARCH64 | ||
195 | #else | ||
196 | #define MY_CPU_NAME "loongarch" | ||
197 | #endif | ||
198 | #endif | ||
199 | |||
200 | |||
201 | // #undef MY_CPU_NAME | ||
202 | // #undef MY_CPU_SIZEOF_POINTER | ||
203 | // #define __e2k__ | ||
204 | // #define __SIZEOF_POINTER__ 4 | ||
205 | #if defined(__e2k__) | ||
206 | #define MY_CPU_E2K | ||
207 | #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
208 | #define MY_CPU_NAME "e2k-32" | ||
209 | #define MY_CPU_SIZEOF_POINTER 4 | ||
210 | #else | ||
211 | #define MY_CPU_NAME "e2k" | ||
212 | #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
213 | #define MY_CPU_SIZEOF_POINTER 8 | ||
214 | #endif | ||
215 | #endif | ||
216 | #define MY_CPU_64BIT | ||
217 | #endif | ||
218 | |||
219 | |||
148 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) | 220 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) |
149 | #define MY_CPU_X86_OR_AMD64 | 221 | #define MY_CPU_X86_OR_AMD64 |
150 | #endif | 222 | #endif |
@@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
175 | || defined(MY_CPU_ARM_LE) \ | 247 | || defined(MY_CPU_ARM_LE) \ |
176 | || defined(MY_CPU_ARM64_LE) \ | 248 | || defined(MY_CPU_ARM64_LE) \ |
177 | || defined(MY_CPU_IA64_LE) \ | 249 | || defined(MY_CPU_IA64_LE) \ |
250 | || defined(_LITTLE_ENDIAN) \ | ||
178 | || defined(__LITTLE_ENDIAN__) \ | 251 | || defined(__LITTLE_ENDIAN__) \ |
179 | || defined(__ARMEL__) \ | 252 | || defined(__ARMEL__) \ |
180 | || defined(__THUMBEL__) \ | 253 | || defined(__THUMBEL__) \ |
@@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
251 | 324 | ||
252 | 325 | ||
253 | #ifndef MY_CPU_NAME | 326 | #ifndef MY_CPU_NAME |
327 | // #define MY_CPU_IS_UNKNOWN | ||
254 | #ifdef MY_CPU_LE | 328 | #ifdef MY_CPU_LE |
255 | #define MY_CPU_NAME "LE" | 329 | #define MY_CPU_NAME "LE" |
256 | #elif defined(MY_CPU_BE) | 330 | #elif defined(MY_CPU_BE) |
@@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
295 | #define Z7_BSWAP64(v) _byteswap_uint64(v) | 369 | #define Z7_BSWAP64(v) _byteswap_uint64(v) |
296 | #define Z7_CPU_FAST_BSWAP_SUPPORTED | 370 | #define Z7_CPU_FAST_BSWAP_SUPPORTED |
297 | 371 | ||
298 | #elif (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | 372 | /* GCC can generate slow code that calls function for __builtin_bswap32() for: |
299 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) | 373 | - GCC for RISCV, if Zbb extension is not used. |
300 | 374 | - GCC for SPARC. | |
375 | The code from CLANG for SPARC also is not fastest. | ||
376 | So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases. | ||
377 | */ | ||
378 | #elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \ | ||
379 | && !defined(MY_CPU_SPARC) \ | ||
380 | && ( \ | ||
381 | (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | ||
382 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \ | ||
383 | ) | ||
384 | |||
301 | #define Z7_BSWAP16(v) __builtin_bswap16(v) | 385 | #define Z7_BSWAP16(v) __builtin_bswap16(v) |
302 | #define Z7_BSWAP32(v) __builtin_bswap32(v) | 386 | #define Z7_BSWAP32(v) __builtin_bswap32(v) |
303 | #define Z7_BSWAP64(v) __builtin_bswap64(v) | 387 | #define Z7_BSWAP64(v) __builtin_bswap64(v) |
@@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
329 | 413 | ||
330 | #ifdef MY_CPU_LE | 414 | #ifdef MY_CPU_LE |
331 | #if defined(MY_CPU_X86_OR_AMD64) \ | 415 | #if defined(MY_CPU_X86_OR_AMD64) \ |
332 | || defined(MY_CPU_ARM64) | 416 | || defined(MY_CPU_ARM64) \ |
417 | || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \ | ||
418 | || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6) | ||
333 | #define MY_CPU_LE_UNALIGN | 419 | #define MY_CPU_LE_UNALIGN |
334 | #define MY_CPU_LE_UNALIGN_64 | 420 | #define MY_CPU_LE_UNALIGN_64 |
335 | #elif defined(__ARM_FEATURE_UNALIGNED) | 421 | #elif defined(__ARM_FEATURE_UNALIGNED) |
336 | /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. | 422 | /* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions. |
337 | So we can't use unaligned 64-bit operations. */ | 423 | Description of problems: |
338 | #define MY_CPU_LE_UNALIGN | 424 | problem-1 : 32-bit ARM architecture: |
425 | multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM) | ||
426 | require 32-bit (WORD) alignment (by 32-bit ARM architecture). | ||
427 | So there is "Alignment fault exception", if data is not aligned for 32-bit. | ||
428 | |||
429 | problem-2 : 32-bit kernels and arm64 kernels: | ||
430 | 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception". | ||
431 | So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux. | ||
432 | |||
433 | But some arm64 kernels do not handle these faults in 32-bit programs. | ||
434 | So we have unhandled exception for such instructions. | ||
435 | Probably some new arm64 kernels have fixed it, and unaligned | ||
436 | paired-access instructions work in new kernels? | ||
437 | |||
438 | problem-3 : compiler for 32-bit arm: | ||
439 | Compilers use LDRD/STRD/LDM/STM for UInt64 accesses | ||
440 | and for another cases where two 32-bit accesses are fused | ||
441 | to one multi-access instruction. | ||
442 | So UInt64 variables must be aligned for 32-bit, and each | ||
443 | 32-bit access must be aligned for 32-bit, if we want to | ||
444 | avoid "Alignment fault" exception (handled or unhandled). | ||
445 | |||
446 | problem-4 : performace: | ||
447 | Even if unaligned access is handled by kernel, it will be slow. | ||
448 | So if we allow unaligned access, we can get fast unaligned | ||
449 | single-access, and slow unaligned paired-access. | ||
450 | |||
451 | We don't allow unaligned access on 32-bit arm, because compiler | ||
452 | genarates paired-access instructions that require 32-bit alignment, | ||
453 | and some arm64 kernels have no handler for these instructions. | ||
454 | Also unaligned paired-access instructions will be slow, if kernel handles them. | ||
455 | */ | ||
456 | // it must be disabled: | ||
457 | // #define MY_CPU_LE_UNALIGN | ||
339 | #endif | 458 | #endif |
340 | #endif | 459 | #endif |
341 | 460 | ||
@@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
439 | 558 | ||
440 | #if defined(MY_CPU_BE) | 559 | #if defined(MY_CPU_BE) |
441 | 560 | ||
561 | #define GetBe64a(p) (*(const UInt64 *)(const void *)(p)) | ||
442 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) | 562 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) |
443 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) | 563 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) |
444 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 564 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
@@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
456 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 576 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
457 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } | 577 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } |
458 | 578 | ||
579 | #define GetBe64a(p) GetBe64(p) | ||
459 | #define GetBe32a(p) GetBe32(p) | 580 | #define GetBe32a(p) GetBe32(p) |
460 | #define GetBe16a(p) GetBe16(p) | 581 | #define GetBe16a(p) GetBe16(p) |
461 | #define SetBe32a(p, v) SetBe32(p, v) | 582 | #define SetBe32a(p, v) SetBe32(p, v) |
@@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void); | |||
486 | BoolInt CPU_IsSupported_AES(void); | 607 | BoolInt CPU_IsSupported_AES(void); |
487 | BoolInt CPU_IsSupported_AVX(void); | 608 | BoolInt CPU_IsSupported_AVX(void); |
488 | BoolInt CPU_IsSupported_AVX2(void); | 609 | BoolInt CPU_IsSupported_AVX2(void); |
610 | // BoolInt CPU_IsSupported_AVX512F_AVX512VL(void); | ||
489 | BoolInt CPU_IsSupported_VAES_AVX2(void); | 611 | BoolInt CPU_IsSupported_VAES_AVX2(void); |
490 | BoolInt CPU_IsSupported_CMOV(void); | 612 | BoolInt CPU_IsSupported_CMOV(void); |
491 | BoolInt CPU_IsSupported_SSE(void); | 613 | BoolInt CPU_IsSupported_SSE(void); |
diff --git a/C/DllSecur.c b/C/DllSecur.c index 02a0f97..bbbfc0a 100644 --- a/C/DllSecur.c +++ b/C/DllSecur.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* DllSecur.c -- DLL loading security | 1 | /* DllSecur.c -- DLL loading security |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-03 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -11,19 +11,7 @@ | |||
11 | 11 | ||
12 | #ifndef UNDER_CE | 12 | #ifndef UNDER_CE |
13 | 13 | ||
14 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | 14 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION |
15 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
16 | #endif | ||
17 | |||
18 | #if defined(__clang__) || defined(__GNUC__) | ||
19 | typedef void (*Z7_voidFunction)(void); | ||
20 | #define MY_CAST_FUNC (Z7_voidFunction) | ||
21 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
22 | #define MY_CAST_FUNC (void *) | ||
23 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
24 | #else | ||
25 | #define MY_CAST_FUNC | ||
26 | #endif | ||
27 | 15 | ||
28 | typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); | 16 | typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); |
29 | 17 | ||
@@ -61,7 +49,7 @@ static const char * const g_Dlls = | |||
61 | if ((UInt16)GetVersion() != 6) { \ | 49 | if ((UInt16)GetVersion() != 6) { \ |
62 | const \ | 50 | const \ |
63 | Func_SetDefaultDllDirectories setDllDirs = \ | 51 | Func_SetDefaultDllDirectories setDllDirs = \ |
64 | (Func_SetDefaultDllDirectories) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ | 52 | (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ |
65 | "SetDefaultDllDirectories"); \ | 53 | "SetDefaultDllDirectories"); \ |
66 | if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } | 54 | if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } |
67 | 55 | ||
diff --git a/C/HuffEnc.c b/C/HuffEnc.c index 3dc1e39..996da30 100644 --- a/C/HuffEnc.c +++ b/C/HuffEnc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* HuffEnc.c -- functions for Huffman encoding | 1 | /* HuffEnc.c -- functions for Huffman encoding |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -8,7 +8,7 @@ | |||
8 | 8 | ||
9 | #define kMaxLen 16 | 9 | #define kMaxLen 16 |
10 | #define NUM_BITS 10 | 10 | #define NUM_BITS 10 |
11 | #define MASK (((unsigned)1 << NUM_BITS) - 1) | 11 | #define MASK ((1u << NUM_BITS) - 1) |
12 | 12 | ||
13 | #define NUM_COUNTERS 64 | 13 | #define NUM_COUNTERS 64 |
14 | 14 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFind.c -- Match finder for LZ algorithms | 1 | /* LzFind.c -- Match finder for LZ algorithms |
2 | 2023-03-14 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -108,9 +108,15 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all | |||
108 | return (p->bufBase != NULL); | 108 | return (p->bufBase != NULL); |
109 | } | 109 | } |
110 | 110 | ||
111 | static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } | 111 | static const Byte *MatchFinder_GetPointerToCurrentPos(void *p) |
112 | { | ||
113 | return ((CMatchFinder *)p)->buffer; | ||
114 | } | ||
112 | 115 | ||
113 | static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } | 116 | static UInt32 MatchFinder_GetNumAvailableBytes(void *p) |
117 | { | ||
118 | return GET_AVAIL_BYTES((CMatchFinder *)p); | ||
119 | } | ||
114 | 120 | ||
115 | 121 | ||
116 | Z7_NO_INLINE | 122 | Z7_NO_INLINE |
@@ -571,8 +577,9 @@ void MatchFinder_Init_4(CMatchFinder *p) | |||
571 | #define CYC_TO_POS_OFFSET 0 | 577 | #define CYC_TO_POS_OFFSET 0 |
572 | // #define CYC_TO_POS_OFFSET 1 // for debug | 578 | // #define CYC_TO_POS_OFFSET 1 // for debug |
573 | 579 | ||
574 | void MatchFinder_Init(CMatchFinder *p) | 580 | void MatchFinder_Init(void *_p) |
575 | { | 581 | { |
582 | CMatchFinder *p = (CMatchFinder *)_p; | ||
576 | MatchFinder_Init_HighHash(p); | 583 | MatchFinder_Init_HighHash(p); |
577 | MatchFinder_Init_LowHash(p); | 584 | MatchFinder_Init_LowHash(p); |
578 | MatchFinder_Init_4(p); | 585 | MatchFinder_Init_4(p); |
@@ -607,16 +614,16 @@ void MatchFinder_Init(CMatchFinder *p) | |||
607 | #endif | 614 | #endif |
608 | #endif | 615 | #endif |
609 | 616 | ||
610 | // #elif defined(MY_CPU_ARM_OR_ARM64) | 617 | #elif defined(MY_CPU_ARM64) \ |
611 | #elif defined(MY_CPU_ARM64) | 618 | /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */ |
612 | 619 | ||
613 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 620 | #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ |
614 | || defined(__GNUC__) && (__GNUC__ >= 8) | 621 | || defined(__GNUC__) && (__GNUC__ >= 6) |
615 | #define USE_LZFIND_SATUR_SUB_128 | 622 | #define USE_LZFIND_SATUR_SUB_128 |
616 | #ifdef MY_CPU_ARM64 | 623 | #ifdef MY_CPU_ARM64 |
617 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) | 624 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) |
618 | #else | 625 | #else |
619 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 626 | #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon"))) |
620 | #endif | 627 | #endif |
621 | 628 | ||
622 | #elif defined(_MSC_VER) | 629 | #elif defined(_MSC_VER) |
@@ -625,7 +632,7 @@ void MatchFinder_Init(CMatchFinder *p) | |||
625 | #endif | 632 | #endif |
626 | #endif | 633 | #endif |
627 | 634 | ||
628 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 635 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
629 | #include <arm64_neon.h> | 636 | #include <arm64_neon.h> |
630 | #else | 637 | #else |
631 | #include <arm_neon.h> | 638 | #include <arm_neon.h> |
@@ -1082,9 +1089,11 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
1082 | 1089 | ||
1083 | 1090 | ||
1084 | #define MOVE_POS \ | 1091 | #define MOVE_POS \ |
1085 | ++p->cyclicBufferPos; \ | 1092 | p->cyclicBufferPos++; \ |
1086 | p->buffer++; \ | 1093 | p->buffer++; \ |
1087 | { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } | 1094 | { const UInt32 pos1 = p->pos + 1; \ |
1095 | p->pos = pos1; \ | ||
1096 | if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } | ||
1088 | 1097 | ||
1089 | #define MOVE_POS_RET MOVE_POS return distances; | 1098 | #define MOVE_POS_RET MOVE_POS return distances; |
1090 | 1099 | ||
@@ -1103,20 +1112,26 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
1103 | } | 1112 | } |
1104 | 1113 | ||
1105 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ | 1114 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ |
1106 | unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ | 1115 | UInt32 hv; const Byte *cur; UInt32 curMatch; \ |
1107 | lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ | 1116 | UInt32 lenLimit = p->lenLimit; \ |
1117 | if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \ | ||
1108 | cur = p->buffer; | 1118 | cur = p->buffer; |
1109 | 1119 | ||
1110 | #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) | 1120 | #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) |
1111 | #define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) | 1121 | #define SKIP_HEADER(minLen) \ |
1122 | do { GET_MATCHES_HEADER2(minLen, continue) | ||
1112 | 1123 | ||
1113 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue | 1124 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \ |
1125 | p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue | ||
1114 | 1126 | ||
1115 | #define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); | 1127 | #define SKIP_FOOTER \ |
1128 | SkipMatchesSpec(MF_PARAMS(p)); \ | ||
1129 | MOVE_POS \ | ||
1130 | } while (--num); | ||
1116 | 1131 | ||
1117 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ | 1132 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ |
1118 | distances = func(MF_PARAMS(p), \ | 1133 | distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \ |
1119 | distances, (UInt32)_maxLen_); MOVE_POS_RET | 1134 | MOVE_POS_RET |
1120 | 1135 | ||
1121 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ | 1136 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ |
1122 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) | 1137 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) |
@@ -1133,8 +1148,9 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
1133 | for (; c != lim; c++) if (*(c + diff) != *c) break; \ | 1148 | for (; c != lim; c++) if (*(c + diff) != *c) break; \ |
1134 | maxLen = (unsigned)(c - cur); } | 1149 | maxLen = (unsigned)(c - cur); } |
1135 | 1150 | ||
1136 | static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1151 | static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1137 | { | 1152 | { |
1153 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1138 | GET_MATCHES_HEADER(2) | 1154 | GET_MATCHES_HEADER(2) |
1139 | HASH2_CALC | 1155 | HASH2_CALC |
1140 | curMatch = p->hash[hv]; | 1156 | curMatch = p->hash[hv]; |
@@ -1158,8 +1174,9 @@ UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1158 | mmm = pos; | 1174 | mmm = pos; |
1159 | 1175 | ||
1160 | 1176 | ||
1161 | static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1177 | static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1162 | { | 1178 | { |
1179 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1163 | UInt32 mmm; | 1180 | UInt32 mmm; |
1164 | UInt32 h2, d2, pos; | 1181 | UInt32 h2, d2, pos; |
1165 | unsigned maxLen; | 1182 | unsigned maxLen; |
@@ -1199,8 +1216,9 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1199 | } | 1216 | } |
1200 | 1217 | ||
1201 | 1218 | ||
1202 | static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1219 | static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1203 | { | 1220 | { |
1221 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1204 | UInt32 mmm; | 1222 | UInt32 mmm; |
1205 | UInt32 h2, h3, d2, d3, pos; | 1223 | UInt32 h2, h3, d2, d3, pos; |
1206 | unsigned maxLen; | 1224 | unsigned maxLen; |
@@ -1267,10 +1285,12 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1267 | } | 1285 | } |
1268 | 1286 | ||
1269 | 1287 | ||
1270 | static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1288 | static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1271 | { | 1289 | { |
1290 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1272 | UInt32 mmm; | 1291 | UInt32 mmm; |
1273 | UInt32 h2, h3, d2, d3, maxLen, pos; | 1292 | UInt32 h2, h3, d2, d3, pos; |
1293 | unsigned maxLen; | ||
1274 | UInt32 *hash; | 1294 | UInt32 *hash; |
1275 | GET_MATCHES_HEADER(5) | 1295 | GET_MATCHES_HEADER(5) |
1276 | 1296 | ||
@@ -1339,8 +1359,9 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1339 | } | 1359 | } |
1340 | 1360 | ||
1341 | 1361 | ||
1342 | static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1362 | static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1343 | { | 1363 | { |
1364 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1344 | UInt32 mmm; | 1365 | UInt32 mmm; |
1345 | UInt32 h2, h3, d2, d3, pos; | 1366 | UInt32 h2, h3, d2, d3, pos; |
1346 | unsigned maxLen; | 1367 | unsigned maxLen; |
@@ -1407,10 +1428,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1407 | } | 1428 | } |
1408 | 1429 | ||
1409 | 1430 | ||
1410 | static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1431 | static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
1411 | { | 1432 | { |
1433 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1412 | UInt32 mmm; | 1434 | UInt32 mmm; |
1413 | UInt32 h2, h3, d2, d3, maxLen, pos; | 1435 | UInt32 h2, h3, d2, d3, pos; |
1436 | unsigned maxLen; | ||
1414 | UInt32 *hash; | 1437 | UInt32 *hash; |
1415 | GET_MATCHES_HEADER(5) | 1438 | GET_MATCHES_HEADER(5) |
1416 | 1439 | ||
@@ -1466,7 +1489,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1466 | if (*(cur - d2 + 3) != cur[3]) | 1489 | if (*(cur - d2 + 3) != cur[3]) |
1467 | break; | 1490 | break; |
1468 | UPDATE_maxLen | 1491 | UPDATE_maxLen |
1469 | distances[-2] = maxLen; | 1492 | distances[-2] = (UInt32)maxLen; |
1470 | if (maxLen == lenLimit) | 1493 | if (maxLen == lenLimit) |
1471 | { | 1494 | { |
1472 | p->son[p->cyclicBufferPos] = curMatch; | 1495 | p->son[p->cyclicBufferPos] = curMatch; |
@@ -1489,8 +1512,9 @@ UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
1489 | } | 1512 | } |
1490 | 1513 | ||
1491 | 1514 | ||
1492 | static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1515 | static void Bt2_MatchFinder_Skip(void *_p, UInt32 num) |
1493 | { | 1516 | { |
1517 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1494 | SKIP_HEADER(2) | 1518 | SKIP_HEADER(2) |
1495 | { | 1519 | { |
1496 | HASH2_CALC | 1520 | HASH2_CALC |
@@ -1511,8 +1535,9 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1511 | SKIP_FOOTER | 1535 | SKIP_FOOTER |
1512 | } | 1536 | } |
1513 | 1537 | ||
1514 | static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1538 | static void Bt3_MatchFinder_Skip(void *_p, UInt32 num) |
1515 | { | 1539 | { |
1540 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1516 | SKIP_HEADER(3) | 1541 | SKIP_HEADER(3) |
1517 | { | 1542 | { |
1518 | UInt32 h2; | 1543 | UInt32 h2; |
@@ -1526,8 +1551,9 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1526 | SKIP_FOOTER | 1551 | SKIP_FOOTER |
1527 | } | 1552 | } |
1528 | 1553 | ||
1529 | static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1554 | static void Bt4_MatchFinder_Skip(void *_p, UInt32 num) |
1530 | { | 1555 | { |
1556 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1531 | SKIP_HEADER(4) | 1557 | SKIP_HEADER(4) |
1532 | { | 1558 | { |
1533 | UInt32 h2, h3; | 1559 | UInt32 h2, h3; |
@@ -1542,8 +1568,9 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1542 | SKIP_FOOTER | 1568 | SKIP_FOOTER |
1543 | } | 1569 | } |
1544 | 1570 | ||
1545 | static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1571 | static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) |
1546 | { | 1572 | { |
1573 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1547 | SKIP_HEADER(5) | 1574 | SKIP_HEADER(5) |
1548 | { | 1575 | { |
1549 | UInt32 h2, h3; | 1576 | UInt32 h2, h3; |
@@ -1589,8 +1616,9 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1589 | }} while(num); \ | 1616 | }} while(num); \ |
1590 | 1617 | ||
1591 | 1618 | ||
1592 | static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1619 | static void Hc4_MatchFinder_Skip(void *_p, UInt32 num) |
1593 | { | 1620 | { |
1621 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1594 | HC_SKIP_HEADER(4) | 1622 | HC_SKIP_HEADER(4) |
1595 | 1623 | ||
1596 | UInt32 h2, h3; | 1624 | UInt32 h2, h3; |
@@ -1604,8 +1632,9 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1604 | } | 1632 | } |
1605 | 1633 | ||
1606 | 1634 | ||
1607 | static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1635 | static void Hc5_MatchFinder_Skip(void *_p, UInt32 num) |
1608 | { | 1636 | { |
1637 | CMatchFinder *p = (CMatchFinder *)_p; | ||
1609 | HC_SKIP_HEADER(5) | 1638 | HC_SKIP_HEADER(5) |
1610 | 1639 | ||
1611 | UInt32 h2, h3; | 1640 | UInt32 h2, h3; |
@@ -1634,41 +1663,41 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
1634 | 1663 | ||
1635 | void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) | 1664 | void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) |
1636 | { | 1665 | { |
1637 | vTable->Init = (Mf_Init_Func)MatchFinder_Init; | 1666 | vTable->Init = MatchFinder_Init; |
1638 | vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; | 1667 | vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes; |
1639 | vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; | 1668 | vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos; |
1640 | if (!p->btMode) | 1669 | if (!p->btMode) |
1641 | { | 1670 | { |
1642 | if (p->numHashBytes <= 4) | 1671 | if (p->numHashBytes <= 4) |
1643 | { | 1672 | { |
1644 | vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; | 1673 | vTable->GetMatches = Hc4_MatchFinder_GetMatches; |
1645 | vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; | 1674 | vTable->Skip = Hc4_MatchFinder_Skip; |
1646 | } | 1675 | } |
1647 | else | 1676 | else |
1648 | { | 1677 | { |
1649 | vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; | 1678 | vTable->GetMatches = Hc5_MatchFinder_GetMatches; |
1650 | vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; | 1679 | vTable->Skip = Hc5_MatchFinder_Skip; |
1651 | } | 1680 | } |
1652 | } | 1681 | } |
1653 | else if (p->numHashBytes == 2) | 1682 | else if (p->numHashBytes == 2) |
1654 | { | 1683 | { |
1655 | vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; | 1684 | vTable->GetMatches = Bt2_MatchFinder_GetMatches; |
1656 | vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; | 1685 | vTable->Skip = Bt2_MatchFinder_Skip; |
1657 | } | 1686 | } |
1658 | else if (p->numHashBytes == 3) | 1687 | else if (p->numHashBytes == 3) |
1659 | { | 1688 | { |
1660 | vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; | 1689 | vTable->GetMatches = Bt3_MatchFinder_GetMatches; |
1661 | vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; | 1690 | vTable->Skip = Bt3_MatchFinder_Skip; |
1662 | } | 1691 | } |
1663 | else if (p->numHashBytes == 4) | 1692 | else if (p->numHashBytes == 4) |
1664 | { | 1693 | { |
1665 | vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; | 1694 | vTable->GetMatches = Bt4_MatchFinder_GetMatches; |
1666 | vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; | 1695 | vTable->Skip = Bt4_MatchFinder_Skip; |
1667 | } | 1696 | } |
1668 | else | 1697 | else |
1669 | { | 1698 | { |
1670 | vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; | 1699 | vTable->GetMatches = Bt5_MatchFinder_GetMatches; |
1671 | vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; | 1700 | vTable->Skip = Bt5_MatchFinder_Skip; |
1672 | } | 1701 | } |
1673 | } | 1702 | } |
1674 | 1703 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFind.h -- Match finder for LZ algorithms | 1 | /* LzFind.h -- Match finder for LZ algorithms |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_LZ_FIND_H | 4 | #ifndef ZIP7_INC_LZ_FIND_H |
5 | #define ZIP7_INC_LZ_FIND_H | 5 | #define ZIP7_INC_LZ_FIND_H |
@@ -144,7 +144,8 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); | |||
144 | void MatchFinder_Init_LowHash(CMatchFinder *p); | 144 | void MatchFinder_Init_LowHash(CMatchFinder *p); |
145 | void MatchFinder_Init_HighHash(CMatchFinder *p); | 145 | void MatchFinder_Init_HighHash(CMatchFinder *p); |
146 | void MatchFinder_Init_4(CMatchFinder *p); | 146 | void MatchFinder_Init_4(CMatchFinder *p); |
147 | void MatchFinder_Init(CMatchFinder *p); | 147 | // void MatchFinder_Init(CMatchFinder *p); |
148 | void MatchFinder_Init(void *p); | ||
148 | 149 | ||
149 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); | 150 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); |
150 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); | 151 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); |
diff --git a/C/LzFindMt.c b/C/LzFindMt.c index 5253e6e..ac9d59d 100644 --- a/C/LzFindMt.c +++ b/C/LzFindMt.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -94,7 +94,7 @@ static void MtSync_Construct(CMtSync *p) | |||
94 | } | 94 | } |
95 | 95 | ||
96 | 96 | ||
97 | #define DEBUG_BUFFER_LOCK // define it to debug lock state | 97 | // #define DEBUG_BUFFER_LOCK // define it to debug lock state |
98 | 98 | ||
99 | #ifdef DEBUG_BUFFER_LOCK | 99 | #ifdef DEBUG_BUFFER_LOCK |
100 | #include <stdlib.h> | 100 | #include <stdlib.h> |
@@ -877,8 +877,9 @@ SRes MatchFinderMt_InitMt(CMatchFinderMt *p) | |||
877 | } | 877 | } |
878 | 878 | ||
879 | 879 | ||
880 | static void MatchFinderMt_Init(CMatchFinderMt *p) | 880 | static void MatchFinderMt_Init(void *_p) |
881 | { | 881 | { |
882 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
882 | CMatchFinder *mf = MF(p); | 883 | CMatchFinder *mf = MF(p); |
883 | 884 | ||
884 | p->btBufPos = | 885 | p->btBufPos = |
@@ -981,8 +982,9 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) | |||
981 | 982 | ||
982 | 983 | ||
983 | 984 | ||
984 | static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) | 985 | static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p) |
985 | { | 986 | { |
987 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
986 | return p->pointerToCurPos; | 988 | return p->pointerToCurPos; |
987 | } | 989 | } |
988 | 990 | ||
@@ -990,8 +992,9 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) | |||
990 | #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); | 992 | #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); |
991 | 993 | ||
992 | 994 | ||
993 | static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) | 995 | static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p) |
994 | { | 996 | { |
997 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
995 | if (p->btBufPos != p->btBufPosLimit) | 998 | if (p->btBufPos != p->btBufPosLimit) |
996 | return p->btNumAvailBytes; | 999 | return p->btNumAvailBytes; |
997 | return MatchFinderMt_GetNextBlock_Bt(p); | 1000 | return MatchFinderMt_GetNextBlock_Bt(p); |
@@ -1243,8 +1246,9 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) | |||
1243 | } | 1246 | } |
1244 | 1247 | ||
1245 | 1248 | ||
1246 | static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) | 1249 | static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d) |
1247 | { | 1250 | { |
1251 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
1248 | const UInt32 *bt = p->btBufPos; | 1252 | const UInt32 *bt = p->btBufPos; |
1249 | const UInt32 len = *bt++; | 1253 | const UInt32 len = *bt++; |
1250 | const UInt32 *btLim = bt + len; | 1254 | const UInt32 *btLim = bt + len; |
@@ -1267,8 +1271,9 @@ static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) | |||
1267 | 1271 | ||
1268 | 1272 | ||
1269 | 1273 | ||
1270 | static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) | 1274 | static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d) |
1271 | { | 1275 | { |
1276 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
1272 | const UInt32 *bt = p->btBufPos; | 1277 | const UInt32 *bt = p->btBufPos; |
1273 | UInt32 len = *bt++; | 1278 | UInt32 len = *bt++; |
1274 | const UInt32 avail = p->btNumAvailBytes - 1; | 1279 | const UInt32 avail = p->btNumAvailBytes - 1; |
@@ -1315,14 +1320,16 @@ static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) | |||
1315 | #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; | 1320 | #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; |
1316 | #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); | 1321 | #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); |
1317 | 1322 | ||
1318 | static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) | 1323 | static void MatchFinderMt0_Skip(void *_p, UInt32 num) |
1319 | { | 1324 | { |
1325 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
1320 | SKIP_HEADER2_MT { p->btNumAvailBytes--; | 1326 | SKIP_HEADER2_MT { p->btNumAvailBytes--; |
1321 | SKIP_FOOTER_MT | 1327 | SKIP_FOOTER_MT |
1322 | } | 1328 | } |
1323 | 1329 | ||
1324 | static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) | 1330 | static void MatchFinderMt2_Skip(void *_p, UInt32 num) |
1325 | { | 1331 | { |
1332 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
1326 | SKIP_HEADER_MT(2) | 1333 | SKIP_HEADER_MT(2) |
1327 | UInt32 h2; | 1334 | UInt32 h2; |
1328 | MT_HASH2_CALC | 1335 | MT_HASH2_CALC |
@@ -1330,8 +1337,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) | |||
1330 | SKIP_FOOTER_MT | 1337 | SKIP_FOOTER_MT |
1331 | } | 1338 | } |
1332 | 1339 | ||
1333 | static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) | 1340 | static void MatchFinderMt3_Skip(void *_p, UInt32 num) |
1334 | { | 1341 | { |
1342 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
1335 | SKIP_HEADER_MT(3) | 1343 | SKIP_HEADER_MT(3) |
1336 | UInt32 h2, h3; | 1344 | UInt32 h2, h3; |
1337 | MT_HASH3_CALC | 1345 | MT_HASH3_CALC |
@@ -1361,39 +1369,39 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) | |||
1361 | 1369 | ||
1362 | void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) | 1370 | void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) |
1363 | { | 1371 | { |
1364 | vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; | 1372 | vTable->Init = MatchFinderMt_Init; |
1365 | vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; | 1373 | vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes; |
1366 | vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; | 1374 | vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos; |
1367 | vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; | 1375 | vTable->GetMatches = MatchFinderMt_GetMatches; |
1368 | 1376 | ||
1369 | switch (MF(p)->numHashBytes) | 1377 | switch (MF(p)->numHashBytes) |
1370 | { | 1378 | { |
1371 | case 2: | 1379 | case 2: |
1372 | p->GetHeadsFunc = GetHeads2; | 1380 | p->GetHeadsFunc = GetHeads2; |
1373 | p->MixMatchesFunc = (Mf_Mix_Matches)NULL; | 1381 | p->MixMatchesFunc = NULL; |
1374 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; | 1382 | vTable->Skip = MatchFinderMt0_Skip; |
1375 | vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; | 1383 | vTable->GetMatches = MatchFinderMt2_GetMatches; |
1376 | break; | 1384 | break; |
1377 | case 3: | 1385 | case 3: |
1378 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; | 1386 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; |
1379 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; | 1387 | p->MixMatchesFunc = MixMatches2; |
1380 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; | 1388 | vTable->Skip = MatchFinderMt2_Skip; |
1381 | break; | 1389 | break; |
1382 | case 4: | 1390 | case 4: |
1383 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; | 1391 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; |
1384 | 1392 | ||
1385 | // it's fast inline version of GetMatches() | 1393 | // it's fast inline version of GetMatches() |
1386 | // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; | 1394 | // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4; |
1387 | 1395 | ||
1388 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; | 1396 | p->MixMatchesFunc = MixMatches3; |
1389 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; | 1397 | vTable->Skip = MatchFinderMt3_Skip; |
1390 | break; | 1398 | break; |
1391 | default: | 1399 | default: |
1392 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; | 1400 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; |
1393 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; | 1401 | p->MixMatchesFunc = MixMatches4; |
1394 | vTable->Skip = | 1402 | vTable->Skip = |
1395 | (Mf_Skip_Func)MatchFinderMt3_Skip; | 1403 | MatchFinderMt3_Skip; |
1396 | // (Mf_Skip_Func)MatchFinderMt4_Skip; | 1404 | // MatchFinderMt4_Skip; |
1397 | break; | 1405 | break; |
1398 | } | 1406 | } |
1399 | } | 1407 | } |
diff --git a/C/LzFindMt.h b/C/LzFindMt.h index db5923e..fcb479d 100644 --- a/C/LzFindMt.h +++ b/C/LzFindMt.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms |
2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_LZ_FIND_MT_H | 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H |
5 | #define ZIP7_INC_LZ_FIND_MT_H | 5 | #define ZIP7_INC_LZ_FIND_MT_H |
@@ -31,7 +31,10 @@ typedef struct | |||
31 | // UInt32 numBlocks_Sent; | 31 | // UInt32 numBlocks_Sent; |
32 | } CMtSync; | 32 | } CMtSync; |
33 | 33 | ||
34 | typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); | 34 | |
35 | struct CMatchFinderMt_; | ||
36 | |||
37 | typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances); | ||
35 | 38 | ||
36 | /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ | 39 | /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ |
37 | #define kMtCacheLineDummy 128 | 40 | #define kMtCacheLineDummy 128 |
@@ -39,7 +42,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance | |||
39 | typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, | 42 | typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, |
40 | UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); | 43 | UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); |
41 | 44 | ||
42 | typedef struct | 45 | typedef struct CMatchFinderMt_ |
43 | { | 46 | { |
44 | /* LZ */ | 47 | /* LZ */ |
45 | const Byte *pointerToCurPos; | 48 | const Byte *pointerToCurPos; |
diff --git a/C/Lzma2Dec.c b/C/Lzma2Dec.c index 388cbc7..8bf54e4 100644 --- a/C/Lzma2Dec.c +++ b/C/Lzma2Dec.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Lzma2Dec.c -- LZMA2 Decoder | 1 | /* Lzma2Dec.c -- LZMA2 Decoder |
2 | 2023-03-03 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | /* #define SHOW_DEBUG_INFO */ | 4 | /* #define SHOW_DEBUG_INFO */ |
5 | 5 | ||
@@ -157,8 +157,10 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) | |||
157 | p->decoder.prop.lp = (Byte)lp; | 157 | p->decoder.prop.lp = (Byte)lp; |
158 | return LZMA2_STATE_DATA; | 158 | return LZMA2_STATE_DATA; |
159 | } | 159 | } |
160 | |||
161 | default: | ||
162 | return LZMA2_STATE_ERROR; | ||
160 | } | 163 | } |
161 | return LZMA2_STATE_ERROR; | ||
162 | } | 164 | } |
163 | 165 | ||
164 | static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) | 166 | static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) |
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 6d13cac..37b2787 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* LzmaEnc.c -- LZMA Encoder | 1 | /* LzmaEnc.c -- LZMA Encoder |
2 | 2023-04-13: Igor Pavlov : Public domain */ | 2 | 2024-01-24: Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -195,11 +195,11 @@ unsigned GetPosSlot1(UInt32 pos); | |||
195 | unsigned GetPosSlot1(UInt32 pos) | 195 | unsigned GetPosSlot1(UInt32 pos) |
196 | { | 196 | { |
197 | unsigned res; | 197 | unsigned res; |
198 | BSR2_RET(pos, res); | 198 | BSR2_RET(pos, res) |
199 | return res; | 199 | return res; |
200 | } | 200 | } |
201 | #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } | 201 | #define GetPosSlot2(pos, res) { BSR2_RET(pos, res) } |
202 | #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } | 202 | #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) } |
203 | 203 | ||
204 | 204 | ||
205 | #else // ! LZMA_LOG_BSR | 205 | #else // ! LZMA_LOG_BSR |
@@ -512,7 +512,7 @@ struct CLzmaEnc | |||
512 | COPY_ARR(d, s, posEncoders) \ | 512 | COPY_ARR(d, s, posEncoders) \ |
513 | (d)->lenProbs = (s)->lenProbs; \ | 513 | (d)->lenProbs = (s)->lenProbs; \ |
514 | (d)->repLenProbs = (s)->repLenProbs; \ | 514 | (d)->repLenProbs = (s)->repLenProbs; \ |
515 | memcpy((d)->litProbs, (s)->litProbs, ((UInt32)0x300 << (p)->lclp) * sizeof(CLzmaProb)); | 515 | memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp); |
516 | 516 | ||
517 | void LzmaEnc_SaveState(CLzmaEncHandle p) | 517 | void LzmaEnc_SaveState(CLzmaEncHandle p) |
518 | { | 518 | { |
@@ -1040,14 +1040,14 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( | |||
1040 | UInt32 price = b; | 1040 | UInt32 price = b; |
1041 | do | 1041 | do |
1042 | { | 1042 | { |
1043 | unsigned bit = sym & 1; | 1043 | const unsigned bit = sym & 1; |
1044 | sym >>= 1; | 1044 | sym >>= 1; |
1045 | price += GET_PRICEa(probs[sym], bit); | 1045 | price += GET_PRICEa(probs[sym], bit); |
1046 | } | 1046 | } |
1047 | while (sym >= 2); | 1047 | while (sym >= 2); |
1048 | 1048 | ||
1049 | { | 1049 | { |
1050 | unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; | 1050 | const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; |
1051 | prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); | 1051 | prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); |
1052 | prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); | 1052 | prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); |
1053 | } | 1053 | } |
@@ -1056,7 +1056,7 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( | |||
1056 | 1056 | ||
1057 | { | 1057 | { |
1058 | unsigned posState; | 1058 | unsigned posState; |
1059 | size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); | 1059 | const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); |
1060 | for (posState = 1; posState < numPosStates; posState++) | 1060 | for (posState = 1; posState < numPosStates; posState++) |
1061 | memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); | 1061 | memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); |
1062 | } | 1062 | } |
@@ -2696,12 +2696,12 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, | |||
2696 | #endif | 2696 | #endif |
2697 | 2697 | ||
2698 | { | 2698 | { |
2699 | unsigned lclp = p->lc + p->lp; | 2699 | const unsigned lclp = p->lc + p->lp; |
2700 | if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) | 2700 | if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) |
2701 | { | 2701 | { |
2702 | LzmaEnc_FreeLits(p, alloc); | 2702 | LzmaEnc_FreeLits(p, alloc); |
2703 | p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); | 2703 | p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); |
2704 | p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); | 2704 | p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); |
2705 | if (!p->litProbs || !p->saveState.litProbs) | 2705 | if (!p->litProbs || !p->saveState.litProbs) |
2706 | { | 2706 | { |
2707 | LzmaEnc_FreeLits(p, alloc); | 2707 | LzmaEnc_FreeLits(p, alloc); |
@@ -2802,8 +2802,8 @@ static void LzmaEnc_Init(CLzmaEnc *p) | |||
2802 | } | 2802 | } |
2803 | 2803 | ||
2804 | { | 2804 | { |
2805 | UInt32 num = (UInt32)0x300 << (p->lp + p->lc); | 2805 | const size_t num = (size_t)0x300 << (p->lp + p->lc); |
2806 | UInt32 k; | 2806 | size_t k; |
2807 | CLzmaProb *probs = p->litProbs; | 2807 | CLzmaProb *probs = p->litProbs; |
2808 | for (k = 0; k < num; k++) | 2808 | for (k = 0; k < num; k++) |
2809 | probs[k] = kProbInitValue; | 2809 | probs[k] = kProbInitValue; |
diff --git a/C/MtCoder.c b/C/MtCoder.c index 6f58abb..03959b6 100644 --- a/C/MtCoder.c +++ b/C/MtCoder.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* MtCoder.c -- Multi-thread Coder | 1 | /* MtCoder.c -- Multi-thread Coder |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -430,7 +430,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
430 | SRes res = SZ_OK; | 430 | SRes res = SZ_OK; |
431 | 431 | ||
432 | if (numThreads > MTCODER_THREADS_MAX) | 432 | if (numThreads > MTCODER_THREADS_MAX) |
433 | numThreads = MTCODER_THREADS_MAX; | 433 | numThreads = MTCODER_THREADS_MAX; |
434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); | 434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); |
435 | 435 | ||
436 | if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; | 436 | if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; |
@@ -438,7 +438,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
438 | if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; | 438 | if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; |
439 | 439 | ||
440 | if (numBlocksMax > MTCODER_BLOCKS_MAX) | 440 | if (numBlocksMax > MTCODER_BLOCKS_MAX) |
441 | numBlocksMax = MTCODER_BLOCKS_MAX; | 441 | numBlocksMax = MTCODER_BLOCKS_MAX; |
442 | 442 | ||
443 | if (p->blockSize != p->allocatedBufsSize) | 443 | if (p->blockSize != p->allocatedBufsSize) |
444 | { | 444 | { |
@@ -469,7 +469,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
469 | 469 | ||
470 | { | 470 | { |
471 | RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) | 471 | RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) |
472 | RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)) | 472 | RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax)) |
473 | } | 473 | } |
474 | 474 | ||
475 | for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) | 475 | for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) |
@@ -1,5 +1,5 @@ | |||
1 | /* MtDec.c -- Multi-thread Decoder | 1 | /* MtDec.c -- Multi-thread Decoder |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-02-20 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -809,6 +809,16 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t) | |||
809 | #endif | 809 | #endif |
810 | 810 | ||
811 | 811 | ||
812 | typedef | ||
813 | #ifdef _WIN32 | ||
814 | UINT_PTR | ||
815 | #elif 1 | ||
816 | uintptr_t | ||
817 | #else | ||
818 | ptrdiff_t | ||
819 | #endif | ||
820 | MY_uintptr_t; | ||
821 | |||
812 | static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | 822 | static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) |
813 | { | 823 | { |
814 | WRes res; | 824 | WRes res; |
@@ -821,7 +831,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | |||
821 | res = MtDec_ThreadFunc2(t); | 831 | res = MtDec_ThreadFunc2(t); |
822 | p = t->mtDec; | 832 | p = t->mtDec; |
823 | if (res == 0) | 833 | if (res == 0) |
824 | return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; | 834 | return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes; |
825 | { | 835 | { |
826 | // it's unexpected situation for some threading function error | 836 | // it's unexpected situation for some threading function error |
827 | if (p->exitThreadWRes == 0) | 837 | if (p->exitThreadWRes == 0) |
@@ -832,7 +842,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | |||
832 | Event_Set(&p->threads[0].canWrite); | 842 | Event_Set(&p->threads[0].canWrite); |
833 | MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); | 843 | MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); |
834 | } | 844 | } |
835 | return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; | 845 | return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res; |
836 | } | 846 | } |
837 | 847 | ||
838 | static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) | 848 | static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) |
@@ -1072,7 +1082,7 @@ SRes MtDec_Code(CMtDec *p) | |||
1072 | if (wres == 0) { wres = Event_Set(&nextThread->canWrite); | 1082 | if (wres == 0) { wres = Event_Set(&nextThread->canWrite); |
1073 | if (wres == 0) { wres = Event_Set(&nextThread->canRead); | 1083 | if (wres == 0) { wres = Event_Set(&nextThread->canRead); |
1074 | if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); | 1084 | if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); |
1075 | wres = (WRes)(UINT_PTR)res; | 1085 | wres = (WRes)(MY_uintptr_t)res; |
1076 | if (wres != 0) | 1086 | if (wres != 0) |
1077 | { | 1087 | { |
1078 | p->needContinue = False; | 1088 | p->needContinue = False; |
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd7.c -- PPMdH codec | 1 | /* Ppmd7.c -- PPMdH codec |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 3 | This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
@@ -302,8 +302,17 @@ static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx) | |||
302 | 302 | ||
303 | 303 | ||
304 | #define MEM_12_CPY(dest, src, num) \ | 304 | #define MEM_12_CPY(dest, src, num) \ |
305 | { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ | 305 | { UInt32 *d = (UInt32 *)(dest); \ |
306 | do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } | 306 | const UInt32 *z = (const UInt32 *)(src); \ |
307 | unsigned n = (num); \ | ||
308 | do { \ | ||
309 | d[0] = z[0]; \ | ||
310 | d[1] = z[1]; \ | ||
311 | d[2] = z[2]; \ | ||
312 | z += 3; \ | ||
313 | d += 3; \ | ||
314 | } while (--n); \ | ||
315 | } | ||
307 | 316 | ||
308 | 317 | ||
309 | /* | 318 | /* |
@@ -711,8 +720,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
711 | if ((ns1 & 1) == 0) | 720 | if ((ns1 & 1) == 0) |
712 | { | 721 | { |
713 | /* Expand for one UNIT */ | 722 | /* Expand for one UNIT */ |
714 | unsigned oldNU = ns1 >> 1; | 723 | const unsigned oldNU = ns1 >> 1; |
715 | unsigned i = U2I(oldNU); | 724 | const unsigned i = U2I(oldNU); |
716 | if (i != U2I((size_t)oldNU + 1)) | 725 | if (i != U2I((size_t)oldNU + 1)) |
717 | { | 726 | { |
718 | void *ptr = Ppmd7_AllocUnits(p, i + 1); | 727 | void *ptr = Ppmd7_AllocUnits(p, i + 1); |
@@ -731,7 +740,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
731 | sum = c->Union2.SummFreq; | 740 | sum = c->Union2.SummFreq; |
732 | /* max increase of Escape_Freq is 3 here. | 741 | /* max increase of Escape_Freq is 3 here. |
733 | total increase of Union2.SummFreq for all symbols is less than 256 here */ | 742 | total increase of Union2.SummFreq for all symbols is less than 256 here */ |
734 | sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); | 743 | sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1))); |
735 | /* original PPMdH uses 16-bit variable for (sum) here. | 744 | /* original PPMdH uses 16-bit variable for (sum) here. |
736 | But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ | 745 | But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ |
737 | // sum = (UInt16)sum; | 746 | // sum = (UInt16)sum; |
@@ -761,7 +770,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
761 | // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context | 770 | // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context |
762 | s->Freq = (Byte)freq; | 771 | s->Freq = (Byte)freq; |
763 | // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here | 772 | // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here |
764 | sum = freq + p->InitEsc + (ns > 3); | 773 | sum = (UInt32)(freq + p->InitEsc + (ns > 3)); |
765 | } | 774 | } |
766 | } | 775 | } |
767 | 776 | ||
@@ -933,10 +942,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) | |||
933 | p->HiBitsFlag; | 942 | p->HiBitsFlag; |
934 | { | 943 | { |
935 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ | 944 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ |
936 | unsigned summ = (UInt16)see->Summ; // & 0xFFFF | 945 | const unsigned summ = (UInt16)see->Summ; // & 0xFFFF |
937 | unsigned r = (summ >> see->Shift); | 946 | const unsigned r = (summ >> see->Shift); |
938 | see->Summ = (UInt16)(summ - r); | 947 | see->Summ = (UInt16)(summ - r); |
939 | *escFreq = r + (r == 0); | 948 | *escFreq = (UInt32)(r + (r == 0)); |
940 | } | 949 | } |
941 | } | 950 | } |
942 | else | 951 | else |
@@ -981,9 +990,9 @@ void Ppmd7_Update1_0(CPpmd7 *p) | |||
981 | CPpmd_State *s = p->FoundState; | 990 | CPpmd_State *s = p->FoundState; |
982 | CPpmd7_Context *mc = p->MinContext; | 991 | CPpmd7_Context *mc = p->MinContext; |
983 | unsigned freq = s->Freq; | 992 | unsigned freq = s->Freq; |
984 | unsigned summFreq = mc->Union2.SummFreq; | 993 | const unsigned summFreq = mc->Union2.SummFreq; |
985 | p->PrevSuccess = (2 * freq > summFreq); | 994 | p->PrevSuccess = (2 * freq > summFreq); |
986 | p->RunLength += (int)p->PrevSuccess; | 995 | p->RunLength += (Int32)p->PrevSuccess; |
987 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); | 996 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); |
988 | freq += 4; | 997 | freq += 4; |
989 | s->Freq = (Byte)freq; | 998 | s->Freq = (Byte)freq; |
diff --git a/C/Ppmd7Dec.c b/C/Ppmd7Dec.c index 8323828..081ab89 100644 --- a/C/Ppmd7Dec.c +++ b/C/Ppmd7Dec.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder | 1 | /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on: | 3 | This code is based on: |
4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
5 | 5 | ||
@@ -58,7 +58,7 @@ static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size) | |||
58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
59 | void Ppmd7_UpdateModel(CPpmd7 *p); | 59 | void Ppmd7_UpdateModel(CPpmd7 *p); |
60 | 60 | ||
61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
62 | // Z7_FORCE_INLINE | 62 | // Z7_FORCE_INLINE |
63 | // static | 63 | // static |
64 | int Ppmd7z_DecodeSymbol(CPpmd7 *p) | 64 | int Ppmd7z_DecodeSymbol(CPpmd7 *p) |
@@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
121 | do | 121 | do |
122 | { | 122 | { |
123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
125 | s2 += 2; | 125 | s2 += 2; |
126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
@@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
210 | 210 | ||
211 | num &= 1; | 211 | num &= 1; |
212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
213 | s += num; | 213 | s += num; |
214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
215 | 215 | ||
216 | do | 216 | do |
217 | { | 217 | { |
218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
220 | s += 2; | 220 | s += 2; |
221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
223 | } | 223 | } |
224 | while (--num2); | 224 | while (--num2); |
225 | } | 225 | } |
@@ -238,13 +238,13 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
238 | 238 | ||
239 | s = Ppmd7_GetStats(p, p->MinContext); | 239 | s = Ppmd7_GetStats(p, p->MinContext); |
240 | hiCnt = count; | 240 | hiCnt = count; |
241 | // count -= s->Freq & (unsigned)(MASK(s->Symbol)); | 241 | // count -= s->Freq & (UInt32)(MASK(s->Symbol)); |
242 | // if ((Int32)count >= 0) | 242 | // if ((Int32)count >= 0) |
243 | { | 243 | { |
244 | for (;;) | 244 | for (;;) |
245 | { | 245 | { |
246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
248 | } | 248 | } |
249 | } | 249 | } |
250 | s--; | 250 | s--; |
diff --git a/C/Ppmd7Enc.c b/C/Ppmd7Enc.c index 41106ba..49cbbe6 100644 --- a/C/Ppmd7Enc.c +++ b/C/Ppmd7Enc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder | 1 | /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on: | 3 | This code is based on: |
4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
5 | 5 | ||
@@ -82,7 +82,7 @@ void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) | |||
82 | 82 | ||
83 | void Ppmd7_UpdateModel(CPpmd7 *p); | 83 | void Ppmd7_UpdateModel(CPpmd7 *p); |
84 | 84 | ||
85 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 85 | #define MASK(sym) ((Byte *)charMask)[sym] |
86 | 86 | ||
87 | Z7_FORCE_INLINE | 87 | Z7_FORCE_INLINE |
88 | static | 88 | static |
@@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) | |||
139 | MASK(s->Symbol) = 0; | 139 | MASK(s->Symbol) = 0; |
140 | do | 140 | do |
141 | { | 141 | { |
142 | unsigned sym0 = s2[0].Symbol; | 142 | const unsigned sym0 = s2[0].Symbol; |
143 | unsigned sym1 = s2[1].Symbol; | 143 | const unsigned sym1 = s2[1].Symbol; |
144 | s2 += 2; | 144 | s2 += 2; |
145 | MASK(sym0) = 0; | 145 | MASK(sym0) = 0; |
146 | MASK(sym1) = 0; | 146 | MASK(sym1) = 0; |
@@ -265,16 +265,15 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) | |||
265 | if (num2 != 0) | 265 | if (num2 != 0) |
266 | { | 266 | { |
267 | s += i; | 267 | s += i; |
268 | for (;;) | 268 | do |
269 | { | 269 | { |
270 | unsigned sym0 = s[0].Symbol; | 270 | const unsigned sym0 = s[0].Symbol; |
271 | unsigned sym1 = s[1].Symbol; | 271 | const unsigned sym1 = s[1].Symbol; |
272 | s += 2; | 272 | s += 2; |
273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); | 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); |
274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); | 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); |
275 | if (--num2 == 0) | ||
276 | break; | ||
277 | } | 275 | } |
276 | while (--num2); | ||
278 | } | 277 | } |
279 | 278 | ||
280 | 279 | ||
diff --git a/C/Ppmd7aDec.c b/C/Ppmd7aDec.c index 55e164e..ef86dde 100644 --- a/C/Ppmd7aDec.c +++ b/C/Ppmd7aDec.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder | 1 | /* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on: | 3 | This code is based on: |
4 | PPMd var.H (2001): Dmitry Shkarin : Public domain | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain |
5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
@@ -58,7 +58,7 @@ typedef CPpmd7_Context * CTX_PTR; | |||
58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
59 | void Ppmd7_UpdateModel(CPpmd7 *p); | 59 | void Ppmd7_UpdateModel(CPpmd7 *p); |
60 | 60 | ||
61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
62 | 62 | ||
63 | 63 | ||
64 | int Ppmd7a_DecodeSymbol(CPpmd7 *p) | 64 | int Ppmd7a_DecodeSymbol(CPpmd7 *p) |
@@ -120,8 +120,8 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
121 | do | 121 | do |
122 | { | 122 | { |
123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
125 | s2 += 2; | 125 | s2 += 2; |
126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
@@ -209,17 +209,17 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
210 | 210 | ||
211 | num &= 1; | 211 | num &= 1; |
212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
213 | s += num; | 213 | s += num; |
214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
215 | 215 | ||
216 | do | 216 | do |
217 | { | 217 | { |
218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
220 | s += 2; | 220 | s += 2; |
221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
223 | } | 223 | } |
224 | while (--num2); | 224 | while (--num2); |
225 | } | 225 | } |
@@ -238,13 +238,13 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
238 | 238 | ||
239 | s = Ppmd7_GetStats(p, p->MinContext); | 239 | s = Ppmd7_GetStats(p, p->MinContext); |
240 | hiCnt = count; | 240 | hiCnt = count; |
241 | // count -= s->Freq & (unsigned)(MASK(s->Symbol)); | 241 | // count -= s->Freq & (UInt32)(MASK(s->Symbol)); |
242 | // if ((Int32)count >= 0) | 242 | // if ((Int32)count >= 0) |
243 | { | 243 | { |
244 | for (;;) | 244 | for (;;) |
245 | { | 245 | { |
246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
248 | } | 248 | } |
249 | } | 249 | } |
250 | s--; | 250 | s--; |
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd8.c -- PPMdI codec | 1 | /* Ppmd8.c -- PPMdI codec |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ | 3 | This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
@@ -302,8 +302,17 @@ static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx) | |||
302 | 302 | ||
303 | 303 | ||
304 | #define MEM_12_CPY(dest, src, num) \ | 304 | #define MEM_12_CPY(dest, src, num) \ |
305 | { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ | 305 | { UInt32 *d = (UInt32 *)(dest); \ |
306 | do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } | 306 | const UInt32 *z = (const UInt32 *)(src); \ |
307 | unsigned n = (num); \ | ||
308 | do { \ | ||
309 | d[0] = z[0]; \ | ||
310 | d[1] = z[1]; \ | ||
311 | d[2] = z[2]; \ | ||
312 | z += 3; \ | ||
313 | d += 3; \ | ||
314 | } while (--n); \ | ||
315 | } | ||
307 | 316 | ||
308 | 317 | ||
309 | 318 | ||
@@ -1215,8 +1224,8 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
1215 | if ((ns1 & 1) != 0) | 1224 | if ((ns1 & 1) != 0) |
1216 | { | 1225 | { |
1217 | /* Expand for one UNIT */ | 1226 | /* Expand for one UNIT */ |
1218 | unsigned oldNU = (ns1 + 1) >> 1; | 1227 | const unsigned oldNU = (ns1 + 1) >> 1; |
1219 | unsigned i = U2I(oldNU); | 1228 | const unsigned i = U2I(oldNU); |
1220 | if (i != U2I((size_t)oldNU + 1)) | 1229 | if (i != U2I((size_t)oldNU + 1)) |
1221 | { | 1230 | { |
1222 | void *ptr = Ppmd8_AllocUnits(p, i + 1); | 1231 | void *ptr = Ppmd8_AllocUnits(p, i + 1); |
@@ -1235,7 +1244,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
1235 | sum = c->Union2.SummFreq; | 1244 | sum = c->Union2.SummFreq; |
1236 | /* max increase of Escape_Freq is 1 here. | 1245 | /* max increase of Escape_Freq is 1 here. |
1237 | an average increase is 1/3 per symbol */ | 1246 | an average increase is 1/3 per symbol */ |
1238 | sum += (3 * ns1 + 1 < ns); | 1247 | sum += (UInt32)(unsigned)(3 * ns1 + 1 < ns); |
1239 | /* original PPMdH uses 16-bit variable for (sum) here. | 1248 | /* original PPMdH uses 16-bit variable for (sum) here. |
1240 | But (sum < ???). Do we need to truncate (sum) to 16-bit */ | 1249 | But (sum < ???). Do we need to truncate (sum) to 16-bit */ |
1241 | // sum = (UInt16)sum; | 1250 | // sum = (UInt16)sum; |
@@ -1265,7 +1274,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
1265 | 1274 | ||
1266 | s->Freq = (Byte)freq; | 1275 | s->Freq = (Byte)freq; |
1267 | 1276 | ||
1268 | sum = freq + p->InitEsc + (ns > 2); // Ppmd8 (> 2) | 1277 | sum = (UInt32)(freq + p->InitEsc + (ns > 2)); // Ppmd8 (> 2) |
1269 | } | 1278 | } |
1270 | } | 1279 | } |
1271 | 1280 | ||
@@ -1437,10 +1446,10 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq) | |||
1437 | 1446 | ||
1438 | { | 1447 | { |
1439 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ | 1448 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ |
1440 | unsigned summ = (UInt16)see->Summ; // & 0xFFFF | 1449 | const unsigned summ = (UInt16)see->Summ; // & 0xFFFF |
1441 | unsigned r = (summ >> see->Shift); | 1450 | const unsigned r = (summ >> see->Shift); |
1442 | see->Summ = (UInt16)(summ - r); | 1451 | see->Summ = (UInt16)(summ - r); |
1443 | *escFreq = r + (r == 0); | 1452 | *escFreq = (UInt32)(r + (r == 0)); |
1444 | } | 1453 | } |
1445 | } | 1454 | } |
1446 | else | 1455 | else |
@@ -1485,9 +1494,9 @@ void Ppmd8_Update1_0(CPpmd8 *p) | |||
1485 | CPpmd_State *s = p->FoundState; | 1494 | CPpmd_State *s = p->FoundState; |
1486 | CPpmd8_Context *mc = p->MinContext; | 1495 | CPpmd8_Context *mc = p->MinContext; |
1487 | unsigned freq = s->Freq; | 1496 | unsigned freq = s->Freq; |
1488 | unsigned summFreq = mc->Union2.SummFreq; | 1497 | const unsigned summFreq = mc->Union2.SummFreq; |
1489 | p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) | 1498 | p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) |
1490 | p->RunLength += (int)p->PrevSuccess; | 1499 | p->RunLength += (Int32)p->PrevSuccess; |
1491 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); | 1500 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); |
1492 | freq += 4; | 1501 | freq += 4; |
1493 | s->Freq = (Byte)freq; | 1502 | s->Freq = (Byte)freq; |
diff --git a/C/Ppmd8Dec.c b/C/Ppmd8Dec.c index 72d3626..ff91167 100644 --- a/C/Ppmd8Dec.c +++ b/C/Ppmd8Dec.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder | 1 | /* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on: | 3 | This code is based on: |
4 | PPMd var.I (2002): Dmitry Shkarin : Public domain | 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain |
5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
@@ -58,7 +58,7 @@ static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size) | |||
58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
59 | void Ppmd8_UpdateModel(CPpmd8 *p); | 59 | void Ppmd8_UpdateModel(CPpmd8 *p); |
60 | 60 | ||
61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
62 | 62 | ||
63 | 63 | ||
64 | int Ppmd8_DecodeSymbol(CPpmd8 *p) | 64 | int Ppmd8_DecodeSymbol(CPpmd8 *p) |
@@ -120,8 +120,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
121 | do | 121 | do |
122 | { | 122 | { |
123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
125 | s2 += 2; | 125 | s2 += 2; |
126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
@@ -209,17 +209,17 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
210 | 210 | ||
211 | num &= 1; | 211 | num &= 1; |
212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
213 | s += num; | 213 | s += num; |
214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
215 | 215 | ||
216 | do | 216 | do |
217 | { | 217 | { |
218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
220 | s += 2; | 220 | s += 2; |
221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
223 | } | 223 | } |
224 | while (--num2); | 224 | while (--num2); |
225 | } | 225 | } |
@@ -243,8 +243,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
243 | { | 243 | { |
244 | for (;;) | 244 | for (;;) |
245 | { | 245 | { |
246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
248 | } | 248 | } |
249 | } | 249 | } |
250 | s--; | 250 | s--; |
diff --git a/C/Ppmd8Enc.c b/C/Ppmd8Enc.c index 9e29ef7..b0e34c4 100644 --- a/C/Ppmd8Enc.c +++ b/C/Ppmd8Enc.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder | 1 | /* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
3 | This code is based on: | 3 | This code is based on: |
4 | PPMd var.I (2002): Dmitry Shkarin : Public domain | 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain |
5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
@@ -82,7 +82,7 @@ static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 t | |||
82 | 82 | ||
83 | void Ppmd8_UpdateModel(CPpmd8 *p); | 83 | void Ppmd8_UpdateModel(CPpmd8 *p); |
84 | 84 | ||
85 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 85 | #define MASK(sym) ((Byte *)charMask)[sym] |
86 | 86 | ||
87 | // Z7_FORCE_INLINE | 87 | // Z7_FORCE_INLINE |
88 | // static | 88 | // static |
@@ -139,8 +139,8 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol) | |||
139 | MASK(s->Symbol) = 0; | 139 | MASK(s->Symbol) = 0; |
140 | do | 140 | do |
141 | { | 141 | { |
142 | unsigned sym0 = s2[0].Symbol; | 142 | const unsigned sym0 = s2[0].Symbol; |
143 | unsigned sym1 = s2[1].Symbol; | 143 | const unsigned sym1 = s2[1].Symbol; |
144 | s2 += 2; | 144 | s2 += 2; |
145 | MASK(sym0) = 0; | 145 | MASK(sym0) = 0; |
146 | MASK(sym1) = 0; | 146 | MASK(sym1) = 0; |
@@ -265,16 +265,15 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol) | |||
265 | if (num2 != 0) | 265 | if (num2 != 0) |
266 | { | 266 | { |
267 | s += i; | 267 | s += i; |
268 | for (;;) | 268 | do |
269 | { | 269 | { |
270 | unsigned sym0 = s[0].Symbol; | 270 | const unsigned sym0 = s[0].Symbol; |
271 | unsigned sym1 = s[1].Symbol; | 271 | const unsigned sym1 = s[1].Symbol; |
272 | s += 2; | 272 | s += 2; |
273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); | 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); |
274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); | 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); |
275 | if (--num2 == 0) | ||
276 | break; | ||
277 | } | 275 | } |
276 | while (--num2); | ||
278 | } | 277 | } |
279 | 278 | ||
280 | PPMD8_CORRECT_SUM_RANGE(p, sum) | 279 | PPMD8_CORRECT_SUM_RANGE(p, sum) |
diff --git a/C/Precomp.h b/C/Precomp.h index 69afb2f..7747fdd 100644 --- a/C/Precomp.h +++ b/C/Precomp.h | |||
@@ -1,10 +1,127 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- precompilation file |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-25 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | #ifndef ZIP7_INC_PRECOMP_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | #define ZIP7_INC_PRECOMP_H |
6 | 6 | ||
7 | /* | ||
8 | this file must be included before another *.h files and before <windows.h>. | ||
9 | this file is included from the following files: | ||
10 | C\*.c | ||
11 | C\Util\*\Precomp.h <- C\Util\*\*.c | ||
12 | CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp | ||
13 | |||
14 | this file can set the following macros: | ||
15 | Z7_LARGE_PAGES 1 | ||
16 | Z7_LONG_PATH 1 | ||
17 | Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip | ||
18 | _WIN32_WINNT 0x0500 (or higher) | ||
19 | WINVER _WIN32_WINNT | ||
20 | UNICODE 1 | ||
21 | _UNICODE 1 | ||
22 | */ | ||
23 | |||
7 | #include "Compiler.h" | 24 | #include "Compiler.h" |
8 | /* #include "7zTypes.h" */ | 25 | |
26 | #ifdef _MSC_VER | ||
27 | // #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty | ||
28 | #if _MSC_VER >= 1912 | ||
29 | // #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception. | ||
30 | #endif | ||
31 | #endif | ||
32 | |||
33 | /* | ||
34 | // for debug: | ||
35 | #define UNICODE 1 | ||
36 | #define _UNICODE 1 | ||
37 | #define _WIN32_WINNT 0x0500 // win2000 | ||
38 | #ifndef WINVER | ||
39 | #define WINVER _WIN32_WINNT | ||
40 | #endif | ||
41 | */ | ||
42 | |||
43 | #ifdef _WIN32 | ||
44 | /* | ||
45 | this "Precomp.h" file must be included before <windows.h>, | ||
46 | if we want to define _WIN32_WINNT before <windows.h>. | ||
47 | */ | ||
48 | |||
49 | #ifndef Z7_LARGE_PAGES | ||
50 | #ifndef Z7_NO_LARGE_PAGES | ||
51 | #define Z7_LARGE_PAGES 1 | ||
52 | #endif | ||
53 | #endif | ||
54 | |||
55 | #ifndef Z7_LONG_PATH | ||
56 | #ifndef Z7_NO_LONG_PATH | ||
57 | #define Z7_LONG_PATH 1 | ||
58 | #endif | ||
59 | #endif | ||
60 | |||
61 | #ifndef Z7_DEVICE_FILE | ||
62 | #ifndef Z7_NO_DEVICE_FILE | ||
63 | // #define Z7_DEVICE_FILE 1 | ||
64 | #endif | ||
65 | #endif | ||
66 | |||
67 | // we don't change macros if included after <windows.h> | ||
68 | #ifndef _WINDOWS_ | ||
69 | |||
70 | #ifndef Z7_WIN32_WINNT_MIN | ||
71 | #if defined(_M_ARM64) || defined(__aarch64__) | ||
72 | // #define Z7_WIN32_WINNT_MIN 0x0a00 // win10 | ||
73 | #define Z7_WIN32_WINNT_MIN 0x0600 // vista | ||
74 | #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT) | ||
75 | // #define Z7_WIN32_WINNT_MIN 0x0602 // win8 | ||
76 | #define Z7_WIN32_WINNT_MIN 0x0600 // vista | ||
77 | #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64) | ||
78 | #define Z7_WIN32_WINNT_MIN 0x0503 // win2003 | ||
79 | // #elif defined(_M_IX86) || defined(__i386__) | ||
80 | // #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 | ||
81 | #else // x86 and another(old) systems | ||
82 | #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 | ||
83 | // #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug | ||
84 | #endif | ||
85 | #endif // Z7_WIN32_WINNT_MIN | ||
86 | |||
87 | |||
88 | #ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT | ||
89 | #ifdef _WIN32_WINNT | ||
90 | // #error Stop_Compiling_Bad_WIN32_WINNT | ||
91 | #else | ||
92 | #ifndef Z7_NO_DEFINE_WIN32_WINNT | ||
93 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
94 | #define _WIN32_WINNT Z7_WIN32_WINNT_MIN | ||
95 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
96 | #endif | ||
97 | #endif // _WIN32_WINNT | ||
98 | |||
99 | #ifndef WINVER | ||
100 | #define WINVER _WIN32_WINNT | ||
101 | #endif | ||
102 | #endif // Z7_DO_NOT_DEFINE_WIN32_WINNT | ||
103 | |||
104 | |||
105 | #ifndef _MBCS | ||
106 | #ifndef Z7_NO_UNICODE | ||
107 | // UNICODE and _UNICODE are used by <windows.h> and by 7-zip code. | ||
108 | |||
109 | #ifndef UNICODE | ||
110 | #define UNICODE 1 | ||
111 | #endif | ||
112 | |||
113 | #ifndef _UNICODE | ||
114 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
115 | #define _UNICODE 1 | ||
116 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
117 | #endif | ||
118 | |||
119 | #endif // Z7_NO_UNICODE | ||
120 | #endif // _MBCS | ||
121 | #endif // _WINDOWS_ | ||
122 | |||
123 | // #include "7zWindows.h" | ||
124 | |||
125 | #endif // _WIN32 | ||
9 | 126 | ||
10 | #endif | 127 | #endif |
@@ -1,5 +1,5 @@ | |||
1 | /* Sha1.c -- SHA-1 Hash | 1 | /* Sha1.c -- SHA-1 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2024-03-01 : Igor Pavlov : Public domain |
3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
@@ -15,35 +15,35 @@ This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ l | |||
15 | #endif | 15 | #endif |
16 | 16 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 17 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #ifdef _MSC_VER | 18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
19 | #if _MSC_VER >= 1200 | 19 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
20 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
21 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ | ||
22 | || defined(_MSC_VER) && (_MSC_VER >= 1200) | ||
20 | #define Z7_COMPILER_SHA1_SUPPORTED | 23 | #define Z7_COMPILER_SHA1_SUPPORTED |
21 | #endif | ||
22 | #elif defined(__clang__) | ||
23 | #if (__clang_major__ >= 8) // fix that check | ||
24 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
25 | #endif | ||
26 | #elif defined(__GNUC__) | ||
27 | #if (__GNUC__ >= 8) // fix that check | ||
28 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
29 | #endif | ||
30 | #elif defined(__INTEL_COMPILER) | ||
31 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
32 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
33 | #endif | ||
34 | #endif | 24 | #endif |
35 | #elif defined(MY_CPU_ARM_OR_ARM64) | 25 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \ |
36 | #ifdef _MSC_VER | 26 | && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037)) |
37 | #if _MSC_VER >= 1910 && _MSC_VER >= 1929 && _MSC_FULL_VER >= 192930037 | 27 | #if defined(__ARM_FEATURE_SHA2) \ |
28 | || defined(__ARM_FEATURE_CRYPTO) | ||
29 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
30 | #else | ||
31 | #if defined(MY_CPU_ARM64) \ | ||
32 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
33 | || defined(Z7_MSC_VER_ORIGINAL) | ||
34 | #if defined(__ARM_FP) && \ | ||
35 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
36 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
37 | ) \ | ||
38 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
39 | #if defined(MY_CPU_ARM64) \ | ||
40 | || !defined(Z7_CLANG_VERSION) \ | ||
41 | || defined(__ARM_NEON) && \ | ||
42 | (Z7_CLANG_VERSION < 170000 || \ | ||
43 | Z7_CLANG_VERSION > 170001) | ||
38 | #define Z7_COMPILER_SHA1_SUPPORTED | 44 | #define Z7_COMPILER_SHA1_SUPPORTED |
39 | #endif | 45 | #endif |
40 | #elif defined(__clang__) | ||
41 | #if (__clang_major__ >= 8) // fix that check | ||
42 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
43 | #endif | 46 | #endif |
44 | #elif defined(__GNUC__) | ||
45 | #if (__GNUC__ >= 6) // fix that check | ||
46 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
47 | #endif | 47 | #endif |
48 | #endif | 48 | #endif |
49 | #endif | 49 | #endif |
@@ -436,7 +436,7 @@ void Sha1Prepare(void) | |||
436 | #endif | 436 | #endif |
437 | { | 437 | { |
438 | // printf("\n========== HW SHA1 ======== \n"); | 438 | // printf("\n========== HW SHA1 ======== \n"); |
439 | #if defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) | 439 | #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) |
440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). | 440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). |
441 | It generated incorrect SHA-1 code. | 441 | It generated incorrect SHA-1 code. |
442 | 21.03 : we test sha1-hardware code at runtime initialization */ | 442 | 21.03 : we test sha1-hardware code at runtime initialization */ |
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 27796aa..4e835f1 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
@@ -11,6 +11,8 @@ | |||
11 | #endif | 11 | #endif |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
15 | |||
14 | #ifdef MY_CPU_X86_OR_AMD64 | 16 | #ifdef MY_CPU_X86_OR_AMD64 |
15 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
16 | #define USE_HW_SHA | 18 | #define USE_HW_SHA |
@@ -32,9 +34,14 @@ | |||
32 | #endif | 34 | #endif |
33 | #if (_MSC_VER >= USE_VER_MIN) | 35 | #if (_MSC_VER >= USE_VER_MIN) |
34 | #define USE_HW_SHA | 36 | #define USE_HW_SHA |
37 | #else | ||
38 | #define Z7_USE_HW_SHA_STUB | ||
35 | #endif | 39 | #endif |
36 | #endif | 40 | #endif |
37 | // #endif // MY_CPU_X86_OR_AMD64 | 41 | // #endif // MY_CPU_X86_OR_AMD64 |
42 | #ifndef USE_HW_SHA | ||
43 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
44 | #endif | ||
38 | 45 | ||
39 | #ifdef USE_HW_SHA | 46 | #ifdef USE_HW_SHA |
40 | 47 | ||
@@ -202,46 +209,124 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
202 | 209 | ||
203 | #endif // USE_HW_SHA | 210 | #endif // USE_HW_SHA |
204 | 211 | ||
205 | #elif defined(MY_CPU_ARM_OR_ARM64) | 212 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \ |
206 | 213 | && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037)) | |
207 | #if defined(__clang__) | 214 | #if defined(__ARM_FEATURE_SHA2) \ |
208 | #if (__clang_major__ >= 8) // fix that check | 215 | || defined(__ARM_FEATURE_CRYPTO) |
216 | #define USE_HW_SHA | ||
217 | #else | ||
218 | #if defined(MY_CPU_ARM64) \ | ||
219 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
220 | || defined(Z7_MSC_VER_ORIGINAL) | ||
221 | #if defined(__ARM_FP) && \ | ||
222 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
223 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
224 | ) \ | ||
225 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
226 | #if defined(MY_CPU_ARM64) \ | ||
227 | || !defined(Z7_CLANG_VERSION) \ | ||
228 | || defined(__ARM_NEON) && \ | ||
229 | (Z7_CLANG_VERSION < 170000 || \ | ||
230 | Z7_CLANG_VERSION > 170001) | ||
209 | #define USE_HW_SHA | 231 | #define USE_HW_SHA |
210 | #endif | 232 | #endif |
211 | #elif defined(__GNUC__) | ||
212 | #if (__GNUC__ >= 6) // fix that check | ||
213 | #define USE_HW_SHA | ||
214 | #endif | 233 | #endif |
215 | #elif defined(_MSC_VER) | ||
216 | #if _MSC_VER >= 1910 | ||
217 | #define USE_HW_SHA | ||
218 | #endif | 234 | #endif |
219 | #endif | 235 | #endif |
220 | 236 | ||
221 | #ifdef USE_HW_SHA | 237 | #ifdef USE_HW_SHA |
222 | 238 | ||
223 | // #pragma message("=== Sha1 HW === ") | 239 | // #pragma message("=== Sha1 HW === ") |
240 | // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2 | ||
224 | 241 | ||
225 | #if defined(__clang__) || defined(__GNUC__) | 242 | #if defined(__clang__) || defined(__GNUC__) |
243 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
244 | !defined(__ARM_FEATURE_CRYPTO) | ||
226 | #ifdef MY_CPU_ARM64 | 245 | #ifdef MY_CPU_ARM64 |
246 | #if defined(__clang__) | ||
247 | #define ATTRIB_SHA __attribute__((__target__("crypto"))) | ||
248 | #else | ||
227 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | 249 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) |
250 | #endif | ||
228 | #else | 251 | #else |
252 | #if defined(__clang__) && (__clang_major__ >= 1) | ||
253 | #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) | ||
254 | #else | ||
229 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 255 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
256 | #endif | ||
230 | #endif | 257 | #endif |
258 | #endif | ||
231 | #else | 259 | #else |
232 | // _MSC_VER | 260 | // _MSC_VER |
233 | // for arm32 | 261 | // for arm32 |
234 | #define _ARM_USE_NEW_NEON_INTRINSICS | 262 | #define _ARM_USE_NEW_NEON_INTRINSICS |
235 | #endif | 263 | #endif |
236 | 264 | ||
237 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 265 | |
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
238 | #include <arm64_neon.h> | 270 | #include <arm64_neon.h> |
239 | #else | 271 | #else |
272 | |||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | ||
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
283 | !defined(__ARM_FEATURE_CRYPTO) | ||
284 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
285 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
286 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
287 | // #if defined(__clang__) && __clang_major__ < 13 | ||
288 | #define __ARM_FEATURE_CRYPTO 1 | ||
289 | // #else | ||
290 | #define __ARM_FEATURE_SHA2 1 | ||
291 | // #endif | ||
292 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
293 | #endif | ||
294 | #endif // clang | ||
295 | |||
296 | #if defined(__clang__) | ||
297 | |||
298 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
299 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
300 | // #pragma message("#define __ARM_ARCH 8") | ||
301 | #undef __ARM_ARCH | ||
302 | #define __ARM_ARCH 8 | ||
303 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
304 | #endif | ||
305 | |||
306 | #endif // clang | ||
307 | |||
240 | #include <arm_neon.h> | 308 | #include <arm_neon.h> |
309 | |||
310 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
311 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
312 | defined(__ARM_FEATURE_SHA2) | ||
313 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
314 | #undef __ARM_FEATURE_CRYPTO | ||
315 | #undef __ARM_FEATURE_SHA2 | ||
316 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
317 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
318 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
241 | #endif | 319 | #endif |
242 | 320 | ||
321 | #endif // Z7_MSC_VER_ORIGINAL | ||
322 | |||
243 | typedef uint32x4_t v128; | 323 | typedef uint32x4_t v128; |
244 | // typedef __n128 v128; // MSVC | 324 | // typedef __n128 v128; // MSVC |
325 | // the bug in clang 3.8.1: | ||
326 | // __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); | ||
327 | #if defined(__clang__) && (__clang_major__ <= 9) | ||
328 | #pragma GCC diagnostic ignored "-Wvector-conversion" | ||
329 | #endif | ||
245 | 330 | ||
246 | #ifdef MY_CPU_BE | 331 | #ifdef MY_CPU_BE |
247 | #define MY_rev32_for_LE(x) | 332 | #define MY_rev32_for_LE(x) |
@@ -256,11 +341,11 @@ typedef uint32x4_t v128; | |||
256 | m = LOAD_128((data + (k) * 16)); \ | 341 | m = LOAD_128((data + (k) * 16)); \ |
257 | MY_rev32_for_LE(m); \ | 342 | MY_rev32_for_LE(m); \ |
258 | 343 | ||
259 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | 344 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) |
260 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src); | 345 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src) |
261 | #define C(e) abcd = vsha1cq_u32(abcd, e, t); | 346 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) |
262 | #define P(e) abcd = vsha1pq_u32(abcd, e, t); | 347 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) |
263 | #define M(e) abcd = vsha1mq_u32(abcd, e, t); | 348 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) |
264 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | 349 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) |
265 | #define T(m, c) t = vaddq_u32(m, c) | 350 | #define T(m, c) t = vaddq_u32(m, c) |
266 | 351 | ||
@@ -337,16 +422,17 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
337 | #endif // MY_CPU_ARM_OR_ARM64 | 422 | #endif // MY_CPU_ARM_OR_ARM64 |
338 | 423 | ||
339 | 424 | ||
340 | #ifndef USE_HW_SHA | 425 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
341 | |||
342 | // #error Stop_Compiling_UNSUPPORTED_SHA | 426 | // #error Stop_Compiling_UNSUPPORTED_SHA |
343 | // #include <stdlib.h> | 427 | // #include <stdlib.h> |
344 | 428 | ||
345 | // #include "Sha1.h" | ||
346 | void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); | ||
347 | 429 | ||
348 | #pragma message("Sha1 HW-SW stub was used") | ||
349 | 430 | ||
431 | // #include "Sha1.h" | ||
432 | // #if defined(_MSC_VER) | ||
433 | #pragma message("Sha1 HW-SW stub was used") | ||
434 | // #endif | ||
435 | void Z7_FASTCALL Sha1_UpdateBlocks (UInt32 state[5], const Byte *data, size_t numBlocks); | ||
350 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); | 436 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); |
351 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) | 437 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) |
352 | { | 438 | { |
@@ -359,7 +445,6 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
359 | return; | 445 | return; |
360 | */ | 446 | */ |
361 | } | 447 | } |
362 | |||
363 | #endif | 448 | #endif |
364 | 449 | ||
365 | #undef SU0 | 450 | #undef SU0 |
@@ -384,3 +469,4 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
384 | #undef USE_HW_SHA | 469 | #undef USE_HW_SHA |
385 | #undef ATTRIB_SHA | 470 | #undef ATTRIB_SHA |
386 | #undef USE_VER_MIN | 471 | #undef USE_VER_MIN |
472 | #undef Z7_USE_HW_SHA_STUB | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Sha256.c -- SHA-256 Hash | 1 | /* Sha256.c -- SHA-256 Hash |
2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2024-03-01 : Igor Pavlov : Public domain |
3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ |
4 | 4 | ||
5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
@@ -15,35 +15,35 @@ This code is based on public domain code from Wei Dai's Crypto++ library. */ | |||
15 | #endif | 15 | #endif |
16 | 16 | ||
17 | #ifdef MY_CPU_X86_OR_AMD64 | 17 | #ifdef MY_CPU_X86_OR_AMD64 |
18 | #ifdef _MSC_VER | 18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
19 | #if _MSC_VER >= 1200 | 19 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
20 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
21 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ | ||
22 | || defined(_MSC_VER) && (_MSC_VER >= 1200) | ||
20 | #define Z7_COMPILER_SHA256_SUPPORTED | 23 | #define Z7_COMPILER_SHA256_SUPPORTED |
21 | #endif | ||
22 | #elif defined(__clang__) | ||
23 | #if (__clang_major__ >= 8) // fix that check | ||
24 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
25 | #endif | ||
26 | #elif defined(__GNUC__) | ||
27 | #if (__GNUC__ >= 8) // fix that check | ||
28 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
29 | #endif | ||
30 | #elif defined(__INTEL_COMPILER) | ||
31 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
32 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
33 | #endif | ||
34 | #endif | 24 | #endif |
35 | #elif defined(MY_CPU_ARM_OR_ARM64) | 25 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
36 | #ifdef _MSC_VER | 26 | |
37 | #if _MSC_VER >= 1910 | 27 | #if defined(__ARM_FEATURE_SHA2) \ |
28 | || defined(__ARM_FEATURE_CRYPTO) | ||
29 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
30 | #else | ||
31 | #if defined(MY_CPU_ARM64) \ | ||
32 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
33 | || defined(Z7_MSC_VER_ORIGINAL) | ||
34 | #if defined(__ARM_FP) && \ | ||
35 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
36 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
37 | ) \ | ||
38 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
39 | #if defined(MY_CPU_ARM64) \ | ||
40 | || !defined(Z7_CLANG_VERSION) \ | ||
41 | || defined(__ARM_NEON) && \ | ||
42 | (Z7_CLANG_VERSION < 170000 || \ | ||
43 | Z7_CLANG_VERSION > 170001) | ||
38 | #define Z7_COMPILER_SHA256_SUPPORTED | 44 | #define Z7_COMPILER_SHA256_SUPPORTED |
39 | #endif | 45 | #endif |
40 | #elif defined(__clang__) | ||
41 | #if (__clang_major__ >= 8) // fix that check | ||
42 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
43 | #endif | 46 | #endif |
44 | #elif defined(__GNUC__) | ||
45 | #if (__GNUC__ >= 6) // fix that check | ||
46 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
47 | #endif | 47 | #endif |
48 | #endif | 48 | #endif |
49 | #endif | 49 | #endif |
@@ -224,8 +224,6 @@ void Sha256_Init(CSha256 *p) | |||
224 | 224 | ||
225 | #endif | 225 | #endif |
226 | 226 | ||
227 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
228 | |||
229 | // static | 227 | // static |
230 | extern MY_ALIGN(64) | 228 | extern MY_ALIGN(64) |
231 | const UInt32 SHA256_K_ARRAY[64]; | 229 | const UInt32 SHA256_K_ARRAY[64]; |
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index e4465e3..eb38166 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
@@ -11,6 +11,8 @@ | |||
11 | #endif | 11 | #endif |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
15 | |||
14 | #ifdef MY_CPU_X86_OR_AMD64 | 16 | #ifdef MY_CPU_X86_OR_AMD64 |
15 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
16 | #define USE_HW_SHA | 18 | #define USE_HW_SHA |
@@ -32,9 +34,14 @@ | |||
32 | #endif | 34 | #endif |
33 | #if (_MSC_VER >= USE_VER_MIN) | 35 | #if (_MSC_VER >= USE_VER_MIN) |
34 | #define USE_HW_SHA | 36 | #define USE_HW_SHA |
37 | #else | ||
38 | #define Z7_USE_HW_SHA_STUB | ||
35 | #endif | 39 | #endif |
36 | #endif | 40 | #endif |
37 | // #endif // MY_CPU_X86_OR_AMD64 | 41 | // #endif // MY_CPU_X86_OR_AMD64 |
42 | #ifndef USE_HW_SHA | ||
43 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
44 | #endif | ||
38 | 45 | ||
39 | #ifdef USE_HW_SHA | 46 | #ifdef USE_HW_SHA |
40 | 47 | ||
@@ -202,19 +209,28 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
202 | 209 | ||
203 | #endif // USE_HW_SHA | 210 | #endif // USE_HW_SHA |
204 | 211 | ||
205 | #elif defined(MY_CPU_ARM_OR_ARM64) | 212 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
206 | 213 | ||
207 | #if defined(__clang__) | 214 | #if defined(__ARM_FEATURE_SHA2) \ |
208 | #if (__clang_major__ >= 8) // fix that check | 215 | || defined(__ARM_FEATURE_CRYPTO) |
216 | #define USE_HW_SHA | ||
217 | #else | ||
218 | #if defined(MY_CPU_ARM64) \ | ||
219 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
220 | || defined(Z7_MSC_VER_ORIGINAL) | ||
221 | #if defined(__ARM_FP) && \ | ||
222 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
223 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
224 | ) \ | ||
225 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
226 | #if defined(MY_CPU_ARM64) \ | ||
227 | || !defined(Z7_CLANG_VERSION) \ | ||
228 | || defined(__ARM_NEON) && \ | ||
229 | (Z7_CLANG_VERSION < 170000 || \ | ||
230 | Z7_CLANG_VERSION > 170001) | ||
209 | #define USE_HW_SHA | 231 | #define USE_HW_SHA |
210 | #endif | 232 | #endif |
211 | #elif defined(__GNUC__) | ||
212 | #if (__GNUC__ >= 6) // fix that check | ||
213 | #define USE_HW_SHA | ||
214 | #endif | 233 | #endif |
215 | #elif defined(_MSC_VER) | ||
216 | #if _MSC_VER >= 1910 | ||
217 | #define USE_HW_SHA | ||
218 | #endif | 234 | #endif |
219 | #endif | 235 | #endif |
220 | 236 | ||
@@ -222,24 +238,88 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
222 | 238 | ||
223 | // #pragma message("=== Sha256 HW === ") | 239 | // #pragma message("=== Sha256 HW === ") |
224 | 240 | ||
241 | |||
225 | #if defined(__clang__) || defined(__GNUC__) | 242 | #if defined(__clang__) || defined(__GNUC__) |
243 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
244 | !defined(__ARM_FEATURE_CRYPTO) | ||
226 | #ifdef MY_CPU_ARM64 | 245 | #ifdef MY_CPU_ARM64 |
246 | #if defined(__clang__) | ||
247 | #define ATTRIB_SHA __attribute__((__target__("crypto"))) | ||
248 | #else | ||
227 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | 249 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) |
250 | #endif | ||
228 | #else | 251 | #else |
252 | #if defined(__clang__) && (__clang_major__ >= 1) | ||
253 | #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) | ||
254 | #else | ||
229 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 255 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
256 | #endif | ||
230 | #endif | 257 | #endif |
258 | #endif | ||
231 | #else | 259 | #else |
232 | // _MSC_VER | 260 | // _MSC_VER |
233 | // for arm32 | 261 | // for arm32 |
234 | #define _ARM_USE_NEW_NEON_INTRINSICS | 262 | #define _ARM_USE_NEW_NEON_INTRINSICS |
235 | #endif | 263 | #endif |
236 | 264 | ||
237 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 265 | |
266 | |||
267 | |||
268 | |||
269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
238 | #include <arm64_neon.h> | 270 | #include <arm64_neon.h> |
239 | #else | 271 | #else |
272 | |||
273 | |||
274 | |||
275 | |||
276 | |||
277 | |||
278 | |||
279 | |||
280 | |||
281 | #if defined(__clang__) && __clang_major__ < 16 | ||
282 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
283 | !defined(__ARM_FEATURE_CRYPTO) | ||
284 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
285 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
286 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
287 | // #if defined(__clang__) && __clang_major__ < 13 | ||
288 | #define __ARM_FEATURE_CRYPTO 1 | ||
289 | // #else | ||
290 | #define __ARM_FEATURE_SHA2 1 | ||
291 | // #endif | ||
292 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
293 | #endif | ||
294 | #endif // clang | ||
295 | |||
296 | #if defined(__clang__) | ||
297 | |||
298 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
299 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
300 | // #pragma message("#define __ARM_ARCH 8") | ||
301 | #undef __ARM_ARCH | ||
302 | #define __ARM_ARCH 8 | ||
303 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
304 | #endif | ||
305 | |||
306 | #endif // clang | ||
307 | |||
240 | #include <arm_neon.h> | 308 | #include <arm_neon.h> |
309 | |||
310 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
311 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
312 | defined(__ARM_FEATURE_SHA2) | ||
313 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
314 | #undef __ARM_FEATURE_CRYPTO | ||
315 | #undef __ARM_FEATURE_SHA2 | ||
316 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
317 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
318 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
241 | #endif | 319 | #endif |
242 | 320 | ||
321 | #endif // Z7_MSC_VER_ORIGINAL | ||
322 | |||
243 | typedef uint32x4_t v128; | 323 | typedef uint32x4_t v128; |
244 | // typedef __n128 v128; // MSVC | 324 | // typedef __n128 v128; // MSVC |
245 | 325 | ||
@@ -316,10 +396,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
316 | LOAD_SHUFFLE (m2, 2) | 396 | LOAD_SHUFFLE (m2, 2) |
317 | LOAD_SHUFFLE (m3, 3) | 397 | LOAD_SHUFFLE (m3, 3) |
318 | 398 | ||
319 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); | 399 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) |
320 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 400 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
321 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 401 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
322 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); | 402 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) |
323 | 403 | ||
324 | state0 = vaddq_u32(state0, state0_save); | 404 | state0 = vaddq_u32(state0, state0_save); |
325 | state1 = vaddq_u32(state1, state1_save); | 405 | state1 = vaddq_u32(state1, state1_save); |
@@ -337,16 +417,17 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
337 | #endif // MY_CPU_ARM_OR_ARM64 | 417 | #endif // MY_CPU_ARM_OR_ARM64 |
338 | 418 | ||
339 | 419 | ||
340 | #ifndef USE_HW_SHA | 420 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
341 | |||
342 | // #error Stop_Compiling_UNSUPPORTED_SHA | 421 | // #error Stop_Compiling_UNSUPPORTED_SHA |
343 | // #include <stdlib.h> | 422 | // #include <stdlib.h> |
344 | 423 | // We can compile this file with another C compiler, | |
424 | // or we can compile asm version. | ||
425 | // So we can generate real code instead of this stub function. | ||
345 | // #include "Sha256.h" | 426 | // #include "Sha256.h" |
346 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); | 427 | // #if defined(_MSC_VER) |
347 | |||
348 | #pragma message("Sha256 HW-SW stub was used") | 428 | #pragma message("Sha256 HW-SW stub was used") |
349 | 429 | // #endif | |
430 | void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks); | ||
350 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 431 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
351 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 432 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
352 | { | 433 | { |
@@ -359,7 +440,6 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
359 | return; | 440 | return; |
360 | */ | 441 | */ |
361 | } | 442 | } |
362 | |||
363 | #endif | 443 | #endif |
364 | 444 | ||
365 | 445 | ||
@@ -384,3 +464,4 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
384 | #undef USE_HW_SHA | 464 | #undef USE_HW_SHA |
385 | #undef ATTRIB_SHA | 465 | #undef ATTRIB_SHA |
386 | #undef USE_VER_MIN | 466 | #undef USE_VER_MIN |
467 | #undef Z7_USE_HW_SHA_STUB | ||
diff --git a/C/SwapBytes.c b/C/SwapBytes.c index 7901bba..9290592 100644 --- a/C/SwapBytes.c +++ b/C/SwapBytes.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* SwapBytes.c -- Byte Swap conversion filter | 1 | /* SwapBytes.c -- Byte Swap conversion filter |
2 | 2023-04-07 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -305,11 +305,12 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) | |||
305 | msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want | 305 | msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want |
306 | */ | 306 | */ |
307 | // _mm256_broadcastsi128_si256(*mask128_ptr); | 307 | // _mm256_broadcastsi128_si256(*mask128_ptr); |
308 | /* | 308 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) |
309 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | 309 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) |
310 | MY_mm256_set_m128i | 310 | #else |
311 | */ | 311 | #define MY_mm256_set_m128i _mm256_set_m128i |
312 | _mm256_set_m128i( | 312 | #endif |
313 | MY_mm256_set_m128i( | ||
313 | *(const __m128i *)mask128_ptr, | 314 | *(const __m128i *)mask128_ptr, |
314 | *(const __m128i *)mask128_ptr); | 315 | *(const __m128i *)mask128_ptr); |
315 | #endif | 316 | #endif |
@@ -330,32 +331,59 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) | |||
330 | 331 | ||
331 | 332 | ||
332 | // compile message "NEON intrinsics not available with the soft-float ABI" | 333 | // compile message "NEON intrinsics not available with the soft-float ABI" |
333 | #elif defined(MY_CPU_ARM_OR_ARM64) || \ | 334 | #elif defined(MY_CPU_ARM_OR_ARM64) \ |
334 | (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) | 335 | && defined(MY_CPU_LE) \ |
335 | // #elif defined(MY_CPU_ARM64) | 336 | && !defined(Z7_DISABLE_ARM_NEON) |
336 | 337 | ||
337 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 338 | #if defined(__clang__) && (__clang_major__ >= 8) \ |
338 | || defined(__GNUC__) && (__GNUC__ >= 8) | 339 | || defined(__GNUC__) && (__GNUC__ >= 6) |
339 | #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ | 340 | #if defined(__ARM_FP) |
341 | #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \ | ||
340 | || defined(MY_CPU_ARM64) | 342 | || defined(MY_CPU_ARM64) |
343 | #if defined(MY_CPU_ARM64) \ | ||
344 | || !defined(Z7_CLANG_VERSION) \ | ||
345 | || defined(__ARM_NEON) | ||
341 | #define USE_SWAP_128 | 346 | #define USE_SWAP_128 |
342 | #endif | ||
343 | #ifdef MY_CPU_ARM64 | 347 | #ifdef MY_CPU_ARM64 |
344 | // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) | 348 | // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) |
345 | #else | 349 | #else |
346 | // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 350 | #if defined(Z7_CLANG_VERSION) |
347 | #endif | 351 | // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon"))) |
352 | #else | ||
353 | // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))") | ||
354 | #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon"))) | ||
355 | #endif | ||
356 | #endif // MY_CPU_ARM64 | ||
357 | #endif // __ARM_NEON | ||
358 | #endif // __ARM_ARCH | ||
359 | #endif // __ARM_FP | ||
360 | |||
348 | #elif defined(_MSC_VER) | 361 | #elif defined(_MSC_VER) |
349 | #if (_MSC_VER >= 1910) | 362 | #if (_MSC_VER >= 1910) |
350 | #define USE_SWAP_128 | 363 | #define USE_SWAP_128 |
351 | #endif | 364 | #endif |
352 | #endif | 365 | #endif |
353 | 366 | ||
354 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 367 | #ifdef USE_SWAP_128 |
368 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
355 | #include <arm64_neon.h> | 369 | #include <arm64_neon.h> |
356 | #else | 370 | #else |
371 | |||
372 | /* | ||
373 | #if !defined(__ARM_NEON) | ||
374 | #if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \ | ||
375 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \ | ||
376 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100) | ||
377 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
378 | #pragma message("#define __ARM_NEON 1") | ||
379 | // #define __ARM_NEON 1 | ||
380 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
381 | #endif | ||
382 | #endif | ||
383 | */ | ||
357 | #include <arm_neon.h> | 384 | #include <arm_neon.h> |
358 | #endif | 385 | #endif |
386 | #endif | ||
359 | 387 | ||
360 | #ifndef USE_SWAP_128 | 388 | #ifndef USE_SWAP_128 |
361 | #define FORCE_SWAP_MODE | 389 | #define FORCE_SWAP_MODE |
@@ -464,6 +492,13 @@ Z7_ATTRIB_NO_VECTOR \ | |||
464 | void Z7_FASTCALL | 492 | void Z7_FASTCALL |
465 | 493 | ||
466 | 494 | ||
495 | #if defined(MY_CPU_ARM_OR_ARM64) | ||
496 | #if defined(__clang__) | ||
497 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" | ||
498 | #endif | ||
499 | #endif | ||
500 | |||
501 | |||
467 | #ifdef MY_CPU_64BIT | 502 | #ifdef MY_CPU_64BIT |
468 | 503 | ||
469 | #if defined(MY_CPU_ARM64) \ | 504 | #if defined(MY_CPU_ARM64) \ |
diff --git a/C/Threads.c b/C/Threads.c index cf52bd3..464efec 100644 --- a/C/Threads.c +++ b/C/Threads.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Threads.c -- multithreading library | 1 | /* Threads.c -- multithreading library |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-03-28 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -195,20 +195,19 @@ WRes CriticalSection_Init(CCriticalSection *p) | |||
195 | 195 | ||
196 | // ---------- POSIX ---------- | 196 | // ---------- POSIX ---------- |
197 | 197 | ||
198 | #ifndef __APPLE__ | 198 | #if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) |
199 | #ifndef Z7_AFFINITY_DISABLE | 199 | #ifndef Z7_AFFINITY_DISABLE |
200 | // _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET | 200 | // _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET |
201 | // clang < 3.6 : unknown warning group '-Wreserved-id-macro' | 201 | // clang < 3.6 : unknown warning group '-Wreserved-id-macro' |
202 | // clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" | 202 | // clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" |
203 | // clang >= 13 : do not give warning | 203 | // clang >= 13 : do not give warning |
204 | #if !defined(_GNU_SOURCE) | 204 | #if !defined(_GNU_SOURCE) |
205 | #if defined(__clang__) && (__clang_major__ >= 4) && (__clang_major__ <= 12) | 205 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
206 | #pragma GCC diagnostic ignored "-Wreserved-id-macro" | 206 | // #define _GNU_SOURCE |
207 | #endif | 207 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
208 | #define _GNU_SOURCE | ||
209 | #endif // !defined(_GNU_SOURCE) | 208 | #endif // !defined(_GNU_SOURCE) |
210 | #endif // Z7_AFFINITY_DISABLE | 209 | #endif // Z7_AFFINITY_DISABLE |
211 | #endif // __APPLE__ | 210 | #endif // __linux__ |
212 | 211 | ||
213 | #include "Threads.h" | 212 | #include "Threads.h" |
214 | 213 | ||
@@ -244,8 +243,9 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, | |||
244 | { | 243 | { |
245 | if (cpuSet) | 244 | if (cpuSet) |
246 | { | 245 | { |
247 | #ifdef Z7_AFFINITY_SUPPORTED | 246 | // pthread_attr_setaffinity_np() is not supported for MUSL compile. |
248 | 247 | // so we check for __GLIBC__ here | |
248 | #if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__) | ||
249 | /* | 249 | /* |
250 | printf("\n affinity :"); | 250 | printf("\n affinity :"); |
251 | unsigned i; | 251 | unsigned i; |
@@ -267,7 +267,7 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, | |||
267 | // ret2 = | 267 | // ret2 = |
268 | pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); | 268 | pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); |
269 | // if (ret2) ret = ret2; | 269 | // if (ret2) ret = ret2; |
270 | #endif | 270 | #endif |
271 | } | 271 | } |
272 | 272 | ||
273 | ret = pthread_create(&p->_tid, &attr, func, param); | 273 | ret = pthread_create(&p->_tid, &attr, func, param); |
@@ -369,13 +369,20 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) | |||
369 | { return AutoResetEvent_Create(p, 0); } | 369 | { return AutoResetEvent_Create(p, 0); } |
370 | 370 | ||
371 | 371 | ||
372 | #if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) | ||
373 | // freebsd: | ||
374 | #pragma GCC diagnostic ignored "-Wthread-safety-analysis" | ||
375 | #endif | ||
376 | |||
372 | WRes Event_Set(CEvent *p) | 377 | WRes Event_Set(CEvent *p) |
373 | { | 378 | { |
374 | RINOK(pthread_mutex_lock(&p->_mutex)) | 379 | RINOK(pthread_mutex_lock(&p->_mutex)) |
375 | p->_state = True; | 380 | p->_state = True; |
376 | int res1 = pthread_cond_broadcast(&p->_cond); | 381 | { |
377 | int res2 = pthread_mutex_unlock(&p->_mutex); | 382 | const int res1 = pthread_cond_broadcast(&p->_cond); |
378 | return (res2 ? res2 : res1); | 383 | const int res2 = pthread_mutex_unlock(&p->_mutex); |
384 | return (res2 ? res2 : res1); | ||
385 | } | ||
379 | } | 386 | } |
380 | 387 | ||
381 | WRes Event_Reset(CEvent *p) | 388 | WRes Event_Reset(CEvent *p) |
@@ -408,8 +415,8 @@ WRes Event_Close(CEvent *p) | |||
408 | return 0; | 415 | return 0; |
409 | p->_created = 0; | 416 | p->_created = 0; |
410 | { | 417 | { |
411 | int res1 = pthread_mutex_destroy(&p->_mutex); | 418 | const int res1 = pthread_mutex_destroy(&p->_mutex); |
412 | int res2 = pthread_cond_destroy(&p->_cond); | 419 | const int res2 = pthread_cond_destroy(&p->_cond); |
413 | return (res1 ? res1 : res2); | 420 | return (res1 ? res1 : res2); |
414 | } | 421 | } |
415 | } | 422 | } |
@@ -487,8 +494,8 @@ WRes Semaphore_Close(CSemaphore *p) | |||
487 | return 0; | 494 | return 0; |
488 | p->_created = 0; | 495 | p->_created = 0; |
489 | { | 496 | { |
490 | int res1 = pthread_mutex_destroy(&p->_mutex); | 497 | const int res1 = pthread_mutex_destroy(&p->_mutex); |
491 | int res2 = pthread_cond_destroy(&p->_cond); | 498 | const int res2 = pthread_cond_destroy(&p->_cond); |
492 | return (res1 ? res1 : res2); | 499 | return (res1 ? res1 : res2); |
493 | } | 500 | } |
494 | } | 501 | } |
@@ -549,6 +556,18 @@ LONG InterlockedIncrement(LONG volatile *addend) | |||
549 | #endif | 556 | #endif |
550 | } | 557 | } |
551 | 558 | ||
559 | LONG InterlockedDecrement(LONG volatile *addend) | ||
560 | { | ||
561 | // Print("InterlockedDecrement") | ||
562 | #ifdef USE_HACK_UNSAFE_ATOMIC | ||
563 | LONG val = *addend - 1; | ||
564 | *addend = val; | ||
565 | return val; | ||
566 | #else | ||
567 | return __sync_sub_and_fetch(addend, 1); | ||
568 | #endif | ||
569 | } | ||
570 | |||
552 | #endif // _WIN32 | 571 | #endif // _WIN32 |
553 | 572 | ||
554 | WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) | 573 | WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) |
diff --git a/C/Threads.h b/C/Threads.h index 4028464..c1484a2 100644 --- a/C/Threads.h +++ b/C/Threads.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* Threads.h -- multithreading library | 1 | /* Threads.h -- multithreading library |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-28 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_THREADS_H | 4 | #ifndef ZIP7_INC_THREADS_H |
5 | #define ZIP7_INC_THREADS_H | 5 | #define ZIP7_INC_THREADS_H |
@@ -9,12 +9,21 @@ | |||
9 | 9 | ||
10 | #else | 10 | #else |
11 | 11 | ||
12 | #include "Compiler.h" | ||
13 | |||
14 | // #define Z7_AFFINITY_DISABLE | ||
12 | #if defined(__linux__) | 15 | #if defined(__linux__) |
13 | #if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) | 16 | #if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) |
14 | #ifndef Z7_AFFINITY_DISABLE | 17 | #ifndef Z7_AFFINITY_DISABLE |
15 | #define Z7_AFFINITY_SUPPORTED | 18 | #define Z7_AFFINITY_SUPPORTED |
16 | // #pragma message(" ==== Z7_AFFINITY_SUPPORTED") | 19 | // #pragma message(" ==== Z7_AFFINITY_SUPPORTED") |
17 | // #define _GNU_SOURCE | 20 | #if !defined(_GNU_SOURCE) |
21 | // #pragma message(" ==== _GNU_SOURCE set") | ||
22 | // we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL | ||
23 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
24 | #define _GNU_SOURCE | ||
25 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
26 | #endif | ||
18 | #endif | 27 | #endif |
19 | #endif | 28 | #endif |
20 | #endif | 29 | #endif |
@@ -173,7 +182,7 @@ WRes CriticalSection_Init(CCriticalSection *p); | |||
173 | 182 | ||
174 | #else // _WIN32 | 183 | #else // _WIN32 |
175 | 184 | ||
176 | typedef struct _CEvent | 185 | typedef struct |
177 | { | 186 | { |
178 | int _created; | 187 | int _created; |
179 | int _manual_reset; | 188 | int _manual_reset; |
@@ -199,7 +208,7 @@ WRes Event_Wait(CEvent *p); | |||
199 | WRes Event_Close(CEvent *p); | 208 | WRes Event_Close(CEvent *p); |
200 | 209 | ||
201 | 210 | ||
202 | typedef struct _CSemaphore | 211 | typedef struct |
203 | { | 212 | { |
204 | int _created; | 213 | int _created; |
205 | UInt32 _count; | 214 | UInt32 _count; |
@@ -219,7 +228,7 @@ WRes Semaphore_Wait(CSemaphore *p); | |||
219 | WRes Semaphore_Close(CSemaphore *p); | 228 | WRes Semaphore_Close(CSemaphore *p); |
220 | 229 | ||
221 | 230 | ||
222 | typedef struct _CCriticalSection | 231 | typedef struct |
223 | { | 232 | { |
224 | pthread_mutex_t _mutex; | 233 | pthread_mutex_t _mutex; |
225 | } CCriticalSection; | 234 | } CCriticalSection; |
@@ -230,6 +239,7 @@ void CriticalSection_Enter(CCriticalSection *cs); | |||
230 | void CriticalSection_Leave(CCriticalSection *cs); | 239 | void CriticalSection_Leave(CCriticalSection *cs); |
231 | 240 | ||
232 | LONG InterlockedIncrement(LONG volatile *addend); | 241 | LONG InterlockedIncrement(LONG volatile *addend); |
242 | LONG InterlockedDecrement(LONG volatile *addend); | ||
233 | 243 | ||
234 | #endif // _WIN32 | 244 | #endif // _WIN32 |
235 | 245 | ||
diff --git a/C/Util/7z/7z.dsp b/C/Util/7z/7z.dsp index 11e1b03..474c660 100644 --- a/C/Util/7z/7z.dsp +++ b/C/Util/7z/7z.dsp | |||
@@ -42,7 +42,7 @@ RSC=rc.exe | |||
42 | # PROP Ignore_Export_Lib 0 | 42 | # PROP Ignore_Export_Lib 0 |
43 | # PROP Target_Dir "" | 43 | # PROP Target_Dir "" |
44 | # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c | 44 | # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c |
45 | # ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /FAcs /Yu"Precomp.h" /FD /c | 45 | # ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /FAcs /Yu"Precomp.h" /FD /c |
46 | # ADD BASE RSC /l 0x419 /d "NDEBUG" | 46 | # ADD BASE RSC /l 0x419 /d "NDEBUG" |
47 | # ADD RSC /l 0x419 /d "NDEBUG" | 47 | # ADD RSC /l 0x419 /d "NDEBUG" |
48 | BSC32=bscmake.exe | 48 | BSC32=bscmake.exe |
@@ -67,7 +67,7 @@ LINK32=link.exe | |||
67 | # PROP Ignore_Export_Lib 0 | 67 | # PROP Ignore_Export_Lib 0 |
68 | # PROP Target_Dir "" | 68 | # PROP Target_Dir "" |
69 | # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c | 69 | # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c |
70 | # ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /Yu"Precomp.h" /FD /GZ /c | 70 | # ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /Yu"Precomp.h" /FD /GZ /c |
71 | # ADD BASE RSC /l 0x419 /d "_DEBUG" | 71 | # ADD BASE RSC /l 0x419 /d "_DEBUG" |
72 | # ADD RSC /l 0x419 /d "_DEBUG" | 72 | # ADD RSC /l 0x419 /d "_DEBUG" |
73 | BSC32=bscmake.exe | 73 | BSC32=bscmake.exe |
@@ -234,6 +234,10 @@ SOURCE=.\Precomp.c | |||
234 | # End Source File | 234 | # End Source File |
235 | # Begin Source File | 235 | # Begin Source File |
236 | 236 | ||
237 | SOURCE=..\..\Precomp.h | ||
238 | # End Source File | ||
239 | # Begin Source File | ||
240 | |||
237 | SOURCE=.\Precomp.h | 241 | SOURCE=.\Precomp.h |
238 | # End Source File | 242 | # End Source File |
239 | # End Group | 243 | # End Group |
diff --git a/C/Util/7z/7zMain.c b/C/Util/7z/7zMain.c index 547920a..6baf979 100644 --- a/C/Util/7z/7zMain.c +++ b/C/Util/7z/7zMain.c | |||
@@ -1,20 +1,11 @@ | |||
1 | /* 7zMain.c - Test application for 7z Decoder | 1 | /* 7zMain.c - Test application for 7z Decoder |
2 | 2023-04-04 : Igor Pavlov : Public domain */ | 2 | 2024-02-28 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include <stdio.h> | 6 | #include <stdio.h> |
7 | #include <string.h> | 7 | #include <string.h> |
8 | 8 | ||
9 | #include "../../CpuArch.h" | ||
10 | |||
11 | #include "../../7z.h" | ||
12 | #include "../../7zAlloc.h" | ||
13 | #include "../../7zBuf.h" | ||
14 | #include "../../7zCrc.h" | ||
15 | #include "../../7zFile.h" | ||
16 | #include "../../7zVersion.h" | ||
17 | |||
18 | #ifndef USE_WINDOWS_FILE | 9 | #ifndef USE_WINDOWS_FILE |
19 | /* for mkdir */ | 10 | /* for mkdir */ |
20 | #ifdef _WIN32 | 11 | #ifdef _WIN32 |
@@ -32,6 +23,15 @@ | |||
32 | #endif | 23 | #endif |
33 | #endif | 24 | #endif |
34 | 25 | ||
26 | #include "../../7zFile.h" | ||
27 | #include "../../7z.h" | ||
28 | #include "../../7zAlloc.h" | ||
29 | #include "../../7zBuf.h" | ||
30 | #include "../../7zCrc.h" | ||
31 | #include "../../7zVersion.h" | ||
32 | |||
33 | #include "../../CpuArch.h" | ||
34 | |||
35 | #define kInputBufSize ((size_t)1 << 18) | 35 | #define kInputBufSize ((size_t)1 << 18) |
36 | 36 | ||
37 | static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; | 37 | static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; |
@@ -168,12 +168,12 @@ static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s | |||
168 | #endif | 168 | #endif |
169 | ) | 169 | ) |
170 | { | 170 | { |
171 | unsigned len = 0; | 171 | size_t len = 0; |
172 | for (len = 0; s[len] != 0; len++) {} | 172 | for (len = 0; s[len] != 0; len++) {} |
173 | 173 | ||
174 | #ifndef MY_USE_UTF8 | 174 | #ifndef MY_USE_UTF8 |
175 | { | 175 | { |
176 | const unsigned size = len * 3 + 100; | 176 | const size_t size = len * 3 + 100; |
177 | if (!Buf_EnsureSize(buf, size)) | 177 | if (!Buf_EnsureSize(buf, size)) |
178 | return SZ_ERROR_MEM; | 178 | return SZ_ERROR_MEM; |
179 | { | 179 | { |
@@ -320,21 +320,20 @@ static void UIntToStr_2(char *s, unsigned value) | |||
320 | // typedef long BOOL; | 320 | // typedef long BOOL; |
321 | typedef int BOOL; | 321 | typedef int BOOL; |
322 | 322 | ||
323 | typedef struct _FILETIME | 323 | typedef struct |
324 | { | 324 | { |
325 | DWORD dwLowDateTime; | 325 | DWORD dwLowDateTime; |
326 | DWORD dwHighDateTime; | 326 | DWORD dwHighDateTime; |
327 | } FILETIME; | 327 | } FILETIME; |
328 | 328 | ||
329 | static LONG TIME_GetBias() | 329 | static LONG TIME_GetBias(void) |
330 | { | 330 | { |
331 | const time_t utc = time(NULL); | 331 | const time_t utc = time(NULL); |
332 | struct tm *ptm = localtime(&utc); | 332 | struct tm *ptm = localtime(&utc); |
333 | const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ | 333 | const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ |
334 | ptm = gmtime(&utc); | 334 | ptm = gmtime(&utc); |
335 | ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ | 335 | ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ |
336 | const LONG bias = (int)(mktime(ptm) - utc); | 336 | return (int)(mktime(ptm) - utc); |
337 | return bias; | ||
338 | } | 337 | } |
339 | 338 | ||
340 | #define TICKS_PER_SEC 10000000 | 339 | #define TICKS_PER_SEC 10000000 |
@@ -359,11 +358,11 @@ static BOOL WINAPI FileTimeToLocalFileTime(const FILETIME *fileTime, FILETIME *l | |||
359 | static const UInt32 kNumTimeQuantumsInSecond = 10000000; | 358 | static const UInt32 kNumTimeQuantumsInSecond = 10000000; |
360 | static const UInt32 kFileTimeStartYear = 1601; | 359 | static const UInt32 kFileTimeStartYear = 1601; |
361 | static const UInt32 kUnixTimeStartYear = 1970; | 360 | static const UInt32 kUnixTimeStartYear = 1970; |
362 | static const UInt64 kUnixTimeOffset = | ||
363 | (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear)); | ||
364 | 361 | ||
365 | static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) | 362 | static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) |
366 | { | 363 | { |
364 | const UInt64 kUnixTimeOffset = | ||
365 | (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear)); | ||
367 | const UInt64 winTime = GET_TIME_64(ft); | 366 | const UInt64 winTime = GET_TIME_64(ft); |
368 | return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; | 367 | return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; |
369 | } | 368 | } |
@@ -384,8 +383,10 @@ static void FILETIME_To_timespec(const FILETIME *ft, struct MY_ST_TIMESPEC *ts) | |||
384 | if (sec2 == sec) | 383 | if (sec2 == sec) |
385 | { | 384 | { |
386 | ts->tv_sec = sec2; | 385 | ts->tv_sec = sec2; |
387 | const UInt64 winTime = GET_TIME_64(ft); | 386 | { |
388 | ts->tv_nsec = (long)((winTime % 10000000) * 100); | 387 | const UInt64 winTime = GET_TIME_64(ft); |
388 | ts->tv_nsec = (long)((winTime % 10000000) * 100); | ||
389 | } | ||
389 | return; | 390 | return; |
390 | } | 391 | } |
391 | } | 392 | } |
@@ -429,7 +430,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s) | |||
429 | { | 430 | { |
430 | unsigned year, mon, hour, min, sec; | 431 | unsigned year, mon, hour, min, sec; |
431 | Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; | 432 | Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; |
432 | unsigned t; | 433 | UInt32 t; |
433 | UInt32 v; | 434 | UInt32 v; |
434 | // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); | 435 | // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); |
435 | UInt64 v64; | 436 | UInt64 v64; |
@@ -461,7 +462,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s) | |||
461 | ms[1] = 29; | 462 | ms[1] = 29; |
462 | for (mon = 0;; mon++) | 463 | for (mon = 0;; mon++) |
463 | { | 464 | { |
464 | const unsigned d = ms[mon]; | 465 | const UInt32 d = ms[mon]; |
465 | if (v < d) | 466 | if (v < d) |
466 | break; | 467 | break; |
467 | v -= d; | 468 | v -= d; |
diff --git a/C/Util/7z/Precomp.h b/C/Util/7z/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7z/Precomp.h +++ b/C/Util/7z/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/7z/makefile b/C/Util/7z/makefile index dfc560e..987f065 100644 --- a/C/Util/7z/makefile +++ b/C/Util/7z/makefile | |||
@@ -5,8 +5,6 @@ PROG = 7zDec.exe | |||
5 | C_OBJS = \ | 5 | C_OBJS = \ |
6 | $O\7zAlloc.obj \ | 6 | $O\7zAlloc.obj \ |
7 | $O\7zBuf.obj \ | 7 | $O\7zBuf.obj \ |
8 | $O\7zCrc.obj \ | ||
9 | $O\7zCrcOpt.obj \ | ||
10 | $O\7zFile.obj \ | 8 | $O\7zFile.obj \ |
11 | $O\7zDec.obj \ | 9 | $O\7zDec.obj \ |
12 | $O\7zArcIn.obj \ | 10 | $O\7zArcIn.obj \ |
@@ -25,10 +23,14 @@ C_OBJS = \ | |||
25 | 7Z_OBJS = \ | 23 | 7Z_OBJS = \ |
26 | $O\7zMain.obj \ | 24 | $O\7zMain.obj \ |
27 | 25 | ||
26 | !include "../../../CPP/7zip/Crc.mak" | ||
27 | !include "../../../CPP/7zip/LzmaDec.mak" | ||
28 | |||
28 | OBJS = \ | 29 | OBJS = \ |
29 | $O\Precomp.obj \ | 30 | $O\Precomp.obj \ |
30 | $(7Z_OBJS) \ | 31 | $(7Z_OBJS) \ |
31 | $(C_OBJS) \ | 32 | $(C_OBJS) \ |
33 | $(ASM_OBJS) \ | ||
32 | 34 | ||
33 | !include "../../../CPP/Build.mak" | 35 | !include "../../../CPP/Build.mak" |
34 | 36 | ||
@@ -38,3 +40,5 @@ $(C_OBJS): ../../$(*B).c | |||
38 | $(CCOMPL_USE) | 40 | $(CCOMPL_USE) |
39 | $O\Precomp.obj: Precomp.c | 41 | $O\Precomp.obj: Precomp.c |
40 | $(CCOMPL_PCH) | 42 | $(CCOMPL_PCH) |
43 | |||
44 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/7zipInstall/7zipInstall.c b/C/Util/7zipInstall/7zipInstall.c index 7f5fd19..7d8e8c4 100644 --- a/C/Util/7zipInstall/7zipInstall.c +++ b/C/Util/7zipInstall/7zipInstall.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* 7zipInstall.c - 7-Zip Installer | 1 | /* 7zipInstall.c - 7-Zip Installer |
2 | 2023-04-04 : Igor Pavlov : Public domain */ | 2 | 2024-04-05 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -11,6 +11,8 @@ | |||
11 | #pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union | 11 | #pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
15 | |||
14 | #ifdef Z7_OLD_WIN_SDK | 16 | #ifdef Z7_OLD_WIN_SDK |
15 | struct IShellView; | 17 | struct IShellView; |
16 | #define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE | 18 | #define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE |
@@ -41,16 +43,6 @@ typedef enum { | |||
41 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | 43 | // #pragma GCC diagnostic ignored "-Wcast-function-type" |
42 | #endif | 44 | #endif |
43 | 45 | ||
44 | #if defined(__clang__) || defined(__GNUC__) | ||
45 | typedef void (*Z7_voidFunction)(void); | ||
46 | #define MY_CAST_FUNC (Z7_voidFunction) | ||
47 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
48 | #define MY_CAST_FUNC (void *) | ||
49 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
50 | #else | ||
51 | #define MY_CAST_FUNC | ||
52 | #endif | ||
53 | |||
54 | #define LLL_(quote) L##quote | 46 | #define LLL_(quote) L##quote |
55 | #define LLL(quote) LLL_(quote) | 47 | #define LLL(quote) LLL_(quote) |
56 | 48 | ||
@@ -118,11 +110,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path" | |||
118 | #define k_Reg_WOW_Flag 0 | 110 | #define k_Reg_WOW_Flag 0 |
119 | #endif | 111 | #endif |
120 | 112 | ||
113 | #ifdef USE_7ZIP_32_DLL | ||
121 | #ifdef _WIN64 | 114 | #ifdef _WIN64 |
122 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY | 115 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY |
123 | #else | 116 | #else |
124 | #define k_Reg_WOW_Flag_32 0 | 117 | #define k_Reg_WOW_Flag_32 0 |
125 | #endif | 118 | #endif |
119 | #endif | ||
126 | 120 | ||
127 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" | 121 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" |
128 | 122 | ||
@@ -219,11 +213,11 @@ static DWORD GetFileVersion(LPCWSTR s) | |||
219 | return 0; | 213 | return 0; |
220 | } | 214 | } |
221 | 215 | ||
222 | my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 216 | my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
223 | "GetFileVersionInfoSizeW"); | 217 | "GetFileVersionInfoSizeW"); |
224 | my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 218 | my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
225 | "GetFileVersionInfoW"); | 219 | "GetFileVersionInfoW"); |
226 | my_VerQueryValueW = (Func_VerQueryValueW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 220 | my_VerQueryValueW = (Func_VerQueryValueW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
227 | "VerQueryValueW"); | 221 | "VerQueryValueW"); |
228 | 222 | ||
229 | if (!my_GetFileVersionInfoSizeW | 223 | if (!my_GetFileVersionInfoSizeW |
@@ -1102,7 +1096,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
1102 | { | 1096 | { |
1103 | BOOL isWow64 = FALSE; | 1097 | BOOL isWow64 = FALSE; |
1104 | const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process) | 1098 | const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process) |
1105 | MY_CAST_FUNC GetProcAddress(GetModuleHandleW(L"kernel32.dll"), | 1099 | Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"kernel32.dll"), |
1106 | "IsWow64Process"); | 1100 | "IsWow64Process"); |
1107 | 1101 | ||
1108 | if (func_IsWow64Process) | 1102 | if (func_IsWow64Process) |
@@ -1111,7 +1105,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
1111 | if (!isWow64) | 1105 | if (!isWow64) |
1112 | { | 1106 | { |
1113 | if (!g_SilentMode) | 1107 | if (!g_SilentMode) |
1114 | PrintErrorMessage("This installation requires Windows " MY_CPU_NAME, NULL); | 1108 | PrintErrorMessage("This installation requires Windows " |
1109 | #ifdef MY_CPU_X86_OR_AMD64 | ||
1110 | "x64" | ||
1111 | #else | ||
1112 | "64-bit" | ||
1113 | #endif | ||
1114 | , NULL); | ||
1115 | return 1; | 1115 | return 1; |
1116 | } | 1116 | } |
1117 | } | 1117 | } |
diff --git a/C/Util/7zipInstall/Precomp.h b/C/Util/7zipInstall/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7zipInstall/Precomp.h +++ b/C/Util/7zipInstall/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/7zipInstall/makefile b/C/Util/7zipInstall/makefile index 18e2783..424bd6c 100644 --- a/C/Util/7zipInstall/makefile +++ b/C/Util/7zipInstall/makefile | |||
@@ -19,9 +19,6 @@ C_OBJS = \ | |||
19 | $O\7zAlloc.obj \ | 19 | $O\7zAlloc.obj \ |
20 | $O\7zArcIn.obj \ | 20 | $O\7zArcIn.obj \ |
21 | $O\7zBuf.obj \ | 21 | $O\7zBuf.obj \ |
22 | $O\7zBuf2.obj \ | ||
23 | $O\7zCrc.obj \ | ||
24 | $O\7zCrcOpt.obj \ | ||
25 | $O\7zFile.obj \ | 22 | $O\7zFile.obj \ |
26 | $O\7zDec.obj \ | 23 | $O\7zDec.obj \ |
27 | $O\7zStream.obj \ | 24 | $O\7zStream.obj \ |
@@ -34,11 +31,17 @@ C_OBJS = \ | |||
34 | OBJS = \ | 31 | OBJS = \ |
35 | $(MAIN_OBJS) \ | 32 | $(MAIN_OBJS) \ |
36 | $(C_OBJS) \ | 33 | $(C_OBJS) \ |
34 | $(ASM_OBJS) \ | ||
37 | $O\resource.res | 35 | $O\resource.res |
38 | 36 | ||
37 | !include "../../../CPP/7zip/Crc.mak" | ||
38 | # !include "../../../CPP/7zip/LzmaDec.mak" | ||
39 | |||
39 | !include "../../../CPP/Build.mak" | 40 | !include "../../../CPP/Build.mak" |
40 | 41 | ||
41 | $(MAIN_OBJS): $(*B).c | 42 | $(MAIN_OBJS): $(*B).c |
42 | $(COMPL_O1) | 43 | $(COMPL_O1) |
43 | $(C_OBJS): ../../$(*B).c | 44 | $(C_OBJS): ../../$(*B).c |
44 | $(COMPL_O1) | 45 | $(COMPL_O1) |
46 | |||
47 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/7zipInstall/resource.rc b/C/Util/7zipInstall/resource.rc index df6474e..40ed580 100644 --- a/C/Util/7zipInstall/resource.rc +++ b/C/Util/7zipInstall/resource.rc | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <winnt.h> | 1 | #include <windows.h> |
2 | #include <WinUser.h> | 2 | // #include <winnt.h> |
3 | // #include <WinUser.h> | ||
3 | #include <CommCtrl.h> | 4 | #include <CommCtrl.h> |
4 | 5 | ||
5 | #define USE_COPYRIGHT_CR | 6 | #define USE_COPYRIGHT_CR |
diff --git a/C/Util/7zipUninstall/7zipUninstall.c b/C/Util/7zipUninstall/7zipUninstall.c index 8bc18b3..e7051e2 100644 --- a/C/Util/7zipUninstall/7zipUninstall.c +++ b/C/Util/7zipUninstall/7zipUninstall.c | |||
@@ -1,10 +1,11 @@ | |||
1 | /* 7zipUninstall.c - 7-Zip Uninstaller | 1 | /* 7zipUninstall.c - 7-Zip Uninstaller |
2 | 2022-07-15 : Igor Pavlov : Public domain */ | 2 | 2024-03-21 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | // #define SZ_ERROR_ABORT 100 | 6 | // #define SZ_ERROR_ABORT 100 |
7 | 7 | ||
8 | #include "../../7zTypes.h" | ||
8 | #include "../../7zWindows.h" | 9 | #include "../../7zWindows.h" |
9 | 10 | ||
10 | #if defined(_MSC_VER) && _MSC_VER < 1600 | 11 | #if defined(_MSC_VER) && _MSC_VER < 1600 |
@@ -31,16 +32,7 @@ typedef enum { | |||
31 | 32 | ||
32 | #include "resource.h" | 33 | #include "resource.h" |
33 | 34 | ||
34 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | ||
35 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
36 | #endif | ||
37 | 35 | ||
38 | #if defined(_MSC_VER) && _MSC_VER > 1920 | ||
39 | #define MY_CAST_FUNC (void *) | ||
40 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
41 | #else | ||
42 | #define MY_CAST_FUNC | ||
43 | #endif | ||
44 | 36 | ||
45 | 37 | ||
46 | #define LLL_(quote) L##quote | 38 | #define LLL_(quote) L##quote |
@@ -101,11 +93,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path" | |||
101 | #define k_Reg_WOW_Flag 0 | 93 | #define k_Reg_WOW_Flag 0 |
102 | #endif | 94 | #endif |
103 | 95 | ||
96 | #ifdef USE_7ZIP_32_DLL | ||
104 | #ifdef _WIN64 | 97 | #ifdef _WIN64 |
105 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY | 98 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY |
106 | #else | 99 | #else |
107 | #define k_Reg_WOW_Flag_32 0 | 100 | #define k_Reg_WOW_Flag_32 0 |
108 | #endif | 101 | #endif |
102 | #endif | ||
109 | 103 | ||
110 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" | 104 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" |
111 | 105 | ||
@@ -124,9 +118,19 @@ static HWND g_Path_HWND; | |||
124 | static HWND g_InfoLine_HWND; | 118 | static HWND g_InfoLine_HWND; |
125 | static HWND g_Progress_HWND; | 119 | static HWND g_Progress_HWND; |
126 | 120 | ||
127 | // WINADVAPI | 121 | // RegDeleteKeyExW is supported starting from win2003sp1/xp-pro-x64 |
122 | // Z7_WIN32_WINNT_MIN < 0x0600 // Vista | ||
123 | #if !defined(Z7_WIN32_WINNT_MIN) \ | ||
124 | || Z7_WIN32_WINNT_MIN < 0x0502 /* < win2003 */ \ | ||
125 | || Z7_WIN32_WINNT_MIN == 0x0502 && !defined(_M_AMD64) | ||
126 | #define Z7_USE_DYN_RegDeleteKeyExW | ||
127 | #endif | ||
128 | |||
129 | #ifdef Z7_USE_DYN_RegDeleteKeyExW | ||
130 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
128 | typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved); | 131 | typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved); |
129 | static Func_RegDeleteKeyExW func_RegDeleteKeyExW; | 132 | static Func_RegDeleteKeyExW func_RegDeleteKeyExW; |
133 | #endif | ||
130 | 134 | ||
131 | static WCHAR cmd[MAX_PATH + 4]; | 135 | static WCHAR cmd[MAX_PATH + 4]; |
132 | static WCHAR cmdError[MAX_PATH + 4]; | 136 | static WCHAR cmdError[MAX_PATH + 4]; |
@@ -247,13 +251,18 @@ static LONG MyRegistry_OpenKey_ReadWrite(HKEY parentKey, LPCWSTR name, HKEY *des | |||
247 | 251 | ||
248 | static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name) | 252 | static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name) |
249 | { | 253 | { |
250 | #if k_Reg_WOW_Flag != 0 | 254 | #if k_Reg_WOW_Flag != 0 |
251 | if (func_RegDeleteKeyExW) | 255 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
252 | return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag, 0); | 256 | if (!func_RegDeleteKeyExW) |
253 | return E_FAIL; | 257 | return E_FAIL; |
254 | #else | 258 | return func_RegDeleteKeyExW |
259 | #else | ||
260 | return RegDeleteKeyExW | ||
261 | #endif | ||
262 | (parentKey, name, k_Reg_WOW_Flag, 0); | ||
263 | #else | ||
255 | return RegDeleteKeyW(parentKey, name); | 264 | return RegDeleteKeyW(parentKey, name); |
256 | #endif | 265 | #endif |
257 | } | 266 | } |
258 | 267 | ||
259 | #ifdef USE_7ZIP_32_DLL | 268 | #ifdef USE_7ZIP_32_DLL |
@@ -278,13 +287,18 @@ static LONG MyRegistry_OpenKey_ReadWrite_32(HKEY parentKey, LPCWSTR name, HKEY * | |||
278 | 287 | ||
279 | static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name) | 288 | static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name) |
280 | { | 289 | { |
281 | #if k_Reg_WOW_Flag_32 != 0 | 290 | #if k_Reg_WOW_Flag_32 != 0 |
282 | if (func_RegDeleteKeyExW) | 291 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
283 | return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag_32, 0); | 292 | if (!func_RegDeleteKeyExW) |
284 | return E_FAIL; | 293 | return E_FAIL; |
285 | #else | 294 | return func_RegDeleteKeyExW |
295 | #else | ||
296 | return RegDeleteKeyExW | ||
297 | #endif | ||
298 | (parentKey, name, k_Reg_WOW_Flag_32, 0); | ||
299 | #else | ||
286 | return RegDeleteKeyW(parentKey, name); | 300 | return RegDeleteKeyW(parentKey, name); |
287 | #endif | 301 | #endif |
288 | } | 302 | } |
289 | 303 | ||
290 | #endif | 304 | #endif |
@@ -930,14 +944,17 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
930 | UNUSED_VAR(lpCmdLine) | 944 | UNUSED_VAR(lpCmdLine) |
931 | UNUSED_VAR(nCmdShow) | 945 | UNUSED_VAR(nCmdShow) |
932 | 946 | ||
933 | #ifndef UNDER_CE | 947 | #ifndef UNDER_CE |
934 | CoInitialize(NULL); | 948 | CoInitialize(NULL); |
935 | #endif | 949 | #endif |
936 | 950 | ||
937 | #ifndef UNDER_CE | 951 | #ifndef UNDER_CE |
938 | func_RegDeleteKeyExW = (Func_RegDeleteKeyExW) MY_CAST_FUNC | 952 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
939 | GetProcAddress(GetModuleHandleW(L"advapi32.dll"), "RegDeleteKeyExW"); | 953 | func_RegDeleteKeyExW = |
940 | #endif | 954 | (Func_RegDeleteKeyExW) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"advapi32.dll"), |
955 | "RegDeleteKeyExW"); | ||
956 | #endif | ||
957 | #endif | ||
941 | 958 | ||
942 | { | 959 | { |
943 | const wchar_t *s = GetCommandLineW(); | 960 | const wchar_t *s = GetCommandLineW(); |
diff --git a/C/Util/7zipUninstall/Precomp.h b/C/Util/7zipUninstall/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7zipUninstall/Precomp.h +++ b/C/Util/7zipUninstall/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/7zipUninstall/resource.rc b/C/Util/7zipUninstall/resource.rc index 00bdcc0..79400c6 100644 --- a/C/Util/7zipUninstall/resource.rc +++ b/C/Util/7zipUninstall/resource.rc | |||
@@ -1,5 +1,6 @@ | |||
1 | #include <winnt.h> | 1 | #include <windows.h> |
2 | #include <WinUser.h> | 2 | // #include <winnt.h> |
3 | // #include <WinUser.h> | ||
3 | #include <CommCtrl.h> | 4 | #include <CommCtrl.h> |
4 | 5 | ||
5 | #define USE_COPYRIGHT_CR | 6 | #define USE_COPYRIGHT_CR |
diff --git a/C/Util/Lzma/Precomp.h b/C/Util/Lzma/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/Lzma/Precomp.h +++ b/C/Util/Lzma/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/LzmaLib/Precomp.h b/C/Util/LzmaLib/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/LzmaLib/Precomp.h +++ b/C/Util/LzmaLib/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/LzmaLib/makefile b/C/Util/LzmaLib/makefile index b8e054e..9ed0aa4 100644 --- a/C/Util/LzmaLib/makefile +++ b/C/Util/LzmaLib/makefile | |||
@@ -14,16 +14,19 @@ C_OBJS = \ | |||
14 | $O\CpuArch.obj \ | 14 | $O\CpuArch.obj \ |
15 | $O\LzFind.obj \ | 15 | $O\LzFind.obj \ |
16 | $O\LzFindMt.obj \ | 16 | $O\LzFindMt.obj \ |
17 | $O\LzFindOpt.obj \ | ||
18 | $O\LzmaDec.obj \ | 17 | $O\LzmaDec.obj \ |
19 | $O\LzmaEnc.obj \ | 18 | $O\LzmaEnc.obj \ |
20 | $O\LzmaLib.obj \ | 19 | $O\LzmaLib.obj \ |
21 | $O\Threads.obj \ | 20 | $O\Threads.obj \ |
22 | 21 | ||
22 | !include "../../../CPP/7zip/LzFindOpt.mak" | ||
23 | !include "../../../CPP/7zip/LzmaDec.mak" | ||
24 | |||
23 | OBJS = \ | 25 | OBJS = \ |
24 | $O\Precomp.obj \ | 26 | $O\Precomp.obj \ |
25 | $(LIB_OBJS) \ | 27 | $(LIB_OBJS) \ |
26 | $(C_OBJS) \ | 28 | $(C_OBJS) \ |
29 | $(ASM_OBJS) \ | ||
27 | $O\resource.res | 30 | $O\resource.res |
28 | 31 | ||
29 | !include "../../../CPP/Build.mak" | 32 | !include "../../../CPP/Build.mak" |
@@ -52,3 +55,5 @@ $(C_OBJS): ../../$(*B).c | |||
52 | $(CCOMPLB_USE) | 55 | $(CCOMPLB_USE) |
53 | 56 | ||
54 | !ENDIF | 57 | !ENDIF |
58 | |||
59 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/SfxSetup/Precomp.h b/C/Util/SfxSetup/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/SfxSetup/Precomp.h +++ b/C/Util/SfxSetup/Precomp.h | |||
@@ -1,14 +1,13 @@ | |||
1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
6 | 6 | ||
7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
9 | #endif | 9 | #endif |
10 | 10 | ||
11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
12 | #include "../../7zTypes.h" | ||
13 | 12 | ||
14 | #endif | 13 | // #endif |
diff --git a/C/Util/SfxSetup/SfxSetup.c b/C/Util/SfxSetup/SfxSetup.c index 7304a0b..9b5c1f9 100644 --- a/C/Util/SfxSetup/SfxSetup.c +++ b/C/Util/SfxSetup/SfxSetup.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* SfxSetup.c - 7z SFX Setup | 1 | /* SfxSetup.c - 7z SFX Setup |
2 | 2019-02-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-24 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -278,10 +278,10 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
278 | #ifdef _CONSOLE | 278 | #ifdef _CONSOLE |
279 | SetConsoleCtrlHandler(HandlerRoutine, TRUE); | 279 | SetConsoleCtrlHandler(HandlerRoutine, TRUE); |
280 | #else | 280 | #else |
281 | UNUSED_VAR(hInstance); | 281 | UNUSED_VAR(hInstance) |
282 | UNUSED_VAR(hPrevInstance); | 282 | UNUSED_VAR(hPrevInstance) |
283 | UNUSED_VAR(lpCmdLine); | 283 | UNUSED_VAR(lpCmdLine) |
284 | UNUSED_VAR(nCmdShow); | 284 | UNUSED_VAR(nCmdShow) |
285 | #endif | 285 | #endif |
286 | 286 | ||
287 | CrcGenerateTable(); | 287 | CrcGenerateTable(); |
@@ -516,12 +516,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
516 | #endif | 516 | #endif |
517 | 517 | ||
518 | { | 518 | { |
519 | const SRes res2 = File_Close(&outFile); | 519 | const WRes res2 = File_Close(&outFile); |
520 | if (res != SZ_OK) | 520 | if (res != SZ_OK) |
521 | break; | 521 | break; |
522 | if (res2 != SZ_OK) | 522 | if (res2 != 0) |
523 | { | 523 | { |
524 | res = res2; | 524 | errorMessage = "Can't close output file"; |
525 | res = SZ_ERROR_FAIL; | ||
525 | break; | 526 | break; |
526 | } | 527 | } |
527 | } | 528 | } |
diff --git a/C/Util/SfxSetup/makefile b/C/Util/SfxSetup/makefile index bc0cf8b..b3f25a2 100644 --- a/C/Util/SfxSetup/makefile +++ b/C/Util/SfxSetup/makefile | |||
@@ -9,8 +9,6 @@ C_OBJS = \ | |||
9 | $O\7zArcIn.obj \ | 9 | $O\7zArcIn.obj \ |
10 | $O\7zBuf.obj \ | 10 | $O\7zBuf.obj \ |
11 | $O\7zBuf2.obj \ | 11 | $O\7zBuf2.obj \ |
12 | $O\7zCrc.obj \ | ||
13 | $O\7zCrcOpt.obj \ | ||
14 | $O\7zFile.obj \ | 12 | $O\7zFile.obj \ |
15 | $O\7zDec.obj \ | 13 | $O\7zDec.obj \ |
16 | $O\7zStream.obj \ | 14 | $O\7zStream.obj \ |
@@ -27,9 +25,13 @@ C_OBJS = \ | |||
27 | 7Z_OBJS = \ | 25 | 7Z_OBJS = \ |
28 | $O\SfxSetup.obj \ | 26 | $O\SfxSetup.obj \ |
29 | 27 | ||
28 | !include "../../../CPP/7zip/Crc.mak" | ||
29 | # !include "../../../CPP/7zip/LzmaDec.mak" | ||
30 | |||
30 | OBJS = \ | 31 | OBJS = \ |
31 | $(7Z_OBJS) \ | 32 | $(7Z_OBJS) \ |
32 | $(C_OBJS) \ | 33 | $(C_OBJS) \ |
34 | $(ASM_OBJS) \ | ||
33 | $O\resource.res | 35 | $O\resource.res |
34 | 36 | ||
35 | !include "../../../CPP/Build.mak" | 37 | !include "../../../CPP/Build.mak" |
@@ -38,3 +40,5 @@ $(7Z_OBJS): $(*B).c | |||
38 | $(COMPL_O1) | 40 | $(COMPL_O1) |
39 | $(C_OBJS): ../../$(*B).c | 41 | $(C_OBJS): ../../$(*B).c |
40 | $(COMPL_O1) | 42 | $(COMPL_O1) |
43 | |||
44 | !include "../../Asm_c.mak" | ||
diff --git a/C/Xxh64.c b/C/Xxh64.c new file mode 100644 index 0000000..dc02a02 --- /dev/null +++ b/C/Xxh64.c | |||
@@ -0,0 +1,327 @@ | |||
1 | /* Xxh64.c -- XXH64 hash calculation | ||
2 | original code: Copyright (c) Yann Collet. | ||
3 | 2023-08-18 : modified by Igor Pavlov. | ||
4 | This source code is licensed under BSD 2-Clause License. | ||
5 | */ | ||
6 | |||
7 | #include "Precomp.h" | ||
8 | |||
9 | #include "CpuArch.h" | ||
10 | #include "RotateDefs.h" | ||
11 | #include "Xxh64.h" | ||
12 | |||
13 | #define Z7_XXH_PRIME64_1 UINT64_CONST(0x9E3779B185EBCA87) | ||
14 | #define Z7_XXH_PRIME64_2 UINT64_CONST(0xC2B2AE3D27D4EB4F) | ||
15 | #define Z7_XXH_PRIME64_3 UINT64_CONST(0x165667B19E3779F9) | ||
16 | #define Z7_XXH_PRIME64_4 UINT64_CONST(0x85EBCA77C2B2AE63) | ||
17 | #define Z7_XXH_PRIME64_5 UINT64_CONST(0x27D4EB2F165667C5) | ||
18 | |||
19 | void Xxh64State_Init(CXxh64State *p) | ||
20 | { | ||
21 | const UInt64 seed = 0; | ||
22 | p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2; | ||
23 | p->v[1] = seed + Z7_XXH_PRIME64_2; | ||
24 | p->v[2] = seed; | ||
25 | p->v[3] = seed - Z7_XXH_PRIME64_1; | ||
26 | } | ||
27 | |||
28 | #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER) | ||
29 | #define Z7_XXH64_USE_ASM | ||
30 | #endif | ||
31 | |||
32 | #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \ | ||
33 | && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200 | ||
34 | /* we try to avoid __allmul calls in MSVC for 64-bit multiply. | ||
35 | But MSVC6 still uses __allmul for our code. | ||
36 | So for MSVC6 we use default 64-bit multiply without our optimization. | ||
37 | */ | ||
38 | #define LOW32(b) ((UInt32)(b & 0xffffffff)) | ||
39 | /* MSVC compiler (MSVC > 1200) can use "mul" instruction | ||
40 | without __allmul for our MY_emulu MACRO. | ||
41 | MY_emulu is similar to __emulu(a, b) MACRO */ | ||
42 | #define MY_emulu(a, b) ((UInt64)(a) * (b)) | ||
43 | #define MY_SET_HIGH32(a) ((UInt64)(a) << 32) | ||
44 | #define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b)) | ||
45 | // /* | ||
46 | #define MY_MUL64(a, b) \ | ||
47 | ( MY_emulu((UInt32)(a), LOW32(b)) + \ | ||
48 | MY_SET_HIGH32( \ | ||
49 | (UInt32)((a) >> 32) * LOW32(b) + \ | ||
50 | (UInt32)(a) * (UInt32)((b) >> 32) \ | ||
51 | )) | ||
52 | // */ | ||
53 | /* | ||
54 | #define MY_MUL64(a, b) \ | ||
55 | ( MY_emulu((UInt32)(a), LOW32(b)) \ | ||
56 | + MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \ | ||
57 | + MY_MUL32_SET_HIGH32(a, (b) >> 32) \ | ||
58 | ) | ||
59 | */ | ||
60 | |||
61 | #define MY_MUL_32_64(a32, b) \ | ||
62 | ( MY_emulu((UInt32)(a32), LOW32(b)) \ | ||
63 | + MY_MUL32_SET_HIGH32(a32, (b) >> 32) \ | ||
64 | ) | ||
65 | |||
66 | #else | ||
67 | #define MY_MUL64(a, b) ((a) * (b)) | ||
68 | #define MY_MUL_32_64(a32, b) ((a32) * (UInt64)(b)) | ||
69 | #endif | ||
70 | |||
71 | |||
72 | static | ||
73 | Z7_FORCE_INLINE | ||
74 | UInt64 Xxh64_Round(UInt64 acc, UInt64 input) | ||
75 | { | ||
76 | acc += MY_MUL64(input, Z7_XXH_PRIME64_2); | ||
77 | acc = Z7_ROTL64(acc, 31); | ||
78 | return MY_MUL64(acc, Z7_XXH_PRIME64_1); | ||
79 | } | ||
80 | |||
81 | static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val) | ||
82 | { | ||
83 | acc ^= Xxh64_Round(0, val); | ||
84 | return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; | ||
85 | } | ||
86 | |||
87 | |||
88 | #ifdef Z7_XXH64_USE_ASM | ||
89 | |||
90 | #define Z7_XXH_PRIME64_1_HIGH 0x9E3779B1 | ||
91 | #define Z7_XXH_PRIME64_1_LOW 0x85EBCA87 | ||
92 | #define Z7_XXH_PRIME64_2_HIGH 0xC2B2AE3D | ||
93 | #define Z7_XXH_PRIME64_2_LOW 0x27D4EB4F | ||
94 | |||
95 | void | ||
96 | Z7_NO_INLINE | ||
97 | __declspec(naked) | ||
98 | Z7_FASTCALL | ||
99 | Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end) | ||
100 | { | ||
101 | #if !defined(__clang__) | ||
102 | UNUSED_VAR(p) | ||
103 | UNUSED_VAR(data) | ||
104 | UNUSED_VAR(end) | ||
105 | #endif | ||
106 | __asm push ebx | ||
107 | __asm push ebp | ||
108 | __asm push esi | ||
109 | __asm push edi | ||
110 | |||
111 | #define STACK_OFFSET 4 * 8 | ||
112 | __asm sub esp, STACK_OFFSET | ||
113 | |||
114 | #define COPY_1(n) \ | ||
115 | __asm mov eax, [ecx + n * 4] \ | ||
116 | __asm mov [esp + n * 4], eax \ | ||
117 | |||
118 | #define COPY_2(n) \ | ||
119 | __asm mov eax, [esp + n * 4] \ | ||
120 | __asm mov [ecx + n * 4], eax \ | ||
121 | |||
122 | COPY_1(0) | ||
123 | __asm mov edi, [ecx + 1 * 4] \ | ||
124 | COPY_1(2) | ||
125 | COPY_1(3) | ||
126 | COPY_1(4) | ||
127 | COPY_1(5) | ||
128 | COPY_1(6) | ||
129 | COPY_1(7) | ||
130 | |||
131 | __asm mov esi, edx \ | ||
132 | __asm mov [esp + 0 * 8 + 4], ecx | ||
133 | __asm mov ecx, Z7_XXH_PRIME64_2_LOW \ | ||
134 | __asm mov ebp, Z7_XXH_PRIME64_1_LOW \ | ||
135 | |||
136 | #define R(n, state1, state1_reg) \ | ||
137 | __asm mov eax, [esi + n * 8] \ | ||
138 | __asm imul ebx, eax, Z7_XXH_PRIME64_2_HIGH \ | ||
139 | __asm add ebx, state1 \ | ||
140 | __asm mul ecx \ | ||
141 | __asm add edx, ebx \ | ||
142 | __asm mov ebx, [esi + n * 8 + 4] \ | ||
143 | __asm imul ebx, ecx \ | ||
144 | __asm add eax, [esp + n * 8] \ | ||
145 | __asm adc edx, ebx \ | ||
146 | __asm mov ebx, eax \ | ||
147 | __asm shld eax, edx, 31 \ | ||
148 | __asm shld edx, ebx, 31 \ | ||
149 | __asm imul state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \ | ||
150 | __asm imul edx, ebp \ | ||
151 | __asm add state1_reg, edx \ | ||
152 | __asm mul ebp \ | ||
153 | __asm add state1_reg, edx \ | ||
154 | __asm mov [esp + n * 8], eax \ | ||
155 | |||
156 | #define R2(n) \ | ||
157 | R(n, [esp + n * 8 + 4], ebx) \ | ||
158 | __asm mov [esp + n * 8 + 4], ebx \ | ||
159 | |||
160 | __asm align 16 | ||
161 | __asm main_loop: | ||
162 | R(0, edi, edi) | ||
163 | R2(1) | ||
164 | R2(2) | ||
165 | R2(3) | ||
166 | __asm add esi, 32 | ||
167 | __asm cmp esi, [esp + STACK_OFFSET + 4 * 4 + 4] | ||
168 | __asm jne main_loop | ||
169 | |||
170 | __asm mov ecx, [esp + 0 * 8 + 4] | ||
171 | |||
172 | COPY_2(0) | ||
173 | __asm mov [ecx + 1 * 4], edi | ||
174 | COPY_2(2) | ||
175 | COPY_2(3) | ||
176 | COPY_2(4) | ||
177 | COPY_2(5) | ||
178 | COPY_2(6) | ||
179 | COPY_2(7) | ||
180 | |||
181 | __asm add esp, STACK_OFFSET | ||
182 | __asm pop edi | ||
183 | __asm pop esi | ||
184 | __asm pop ebp | ||
185 | __asm pop ebx | ||
186 | __asm ret 4 | ||
187 | } | ||
188 | |||
189 | #else | ||
190 | |||
191 | void | ||
192 | Z7_NO_INLINE | ||
193 | Z7_FASTCALL | ||
194 | Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end) | ||
195 | { | ||
196 | const Byte *data = (const Byte *)_data; | ||
197 | UInt64 v[4]; | ||
198 | v[0] = p->v[0]; | ||
199 | v[1] = p->v[1]; | ||
200 | v[2] = p->v[2]; | ||
201 | v[3] = p->v[3]; | ||
202 | do | ||
203 | { | ||
204 | v[0] = Xxh64_Round(v[0], GetUi64(data)); data += 8; | ||
205 | v[1] = Xxh64_Round(v[1], GetUi64(data)); data += 8; | ||
206 | v[2] = Xxh64_Round(v[2], GetUi64(data)); data += 8; | ||
207 | v[3] = Xxh64_Round(v[3], GetUi64(data)); data += 8; | ||
208 | } | ||
209 | while (data != end); | ||
210 | p->v[0] = v[0]; | ||
211 | p->v[1] = v[1]; | ||
212 | p->v[2] = v[2]; | ||
213 | p->v[3] = v[3]; | ||
214 | } | ||
215 | |||
216 | #endif | ||
217 | |||
218 | UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count) | ||
219 | { | ||
220 | UInt64 h = p->v[2]; | ||
221 | |||
222 | if (count >= 32) | ||
223 | { | ||
224 | h = Z7_ROTL64(p->v[0], 1) + | ||
225 | Z7_ROTL64(p->v[1], 7) + | ||
226 | Z7_ROTL64(h, 12) + | ||
227 | Z7_ROTL64(p->v[3], 18); | ||
228 | h = Xxh64_Merge(h, p->v[0]); | ||
229 | h = Xxh64_Merge(h, p->v[1]); | ||
230 | h = Xxh64_Merge(h, p->v[2]); | ||
231 | h = Xxh64_Merge(h, p->v[3]); | ||
232 | } | ||
233 | else | ||
234 | h += Z7_XXH_PRIME64_5; | ||
235 | |||
236 | h += count; | ||
237 | |||
238 | // XXH64_finalize(): | ||
239 | { | ||
240 | unsigned cnt = (unsigned)count & 31; | ||
241 | const Byte *data = (const Byte *)_data; | ||
242 | while (cnt >= 8) | ||
243 | { | ||
244 | h ^= Xxh64_Round(0, GetUi64(data)); | ||
245 | data += 8; | ||
246 | h = Z7_ROTL64(h, 27); | ||
247 | h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; | ||
248 | cnt -= 8; | ||
249 | } | ||
250 | if (cnt >= 4) | ||
251 | { | ||
252 | const UInt32 v = GetUi32(data); | ||
253 | data += 4; | ||
254 | h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1); | ||
255 | h = Z7_ROTL64(h, 23); | ||
256 | h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3; | ||
257 | cnt -= 4; | ||
258 | } | ||
259 | while (cnt) | ||
260 | { | ||
261 | const UInt32 v = *data++; | ||
262 | h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5); | ||
263 | h = Z7_ROTL64(h, 11); | ||
264 | h = MY_MUL64(h, Z7_XXH_PRIME64_1); | ||
265 | cnt--; | ||
266 | } | ||
267 | // XXH64_avalanche(h): | ||
268 | h ^= h >> 33; h = MY_MUL64(h, Z7_XXH_PRIME64_2); | ||
269 | h ^= h >> 29; h = MY_MUL64(h, Z7_XXH_PRIME64_3); | ||
270 | h ^= h >> 32; | ||
271 | return h; | ||
272 | } | ||
273 | } | ||
274 | |||
275 | |||
276 | void Xxh64_Init(CXxh64 *p) | ||
277 | { | ||
278 | Xxh64State_Init(&p->state); | ||
279 | p->count = 0; | ||
280 | p->buf64[0] = 0; | ||
281 | p->buf64[1] = 0; | ||
282 | p->buf64[2] = 0; | ||
283 | p->buf64[3] = 0; | ||
284 | } | ||
285 | |||
286 | void Xxh64_Update(CXxh64 *p, const void *_data, size_t size) | ||
287 | { | ||
288 | const Byte *data = (const Byte *)_data; | ||
289 | unsigned cnt; | ||
290 | if (size == 0) | ||
291 | return; | ||
292 | cnt = (unsigned)p->count; | ||
293 | p->count += size; | ||
294 | |||
295 | if (cnt &= 31) | ||
296 | { | ||
297 | unsigned rem = 32 - cnt; | ||
298 | Byte *dest = (Byte *)p->buf64 + cnt; | ||
299 | if (rem > size) | ||
300 | rem = (unsigned)size; | ||
301 | size -= rem; | ||
302 | cnt += rem; | ||
303 | // memcpy((Byte *)p->buf64 + cnt, data, rem); | ||
304 | do | ||
305 | *dest++ = *data++; | ||
306 | while (--rem); | ||
307 | if (cnt != 32) | ||
308 | return; | ||
309 | Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]); | ||
310 | } | ||
311 | |||
312 | if (size &= ~(size_t)31) | ||
313 | { | ||
314 | Xxh64State_UpdateBlocks(&p->state, data, data + size); | ||
315 | data += size; | ||
316 | } | ||
317 | |||
318 | cnt = (unsigned)p->count & 31; | ||
319 | if (cnt) | ||
320 | { | ||
321 | // memcpy(p->buf64, data, cnt); | ||
322 | Byte *dest = (Byte *)p->buf64; | ||
323 | do | ||
324 | *dest++ = *data++; | ||
325 | while (--cnt); | ||
326 | } | ||
327 | } | ||
diff --git a/C/Xxh64.h b/C/Xxh64.h new file mode 100644 index 0000000..efef65e --- /dev/null +++ b/C/Xxh64.h | |||
@@ -0,0 +1,50 @@ | |||
1 | /* Xxh64.h -- XXH64 hash calculation interfaces | ||
2 | 2023-08-18 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_XXH64_H | ||
5 | #define ZIP7_INC_XXH64_H | ||
6 | |||
7 | #include "7zTypes.h" | ||
8 | |||
9 | EXTERN_C_BEGIN | ||
10 | |||
11 | #define Z7_XXH64_BLOCK_SIZE (4 * 8) | ||
12 | |||
13 | typedef struct | ||
14 | { | ||
15 | UInt64 v[4]; | ||
16 | } CXxh64State; | ||
17 | |||
18 | void Xxh64State_Init(CXxh64State *p); | ||
19 | |||
20 | // end != data && end == data + Z7_XXH64_BLOCK_SIZE * numBlocks | ||
21 | void Z7_FASTCALL Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end); | ||
22 | |||
23 | /* | ||
24 | Xxh64State_Digest(): | ||
25 | data: | ||
26 | the function processes only | ||
27 | (totalCount & (Z7_XXH64_BLOCK_SIZE - 1)) bytes in (data): (smaller than 32 bytes). | ||
28 | totalCount: total size of hashed stream: | ||
29 | it includes total size of data processed by previous Xxh64State_UpdateBlocks() calls, | ||
30 | and it also includes current processed size in (data). | ||
31 | */ | ||
32 | UInt64 Xxh64State_Digest(const CXxh64State *p, const void *data, UInt64 totalCount); | ||
33 | |||
34 | |||
35 | typedef struct | ||
36 | { | ||
37 | CXxh64State state; | ||
38 | UInt64 count; | ||
39 | UInt64 buf64[4]; | ||
40 | } CXxh64; | ||
41 | |||
42 | void Xxh64_Init(CXxh64 *p); | ||
43 | void Xxh64_Update(CXxh64 *p, const void *data, size_t size); | ||
44 | |||
45 | #define Xxh64_Digest(p) \ | ||
46 | Xxh64State_Digest(&(p)->state, (p)->buf64, (p)->count) | ||
47 | |||
48 | EXTERN_C_END | ||
49 | |||
50 | #endif | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Xz.c - Xz | 1 | /* Xz.c - Xz |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode) | |||
52 | case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; | 52 | case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; |
53 | case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; | 53 | case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; |
54 | case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; | 54 | case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; |
55 | default: break; | ||
55 | } | 56 | } |
56 | } | 57 | } |
57 | 58 | ||
@@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size) | |||
62 | case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; | 63 | case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; |
63 | case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; | 64 | case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; |
64 | case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; | 65 | case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; |
66 | default: break; | ||
65 | } | 67 | } |
66 | } | 68 | } |
67 | 69 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* Xz.h - Xz interface | 1 | /* Xz.h - Xz interface |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-01-26 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_XZ_H | 4 | #ifndef ZIP7_INC_XZ_H |
5 | #define ZIP7_INC_XZ_H | 5 | #define ZIP7_INC_XZ_H |
@@ -18,6 +18,7 @@ EXTERN_C_BEGIN | |||
18 | #define XZ_ID_ARMT 8 | 18 | #define XZ_ID_ARMT 8 |
19 | #define XZ_ID_SPARC 9 | 19 | #define XZ_ID_SPARC 9 |
20 | #define XZ_ID_ARM64 0xa | 20 | #define XZ_ID_ARM64 0xa |
21 | #define XZ_ID_RISCV 0xb | ||
21 | #define XZ_ID_LZMA2 0x21 | 22 | #define XZ_ID_LZMA2 0x21 |
22 | 23 | ||
23 | unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); | 24 | unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); |
@@ -233,13 +234,13 @@ typedef enum | |||
233 | typedef struct | 234 | typedef struct |
234 | { | 235 | { |
235 | EXzState state; | 236 | EXzState state; |
236 | UInt32 pos; | 237 | unsigned pos; |
237 | unsigned alignPos; | 238 | unsigned alignPos; |
238 | unsigned indexPreSize; | 239 | unsigned indexPreSize; |
239 | 240 | ||
240 | CXzStreamFlags streamFlags; | 241 | CXzStreamFlags streamFlags; |
241 | 242 | ||
242 | UInt32 blockHeaderSize; | 243 | unsigned blockHeaderSize; |
243 | UInt64 packSize; | 244 | UInt64 packSize; |
244 | UInt64 unpackSize; | 245 | UInt64 unpackSize; |
245 | 246 | ||
diff --git a/C/XzCrc64.c b/C/XzCrc64.c index c2fad6c..94fc1af 100644 --- a/C/XzCrc64.c +++ b/C/XzCrc64.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* XzCrc64.c -- CRC64 calculation | 1 | /* XzCrc64.c -- CRC64 calculation |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -8,36 +8,76 @@ | |||
8 | 8 | ||
9 | #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) | 9 | #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) |
10 | 10 | ||
11 | #ifdef MY_CPU_LE | 11 | // for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu |
12 | #define CRC64_NUM_TABLES 4 | 12 | // #define Z7_CRC64_DEBUG_BE |
13 | #ifdef Z7_CRC64_DEBUG_BE | ||
14 | #undef MY_CPU_LE | ||
15 | #define MY_CPU_BE | ||
16 | #endif | ||
17 | |||
18 | #ifdef Z7_CRC64_NUM_TABLES | ||
19 | #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES | ||
13 | #else | 20 | #else |
14 | #define CRC64_NUM_TABLES 5 | 21 | #define Z7_CRC64_NUM_TABLES_USE 12 |
22 | #endif | ||
15 | 23 | ||
16 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 24 | #if Z7_CRC64_NUM_TABLES_USE < 1 |
25 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
17 | #endif | 26 | #endif |
18 | 27 | ||
28 | |||
29 | #if Z7_CRC64_NUM_TABLES_USE != 1 | ||
30 | |||
19 | #ifndef MY_CPU_BE | 31 | #ifndef MY_CPU_BE |
20 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 32 | #define FUNC_NAME_LE_2(s) XzCrc64UpdateT ## s |
33 | #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) | ||
34 | #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE) | ||
35 | UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
36 | #endif | ||
37 | #ifndef MY_CPU_LE | ||
38 | #define FUNC_NAME_BE_2(s) XzCrc64UpdateBeT ## s | ||
39 | #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) | ||
40 | #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE) | ||
41 | UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
21 | #endif | 42 | #endif |
22 | 43 | ||
23 | typedef UInt64 (Z7_FASTCALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table); | 44 | #if defined(MY_CPU_LE) |
45 | #define FUNC_REF FUNC_NAME_LE | ||
46 | #elif defined(MY_CPU_BE) | ||
47 | #define FUNC_REF FUNC_NAME_BE | ||
48 | #else | ||
49 | #define FUNC_REF g_Crc64Update | ||
50 | static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
51 | #endif | ||
52 | |||
53 | #endif | ||
54 | |||
55 | |||
56 | MY_ALIGN(64) | ||
57 | static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE]; | ||
24 | 58 | ||
25 | static CRC64_FUNC g_Crc64Update; | ||
26 | UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES]; | ||
27 | 59 | ||
28 | UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) | 60 | UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) |
29 | { | 61 | { |
30 | return g_Crc64Update(v, data, size, g_Crc64Table); | 62 | #if Z7_CRC64_NUM_TABLES_USE == 1 |
63 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | ||
64 | const UInt64 *table = g_Crc64Table; | ||
65 | const Byte *p = (const Byte *)data; | ||
66 | const Byte *lim = p + size; | ||
67 | for (; p != lim; p++) | ||
68 | v = CRC64_UPDATE_BYTE_2(v, *p); | ||
69 | return v; | ||
70 | #undef CRC64_UPDATE_BYTE_2 | ||
71 | #else | ||
72 | return FUNC_REF (v, data, size, g_Crc64Table); | ||
73 | #endif | ||
31 | } | 74 | } |
32 | 75 | ||
33 | UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size) | ||
34 | { | ||
35 | return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL; | ||
36 | } | ||
37 | 76 | ||
77 | Z7_NO_INLINE | ||
38 | void Z7_FASTCALL Crc64GenerateTable(void) | 78 | void Z7_FASTCALL Crc64GenerateTable(void) |
39 | { | 79 | { |
40 | UInt32 i; | 80 | unsigned i; |
41 | for (i = 0; i < 256; i++) | 81 | for (i = 0; i < 256; i++) |
42 | { | 82 | { |
43 | UInt64 r = i; | 83 | UInt64 r = i; |
@@ -46,35 +86,55 @@ void Z7_FASTCALL Crc64GenerateTable(void) | |||
46 | r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); | 86 | r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); |
47 | g_Crc64Table[i] = r; | 87 | g_Crc64Table[i] = r; |
48 | } | 88 | } |
49 | for (i = 256; i < 256 * CRC64_NUM_TABLES; i++) | 89 | |
90 | #if Z7_CRC64_NUM_TABLES_USE != 1 | ||
91 | #if 1 || 1 && defined(MY_CPU_X86) // low register count | ||
92 | for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++) | ||
50 | { | 93 | { |
51 | const UInt64 r = g_Crc64Table[(size_t)i - 256]; | 94 | const UInt64 r0 = g_Crc64Table[(size_t)i]; |
52 | g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8); | 95 | g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); |
53 | } | 96 | } |
54 | 97 | #else | |
55 | #ifdef MY_CPU_LE | 98 | for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2) |
56 | 99 | { | |
57 | g_Crc64Update = XzCrc64UpdateT4; | 100 | UInt64 r0 = g_Crc64Table[(size_t)(i) ]; |
101 | UInt64 r1 = g_Crc64Table[(size_t)(i) + 1]; | ||
102 | r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); | ||
103 | r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8); | ||
104 | g_Crc64Table[(size_t)i + 256 ] = r0; | ||
105 | g_Crc64Table[(size_t)i + 256 + 1] = r1; | ||
106 | } | ||
107 | #endif | ||
58 | 108 | ||
59 | #else | 109 | #ifndef MY_CPU_LE |
60 | { | 110 | { |
61 | #ifndef MY_CPU_BE | 111 | #ifndef MY_CPU_BE |
62 | UInt32 k = 1; | 112 | UInt32 k = 1; |
63 | if (*(const Byte *)&k == 1) | 113 | if (*(const Byte *)&k == 1) |
64 | g_Crc64Update = XzCrc64UpdateT4; | 114 | FUNC_REF = FUNC_NAME_LE; |
65 | else | 115 | else |
66 | #endif | 116 | #endif |
67 | { | 117 | { |
68 | for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--) | 118 | #ifndef MY_CPU_BE |
119 | FUNC_REF = FUNC_NAME_BE; | ||
120 | #endif | ||
121 | for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++) | ||
69 | { | 122 | { |
70 | const UInt64 x = g_Crc64Table[(size_t)i - 256]; | 123 | const UInt64 x = g_Crc64Table[i]; |
71 | g_Crc64Table[i] = Z7_BSWAP64(x); | 124 | g_Crc64Table[i] = Z7_BSWAP64(x); |
72 | } | 125 | } |
73 | g_Crc64Update = XzCrc64UpdateT1_BeT4; | ||
74 | } | 126 | } |
75 | } | 127 | } |
76 | #endif | 128 | #endif // ndef MY_CPU_LE |
129 | #endif // Z7_CRC64_NUM_TABLES_USE != 1 | ||
77 | } | 130 | } |
78 | 131 | ||
79 | #undef kCrc64Poly | 132 | #undef kCrc64Poly |
80 | #undef CRC64_NUM_TABLES | 133 | #undef Z7_CRC64_NUM_TABLES_USE |
134 | #undef FUNC_REF | ||
135 | #undef FUNC_NAME_LE_2 | ||
136 | #undef FUNC_NAME_LE_1 | ||
137 | #undef FUNC_NAME_LE | ||
138 | #undef FUNC_NAME_BE_2 | ||
139 | #undef FUNC_NAME_BE_1 | ||
140 | #undef FUNC_NAME_BE | ||
diff --git a/C/XzCrc64.h b/C/XzCrc64.h index ca46869..04f8153 100644 --- a/C/XzCrc64.h +++ b/C/XzCrc64.h | |||
@@ -1,5 +1,5 @@ | |||
1 | /* XzCrc64.h -- CRC64 calculation | 1 | /* XzCrc64.h -- CRC64 calculation |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #ifndef ZIP7_INC_XZ_CRC64_H | 4 | #ifndef ZIP7_INC_XZ_CRC64_H |
5 | #define ZIP7_INC_XZ_CRC64_H | 5 | #define ZIP7_INC_XZ_CRC64_H |
@@ -10,16 +10,16 @@ | |||
10 | 10 | ||
11 | EXTERN_C_BEGIN | 11 | EXTERN_C_BEGIN |
12 | 12 | ||
13 | extern UInt64 g_Crc64Table[]; | 13 | // extern UInt64 g_Crc64Table[]; |
14 | 14 | ||
15 | void Z7_FASTCALL Crc64GenerateTable(void); | 15 | void Z7_FASTCALL Crc64GenerateTable(void); |
16 | 16 | ||
17 | #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) | 17 | #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) |
18 | #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) | 18 | #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) |
19 | #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 19 | // #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
20 | 20 | ||
21 | UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); | 21 | UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); |
22 | UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); | 22 | // UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); |
23 | 23 | ||
24 | EXTERN_C_END | 24 | EXTERN_C_END |
25 | 25 | ||
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c index d03374c..0c1fc2f 100644 --- a/C/XzCrc64Opt.c +++ b/C/XzCrc64Opt.c | |||
@@ -1,61 +1,261 @@ | |||
1 | /* XzCrc64Opt.c -- CRC64 calculation | 1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
7 | 7 | ||
8 | #if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1 | ||
9 | |||
10 | // for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu | ||
11 | // #define Z7_CRC64_DEBUG_BE | ||
12 | #ifdef Z7_CRC64_DEBUG_BE | ||
13 | #undef MY_CPU_LE | ||
14 | #define MY_CPU_BE | ||
15 | #endif | ||
16 | |||
17 | #if defined(MY_CPU_64BIT) | ||
18 | #define Z7_CRC64_USE_64BIT | ||
19 | #endif | ||
20 | |||
21 | // the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c | ||
22 | #ifdef Z7_CRC64_NUM_TABLES | ||
23 | #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES | ||
24 | #else | ||
25 | #define Z7_CRC64_NUM_TABLES_USE 12 | ||
26 | #endif | ||
27 | |||
28 | #if Z7_CRC64_NUM_TABLES_USE % 4 || \ | ||
29 | Z7_CRC64_NUM_TABLES_USE < 4 || \ | ||
30 | Z7_CRC64_NUM_TABLES_USE > 4 * 4 | ||
31 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
32 | #endif | ||
33 | |||
34 | |||
8 | #ifndef MY_CPU_BE | 35 | #ifndef MY_CPU_BE |
9 | 36 | ||
10 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 37 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
38 | |||
39 | #if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) | ||
11 | 40 | ||
12 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 41 | #define Q64LE(n, d) \ |
13 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) | 42 | ( (table + ((n) * 8 + 7) * 0x100)[((d) ) & 0xFF] \ |
43 | ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
44 | ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
45 | ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \ | ||
46 | ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \ | ||
47 | ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \ | ||
48 | ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \ | ||
49 | ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] ) | ||
50 | |||
51 | #define R64(a) *((const UInt64 *)(const void *)p + (a)) | ||
52 | |||
53 | #else | ||
54 | |||
55 | #define Q32LE(n, d) \ | ||
56 | ( (table + ((n) * 4 + 3) * 0x100)[((d) ) & 0xFF] \ | ||
57 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
58 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
59 | ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) | ||
60 | |||
61 | #define R32(a) *((const UInt32 *)(const void *)p + (a)) | ||
62 | |||
63 | #endif | ||
64 | |||
65 | |||
66 | #define CRC64_FUNC_PRE_LE2(step) \ | ||
67 | UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) | ||
68 | |||
69 | #define CRC64_FUNC_PRE_LE(step) \ | ||
70 | CRC64_FUNC_PRE_LE2(step); \ | ||
71 | CRC64_FUNC_PRE_LE2(step) | ||
72 | |||
73 | CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE) | ||
14 | { | 74 | { |
15 | const Byte *p = (const Byte *)data; | 75 | const Byte *p = (const Byte *)data; |
16 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 76 | const Byte *lim; |
77 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) | ||
17 | v = CRC64_UPDATE_BYTE_2(v, *p); | 78 | v = CRC64_UPDATE_BYTE_2(v, *p); |
18 | for (; size >= 4; size -= 4, p += 4) | 79 | lim = p + size; |
80 | if (size >= Z7_CRC64_NUM_TABLES_USE) | ||
19 | { | 81 | { |
20 | const UInt32 d = (UInt32)v ^ *(const UInt32 *)(const void *)p; | 82 | lim -= Z7_CRC64_NUM_TABLES_USE; |
21 | v = (v >> 32) | 83 | do |
22 | ^ (table + 0x300)[((d ) & 0xFF)] | 84 | { |
23 | ^ (table + 0x200)[((d >> 8) & 0xFF)] | 85 | #if Z7_CRC64_NUM_TABLES_USE == 4 |
24 | ^ (table + 0x100)[((d >> 16) & 0xFF)] | 86 | const UInt32 d = (UInt32)v ^ R32(0); |
25 | ^ (table + 0x000)[((d >> 24))]; | 87 | v = (v >> 32) ^ Q32LE(0, d); |
88 | #elif Z7_CRC64_NUM_TABLES_USE == 8 | ||
89 | #ifdef Z7_CRC64_USE_64BIT | ||
90 | v ^= R64(0); | ||
91 | v = Q64LE(0, v); | ||
92 | #else | ||
93 | UInt32 v0, v1; | ||
94 | v0 = (UInt32)v ^ R32(0); | ||
95 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
96 | v = Q32LE(1, v0) ^ Q32LE(0, v1); | ||
97 | #endif | ||
98 | #elif Z7_CRC64_NUM_TABLES_USE == 12 | ||
99 | UInt32 w; | ||
100 | UInt32 v0, v1; | ||
101 | v0 = (UInt32)v ^ R32(0); | ||
102 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
103 | w = R32(2); | ||
104 | v = Q32LE(0, w); | ||
105 | v ^= Q32LE(2, v0) ^ Q32LE(1, v1); | ||
106 | #elif Z7_CRC64_NUM_TABLES_USE == 16 | ||
107 | #ifdef Z7_CRC64_USE_64BIT | ||
108 | UInt64 w; | ||
109 | UInt64 x; | ||
110 | w = R64(1); x = Q64LE(0, w); | ||
111 | v ^= R64(0); v = x ^ Q64LE(1, v); | ||
112 | #else | ||
113 | UInt32 v0, v1; | ||
114 | UInt32 r0, r1; | ||
115 | v0 = (UInt32)v ^ R32(0); | ||
116 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
117 | r0 = R32(2); | ||
118 | r1 = R32(3); | ||
119 | v = Q32LE(1, r0) ^ Q32LE(0, r1); | ||
120 | v ^= Q32LE(3, v0) ^ Q32LE(2, v1); | ||
121 | #endif | ||
122 | #else | ||
123 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
124 | #endif | ||
125 | p += Z7_CRC64_NUM_TABLES_USE; | ||
126 | } | ||
127 | while (p <= lim); | ||
128 | lim += Z7_CRC64_NUM_TABLES_USE; | ||
26 | } | 129 | } |
27 | for (; size > 0; size--, p++) | 130 | for (; p < lim; p++) |
28 | v = CRC64_UPDATE_BYTE_2(v, *p); | 131 | v = CRC64_UPDATE_BYTE_2(v, *p); |
29 | return v; | 132 | return v; |
30 | } | 133 | } |
31 | 134 | ||
135 | #undef CRC64_UPDATE_BYTE_2 | ||
136 | #undef R32 | ||
137 | #undef R64 | ||
138 | #undef Q32LE | ||
139 | #undef Q64LE | ||
140 | #undef CRC64_FUNC_PRE_LE | ||
141 | #undef CRC64_FUNC_PRE_LE2 | ||
142 | |||
32 | #endif | 143 | #endif |
33 | 144 | ||
34 | 145 | ||
146 | |||
147 | |||
35 | #ifndef MY_CPU_LE | 148 | #ifndef MY_CPU_LE |
36 | 149 | ||
37 | #define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) | 150 | #define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8)) |
151 | |||
152 | #if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) | ||
153 | |||
154 | #define Q64BE(n, d) \ | ||
155 | ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \ | ||
156 | ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
157 | ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
158 | ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \ | ||
159 | ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \ | ||
160 | ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \ | ||
161 | ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \ | ||
162 | ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] ) | ||
163 | |||
164 | #ifdef Z7_CRC64_DEBUG_BE | ||
165 | #define R64BE(a) GetBe64a((const UInt64 *)(const void *)p + (a)) | ||
166 | #else | ||
167 | #define R64BE(a) *((const UInt64 *)(const void *)p + (a)) | ||
168 | #endif | ||
169 | |||
170 | #else | ||
171 | |||
172 | #define Q32BE(n, d) \ | ||
173 | ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \ | ||
174 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
175 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
176 | ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) | ||
38 | 177 | ||
39 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 178 | #ifdef Z7_CRC64_DEBUG_BE |
40 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) | 179 | #define R32BE(a) GetBe32a((const UInt32 *)(const void *)p + (a)) |
180 | #else | ||
181 | #define R32BE(a) *((const UInt32 *)(const void *)p + (a)) | ||
182 | #endif | ||
183 | |||
184 | #endif | ||
185 | |||
186 | #define CRC64_FUNC_PRE_BE2(step) \ | ||
187 | UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) | ||
188 | |||
189 | #define CRC64_FUNC_PRE_BE(step) \ | ||
190 | CRC64_FUNC_PRE_BE2(step); \ | ||
191 | CRC64_FUNC_PRE_BE2(step) | ||
192 | |||
193 | CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) | ||
41 | { | 194 | { |
42 | const Byte *p = (const Byte *)data; | 195 | const Byte *p = (const Byte *)data; |
43 | table += 0x100; | 196 | const Byte *lim; |
44 | v = Z7_BSWAP64(v); | 197 | v = Z7_BSWAP64(v); |
45 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 198 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) |
46 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); | 199 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); |
47 | for (; size >= 4; size -= 4, p += 4) | 200 | lim = p + size; |
201 | if (size >= Z7_CRC64_NUM_TABLES_USE) | ||
48 | { | 202 | { |
49 | const UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)(const void *)p; | 203 | lim -= Z7_CRC64_NUM_TABLES_USE; |
50 | v = (v << 32) | 204 | do |
51 | ^ (table + 0x000)[((d ) & 0xFF)] | 205 | { |
52 | ^ (table + 0x100)[((d >> 8) & 0xFF)] | 206 | #if Z7_CRC64_NUM_TABLES_USE == 4 |
53 | ^ (table + 0x200)[((d >> 16) & 0xFF)] | 207 | const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0); |
54 | ^ (table + 0x300)[((d >> 24))]; | 208 | v = (v << 32) ^ Q32BE(0, d); |
209 | #elif Z7_CRC64_NUM_TABLES_USE == 12 | ||
210 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
211 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
212 | const UInt32 w = R32BE(2); | ||
213 | v = Q32BE(0, w); | ||
214 | v ^= Q32BE(2, d1) ^ Q32BE(1, d0); | ||
215 | |||
216 | #elif Z7_CRC64_NUM_TABLES_USE == 8 | ||
217 | #ifdef Z7_CRC64_USE_64BIT | ||
218 | v ^= R64BE(0); | ||
219 | v = Q64BE(0, v); | ||
220 | #else | ||
221 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
222 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
223 | v = Q32BE(1, d1) ^ Q32BE(0, d0); | ||
224 | #endif | ||
225 | #elif Z7_CRC64_NUM_TABLES_USE == 16 | ||
226 | #ifdef Z7_CRC64_USE_64BIT | ||
227 | const UInt64 w = R64BE(1); | ||
228 | v ^= R64BE(0); | ||
229 | v = Q64BE(0, w) ^ Q64BE(1, v); | ||
230 | #else | ||
231 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
232 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
233 | const UInt32 w1 = R32BE(2); | ||
234 | const UInt32 w0 = R32BE(3); | ||
235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); | ||
236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); | ||
237 | #endif | ||
238 | #elif | ||
239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
240 | #endif | ||
241 | p += Z7_CRC64_NUM_TABLES_USE; | ||
242 | } | ||
243 | while (p <= lim); | ||
244 | lim += Z7_CRC64_NUM_TABLES_USE; | ||
55 | } | 245 | } |
56 | for (; size > 0; size--, p++) | 246 | for (; p < lim; p++) |
57 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); | 247 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); |
58 | return Z7_BSWAP64(v); | 248 | return Z7_BSWAP64(v); |
59 | } | 249 | } |
60 | 250 | ||
251 | #undef CRC64_UPDATE_BYTE_2_BE | ||
252 | #undef R32BE | ||
253 | #undef R64BE | ||
254 | #undef Q32BE | ||
255 | #undef Q64BE | ||
256 | #undef CRC64_FUNC_PRE_BE | ||
257 | #undef CRC64_FUNC_PRE_BE2 | ||
258 | |||
259 | #endif | ||
260 | #undef Z7_CRC64_NUM_TABLES_USE | ||
61 | #endif | 261 | #endif |
@@ -1,5 +1,5 @@ | |||
1 | /* XzDec.c -- Xz Decode | 1 | /* XzDec.c -- Xz Decode |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -105,30 +105,32 @@ static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSiz | |||
105 | { | 105 | { |
106 | if (propSize != 1) | 106 | if (propSize != 1) |
107 | return SZ_ERROR_UNSUPPORTED; | 107 | return SZ_ERROR_UNSUPPORTED; |
108 | p->delta = (unsigned)props[0] + 1; | 108 | p->delta = (UInt32)props[0] + 1; |
109 | } | 109 | } |
110 | else | 110 | else |
111 | { | 111 | { |
112 | if (propSize == 4) | 112 | if (propSize == 4) |
113 | { | 113 | { |
114 | UInt32 v = GetUi32(props); | 114 | const UInt32 v = GetUi32(props); |
115 | switch (p->methodId) | 115 | switch (p->methodId) |
116 | { | 116 | { |
117 | case XZ_ID_PPC: | 117 | case XZ_ID_PPC: |
118 | case XZ_ID_ARM: | 118 | case XZ_ID_ARM: |
119 | case XZ_ID_SPARC: | 119 | case XZ_ID_SPARC: |
120 | case XZ_ID_ARM64: | 120 | case XZ_ID_ARM64: |
121 | if ((v & 3) != 0) | 121 | if (v & 3) |
122 | return SZ_ERROR_UNSUPPORTED; | 122 | return SZ_ERROR_UNSUPPORTED; |
123 | break; | 123 | break; |
124 | case XZ_ID_ARMT: | 124 | case XZ_ID_ARMT: |
125 | if ((v & 1) != 0) | 125 | case XZ_ID_RISCV: |
126 | if (v & 1) | ||
126 | return SZ_ERROR_UNSUPPORTED; | 127 | return SZ_ERROR_UNSUPPORTED; |
127 | break; | 128 | break; |
128 | case XZ_ID_IA64: | 129 | case XZ_ID_IA64: |
129 | if ((v & 0xF) != 0) | 130 | if (v & 0xf) |
130 | return SZ_ERROR_UNSUPPORTED; | 131 | return SZ_ERROR_UNSUPPORTED; |
131 | break; | 132 | break; |
133 | default: break; | ||
132 | } | 134 | } |
133 | p->ip = v; | 135 | p->ip = v; |
134 | } | 136 | } |
@@ -151,12 +153,13 @@ static void XzBcFilterState_Init(void *pp) | |||
151 | 153 | ||
152 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = | 154 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = |
153 | { | 155 | { |
154 | Z7_BRANCH_CONV_DEC(PPC), | 156 | Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC), |
155 | Z7_BRANCH_CONV_DEC(IA64), | 157 | Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64), |
156 | Z7_BRANCH_CONV_DEC(ARM), | 158 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM), |
157 | Z7_BRANCH_CONV_DEC(ARMT), | 159 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT), |
158 | Z7_BRANCH_CONV_DEC(SPARC), | 160 | Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC), |
159 | Z7_BRANCH_CONV_DEC(ARM64) | 161 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64), |
162 | Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV) | ||
160 | }; | 163 | }; |
161 | 164 | ||
162 | static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) | 165 | static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) |
@@ -262,7 +265,7 @@ static SRes XzBcFilterState_Code2(void *pp, | |||
262 | 265 | ||
263 | 266 | ||
264 | #define XZ_IS_SUPPORTED_FILTER_ID(id) \ | 267 | #define XZ_IS_SUPPORTED_FILTER_ID(id) \ |
265 | ((id) >= XZ_ID_Delta && (id) <= XZ_ID_ARM64) | 268 | ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV) |
266 | 269 | ||
267 | SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, | 270 | SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, |
268 | Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) | 271 | Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) |
@@ -541,13 +544,12 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met | |||
541 | { | 544 | { |
542 | IStateCoder *sc = &p->coders[coderIndex]; | 545 | IStateCoder *sc = &p->coders[coderIndex]; |
543 | p->ids[coderIndex] = methodId; | 546 | p->ids[coderIndex] = methodId; |
544 | switch (methodId) | 547 | if (methodId == XZ_ID_LZMA2) |
545 | { | 548 | return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); |
546 | case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); | 549 | #ifdef USE_SUBBLOCK |
547 | #ifdef USE_SUBBLOCK | 550 | if (methodId == XZ_ID_Subblock) |
548 | case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc); | 551 | return SbState_SetFromMethod(sc, p->alloc); |
549 | #endif | 552 | #endif |
550 | } | ||
551 | if (coderIndex == 0) | 553 | if (coderIndex == 0) |
552 | return SZ_ERROR_UNSUPPORTED; | 554 | return SZ_ERROR_UNSUPPORTED; |
553 | return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, | 555 | return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, |
@@ -558,10 +560,8 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met | |||
558 | static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) | 560 | static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) |
559 | { | 561 | { |
560 | IStateCoder *sc = &p->coders[coderIndex]; | 562 | IStateCoder *sc = &p->coders[coderIndex]; |
561 | switch (methodId) | 563 | if (methodId == XZ_ID_LZMA2) |
562 | { | 564 | return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); |
563 | case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); | ||
564 | } | ||
565 | return SZ_ERROR_UNSUPPORTED; | 565 | return SZ_ERROR_UNSUPPORTED; |
566 | } | 566 | } |
567 | 567 | ||
@@ -804,7 +804,7 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte | |||
804 | } | 804 | } |
805 | 805 | ||
806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
807 | { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 807 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ |
808 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 808 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
809 | pos += s; } | 809 | pos += s; } |
810 | 810 | ||
@@ -1034,7 +1034,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1034 | SRes res; | 1034 | SRes res; |
1035 | 1035 | ||
1036 | ECoderFinishMode finishMode2 = finishMode; | 1036 | ECoderFinishMode finishMode2 = finishMode; |
1037 | BoolInt srcFinished2 = srcFinished; | 1037 | BoolInt srcFinished2 = (BoolInt)srcFinished; |
1038 | BoolInt destFinish = False; | 1038 | BoolInt destFinish = False; |
1039 | 1039 | ||
1040 | if (p->block.packSize != (UInt64)(Int64)-1) | 1040 | if (p->block.packSize != (UInt64)(Int64)-1) |
@@ -1127,7 +1127,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1127 | return SZ_OK; | 1127 | return SZ_OK; |
1128 | } | 1128 | } |
1129 | 1129 | ||
1130 | switch (p->state) | 1130 | switch ((int)p->state) |
1131 | { | 1131 | { |
1132 | case XZ_STATE_STREAM_HEADER: | 1132 | case XZ_STATE_STREAM_HEADER: |
1133 | { | 1133 | { |
@@ -1172,15 +1172,15 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1172 | p->state = XZ_STATE_STREAM_INDEX; | 1172 | p->state = XZ_STATE_STREAM_INDEX; |
1173 | break; | 1173 | break; |
1174 | } | 1174 | } |
1175 | p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4; | 1175 | p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4; |
1176 | break; | 1176 | break; |
1177 | } | 1177 | } |
1178 | 1178 | ||
1179 | if (p->pos != p->blockHeaderSize) | 1179 | if (p->pos != p->blockHeaderSize) |
1180 | { | 1180 | { |
1181 | UInt32 cur = p->blockHeaderSize - p->pos; | 1181 | unsigned cur = p->blockHeaderSize - p->pos; |
1182 | if (cur > srcRem) | 1182 | if (cur > srcRem) |
1183 | cur = (UInt32)srcRem; | 1183 | cur = (unsigned)srcRem; |
1184 | memcpy(p->buf + p->pos, src, cur); | 1184 | memcpy(p->buf + p->pos, src, cur); |
1185 | p->pos += cur; | 1185 | p->pos += cur; |
1186 | (*srcLen) += cur; | 1186 | (*srcLen) += cur; |
@@ -1222,8 +1222,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1222 | } | 1222 | } |
1223 | else | 1223 | else |
1224 | { | 1224 | { |
1225 | UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags); | 1225 | const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags); |
1226 | UInt32 cur = checkSize - p->pos; | 1226 | unsigned cur = checkSize - p->pos; |
1227 | if (cur != 0) | 1227 | if (cur != 0) |
1228 | { | 1228 | { |
1229 | if (srcRem == 0) | 1229 | if (srcRem == 0) |
@@ -1232,7 +1232,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1232 | return SZ_OK; | 1232 | return SZ_OK; |
1233 | } | 1233 | } |
1234 | if (cur > srcRem) | 1234 | if (cur > srcRem) |
1235 | cur = (UInt32)srcRem; | 1235 | cur = (unsigned)srcRem; |
1236 | memcpy(p->buf + p->pos, src, cur); | 1236 | memcpy(p->buf + p->pos, src, cur); |
1237 | p->pos += cur; | 1237 | p->pos += cur; |
1238 | (*srcLen) += cur; | 1238 | (*srcLen) += cur; |
@@ -1321,9 +1321,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1321 | 1321 | ||
1322 | case XZ_STATE_STREAM_FOOTER: | 1322 | case XZ_STATE_STREAM_FOOTER: |
1323 | { | 1323 | { |
1324 | UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos; | 1324 | unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos; |
1325 | if (cur > srcRem) | 1325 | if (cur > srcRem) |
1326 | cur = (UInt32)srcRem; | 1326 | cur = (unsigned)srcRem; |
1327 | memcpy(p->buf + p->pos, src, cur); | 1327 | memcpy(p->buf + p->pos, src, cur); |
1328 | p->pos += cur; | 1328 | p->pos += cur; |
1329 | (*srcLen) += cur; | 1329 | (*srcLen) += cur; |
@@ -1358,6 +1358,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
1358 | } | 1358 | } |
1359 | 1359 | ||
1360 | case XZ_STATE_BLOCK: break; /* to disable GCC warning */ | 1360 | case XZ_STATE_BLOCK: break; /* to disable GCC warning */ |
1361 | |||
1362 | default: return SZ_ERROR_FAIL; | ||
1361 | } | 1363 | } |
1362 | } | 1364 | } |
1363 | /* | 1365 | /* |
@@ -1773,10 +1775,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac | |||
1773 | } | 1775 | } |
1774 | } | 1776 | } |
1775 | { | 1777 | { |
1776 | UInt64 packSize = block->packSize; | 1778 | const UInt64 packSize = block->packSize; |
1777 | UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); | 1779 | const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); |
1778 | UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); | 1780 | const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); |
1779 | UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; | 1781 | const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; |
1780 | // if (blockPackSum <= me->props.inBlockMax) | 1782 | // if (blockPackSum <= me->props.inBlockMax) |
1781 | // unpackBlockMaxSize | 1783 | // unpackBlockMaxSize |
1782 | { | 1784 | { |
@@ -2381,7 +2383,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p | |||
2381 | if (tMode) | 2383 | if (tMode) |
2382 | { | 2384 | { |
2383 | XzDecMt_FreeOutBufs(p); | 2385 | XzDecMt_FreeOutBufs(p); |
2384 | tMode = MtDec_PrepareRead(&p->mtc); | 2386 | tMode = (BoolInt)MtDec_PrepareRead(&p->mtc); |
2385 | } | 2387 | } |
2386 | #endif | 2388 | #endif |
2387 | 2389 | ||
@@ -2644,7 +2646,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle p, | |||
2644 | p->outSize = *outDataSize; | 2646 | p->outSize = *outDataSize; |
2645 | } | 2647 | } |
2646 | 2648 | ||
2647 | p->finishMode = finishMode; | 2649 | p->finishMode = (BoolInt)finishMode; |
2648 | 2650 | ||
2649 | // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test | 2651 | // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test |
2650 | 2652 | ||
@@ -1,5 +1,5 @@ | |||
1 | /* XzEnc.c -- Xz Encode | 1 | /* XzEnc.c -- Xz Encode |
2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -29,8 +29,9 @@ | |||
29 | 29 | ||
30 | #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) | 30 | #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) |
31 | 31 | ||
32 | /* max pack size for LZMA2 block + check-64bytrs: */ | 32 | #define XZ_CHECK_SIZE_MAX 64 |
33 | #define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64) | 33 | /* max pack size for LZMA2 block + pad4 + check_size: */ |
34 | #define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX) | ||
34 | 35 | ||
35 | #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) | 36 | #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) |
36 | 37 | ||
@@ -325,12 +326,13 @@ typedef struct | |||
325 | 326 | ||
326 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = | 327 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = |
327 | { | 328 | { |
328 | Z7_BRANCH_CONV_ENC(PPC), | 329 | Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC), |
329 | Z7_BRANCH_CONV_ENC(IA64), | 330 | Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64), |
330 | Z7_BRANCH_CONV_ENC(ARM), | 331 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM), |
331 | Z7_BRANCH_CONV_ENC(ARMT), | 332 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT), |
332 | Z7_BRANCH_CONV_ENC(SPARC), | 333 | Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC), |
333 | Z7_BRANCH_CONV_ENC(ARM64) | 334 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64), |
335 | Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV) | ||
334 | }; | 336 | }; |
335 | 337 | ||
336 | static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) | 338 | static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) |
@@ -888,9 +890,9 @@ static SRes Xz_CompressBlock( | |||
888 | blockSizes->unpackSize = checkInStream.processed; | 890 | blockSizes->unpackSize = checkInStream.processed; |
889 | } | 891 | } |
890 | { | 892 | { |
891 | Byte buf[4 + 64]; | 893 | Byte buf[4 + XZ_CHECK_SIZE_MAX]; |
892 | unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); | 894 | const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); |
893 | UInt64 packSize = seqSizeOutStream.processed; | 895 | const UInt64 packSize = seqSizeOutStream.processed; |
894 | 896 | ||
895 | buf[0] = 0; | 897 | buf[0] = 0; |
896 | buf[1] = 0; | 898 | buf[1] = 0; |
@@ -898,7 +900,8 @@ static SRes Xz_CompressBlock( | |||
898 | buf[3] = 0; | 900 | buf[3] = 0; |
899 | 901 | ||
900 | SeqCheckInStream_GetDigest(&checkInStream, buf + 4); | 902 | SeqCheckInStream_GetDigest(&checkInStream, buf + 4); |
901 | RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) | 903 | RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), |
904 | padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) | ||
902 | 905 | ||
903 | blockSizes->totalSize = seqSizeOutStream.processed - padSize; | 906 | blockSizes->totalSize = seqSizeOutStream.processed - padSize; |
904 | 907 | ||
@@ -1083,18 +1086,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf | |||
1083 | CXzEnc *me = (CXzEnc *)pp; | 1086 | CXzEnc *me = (CXzEnc *)pp; |
1084 | SRes res; | 1087 | SRes res; |
1085 | CMtProgressThunk progressThunk; | 1088 | CMtProgressThunk progressThunk; |
1086 | 1089 | Byte *dest; | |
1087 | Byte *dest = me->outBufs[outBufIndex]; | ||
1088 | |||
1089 | UNUSED_VAR(finished) | 1090 | UNUSED_VAR(finished) |
1090 | |||
1091 | { | 1091 | { |
1092 | CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; | 1092 | CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; |
1093 | bInfo->totalSize = 0; | 1093 | bInfo->totalSize = 0; |
1094 | bInfo->unpackSize = 0; | 1094 | bInfo->unpackSize = 0; |
1095 | bInfo->headerSize = 0; | 1095 | bInfo->headerSize = 0; |
1096 | // v23.02: we don't compress empty blocks | ||
1097 | // also we must ignore that empty block in XzEnc_MtCallback_Write() | ||
1098 | if (srcSize == 0) | ||
1099 | return SZ_OK; | ||
1096 | } | 1100 | } |
1097 | 1101 | dest = me->outBufs[outBufIndex]; | |
1098 | if (!dest) | 1102 | if (!dest) |
1099 | { | 1103 | { |
1100 | dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); | 1104 | dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); |
@@ -1140,18 +1144,20 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf | |||
1140 | static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) | 1144 | static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) |
1141 | { | 1145 | { |
1142 | CXzEnc *me = (CXzEnc *)pp; | 1146 | CXzEnc *me = (CXzEnc *)pp; |
1143 | |||
1144 | const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; | 1147 | const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; |
1145 | const Byte *data = me->outBufs[outBufIndex]; | 1148 | // v23.02: we don't write empty blocks |
1146 | 1149 | // note: if (bInfo->unpackSize == 0) then there is no compressed data of block | |
1147 | RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) | 1150 | if (bInfo->unpackSize == 0) |
1148 | 1151 | return SZ_OK; | |
1149 | { | 1152 | { |
1150 | UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); | 1153 | const Byte *data = me->outBufs[outBufIndex]; |
1151 | RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) | 1154 | RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) |
1155 | { | ||
1156 | const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); | ||
1157 | RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) | ||
1158 | } | ||
1159 | return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); | ||
1152 | } | 1160 | } |
1153 | |||
1154 | return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); | ||
1155 | } | 1161 | } |
1156 | 1162 | ||
1157 | #endif | 1163 | #endif |
@@ -1,5 +1,5 @@ | |||
1 | /* XzIn.c - Xz input | 1 | /* XzIn.c - Xz input |
2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
3 | 3 | ||
4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
5 | 5 | ||
@@ -27,7 +27,7 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) | |||
27 | } | 27 | } |
28 | 28 | ||
29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
30 | { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 30 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ |
31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
32 | pos += s; } | 32 | pos += s; } |
33 | 33 | ||
@@ -37,7 +37,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
37 | unsigned headerSize; | 37 | unsigned headerSize; |
38 | *headerSizeRes = 0; | 38 | *headerSizeRes = 0; |
39 | RINOK(SeqInStream_ReadByte(inStream, &header[0])) | 39 | RINOK(SeqInStream_ReadByte(inStream, &header[0])) |
40 | headerSize = (unsigned)header[0]; | 40 | headerSize = header[0]; |
41 | if (headerSize == 0) | 41 | if (headerSize == 0) |
42 | { | 42 | { |
43 | *headerSizeRes = 1; | 43 | *headerSizeRes = 1; |
@@ -47,7 +47,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
47 | 47 | ||
48 | *isIndex = False; | 48 | *isIndex = False; |
49 | headerSize = (headerSize << 2) + 4; | 49 | headerSize = (headerSize << 2) + 4; |
50 | *headerSizeRes = headerSize; | 50 | *headerSizeRes = (UInt32)headerSize; |
51 | { | 51 | { |
52 | size_t processedSize = headerSize - 1; | 52 | size_t processedSize = headerSize - 1; |
53 | RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) | 53 | RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) |
@@ -58,7 +58,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
58 | } | 58 | } |
59 | 59 | ||
60 | #define ADD_SIZE_CHECK(size, val) \ | 60 | #define ADD_SIZE_CHECK(size, val) \ |
61 | { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } | 61 | { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } |
62 | 62 | ||
63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) | 63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) |
64 | { | 64 | { |
diff --git a/C/ZstdDec.c b/C/ZstdDec.c new file mode 100644 index 0000000..ecf6d22 --- /dev/null +++ b/C/ZstdDec.c | |||
@@ -0,0 +1,4064 @@ | |||
1 | /* ZstdDec.c -- Zstd Decoder | ||
2 | 2024-01-21 : the code was developed by Igor Pavlov, using Zstandard format | ||
3 | specification and original zstd decoder code as reference code. | ||
4 | original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved. | ||
5 | This source code is licensed under BSD 3-Clause License. | ||
6 | */ | ||
7 | |||
8 | #include "Precomp.h" | ||
9 | |||
10 | #include <string.h> | ||
11 | #include <stdlib.h> | ||
12 | // #include <stdio.h> | ||
13 | |||
14 | #include "Alloc.h" | ||
15 | #include "Xxh64.h" | ||
16 | #include "ZstdDec.h" | ||
17 | #include "CpuArch.h" | ||
18 | |||
19 | #if defined(MY_CPU_ARM64) | ||
20 | #include <arm_neon.h> | ||
21 | #endif | ||
22 | |||
23 | /* original-zstd still doesn't support window larger than 2 GiB. | ||
24 | So we also limit our decoder for 2 GiB window: */ | ||
25 | #if defined(MY_CPU_64BIT) && 0 == 1 | ||
26 | #define MAX_WINDOW_SIZE_LOG 41 | ||
27 | #else | ||
28 | #define MAX_WINDOW_SIZE_LOG 31 | ||
29 | #endif | ||
30 | |||
31 | typedef | ||
32 | #if MAX_WINDOW_SIZE_LOG < 32 | ||
33 | UInt32 | ||
34 | #else | ||
35 | size_t | ||
36 | #endif | ||
37 | CZstdDecOffset; | ||
38 | |||
39 | // for debug: simpler and smaller code but slow: | ||
40 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
41 | |||
42 | // #define SHOW_STAT | ||
43 | #ifdef SHOW_STAT | ||
44 | #include <stdio.h> | ||
45 | static unsigned g_Num_Blocks_Compressed = 0; | ||
46 | static unsigned g_Num_Blocks_memcpy = 0; | ||
47 | static unsigned g_Num_Wrap_memmove_Num = 0; | ||
48 | static unsigned g_Num_Wrap_memmove_Bytes = 0; | ||
49 | static unsigned g_NumSeqs_total = 0; | ||
50 | // static unsigned g_NumCopy = 0; | ||
51 | static unsigned g_NumOver = 0; | ||
52 | static unsigned g_NumOver2 = 0; | ||
53 | static unsigned g_Num_Match = 0; | ||
54 | static unsigned g_Num_Lits = 0; | ||
55 | static unsigned g_Num_LitsBig = 0; | ||
56 | static unsigned g_Num_Lit0 = 0; | ||
57 | static unsigned g_Num_Rep0 = 0; | ||
58 | static unsigned g_Num_Rep1 = 0; | ||
59 | static unsigned g_Num_Rep2 = 0; | ||
60 | static unsigned g_Num_Rep3 = 0; | ||
61 | static unsigned g_Num_Threshold_0 = 0; | ||
62 | static unsigned g_Num_Threshold_1 = 0; | ||
63 | static unsigned g_Num_Threshold_0sum = 0; | ||
64 | static unsigned g_Num_Threshold_1sum = 0; | ||
65 | #define STAT_UPDATE(v) v | ||
66 | #else | ||
67 | #define STAT_UPDATE(v) | ||
68 | #endif | ||
69 | #define STAT_INC(v) STAT_UPDATE(v++;) | ||
70 | |||
71 | |||
72 | typedef struct | ||
73 | { | ||
74 | const Byte *ptr; | ||
75 | size_t len; | ||
76 | } | ||
77 | CInBufPair; | ||
78 | |||
79 | |||
80 | #if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64) | ||
81 | #if (defined(__clang__) && (__clang_major__ >= 6)) \ | ||
82 | || (defined(__GNUC__) && (__GNUC__ >= 6)) | ||
83 | // disable for debug: | ||
84 | #define Z7_ZSTD_DEC_USE_BSR | ||
85 | #elif defined(_MSC_VER) && (_MSC_VER >= 1300) | ||
86 | // #if defined(MY_CPU_ARM_OR_ARM64) | ||
87 | #if (_MSC_VER >= 1600) | ||
88 | #include <intrin.h> | ||
89 | #endif | ||
90 | // disable for debug: | ||
91 | #define Z7_ZSTD_DEC_USE_BSR | ||
92 | #endif | ||
93 | #endif | ||
94 | |||
95 | #ifdef Z7_ZSTD_DEC_USE_BSR | ||
96 | #if defined(__clang__) || defined(__GNUC__) | ||
97 | #define MY_clz(x) ((unsigned)__builtin_clz((UInt32)x)) | ||
98 | #else // #if defined(_MSC_VER) | ||
99 | #ifdef MY_CPU_ARM_OR_ARM64 | ||
100 | #define MY_clz _CountLeadingZeros | ||
101 | #endif // MY_CPU_X86_OR_AMD64 | ||
102 | #endif // _MSC_VER | ||
103 | #elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE) | ||
104 | #define Z7_ZSTD_DEC_USE_LOG_TABLE | ||
105 | #endif | ||
106 | |||
107 | |||
108 | static | ||
109 | Z7_FORCE_INLINE | ||
110 | unsigned GetHighestSetBit_32_nonzero_big(UInt32 num) | ||
111 | { | ||
112 | // (num != 0) | ||
113 | #ifdef MY_clz | ||
114 | return 31 - MY_clz(num); | ||
115 | #elif defined(Z7_ZSTD_DEC_USE_BSR) | ||
116 | { | ||
117 | unsigned long zz; | ||
118 | _BitScanReverse(&zz, num); | ||
119 | return zz; | ||
120 | } | ||
121 | #else | ||
122 | { | ||
123 | int i = -1; | ||
124 | for (;;) | ||
125 | { | ||
126 | i++; | ||
127 | num >>= 1; | ||
128 | if (num == 0) | ||
129 | return (unsigned)i; | ||
130 | } | ||
131 | } | ||
132 | #endif | ||
133 | } | ||
134 | |||
135 | #ifdef Z7_ZSTD_DEC_USE_LOG_TABLE | ||
136 | |||
137 | #define R1(a) a, a | ||
138 | #define R2(a) R1(a), R1(a) | ||
139 | #define R3(a) R2(a), R2(a) | ||
140 | #define R4(a) R3(a), R3(a) | ||
141 | #define R5(a) R4(a), R4(a) | ||
142 | #define R6(a) R5(a), R5(a) | ||
143 | #define R7(a) R6(a), R6(a) | ||
144 | #define R8(a) R7(a), R7(a) | ||
145 | #define R9(a) R8(a), R8(a) | ||
146 | |||
147 | #define Z7_ZSTD_FSE_MAX_ACCURACY 9 | ||
148 | // states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits. | ||
149 | static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] = | ||
150 | { | ||
151 | R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9) | ||
152 | }; | ||
153 | |||
154 | #define GetHighestSetBit_32_nonzero_small(num) (k_zstd_LogTable[num]) | ||
155 | #else | ||
156 | #define GetHighestSetBit_32_nonzero_small GetHighestSetBit_32_nonzero_big | ||
157 | #endif | ||
158 | |||
159 | |||
160 | #ifdef MY_clz | ||
161 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
162 | bitOffset -= (CBitCtr)(MY_clz(b) - 23); | ||
163 | #elif defined(Z7_ZSTD_DEC_USE_BSR) | ||
164 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
165 | { unsigned long zz; _BitScanReverse(&zz, b); bitOffset -= 8; bitOffset += zz; } | ||
166 | #else | ||
167 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
168 | for (;;) { bitOffset--; if (b & 0x80) { break; } b <<= 1; } | ||
169 | #endif | ||
170 | |||
171 | #define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \ | ||
172 | { \ | ||
173 | unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \ | ||
174 | if (lastByte == 0) return SZ_ERROR_DATA; \ | ||
175 | bitOffset = (CBitCtr)((srcLen) * 8); \ | ||
176 | UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \ | ||
177 | } | ||
178 | |||
179 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
180 | |||
181 | #define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \ | ||
182 | { \ | ||
183 | unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \ | ||
184 | if (lastByte == 0) return SZ_ERROR_DATA; \ | ||
185 | srcLen_res *= 8; \ | ||
186 | bitOffset = (CBitCtr)srcLen_res; \ | ||
187 | UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \ | ||
188 | } | ||
189 | |||
190 | #endif | ||
191 | |||
192 | /* | ||
193 | typedef Int32 CBitCtr_signed; | ||
194 | typedef Int32 CBitCtr; | ||
195 | */ | ||
196 | // /* | ||
197 | typedef ptrdiff_t CBitCtr_signed; | ||
198 | typedef ptrdiff_t CBitCtr; | ||
199 | // */ | ||
200 | |||
201 | |||
202 | #define MATCH_LEN_MIN 3 | ||
203 | #define kBlockSizeMax (1u << 17) | ||
204 | |||
205 | // #define Z7_ZSTD_DEC_PRINT_TABLE | ||
206 | |||
207 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
208 | #define NUM_OFFSET_SYMBOLS_PREDEF 29 | ||
209 | #endif | ||
210 | #define NUM_OFFSET_SYMBOLS_MAX (MAX_WINDOW_SIZE_LOG + 1) // 32 | ||
211 | #define NUM_LL_SYMBOLS 36 | ||
212 | #define NUM_ML_SYMBOLS 53 | ||
213 | #define FSE_NUM_SYMBOLS_MAX 53 // NUM_ML_SYMBOLS | ||
214 | |||
215 | // /* | ||
216 | #if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT) | ||
217 | #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
218 | #endif | ||
219 | // */ | ||
220 | // for debug: | ||
221 | // #define Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
222 | // #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
223 | |||
224 | #define GLOBAL_TABLE(n) k_ ## n | ||
225 | |||
226 | #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) | ||
227 | #define BASES_TABLE(n) a_ ## n | ||
228 | #elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT) | ||
229 | #define BASES_TABLE(n) p->m_ ## n | ||
230 | #else | ||
231 | #define BASES_TABLE(n) GLOBAL_TABLE(n) | ||
232 | #endif | ||
233 | |||
234 | #define Z7_ZSTD_DEC_USE_ML_PLUS3 | ||
235 | |||
236 | #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \ | ||
237 | defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT) | ||
238 | |||
239 | #define SEQ_EXTRA_TABLES(n) \ | ||
240 | Byte n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \ | ||
241 | Byte n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \ | ||
242 | UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \ | ||
243 | UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \ | ||
244 | |||
245 | #define Z7_ZSTD_DEC_USE_BASES_CALC | ||
246 | |||
247 | #ifdef Z7_ZSTD_DEC_USE_BASES_CALC | ||
248 | |||
249 | #define FILL_LOC_BASES(n, startSum) \ | ||
250 | { unsigned i; UInt32 sum = startSum; \ | ||
251 | for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \ | ||
252 | { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \ | ||
253 | BASES_TABLE(n ## _BASES)[i] = sum; \ | ||
254 | /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \ | ||
255 | sum += (UInt32)1 << a; \ | ||
256 | BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }} | ||
257 | |||
258 | #define FILL_LOC_BASES_ALL \ | ||
259 | FILL_LOC_BASES (SEQ_LL, 0) \ | ||
260 | FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \ | ||
261 | |||
262 | #else | ||
263 | #define COPY_GLOBAL_ARR(n) \ | ||
264 | memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n))); | ||
265 | #define FILL_LOC_BASES_ALL \ | ||
266 | COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \ | ||
267 | COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \ | ||
268 | COPY_GLOBAL_ARR (SEQ_LL_BASES) \ | ||
269 | COPY_GLOBAL_ARR (SEQ_ML_BASES) \ | ||
270 | |||
271 | #endif | ||
272 | |||
273 | #endif | ||
274 | |||
275 | |||
276 | |||
277 | /// The sequence decoding baseline and number of additional bits to read/add | ||
278 | #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC) | ||
279 | static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] = | ||
280 | { | ||
281 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | ||
282 | 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, | ||
283 | 0x2000, 0x4000, 0x8000, 0x10000 | ||
284 | }; | ||
285 | #endif | ||
286 | |||
287 | static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] = | ||
288 | { | ||
289 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
290 | 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, | ||
291 | 13, 14, 15, 16 | ||
292 | }; | ||
293 | |||
294 | #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC) | ||
295 | static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] = | ||
296 | { | ||
297 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, | ||
298 | 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, | ||
299 | 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, | ||
300 | 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 | ||
301 | }; | ||
302 | #endif | ||
303 | |||
304 | static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] = | ||
305 | { | ||
306 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
307 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
308 | 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, | ||
309 | 12, 13, 14, 15, 16 | ||
310 | }; | ||
311 | |||
312 | |||
313 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
314 | |||
315 | static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] = | ||
316 | { | ||
317 | 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, | ||
318 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, | ||
319 | -1,-1,-1,-1 | ||
320 | }; | ||
321 | static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] = | ||
322 | { | ||
323 | 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, | ||
324 | 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 | ||
325 | }; | ||
326 | static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] = | ||
327 | { | ||
328 | 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, | ||
329 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
330 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1, | ||
331 | -1,-1,-1,-1,-1 | ||
332 | }; | ||
333 | |||
334 | #endif | ||
335 | |||
336 | // typedef int FastInt; | ||
337 | // typedef Int32 FastInt32; | ||
338 | typedef unsigned FastInt; | ||
339 | typedef UInt32 FastInt32; | ||
340 | typedef FastInt32 CFseRecord; | ||
341 | |||
342 | |||
343 | #define FSE_REC_LEN_OFFSET 8 | ||
344 | #define FSE_REC_STATE_OFFSET 16 | ||
345 | #define GET_FSE_REC_SYM(st) ((Byte)(st)) | ||
346 | #define GET_FSE_REC_LEN(st) ((Byte)((st) >> FSE_REC_LEN_OFFSET)) | ||
347 | #define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET) | ||
348 | |||
349 | // #define FSE_REC_SYM_MASK (0xff) | ||
350 | // #define GET_FSE_REC_SYM(st) (st & FSE_REC_SYM_MASK) | ||
351 | |||
352 | #define W_BASE(state, len, sym) \ | ||
353 | (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \ | ||
354 | (len << FSE_REC_LEN_OFFSET) + (sym)) | ||
355 | #define W(state, len, sym) W_BASE(state, len, sym) | ||
356 | static const CFseRecord k_PredefRecords_LL[1 << 6] = { | ||
357 | W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9), | ||
358 | W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22), | ||
359 | W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1), | ||
360 | W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13), | ||
361 | W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25), | ||
362 | W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5), | ||
363 | W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18), | ||
364 | W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32) | ||
365 | }; | ||
366 | static const CFseRecord k_PredefRecords_OF[1 << 5] = { | ||
367 | W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12), | ||
368 | W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7), | ||
369 | W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1), | ||
370 | W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24) | ||
371 | }; | ||
372 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
373 | #undef W | ||
374 | #define W(state, len, sym) W_BASE(state, len, (sym + MATCH_LEN_MIN)) | ||
375 | #endif | ||
376 | static const CFseRecord k_PredefRecords_ML[1 << 6] = { | ||
377 | W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10), | ||
378 | W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33), | ||
379 | W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2), | ||
380 | W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18), | ||
381 | W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38), | ||
382 | W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5), | ||
383 | W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26), | ||
384 | W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46) | ||
385 | }; | ||
386 | |||
387 | |||
388 | // sum of freqs[] must be correct | ||
389 | // (numSyms != 0) | ||
390 | // (accuracy >= 5) | ||
391 | static | ||
392 | Z7_NO_INLINE | ||
393 | // Z7_FORCE_INLINE | ||
394 | void FSE_Generate(CFseRecord *table, | ||
395 | const Int16 *const freqs, const size_t numSyms, | ||
396 | const unsigned accuracy, UInt32 delta) | ||
397 | { | ||
398 | size_t size = (size_t)1 << accuracy; | ||
399 | // max value in states[x] is ((1 << accuracy) * 2) | ||
400 | UInt16 states[FSE_NUM_SYMBOLS_MAX]; | ||
401 | { | ||
402 | /* Symbols with "less than 1" probability get a single cell, | ||
403 | starting from the end of the table. | ||
404 | These symbols define a full state reset, reading (accuracy) bits. */ | ||
405 | size_t threshold = size; | ||
406 | { | ||
407 | size_t s = 0; | ||
408 | do | ||
409 | if (freqs[s] == -1) | ||
410 | { | ||
411 | table[--threshold] = (CFseRecord)s; | ||
412 | states[s] = 1; | ||
413 | } | ||
414 | while (++s != numSyms); | ||
415 | } | ||
416 | |||
417 | #ifdef SHOW_STAT | ||
418 | if (threshold == size) | ||
419 | { | ||
420 | STAT_INC(g_Num_Threshold_0) | ||
421 | STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;) | ||
422 | } | ||
423 | else | ||
424 | { | ||
425 | STAT_INC(g_Num_Threshold_1) | ||
426 | STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;) | ||
427 | } | ||
428 | #endif | ||
429 | |||
430 | // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++) | ||
431 | { | ||
432 | // Each (symbol) gets freqs[symbol] cells. | ||
433 | // Cell allocation is spread, not linear. | ||
434 | const size_t step = (size >> 1) + (size >> 3) + 3; | ||
435 | size_t pos = 0; | ||
436 | // const unsigned mask = size - 1; | ||
437 | /* | ||
438 | if (threshold == size) | ||
439 | { | ||
440 | size_t s = 0; | ||
441 | size--; | ||
442 | do | ||
443 | { | ||
444 | int freq = freqs[s]; | ||
445 | if (freq <= 0) | ||
446 | continue; | ||
447 | states[s] = (UInt16)freq; | ||
448 | do | ||
449 | { | ||
450 | table[pos] (CFseRecord)s; | ||
451 | pos = (pos + step) & size; // & mask; | ||
452 | } | ||
453 | while (--freq); | ||
454 | } | ||
455 | while (++s != numSyms); | ||
456 | } | ||
457 | else | ||
458 | */ | ||
459 | { | ||
460 | size_t s = 0; | ||
461 | size--; | ||
462 | do | ||
463 | { | ||
464 | int freq = freqs[s]; | ||
465 | if (freq <= 0) | ||
466 | continue; | ||
467 | states[s] = (UInt16)freq; | ||
468 | do | ||
469 | { | ||
470 | table[pos] = (CFseRecord)s; | ||
471 | // we skip position, if it's already occupied by a "less than 1" probability symbol. | ||
472 | // (step) is coprime to table size, so the cycle will visit each position exactly once | ||
473 | do | ||
474 | pos = (pos + step) & size; // & mask; | ||
475 | while (pos >= threshold); | ||
476 | } | ||
477 | while (--freq); | ||
478 | } | ||
479 | while (++s != numSyms); | ||
480 | } | ||
481 | size++; | ||
482 | // (pos != 0) is unexpected case that means that freqs[] are not correct. | ||
483 | // so it's some failure in code (for example, incorrect predefined freq[] table) | ||
484 | // if (pos != 0) return SZ_ERROR_FAIL; | ||
485 | } | ||
486 | // } | ||
487 | } | ||
488 | { | ||
489 | const CFseRecord * const limit = table + size; | ||
490 | delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta; | ||
491 | /* State increases by symbol over time, decreasing number of bits. | ||
492 | Baseline increases until the bit threshold is passed, at which point it resets to 0 */ | ||
493 | do | ||
494 | { | ||
495 | #define TABLE_ITER(a) \ | ||
496 | { \ | ||
497 | const FastInt sym = (FastInt)table[a]; \ | ||
498 | const unsigned nextState = states[sym]; \ | ||
499 | unsigned nb; \ | ||
500 | states[sym] = (UInt16)(nextState + 1); \ | ||
501 | nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \ | ||
502 | table[a] = (CFseRecord)(sym - delta \ | ||
503 | + ((UInt32)nb << FSE_REC_LEN_OFFSET) \ | ||
504 | + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \ | ||
505 | } | ||
506 | TABLE_ITER(0) | ||
507 | TABLE_ITER(1) | ||
508 | table += 2; | ||
509 | } | ||
510 | while (table != limit); | ||
511 | } | ||
512 | } | ||
513 | |||
514 | |||
515 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
516 | |||
517 | static void Print_Predef(unsigned predefAccuracy, | ||
518 | const unsigned numSymsPredef, | ||
519 | const Int16 * const predefFreqs, | ||
520 | const CFseRecord *checkTable) | ||
521 | { | ||
522 | CFseRecord table[1 << 6]; | ||
523 | unsigned i; | ||
524 | FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy, | ||
525 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
526 | numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : | ||
527 | #endif | ||
528 | 0 | ||
529 | ); | ||
530 | if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0) | ||
531 | exit(1); | ||
532 | for (i = 0; i < (1u << predefAccuracy); i++) | ||
533 | { | ||
534 | const UInt32 v = table[i]; | ||
535 | const unsigned state = (unsigned)(GET_FSE_REC_STATE(v)); | ||
536 | if (state & 0xf) | ||
537 | exit(1); | ||
538 | if (i != 0) | ||
539 | { | ||
540 | printf(","); | ||
541 | if (i % 8 == 0) | ||
542 | printf("\n"); | ||
543 | } | ||
544 | printf("W(%d,%d,%2d)", | ||
545 | (unsigned)(state >> 4), | ||
546 | (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff), | ||
547 | (unsigned)GET_FSE_REC_SYM(v)); | ||
548 | } | ||
549 | printf("\n\n"); | ||
550 | } | ||
551 | |||
552 | #endif | ||
553 | |||
554 | |||
555 | #define GET16(dest, p) { const Byte *ptr = p; dest = GetUi16(ptr); } | ||
556 | #define GET32(dest, p) { const Byte *ptr = p; dest = GetUi32(ptr); } | ||
557 | |||
558 | // (1 <= numBits <= 9) | ||
559 | #define FORWARD_READ_BITS(destVal, numBits, mask) \ | ||
560 | { const CBitCtr_signed bos3 = (bitOffset) >> 3; \ | ||
561 | if (bos3 >= 0) return SZ_ERROR_DATA; \ | ||
562 | GET16(destVal, src + bos3) \ | ||
563 | destVal >>= (bitOffset) & 7; \ | ||
564 | bitOffset += (CBitCtr_signed)(numBits); \ | ||
565 | mask = (1u << (numBits)) - 1; \ | ||
566 | destVal &= mask; \ | ||
567 | } | ||
568 | |||
569 | #define FORWARD_READ_1BIT(destVal) \ | ||
570 | { const CBitCtr_signed bos3 = (bitOffset) >> 3; \ | ||
571 | if (bos3 >= 0) return SZ_ERROR_DATA; \ | ||
572 | destVal = *(src + bos3); \ | ||
573 | destVal >>= (bitOffset) & 7; \ | ||
574 | (bitOffset)++; \ | ||
575 | destVal &= 1; \ | ||
576 | } | ||
577 | |||
578 | |||
579 | // in: (accuracyMax <= 9) | ||
580 | // at least 2 bytes will be processed from (in) stream. | ||
581 | // at return: (in->len > 0) | ||
582 | static | ||
583 | Z7_NO_INLINE | ||
584 | SRes FSE_DecodeHeader(CFseRecord *const table, | ||
585 | CInBufPair *const in, | ||
586 | const unsigned accuracyMax, | ||
587 | Byte *const accuracyRes, | ||
588 | unsigned numSymbolsMax) | ||
589 | { | ||
590 | unsigned accuracy; | ||
591 | unsigned remain1; | ||
592 | unsigned syms; | ||
593 | Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat) | ||
594 | const Byte *src = in->ptr; | ||
595 | CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1; | ||
596 | if (bitOffset <= 0) | ||
597 | return SZ_ERROR_DATA; | ||
598 | accuracy = *src & 0xf; | ||
599 | accuracy += 5; | ||
600 | if (accuracy > accuracyMax) | ||
601 | return SZ_ERROR_DATA; | ||
602 | *accuracyRes = (Byte)accuracy; | ||
603 | remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1) | ||
604 | syms = 0; | ||
605 | src += bitOffset; // src points to last byte | ||
606 | bitOffset = 4 - (bitOffset << 3); | ||
607 | |||
608 | for (;;) | ||
609 | { | ||
610 | // (2 <= remain1) | ||
611 | const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1); | ||
612 | // (1 <= bits <= accuracy) | ||
613 | unsigned val; // it must be unsigned or int | ||
614 | unsigned mask; | ||
615 | FORWARD_READ_BITS(val, bits, mask) | ||
616 | { | ||
617 | const unsigned val2 = remain1 + val - mask; | ||
618 | if (val2 > mask) | ||
619 | { | ||
620 | unsigned bit; | ||
621 | FORWARD_READ_1BIT(bit) | ||
622 | if (bit) | ||
623 | val = val2; | ||
624 | } | ||
625 | } | ||
626 | { | ||
627 | // (remain1 >= 2) | ||
628 | // (0 <= (int)val <= remain1) | ||
629 | val = (unsigned)((int)val - 1); | ||
630 | // val now is "probability" of symbol | ||
631 | // (probability == -1) means "less than 1" frequency. | ||
632 | // (-1 <= (int)val <= remain1 - 1) | ||
633 | freqs[syms++] = (Int16)(int)val; | ||
634 | if (val != 0) | ||
635 | { | ||
636 | remain1 -= (int)val < 0 ? 1u : (unsigned)val; | ||
637 | // remain1 -= val; | ||
638 | // val >>= (sizeof(val) * 8 - 2); | ||
639 | // remain1 -= val & 2; | ||
640 | // freqs[syms++] = (Int16)(int)val; | ||
641 | // syms++; | ||
642 | if (remain1 == 1) | ||
643 | break; | ||
644 | if (syms >= FSE_NUM_SYMBOLS_MAX) | ||
645 | return SZ_ERROR_DATA; | ||
646 | } | ||
647 | else // if (val == 0) | ||
648 | { | ||
649 | // freqs[syms++] = 0; | ||
650 | // syms++; | ||
651 | for (;;) | ||
652 | { | ||
653 | unsigned repeat; | ||
654 | FORWARD_READ_BITS(repeat, 2, mask) | ||
655 | freqs[syms ] = 0; | ||
656 | freqs[syms + 1] = 0; | ||
657 | freqs[syms + 2] = 0; | ||
658 | syms += repeat; | ||
659 | if (syms >= FSE_NUM_SYMBOLS_MAX) | ||
660 | return SZ_ERROR_DATA; | ||
661 | if (repeat != 3) | ||
662 | break; | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | } | ||
667 | |||
668 | if (syms > numSymbolsMax) | ||
669 | return SZ_ERROR_DATA; | ||
670 | bitOffset += 7; | ||
671 | bitOffset >>= 3; | ||
672 | if (bitOffset > 0) | ||
673 | return SZ_ERROR_DATA; | ||
674 | in->ptr = src + bitOffset; | ||
675 | in->len = (size_t)(1 - bitOffset); | ||
676 | { | ||
677 | // unsigned uuu; for (uuu = 0; uuu < 50; uuu++) | ||
678 | FSE_Generate(table, freqs, syms, accuracy, | ||
679 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
680 | numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : | ||
681 | #endif | ||
682 | 0 | ||
683 | ); | ||
684 | } | ||
685 | return SZ_OK; | ||
686 | } | ||
687 | |||
688 | |||
689 | // ---------- HUFFMAN ---------- | ||
690 | |||
691 | #define HUF_MAX_BITS 12 | ||
692 | #define HUF_MAX_SYMBS 256 | ||
693 | #define HUF_DUMMY_SIZE (128 + 8 * 2) // it must multiple of 8 | ||
694 | // #define HUF_DUMMY_SIZE 0 | ||
695 | #define HUF_TABLE_SIZE ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE) | ||
696 | #define HUF_GET_SYMBOLS(table) ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE) | ||
697 | // #define HUF_GET_LENS(table) (table) | ||
698 | |||
699 | typedef struct | ||
700 | { | ||
701 | // Byte table[HUF_TABLE_SIZE]; | ||
702 | UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)]; | ||
703 | } | ||
704 | CZstdDecHufTable; | ||
705 | |||
706 | /* | ||
707 | Input: | ||
708 | numSyms != 0 | ||
709 | (bits) array size must be aligned for 2 | ||
710 | if (numSyms & 1), then bits[numSyms] == 0, | ||
711 | Huffman tree must be correct before Huf_Build() call: | ||
712 | (sum (1/2^bits[i]) == 1). | ||
713 | && (bits[i] <= HUF_MAX_BITS) | ||
714 | */ | ||
715 | static | ||
716 | Z7_FORCE_INLINE | ||
717 | void Huf_Build(Byte * const table, | ||
718 | const Byte *bits, const unsigned numSyms) | ||
719 | { | ||
720 | unsigned counts0[HUF_MAX_BITS + 1]; | ||
721 | unsigned counts1[HUF_MAX_BITS + 1]; | ||
722 | const Byte * const bitsEnd = bits + numSyms; | ||
723 | // /* | ||
724 | { | ||
725 | unsigned t; | ||
726 | for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0; | ||
727 | for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0; | ||
728 | } | ||
729 | // */ | ||
730 | // memset(counts0, 0, sizeof(counts0)); | ||
731 | // memset(counts1, 0, sizeof(counts1)); | ||
732 | { | ||
733 | const Byte *bits2 = bits; | ||
734 | // we access additional bits[symbol] if (numSyms & 1) | ||
735 | do | ||
736 | { | ||
737 | counts0[bits2[0]]++; | ||
738 | counts1[bits2[1]]++; | ||
739 | } | ||
740 | while ((bits2 += 2) < bitsEnd); | ||
741 | } | ||
742 | { | ||
743 | unsigned r = 0; | ||
744 | unsigned i = HUF_MAX_BITS; | ||
745 | // Byte *lens = HUF_GET_LENS(symbols); | ||
746 | do | ||
747 | { | ||
748 | const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i); | ||
749 | counts0[i] = r; | ||
750 | if (num) | ||
751 | { | ||
752 | Byte *lens = &table[r]; | ||
753 | r += num; | ||
754 | memset(lens, (int)i, num); | ||
755 | } | ||
756 | } | ||
757 | while (--i); | ||
758 | counts0[0] = 0; // for speculated loads | ||
759 | // no need for check: | ||
760 | // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0); | ||
761 | } | ||
762 | { | ||
763 | #ifdef MY_CPU_64BIT | ||
764 | UInt64 | ||
765 | #else | ||
766 | UInt32 | ||
767 | #endif | ||
768 | v = 0; | ||
769 | Byte *symbols = HUF_GET_SYMBOLS(table); | ||
770 | do | ||
771 | { | ||
772 | const unsigned nb = *bits++; | ||
773 | if (nb) | ||
774 | { | ||
775 | const unsigned code = counts0[nb]; | ||
776 | const unsigned num = (1u << HUF_MAX_BITS) >> nb; | ||
777 | counts0[nb] = code + num; | ||
778 | // memset(&symbols[code], i, num); | ||
779 | // /* | ||
780 | { | ||
781 | Byte *s2 = &symbols[code]; | ||
782 | if (num <= 2) | ||
783 | { | ||
784 | s2[0] = (Byte)v; | ||
785 | s2[(size_t)num - 1] = (Byte)v; | ||
786 | } | ||
787 | else if (num <= 8) | ||
788 | { | ||
789 | *(UInt32 *)(void *)s2 = (UInt32)v; | ||
790 | *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v; | ||
791 | } | ||
792 | else | ||
793 | { | ||
794 | #ifdef MY_CPU_64BIT | ||
795 | UInt64 *s = (UInt64 *)(void *)s2; | ||
796 | const UInt64 *lim = (UInt64 *)(void *)(s2 + num); | ||
797 | do | ||
798 | { | ||
799 | s[0] = v; s[1] = v; s += 2; | ||
800 | } | ||
801 | while (s != lim); | ||
802 | #else | ||
803 | UInt32 *s = (UInt32 *)(void *)s2; | ||
804 | const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num); | ||
805 | do | ||
806 | { | ||
807 | s[0] = v; s[1] = v; s += 2; | ||
808 | s[0] = v; s[1] = v; s += 2; | ||
809 | } | ||
810 | while (s != lim); | ||
811 | #endif | ||
812 | } | ||
813 | } | ||
814 | // */ | ||
815 | } | ||
816 | v += | ||
817 | #ifdef MY_CPU_64BIT | ||
818 | 0x0101010101010101; | ||
819 | #else | ||
820 | 0x01010101; | ||
821 | #endif | ||
822 | } | ||
823 | while (bits != bitsEnd); | ||
824 | } | ||
825 | } | ||
826 | |||
827 | |||
828 | |||
829 | // how many bytes (src) was moved back from original value. | ||
830 | // we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access | ||
831 | #define HUF_SRC_OFFSET 3 | ||
832 | |||
833 | // v <<= 8 - (bitOffset & 7) + numBits; | ||
834 | // v >>= 32 - HUF_MAX_BITS; | ||
835 | #define HUF_GET_STATE(v, bitOffset, numBits) \ | ||
836 | GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \ | ||
837 | v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \ | ||
838 | v &= (1u << HUF_MAX_BITS) - 1; \ | ||
839 | |||
840 | |||
841 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
842 | #if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \ | ||
843 | || !defined(MY_CPU_X86_OR_AMD64) \ | ||
844 | // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */ | ||
845 | // we need big number (>=16) of registers for PRELOAD4 | ||
846 | #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4 | ||
847 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug | ||
848 | #endif | ||
849 | #endif | ||
850 | |||
851 | // for debug: simpler and smaller code but slow: | ||
852 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE | ||
853 | |||
854 | #if defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \ | ||
855 | !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS) | ||
856 | |||
857 | #define HUF_DECODE(bitOffset, dest) \ | ||
858 | { \ | ||
859 | UInt32 v; \ | ||
860 | HUF_GET_STATE(v, bitOffset, 0) \ | ||
861 | bitOffset -= table[v]; \ | ||
862 | *(dest) = symbols[v]; \ | ||
863 | if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \ | ||
864 | } | ||
865 | |||
866 | #endif | ||
867 | |||
868 | #if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \ | ||
869 | defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \ | ||
870 | defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \ | ||
871 | |||
872 | #define HUF_DECODE_2_INIT(v, bitOffset) \ | ||
873 | HUF_GET_STATE(v, bitOffset, 0) | ||
874 | |||
875 | #define HUF_DECODE_2(v, bitOffset, dest) \ | ||
876 | { \ | ||
877 | unsigned numBits; \ | ||
878 | numBits = table[v]; \ | ||
879 | *(dest) = symbols[v]; \ | ||
880 | HUF_GET_STATE(v, bitOffset, numBits) \ | ||
881 | bitOffset -= (CBitCtr)numBits; \ | ||
882 | if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \ | ||
883 | } | ||
884 | |||
885 | #endif | ||
886 | |||
887 | |||
888 | // src == ptr - HUF_SRC_OFFSET | ||
889 | // we are allowed to access 3 bytes before start of input buffer | ||
890 | static | ||
891 | Z7_NO_INLINE | ||
892 | SRes Huf_Decompress_1stream(const Byte * const table, | ||
893 | const Byte *src, const size_t srcLen, | ||
894 | Byte *dest, const size_t destLen) | ||
895 | { | ||
896 | CBitCtr bitOffset; | ||
897 | if (srcLen == 0) | ||
898 | return SZ_ERROR_DATA; | ||
899 | SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen) | ||
900 | if (destLen) | ||
901 | { | ||
902 | const Byte *symbols = HUF_GET_SYMBOLS(table); | ||
903 | const Byte *destLim = dest + destLen; | ||
904 | #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE | ||
905 | { | ||
906 | do | ||
907 | { | ||
908 | HUF_DECODE (bitOffset, dest) | ||
909 | } | ||
910 | while (++dest != destLim); | ||
911 | } | ||
912 | #else | ||
913 | { | ||
914 | UInt32 v; | ||
915 | HUF_DECODE_2_INIT (v, bitOffset) | ||
916 | do | ||
917 | { | ||
918 | HUF_DECODE_2 (v, bitOffset, dest) | ||
919 | } | ||
920 | while (++dest != destLim); | ||
921 | } | ||
922 | #endif | ||
923 | } | ||
924 | return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA; | ||
925 | } | ||
926 | |||
927 | |||
928 | // for debug : it reduces register pressure : by array copy can be slow : | ||
929 | // #define Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
930 | |||
931 | // src == ptr + (6 - HUF_SRC_OFFSET) | ||
932 | // srcLen >= 10 | ||
933 | // we are allowed to access 3 bytes before start of input buffer | ||
934 | static | ||
935 | Z7_NO_INLINE | ||
936 | SRes Huf_Decompress_4stream(const Byte * const | ||
937 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
938 | table2, | ||
939 | #else | ||
940 | table, | ||
941 | #endif | ||
942 | const Byte *src, size_t srcLen, | ||
943 | Byte *dest, size_t destLen) | ||
944 | { | ||
945 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
946 | Byte table[HUF_TABLE_SIZE]; | ||
947 | #endif | ||
948 | UInt32 sizes[3]; | ||
949 | const size_t delta = (destLen + 3) / 4; | ||
950 | if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
951 | if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
952 | sizes[1] += sizes[0]; | ||
953 | if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
954 | sizes[2] += sizes[1]; | ||
955 | srcLen -= 6; | ||
956 | if (srcLen <= sizes[2]) | ||
957 | return SZ_ERROR_DATA; | ||
958 | |||
959 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
960 | { | ||
961 | // unsigned i = 0; for(; i < 1000; i++) | ||
962 | memcpy(table, table2, HUF_TABLE_SIZE); | ||
963 | } | ||
964 | #endif | ||
965 | |||
966 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
967 | { | ||
968 | CBitCtr bitOffset_0, | ||
969 | bitOffset_1, | ||
970 | bitOffset_2, | ||
971 | bitOffset_3; | ||
972 | { | ||
973 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0]) | ||
974 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1]) | ||
975 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2]) | ||
976 | SET_bitOffset_TO_PAD (bitOffset_3, src + HUF_SRC_OFFSET, srcLen) | ||
977 | } | ||
978 | { | ||
979 | const Byte * const symbols = HUF_GET_SYMBOLS(table); | ||
980 | Byte *destLim = dest + destLen - delta * 3; | ||
981 | |||
982 | if (dest != destLim) | ||
983 | #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4 | ||
984 | { | ||
985 | UInt32 v_0, v_1, v_2, v_3; | ||
986 | HUF_DECODE_2_INIT (v_0, bitOffset_0) | ||
987 | HUF_DECODE_2_INIT (v_1, bitOffset_1) | ||
988 | HUF_DECODE_2_INIT (v_2, bitOffset_2) | ||
989 | HUF_DECODE_2_INIT (v_3, bitOffset_3) | ||
990 | // #define HUF_DELTA (1 << 17) / 4 | ||
991 | do | ||
992 | { | ||
993 | HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3) | ||
994 | HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2) | ||
995 | HUF_DECODE_2 (v_1, bitOffset_1, dest + delta) | ||
996 | HUF_DECODE_2 (v_0, bitOffset_0, dest) | ||
997 | } | ||
998 | while (++dest != destLim); | ||
999 | /* | ||
1000 | {// unsigned y = 0; for (;y < 1; y++) | ||
1001 | { | ||
1002 | const size_t num = destLen - delta * 3; | ||
1003 | Byte *orig = dest - num; | ||
1004 | memmove (orig + delta , orig + HUF_DELTA, num); | ||
1005 | memmove (orig + delta * 2, orig + HUF_DELTA * 2, num); | ||
1006 | memmove (orig + delta * 3, orig + HUF_DELTA * 3, num); | ||
1007 | }} | ||
1008 | */ | ||
1009 | } | ||
1010 | #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) | ||
1011 | { | ||
1012 | UInt32 v_0, v_1, v_2, v_3; | ||
1013 | HUF_DECODE_2_INIT (v_0, bitOffset_0) | ||
1014 | HUF_DECODE_2_INIT (v_1, bitOffset_1) | ||
1015 | do | ||
1016 | { | ||
1017 | HUF_DECODE_2 (v_0, bitOffset_0, dest) | ||
1018 | HUF_DECODE_2 (v_1, bitOffset_1, dest + delta) | ||
1019 | } | ||
1020 | while (++dest != destLim); | ||
1021 | dest = destLim - (destLen - delta * 3); | ||
1022 | dest += delta * 2; | ||
1023 | destLim += delta * 2; | ||
1024 | HUF_DECODE_2_INIT (v_2, bitOffset_2) | ||
1025 | HUF_DECODE_2_INIT (v_3, bitOffset_3) | ||
1026 | do | ||
1027 | { | ||
1028 | HUF_DECODE_2 (v_2, bitOffset_2, dest) | ||
1029 | HUF_DECODE_2 (v_3, bitOffset_3, dest + delta) | ||
1030 | } | ||
1031 | while (++dest != destLim); | ||
1032 | dest -= delta * 2; | ||
1033 | destLim -= delta * 2; | ||
1034 | } | ||
1035 | #else | ||
1036 | { | ||
1037 | do | ||
1038 | { | ||
1039 | HUF_DECODE (bitOffset_3, dest + delta * 3) | ||
1040 | HUF_DECODE (bitOffset_2, dest + delta * 2) | ||
1041 | HUF_DECODE (bitOffset_1, dest + delta) | ||
1042 | HUF_DECODE (bitOffset_0, dest) | ||
1043 | } | ||
1044 | while (++dest != destLim); | ||
1045 | } | ||
1046 | #endif | ||
1047 | |||
1048 | if (bitOffset_3 != (CBitCtr)sizes[2]) | ||
1049 | return SZ_ERROR_DATA; | ||
1050 | if (destLen &= 3) | ||
1051 | { | ||
1052 | destLim = dest + 4 - destLen; | ||
1053 | do | ||
1054 | { | ||
1055 | HUF_DECODE (bitOffset_2, dest + delta * 2) | ||
1056 | HUF_DECODE (bitOffset_1, dest + delta) | ||
1057 | HUF_DECODE (bitOffset_0, dest) | ||
1058 | } | ||
1059 | while (++dest != destLim); | ||
1060 | } | ||
1061 | if ( bitOffset_0 != 0 | ||
1062 | || bitOffset_1 != (CBitCtr)sizes[0] | ||
1063 | || bitOffset_2 != (CBitCtr)sizes[1]) | ||
1064 | return SZ_ERROR_DATA; | ||
1065 | } | ||
1066 | } | ||
1067 | #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
1068 | { | ||
1069 | unsigned i; | ||
1070 | for (i = 0; i < 4; i++) | ||
1071 | { | ||
1072 | size_t d = destLen; | ||
1073 | size_t size = srcLen; | ||
1074 | if (i != 3) | ||
1075 | { | ||
1076 | d = delta; | ||
1077 | size = sizes[i]; | ||
1078 | } | ||
1079 | if (i != 0) | ||
1080 | size -= sizes[i - 1]; | ||
1081 | destLen -= d; | ||
1082 | RINOK(Huf_Decompress_1stream(table, src, size, dest, d)) | ||
1083 | dest += d; | ||
1084 | src += size; | ||
1085 | } | ||
1086 | } | ||
1087 | #endif | ||
1088 | |||
1089 | return SZ_OK; | ||
1090 | } | ||
1091 | |||
1092 | |||
1093 | |||
1094 | // (in->len != 0) | ||
1095 | // we are allowed to access in->ptr[-3] | ||
1096 | // at least 2 bytes in (in->ptr) will be processed | ||
1097 | static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in) | ||
1098 | { | ||
1099 | Byte weights[HUF_MAX_SYMBS + 1]; // +1 for extra write for loop unroll | ||
1100 | unsigned numSyms; | ||
1101 | const unsigned header = *(in->ptr)++; | ||
1102 | in->len--; | ||
1103 | // memset(weights, 0, sizeof(weights)); | ||
1104 | if (header >= 128) | ||
1105 | { | ||
1106 | // direct representation: 4 bits field (0-15) per weight | ||
1107 | numSyms = header - 127; | ||
1108 | // numSyms != 0 | ||
1109 | { | ||
1110 | const size_t numBytes = (numSyms + 1) / 2; | ||
1111 | const Byte *const ws = in->ptr; | ||
1112 | size_t i = 0; | ||
1113 | if (in->len < numBytes) | ||
1114 | return SZ_ERROR_DATA; | ||
1115 | in->ptr += numBytes; | ||
1116 | in->len -= numBytes; | ||
1117 | do | ||
1118 | { | ||
1119 | const unsigned b = ws[i]; | ||
1120 | weights[i * 2 ] = (Byte)(b >> 4); | ||
1121 | weights[i * 2 + 1] = (Byte)(b & 0xf); | ||
1122 | } | ||
1123 | while (++i != numBytes); | ||
1124 | /* 7ZIP: we can restore correct zero value for weights[numSyms], | ||
1125 | if we want to use zero values starting from numSyms in code below. */ | ||
1126 | // weights[numSyms] = 0; | ||
1127 | } | ||
1128 | } | ||
1129 | else | ||
1130 | { | ||
1131 | #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6 | ||
1132 | CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS]; | ||
1133 | |||
1134 | Byte accuracy; | ||
1135 | const Byte *src; | ||
1136 | size_t srcLen; | ||
1137 | if (in->len < header) | ||
1138 | return SZ_ERROR_DATA; | ||
1139 | { | ||
1140 | CInBufPair fse_stream; | ||
1141 | fse_stream.len = header; | ||
1142 | fse_stream.ptr = in->ptr; | ||
1143 | in->ptr += header; | ||
1144 | in->len -= header; | ||
1145 | RINOK(FSE_DecodeHeader(table, &fse_stream, | ||
1146 | MAX_ACCURACY_LOG_FOR_WEIGHTS, | ||
1147 | &accuracy, | ||
1148 | 16 // num weight symbols max (max-symbol is 15) | ||
1149 | )) | ||
1150 | // at least 2 bytes were processed in fse_stream. | ||
1151 | // (srcLen > 0) after FSE_DecodeHeader() | ||
1152 | // if (srcLen == 0) return SZ_ERROR_DATA; | ||
1153 | src = fse_stream.ptr; | ||
1154 | srcLen = fse_stream.len; | ||
1155 | } | ||
1156 | // we are allowed to access src[-5] | ||
1157 | { | ||
1158 | // unsigned yyy = 200; do { | ||
1159 | CBitCtr bitOffset; | ||
1160 | FastInt32 state1, state2; | ||
1161 | SET_bitOffset_TO_PAD (bitOffset, src, srcLen) | ||
1162 | state1 = accuracy; | ||
1163 | src -= state1 >> 2; // src -= 1; // for GET16() optimization | ||
1164 | state1 <<= FSE_REC_LEN_OFFSET; | ||
1165 | state2 = state1; | ||
1166 | numSyms = 0; | ||
1167 | for (;;) | ||
1168 | { | ||
1169 | #define FSE_WEIGHT_DECODE(st) \ | ||
1170 | { \ | ||
1171 | const unsigned bits = GET_FSE_REC_LEN(st); \ | ||
1172 | FastInt r; \ | ||
1173 | GET16(r, src + (bitOffset >> 3)) \ | ||
1174 | r >>= (unsigned)bitOffset & 7; \ | ||
1175 | if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \ | ||
1176 | { if (bitOffset + (CBitCtr)bits != 0) \ | ||
1177 | return SZ_ERROR_DATA; \ | ||
1178 | break; } \ | ||
1179 | r &= 0xff; \ | ||
1180 | r >>= 8 - bits; \ | ||
1181 | st = table[GET_FSE_REC_STATE(st) + r]; \ | ||
1182 | weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \ | ||
1183 | } | ||
1184 | FSE_WEIGHT_DECODE (state1) | ||
1185 | FSE_WEIGHT_DECODE (state2) | ||
1186 | if (numSyms == HUF_MAX_SYMBS) | ||
1187 | return SZ_ERROR_DATA; | ||
1188 | } | ||
1189 | // src += (unsigned)accuracy >> 2; } while (--yyy); | ||
1190 | } | ||
1191 | } | ||
1192 | |||
1193 | // Build using weights: | ||
1194 | { | ||
1195 | UInt32 sum = 0; | ||
1196 | { | ||
1197 | // numSyms >= 1 | ||
1198 | unsigned i = 0; | ||
1199 | weights[numSyms] = 0; | ||
1200 | do | ||
1201 | { | ||
1202 | sum += ((UInt32)1 << weights[i ]) & ~(UInt32)1; | ||
1203 | sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1; | ||
1204 | i += 2; | ||
1205 | } | ||
1206 | while (i < numSyms); | ||
1207 | if (sum == 0) | ||
1208 | return SZ_ERROR_DATA; | ||
1209 | } | ||
1210 | { | ||
1211 | const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1; | ||
1212 | { | ||
1213 | const UInt32 left = ((UInt32)1 << maxBits) - sum; | ||
1214 | // (left != 0) | ||
1215 | // (left) must be power of 2 in correct stream | ||
1216 | if (left & (left - 1)) | ||
1217 | return SZ_ERROR_DATA; | ||
1218 | weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left); | ||
1219 | } | ||
1220 | // if (numSyms & 1) | ||
1221 | weights[numSyms] = 0; // for loop unroll | ||
1222 | // numSyms >= 2 | ||
1223 | { | ||
1224 | unsigned i = 0; | ||
1225 | do | ||
1226 | { | ||
1227 | /* | ||
1228 | #define WEIGHT_ITER(a) \ | ||
1229 | { unsigned w = weights[i + (a)]; \ | ||
1230 | const unsigned t = maxBits - w; \ | ||
1231 | w = w ? t: w; \ | ||
1232 | if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \ | ||
1233 | weights[i + (a)] = (Byte)w; } | ||
1234 | */ | ||
1235 | // /* | ||
1236 | #define WEIGHT_ITER(a) \ | ||
1237 | { unsigned w = weights[i + (a)]; \ | ||
1238 | if (w) { \ | ||
1239 | w = maxBits - w; \ | ||
1240 | if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \ | ||
1241 | weights[i + (a)] = (Byte)w; }} | ||
1242 | // */ | ||
1243 | WEIGHT_ITER(0) | ||
1244 | // WEIGHT_ITER(1) | ||
1245 | // i += 2; | ||
1246 | } | ||
1247 | while (++i != numSyms); | ||
1248 | } | ||
1249 | } | ||
1250 | } | ||
1251 | { | ||
1252 | // unsigned yyy; for (yyy = 0; yyy < 100; yyy++) | ||
1253 | Huf_Build((Byte *)(void *)p->table64, weights, numSyms); | ||
1254 | } | ||
1255 | return SZ_OK; | ||
1256 | } | ||
1257 | |||
1258 | |||
1259 | typedef enum | ||
1260 | { | ||
1261 | k_SeqMode_Predef = 0, | ||
1262 | k_SeqMode_RLE = 1, | ||
1263 | k_SeqMode_FSE = 2, | ||
1264 | k_SeqMode_Repeat = 3 | ||
1265 | } | ||
1266 | z7_zstd_enum_SeqMode; | ||
1267 | |||
1268 | // predefAccuracy == 5 for OFFSET symbols | ||
1269 | // predefAccuracy == 6 for MATCH/LIT LEN symbols | ||
1270 | static | ||
1271 | SRes | ||
1272 | Z7_NO_INLINE | ||
1273 | // Z7_FORCE_INLINE | ||
1274 | FSE_Decode_SeqTable(CFseRecord * const table, | ||
1275 | CInBufPair * const in, | ||
1276 | unsigned predefAccuracy, | ||
1277 | Byte * const accuracyRes, | ||
1278 | unsigned numSymbolsMax, | ||
1279 | const CFseRecord * const predefs, | ||
1280 | const unsigned seqMode) | ||
1281 | { | ||
1282 | // UNUSED_VAR(numSymsPredef) | ||
1283 | // UNUSED_VAR(predefFreqs) | ||
1284 | if (seqMode == k_SeqMode_FSE) | ||
1285 | { | ||
1286 | // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK( | ||
1287 | return | ||
1288 | FSE_DecodeHeader(table, in, | ||
1289 | predefAccuracy + 3, // accuracyMax | ||
1290 | accuracyRes, | ||
1291 | numSymbolsMax) | ||
1292 | ; | ||
1293 | // )} while (--y); return SZ_OK; | ||
1294 | } | ||
1295 | // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax | ||
1296 | // numSymsMax == 32 for offsets | ||
1297 | |||
1298 | if (seqMode == k_SeqMode_Predef) | ||
1299 | { | ||
1300 | *accuracyRes = (Byte)predefAccuracy; | ||
1301 | memcpy(table, predefs, sizeof(UInt32) << predefAccuracy); | ||
1302 | return SZ_OK; | ||
1303 | } | ||
1304 | |||
1305 | // (seqMode == k_SeqMode_RLE) | ||
1306 | if (in->len == 0) | ||
1307 | return SZ_ERROR_DATA; | ||
1308 | in->len--; | ||
1309 | { | ||
1310 | const Byte *ptr = in->ptr; | ||
1311 | const Byte sym = ptr[0]; | ||
1312 | in->ptr = ptr + 1; | ||
1313 | table[0] = (FastInt32)sym | ||
1314 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
1315 | + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0) | ||
1316 | #endif | ||
1317 | ; | ||
1318 | *accuracyRes = 0; | ||
1319 | } | ||
1320 | return SZ_OK; | ||
1321 | } | ||
1322 | |||
1323 | |||
1324 | typedef struct | ||
1325 | { | ||
1326 | CFseRecord of[1 << 8]; | ||
1327 | CFseRecord ll[1 << 9]; | ||
1328 | CFseRecord ml[1 << 9]; | ||
1329 | } | ||
1330 | CZstdDecFseTables; | ||
1331 | |||
1332 | |||
1333 | typedef struct | ||
1334 | { | ||
1335 | Byte *win; | ||
1336 | SizeT cycSize; | ||
1337 | /* | ||
1338 | if (outBuf_fromCaller) : cycSize = outBufSize_fromCaller | ||
1339 | else { | ||
1340 | if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space) | ||
1341 | if (!isCyclicMode) : cycSize = ContentSize, | ||
1342 | (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown | ||
1343 | } | ||
1344 | */ | ||
1345 | SizeT winPos; | ||
1346 | |||
1347 | CZstdDecOffset reps[3]; | ||
1348 | |||
1349 | Byte ll_accuracy; | ||
1350 | Byte of_accuracy; | ||
1351 | Byte ml_accuracy; | ||
1352 | // Byte seqTables_wereSet; | ||
1353 | Byte litHuf_wasSet; | ||
1354 | |||
1355 | Byte *literalsBase; | ||
1356 | |||
1357 | size_t winSize; // from header | ||
1358 | size_t totalOutCheck; // totalOutCheck <= winSize | ||
1359 | |||
1360 | #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
1361 | SEQ_EXTRA_TABLES(m_) | ||
1362 | #endif | ||
1363 | // UInt64 _pad_Alignment; // is not required now | ||
1364 | CZstdDecFseTables fse; | ||
1365 | CZstdDecHufTable huf; | ||
1366 | } | ||
1367 | CZstdDec1; | ||
1368 | |||
1369 | #define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \ | ||
1370 | ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax) | ||
1371 | |||
1372 | #define SEQ_TABLES_WERE_NOT_SET_ml_accuracy 1 // accuracy=1 is not used by zstd | ||
1373 | #define IS_SEQ_TABLES_WERE_SET(p) (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy)) | ||
1374 | // #define IS_SEQ_TABLES_WERE_SET(p) ((p)->seqTables_wereSet) | ||
1375 | |||
1376 | |||
1377 | static void ZstdDec1_Construct(CZstdDec1 *p) | ||
1378 | { | ||
1379 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
1380 | Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL); | ||
1381 | Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF); | ||
1382 | Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML); | ||
1383 | #endif | ||
1384 | |||
1385 | p->win = NULL; | ||
1386 | p->cycSize = 0; | ||
1387 | p->literalsBase = NULL; | ||
1388 | #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
1389 | FILL_LOC_BASES_ALL | ||
1390 | #endif | ||
1391 | } | ||
1392 | |||
1393 | |||
1394 | static void ZstdDec1_Init(CZstdDec1 *p) | ||
1395 | { | ||
1396 | p->reps[0] = 1; | ||
1397 | p->reps[1] = 4; | ||
1398 | p->reps[2] = 8; | ||
1399 | // p->seqTables_wereSet = False; | ||
1400 | p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy; | ||
1401 | p->litHuf_wasSet = False; | ||
1402 | p->totalOutCheck = 0; | ||
1403 | } | ||
1404 | |||
1405 | |||
1406 | |||
1407 | #ifdef MY_CPU_LE_UNALIGN | ||
1408 | #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY | ||
1409 | #endif | ||
1410 | |||
1411 | #ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY | ||
1412 | |||
1413 | #define COPY_CHUNK_SIZE 16 | ||
1414 | |||
1415 | #define COPY_CHUNK_4_2(dest, src) \ | ||
1416 | { \ | ||
1417 | ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \ | ||
1418 | ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \ | ||
1419 | src += 4 * 2; \ | ||
1420 | dest += 4 * 2; \ | ||
1421 | } | ||
1422 | |||
1423 | /* sse2 doesn't help here in GCC and CLANG. | ||
1424 | so we disabled sse2 here */ | ||
1425 | /* | ||
1426 | #if defined(MY_CPU_AMD64) | ||
1427 | #define Z7_ZSTD_DEC_USE_SSE2 | ||
1428 | #elif defined(MY_CPU_X86) | ||
1429 | #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \ | ||
1430 | || defined(__SSE2__) \ | ||
1431 | // || 1 == 1 // for debug only | ||
1432 | #define Z7_ZSTD_DEC_USE_SSE2 | ||
1433 | #endif | ||
1434 | #endif | ||
1435 | */ | ||
1436 | |||
1437 | #if defined(MY_CPU_ARM64) | ||
1438 | #define COPY_OFFSET_MIN 16 | ||
1439 | #define COPY_CHUNK1(dest, src) \ | ||
1440 | { \ | ||
1441 | vst1q_u8((uint8_t *)(void *)dest, \ | ||
1442 | vld1q_u8((const uint8_t *)(const void *)src)); \ | ||
1443 | src += 16; \ | ||
1444 | dest += 16; \ | ||
1445 | } | ||
1446 | |||
1447 | #define COPY_CHUNK(dest, src) \ | ||
1448 | { \ | ||
1449 | COPY_CHUNK1(dest, src) \ | ||
1450 | if ((len -= COPY_CHUNK_SIZE) == 0) break; \ | ||
1451 | COPY_CHUNK1(dest, src) \ | ||
1452 | } | ||
1453 | |||
1454 | #elif defined(Z7_ZSTD_DEC_USE_SSE2) | ||
1455 | #include <emmintrin.h> // sse2 | ||
1456 | #define COPY_OFFSET_MIN 16 | ||
1457 | |||
1458 | #define COPY_CHUNK1(dest, src) \ | ||
1459 | { \ | ||
1460 | _mm_storeu_si128((__m128i *)(void *)dest, \ | ||
1461 | _mm_loadu_si128((const __m128i *)(const void *)src)); \ | ||
1462 | src += 16; \ | ||
1463 | dest += 16; \ | ||
1464 | } | ||
1465 | |||
1466 | #define COPY_CHUNK(dest, src) \ | ||
1467 | { \ | ||
1468 | COPY_CHUNK1(dest, src) \ | ||
1469 | if ((len -= COPY_CHUNK_SIZE) == 0) break; \ | ||
1470 | COPY_CHUNK1(dest, src) \ | ||
1471 | } | ||
1472 | |||
1473 | #elif defined(MY_CPU_64BIT) | ||
1474 | #define COPY_OFFSET_MIN 8 | ||
1475 | |||
1476 | #define COPY_CHUNK(dest, src) \ | ||
1477 | { \ | ||
1478 | ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \ | ||
1479 | ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \ | ||
1480 | src += 8 * 2; \ | ||
1481 | dest += 8 * 2; \ | ||
1482 | } | ||
1483 | |||
1484 | #else | ||
1485 | #define COPY_OFFSET_MIN 4 | ||
1486 | |||
1487 | #define COPY_CHUNK(dest, src) \ | ||
1488 | { \ | ||
1489 | COPY_CHUNK_4_2(dest, src); \ | ||
1490 | COPY_CHUNK_4_2(dest, src); \ | ||
1491 | } | ||
1492 | |||
1493 | #endif | ||
1494 | #endif | ||
1495 | |||
1496 | |||
1497 | #ifndef COPY_CHUNK_SIZE | ||
1498 | #define COPY_OFFSET_MIN 4 | ||
1499 | #define COPY_CHUNK_SIZE 8 | ||
1500 | #define COPY_CHUNK_2(dest, src) \ | ||
1501 | { \ | ||
1502 | const Byte a0 = src[0]; \ | ||
1503 | const Byte a1 = src[1]; \ | ||
1504 | dest[0] = a0; \ | ||
1505 | dest[1] = a1; \ | ||
1506 | src += 2; \ | ||
1507 | dest += 2; \ | ||
1508 | } | ||
1509 | #define COPY_CHUNK(dest, src) \ | ||
1510 | { \ | ||
1511 | COPY_CHUNK_2(dest, src) \ | ||
1512 | COPY_CHUNK_2(dest, src) \ | ||
1513 | COPY_CHUNK_2(dest, src) \ | ||
1514 | COPY_CHUNK_2(dest, src) \ | ||
1515 | } | ||
1516 | #endif | ||
1517 | |||
1518 | |||
1519 | #define COPY_PREPARE \ | ||
1520 | len += (COPY_CHUNK_SIZE - 1); \ | ||
1521 | len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \ | ||
1522 | { if (len > rem) \ | ||
1523 | { len = rem; \ | ||
1524 | rem &= (COPY_CHUNK_SIZE - 1); \ | ||
1525 | if (rem) { \ | ||
1526 | len -= rem; \ | ||
1527 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ | ||
1528 | do *dest++ = *src++; while (--rem); \ | ||
1529 | if (len == 0) return; }}} | ||
1530 | |||
1531 | #define COPY_CHUNKS \ | ||
1532 | { \ | ||
1533 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ | ||
1534 | do { COPY_CHUNK(dest, src) } \ | ||
1535 | while (len -= COPY_CHUNK_SIZE); \ | ||
1536 | } | ||
1537 | |||
1538 | // (len != 0) | ||
1539 | // (len <= rem) | ||
1540 | static | ||
1541 | Z7_FORCE_INLINE | ||
1542 | // Z7_ATTRIB_NO_VECTOR | ||
1543 | void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem) | ||
1544 | { | ||
1545 | COPY_PREPARE | ||
1546 | COPY_CHUNKS | ||
1547 | } | ||
1548 | |||
1549 | |||
1550 | /* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional | ||
1551 | space after cycSize that can be used to reduce the code in CopyMatch(): */ | ||
1552 | // for debug: | ||
1553 | // #define Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
1554 | |||
1555 | /* | ||
1556 | CopyMatch() | ||
1557 | if wrap (offset > winPos) | ||
1558 | { | ||
1559 | then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src): | ||
1560 | (cycSize >= offset + COPY_CHUNK_SIZE) | ||
1561 | if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF) | ||
1562 | we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1], | ||
1563 | } | ||
1564 | (len != 0) | ||
1565 | */ | ||
1566 | static | ||
1567 | Z7_FORCE_INLINE | ||
1568 | // Z7_ATTRIB_NO_VECTOR | ||
1569 | void CopyMatch(size_t offset, size_t len, | ||
1570 | Byte *win, size_t winPos, size_t rem, const size_t cycSize) | ||
1571 | { | ||
1572 | Byte *dest = win + winPos; | ||
1573 | const Byte *src; | ||
1574 | // STAT_INC(g_NumCopy) | ||
1575 | |||
1576 | if (offset > winPos) | ||
1577 | { | ||
1578 | size_t back = offset - winPos; | ||
1579 | // src = win + cycSize - back; | ||
1580 | // cycSize -= offset; | ||
1581 | STAT_INC(g_NumOver) | ||
1582 | src = dest + (cycSize - offset); | ||
1583 | // (src >= dest) here | ||
1584 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
1585 | if (back < len) | ||
1586 | { | ||
1587 | #else | ||
1588 | if (back < len + (COPY_CHUNK_SIZE - 1)) | ||
1589 | { | ||
1590 | if (back >= len) | ||
1591 | { | ||
1592 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
1593 | do | ||
1594 | *dest++ = *src++; | ||
1595 | while (--len); | ||
1596 | return; | ||
1597 | } | ||
1598 | #endif | ||
1599 | // back < len | ||
1600 | STAT_INC(g_NumOver2) | ||
1601 | len -= back; | ||
1602 | rem -= back; | ||
1603 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
1604 | do | ||
1605 | *dest++ = *src++; | ||
1606 | while (--back); | ||
1607 | src = dest - offset; | ||
1608 | // src = win; | ||
1609 | // we go to MAIN-COPY | ||
1610 | } | ||
1611 | } | ||
1612 | else | ||
1613 | src = dest - offset; | ||
1614 | |||
1615 | // len != 0 | ||
1616 | // do *dest++ = *src++; while (--len); return; | ||
1617 | |||
1618 | // --- MAIN COPY --- | ||
1619 | // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE) | ||
1620 | // so we have at least COPY_CHUNK_SIZE space before overlap for writing. | ||
1621 | COPY_PREPARE | ||
1622 | |||
1623 | /* now (len == COPY_CHUNK_SIZE * x) | ||
1624 | so we can unroll for aligned copy */ | ||
1625 | { | ||
1626 | // const unsigned b0 = src[0]; | ||
1627 | // (COPY_OFFSET_MIN >= 4) | ||
1628 | |||
1629 | if (offset >= COPY_OFFSET_MIN) | ||
1630 | { | ||
1631 | COPY_CHUNKS | ||
1632 | // return; | ||
1633 | } | ||
1634 | else | ||
1635 | #if (COPY_OFFSET_MIN > 4) | ||
1636 | #if COPY_CHUNK_SIZE < 8 | ||
1637 | #error Stop_Compiling_Bad_COPY_CHUNK_SIZE | ||
1638 | #endif | ||
1639 | if (offset >= 4) | ||
1640 | { | ||
1641 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
1642 | do | ||
1643 | { | ||
1644 | COPY_CHUNK_4_2(dest, src) | ||
1645 | #if COPY_CHUNK_SIZE != 16 | ||
1646 | if (len == 8) break; | ||
1647 | #endif | ||
1648 | COPY_CHUNK_4_2(dest, src) | ||
1649 | } | ||
1650 | while (len -= 16); | ||
1651 | // return; | ||
1652 | } | ||
1653 | else | ||
1654 | #endif | ||
1655 | { | ||
1656 | // (offset < 4) | ||
1657 | const unsigned b0 = src[0]; | ||
1658 | if (offset < 2) | ||
1659 | { | ||
1660 | #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16) | ||
1661 | #if defined(MY_CPU_64BIT) | ||
1662 | { | ||
1663 | const UInt64 v64 = (UInt64)b0 * 0x0101010101010101; | ||
1664 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
1665 | do | ||
1666 | { | ||
1667 | ((UInt64 *)(void *)dest)[0] = v64; | ||
1668 | ((UInt64 *)(void *)dest)[1] = v64; | ||
1669 | dest += 16; | ||
1670 | } | ||
1671 | while (len -= 16); | ||
1672 | } | ||
1673 | #else | ||
1674 | { | ||
1675 | UInt32 v = b0; | ||
1676 | v |= v << 8; | ||
1677 | v |= v << 16; | ||
1678 | do | ||
1679 | { | ||
1680 | ((UInt32 *)(void *)dest)[0] = v; | ||
1681 | ((UInt32 *)(void *)dest)[1] = v; | ||
1682 | dest += 8; | ||
1683 | ((UInt32 *)(void *)dest)[0] = v; | ||
1684 | ((UInt32 *)(void *)dest)[1] = v; | ||
1685 | dest += 8; | ||
1686 | } | ||
1687 | while (len -= 16); | ||
1688 | } | ||
1689 | #endif | ||
1690 | #else | ||
1691 | do | ||
1692 | { | ||
1693 | dest[0] = (Byte)b0; | ||
1694 | dest[1] = (Byte)b0; | ||
1695 | dest += 2; | ||
1696 | dest[0] = (Byte)b0; | ||
1697 | dest[1] = (Byte)b0; | ||
1698 | dest += 2; | ||
1699 | } | ||
1700 | while (len -= 4); | ||
1701 | #endif | ||
1702 | } | ||
1703 | else if (offset == 2) | ||
1704 | { | ||
1705 | const Byte b1 = src[1]; | ||
1706 | { | ||
1707 | do | ||
1708 | { | ||
1709 | dest[0] = (Byte)b0; | ||
1710 | dest[1] = b1; | ||
1711 | dest += 2; | ||
1712 | } | ||
1713 | while (len -= 2); | ||
1714 | } | ||
1715 | } | ||
1716 | else // (offset == 3) | ||
1717 | { | ||
1718 | const Byte *lim = dest + len - 2; | ||
1719 | const Byte b1 = src[1]; | ||
1720 | const Byte b2 = src[2]; | ||
1721 | do | ||
1722 | { | ||
1723 | dest[0] = (Byte)b0; | ||
1724 | dest[1] = b1; | ||
1725 | dest[2] = b2; | ||
1726 | dest += 3; | ||
1727 | } | ||
1728 | while (dest < lim); | ||
1729 | lim++; // points to last byte that must be written | ||
1730 | if (dest <= lim) | ||
1731 | { | ||
1732 | *dest = (Byte)b0; | ||
1733 | if (dest != lim) | ||
1734 | dest[1] = b1; | ||
1735 | } | ||
1736 | } | ||
1737 | } | ||
1738 | } | ||
1739 | } | ||
1740 | |||
1741 | |||
1742 | |||
1743 | #define UPDATE_TOTAL_OUT(p, size) \ | ||
1744 | { \ | ||
1745 | size_t _toc = (p)->totalOutCheck + (size); \ | ||
1746 | const size_t _ws = (p)->winSize; \ | ||
1747 | if (_toc >= _ws) _toc = _ws; \ | ||
1748 | (p)->totalOutCheck = _toc; \ | ||
1749 | } | ||
1750 | |||
1751 | |||
1752 | #if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN) | ||
1753 | // we can disable it for debug: | ||
1754 | #define Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1755 | #endif | ||
1756 | // #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit | ||
1757 | |||
1758 | // SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value. | ||
1759 | // we need (SEQ_SRC_OFFSET != 0) for optimized memory access | ||
1760 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1761 | #define SEQ_SRC_OFFSET 7 | ||
1762 | #else | ||
1763 | #define SEQ_SRC_OFFSET 3 | ||
1764 | #endif | ||
1765 | #define SRC_PLUS_FOR_4BYTES(bitOffset) (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3) | ||
1766 | #define BIT_OFFSET_7BITS(bitOffset) ((unsigned)(bitOffset) & 7) | ||
1767 | /* | ||
1768 | if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits | ||
1769 | if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1 | ||
1770 | and we can read 1 bit more in that mode : (8 * n + 1). | ||
1771 | */ | ||
1772 | // #define BIT_OFFSET_DELTA_BITS 0 | ||
1773 | #define BIT_OFFSET_DELTA_BITS 1 | ||
1774 | #if BIT_OFFSET_DELTA_BITS == 1 | ||
1775 | #define GET_SHIFT_FROM_BOFFS7(boff7) (7 ^ (boff7)) | ||
1776 | #else | ||
1777 | #define GET_SHIFT_FROM_BOFFS7(boff7) (8 - BIT_OFFSET_DELTA_BITS - (boff7)) | ||
1778 | #endif | ||
1779 | |||
1780 | #define UPDATE_BIT_OFFSET(bitOffset, numBits) \ | ||
1781 | (bitOffset) -= (CBitCtr)(numBits); | ||
1782 | |||
1783 | #define GET_SHIFT(bitOffset) GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset)) | ||
1784 | |||
1785 | |||
1786 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) | ||
1787 | #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32) | ||
1788 | /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1), | ||
1789 | we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */ | ||
1790 | #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
1791 | #endif | ||
1792 | #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
1793 | #if (BIT_OFFSET_DELTA_BITS == 1) | ||
1794 | /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17))) | ||
1795 | { | ||
1796 | the case (16 bits literal extra + 16 match extra) is not possible | ||
1797 | in correct stream. So error will be detected for (16 + 16) case. | ||
1798 | And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits). | ||
1799 | So we can use just one 64-bit load here in that case. | ||
1800 | } | ||
1801 | */ | ||
1802 | #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML | ||
1803 | #endif | ||
1804 | #endif | ||
1805 | #endif | ||
1806 | |||
1807 | |||
1808 | #if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \ | ||
1809 | (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \ | ||
1810 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)) | ||
1811 | // in : (0 < bits <= (24 or 25)): | ||
1812 | #define STREAM_READ_BITS(dest, bits) \ | ||
1813 | { \ | ||
1814 | GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
1815 | dest <<= GET_SHIFT(bitOffset); \ | ||
1816 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1817 | dest >>= 32 - bits; \ | ||
1818 | } | ||
1819 | #endif | ||
1820 | |||
1821 | |||
1822 | #define FSE_Peek_1(table, state) table[state] | ||
1823 | |||
1824 | #define STATE_VAR(name) state_ ## name | ||
1825 | |||
1826 | // in : (0 <= accuracy <= (24 or 25)) | ||
1827 | #define FSE_INIT_STATE(name, cond) \ | ||
1828 | { \ | ||
1829 | UInt32 r; \ | ||
1830 | const unsigned bits = p->name ## _accuracy; \ | ||
1831 | GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
1832 | r <<= GET_SHIFT(bitOffset); \ | ||
1833 | r >>= 1; \ | ||
1834 | r >>= 31 ^ bits; \ | ||
1835 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1836 | cond \ | ||
1837 | STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \ | ||
1838 | /* STATE_VAR(name) = dest << 16; */ \ | ||
1839 | } | ||
1840 | |||
1841 | |||
1842 | #define FSE_Peek_Plus(name, r) \ | ||
1843 | STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \ | ||
1844 | GET_FSE_REC_STATE(STATE_VAR(name)) + r); | ||
1845 | |||
1846 | #define LZ_LOOP_ERROR_EXIT { return SZ_ERROR_DATA; } | ||
1847 | |||
1848 | #define BO_OVERFLOW_CHECK \ | ||
1849 | { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT } | ||
1850 | |||
1851 | |||
1852 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1853 | |||
1854 | #define GET64(dest, p) { const Byte *ptr = p; dest = GetUi64(ptr); } | ||
1855 | |||
1856 | #define FSE_PRELOAD \ | ||
1857 | { \ | ||
1858 | GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
1859 | v <<= GET_SHIFT(bitOffset); \ | ||
1860 | } | ||
1861 | |||
1862 | #define FSE_UPDATE_STATE_2(name, cond) \ | ||
1863 | { \ | ||
1864 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
1865 | UInt64 r = v; \ | ||
1866 | v <<= bits; \ | ||
1867 | r >>= 1; \ | ||
1868 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1869 | cond \ | ||
1870 | r >>= 63 ^ bits; \ | ||
1871 | FSE_Peek_Plus(name, r); \ | ||
1872 | } | ||
1873 | |||
1874 | #define FSE_UPDATE_STATES \ | ||
1875 | FSE_UPDATE_STATE_2 (ll, {} ) \ | ||
1876 | FSE_UPDATE_STATE_2 (ml, {} ) \ | ||
1877 | FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \ | ||
1878 | |||
1879 | #else // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1880 | |||
1881 | // it supports 8 bits accuracy for any code | ||
1882 | // it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1) | ||
1883 | #define FSE_UPDATE_STATE_0(name, cond) \ | ||
1884 | { \ | ||
1885 | UInt32 r; \ | ||
1886 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
1887 | GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
1888 | r >>= (bitOffset & 7); \ | ||
1889 | r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \ | ||
1890 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1891 | cond \ | ||
1892 | r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \ | ||
1893 | FSE_Peek_Plus(name, r); \ | ||
1894 | } | ||
1895 | |||
1896 | // for debug (slow): | ||
1897 | // #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE | ||
1898 | #if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE) | ||
1899 | #define Z7_ZSTD_DEC_USE_FSE_FUSION | ||
1900 | #endif | ||
1901 | |||
1902 | #ifdef Z7_ZSTD_DEC_USE_FSE_FUSION | ||
1903 | #define FSE_UPDATE_STATE_1(name) \ | ||
1904 | { UInt32 rest2; \ | ||
1905 | { \ | ||
1906 | UInt32 r; \ | ||
1907 | unsigned bits; \ | ||
1908 | GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
1909 | bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
1910 | r <<= GET_SHIFT(bitOffset); \ | ||
1911 | rest2 = r << bits; \ | ||
1912 | r >>= 1; \ | ||
1913 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1914 | r >>= 31 ^ bits; \ | ||
1915 | FSE_Peek_Plus(name, r); \ | ||
1916 | } | ||
1917 | |||
1918 | #define FSE_UPDATE_STATE_3(name) \ | ||
1919 | { \ | ||
1920 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
1921 | rest2 >>= 1; \ | ||
1922 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
1923 | rest2 >>= 31 ^ bits; \ | ||
1924 | FSE_Peek_Plus(name, rest2); \ | ||
1925 | }} | ||
1926 | |||
1927 | #define FSE_UPDATE_STATES \ | ||
1928 | FSE_UPDATE_STATE_1 (ll) \ | ||
1929 | FSE_UPDATE_STATE_3 (ml) \ | ||
1930 | FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \ | ||
1931 | |||
1932 | #else // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1933 | |||
1934 | #define FSE_UPDATE_STATES \ | ||
1935 | FSE_UPDATE_STATE_0 (ll, {} ) \ | ||
1936 | FSE_UPDATE_STATE_0 (ml, {} ) \ | ||
1937 | FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \ | ||
1938 | |||
1939 | #endif // Z7_ZSTD_DEC_USE_FSE_FUSION | ||
1940 | #endif // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
1941 | |||
1942 | |||
1943 | |||
1944 | typedef struct | ||
1945 | { | ||
1946 | UInt32 numSeqs; | ||
1947 | UInt32 literalsLen; | ||
1948 | const Byte *literals; | ||
1949 | } | ||
1950 | CZstdDec1_Vars; | ||
1951 | |||
1952 | |||
1953 | // if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0) | ||
1954 | #define BIT_OFFSET_DELTA_BYTES BIT_OFFSET_DELTA_BITS | ||
1955 | |||
1956 | /* if (NUM_OFFSET_SYMBOLS_MAX == 32) | ||
1957 | max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits | ||
1958 | if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward | ||
1959 | lookahead offset, and we read UInt64 after literal_len reading. | ||
1960 | if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32) | ||
1961 | MAX_BACKWARD_DEPTH = 16 bytes | ||
1962 | */ | ||
1963 | #define MAX_BACKWARD_DEPTH \ | ||
1964 | ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES) | ||
1965 | |||
1966 | /* srcLen != 0 | ||
1967 | src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES | ||
1968 | if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then | ||
1969 | (winLimit - p->winPos <= (1 << 17)) is required | ||
1970 | */ | ||
1971 | static | ||
1972 | Z7_NO_INLINE | ||
1973 | // Z7_ATTRIB_NO_VECTOR | ||
1974 | SRes Decompress_Sequences(CZstdDec1 * const p, | ||
1975 | const Byte *src, const size_t srcLen, | ||
1976 | const size_t winLimit, | ||
1977 | const CZstdDec1_Vars * const vars) | ||
1978 | { | ||
1979 | #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
1980 | SEQ_EXTRA_TABLES(a_) | ||
1981 | #endif | ||
1982 | |||
1983 | // for debug: | ||
1984 | // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES | ||
1985 | #ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES | ||
1986 | #define FSE_TABLE(n) fse. n | ||
1987 | const CZstdDecFseTables fse = p->fse; | ||
1988 | /* | ||
1989 | CZstdDecFseTables fse; | ||
1990 | #define COPY_FSE_TABLE(n) \ | ||
1991 | memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy); | ||
1992 | COPY_FSE_TABLE(of) | ||
1993 | COPY_FSE_TABLE(ll) | ||
1994 | COPY_FSE_TABLE(ml) | ||
1995 | */ | ||
1996 | #else | ||
1997 | #define FSE_TABLE(n) (p->fse. n) | ||
1998 | #endif | ||
1999 | |||
2000 | #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
2001 | FILL_LOC_BASES_ALL | ||
2002 | #endif | ||
2003 | |||
2004 | { | ||
2005 | unsigned numSeqs = vars->numSeqs; | ||
2006 | const Byte *literals = vars->literals; | ||
2007 | ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen; | ||
2008 | Byte * const win = p->win; | ||
2009 | size_t winPos = p->winPos; | ||
2010 | const size_t cycSize = p->cycSize; | ||
2011 | size_t totalOutCheck = p->totalOutCheck; | ||
2012 | const size_t winSize = p->winSize; | ||
2013 | size_t reps_0 = p->reps[0]; | ||
2014 | size_t reps_1 = p->reps[1]; | ||
2015 | size_t reps_2 = p->reps[2]; | ||
2016 | UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml); | ||
2017 | CBitCtr bitOffset; | ||
2018 | |||
2019 | SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES) | ||
2020 | |||
2021 | bitOffset -= BIT_OFFSET_DELTA_BITS; | ||
2022 | |||
2023 | FSE_INIT_STATE(ll, {} ) | ||
2024 | FSE_INIT_STATE(of, {} ) | ||
2025 | FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK) | ||
2026 | |||
2027 | for (;;) | ||
2028 | { | ||
2029 | size_t matchLen; | ||
2030 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2031 | UInt64 v; | ||
2032 | #endif | ||
2033 | |||
2034 | #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
2035 | FSE_PRELOAD | ||
2036 | #endif | ||
2037 | |||
2038 | // if (of_code == 0) | ||
2039 | if ((Byte)STATE_VAR(of) == 0) | ||
2040 | { | ||
2041 | if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0) | ||
2042 | { | ||
2043 | const size_t offset = reps_1; | ||
2044 | reps_1 = reps_0; | ||
2045 | reps_0 = offset; | ||
2046 | STAT_INC(g_Num_Rep1) | ||
2047 | } | ||
2048 | STAT_UPDATE(else g_Num_Rep0++;) | ||
2049 | } | ||
2050 | else | ||
2051 | { | ||
2052 | const unsigned of_code = (Byte)STATE_VAR(of); | ||
2053 | |||
2054 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2055 | #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
2056 | FSE_PRELOAD | ||
2057 | #endif | ||
2058 | #else | ||
2059 | UInt32 v; | ||
2060 | { | ||
2061 | const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset); | ||
2062 | const unsigned skip = GET_SHIFT(bitOffset); | ||
2063 | GET32(v, src4) | ||
2064 | v <<= skip; | ||
2065 | v |= (UInt32)src4[-1] >> (8 - skip); | ||
2066 | } | ||
2067 | #endif | ||
2068 | |||
2069 | UPDATE_BIT_OFFSET(bitOffset, of_code) | ||
2070 | |||
2071 | if (of_code == 1) | ||
2072 | { | ||
2073 | // read 1 bit | ||
2074 | #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64) | ||
2075 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2076 | #define CHECK_HIGH_BIT_64(a) ((Int64)(UInt64)(a) < 0) | ||
2077 | #else | ||
2078 | #define CHECK_HIGH_BIT_32(a) ((Int32)(UInt32)(a) < 0) | ||
2079 | #endif | ||
2080 | #else | ||
2081 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2082 | #define CHECK_HIGH_BIT_64(a) ((UInt64)(a) & ((UInt64)1 << 63)) | ||
2083 | #else | ||
2084 | #define CHECK_HIGH_BIT_32(a) ((UInt32)(a) & ((UInt32)1 << 31)) | ||
2085 | #endif | ||
2086 | #endif | ||
2087 | |||
2088 | if | ||
2089 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2090 | CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v) | ||
2091 | #else | ||
2092 | CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v) | ||
2093 | #endif | ||
2094 | { | ||
2095 | v <<= 1; | ||
2096 | { | ||
2097 | const size_t offset = reps_2; | ||
2098 | reps_2 = reps_1; | ||
2099 | reps_1 = reps_0; | ||
2100 | reps_0 = offset; | ||
2101 | STAT_INC(g_Num_Rep2) | ||
2102 | } | ||
2103 | } | ||
2104 | else | ||
2105 | { | ||
2106 | if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0) | ||
2107 | { | ||
2108 | // litLen == 0 && bit == 1 | ||
2109 | STAT_INC(g_Num_Rep3) | ||
2110 | v <<= 1; | ||
2111 | reps_2 = reps_1; | ||
2112 | reps_1 = reps_0; | ||
2113 | if (--reps_0 == 0) | ||
2114 | { | ||
2115 | // LZ_LOOP_ERROR_EXIT | ||
2116 | // original-zstd decoder : input is corrupted; force offset to 1 | ||
2117 | // reps_0 = 1; | ||
2118 | reps_0++; | ||
2119 | } | ||
2120 | } | ||
2121 | else | ||
2122 | { | ||
2123 | // litLen != 0 && bit == 0 | ||
2124 | v <<= 1; | ||
2125 | { | ||
2126 | const size_t offset = reps_1; | ||
2127 | reps_1 = reps_0; | ||
2128 | reps_0 = offset; | ||
2129 | STAT_INC(g_Num_Rep1) | ||
2130 | } | ||
2131 | } | ||
2132 | } | ||
2133 | } | ||
2134 | else | ||
2135 | { | ||
2136 | // (2 <= of_code) | ||
2137 | // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check | ||
2138 | // we don't allow (of_code >= 32) cases in another code | ||
2139 | reps_2 = reps_1; | ||
2140 | reps_1 = reps_0; | ||
2141 | reps_0 = ((size_t)1 << of_code) - 3 + (size_t) | ||
2142 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2143 | (v >> (64 - of_code)); | ||
2144 | v <<= of_code; | ||
2145 | #else | ||
2146 | (v >> (32 - of_code)); | ||
2147 | #endif | ||
2148 | } | ||
2149 | } | ||
2150 | |||
2151 | #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML | ||
2152 | FSE_PRELOAD | ||
2153 | #endif | ||
2154 | |||
2155 | matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml)) | ||
2156 | #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3 | ||
2157 | + MATCH_LEN_MIN | ||
2158 | #endif | ||
2159 | ; | ||
2160 | { | ||
2161 | { | ||
2162 | if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20) | ||
2163 | { | ||
2164 | const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN]; | ||
2165 | matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN]; | ||
2166 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \ | ||
2167 | (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \ | ||
2168 | defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)) | ||
2169 | { | ||
2170 | UPDATE_BIT_OFFSET(bitOffset, extra) | ||
2171 | matchLen += (size_t)(v >> (64 - extra)); | ||
2172 | #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
2173 | FSE_PRELOAD | ||
2174 | #else | ||
2175 | v <<= extra; | ||
2176 | #endif | ||
2177 | } | ||
2178 | #else | ||
2179 | { | ||
2180 | UInt32 v32; | ||
2181 | STREAM_READ_BITS(v32, extra) | ||
2182 | matchLen += v32; | ||
2183 | } | ||
2184 | #endif | ||
2185 | STAT_INC(g_Num_Match) | ||
2186 | } | ||
2187 | } | ||
2188 | } | ||
2189 | |||
2190 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \ | ||
2191 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \ | ||
2192 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) | ||
2193 | FSE_PRELOAD | ||
2194 | #endif | ||
2195 | |||
2196 | { | ||
2197 | size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll)); | ||
2198 | if (litLen) | ||
2199 | { | ||
2200 | // if (STATE_VAR(ll) & 0x70) | ||
2201 | if (litLen >= 16) | ||
2202 | { | ||
2203 | const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen]; | ||
2204 | litLen = BASES_TABLE(SEQ_LL_BASES) [litLen]; | ||
2205 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
2206 | { | ||
2207 | UPDATE_BIT_OFFSET(bitOffset, extra) | ||
2208 | litLen += (size_t)(v >> (64 - extra)); | ||
2209 | #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
2210 | FSE_PRELOAD | ||
2211 | #else | ||
2212 | v <<= extra; | ||
2213 | #endif | ||
2214 | } | ||
2215 | #else | ||
2216 | { | ||
2217 | UInt32 v32; | ||
2218 | STREAM_READ_BITS(v32, extra) | ||
2219 | litLen += v32; | ||
2220 | } | ||
2221 | #endif | ||
2222 | STAT_INC(g_Num_LitsBig) | ||
2223 | } | ||
2224 | |||
2225 | if ((literalsLen -= (ptrdiff_t)litLen) < 0) | ||
2226 | LZ_LOOP_ERROR_EXIT | ||
2227 | totalOutCheck += litLen; | ||
2228 | { | ||
2229 | const size_t rem = winLimit - winPos; | ||
2230 | if (litLen > rem) | ||
2231 | LZ_LOOP_ERROR_EXIT | ||
2232 | { | ||
2233 | const Byte *literals_temp = literals; | ||
2234 | Byte *d = win + winPos; | ||
2235 | literals += litLen; | ||
2236 | winPos += litLen; | ||
2237 | CopyLiterals(d, literals_temp, litLen, rem); | ||
2238 | } | ||
2239 | } | ||
2240 | } | ||
2241 | STAT_UPDATE(else g_Num_Lit0++;) | ||
2242 | } | ||
2243 | |||
2244 | #define COPY_MATCH \ | ||
2245 | { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \ | ||
2246 | totalOutCheck += matchLen; \ | ||
2247 | { const size_t rem = winLimit - winPos; \ | ||
2248 | if (matchLen > rem) LZ_LOOP_ERROR_EXIT \ | ||
2249 | { const size_t winPos_temp = winPos; \ | ||
2250 | winPos += matchLen; \ | ||
2251 | CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}} | ||
2252 | |||
2253 | if (--numSeqs == 0) | ||
2254 | { | ||
2255 | COPY_MATCH | ||
2256 | break; | ||
2257 | } | ||
2258 | FSE_UPDATE_STATES | ||
2259 | COPY_MATCH | ||
2260 | } // for | ||
2261 | |||
2262 | if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS) | ||
2263 | return SZ_ERROR_DATA; | ||
2264 | |||
2265 | if (literalsLen) | ||
2266 | { | ||
2267 | const size_t rem = winLimit - winPos; | ||
2268 | if ((size_t)literalsLen > rem) | ||
2269 | return SZ_ERROR_DATA; | ||
2270 | { | ||
2271 | Byte *d = win + winPos; | ||
2272 | winPos += (size_t)literalsLen; | ||
2273 | totalOutCheck += (size_t)literalsLen; | ||
2274 | CopyLiterals | ||
2275 | // memcpy | ||
2276 | (d, literals, (size_t)literalsLen, rem); | ||
2277 | } | ||
2278 | } | ||
2279 | if (totalOutCheck >= winSize) | ||
2280 | totalOutCheck = winSize; | ||
2281 | p->totalOutCheck = totalOutCheck; | ||
2282 | p->winPos = winPos; | ||
2283 | p->reps[0] = (CZstdDecOffset)reps_0; | ||
2284 | p->reps[1] = (CZstdDecOffset)reps_1; | ||
2285 | p->reps[2] = (CZstdDecOffset)reps_2; | ||
2286 | } | ||
2287 | return SZ_OK; | ||
2288 | } | ||
2289 | |||
2290 | |||
2291 | // for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly: | ||
2292 | // #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only | ||
2293 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
2294 | static unsigned g_numSeqs; | ||
2295 | #endif | ||
2296 | |||
2297 | |||
2298 | #define k_LitBlockType_Flag_RLE_or_Treeless 1 | ||
2299 | #define k_LitBlockType_Flag_Compressed 2 | ||
2300 | |||
2301 | // outLimit : is strong limit | ||
2302 | // outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) | ||
2303 | // inSize != 0 | ||
2304 | static | ||
2305 | Z7_NO_INLINE | ||
2306 | SRes ZstdDec1_DecodeBlock(CZstdDec1 *p, | ||
2307 | const Byte *src, SizeT inSize, SizeT afterAvail, | ||
2308 | const size_t outLimit) | ||
2309 | { | ||
2310 | CZstdDec1_Vars vars; | ||
2311 | vars.literals = p->literalsBase; | ||
2312 | { | ||
2313 | const unsigned b0 = *src++; | ||
2314 | UInt32 numLits, compressedSize; | ||
2315 | const Byte *litStream; | ||
2316 | Byte *literalsDest; | ||
2317 | inSize--; | ||
2318 | |||
2319 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
2320 | { | ||
2321 | // we need at least one additional byte for (numSeqs). | ||
2322 | // so we check for that additional byte in conditions. | ||
2323 | numLits = b0 >> 3; | ||
2324 | if (b0 & 4) | ||
2325 | { | ||
2326 | UInt32 v; | ||
2327 | if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs). | ||
2328 | return SZ_ERROR_DATA; | ||
2329 | numLits >>= 1; | ||
2330 | v = GetUi16(src); | ||
2331 | src += 2; | ||
2332 | inSize -= 2; | ||
2333 | if ((b0 & 8) == 0) | ||
2334 | { | ||
2335 | src--; | ||
2336 | inSize++; | ||
2337 | v = (Byte)v; | ||
2338 | } | ||
2339 | numLits += v << 4; | ||
2340 | } | ||
2341 | compressedSize = 1; | ||
2342 | if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0) | ||
2343 | compressedSize = numLits; | ||
2344 | } | ||
2345 | else if (inSize < 4) | ||
2346 | return SZ_ERROR_DATA; | ||
2347 | else | ||
2348 | { | ||
2349 | const unsigned mode4Streams = b0 & 0xc; | ||
2350 | const unsigned numBytes = (3 * mode4Streams + 32) >> 4; | ||
2351 | const unsigned numBits = 4 * numBytes - 2; | ||
2352 | const UInt32 mask = ((UInt32)16 << numBits) - 1; | ||
2353 | compressedSize = GetUi32(src); | ||
2354 | numLits = (( | ||
2355 | #ifdef MY_CPU_LE_UNALIGN | ||
2356 | GetUi32(src - 1) | ||
2357 | #else | ||
2358 | ((compressedSize << 8) + b0) | ||
2359 | #endif | ||
2360 | ) >> 4) & mask; | ||
2361 | src += numBytes; | ||
2362 | inSize -= numBytes; | ||
2363 | compressedSize >>= numBits; | ||
2364 | compressedSize &= mask; | ||
2365 | /* | ||
2366 | if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u ratio2 = %u\n", | ||
2367 | i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits, | ||
2368 | (unsigned)numLits * 100 / (unsigned)inSize); | ||
2369 | } | ||
2370 | */ | ||
2371 | if (compressedSize == 0) | ||
2372 | return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed | ||
2373 | } | ||
2374 | |||
2375 | STAT_UPDATE(g_Num_Lits += numLits;) | ||
2376 | |||
2377 | vars.literalsLen = numLits; | ||
2378 | |||
2379 | if (compressedSize >= inSize) | ||
2380 | return SZ_ERROR_DATA; | ||
2381 | litStream = src; | ||
2382 | src += compressedSize; | ||
2383 | inSize -= compressedSize; | ||
2384 | // inSize != 0 | ||
2385 | { | ||
2386 | UInt32 numSeqs = *src++; | ||
2387 | inSize--; | ||
2388 | if (numSeqs > 127) | ||
2389 | { | ||
2390 | UInt32 b1; | ||
2391 | if (inSize == 0) | ||
2392 | return SZ_ERROR_DATA; | ||
2393 | numSeqs -= 128; | ||
2394 | b1 = *src++; | ||
2395 | inSize--; | ||
2396 | if (numSeqs == 127) | ||
2397 | { | ||
2398 | if (inSize == 0) | ||
2399 | return SZ_ERROR_DATA; | ||
2400 | numSeqs = (UInt32)(*src++) + 127; | ||
2401 | inSize--; | ||
2402 | } | ||
2403 | numSeqs = (numSeqs << 8) + b1; | ||
2404 | } | ||
2405 | if (numSeqs * MATCH_LEN_MIN + numLits > outLimit) | ||
2406 | return SZ_ERROR_DATA; | ||
2407 | vars.numSeqs = numSeqs; | ||
2408 | |||
2409 | STAT_UPDATE(g_NumSeqs_total += numSeqs;) | ||
2410 | /* | ||
2411 | #ifdef SHOW_STAT | ||
2412 | printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total, | ||
2413 | (int)g_NumSeqs_total / g_Num_Blocks_Compressed); | ||
2414 | #endif | ||
2415 | // printf("\nnumSeqs2 = %d", numSeqs); | ||
2416 | */ | ||
2417 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
2418 | if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug | ||
2419 | #endif | ||
2420 | if (numSeqs == 0) | ||
2421 | { | ||
2422 | if (inSize != 0) | ||
2423 | return SZ_ERROR_DATA; | ||
2424 | literalsDest = p->win + p->winPos; | ||
2425 | } | ||
2426 | else | ||
2427 | literalsDest = p->literalsBase; | ||
2428 | } | ||
2429 | |||
2430 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
2431 | { | ||
2432 | if (b0 & k_LitBlockType_Flag_RLE_or_Treeless) | ||
2433 | { | ||
2434 | memset(literalsDest, litStream[0], numLits); | ||
2435 | if (vars.numSeqs) | ||
2436 | { | ||
2437 | // literalsDest == p->literalsBase == vars.literals | ||
2438 | #if COPY_CHUNK_SIZE > 1 | ||
2439 | memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE); | ||
2440 | #endif | ||
2441 | } | ||
2442 | } | ||
2443 | else | ||
2444 | { | ||
2445 | // unsigned y; | ||
2446 | // for (y = 0; y < 10000; y++) | ||
2447 | memcpy(literalsDest, litStream, numLits); | ||
2448 | if (vars.numSeqs) | ||
2449 | { | ||
2450 | /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals(). | ||
2451 | If we have additional space in input stream after literals stream, | ||
2452 | we use direct copy of rar literals in input stream */ | ||
2453 | if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1)) | ||
2454 | vars.literals = litStream; | ||
2455 | else | ||
2456 | { | ||
2457 | // literalsDest == p->literalsBase == vars.literals | ||
2458 | #if COPY_CHUNK_SIZE > 1 | ||
2459 | /* CopyLiterals(): | ||
2460 | 1) we don't want reading non-initialized data | ||
2461 | 2) we will copy only zero byte after literals buffer */ | ||
2462 | memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE); | ||
2463 | #endif | ||
2464 | } | ||
2465 | } | ||
2466 | } | ||
2467 | } | ||
2468 | else | ||
2469 | { | ||
2470 | CInBufPair hufStream; | ||
2471 | hufStream.ptr = litStream; | ||
2472 | hufStream.len = compressedSize; | ||
2473 | |||
2474 | if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0) | ||
2475 | { | ||
2476 | // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2; | ||
2477 | RINOK(Huf_DecodeTable(&p->huf, &hufStream)) | ||
2478 | p->litHuf_wasSet = True; | ||
2479 | // } while (--y); | ||
2480 | } | ||
2481 | else if (!p->litHuf_wasSet) | ||
2482 | return SZ_ERROR_DATA; | ||
2483 | |||
2484 | { | ||
2485 | // int yyy; for (yyy = 0; yyy < 34; yyy++) { | ||
2486 | SRes sres; | ||
2487 | if ((b0 & 0xc) == 0) // mode4Streams | ||
2488 | sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64, | ||
2489 | hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits); | ||
2490 | else | ||
2491 | { | ||
2492 | // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes) | ||
2493 | if (hufStream.len < 6 + 4) | ||
2494 | return SZ_ERROR_DATA; | ||
2495 | // the condition from original-zstd decoder: | ||
2496 | #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6 | ||
2497 | if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS) | ||
2498 | return SZ_ERROR_DATA; | ||
2499 | sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64, | ||
2500 | hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits); | ||
2501 | } | ||
2502 | RINOK(sres) | ||
2503 | // } | ||
2504 | } | ||
2505 | } | ||
2506 | |||
2507 | if (vars.numSeqs == 0) | ||
2508 | { | ||
2509 | p->winPos += numLits; | ||
2510 | return SZ_OK; | ||
2511 | } | ||
2512 | } | ||
2513 | { | ||
2514 | CInBufPair in; | ||
2515 | unsigned mode; | ||
2516 | unsigned seqMode; | ||
2517 | |||
2518 | in.ptr = src; | ||
2519 | in.len = inSize; | ||
2520 | if (in.len == 0) | ||
2521 | return SZ_ERROR_DATA; | ||
2522 | in.len--; | ||
2523 | mode = *in.ptr++; | ||
2524 | if (mode & 3) // Reserved bits | ||
2525 | return SZ_ERROR_DATA; | ||
2526 | |||
2527 | seqMode = (mode >> 6); | ||
2528 | if (seqMode == k_SeqMode_Repeat) | ||
2529 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
2530 | else RINOK(FSE_Decode_SeqTable( | ||
2531 | p->fse.ll, | ||
2532 | &in, | ||
2533 | 6, // predefAccuracy | ||
2534 | &p->ll_accuracy, | ||
2535 | NUM_LL_SYMBOLS, | ||
2536 | k_PredefRecords_LL, | ||
2537 | seqMode)) | ||
2538 | |||
2539 | seqMode = (mode >> 4) & 3; | ||
2540 | if (seqMode == k_SeqMode_Repeat) | ||
2541 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
2542 | else RINOK(FSE_Decode_SeqTable( | ||
2543 | p->fse.of, | ||
2544 | &in, | ||
2545 | 5, // predefAccuracy | ||
2546 | &p->of_accuracy, | ||
2547 | NUM_OFFSET_SYMBOLS_MAX, | ||
2548 | k_PredefRecords_OF, | ||
2549 | seqMode)) | ||
2550 | |||
2551 | seqMode = (mode >> 2) & 3; | ||
2552 | if (seqMode == k_SeqMode_Repeat) | ||
2553 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
2554 | else | ||
2555 | { | ||
2556 | RINOK(FSE_Decode_SeqTable( | ||
2557 | p->fse.ml, | ||
2558 | &in, | ||
2559 | 6, // predefAccuracy | ||
2560 | &p->ml_accuracy, | ||
2561 | NUM_ML_SYMBOLS, | ||
2562 | k_PredefRecords_ML, | ||
2563 | seqMode)) | ||
2564 | /* | ||
2565 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
2566 | // { unsigned y = 1 << 10; do | ||
2567 | { | ||
2568 | const unsigned accuracy = p->ml_accuracy; | ||
2569 | if (accuracy == 0) | ||
2570 | p->fse.ml[0] += 3; | ||
2571 | else | ||
2572 | #ifdef MY_CPU_64BIT | ||
2573 | { | ||
2574 | // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code | ||
2575 | UInt64 *table = (UInt64 *)(void *)p->fse.ml; | ||
2576 | const UInt64 *end = (const UInt64 *)(const void *) | ||
2577 | ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy)); | ||
2578 | do | ||
2579 | { | ||
2580 | table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN; | ||
2581 | table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN; | ||
2582 | table += 2; | ||
2583 | } | ||
2584 | while (table != end); | ||
2585 | } | ||
2586 | #else | ||
2587 | { | ||
2588 | UInt32 *table = p->fse.ml; | ||
2589 | const UInt32 *end = (const UInt32 *)(const void *) | ||
2590 | ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy)); | ||
2591 | do | ||
2592 | { | ||
2593 | table[0] += MATCH_LEN_MIN; | ||
2594 | table[1] += MATCH_LEN_MIN; | ||
2595 | table += 2; | ||
2596 | table[0] += MATCH_LEN_MIN; | ||
2597 | table[1] += MATCH_LEN_MIN; | ||
2598 | table += 2; | ||
2599 | } | ||
2600 | while (table != end); | ||
2601 | } | ||
2602 | #endif | ||
2603 | } | ||
2604 | // while (--y); } | ||
2605 | #endif | ||
2606 | */ | ||
2607 | } | ||
2608 | |||
2609 | // p->seqTables_wereSet = True; | ||
2610 | if (in.len == 0) | ||
2611 | return SZ_ERROR_DATA; | ||
2612 | return Decompress_Sequences(p, | ||
2613 | in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len, | ||
2614 | p->winPos + outLimit, &vars); | ||
2615 | } | ||
2616 | } | ||
2617 | |||
2618 | |||
2619 | |||
2620 | |||
2621 | // inSize != 0 | ||
2622 | // it must do similar to ZstdDec1_DecodeBlock() | ||
2623 | static size_t ZstdDec1_NeedTempBufferForInput( | ||
2624 | const SizeT beforeSize, const Byte * const src, const SizeT inSize) | ||
2625 | { | ||
2626 | unsigned b0; | ||
2627 | UInt32 pos; | ||
2628 | |||
2629 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
2630 | g_numSeqs = 1 << 24; | ||
2631 | #else | ||
2632 | // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode | ||
2633 | #define MIN_BLOCK_LZ_HEADERS_SIZE 3 | ||
2634 | if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE) | ||
2635 | return 0; | ||
2636 | #endif | ||
2637 | |||
2638 | b0 = src[0]; | ||
2639 | |||
2640 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
2641 | { | ||
2642 | UInt32 numLits = b0 >> 3; | ||
2643 | pos = 1; | ||
2644 | if (b0 & 4) | ||
2645 | { | ||
2646 | UInt32 v; | ||
2647 | if (inSize < 3) | ||
2648 | return 0; | ||
2649 | numLits >>= 1; | ||
2650 | v = GetUi16(src + 1); | ||
2651 | pos = 3; | ||
2652 | if ((b0 & 8) == 0) | ||
2653 | { | ||
2654 | pos = 2; | ||
2655 | v = (Byte)v; | ||
2656 | } | ||
2657 | numLits += v << 4; | ||
2658 | } | ||
2659 | if (b0 & k_LitBlockType_Flag_RLE_or_Treeless) | ||
2660 | numLits = 1; | ||
2661 | pos += numLits; | ||
2662 | } | ||
2663 | else if (inSize < 5) | ||
2664 | return 0; | ||
2665 | else | ||
2666 | { | ||
2667 | const unsigned mode4Streams = b0 & 0xc; | ||
2668 | const unsigned numBytes = (3 * mode4Streams + 48) >> 4; | ||
2669 | const unsigned numBits = 4 * numBytes - 6; | ||
2670 | UInt32 cs = GetUi32(src + 1); | ||
2671 | cs >>= numBits; | ||
2672 | cs &= ((UInt32)16 << numBits) - 1; | ||
2673 | if (cs == 0) | ||
2674 | return 0; | ||
2675 | pos = numBytes + cs; | ||
2676 | } | ||
2677 | |||
2678 | if (pos >= inSize) | ||
2679 | return 0; | ||
2680 | { | ||
2681 | UInt32 numSeqs = src[pos++]; | ||
2682 | if (numSeqs > 127) | ||
2683 | { | ||
2684 | UInt32 b1; | ||
2685 | if (pos >= inSize) | ||
2686 | return 0; | ||
2687 | numSeqs -= 128; | ||
2688 | b1 = src[pos++]; | ||
2689 | if (numSeqs == 127) | ||
2690 | { | ||
2691 | if (pos >= inSize) | ||
2692 | return 0; | ||
2693 | numSeqs = (UInt32)(src[pos++]) + 127; | ||
2694 | } | ||
2695 | numSeqs = (numSeqs << 8) + b1; | ||
2696 | } | ||
2697 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
2698 | g_numSeqs = numSeqs; // for debug | ||
2699 | #endif | ||
2700 | if (numSeqs == 0) | ||
2701 | return 0; | ||
2702 | } | ||
2703 | /* | ||
2704 | if (pos >= inSize) | ||
2705 | return 0; | ||
2706 | pos++; | ||
2707 | */ | ||
2708 | // we will have one additional byte for seqMode: | ||
2709 | if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1) | ||
2710 | return 0; | ||
2711 | return 1; | ||
2712 | } | ||
2713 | |||
2714 | |||
2715 | |||
2716 | // ---------- ZSTD FRAME ---------- | ||
2717 | |||
2718 | #define kBlockType_Raw 0 | ||
2719 | #define kBlockType_RLE 1 | ||
2720 | #define kBlockType_Compressed 2 | ||
2721 | #define kBlockType_Reserved 3 | ||
2722 | |||
2723 | typedef enum | ||
2724 | { | ||
2725 | // begin: states that require 4 bytes: | ||
2726 | ZSTD2_STATE_SIGNATURE, | ||
2727 | ZSTD2_STATE_HASH, | ||
2728 | ZSTD2_STATE_SKIP_HEADER, | ||
2729 | // end of states that require 4 bytes | ||
2730 | |||
2731 | ZSTD2_STATE_SKIP_DATA, | ||
2732 | ZSTD2_STATE_FRAME_HEADER, | ||
2733 | ZSTD2_STATE_AFTER_HEADER, | ||
2734 | ZSTD2_STATE_BLOCK, | ||
2735 | ZSTD2_STATE_DATA, | ||
2736 | ZSTD2_STATE_FINISHED | ||
2737 | } EZstd2State; | ||
2738 | |||
2739 | |||
2740 | struct CZstdDec | ||
2741 | { | ||
2742 | EZstd2State frameState; | ||
2743 | unsigned tempSize; | ||
2744 | |||
2745 | Byte temp[14]; // 14 is required | ||
2746 | |||
2747 | Byte descriptor; | ||
2748 | Byte windowDescriptor; | ||
2749 | Byte isLastBlock; | ||
2750 | Byte blockType; | ||
2751 | Byte isErrorState; | ||
2752 | Byte hashError; | ||
2753 | Byte disableHash; | ||
2754 | Byte isCyclicMode; | ||
2755 | |||
2756 | UInt32 blockSize; | ||
2757 | UInt32 dictionaryId; | ||
2758 | UInt32 curBlockUnpackRem; // for compressed blocks only | ||
2759 | UInt32 inTempPos; | ||
2760 | |||
2761 | UInt64 contentSize; | ||
2762 | UInt64 contentProcessed; | ||
2763 | CXxh64State xxh64; | ||
2764 | |||
2765 | Byte *inTemp; | ||
2766 | SizeT winBufSize_Allocated; | ||
2767 | Byte *win_Base; | ||
2768 | |||
2769 | ISzAllocPtr alloc_Small; | ||
2770 | ISzAllocPtr alloc_Big; | ||
2771 | |||
2772 | CZstdDec1 decoder; | ||
2773 | }; | ||
2774 | |||
2775 | #define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \ | ||
2776 | ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1)) | ||
2777 | |||
2778 | #define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock) | ||
2779 | |||
2780 | |||
2781 | static void ZstdDec_FreeWindow(CZstdDec * const p) | ||
2782 | { | ||
2783 | if (p->win_Base) | ||
2784 | { | ||
2785 | ISzAlloc_Free(p->alloc_Big, p->win_Base); | ||
2786 | p->win_Base = NULL; | ||
2787 | // p->decoder.win = NULL; | ||
2788 | p->winBufSize_Allocated = 0; | ||
2789 | } | ||
2790 | } | ||
2791 | |||
2792 | |||
2793 | CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big) | ||
2794 | { | ||
2795 | CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec)); | ||
2796 | if (!p) | ||
2797 | return NULL; | ||
2798 | p->alloc_Small = alloc_Small; | ||
2799 | p->alloc_Big = alloc_Big; | ||
2800 | // ZstdDec_CONSTRUCT(p) | ||
2801 | p->inTemp = NULL; | ||
2802 | p->win_Base = NULL; | ||
2803 | p->winBufSize_Allocated = 0; | ||
2804 | p->disableHash = False; | ||
2805 | ZstdDec1_Construct(&p->decoder); | ||
2806 | return p; | ||
2807 | } | ||
2808 | |||
2809 | void ZstdDec_Destroy(CZstdDecHandle p) | ||
2810 | { | ||
2811 | #ifdef SHOW_STAT | ||
2812 | #define PRINT_STAT1(name, v) \ | ||
2813 | printf("\n%25s = %9u", name, v); | ||
2814 | PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed) | ||
2815 | PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy) | ||
2816 | PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num) | ||
2817 | PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes) | ||
2818 | if (g_Num_Blocks_Compressed) | ||
2819 | { | ||
2820 | #define PRINT_STAT(name, v) \ | ||
2821 | printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed); | ||
2822 | PRINT_STAT("g_NumSeqs", g_NumSeqs_total) | ||
2823 | // PRINT_STAT("g_NumCopy", g_NumCopy) | ||
2824 | PRINT_STAT("g_NumOver", g_NumOver) | ||
2825 | PRINT_STAT("g_NumOver2", g_NumOver2) | ||
2826 | PRINT_STAT("g_Num_Match", g_Num_Match) | ||
2827 | PRINT_STAT("g_Num_Lits", g_Num_Lits) | ||
2828 | PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig) | ||
2829 | PRINT_STAT("g_Num_Lit0", g_Num_Lit0) | ||
2830 | PRINT_STAT("g_Num_Rep_0", g_Num_Rep0) | ||
2831 | PRINT_STAT("g_Num_Rep_1", g_Num_Rep1) | ||
2832 | PRINT_STAT("g_Num_Rep_2", g_Num_Rep2) | ||
2833 | PRINT_STAT("g_Num_Rep_3", g_Num_Rep3) | ||
2834 | PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0) | ||
2835 | PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1) | ||
2836 | PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum) | ||
2837 | PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum) | ||
2838 | } | ||
2839 | printf("\n"); | ||
2840 | #endif | ||
2841 | |||
2842 | ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase); | ||
2843 | // p->->decoder.literalsBase = NULL; | ||
2844 | ISzAlloc_Free(p->alloc_Small, p->inTemp); | ||
2845 | // p->inTemp = NULL; | ||
2846 | ZstdDec_FreeWindow(p); | ||
2847 | ISzAlloc_Free(p->alloc_Small, p); | ||
2848 | } | ||
2849 | |||
2850 | |||
2851 | |||
2852 | #define kTempBuffer_PreSize (1u << 6) | ||
2853 | #if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH | ||
2854 | #error Stop_Compiling_Bad_kTempBuffer_PreSize | ||
2855 | #endif | ||
2856 | |||
2857 | static SRes ZstdDec_AllocateMisc(CZstdDec *p) | ||
2858 | { | ||
2859 | #define k_Lit_AfterAvail (1u << 6) | ||
2860 | #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1) | ||
2861 | #error Stop_Compiling_Bad_k_Lit_AfterAvail | ||
2862 | #endif | ||
2863 | // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small); | ||
2864 | if (!p->decoder.literalsBase) | ||
2865 | { | ||
2866 | p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small, | ||
2867 | kBlockSizeMax + k_Lit_AfterAvail); | ||
2868 | if (!p->decoder.literalsBase) | ||
2869 | return SZ_ERROR_MEM; | ||
2870 | } | ||
2871 | if (!p->inTemp) | ||
2872 | { | ||
2873 | // we need k_Lit_AfterAvail here for owerread from raw literals stream | ||
2874 | p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small, | ||
2875 | kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail); | ||
2876 | if (!p->inTemp) | ||
2877 | return SZ_ERROR_MEM; | ||
2878 | } | ||
2879 | return SZ_OK; | ||
2880 | } | ||
2881 | |||
2882 | |||
2883 | static void ZstdDec_Init_ForNewFrame(CZstdDec *p) | ||
2884 | { | ||
2885 | p->frameState = ZSTD2_STATE_SIGNATURE; | ||
2886 | p->tempSize = 0; | ||
2887 | |||
2888 | p->isErrorState = False; | ||
2889 | p->hashError = False; | ||
2890 | p->isCyclicMode = False; | ||
2891 | p->contentProcessed = 0; | ||
2892 | Xxh64State_Init(&p->xxh64); | ||
2893 | ZstdDec1_Init(&p->decoder); | ||
2894 | } | ||
2895 | |||
2896 | |||
2897 | void ZstdDec_Init(CZstdDec *p) | ||
2898 | { | ||
2899 | ZstdDec_Init_ForNewFrame(p); | ||
2900 | p->decoder.winPos = 0; | ||
2901 | memset(p->temp, 0, sizeof(p->temp)); | ||
2902 | } | ||
2903 | |||
2904 | |||
2905 | #define DESCRIPTOR_Get_DictionaryId_Flag(d) ((d) & 3) | ||
2906 | #define DESCRIPTOR_FLAG_CHECKSUM (1 << 2) | ||
2907 | #define DESCRIPTOR_FLAG_RESERVED (1 << 3) | ||
2908 | // #define DESCRIPTOR_FLAG_UNUSED (1 << 4) | ||
2909 | #define DESCRIPTOR_FLAG_SINGLE (1 << 5) | ||
2910 | #define DESCRIPTOR_Get_ContentSize_Flag3(d) ((d) >> 5) | ||
2911 | #define DESCRIPTOR_Is_ContentSize_Defined(d) (((d) & 0xe0) != 0) | ||
2912 | |||
2913 | |||
2914 | static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info) | ||
2915 | { | ||
2916 | unsigned tempSize = p->tempSize; | ||
2917 | p->temp[tempSize++] = b; | ||
2918 | p->tempSize = tempSize; | ||
2919 | |||
2920 | if (p->frameState == ZSTD2_STATE_BLOCK) | ||
2921 | { | ||
2922 | if (tempSize < 3) | ||
2923 | return ZSTD2_STATE_BLOCK; | ||
2924 | { | ||
2925 | UInt32 b0 = GetUi32(p->temp); | ||
2926 | const unsigned type = ((unsigned)b0 >> 1) & 3; | ||
2927 | if (type == kBlockType_RLE && tempSize == 3) | ||
2928 | return ZSTD2_STATE_BLOCK; | ||
2929 | // info->num_Blocks_forType[type]++; | ||
2930 | info->num_Blocks++; | ||
2931 | if (type == kBlockType_Reserved) | ||
2932 | { | ||
2933 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED | ||
2934 | return ZSTD2_STATE_BLOCK; | ||
2935 | } | ||
2936 | p->blockType = (Byte)type; | ||
2937 | p->isLastBlock = (Byte)(b0 & 1); | ||
2938 | p->inTempPos = 0; | ||
2939 | p->tempSize = 0; | ||
2940 | b0 >>= 3; | ||
2941 | b0 &= 0x1fffff; | ||
2942 | // info->num_BlockBytes_forType[type] += b0; | ||
2943 | if (b0 == 0) | ||
2944 | { | ||
2945 | // empty RAW/RLE blocks are allowed in original-zstd decoder | ||
2946 | if (type == kBlockType_Compressed) | ||
2947 | { | ||
2948 | p->isErrorState = True; | ||
2949 | return ZSTD2_STATE_BLOCK; | ||
2950 | } | ||
2951 | if (!ZSTD_DEC_IS_LAST_BLOCK(p)) | ||
2952 | return ZSTD2_STATE_BLOCK; | ||
2953 | if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
2954 | return ZSTD2_STATE_HASH; | ||
2955 | return ZSTD2_STATE_FINISHED; | ||
2956 | } | ||
2957 | p->blockSize = b0; | ||
2958 | { | ||
2959 | UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder); | ||
2960 | // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size) | ||
2961 | if (b0 > blockLim) | ||
2962 | { | ||
2963 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED; | ||
2964 | return ZSTD2_STATE_BLOCK; | ||
2965 | } | ||
2966 | if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)) | ||
2967 | { | ||
2968 | const UInt64 rem = p->contentSize - p->contentProcessed; | ||
2969 | if (blockLim > rem) | ||
2970 | blockLim = (UInt32)rem; | ||
2971 | } | ||
2972 | p->curBlockUnpackRem = blockLim; | ||
2973 | // uncompressed block size cannot be larger than remain data size: | ||
2974 | if (type != kBlockType_Compressed) | ||
2975 | { | ||
2976 | if (b0 > blockLim) | ||
2977 | { | ||
2978 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED; | ||
2979 | return ZSTD2_STATE_BLOCK; | ||
2980 | } | ||
2981 | } | ||
2982 | } | ||
2983 | } | ||
2984 | return ZSTD2_STATE_DATA; | ||
2985 | } | ||
2986 | |||
2987 | if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA) | ||
2988 | { | ||
2989 | UInt32 v; | ||
2990 | if (tempSize != 4) | ||
2991 | return p->frameState; | ||
2992 | v = GetUi32(p->temp); | ||
2993 | if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE | ||
2994 | { | ||
2995 | if (v == 0xfd2fb528) | ||
2996 | { | ||
2997 | p->tempSize = 0; | ||
2998 | info->num_DataFrames++; | ||
2999 | return ZSTD2_STATE_FRAME_HEADER; | ||
3000 | } | ||
3001 | if ((v & 0xfffffff0) == 0x184d2a50) | ||
3002 | { | ||
3003 | p->tempSize = 0; | ||
3004 | info->num_SkipFrames++; | ||
3005 | return ZSTD2_STATE_SKIP_HEADER; | ||
3006 | } | ||
3007 | p->isErrorState = True; | ||
3008 | return ZSTD2_STATE_SIGNATURE; | ||
3009 | // return ZSTD2_STATE_ERROR; // is not ZSTD stream | ||
3010 | } | ||
3011 | if (p->frameState == ZSTD2_STATE_HASH) | ||
3012 | { | ||
3013 | info->checksum_Defined = True; | ||
3014 | info->checksum = v; | ||
3015 | // #ifndef DISABLE_XXH_CHECK | ||
3016 | if (!p->disableHash) | ||
3017 | { | ||
3018 | if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)) | ||
3019 | { | ||
3020 | // unexpected code failure | ||
3021 | p->isErrorState = True; | ||
3022 | // SZ_ERROR_FAIL; | ||
3023 | } | ||
3024 | else | ||
3025 | if ((UInt32)Xxh64State_Digest(&p->xxh64, | ||
3026 | p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)), | ||
3027 | p->contentProcessed) != v) | ||
3028 | { | ||
3029 | p->hashError = True; | ||
3030 | // return ZSTD2_STATE_ERROR; // hash error | ||
3031 | } | ||
3032 | } | ||
3033 | // #endif | ||
3034 | return ZSTD2_STATE_FINISHED; | ||
3035 | } | ||
3036 | // (p->frameState == ZSTD2_STATE_SKIP_HEADER) | ||
3037 | { | ||
3038 | p->blockSize = v; | ||
3039 | info->skipFrames_Size += v; | ||
3040 | p->tempSize = 0; | ||
3041 | /* we want the caller could know that there was finished frame | ||
3042 | finished frame. So we allow the case where | ||
3043 | we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0). | ||
3044 | */ | ||
3045 | // if (v == 0) return ZSTD2_STATE_SIGNATURE; | ||
3046 | return ZSTD2_STATE_SKIP_DATA; | ||
3047 | } | ||
3048 | } | ||
3049 | |||
3050 | // if (p->frameState == ZSTD2_STATE_FRAME_HEADER) | ||
3051 | { | ||
3052 | unsigned descriptor; | ||
3053 | const Byte *h; | ||
3054 | descriptor = p->temp[0]; | ||
3055 | p->descriptor = (Byte)descriptor; | ||
3056 | if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit | ||
3057 | { | ||
3058 | p->isErrorState = True; | ||
3059 | return ZSTD2_STATE_FRAME_HEADER; | ||
3060 | // return ZSTD2_STATE_ERROR; | ||
3061 | } | ||
3062 | { | ||
3063 | const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor); | ||
3064 | // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1)); | ||
3065 | tempSize -= (0x9a563422u >> (n * 4)) & 0xf; | ||
3066 | } | ||
3067 | if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor)))) | ||
3068 | return ZSTD2_STATE_FRAME_HEADER; | ||
3069 | |||
3070 | info->descriptor_OR = (Byte)(info->descriptor_OR | descriptor); | ||
3071 | info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor); | ||
3072 | |||
3073 | h = &p->temp[1]; | ||
3074 | { | ||
3075 | Byte w = 0; | ||
3076 | if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0) | ||
3077 | { | ||
3078 | w = *h++; | ||
3079 | if (info->windowDescriptor_MAX < w) | ||
3080 | info->windowDescriptor_MAX = w; | ||
3081 | // info->are_WindowDescriptors = True; | ||
3082 | // info->num_WindowDescriptors++; | ||
3083 | } | ||
3084 | else | ||
3085 | { | ||
3086 | // info->are_SingleSegments = True; | ||
3087 | // info->num_SingleSegments++; | ||
3088 | } | ||
3089 | p->windowDescriptor = w; | ||
3090 | } | ||
3091 | { | ||
3092 | unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor); | ||
3093 | UInt32 d = 0; | ||
3094 | if (n) | ||
3095 | { | ||
3096 | n = 1u << (n - 1); | ||
3097 | d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n)); | ||
3098 | h += n; | ||
3099 | } | ||
3100 | p->dictionaryId = d; | ||
3101 | // info->dictionaryId_Cur = d; | ||
3102 | if (d != 0) | ||
3103 | { | ||
3104 | if (info->dictionaryId == 0) | ||
3105 | info->dictionaryId = d; | ||
3106 | else if (info->dictionaryId != d) | ||
3107 | info->are_DictionaryId_Different = True; | ||
3108 | } | ||
3109 | } | ||
3110 | { | ||
3111 | unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor); | ||
3112 | UInt64 v = 0; | ||
3113 | if (n) | ||
3114 | { | ||
3115 | n >>= 1; | ||
3116 | if (n == 1) | ||
3117 | v = 256; | ||
3118 | v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n))); | ||
3119 | // info->are_ContentSize_Known = True; | ||
3120 | // info->num_Frames_with_ContentSize++; | ||
3121 | if (info->contentSize_MAX < v) | ||
3122 | info->contentSize_MAX = v; | ||
3123 | info->contentSize_Total += v; | ||
3124 | } | ||
3125 | else | ||
3126 | { | ||
3127 | info->are_ContentSize_Unknown = True; | ||
3128 | // info->num_Frames_without_ContentSize++; | ||
3129 | } | ||
3130 | p->contentSize = v; | ||
3131 | } | ||
3132 | // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure | ||
3133 | p->tempSize = 0; | ||
3134 | |||
3135 | info->checksum_Defined = False; | ||
3136 | /* | ||
3137 | if (descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
3138 | info->are_Checksums = True; | ||
3139 | else | ||
3140 | info->are_Non_Checksums = True; | ||
3141 | */ | ||
3142 | |||
3143 | return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK; | ||
3144 | } | ||
3145 | } | ||
3146 | |||
3147 | |||
3148 | static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos) | ||
3149 | { | ||
3150 | /* | ||
3151 | #ifdef DISABLE_XXH_CHECK | ||
3152 | UNUSED_VAR(data) | ||
3153 | #else | ||
3154 | */ | ||
3155 | if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)) | ||
3156 | { | ||
3157 | // const size_t pos = p->xxh64_winPos; | ||
3158 | const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31; | ||
3159 | if (size) | ||
3160 | { | ||
3161 | // p->xxh64_winPos = pos + size; | ||
3162 | Xxh64State_UpdateBlocks(&p->xxh64, | ||
3163 | p->decoder.win + xxh64_winPos, | ||
3164 | p->decoder.win + xxh64_winPos + size); | ||
3165 | } | ||
3166 | } | ||
3167 | } | ||
3168 | |||
3169 | |||
3170 | /* | ||
3171 | in: | ||
3172 | (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible). | ||
3173 | - this function uses (winLimit) for RAW/RLE blocks only, | ||
3174 | because this function can decode single RAW/RLE block in several different calls. | ||
3175 | - this function DOESN'T use (winLimit) for Compressed blocks, | ||
3176 | because this function decodes full compressed block in single call. | ||
3177 | (CZstdDec1::winPos <= winLimit) | ||
3178 | (winLimit <= CZstdDec1::cycSize). | ||
3179 | Note: if (ds->outBuf_fromCaller) mode is used, then | ||
3180 | { | ||
3181 | (strong_limit) is stored in CZstdDec1::cycSize. | ||
3182 | So (winLimit) is more strong than (strong_limit). | ||
3183 | } | ||
3184 | |||
3185 | exit: | ||
3186 | Note: (CZstdDecState::winPos) will be set by caller after exit of this function. | ||
3187 | |||
3188 | This function can exit for any of these conditions: | ||
3189 | - (frameState == ZSTD2_STATE_AFTER_HEADER) | ||
3190 | - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set | ||
3191 | - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart). | ||
3192 | - ZSTD_STATUS_NEEDS_MORE_INPUT in src | ||
3193 | - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block | ||
3194 | |||
3195 | This function decodes no more than one non-empty block. | ||
3196 | So it fulfills the condition at exit: | ||
3197 | (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max) | ||
3198 | Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding. | ||
3199 | |||
3200 | if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo) | ||
3201 | { | ||
3202 | then this function uses additional strong limit from (CZstdDec1::cycSize). | ||
3203 | So this function will not write any data after (CZstdDec1::cycSize) | ||
3204 | And it fulfills the condition at exit: | ||
3205 | (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize) | ||
3206 | } | ||
3207 | */ | ||
3208 | static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds, | ||
3209 | SizeT winLimitAdd) | ||
3210 | { | ||
3211 | const Byte *src = ds->inBuf; | ||
3212 | SizeT * const srcLen = &ds->inPos; | ||
3213 | const SizeT inSize = ds->inLim; | ||
3214 | // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0; | ||
3215 | enum_ZstdStatus * const status = &ds->status; | ||
3216 | CZstdDecInfo * const info = &ds->info; | ||
3217 | SizeT winLimit; | ||
3218 | |||
3219 | const SizeT winPos_atFuncStart = p->decoder.winPos; | ||
3220 | src += *srcLen; | ||
3221 | *status = ZSTD_STATUS_NOT_SPECIFIED; | ||
3222 | |||
3223 | // finishMode = ZSTD_FINISH_ANY; | ||
3224 | if (ds->outSize_Defined) | ||
3225 | { | ||
3226 | if (ds->outSize < ds->outProcessed) | ||
3227 | { | ||
3228 | // p->isAfterSizeMode = 2; // we have extra bytes already | ||
3229 | *status = ZSTD_STATUS_OUT_REACHED; | ||
3230 | return SZ_OK; | ||
3231 | // size = 0; | ||
3232 | } | ||
3233 | else | ||
3234 | { | ||
3235 | // p->outSize >= p->outProcessed | ||
3236 | const UInt64 rem = ds->outSize - ds->outProcessed; | ||
3237 | /* | ||
3238 | if (rem == 0) | ||
3239 | p->isAfterSizeMode = 1; // we have reached exact required size | ||
3240 | */ | ||
3241 | if (winLimitAdd >= rem) | ||
3242 | { | ||
3243 | winLimitAdd = (SizeT)rem; | ||
3244 | // if (p->finishMode) finishMode = ZSTD_FINISH_END; | ||
3245 | } | ||
3246 | } | ||
3247 | } | ||
3248 | |||
3249 | winLimit = p->decoder.winPos + winLimitAdd; | ||
3250 | // (p->decoder.winPos <= winLimit) | ||
3251 | |||
3252 | // while (p->frameState != ZSTD2_STATE_ERROR) | ||
3253 | while (!p->isErrorState) | ||
3254 | { | ||
3255 | SizeT inCur = inSize - *srcLen; | ||
3256 | |||
3257 | if (p->frameState == ZSTD2_STATE_DATA) | ||
3258 | { | ||
3259 | /* (p->decoder.winPos == winPos_atFuncStart) is expected, | ||
3260 | because this function doesn't start new block. | ||
3261 | if it have finished some non-empty block in this call. */ | ||
3262 | if (p->decoder.winPos != winPos_atFuncStart) | ||
3263 | return SZ_ERROR_FAIL; // it's unexpected | ||
3264 | |||
3265 | /* | ||
3266 | if (p->decoder.winPos > winLimit) | ||
3267 | { | ||
3268 | // we can be here, if in this function call | ||
3269 | // - we have extracted non-empty compressed block, and (winPos > winLimit) after that. | ||
3270 | // - we have started new block decoding after that. | ||
3271 | // It's unexpected case, because we exit after non-empty non-last block. | ||
3272 | *status = (inSize == *srcLen) ? | ||
3273 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
3274 | ZSTD_STATUS_NOT_FINISHED; | ||
3275 | return SZ_OK; | ||
3276 | } | ||
3277 | */ | ||
3278 | // p->decoder.winPos <= winLimit | ||
3279 | |||
3280 | if (p->blockType != kBlockType_Compressed) | ||
3281 | { | ||
3282 | // it's RLE or RAW block. | ||
3283 | // p->BlockSize != 0_ | ||
3284 | // winLimit <= p->decoder.cycSize | ||
3285 | /* So here we use more strong (winLimit), even for | ||
3286 | (ds->outBuf_fromCaller) mode. */ | ||
3287 | SizeT outCur = winLimit - p->decoder.winPos; | ||
3288 | { | ||
3289 | const UInt32 rem = p->blockSize; | ||
3290 | if (outCur > rem) | ||
3291 | outCur = rem; | ||
3292 | } | ||
3293 | if (p->blockType == kBlockType_Raw) | ||
3294 | { | ||
3295 | if (outCur > inCur) | ||
3296 | outCur = inCur; | ||
3297 | /* output buffer is better aligned for XXH code. | ||
3298 | So we use hash for output buffer data */ | ||
3299 | // ZstdDec_Update_XXH(p, src, outCur); // for debug: | ||
3300 | memcpy(p->decoder.win + p->decoder.winPos, src, outCur); | ||
3301 | src += outCur; | ||
3302 | *srcLen += outCur; | ||
3303 | } | ||
3304 | else // kBlockType_RLE | ||
3305 | { | ||
3306 | #define RLE_BYTE_INDEX_IN_temp 3 | ||
3307 | memset(p->decoder.win + p->decoder.winPos, | ||
3308 | p->temp[RLE_BYTE_INDEX_IN_temp], outCur); | ||
3309 | } | ||
3310 | { | ||
3311 | const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p); | ||
3312 | p->decoder.winPos += outCur; | ||
3313 | p->contentProcessed += outCur; | ||
3314 | ZstdDec_Update_XXH(p, xxh64_winPos); | ||
3315 | } | ||
3316 | // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug: | ||
3317 | UPDATE_TOTAL_OUT(&p->decoder, outCur) | ||
3318 | ds->outProcessed += outCur; | ||
3319 | if (p->blockSize -= (UInt32)outCur) | ||
3320 | { | ||
3321 | /* | ||
3322 | if (ds->outSize_Defined) | ||
3323 | { | ||
3324 | if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus) | ||
3325 | (ds->outSize == ds->outProcessed ? 1u: 2u); | ||
3326 | } | ||
3327 | */ | ||
3328 | *status = (enum_ZstdStatus) | ||
3329 | (ds->outSize_Defined && ds->outSize <= ds->outProcessed ? | ||
3330 | ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ? | ||
3331 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
3332 | ZSTD_STATUS_NOT_FINISHED); | ||
3333 | return SZ_OK; | ||
3334 | } | ||
3335 | } | ||
3336 | else // kBlockType_Compressed | ||
3337 | { | ||
3338 | // p->blockSize != 0 | ||
3339 | // (uncompressed_size_of_block == 0) is allowed | ||
3340 | // (p->curBlockUnpackRem == 0) is allowed | ||
3341 | /* | ||
3342 | if (p->decoder.winPos >= winLimit) | ||
3343 | { | ||
3344 | if (p->decoder.winPos != winPos_atFuncStart) | ||
3345 | { | ||
3346 | // it's unexpected case | ||
3347 | // We already have some data in finished blocks in this function call. | ||
3348 | // So we don't decompress new block after (>=winLimit), | ||
3349 | // even if it's empty block. | ||
3350 | *status = (inSize == *srcLen) ? | ||
3351 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
3352 | ZSTD_STATUS_NOT_FINISHED; | ||
3353 | return SZ_OK; | ||
3354 | } | ||
3355 | // (p->decoder.winPos == winLimit == winPos_atFuncStart) | ||
3356 | // we will decode current block, because that current | ||
3357 | // block can be empty block and we want to make some visible | ||
3358 | // change of (src) stream after function start. | ||
3359 | } | ||
3360 | */ | ||
3361 | /* | ||
3362 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
3363 | { | ||
3364 | // we don't want to start new block, if we have more extra decoded bytes already | ||
3365 | *status = ZSTD_STATUS_OUT_REACHED; | ||
3366 | return SZ_OK; | ||
3367 | } | ||
3368 | */ | ||
3369 | { | ||
3370 | const Byte *comprStream; | ||
3371 | size_t afterAvail; | ||
3372 | UInt32 inTempPos = p->inTempPos; | ||
3373 | const UInt32 rem = p->blockSize - inTempPos; | ||
3374 | // rem != 0 | ||
3375 | if (inTempPos != 0 // (inTemp) buffer already contains some input data | ||
3376 | || inCur < rem // available input data size is smaller than compressed block size | ||
3377 | || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem)) | ||
3378 | { | ||
3379 | if (inCur > rem) | ||
3380 | inCur = rem; | ||
3381 | if (inCur) | ||
3382 | { | ||
3383 | STAT_INC(g_Num_Blocks_memcpy) | ||
3384 | // we clear data for backward lookahead reading | ||
3385 | if (inTempPos == 0) | ||
3386 | memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH); | ||
3387 | // { unsigned y = 0; for(;y < 1000; y++) | ||
3388 | memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur); | ||
3389 | // } | ||
3390 | src += inCur; | ||
3391 | *srcLen += inCur; | ||
3392 | inTempPos += (UInt32)inCur; | ||
3393 | p->inTempPos = inTempPos; | ||
3394 | } | ||
3395 | if (inTempPos != p->blockSize) | ||
3396 | { | ||
3397 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
3398 | return SZ_OK; | ||
3399 | } | ||
3400 | #if COPY_CHUNK_SIZE > 1 | ||
3401 | memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE); | ||
3402 | #endif | ||
3403 | comprStream = p->inTemp + kTempBuffer_PreSize; | ||
3404 | afterAvail = k_Lit_AfterAvail; | ||
3405 | // we don't want to read non-initialized data or junk in CopyMatch(): | ||
3406 | } | ||
3407 | else | ||
3408 | { | ||
3409 | // inCur >= rem | ||
3410 | // we use direct decoding from (src) buffer: | ||
3411 | afterAvail = inCur - rem; | ||
3412 | comprStream = src; | ||
3413 | src += rem; | ||
3414 | *srcLen += rem; | ||
3415 | } | ||
3416 | |||
3417 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
3418 | ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize); | ||
3419 | #endif | ||
3420 | // printf("\nblockSize=%u", p->blockSize); | ||
3421 | // printf("%x\n", (unsigned)p->contentProcessed); | ||
3422 | STAT_INC(g_Num_Blocks_Compressed) | ||
3423 | { | ||
3424 | SRes sres; | ||
3425 | const size_t winPos = p->decoder.winPos; | ||
3426 | /* | ||
3427 | if ( useAdditionalWinLimit), we use strong unpack limit: smallest from | ||
3428 | - limit from stream : (curBlockUnpackRem) | ||
3429 | - limit from caller : (cycSize - winPos) | ||
3430 | if (!useAdditionalWinLimit), we use only relaxed limit: | ||
3431 | - limit from stream : (curBlockUnpackRem) | ||
3432 | */ | ||
3433 | SizeT outLimit = p->curBlockUnpackRem; | ||
3434 | if (ds->outBuf_fromCaller) | ||
3435 | // if (useAdditionalWinLimit) | ||
3436 | { | ||
3437 | const size_t limit = p->decoder.cycSize - winPos; | ||
3438 | if (outLimit > limit) | ||
3439 | outLimit = limit; | ||
3440 | } | ||
3441 | sres = ZstdDec1_DecodeBlock(&p->decoder, | ||
3442 | comprStream, p->blockSize, afterAvail, outLimit); | ||
3443 | // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug: | ||
3444 | if (sres) | ||
3445 | { | ||
3446 | p->isErrorState = True; | ||
3447 | return sres; | ||
3448 | } | ||
3449 | { | ||
3450 | const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p); | ||
3451 | const size_t num = p->decoder.winPos - winPos; | ||
3452 | ds->outProcessed += num; | ||
3453 | p->contentProcessed += num; | ||
3454 | ZstdDec_Update_XXH(p, xxh64_winPos); | ||
3455 | } | ||
3456 | } | ||
3457 | // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos); | ||
3458 | } | ||
3459 | } | ||
3460 | |||
3461 | /* | ||
3462 | if (ds->outSize_Defined) | ||
3463 | { | ||
3464 | if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus) | ||
3465 | (ds->outSize == ds->outProcessed ? 1u: 2u); | ||
3466 | } | ||
3467 | */ | ||
3468 | |||
3469 | if (!ZSTD_DEC_IS_LAST_BLOCK(p)) | ||
3470 | { | ||
3471 | p->frameState = ZSTD2_STATE_BLOCK; | ||
3472 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
3473 | { | ||
3474 | *status = ZSTD_STATUS_OUT_REACHED; | ||
3475 | return SZ_OK; | ||
3476 | } | ||
3477 | // we exit only if (winPos) was changed in this function call: | ||
3478 | if (p->decoder.winPos != winPos_atFuncStart) | ||
3479 | { | ||
3480 | // decoded block was not empty. So we exit: | ||
3481 | *status = (enum_ZstdStatus)( | ||
3482 | (inSize == *srcLen) ? | ||
3483 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
3484 | ZSTD_STATUS_NOT_FINISHED); | ||
3485 | return SZ_OK; | ||
3486 | } | ||
3487 | // (p->decoder.winPos == winPos_atFuncStart) | ||
3488 | // so current decoded block was empty. | ||
3489 | // we will try to decode more blocks in this function. | ||
3490 | continue; | ||
3491 | } | ||
3492 | |||
3493 | // decoded block was last in frame | ||
3494 | if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
3495 | { | ||
3496 | p->frameState = ZSTD2_STATE_HASH; | ||
3497 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
3498 | { | ||
3499 | *status = ZSTD_STATUS_OUT_REACHED; | ||
3500 | return SZ_OK; // disable if want to | ||
3501 | /* We want to get same return codes for any input buffer sizes. | ||
3502 | We want to get faster ZSTD_STATUS_OUT_REACHED status. | ||
3503 | So we exit with ZSTD_STATUS_OUT_REACHED here, | ||
3504 | instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing. | ||
3505 | that depends from input buffer size and that can set | ||
3506 | ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC. | ||
3507 | */ | ||
3508 | } | ||
3509 | } | ||
3510 | else | ||
3511 | { | ||
3512 | /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */ | ||
3513 | p->frameState = ZSTD2_STATE_FINISHED; | ||
3514 | } | ||
3515 | /* | ||
3516 | p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ? | ||
3517 | ZSTD2_STATE_HASH : | ||
3518 | ZSTD2_STATE_FINISHED; | ||
3519 | */ | ||
3520 | /* it's required to process ZSTD2_STATE_FINISHED state in this function call, | ||
3521 | because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code, | ||
3522 | while the caller can reinit full state for ZSTD2_STATE_FINISHED | ||
3523 | So we can't exit from function here. */ | ||
3524 | continue; | ||
3525 | } | ||
3526 | |||
3527 | if (p->frameState == ZSTD2_STATE_FINISHED) | ||
3528 | { | ||
3529 | *status = ZSTD_STATUS_FINISHED_FRAME; | ||
3530 | if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor) | ||
3531 | && p->contentSize != p->contentProcessed) | ||
3532 | return SZ_ERROR_DATA; | ||
3533 | if (p->hashError) // for debug | ||
3534 | return SZ_ERROR_CRC; | ||
3535 | return SZ_OK; | ||
3536 | // p->frameState = ZSTD2_STATE_SIGNATURE; | ||
3537 | // continue; | ||
3538 | } | ||
3539 | |||
3540 | if (p->frameState == ZSTD2_STATE_AFTER_HEADER) | ||
3541 | return SZ_OK; // we need memory allocation for that state | ||
3542 | |||
3543 | if (p->frameState == ZSTD2_STATE_SKIP_DATA) | ||
3544 | { | ||
3545 | UInt32 blockSize = p->blockSize; | ||
3546 | // (blockSize == 0) is possible | ||
3547 | if (inCur > blockSize) | ||
3548 | inCur = blockSize; | ||
3549 | src += inCur; | ||
3550 | *srcLen += inCur; | ||
3551 | blockSize -= (UInt32)inCur; | ||
3552 | p->blockSize = blockSize; | ||
3553 | if (blockSize == 0) | ||
3554 | { | ||
3555 | p->frameState = ZSTD2_STATE_SIGNATURE; | ||
3556 | // continue; // for debug: we can continue without return to caller. | ||
3557 | // we notify the caller that skip frame was finished: | ||
3558 | *status = ZSTD_STATUS_FINISHED_FRAME; | ||
3559 | return SZ_OK; | ||
3560 | } | ||
3561 | // blockSize != 0 | ||
3562 | // (inCur) was smaller than previous value of p->blockSize. | ||
3563 | // (inSize == *srcLen) now | ||
3564 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
3565 | return SZ_OK; | ||
3566 | } | ||
3567 | |||
3568 | if (inCur == 0) | ||
3569 | { | ||
3570 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
3571 | return SZ_OK; | ||
3572 | } | ||
3573 | |||
3574 | { | ||
3575 | (*srcLen)++; | ||
3576 | p->frameState = ZstdDec_UpdateState(p, *src++, info); | ||
3577 | } | ||
3578 | } | ||
3579 | |||
3580 | *status = ZSTD_STATUS_NOT_SPECIFIED; | ||
3581 | p->isErrorState = True; | ||
3582 | // p->frameState = ZSTD2_STATE_ERROR; | ||
3583 | // if (p->frameState = ZSTD2_STATE_SIGNATURE) return SZ_ERROR_NO_ARCHIVE | ||
3584 | return SZ_ERROR_DATA; | ||
3585 | } | ||
3586 | |||
3587 | |||
3588 | |||
3589 | |||
3590 | SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p) | ||
3591 | { | ||
3592 | p->needWrite_Size = 0; | ||
3593 | p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
3594 | dec->disableHash = p->disableHash; | ||
3595 | |||
3596 | if (p->outBuf_fromCaller) | ||
3597 | { | ||
3598 | dec->decoder.win = p->outBuf_fromCaller; | ||
3599 | dec->decoder.cycSize = p->outBufSize_fromCaller; | ||
3600 | } | ||
3601 | |||
3602 | // p->winPos = dec->decoder.winPos; | ||
3603 | |||
3604 | for (;;) | ||
3605 | { | ||
3606 | SizeT winPos, size; | ||
3607 | // SizeT outProcessed; | ||
3608 | SRes res; | ||
3609 | |||
3610 | if (p->wrPos > dec->decoder.winPos) | ||
3611 | return SZ_ERROR_FAIL; | ||
3612 | |||
3613 | if (dec->frameState == ZSTD2_STATE_FINISHED) | ||
3614 | { | ||
3615 | if (!p->outBuf_fromCaller) | ||
3616 | { | ||
3617 | // we need to set positions to zero for new frame. | ||
3618 | if (p->wrPos != dec->decoder.winPos) | ||
3619 | { | ||
3620 | /* We have already asked the caller to flush all data | ||
3621 | with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status. | ||
3622 | So it's unexpected case */ | ||
3623 | // p->winPos = dec->decoder.winPos; | ||
3624 | // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking | ||
3625 | // return SZ_OK; // ask to flush again | ||
3626 | return SZ_ERROR_FAIL; | ||
3627 | } | ||
3628 | // (p->wrPos == dec->decoder.winPos), and we wrap to zero: | ||
3629 | dec->decoder.winPos = 0; | ||
3630 | p->winPos = 0; | ||
3631 | p->wrPos = 0; | ||
3632 | } | ||
3633 | ZstdDec_Init_ForNewFrame(dec); | ||
3634 | // continue; | ||
3635 | } | ||
3636 | |||
3637 | winPos = dec->decoder.winPos; | ||
3638 | { | ||
3639 | SizeT next = dec->decoder.cycSize; | ||
3640 | /* cycSize == 0, if no buffer was allocated still, | ||
3641 | or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */ | ||
3642 | if (!p->outBuf_fromCaller | ||
3643 | && next | ||
3644 | && next <= winPos | ||
3645 | && dec->isCyclicMode) | ||
3646 | { | ||
3647 | // (0 < decoder.cycSize <= winPos) in isCyclicMode. | ||
3648 | // so we need to wrap (winPos) and (wrPos) over (cycSize). | ||
3649 | const size_t delta = next; | ||
3650 | // (delta) is how many bytes we remove from buffer. | ||
3651 | /* | ||
3652 | // we don't need data older than last (cycSize) bytes. | ||
3653 | size_t delta = winPos - next; // num bytes after (cycSize) | ||
3654 | if (delta <= next) // it's expected case | ||
3655 | delta = next; | ||
3656 | // delta == Max(cycSize, winPos - cycSize) | ||
3657 | */ | ||
3658 | if (p->wrPos < delta) | ||
3659 | { | ||
3660 | // (wrPos < decoder.cycSize) | ||
3661 | // We have asked already the caller to flush required data | ||
3662 | // p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
3663 | // p->winPos = winPos; | ||
3664 | // p->needWrite_Size = delta - p->wrPos; // flush size asking | ||
3665 | // return SZ_OK; // ask to flush again | ||
3666 | return SZ_ERROR_FAIL; | ||
3667 | } | ||
3668 | // p->wrPos >= decoder.cycSize | ||
3669 | // we move extra data after (decoder.cycSize) to start of cyclic buffer: | ||
3670 | winPos -= delta; | ||
3671 | if (winPos) | ||
3672 | { | ||
3673 | if (winPos >= delta) | ||
3674 | return SZ_ERROR_FAIL; | ||
3675 | memmove(dec->decoder.win, dec->decoder.win + delta, winPos); | ||
3676 | // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos); | ||
3677 | STAT_INC(g_Num_Wrap_memmove_Num) | ||
3678 | STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;) | ||
3679 | } | ||
3680 | dec->decoder.winPos = winPos; | ||
3681 | p->winPos = winPos; | ||
3682 | p->wrPos -= delta; | ||
3683 | // dec->xxh64_winPos -= delta; | ||
3684 | |||
3685 | // (winPos < delta) | ||
3686 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
3687 | /* we set the data after cycSize, because | ||
3688 | we don't want to read non-initialized data or junk in CopyMatch(). */ | ||
3689 | memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE); | ||
3690 | #endif | ||
3691 | |||
3692 | /* | ||
3693 | if (winPos == next) | ||
3694 | { | ||
3695 | if (winPos != p->wrPos) | ||
3696 | { | ||
3697 | // we already requested before to flush full data for that case. | ||
3698 | // but we give the caller a second chance to flush data: | ||
3699 | p->needWrite_Size = winPos - p->wrPos; | ||
3700 | return SZ_OK; | ||
3701 | } | ||
3702 | // (decoder.cycSize == winPos == p->wrPos) | ||
3703 | // so we do second wrapping to zero: | ||
3704 | winPos = 0; | ||
3705 | dec->decoder.winPos = 0; | ||
3706 | p->winPos = 0; | ||
3707 | p->wrPos = 0; | ||
3708 | } | ||
3709 | */ | ||
3710 | // (winPos < next) | ||
3711 | } | ||
3712 | |||
3713 | if (winPos > next) | ||
3714 | return SZ_ERROR_FAIL; // it's unexpected case | ||
3715 | /* | ||
3716 | if (!outBuf_fromCaller && isCyclicMode && cycSize != 0) | ||
3717 | then (winPos < cycSize) | ||
3718 | else (winPos <= cycSize) | ||
3719 | */ | ||
3720 | if (!p->outBuf_fromCaller) | ||
3721 | { | ||
3722 | // that code is optional. We try to optimize write chunk sizes. | ||
3723 | /* (next2) is expected next write position in the caller, | ||
3724 | if the caller writes by kBlockSizeMax chunks. | ||
3725 | */ | ||
3726 | /* | ||
3727 | const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1); | ||
3728 | if (winPos < next2 && next2 < next) | ||
3729 | next = next2; | ||
3730 | */ | ||
3731 | } | ||
3732 | size = next - winPos; | ||
3733 | } | ||
3734 | |||
3735 | // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks | ||
3736 | res = ZstdDec_DecodeBlock(dec, p, size); | ||
3737 | /* | ||
3738 | after one block decoding: | ||
3739 | if (!outBuf_fromCaller && isCyclicMode && cycSize != 0) | ||
3740 | then (winPos < cycSize + max_block_size) | ||
3741 | else (winPos <= cycSize) | ||
3742 | */ | ||
3743 | |||
3744 | if (!p->outBuf_fromCaller) | ||
3745 | p->win = dec->decoder.win; | ||
3746 | p->winPos = dec->decoder.winPos; | ||
3747 | |||
3748 | // outProcessed = dec->decoder.winPos - winPos; | ||
3749 | // p->outProcessed += outProcessed; | ||
3750 | |||
3751 | if (res != SZ_OK) | ||
3752 | return res; | ||
3753 | |||
3754 | if (dec->frameState != ZSTD2_STATE_AFTER_HEADER) | ||
3755 | { | ||
3756 | if (p->outBuf_fromCaller) | ||
3757 | return SZ_OK; | ||
3758 | { | ||
3759 | // !p->outBuf_fromCaller | ||
3760 | /* | ||
3761 | if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because | ||
3762 | 1) it's simpler to work with allocation and extracting of next frame, | ||
3763 | 2) it's better to start writing to next new frame with aligned memory | ||
3764 | for faster xxh 64-bit reads. | ||
3765 | */ | ||
3766 | size_t end = dec->decoder.winPos; // end pos for all data flushing | ||
3767 | if (p->status != ZSTD_STATUS_FINISHED_FRAME) | ||
3768 | { | ||
3769 | // we will request flush here only for cases when wrap in cyclic buffer can be required in next call. | ||
3770 | if (!dec->isCyclicMode) | ||
3771 | return SZ_OK; | ||
3772 | // isCyclicMode | ||
3773 | { | ||
3774 | const size_t delta = dec->decoder.cycSize; | ||
3775 | if (end < delta) | ||
3776 | return SZ_OK; // (winPos < cycSize). no need for flush | ||
3777 | // cycSize <= winPos | ||
3778 | // So we ask the caller to flush of (cycSize - wrPos) bytes, | ||
3779 | // and then we will wrap cylicBuffer in next call | ||
3780 | end = delta; | ||
3781 | } | ||
3782 | } | ||
3783 | p->needWrite_Size = end - p->wrPos; | ||
3784 | } | ||
3785 | return SZ_OK; | ||
3786 | } | ||
3787 | |||
3788 | // ZSTD2_STATE_AFTER_HEADER | ||
3789 | { | ||
3790 | BoolInt useCyclic = False; | ||
3791 | size_t cycSize; | ||
3792 | |||
3793 | // p->status = ZSTD_STATUS_NOT_FINISHED; | ||
3794 | if (dec->dictionaryId != 0) | ||
3795 | { | ||
3796 | /* actually we can try to decode some data, | ||
3797 | because it's possible that some data doesn't use dictionary */ | ||
3798 | // p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
3799 | return SZ_ERROR_UNSUPPORTED; | ||
3800 | } | ||
3801 | |||
3802 | { | ||
3803 | UInt64 winSize = dec->contentSize; | ||
3804 | UInt64 winSize_Allocate = winSize; | ||
3805 | const unsigned descriptor = dec->descriptor; | ||
3806 | |||
3807 | if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0) | ||
3808 | { | ||
3809 | const Byte wd = dec->windowDescriptor; | ||
3810 | winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3); | ||
3811 | if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor) | ||
3812 | || winSize_Allocate > winSize) | ||
3813 | { | ||
3814 | winSize_Allocate = winSize; | ||
3815 | useCyclic = True; | ||
3816 | } | ||
3817 | } | ||
3818 | /* | ||
3819 | else | ||
3820 | { | ||
3821 | if (p->info.singleSegment_ContentSize_MAX < winSize) | ||
3822 | p->info.singleSegment_ContentSize_MAX = winSize; | ||
3823 | // p->info.num_SingleSegments++; | ||
3824 | } | ||
3825 | */ | ||
3826 | if (p->info.windowSize_MAX < winSize) | ||
3827 | p->info.windowSize_MAX = winSize; | ||
3828 | if (p->info.windowSize_Allocate_MAX < winSize_Allocate) | ||
3829 | p->info.windowSize_Allocate_MAX = winSize_Allocate; | ||
3830 | /* | ||
3831 | winSize_Allocate is MIN(content_size, window_size_from_descriptor). | ||
3832 | Wven if (content_size < (window_size_from_descriptor)) | ||
3833 | original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed. | ||
3834 | We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate)) | ||
3835 | */ | ||
3836 | if ( | ||
3837 | // winSize_Allocate // it's relaxed check | ||
3838 | winSize // it's more strict check to be compatible with original-zstd | ||
3839 | > ((UInt64)1 << MAX_WINDOW_SIZE_LOG)) | ||
3840 | return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM | ||
3841 | cycSize = (size_t)winSize_Allocate; | ||
3842 | if (cycSize != winSize_Allocate) | ||
3843 | return SZ_ERROR_MEM; | ||
3844 | // cycSize <= winSize | ||
3845 | /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes. | ||
3846 | if (there is window descriptor) | ||
3847 | { | ||
3848 | We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate). | ||
3849 | Does original-zstd do it that way also? | ||
3850 | } | ||
3851 | Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize). | ||
3852 | Also we don't want too big values for (CZstdDec1::winSize). | ||
3853 | our (CZstdDec1::winSize) will meet the condition: | ||
3854 | (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize). | ||
3855 | */ | ||
3856 | dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize; | ||
3857 | // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic) | ||
3858 | } | ||
3859 | |||
3860 | RINOK(ZstdDec_AllocateMisc(dec)) | ||
3861 | |||
3862 | if (p->outBuf_fromCaller) | ||
3863 | dec->isCyclicMode = False; | ||
3864 | else | ||
3865 | { | ||
3866 | size_t d = cycSize; | ||
3867 | |||
3868 | if (dec->decoder.winPos != p->wrPos) | ||
3869 | return SZ_ERROR_FAIL; | ||
3870 | |||
3871 | dec->decoder.winPos = 0; | ||
3872 | p->wrPos = 0; | ||
3873 | p->winPos = dec->decoder.winPos; | ||
3874 | |||
3875 | /* | ||
3876 | const size_t needWrite = dec->decoder.winPos - p->wrPos; | ||
3877 | if (!needWrite) | ||
3878 | { | ||
3879 | dec->decoder.winPos = 0; | ||
3880 | p->wrPos = 0; | ||
3881 | p->winPos = dec->decoder.winPos; | ||
3882 | } | ||
3883 | */ | ||
3884 | /* if (!useCyclic) we allocate only cycSize = ContentSize. | ||
3885 | But if we want to support the case where new frame starts with winPos != 0, | ||
3886 | then we will wrap over zero, and we still need | ||
3887 | to set (useCyclic) and allocate additional buffer spaces. | ||
3888 | Now we don't allow new frame starting with (winPos != 0). | ||
3889 | so (dec->decoder->winPos == 0) | ||
3890 | can use (!useCyclic) with reduced buffer sizes. | ||
3891 | */ | ||
3892 | /* | ||
3893 | if (dec->decoder->winPos != 0) | ||
3894 | useCyclic = True; | ||
3895 | */ | ||
3896 | |||
3897 | if (useCyclic) | ||
3898 | { | ||
3899 | /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes | ||
3900 | larger than window size, because CopyMatch() can write additional | ||
3901 | (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer. | ||
3902 | But for performance reasons we align (cycSize) for (kBlockSizeMax). | ||
3903 | also we must provide (cycSize >= max_decoded_data_after_cycSize), | ||
3904 | because after data move wrapping over zero we must provide (winPos < cycSize). | ||
3905 | */ | ||
3906 | const size_t alignSize = kBlockSizeMax; | ||
3907 | /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because | ||
3908 | we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */ | ||
3909 | // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize) | ||
3910 | cycSize += (1 << 7) + alignSize; | ||
3911 | cycSize &= ~(size_t)(alignSize - 1); | ||
3912 | // cycSize must be aligned for 32, because xxh requires 32-bytes blocks. | ||
3913 | // cycSize += 12345; // for debug | ||
3914 | // cycSize += 1 << 10; // for debug | ||
3915 | // cycSize += 32; // for debug | ||
3916 | // cycSize += kBlockSizeMax; // for debug | ||
3917 | if (cycSize < d) | ||
3918 | return SZ_ERROR_MEM; | ||
3919 | /* | ||
3920 | in cyclic buffer mode we allow to decode one additional block | ||
3921 | that exceeds (cycSize). | ||
3922 | So we must allocate additional (kBlockSizeMax) bytes after (cycSize). | ||
3923 | if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF) | ||
3924 | { | ||
3925 | we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize) | ||
3926 | but we aready allocate additional kBlockSizeMax that | ||
3927 | is larger than COPY_CHUNK_SIZE. | ||
3928 | So we don't need additional space of COPY_CHUNK_SIZE after (cycSize). | ||
3929 | } | ||
3930 | */ | ||
3931 | /* | ||
3932 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
3933 | d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1) | ||
3934 | #endif | ||
3935 | */ | ||
3936 | d = cycSize + kBlockSizeMax; | ||
3937 | if (d < cycSize) | ||
3938 | return SZ_ERROR_MEM; | ||
3939 | } | ||
3940 | |||
3941 | { | ||
3942 | const size_t kMinWinAllocSize = 1 << 12; | ||
3943 | if (d < kMinWinAllocSize) | ||
3944 | d = kMinWinAllocSize; | ||
3945 | } | ||
3946 | |||
3947 | if (d > dec->winBufSize_Allocated) | ||
3948 | { | ||
3949 | /* | ||
3950 | if (needWrite) | ||
3951 | { | ||
3952 | p->needWrite_Size = needWrite; | ||
3953 | return SZ_OK; | ||
3954 | // return SZ_ERROR_FAIL; | ||
3955 | } | ||
3956 | */ | ||
3957 | |||
3958 | if (dec->winBufSize_Allocated != 0) | ||
3959 | { | ||
3960 | const size_t k_extra = (useCyclic || d >= (1u << 20)) ? | ||
3961 | 2 * kBlockSizeMax : 0; | ||
3962 | unsigned i = useCyclic ? 17 : 12; | ||
3963 | for (; i < sizeof(size_t) * 8; i++) | ||
3964 | { | ||
3965 | const size_t d2 = ((size_t)1 << i) + k_extra; | ||
3966 | if (d2 >= d) | ||
3967 | { | ||
3968 | d = d2; | ||
3969 | break; | ||
3970 | } | ||
3971 | } | ||
3972 | } | ||
3973 | // RINOK(ZstdDec_AllocateWindow(dec, d)) | ||
3974 | ZstdDec_FreeWindow(dec); | ||
3975 | dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d); | ||
3976 | if (!dec->win_Base) | ||
3977 | return SZ_ERROR_MEM; | ||
3978 | dec->decoder.win = dec->win_Base; | ||
3979 | dec->winBufSize_Allocated = d; | ||
3980 | } | ||
3981 | /* | ||
3982 | else | ||
3983 | { | ||
3984 | // for non-cyclycMode we want flush data, and set winPos = 0 | ||
3985 | if (needWrite) | ||
3986 | { | ||
3987 | if (!useCyclic || dec->decoder.winPos >= cycSize) | ||
3988 | { | ||
3989 | p->needWrite_Size = needWrite; | ||
3990 | return SZ_OK; | ||
3991 | // return SZ_ERROR_FAIL; | ||
3992 | } | ||
3993 | } | ||
3994 | } | ||
3995 | */ | ||
3996 | |||
3997 | dec->decoder.cycSize = cycSize; | ||
3998 | p->win = dec->decoder.win; | ||
3999 | // p->cycSize = dec->decoder.cycSize; | ||
4000 | dec->isCyclicMode = (Byte)useCyclic; | ||
4001 | } // (!p->outBuf_fromCaller) end | ||
4002 | |||
4003 | // p->winPos = dec->decoder.winPos; | ||
4004 | dec->frameState = ZSTD2_STATE_BLOCK; | ||
4005 | // continue; | ||
4006 | } // ZSTD2_STATE_AFTER_HEADER end | ||
4007 | } | ||
4008 | } | ||
4009 | |||
4010 | |||
4011 | void ZstdDec_GetResInfo(const CZstdDec *dec, | ||
4012 | const CZstdDecState *p, | ||
4013 | SRes res, | ||
4014 | CZstdDecResInfo *stat) | ||
4015 | { | ||
4016 | // ZstdDecInfo_CLEAR(stat); | ||
4017 | stat->extraSize = 0; | ||
4018 | stat->is_NonFinishedFrame = False; | ||
4019 | if (dec->frameState != ZSTD2_STATE_FINISHED) | ||
4020 | { | ||
4021 | if (dec->frameState == ZSTD2_STATE_SIGNATURE) | ||
4022 | { | ||
4023 | stat->extraSize = (Byte)dec->tempSize; | ||
4024 | if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0) | ||
4025 | res = SZ_ERROR_NO_ARCHIVE; | ||
4026 | } | ||
4027 | else | ||
4028 | { | ||
4029 | stat->is_NonFinishedFrame = True; | ||
4030 | if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT) | ||
4031 | res = SZ_ERROR_INPUT_EOF; | ||
4032 | } | ||
4033 | } | ||
4034 | stat->decode_SRes = res; | ||
4035 | } | ||
4036 | |||
4037 | |||
4038 | size_t ZstdDec_ReadUnusedFromInBuf( | ||
4039 | CZstdDecHandle dec, | ||
4040 | size_t afterDecoding_tempPos, | ||
4041 | void *data, size_t size) | ||
4042 | { | ||
4043 | size_t processed = 0; | ||
4044 | if (dec->frameState == ZSTD2_STATE_SIGNATURE) | ||
4045 | { | ||
4046 | Byte *dest = (Byte *)data; | ||
4047 | const size_t tempSize = dec->tempSize; | ||
4048 | while (afterDecoding_tempPos < tempSize) | ||
4049 | { | ||
4050 | if (size == 0) | ||
4051 | break; | ||
4052 | size--; | ||
4053 | *dest++ = dec->temp[afterDecoding_tempPos++]; | ||
4054 | processed++; | ||
4055 | } | ||
4056 | } | ||
4057 | return processed; | ||
4058 | } | ||
4059 | |||
4060 | |||
4061 | void ZstdDecState_Clear(CZstdDecState *p) | ||
4062 | { | ||
4063 | memset(p, 0 , sizeof(*p)); | ||
4064 | } | ||
diff --git a/C/ZstdDec.h b/C/ZstdDec.h new file mode 100644 index 0000000..cd26131 --- /dev/null +++ b/C/ZstdDec.h | |||
@@ -0,0 +1,173 @@ | |||
1 | /* ZstdDec.h -- Zstd Decoder interfaces | ||
2 | 2024-01-21 : Igor Pavlov : Public domain */ | ||
3 | |||
4 | #ifndef ZIP7_INC_ZSTD_DEC_H | ||
5 | #define ZIP7_INC_ZSTD_DEC_H | ||
6 | |||
7 | EXTERN_C_BEGIN | ||
8 | |||
9 | typedef struct CZstdDec CZstdDec; | ||
10 | typedef CZstdDec * CZstdDecHandle; | ||
11 | |||
12 | CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big); | ||
13 | void ZstdDec_Destroy(CZstdDecHandle p); | ||
14 | |||
15 | typedef enum | ||
16 | { | ||
17 | ZSTD_STATUS_NOT_SPECIFIED, /* use main error code instead */ | ||
18 | ZSTD_STATUS_FINISHED_FRAME, /* data frame or skip frame was finished */ | ||
19 | ZSTD_STATUS_NOT_FINISHED, /* just finished non-empty block or unfinished RAW/RLE block */ | ||
20 | ZSTD_STATUS_NEEDS_MORE_INPUT, /* the callee needs more input bytes. It has more priority over ZSTD_STATUS_NOT_FINISHED */ | ||
21 | ZSTD_STATUS_OUT_REACHED /* is not finihed frame and ((outProcessed > outSize) || (outProcessed == outSize && unfinished RAW/RLE block) */ | ||
22 | } enum_ZstdStatus_Dummy; | ||
23 | |||
24 | #define ZstdDecState_DOES_NEED_MORE_INPUT_OR_FINISHED_FRAME(p) \ | ||
25 | ((p)->status & ZSTD_STATUS_FINISHED_FRAME) | ||
26 | /* | ||
27 | ((p)->status == ZSTD_STATUS_NEEDS_MORE_INPUT || \ | ||
28 | (p)->status == ZSTD_STATUS_FINISHED_FRAME) | ||
29 | */ | ||
30 | |||
31 | typedef Byte enum_ZstdStatus; | ||
32 | |||
33 | |||
34 | void ZstdDec_Init(CZstdDecHandle p); | ||
35 | |||
36 | typedef struct | ||
37 | { | ||
38 | UInt64 num_Blocks; | ||
39 | Byte descriptor_OR; | ||
40 | Byte descriptor_NOT_OR; | ||
41 | Byte are_ContentSize_Unknown; | ||
42 | Byte windowDescriptor_MAX; | ||
43 | |||
44 | // Byte are_ContentSize_Known; | ||
45 | // Byte are_SingleSegments; | ||
46 | // Byte are_WindowDescriptors; | ||
47 | Byte checksum_Defined; | ||
48 | // Byte are_Checksums; | ||
49 | // Byte are_Non_Checksums; | ||
50 | |||
51 | // Byte are_DictionaryId; | ||
52 | Byte are_DictionaryId_Different; | ||
53 | |||
54 | // Byte reserved[3]; | ||
55 | |||
56 | UInt32 checksum; // checksum of last data frame | ||
57 | /// UInt32 dictionaryId_Cur; | ||
58 | UInt32 dictionaryId; // if there are non-zero dictionary IDs, then it's first dictionaryId | ||
59 | |||
60 | UInt64 num_DataFrames; | ||
61 | UInt64 num_SkipFrames; | ||
62 | UInt64 skipFrames_Size; | ||
63 | UInt64 contentSize_Total; | ||
64 | UInt64 contentSize_MAX; | ||
65 | // UInt64 num_Checksums; | ||
66 | // UInt64 num_Non_Checksums; // frames without checksum | ||
67 | // UInt64 num_WindowDescriptors; | ||
68 | // UInt64 num_SingleSegments; | ||
69 | // UInt64 num_Frames_with_ContentSize; | ||
70 | // UInt64 num_Frames_without_ContentSize; | ||
71 | UInt64 windowSize_MAX; | ||
72 | UInt64 windowSize_Allocate_MAX; | ||
73 | // UInt64 num_DictionaryIds; | ||
74 | // UInt64 num_Blocks_forType[4]; | ||
75 | // UInt64 num_BlockBytes_forType[4]; | ||
76 | // UInt64 num_SingleSegments; | ||
77 | // UInt64 singleSegment_ContentSize_MAX; | ||
78 | } CZstdDecInfo; | ||
79 | |||
80 | #define ZstdDecInfo_CLEAR(p) { memset(p, 0, sizeof(*(p))); } | ||
81 | |||
82 | #define ZstdDecInfo_GET_NUM_FRAMES(p) ((p)->num_DataFrames + (p)->num_SkipFrames) | ||
83 | |||
84 | |||
85 | typedef struct CZstdDecState | ||
86 | { | ||
87 | enum_ZstdStatus status; // out | ||
88 | Byte disableHash; | ||
89 | // Byte mustBeFinished; | ||
90 | Byte outSize_Defined; | ||
91 | // Byte isAfterSizeMode; | ||
92 | // UInt64 inProcessed; | ||
93 | // SRes codeRes; | ||
94 | // Byte needWrite_IsStrong; | ||
95 | |||
96 | const Byte *inBuf; | ||
97 | size_t inPos; // in/out | ||
98 | size_t inLim; | ||
99 | |||
100 | const Byte *win; // out | ||
101 | size_t winPos; // out | ||
102 | size_t wrPos; // in/out | ||
103 | // size_t cycSize; // out : if (!outBuf_fromCaller) | ||
104 | size_t needWrite_Size; // out | ||
105 | |||
106 | Byte *outBuf_fromCaller; | ||
107 | size_t outBufSize_fromCaller; | ||
108 | /* (outBufSize_fromCaller >= full_uncompressed_size_of_all_frames) is required | ||
109 | for success decoding. | ||
110 | If outBufSize_fromCaller < full_uncompressed_size_of_all_frames), | ||
111 | decoding can give error message, because we decode per block basis. | ||
112 | */ | ||
113 | |||
114 | // size_t outStep; | ||
115 | UInt64 outSize; // total in all frames | ||
116 | UInt64 outProcessed; // out decoded in all frames (it can be >= outSize) | ||
117 | |||
118 | CZstdDecInfo info; | ||
119 | } CZstdDecState; | ||
120 | |||
121 | void ZstdDecState_Clear(CZstdDecState *p); | ||
122 | |||
123 | /* | ||
124 | ZstdDec_Decode() | ||
125 | return: | ||
126 | SZ_OK - no error | ||
127 | SZ_ERROR_DATA - Data Error | ||
128 | SZ_ERROR_MEM - Memory allocation error | ||
129 | SZ_ERROR_UNSUPPORTED - Unsupported method or method properties | ||
130 | SZ_ERROR_CRC - XXH hash Error | ||
131 | // SZ_ERROR_ARCHIVE - Headers error (not used now) | ||
132 | */ | ||
133 | SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p); | ||
134 | |||
135 | /* | ||
136 | ZstdDec_ReadUnusedFromInBuf(): | ||
137 | returns: the number of bytes that were read from InBuf | ||
138 | (*afterDecoding_tempPos) must be set to zero before first call of ZstdDec_ReadUnusedFromInBuf() | ||
139 | */ | ||
140 | size_t ZstdDec_ReadUnusedFromInBuf( | ||
141 | CZstdDecHandle dec, | ||
142 | size_t afterDecoding_tempPos, // in/out | ||
143 | void *data, size_t size); | ||
144 | |||
145 | typedef struct | ||
146 | { | ||
147 | SRes decode_SRes; // error code of data decoding | ||
148 | Byte is_NonFinishedFrame; // there is unfinished decoding for data frame or skip frame | ||
149 | Byte extraSize; | ||
150 | } CZstdDecResInfo; | ||
151 | |||
152 | /* | ||
153 | #define ZstdDecResInfo_CLEAR(p) \ | ||
154 | { (p)->decode_SRes = 0; \ | ||
155 | (p)->is_NonFinishedFrame; \ | ||
156 | (p)->extraSize = 0; \ | ||
157 | } | ||
158 | // memset(p, 0, sizeof(*p)); | ||
159 | */ | ||
160 | |||
161 | /* | ||
162 | additional error codes for CZstdDecResInfo::decode_SRes: | ||
163 | SZ_ERROR_NO_ARCHIVE - is not zstd stream (no frames) | ||
164 | SZ_ERROR_INPUT_EOF - need more data in input stream | ||
165 | */ | ||
166 | void ZstdDec_GetResInfo(const CZstdDec *dec, | ||
167 | const CZstdDecState *p, | ||
168 | SRes res, // it's result from ZstdDec_Decode() | ||
169 | CZstdDecResInfo *info); | ||
170 | |||
171 | EXTERN_C_END | ||
172 | |||
173 | #endif | ||
diff --git a/C/var_clang_arm64.mak b/C/var_clang_arm64.mak index 4b35409..971101a 100644 --- a/C/var_clang_arm64.mak +++ b/C/var_clang_arm64.mak | |||
@@ -6,6 +6,7 @@ IS_ARM64=1 | |||
6 | CROSS_COMPILE= | 6 | CROSS_COMPILE= |
7 | MY_ARCH= | 7 | MY_ARCH= |
8 | USE_ASM=1 | 8 | USE_ASM=1 |
9 | ASM_FLAGS=-Wno-unused-macros | ||
9 | CC=$(CROSS_COMPILE)clang | 10 | CC=$(CROSS_COMPILE)clang |
10 | CXX=$(CROSS_COMPILE)clang++ | 11 | CXX=$(CROSS_COMPILE)clang++ |
11 | USE_CLANG=1 | 12 | USE_CLANG=1 |