diff options
Diffstat (limited to 'C')
75 files changed, 10047 insertions, 1298 deletions
diff --git a/C/7zArcIn.c b/C/7zArcIn.c index 43fa7c2..23f2949 100644 --- a/C/7zArcIn.c +++ b/C/7zArcIn.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* 7zArcIn.c -- 7z Input functions | 1 | /* 7zArcIn.c -- 7z Input functions |
| 2 | 2023-05-11 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -301,7 +301,7 @@ static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v) | |||
| 301 | 301 | ||
| 302 | static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) | 302 | static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) |
| 303 | { | 303 | { |
| 304 | Byte b = 0; | 304 | unsigned b = 0; |
| 305 | unsigned m = 0; | 305 | unsigned m = 0; |
| 306 | UInt32 sum = 0; | 306 | UInt32 sum = 0; |
| 307 | for (; numItems != 0; numItems--) | 307 | for (; numItems != 0; numItems--) |
| @@ -312,7 +312,7 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) | |||
| 312 | m = 8; | 312 | m = 8; |
| 313 | } | 313 | } |
| 314 | m--; | 314 | m--; |
| 315 | sum += ((b >> m) & 1); | 315 | sum += (UInt32)((b >> m) & 1); |
| 316 | } | 316 | } |
| 317 | return sum; | 317 | return sum; |
| 318 | } | 318 | } |
| @@ -1,93 +1,96 @@ | |||
| 1 | /* 7zCrc.c -- CRC32 calculation and init | 1 | /* 7zCrc.c -- CRC32 calculation and init |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "7zCrc.h" | 6 | #include "7zCrc.h" |
| 7 | #include "CpuArch.h" | 7 | #include "CpuArch.h" |
| 8 | 8 | ||
| 9 | #define kCrcPoly 0xEDB88320 | 9 | // for debug: |
| 10 | // #define __ARM_FEATURE_CRC32 1 | ||
| 10 | 11 | ||
| 11 | #ifdef MY_CPU_LE | 12 | #ifdef __ARM_FEATURE_CRC32 |
| 12 | #define CRC_NUM_TABLES 8 | 13 | // #pragma message("__ARM_FEATURE_CRC32") |
| 13 | #else | 14 | #define Z7_CRC_HW_FORCE |
| 14 | #define CRC_NUM_TABLES 9 | 15 | #endif |
| 15 | 16 | ||
| 16 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 17 | // #define Z7_CRC_DEBUG_BE |
| 17 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 18 | #ifdef Z7_CRC_DEBUG_BE |
| 19 | #undef MY_CPU_LE | ||
| 20 | #define MY_CPU_BE | ||
| 18 | #endif | 21 | #endif |
| 19 | 22 | ||
| 20 | #ifndef MY_CPU_BE | 23 | #ifdef Z7_CRC_HW_FORCE |
| 21 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 24 | #define Z7_CRC_NUM_TABLES_USE 1 |
| 22 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 25 | #else |
| 26 | #ifdef Z7_CRC_NUM_TABLES | ||
| 27 | #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES | ||
| 28 | #else | ||
| 29 | #define Z7_CRC_NUM_TABLES_USE 12 | ||
| 30 | #endif | ||
| 23 | #endif | 31 | #endif |
| 24 | 32 | ||
| 25 | /* | 33 | #if Z7_CRC_NUM_TABLES_USE < 1 |
| 26 | extern | 34 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES |
| 27 | CRC_FUNC g_CrcUpdateT4; | 35 | #endif |
| 28 | CRC_FUNC g_CrcUpdateT4; | ||
| 29 | */ | ||
| 30 | extern | ||
| 31 | CRC_FUNC g_CrcUpdateT8; | ||
| 32 | CRC_FUNC g_CrcUpdateT8; | ||
| 33 | extern | ||
| 34 | CRC_FUNC g_CrcUpdateT0_32; | ||
| 35 | CRC_FUNC g_CrcUpdateT0_32; | ||
| 36 | extern | ||
| 37 | CRC_FUNC g_CrcUpdateT0_64; | ||
| 38 | CRC_FUNC g_CrcUpdateT0_64; | ||
| 39 | extern | ||
| 40 | CRC_FUNC g_CrcUpdate; | ||
| 41 | CRC_FUNC g_CrcUpdate; | ||
| 42 | |||
| 43 | UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; | ||
| 44 | |||
| 45 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 v, const void *data, size_t size) | ||
| 46 | { | ||
| 47 | return g_CrcUpdate(v, data, size, g_CrcTable); | ||
| 48 | } | ||
| 49 | 36 | ||
| 50 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) | 37 | #if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1) |
| 51 | { | 38 | #define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE |
| 52 | return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL; | 39 | #else |
| 53 | } | 40 | #define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1) |
| 41 | #endif | ||
| 54 | 42 | ||
| 55 | #if CRC_NUM_TABLES < 4 \ | 43 | #ifndef Z7_CRC_HW_FORCE |
| 56 | || (CRC_NUM_TABLES == 4 && defined(MY_CPU_BE)) \ | 44 | |
| 45 | #if Z7_CRC_NUM_TABLES_USE == 1 \ | ||
| 57 | || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) | 46 | || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) |
| 58 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 47 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
| 59 | UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table); | 48 | #define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1 |
| 60 | UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) | 49 | static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size) |
| 61 | { | 50 | { |
| 51 | const UInt32 *table = g_CrcTable; | ||
| 62 | const Byte *p = (const Byte *)data; | 52 | const Byte *p = (const Byte *)data; |
| 63 | const Byte *pEnd = p + size; | 53 | const Byte *lim = p + size; |
| 64 | for (; p != pEnd; p++) | 54 | for (; p != lim; p++) |
| 65 | v = CRC_UPDATE_BYTE_2(v, *p); | 55 | v = CRC_UPDATE_BYTE_2(v, *p); |
| 66 | return v; | 56 | return v; |
| 67 | } | 57 | } |
| 68 | #endif | 58 | #endif |
| 69 | 59 | ||
| 60 | |||
| 61 | #if Z7_CRC_NUM_TABLES_USE != 1 | ||
| 62 | #ifndef MY_CPU_BE | ||
| 63 | #define FUNC_NAME_LE_2(s) CrcUpdateT ## s | ||
| 64 | #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) | ||
| 65 | #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE) | ||
| 66 | UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
| 67 | #endif | ||
| 68 | #ifndef MY_CPU_LE | ||
| 69 | #define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s | ||
| 70 | #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) | ||
| 71 | #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE) | ||
| 72 | UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
| 73 | #endif | ||
| 74 | #endif | ||
| 75 | |||
| 76 | #endif // Z7_CRC_HW_FORCE | ||
| 77 | |||
| 70 | /* ---------- hardware CRC ---------- */ | 78 | /* ---------- hardware CRC ---------- */ |
| 71 | 79 | ||
| 72 | #ifdef MY_CPU_LE | 80 | #ifdef MY_CPU_LE |
| 73 | 81 | ||
| 74 | #if defined(MY_CPU_ARM_OR_ARM64) | 82 | #if defined(MY_CPU_ARM_OR_ARM64) |
| 75 | |||
| 76 | // #pragma message("ARM*") | 83 | // #pragma message("ARM*") |
| 77 | 84 | ||
| 78 | #if defined(_MSC_VER) | 85 | #if (defined(__clang__) && (__clang_major__ >= 3)) \ |
| 79 | #if defined(MY_CPU_ARM64) | 86 | || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \ |
| 80 | #if (_MSC_VER >= 1910) | 87 | || defined(__GNUC__) && (__GNUC__ >= 8) |
| 81 | #ifndef __clang__ | ||
| 82 | #define USE_ARM64_CRC | ||
| 83 | #include <intrin.h> | ||
| 84 | #endif | ||
| 85 | #endif | ||
| 86 | #endif | ||
| 87 | #elif (defined(__clang__) && (__clang_major__ >= 3)) \ | ||
| 88 | || (defined(__GNUC__) && (__GNUC__ > 4)) | ||
| 89 | #if !defined(__ARM_FEATURE_CRC32) | 88 | #if !defined(__ARM_FEATURE_CRC32) |
| 89 | // #pragma message("!defined(__ARM_FEATURE_CRC32)") | ||
| 90 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 90 | #define __ARM_FEATURE_CRC32 1 | 91 | #define __ARM_FEATURE_CRC32 1 |
| 92 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 93 | #define Z7_ARM_FEATURE_CRC32_WAS_SET | ||
| 91 | #if defined(__clang__) | 94 | #if defined(__clang__) |
| 92 | #if defined(MY_CPU_ARM64) | 95 | #if defined(MY_CPU_ARM64) |
| 93 | #define ATTRIB_CRC __attribute__((__target__("crc"))) | 96 | #define ATTRIB_CRC __attribute__((__target__("crc"))) |
| @@ -96,100 +99,120 @@ UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UI | |||
| 96 | #endif | 99 | #endif |
| 97 | #else | 100 | #else |
| 98 | #if defined(MY_CPU_ARM64) | 101 | #if defined(MY_CPU_ARM64) |
| 102 | #if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000) | ||
| 99 | #define ATTRIB_CRC __attribute__((__target__("+crc"))) | 103 | #define ATTRIB_CRC __attribute__((__target__("+crc"))) |
| 104 | #endif | ||
| 100 | #else | 105 | #else |
| 106 | #if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8) | ||
| 107 | #if defined(__ARM_FP) && __GNUC__ >= 8 | ||
| 108 | // for -mfloat-abi=hard: similar to <arm_acle.h> | ||
| 109 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd"))) | ||
| 110 | #else | ||
| 101 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) | 111 | #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) |
| 112 | #endif | ||
| 113 | #endif | ||
| 102 | #endif | 114 | #endif |
| 103 | #endif | 115 | #endif |
| 104 | #endif | 116 | #endif |
| 105 | #if defined(__ARM_FEATURE_CRC32) | 117 | #if defined(__ARM_FEATURE_CRC32) |
| 106 | #define USE_ARM64_CRC | 118 | // #pragma message("<arm_acle.h>") |
| 119 | /* | ||
| 120 | arm_acle.h (GGC): | ||
| 121 | before Nov 17, 2017: | ||
| 122 | #ifdef __ARM_FEATURE_CRC32 | ||
| 123 | |||
| 124 | Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked" | ||
| 125 | #if __ARM_ARCH >= 8 | ||
| 126 | #pragma GCC target ("arch=armv8-a+crc") | ||
| 127 | |||
| 128 | Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1: | ||
| 129 | #ifdef __ARM_FEATURE_CRC32 | ||
| 130 | #ifdef __ARM_FP | ||
| 131 | #pragma GCC target ("arch=armv8-a+crc+simd") | ||
| 132 | #else | ||
| 133 | #pragma GCC target ("arch=armv8-a+crc") | ||
| 134 | #endif | ||
| 135 | */ | ||
| 136 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
| 137 | #if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \ | ||
| 138 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \ | ||
| 139 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100) | ||
| 140 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 141 | // #pragma message("#define __ARM_ARCH 8") | ||
| 142 | #undef __ARM_ARCH | ||
| 143 | #define __ARM_ARCH 8 | ||
| 144 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 145 | #endif | ||
| 146 | #endif | ||
| 147 | #define Z7_CRC_HW_USE | ||
| 107 | #include <arm_acle.h> | 148 | #include <arm_acle.h> |
| 108 | #endif | 149 | #endif |
| 150 | #elif defined(_MSC_VER) | ||
| 151 | #if defined(MY_CPU_ARM64) | ||
| 152 | #if (_MSC_VER >= 1910) | ||
| 153 | #ifdef __clang__ | ||
| 154 | // #define Z7_CRC_HW_USE | ||
| 155 | // #include <arm_acle.h> | ||
| 156 | #else | ||
| 157 | #define Z7_CRC_HW_USE | ||
| 158 | #include <intrin.h> | ||
| 159 | #endif | ||
| 160 | #endif | ||
| 161 | #endif | ||
| 109 | #endif | 162 | #endif |
| 110 | 163 | ||
| 111 | #else | 164 | #else // non-ARM* |
| 112 | |||
| 113 | // no hardware CRC | ||
| 114 | |||
| 115 | // #define USE_CRC_EMU | ||
| 116 | |||
| 117 | #ifdef USE_CRC_EMU | ||
| 118 | |||
| 119 | #pragma message("ARM64 CRC emulation") | ||
| 120 | |||
| 121 | Z7_FORCE_INLINE | ||
| 122 | UInt32 __crc32b(UInt32 v, UInt32 data) | ||
| 123 | { | ||
| 124 | const UInt32 *table = g_CrcTable; | ||
| 125 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); | ||
| 126 | return v; | ||
| 127 | } | ||
| 128 | 165 | ||
| 129 | Z7_FORCE_INLINE | 166 | // #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code |
| 130 | UInt32 __crc32w(UInt32 v, UInt32 data) | 167 | #ifdef Z7_CRC_HW_USE |
| 131 | { | 168 | #include "7zCrcEmu.h" |
| 132 | const UInt32 *table = g_CrcTable; | 169 | #endif |
| 133 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 134 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 135 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 136 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 137 | return v; | ||
| 138 | } | ||
| 139 | 170 | ||
| 140 | Z7_FORCE_INLINE | 171 | #endif // non-ARM* |
| 141 | UInt32 __crc32d(UInt32 v, UInt64 data) | ||
| 142 | { | ||
| 143 | const UInt32 *table = g_CrcTable; | ||
| 144 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 145 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 146 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 147 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 148 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 149 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 150 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 151 | v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; | ||
| 152 | return v; | ||
| 153 | } | ||
| 154 | 172 | ||
| 155 | #endif // USE_CRC_EMU | ||
| 156 | 173 | ||
| 157 | #endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE) | ||
| 158 | 174 | ||
| 175 | #if defined(Z7_CRC_HW_USE) | ||
| 159 | 176 | ||
| 177 | // #pragma message("USE ARM HW CRC") | ||
| 160 | 178 | ||
| 161 | #if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) | 179 | #ifdef MY_CPU_64BIT |
| 180 | #define CRC_HW_WORD_TYPE UInt64 | ||
| 181 | #define CRC_HW_WORD_FUNC __crc32d | ||
| 182 | #else | ||
| 183 | #define CRC_HW_WORD_TYPE UInt32 | ||
| 184 | #define CRC_HW_WORD_FUNC __crc32w | ||
| 185 | #endif | ||
| 162 | 186 | ||
| 163 | #define T0_32_UNROLL_BYTES (4 * 4) | 187 | #define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4) |
| 164 | #define T0_64_UNROLL_BYTES (4 * 8) | ||
| 165 | 188 | ||
| 166 | #ifndef ATTRIB_CRC | 189 | #ifdef ATTRIB_CRC |
| 167 | #define ATTRIB_CRC | 190 | ATTRIB_CRC |
| 168 | #endif | 191 | #endif |
| 169 | // #pragma message("USE ARM HW CRC") | 192 | Z7_NO_INLINE |
| 170 | 193 | #ifdef Z7_CRC_HW_FORCE | |
| 171 | ATTRIB_CRC | 194 | UInt32 Z7_FASTCALL CrcUpdate |
| 172 | UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); | 195 | #else |
| 173 | ATTRIB_CRC | 196 | static UInt32 Z7_FASTCALL CrcUpdate_HW |
| 174 | UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) | 197 | #endif |
| 198 | (UInt32 v, const void *data, size_t size) | ||
| 175 | { | 199 | { |
| 176 | const Byte *p = (const Byte *)data; | 200 | const Byte *p = (const Byte *)data; |
| 177 | UNUSED_VAR(table); | 201 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--) |
| 178 | |||
| 179 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--) | ||
| 180 | v = __crc32b(v, *p++); | 202 | v = __crc32b(v, *p++); |
| 181 | 203 | if (size >= CRC_HW_UNROLL_BYTES) | |
| 182 | if (size >= T0_32_UNROLL_BYTES) | ||
| 183 | { | 204 | { |
| 184 | const Byte *lim = p + size; | 205 | const Byte *lim = p + size; |
| 185 | size &= (T0_32_UNROLL_BYTES - 1); | 206 | size &= CRC_HW_UNROLL_BYTES - 1; |
| 186 | lim -= size; | 207 | lim -= size; |
| 187 | do | 208 | do |
| 188 | { | 209 | { |
| 189 | v = __crc32w(v, *(const UInt32 *)(const void *)(p)); | 210 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); |
| 190 | v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; | 211 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); |
| 191 | v = __crc32w(v, *(const UInt32 *)(const void *)(p)); | 212 | p += 2 * sizeof(CRC_HW_WORD_TYPE); |
| 192 | v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; | 213 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); |
| 214 | v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); | ||
| 215 | p += 2 * sizeof(CRC_HW_WORD_TYPE); | ||
| 193 | } | 216 | } |
| 194 | while (p != lim); | 217 | while (p != lim); |
| 195 | } | 218 | } |
| @@ -200,46 +223,86 @@ UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const | |||
| 200 | return v; | 223 | return v; |
| 201 | } | 224 | } |
| 202 | 225 | ||
| 203 | ATTRIB_CRC | 226 | #ifdef Z7_ARM_FEATURE_CRC32_WAS_SET |
| 204 | UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); | 227 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
| 205 | ATTRIB_CRC | 228 | #undef __ARM_FEATURE_CRC32 |
| 206 | UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) | 229 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
| 230 | #undef Z7_ARM_FEATURE_CRC32_WAS_SET | ||
| 231 | #endif | ||
| 232 | |||
| 233 | #endif // defined(Z7_CRC_HW_USE) | ||
| 234 | #endif // MY_CPU_LE | ||
| 235 | |||
| 236 | |||
| 237 | |||
| 238 | #ifndef Z7_CRC_HW_FORCE | ||
| 239 | |||
| 240 | #if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) | ||
| 241 | /* | ||
| 242 | typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC) | ||
| 243 | (UInt32 v, const void *data, size_t size, const UInt32 *table); | ||
| 244 | Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate; | ||
| 245 | */ | ||
| 246 | static unsigned g_Crc_Algo; | ||
| 247 | #if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) | ||
| 248 | static unsigned g_Crc_Be; | ||
| 249 | #endif | ||
| 250 | #endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) | ||
| 251 | |||
| 252 | |||
| 253 | |||
| 254 | Z7_NO_INLINE | ||
| 255 | #ifdef Z7_CRC_HW_USE | ||
| 256 | static UInt32 Z7_FASTCALL CrcUpdate_Base | ||
| 257 | #else | ||
| 258 | UInt32 Z7_FASTCALL CrcUpdate | ||
| 259 | #endif | ||
| 260 | (UInt32 crc, const void *data, size_t size) | ||
| 207 | { | 261 | { |
| 208 | const Byte *p = (const Byte *)data; | 262 | #if Z7_CRC_NUM_TABLES_USE == 1 |
| 209 | UNUSED_VAR(table); | 263 | return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); |
| 264 | #else // Z7_CRC_NUM_TABLES_USE != 1 | ||
| 265 | #ifdef Z7_CRC_UPDATE_T1_FUNC_NAME | ||
| 266 | if (g_Crc_Algo == 1) | ||
| 267 | return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); | ||
| 268 | #endif | ||
| 210 | 269 | ||
| 211 | for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--) | 270 | #ifdef MY_CPU_LE |
| 212 | v = __crc32b(v, *p++); | 271 | return FUNC_NAME_LE(crc, data, size, g_CrcTable); |
| 272 | #elif defined(MY_CPU_BE) | ||
| 273 | return FUNC_NAME_BE(crc, data, size, g_CrcTable); | ||
| 274 | #else | ||
| 275 | if (g_Crc_Be) | ||
| 276 | return FUNC_NAME_BE(crc, data, size, g_CrcTable); | ||
| 277 | else | ||
| 278 | return FUNC_NAME_LE(crc, data, size, g_CrcTable); | ||
| 279 | #endif | ||
| 280 | #endif // Z7_CRC_NUM_TABLES_USE != 1 | ||
| 281 | } | ||
| 213 | 282 | ||
| 214 | if (size >= T0_64_UNROLL_BYTES) | ||
| 215 | { | ||
| 216 | const Byte *lim = p + size; | ||
| 217 | size &= (T0_64_UNROLL_BYTES - 1); | ||
| 218 | lim -= size; | ||
| 219 | do | ||
| 220 | { | ||
| 221 | v = __crc32d(v, *(const UInt64 *)(const void *)(p)); | ||
| 222 | v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; | ||
| 223 | v = __crc32d(v, *(const UInt64 *)(const void *)(p)); | ||
| 224 | v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; | ||
| 225 | } | ||
| 226 | while (p != lim); | ||
| 227 | } | ||
| 228 | |||
| 229 | for (; size != 0; size--) | ||
| 230 | v = __crc32b(v, *p++); | ||
| 231 | 283 | ||
| 232 | return v; | 284 | #ifdef Z7_CRC_HW_USE |
| 285 | Z7_NO_INLINE | ||
| 286 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size) | ||
| 287 | { | ||
| 288 | if (g_Crc_Algo == 0) | ||
| 289 | return CrcUpdate_HW(crc, data, size); | ||
| 290 | return CrcUpdate_Base(crc, data, size); | ||
| 233 | } | 291 | } |
| 292 | #endif | ||
| 234 | 293 | ||
| 235 | #undef T0_32_UNROLL_BYTES | 294 | #endif // !defined(Z7_CRC_HW_FORCE) |
| 236 | #undef T0_64_UNROLL_BYTES | ||
| 237 | 295 | ||
| 238 | #endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) | ||
| 239 | 296 | ||
| 240 | #endif // MY_CPU_LE | 297 | |
| 298 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) | ||
| 299 | { | ||
| 300 | return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL; | ||
| 301 | } | ||
| 241 | 302 | ||
| 242 | 303 | ||
| 304 | MY_ALIGN(64) | ||
| 305 | UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL]; | ||
| 243 | 306 | ||
| 244 | 307 | ||
| 245 | void Z7_FASTCALL CrcGenerateTable(void) | 308 | void Z7_FASTCALL CrcGenerateTable(void) |
| @@ -247,94 +310,111 @@ void Z7_FASTCALL CrcGenerateTable(void) | |||
| 247 | UInt32 i; | 310 | UInt32 i; |
| 248 | for (i = 0; i < 256; i++) | 311 | for (i = 0; i < 256; i++) |
| 249 | { | 312 | { |
| 313 | #if defined(Z7_CRC_HW_FORCE) | ||
| 314 | g_CrcTable[i] = __crc32b(i, 0); | ||
| 315 | #else | ||
| 316 | #define kCrcPoly 0xEDB88320 | ||
| 250 | UInt32 r = i; | 317 | UInt32 r = i; |
| 251 | unsigned j; | 318 | unsigned j; |
| 252 | for (j = 0; j < 8; j++) | 319 | for (j = 0; j < 8; j++) |
| 253 | r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); | 320 | r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); |
| 254 | g_CrcTable[i] = r; | 321 | g_CrcTable[i] = r; |
| 322 | #endif | ||
| 255 | } | 323 | } |
| 256 | for (i = 256; i < 256 * CRC_NUM_TABLES; i++) | 324 | for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++) |
| 257 | { | 325 | { |
| 258 | const UInt32 r = g_CrcTable[(size_t)i - 256]; | 326 | const UInt32 r = g_CrcTable[(size_t)i - 256]; |
| 259 | g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); | 327 | g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); |
| 260 | } | 328 | } |
| 261 | 329 | ||
| 262 | #if CRC_NUM_TABLES < 4 | 330 | #if !defined(Z7_CRC_HW_FORCE) && \ |
| 263 | g_CrcUpdate = CrcUpdateT1; | 331 | (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE)) |
| 264 | #elif defined(MY_CPU_LE) | 332 | |
| 265 | // g_CrcUpdateT4 = CrcUpdateT4; | 333 | #if Z7_CRC_NUM_TABLES_USE <= 1 |
| 266 | #if CRC_NUM_TABLES < 8 | 334 | g_Crc_Algo = 1; |
| 267 | g_CrcUpdate = CrcUpdateT4; | 335 | #else // Z7_CRC_NUM_TABLES_USE <= 1 |
| 268 | #else // CRC_NUM_TABLES >= 8 | 336 | |
| 269 | g_CrcUpdateT8 = CrcUpdateT8; | 337 | #if defined(MY_CPU_LE) |
| 270 | /* | 338 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
| 271 | #ifdef MY_CPU_X86_OR_AMD64 | 339 | #else // !defined(MY_CPU_LE) |
| 272 | if (!CPU_Is_InOrder()) | ||
| 273 | #endif | ||
| 274 | */ | ||
| 275 | g_CrcUpdate = CrcUpdateT8; | ||
| 276 | #endif | ||
| 277 | #else | ||
| 278 | { | 340 | { |
| 279 | #ifndef MY_CPU_BE | 341 | #ifndef MY_CPU_BE |
| 280 | UInt32 k = 0x01020304; | 342 | UInt32 k = 0x01020304; |
| 281 | const Byte *p = (const Byte *)&k; | 343 | const Byte *p = (const Byte *)&k; |
| 282 | if (p[0] == 4 && p[1] == 3) | 344 | if (p[0] == 4 && p[1] == 3) |
| 283 | { | 345 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
| 284 | #if CRC_NUM_TABLES < 8 | ||
| 285 | // g_CrcUpdateT4 = CrcUpdateT4; | ||
| 286 | g_CrcUpdate = CrcUpdateT4; | ||
| 287 | #else // CRC_NUM_TABLES >= 8 | ||
| 288 | g_CrcUpdateT8 = CrcUpdateT8; | ||
| 289 | g_CrcUpdate = CrcUpdateT8; | ||
| 290 | #endif | ||
| 291 | } | ||
| 292 | else if (p[0] != 1 || p[1] != 2) | 346 | else if (p[0] != 1 || p[1] != 2) |
| 293 | g_CrcUpdate = CrcUpdateT1; | 347 | g_Crc_Algo = 1; |
| 294 | else | 348 | else |
| 295 | #endif // MY_CPU_BE | 349 | #endif // MY_CPU_BE |
| 296 | { | 350 | { |
| 297 | for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) | 351 | for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--) |
| 298 | { | 352 | { |
| 299 | const UInt32 x = g_CrcTable[(size_t)i - 256]; | 353 | const UInt32 x = g_CrcTable[(size_t)i - 256]; |
| 300 | g_CrcTable[i] = Z7_BSWAP32(x); | 354 | g_CrcTable[i] = Z7_BSWAP32(x); |
| 301 | } | 355 | } |
| 302 | #if CRC_NUM_TABLES <= 4 | 356 | #if defined(Z7_CRC_UPDATE_T1_FUNC_NAME) |
| 303 | g_CrcUpdate = CrcUpdateT1; | 357 | g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; |
| 304 | #elif CRC_NUM_TABLES <= 8 | 358 | #endif |
| 305 | // g_CrcUpdateT4 = CrcUpdateT1_BeT4; | 359 | #if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) |
| 306 | g_CrcUpdate = CrcUpdateT1_BeT4; | 360 | g_Crc_Be = 1; |
| 307 | #else // CRC_NUM_TABLES > 8 | 361 | #endif |
| 308 | g_CrcUpdateT8 = CrcUpdateT1_BeT8; | ||
| 309 | g_CrcUpdate = CrcUpdateT1_BeT8; | ||
| 310 | #endif | ||
| 311 | } | 362 | } |
| 312 | } | 363 | } |
| 313 | #endif // CRC_NUM_TABLES < 4 | 364 | #endif // !defined(MY_CPU_LE) |
| 314 | 365 | ||
| 315 | #ifdef MY_CPU_LE | 366 | #ifdef MY_CPU_LE |
| 316 | #ifdef USE_ARM64_CRC | 367 | #ifdef Z7_CRC_HW_USE |
| 317 | if (CPU_IsSupported_CRC32()) | 368 | if (CPU_IsSupported_CRC32()) |
| 318 | { | 369 | g_Crc_Algo = 0; |
| 319 | g_CrcUpdateT0_32 = CrcUpdateT0_32; | 370 | #endif // Z7_CRC_HW_USE |
| 320 | g_CrcUpdateT0_64 = CrcUpdateT0_64; | 371 | #endif // MY_CPU_LE |
| 321 | g_CrcUpdate = | 372 | |
| 322 | #if defined(MY_CPU_ARM) | 373 | #endif // Z7_CRC_NUM_TABLES_USE <= 1 |
| 323 | CrcUpdateT0_32; | 374 | #endif // g_Crc_Algo was declared |
| 324 | #else | 375 | } |
| 325 | CrcUpdateT0_64; | 376 | |
| 326 | #endif | 377 | Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo) |
| 327 | } | 378 | { |
| 328 | #endif | 379 | if (algo == 0) |
| 329 | 380 | return &CrcUpdate; | |
| 330 | #ifdef USE_CRC_EMU | 381 | |
| 331 | g_CrcUpdateT0_32 = CrcUpdateT0_32; | 382 | #if defined(Z7_CRC_HW_USE) |
| 332 | g_CrcUpdateT0_64 = CrcUpdateT0_64; | 383 | if (algo == sizeof(CRC_HW_WORD_TYPE) * 8) |
| 333 | g_CrcUpdate = CrcUpdateT0_64; | 384 | { |
| 334 | #endif | 385 | #ifdef Z7_CRC_HW_FORCE |
| 386 | return &CrcUpdate; | ||
| 387 | #else | ||
| 388 | if (g_Crc_Algo == 0) | ||
| 389 | return &CrcUpdate_HW; | ||
| 390 | #endif | ||
| 391 | } | ||
| 392 | #endif | ||
| 393 | |||
| 394 | #ifndef Z7_CRC_HW_FORCE | ||
| 395 | if (algo == Z7_CRC_NUM_TABLES_USE) | ||
| 396 | return | ||
| 397 | #ifdef Z7_CRC_HW_USE | ||
| 398 | &CrcUpdate_Base; | ||
| 399 | #else | ||
| 400 | &CrcUpdate; | ||
| 335 | #endif | 401 | #endif |
| 402 | #endif | ||
| 403 | |||
| 404 | return NULL; | ||
| 336 | } | 405 | } |
| 337 | 406 | ||
| 338 | #undef kCrcPoly | 407 | #undef kCrcPoly |
| 339 | #undef CRC64_NUM_TABLES | 408 | #undef Z7_CRC_NUM_TABLES_USE |
| 409 | #undef Z7_CRC_NUM_TABLES_TOTAL | ||
| 340 | #undef CRC_UPDATE_BYTE_2 | 410 | #undef CRC_UPDATE_BYTE_2 |
| 411 | #undef FUNC_NAME_LE_2 | ||
| 412 | #undef FUNC_NAME_LE_1 | ||
| 413 | #undef FUNC_NAME_LE | ||
| 414 | #undef FUNC_NAME_BE_2 | ||
| 415 | #undef FUNC_NAME_BE_1 | ||
| 416 | #undef FUNC_NAME_BE | ||
| 417 | |||
| 418 | #undef CRC_HW_UNROLL_BYTES | ||
| 419 | #undef CRC_HW_WORD_FUNC | ||
| 420 | #undef CRC_HW_WORD_TYPE | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* 7zCrc.h -- CRC32 calculation | 1 | /* 7zCrc.h -- CRC32 calculation |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_7Z_CRC_H | 4 | #ifndef ZIP7_INC_7Z_CRC_H |
| 5 | #define ZIP7_INC_7Z_CRC_H | 5 | #define ZIP7_INC_7Z_CRC_H |
| @@ -20,7 +20,8 @@ void Z7_FASTCALL CrcGenerateTable(void); | |||
| 20 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); | 20 | UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); |
| 21 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); | 21 | UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); |
| 22 | 22 | ||
| 23 | typedef UInt32 (Z7_FASTCALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); | 23 | typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size); |
| 24 | Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo); | ||
| 24 | 25 | ||
| 25 | EXTERN_C_END | 26 | EXTERN_C_END |
| 26 | 27 | ||
diff --git a/C/7zCrcOpt.c b/C/7zCrcOpt.c index 9c64929..9408017 100644 --- a/C/7zCrcOpt.c +++ b/C/7zCrcOpt.c | |||
| @@ -1,117 +1,199 @@ | |||
| 1 | /* 7zCrcOpt.c -- CRC32 calculation | 1 | /* 7zCrcOpt.c -- CRC32 calculation (optimized functions) |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-07 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
| 7 | 7 | ||
| 8 | #if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1 | ||
| 9 | |||
| 10 | // for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu | ||
| 11 | // #define Z7_CRC_DEBUG_BE | ||
| 12 | #ifdef Z7_CRC_DEBUG_BE | ||
| 13 | #undef MY_CPU_LE | ||
| 14 | #define MY_CPU_BE | ||
| 15 | #endif | ||
| 16 | |||
| 17 | // the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c | ||
| 18 | #ifdef Z7_CRC_NUM_TABLES | ||
| 19 | #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES | ||
| 20 | #else | ||
| 21 | #define Z7_CRC_NUM_TABLES_USE 12 | ||
| 22 | #endif | ||
| 23 | |||
| 24 | #if Z7_CRC_NUM_TABLES_USE % 4 || \ | ||
| 25 | Z7_CRC_NUM_TABLES_USE < 4 * 1 || \ | ||
| 26 | Z7_CRC_NUM_TABLES_USE > 4 * 6 | ||
| 27 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
| 28 | #endif | ||
| 29 | |||
| 30 | |||
| 8 | #ifndef MY_CPU_BE | 31 | #ifndef MY_CPU_BE |
| 9 | 32 | ||
| 10 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 33 | #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
| 11 | 34 | ||
| 12 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); | 35 | #define Q(n, d) \ |
| 13 | UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) | 36 | ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \ |
| 14 | { | 37 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ |
| 15 | const Byte *p = (const Byte *)data; | 38 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ |
| 16 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 39 | ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) |
| 17 | v = CRC_UPDATE_BYTE_2(v, *p); | 40 | |
| 18 | for (; size >= 4; size -= 4, p += 4) | 41 | #define R(a) *((const UInt32 *)(const void *)p + (a)) |
| 19 | { | 42 | |
| 20 | v ^= *(const UInt32 *)(const void *)p; | 43 | #define CRC_FUNC_PRE_LE2(step) \ |
| 21 | v = | 44 | UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) |
| 22 | (table + 0x300)[((v ) & 0xFF)] | ||
| 23 | ^ (table + 0x200)[((v >> 8) & 0xFF)] | ||
| 24 | ^ (table + 0x100)[((v >> 16) & 0xFF)] | ||
| 25 | ^ (table + 0x000)[((v >> 24))]; | ||
| 26 | } | ||
| 27 | for (; size > 0; size--, p++) | ||
| 28 | v = CRC_UPDATE_BYTE_2(v, *p); | ||
| 29 | return v; | ||
| 30 | } | ||
| 31 | 45 | ||
| 32 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); | 46 | #define CRC_FUNC_PRE_LE(step) \ |
| 33 | UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) | 47 | CRC_FUNC_PRE_LE2(step); \ |
| 48 | CRC_FUNC_PRE_LE2(step) | ||
| 49 | |||
| 50 | CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE) | ||
| 34 | { | 51 | { |
| 35 | const Byte *p = (const Byte *)data; | 52 | const Byte *p = (const Byte *)data; |
| 36 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) | 53 | const Byte *lim; |
| 54 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) | ||
| 37 | v = CRC_UPDATE_BYTE_2(v, *p); | 55 | v = CRC_UPDATE_BYTE_2(v, *p); |
| 38 | for (; size >= 8; size -= 8, p += 8) | 56 | lim = p + size; |
| 57 | if (size >= Z7_CRC_NUM_TABLES_USE) | ||
| 39 | { | 58 | { |
| 40 | UInt32 d; | 59 | lim -= Z7_CRC_NUM_TABLES_USE; |
| 41 | v ^= *(const UInt32 *)(const void *)p; | 60 | do |
| 42 | v = | 61 | { |
| 43 | (table + 0x700)[((v ) & 0xFF)] | 62 | v ^= R(0); |
| 44 | ^ (table + 0x600)[((v >> 8) & 0xFF)] | 63 | { |
| 45 | ^ (table + 0x500)[((v >> 16) & 0xFF)] | 64 | #if Z7_CRC_NUM_TABLES_USE == 1 * 4 |
| 46 | ^ (table + 0x400)[((v >> 24))]; | 65 | v = Q(0, v); |
| 47 | d = *((const UInt32 *)(const void *)p + 1); | 66 | #else |
| 48 | v ^= | 67 | #define U2(r, op) \ |
| 49 | (table + 0x300)[((d ) & 0xFF)] | 68 | { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } |
| 50 | ^ (table + 0x200)[((d >> 8) & 0xFF)] | 69 | UInt32 d, x; |
| 51 | ^ (table + 0x100)[((d >> 16) & 0xFF)] | 70 | U2(1, =) |
| 52 | ^ (table + 0x000)[((d >> 24))]; | 71 | #if Z7_CRC_NUM_TABLES_USE >= 3 * 4 |
| 72 | #define U(r) U2(r, ^=) | ||
| 73 | U(2) | ||
| 74 | #if Z7_CRC_NUM_TABLES_USE >= 4 * 4 | ||
| 75 | U(3) | ||
| 76 | #if Z7_CRC_NUM_TABLES_USE >= 5 * 4 | ||
| 77 | U(4) | ||
| 78 | #if Z7_CRC_NUM_TABLES_USE >= 6 * 4 | ||
| 79 | U(5) | ||
| 80 | #if Z7_CRC_NUM_TABLES_USE >= 7 * 4 | ||
| 81 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
| 82 | #endif | ||
| 83 | #endif | ||
| 84 | #endif | ||
| 85 | #endif | ||
| 86 | #endif | ||
| 87 | #undef U | ||
| 88 | #undef U2 | ||
| 89 | v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); | ||
| 90 | #endif | ||
| 91 | } | ||
| 92 | p += Z7_CRC_NUM_TABLES_USE; | ||
| 93 | } | ||
| 94 | while (p <= lim); | ||
| 95 | lim += Z7_CRC_NUM_TABLES_USE; | ||
| 53 | } | 96 | } |
| 54 | for (; size > 0; size--, p++) | 97 | for (; p < lim; p++) |
| 55 | v = CRC_UPDATE_BYTE_2(v, *p); | 98 | v = CRC_UPDATE_BYTE_2(v, *p); |
| 56 | return v; | 99 | return v; |
| 57 | } | 100 | } |
| 58 | 101 | ||
| 102 | #undef CRC_UPDATE_BYTE_2 | ||
| 103 | #undef R | ||
| 104 | #undef Q | ||
| 105 | #undef CRC_FUNC_PRE_LE | ||
| 106 | #undef CRC_FUNC_PRE_LE2 | ||
| 107 | |||
| 59 | #endif | 108 | #endif |
| 60 | 109 | ||
| 61 | 110 | ||
| 111 | |||
| 112 | |||
| 62 | #ifndef MY_CPU_LE | 113 | #ifndef MY_CPU_LE |
| 63 | 114 | ||
| 64 | #define CRC_UINT32_SWAP(v) Z7_BSWAP32(v) | 115 | #define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8)) |
| 65 | 116 | ||
| 66 | #define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) | 117 | #define Q(n, d) \ |
| 118 | ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \ | ||
| 119 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
| 120 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
| 121 | ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) | ||
| 67 | 122 | ||
| 68 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) | 123 | #ifdef Z7_CRC_DEBUG_BE |
| 69 | { | 124 | #define R(a) GetBe32a((const UInt32 *)(const void *)p + (a)) |
| 70 | const Byte *p = (const Byte *)data; | 125 | #else |
| 71 | table += 0x100; | 126 | #define R(a) *((const UInt32 *)(const void *)p + (a)) |
| 72 | v = CRC_UINT32_SWAP(v); | 127 | #endif |
| 73 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 128 | |
| 74 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 129 | |
| 75 | for (; size >= 4; size -= 4, p += 4) | 130 | #define CRC_FUNC_PRE_BE2(step) \ |
| 76 | { | 131 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) |
| 77 | v ^= *(const UInt32 *)(const void *)p; | ||
| 78 | v = | ||
| 79 | (table + 0x000)[((v ) & 0xFF)] | ||
| 80 | ^ (table + 0x100)[((v >> 8) & 0xFF)] | ||
| 81 | ^ (table + 0x200)[((v >> 16) & 0xFF)] | ||
| 82 | ^ (table + 0x300)[((v >> 24))]; | ||
| 83 | } | ||
| 84 | for (; size > 0; size--, p++) | ||
| 85 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | ||
| 86 | return CRC_UINT32_SWAP(v); | ||
| 87 | } | ||
| 88 | 132 | ||
| 89 | UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) | 133 | #define CRC_FUNC_PRE_BE(step) \ |
| 134 | CRC_FUNC_PRE_BE2(step); \ | ||
| 135 | CRC_FUNC_PRE_BE2(step) | ||
| 136 | |||
| 137 | CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE) | ||
| 90 | { | 138 | { |
| 91 | const Byte *p = (const Byte *)data; | 139 | const Byte *p = (const Byte *)data; |
| 140 | const Byte *lim; | ||
| 92 | table += 0x100; | 141 | table += 0x100; |
| 93 | v = CRC_UINT32_SWAP(v); | 142 | v = Z7_BSWAP32(v); |
| 94 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) | 143 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) |
| 95 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 144 | v = CRC_UPDATE_BYTE_2_BE(v, *p); |
| 96 | for (; size >= 8; size -= 8, p += 8) | 145 | lim = p + size; |
| 146 | if (size >= Z7_CRC_NUM_TABLES_USE) | ||
| 97 | { | 147 | { |
| 98 | UInt32 d; | 148 | lim -= Z7_CRC_NUM_TABLES_USE; |
| 99 | v ^= *(const UInt32 *)(const void *)p; | 149 | do |
| 100 | v = | 150 | { |
| 101 | (table + 0x400)[((v ) & 0xFF)] | 151 | v ^= R(0); |
| 102 | ^ (table + 0x500)[((v >> 8) & 0xFF)] | 152 | { |
| 103 | ^ (table + 0x600)[((v >> 16) & 0xFF)] | 153 | #if Z7_CRC_NUM_TABLES_USE == 1 * 4 |
| 104 | ^ (table + 0x700)[((v >> 24))]; | 154 | v = Q(0, v); |
| 105 | d = *((const UInt32 *)(const void *)p + 1); | 155 | #else |
| 106 | v ^= | 156 | #define U2(r, op) \ |
| 107 | (table + 0x000)[((d ) & 0xFF)] | 157 | { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } |
| 108 | ^ (table + 0x100)[((d >> 8) & 0xFF)] | 158 | UInt32 d, x; |
| 109 | ^ (table + 0x200)[((d >> 16) & 0xFF)] | 159 | U2(1, =) |
| 110 | ^ (table + 0x300)[((d >> 24))]; | 160 | #if Z7_CRC_NUM_TABLES_USE >= 3 * 4 |
| 161 | #define U(r) U2(r, ^=) | ||
| 162 | U(2) | ||
| 163 | #if Z7_CRC_NUM_TABLES_USE >= 4 * 4 | ||
| 164 | U(3) | ||
| 165 | #if Z7_CRC_NUM_TABLES_USE >= 5 * 4 | ||
| 166 | U(4) | ||
| 167 | #if Z7_CRC_NUM_TABLES_USE >= 6 * 4 | ||
| 168 | U(5) | ||
| 169 | #if Z7_CRC_NUM_TABLES_USE >= 7 * 4 | ||
| 170 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
| 171 | #endif | ||
| 172 | #endif | ||
| 173 | #endif | ||
| 174 | #endif | ||
| 175 | #endif | ||
| 176 | #undef U | ||
| 177 | #undef U2 | ||
| 178 | v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); | ||
| 179 | #endif | ||
| 180 | } | ||
| 181 | p += Z7_CRC_NUM_TABLES_USE; | ||
| 182 | } | ||
| 183 | while (p <= lim); | ||
| 184 | lim += Z7_CRC_NUM_TABLES_USE; | ||
| 111 | } | 185 | } |
| 112 | for (; size > 0; size--, p++) | 186 | for (; p < lim; p++) |
| 113 | v = CRC_UPDATE_BYTE_2_BE(v, *p); | 187 | v = CRC_UPDATE_BYTE_2_BE(v, *p); |
| 114 | return CRC_UINT32_SWAP(v); | 188 | return Z7_BSWAP32(v); |
| 115 | } | 189 | } |
| 116 | 190 | ||
| 191 | #undef CRC_UPDATE_BYTE_2_BE | ||
| 192 | #undef R | ||
| 193 | #undef Q | ||
| 194 | #undef CRC_FUNC_PRE_BE | ||
| 195 | #undef CRC_FUNC_PRE_BE2 | ||
| 196 | |||
| 197 | #endif | ||
| 198 | #undef Z7_CRC_NUM_TABLES_USE | ||
| 117 | #endif | 199 | #endif |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* 7zDec.c -- Decoding from 7z folder | 1 | /* 7zDec.c -- Decoding from 7z folder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -51,6 +51,7 @@ | |||
| 51 | 51 | ||
| 52 | #ifndef Z7_NO_METHODS_FILTERS | 52 | #ifndef Z7_NO_METHODS_FILTERS |
| 53 | #define k_Delta 3 | 53 | #define k_Delta 3 |
| 54 | #define k_RISCV 0xb | ||
| 54 | #define k_BCJ 0x3030103 | 55 | #define k_BCJ 0x3030103 |
| 55 | #define k_PPC 0x3030205 | 56 | #define k_PPC 0x3030205 |
| 56 | #define k_IA64 0x3030401 | 57 | #define k_IA64 0x3030401 |
| @@ -362,6 +363,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f) | |||
| 362 | case k_IA64: | 363 | case k_IA64: |
| 363 | case k_SPARC: | 364 | case k_SPARC: |
| 364 | case k_ARM: | 365 | case k_ARM: |
| 366 | case k_RISCV: | ||
| 365 | #endif | 367 | #endif |
| 366 | #ifdef Z7_USE_FILTER_ARM64 | 368 | #ifdef Z7_USE_FILTER_ARM64 |
| 367 | case k_ARM64: | 369 | case k_ARM64: |
| @@ -535,10 +537,10 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
| 535 | } | 537 | } |
| 536 | } | 538 | } |
| 537 | } | 539 | } |
| 538 | #if defined(Z7_USE_BRANCH_FILTER) | 540 | #if defined(Z7_USE_BRANCH_FILTER) |
| 539 | else if (ci == 1) | 541 | else if (ci == 1) |
| 540 | { | 542 | { |
| 541 | #if !defined(Z7_NO_METHODS_FILTERS) | 543 | #if !defined(Z7_NO_METHODS_FILTERS) |
| 542 | if (coder->MethodID == k_Delta) | 544 | if (coder->MethodID == k_Delta) |
| 543 | { | 545 | { |
| 544 | if (coder->PropsSize != 1) | 546 | if (coder->PropsSize != 1) |
| @@ -550,22 +552,43 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
| 550 | } | 552 | } |
| 551 | continue; | 553 | continue; |
| 552 | } | 554 | } |
| 553 | #endif | 555 | #endif |
| 554 | 556 | ||
| 555 | #ifdef Z7_USE_FILTER_ARM64 | 557 | #ifdef Z7_USE_FILTER_ARM64 |
| 556 | if (coder->MethodID == k_ARM64) | 558 | if (coder->MethodID == k_ARM64) |
| 557 | { | 559 | { |
| 558 | UInt32 pc = 0; | 560 | UInt32 pc = 0; |
| 559 | if (coder->PropsSize == 4) | 561 | if (coder->PropsSize == 4) |
| 562 | { | ||
| 560 | pc = GetUi32(propsData + coder->PropsOffset); | 563 | pc = GetUi32(propsData + coder->PropsOffset); |
| 564 | if (pc & 3) | ||
| 565 | return SZ_ERROR_UNSUPPORTED; | ||
| 566 | } | ||
| 561 | else if (coder->PropsSize != 0) | 567 | else if (coder->PropsSize != 0) |
| 562 | return SZ_ERROR_UNSUPPORTED; | 568 | return SZ_ERROR_UNSUPPORTED; |
| 563 | z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); | 569 | z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); |
| 564 | continue; | 570 | continue; |
| 565 | } | 571 | } |
| 566 | #endif | 572 | #endif |
| 567 | 573 | ||
| 568 | #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) | 574 | #if !defined(Z7_NO_METHODS_FILTERS) |
| 575 | if (coder->MethodID == k_RISCV) | ||
| 576 | { | ||
| 577 | UInt32 pc = 0; | ||
| 578 | if (coder->PropsSize == 4) | ||
| 579 | { | ||
| 580 | pc = GetUi32(propsData + coder->PropsOffset); | ||
| 581 | if (pc & 1) | ||
| 582 | return SZ_ERROR_UNSUPPORTED; | ||
| 583 | } | ||
| 584 | else if (coder->PropsSize != 0) | ||
| 585 | return SZ_ERROR_UNSUPPORTED; | ||
| 586 | z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc); | ||
| 587 | continue; | ||
| 588 | } | ||
| 589 | #endif | ||
| 590 | |||
| 591 | #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) | ||
| 569 | { | 592 | { |
| 570 | if (coder->PropsSize != 0) | 593 | if (coder->PropsSize != 0) |
| 571 | return SZ_ERROR_UNSUPPORTED; | 594 | return SZ_ERROR_UNSUPPORTED; |
| @@ -579,7 +602,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
| 579 | z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 | 602 | z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 |
| 580 | break; | 603 | break; |
| 581 | } | 604 | } |
| 582 | CASE_BRA_CONV(PPC) | 605 | case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0; |
| 606 | // CASE_BRA_CONV(PPC) | ||
| 583 | CASE_BRA_CONV(IA64) | 607 | CASE_BRA_CONV(IA64) |
| 584 | CASE_BRA_CONV(SPARC) | 608 | CASE_BRA_CONV(SPARC) |
| 585 | CASE_BRA_CONV(ARM) | 609 | CASE_BRA_CONV(ARM) |
| @@ -592,9 +616,9 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, | |||
| 592 | } | 616 | } |
| 593 | continue; | 617 | continue; |
| 594 | } | 618 | } |
| 595 | #endif | 619 | #endif |
| 596 | } // (c == 1) | 620 | } // (c == 1) |
| 597 | #endif | 621 | #endif // Z7_USE_BRANCH_FILTER |
| 598 | else | 622 | else |
| 599 | return SZ_ERROR_UNSUPPORTED; | 623 | return SZ_ERROR_UNSUPPORTED; |
| 600 | } | 624 | } |
diff --git a/C/7zTypes.h b/C/7zTypes.h index 1fcb247..5b77420 100644 --- a/C/7zTypes.h +++ b/C/7zTypes.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* 7zTypes.h -- Basic types | 1 | /* 7zTypes.h -- Basic types |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-24 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_7Z_TYPES_H | 4 | #ifndef ZIP7_7Z_TYPES_H |
| 5 | #define ZIP7_7Z_TYPES_H | 5 | #define ZIP7_7Z_TYPES_H |
| @@ -530,20 +530,20 @@ struct ISzAlloc | |||
| 530 | #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) | 530 | #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) |
| 531 | */ | 531 | */ |
| 532 | #if defined (__clang__) || defined(__GNUC__) | 532 | #if defined (__clang__) || defined(__GNUC__) |
| 533 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ | 533 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ |
| 534 | _Pragma("GCC diagnostic push") \ | 534 | _Pragma("GCC diagnostic push") \ |
| 535 | _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") | 535 | _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") |
| 536 | #define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL \ | 536 | #define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \ |
| 537 | _Pragma("GCC diagnostic pop") | 537 | _Pragma("GCC diagnostic pop") |
| 538 | #else | 538 | #else |
| 539 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL | 539 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL |
| 540 | #define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL | 540 | #define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL |
| 541 | #endif | 541 | #endif |
| 542 | 542 | ||
| 543 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ | 543 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ |
| 544 | Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ | 544 | Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ |
| 545 | type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ | 545 | type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ |
| 546 | Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL | 546 | Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL |
| 547 | 547 | ||
| 548 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ | 548 | #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ |
| 549 | Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) | 549 | Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) |
diff --git a/C/7zVersion.h b/C/7zVersion.h index 7549239..72b915a 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | #define MY_VER_MAJOR 23 | 1 | #define MY_VER_MAJOR 24 |
| 2 | #define MY_VER_MINOR 01 | 2 | #define MY_VER_MINOR 05 |
| 3 | #define MY_VER_BUILD 0 | 3 | #define MY_VER_BUILD 0 |
| 4 | #define MY_VERSION_NUMBERS "23.01" | 4 | #define MY_VERSION_NUMBERS "24.05" |
| 5 | #define MY_VERSION MY_VERSION_NUMBERS | 5 | #define MY_VERSION MY_VERSION_NUMBERS |
| 6 | 6 | ||
| 7 | #ifdef MY_CPU_NAME | 7 | #ifdef MY_CPU_NAME |
| @@ -10,12 +10,12 @@ | |||
| 10 | #define MY_VERSION_CPU MY_VERSION | 10 | #define MY_VERSION_CPU MY_VERSION |
| 11 | #endif | 11 | #endif |
| 12 | 12 | ||
| 13 | #define MY_DATE "2023-06-20" | 13 | #define MY_DATE "2024-05-14" |
| 14 | #undef MY_COPYRIGHT | 14 | #undef MY_COPYRIGHT |
| 15 | #undef MY_VERSION_COPYRIGHT_DATE | 15 | #undef MY_VERSION_COPYRIGHT_DATE |
| 16 | #define MY_AUTHOR_NAME "Igor Pavlov" | 16 | #define MY_AUTHOR_NAME "Igor Pavlov" |
| 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" | 17 | #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" |
| 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2023 Igor Pavlov" | 18 | #define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" |
| 19 | 19 | ||
| 20 | #ifdef USE_COPYRIGHT_CR | 20 | #ifdef USE_COPYRIGHT_CR |
| 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR | 21 | #define MY_COPYRIGHT MY_COPYRIGHT_CR |
diff --git a/C/7zip_gcc_c.mak b/C/7zip_gcc_c.mak index f19a99b..195d23d 100644 --- a/C/7zip_gcc_c.mak +++ b/C/7zip_gcc_c.mak | |||
| @@ -22,8 +22,8 @@ CFLAGS_BASE_LIST = -c | |||
| 22 | # for ASM file | 22 | # for ASM file |
| 23 | # CFLAGS_BASE_LIST = -S | 23 | # CFLAGS_BASE_LIST = -S |
| 24 | 24 | ||
| 25 | FLAGS_FLTO = | ||
| 26 | FLAGS_FLTO = -flto | 25 | FLAGS_FLTO = -flto |
| 26 | FLAGS_FLTO = | ||
| 27 | 27 | ||
| 28 | CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \ | 28 | CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \ |
| 29 | -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE | 29 | -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE |
| @@ -329,7 +329,7 @@ endif | |||
| 329 | 329 | ||
| 330 | ifdef IS_ARM64 | 330 | ifdef IS_ARM64 |
| 331 | $O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S | 331 | $O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S |
| 332 | $(CC) $(CFLAGS) $< | 332 | $(CC) $(CFLAGS) $(ASM_FLAGS) $< |
| 333 | endif | 333 | endif |
| 334 | 334 | ||
| 335 | $O/LzmaDec.o: ../../LzmaDec.c | 335 | $O/LzmaDec.o: ../../LzmaDec.c |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Aes.c -- AES encryption / decryption | 1 | /* Aes.c -- AES encryption / decryption |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -13,7 +13,9 @@ AES_CODE_FUNC g_AesCtr_Code; | |||
| 13 | UInt32 g_Aes_SupportedFunctions_Flags; | 13 | UInt32 g_Aes_SupportedFunctions_Flags; |
| 14 | #endif | 14 | #endif |
| 15 | 15 | ||
| 16 | MY_ALIGN(64) | ||
| 16 | static UInt32 T[256 * 4]; | 17 | static UInt32 T[256 * 4]; |
| 18 | MY_ALIGN(64) | ||
| 17 | static const Byte Sbox[256] = { | 19 | static const Byte Sbox[256] = { |
| 18 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, | 20 | 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, |
| 19 | 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, | 21 | 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, |
| @@ -33,7 +35,9 @@ static const Byte Sbox[256] = { | |||
| 33 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; | 35 | 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; |
| 34 | 36 | ||
| 35 | 37 | ||
| 38 | MY_ALIGN(64) | ||
| 36 | static UInt32 D[256 * 4]; | 39 | static UInt32 D[256 * 4]; |
| 40 | MY_ALIGN(64) | ||
| 37 | static Byte InvS[256]; | 41 | static Byte InvS[256]; |
| 38 | 42 | ||
| 39 | #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) | 43 | #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) |
| @@ -54,24 +58,54 @@ static Byte InvS[256]; | |||
| 54 | // #define Z7_SHOW_AES_STATUS | 58 | // #define Z7_SHOW_AES_STATUS |
| 55 | 59 | ||
| 56 | #ifdef MY_CPU_X86_OR_AMD64 | 60 | #ifdef MY_CPU_X86_OR_AMD64 |
| 57 | #define USE_HW_AES | 61 | |
| 58 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | 62 | #if defined(__INTEL_COMPILER) |
| 59 | #if defined(__clang__) | 63 | #if (__INTEL_COMPILER >= 1110) |
| 60 | #if (__clang_major__ >= 8) // fix that check | ||
| 61 | #define USE_HW_AES | ||
| 62 | #endif | ||
| 63 | #elif defined(__GNUC__) | ||
| 64 | #if (__GNUC__ >= 6) // fix that check | ||
| 65 | #define USE_HW_AES | 64 | #define USE_HW_AES |
| 65 | #if (__INTEL_COMPILER >= 1900) | ||
| 66 | #define USE_HW_VAES | ||
| 67 | #endif | ||
| 66 | #endif | 68 | #endif |
| 69 | #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 70 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) | ||
| 71 | #define USE_HW_AES | ||
| 72 | #if defined(__clang__) && (__clang_major__ >= 8) \ | ||
| 73 | || defined(__GNUC__) && (__GNUC__ >= 8) | ||
| 74 | #define USE_HW_VAES | ||
| 75 | #endif | ||
| 67 | #elif defined(_MSC_VER) | 76 | #elif defined(_MSC_VER) |
| 68 | #if _MSC_VER >= 1910 | 77 | #define USE_HW_AES |
| 78 | #define USE_HW_VAES | ||
| 79 | #endif | ||
| 80 | |||
| 81 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | ||
| 82 | |||
| 83 | #if defined(__ARM_FEATURE_AES) \ | ||
| 84 | || defined(__ARM_FEATURE_CRYPTO) | ||
| 85 | #define USE_HW_AES | ||
| 86 | #else | ||
| 87 | #if defined(MY_CPU_ARM64) \ | ||
| 88 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 89 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 90 | #if defined(__ARM_FP) && \ | ||
| 91 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 92 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 93 | ) \ | ||
| 94 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 95 | #if defined(MY_CPU_ARM64) \ | ||
| 96 | || !defined(Z7_CLANG_VERSION) \ | ||
| 97 | || defined(__ARM_NEON) && \ | ||
| 98 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 99 | Z7_CLANG_VERSION > 170001) | ||
| 69 | #define USE_HW_AES | 100 | #define USE_HW_AES |
| 70 | #endif | 101 | #endif |
| 102 | #endif | ||
| 103 | #endif | ||
| 71 | #endif | 104 | #endif |
| 72 | #endif | 105 | #endif |
| 73 | 106 | ||
| 74 | #ifdef USE_HW_AES | 107 | #ifdef USE_HW_AES |
| 108 | // #pragma message("=== Aes.c USE_HW_AES === ") | ||
| 75 | #ifdef Z7_SHOW_AES_STATUS | 109 | #ifdef Z7_SHOW_AES_STATUS |
| 76 | #include <stdio.h> | 110 | #include <stdio.h> |
| 77 | #define PRF(x) x | 111 | #define PRF(x) x |
| @@ -136,6 +170,7 @@ void AesGenTables(void) | |||
| 136 | #endif | 170 | #endif |
| 137 | 171 | ||
| 138 | #ifdef MY_CPU_X86_OR_AMD64 | 172 | #ifdef MY_CPU_X86_OR_AMD64 |
| 173 | #ifdef USE_HW_VAES | ||
| 139 | if (CPU_IsSupported_VAES_AVX2()) | 174 | if (CPU_IsSupported_VAES_AVX2()) |
| 140 | { | 175 | { |
| 141 | PRF(printf("\n===vaes avx2\n")); | 176 | PRF(printf("\n===vaes avx2\n")); |
| @@ -146,6 +181,7 @@ void AesGenTables(void) | |||
| 146 | #endif | 181 | #endif |
| 147 | } | 182 | } |
| 148 | #endif | 183 | #endif |
| 184 | #endif | ||
| 149 | } | 185 | } |
| 150 | #endif | 186 | #endif |
| 151 | 187 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions | 1 | /* AesOpt.c -- AES optimized code for x86 AES hardware instructions |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -15,8 +15,8 @@ | |||
| 15 | #define USE_INTEL_VAES | 15 | #define USE_INTEL_VAES |
| 16 | #endif | 16 | #endif |
| 17 | #endif | 17 | #endif |
| 18 | #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ | 18 | #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ |
| 19 | || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) | 19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) |
| 20 | #define USE_INTEL_AES | 20 | #define USE_INTEL_AES |
| 21 | #if !defined(__AES__) | 21 | #if !defined(__AES__) |
| 22 | #define ATTRIB_AES __attribute__((__target__("aes"))) | 22 | #define ATTRIB_AES __attribute__((__target__("aes"))) |
| @@ -35,27 +35,37 @@ | |||
| 35 | #define USE_INTEL_VAES | 35 | #define USE_INTEL_VAES |
| 36 | #endif | 36 | #endif |
| 37 | #endif | 37 | #endif |
| 38 | #ifndef USE_INTEL_AES | ||
| 39 | #define Z7_USE_AES_HW_STUB | ||
| 40 | #endif | ||
| 41 | #ifndef USE_INTEL_VAES | ||
| 42 | #define Z7_USE_VAES_HW_STUB | ||
| 43 | #endif | ||
| 38 | #endif | 44 | #endif |
| 39 | 45 | ||
| 40 | #ifndef ATTRIB_AES | 46 | #ifndef USE_INTEL_AES |
| 41 | #define ATTRIB_AES | 47 | // #define Z7_USE_AES_HW_STUB // for debug |
| 42 | #endif | 48 | #endif |
| 43 | #ifndef ATTRIB_VAES | 49 | #ifndef USE_INTEL_VAES |
| 44 | #define ATTRIB_VAES | 50 | // #define Z7_USE_VAES_HW_STUB // for debug |
| 45 | #endif | 51 | #endif |
| 46 | 52 | ||
| 47 | 53 | ||
| 48 | #ifdef USE_INTEL_AES | 54 | #ifdef USE_INTEL_AES |
| 49 | 55 | ||
| 50 | #include <wmmintrin.h> | 56 | #include <wmmintrin.h> |
| 51 | 57 | ||
| 52 | #ifndef USE_INTEL_VAES | 58 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) |
| 53 | #define AES_TYPE_keys UInt32 | 59 | #define AES_TYPE_keys UInt32 |
| 54 | #define AES_TYPE_data Byte | 60 | #define AES_TYPE_data Byte |
| 55 | // #define AES_TYPE_keys __m128i | 61 | // #define AES_TYPE_keys __m128i |
| 56 | // #define AES_TYPE_data __m128i | 62 | // #define AES_TYPE_data __m128i |
| 57 | #endif | 63 | #endif |
| 58 | 64 | ||
| 65 | #ifndef ATTRIB_AES | ||
| 66 | #define ATTRIB_AES | ||
| 67 | #endif | ||
| 68 | |||
| 59 | #define AES_FUNC_START(name) \ | 69 | #define AES_FUNC_START(name) \ |
| 60 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) | 70 | void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) |
| 61 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) | 71 | // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) |
| @@ -69,8 +79,6 @@ AES_FUNC_START (name) | |||
| 69 | #define MM_OP_m(op, src) MM_OP(op, m, src) | 79 | #define MM_OP_m(op, src) MM_OP(op, m, src) |
| 70 | 80 | ||
| 71 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) | 81 | #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) |
| 72 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
| 73 | |||
| 74 | 82 | ||
| 75 | AES_FUNC_START2 (AesCbc_Encode_HW) | 83 | AES_FUNC_START2 (AesCbc_Encode_HW) |
| 76 | { | 84 | { |
| @@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 139 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) | 147 | #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) |
| 140 | #endif | 148 | #endif |
| 141 | 149 | ||
| 142 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
| 143 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
| 144 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
| 145 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
| 146 | |||
| 147 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); | 150 | #define MM_OP_key(op, reg) MM_OP(op, reg, key); |
| 148 | 151 | ||
| 149 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) | 152 | #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) |
| @@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 152 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) | 155 | #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) |
| 153 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) | 156 | #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) |
| 154 | 157 | ||
| 155 | |||
| 156 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
| 157 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
| 158 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
| 159 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
| 160 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
| 161 | |||
| 162 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; | 158 | #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; |
| 163 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) | 159 | #define CTR_END( reg, ii) MM_XOR (data[ii], reg) |
| 164 | 160 | ||
| 165 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
| 166 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
| 167 | |||
| 168 | #define WOP_KEY(op, n) { \ | 161 | #define WOP_KEY(op, n) { \ |
| 169 | const __m128i key = w[n]; \ | 162 | const __m128i key = w[n]; \ |
| 170 | WOP(op); } | 163 | WOP(op); } |
| 171 | 164 | ||
| 172 | #define AVX_WOP_KEY(op, n) { \ | ||
| 173 | const __m256i key = w[n]; \ | ||
| 174 | WOP(op); } | ||
| 175 | |||
| 176 | 165 | ||
| 177 | #define WIDE_LOOP_START \ | 166 | #define WIDE_LOOP_START \ |
| 178 | dataEnd = data + numBlocks; \ | 167 | dataEnd = data + numBlocks; \ |
| @@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 190 | for (; data < dataEnd; data++) | 179 | for (; data < dataEnd; data++) |
| 191 | 180 | ||
| 192 | 181 | ||
| 182 | |||
| 183 | #ifdef USE_INTEL_VAES | ||
| 184 | |||
| 185 | #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) | ||
| 186 | #define AVX_DECLARE_VAR(reg, ii) __m256i reg; | ||
| 187 | #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; | ||
| 188 | #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; | ||
| 189 | /* | ||
| 190 | AVX_XOR_data_M1() needs unaligned memory load | ||
| 191 | if (we don't use _mm256_loadu_si256() here) | ||
| 192 | { | ||
| 193 | Most compilers with enabled optimizations generate fused AVX (LOAD + OP) | ||
| 194 | instruction that can load unaligned data. | ||
| 195 | But GCC and CLANG without -O2 or -O1 optimizations can generate separated | ||
| 196 | LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. | ||
| 197 | } | ||
| 198 | Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. | ||
| 199 | v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. | ||
| 200 | */ | ||
| 201 | #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) | ||
| 202 | // for debug only: the following code will fail on execution, if compiled by some compilers: | ||
| 203 | // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) | ||
| 204 | |||
| 205 | #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) | ||
| 206 | #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) | ||
| 207 | #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) | ||
| 208 | #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) | ||
| 209 | #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) | ||
| 210 | #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); | ||
| 211 | #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) | ||
| 212 | #define AVX_WOP_KEY(op, n) { \ | ||
| 213 | const __m256i key = w[n]; \ | ||
| 214 | WOP(op); } | ||
| 215 | |||
| 193 | #define NUM_AES_KEYS_MAX 15 | 216 | #define NUM_AES_KEYS_MAX 15 |
| 194 | 217 | ||
| 195 | #define WIDE_LOOP_START_AVX(OP) \ | 218 | #define WIDE_LOOP_START_AVX(OP) \ |
| @@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 214 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, | 237 | /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, |
| 215 | MSVC still can insert vzeroupper instruction. */ | 238 | MSVC still can insert vzeroupper instruction. */ |
| 216 | 239 | ||
| 240 | #endif | ||
| 241 | |||
| 242 | |||
| 217 | 243 | ||
| 218 | AES_FUNC_START2 (AesCbc_Decode_HW) | 244 | AES_FUNC_START2 (AesCbc_Decode_HW) |
| 219 | { | 245 | { |
| @@ -380,6 +406,9 @@ required that <immintrin.h> must be included before <avxintrin.h>. | |||
| 380 | #endif | 406 | #endif |
| 381 | #endif // __clang__ && _MSC_VER | 407 | #endif // __clang__ && _MSC_VER |
| 382 | 408 | ||
| 409 | #ifndef ATTRIB_VAES | ||
| 410 | #define ATTRIB_VAES | ||
| 411 | #endif | ||
| 383 | 412 | ||
| 384 | #define VAES_FUNC_START2(name) \ | 413 | #define VAES_FUNC_START2(name) \ |
| 385 | AES_FUNC_START (name); \ | 414 | AES_FUNC_START (name); \ |
| @@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
| 519 | 548 | ||
| 520 | /* no USE_INTEL_AES */ | 549 | /* no USE_INTEL_AES */ |
| 521 | 550 | ||
| 551 | #if defined(Z7_USE_AES_HW_STUB) | ||
| 552 | // We can compile this file with another C compiler, | ||
| 553 | // or we can compile asm version. | ||
| 554 | // So we can generate real code instead of this stub function. | ||
| 555 | // #if defined(_MSC_VER) | ||
| 522 | #pragma message("AES HW_SW stub was used") | 556 | #pragma message("AES HW_SW stub was used") |
| 557 | // #endif | ||
| 523 | 558 | ||
| 559 | #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) | ||
| 524 | #define AES_TYPE_keys UInt32 | 560 | #define AES_TYPE_keys UInt32 |
| 525 | #define AES_TYPE_data Byte | 561 | #define AES_TYPE_data Byte |
| 562 | #endif | ||
| 526 | 563 | ||
| 527 | #define AES_FUNC_START(name) \ | 564 | #define AES_FUNC_START(name) \ |
| 528 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ | 565 | void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ |
| @@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) | |||
| 535 | AES_COMPAT_STUB (AesCbc_Encode) | 572 | AES_COMPAT_STUB (AesCbc_Encode) |
| 536 | AES_COMPAT_STUB (AesCbc_Decode) | 573 | AES_COMPAT_STUB (AesCbc_Decode) |
| 537 | AES_COMPAT_STUB (AesCtr_Code) | 574 | AES_COMPAT_STUB (AesCtr_Code) |
| 575 | #endif // Z7_USE_AES_HW_STUB | ||
| 538 | 576 | ||
| 539 | #endif // USE_INTEL_AES | 577 | #endif // USE_INTEL_AES |
| 540 | 578 | ||
| 541 | 579 | ||
| 542 | #ifndef USE_INTEL_VAES | 580 | #ifndef USE_INTEL_VAES |
| 543 | 581 | #if defined(Z7_USE_VAES_HW_STUB) | |
| 582 | // #if defined(_MSC_VER) | ||
| 544 | #pragma message("VAES HW_SW stub was used") | 583 | #pragma message("VAES HW_SW stub was used") |
| 584 | // #endif | ||
| 545 | 585 | ||
| 546 | #define VAES_COMPAT_STUB(name) \ | 586 | #define VAES_COMPAT_STUB(name) \ |
| 547 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ | 587 | void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ |
| @@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code) | |||
| 550 | 590 | ||
| 551 | VAES_COMPAT_STUB (AesCbc_Decode_HW) | 591 | VAES_COMPAT_STUB (AesCbc_Decode_HW) |
| 552 | VAES_COMPAT_STUB (AesCtr_Code_HW) | 592 | VAES_COMPAT_STUB (AesCtr_Code_HW) |
| 553 | 593 | #endif | |
| 554 | #endif // ! USE_INTEL_VAES | 594 | #endif // ! USE_INTEL_VAES |
| 555 | 595 | ||
| 556 | 596 | ||
| 597 | |||
| 598 | |||
| 557 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) | 599 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
| 558 | 600 | ||
| 559 | #if defined(__clang__) | 601 | #if defined(__ARM_FEATURE_AES) \ |
| 560 | #if (__clang_major__ >= 8) // fix that check | 602 | || defined(__ARM_FEATURE_CRYPTO) |
| 603 | #define USE_HW_AES | ||
| 604 | #else | ||
| 605 | #if defined(MY_CPU_ARM64) \ | ||
| 606 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 607 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 608 | #if defined(__ARM_FP) && \ | ||
| 609 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 610 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 611 | ) \ | ||
| 612 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 613 | #if defined(MY_CPU_ARM64) \ | ||
| 614 | || !defined(Z7_CLANG_VERSION) \ | ||
| 615 | || defined(__ARM_NEON) && \ | ||
| 616 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 617 | Z7_CLANG_VERSION > 170001) | ||
| 561 | #define USE_HW_AES | 618 | #define USE_HW_AES |
| 562 | #endif | 619 | #endif |
| 563 | #elif defined(__GNUC__) | ||
| 564 | #if (__GNUC__ >= 6) // fix that check | ||
| 565 | #define USE_HW_AES | ||
| 566 | #endif | 620 | #endif |
| 567 | #elif defined(_MSC_VER) | ||
| 568 | #if _MSC_VER >= 1910 | ||
| 569 | #define USE_HW_AES | ||
| 570 | #endif | 621 | #endif |
| 571 | #endif | 622 | #endif |
| 572 | 623 | ||
| 573 | #ifdef USE_HW_AES | 624 | #ifdef USE_HW_AES |
| 574 | 625 | ||
| 575 | // #pragma message("=== AES HW === ") | 626 | // #pragma message("=== AES HW === ") |
| 627 | // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES | ||
| 576 | 628 | ||
| 577 | #if defined(__clang__) || defined(__GNUC__) | 629 | #if defined(__clang__) || defined(__GNUC__) |
| 630 | #if !defined(__ARM_FEATURE_AES) && \ | ||
| 631 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 578 | #ifdef MY_CPU_ARM64 | 632 | #ifdef MY_CPU_ARM64 |
| 633 | #if defined(__clang__) | ||
| 634 | #define ATTRIB_AES __attribute__((__target__("crypto"))) | ||
| 635 | #else | ||
| 579 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) | 636 | #define ATTRIB_AES __attribute__((__target__("+crypto"))) |
| 637 | #endif | ||
| 580 | #else | 638 | #else |
| 639 | #if defined(__clang__) | ||
| 640 | #define ATTRIB_AES __attribute__((__target__("armv8-a,aes"))) | ||
| 641 | #else | ||
| 581 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 642 | #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
| 643 | #endif | ||
| 582 | #endif | 644 | #endif |
| 645 | #endif | ||
| 583 | #else | 646 | #else |
| 584 | // _MSC_VER | 647 | // _MSC_VER |
| 585 | // for arm32 | 648 | // for arm32 |
| @@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW) | |||
| 590 | #define ATTRIB_AES | 653 | #define ATTRIB_AES |
| 591 | #endif | 654 | #endif |
| 592 | 655 | ||
| 593 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 656 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
| 594 | #include <arm64_neon.h> | 657 | #include <arm64_neon.h> |
| 595 | #else | 658 | #else |
| 659 | /* | ||
| 660 | clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese | ||
| 661 | clang | ||
| 662 | 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO) | ||
| 663 | 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
| 664 | 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) | ||
| 665 | 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES) | ||
| 666 | 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 | ||
| 667 | */ | ||
| 668 | #if defined(__clang__) && __clang_major__ < 16 | ||
| 669 | #if !defined(__ARM_FEATURE_AES) && \ | ||
| 670 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 671 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
| 672 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 673 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
| 674 | // #if defined(__clang__) && __clang_major__ < 13 | ||
| 675 | #define __ARM_FEATURE_CRYPTO 1 | ||
| 676 | // #else | ||
| 677 | #define __ARM_FEATURE_AES 1 | ||
| 678 | // #endif | ||
| 679 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 680 | #endif | ||
| 681 | #endif // clang | ||
| 682 | |||
| 683 | #if defined(__clang__) | ||
| 684 | |||
| 685 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
| 686 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 687 | // #pragma message("#define __ARM_ARCH 8") | ||
| 688 | #undef __ARM_ARCH | ||
| 689 | #define __ARM_ARCH 8 | ||
| 690 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 691 | #endif | ||
| 692 | |||
| 693 | #endif // clang | ||
| 694 | |||
| 596 | #include <arm_neon.h> | 695 | #include <arm_neon.h> |
| 696 | |||
| 697 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
| 698 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
| 699 | defined(__ARM_FEATURE_AES) | ||
| 700 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 701 | #undef __ARM_FEATURE_CRYPTO | ||
| 702 | #undef __ARM_FEATURE_AES | ||
| 703 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
| 704 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 705 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
| 597 | #endif | 706 | #endif |
| 598 | 707 | ||
| 708 | #endif // Z7_MSC_VER_ORIGINAL | ||
| 709 | |||
| 599 | typedef uint8x16_t v128; | 710 | typedef uint8x16_t v128; |
| 600 | 711 | ||
| 601 | #define AES_FUNC_START(name) \ | 712 | #define AES_FUNC_START(name) \ |
| @@ -620,7 +731,7 @@ AES_FUNC_START (name) | |||
| 620 | 731 | ||
| 621 | AES_FUNC_START2 (AesCbc_Encode_HW) | 732 | AES_FUNC_START2 (AesCbc_Encode_HW) |
| 622 | { | 733 | { |
| 623 | v128 *p = (v128*)(void*)ivAes; | 734 | v128 * const p = (v128*)(void*)ivAes; |
| 624 | v128 *data = (v128*)(void*)data8; | 735 | v128 *data = (v128*)(void*)data8; |
| 625 | v128 m = *p; | 736 | v128 m = *p; |
| 626 | const v128 k0 = p[2]; | 737 | const v128 k0 = p[2]; |
| @@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 639 | const v128 k_z0 = w[2]; | 750 | const v128 k_z0 = w[2]; |
| 640 | for (; numBlocks != 0; numBlocks--, data++) | 751 | for (; numBlocks != 0; numBlocks--, data++) |
| 641 | { | 752 | { |
| 642 | MM_XOR_m (*data); | 753 | MM_XOR_m (*data) |
| 643 | AES_E_MC_m (k0) | 754 | AES_E_MC_m (k0) |
| 644 | AES_E_MC_m (k1) | 755 | AES_E_MC_m (k1) |
| 645 | AES_E_MC_m (k2) | 756 | AES_E_MC_m (k2) |
| @@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) | |||
| 660 | } | 771 | } |
| 661 | } | 772 | } |
| 662 | AES_E_m (k_z1) | 773 | AES_E_m (k_z1) |
| 663 | MM_XOR_m (k_z0); | 774 | MM_XOR_m (k_z0) |
| 664 | *data = m; | 775 | *data = m; |
| 665 | } | 776 | } |
| 666 | *p = m; | 777 | *p = m; |
| @@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
| 745 | while (w != p); | 856 | while (w != p); |
| 746 | WOP_KEY (AES_D, 1) | 857 | WOP_KEY (AES_D, 1) |
| 747 | WOP_KEY (AES_XOR, 0) | 858 | WOP_KEY (AES_XOR, 0) |
| 748 | MM_XOR (m0, iv); | 859 | MM_XOR (m0, iv) |
| 749 | WOP_M1 (XOR_data_M1) | 860 | WOP_M1 (XOR_data_M1) |
| 750 | iv = data[NUM_WAYS - 1]; | 861 | iv = data[NUM_WAYS - 1]; |
| 751 | WOP (STORE_data) | 862 | WOP (STORE_data) |
| @@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW) | |||
| 759 | AES_D_IMC_m (w[2]) | 870 | AES_D_IMC_m (w[2]) |
| 760 | do | 871 | do |
| 761 | { | 872 | { |
| 762 | AES_D_IMC_m (w[1]); | 873 | AES_D_IMC_m (w[1]) |
| 763 | AES_D_IMC_m (w[0]); | 874 | AES_D_IMC_m (w[0]) |
| 764 | w -= 2; | 875 | w -= 2; |
| 765 | } | 876 | } |
| 766 | while (w != p); | 877 | while (w != p); |
| 767 | AES_D_m (w[1]); | 878 | AES_D_m (w[1]) |
| 768 | MM_XOR_m (w[0]); | 879 | MM_XOR_m (w[0]) |
| 769 | MM_XOR_m (iv); | 880 | MM_XOR_m (iv) |
| 770 | iv = *data; | 881 | iv = *data; |
| 771 | *data = m; | 882 | *data = m; |
| 772 | } | 883 | } |
| @@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
| 783 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; | 894 | const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; |
| 784 | const v128 *dataEnd; | 895 | const v128 *dataEnd; |
| 785 | uint64x2_t one = vdupq_n_u64(0); | 896 | uint64x2_t one = vdupq_n_u64(0); |
| 897 | |||
| 898 | // the bug in clang: | ||
| 899 | // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); | ||
| 900 | #if defined(__clang__) && (__clang_major__ <= 9) | ||
| 901 | #pragma GCC diagnostic ignored "-Wvector-conversion" | ||
| 902 | #endif | ||
| 786 | one = vsetq_lane_u64(1, one, 0); | 903 | one = vsetq_lane_u64(1, one, 0); |
| 787 | p += 2; | 904 | p += 2; |
| 788 | 905 | ||
| @@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW) | |||
| 809 | { | 926 | { |
| 810 | const v128 *w = p; | 927 | const v128 *w = p; |
| 811 | v128 m; | 928 | v128 m; |
| 812 | CTR_START (m, 0); | 929 | CTR_START (m, 0) |
| 813 | do | 930 | do |
| 814 | { | 931 | { |
| 815 | AES_E_MC_m (w[0]); | 932 | AES_E_MC_m (w[0]) |
| 816 | AES_E_MC_m (w[1]); | 933 | AES_E_MC_m (w[1]) |
| 817 | w += 2; | 934 | w += 2; |
| 818 | } | 935 | } |
| 819 | while (w != wEnd); | 936 | while (w != wEnd); |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Alloc.c -- Memory allocation functions | 1 | /* Alloc.c -- Memory allocation functions |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-02-18 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -10,19 +10,18 @@ | |||
| 10 | 10 | ||
| 11 | #include "Alloc.h" | 11 | #include "Alloc.h" |
| 12 | 12 | ||
| 13 | #ifdef _WIN32 | 13 | #if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \ |
| 14 | #ifdef Z7_LARGE_PAGES | 14 | (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64) |
| 15 | #if defined(__clang__) || defined(__GNUC__) | 15 | #define Z7_USE_DYN_GetLargePageMinimum |
| 16 | typedef void (*Z7_voidFunction)(void); | 16 | #endif |
| 17 | #define MY_CAST_FUNC (Z7_voidFunction) | 17 | |
| 18 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | 18 | // for debug: |
| 19 | #define MY_CAST_FUNC (void *) | 19 | #if 0 |
| 20 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | 20 | #if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) |
| 21 | #else | 21 | // #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ") |
| 22 | #define MY_CAST_FUNC | 22 | #define Z7_ALLOC_NO_OFFSET_ALLOCATOR |
| 23 | #endif | ||
| 23 | #endif | 24 | #endif |
| 24 | #endif // Z7_LARGE_PAGES | ||
| 25 | #endif // _WIN32 | ||
| 26 | 25 | ||
| 27 | // #define SZ_ALLOC_DEBUG | 26 | // #define SZ_ALLOC_DEBUG |
| 28 | /* #define SZ_ALLOC_DEBUG */ | 27 | /* #define SZ_ALLOC_DEBUG */ |
| @@ -146,7 +145,9 @@ static void PrintAddr(void *p) | |||
| 146 | #define PRINT_FREE(name, cnt, ptr) | 145 | #define PRINT_FREE(name, cnt, ptr) |
| 147 | #define Print(s) | 146 | #define Print(s) |
| 148 | #define PrintLn() | 147 | #define PrintLn() |
| 148 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR | ||
| 149 | #define PrintHex(v, align) | 149 | #define PrintHex(v, align) |
| 150 | #endif | ||
| 150 | #define PrintAddr(p) | 151 | #define PrintAddr(p) |
| 151 | 152 | ||
| 152 | #endif | 153 | #endif |
| @@ -246,9 +247,9 @@ void MidFree(void *address) | |||
| 246 | #ifdef Z7_LARGE_PAGES | 247 | #ifdef Z7_LARGE_PAGES |
| 247 | 248 | ||
| 248 | #ifdef MEM_LARGE_PAGES | 249 | #ifdef MEM_LARGE_PAGES |
| 249 | #define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES | 250 | #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES |
| 250 | #else | 251 | #else |
| 251 | #define MY__MEM_LARGE_PAGES 0x20000000 | 252 | #define MY_MEM_LARGE_PAGES 0x20000000 |
| 252 | #endif | 253 | #endif |
| 253 | 254 | ||
| 254 | extern | 255 | extern |
| @@ -258,19 +259,23 @@ typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID); | |||
| 258 | 259 | ||
| 259 | void SetLargePageSize(void) | 260 | void SetLargePageSize(void) |
| 260 | { | 261 | { |
| 261 | #ifdef Z7_LARGE_PAGES | ||
| 262 | SIZE_T size; | 262 | SIZE_T size; |
| 263 | #ifdef Z7_USE_DYN_GetLargePageMinimum | ||
| 264 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
| 265 | |||
| 263 | const | 266 | const |
| 264 | Func_GetLargePageMinimum fn = | 267 | Func_GetLargePageMinimum fn = |
| 265 | (Func_GetLargePageMinimum) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), | 268 | (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), |
| 266 | "GetLargePageMinimum"); | 269 | "GetLargePageMinimum"); |
| 267 | if (!fn) | 270 | if (!fn) |
| 268 | return; | 271 | return; |
| 269 | size = fn(); | 272 | size = fn(); |
| 273 | #else | ||
| 274 | size = GetLargePageMinimum(); | ||
| 275 | #endif | ||
| 270 | if (size == 0 || (size & (size - 1)) != 0) | 276 | if (size == 0 || (size & (size - 1)) != 0) |
| 271 | return; | 277 | return; |
| 272 | g_LargePageSize = size; | 278 | g_LargePageSize = size; |
| 273 | #endif | ||
| 274 | } | 279 | } |
| 275 | 280 | ||
| 276 | #endif // Z7_LARGE_PAGES | 281 | #endif // Z7_LARGE_PAGES |
| @@ -292,7 +297,7 @@ void *BigAlloc(size_t size) | |||
| 292 | size2 = (size + ps) & ~ps; | 297 | size2 = (size + ps) & ~ps; |
| 293 | if (size2 >= size) | 298 | if (size2 >= size) |
| 294 | { | 299 | { |
| 295 | void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); | 300 | void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE); |
| 296 | if (p) | 301 | if (p) |
| 297 | { | 302 | { |
| 298 | PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) | 303 | PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) |
| @@ -328,20 +333,7 @@ const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; | |||
| 328 | const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; | 333 | const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; |
| 329 | #endif | 334 | #endif |
| 330 | 335 | ||
| 331 | /* | 336 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR |
| 332 | uintptr_t : <stdint.h> C99 (optional) | ||
| 333 | : unsupported in VS6 | ||
| 334 | */ | ||
| 335 | |||
| 336 | #ifdef _WIN32 | ||
| 337 | typedef UINT_PTR UIntPtr; | ||
| 338 | #else | ||
| 339 | /* | ||
| 340 | typedef uintptr_t UIntPtr; | ||
| 341 | */ | ||
| 342 | typedef ptrdiff_t UIntPtr; | ||
| 343 | #endif | ||
| 344 | |||
| 345 | 337 | ||
| 346 | #define ADJUST_ALLOC_SIZE 0 | 338 | #define ADJUST_ALLOC_SIZE 0 |
| 347 | /* | 339 | /* |
| @@ -352,14 +344,36 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; | |||
| 352 | MyAlloc() can return address that is NOT multiple of sizeof(void *). | 344 | MyAlloc() can return address that is NOT multiple of sizeof(void *). |
| 353 | */ | 345 | */ |
| 354 | 346 | ||
| 355 | |||
| 356 | /* | 347 | /* |
| 357 | #define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) | 348 | uintptr_t : <stdint.h> C99 (optional) |
| 349 | : unsupported in VS6 | ||
| 358 | */ | 350 | */ |
| 359 | #define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) | 351 | typedef |
| 352 | #ifdef _WIN32 | ||
| 353 | UINT_PTR | ||
| 354 | #elif 1 | ||
| 355 | uintptr_t | ||
| 356 | #else | ||
| 357 | ptrdiff_t | ||
| 358 | #endif | ||
| 359 | MY_uintptr_t; | ||
| 360 | |||
| 361 | #if 0 \ | ||
| 362 | || (defined(__CHERI__) \ | ||
| 363 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8)) | ||
| 364 | // for 128-bit pointers (cheri): | ||
| 365 | #define MY_ALIGN_PTR_DOWN(p, align) \ | ||
| 366 | ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1)))) | ||
| 367 | #else | ||
| 368 | #define MY_ALIGN_PTR_DOWN(p, align) \ | ||
| 369 | ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1)))) | ||
| 370 | #endif | ||
| 360 | 371 | ||
| 372 | #endif | ||
| 361 | 373 | ||
| 362 | #if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) | 374 | #if !defined(_WIN32) \ |
| 375 | && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \ | ||
| 376 | || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) | ||
| 363 | #define USE_posix_memalign | 377 | #define USE_posix_memalign |
| 364 | #endif | 378 | #endif |
| 365 | 379 | ||
| @@ -399,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size) | |||
| 399 | 413 | ||
| 400 | #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) | 414 | #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) |
| 401 | 415 | ||
| 402 | static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | 416 | void *z7_AlignedAlloc(size_t size) |
| 403 | { | 417 | { |
| 404 | #ifndef USE_posix_memalign | 418 | #ifndef USE_posix_memalign |
| 405 | 419 | ||
| 406 | void *p; | 420 | void *p; |
| 407 | void *pAligned; | 421 | void *pAligned; |
| 408 | size_t newSize; | 422 | size_t newSize; |
| 409 | UNUSED_VAR(pp) | ||
| 410 | 423 | ||
| 411 | /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned | 424 | /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned |
| 412 | block to prevent cache line sharing with another allocated blocks */ | 425 | block to prevent cache line sharing with another allocated blocks */ |
| @@ -431,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | |||
| 431 | 444 | ||
| 432 | return pAligned; | 445 | return pAligned; |
| 433 | 446 | ||
| 434 | #else | 447 | #else |
| 435 | 448 | ||
| 436 | void *p; | 449 | void *p; |
| 437 | UNUSED_VAR(pp) | ||
| 438 | if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) | 450 | if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) |
| 439 | return NULL; | 451 | return NULL; |
| 440 | 452 | ||
| @@ -443,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | |||
| 443 | 455 | ||
| 444 | return p; | 456 | return p; |
| 445 | 457 | ||
| 446 | #endif | 458 | #endif |
| 459 | } | ||
| 460 | |||
| 461 | |||
| 462 | void z7_AlignedFree(void *address) | ||
| 463 | { | ||
| 464 | #ifndef USE_posix_memalign | ||
| 465 | if (address) | ||
| 466 | MyFree(((void **)address)[-1]); | ||
| 467 | #else | ||
| 468 | free(address); | ||
| 469 | #endif | ||
| 470 | } | ||
| 471 | |||
| 472 | |||
| 473 | static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) | ||
| 474 | { | ||
| 475 | UNUSED_VAR(pp) | ||
| 476 | return z7_AlignedAlloc(size); | ||
| 447 | } | 477 | } |
| 448 | 478 | ||
| 449 | 479 | ||
| 450 | static void SzAlignedFree(ISzAllocPtr pp, void *address) | 480 | static void SzAlignedFree(ISzAllocPtr pp, void *address) |
| 451 | { | 481 | { |
| 452 | UNUSED_VAR(pp) | 482 | UNUSED_VAR(pp) |
| 453 | #ifndef USE_posix_memalign | 483 | #ifndef USE_posix_memalign |
| 454 | if (address) | 484 | if (address) |
| 455 | MyFree(((void **)address)[-1]); | 485 | MyFree(((void **)address)[-1]); |
| 456 | #else | 486 | #else |
| 457 | free(address); | 487 | free(address); |
| 458 | #endif | 488 | #endif |
| 459 | } | 489 | } |
| 460 | 490 | ||
| 461 | 491 | ||
| @@ -463,16 +493,44 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree }; | |||
| 463 | 493 | ||
| 464 | 494 | ||
| 465 | 495 | ||
| 466 | #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) | ||
| 467 | |||
| 468 | /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ | 496 | /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ |
| 469 | #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] | 497 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR |
| 470 | /* | 498 | #if 1 |
| 471 | #define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] | 499 | #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) |
| 472 | */ | 500 | #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] |
| 501 | #else | ||
| 502 | // we can use this simplified code, | ||
| 503 | // if (CAlignOffsetAlloc::offset == (k * sizeof(void *)) | ||
| 504 | #define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1]) | ||
| 505 | #endif | ||
| 506 | #endif | ||
| 507 | |||
| 508 | |||
| 509 | #if 0 | ||
| 510 | #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR | ||
| 511 | #include <stdio.h> | ||
| 512 | static void PrintPtr(const char *s, const void *p) | ||
| 513 | { | ||
| 514 | const Byte *p2 = (const Byte *)&p; | ||
| 515 | unsigned i; | ||
| 516 | printf("%s %p ", s, p); | ||
| 517 | for (i = sizeof(p); i != 0;) | ||
| 518 | { | ||
| 519 | i--; | ||
| 520 | printf("%02x", p2[i]); | ||
| 521 | } | ||
| 522 | printf("\n"); | ||
| 523 | } | ||
| 524 | #endif | ||
| 525 | #endif | ||
| 526 | |||
| 473 | 527 | ||
| 474 | static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | 528 | static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) |
| 475 | { | 529 | { |
| 530 | #if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) | ||
| 531 | UNUSED_VAR(pp) | ||
| 532 | return z7_AlignedAlloc(size); | ||
| 533 | #else | ||
| 476 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); | 534 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); |
| 477 | void *adr; | 535 | void *adr; |
| 478 | void *pAligned; | 536 | void *pAligned; |
| @@ -501,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | |||
| 501 | pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + | 559 | pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + |
| 502 | alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; | 560 | alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; |
| 503 | 561 | ||
| 562 | #if 0 | ||
| 563 | printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size); | ||
| 564 | PrintPtr("base", adr); | ||
| 565 | PrintPtr("alig", pAligned); | ||
| 566 | #endif | ||
| 567 | |||
| 504 | PrintLn(); | 568 | PrintLn(); |
| 505 | Print("- Aligned: "); | 569 | Print("- Aligned: "); |
| 506 | Print(" size="); PrintHex(size, 8); | 570 | Print(" size="); PrintHex(size, 8); |
| @@ -512,11 +576,16 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) | |||
| 512 | REAL_BLOCK_PTR_VAR(pAligned) = adr; | 576 | REAL_BLOCK_PTR_VAR(pAligned) = adr; |
| 513 | 577 | ||
| 514 | return pAligned; | 578 | return pAligned; |
| 579 | #endif | ||
| 515 | } | 580 | } |
| 516 | 581 | ||
| 517 | 582 | ||
| 518 | static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) | 583 | static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) |
| 519 | { | 584 | { |
| 585 | #if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) | ||
| 586 | UNUSED_VAR(pp) | ||
| 587 | z7_AlignedFree(address); | ||
| 588 | #else | ||
| 520 | if (address) | 589 | if (address) |
| 521 | { | 590 | { |
| 522 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); | 591 | const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); |
| @@ -525,6 +594,7 @@ static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) | |||
| 525 | PrintLn(); | 594 | PrintLn(); |
| 526 | ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); | 595 | ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); |
| 527 | } | 596 | } |
| 597 | #endif | ||
| 528 | } | 598 | } |
| 529 | 599 | ||
| 530 | 600 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Alloc.h -- Memory allocation functions | 1 | /* Alloc.h -- Memory allocation functions |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_ALLOC_H | 4 | #ifndef ZIP7_INC_ALLOC_H |
| 5 | #define ZIP7_INC_ALLOC_H | 5 | #define ZIP7_INC_ALLOC_H |
| @@ -22,6 +22,9 @@ void *MyAlloc(size_t size); | |||
| 22 | void MyFree(void *address); | 22 | void MyFree(void *address); |
| 23 | void *MyRealloc(void *address, size_t size); | 23 | void *MyRealloc(void *address, size_t size); |
| 24 | 24 | ||
| 25 | void *z7_AlignedAlloc(size_t size); | ||
| 26 | void z7_AlignedFree(void *p); | ||
| 27 | |||
| 25 | #ifdef _WIN32 | 28 | #ifdef _WIN32 |
| 26 | 29 | ||
| 27 | #ifdef Z7_LARGE_PAGES | 30 | #ifdef Z7_LARGE_PAGES |
| @@ -33,12 +36,14 @@ void MidFree(void *address); | |||
| 33 | void *BigAlloc(size_t size); | 36 | void *BigAlloc(size_t size); |
| 34 | void BigFree(void *address); | 37 | void BigFree(void *address); |
| 35 | 38 | ||
| 39 | /* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */ | ||
| 40 | |||
| 36 | #else | 41 | #else |
| 37 | 42 | ||
| 38 | #define MidAlloc(size) MyAlloc(size) | 43 | #define MidAlloc(size) z7_AlignedAlloc(size) |
| 39 | #define MidFree(address) MyFree(address) | 44 | #define MidFree(address) z7_AlignedFree(address) |
| 40 | #define BigAlloc(size) MyAlloc(size) | 45 | #define BigAlloc(size) z7_AlignedAlloc(size) |
| 41 | #define BigFree(address) MyFree(address) | 46 | #define BigFree(address) z7_AlignedFree(address) |
| 42 | 47 | ||
| 43 | #endif | 48 | #endif |
| 44 | 49 | ||
diff --git a/C/Asm_c.mak b/C/Asm_c.mak new file mode 100644 index 0000000..9431816 --- /dev/null +++ b/C/Asm_c.mak | |||
| @@ -0,0 +1,12 @@ | |||
| 1 | !IFDEF ASM_OBJS | ||
| 2 | !IF "$(PLATFORM)" == "arm64" | ||
| 3 | $(ASM_OBJS): ../../../Asm/arm64/$(*B).S | ||
| 4 | $(COMPL_ASM_CLANG) | ||
| 5 | !ELSEIF "$(PLATFORM)" == "arm" | ||
| 6 | $(ASM_OBJS): ../../../Asm/arm/$(*B).asm | ||
| 7 | $(COMPL_ASM) | ||
| 8 | !ELSEIF "$(PLATFORM)" != "ia64" && "$(PLATFORM)" != "mips" | ||
| 9 | $(ASM_OBJS): ../../../Asm/x86/$(*B).asm | ||
| 10 | $(COMPL_ASM) | ||
| 11 | !ENDIF | ||
| 12 | !ENDIF | ||
| @@ -1,47 +1,104 @@ | |||
| 1 | /* Blake2.h -- BLAKE2 Hash | 1 | /* Blake2.h -- BLAKE2sp Hash |
| 2 | 2023-03-04 : Igor Pavlov : Public domain | 2 | 2024-01-17 : Igor Pavlov : Public domain */ |
| 3 | 2015 : Samuel Neves : Public domain */ | ||
| 4 | 3 | ||
| 5 | #ifndef ZIP7_INC_BLAKE2_H | 4 | #ifndef ZIP7_INC_BLAKE2_H |
| 6 | #define ZIP7_INC_BLAKE2_H | 5 | #define ZIP7_INC_BLAKE2_H |
| 7 | 6 | ||
| 8 | #include "7zTypes.h" | 7 | #include "7zTypes.h" |
| 9 | 8 | ||
| 10 | EXTERN_C_BEGIN | 9 | #if 0 |
| 10 | #include "Compiler.h" | ||
| 11 | #include "CpuArch.h" | ||
| 12 | #if defined(MY_CPU_X86_OR_AMD64) | ||
| 13 | #if defined(__SSE2__) \ | ||
| 14 | || defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
| 15 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
| 16 | || defined(__clang__) \ | ||
| 17 | || defined(__INTEL_COMPILER) | ||
| 18 | #include <emmintrin.h> // SSE2 | ||
| 19 | #endif | ||
| 11 | 20 | ||
| 12 | #define BLAKE2S_BLOCK_SIZE 64 | 21 | #if defined(__AVX2__) \ |
| 13 | #define BLAKE2S_DIGEST_SIZE 32 | 22 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ |
| 14 | #define BLAKE2SP_PARALLEL_DEGREE 8 | 23 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ |
| 24 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
| 25 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
| 26 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
| 27 | #include <immintrin.h> | ||
| 28 | #if defined(__clang__) | ||
| 29 | #include <avxintrin.h> | ||
| 30 | #include <avx2intrin.h> | ||
| 31 | #endif | ||
| 32 | #endif // avx2 | ||
| 33 | #endif // MY_CPU_X86_OR_AMD64 | ||
| 34 | #endif // 0 | ||
| 15 | 35 | ||
| 16 | typedef struct | 36 | EXTERN_C_BEGIN |
| 17 | { | ||
| 18 | UInt32 h[8]; | ||
| 19 | UInt32 t[2]; | ||
| 20 | UInt32 f[2]; | ||
| 21 | Byte buf[BLAKE2S_BLOCK_SIZE]; | ||
| 22 | UInt32 bufPos; | ||
| 23 | UInt32 lastNode_f1; | ||
| 24 | UInt32 dummy[2]; /* for sizeof(CBlake2s) alignment */ | ||
| 25 | } CBlake2s; | ||
| 26 | |||
| 27 | /* You need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ | ||
| 28 | /* | ||
| 29 | void Blake2s_Init0(CBlake2s *p); | ||
| 30 | void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size); | ||
| 31 | void Blake2s_Final(CBlake2s *p, Byte *digest); | ||
| 32 | */ | ||
| 33 | 37 | ||
| 38 | #define Z7_BLAKE2S_BLOCK_SIZE 64 | ||
| 39 | #define Z7_BLAKE2S_DIGEST_SIZE 32 | ||
| 40 | #define Z7_BLAKE2SP_PARALLEL_DEGREE 8 | ||
| 41 | #define Z7_BLAKE2SP_NUM_STRUCT_WORDS 16 | ||
| 34 | 42 | ||
| 43 | #if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS) | ||
| 44 | typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_COMPRESS)(UInt32 *states, const Byte *data, const Byte *end); | ||
| 45 | typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_INIT)(UInt32 *states); | ||
| 46 | #endif | ||
| 47 | |||
| 48 | // it's required that CBlake2sp is aligned for 32-bytes, | ||
| 49 | // because the code can use unaligned access with sse and avx256. | ||
| 50 | // but 64-bytes alignment can be better. | ||
| 51 | MY_ALIGN(64) | ||
| 35 | typedef struct | 52 | typedef struct |
| 36 | { | 53 | { |
| 37 | CBlake2s S[BLAKE2SP_PARALLEL_DEGREE]; | 54 | union |
| 38 | unsigned bufPos; | 55 | { |
| 39 | } CBlake2sp; | 56 | #if 0 |
| 57 | #if defined(MY_CPU_X86_OR_AMD64) | ||
| 58 | #if defined(__SSE2__) \ | ||
| 59 | || defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
| 60 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
| 61 | || defined(__clang__) \ | ||
| 62 | || defined(__INTEL_COMPILER) | ||
| 63 | __m128i _pad_align_128bit[4]; | ||
| 64 | #endif // sse2 | ||
| 65 | #if defined(__AVX2__) \ | ||
| 66 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
| 67 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
| 68 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
| 69 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
| 70 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
| 71 | __m256i _pad_align_256bit[2]; | ||
| 72 | #endif // avx2 | ||
| 73 | #endif // x86 | ||
| 74 | #endif // 0 | ||
| 40 | 75 | ||
| 76 | void * _pad_align_ptr[8]; | ||
| 77 | UInt32 _pad_align_32bit[16]; | ||
| 78 | struct | ||
| 79 | { | ||
| 80 | unsigned cycPos; | ||
| 81 | unsigned _pad_unused; | ||
| 82 | #if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS) | ||
| 83 | Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Fast; | ||
| 84 | Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Single; | ||
| 85 | Z7_BLAKE2SP_FUNC_INIT func_Init; | ||
| 86 | Z7_BLAKE2SP_FUNC_INIT func_Final; | ||
| 87 | #endif | ||
| 88 | } header; | ||
| 89 | } u; | ||
| 90 | // MY_ALIGN(64) | ||
| 91 | UInt32 states[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS]; | ||
| 92 | // MY_ALIGN(64) | ||
| 93 | UInt32 buf32[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS * 2]; | ||
| 94 | } CBlake2sp; | ||
| 41 | 95 | ||
| 96 | BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo); | ||
| 42 | void Blake2sp_Init(CBlake2sp *p); | 97 | void Blake2sp_Init(CBlake2sp *p); |
| 98 | void Blake2sp_InitState(CBlake2sp *p); | ||
| 43 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); | 99 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); |
| 44 | void Blake2sp_Final(CBlake2sp *p, Byte *digest); | 100 | void Blake2sp_Final(CBlake2sp *p, Byte *digest); |
| 101 | void z7_Black2sp_Prepare(void); | ||
| 45 | 102 | ||
| 46 | EXTERN_C_END | 103 | EXTERN_C_END |
| 47 | 104 | ||
diff --git a/C/Blake2s.c b/C/Blake2s.c index 2a84b57..459e76b 100644 --- a/C/Blake2s.c +++ b/C/Blake2s.c | |||
| @@ -1,250 +1,2645 @@ | |||
| 1 | /* Blake2s.c -- BLAKE2s and BLAKE2sp Hash | 1 | /* Blake2s.c -- BLAKE2sp Hash |
| 2 | 2023-03-04 : Igor Pavlov : Public domain | 2 | 2024-01-29 : Igor Pavlov : Public domain |
| 3 | 2015 : Samuel Neves : Public domain */ | 3 | 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */ |
| 4 | 4 | ||
| 5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
| 6 | 6 | ||
| 7 | // #include <stdio.h> | ||
| 7 | #include <string.h> | 8 | #include <string.h> |
| 8 | 9 | ||
| 9 | #include "Blake2.h" | 10 | #include "Blake2.h" |
| 10 | #include "CpuArch.h" | ||
| 11 | #include "RotateDefs.h" | 11 | #include "RotateDefs.h" |
| 12 | #include "Compiler.h" | ||
| 13 | #include "CpuArch.h" | ||
| 14 | |||
| 15 | #if defined(__SSE2__) | ||
| 16 | #define Z7_BLAKE2S_USE_VECTORS | ||
| 17 | #elif defined(MY_CPU_X86_OR_AMD64) | ||
| 18 | #if defined(_MSC_VER) && _MSC_VER > 1200 \ | ||
| 19 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ | ||
| 20 | || defined(__clang__) \ | ||
| 21 | || defined(__INTEL_COMPILER) | ||
| 22 | #define Z7_BLAKE2S_USE_VECTORS | ||
| 23 | #endif | ||
| 24 | #endif | ||
| 25 | |||
| 26 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 27 | |||
| 28 | #define Z7_BLAKE2SP_USE_FUNCTIONS | ||
| 29 | |||
| 30 | // define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes. | ||
| 31 | // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
| 32 | |||
| 33 | // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%. | ||
| 34 | #if defined(__SSSE3__) | ||
| 35 | #define Z7_BLAKE2S_USE_SSSE3 | ||
| 36 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
| 37 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
| 38 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
| 39 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
| 40 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
| 41 | #define Z7_BLAKE2S_USE_SSSE3 | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #ifdef Z7_BLAKE2S_USE_SSSE3 | ||
| 45 | /* SSE41 : for _mm_insert_epi32 (pinsrd) | ||
| 46 | it can slightly reduce code size and improves the performance in some cases. | ||
| 47 | it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used. | ||
| 48 | it can be used for all blocks in another algos (4+). | ||
| 49 | */ | ||
| 50 | #if defined(__SSE4_1__) | ||
| 51 | #define Z7_BLAKE2S_USE_SSE41 | ||
| 52 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ | ||
| 53 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ | ||
| 54 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ | ||
| 55 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ | ||
| 56 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) | ||
| 57 | #define Z7_BLAKE2S_USE_SSE41 | ||
| 58 | #endif | ||
| 59 | #endif // SSSE3 | ||
| 60 | |||
| 61 | #if defined(__GNUC__) || defined(__clang__) | ||
| 62 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
| 63 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1"))) | ||
| 64 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
| 65 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3"))) | ||
| 66 | #else | ||
| 67 | #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2"))) | ||
| 68 | #endif | ||
| 69 | #endif | ||
| 70 | |||
| 71 | |||
| 72 | #if defined(__AVX2__) | ||
| 73 | #define Z7_BLAKE2S_USE_AVX2 | ||
| 74 | #else | ||
| 75 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
| 76 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
| 77 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) | ||
| 78 | #define Z7_BLAKE2S_USE_AVX2 | ||
| 79 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 80 | #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2"))) | ||
| 81 | #endif | ||
| 82 | #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
| 83 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
| 84 | #if (Z7_MSC_VER_ORIGINAL == 1900) | ||
| 85 | #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX | ||
| 86 | #endif | ||
| 87 | #define Z7_BLAKE2S_USE_AVX2 | ||
| 88 | #endif | ||
| 89 | #endif | ||
| 90 | |||
| 91 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
| 92 | #include <smmintrin.h> // SSE4.1 | ||
| 93 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
| 94 | #include <tmmintrin.h> // SSSE3 | ||
| 95 | #else | ||
| 96 | #include <emmintrin.h> // SSE2 | ||
| 97 | #endif | ||
| 98 | |||
| 99 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 100 | #include <immintrin.h> | ||
| 101 | #if defined(__clang__) | ||
| 102 | #include <avxintrin.h> | ||
| 103 | #include <avx2intrin.h> | ||
| 104 | #endif | ||
| 105 | #endif // avx2 | ||
| 106 | |||
| 107 | |||
| 108 | #if defined(__AVX512F__) && defined(__AVX512VL__) | ||
| 109 | // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930) | ||
| 110 | #define Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
| 111 | // #pragma message ("=== Blake2s AVX512") | ||
| 112 | #endif | ||
| 12 | 113 | ||
| 13 | #define rotr32 rotrFixed | ||
| 14 | 114 | ||
| 15 | #define BLAKE2S_NUM_ROUNDS 10 | 115 | #define Z7_BLAKE2S_USE_V128_FAST |
| 16 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | 116 | // for speed optimization for small messages: |
| 117 | // #define Z7_BLAKE2S_USE_V128_WAY2 | ||
| 17 | 118 | ||
| 119 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 120 | |||
| 121 | // for debug: | ||
| 122 | // gather is slow | ||
| 123 | // #define Z7_BLAKE2S_USE_GATHER | ||
| 124 | |||
| 125 | #define Z7_BLAKE2S_USE_AVX2_FAST | ||
| 126 | // for speed optimization for small messages: | ||
| 127 | // #define Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 128 | // #define Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 129 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \ | ||
| 130 | defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
| 131 | #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 132 | #endif | ||
| 133 | #endif | ||
| 134 | |||
| 135 | #define Z7_BLAKE2SP_ALGO_DEFAULT 0 | ||
| 136 | #define Z7_BLAKE2SP_ALGO_SCALAR 1 | ||
| 137 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
| 138 | #define Z7_BLAKE2SP_ALGO_V128_FAST 2 | ||
| 139 | #endif | ||
| 140 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 141 | #define Z7_BLAKE2SP_ALGO_V256_FAST 3 | ||
| 142 | #endif | ||
| 143 | #define Z7_BLAKE2SP_ALGO_V128_WAY1 4 | ||
| 144 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
| 145 | #define Z7_BLAKE2SP_ALGO_V128_WAY2 5 | ||
| 146 | #endif | ||
| 147 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 148 | #define Z7_BLAKE2SP_ALGO_V256_WAY2 6 | ||
| 149 | #endif | ||
| 150 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 151 | #define Z7_BLAKE2SP_ALGO_V256_WAY4 7 | ||
| 152 | #endif | ||
| 153 | |||
| 154 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
| 155 | |||
| 156 | |||
| 157 | |||
| 158 | |||
| 159 | #define BLAKE2S_FINAL_FLAG (~(UInt32)0) | ||
| 160 | #define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS | ||
| 161 | #define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE) | ||
| 162 | #define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1) | ||
| 163 | |||
| 164 | #define V_INDEX_0_0 0 | ||
| 165 | #define V_INDEX_1_0 1 | ||
| 166 | #define V_INDEX_2_0 2 | ||
| 167 | #define V_INDEX_3_0 3 | ||
| 168 | #define V_INDEX_0_1 4 | ||
| 169 | #define V_INDEX_1_1 5 | ||
| 170 | #define V_INDEX_2_1 6 | ||
| 171 | #define V_INDEX_3_1 7 | ||
| 172 | #define V_INDEX_0_2 8 | ||
| 173 | #define V_INDEX_1_2 9 | ||
| 174 | #define V_INDEX_2_2 10 | ||
| 175 | #define V_INDEX_3_2 11 | ||
| 176 | #define V_INDEX_0_3 12 | ||
| 177 | #define V_INDEX_1_3 13 | ||
| 178 | #define V_INDEX_2_3 14 | ||
| 179 | #define V_INDEX_3_3 15 | ||
| 180 | #define V_INDEX_4_0 0 | ||
| 181 | #define V_INDEX_5_0 1 | ||
| 182 | #define V_INDEX_6_0 2 | ||
| 183 | #define V_INDEX_7_0 3 | ||
| 184 | #define V_INDEX_7_1 4 | ||
| 185 | #define V_INDEX_4_1 5 | ||
| 186 | #define V_INDEX_5_1 6 | ||
| 187 | #define V_INDEX_6_1 7 | ||
| 188 | #define V_INDEX_6_2 8 | ||
| 189 | #define V_INDEX_7_2 9 | ||
| 190 | #define V_INDEX_4_2 10 | ||
| 191 | #define V_INDEX_5_2 11 | ||
| 192 | #define V_INDEX_5_3 12 | ||
| 193 | #define V_INDEX_6_3 13 | ||
| 194 | #define V_INDEX_7_3 14 | ||
| 195 | #define V_INDEX_4_3 15 | ||
| 196 | |||
| 197 | #define V(row, col) v[V_INDEX_ ## row ## _ ## col] | ||
| 198 | |||
| 199 | #define k_Blake2s_IV_0 0x6A09E667UL | ||
| 200 | #define k_Blake2s_IV_1 0xBB67AE85UL | ||
| 201 | #define k_Blake2s_IV_2 0x3C6EF372UL | ||
| 202 | #define k_Blake2s_IV_3 0xA54FF53AUL | ||
| 203 | #define k_Blake2s_IV_4 0x510E527FUL | ||
| 204 | #define k_Blake2s_IV_5 0x9B05688CUL | ||
| 205 | #define k_Blake2s_IV_6 0x1F83D9ABUL | ||
| 206 | #define k_Blake2s_IV_7 0x5BE0CD19UL | ||
| 207 | |||
| 208 | #define KIV(n) (k_Blake2s_IV_## n) | ||
| 209 | |||
| 210 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 211 | MY_ALIGN(16) | ||
| 18 | static const UInt32 k_Blake2s_IV[8] = | 212 | static const UInt32 k_Blake2s_IV[8] = |
| 19 | { | 213 | { |
| 20 | 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, | 214 | KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7) |
| 21 | 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL | ||
| 22 | }; | 215 | }; |
| 216 | #endif | ||
| 23 | 217 | ||
| 24 | static const Byte k_Blake2s_Sigma[BLAKE2S_NUM_ROUNDS][16] = | 218 | #define STATE_T(s) ((s) + 8) |
| 25 | { | 219 | #define STATE_F(s) ((s) + 10) |
| 26 | { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , | 220 | |
| 27 | { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , | 221 | #ifdef Z7_BLAKE2S_USE_VECTORS |
| 28 | { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , | ||
| 29 | { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , | ||
| 30 | { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , | ||
| 31 | { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , | ||
| 32 | { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , | ||
| 33 | { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , | ||
| 34 | { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , | ||
| 35 | { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , | ||
| 36 | }; | ||
| 37 | 222 | ||
| 223 | #define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p)) | ||
| 224 | #define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p)) | ||
| 225 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
| 226 | // here we use unaligned load and stores | ||
| 227 | // use this branch if CBlake2sp can be unaligned for 16 bytes | ||
| 228 | #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r) | ||
| 229 | #define LOAD_128_FROM_STRUCT(p) LOADU_128(p) | ||
| 230 | #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r) | ||
| 231 | #else | ||
| 232 | // here we use aligned load and stores | ||
| 233 | // use this branch if CBlake2sp is aligned for 16 bytes | ||
| 234 | #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r) | ||
| 235 | #define LOAD_128_FROM_STRUCT(p) LOAD_128(p) | ||
| 236 | #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r) | ||
| 237 | #endif | ||
| 38 | 238 | ||
| 39 | static void Blake2s_Init0(CBlake2s *p) | 239 | #endif // Z7_BLAKE2S_USE_VECTORS |
| 240 | |||
| 241 | |||
| 242 | #if 0 | ||
| 243 | static void PrintState(const UInt32 *s, unsigned num) | ||
| 244 | { | ||
| 245 | unsigned i; | ||
| 246 | printf("\n"); | ||
| 247 | for (i = 0; i < num; i++) | ||
| 248 | printf(" %08x", (unsigned)s[i]); | ||
| 249 | } | ||
| 250 | static void PrintStates2(const UInt32 *s, unsigned x, unsigned y) | ||
| 40 | { | 251 | { |
| 41 | unsigned i; | 252 | unsigned i; |
| 42 | for (i = 0; i < 8; i++) | 253 | for (i = 0; i < y; i++) |
| 43 | p->h[i] = k_Blake2s_IV[i]; | 254 | PrintState(s + i * x, x); |
| 44 | p->t[0] = 0; | 255 | printf("\n"); |
| 45 | p->t[1] = 0; | ||
| 46 | p->f[0] = 0; | ||
| 47 | p->f[1] = 0; | ||
| 48 | p->bufPos = 0; | ||
| 49 | p->lastNode_f1 = 0; | ||
| 50 | } | 256 | } |
| 257 | #endif | ||
| 258 | |||
| 259 | |||
| 260 | #define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) } | ||
| 261 | |||
| 262 | #define BLAKE2S_NUM_ROUNDS 10 | ||
| 263 | |||
| 264 | #if defined(Z7_BLAKE2S_USE_VECTORS) | ||
| 265 | #define ROUNDS_LOOP(mac) \ | ||
| 266 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) } | ||
| 267 | #endif | ||
| 268 | /* | ||
| 269 | #define ROUNDS_LOOP_2(mac) \ | ||
| 270 | { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } } | ||
| 271 | */ | ||
| 272 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
| 273 | #define ROUNDS_LOOP_UNROLLED(m) \ | ||
| 274 | { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) } | ||
| 275 | #endif | ||
| 276 | |||
| 277 | #define SIGMA_TABLE(M) \ | ||
| 278 | M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \ | ||
| 279 | M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \ | ||
| 280 | M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \ | ||
| 281 | M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \ | ||
| 282 | M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \ | ||
| 283 | M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \ | ||
| 284 | M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \ | ||
| 285 | M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \ | ||
| 286 | M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \ | ||
| 287 | M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 ) | ||
| 288 | |||
| 289 | #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
| 290 | { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m } | ||
| 291 | #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
| 292 | SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
| 293 | |||
| 294 | // MY_ALIGN(32) | ||
| 295 | MY_ALIGN(16) | ||
| 296 | static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] = | ||
| 297 | { SIGMA_TABLE(SIGMA_TABLE_MULT_4) }; | ||
| 298 | |||
| 299 | #define GET_SIGMA_PTR(p, index) \ | ||
| 300 | ((const void *)((const Byte *)(const void *)(p) + (index))) | ||
| 51 | 301 | ||
| 302 | #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \ | ||
| 303 | ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos))) | ||
| 52 | 304 | ||
| 53 | static void Blake2s_Compress(CBlake2s *p) | 305 | |
| 306 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 307 | |||
| 308 | |||
| 309 | #if 0 | ||
| 310 | // use loading constants from memory | ||
| 311 | // is faster for some compilers. | ||
| 312 | #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n) | ||
| 313 | MY_ALIGN(64) | ||
| 314 | static const UInt32 k_Blake2s_IV_WAY4[]= | ||
| 54 | { | 315 | { |
| 55 | UInt32 m[16]; | 316 | KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7) |
| 56 | UInt32 v[16]; | 317 | }; |
| 57 | 318 | #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i)) | |
| 319 | #else | ||
| 320 | // use constant generation: | ||
| 321 | #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i)) | ||
| 322 | #endif | ||
| 323 | |||
| 324 | |||
| 325 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 326 | #define GET_CONST_128_FROM_ARRAY32(k) \ | ||
| 327 | _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0]) | ||
| 328 | #endif | ||
| 329 | |||
| 330 | |||
| 331 | #if 0 | ||
| 332 | #define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
| 333 | #define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
| 334 | #define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE) | ||
| 335 | #define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0) | ||
| 336 | #define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4) | ||
| 337 | #else | ||
| 338 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
| 339 | !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
| 340 | MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 }; | ||
| 341 | MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 }; | ||
| 342 | #define k_r8 LOAD_128(k_r8_arr) | ||
| 343 | #define k_r16 LOAD_128(k_r16_arr) | ||
| 344 | #endif | ||
| 345 | MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 }; | ||
| 346 | #define k_inc LOAD_128(k_inc_arr) | ||
| 347 | #define k_iv0_128 LOAD_128(k_Blake2s_IV + 0) | ||
| 348 | #define k_iv4_128 LOAD_128(k_Blake2s_IV + 4) | ||
| 349 | #endif | ||
| 350 | |||
| 351 | |||
| 352 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 353 | |||
| 354 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 355 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) | ||
| 356 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | ||
| 357 | #else | ||
| 358 | #define MY_mm256_set_m128i _mm256_set_m128i | ||
| 359 | #endif | ||
| 360 | |||
| 361 | #define SET_FROM_128(a) MY_mm256_set_m128i(a, a) | ||
| 362 | |||
| 363 | #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS | ||
| 364 | MY_ALIGN(32) static const Byte k_r8_arr_256 [32] = | ||
| 365 | { | ||
| 366 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12, | ||
| 367 | 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 | ||
| 368 | }; | ||
| 369 | MY_ALIGN(32) static const Byte k_r16_arr_256[32] = | ||
| 370 | { | ||
| 371 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, | ||
| 372 | 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 | ||
| 373 | }; | ||
| 374 | #define k_r8_256 LOAD_256(k_r8_arr_256) | ||
| 375 | #define k_r16_256 LOAD_256(k_r16_arr_256) | ||
| 376 | #endif | ||
| 377 | |||
| 378 | // #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)) | ||
| 379 | // #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)) | ||
| 380 | // #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)) | ||
| 381 | // #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)) | ||
| 382 | #define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)) | ||
| 383 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 384 | #endif | ||
| 385 | |||
| 386 | |||
| 387 | /* | ||
| 388 | IPC(TP) ports: | ||
| 389 | 1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps | ||
| 390 | 2 p_15 : icl+ | ||
| 391 | 1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps | ||
| 392 | 3 p015 : skl+ | ||
| 393 | |||
| 394 | 3 p015 : SSE2 : pxor : _mm_xor_si128 | ||
| 395 | 2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32 | ||
| 396 | 2 p0_5: mrm-wsm : | ||
| 397 | 3 p015 : skl+ | ||
| 398 | |||
| 399 | 2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq | ||
| 400 | 2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32 | ||
| 401 | 2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16 | ||
| 402 | 2 p_15 : : SSE2 : psrldq : | ||
| 403 | 2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8 | ||
| 404 | 2 p_15 : : SSE4 : pblendw : _mm_blend_epi16 | ||
| 405 | 1 p__5 : hsw-skl : * | ||
| 406 | |||
| 407 | 1 p0 : SSE2 : pslld (i8) : _mm_slli_si128 | ||
| 408 | 2 p01 : skl+ : | ||
| 409 | |||
| 410 | 2 p_15 : ivb- : SSE3 : palignr | ||
| 411 | 1 p__5 : hsw+ | ||
| 412 | |||
| 413 | 2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8) | ||
| 414 | 1 p__5 + p23 : hsw-skl | ||
| 415 | 1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8) | ||
| 416 | 0.5 2*p5 : hsw-skl | ||
| 417 | |||
| 418 | 2 p23 : SSE2 : movd (m32) | ||
| 419 | 3 p23A : adl : | ||
| 420 | 1 p5: : SSE2 : movd (r32) | ||
| 421 | */ | ||
| 422 | |||
| 423 | #if 0 && defined(__XOP__) | ||
| 424 | // we must debug and test __XOP__ instruction | ||
| 425 | #include <x86intrin.h> | ||
| 426 | #include <ammintrin.h> | ||
| 427 | #define LOAD_ROTATE_CONSTS | ||
| 428 | #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c)) | ||
| 429 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
| 430 | #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
| 431 | #define LOAD_ROTATE_CONSTS | ||
| 432 | #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c) | ||
| 433 | #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
| 434 | #else | ||
| 435 | |||
| 436 | // MSVC_1937+ uses "orps" instruction for _mm_or_si128(). | ||
| 437 | // But "orps" has low throughput: TP=1 for bdw-nhm. | ||
| 438 | // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps". | ||
| 439 | // But "orps" is fast for modern cpus (skl+). | ||
| 440 | // So we are default with "or" version: | ||
| 441 | #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937 | ||
| 442 | // minor optimization for some old cpus, if "xorps" is slow. | ||
| 443 | #define MM128_EPI32_OR_or_ADD _mm_add_epi32 | ||
| 444 | #else | ||
| 445 | #define MM128_EPI32_OR_or_ADD _mm_or_si128 | ||
| 446 | #endif | ||
| 447 | |||
| 448 | #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \ | ||
| 449 | MM128_EPI32_OR_or_ADD( \ | ||
| 450 | _mm_srli_epi32((r), (c)), \ | ||
| 451 | _mm_slli_epi32((r), 32-(c)))) | ||
| 452 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
| 453 | #define LOAD_ROTATE_CONSTS \ | ||
| 454 | const __m128i r8 = k_r8; \ | ||
| 455 | const __m128i r16 = k_r16; | ||
| 456 | #define MM_ROR_EPI32(r, c) ( \ | ||
| 457 | ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \ | ||
| 458 | : (16==(c)) ? _mm_shuffle_epi8(r,r16) \ | ||
| 459 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
| 460 | #else | ||
| 461 | #define LOAD_ROTATE_CONSTS | ||
| 462 | #define MM_ROR_EPI32(r, c) ( \ | ||
| 463 | (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \ | ||
| 464 | : MM_ROR_EPI32_VIA_SHIFT(r, c)) | ||
| 465 | #endif | ||
| 466 | #endif | ||
| 467 | |||
| 468 | /* | ||
| 469 | we have 3 main ways to load 4 32-bit integers to __m128i: | ||
| 470 | 1) SSE2: _mm_set_epi32() | ||
| 471 | 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128() | ||
| 472 | 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128() | ||
| 473 | good compiler for _mm_set_epi32() generates these instructions: | ||
| 474 | { | ||
| 475 | movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq; | ||
| 476 | } | ||
| 477 | good new compiler generates one instruction | ||
| 478 | { | ||
| 479 | for _mm_insert_epi32() : { pinsrd xmm, [m32], i } | ||
| 480 | for _mm_cvtsi32_si128() : { movd xmm, [m32] } | ||
| 481 | } | ||
| 482 | but vc2010 generates slow pair of instructions: | ||
| 483 | { | ||
| 484 | for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i } | ||
| 485 | for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 } | ||
| 486 | } | ||
| 487 | _mm_insert_epi32() (pinsrd) code reduces xmm register pressure | ||
| 488 | in comparison with _mm_set_epi32() (movd + vpunpckld) code. | ||
| 489 | Note that variant with "movd xmm, r32" can be more slow, | ||
| 490 | but register pressure can be more important. | ||
| 491 | So we can force to "pinsrd" always. | ||
| 492 | */ | ||
| 493 | // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86) | ||
| 494 | #ifdef Z7_BLAKE2S_USE_SSE41 | ||
| 495 | /* _mm_set_epi32() can be more effective for GCC and CLANG | ||
| 496 | _mm_insert_epi32() is more effective for MSVC */ | ||
| 497 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
| 498 | #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
| 499 | #endif | ||
| 500 | #endif // USE_SSE41 | ||
| 501 | // #endif | ||
| 502 | |||
| 503 | #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION | ||
| 504 | // for SSE4.1 | ||
| 505 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
| 506 | _mm_insert_epi32( \ | ||
| 507 | _mm_insert_epi32( \ | ||
| 508 | _mm_insert_epi32( \ | ||
| 509 | _mm_cvtsi32_si128( \ | ||
| 510 | *(const Int32 *)p0), \ | ||
| 511 | *(const Int32 *)p1, 1), \ | ||
| 512 | *(const Int32 *)p2, 2), \ | ||
| 513 | *(const Int32 *)p3, 3) | ||
| 514 | #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) | ||
| 515 | /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read. | ||
| 516 | Also _mm_unpacklo_epi32 is more effective for another MSVC compilers. | ||
| 517 | But _mm_set_epi32() is more effective for GCC and CLANG. | ||
| 518 | So we use _mm_unpacklo_epi32 for MSVC only */ | ||
| 519 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
| 520 | _mm_unpacklo_epi64( \ | ||
| 521 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \ | ||
| 522 | _mm_cvtsi32_si128(*(const Int32 *)p1)), \ | ||
| 523 | _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \ | ||
| 524 | _mm_cvtsi32_si128(*(const Int32 *)p3))) | ||
| 525 | #else | ||
| 526 | #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ | ||
| 527 | _mm_set_epi32( \ | ||
| 528 | *(const Int32 *)p3, \ | ||
| 529 | *(const Int32 *)p2, \ | ||
| 530 | *(const Int32 *)p1, \ | ||
| 531 | *(const Int32 *)p0) | ||
| 532 | #endif | ||
| 533 | |||
| 534 | #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \ | ||
| 535 | MM_LOAD_EPI32_FROM_4_POINTERS( \ | ||
| 536 | GET_SIGMA_PTR(input, i0), \ | ||
| 537 | GET_SIGMA_PTR(input, i1), \ | ||
| 538 | GET_SIGMA_PTR(input, i2), \ | ||
| 539 | GET_SIGMA_PTR(input, i3)) | ||
| 540 | |||
| 541 | #define SET_ROW_FROM_SIGMA(input, sigma_index) \ | ||
| 542 | SET_ROW_FROM_SIGMA_BASE(input, \ | ||
| 543 | sigma[(sigma_index) ], \ | ||
| 544 | sigma[(sigma_index) + 2 * 1], \ | ||
| 545 | sigma[(sigma_index) + 2 * 2], \ | ||
| 546 | sigma[(sigma_index) + 2 * 3]) \ | ||
| 547 | |||
| 548 | |||
| 549 | #define ADD_128(a, b) _mm_add_epi32(a, b) | ||
| 550 | #define XOR_128(a, b) _mm_xor_si128(a, b) | ||
| 551 | |||
| 552 | #define D_ADD_128(dest, src) dest = ADD_128(dest, src) | ||
| 553 | #define D_XOR_128(dest, src) dest = XOR_128(dest, src) | ||
| 554 | #define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift) | ||
| 555 | #define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src) | ||
| 556 | |||
| 557 | |||
| 558 | #define AXR(a, b, d, shift) \ | ||
| 559 | D_ADD_128(a, b); \ | ||
| 560 | D_XOR_128(d, a); \ | ||
| 561 | D_ROR_128(d, shift); | ||
| 562 | |||
| 563 | #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \ | ||
| 564 | a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \ | ||
| 565 | AXR(a, b, d, shift1) \ | ||
| 566 | AXR(c, d, b, shift2) | ||
| 567 | |||
| 568 | #define ROTATE_WORDS_TO_RIGHT(a, n) \ | ||
| 569 | a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); | ||
| 570 | |||
| 571 | #define AXR4(a, b, c, d, input, sigma_index) \ | ||
| 572 | AXR2(a, b, c, d, input, sigma_index, 16, 12) \ | ||
| 573 | AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \ | ||
| 574 | |||
| 575 | #define RR2(a, b, c, d, input) \ | ||
| 576 | { \ | ||
| 577 | AXR4(a, b, c, d, input, 0) \ | ||
| 578 | ROTATE_WORDS_TO_RIGHT(b, 1) \ | ||
| 579 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
| 580 | ROTATE_WORDS_TO_RIGHT(d, 3) \ | ||
| 581 | AXR4(a, b, c, d, input, 8) \ | ||
| 582 | ROTATE_WORDS_TO_RIGHT(b, 3) \ | ||
| 583 | ROTATE_WORDS_TO_RIGHT(c, 2) \ | ||
| 584 | ROTATE_WORDS_TO_RIGHT(d, 1) \ | ||
| 585 | } | ||
| 586 | |||
| 587 | |||
| 588 | /* | ||
| 589 | Way1: | ||
| 590 | per 64 bytes block: | ||
| 591 | 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1 | ||
| 592 | * (7 + 1) = 320 cycles = if pslld TP=2 (skl+) | ||
| 593 | additional operations per 7_op_iter : | ||
| 594 | 4 movzx byte mem | ||
| 595 | 1 movd mem | ||
| 596 | 3 pinsrd mem | ||
| 597 | 1.5 pshufd | ||
| 598 | */ | ||
| 599 | |||
| 600 | static | ||
| 601 | #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
| 602 | defined(Z7_BLAKE2S_USE_V256_WAY2)) | ||
| 603 | Z7_NO_INLINE | ||
| 604 | #else | ||
| 605 | Z7_FORCE_INLINE | ||
| 606 | #endif | ||
| 607 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
| 608 | BLAKE2S_ATTRIB_128BIT | ||
| 609 | #endif | ||
| 610 | void | ||
| 611 | Z7_FASTCALL | ||
| 612 | Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input) | ||
| 613 | { | ||
| 614 | __m128i a, b, c, d; | ||
| 615 | __m128i f0, f1; | ||
| 616 | |||
| 617 | LOAD_ROTATE_CONSTS | ||
| 618 | d = LOAD_128_FROM_STRUCT(STATE_T(s)); | ||
| 619 | c = k_iv0_128; | ||
| 620 | a = f0 = LOAD_128_FROM_STRUCT(s); | ||
| 621 | b = f1 = LOAD_128_FROM_STRUCT(s + 4); | ||
| 622 | D_ADD_EPI64_128(d, k_inc); | ||
| 623 | STORE_128_TO_STRUCT (STATE_T(s), d); | ||
| 624 | D_XOR_128(d, k_iv4_128); | ||
| 625 | |||
| 626 | #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
| 627 | RR2(a, b, c, d, input) } | ||
| 628 | |||
| 629 | ROUNDS_LOOP(RR) | ||
| 630 | #undef RR | ||
| 631 | |||
| 632 | STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c))); | ||
| 633 | STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d))); | ||
| 634 | } | ||
| 635 | |||
| 636 | |||
| 637 | static | ||
| 638 | Z7_NO_INLINE | ||
| 639 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
| 640 | BLAKE2S_ATTRIB_128BIT | ||
| 641 | #endif | ||
| 642 | void | ||
| 643 | Z7_FASTCALL | ||
| 644 | Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 645 | { | ||
| 646 | size_t pos = 0; | ||
| 647 | do | ||
| 58 | { | 648 | { |
| 59 | unsigned i; | 649 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
| 650 | Blake2s_Compress_V128_Way1(s, data); | ||
| 651 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 652 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 653 | pos &= SUPER_BLOCK_MASK; | ||
| 654 | } | ||
| 655 | while (data != end); | ||
| 656 | } | ||
| 657 | |||
| 658 | |||
| 659 | #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \ | ||
| 660 | defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
| 661 | #if 1 | ||
| 662 | #define Z7_BLAKE2S_CompressSingleBlock(s, data) \ | ||
| 663 | Blake2sp_Compress2_V128_Way1(s, data, \ | ||
| 664 | (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE) | ||
| 665 | #else | ||
| 666 | #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1 | ||
| 667 | #endif | ||
| 668 | #endif | ||
| 669 | |||
| 670 | |||
| 671 | #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \ | ||
| 672 | defined(Z7_BLAKE2S_USE_V128_WAY2)) && \ | ||
| 673 | !defined(Z7_BLAKE2S_USE_GATHER) | ||
| 674 | #define AXR2_LOAD_INDEXES(sigma_index) \ | ||
| 675 | const unsigned i0 = sigma[(sigma_index)]; \ | ||
| 676 | const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \ | ||
| 677 | const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \ | ||
| 678 | const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \ | ||
| 679 | |||
| 680 | #define SET_ROW_FROM_SIGMA_W(input) \ | ||
| 681 | SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) | ||
| 682 | #endif | ||
| 683 | |||
| 684 | |||
| 685 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
| 686 | |||
| 687 | #if 1 || !defined(Z7_BLAKE2S_USE_SSE41) | ||
| 688 | /* we use SET_ROW_FROM_SIGMA_BASE, that uses | ||
| 689 | (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined | ||
| 690 | (SSE2) _mm_set_epi32() | ||
| 691 | MSVC can be faster for this branch: | ||
| 692 | */ | ||
| 693 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
| 694 | { \ | ||
| 695 | AXR2_LOAD_INDEXES(sigma_index) \ | ||
| 696 | a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \ | ||
| 697 | a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \ | ||
| 698 | AXR(a0, b0, d0, shift1) \ | ||
| 699 | AXR(a1, b1, d1, shift1) \ | ||
| 700 | AXR(c0, d0, b0, shift2) \ | ||
| 701 | AXR(c1, d1, b1, shift2) \ | ||
| 702 | } | ||
| 703 | #else | ||
| 704 | /* we use interleaved _mm_insert_epi32(): | ||
| 705 | GCC can be faster for this branch: | ||
| 706 | */ | ||
| 707 | #define AXR2_W_PRE_INSERT(sigma_index, i) \ | ||
| 708 | { const unsigned ii = sigma[(sigma_index) + i * 2]; \ | ||
| 709 | t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \ | ||
| 710 | t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \ | ||
| 711 | } | ||
| 712 | #define AXR2_W(sigma_index, shift1, shift2) \ | ||
| 713 | { __m128i t0, t1; \ | ||
| 714 | { const unsigned ii = sigma[sigma_index]; \ | ||
| 715 | t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \ | ||
| 716 | t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \ | ||
| 717 | } \ | ||
| 718 | AXR2_W_PRE_INSERT(sigma_index, 1) \ | ||
| 719 | AXR2_W_PRE_INSERT(sigma_index, 2) \ | ||
| 720 | AXR2_W_PRE_INSERT(sigma_index, 3) \ | ||
| 721 | a0 = _mm_add_epi32(a0, t0); \ | ||
| 722 | a1 = _mm_add_epi32(a1, t1); \ | ||
| 723 | AXR(a0, b0, d0, shift1) \ | ||
| 724 | AXR(a1, b1, d1, shift1) \ | ||
| 725 | AXR(c0, d0, b0, shift2) \ | ||
| 726 | AXR(c1, d1, b1, shift2) \ | ||
| 727 | } | ||
| 728 | #endif | ||
| 729 | |||
| 730 | |||
| 731 | #define AXR4_W(sigma_index) \ | ||
| 732 | AXR2_W(sigma_index, 16, 12) \ | ||
| 733 | AXR2_W(sigma_index + 1, 8, 7) \ | ||
| 734 | |||
| 735 | #define WW(r) \ | ||
| 736 | { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ | ||
| 737 | AXR4_W(0) \ | ||
| 738 | ROTATE_WORDS_TO_RIGHT(b0, 1) \ | ||
| 739 | ROTATE_WORDS_TO_RIGHT(b1, 1) \ | ||
| 740 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
| 741 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
| 742 | ROTATE_WORDS_TO_RIGHT(d0, 3) \ | ||
| 743 | ROTATE_WORDS_TO_RIGHT(d1, 3) \ | ||
| 744 | AXR4_W(8) \ | ||
| 745 | ROTATE_WORDS_TO_RIGHT(b0, 3) \ | ||
| 746 | ROTATE_WORDS_TO_RIGHT(b1, 3) \ | ||
| 747 | ROTATE_WORDS_TO_RIGHT(c0, 2) \ | ||
| 748 | ROTATE_WORDS_TO_RIGHT(c1, 2) \ | ||
| 749 | ROTATE_WORDS_TO_RIGHT(d0, 1) \ | ||
| 750 | ROTATE_WORDS_TO_RIGHT(d1, 1) \ | ||
| 751 | } | ||
| 752 | |||
| 753 | |||
| 754 | static | ||
| 755 | Z7_NO_INLINE | ||
| 756 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
| 757 | BLAKE2S_ATTRIB_128BIT | ||
| 758 | #endif | ||
| 759 | void | ||
| 760 | Z7_FASTCALL | ||
| 761 | Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 762 | { | ||
| 763 | size_t pos = 0; | ||
| 764 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
| 765 | |||
| 766 | if (data != end) | ||
| 767 | { | ||
| 768 | LOAD_ROTATE_CONSTS | ||
| 769 | do | ||
| 770 | { | ||
| 771 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 772 | __m128i a0, b0, c0, d0; | ||
| 773 | __m128i a1, b1, c1, d1; | ||
| 774 | { | ||
| 775 | const __m128i inc = k_inc; | ||
| 776 | const __m128i temp = k_iv4_128; | ||
| 777 | d0 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
| 778 | d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
| 779 | D_ADD_EPI64_128(d0, inc); | ||
| 780 | D_ADD_EPI64_128(d1, inc); | ||
| 781 | STORE_128_TO_STRUCT (STATE_T(s ), d0); | ||
| 782 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1); | ||
| 783 | D_XOR_128(d0, temp); | ||
| 784 | D_XOR_128(d1, temp); | ||
| 785 | } | ||
| 786 | c1 = c0 = k_iv0_128; | ||
| 787 | a0 = LOAD_128_FROM_STRUCT(s); | ||
| 788 | b0 = LOAD_128_FROM_STRUCT(s + 4); | ||
| 789 | a1 = LOAD_128_FROM_STRUCT(s + NSW); | ||
| 790 | b1 = LOAD_128_FROM_STRUCT(s + NSW + 4); | ||
| 791 | |||
| 792 | ROUNDS_LOOP (WW) | ||
| 793 | |||
| 794 | #undef WW | ||
| 795 | |||
| 796 | D_XOR_128(a0, c0); | ||
| 797 | D_XOR_128(b0, d0); | ||
| 798 | D_XOR_128(a1, c1); | ||
| 799 | D_XOR_128(b1, d1); | ||
| 800 | |||
| 801 | D_XOR_128(a0, LOAD_128_FROM_STRUCT(s)); | ||
| 802 | D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4)); | ||
| 803 | D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW)); | ||
| 804 | D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4)); | ||
| 805 | |||
| 806 | STORE_128_TO_STRUCT(s, a0); | ||
| 807 | STORE_128_TO_STRUCT(s + 4, b0); | ||
| 808 | STORE_128_TO_STRUCT(s + NSW, a1); | ||
| 809 | STORE_128_TO_STRUCT(s + NSW + 4, b1); | ||
| 810 | |||
| 811 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
| 812 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
| 813 | pos &= SUPER_BLOCK_MASK; | ||
| 814 | } | ||
| 815 | while (data < end); | ||
| 816 | if (data != end) | ||
| 817 | return; | ||
| 818 | } | ||
| 819 | { | ||
| 820 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 821 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
| 822 | } | ||
| 823 | } | ||
| 824 | #endif // Z7_BLAKE2S_USE_V128_WAY2 | ||
| 825 | |||
| 826 | |||
| 827 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
| 828 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2 | ||
| 829 | #else | ||
| 830 | #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1 | ||
| 831 | #endif | ||
| 832 | |||
| 833 | |||
| 834 | |||
| 835 | #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED | ||
| 836 | #define ROT_128_8(x) MM_ROR_EPI32(x, 8) | ||
| 837 | #define ROT_128_16(x) MM_ROR_EPI32(x, 16) | ||
| 838 | #define ROT_128_7(x) MM_ROR_EPI32(x, 7) | ||
| 839 | #define ROT_128_12(x) MM_ROR_EPI32(x, 12) | ||
| 840 | #else | ||
| 841 | #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) | ||
| 842 | #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8 | ||
| 843 | #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16 | ||
| 844 | #else | ||
| 845 | #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8) | ||
| 846 | #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16) | ||
| 847 | #endif | ||
| 848 | #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7) | ||
| 849 | #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12) | ||
| 850 | #endif | ||
| 851 | |||
| 852 | |||
| 853 | #if 1 | ||
| 854 | // this branch can provide similar speed on x86* in most cases, | ||
| 855 | // because [base + index*4] provides same speed as [base + index]. | ||
| 856 | // but some compilers can generate different code with this branch, that can be faster sometimes. | ||
| 857 | // this branch uses additional table of 10*16=160 bytes. | ||
| 858 | #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
| 859 | SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
| 860 | MY_ALIGN(16) | ||
| 861 | static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] = | ||
| 862 | { SIGMA_TABLE(SIGMA_TABLE_MULT_16) }; | ||
| 863 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r]; | ||
| 864 | #define GET_SIGMA_VAL_128(n) (sigma[n]) | ||
| 865 | #else | ||
| 866 | #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
| 867 | #define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n]) | ||
| 868 | #endif | ||
| 869 | |||
| 870 | |||
| 871 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 872 | #if 1 | ||
| 873 | #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
| 874 | SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) | ||
| 875 | MY_ALIGN(64) | ||
| 876 | static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] = | ||
| 877 | { SIGMA_TABLE(SIGMA_TABLE_MULT_32) }; | ||
| 878 | #define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r]; | ||
| 879 | #define GET_SIGMA_VAL_256(n) (sigma[n]) | ||
| 880 | #else | ||
| 881 | #define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
| 882 | #define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n]) | ||
| 883 | #endif | ||
| 884 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
| 885 | |||
| 886 | |||
| 887 | #define D_ROT_128_7(dest) dest = ROT_128_7(dest) | ||
| 888 | #define D_ROT_128_8(dest) dest = ROT_128_8(dest) | ||
| 889 | #define D_ROT_128_12(dest) dest = ROT_128_12(dest) | ||
| 890 | #define D_ROT_128_16(dest) dest = ROT_128_16(dest) | ||
| 891 | |||
| 892 | #define OP_L(a, i) D_ADD_128 (V(a, 0), \ | ||
| 893 | LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i)))); | ||
| 894 | |||
| 895 | #define OP_0(a) OP_L(a, 0) | ||
| 896 | #define OP_7(a) OP_L(a, 1) | ||
| 897 | |||
| 898 | #define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1)); | ||
| 899 | #define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0)); | ||
| 900 | #define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3)); | ||
| 901 | #define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2)); | ||
| 902 | |||
| 903 | #define OP_3(a) D_ROT_128_16 (V(a, 3)); | ||
| 904 | #define OP_6(a) D_ROT_128_12 (V(a, 1)); | ||
| 905 | #define OP_8(a) D_ROT_128_8 (V(a, 3)); | ||
| 906 | #define OP_9(a) D_ROT_128_7 (V(a, 1)); | ||
| 907 | |||
| 908 | |||
| 909 | // for 32-bit x86 : interleave mode works slower, because of register pressure. | ||
| 910 | |||
| 911 | #if 0 || 1 && (defined(MY_CPU_X86) \ | ||
| 912 | || defined(__GNUC__) && !defined(__clang__)) | ||
| 913 | // non-inteleaved version: | ||
| 914 | // is fast for x86 32-bit. | ||
| 915 | // is fast for GCC x86-64. | ||
| 916 | |||
| 917 | #define V4G(a) \ | ||
| 918 | OP_0 (a) \ | ||
| 919 | OP_1 (a) \ | ||
| 920 | OP_2 (a) \ | ||
| 921 | OP_3 (a) \ | ||
| 922 | OP_4 (a) \ | ||
| 923 | OP_5 (a) \ | ||
| 924 | OP_6 (a) \ | ||
| 925 | OP_7 (a) \ | ||
| 926 | OP_1 (a) \ | ||
| 927 | OP_2 (a) \ | ||
| 928 | OP_8 (a) \ | ||
| 929 | OP_4 (a) \ | ||
| 930 | OP_5 (a) \ | ||
| 931 | OP_9 (a) \ | ||
| 932 | |||
| 933 | #define V4R \ | ||
| 934 | { \ | ||
| 935 | V4G (0) \ | ||
| 936 | V4G (1) \ | ||
| 937 | V4G (2) \ | ||
| 938 | V4G (3) \ | ||
| 939 | V4G (4) \ | ||
| 940 | V4G (5) \ | ||
| 941 | V4G (6) \ | ||
| 942 | V4G (7) \ | ||
| 943 | } | ||
| 944 | |||
| 945 | #elif 0 || 1 && defined(MY_CPU_X86) | ||
| 946 | |||
| 947 | #define OP_INTER_2(op, a,b) \ | ||
| 948 | op (a) \ | ||
| 949 | op (b) \ | ||
| 950 | |||
| 951 | #define V4G(a,b) \ | ||
| 952 | OP_INTER_2 (OP_0, a,b) \ | ||
| 953 | OP_INTER_2 (OP_1, a,b) \ | ||
| 954 | OP_INTER_2 (OP_2, a,b) \ | ||
| 955 | OP_INTER_2 (OP_3, a,b) \ | ||
| 956 | OP_INTER_2 (OP_4, a,b) \ | ||
| 957 | OP_INTER_2 (OP_5, a,b) \ | ||
| 958 | OP_INTER_2 (OP_6, a,b) \ | ||
| 959 | OP_INTER_2 (OP_7, a,b) \ | ||
| 960 | OP_INTER_2 (OP_1, a,b) \ | ||
| 961 | OP_INTER_2 (OP_2, a,b) \ | ||
| 962 | OP_INTER_2 (OP_8, a,b) \ | ||
| 963 | OP_INTER_2 (OP_4, a,b) \ | ||
| 964 | OP_INTER_2 (OP_5, a,b) \ | ||
| 965 | OP_INTER_2 (OP_9, a,b) \ | ||
| 966 | |||
| 967 | #define V4R \ | ||
| 968 | { \ | ||
| 969 | V4G (0, 1) \ | ||
| 970 | V4G (2, 3) \ | ||
| 971 | V4G (4, 5) \ | ||
| 972 | V4G (6, 7) \ | ||
| 973 | } | ||
| 974 | |||
| 975 | #else | ||
| 976 | // iterleave-4 version is fast for x64 (MSVC/CLANG) | ||
| 977 | |||
| 978 | #define OP_INTER_4(op, a,b,c,d) \ | ||
| 979 | op (a) \ | ||
| 980 | op (b) \ | ||
| 981 | op (c) \ | ||
| 982 | op (d) \ | ||
| 983 | |||
| 984 | #define V4G(a,b,c,d) \ | ||
| 985 | OP_INTER_4 (OP_0, a,b,c,d) \ | ||
| 986 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
| 987 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
| 988 | OP_INTER_4 (OP_3, a,b,c,d) \ | ||
| 989 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
| 990 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
| 991 | OP_INTER_4 (OP_6, a,b,c,d) \ | ||
| 992 | OP_INTER_4 (OP_7, a,b,c,d) \ | ||
| 993 | OP_INTER_4 (OP_1, a,b,c,d) \ | ||
| 994 | OP_INTER_4 (OP_2, a,b,c,d) \ | ||
| 995 | OP_INTER_4 (OP_8, a,b,c,d) \ | ||
| 996 | OP_INTER_4 (OP_4, a,b,c,d) \ | ||
| 997 | OP_INTER_4 (OP_5, a,b,c,d) \ | ||
| 998 | OP_INTER_4 (OP_9, a,b,c,d) \ | ||
| 999 | |||
| 1000 | #define V4R \ | ||
| 1001 | { \ | ||
| 1002 | V4G (0, 1, 2, 3) \ | ||
| 1003 | V4G (4, 5, 6, 7) \ | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | #endif | ||
| 1007 | |||
| 1008 | #define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R } | ||
| 1009 | |||
| 1010 | |||
| 1011 | #define V4_LOAD_MSG_1(w, m, i) \ | ||
| 1012 | { \ | ||
| 1013 | __m128i m0, m1, m2, m3; \ | ||
| 1014 | __m128i t0, t1, t2, t3; \ | ||
| 1015 | m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \ | ||
| 1016 | m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \ | ||
| 1017 | m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \ | ||
| 1018 | m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \ | ||
| 1019 | t0 = _mm_unpacklo_epi32(m0, m1); \ | ||
| 1020 | t1 = _mm_unpackhi_epi32(m0, m1); \ | ||
| 1021 | t2 = _mm_unpacklo_epi32(m2, m3); \ | ||
| 1022 | t3 = _mm_unpackhi_epi32(m2, m3); \ | ||
| 1023 | w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \ | ||
| 1024 | w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \ | ||
| 1025 | w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \ | ||
| 1026 | w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \ | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | #define V4_LOAD_MSG(w, m) \ | ||
| 1030 | { \ | ||
| 1031 | V4_LOAD_MSG_1 (w, m, 0) \ | ||
| 1032 | V4_LOAD_MSG_1 (w, m, 1) \ | ||
| 1033 | V4_LOAD_MSG_1 (w, m, 2) \ | ||
| 1034 | V4_LOAD_MSG_1 (w, m, 3) \ | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \ | ||
| 1038 | { \ | ||
| 1039 | const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \ | ||
| 1040 | const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \ | ||
| 1041 | d0 = _mm_unpacklo_epi32(v0, v1); \ | ||
| 1042 | d1 = _mm_unpackhi_epi32(v0, v1); \ | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \ | ||
| 1046 | { \ | ||
| 1047 | STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \ | ||
| 1048 | STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \ | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | #define V4_UNPACK_STATE(dest32, src32) \ | ||
| 1052 | { \ | ||
| 1053 | __m128i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
| 1054 | V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \ | ||
| 1055 | V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \ | ||
| 1056 | V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \ | ||
| 1057 | V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \ | ||
| 1058 | V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \ | ||
| 1059 | V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \ | ||
| 1060 | V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \ | ||
| 1061 | V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \ | ||
| 1062 | } | ||
| 1063 | |||
| 1064 | |||
| 1065 | static | ||
| 1066 | Z7_NO_INLINE | ||
| 1067 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
| 1068 | BLAKE2S_ATTRIB_128BIT | ||
| 1069 | #endif | ||
| 1070 | void | ||
| 1071 | Z7_FASTCALL | ||
| 1072 | Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 1073 | { | ||
| 1074 | // PrintStates2(s_items, 8, 16); | ||
| 1075 | size_t pos = 0; | ||
| 1076 | pos /= 2; | ||
| 1077 | do | ||
| 1078 | { | ||
| 1079 | #if defined(Z7_BLAKE2S_USE_SSSE3) && \ | ||
| 1080 | !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED) | ||
| 1081 | const __m128i r8 = k_r8; | ||
| 1082 | const __m128i r16 = k_r16; | ||
| 1083 | #endif | ||
| 1084 | __m128i w[16]; | ||
| 1085 | __m128i v[16]; | ||
| 1086 | UInt32 *s; | ||
| 1087 | V4_LOAD_MSG(w, data) | ||
| 1088 | s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 1089 | { | ||
| 1090 | __m128i ctr = LOAD_128_FROM_STRUCT(s + 64); | ||
| 1091 | D_ADD_EPI64_128 (ctr, k_inc); | ||
| 1092 | STORE_128_TO_STRUCT(s + 64, ctr); | ||
| 1093 | v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
| 1094 | v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
| 1095 | } | ||
| 1096 | v[ 8] = GET_128_IV_WAY4(0); | ||
| 1097 | v[ 9] = GET_128_IV_WAY4(1); | ||
| 1098 | v[10] = GET_128_IV_WAY4(2); | ||
| 1099 | v[11] = GET_128_IV_WAY4(3); | ||
| 1100 | v[14] = GET_128_IV_WAY4(6); | ||
| 1101 | v[15] = GET_128_IV_WAY4(7); | ||
| 60 | 1102 | ||
| 61 | for (i = 0; i < 16; i++) | 1103 | #define LOAD_STATE_128_FROM_STRUCT(i) \ |
| 62 | m[i] = GetUi32(p->buf + i * sizeof(m[i])); | 1104 | v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4); |
| 1105 | |||
| 1106 | #define UPDATE_STATE_128_IN_STRUCT(i) \ | ||
| 1107 | STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \ | ||
| 1108 | XOR_128(v[i], v[(i) + 8]), \ | ||
| 1109 | LOAD_128_FROM_STRUCT(s + (i) * 4))); | ||
| 63 | 1110 | ||
| 64 | for (i = 0; i < 8; i++) | 1111 | REP8_MACRO (LOAD_STATE_128_FROM_STRUCT) |
| 65 | v[i] = p->h[i]; | 1112 | ROUNDS_LOOP (V4_ROUND) |
| 1113 | REP8_MACRO (UPDATE_STATE_128_IN_STRUCT) | ||
| 1114 | |||
| 1115 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
| 1116 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2; | ||
| 1117 | pos &= SUPER_BLOCK_SIZE / 2 - 1; | ||
| 66 | } | 1118 | } |
| 1119 | while (data != end); | ||
| 1120 | } | ||
| 67 | 1121 | ||
| 68 | v[ 8] = k_Blake2s_IV[0]; | ||
| 69 | v[ 9] = k_Blake2s_IV[1]; | ||
| 70 | v[10] = k_Blake2s_IV[2]; | ||
| 71 | v[11] = k_Blake2s_IV[3]; | ||
| 72 | |||
| 73 | v[12] = p->t[0] ^ k_Blake2s_IV[4]; | ||
| 74 | v[13] = p->t[1] ^ k_Blake2s_IV[5]; | ||
| 75 | v[14] = p->f[0] ^ k_Blake2s_IV[6]; | ||
| 76 | v[15] = p->f[1] ^ k_Blake2s_IV[7]; | ||
| 77 | 1122 | ||
| 78 | #define G(r,i,a,b,c,d) \ | 1123 | static |
| 79 | a += b + m[sigma[2*i+0]]; d ^= a; d = rotr32(d, 16); c += d; b ^= c; b = rotr32(b, 12); \ | 1124 | Z7_NO_INLINE |
| 80 | a += b + m[sigma[2*i+1]]; d ^= a; d = rotr32(d, 8); c += d; b ^= c; b = rotr32(b, 7); \ | 1125 | #ifdef BLAKE2S_ATTRIB_128BIT |
| 1126 | BLAKE2S_ATTRIB_128BIT | ||
| 1127 | #endif | ||
| 1128 | void | ||
| 1129 | Z7_FASTCALL | ||
| 1130 | Blake2sp_Final_V128_Fast(UInt32 *states) | ||
| 1131 | { | ||
| 1132 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); | ||
| 1133 | // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n"); | ||
| 1134 | // PrintStates2(states, 8, 16); | ||
| 1135 | { | ||
| 1136 | ptrdiff_t pos = 8 * 4; | ||
| 1137 | do | ||
| 1138 | { | ||
| 1139 | UInt32 *src32 = states + (size_t)(pos * 1); | ||
| 1140 | UInt32 *dest32 = states + (size_t)(pos * 2); | ||
| 1141 | V4_UNPACK_STATE(dest32, src32) | ||
| 1142 | pos -= 8 * 4; | ||
| 1143 | } | ||
| 1144 | while (pos >= 0); | ||
| 1145 | } | ||
| 1146 | { | ||
| 1147 | unsigned k; | ||
| 1148 | for (k = 0; k < 8; k++) | ||
| 1149 | { | ||
| 1150 | UInt32 *s = states + (size_t)k * 16; | ||
| 1151 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
| 1152 | } | ||
| 1153 | } | ||
| 1154 | // PrintStates2(states, 8, 16); | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | |||
| 1158 | |||
| 1159 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 1160 | |||
| 1161 | #define ADD_256(a, b) _mm256_add_epi32(a, b) | ||
| 1162 | #define XOR_256(a, b) _mm256_xor_si256(a, b) | ||
| 1163 | |||
| 1164 | #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
| 1165 | #define MM256_ROR_EPI32 _mm256_ror_epi32 | ||
| 1166 | #define Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
| 1167 | #define LOAD_ROTATE_CONSTS_256 | ||
| 1168 | #else | ||
| 1169 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 1170 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 1171 | #define LOAD_ROTATE_CONSTS_256 \ | ||
| 1172 | const __m256i r8 = k_r8_256; \ | ||
| 1173 | const __m256i r16 = k_r16_256; | ||
| 1174 | #endif // AVX2_WAY2 | ||
| 1175 | |||
| 1176 | #define MM256_ROR_EPI32(r, c) ( \ | ||
| 1177 | ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \ | ||
| 1178 | : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \ | ||
| 1179 | : _mm256_or_si256( \ | ||
| 1180 | _mm256_srli_epi32((r), (c)), \ | ||
| 1181 | _mm256_slli_epi32((r), 32-(c)))) | ||
| 1182 | #endif // WAY_SLOW | ||
| 1183 | #endif | ||
| 1184 | |||
| 1185 | |||
| 1186 | #define D_ADD_256(dest, src) dest = ADD_256(dest, src) | ||
| 1187 | #define D_XOR_256(dest, src) dest = XOR_256(dest, src) | ||
| 1188 | |||
| 1189 | #define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) | ||
| 1190 | |||
| 1191 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 1192 | |||
| 1193 | #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED | ||
| 1194 | #define ROT_256_16(x) MM256_ROR_EPI32((x), 16) | ||
| 1195 | #define ROT_256_12(x) MM256_ROR_EPI32((x), 12) | ||
| 1196 | #define ROT_256_8(x) MM256_ROR_EPI32((x), 8) | ||
| 1197 | #define ROT_256_7(x) MM256_ROR_EPI32((x), 7) | ||
| 1198 | #else | ||
| 1199 | #define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \ | ||
| 1200 | 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) | ||
| 1201 | #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \ | ||
| 1202 | 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) | ||
| 1203 | #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16) | ||
| 1204 | #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20)) | ||
| 1205 | #define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8) | ||
| 1206 | #define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25)) | ||
| 1207 | #endif | ||
| 1208 | |||
| 1209 | #define D_ROT_256_7(dest) dest = ROT_256_7(dest) | ||
| 1210 | #define D_ROT_256_8(dest) dest = ROT_256_8(dest) | ||
| 1211 | #define D_ROT_256_12(dest) dest = ROT_256_12(dest) | ||
| 1212 | #define D_ROT_256_16(dest) dest = ROT_256_16(dest) | ||
| 1213 | |||
| 1214 | #define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p)) | ||
| 1215 | #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED | ||
| 1216 | #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r) | ||
| 1217 | #define LOAD_256_FROM_STRUCT(p) LOADU_256(p) | ||
| 1218 | #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r) | ||
| 1219 | #else | ||
| 1220 | // if struct is aligned for 32-bytes | ||
| 1221 | #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r) | ||
| 1222 | #define LOAD_256_FROM_STRUCT(p) LOAD_256(p) | ||
| 1223 | #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r) | ||
| 1224 | #endif | ||
| 1225 | |||
| 1226 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
| 1227 | |||
| 1228 | |||
| 1229 | |||
| 1230 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 1231 | |||
| 1232 | #if 0 | ||
| 1233 | #define DIAG_PERM2(s) \ | ||
| 1234 | { \ | ||
| 1235 | const __m256i a = LOAD_256_FROM_STRUCT((s) ); \ | ||
| 1236 | const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \ | ||
| 1237 | STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \ | ||
| 1238 | STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \ | ||
| 1239 | } | ||
| 1240 | #else | ||
| 1241 | #define DIAG_PERM2(s) \ | ||
| 1242 | { \ | ||
| 1243 | const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \ | ||
| 1244 | const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \ | ||
| 1245 | STORE_128_TO_STRUCT((s) + NSW, a); \ | ||
| 1246 | STORE_128_TO_STRUCT((s) + 4 , b); \ | ||
| 1247 | } | ||
| 1248 | #endif | ||
| 1249 | #define DIAG_PERM8(s_items) \ | ||
| 1250 | { \ | ||
| 1251 | DIAG_PERM2(s_items) \ | ||
| 1252 | DIAG_PERM2(s_items + NSW * 2) \ | ||
| 1253 | DIAG_PERM2(s_items + NSW * 4) \ | ||
| 1254 | DIAG_PERM2(s_items + NSW * 6) \ | ||
| 1255 | } | ||
| 1256 | |||
| 1257 | |||
| 1258 | #define AXR256(a, b, d, shift) \ | ||
| 1259 | D_ADD_256(a, b); \ | ||
| 1260 | D_XOR_256(d, a); \ | ||
| 1261 | d = MM256_ROR_EPI32(d, shift); \ | ||
| 1262 | |||
| 1263 | |||
| 1264 | |||
| 1265 | #ifdef Z7_BLAKE2S_USE_GATHER | ||
| 1266 | |||
| 1267 | #define TABLE_GATHER_256_4(a0,a1,a2,a3) \ | ||
| 1268 | a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16 | ||
| 1269 | #define TABLE_GATHER_256( \ | ||
| 1270 | a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ | ||
| 1271 | { TABLE_GATHER_256_4(a0,a2,a4,a6), \ | ||
| 1272 | TABLE_GATHER_256_4(a1,a3,a5,a7), \ | ||
| 1273 | TABLE_GATHER_256_4(a8,a10,a12,a14), \ | ||
| 1274 | TABLE_GATHER_256_4(a9,a11,a13,a15) } | ||
| 1275 | MY_ALIGN(64) | ||
| 1276 | static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] = | ||
| 1277 | { SIGMA_TABLE(TABLE_GATHER_256) }; | ||
| 1278 | #define GET_SIGMA(r) \ | ||
| 1279 | const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r]; | ||
| 1280 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
| 1281 | const __m256i i01234567 = LOAD_256(sigma + (sigma_index)); | ||
| 1282 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
| 1283 | _mm256_i32gather_epi32((const void *)(in), i01234567, 4) | ||
| 1284 | #define SIGMA_INTERLEAVE 8 | ||
| 1285 | #define SIGMA_HALF_ROW_SIZE 16 | ||
| 1286 | |||
| 1287 | #else // !Z7_BLAKE2S_USE_GATHER | ||
| 1288 | |||
| 1289 | #define GET_SIGMA(r) \ | ||
| 1290 | const Byte * const sigma = k_Blake2s_Sigma_4[r]; | ||
| 1291 | #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
| 1292 | AXR2_LOAD_INDEXES(sigma_index) | ||
| 1293 | #define SET_ROW_FROM_SIGMA_AVX(in) \ | ||
| 1294 | MY_mm256_set_m128i( \ | ||
| 1295 | SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \ | ||
| 1296 | SET_ROW_FROM_SIGMA_W(in)) | ||
| 1297 | #define SIGMA_INTERLEAVE 1 | ||
| 1298 | #define SIGMA_HALF_ROW_SIZE 8 | ||
| 1299 | #endif // !Z7_BLAKE2S_USE_GATHER | ||
| 1300 | |||
| 81 | 1301 | ||
| 82 | #define R(r) \ | 1302 | #define ROTATE_WORDS_TO_RIGHT_256(a, n) \ |
| 83 | G(r,0,v[ 0],v[ 4],v[ 8],v[12]) \ | 1303 | a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); |
| 84 | G(r,1,v[ 1],v[ 5],v[ 9],v[13]) \ | ||
| 85 | G(r,2,v[ 2],v[ 6],v[10],v[14]) \ | ||
| 86 | G(r,3,v[ 3],v[ 7],v[11],v[15]) \ | ||
| 87 | G(r,4,v[ 0],v[ 5],v[10],v[15]) \ | ||
| 88 | G(r,5,v[ 1],v[ 6],v[11],v[12]) \ | ||
| 89 | G(r,6,v[ 2],v[ 7],v[ 8],v[13]) \ | ||
| 90 | G(r,7,v[ 3],v[ 4],v[ 9],v[14]) \ | ||
| 91 | 1304 | ||
| 1305 | |||
| 1306 | |||
| 1307 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 1308 | |||
| 1309 | #define AXR2_A(sigma_index, shift1, shift2) \ | ||
| 1310 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
| 1311 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
| 1312 | AXR256(a0, b0, d0, shift1) \ | ||
| 1313 | AXR256(c0, d0, b0, shift2) \ | ||
| 1314 | |||
| 1315 | #define AXR4_A(sigma_index) \ | ||
| 1316 | { AXR2_A(sigma_index, 16, 12) } \ | ||
| 1317 | { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
| 1318 | |||
| 1319 | #define EE1(r) \ | ||
| 1320 | { GET_SIGMA(r) \ | ||
| 1321 | AXR4_A(0) \ | ||
| 1322 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
| 1323 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
| 1324 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
| 1325 | AXR4_A(SIGMA_HALF_ROW_SIZE) \ | ||
| 1326 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
| 1327 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
| 1328 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | static | ||
| 1332 | Z7_NO_INLINE | ||
| 1333 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
| 1334 | BLAKE2S_ATTRIB_AVX2 | ||
| 1335 | #endif | ||
| 1336 | void | ||
| 1337 | Z7_FASTCALL | ||
| 1338 | Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 1339 | { | ||
| 1340 | size_t pos = 0; | ||
| 1341 | end -= Z7_BLAKE2S_BLOCK_SIZE; | ||
| 1342 | |||
| 1343 | if (data != end) | ||
| 92 | { | 1344 | { |
| 93 | unsigned r; | 1345 | LOAD_ROTATE_CONSTS_256 |
| 94 | for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) | 1346 | DIAG_PERM8(s_items) |
| 1347 | do | ||
| 95 | { | 1348 | { |
| 96 | const Byte *sigma = k_Blake2s_Sigma[r]; | 1349 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
| 97 | R(r) | 1350 | __m256i a0, b0, c0, d0; |
| 1351 | { | ||
| 1352 | const __m128i inc = k_inc; | ||
| 1353 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
| 1354 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
| 1355 | D_ADD_EPI64_128(d0_128, inc); | ||
| 1356 | D_ADD_EPI64_128(d1_128, inc); | ||
| 1357 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
| 1358 | STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128); | ||
| 1359 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
| 1360 | D_XOR_256(d0, k_iv4_256); | ||
| 1361 | } | ||
| 1362 | c0 = SET_FROM_128(k_iv0_128); | ||
| 1363 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
| 1364 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
| 1365 | |||
| 1366 | ROUNDS_LOOP (EE1) | ||
| 1367 | |||
| 1368 | D_XOR_256(a0, c0); | ||
| 1369 | D_XOR_256(b0, d0); | ||
| 1370 | |||
| 1371 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
| 1372 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
| 1373 | |||
| 1374 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
| 1375 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
| 1376 | |||
| 1377 | data += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
| 1378 | pos += Z7_BLAKE2S_BLOCK_SIZE * 2; | ||
| 1379 | pos &= SUPER_BLOCK_MASK; | ||
| 98 | } | 1380 | } |
| 99 | /* R(0); R(1); R(2); R(3); R(4); R(5); R(6); R(7); R(8); R(9); */ | 1381 | while (data < end); |
| 1382 | DIAG_PERM8(s_items) | ||
| 1383 | if (data != end) | ||
| 1384 | return; | ||
| 1385 | } | ||
| 1386 | { | ||
| 1387 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 1388 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
| 100 | } | 1389 | } |
| 1390 | } | ||
| 1391 | |||
| 1392 | #endif // Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 101 | 1393 | ||
| 102 | #undef G | ||
| 103 | #undef R | ||
| 104 | 1394 | ||
| 1395 | |||
| 1396 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 1397 | |||
| 1398 | #define AXR2_X(sigma_index, shift1, shift2) \ | ||
| 1399 | AXR2_LOAD_INDEXES_AVX(sigma_index) \ | ||
| 1400 | D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ | ||
| 1401 | D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \ | ||
| 1402 | AXR256(a0, b0, d0, shift1) \ | ||
| 1403 | AXR256(a1, b1, d1, shift1) \ | ||
| 1404 | AXR256(c0, d0, b0, shift2) \ | ||
| 1405 | AXR256(c1, d1, b1, shift2) \ | ||
| 1406 | |||
| 1407 | #define AXR4_X(sigma_index) \ | ||
| 1408 | { AXR2_X(sigma_index, 16, 12) } \ | ||
| 1409 | { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) } | ||
| 1410 | |||
| 1411 | #define EE2(r) \ | ||
| 1412 | { GET_SIGMA(r) \ | ||
| 1413 | AXR4_X(0) \ | ||
| 1414 | ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ | ||
| 1415 | ROTATE_WORDS_TO_RIGHT_256(b1, 1) \ | ||
| 1416 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
| 1417 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
| 1418 | ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ | ||
| 1419 | ROTATE_WORDS_TO_RIGHT_256(d1, 3) \ | ||
| 1420 | AXR4_X(SIGMA_HALF_ROW_SIZE) \ | ||
| 1421 | ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ | ||
| 1422 | ROTATE_WORDS_TO_RIGHT_256(b1, 3) \ | ||
| 1423 | ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ | ||
| 1424 | ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ | ||
| 1425 | ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ | ||
| 1426 | ROTATE_WORDS_TO_RIGHT_256(d1, 1) \ | ||
| 1427 | } | ||
| 1428 | |||
| 1429 | static | ||
| 1430 | Z7_NO_INLINE | ||
| 1431 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
| 1432 | BLAKE2S_ATTRIB_AVX2 | ||
| 1433 | #endif | ||
| 1434 | void | ||
| 1435 | Z7_FASTCALL | ||
| 1436 | Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 1437 | { | ||
| 1438 | size_t pos = 0; | ||
| 1439 | |||
| 1440 | if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4) | ||
| 105 | { | 1441 | { |
| 106 | unsigned i; | 1442 | #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED |
| 107 | for (i = 0; i < 8; i++) | 1443 | const __m256i r8 = k_r8_256; |
| 108 | p->h[i] ^= v[i] ^ v[i + 8]; | 1444 | const __m256i r16 = k_r16_256; |
| 1445 | #endif | ||
| 1446 | end -= Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
| 1447 | DIAG_PERM8(s_items) | ||
| 1448 | do | ||
| 1449 | { | ||
| 1450 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 1451 | __m256i a0, b0, c0, d0; | ||
| 1452 | __m256i a1, b1, c1, d1; | ||
| 1453 | { | ||
| 1454 | const __m128i inc = k_inc; | ||
| 1455 | __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); | ||
| 1456 | __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); | ||
| 1457 | __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2)); | ||
| 1458 | __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3)); | ||
| 1459 | D_ADD_EPI64_128(d0_128, inc); | ||
| 1460 | D_ADD_EPI64_128(d1_128, inc); | ||
| 1461 | D_ADD_EPI64_128(d2_128, inc); | ||
| 1462 | D_ADD_EPI64_128(d3_128, inc); | ||
| 1463 | STORE_128_TO_STRUCT (STATE_T(s ), d0_128); | ||
| 1464 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128); | ||
| 1465 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128); | ||
| 1466 | STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128); | ||
| 1467 | d0 = MY_mm256_set_m128i(d1_128, d0_128); | ||
| 1468 | d1 = MY_mm256_set_m128i(d3_128, d2_128); | ||
| 1469 | D_XOR_256(d0, k_iv4_256); | ||
| 1470 | D_XOR_256(d1, k_iv4_256); | ||
| 1471 | } | ||
| 1472 | c1 = c0 = SET_FROM_128(k_iv0_128); | ||
| 1473 | a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); | ||
| 1474 | b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); | ||
| 1475 | a1 = LOAD_256_FROM_STRUCT(s + NSW * 2); | ||
| 1476 | b1 = LOAD_256_FROM_STRUCT(s + NSW * 3); | ||
| 1477 | |||
| 1478 | ROUNDS_LOOP (EE2) | ||
| 1479 | |||
| 1480 | D_XOR_256(a0, c0); | ||
| 1481 | D_XOR_256(b0, d0); | ||
| 1482 | D_XOR_256(a1, c1); | ||
| 1483 | D_XOR_256(b1, d1); | ||
| 1484 | |||
| 1485 | D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); | ||
| 1486 | D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); | ||
| 1487 | D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2)); | ||
| 1488 | D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3)); | ||
| 1489 | |||
| 1490 | STORE_256_TO_STRUCT(s + NSW * 0, a0); | ||
| 1491 | STORE_256_TO_STRUCT(s + NSW * 1, b0); | ||
| 1492 | STORE_256_TO_STRUCT(s + NSW * 2, a1); | ||
| 1493 | STORE_256_TO_STRUCT(s + NSW * 3, b1); | ||
| 1494 | |||
| 1495 | data += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
| 1496 | pos += Z7_BLAKE2S_BLOCK_SIZE * 4; | ||
| 1497 | pos &= SUPER_BLOCK_MASK; | ||
| 1498 | } | ||
| 1499 | while (data < end); | ||
| 1500 | DIAG_PERM8(s_items) | ||
| 1501 | end += Z7_BLAKE2S_BLOCK_SIZE * 3; | ||
| 109 | } | 1502 | } |
| 1503 | if (data == end) | ||
| 1504 | return; | ||
| 1505 | // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos); | ||
| 1506 | do | ||
| 1507 | { | ||
| 1508 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); | ||
| 1509 | Z7_BLAKE2S_CompressSingleBlock(s, data); | ||
| 1510 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 1511 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 1512 | pos &= SUPER_BLOCK_MASK; | ||
| 1513 | } | ||
| 1514 | while (data != end); | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | #endif // Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 1518 | #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW | ||
| 1519 | |||
| 1520 | |||
| 1521 | // --------------------------------------------------------- | ||
| 1522 | |||
| 1523 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 1524 | |||
| 1525 | #define OP256_L(a, i) D_ADD_256 (V(a, 0), \ | ||
| 1526 | LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i)))); | ||
| 1527 | |||
| 1528 | #define OP256_0(a) OP256_L(a, 0) | ||
| 1529 | #define OP256_7(a) OP256_L(a, 1) | ||
| 1530 | |||
| 1531 | #define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1)); | ||
| 1532 | #define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0)); | ||
| 1533 | #define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3)); | ||
| 1534 | #define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2)); | ||
| 1535 | |||
| 1536 | #define OP256_3(a) D_ROT_256_16 (V(a, 3)); | ||
| 1537 | #define OP256_6(a) D_ROT_256_12 (V(a, 1)); | ||
| 1538 | #define OP256_8(a) D_ROT_256_8 (V(a, 3)); | ||
| 1539 | #define OP256_9(a) D_ROT_256_7 (V(a, 1)); | ||
| 1540 | |||
| 1541 | |||
| 1542 | #if 0 || 1 && defined(MY_CPU_X86) | ||
| 1543 | |||
| 1544 | #define V8_G(a) \ | ||
| 1545 | OP256_0 (a) \ | ||
| 1546 | OP256_1 (a) \ | ||
| 1547 | OP256_2 (a) \ | ||
| 1548 | OP256_3 (a) \ | ||
| 1549 | OP256_4 (a) \ | ||
| 1550 | OP256_5 (a) \ | ||
| 1551 | OP256_6 (a) \ | ||
| 1552 | OP256_7 (a) \ | ||
| 1553 | OP256_1 (a) \ | ||
| 1554 | OP256_2 (a) \ | ||
| 1555 | OP256_8 (a) \ | ||
| 1556 | OP256_4 (a) \ | ||
| 1557 | OP256_5 (a) \ | ||
| 1558 | OP256_9 (a) \ | ||
| 1559 | |||
| 1560 | #define V8R { \ | ||
| 1561 | V8_G (0); \ | ||
| 1562 | V8_G (1); \ | ||
| 1563 | V8_G (2); \ | ||
| 1564 | V8_G (3); \ | ||
| 1565 | V8_G (4); \ | ||
| 1566 | V8_G (5); \ | ||
| 1567 | V8_G (6); \ | ||
| 1568 | V8_G (7); \ | ||
| 1569 | } | ||
| 1570 | |||
| 1571 | #else | ||
| 1572 | |||
| 1573 | #define OP256_INTER_4(op, a,b,c,d) \ | ||
| 1574 | op (a) \ | ||
| 1575 | op (b) \ | ||
| 1576 | op (c) \ | ||
| 1577 | op (d) \ | ||
| 1578 | |||
| 1579 | #define V8_G(a,b,c,d) \ | ||
| 1580 | OP256_INTER_4 (OP256_0, a,b,c,d) \ | ||
| 1581 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
| 1582 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
| 1583 | OP256_INTER_4 (OP256_3, a,b,c,d) \ | ||
| 1584 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
| 1585 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
| 1586 | OP256_INTER_4 (OP256_6, a,b,c,d) \ | ||
| 1587 | OP256_INTER_4 (OP256_7, a,b,c,d) \ | ||
| 1588 | OP256_INTER_4 (OP256_1, a,b,c,d) \ | ||
| 1589 | OP256_INTER_4 (OP256_2, a,b,c,d) \ | ||
| 1590 | OP256_INTER_4 (OP256_8, a,b,c,d) \ | ||
| 1591 | OP256_INTER_4 (OP256_4, a,b,c,d) \ | ||
| 1592 | OP256_INTER_4 (OP256_5, a,b,c,d) \ | ||
| 1593 | OP256_INTER_4 (OP256_9, a,b,c,d) \ | ||
| 1594 | |||
| 1595 | #define V8R { \ | ||
| 1596 | V8_G (0, 1, 2, 3) \ | ||
| 1597 | V8_G (4, 5, 6, 7) \ | ||
| 1598 | } | ||
| 1599 | #endif | ||
| 1600 | |||
| 1601 | #define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R } | ||
| 1602 | |||
| 1603 | |||
| 1604 | // for debug: | ||
| 1605 | // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
| 1606 | #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER) | ||
| 1607 | // gather instruction is slow. | ||
| 1608 | #define V8_LOAD_MSG(w, m) \ | ||
| 1609 | { \ | ||
| 1610 | unsigned i; \ | ||
| 1611 | for (i = 0; i < 16; ++i) { \ | ||
| 1612 | w[i] = _mm256_i32gather_epi32( \ | ||
| 1613 | (const void *)((m) + i * sizeof(UInt32)),\ | ||
| 1614 | _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \ | ||
| 1615 | sizeof(UInt32)); \ | ||
| 1616 | } \ | ||
| 1617 | } | ||
| 1618 | #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
| 1619 | |||
| 1620 | #define V8_LOAD_MSG_2(w, a0, a1) \ | ||
| 1621 | { \ | ||
| 1622 | (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \ | ||
| 1623 | (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \ | ||
| 1624 | } | ||
| 1625 | |||
| 1626 | #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \ | ||
| 1627 | { \ | ||
| 1628 | __m256i s0, s1, s2, s3; \ | ||
| 1629 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
| 1630 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
| 1631 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
| 1632 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
| 1633 | V8_LOAD_MSG_2((w) + 0, s0, s2) \ | ||
| 1634 | V8_LOAD_MSG_2((w) + 1, s1, s3) \ | ||
| 1635 | } | ||
| 1636 | |||
| 1637 | #define V8_LOAD_MSG_0(t0, t1, m) \ | ||
| 1638 | { \ | ||
| 1639 | __m256i m0, m1; \ | ||
| 1640 | m0 = LOADU_256(m); \ | ||
| 1641 | m1 = LOADU_256((m) + 2 * 32); \ | ||
| 1642 | t0 = _mm256_unpacklo_epi32(m0, m1); \ | ||
| 1643 | t1 = _mm256_unpackhi_epi32(m0, m1); \ | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | #define V8_LOAD_MSG_8(w, m) \ | ||
| 1647 | { \ | ||
| 1648 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
| 1649 | V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \ | ||
| 1650 | V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \ | ||
| 1651 | V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \ | ||
| 1652 | V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \ | ||
| 1653 | V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \ | ||
| 1654 | V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \ | ||
| 1655 | } | ||
| 1656 | |||
| 1657 | #define V8_LOAD_MSG(w, m) \ | ||
| 1658 | { \ | ||
| 1659 | V8_LOAD_MSG_8(w, m) \ | ||
| 1660 | V8_LOAD_MSG_8((w) + 8, (m) + 32) \ | ||
| 1661 | } | ||
| 1662 | |||
| 1663 | #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER | ||
| 1664 | |||
| 1665 | |||
| 1666 | #define V8_PERM_PAIR_STORE(u, a0, a2) \ | ||
| 1667 | { \ | ||
| 1668 | STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \ | ||
| 1669 | STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \ | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \ | ||
| 1673 | { \ | ||
| 1674 | __m256i s0, s1, s2, s3; \ | ||
| 1675 | s0 = _mm256_unpacklo_epi64(z0, z1); \ | ||
| 1676 | s1 = _mm256_unpackhi_epi64(z0, z1); \ | ||
| 1677 | s2 = _mm256_unpacklo_epi64(z2, z3); \ | ||
| 1678 | s3 = _mm256_unpackhi_epi64(z2, z3); \ | ||
| 1679 | V8_PERM_PAIR_STORE(u + 0, s0, s2) \ | ||
| 1680 | V8_PERM_PAIR_STORE(u + 2, s1, s3) \ | ||
| 1681 | } | ||
| 1682 | |||
| 1683 | #define V8_UNPACK_STORE_0(src32, d0, d1) \ | ||
| 1684 | { \ | ||
| 1685 | const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \ | ||
| 1686 | const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \ | ||
| 1687 | d0 = _mm256_unpacklo_epi32(v0, v1); \ | ||
| 1688 | d1 = _mm256_unpackhi_epi32(v0, v1); \ | ||
| 1689 | } | ||
| 1690 | |||
| 1691 | #define V8_UNPACK_STATE(dest32, src32) \ | ||
| 1692 | { \ | ||
| 1693 | __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ | ||
| 1694 | V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \ | ||
| 1695 | V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \ | ||
| 1696 | V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \ | ||
| 1697 | V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \ | ||
| 1698 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \ | ||
| 1699 | V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \ | ||
| 110 | } | 1700 | } |
| 111 | 1701 | ||
| 112 | 1702 | ||
| 113 | #define Blake2s_Increment_Counter(S, inc) \ | ||
| 114 | { p->t[0] += (inc); p->t[1] += (p->t[0] < (inc)); } | ||
| 115 | 1703 | ||
| 116 | #define Blake2s_Set_LastBlock(p) \ | 1704 | #define V8_LOAD_STATE_256_FROM_STRUCT(i) \ |
| 117 | { p->f[0] = BLAKE2S_FINAL_FLAG; p->f[1] = p->lastNode_f1; } | 1705 | v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8); |
| 1706 | |||
| 1707 | #if 0 || 0 && defined(MY_CPU_X86) | ||
| 1708 | #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1709 | #endif | ||
| 1710 | |||
| 1711 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1712 | // this branch doesn't use (iv) array | ||
| 1713 | // so register pressure can be lower. | ||
| 1714 | // it can be faster sometimes | ||
| 1715 | #define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i) | ||
| 1716 | #define V8_UPDATE_STATE_256(i) \ | ||
| 1717 | { \ | ||
| 1718 | STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \ | ||
| 1719 | XOR_256(v[i], v[(i) + 8]), \ | ||
| 1720 | LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \ | ||
| 1721 | } | ||
| 1722 | #else | ||
| 1723 | // it uses more variables (iv) registers | ||
| 1724 | // it's better for gcc | ||
| 1725 | // maybe that branch is better, if register pressure will be lower (avx512) | ||
| 1726 | #define V8_LOAD_STATE_256(i) { iv[i] = v[i]; } | ||
| 1727 | #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); } | ||
| 1728 | #define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); } | ||
| 1729 | #endif | ||
| 118 | 1730 | ||
| 119 | 1731 | ||
| 120 | static void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size) | 1732 | #if 0 |
| 1733 | // use loading constants from memory | ||
| 1734 | #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n) | ||
| 1735 | MY_ALIGN(64) | ||
| 1736 | static const UInt32 k_Blake2s_IV_WAY8[]= | ||
| 121 | { | 1737 | { |
| 122 | while (size != 0) | 1738 | KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7) |
| 123 | { | 1739 | }; |
| 124 | unsigned pos = (unsigned)p->bufPos; | 1740 | #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i)) |
| 125 | unsigned rem = BLAKE2S_BLOCK_SIZE - pos; | 1741 | #else |
| 1742 | // use constant generation: | ||
| 1743 | #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i)) | ||
| 1744 | #endif | ||
| 126 | 1745 | ||
| 127 | if (size <= rem) | 1746 | |
| 1747 | static | ||
| 1748 | Z7_NO_INLINE | ||
| 1749 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
| 1750 | BLAKE2S_ATTRIB_AVX2 | ||
| 1751 | #endif | ||
| 1752 | void | ||
| 1753 | Z7_FASTCALL | ||
| 1754 | Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 1755 | { | ||
| 1756 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1757 | __m256i v[16]; | ||
| 1758 | #endif | ||
| 1759 | |||
| 1760 | // PrintStates2(s_items, 8, 16); | ||
| 1761 | |||
| 1762 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1763 | REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT) | ||
| 1764 | #endif | ||
| 1765 | |||
| 1766 | do | ||
| 1767 | { | ||
| 1768 | __m256i w[16]; | ||
| 1769 | #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1770 | __m256i v[16]; | ||
| 1771 | #else | ||
| 1772 | __m256i iv[8]; | ||
| 1773 | #endif | ||
| 1774 | V8_LOAD_MSG(w, data) | ||
| 128 | { | 1775 | { |
| 129 | memcpy(p->buf + pos, data, size); | 1776 | // we use load/store ctr inside loop to reduce register pressure: |
| 130 | p->bufPos += (UInt32)size; | 1777 | #if 1 || 1 && defined(MY_CPU_X86) |
| 131 | return; | 1778 | const __m256i ctr = _mm256_add_epi64( |
| 1779 | LOAD_256_FROM_STRUCT(s_items + 64), | ||
| 1780 | _mm256_set_epi32( | ||
| 1781 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE, | ||
| 1782 | 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)); | ||
| 1783 | STORE_256_TO_STRUCT(s_items + 64, ctr); | ||
| 1784 | #else | ||
| 1785 | const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64) | ||
| 1786 | + Z7_BLAKE2S_BLOCK_SIZE; | ||
| 1787 | const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64); | ||
| 1788 | *(UInt64 *)(void *)(s_items + 64) = ctr64; | ||
| 1789 | #endif | ||
| 1790 | v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); | ||
| 1791 | v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); | ||
| 132 | } | 1792 | } |
| 1793 | v[ 8] = GET_256_IV_WAY8(0); | ||
| 1794 | v[ 9] = GET_256_IV_WAY8(1); | ||
| 1795 | v[10] = GET_256_IV_WAY8(2); | ||
| 1796 | v[11] = GET_256_IV_WAY8(3); | ||
| 1797 | v[14] = GET_256_IV_WAY8(6); | ||
| 1798 | v[15] = GET_256_IV_WAY8(7); | ||
| 133 | 1799 | ||
| 134 | memcpy(p->buf + pos, data, rem); | 1800 | REP8_MACRO (V8_LOAD_STATE_256) |
| 135 | Blake2s_Increment_Counter(S, BLAKE2S_BLOCK_SIZE) | 1801 | ROUNDS_LOOP (V8_ROUND) |
| 136 | Blake2s_Compress(p); | 1802 | REP8_MACRO (V8_UPDATE_STATE_256) |
| 137 | p->bufPos = 0; | 1803 | data += SUPER_BLOCK_SIZE; |
| 138 | data += rem; | ||
| 139 | size -= rem; | ||
| 140 | } | 1804 | } |
| 1805 | while (data != end); | ||
| 1806 | |||
| 1807 | #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT | ||
| 1808 | REP8_MACRO (V8_STORE_STATE_256) | ||
| 1809 | #endif | ||
| 141 | } | 1810 | } |
| 142 | 1811 | ||
| 143 | 1812 | ||
| 144 | static void Blake2s_Final(CBlake2s *p, Byte *digest) | 1813 | static |
| 1814 | Z7_NO_INLINE | ||
| 1815 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
| 1816 | BLAKE2S_ATTRIB_AVX2 | ||
| 1817 | #endif | ||
| 1818 | void | ||
| 1819 | Z7_FASTCALL | ||
| 1820 | Blake2sp_Final_AVX2_Fast(UInt32 *states) | ||
| 145 | { | 1821 | { |
| 146 | unsigned i; | 1822 | const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); |
| 1823 | // PrintStates2(states, 8, 16); | ||
| 1824 | V8_UNPACK_STATE(states, states) | ||
| 1825 | // PrintStates2(states, 8, 16); | ||
| 1826 | { | ||
| 1827 | unsigned k; | ||
| 1828 | for (k = 0; k < 8; k++) | ||
| 1829 | { | ||
| 1830 | UInt32 *s = states + (size_t)k * 16; | ||
| 1831 | STORE_128_TO_STRUCT (STATE_T(s), ctr); | ||
| 1832 | } | ||
| 1833 | } | ||
| 1834 | // PrintStates2(states, 8, 16); | ||
| 1835 | // printf("\nafter V8_UNPACK_STATE \n"); | ||
| 1836 | } | ||
| 1837 | |||
| 1838 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
| 1839 | #endif // avx2 | ||
| 1840 | #endif // vector | ||
| 1841 | |||
| 1842 | |||
| 1843 | /* | ||
| 1844 | #define Blake2s_Increment_Counter(s, inc) \ | ||
| 1845 | { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); } | ||
| 1846 | #define Blake2s_Increment_Counter_Small(s, inc) \ | ||
| 1847 | { STATE_T(s)[0] += (inc); } | ||
| 1848 | */ | ||
| 1849 | |||
| 1850 | #define Blake2s_Set_LastBlock(s) \ | ||
| 1851 | { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ } | ||
| 1852 | |||
| 1853 | |||
| 1854 | #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600 | ||
| 1855 | // good for vs2022 | ||
| 1856 | #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) } | ||
| 1857 | #else | ||
| 1858 | // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64. | ||
| 1859 | #define LOOP_8(mac) { REP8_MACRO(mac) } | ||
| 1860 | #endif | ||
| 1861 | |||
| 1862 | |||
| 1863 | static | ||
| 1864 | Z7_FORCE_INLINE | ||
| 1865 | // Z7_NO_INLINE | ||
| 1866 | void | ||
| 1867 | Z7_FASTCALL | ||
| 1868 | Blake2s_Compress(UInt32 *s, const Byte *input) | ||
| 1869 | { | ||
| 1870 | UInt32 m[16]; | ||
| 1871 | UInt32 v[16]; | ||
| 1872 | { | ||
| 1873 | unsigned i; | ||
| 1874 | for (i = 0; i < 16; i++) | ||
| 1875 | m[i] = GetUi32(input + i * 4); | ||
| 1876 | } | ||
| 1877 | |||
| 1878 | #define INIT_v_FROM_s(i) v[i] = s[i]; | ||
| 1879 | |||
| 1880 | LOOP_8(INIT_v_FROM_s) | ||
| 1881 | |||
| 1882 | // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE) | ||
| 1883 | { | ||
| 1884 | const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE; | ||
| 1885 | const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE); | ||
| 1886 | STATE_T(s)[0] = t0; | ||
| 1887 | STATE_T(s)[1] = t1; | ||
| 1888 | v[12] = t0 ^ KIV(4); | ||
| 1889 | v[13] = t1 ^ KIV(5); | ||
| 1890 | } | ||
| 1891 | // v[12] = STATE_T(s)[0] ^ KIV(4); | ||
| 1892 | // v[13] = STATE_T(s)[1] ^ KIV(5); | ||
| 1893 | v[14] = STATE_F(s)[0] ^ KIV(6); | ||
| 1894 | v[15] = STATE_F(s)[1] ^ KIV(7); | ||
| 1895 | |||
| 1896 | v[ 8] = KIV(0); | ||
| 1897 | v[ 9] = KIV(1); | ||
| 1898 | v[10] = KIV(2); | ||
| 1899 | v[11] = KIV(3); | ||
| 1900 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
| 1901 | |||
| 1902 | #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]); | ||
| 1903 | #define ADD32M(dest, src, a) V(a, dest) += V(a, src); | ||
| 1904 | #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src); | ||
| 1905 | #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift); | ||
| 1906 | |||
| 1907 | // big interleaving can provides big performance gain, if scheduler queues are small. | ||
| 1908 | #if 0 || 1 && defined(MY_CPU_X86) | ||
| 1909 | // interleave-1: for small register number (x86-32bit) | ||
| 1910 | #define G2(index, a, x, y) \ | ||
| 1911 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
| 1912 | ADD32M (0, 1, a) \ | ||
| 1913 | XOR32M (3, 0, a) \ | ||
| 1914 | RTR32M (3, x, a) \ | ||
| 1915 | ADD32M (2, 3, a) \ | ||
| 1916 | XOR32M (1, 2, a) \ | ||
| 1917 | RTR32M (1, y, a) \ | ||
| 1918 | |||
| 1919 | #define G(a) \ | ||
| 1920 | G2(a * 2 , a, 16, 12) \ | ||
| 1921 | G2(a * 2 + 1, a, 8, 7) \ | ||
| 1922 | |||
| 1923 | #define R2 \ | ||
| 1924 | G(0) \ | ||
| 1925 | G(1) \ | ||
| 1926 | G(2) \ | ||
| 1927 | G(3) \ | ||
| 1928 | G(4) \ | ||
| 1929 | G(5) \ | ||
| 1930 | G(6) \ | ||
| 1931 | G(7) \ | ||
| 1932 | |||
| 1933 | #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64) | ||
| 1934 | // interleave-2: is good if the number of registers is not big (x86-64). | ||
| 1935 | |||
| 1936 | #define REP2(mac, dest, src, a, b) \ | ||
| 1937 | mac(dest, src, a) \ | ||
| 1938 | mac(dest, src, b) | ||
| 1939 | |||
| 1940 | #define G2(index, a, b, x, y) \ | ||
| 1941 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
| 1942 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
| 1943 | REP2 (ADD32M, 0, 1, a, b) \ | ||
| 1944 | REP2 (XOR32M, 3, 0, a, b) \ | ||
| 1945 | REP2 (RTR32M, 3, x, a, b) \ | ||
| 1946 | REP2 (ADD32M, 2, 3, a, b) \ | ||
| 1947 | REP2 (XOR32M, 1, 2, a, b) \ | ||
| 1948 | REP2 (RTR32M, 1, y, a, b) \ | ||
| 1949 | |||
| 1950 | #define G(a, b) \ | ||
| 1951 | G2(a * 2 , a, b, 16, 12) \ | ||
| 1952 | G2(a * 2 + 1, a, b, 8, 7) \ | ||
| 1953 | |||
| 1954 | #define R2 \ | ||
| 1955 | G(0, 1) \ | ||
| 1956 | G(2, 3) \ | ||
| 1957 | G(4, 5) \ | ||
| 1958 | G(6, 7) \ | ||
| 147 | 1959 | ||
| 148 | Blake2s_Increment_Counter(S, (UInt32)p->bufPos) | 1960 | #else |
| 149 | Blake2s_Set_LastBlock(p) | 1961 | // interleave-4: |
| 150 | memset(p->buf + p->bufPos, 0, BLAKE2S_BLOCK_SIZE - p->bufPos); | 1962 | // it has big register pressure for x86/x64. |
| 151 | Blake2s_Compress(p); | 1963 | // and MSVC compilers for x86/x64 are slow for this branch. |
| 1964 | // but if we have big number of registers, this branch can be faster. | ||
| 152 | 1965 | ||
| 153 | for (i = 0; i < 8; i++) | 1966 | #define REP4(mac, dest, src, a, b, c, d) \ |
| 1967 | mac(dest, src, a) \ | ||
| 1968 | mac(dest, src, b) \ | ||
| 1969 | mac(dest, src, c) \ | ||
| 1970 | mac(dest, src, d) | ||
| 1971 | |||
| 1972 | #define G2(index, a, b, c, d, x, y) \ | ||
| 1973 | ADD_SIGMA (a, (index) + 2 * 0) \ | ||
| 1974 | ADD_SIGMA (b, (index) + 2 * 1) \ | ||
| 1975 | ADD_SIGMA (c, (index) + 2 * 2) \ | ||
| 1976 | ADD_SIGMA (d, (index) + 2 * 3) \ | ||
| 1977 | REP4 (ADD32M, 0, 1, a, b, c, d) \ | ||
| 1978 | REP4 (XOR32M, 3, 0, a, b, c, d) \ | ||
| 1979 | REP4 (RTR32M, 3, x, a, b, c, d) \ | ||
| 1980 | REP4 (ADD32M, 2, 3, a, b, c, d) \ | ||
| 1981 | REP4 (XOR32M, 1, 2, a, b, c, d) \ | ||
| 1982 | REP4 (RTR32M, 1, y, a, b, c, d) \ | ||
| 1983 | |||
| 1984 | #define G(a, b, c, d) \ | ||
| 1985 | G2(a * 2 , a, b, c, d, 16, 12) \ | ||
| 1986 | G2(a * 2 + 1, a, b, c, d, 8, 7) \ | ||
| 1987 | |||
| 1988 | #define R2 \ | ||
| 1989 | G(0, 1, 2, 3) \ | ||
| 1990 | G(4, 5, 6, 7) \ | ||
| 1991 | |||
| 1992 | #endif | ||
| 1993 | |||
| 1994 | #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 } | ||
| 1995 | |||
| 1996 | // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster: | ||
| 1997 | // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG. | ||
| 1998 | // 30-60% faster for (arm64-arm32) GCC. | ||
| 1999 | // 5-11% faster for (arm64) CLANG-MAC. | ||
| 2000 | // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch. | ||
| 2001 | // But if there is vectors branch (for x86*), this scalar code will be unused mostly. | ||
| 2002 | // So we want smaller code (without unrolling) in that case (x86*). | ||
| 2003 | #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) | ||
| 2004 | #define Z7_BLAKE2S_UNROLL | ||
| 2005 | #endif | ||
| 2006 | |||
| 2007 | #ifdef Z7_BLAKE2S_UNROLL | ||
| 2008 | ROUNDS_LOOP_UNROLLED (R) | ||
| 2009 | #else | ||
| 2010 | ROUNDS_LOOP (R) | ||
| 2011 | #endif | ||
| 2012 | |||
| 2013 | #undef G | ||
| 2014 | #undef G2 | ||
| 2015 | #undef R | ||
| 2016 | #undef R2 | ||
| 2017 | |||
| 2018 | // printf("\n v after: \n"); | ||
| 2019 | // PrintStates2((const UInt32 *)v, 1, 16); | ||
| 2020 | #define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8]; | ||
| 2021 | |||
| 2022 | LOOP_8(XOR_s_PAIR_v) | ||
| 2023 | // printf("\n s after:\n"); | ||
| 2024 | // PrintStates2((const UInt32 *)s, 1, 16); | ||
| 2025 | } | ||
| 2026 | |||
| 2027 | |||
| 2028 | static | ||
| 2029 | Z7_NO_INLINE | ||
| 2030 | void | ||
| 2031 | Z7_FASTCALL | ||
| 2032 | Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end) | ||
| 2033 | { | ||
| 2034 | size_t pos = 0; | ||
| 2035 | // PrintStates2(s_items, 8, 16); | ||
| 2036 | do | ||
| 154 | { | 2037 | { |
| 155 | SetUi32(digest + sizeof(p->h[i]) * i, p->h[i]) | 2038 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); |
| 2039 | Blake2s_Compress(s, data); | ||
| 2040 | data += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 2041 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 2042 | pos &= SUPER_BLOCK_MASK; | ||
| 156 | } | 2043 | } |
| 2044 | while (data != end); | ||
| 157 | } | 2045 | } |
| 158 | 2046 | ||
| 159 | 2047 | ||
| 160 | /* ---------- BLAKE2s ---------- */ | 2048 | #ifdef Z7_BLAKE2S_USE_VECTORS |
| 2049 | |||
| 2050 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2; | ||
| 2051 | static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2; | ||
| 2052 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
| 2053 | static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
| 2054 | static unsigned g_z7_Blake2sp_SupportedFlags; | ||
| 2055 | |||
| 2056 | #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast | ||
| 2057 | #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single | ||
| 2058 | #else | ||
| 2059 | #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2 | ||
| 2060 | #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2 | ||
| 2061 | #endif // Z7_BLAKE2S_USE_VECTORS | ||
| 2062 | |||
| 161 | 2063 | ||
| 162 | /* we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ | 2064 | #if 1 && defined(MY_CPU_LE) |
| 2065 | #define GET_DIGEST(_s, _digest) \ | ||
| 2066 | { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); } | ||
| 2067 | #else | ||
| 2068 | #define GET_DIGEST(_s, _digest) \ | ||
| 2069 | { unsigned _i; for (_i = 0; _i < 8; _i++) \ | ||
| 2070 | { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \ | ||
| 2071 | } | ||
| 2072 | #endif | ||
| 2073 | |||
| 2074 | |||
| 2075 | /* ---------- BLAKE2s ---------- */ | ||
| 163 | /* | 2076 | /* |
| 2077 | // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() | ||
| 164 | typedef struct | 2078 | typedef struct |
| 165 | { | 2079 | { |
| 166 | Byte digest_length; | 2080 | Byte digest_length; |
| 167 | Byte key_length; | 2081 | Byte key_length; |
| 168 | Byte fanout; | 2082 | Byte fanout; // = 1 : in sequential mode |
| 169 | Byte depth; | 2083 | Byte depth; // = 1 : in sequential mode |
| 170 | UInt32 leaf_length; | 2084 | UInt32 leaf_length; |
| 171 | Byte node_offset[6]; | 2085 | Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode |
| 172 | Byte node_depth; | 2086 | Byte node_depth; // 0 for the leaves, or in sequential mode |
| 173 | Byte inner_length; | 2087 | Byte inner_length; // [0, 32], 0 in sequential mode |
| 174 | Byte salt[BLAKE2S_SALTBYTES]; | 2088 | Byte salt[BLAKE2S_SALTBYTES]; |
| 175 | Byte personal[BLAKE2S_PERSONALBYTES]; | 2089 | Byte personal[BLAKE2S_PERSONALBYTES]; |
| 176 | } CBlake2sParam; | 2090 | } CBlake2sParam; |
| 177 | */ | 2091 | */ |
| 178 | 2092 | ||
| 2093 | #define k_Blake2sp_IV_0 \ | ||
| 2094 | (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24))) | ||
| 2095 | #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \ | ||
| 2096 | (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24)) | ||
| 179 | 2097 | ||
| 180 | static void Blake2sp_Init_Spec(CBlake2s *p, unsigned node_offset, unsigned node_depth) | 2098 | Z7_FORCE_INLINE |
| 2099 | static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth) | ||
| 181 | { | 2100 | { |
| 182 | Blake2s_Init0(p); | 2101 | s[0] = k_Blake2sp_IV_0; |
| 183 | 2102 | s[1] = KIV(1); | |
| 184 | p->h[0] ^= (BLAKE2S_DIGEST_SIZE | ((UInt32)BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)); | 2103 | s[2] = KIV(2) ^ (UInt32)node_offset; |
| 185 | p->h[2] ^= ((UInt32)node_offset); | 2104 | s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth); |
| 186 | p->h[3] ^= ((UInt32)node_depth << 16) | ((UInt32)BLAKE2S_DIGEST_SIZE << 24); | 2105 | s[4] = KIV(4); |
| 187 | /* | 2106 | s[5] = KIV(5); |
| 188 | P->digest_length = BLAKE2S_DIGEST_SIZE; | 2107 | s[6] = KIV(6); |
| 189 | P->key_length = 0; | 2108 | s[7] = KIV(7); |
| 190 | P->fanout = BLAKE2SP_PARALLEL_DEGREE; | 2109 | |
| 191 | P->depth = 2; | 2110 | STATE_T(s)[0] = 0; |
| 192 | P->leaf_length = 0; | 2111 | STATE_T(s)[1] = 0; |
| 193 | store48(P->node_offset, node_offset); | 2112 | STATE_F(s)[0] = 0; |
| 194 | P->node_depth = node_depth; | 2113 | STATE_F(s)[1] = 0; |
| 195 | P->inner_length = BLAKE2S_DIGEST_SIZE; | ||
| 196 | */ | ||
| 197 | } | 2114 | } |
| 198 | 2115 | ||
| 199 | 2116 | ||
| 2117 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
| 2118 | |||
| 2119 | static | ||
| 2120 | Z7_NO_INLINE | ||
| 2121 | #ifdef BLAKE2S_ATTRIB_128BIT | ||
| 2122 | BLAKE2S_ATTRIB_128BIT | ||
| 2123 | #endif | ||
| 2124 | void | ||
| 2125 | Z7_FASTCALL | ||
| 2126 | Blake2sp_InitState_V128_Fast(UInt32 *states) | ||
| 2127 | { | ||
| 2128 | #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \ | ||
| 2129 | { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \ | ||
| 2130 | STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \ | ||
| 2131 | } | ||
| 2132 | #define STORE_128_PAIR_INIT_STATES_1(i, mac) \ | ||
| 2133 | { const __m128i t = mac; \ | ||
| 2134 | STORE_128_PAIR_INIT_STATES_2(i, t, t) \ | ||
| 2135 | } | ||
| 2136 | #define STORE_128_PAIR_INIT_STATES_IV(i) \ | ||
| 2137 | STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i)) | ||
| 2138 | |||
| 2139 | STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
| 2140 | STORE_128_PAIR_INIT_STATES_IV (1) | ||
| 2141 | { | ||
| 2142 | const __m128i t = GET_128_IV_WAY4(2); | ||
| 2143 | STORE_128_PAIR_INIT_STATES_2 (2, | ||
| 2144 | XOR_128(t, _mm_set_epi32(3, 2, 1, 0)), | ||
| 2145 | XOR_128(t, _mm_set_epi32(7, 6, 5, 4))) | ||
| 2146 | } | ||
| 2147 | STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
| 2148 | STORE_128_PAIR_INIT_STATES_IV (4) | ||
| 2149 | STORE_128_PAIR_INIT_STATES_IV (5) | ||
| 2150 | STORE_128_PAIR_INIT_STATES_IV (6) | ||
| 2151 | STORE_128_PAIR_INIT_STATES_IV (7) | ||
| 2152 | STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0)) | ||
| 2153 | // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]); | ||
| 2154 | } | ||
| 2155 | |||
| 2156 | #endif // Z7_BLAKE2S_USE_V128_FAST | ||
| 2157 | |||
| 2158 | |||
| 2159 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 2160 | |||
| 2161 | static | ||
| 2162 | Z7_NO_INLINE | ||
| 2163 | #ifdef BLAKE2S_ATTRIB_AVX2 | ||
| 2164 | BLAKE2S_ATTRIB_AVX2 | ||
| 2165 | #endif | ||
| 2166 | void | ||
| 2167 | Z7_FASTCALL | ||
| 2168 | Blake2sp_InitState_AVX2_Fast(UInt32 *states) | ||
| 2169 | { | ||
| 2170 | #define STORE_256_INIT_STATES(i, t) \ | ||
| 2171 | STORE_256_TO_STRUCT(states + 8 * (i), t); | ||
| 2172 | #define STORE_256_INIT_STATES_IV(i) \ | ||
| 2173 | STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i)) | ||
| 2174 | |||
| 2175 | STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0)) | ||
| 2176 | STORE_256_INIT_STATES_IV (1) | ||
| 2177 | STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2), | ||
| 2178 | _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))) | ||
| 2179 | STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) | ||
| 2180 | STORE_256_INIT_STATES_IV (4) | ||
| 2181 | STORE_256_INIT_STATES_IV (5) | ||
| 2182 | STORE_256_INIT_STATES_IV (6) | ||
| 2183 | STORE_256_INIT_STATES_IV (7) | ||
| 2184 | STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0)) | ||
| 2185 | // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n"); | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | #endif // Z7_BLAKE2S_USE_AVX2_FAST | ||
| 2189 | |||
| 2190 | |||
| 2191 | |||
| 2192 | Z7_NO_INLINE | ||
| 2193 | void Blake2sp_InitState(CBlake2sp *p) | ||
| 2194 | { | ||
| 2195 | size_t i; | ||
| 2196 | // memset(p->states, 0, sizeof(p->states)); // for debug | ||
| 2197 | p->u.header.cycPos = 0; | ||
| 2198 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
| 2199 | if (p->u.header.func_Init) | ||
| 2200 | { | ||
| 2201 | p->u.header.func_Init(p->states); | ||
| 2202 | return; | ||
| 2203 | } | ||
| 2204 | #endif | ||
| 2205 | for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++) | ||
| 2206 | Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0); | ||
| 2207 | } | ||
| 2208 | |||
| 200 | void Blake2sp_Init(CBlake2sp *p) | 2209 | void Blake2sp_Init(CBlake2sp *p) |
| 201 | { | 2210 | { |
| 202 | unsigned i; | 2211 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS |
| 203 | 2212 | p->u.header.func_Compress_Fast = | |
| 204 | p->bufPos = 0; | 2213 | #ifdef Z7_BLAKE2S_USE_VECTORS |
| 2214 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
| 2215 | #else | ||
| 2216 | NULL; | ||
| 2217 | #endif | ||
| 2218 | |||
| 2219 | p->u.header.func_Compress_Single = | ||
| 2220 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 2221 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
| 2222 | #else | ||
| 2223 | NULL; | ||
| 2224 | #endif | ||
| 2225 | |||
| 2226 | p->u.header.func_Init = | ||
| 2227 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 2228 | g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
| 2229 | #else | ||
| 2230 | NULL; | ||
| 2231 | #endif | ||
| 205 | 2232 | ||
| 206 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | 2233 | p->u.header.func_Final = |
| 207 | Blake2sp_Init_Spec(&p->S[i], i, 0); | 2234 | #ifdef Z7_BLAKE2S_USE_VECTORS |
| 2235 | g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
| 2236 | #else | ||
| 2237 | NULL; | ||
| 2238 | #endif | ||
| 2239 | #endif | ||
| 208 | 2240 | ||
| 209 | p->S[BLAKE2SP_PARALLEL_DEGREE - 1].lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2241 | Blake2sp_InitState(p); |
| 210 | } | 2242 | } |
| 211 | 2243 | ||
| 212 | 2244 | ||
| 213 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) | 2245 | void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) |
| 214 | { | 2246 | { |
| 215 | unsigned pos = p->bufPos; | 2247 | size_t pos; |
| 216 | while (size != 0) | 2248 | // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data); |
| 2249 | if (size == 0) | ||
| 2250 | return; | ||
| 2251 | pos = p->u.header.cycPos; | ||
| 2252 | // pos < SUPER_BLOCK_SIZE * 2 : is expected | ||
| 2253 | // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also | ||
| 2254 | { | ||
| 2255 | const size_t pos2 = pos & SUPER_BLOCK_MASK; | ||
| 2256 | if (pos2) | ||
| 2257 | { | ||
| 2258 | const size_t rem = SUPER_BLOCK_SIZE - pos2; | ||
| 2259 | if (rem > size) | ||
| 2260 | { | ||
| 2261 | p->u.header.cycPos = (unsigned)(pos + size); | ||
| 2262 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
| 2263 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); | ||
| 2264 | /* to simpilify the code here we don't try to process first superblock, | ||
| 2265 | if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */ | ||
| 2266 | return; | ||
| 2267 | } | ||
| 2268 | // (rem <= size) | ||
| 2269 | memcpy((Byte *)(void *)p->buf32 + pos, data, rem); | ||
| 2270 | pos += rem; | ||
| 2271 | data += rem; | ||
| 2272 | size -= rem; | ||
| 2273 | } | ||
| 2274 | } | ||
| 2275 | |||
| 2276 | // pos <= SUPER_BLOCK_SIZE * 2 | ||
| 2277 | // pos % SUPER_BLOCK_SIZE == 0 | ||
| 2278 | if (pos) | ||
| 2279 | { | ||
| 2280 | /* pos == SUPER_BLOCK_SIZE || | ||
| 2281 | pos == SUPER_BLOCK_SIZE * 2 */ | ||
| 2282 | size_t end = pos; | ||
| 2283 | if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE | ||
| 2284 | || (end -= SUPER_BLOCK_SIZE)) | ||
| 2285 | { | ||
| 2286 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, | ||
| 2287 | (const Byte *)(const void *)p->buf32, | ||
| 2288 | (const Byte *)(const void *)p->buf32 + end); | ||
| 2289 | if (pos -= end) | ||
| 2290 | memcpy(p->buf32, (const Byte *)(const void *)p->buf32 | ||
| 2291 | + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE); | ||
| 2292 | } | ||
| 2293 | } | ||
| 2294 | |||
| 2295 | // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2296 | if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2297 | { | ||
| 2298 | // pos == 0 | ||
| 2299 | const Byte *end; | ||
| 2300 | const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1)) | ||
| 2301 | & ~(size_t)SUPER_BLOCK_MASK; | ||
| 2302 | size -= size2; | ||
| 2303 | // size < SUPER_BLOCK_SIZE * 2 | ||
| 2304 | end = data + size2; | ||
| 2305 | Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end); | ||
| 2306 | data = end; | ||
| 2307 | } | ||
| 2308 | |||
| 2309 | if (size != 0) | ||
| 217 | { | 2310 | { |
| 218 | unsigned index = pos / BLAKE2S_BLOCK_SIZE; | 2311 | memcpy((Byte *)(void *)p->buf32 + pos, data, size); |
| 219 | unsigned rem = BLAKE2S_BLOCK_SIZE - (pos & (BLAKE2S_BLOCK_SIZE - 1)); | 2312 | pos += size; |
| 220 | if (rem > size) | ||
| 221 | rem = (unsigned)size; | ||
| 222 | Blake2s_Update(&p->S[index], data, rem); | ||
| 223 | size -= rem; | ||
| 224 | data += rem; | ||
| 225 | pos += rem; | ||
| 226 | pos &= (BLAKE2S_BLOCK_SIZE * BLAKE2SP_PARALLEL_DEGREE - 1); | ||
| 227 | } | 2313 | } |
| 228 | p->bufPos = pos; | 2314 | p->u.header.cycPos = (unsigned)pos; |
| 2315 | // cycPos < SUPER_BLOCK_SIZE * 2 | ||
| 229 | } | 2316 | } |
| 230 | 2317 | ||
| 231 | 2318 | ||
| 232 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) | 2319 | void Blake2sp_Final(CBlake2sp *p, Byte *digest) |
| 233 | { | 2320 | { |
| 234 | CBlake2s R; | 2321 | // UInt32 * const R_states = p->states; |
| 235 | unsigned i; | 2322 | // printf("\nBlake2sp_Final \n"); |
| 2323 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
| 2324 | if (p->u.header.func_Final) | ||
| 2325 | p->u.header.func_Final(p->states); | ||
| 2326 | #endif | ||
| 2327 | // printf("\n=====\nBlake2sp_Final \n"); | ||
| 2328 | // PrintStates(p->states, 32); | ||
| 2329 | |||
| 2330 | // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch: | ||
| 2331 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE) | ||
| 2332 | { | ||
| 2333 | unsigned pos; | ||
| 2334 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, | ||
| 2335 | 0, SUPER_BLOCK_SIZE - p->u.header.cycPos); | ||
| 2336 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
| 2337 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2338 | { | ||
| 2339 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
| 2340 | Blake2s_Set_LastBlock(s) | ||
| 2341 | if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos) | ||
| 2342 | { | ||
| 2343 | UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE; | ||
| 2344 | if (pos < p->u.header.cycPos) | ||
| 2345 | delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1); | ||
| 2346 | // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE | ||
| 2347 | { | ||
| 2348 | const UInt32 v = STATE_T(s)[0]; | ||
| 2349 | STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0) | ||
| 2350 | STATE_T(s)[0] = v - delta; | ||
| 2351 | } | ||
| 2352 | } | ||
| 2353 | } | ||
| 2354 | // PrintStates(p->states, 16); | ||
| 2355 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
| 2356 | (Byte *)(void *)p->buf32, | ||
| 2357 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
| 2358 | // PrintStates(p->states, 16); | ||
| 2359 | } | ||
| 2360 | else | ||
| 2361 | { | ||
| 2362 | // (p->u.header.cycPos > SUPER_BLOCK_SIZE) | ||
| 2363 | unsigned pos; | ||
| 2364 | for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2365 | { | ||
| 2366 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); | ||
| 2367 | if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos) | ||
| 2368 | Blake2s_Set_LastBlock(s) | ||
| 2369 | } | ||
| 2370 | if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2371 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; | ||
| 2372 | |||
| 2373 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
| 2374 | (Byte *)(void *)p->buf32, | ||
| 2375 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); | ||
| 236 | 2376 | ||
| 237 | Blake2sp_Init_Spec(&R, 0, 1); | 2377 | // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE; |
| 238 | R.lastNode_f1 = BLAKE2S_FINAL_FLAG; | 2378 | STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; |
| 2379 | |||
| 2380 | // if (p->u.header.cycPos != SUPER_BLOCK_SIZE) | ||
| 2381 | { | ||
| 2382 | pos = SUPER_BLOCK_SIZE; | ||
| 2383 | for (;;) | ||
| 2384 | { | ||
| 2385 | UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK); | ||
| 2386 | Blake2s_Set_LastBlock(s) | ||
| 2387 | pos += Z7_BLAKE2S_BLOCK_SIZE; | ||
| 2388 | if (pos >= p->u.header.cycPos) | ||
| 2389 | { | ||
| 2390 | if (pos != p->u.header.cycPos) | ||
| 2391 | { | ||
| 2392 | const UInt32 delta = pos - p->u.header.cycPos; | ||
| 2393 | const UInt32 v = STATE_T(s)[0]; | ||
| 2394 | STATE_T(s)[1] -= v < delta; | ||
| 2395 | STATE_T(s)[0] = v - delta; | ||
| 2396 | memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta); | ||
| 2397 | } | ||
| 2398 | break; | ||
| 2399 | } | ||
| 2400 | } | ||
| 2401 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
| 2402 | (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE, | ||
| 2403 | (Byte *)(void *)p->buf32 + pos); | ||
| 2404 | } | ||
| 2405 | } | ||
| 239 | 2406 | ||
| 240 | for (i = 0; i < BLAKE2SP_PARALLEL_DEGREE; i++) | ||
| 241 | { | 2407 | { |
| 242 | Byte hash[BLAKE2S_DIGEST_SIZE]; | 2408 | size_t pos; |
| 243 | Blake2s_Final(&p->S[i], hash); | 2409 | for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2) |
| 244 | Blake2s_Update(&R, hash, BLAKE2S_DIGEST_SIZE); | 2410 | { |
| 2411 | const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2)); | ||
| 2412 | Byte *dest = (Byte *)(void *)p->buf32 + pos; | ||
| 2413 | GET_DIGEST(s, dest) | ||
| 2414 | } | ||
| 245 | } | 2415 | } |
| 2416 | Blake2sp_Init_Spec(p->states, 0, 1); | ||
| 2417 | { | ||
| 2418 | size_t pos; | ||
| 2419 | for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE) | ||
| 2420 | - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) | ||
| 2421 | { | ||
| 2422 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
| 2423 | (const Byte *)(const void *)p->buf32 + pos, | ||
| 2424 | (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE); | ||
| 2425 | } | ||
| 2426 | } | ||
| 2427 | // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i); | ||
| 2428 | Blake2s_Set_LastBlock(p->states) | ||
| 2429 | STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG; | ||
| 2430 | { | ||
| 2431 | Z7_BLAKE2SP_Compress_Single(p)(p->states, | ||
| 2432 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE, | ||
| 2433 | (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE); | ||
| 2434 | } | ||
| 2435 | GET_DIGEST(p->states, digest) | ||
| 2436 | // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs); | ||
| 2437 | } | ||
| 2438 | |||
| 246 | 2439 | ||
| 247 | Blake2s_Final(&R, digest); | 2440 | BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo) |
| 2441 | { | ||
| 2442 | // printf("\n========== setfunction = %d ======== \n", algo); | ||
| 2443 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
| 2444 | Z7_BLAKE2SP_FUNC_COMPRESS func = NULL; | ||
| 2445 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL; | ||
| 2446 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
| 2447 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
| 2448 | #else | ||
| 2449 | UNUSED_VAR(p) | ||
| 2450 | #endif | ||
| 2451 | |||
| 2452 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 2453 | |||
| 2454 | func = func_Single = Blake2sp_Compress2; | ||
| 2455 | |||
| 2456 | if (algo != Z7_BLAKE2SP_ALGO_SCALAR) | ||
| 2457 | { | ||
| 2458 | // printf("\n========== setfunction NON-SCALER ======== \n"); | ||
| 2459 | if (algo == Z7_BLAKE2SP_ALGO_DEFAULT) | ||
| 2460 | { | ||
| 2461 | func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; | ||
| 2462 | func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; | ||
| 2463 | func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init; | ||
| 2464 | func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final; | ||
| 2465 | } | ||
| 2466 | else | ||
| 2467 | { | ||
| 2468 | if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0) | ||
| 2469 | return False; | ||
| 2470 | |||
| 2471 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 2472 | |||
| 2473 | func_Single = | ||
| 2474 | #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
| 2475 | Blake2sp_Compress2_AVX2_Way2; | ||
| 2476 | #else | ||
| 2477 | Z7_BLAKE2S_Compress2_V128; | ||
| 2478 | #endif | ||
| 2479 | |||
| 2480 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 2481 | if (algo == Z7_BLAKE2SP_ALGO_V256_FAST) | ||
| 2482 | { | ||
| 2483 | func = Blake2sp_Compress2_AVX2_Fast; | ||
| 2484 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
| 2485 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
| 2486 | } | ||
| 2487 | else | ||
| 2488 | #endif | ||
| 2489 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 2490 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2) | ||
| 2491 | func = Blake2sp_Compress2_AVX2_Way2; | ||
| 2492 | else | ||
| 2493 | #endif | ||
| 2494 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 2495 | if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4) | ||
| 2496 | { | ||
| 2497 | func_Single = func = Blake2sp_Compress2_AVX2_Way4; | ||
| 2498 | } | ||
| 2499 | else | ||
| 2500 | #endif | ||
| 2501 | #endif // avx2 | ||
| 2502 | { | ||
| 2503 | if (algo == Z7_BLAKE2SP_ALGO_V128_FAST) | ||
| 2504 | { | ||
| 2505 | func = Blake2sp_Compress2_V128_Fast; | ||
| 2506 | func_Final = Blake2sp_Final_V128_Fast; | ||
| 2507 | func_Init = Blake2sp_InitState_V128_Fast; | ||
| 2508 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
| 2509 | } | ||
| 2510 | else | ||
| 2511 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
| 2512 | if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2) | ||
| 2513 | func = func_Single = Blake2sp_Compress2_V128_Way2; | ||
| 2514 | else | ||
| 2515 | #endif | ||
| 2516 | { | ||
| 2517 | if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1) | ||
| 2518 | return False; | ||
| 2519 | func = func_Single = Blake2sp_Compress2_V128_Way1; | ||
| 2520 | } | ||
| 2521 | } | ||
| 2522 | } | ||
| 2523 | } | ||
| 2524 | #else // !VECTORS | ||
| 2525 | if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR | ||
| 2526 | return False; | ||
| 2527 | #endif // !VECTORS | ||
| 2528 | |||
| 2529 | #ifdef Z7_BLAKE2SP_USE_FUNCTIONS | ||
| 2530 | p->u.header.func_Compress_Fast = func; | ||
| 2531 | p->u.header.func_Compress_Single = func_Single; | ||
| 2532 | p->u.header.func_Final = func_Final; | ||
| 2533 | p->u.header.func_Init = func_Init; | ||
| 2534 | #endif | ||
| 2535 | // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress); | ||
| 2536 | return True; | ||
| 2537 | } | ||
| 2538 | |||
| 2539 | |||
| 2540 | void z7_Black2sp_Prepare(void) | ||
| 2541 | { | ||
| 2542 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 2543 | unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR); | ||
| 2544 | |||
| 2545 | Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2; | ||
| 2546 | Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2; | ||
| 2547 | Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; | ||
| 2548 | Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; | ||
| 2549 | |||
| 2550 | #if defined(MY_CPU_X86_OR_AMD64) | ||
| 2551 | #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
| 2552 | if (CPU_IsSupported_AVX512F_AVX512VL()) | ||
| 2553 | #endif | ||
| 2554 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
| 2555 | if (CPU_IsSupported_SSE41()) | ||
| 2556 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
| 2557 | if (CPU_IsSupported_SSSE3()) | ||
| 2558 | #elif !defined(MY_CPU_AMD64) | ||
| 2559 | if (CPU_IsSupported_SSE2()) | ||
| 2560 | #endif | ||
| 2561 | #endif | ||
| 2562 | { | ||
| 2563 | #if defined(Z7_BLAKE2S_USE_SSE41) | ||
| 2564 | // printf("\n========== Blake2s SSE41 128-bit\n"); | ||
| 2565 | #elif defined(Z7_BLAKE2S_USE_SSSE3) | ||
| 2566 | // printf("\n========== Blake2s SSSE3 128-bit\n"); | ||
| 2567 | #else | ||
| 2568 | // printf("\n========== Blake2s SSE2 128-bit\n"); | ||
| 2569 | #endif | ||
| 2570 | // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2; | ||
| 2571 | // printf("\n========== Blake2sp_Compress2_V128_Way2\n"); | ||
| 2572 | func_Fast = | ||
| 2573 | func_Single = Z7_BLAKE2S_Compress2_V128; | ||
| 2574 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1); | ||
| 2575 | #ifdef Z7_BLAKE2S_USE_V128_WAY2 | ||
| 2576 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2); | ||
| 2577 | #endif | ||
| 2578 | #ifdef Z7_BLAKE2S_USE_V128_FAST | ||
| 2579 | flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST); | ||
| 2580 | func_Fast = Blake2sp_Compress2_V128_Fast; | ||
| 2581 | func_Init = Blake2sp_InitState_V128_Fast; | ||
| 2582 | func_Final = Blake2sp_Final_V128_Fast; | ||
| 2583 | #endif | ||
| 2584 | |||
| 2585 | #ifdef Z7_BLAKE2S_USE_AVX2 | ||
| 2586 | #if defined(MY_CPU_X86_OR_AMD64) | ||
| 2587 | if ( | ||
| 2588 | #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) | ||
| 2589 | CPU_IsSupported_AVX512F_AVX512VL() && | ||
| 2590 | #endif | ||
| 2591 | CPU_IsSupported_AVX2() | ||
| 2592 | ) | ||
| 2593 | #endif | ||
| 2594 | { | ||
| 2595 | // #pragma message ("=== Blake2s AVX2") | ||
| 2596 | // printf("\n========== Blake2s AVX2\n"); | ||
| 2597 | |||
| 2598 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 | ||
| 2599 | func_Single = Blake2sp_Compress2_AVX2_Way2; | ||
| 2600 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2); | ||
| 2601 | #endif | ||
| 2602 | #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 | ||
| 2603 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4); | ||
| 2604 | #endif | ||
| 2605 | |||
| 2606 | #ifdef Z7_BLAKE2S_USE_AVX2_FAST | ||
| 2607 | flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST); | ||
| 2608 | func_Fast = Blake2sp_Compress2_AVX2_Fast; | ||
| 2609 | func_Init = Blake2sp_InitState_AVX2_Fast; | ||
| 2610 | func_Final = Blake2sp_Final_AVX2_Fast; | ||
| 2611 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4) | ||
| 2612 | func_Fast = Blake2sp_Compress2_AVX2_Way4; | ||
| 2613 | #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2) | ||
| 2614 | func_Fast = Blake2sp_Compress2_AVX2_Way2; | ||
| 2615 | #endif | ||
| 2616 | } // avx2 | ||
| 2617 | #endif // avx2 | ||
| 2618 | } // sse* | ||
| 2619 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast; | ||
| 2620 | g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single; | ||
| 2621 | g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init; | ||
| 2622 | g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final; | ||
| 2623 | g_z7_Blake2sp_SupportedFlags = flags; | ||
| 2624 | // printf("\nflags=%x\n", flags); | ||
| 2625 | #endif // vectors | ||
| 248 | } | 2626 | } |
| 249 | 2627 | ||
| 250 | #undef rotr32 | 2628 | /* |
| 2629 | #ifdef Z7_BLAKE2S_USE_VECTORS | ||
| 2630 | void align_test2(CBlake2sp *sp); | ||
| 2631 | void align_test2(CBlake2sp *sp) | ||
| 2632 | { | ||
| 2633 | __m128i a = LOAD_128(sp->states); | ||
| 2634 | D_XOR_128(a, LOAD_128(sp->states + 4)); | ||
| 2635 | STORE_128(sp->states, a); | ||
| 2636 | } | ||
| 2637 | void align_test2(void); | ||
| 2638 | void align_test2(void) | ||
| 2639 | { | ||
| 2640 | CBlake2sp sp; | ||
| 2641 | Blake2sp_Init(&sp); | ||
| 2642 | Blake2sp_Update(&sp, NULL, 0); | ||
| 2643 | } | ||
| 2644 | #endif | ||
| 2645 | */ | ||
| @@ -1,11 +1,11 @@ | |||
| 1 | /* Bra.c -- Branch converters for RISC code | 1 | /* Bra.c -- Branch converters for RISC code |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-20 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "Bra.h" | 6 | #include "Bra.h" |
| 7 | #include "CpuArch.h" | ||
| 8 | #include "RotateDefs.h" | 7 | #include "RotateDefs.h" |
| 8 | #include "CpuArch.h" | ||
| 9 | 9 | ||
| 10 | #if defined(MY_CPU_SIZEOF_POINTER) \ | 10 | #if defined(MY_CPU_SIZEOF_POINTER) \ |
| 11 | && ( MY_CPU_SIZEOF_POINTER == 4 \ | 11 | && ( MY_CPU_SIZEOF_POINTER == 4 \ |
| @@ -26,7 +26,7 @@ | |||
| 26 | #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; | 26 | #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; |
| 27 | // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; | 27 | // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; |
| 28 | 28 | ||
| 29 | #define Z7_BRANCH_CONV(name) z7_BranchConv_ ## name | 29 | #define Z7_BRANCH_CONV(name) z7_ ## name |
| 30 | 30 | ||
| 31 | #define Z7_BRANCH_FUNC_MAIN(name) \ | 31 | #define Z7_BRANCH_FUNC_MAIN(name) \ |
| 32 | static \ | 32 | static \ |
| @@ -42,11 +42,11 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ | |||
| 42 | 42 | ||
| 43 | #ifdef Z7_EXTRACT_ONLY | 43 | #ifdef Z7_EXTRACT_ONLY |
| 44 | #define Z7_BRANCH_FUNCS_IMP(name) \ | 44 | #define Z7_BRANCH_FUNCS_IMP(name) \ |
| 45 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) | 45 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) |
| 46 | #else | 46 | #else |
| 47 | #define Z7_BRANCH_FUNCS_IMP(name) \ | 47 | #define Z7_BRANCH_FUNCS_IMP(name) \ |
| 48 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) \ | 48 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \ |
| 49 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC, 1) | 49 | Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1) |
| 50 | #endif | 50 | #endif |
| 51 | 51 | ||
| 52 | #if defined(__clang__) | 52 | #if defined(__clang__) |
| @@ -72,7 +72,7 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ | |||
| 72 | #endif | 72 | #endif |
| 73 | 73 | ||
| 74 | 74 | ||
| 75 | Z7_BRANCH_FUNC_MAIN(ARM64) | 75 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64) |
| 76 | { | 76 | { |
| 77 | // Byte *p = data; | 77 | // Byte *p = data; |
| 78 | const Byte *lim; | 78 | const Byte *lim; |
| @@ -121,10 +121,10 @@ Z7_BRANCH_FUNC_MAIN(ARM64) | |||
| 121 | } | 121 | } |
| 122 | } | 122 | } |
| 123 | } | 123 | } |
| 124 | Z7_BRANCH_FUNCS_IMP(ARM64) | 124 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64) |
| 125 | 125 | ||
| 126 | 126 | ||
| 127 | Z7_BRANCH_FUNC_MAIN(ARM) | 127 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARM) |
| 128 | { | 128 | { |
| 129 | // Byte *p = data; | 129 | // Byte *p = data; |
| 130 | const Byte *lim; | 130 | const Byte *lim; |
| @@ -152,10 +152,10 @@ Z7_BRANCH_FUNC_MAIN(ARM) | |||
| 152 | } | 152 | } |
| 153 | } | 153 | } |
| 154 | } | 154 | } |
| 155 | Z7_BRANCH_FUNCS_IMP(ARM) | 155 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARM) |
| 156 | 156 | ||
| 157 | 157 | ||
| 158 | Z7_BRANCH_FUNC_MAIN(PPC) | 158 | Z7_BRANCH_FUNC_MAIN(BranchConv_PPC) |
| 159 | { | 159 | { |
| 160 | // Byte *p = data; | 160 | // Byte *p = data; |
| 161 | const Byte *lim; | 161 | const Byte *lim; |
| @@ -192,14 +192,14 @@ Z7_BRANCH_FUNC_MAIN(PPC) | |||
| 192 | } | 192 | } |
| 193 | } | 193 | } |
| 194 | } | 194 | } |
| 195 | Z7_BRANCH_FUNCS_IMP(PPC) | 195 | Z7_BRANCH_FUNCS_IMP(BranchConv_PPC) |
| 196 | 196 | ||
| 197 | 197 | ||
| 198 | #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED | 198 | #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED |
| 199 | #define BR_SPARC_USE_ROTATE | 199 | #define BR_SPARC_USE_ROTATE |
| 200 | #endif | 200 | #endif |
| 201 | 201 | ||
| 202 | Z7_BRANCH_FUNC_MAIN(SPARC) | 202 | Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC) |
| 203 | { | 203 | { |
| 204 | // Byte *p = data; | 204 | // Byte *p = data; |
| 205 | const Byte *lim; | 205 | const Byte *lim; |
| @@ -254,10 +254,10 @@ Z7_BRANCH_FUNC_MAIN(SPARC) | |||
| 254 | } | 254 | } |
| 255 | } | 255 | } |
| 256 | } | 256 | } |
| 257 | Z7_BRANCH_FUNCS_IMP(SPARC) | 257 | Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC) |
| 258 | 258 | ||
| 259 | 259 | ||
| 260 | Z7_BRANCH_FUNC_MAIN(ARMT) | 260 | Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT) |
| 261 | { | 261 | { |
| 262 | // Byte *p = data; | 262 | // Byte *p = data; |
| 263 | Byte *lim; | 263 | Byte *lim; |
| @@ -335,12 +335,12 @@ Z7_BRANCH_FUNC_MAIN(ARMT) | |||
| 335 | // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); | 335 | // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); |
| 336 | // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); | 336 | // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); |
| 337 | } | 337 | } |
| 338 | Z7_BRANCH_FUNCS_IMP(ARMT) | 338 | Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT) |
| 339 | 339 | ||
| 340 | 340 | ||
| 341 | // #define BR_IA64_NO_INLINE | 341 | // #define BR_IA64_NO_INLINE |
| 342 | 342 | ||
| 343 | Z7_BRANCH_FUNC_MAIN(IA64) | 343 | Z7_BRANCH_FUNC_MAIN(BranchConv_IA64) |
| 344 | { | 344 | { |
| 345 | // Byte *p = data; | 345 | // Byte *p = data; |
| 346 | const Byte *lim; | 346 | const Byte *lim; |
| @@ -417,4 +417,293 @@ Z7_BRANCH_FUNC_MAIN(IA64) | |||
| 417 | } | 417 | } |
| 418 | } | 418 | } |
| 419 | } | 419 | } |
| 420 | Z7_BRANCH_FUNCS_IMP(IA64) | 420 | Z7_BRANCH_FUNCS_IMP(BranchConv_IA64) |
| 421 | |||
| 422 | |||
| 423 | #define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET; | ||
| 424 | #define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET; | ||
| 425 | |||
| 426 | #if 1 && defined(MY_CPU_LE_UNALIGN) | ||
| 427 | #define RISCV_USE_UNALIGNED_LOAD | ||
| 428 | #endif | ||
| 429 | |||
| 430 | #ifdef RISCV_USE_UNALIGNED_LOAD | ||
| 431 | #define RISCV_GET_UI32(p) GetUi32(p) | ||
| 432 | #define RISCV_SET_UI32(p, v) { SetUi32(p, v) } | ||
| 433 | #else | ||
| 434 | #define RISCV_GET_UI32(p) \ | ||
| 435 | ((UInt32)GetUi16a(p) + \ | ||
| 436 | ((UInt32)GetUi16a((p) + 2) << 16)) | ||
| 437 | #define RISCV_SET_UI32(p, v) { \ | ||
| 438 | SetUi16a(p, (UInt16)(v)) \ | ||
| 439 | SetUi16a((p) + 2, (UInt16)(v >> 16)) } | ||
| 440 | #endif | ||
| 441 | |||
| 442 | #if 1 && defined(MY_CPU_LE) | ||
| 443 | #define RISCV_USE_16BIT_LOAD | ||
| 444 | #endif | ||
| 445 | |||
| 446 | #ifdef RISCV_USE_16BIT_LOAD | ||
| 447 | #define RISCV_LOAD_VAL(p) GetUi16a(p) | ||
| 448 | #else | ||
| 449 | #define RISCV_LOAD_VAL(p) (*(p)) | ||
| 450 | #endif | ||
| 451 | |||
| 452 | #define RISCV_INSTR_SIZE 2 | ||
| 453 | #define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE) | ||
| 454 | #define RISCV_STEP_2 4 | ||
| 455 | #define RISCV_REG_VAL (2 << 7) | ||
| 456 | #define RISCV_CMD_VAL 3 | ||
| 457 | #if 1 | ||
| 458 | // for code size optimization: | ||
| 459 | #define RISCV_DELTA_7F 0x7f | ||
| 460 | #else | ||
| 461 | #define RISCV_DELTA_7F 0 | ||
| 462 | #endif | ||
| 463 | |||
| 464 | #define RISCV_CHECK_1(v, b) \ | ||
| 465 | (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0) | ||
| 466 | |||
| 467 | #if 1 | ||
| 468 | #define RISCV_CHECK_2(v, r) \ | ||
| 469 | ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \ | ||
| 470 | << 18) \ | ||
| 471 | < ((r) & 0x1d)) | ||
| 472 | #else | ||
| 473 | // this branch gives larger code, because | ||
| 474 | // compilers generate larger code for big constants. | ||
| 475 | #define RISCV_CHECK_2(v, r) \ | ||
| 476 | ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ | ||
| 477 | & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ | ||
| 478 | < ((r) & 0x1d)) | ||
| 479 | #endif | ||
| 480 | |||
| 481 | |||
| 482 | #define RISCV_SCAN_LOOP \ | ||
| 483 | Byte *lim; \ | ||
| 484 | size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \ | ||
| 485 | if (size <= 6) return p; \ | ||
| 486 | size -= 6; \ | ||
| 487 | lim = p + size; \ | ||
| 488 | BR_PC_INIT \ | ||
| 489 | for (;;) \ | ||
| 490 | { \ | ||
| 491 | UInt32 a, v; \ | ||
| 492 | /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \ | ||
| 493 | for (;;) \ | ||
| 494 | { \ | ||
| 495 | if Z7_UNLIKELY(p >= lim) { return p; } \ | ||
| 496 | a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \ | ||
| 497 | if ((a & 0x77) == 0) break; \ | ||
| 498 | a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \ | ||
| 499 | p += RISCV_INSTR_SIZE * 2; \ | ||
| 500 | if ((a & 0x77) == 0) \ | ||
| 501 | { \ | ||
| 502 | p -= RISCV_INSTR_SIZE; \ | ||
| 503 | if Z7_UNLIKELY(p >= lim) { return p; } \ | ||
| 504 | break; \ | ||
| 505 | } \ | ||
| 506 | } | ||
| 507 | // (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL | ||
| 508 | // (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL | ||
| 509 | // (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC | ||
| 510 | // (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC | ||
| 511 | |||
| 512 | Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc) | ||
| 513 | { | ||
| 514 | RISCV_SCAN_LOOP | ||
| 515 | v = a; | ||
| 516 | a = RISCV_GET_UI32(p); | ||
| 517 | #ifndef RISCV_USE_16BIT_LOAD | ||
| 518 | v += (UInt32)p[1] << 8; | ||
| 519 | #endif | ||
| 520 | |||
| 521 | if ((v & 8) == 0) // JAL | ||
| 522 | { | ||
| 523 | if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80) | ||
| 524 | { | ||
| 525 | p += RISCV_INSTR_SIZE; | ||
| 526 | continue; | ||
| 527 | } | ||
| 528 | { | ||
| 529 | v = ((a & 1u << 31) >> 11) | ||
| 530 | | ((a & 0x3ff << 21) >> 20) | ||
| 531 | | ((a & 1 << 20) >> 9) | ||
| 532 | | (a & 0xff << 12); | ||
| 533 | BR_CONVERT_VAL_ENC(v) | ||
| 534 | // ((v & 1) == 0) | ||
| 535 | // v: bits [1 : 20] contain offset bits | ||
| 536 | #if 0 && defined(RISCV_USE_UNALIGNED_LOAD) | ||
| 537 | a &= 0xfff; | ||
| 538 | a |= ((UInt32)(v << 23)) | ||
| 539 | | ((UInt32)(v << 7) & ((UInt32)0xff << 16)) | ||
| 540 | | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8)); | ||
| 541 | RISCV_SET_UI32(p, a) | ||
| 542 | #else // aligned | ||
| 543 | #if 0 | ||
| 544 | SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff))) | ||
| 545 | #else | ||
| 546 | p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf)); | ||
| 547 | #endif | ||
| 548 | |||
| 549 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
| 550 | v <<= 15; | ||
| 551 | v = Z7_BSWAP32(v); | ||
| 552 | SetUi16a(p + 2, (UInt16)v) | ||
| 553 | #else | ||
| 554 | p[2] = (Byte)(v >> 9); | ||
| 555 | p[3] = (Byte)(v >> 1); | ||
| 556 | #endif | ||
| 557 | #endif // aligned | ||
| 558 | } | ||
| 559 | p += 4; | ||
| 560 | continue; | ||
| 561 | } // JAL | ||
| 562 | |||
| 563 | { | ||
| 564 | // AUIPC | ||
| 565 | if (v & 0xe80) // (not x0) and (not x2) | ||
| 566 | { | ||
| 567 | const UInt32 b = RISCV_GET_UI32(p + 4); | ||
| 568 | if (RISCV_CHECK_1(v, b)) | ||
| 569 | { | ||
| 570 | { | ||
| 571 | const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL); | ||
| 572 | RISCV_SET_UI32(p, temp) | ||
| 573 | } | ||
| 574 | a &= 0xfffff000; | ||
| 575 | { | ||
| 576 | #if 1 | ||
| 577 | const int t = -1 >> 1; | ||
| 578 | if (t != -1) | ||
| 579 | a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation | ||
| 580 | else | ||
| 581 | #endif | ||
| 582 | a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension). | ||
| 583 | } | ||
| 584 | BR_CONVERT_VAL_ENC(a) | ||
| 585 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
| 586 | a = Z7_BSWAP32(a); | ||
| 587 | RISCV_SET_UI32(p + 4, a) | ||
| 588 | #else | ||
| 589 | SetBe32(p + 4, a) | ||
| 590 | #endif | ||
| 591 | p += 8; | ||
| 592 | } | ||
| 593 | else | ||
| 594 | p += RISCV_STEP_1; | ||
| 595 | } | ||
| 596 | else | ||
| 597 | { | ||
| 598 | UInt32 r = a >> 27; | ||
| 599 | if (RISCV_CHECK_2(v, r)) | ||
| 600 | { | ||
| 601 | v = RISCV_GET_UI32(p + 4); | ||
| 602 | r = (r << 7) + 0x17 + (v & 0xfffff000); | ||
| 603 | a = (a >> 12) | (v << 20); | ||
| 604 | RISCV_SET_UI32(p, r) | ||
| 605 | RISCV_SET_UI32(p + 4, a) | ||
| 606 | p += 8; | ||
| 607 | } | ||
| 608 | else | ||
| 609 | p += RISCV_STEP_2; | ||
| 610 | } | ||
| 611 | } | ||
| 612 | } // for | ||
| 613 | } | ||
| 614 | |||
| 615 | |||
| 616 | Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc) | ||
| 617 | { | ||
| 618 | RISCV_SCAN_LOOP | ||
| 619 | #ifdef RISCV_USE_16BIT_LOAD | ||
| 620 | if ((a & 8) == 0) | ||
| 621 | { | ||
| 622 | #else | ||
| 623 | v = a; | ||
| 624 | a += (UInt32)p[1] << 8; | ||
| 625 | if ((v & 8) == 0) | ||
| 626 | { | ||
| 627 | #endif | ||
| 628 | // JAL | ||
| 629 | a -= 0x100 - RISCV_DELTA_7F; | ||
| 630 | if (a & 0xd80) | ||
| 631 | { | ||
| 632 | p += RISCV_INSTR_SIZE; | ||
| 633 | continue; | ||
| 634 | } | ||
| 635 | { | ||
| 636 | const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff; | ||
| 637 | #if 0 // unaligned | ||
| 638 | a = GetUi32(p); | ||
| 639 | v = (UInt32)(a >> 23) & ((UInt32)0xff << 1) | ||
| 640 | | (UInt32)(a >> 7) & ((UInt32)0xff << 9) | ||
| 641 | #elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
| 642 | v = GetUi16a(p + 2); | ||
| 643 | v = Z7_BSWAP32(v) >> 15 | ||
| 644 | #else | ||
| 645 | v = (UInt32)p[3] << 1 | ||
| 646 | | (UInt32)p[2] << 9 | ||
| 647 | #endif | ||
| 648 | | (UInt32)((a & 0xf000) << 5); | ||
| 649 | BR_CONVERT_VAL_DEC(v) | ||
| 650 | a = a_old | ||
| 651 | | (v << 11 & 1u << 31) | ||
| 652 | | (v << 20 & 0x3ff << 21) | ||
| 653 | | (v << 9 & 1 << 20) | ||
| 654 | | (v & 0xff << 12); | ||
| 655 | RISCV_SET_UI32(p, a) | ||
| 656 | } | ||
| 657 | p += 4; | ||
| 658 | continue; | ||
| 659 | } // JAL | ||
| 660 | |||
| 661 | { | ||
| 662 | // AUIPC | ||
| 663 | v = a; | ||
| 664 | #if 1 && defined(RISCV_USE_UNALIGNED_LOAD) | ||
| 665 | a = GetUi32(p); | ||
| 666 | #else | ||
| 667 | a |= (UInt32)GetUi16a(p + 2) << 16; | ||
| 668 | #endif | ||
| 669 | if ((v & 0xe80) == 0) // x0/x2 | ||
| 670 | { | ||
| 671 | const UInt32 r = a >> 27; | ||
| 672 | if (RISCV_CHECK_2(v, r)) | ||
| 673 | { | ||
| 674 | UInt32 b; | ||
| 675 | #if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) | ||
| 676 | b = RISCV_GET_UI32(p + 4); | ||
| 677 | b = Z7_BSWAP32(b); | ||
| 678 | #else | ||
| 679 | b = GetBe32(p + 4); | ||
| 680 | #endif | ||
| 681 | v = a >> 12; | ||
| 682 | BR_CONVERT_VAL_DEC(b) | ||
| 683 | a = (r << 7) + 0x17; | ||
| 684 | a += (b + 0x800) & 0xfffff000; | ||
| 685 | v |= b << 20; | ||
| 686 | RISCV_SET_UI32(p, a) | ||
| 687 | RISCV_SET_UI32(p + 4, v) | ||
| 688 | p += 8; | ||
| 689 | } | ||
| 690 | else | ||
| 691 | p += RISCV_STEP_2; | ||
| 692 | } | ||
| 693 | else | ||
| 694 | { | ||
| 695 | const UInt32 b = RISCV_GET_UI32(p + 4); | ||
| 696 | if (!RISCV_CHECK_1(v, b)) | ||
| 697 | p += RISCV_STEP_1; | ||
| 698 | else | ||
| 699 | { | ||
| 700 | v = (a & 0xfffff000) | (b >> 20); | ||
| 701 | a = (b << 12) | (0x17 + RISCV_REG_VAL); | ||
| 702 | RISCV_SET_UI32(p, a) | ||
| 703 | RISCV_SET_UI32(p + 4, v) | ||
| 704 | p += 8; | ||
| 705 | } | ||
| 706 | } | ||
| 707 | } | ||
| 708 | } // for | ||
| 709 | } | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Bra.h -- Branch converters for executables | 1 | /* Bra.h -- Branch converters for executables |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-20 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_BRA_H | 4 | #ifndef ZIP7_INC_BRA_H |
| 5 | #define ZIP7_INC_BRA_H | 5 | #define ZIP7_INC_BRA_H |
| @@ -8,8 +8,12 @@ | |||
| 8 | 8 | ||
| 9 | EXTERN_C_BEGIN | 9 | EXTERN_C_BEGIN |
| 10 | 10 | ||
| 11 | #define Z7_BRANCH_CONV_DEC(name) z7_BranchConv_ ## name ## _Dec | 11 | /* #define PPC BAD_PPC_11 // for debug */ |
| 12 | #define Z7_BRANCH_CONV_ENC(name) z7_BranchConv_ ## name ## _Enc | 12 | |
| 13 | #define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec | ||
| 14 | #define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc | ||
| 15 | #define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name) | ||
| 16 | #define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name) | ||
| 13 | #define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec | 17 | #define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec |
| 14 | #define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc | 18 | #define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc |
| 15 | 19 | ||
| @@ -20,19 +24,20 @@ typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv)); | |||
| 20 | typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); | 24 | typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); |
| 21 | 25 | ||
| 22 | #define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 | 26 | #define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 |
| 23 | Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_DEC(X86)); | 27 | Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86)); |
| 24 | Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_ENC(X86)); | 28 | Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86)); |
| 25 | 29 | ||
| 26 | #define Z7_BRANCH_FUNCS_DECL(name) \ | 30 | #define Z7_BRANCH_FUNCS_DECL(name) \ |
| 27 | Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_DEC(name)); \ | 31 | Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \ |
| 28 | Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_ENC(name)); | 32 | Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name)); |
| 29 | 33 | ||
| 30 | Z7_BRANCH_FUNCS_DECL(ARM64) | 34 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64) |
| 31 | Z7_BRANCH_FUNCS_DECL(ARM) | 35 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARM) |
| 32 | Z7_BRANCH_FUNCS_DECL(ARMT) | 36 | Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT) |
| 33 | Z7_BRANCH_FUNCS_DECL(PPC) | 37 | Z7_BRANCH_FUNCS_DECL (BranchConv_PPC) |
| 34 | Z7_BRANCH_FUNCS_DECL(SPARC) | 38 | Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC) |
| 35 | Z7_BRANCH_FUNCS_DECL(IA64) | 39 | Z7_BRANCH_FUNCS_DECL (BranchConv_IA64) |
| 40 | Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV) | ||
| 36 | 41 | ||
| 37 | /* | 42 | /* |
| 38 | These functions convert data that contain CPU instructions. | 43 | These functions convert data that contain CPU instructions. |
| @@ -49,14 +54,14 @@ and one for decoding (_Enc/_Dec postfixes in function name). | |||
| 49 | In params: | 54 | In params: |
| 50 | data : data buffer | 55 | data : data buffer |
| 51 | size : size of data | 56 | size : size of data |
| 52 | pc : current virtual Program Counter (Instruction Pinter) value | 57 | pc : current virtual Program Counter (Instruction Pointer) value |
| 53 | In/Out param: | 58 | In/Out param: |
| 54 | state : pointer to state variable (for X86 converter only) | 59 | state : pointer to state variable (for X86 converter only) |
| 55 | 60 | ||
| 56 | Return: | 61 | Return: |
| 57 | The pointer to position in (data) buffer after last byte that was processed. | 62 | The pointer to position in (data) buffer after last byte that was processed. |
| 58 | If the caller calls converter again, it must call it starting with that position. | 63 | If the caller calls converter again, it must call it starting with that position. |
| 59 | But the caller is allowed to move data in buffer. so pointer to | 64 | But the caller is allowed to move data in buffer. So pointer to |
| 60 | current processed position also will be changed for next call. | 65 | current processed position also will be changed for next call. |
| 61 | Also the caller must increase internal (pc) value for next call. | 66 | Also the caller must increase internal (pc) value for next call. |
| 62 | 67 | ||
| @@ -65,6 +70,7 @@ Each converter has some characteristics: Endian, Alignment, LookAhead. | |||
| 65 | 70 | ||
| 66 | X86 little 1 4 | 71 | X86 little 1 4 |
| 67 | ARMT little 2 2 | 72 | ARMT little 2 2 |
| 73 | RISCV little 2 6 | ||
| 68 | ARM little 4 0 | 74 | ARM little 4 0 |
| 69 | ARM64 little 4 0 | 75 | ARM64 little 4 0 |
| 70 | PPC big 4 0 | 76 | PPC big 4 0 |
diff --git a/C/Compiler.h b/C/Compiler.h index 185a52d..2a9c2b7 100644 --- a/C/Compiler.h +++ b/C/Compiler.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Compiler.h : Compiler specific defines and pragmas | 1 | /* Compiler.h : Compiler specific defines and pragmas |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_COMPILER_H | 4 | #ifndef ZIP7_INC_COMPILER_H |
| 5 | #define ZIP7_INC_COMPILER_H | 5 | #define ZIP7_INC_COMPILER_H |
| @@ -25,11 +25,79 @@ | |||
| 25 | #define Z7_MINGW | 25 | #define Z7_MINGW |
| 26 | #endif | 26 | #endif |
| 27 | 27 | ||
| 28 | #if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__)) | ||
| 29 | #define Z7_MCST_LCC | ||
| 30 | #define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__) | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /* | ||
| 34 | #if defined(__AVX2__) \ | ||
| 35 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
| 36 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ | ||
| 37 | || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ | ||
| 38 | || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ | ||
| 39 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) | ||
| 40 | #define Z7_COMPILER_AVX2_SUPPORTED | ||
| 41 | #endif | ||
| 42 | #endif | ||
| 43 | */ | ||
| 44 | |||
| 28 | // #pragma GCC diagnostic ignored "-Wunknown-pragmas" | 45 | // #pragma GCC diagnostic ignored "-Wunknown-pragmas" |
| 29 | 46 | ||
| 30 | #ifdef __clang__ | 47 | #ifdef __clang__ |
| 31 | // padding size of '' with 4 bytes to alignment boundary | 48 | // padding size of '' with 4 bytes to alignment boundary |
| 32 | #pragma GCC diagnostic ignored "-Wpadded" | 49 | #pragma GCC diagnostic ignored "-Wpadded" |
| 50 | |||
| 51 | #if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \ | ||
| 52 | && defined(__FreeBSD__) | ||
| 53 | // freebsd: | ||
| 54 | #pragma GCC diagnostic ignored "-Wexcess-padding" | ||
| 55 | #endif | ||
| 56 | |||
| 57 | #if __clang_major__ >= 16 | ||
| 58 | #pragma GCC diagnostic ignored "-Wunsafe-buffer-usage" | ||
| 59 | #endif | ||
| 60 | |||
| 61 | #if __clang_major__ == 13 | ||
| 62 | #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) | ||
| 63 | // cheri | ||
| 64 | #pragma GCC diagnostic ignored "-Wcapability-to-integer-cast" | ||
| 65 | #endif | ||
| 66 | #endif | ||
| 67 | |||
| 68 | #if __clang_major__ == 13 | ||
| 69 | // for <arm_neon.h> | ||
| 70 | #pragma GCC diagnostic ignored "-Wreserved-identifier" | ||
| 71 | #endif | ||
| 72 | |||
| 73 | #endif // __clang__ | ||
| 74 | |||
| 75 | #if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16 | ||
| 76 | // #pragma GCC diagnostic ignored "-Wcast-function-type-strict" | ||
| 77 | #define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \ | ||
| 78 | _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"") | ||
| 79 | #else | ||
| 80 | #define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
| 81 | #endif | ||
| 82 | |||
| 83 | typedef void (*Z7_void_Function)(void); | ||
| 84 | #if defined(__clang__) || defined(__GNUC__) | ||
| 85 | #define Z7_CAST_FUNC_C (Z7_void_Function) | ||
| 86 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
| 87 | #define Z7_CAST_FUNC_C (void *) | ||
| 88 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
| 89 | #else | ||
| 90 | #define Z7_CAST_FUNC_C | ||
| 91 | #endif | ||
| 92 | /* | ||
| 93 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | ||
| 94 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
| 95 | #endif | ||
| 96 | */ | ||
| 97 | #ifdef __GNUC__ | ||
| 98 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000) | ||
| 99 | #pragma GCC diagnostic ignored "-Wstrict-aliasing" | ||
| 100 | #endif | ||
| 33 | #endif | 101 | #endif |
| 34 | 102 | ||
| 35 | 103 | ||
| @@ -101,7 +169,8 @@ | |||
| 101 | _Pragma("clang loop unroll(disable)") \ | 169 | _Pragma("clang loop unroll(disable)") \ |
| 102 | _Pragma("clang loop vectorize(disable)") | 170 | _Pragma("clang loop vectorize(disable)") |
| 103 | #define Z7_ATTRIB_NO_VECTORIZE | 171 | #define Z7_ATTRIB_NO_VECTORIZE |
| 104 | #elif defined(__GNUC__) && (__GNUC__ >= 5) | 172 | #elif defined(__GNUC__) && (__GNUC__ >= 5) \ |
| 173 | && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610)) | ||
| 105 | #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) | 174 | #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) |
| 106 | // __attribute__((optimize("no-unroll-loops"))); | 175 | // __attribute__((optimize("no-unroll-loops"))); |
| 107 | #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | 176 | #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE |
| @@ -142,15 +211,23 @@ | |||
| 142 | #endif | 211 | #endif |
| 143 | 212 | ||
| 144 | 213 | ||
| 145 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 36000)) | 214 | #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600)) |
| 146 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | 215 | |
| 216 | #if (Z7_CLANG_VERSION < 130000) | ||
| 217 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | ||
| 218 | _Pragma("GCC diagnostic push") \ | ||
| 219 | _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"") | ||
| 220 | #else | ||
| 221 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ | ||
| 147 | _Pragma("GCC diagnostic push") \ | 222 | _Pragma("GCC diagnostic push") \ |
| 148 | _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") | 223 | _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") |
| 149 | #define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ | 224 | #endif |
| 225 | |||
| 226 | #define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ | ||
| 150 | _Pragma("GCC diagnostic pop") | 227 | _Pragma("GCC diagnostic pop") |
| 151 | #else | 228 | #else |
| 152 | #define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | 229 | #define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
| 153 | #define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | 230 | #define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
| 154 | #endif | 231 | #endif |
| 155 | 232 | ||
| 156 | #define UNUSED_VAR(x) (void)x; | 233 | #define UNUSED_VAR(x) (void)x; |
diff --git a/C/CpuArch.c b/C/CpuArch.c index 33f8a3a..d51b38a 100644 --- a/C/CpuArch.c +++ b/C/CpuArch.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* CpuArch.c -- CPU specific code | 1 | /* CpuArch.c -- CPU specific code |
| 2 | 2023-05-18 : Igor Pavlov : Public domain */ | 2 | 2024-03-02 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -226,7 +226,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | |||
| 226 | DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! | 226 | DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! |
| 227 | */ | 227 | */ |
| 228 | static | 228 | static |
| 229 | Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo) | 229 | Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) |
| 230 | { | 230 | { |
| 231 | UNUSED_VAR(subFunction) | 231 | UNUSED_VAR(subFunction) |
| 232 | __cpuid(CPUInfo, func); | 232 | __cpuid(CPUInfo, func); |
| @@ -242,13 +242,13 @@ Z7_NO_INLINE | |||
| 242 | #endif | 242 | #endif |
| 243 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) | 243 | void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) |
| 244 | { | 244 | { |
| 245 | MY_cpuidex((int *)p, (int)func, 0); | 245 | MY_cpuidex((Int32 *)p, (Int32)func, 0); |
| 246 | } | 246 | } |
| 247 | 247 | ||
| 248 | Z7_NO_INLINE | 248 | Z7_NO_INLINE |
| 249 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) | 249 | UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) |
| 250 | { | 250 | { |
| 251 | int a[4]; | 251 | Int32 a[4]; |
| 252 | MY_cpuidex(a, 0, 0); | 252 | MY_cpuidex(a, 0, 0); |
| 253 | return a[0]; | 253 | return a[0]; |
| 254 | } | 254 | } |
| @@ -384,7 +384,7 @@ BoolInt CPU_IsSupported_CMOV(void) | |||
| 384 | UInt32 a[4]; | 384 | UInt32 a[4]; |
| 385 | if (!x86cpuid_Func_1(&a[0])) | 385 | if (!x86cpuid_Func_1(&a[0])) |
| 386 | return 0; | 386 | return 0; |
| 387 | return (a[3] >> 15) & 1; | 387 | return (BoolInt)(a[3] >> 15) & 1; |
| 388 | } | 388 | } |
| 389 | 389 | ||
| 390 | BoolInt CPU_IsSupported_SSE(void) | 390 | BoolInt CPU_IsSupported_SSE(void) |
| @@ -393,7 +393,7 @@ BoolInt CPU_IsSupported_SSE(void) | |||
| 393 | CHECK_SYS_SSE_SUPPORT | 393 | CHECK_SYS_SSE_SUPPORT |
| 394 | if (!x86cpuid_Func_1(&a[0])) | 394 | if (!x86cpuid_Func_1(&a[0])) |
| 395 | return 0; | 395 | return 0; |
| 396 | return (a[3] >> 25) & 1; | 396 | return (BoolInt)(a[3] >> 25) & 1; |
| 397 | } | 397 | } |
| 398 | 398 | ||
| 399 | BoolInt CPU_IsSupported_SSE2(void) | 399 | BoolInt CPU_IsSupported_SSE2(void) |
| @@ -402,7 +402,7 @@ BoolInt CPU_IsSupported_SSE2(void) | |||
| 402 | CHECK_SYS_SSE_SUPPORT | 402 | CHECK_SYS_SSE_SUPPORT |
| 403 | if (!x86cpuid_Func_1(&a[0])) | 403 | if (!x86cpuid_Func_1(&a[0])) |
| 404 | return 0; | 404 | return 0; |
| 405 | return (a[3] >> 26) & 1; | 405 | return (BoolInt)(a[3] >> 26) & 1; |
| 406 | } | 406 | } |
| 407 | 407 | ||
| 408 | #endif | 408 | #endif |
| @@ -419,17 +419,17 @@ static UInt32 x86cpuid_Func_1_ECX(void) | |||
| 419 | 419 | ||
| 420 | BoolInt CPU_IsSupported_AES(void) | 420 | BoolInt CPU_IsSupported_AES(void) |
| 421 | { | 421 | { |
| 422 | return (x86cpuid_Func_1_ECX() >> 25) & 1; | 422 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; |
| 423 | } | 423 | } |
| 424 | 424 | ||
| 425 | BoolInt CPU_IsSupported_SSSE3(void) | 425 | BoolInt CPU_IsSupported_SSSE3(void) |
| 426 | { | 426 | { |
| 427 | return (x86cpuid_Func_1_ECX() >> 9) & 1; | 427 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; |
| 428 | } | 428 | } |
| 429 | 429 | ||
| 430 | BoolInt CPU_IsSupported_SSE41(void) | 430 | BoolInt CPU_IsSupported_SSE41(void) |
| 431 | { | 431 | { |
| 432 | return (x86cpuid_Func_1_ECX() >> 19) & 1; | 432 | return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; |
| 433 | } | 433 | } |
| 434 | 434 | ||
| 435 | BoolInt CPU_IsSupported_SHA(void) | 435 | BoolInt CPU_IsSupported_SHA(void) |
| @@ -441,7 +441,7 @@ BoolInt CPU_IsSupported_SHA(void) | |||
| 441 | { | 441 | { |
| 442 | UInt32 d[4]; | 442 | UInt32 d[4]; |
| 443 | z7_x86_cpuid(d, 7); | 443 | z7_x86_cpuid(d, 7); |
| 444 | return (d[1] >> 29) & 1; | 444 | return (BoolInt)(d[1] >> 29) & 1; |
| 445 | } | 445 | } |
| 446 | } | 446 | } |
| 447 | 447 | ||
| @@ -640,8 +640,8 @@ BoolInt CPU_IsSupported_AVX(void) | |||
| 640 | const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); | 640 | const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); |
| 641 | // printf("\n=== XGetBV=%d\n", bm); | 641 | // printf("\n=== XGetBV=%d\n", bm); |
| 642 | return 1 | 642 | return 1 |
| 643 | & (bm >> 1) // SSE state is supported (set by OS) for storing/restoring | 643 | & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring |
| 644 | & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring | 644 | & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring |
| 645 | } | 645 | } |
| 646 | // since Win7SP1: we can use GetEnabledXStateFeatures(); | 646 | // since Win7SP1: we can use GetEnabledXStateFeatures(); |
| 647 | } | 647 | } |
| @@ -658,10 +658,29 @@ BoolInt CPU_IsSupported_AVX2(void) | |||
| 658 | z7_x86_cpuid(d, 7); | 658 | z7_x86_cpuid(d, 7); |
| 659 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | 659 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
| 660 | return 1 | 660 | return 1 |
| 661 | & (d[1] >> 5); // avx2 | 661 | & (BoolInt)(d[1] >> 5); // avx2 |
| 662 | } | 662 | } |
| 663 | } | 663 | } |
| 664 | 664 | ||
| 665 | /* | ||
| 666 | // fix it: | ||
| 667 | BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) | ||
| 668 | { | ||
| 669 | if (!CPU_IsSupported_AVX()) | ||
| 670 | return False; | ||
| 671 | if (z7_x86_cpuid_GetMaxFunc() < 7) | ||
| 672 | return False; | ||
| 673 | { | ||
| 674 | UInt32 d[4]; | ||
| 675 | z7_x86_cpuid(d, 7); | ||
| 676 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | ||
| 677 | return 1 | ||
| 678 | & (BoolInt)(d[1] >> 16) // avx512-f | ||
| 679 | & (BoolInt)(d[1] >> 31); // avx512-Vl | ||
| 680 | } | ||
| 681 | } | ||
| 682 | */ | ||
| 683 | |||
| 665 | BoolInt CPU_IsSupported_VAES_AVX2(void) | 684 | BoolInt CPU_IsSupported_VAES_AVX2(void) |
| 666 | { | 685 | { |
| 667 | if (!CPU_IsSupported_AVX()) | 686 | if (!CPU_IsSupported_AVX()) |
| @@ -673,9 +692,9 @@ BoolInt CPU_IsSupported_VAES_AVX2(void) | |||
| 673 | z7_x86_cpuid(d, 7); | 692 | z7_x86_cpuid(d, 7); |
| 674 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); | 693 | // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); |
| 675 | return 1 | 694 | return 1 |
| 676 | & (d[1] >> 5) // avx2 | 695 | & (BoolInt)(d[1] >> 5) // avx2 |
| 677 | // & (d[1] >> 31) // avx512vl | 696 | // & (d[1] >> 31) // avx512vl |
| 678 | & (d[2] >> 9); // vaes // VEX-256/EVEX | 697 | & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX |
| 679 | } | 698 | } |
| 680 | } | 699 | } |
| 681 | 700 | ||
| @@ -688,7 +707,7 @@ BoolInt CPU_IsSupported_PageGB(void) | |||
| 688 | if (d[0] < 0x80000001) | 707 | if (d[0] < 0x80000001) |
| 689 | return False; | 708 | return False; |
| 690 | z7_x86_cpuid(d, 0x80000001); | 709 | z7_x86_cpuid(d, 0x80000001); |
| 691 | return (d[3] >> 26) & 1; | 710 | return (BoolInt)(d[3] >> 26) & 1; |
| 692 | } | 711 | } |
| 693 | } | 712 | } |
| 694 | 713 | ||
| @@ -760,32 +779,65 @@ BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } | |||
| 760 | 779 | ||
| 761 | #else // __APPLE__ | 780 | #else // __APPLE__ |
| 762 | 781 | ||
| 763 | #include <sys/auxv.h> | 782 | #if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) |
| 783 | #define Z7_GETAUXV_AVAILABLE | ||
| 784 | #else | ||
| 785 | // #pragma message("=== is not NEW GLIBC === ") | ||
| 786 | #if defined __has_include | ||
| 787 | #if __has_include (<sys/auxv.h>) | ||
| 788 | // #pragma message("=== sys/auxv.h is avail=== ") | ||
| 789 | #define Z7_GETAUXV_AVAILABLE | ||
| 790 | #endif | ||
| 791 | #endif | ||
| 792 | #endif | ||
| 764 | 793 | ||
| 794 | #ifdef Z7_GETAUXV_AVAILABLE | ||
| 795 | // #pragma message("=== Z7_GETAUXV_AVAILABLE === ") | ||
| 796 | #include <sys/auxv.h> | ||
| 765 | #define USE_HWCAP | 797 | #define USE_HWCAP |
| 798 | #endif | ||
| 766 | 799 | ||
| 767 | #ifdef USE_HWCAP | 800 | #ifdef USE_HWCAP |
| 768 | 801 | ||
| 802 | #if defined(__FreeBSD__) | ||
| 803 | static unsigned long MY_getauxval(int aux) | ||
| 804 | { | ||
| 805 | unsigned long val; | ||
| 806 | if (elf_aux_info(aux, &val, sizeof(val))) | ||
| 807 | return 0; | ||
| 808 | return val; | ||
| 809 | } | ||
| 810 | #else | ||
| 811 | #define MY_getauxval getauxval | ||
| 812 | #if defined __has_include | ||
| 813 | #if __has_include (<asm/hwcap.h>) | ||
| 769 | #include <asm/hwcap.h> | 814 | #include <asm/hwcap.h> |
| 815 | #endif | ||
| 816 | #endif | ||
| 817 | #endif | ||
| 770 | 818 | ||
| 771 | #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ | 819 | #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ |
| 772 | BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } | 820 | BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } |
| 773 | 821 | ||
| 774 | #ifdef MY_CPU_ARM64 | 822 | #ifdef MY_CPU_ARM64 |
| 775 | #define MY_HWCAP_CHECK_FUNC(name) \ | 823 | #define MY_HWCAP_CHECK_FUNC(name) \ |
| 776 | MY_HWCAP_CHECK_FUNC_2(name, name) | 824 | MY_HWCAP_CHECK_FUNC_2(name, name) |
| 825 | #if 1 || defined(__ARM_NEON) | ||
| 826 | BoolInt CPU_IsSupported_NEON(void) { return True; } | ||
| 827 | #else | ||
| 777 | MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) | 828 | MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) |
| 829 | #endif | ||
| 778 | // MY_HWCAP_CHECK_FUNC (ASIMD) | 830 | // MY_HWCAP_CHECK_FUNC (ASIMD) |
| 779 | #elif defined(MY_CPU_ARM) | 831 | #elif defined(MY_CPU_ARM) |
| 780 | #define MY_HWCAP_CHECK_FUNC(name) \ | 832 | #define MY_HWCAP_CHECK_FUNC(name) \ |
| 781 | BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } | 833 | BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } |
| 782 | MY_HWCAP_CHECK_FUNC_2(NEON, NEON) | 834 | MY_HWCAP_CHECK_FUNC_2(NEON, NEON) |
| 783 | #endif | 835 | #endif |
| 784 | 836 | ||
| 785 | #else // USE_HWCAP | 837 | #else // USE_HWCAP |
| 786 | 838 | ||
| 787 | #define MY_HWCAP_CHECK_FUNC(name) \ | 839 | #define MY_HWCAP_CHECK_FUNC(name) \ |
| 788 | BoolInt CPU_IsSupported_ ## name() { return 0; } | 840 | BoolInt CPU_IsSupported_ ## name(void) { return 0; } |
| 789 | MY_HWCAP_CHECK_FUNC(NEON) | 841 | MY_HWCAP_CHECK_FUNC(NEON) |
| 790 | 842 | ||
| 791 | #endif // USE_HWCAP | 843 | #endif // USE_HWCAP |
diff --git a/C/CpuArch.h b/C/CpuArch.h index 8e5d8a5..dfc68f1 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* CpuArch.h -- CPU specific code | 1 | /* CpuArch.h -- CPU specific code |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-05-13 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_CPU_ARCH_H | 4 | #ifndef ZIP7_INC_CPU_ARCH_H |
| 5 | #define ZIP7_INC_CPU_ARCH_H | 5 | #define ZIP7_INC_CPU_ARCH_H |
| @@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) | 20 | MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) |
| 21 | */ | 21 | */ |
| 22 | 22 | ||
| 23 | #if !defined(_M_ARM64EC) | ||
| 23 | #if defined(_M_X64) \ | 24 | #if defined(_M_X64) \ |
| 24 | || defined(_M_AMD64) \ | 25 | || defined(_M_AMD64) \ |
| 25 | || defined(__x86_64__) \ | 26 | || defined(__x86_64__) \ |
| @@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 35 | #endif | 36 | #endif |
| 36 | #define MY_CPU_64BIT | 37 | #define MY_CPU_64BIT |
| 37 | #endif | 38 | #endif |
| 39 | #endif | ||
| 38 | 40 | ||
| 39 | 41 | ||
| 40 | #if defined(_M_IX86) \ | 42 | #if defined(_M_IX86) \ |
| @@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 47 | 49 | ||
| 48 | 50 | ||
| 49 | #if defined(_M_ARM64) \ | 51 | #if defined(_M_ARM64) \ |
| 52 | || defined(_M_ARM64EC) \ | ||
| 50 | || defined(__AARCH64EL__) \ | 53 | || defined(__AARCH64EL__) \ |
| 51 | || defined(__AARCH64EB__) \ | 54 | || defined(__AARCH64EB__) \ |
| 52 | || defined(__aarch64__) | 55 | || defined(__aarch64__) |
| 53 | #define MY_CPU_ARM64 | 56 | #define MY_CPU_ARM64 |
| 54 | #ifdef __ILP32__ | 57 | #if defined(__ILP32__) \ |
| 58 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
| 55 | #define MY_CPU_NAME "arm64-32" | 59 | #define MY_CPU_NAME "arm64-32" |
| 56 | #define MY_CPU_SIZEOF_POINTER 4 | 60 | #define MY_CPU_SIZEOF_POINTER 4 |
| 57 | #else | 61 | #elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) |
| 62 | #define MY_CPU_NAME "arm64-128" | ||
| 63 | #define MY_CPU_SIZEOF_POINTER 16 | ||
| 64 | #else | ||
| 65 | #if defined(_M_ARM64EC) | ||
| 66 | #define MY_CPU_NAME "arm64ec" | ||
| 67 | #else | ||
| 58 | #define MY_CPU_NAME "arm64" | 68 | #define MY_CPU_NAME "arm64" |
| 69 | #endif | ||
| 59 | #define MY_CPU_SIZEOF_POINTER 8 | 70 | #define MY_CPU_SIZEOF_POINTER 8 |
| 60 | #endif | 71 | #endif |
| 61 | #define MY_CPU_64BIT | 72 | #define MY_CPU_64BIT |
| 62 | #endif | 73 | #endif |
| 63 | 74 | ||
| @@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 133 | #endif | 144 | #endif |
| 134 | 145 | ||
| 135 | 146 | ||
| 147 | #if defined(__sparc__) \ | ||
| 148 | || defined(__sparc) | ||
| 149 | #define MY_CPU_SPARC | ||
| 150 | #if defined(__LP64__) \ | ||
| 151 | || defined(_LP64) \ | ||
| 152 | || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
| 153 | #define MY_CPU_NAME "sparcv9" | ||
| 154 | #define MY_CPU_SIZEOF_POINTER 8 | ||
| 155 | #define MY_CPU_64BIT | ||
| 156 | #elif defined(__sparc_v9__) \ | ||
| 157 | || defined(__sparcv9) | ||
| 158 | #define MY_CPU_64BIT | ||
| 159 | #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
| 160 | #define MY_CPU_NAME "sparcv9-32" | ||
| 161 | #else | ||
| 162 | #define MY_CPU_NAME "sparcv9m" | ||
| 163 | #endif | ||
| 164 | #elif defined(__sparc_v8__) \ | ||
| 165 | || defined(__sparcv8) | ||
| 166 | #define MY_CPU_NAME "sparcv8" | ||
| 167 | #define MY_CPU_SIZEOF_POINTER 4 | ||
| 168 | #else | ||
| 169 | #define MY_CPU_NAME "sparc" | ||
| 170 | #endif | ||
| 171 | #endif | ||
| 172 | |||
| 173 | |||
| 136 | #if defined(__riscv) \ | 174 | #if defined(__riscv) \ |
| 137 | || defined(__riscv__) | 175 | || defined(__riscv__) |
| 176 | #define MY_CPU_RISCV | ||
| 138 | #if __riscv_xlen == 32 | 177 | #if __riscv_xlen == 32 |
| 139 | #define MY_CPU_NAME "riscv32" | 178 | #define MY_CPU_NAME "riscv32" |
| 140 | #elif __riscv_xlen == 64 | 179 | #elif __riscv_xlen == 64 |
| @@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 145 | #endif | 184 | #endif |
| 146 | 185 | ||
| 147 | 186 | ||
| 187 | #if defined(__loongarch__) | ||
| 188 | #define MY_CPU_LOONGARCH | ||
| 189 | #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64) | ||
| 190 | #define MY_CPU_64BIT | ||
| 191 | #endif | ||
| 192 | #if defined(__loongarch64) | ||
| 193 | #define MY_CPU_NAME "loongarch64" | ||
| 194 | #define MY_CPU_LOONGARCH64 | ||
| 195 | #else | ||
| 196 | #define MY_CPU_NAME "loongarch" | ||
| 197 | #endif | ||
| 198 | #endif | ||
| 199 | |||
| 200 | |||
| 201 | // #undef MY_CPU_NAME | ||
| 202 | // #undef MY_CPU_SIZEOF_POINTER | ||
| 203 | // #define __e2k__ | ||
| 204 | // #define __SIZEOF_POINTER__ 4 | ||
| 205 | #if defined(__e2k__) | ||
| 206 | #define MY_CPU_E2K | ||
| 207 | #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) | ||
| 208 | #define MY_CPU_NAME "e2k-32" | ||
| 209 | #define MY_CPU_SIZEOF_POINTER 4 | ||
| 210 | #else | ||
| 211 | #define MY_CPU_NAME "e2k" | ||
| 212 | #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) | ||
| 213 | #define MY_CPU_SIZEOF_POINTER 8 | ||
| 214 | #endif | ||
| 215 | #endif | ||
| 216 | #define MY_CPU_64BIT | ||
| 217 | #endif | ||
| 218 | |||
| 219 | |||
| 148 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) | 220 | #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) |
| 149 | #define MY_CPU_X86_OR_AMD64 | 221 | #define MY_CPU_X86_OR_AMD64 |
| 150 | #endif | 222 | #endif |
| @@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 175 | || defined(MY_CPU_ARM_LE) \ | 247 | || defined(MY_CPU_ARM_LE) \ |
| 176 | || defined(MY_CPU_ARM64_LE) \ | 248 | || defined(MY_CPU_ARM64_LE) \ |
| 177 | || defined(MY_CPU_IA64_LE) \ | 249 | || defined(MY_CPU_IA64_LE) \ |
| 250 | || defined(_LITTLE_ENDIAN) \ | ||
| 178 | || defined(__LITTLE_ENDIAN__) \ | 251 | || defined(__LITTLE_ENDIAN__) \ |
| 179 | || defined(__ARMEL__) \ | 252 | || defined(__ARMEL__) \ |
| 180 | || defined(__THUMBEL__) \ | 253 | || defined(__THUMBEL__) \ |
| @@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 251 | 324 | ||
| 252 | 325 | ||
| 253 | #ifndef MY_CPU_NAME | 326 | #ifndef MY_CPU_NAME |
| 327 | // #define MY_CPU_IS_UNKNOWN | ||
| 254 | #ifdef MY_CPU_LE | 328 | #ifdef MY_CPU_LE |
| 255 | #define MY_CPU_NAME "LE" | 329 | #define MY_CPU_NAME "LE" |
| 256 | #elif defined(MY_CPU_BE) | 330 | #elif defined(MY_CPU_BE) |
| @@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 295 | #define Z7_BSWAP64(v) _byteswap_uint64(v) | 369 | #define Z7_BSWAP64(v) _byteswap_uint64(v) |
| 296 | #define Z7_CPU_FAST_BSWAP_SUPPORTED | 370 | #define Z7_CPU_FAST_BSWAP_SUPPORTED |
| 297 | 371 | ||
| 298 | #elif (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | 372 | /* GCC can generate slow code that calls function for __builtin_bswap32() for: |
| 299 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) | 373 | - GCC for RISCV, if Zbb extension is not used. |
| 300 | 374 | - GCC for SPARC. | |
| 375 | The code from CLANG for SPARC also is not fastest. | ||
| 376 | So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases. | ||
| 377 | */ | ||
| 378 | #elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb)) \ | ||
| 379 | && !defined(MY_CPU_SPARC) \ | ||
| 380 | && ( \ | ||
| 381 | (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ | ||
| 382 | || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \ | ||
| 383 | ) | ||
| 384 | |||
| 301 | #define Z7_BSWAP16(v) __builtin_bswap16(v) | 385 | #define Z7_BSWAP16(v) __builtin_bswap16(v) |
| 302 | #define Z7_BSWAP32(v) __builtin_bswap32(v) | 386 | #define Z7_BSWAP32(v) __builtin_bswap32(v) |
| 303 | #define Z7_BSWAP64(v) __builtin_bswap64(v) | 387 | #define Z7_BSWAP64(v) __builtin_bswap64(v) |
| @@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 329 | 413 | ||
| 330 | #ifdef MY_CPU_LE | 414 | #ifdef MY_CPU_LE |
| 331 | #if defined(MY_CPU_X86_OR_AMD64) \ | 415 | #if defined(MY_CPU_X86_OR_AMD64) \ |
| 332 | || defined(MY_CPU_ARM64) | 416 | || defined(MY_CPU_ARM64) \ |
| 417 | || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \ | ||
| 418 | || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6) | ||
| 333 | #define MY_CPU_LE_UNALIGN | 419 | #define MY_CPU_LE_UNALIGN |
| 334 | #define MY_CPU_LE_UNALIGN_64 | 420 | #define MY_CPU_LE_UNALIGN_64 |
| 335 | #elif defined(__ARM_FEATURE_UNALIGNED) | 421 | #elif defined(__ARM_FEATURE_UNALIGNED) |
| 336 | /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. | 422 | /* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions. |
| 337 | So we can't use unaligned 64-bit operations. */ | 423 | Description of problems: |
| 338 | #define MY_CPU_LE_UNALIGN | 424 | problem-1 : 32-bit ARM architecture: |
| 425 | multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM) | ||
| 426 | require 32-bit (WORD) alignment (by 32-bit ARM architecture). | ||
| 427 | So there is "Alignment fault exception", if data is not aligned for 32-bit. | ||
| 428 | |||
| 429 | problem-2 : 32-bit kernels and arm64 kernels: | ||
| 430 | 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception". | ||
| 431 | So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux. | ||
| 432 | |||
| 433 | But some arm64 kernels do not handle these faults in 32-bit programs. | ||
| 434 | So we have unhandled exception for such instructions. | ||
| 435 | Probably some new arm64 kernels have fixed it, and unaligned | ||
| 436 | paired-access instructions work in new kernels? | ||
| 437 | |||
| 438 | problem-3 : compiler for 32-bit arm: | ||
| 439 | Compilers use LDRD/STRD/LDM/STM for UInt64 accesses | ||
| 440 | and for another cases where two 32-bit accesses are fused | ||
| 441 | to one multi-access instruction. | ||
| 442 | So UInt64 variables must be aligned for 32-bit, and each | ||
| 443 | 32-bit access must be aligned for 32-bit, if we want to | ||
| 444 | avoid "Alignment fault" exception (handled or unhandled). | ||
| 445 | |||
| 446 | problem-4 : performace: | ||
| 447 | Even if unaligned access is handled by kernel, it will be slow. | ||
| 448 | So if we allow unaligned access, we can get fast unaligned | ||
| 449 | single-access, and slow unaligned paired-access. | ||
| 450 | |||
| 451 | We don't allow unaligned access on 32-bit arm, because compiler | ||
| 452 | genarates paired-access instructions that require 32-bit alignment, | ||
| 453 | and some arm64 kernels have no handler for these instructions. | ||
| 454 | Also unaligned paired-access instructions will be slow, if kernel handles them. | ||
| 455 | */ | ||
| 456 | // it must be disabled: | ||
| 457 | // #define MY_CPU_LE_UNALIGN | ||
| 339 | #endif | 458 | #endif |
| 340 | #endif | 459 | #endif |
| 341 | 460 | ||
| @@ -439,6 +558,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 439 | 558 | ||
| 440 | #if defined(MY_CPU_BE) | 559 | #if defined(MY_CPU_BE) |
| 441 | 560 | ||
| 561 | #define GetBe64a(p) (*(const UInt64 *)(const void *)(p)) | ||
| 442 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) | 562 | #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) |
| 443 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) | 563 | #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) |
| 444 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 564 | #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
| @@ -456,6 +576,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. | |||
| 456 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } | 576 | #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } |
| 457 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } | 577 | #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } |
| 458 | 578 | ||
| 579 | #define GetBe64a(p) GetBe64(p) | ||
| 459 | #define GetBe32a(p) GetBe32(p) | 580 | #define GetBe32a(p) GetBe32(p) |
| 460 | #define GetBe16a(p) GetBe16(p) | 581 | #define GetBe16a(p) GetBe16(p) |
| 461 | #define SetBe32a(p, v) SetBe32(p, v) | 582 | #define SetBe32a(p, v) SetBe32(p, v) |
| @@ -486,6 +607,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void); | |||
| 486 | BoolInt CPU_IsSupported_AES(void); | 607 | BoolInt CPU_IsSupported_AES(void); |
| 487 | BoolInt CPU_IsSupported_AVX(void); | 608 | BoolInt CPU_IsSupported_AVX(void); |
| 488 | BoolInt CPU_IsSupported_AVX2(void); | 609 | BoolInt CPU_IsSupported_AVX2(void); |
| 610 | // BoolInt CPU_IsSupported_AVX512F_AVX512VL(void); | ||
| 489 | BoolInt CPU_IsSupported_VAES_AVX2(void); | 611 | BoolInt CPU_IsSupported_VAES_AVX2(void); |
| 490 | BoolInt CPU_IsSupported_CMOV(void); | 612 | BoolInt CPU_IsSupported_CMOV(void); |
| 491 | BoolInt CPU_IsSupported_SSE(void); | 613 | BoolInt CPU_IsSupported_SSE(void); |
diff --git a/C/DllSecur.c b/C/DllSecur.c index 02a0f97..bbbfc0a 100644 --- a/C/DllSecur.c +++ b/C/DllSecur.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* DllSecur.c -- DLL loading security | 1 | /* DllSecur.c -- DLL loading security |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-03 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -11,19 +11,7 @@ | |||
| 11 | 11 | ||
| 12 | #ifndef UNDER_CE | 12 | #ifndef UNDER_CE |
| 13 | 13 | ||
| 14 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | 14 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION |
| 15 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
| 16 | #endif | ||
| 17 | |||
| 18 | #if defined(__clang__) || defined(__GNUC__) | ||
| 19 | typedef void (*Z7_voidFunction)(void); | ||
| 20 | #define MY_CAST_FUNC (Z7_voidFunction) | ||
| 21 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
| 22 | #define MY_CAST_FUNC (void *) | ||
| 23 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
| 24 | #else | ||
| 25 | #define MY_CAST_FUNC | ||
| 26 | #endif | ||
| 27 | 15 | ||
| 28 | typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); | 16 | typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); |
| 29 | 17 | ||
| @@ -61,7 +49,7 @@ static const char * const g_Dlls = | |||
| 61 | if ((UInt16)GetVersion() != 6) { \ | 49 | if ((UInt16)GetVersion() != 6) { \ |
| 62 | const \ | 50 | const \ |
| 63 | Func_SetDefaultDllDirectories setDllDirs = \ | 51 | Func_SetDefaultDllDirectories setDllDirs = \ |
| 64 | (Func_SetDefaultDllDirectories) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ | 52 | (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ |
| 65 | "SetDefaultDllDirectories"); \ | 53 | "SetDefaultDllDirectories"); \ |
| 66 | if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } | 54 | if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } |
| 67 | 55 | ||
diff --git a/C/HuffEnc.c b/C/HuffEnc.c index 3dc1e39..996da30 100644 --- a/C/HuffEnc.c +++ b/C/HuffEnc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* HuffEnc.c -- functions for Huffman encoding | 1 | /* HuffEnc.c -- functions for Huffman encoding |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -8,7 +8,7 @@ | |||
| 8 | 8 | ||
| 9 | #define kMaxLen 16 | 9 | #define kMaxLen 16 |
| 10 | #define NUM_BITS 10 | 10 | #define NUM_BITS 10 |
| 11 | #define MASK (((unsigned)1 << NUM_BITS) - 1) | 11 | #define MASK ((1u << NUM_BITS) - 1) |
| 12 | 12 | ||
| 13 | #define NUM_COUNTERS 64 | 13 | #define NUM_COUNTERS 64 |
| 14 | 14 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFind.c -- Match finder for LZ algorithms | 1 | /* LzFind.c -- Match finder for LZ algorithms |
| 2 | 2023-03-14 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -108,9 +108,15 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all | |||
| 108 | return (p->bufBase != NULL); | 108 | return (p->bufBase != NULL); |
| 109 | } | 109 | } |
| 110 | 110 | ||
| 111 | static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } | 111 | static const Byte *MatchFinder_GetPointerToCurrentPos(void *p) |
| 112 | { | ||
| 113 | return ((CMatchFinder *)p)->buffer; | ||
| 114 | } | ||
| 112 | 115 | ||
| 113 | static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } | 116 | static UInt32 MatchFinder_GetNumAvailableBytes(void *p) |
| 117 | { | ||
| 118 | return GET_AVAIL_BYTES((CMatchFinder *)p); | ||
| 119 | } | ||
| 114 | 120 | ||
| 115 | 121 | ||
| 116 | Z7_NO_INLINE | 122 | Z7_NO_INLINE |
| @@ -571,8 +577,9 @@ void MatchFinder_Init_4(CMatchFinder *p) | |||
| 571 | #define CYC_TO_POS_OFFSET 0 | 577 | #define CYC_TO_POS_OFFSET 0 |
| 572 | // #define CYC_TO_POS_OFFSET 1 // for debug | 578 | // #define CYC_TO_POS_OFFSET 1 // for debug |
| 573 | 579 | ||
| 574 | void MatchFinder_Init(CMatchFinder *p) | 580 | void MatchFinder_Init(void *_p) |
| 575 | { | 581 | { |
| 582 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 576 | MatchFinder_Init_HighHash(p); | 583 | MatchFinder_Init_HighHash(p); |
| 577 | MatchFinder_Init_LowHash(p); | 584 | MatchFinder_Init_LowHash(p); |
| 578 | MatchFinder_Init_4(p); | 585 | MatchFinder_Init_4(p); |
| @@ -607,16 +614,16 @@ void MatchFinder_Init(CMatchFinder *p) | |||
| 607 | #endif | 614 | #endif |
| 608 | #endif | 615 | #endif |
| 609 | 616 | ||
| 610 | // #elif defined(MY_CPU_ARM_OR_ARM64) | 617 | #elif defined(MY_CPU_ARM64) \ |
| 611 | #elif defined(MY_CPU_ARM64) | 618 | /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */ |
| 612 | 619 | ||
| 613 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 620 | #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ |
| 614 | || defined(__GNUC__) && (__GNUC__ >= 8) | 621 | || defined(__GNUC__) && (__GNUC__ >= 6) |
| 615 | #define USE_LZFIND_SATUR_SUB_128 | 622 | #define USE_LZFIND_SATUR_SUB_128 |
| 616 | #ifdef MY_CPU_ARM64 | 623 | #ifdef MY_CPU_ARM64 |
| 617 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) | 624 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) |
| 618 | #else | 625 | #else |
| 619 | // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 626 | #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon"))) |
| 620 | #endif | 627 | #endif |
| 621 | 628 | ||
| 622 | #elif defined(_MSC_VER) | 629 | #elif defined(_MSC_VER) |
| @@ -625,7 +632,7 @@ void MatchFinder_Init(CMatchFinder *p) | |||
| 625 | #endif | 632 | #endif |
| 626 | #endif | 633 | #endif |
| 627 | 634 | ||
| 628 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 635 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) |
| 629 | #include <arm64_neon.h> | 636 | #include <arm64_neon.h> |
| 630 | #else | 637 | #else |
| 631 | #include <arm_neon.h> | 638 | #include <arm_neon.h> |
| @@ -1082,9 +1089,11 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const | |||
| 1082 | 1089 | ||
| 1083 | 1090 | ||
| 1084 | #define MOVE_POS \ | 1091 | #define MOVE_POS \ |
| 1085 | ++p->cyclicBufferPos; \ | 1092 | p->cyclicBufferPos++; \ |
| 1086 | p->buffer++; \ | 1093 | p->buffer++; \ |
| 1087 | { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } | 1094 | { const UInt32 pos1 = p->pos + 1; \ |
| 1095 | p->pos = pos1; \ | ||
| 1096 | if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } | ||
| 1088 | 1097 | ||
| 1089 | #define MOVE_POS_RET MOVE_POS return distances; | 1098 | #define MOVE_POS_RET MOVE_POS return distances; |
| 1090 | 1099 | ||
| @@ -1103,20 +1112,26 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
| 1103 | } | 1112 | } |
| 1104 | 1113 | ||
| 1105 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ | 1114 | #define GET_MATCHES_HEADER2(minLen, ret_op) \ |
| 1106 | unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ | 1115 | UInt32 hv; const Byte *cur; UInt32 curMatch; \ |
| 1107 | lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ | 1116 | UInt32 lenLimit = p->lenLimit; \ |
| 1117 | if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \ | ||
| 1108 | cur = p->buffer; | 1118 | cur = p->buffer; |
| 1109 | 1119 | ||
| 1110 | #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) | 1120 | #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) |
| 1111 | #define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) | 1121 | #define SKIP_HEADER(minLen) \ |
| 1122 | do { GET_MATCHES_HEADER2(minLen, continue) | ||
| 1112 | 1123 | ||
| 1113 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue | 1124 | #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \ |
| 1125 | p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue | ||
| 1114 | 1126 | ||
| 1115 | #define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); | 1127 | #define SKIP_FOOTER \ |
| 1128 | SkipMatchesSpec(MF_PARAMS(p)); \ | ||
| 1129 | MOVE_POS \ | ||
| 1130 | } while (--num); | ||
| 1116 | 1131 | ||
| 1117 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ | 1132 | #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ |
| 1118 | distances = func(MF_PARAMS(p), \ | 1133 | distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \ |
| 1119 | distances, (UInt32)_maxLen_); MOVE_POS_RET | 1134 | MOVE_POS_RET |
| 1120 | 1135 | ||
| 1121 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ | 1136 | #define GET_MATCHES_FOOTER_BT(_maxLen_) \ |
| 1122 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) | 1137 | GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) |
| @@ -1133,8 +1148,9 @@ static void MatchFinder_MovePos(CMatchFinder *p) | |||
| 1133 | for (; c != lim; c++) if (*(c + diff) != *c) break; \ | 1148 | for (; c != lim; c++) if (*(c + diff) != *c) break; \ |
| 1134 | maxLen = (unsigned)(c - cur); } | 1149 | maxLen = (unsigned)(c - cur); } |
| 1135 | 1150 | ||
| 1136 | static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1151 | static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1137 | { | 1152 | { |
| 1153 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1138 | GET_MATCHES_HEADER(2) | 1154 | GET_MATCHES_HEADER(2) |
| 1139 | HASH2_CALC | 1155 | HASH2_CALC |
| 1140 | curMatch = p->hash[hv]; | 1156 | curMatch = p->hash[hv]; |
| @@ -1158,8 +1174,9 @@ UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1158 | mmm = pos; | 1174 | mmm = pos; |
| 1159 | 1175 | ||
| 1160 | 1176 | ||
| 1161 | static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1177 | static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1162 | { | 1178 | { |
| 1179 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1163 | UInt32 mmm; | 1180 | UInt32 mmm; |
| 1164 | UInt32 h2, d2, pos; | 1181 | UInt32 h2, d2, pos; |
| 1165 | unsigned maxLen; | 1182 | unsigned maxLen; |
| @@ -1199,8 +1216,9 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1199 | } | 1216 | } |
| 1200 | 1217 | ||
| 1201 | 1218 | ||
| 1202 | static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1219 | static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1203 | { | 1220 | { |
| 1221 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1204 | UInt32 mmm; | 1222 | UInt32 mmm; |
| 1205 | UInt32 h2, h3, d2, d3, pos; | 1223 | UInt32 h2, h3, d2, d3, pos; |
| 1206 | unsigned maxLen; | 1224 | unsigned maxLen; |
| @@ -1267,10 +1285,12 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1267 | } | 1285 | } |
| 1268 | 1286 | ||
| 1269 | 1287 | ||
| 1270 | static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1288 | static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1271 | { | 1289 | { |
| 1290 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1272 | UInt32 mmm; | 1291 | UInt32 mmm; |
| 1273 | UInt32 h2, h3, d2, d3, maxLen, pos; | 1292 | UInt32 h2, h3, d2, d3, pos; |
| 1293 | unsigned maxLen; | ||
| 1274 | UInt32 *hash; | 1294 | UInt32 *hash; |
| 1275 | GET_MATCHES_HEADER(5) | 1295 | GET_MATCHES_HEADER(5) |
| 1276 | 1296 | ||
| @@ -1339,8 +1359,9 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1339 | } | 1359 | } |
| 1340 | 1360 | ||
| 1341 | 1361 | ||
| 1342 | static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1362 | static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1343 | { | 1363 | { |
| 1364 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1344 | UInt32 mmm; | 1365 | UInt32 mmm; |
| 1345 | UInt32 h2, h3, d2, d3, pos; | 1366 | UInt32 h2, h3, d2, d3, pos; |
| 1346 | unsigned maxLen; | 1367 | unsigned maxLen; |
| @@ -1407,10 +1428,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1407 | } | 1428 | } |
| 1408 | 1429 | ||
| 1409 | 1430 | ||
| 1410 | static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | 1431 | static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances) |
| 1411 | { | 1432 | { |
| 1433 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1412 | UInt32 mmm; | 1434 | UInt32 mmm; |
| 1413 | UInt32 h2, h3, d2, d3, maxLen, pos; | 1435 | UInt32 h2, h3, d2, d3, pos; |
| 1436 | unsigned maxLen; | ||
| 1414 | UInt32 *hash; | 1437 | UInt32 *hash; |
| 1415 | GET_MATCHES_HEADER(5) | 1438 | GET_MATCHES_HEADER(5) |
| 1416 | 1439 | ||
| @@ -1466,7 +1489,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1466 | if (*(cur - d2 + 3) != cur[3]) | 1489 | if (*(cur - d2 + 3) != cur[3]) |
| 1467 | break; | 1490 | break; |
| 1468 | UPDATE_maxLen | 1491 | UPDATE_maxLen |
| 1469 | distances[-2] = maxLen; | 1492 | distances[-2] = (UInt32)maxLen; |
| 1470 | if (maxLen == lenLimit) | 1493 | if (maxLen == lenLimit) |
| 1471 | { | 1494 | { |
| 1472 | p->son[p->cyclicBufferPos] = curMatch; | 1495 | p->son[p->cyclicBufferPos] = curMatch; |
| @@ -1489,8 +1512,9 @@ UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) | |||
| 1489 | } | 1512 | } |
| 1490 | 1513 | ||
| 1491 | 1514 | ||
| 1492 | static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1515 | static void Bt2_MatchFinder_Skip(void *_p, UInt32 num) |
| 1493 | { | 1516 | { |
| 1517 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1494 | SKIP_HEADER(2) | 1518 | SKIP_HEADER(2) |
| 1495 | { | 1519 | { |
| 1496 | HASH2_CALC | 1520 | HASH2_CALC |
| @@ -1511,8 +1535,9 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1511 | SKIP_FOOTER | 1535 | SKIP_FOOTER |
| 1512 | } | 1536 | } |
| 1513 | 1537 | ||
| 1514 | static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1538 | static void Bt3_MatchFinder_Skip(void *_p, UInt32 num) |
| 1515 | { | 1539 | { |
| 1540 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1516 | SKIP_HEADER(3) | 1541 | SKIP_HEADER(3) |
| 1517 | { | 1542 | { |
| 1518 | UInt32 h2; | 1543 | UInt32 h2; |
| @@ -1526,8 +1551,9 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1526 | SKIP_FOOTER | 1551 | SKIP_FOOTER |
| 1527 | } | 1552 | } |
| 1528 | 1553 | ||
| 1529 | static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1554 | static void Bt4_MatchFinder_Skip(void *_p, UInt32 num) |
| 1530 | { | 1555 | { |
| 1556 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1531 | SKIP_HEADER(4) | 1557 | SKIP_HEADER(4) |
| 1532 | { | 1558 | { |
| 1533 | UInt32 h2, h3; | 1559 | UInt32 h2, h3; |
| @@ -1542,8 +1568,9 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1542 | SKIP_FOOTER | 1568 | SKIP_FOOTER |
| 1543 | } | 1569 | } |
| 1544 | 1570 | ||
| 1545 | static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1571 | static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) |
| 1546 | { | 1572 | { |
| 1573 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1547 | SKIP_HEADER(5) | 1574 | SKIP_HEADER(5) |
| 1548 | { | 1575 | { |
| 1549 | UInt32 h2, h3; | 1576 | UInt32 h2, h3; |
| @@ -1589,8 +1616,9 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1589 | }} while(num); \ | 1616 | }} while(num); \ |
| 1590 | 1617 | ||
| 1591 | 1618 | ||
| 1592 | static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1619 | static void Hc4_MatchFinder_Skip(void *_p, UInt32 num) |
| 1593 | { | 1620 | { |
| 1621 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1594 | HC_SKIP_HEADER(4) | 1622 | HC_SKIP_HEADER(4) |
| 1595 | 1623 | ||
| 1596 | UInt32 h2, h3; | 1624 | UInt32 h2, h3; |
| @@ -1604,8 +1632,9 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1604 | } | 1632 | } |
| 1605 | 1633 | ||
| 1606 | 1634 | ||
| 1607 | static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | 1635 | static void Hc5_MatchFinder_Skip(void *_p, UInt32 num) |
| 1608 | { | 1636 | { |
| 1637 | CMatchFinder *p = (CMatchFinder *)_p; | ||
| 1609 | HC_SKIP_HEADER(5) | 1638 | HC_SKIP_HEADER(5) |
| 1610 | 1639 | ||
| 1611 | UInt32 h2, h3; | 1640 | UInt32 h2, h3; |
| @@ -1634,41 +1663,41 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) | |||
| 1634 | 1663 | ||
| 1635 | void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) | 1664 | void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) |
| 1636 | { | 1665 | { |
| 1637 | vTable->Init = (Mf_Init_Func)MatchFinder_Init; | 1666 | vTable->Init = MatchFinder_Init; |
| 1638 | vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; | 1667 | vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes; |
| 1639 | vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; | 1668 | vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos; |
| 1640 | if (!p->btMode) | 1669 | if (!p->btMode) |
| 1641 | { | 1670 | { |
| 1642 | if (p->numHashBytes <= 4) | 1671 | if (p->numHashBytes <= 4) |
| 1643 | { | 1672 | { |
| 1644 | vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; | 1673 | vTable->GetMatches = Hc4_MatchFinder_GetMatches; |
| 1645 | vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; | 1674 | vTable->Skip = Hc4_MatchFinder_Skip; |
| 1646 | } | 1675 | } |
| 1647 | else | 1676 | else |
| 1648 | { | 1677 | { |
| 1649 | vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; | 1678 | vTable->GetMatches = Hc5_MatchFinder_GetMatches; |
| 1650 | vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; | 1679 | vTable->Skip = Hc5_MatchFinder_Skip; |
| 1651 | } | 1680 | } |
| 1652 | } | 1681 | } |
| 1653 | else if (p->numHashBytes == 2) | 1682 | else if (p->numHashBytes == 2) |
| 1654 | { | 1683 | { |
| 1655 | vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; | 1684 | vTable->GetMatches = Bt2_MatchFinder_GetMatches; |
| 1656 | vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; | 1685 | vTable->Skip = Bt2_MatchFinder_Skip; |
| 1657 | } | 1686 | } |
| 1658 | else if (p->numHashBytes == 3) | 1687 | else if (p->numHashBytes == 3) |
| 1659 | { | 1688 | { |
| 1660 | vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; | 1689 | vTable->GetMatches = Bt3_MatchFinder_GetMatches; |
| 1661 | vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; | 1690 | vTable->Skip = Bt3_MatchFinder_Skip; |
| 1662 | } | 1691 | } |
| 1663 | else if (p->numHashBytes == 4) | 1692 | else if (p->numHashBytes == 4) |
| 1664 | { | 1693 | { |
| 1665 | vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; | 1694 | vTable->GetMatches = Bt4_MatchFinder_GetMatches; |
| 1666 | vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; | 1695 | vTable->Skip = Bt4_MatchFinder_Skip; |
| 1667 | } | 1696 | } |
| 1668 | else | 1697 | else |
| 1669 | { | 1698 | { |
| 1670 | vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; | 1699 | vTable->GetMatches = Bt5_MatchFinder_GetMatches; |
| 1671 | vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; | 1700 | vTable->Skip = Bt5_MatchFinder_Skip; |
| 1672 | } | 1701 | } |
| 1673 | } | 1702 | } |
| 1674 | 1703 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFind.h -- Match finder for LZ algorithms | 1 | /* LzFind.h -- Match finder for LZ algorithms |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_LZ_FIND_H | 4 | #ifndef ZIP7_INC_LZ_FIND_H |
| 5 | #define ZIP7_INC_LZ_FIND_H | 5 | #define ZIP7_INC_LZ_FIND_H |
| @@ -144,7 +144,8 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); | |||
| 144 | void MatchFinder_Init_LowHash(CMatchFinder *p); | 144 | void MatchFinder_Init_LowHash(CMatchFinder *p); |
| 145 | void MatchFinder_Init_HighHash(CMatchFinder *p); | 145 | void MatchFinder_Init_HighHash(CMatchFinder *p); |
| 146 | void MatchFinder_Init_4(CMatchFinder *p); | 146 | void MatchFinder_Init_4(CMatchFinder *p); |
| 147 | void MatchFinder_Init(CMatchFinder *p); | 147 | // void MatchFinder_Init(CMatchFinder *p); |
| 148 | void MatchFinder_Init(void *p); | ||
| 148 | 149 | ||
| 149 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); | 150 | UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); |
| 150 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); | 151 | UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); |
diff --git a/C/LzFindMt.c b/C/LzFindMt.c index 5253e6e..ac9d59d 100644 --- a/C/LzFindMt.c +++ b/C/LzFindMt.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.c -- multithreaded Match finder for LZ algorithms |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -94,7 +94,7 @@ static void MtSync_Construct(CMtSync *p) | |||
| 94 | } | 94 | } |
| 95 | 95 | ||
| 96 | 96 | ||
| 97 | #define DEBUG_BUFFER_LOCK // define it to debug lock state | 97 | // #define DEBUG_BUFFER_LOCK // define it to debug lock state |
| 98 | 98 | ||
| 99 | #ifdef DEBUG_BUFFER_LOCK | 99 | #ifdef DEBUG_BUFFER_LOCK |
| 100 | #include <stdlib.h> | 100 | #include <stdlib.h> |
| @@ -877,8 +877,9 @@ SRes MatchFinderMt_InitMt(CMatchFinderMt *p) | |||
| 877 | } | 877 | } |
| 878 | 878 | ||
| 879 | 879 | ||
| 880 | static void MatchFinderMt_Init(CMatchFinderMt *p) | 880 | static void MatchFinderMt_Init(void *_p) |
| 881 | { | 881 | { |
| 882 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 882 | CMatchFinder *mf = MF(p); | 883 | CMatchFinder *mf = MF(p); |
| 883 | 884 | ||
| 884 | p->btBufPos = | 885 | p->btBufPos = |
| @@ -981,8 +982,9 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) | |||
| 981 | 982 | ||
| 982 | 983 | ||
| 983 | 984 | ||
| 984 | static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) | 985 | static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p) |
| 985 | { | 986 | { |
| 987 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 986 | return p->pointerToCurPos; | 988 | return p->pointerToCurPos; |
| 987 | } | 989 | } |
| 988 | 990 | ||
| @@ -990,8 +992,9 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) | |||
| 990 | #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); | 992 | #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); |
| 991 | 993 | ||
| 992 | 994 | ||
| 993 | static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) | 995 | static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p) |
| 994 | { | 996 | { |
| 997 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 995 | if (p->btBufPos != p->btBufPosLimit) | 998 | if (p->btBufPos != p->btBufPosLimit) |
| 996 | return p->btNumAvailBytes; | 999 | return p->btNumAvailBytes; |
| 997 | return MatchFinderMt_GetNextBlock_Bt(p); | 1000 | return MatchFinderMt_GetNextBlock_Bt(p); |
| @@ -1243,8 +1246,9 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) | |||
| 1243 | } | 1246 | } |
| 1244 | 1247 | ||
| 1245 | 1248 | ||
| 1246 | static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) | 1249 | static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d) |
| 1247 | { | 1250 | { |
| 1251 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 1248 | const UInt32 *bt = p->btBufPos; | 1252 | const UInt32 *bt = p->btBufPos; |
| 1249 | const UInt32 len = *bt++; | 1253 | const UInt32 len = *bt++; |
| 1250 | const UInt32 *btLim = bt + len; | 1254 | const UInt32 *btLim = bt + len; |
| @@ -1267,8 +1271,9 @@ static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) | |||
| 1267 | 1271 | ||
| 1268 | 1272 | ||
| 1269 | 1273 | ||
| 1270 | static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) | 1274 | static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d) |
| 1271 | { | 1275 | { |
| 1276 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 1272 | const UInt32 *bt = p->btBufPos; | 1277 | const UInt32 *bt = p->btBufPos; |
| 1273 | UInt32 len = *bt++; | 1278 | UInt32 len = *bt++; |
| 1274 | const UInt32 avail = p->btNumAvailBytes - 1; | 1279 | const UInt32 avail = p->btNumAvailBytes - 1; |
| @@ -1315,14 +1320,16 @@ static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) | |||
| 1315 | #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; | 1320 | #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; |
| 1316 | #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); | 1321 | #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); |
| 1317 | 1322 | ||
| 1318 | static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) | 1323 | static void MatchFinderMt0_Skip(void *_p, UInt32 num) |
| 1319 | { | 1324 | { |
| 1325 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 1320 | SKIP_HEADER2_MT { p->btNumAvailBytes--; | 1326 | SKIP_HEADER2_MT { p->btNumAvailBytes--; |
| 1321 | SKIP_FOOTER_MT | 1327 | SKIP_FOOTER_MT |
| 1322 | } | 1328 | } |
| 1323 | 1329 | ||
| 1324 | static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) | 1330 | static void MatchFinderMt2_Skip(void *_p, UInt32 num) |
| 1325 | { | 1331 | { |
| 1332 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 1326 | SKIP_HEADER_MT(2) | 1333 | SKIP_HEADER_MT(2) |
| 1327 | UInt32 h2; | 1334 | UInt32 h2; |
| 1328 | MT_HASH2_CALC | 1335 | MT_HASH2_CALC |
| @@ -1330,8 +1337,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) | |||
| 1330 | SKIP_FOOTER_MT | 1337 | SKIP_FOOTER_MT |
| 1331 | } | 1338 | } |
| 1332 | 1339 | ||
| 1333 | static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) | 1340 | static void MatchFinderMt3_Skip(void *_p, UInt32 num) |
| 1334 | { | 1341 | { |
| 1342 | CMatchFinderMt *p = (CMatchFinderMt *)_p; | ||
| 1335 | SKIP_HEADER_MT(3) | 1343 | SKIP_HEADER_MT(3) |
| 1336 | UInt32 h2, h3; | 1344 | UInt32 h2, h3; |
| 1337 | MT_HASH3_CALC | 1345 | MT_HASH3_CALC |
| @@ -1361,39 +1369,39 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) | |||
| 1361 | 1369 | ||
| 1362 | void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) | 1370 | void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) |
| 1363 | { | 1371 | { |
| 1364 | vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; | 1372 | vTable->Init = MatchFinderMt_Init; |
| 1365 | vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; | 1373 | vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes; |
| 1366 | vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; | 1374 | vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos; |
| 1367 | vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; | 1375 | vTable->GetMatches = MatchFinderMt_GetMatches; |
| 1368 | 1376 | ||
| 1369 | switch (MF(p)->numHashBytes) | 1377 | switch (MF(p)->numHashBytes) |
| 1370 | { | 1378 | { |
| 1371 | case 2: | 1379 | case 2: |
| 1372 | p->GetHeadsFunc = GetHeads2; | 1380 | p->GetHeadsFunc = GetHeads2; |
| 1373 | p->MixMatchesFunc = (Mf_Mix_Matches)NULL; | 1381 | p->MixMatchesFunc = NULL; |
| 1374 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; | 1382 | vTable->Skip = MatchFinderMt0_Skip; |
| 1375 | vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; | 1383 | vTable->GetMatches = MatchFinderMt2_GetMatches; |
| 1376 | break; | 1384 | break; |
| 1377 | case 3: | 1385 | case 3: |
| 1378 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; | 1386 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; |
| 1379 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; | 1387 | p->MixMatchesFunc = MixMatches2; |
| 1380 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; | 1388 | vTable->Skip = MatchFinderMt2_Skip; |
| 1381 | break; | 1389 | break; |
| 1382 | case 4: | 1390 | case 4: |
| 1383 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; | 1391 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; |
| 1384 | 1392 | ||
| 1385 | // it's fast inline version of GetMatches() | 1393 | // it's fast inline version of GetMatches() |
| 1386 | // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; | 1394 | // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4; |
| 1387 | 1395 | ||
| 1388 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; | 1396 | p->MixMatchesFunc = MixMatches3; |
| 1389 | vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; | 1397 | vTable->Skip = MatchFinderMt3_Skip; |
| 1390 | break; | 1398 | break; |
| 1391 | default: | 1399 | default: |
| 1392 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; | 1400 | p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; |
| 1393 | p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; | 1401 | p->MixMatchesFunc = MixMatches4; |
| 1394 | vTable->Skip = | 1402 | vTable->Skip = |
| 1395 | (Mf_Skip_Func)MatchFinderMt3_Skip; | 1403 | MatchFinderMt3_Skip; |
| 1396 | // (Mf_Skip_Func)MatchFinderMt4_Skip; | 1404 | // MatchFinderMt4_Skip; |
| 1397 | break; | 1405 | break; |
| 1398 | } | 1406 | } |
| 1399 | } | 1407 | } |
diff --git a/C/LzFindMt.h b/C/LzFindMt.h index db5923e..fcb479d 100644 --- a/C/LzFindMt.h +++ b/C/LzFindMt.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms | 1 | /* LzFindMt.h -- multithreaded Match finder for LZ algorithms |
| 2 | 2023-03-05 : Igor Pavlov : Public domain */ | 2 | 2024-01-22 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H | 4 | #ifndef ZIP7_INC_LZ_FIND_MT_H |
| 5 | #define ZIP7_INC_LZ_FIND_MT_H | 5 | #define ZIP7_INC_LZ_FIND_MT_H |
| @@ -31,7 +31,10 @@ typedef struct | |||
| 31 | // UInt32 numBlocks_Sent; | 31 | // UInt32 numBlocks_Sent; |
| 32 | } CMtSync; | 32 | } CMtSync; |
| 33 | 33 | ||
| 34 | typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); | 34 | |
| 35 | struct CMatchFinderMt_; | ||
| 36 | |||
| 37 | typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances); | ||
| 35 | 38 | ||
| 36 | /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ | 39 | /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ |
| 37 | #define kMtCacheLineDummy 128 | 40 | #define kMtCacheLineDummy 128 |
| @@ -39,7 +42,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance | |||
| 39 | typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, | 42 | typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, |
| 40 | UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); | 43 | UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); |
| 41 | 44 | ||
| 42 | typedef struct | 45 | typedef struct CMatchFinderMt_ |
| 43 | { | 46 | { |
| 44 | /* LZ */ | 47 | /* LZ */ |
| 45 | const Byte *pointerToCurPos; | 48 | const Byte *pointerToCurPos; |
diff --git a/C/Lzma2Dec.c b/C/Lzma2Dec.c index 388cbc7..8bf54e4 100644 --- a/C/Lzma2Dec.c +++ b/C/Lzma2Dec.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Lzma2Dec.c -- LZMA2 Decoder | 1 | /* Lzma2Dec.c -- LZMA2 Decoder |
| 2 | 2023-03-03 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | /* #define SHOW_DEBUG_INFO */ | 4 | /* #define SHOW_DEBUG_INFO */ |
| 5 | 5 | ||
| @@ -157,8 +157,10 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) | |||
| 157 | p->decoder.prop.lp = (Byte)lp; | 157 | p->decoder.prop.lp = (Byte)lp; |
| 158 | return LZMA2_STATE_DATA; | 158 | return LZMA2_STATE_DATA; |
| 159 | } | 159 | } |
| 160 | |||
| 161 | default: | ||
| 162 | return LZMA2_STATE_ERROR; | ||
| 160 | } | 163 | } |
| 161 | return LZMA2_STATE_ERROR; | ||
| 162 | } | 164 | } |
| 163 | 165 | ||
| 164 | static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) | 166 | static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) |
diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 6d13cac..37b2787 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* LzmaEnc.c -- LZMA Encoder | 1 | /* LzmaEnc.c -- LZMA Encoder |
| 2 | 2023-04-13: Igor Pavlov : Public domain */ | 2 | 2024-01-24: Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -195,11 +195,11 @@ unsigned GetPosSlot1(UInt32 pos); | |||
| 195 | unsigned GetPosSlot1(UInt32 pos) | 195 | unsigned GetPosSlot1(UInt32 pos) |
| 196 | { | 196 | { |
| 197 | unsigned res; | 197 | unsigned res; |
| 198 | BSR2_RET(pos, res); | 198 | BSR2_RET(pos, res) |
| 199 | return res; | 199 | return res; |
| 200 | } | 200 | } |
| 201 | #define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } | 201 | #define GetPosSlot2(pos, res) { BSR2_RET(pos, res) } |
| 202 | #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } | 202 | #define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) } |
| 203 | 203 | ||
| 204 | 204 | ||
| 205 | #else // ! LZMA_LOG_BSR | 205 | #else // ! LZMA_LOG_BSR |
| @@ -512,7 +512,7 @@ struct CLzmaEnc | |||
| 512 | COPY_ARR(d, s, posEncoders) \ | 512 | COPY_ARR(d, s, posEncoders) \ |
| 513 | (d)->lenProbs = (s)->lenProbs; \ | 513 | (d)->lenProbs = (s)->lenProbs; \ |
| 514 | (d)->repLenProbs = (s)->repLenProbs; \ | 514 | (d)->repLenProbs = (s)->repLenProbs; \ |
| 515 | memcpy((d)->litProbs, (s)->litProbs, ((UInt32)0x300 << (p)->lclp) * sizeof(CLzmaProb)); | 515 | memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp); |
| 516 | 516 | ||
| 517 | void LzmaEnc_SaveState(CLzmaEncHandle p) | 517 | void LzmaEnc_SaveState(CLzmaEncHandle p) |
| 518 | { | 518 | { |
| @@ -1040,14 +1040,14 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( | |||
| 1040 | UInt32 price = b; | 1040 | UInt32 price = b; |
| 1041 | do | 1041 | do |
| 1042 | { | 1042 | { |
| 1043 | unsigned bit = sym & 1; | 1043 | const unsigned bit = sym & 1; |
| 1044 | sym >>= 1; | 1044 | sym >>= 1; |
| 1045 | price += GET_PRICEa(probs[sym], bit); | 1045 | price += GET_PRICEa(probs[sym], bit); |
| 1046 | } | 1046 | } |
| 1047 | while (sym >= 2); | 1047 | while (sym >= 2); |
| 1048 | 1048 | ||
| 1049 | { | 1049 | { |
| 1050 | unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; | 1050 | const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; |
| 1051 | prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); | 1051 | prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); |
| 1052 | prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); | 1052 | prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); |
| 1053 | } | 1053 | } |
| @@ -1056,7 +1056,7 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( | |||
| 1056 | 1056 | ||
| 1057 | { | 1057 | { |
| 1058 | unsigned posState; | 1058 | unsigned posState; |
| 1059 | size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); | 1059 | const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); |
| 1060 | for (posState = 1; posState < numPosStates; posState++) | 1060 | for (posState = 1; posState < numPosStates; posState++) |
| 1061 | memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); | 1061 | memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); |
| 1062 | } | 1062 | } |
| @@ -2696,12 +2696,12 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, | |||
| 2696 | #endif | 2696 | #endif |
| 2697 | 2697 | ||
| 2698 | { | 2698 | { |
| 2699 | unsigned lclp = p->lc + p->lp; | 2699 | const unsigned lclp = p->lc + p->lp; |
| 2700 | if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) | 2700 | if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) |
| 2701 | { | 2701 | { |
| 2702 | LzmaEnc_FreeLits(p, alloc); | 2702 | LzmaEnc_FreeLits(p, alloc); |
| 2703 | p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); | 2703 | p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); |
| 2704 | p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); | 2704 | p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); |
| 2705 | if (!p->litProbs || !p->saveState.litProbs) | 2705 | if (!p->litProbs || !p->saveState.litProbs) |
| 2706 | { | 2706 | { |
| 2707 | LzmaEnc_FreeLits(p, alloc); | 2707 | LzmaEnc_FreeLits(p, alloc); |
| @@ -2802,8 +2802,8 @@ static void LzmaEnc_Init(CLzmaEnc *p) | |||
| 2802 | } | 2802 | } |
| 2803 | 2803 | ||
| 2804 | { | 2804 | { |
| 2805 | UInt32 num = (UInt32)0x300 << (p->lp + p->lc); | 2805 | const size_t num = (size_t)0x300 << (p->lp + p->lc); |
| 2806 | UInt32 k; | 2806 | size_t k; |
| 2807 | CLzmaProb *probs = p->litProbs; | 2807 | CLzmaProb *probs = p->litProbs; |
| 2808 | for (k = 0; k < num; k++) | 2808 | for (k = 0; k < num; k++) |
| 2809 | probs[k] = kProbInitValue; | 2809 | probs[k] = kProbInitValue; |
diff --git a/C/MtCoder.c b/C/MtCoder.c index 6f58abb..03959b6 100644 --- a/C/MtCoder.c +++ b/C/MtCoder.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* MtCoder.c -- Multi-thread Coder | 1 | /* MtCoder.c -- Multi-thread Coder |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -430,7 +430,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 430 | SRes res = SZ_OK; | 430 | SRes res = SZ_OK; |
| 431 | 431 | ||
| 432 | if (numThreads > MTCODER_THREADS_MAX) | 432 | if (numThreads > MTCODER_THREADS_MAX) |
| 433 | numThreads = MTCODER_THREADS_MAX; | 433 | numThreads = MTCODER_THREADS_MAX; |
| 434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); | 434 | numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); |
| 435 | 435 | ||
| 436 | if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; | 436 | if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; |
| @@ -438,7 +438,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 438 | if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; | 438 | if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; |
| 439 | 439 | ||
| 440 | if (numBlocksMax > MTCODER_BLOCKS_MAX) | 440 | if (numBlocksMax > MTCODER_BLOCKS_MAX) |
| 441 | numBlocksMax = MTCODER_BLOCKS_MAX; | 441 | numBlocksMax = MTCODER_BLOCKS_MAX; |
| 442 | 442 | ||
| 443 | if (p->blockSize != p->allocatedBufsSize) | 443 | if (p->blockSize != p->allocatedBufsSize) |
| 444 | { | 444 | { |
| @@ -469,7 +469,7 @@ SRes MtCoder_Code(CMtCoder *p) | |||
| 469 | 469 | ||
| 470 | { | 470 | { |
| 471 | RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) | 471 | RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) |
| 472 | RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)) | 472 | RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax)) |
| 473 | } | 473 | } |
| 474 | 474 | ||
| 475 | for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) | 475 | for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* MtDec.c -- Multi-thread Decoder | 1 | /* MtDec.c -- Multi-thread Decoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-02-20 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -809,6 +809,16 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t) | |||
| 809 | #endif | 809 | #endif |
| 810 | 810 | ||
| 811 | 811 | ||
| 812 | typedef | ||
| 813 | #ifdef _WIN32 | ||
| 814 | UINT_PTR | ||
| 815 | #elif 1 | ||
| 816 | uintptr_t | ||
| 817 | #else | ||
| 818 | ptrdiff_t | ||
| 819 | #endif | ||
| 820 | MY_uintptr_t; | ||
| 821 | |||
| 812 | static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | 822 | static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) |
| 813 | { | 823 | { |
| 814 | WRes res; | 824 | WRes res; |
| @@ -821,7 +831,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | |||
| 821 | res = MtDec_ThreadFunc2(t); | 831 | res = MtDec_ThreadFunc2(t); |
| 822 | p = t->mtDec; | 832 | p = t->mtDec; |
| 823 | if (res == 0) | 833 | if (res == 0) |
| 824 | return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; | 834 | return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes; |
| 825 | { | 835 | { |
| 826 | // it's unexpected situation for some threading function error | 836 | // it's unexpected situation for some threading function error |
| 827 | if (p->exitThreadWRes == 0) | 837 | if (p->exitThreadWRes == 0) |
| @@ -832,7 +842,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) | |||
| 832 | Event_Set(&p->threads[0].canWrite); | 842 | Event_Set(&p->threads[0].canWrite); |
| 833 | MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); | 843 | MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); |
| 834 | } | 844 | } |
| 835 | return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; | 845 | return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res; |
| 836 | } | 846 | } |
| 837 | 847 | ||
| 838 | static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) | 848 | static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) |
| @@ -1072,7 +1082,7 @@ SRes MtDec_Code(CMtDec *p) | |||
| 1072 | if (wres == 0) { wres = Event_Set(&nextThread->canWrite); | 1082 | if (wres == 0) { wres = Event_Set(&nextThread->canWrite); |
| 1073 | if (wres == 0) { wres = Event_Set(&nextThread->canRead); | 1083 | if (wres == 0) { wres = Event_Set(&nextThread->canRead); |
| 1074 | if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); | 1084 | if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); |
| 1075 | wres = (WRes)(UINT_PTR)res; | 1085 | wres = (WRes)(MY_uintptr_t)res; |
| 1076 | if (wres != 0) | 1086 | if (wres != 0) |
| 1077 | { | 1087 | { |
| 1078 | p->needContinue = False; | 1088 | p->needContinue = False; |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd7.c -- PPMdH codec | 1 | /* Ppmd7.c -- PPMdH codec |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 3 | This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
| 4 | 4 | ||
| 5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
| @@ -302,8 +302,17 @@ static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx) | |||
| 302 | 302 | ||
| 303 | 303 | ||
| 304 | #define MEM_12_CPY(dest, src, num) \ | 304 | #define MEM_12_CPY(dest, src, num) \ |
| 305 | { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ | 305 | { UInt32 *d = (UInt32 *)(dest); \ |
| 306 | do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } | 306 | const UInt32 *z = (const UInt32 *)(src); \ |
| 307 | unsigned n = (num); \ | ||
| 308 | do { \ | ||
| 309 | d[0] = z[0]; \ | ||
| 310 | d[1] = z[1]; \ | ||
| 311 | d[2] = z[2]; \ | ||
| 312 | z += 3; \ | ||
| 313 | d += 3; \ | ||
| 314 | } while (--n); \ | ||
| 315 | } | ||
| 307 | 316 | ||
| 308 | 317 | ||
| 309 | /* | 318 | /* |
| @@ -711,8 +720,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
| 711 | if ((ns1 & 1) == 0) | 720 | if ((ns1 & 1) == 0) |
| 712 | { | 721 | { |
| 713 | /* Expand for one UNIT */ | 722 | /* Expand for one UNIT */ |
| 714 | unsigned oldNU = ns1 >> 1; | 723 | const unsigned oldNU = ns1 >> 1; |
| 715 | unsigned i = U2I(oldNU); | 724 | const unsigned i = U2I(oldNU); |
| 716 | if (i != U2I((size_t)oldNU + 1)) | 725 | if (i != U2I((size_t)oldNU + 1)) |
| 717 | { | 726 | { |
| 718 | void *ptr = Ppmd7_AllocUnits(p, i + 1); | 727 | void *ptr = Ppmd7_AllocUnits(p, i + 1); |
| @@ -731,7 +740,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
| 731 | sum = c->Union2.SummFreq; | 740 | sum = c->Union2.SummFreq; |
| 732 | /* max increase of Escape_Freq is 3 here. | 741 | /* max increase of Escape_Freq is 3 here. |
| 733 | total increase of Union2.SummFreq for all symbols is less than 256 here */ | 742 | total increase of Union2.SummFreq for all symbols is less than 256 here */ |
| 734 | sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); | 743 | sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1))); |
| 735 | /* original PPMdH uses 16-bit variable for (sum) here. | 744 | /* original PPMdH uses 16-bit variable for (sum) here. |
| 736 | But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ | 745 | But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ |
| 737 | // sum = (UInt16)sum; | 746 | // sum = (UInt16)sum; |
| @@ -761,7 +770,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) | |||
| 761 | // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context | 770 | // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context |
| 762 | s->Freq = (Byte)freq; | 771 | s->Freq = (Byte)freq; |
| 763 | // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here | 772 | // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here |
| 764 | sum = freq + p->InitEsc + (ns > 3); | 773 | sum = (UInt32)(freq + p->InitEsc + (ns > 3)); |
| 765 | } | 774 | } |
| 766 | } | 775 | } |
| 767 | 776 | ||
| @@ -933,10 +942,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) | |||
| 933 | p->HiBitsFlag; | 942 | p->HiBitsFlag; |
| 934 | { | 943 | { |
| 935 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ | 944 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ |
| 936 | unsigned summ = (UInt16)see->Summ; // & 0xFFFF | 945 | const unsigned summ = (UInt16)see->Summ; // & 0xFFFF |
| 937 | unsigned r = (summ >> see->Shift); | 946 | const unsigned r = (summ >> see->Shift); |
| 938 | see->Summ = (UInt16)(summ - r); | 947 | see->Summ = (UInt16)(summ - r); |
| 939 | *escFreq = r + (r == 0); | 948 | *escFreq = (UInt32)(r + (r == 0)); |
| 940 | } | 949 | } |
| 941 | } | 950 | } |
| 942 | else | 951 | else |
| @@ -981,9 +990,9 @@ void Ppmd7_Update1_0(CPpmd7 *p) | |||
| 981 | CPpmd_State *s = p->FoundState; | 990 | CPpmd_State *s = p->FoundState; |
| 982 | CPpmd7_Context *mc = p->MinContext; | 991 | CPpmd7_Context *mc = p->MinContext; |
| 983 | unsigned freq = s->Freq; | 992 | unsigned freq = s->Freq; |
| 984 | unsigned summFreq = mc->Union2.SummFreq; | 993 | const unsigned summFreq = mc->Union2.SummFreq; |
| 985 | p->PrevSuccess = (2 * freq > summFreq); | 994 | p->PrevSuccess = (2 * freq > summFreq); |
| 986 | p->RunLength += (int)p->PrevSuccess; | 995 | p->RunLength += (Int32)p->PrevSuccess; |
| 987 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); | 996 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); |
| 988 | freq += 4; | 997 | freq += 4; |
| 989 | s->Freq = (Byte)freq; | 998 | s->Freq = (Byte)freq; |
diff --git a/C/Ppmd7Dec.c b/C/Ppmd7Dec.c index 8323828..081ab89 100644 --- a/C/Ppmd7Dec.c +++ b/C/Ppmd7Dec.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder | 1 | /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on: | 3 | This code is based on: |
| 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
| 5 | 5 | ||
| @@ -58,7 +58,7 @@ static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size) | |||
| 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
| 59 | void Ppmd7_UpdateModel(CPpmd7 *p); | 59 | void Ppmd7_UpdateModel(CPpmd7 *p); |
| 60 | 60 | ||
| 61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
| 62 | // Z7_FORCE_INLINE | 62 | // Z7_FORCE_INLINE |
| 63 | // static | 63 | // static |
| 64 | int Ppmd7z_DecodeSymbol(CPpmd7 *p) | 64 | int Ppmd7z_DecodeSymbol(CPpmd7 *p) |
| @@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
| 120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
| 121 | do | 121 | do |
| 122 | { | 122 | { |
| 123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
| 124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
| 125 | s2 += 2; | 125 | s2 += 2; |
| 126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
| 127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
| @@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
| 209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
| 210 | 210 | ||
| 211 | num &= 1; | 211 | num &= 1; |
| 212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
| 213 | s += num; | 213 | s += num; |
| 214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
| 215 | 215 | ||
| 216 | do | 216 | do |
| 217 | { | 217 | { |
| 218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
| 219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
| 220 | s += 2; | 220 | s += 2; |
| 221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
| 222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
| 223 | } | 223 | } |
| 224 | while (--num2); | 224 | while (--num2); |
| 225 | } | 225 | } |
| @@ -238,13 +238,13 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) | |||
| 238 | 238 | ||
| 239 | s = Ppmd7_GetStats(p, p->MinContext); | 239 | s = Ppmd7_GetStats(p, p->MinContext); |
| 240 | hiCnt = count; | 240 | hiCnt = count; |
| 241 | // count -= s->Freq & (unsigned)(MASK(s->Symbol)); | 241 | // count -= s->Freq & (UInt32)(MASK(s->Symbol)); |
| 242 | // if ((Int32)count >= 0) | 242 | // if ((Int32)count >= 0) |
| 243 | { | 243 | { |
| 244 | for (;;) | 244 | for (;;) |
| 245 | { | 245 | { |
| 246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 248 | } | 248 | } |
| 249 | } | 249 | } |
| 250 | s--; | 250 | s--; |
diff --git a/C/Ppmd7Enc.c b/C/Ppmd7Enc.c index 41106ba..49cbbe6 100644 --- a/C/Ppmd7Enc.c +++ b/C/Ppmd7Enc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder | 1 | /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on: | 3 | This code is based on: |
| 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain */ |
| 5 | 5 | ||
| @@ -82,7 +82,7 @@ void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) | |||
| 82 | 82 | ||
| 83 | void Ppmd7_UpdateModel(CPpmd7 *p); | 83 | void Ppmd7_UpdateModel(CPpmd7 *p); |
| 84 | 84 | ||
| 85 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 85 | #define MASK(sym) ((Byte *)charMask)[sym] |
| 86 | 86 | ||
| 87 | Z7_FORCE_INLINE | 87 | Z7_FORCE_INLINE |
| 88 | static | 88 | static |
| @@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) | |||
| 139 | MASK(s->Symbol) = 0; | 139 | MASK(s->Symbol) = 0; |
| 140 | do | 140 | do |
| 141 | { | 141 | { |
| 142 | unsigned sym0 = s2[0].Symbol; | 142 | const unsigned sym0 = s2[0].Symbol; |
| 143 | unsigned sym1 = s2[1].Symbol; | 143 | const unsigned sym1 = s2[1].Symbol; |
| 144 | s2 += 2; | 144 | s2 += 2; |
| 145 | MASK(sym0) = 0; | 145 | MASK(sym0) = 0; |
| 146 | MASK(sym1) = 0; | 146 | MASK(sym1) = 0; |
| @@ -265,16 +265,15 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) | |||
| 265 | if (num2 != 0) | 265 | if (num2 != 0) |
| 266 | { | 266 | { |
| 267 | s += i; | 267 | s += i; |
| 268 | for (;;) | 268 | do |
| 269 | { | 269 | { |
| 270 | unsigned sym0 = s[0].Symbol; | 270 | const unsigned sym0 = s[0].Symbol; |
| 271 | unsigned sym1 = s[1].Symbol; | 271 | const unsigned sym1 = s[1].Symbol; |
| 272 | s += 2; | 272 | s += 2; |
| 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); | 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); |
| 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); | 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); |
| 275 | if (--num2 == 0) | ||
| 276 | break; | ||
| 277 | } | 275 | } |
| 276 | while (--num2); | ||
| 278 | } | 277 | } |
| 279 | 278 | ||
| 280 | 279 | ||
diff --git a/C/Ppmd7aDec.c b/C/Ppmd7aDec.c index 55e164e..ef86dde 100644 --- a/C/Ppmd7aDec.c +++ b/C/Ppmd7aDec.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder | 1 | /* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on: | 3 | This code is based on: |
| 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain | 4 | PPMd var.H (2001): Dmitry Shkarin : Public domain |
| 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
| @@ -58,7 +58,7 @@ typedef CPpmd7_Context * CTX_PTR; | |||
| 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
| 59 | void Ppmd7_UpdateModel(CPpmd7 *p); | 59 | void Ppmd7_UpdateModel(CPpmd7 *p); |
| 60 | 60 | ||
| 61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
| 62 | 62 | ||
| 63 | 63 | ||
| 64 | int Ppmd7a_DecodeSymbol(CPpmd7 *p) | 64 | int Ppmd7a_DecodeSymbol(CPpmd7 *p) |
| @@ -120,8 +120,8 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
| 120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
| 121 | do | 121 | do |
| 122 | { | 122 | { |
| 123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
| 124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
| 125 | s2 += 2; | 125 | s2 += 2; |
| 126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
| 127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
| @@ -209,17 +209,17 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
| 209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
| 210 | 210 | ||
| 211 | num &= 1; | 211 | num &= 1; |
| 212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
| 213 | s += num; | 213 | s += num; |
| 214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
| 215 | 215 | ||
| 216 | do | 216 | do |
| 217 | { | 217 | { |
| 218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
| 219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
| 220 | s += 2; | 220 | s += 2; |
| 221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
| 222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
| 223 | } | 223 | } |
| 224 | while (--num2); | 224 | while (--num2); |
| 225 | } | 225 | } |
| @@ -238,13 +238,13 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p) | |||
| 238 | 238 | ||
| 239 | s = Ppmd7_GetStats(p, p->MinContext); | 239 | s = Ppmd7_GetStats(p, p->MinContext); |
| 240 | hiCnt = count; | 240 | hiCnt = count; |
| 241 | // count -= s->Freq & (unsigned)(MASK(s->Symbol)); | 241 | // count -= s->Freq & (UInt32)(MASK(s->Symbol)); |
| 242 | // if ((Int32)count >= 0) | 242 | // if ((Int32)count >= 0) |
| 243 | { | 243 | { |
| 244 | for (;;) | 244 | for (;;) |
| 245 | { | 245 | { |
| 246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 248 | } | 248 | } |
| 249 | } | 249 | } |
| 250 | s--; | 250 | s--; |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd8.c -- PPMdI codec | 1 | /* Ppmd8.c -- PPMdI codec |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ | 3 | This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ |
| 4 | 4 | ||
| 5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
| @@ -302,8 +302,17 @@ static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx) | |||
| 302 | 302 | ||
| 303 | 303 | ||
| 304 | #define MEM_12_CPY(dest, src, num) \ | 304 | #define MEM_12_CPY(dest, src, num) \ |
| 305 | { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ | 305 | { UInt32 *d = (UInt32 *)(dest); \ |
| 306 | do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } | 306 | const UInt32 *z = (const UInt32 *)(src); \ |
| 307 | unsigned n = (num); \ | ||
| 308 | do { \ | ||
| 309 | d[0] = z[0]; \ | ||
| 310 | d[1] = z[1]; \ | ||
| 311 | d[2] = z[2]; \ | ||
| 312 | z += 3; \ | ||
| 313 | d += 3; \ | ||
| 314 | } while (--n); \ | ||
| 315 | } | ||
| 307 | 316 | ||
| 308 | 317 | ||
| 309 | 318 | ||
| @@ -1215,8 +1224,8 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
| 1215 | if ((ns1 & 1) != 0) | 1224 | if ((ns1 & 1) != 0) |
| 1216 | { | 1225 | { |
| 1217 | /* Expand for one UNIT */ | 1226 | /* Expand for one UNIT */ |
| 1218 | unsigned oldNU = (ns1 + 1) >> 1; | 1227 | const unsigned oldNU = (ns1 + 1) >> 1; |
| 1219 | unsigned i = U2I(oldNU); | 1228 | const unsigned i = U2I(oldNU); |
| 1220 | if (i != U2I((size_t)oldNU + 1)) | 1229 | if (i != U2I((size_t)oldNU + 1)) |
| 1221 | { | 1230 | { |
| 1222 | void *ptr = Ppmd8_AllocUnits(p, i + 1); | 1231 | void *ptr = Ppmd8_AllocUnits(p, i + 1); |
| @@ -1235,7 +1244,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
| 1235 | sum = c->Union2.SummFreq; | 1244 | sum = c->Union2.SummFreq; |
| 1236 | /* max increase of Escape_Freq is 1 here. | 1245 | /* max increase of Escape_Freq is 1 here. |
| 1237 | an average increase is 1/3 per symbol */ | 1246 | an average increase is 1/3 per symbol */ |
| 1238 | sum += (3 * ns1 + 1 < ns); | 1247 | sum += (UInt32)(unsigned)(3 * ns1 + 1 < ns); |
| 1239 | /* original PPMdH uses 16-bit variable for (sum) here. | 1248 | /* original PPMdH uses 16-bit variable for (sum) here. |
| 1240 | But (sum < ???). Do we need to truncate (sum) to 16-bit */ | 1249 | But (sum < ???). Do we need to truncate (sum) to 16-bit */ |
| 1241 | // sum = (UInt16)sum; | 1250 | // sum = (UInt16)sum; |
| @@ -1265,7 +1274,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p) | |||
| 1265 | 1274 | ||
| 1266 | s->Freq = (Byte)freq; | 1275 | s->Freq = (Byte)freq; |
| 1267 | 1276 | ||
| 1268 | sum = freq + p->InitEsc + (ns > 2); // Ppmd8 (> 2) | 1277 | sum = (UInt32)(freq + p->InitEsc + (ns > 2)); // Ppmd8 (> 2) |
| 1269 | } | 1278 | } |
| 1270 | } | 1279 | } |
| 1271 | 1280 | ||
| @@ -1437,10 +1446,10 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq) | |||
| 1437 | 1446 | ||
| 1438 | { | 1447 | { |
| 1439 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ | 1448 | // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ |
| 1440 | unsigned summ = (UInt16)see->Summ; // & 0xFFFF | 1449 | const unsigned summ = (UInt16)see->Summ; // & 0xFFFF |
| 1441 | unsigned r = (summ >> see->Shift); | 1450 | const unsigned r = (summ >> see->Shift); |
| 1442 | see->Summ = (UInt16)(summ - r); | 1451 | see->Summ = (UInt16)(summ - r); |
| 1443 | *escFreq = r + (r == 0); | 1452 | *escFreq = (UInt32)(r + (r == 0)); |
| 1444 | } | 1453 | } |
| 1445 | } | 1454 | } |
| 1446 | else | 1455 | else |
| @@ -1485,9 +1494,9 @@ void Ppmd8_Update1_0(CPpmd8 *p) | |||
| 1485 | CPpmd_State *s = p->FoundState; | 1494 | CPpmd_State *s = p->FoundState; |
| 1486 | CPpmd8_Context *mc = p->MinContext; | 1495 | CPpmd8_Context *mc = p->MinContext; |
| 1487 | unsigned freq = s->Freq; | 1496 | unsigned freq = s->Freq; |
| 1488 | unsigned summFreq = mc->Union2.SummFreq; | 1497 | const unsigned summFreq = mc->Union2.SummFreq; |
| 1489 | p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) | 1498 | p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) |
| 1490 | p->RunLength += (int)p->PrevSuccess; | 1499 | p->RunLength += (Int32)p->PrevSuccess; |
| 1491 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); | 1500 | mc->Union2.SummFreq = (UInt16)(summFreq + 4); |
| 1492 | freq += 4; | 1501 | freq += 4; |
| 1493 | s->Freq = (Byte)freq; | 1502 | s->Freq = (Byte)freq; |
diff --git a/C/Ppmd8Dec.c b/C/Ppmd8Dec.c index 72d3626..ff91167 100644 --- a/C/Ppmd8Dec.c +++ b/C/Ppmd8Dec.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder | 1 | /* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on: | 3 | This code is based on: |
| 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain | 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain |
| 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
| @@ -58,7 +58,7 @@ static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size) | |||
| 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) | 58 | #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) |
| 59 | void Ppmd8_UpdateModel(CPpmd8 *p); | 59 | void Ppmd8_UpdateModel(CPpmd8 *p); |
| 60 | 60 | ||
| 61 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 61 | #define MASK(sym) ((Byte *)charMask)[sym] |
| 62 | 62 | ||
| 63 | 63 | ||
| 64 | int Ppmd8_DecodeSymbol(CPpmd8 *p) | 64 | int Ppmd8_DecodeSymbol(CPpmd8 *p) |
| @@ -120,8 +120,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
| 120 | MASK(s->Symbol) = 0; | 120 | MASK(s->Symbol) = 0; |
| 121 | do | 121 | do |
| 122 | { | 122 | { |
| 123 | unsigned sym0 = s2[0].Symbol; | 123 | const unsigned sym0 = s2[0].Symbol; |
| 124 | unsigned sym1 = s2[1].Symbol; | 124 | const unsigned sym1 = s2[1].Symbol; |
| 125 | s2 += 2; | 125 | s2 += 2; |
| 126 | MASK(sym0) = 0; | 126 | MASK(sym0) = 0; |
| 127 | MASK(sym1) = 0; | 127 | MASK(sym1) = 0; |
| @@ -209,17 +209,17 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
| 209 | unsigned num2 = num / 2; | 209 | unsigned num2 = num / 2; |
| 210 | 210 | ||
| 211 | num &= 1; | 211 | num &= 1; |
| 212 | hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); | 212 | hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); |
| 213 | s += num; | 213 | s += num; |
| 214 | p->MinContext = mc; | 214 | p->MinContext = mc; |
| 215 | 215 | ||
| 216 | do | 216 | do |
| 217 | { | 217 | { |
| 218 | unsigned sym0 = s[0].Symbol; | 218 | const unsigned sym0 = s[0].Symbol; |
| 219 | unsigned sym1 = s[1].Symbol; | 219 | const unsigned sym1 = s[1].Symbol; |
| 220 | s += 2; | 220 | s += 2; |
| 221 | hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); | 221 | hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); |
| 222 | hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); | 222 | hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); |
| 223 | } | 223 | } |
| 224 | while (--num2); | 224 | while (--num2); |
| 225 | } | 225 | } |
| @@ -243,8 +243,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p) | |||
| 243 | { | 243 | { |
| 244 | for (;;) | 244 | for (;;) |
| 245 | { | 245 | { |
| 246 | count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 246 | count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 247 | // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; | 247 | // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; |
| 248 | } | 248 | } |
| 249 | } | 249 | } |
| 250 | s--; | 250 | s--; |
diff --git a/C/Ppmd8Enc.c b/C/Ppmd8Enc.c index 9e29ef7..b0e34c4 100644 --- a/C/Ppmd8Enc.c +++ b/C/Ppmd8Enc.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder | 1 | /* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2023-09-07 : Igor Pavlov : Public domain |
| 3 | This code is based on: | 3 | This code is based on: |
| 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain | 4 | PPMd var.I (2002): Dmitry Shkarin : Public domain |
| 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ | 5 | Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ |
| @@ -82,7 +82,7 @@ static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 t | |||
| 82 | 82 | ||
| 83 | void Ppmd8_UpdateModel(CPpmd8 *p); | 83 | void Ppmd8_UpdateModel(CPpmd8 *p); |
| 84 | 84 | ||
| 85 | #define MASK(sym) ((unsigned char *)charMask)[sym] | 85 | #define MASK(sym) ((Byte *)charMask)[sym] |
| 86 | 86 | ||
| 87 | // Z7_FORCE_INLINE | 87 | // Z7_FORCE_INLINE |
| 88 | // static | 88 | // static |
| @@ -139,8 +139,8 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol) | |||
| 139 | MASK(s->Symbol) = 0; | 139 | MASK(s->Symbol) = 0; |
| 140 | do | 140 | do |
| 141 | { | 141 | { |
| 142 | unsigned sym0 = s2[0].Symbol; | 142 | const unsigned sym0 = s2[0].Symbol; |
| 143 | unsigned sym1 = s2[1].Symbol; | 143 | const unsigned sym1 = s2[1].Symbol; |
| 144 | s2 += 2; | 144 | s2 += 2; |
| 145 | MASK(sym0) = 0; | 145 | MASK(sym0) = 0; |
| 146 | MASK(sym1) = 0; | 146 | MASK(sym1) = 0; |
| @@ -265,16 +265,15 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol) | |||
| 265 | if (num2 != 0) | 265 | if (num2 != 0) |
| 266 | { | 266 | { |
| 267 | s += i; | 267 | s += i; |
| 268 | for (;;) | 268 | do |
| 269 | { | 269 | { |
| 270 | unsigned sym0 = s[0].Symbol; | 270 | const unsigned sym0 = s[0].Symbol; |
| 271 | unsigned sym1 = s[1].Symbol; | 271 | const unsigned sym1 = s[1].Symbol; |
| 272 | s += 2; | 272 | s += 2; |
| 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); | 273 | sum += (s[-2].Freq & (unsigned)(MASK(sym0))); |
| 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); | 274 | sum += (s[-1].Freq & (unsigned)(MASK(sym1))); |
| 275 | if (--num2 == 0) | ||
| 276 | break; | ||
| 277 | } | 275 | } |
| 276 | while (--num2); | ||
| 278 | } | 277 | } |
| 279 | 278 | ||
| 280 | PPMD8_CORRECT_SUM_RANGE(p, sum) | 279 | PPMD8_CORRECT_SUM_RANGE(p, sum) |
diff --git a/C/Precomp.h b/C/Precomp.h index 69afb2f..7747fdd 100644 --- a/C/Precomp.h +++ b/C/Precomp.h | |||
| @@ -1,10 +1,127 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- precompilation file |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-25 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | #ifndef ZIP7_INC_PRECOMP_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | #define ZIP7_INC_PRECOMP_H |
| 6 | 6 | ||
| 7 | /* | ||
| 8 | this file must be included before another *.h files and before <windows.h>. | ||
| 9 | this file is included from the following files: | ||
| 10 | C\*.c | ||
| 11 | C\Util\*\Precomp.h <- C\Util\*\*.c | ||
| 12 | CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp | ||
| 13 | |||
| 14 | this file can set the following macros: | ||
| 15 | Z7_LARGE_PAGES 1 | ||
| 16 | Z7_LONG_PATH 1 | ||
| 17 | Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip | ||
| 18 | _WIN32_WINNT 0x0500 (or higher) | ||
| 19 | WINVER _WIN32_WINNT | ||
| 20 | UNICODE 1 | ||
| 21 | _UNICODE 1 | ||
| 22 | */ | ||
| 23 | |||
| 7 | #include "Compiler.h" | 24 | #include "Compiler.h" |
| 8 | /* #include "7zTypes.h" */ | 25 | |
| 26 | #ifdef _MSC_VER | ||
| 27 | // #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty | ||
| 28 | #if _MSC_VER >= 1912 | ||
| 29 | // #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception. | ||
| 30 | #endif | ||
| 31 | #endif | ||
| 32 | |||
| 33 | /* | ||
| 34 | // for debug: | ||
| 35 | #define UNICODE 1 | ||
| 36 | #define _UNICODE 1 | ||
| 37 | #define _WIN32_WINNT 0x0500 // win2000 | ||
| 38 | #ifndef WINVER | ||
| 39 | #define WINVER _WIN32_WINNT | ||
| 40 | #endif | ||
| 41 | */ | ||
| 42 | |||
| 43 | #ifdef _WIN32 | ||
| 44 | /* | ||
| 45 | this "Precomp.h" file must be included before <windows.h>, | ||
| 46 | if we want to define _WIN32_WINNT before <windows.h>. | ||
| 47 | */ | ||
| 48 | |||
| 49 | #ifndef Z7_LARGE_PAGES | ||
| 50 | #ifndef Z7_NO_LARGE_PAGES | ||
| 51 | #define Z7_LARGE_PAGES 1 | ||
| 52 | #endif | ||
| 53 | #endif | ||
| 54 | |||
| 55 | #ifndef Z7_LONG_PATH | ||
| 56 | #ifndef Z7_NO_LONG_PATH | ||
| 57 | #define Z7_LONG_PATH 1 | ||
| 58 | #endif | ||
| 59 | #endif | ||
| 60 | |||
| 61 | #ifndef Z7_DEVICE_FILE | ||
| 62 | #ifndef Z7_NO_DEVICE_FILE | ||
| 63 | // #define Z7_DEVICE_FILE 1 | ||
| 64 | #endif | ||
| 65 | #endif | ||
| 66 | |||
| 67 | // we don't change macros if included after <windows.h> | ||
| 68 | #ifndef _WINDOWS_ | ||
| 69 | |||
| 70 | #ifndef Z7_WIN32_WINNT_MIN | ||
| 71 | #if defined(_M_ARM64) || defined(__aarch64__) | ||
| 72 | // #define Z7_WIN32_WINNT_MIN 0x0a00 // win10 | ||
| 73 | #define Z7_WIN32_WINNT_MIN 0x0600 // vista | ||
| 74 | #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT) | ||
| 75 | // #define Z7_WIN32_WINNT_MIN 0x0602 // win8 | ||
| 76 | #define Z7_WIN32_WINNT_MIN 0x0600 // vista | ||
| 77 | #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64) | ||
| 78 | #define Z7_WIN32_WINNT_MIN 0x0503 // win2003 | ||
| 79 | // #elif defined(_M_IX86) || defined(__i386__) | ||
| 80 | // #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 | ||
| 81 | #else // x86 and another(old) systems | ||
| 82 | #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 | ||
| 83 | // #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug | ||
| 84 | #endif | ||
| 85 | #endif // Z7_WIN32_WINNT_MIN | ||
| 86 | |||
| 87 | |||
| 88 | #ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT | ||
| 89 | #ifdef _WIN32_WINNT | ||
| 90 | // #error Stop_Compiling_Bad_WIN32_WINNT | ||
| 91 | #else | ||
| 92 | #ifndef Z7_NO_DEFINE_WIN32_WINNT | ||
| 93 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 94 | #define _WIN32_WINNT Z7_WIN32_WINNT_MIN | ||
| 95 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 96 | #endif | ||
| 97 | #endif // _WIN32_WINNT | ||
| 98 | |||
| 99 | #ifndef WINVER | ||
| 100 | #define WINVER _WIN32_WINNT | ||
| 101 | #endif | ||
| 102 | #endif // Z7_DO_NOT_DEFINE_WIN32_WINNT | ||
| 103 | |||
| 104 | |||
| 105 | #ifndef _MBCS | ||
| 106 | #ifndef Z7_NO_UNICODE | ||
| 107 | // UNICODE and _UNICODE are used by <windows.h> and by 7-zip code. | ||
| 108 | |||
| 109 | #ifndef UNICODE | ||
| 110 | #define UNICODE 1 | ||
| 111 | #endif | ||
| 112 | |||
| 113 | #ifndef _UNICODE | ||
| 114 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 115 | #define _UNICODE 1 | ||
| 116 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 117 | #endif | ||
| 118 | |||
| 119 | #endif // Z7_NO_UNICODE | ||
| 120 | #endif // _MBCS | ||
| 121 | #endif // _WINDOWS_ | ||
| 122 | |||
| 123 | // #include "7zWindows.h" | ||
| 124 | |||
| 125 | #endif // _WIN32 | ||
| 9 | 126 | ||
| 10 | #endif | 127 | #endif |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Sha1.c -- SHA-1 Hash | 1 | /* Sha1.c -- SHA-1 Hash |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2024-03-01 : Igor Pavlov : Public domain |
| 3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ |
| 4 | 4 | ||
| 5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
| @@ -15,35 +15,35 @@ This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ l | |||
| 15 | #endif | 15 | #endif |
| 16 | 16 | ||
| 17 | #ifdef MY_CPU_X86_OR_AMD64 | 17 | #ifdef MY_CPU_X86_OR_AMD64 |
| 18 | #ifdef _MSC_VER | 18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
| 19 | #if _MSC_VER >= 1200 | 19 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
| 20 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
| 21 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ | ||
| 22 | || defined(_MSC_VER) && (_MSC_VER >= 1200) | ||
| 20 | #define Z7_COMPILER_SHA1_SUPPORTED | 23 | #define Z7_COMPILER_SHA1_SUPPORTED |
| 21 | #endif | ||
| 22 | #elif defined(__clang__) | ||
| 23 | #if (__clang_major__ >= 8) // fix that check | ||
| 24 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 25 | #endif | ||
| 26 | #elif defined(__GNUC__) | ||
| 27 | #if (__GNUC__ >= 8) // fix that check | ||
| 28 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 29 | #endif | ||
| 30 | #elif defined(__INTEL_COMPILER) | ||
| 31 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
| 32 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 33 | #endif | ||
| 34 | #endif | 24 | #endif |
| 35 | #elif defined(MY_CPU_ARM_OR_ARM64) | 25 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \ |
| 36 | #ifdef _MSC_VER | 26 | && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037)) |
| 37 | #if _MSC_VER >= 1910 && _MSC_VER >= 1929 && _MSC_FULL_VER >= 192930037 | 27 | #if defined(__ARM_FEATURE_SHA2) \ |
| 28 | || defined(__ARM_FEATURE_CRYPTO) | ||
| 29 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 30 | #else | ||
| 31 | #if defined(MY_CPU_ARM64) \ | ||
| 32 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 33 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 34 | #if defined(__ARM_FP) && \ | ||
| 35 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 36 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 37 | ) \ | ||
| 38 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 39 | #if defined(MY_CPU_ARM64) \ | ||
| 40 | || !defined(Z7_CLANG_VERSION) \ | ||
| 41 | || defined(__ARM_NEON) && \ | ||
| 42 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 43 | Z7_CLANG_VERSION > 170001) | ||
| 38 | #define Z7_COMPILER_SHA1_SUPPORTED | 44 | #define Z7_COMPILER_SHA1_SUPPORTED |
| 39 | #endif | 45 | #endif |
| 40 | #elif defined(__clang__) | ||
| 41 | #if (__clang_major__ >= 8) // fix that check | ||
| 42 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 43 | #endif | 46 | #endif |
| 44 | #elif defined(__GNUC__) | ||
| 45 | #if (__GNUC__ >= 6) // fix that check | ||
| 46 | #define Z7_COMPILER_SHA1_SUPPORTED | ||
| 47 | #endif | 47 | #endif |
| 48 | #endif | 48 | #endif |
| 49 | #endif | 49 | #endif |
| @@ -436,7 +436,7 @@ void Sha1Prepare(void) | |||
| 436 | #endif | 436 | #endif |
| 437 | { | 437 | { |
| 438 | // printf("\n========== HW SHA1 ======== \n"); | 438 | // printf("\n========== HW SHA1 ======== \n"); |
| 439 | #if defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) | 439 | #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) |
| 440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). | 440 | /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). |
| 441 | It generated incorrect SHA-1 code. | 441 | It generated incorrect SHA-1 code. |
| 442 | 21.03 : we test sha1-hardware code at runtime initialization */ | 442 | 21.03 : we test sha1-hardware code at runtime initialization */ |
diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 27796aa..4e835f1 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions | 1 | /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
| @@ -11,6 +11,8 @@ | |||
| 11 | #endif | 11 | #endif |
| 12 | #endif | 12 | #endif |
| 13 | 13 | ||
| 14 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
| 15 | |||
| 14 | #ifdef MY_CPU_X86_OR_AMD64 | 16 | #ifdef MY_CPU_X86_OR_AMD64 |
| 15 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
| 16 | #define USE_HW_SHA | 18 | #define USE_HW_SHA |
| @@ -32,9 +34,14 @@ | |||
| 32 | #endif | 34 | #endif |
| 33 | #if (_MSC_VER >= USE_VER_MIN) | 35 | #if (_MSC_VER >= USE_VER_MIN) |
| 34 | #define USE_HW_SHA | 36 | #define USE_HW_SHA |
| 37 | #else | ||
| 38 | #define Z7_USE_HW_SHA_STUB | ||
| 35 | #endif | 39 | #endif |
| 36 | #endif | 40 | #endif |
| 37 | // #endif // MY_CPU_X86_OR_AMD64 | 41 | // #endif // MY_CPU_X86_OR_AMD64 |
| 42 | #ifndef USE_HW_SHA | ||
| 43 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
| 44 | #endif | ||
| 38 | 45 | ||
| 39 | #ifdef USE_HW_SHA | 46 | #ifdef USE_HW_SHA |
| 40 | 47 | ||
| @@ -202,46 +209,124 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
| 202 | 209 | ||
| 203 | #endif // USE_HW_SHA | 210 | #endif // USE_HW_SHA |
| 204 | 211 | ||
| 205 | #elif defined(MY_CPU_ARM_OR_ARM64) | 212 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \ |
| 206 | 213 | && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037)) | |
| 207 | #if defined(__clang__) | 214 | #if defined(__ARM_FEATURE_SHA2) \ |
| 208 | #if (__clang_major__ >= 8) // fix that check | 215 | || defined(__ARM_FEATURE_CRYPTO) |
| 216 | #define USE_HW_SHA | ||
| 217 | #else | ||
| 218 | #if defined(MY_CPU_ARM64) \ | ||
| 219 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 220 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 221 | #if defined(__ARM_FP) && \ | ||
| 222 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 223 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 224 | ) \ | ||
| 225 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 226 | #if defined(MY_CPU_ARM64) \ | ||
| 227 | || !defined(Z7_CLANG_VERSION) \ | ||
| 228 | || defined(__ARM_NEON) && \ | ||
| 229 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 230 | Z7_CLANG_VERSION > 170001) | ||
| 209 | #define USE_HW_SHA | 231 | #define USE_HW_SHA |
| 210 | #endif | 232 | #endif |
| 211 | #elif defined(__GNUC__) | ||
| 212 | #if (__GNUC__ >= 6) // fix that check | ||
| 213 | #define USE_HW_SHA | ||
| 214 | #endif | 233 | #endif |
| 215 | #elif defined(_MSC_VER) | ||
| 216 | #if _MSC_VER >= 1910 | ||
| 217 | #define USE_HW_SHA | ||
| 218 | #endif | 234 | #endif |
| 219 | #endif | 235 | #endif |
| 220 | 236 | ||
| 221 | #ifdef USE_HW_SHA | 237 | #ifdef USE_HW_SHA |
| 222 | 238 | ||
| 223 | // #pragma message("=== Sha1 HW === ") | 239 | // #pragma message("=== Sha1 HW === ") |
| 240 | // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2 | ||
| 224 | 241 | ||
| 225 | #if defined(__clang__) || defined(__GNUC__) | 242 | #if defined(__clang__) || defined(__GNUC__) |
| 243 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
| 244 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 226 | #ifdef MY_CPU_ARM64 | 245 | #ifdef MY_CPU_ARM64 |
| 246 | #if defined(__clang__) | ||
| 247 | #define ATTRIB_SHA __attribute__((__target__("crypto"))) | ||
| 248 | #else | ||
| 227 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | 249 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) |
| 250 | #endif | ||
| 228 | #else | 251 | #else |
| 252 | #if defined(__clang__) && (__clang_major__ >= 1) | ||
| 253 | #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) | ||
| 254 | #else | ||
| 229 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 255 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
| 256 | #endif | ||
| 230 | #endif | 257 | #endif |
| 258 | #endif | ||
| 231 | #else | 259 | #else |
| 232 | // _MSC_VER | 260 | // _MSC_VER |
| 233 | // for arm32 | 261 | // for arm32 |
| 234 | #define _ARM_USE_NEW_NEON_INTRINSICS | 262 | #define _ARM_USE_NEW_NEON_INTRINSICS |
| 235 | #endif | 263 | #endif |
| 236 | 264 | ||
| 237 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 265 | |
| 266 | |||
| 267 | |||
| 268 | |||
| 269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
| 238 | #include <arm64_neon.h> | 270 | #include <arm64_neon.h> |
| 239 | #else | 271 | #else |
| 272 | |||
| 273 | |||
| 274 | |||
| 275 | |||
| 276 | |||
| 277 | |||
| 278 | |||
| 279 | |||
| 280 | |||
| 281 | #if defined(__clang__) && __clang_major__ < 16 | ||
| 282 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
| 283 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 284 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
| 285 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 286 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
| 287 | // #if defined(__clang__) && __clang_major__ < 13 | ||
| 288 | #define __ARM_FEATURE_CRYPTO 1 | ||
| 289 | // #else | ||
| 290 | #define __ARM_FEATURE_SHA2 1 | ||
| 291 | // #endif | ||
| 292 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 293 | #endif | ||
| 294 | #endif // clang | ||
| 295 | |||
| 296 | #if defined(__clang__) | ||
| 297 | |||
| 298 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
| 299 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 300 | // #pragma message("#define __ARM_ARCH 8") | ||
| 301 | #undef __ARM_ARCH | ||
| 302 | #define __ARM_ARCH 8 | ||
| 303 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 304 | #endif | ||
| 305 | |||
| 306 | #endif // clang | ||
| 307 | |||
| 240 | #include <arm_neon.h> | 308 | #include <arm_neon.h> |
| 309 | |||
| 310 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
| 311 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
| 312 | defined(__ARM_FEATURE_SHA2) | ||
| 313 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 314 | #undef __ARM_FEATURE_CRYPTO | ||
| 315 | #undef __ARM_FEATURE_SHA2 | ||
| 316 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
| 317 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 318 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
| 241 | #endif | 319 | #endif |
| 242 | 320 | ||
| 321 | #endif // Z7_MSC_VER_ORIGINAL | ||
| 322 | |||
| 243 | typedef uint32x4_t v128; | 323 | typedef uint32x4_t v128; |
| 244 | // typedef __n128 v128; // MSVC | 324 | // typedef __n128 v128; // MSVC |
| 325 | // the bug in clang 3.8.1: | ||
| 326 | // __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1); | ||
| 327 | #if defined(__clang__) && (__clang_major__ <= 9) | ||
| 328 | #pragma GCC diagnostic ignored "-Wvector-conversion" | ||
| 329 | #endif | ||
| 245 | 330 | ||
| 246 | #ifdef MY_CPU_BE | 331 | #ifdef MY_CPU_BE |
| 247 | #define MY_rev32_for_LE(x) | 332 | #define MY_rev32_for_LE(x) |
| @@ -256,11 +341,11 @@ typedef uint32x4_t v128; | |||
| 256 | m = LOAD_128((data + (k) * 16)); \ | 341 | m = LOAD_128((data + (k) * 16)); \ |
| 257 | MY_rev32_for_LE(m); \ | 342 | MY_rev32_for_LE(m); \ |
| 258 | 343 | ||
| 259 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); | 344 | #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) |
| 260 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src); | 345 | #define SU1(dest, src) dest = vsha1su1q_u32(dest, src) |
| 261 | #define C(e) abcd = vsha1cq_u32(abcd, e, t); | 346 | #define C(e) abcd = vsha1cq_u32(abcd, e, t) |
| 262 | #define P(e) abcd = vsha1pq_u32(abcd, e, t); | 347 | #define P(e) abcd = vsha1pq_u32(abcd, e, t) |
| 263 | #define M(e) abcd = vsha1mq_u32(abcd, e, t); | 348 | #define M(e) abcd = vsha1mq_u32(abcd, e, t) |
| 264 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) | 349 | #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) |
| 265 | #define T(m, c) t = vaddq_u32(m, c) | 350 | #define T(m, c) t = vaddq_u32(m, c) |
| 266 | 351 | ||
| @@ -337,16 +422,17 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t | |||
| 337 | #endif // MY_CPU_ARM_OR_ARM64 | 422 | #endif // MY_CPU_ARM_OR_ARM64 |
| 338 | 423 | ||
| 339 | 424 | ||
| 340 | #ifndef USE_HW_SHA | 425 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
| 341 | |||
| 342 | // #error Stop_Compiling_UNSUPPORTED_SHA | 426 | // #error Stop_Compiling_UNSUPPORTED_SHA |
| 343 | // #include <stdlib.h> | 427 | // #include <stdlib.h> |
| 344 | 428 | ||
| 345 | // #include "Sha1.h" | ||
| 346 | void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); | ||
| 347 | 429 | ||
| 348 | #pragma message("Sha1 HW-SW stub was used") | ||
| 349 | 430 | ||
| 431 | // #include "Sha1.h" | ||
| 432 | // #if defined(_MSC_VER) | ||
| 433 | #pragma message("Sha1 HW-SW stub was used") | ||
| 434 | // #endif | ||
| 435 | void Z7_FASTCALL Sha1_UpdateBlocks (UInt32 state[5], const Byte *data, size_t numBlocks); | ||
| 350 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); | 436 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); |
| 351 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) | 437 | void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) |
| 352 | { | 438 | { |
| @@ -359,7 +445,6 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
| 359 | return; | 445 | return; |
| 360 | */ | 446 | */ |
| 361 | } | 447 | } |
| 362 | |||
| 363 | #endif | 448 | #endif |
| 364 | 449 | ||
| 365 | #undef SU0 | 450 | #undef SU0 |
| @@ -384,3 +469,4 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t | |||
| 384 | #undef USE_HW_SHA | 469 | #undef USE_HW_SHA |
| 385 | #undef ATTRIB_SHA | 470 | #undef ATTRIB_SHA |
| 386 | #undef USE_VER_MIN | 471 | #undef USE_VER_MIN |
| 472 | #undef Z7_USE_HW_SHA_STUB | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Sha256.c -- SHA-256 Hash | 1 | /* Sha256.c -- SHA-256 Hash |
| 2 | 2023-04-02 : Igor Pavlov : Public domain | 2 | 2024-03-01 : Igor Pavlov : Public domain |
| 3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ | 3 | This code is based on public domain code from Wei Dai's Crypto++ library. */ |
| 4 | 4 | ||
| 5 | #include "Precomp.h" | 5 | #include "Precomp.h" |
| @@ -15,35 +15,35 @@ This code is based on public domain code from Wei Dai's Crypto++ library. */ | |||
| 15 | #endif | 15 | #endif |
| 16 | 16 | ||
| 17 | #ifdef MY_CPU_X86_OR_AMD64 | 17 | #ifdef MY_CPU_X86_OR_AMD64 |
| 18 | #ifdef _MSC_VER | 18 | #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ |
| 19 | #if _MSC_VER >= 1200 | 19 | || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ |
| 20 | || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ | ||
| 21 | || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ | ||
| 22 | || defined(_MSC_VER) && (_MSC_VER >= 1200) | ||
| 20 | #define Z7_COMPILER_SHA256_SUPPORTED | 23 | #define Z7_COMPILER_SHA256_SUPPORTED |
| 21 | #endif | ||
| 22 | #elif defined(__clang__) | ||
| 23 | #if (__clang_major__ >= 8) // fix that check | ||
| 24 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 25 | #endif | ||
| 26 | #elif defined(__GNUC__) | ||
| 27 | #if (__GNUC__ >= 8) // fix that check | ||
| 28 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 29 | #endif | ||
| 30 | #elif defined(__INTEL_COMPILER) | ||
| 31 | #if (__INTEL_COMPILER >= 1800) // fix that check | ||
| 32 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 33 | #endif | ||
| 34 | #endif | 24 | #endif |
| 35 | #elif defined(MY_CPU_ARM_OR_ARM64) | 25 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
| 36 | #ifdef _MSC_VER | 26 | |
| 37 | #if _MSC_VER >= 1910 | 27 | #if defined(__ARM_FEATURE_SHA2) \ |
| 28 | || defined(__ARM_FEATURE_CRYPTO) | ||
| 29 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 30 | #else | ||
| 31 | #if defined(MY_CPU_ARM64) \ | ||
| 32 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 33 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 34 | #if defined(__ARM_FP) && \ | ||
| 35 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 36 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 37 | ) \ | ||
| 38 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 39 | #if defined(MY_CPU_ARM64) \ | ||
| 40 | || !defined(Z7_CLANG_VERSION) \ | ||
| 41 | || defined(__ARM_NEON) && \ | ||
| 42 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 43 | Z7_CLANG_VERSION > 170001) | ||
| 38 | #define Z7_COMPILER_SHA256_SUPPORTED | 44 | #define Z7_COMPILER_SHA256_SUPPORTED |
| 39 | #endif | 45 | #endif |
| 40 | #elif defined(__clang__) | ||
| 41 | #if (__clang_major__ >= 8) // fix that check | ||
| 42 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 43 | #endif | 46 | #endif |
| 44 | #elif defined(__GNUC__) | ||
| 45 | #if (__GNUC__ >= 6) // fix that check | ||
| 46 | #define Z7_COMPILER_SHA256_SUPPORTED | ||
| 47 | #endif | 47 | #endif |
| 48 | #endif | 48 | #endif |
| 49 | #endif | 49 | #endif |
| @@ -224,8 +224,6 @@ void Sha256_Init(CSha256 *p) | |||
| 224 | 224 | ||
| 225 | #endif | 225 | #endif |
| 226 | 226 | ||
| 227 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | ||
| 228 | |||
| 229 | // static | 227 | // static |
| 230 | extern MY_ALIGN(64) | 228 | extern MY_ALIGN(64) |
| 231 | const UInt32 SHA256_K_ARRAY[64]; | 229 | const UInt32 SHA256_K_ARRAY[64]; |
diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index e4465e3..eb38166 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions | 1 | /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | #include "Compiler.h" | 5 | #include "Compiler.h" |
| @@ -11,6 +11,8 @@ | |||
| 11 | #endif | 11 | #endif |
| 12 | #endif | 12 | #endif |
| 13 | 13 | ||
| 14 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
| 15 | |||
| 14 | #ifdef MY_CPU_X86_OR_AMD64 | 16 | #ifdef MY_CPU_X86_OR_AMD64 |
| 15 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check | 17 | #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check |
| 16 | #define USE_HW_SHA | 18 | #define USE_HW_SHA |
| @@ -32,9 +34,14 @@ | |||
| 32 | #endif | 34 | #endif |
| 33 | #if (_MSC_VER >= USE_VER_MIN) | 35 | #if (_MSC_VER >= USE_VER_MIN) |
| 34 | #define USE_HW_SHA | 36 | #define USE_HW_SHA |
| 37 | #else | ||
| 38 | #define Z7_USE_HW_SHA_STUB | ||
| 35 | #endif | 39 | #endif |
| 36 | #endif | 40 | #endif |
| 37 | // #endif // MY_CPU_X86_OR_AMD64 | 41 | // #endif // MY_CPU_X86_OR_AMD64 |
| 42 | #ifndef USE_HW_SHA | ||
| 43 | // #define Z7_USE_HW_SHA_STUB // for debug | ||
| 44 | #endif | ||
| 38 | 45 | ||
| 39 | #ifdef USE_HW_SHA | 46 | #ifdef USE_HW_SHA |
| 40 | 47 | ||
| @@ -202,19 +209,28 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 202 | 209 | ||
| 203 | #endif // USE_HW_SHA | 210 | #endif // USE_HW_SHA |
| 204 | 211 | ||
| 205 | #elif defined(MY_CPU_ARM_OR_ARM64) | 212 | #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) |
| 206 | 213 | ||
| 207 | #if defined(__clang__) | 214 | #if defined(__ARM_FEATURE_SHA2) \ |
| 208 | #if (__clang_major__ >= 8) // fix that check | 215 | || defined(__ARM_FEATURE_CRYPTO) |
| 216 | #define USE_HW_SHA | ||
| 217 | #else | ||
| 218 | #if defined(MY_CPU_ARM64) \ | ||
| 219 | || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ | ||
| 220 | || defined(Z7_MSC_VER_ORIGINAL) | ||
| 221 | #if defined(__ARM_FP) && \ | ||
| 222 | ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ | ||
| 223 | || defined(__GNUC__) && (__GNUC__ >= 6) \ | ||
| 224 | ) \ | ||
| 225 | || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) | ||
| 226 | #if defined(MY_CPU_ARM64) \ | ||
| 227 | || !defined(Z7_CLANG_VERSION) \ | ||
| 228 | || defined(__ARM_NEON) && \ | ||
| 229 | (Z7_CLANG_VERSION < 170000 || \ | ||
| 230 | Z7_CLANG_VERSION > 170001) | ||
| 209 | #define USE_HW_SHA | 231 | #define USE_HW_SHA |
| 210 | #endif | 232 | #endif |
| 211 | #elif defined(__GNUC__) | ||
| 212 | #if (__GNUC__ >= 6) // fix that check | ||
| 213 | #define USE_HW_SHA | ||
| 214 | #endif | 233 | #endif |
| 215 | #elif defined(_MSC_VER) | ||
| 216 | #if _MSC_VER >= 1910 | ||
| 217 | #define USE_HW_SHA | ||
| 218 | #endif | 234 | #endif |
| 219 | #endif | 235 | #endif |
| 220 | 236 | ||
| @@ -222,24 +238,88 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 222 | 238 | ||
| 223 | // #pragma message("=== Sha256 HW === ") | 239 | // #pragma message("=== Sha256 HW === ") |
| 224 | 240 | ||
| 241 | |||
| 225 | #if defined(__clang__) || defined(__GNUC__) | 242 | #if defined(__clang__) || defined(__GNUC__) |
| 243 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
| 244 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 226 | #ifdef MY_CPU_ARM64 | 245 | #ifdef MY_CPU_ARM64 |
| 246 | #if defined(__clang__) | ||
| 247 | #define ATTRIB_SHA __attribute__((__target__("crypto"))) | ||
| 248 | #else | ||
| 227 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) | 249 | #define ATTRIB_SHA __attribute__((__target__("+crypto"))) |
| 250 | #endif | ||
| 228 | #else | 251 | #else |
| 252 | #if defined(__clang__) && (__clang_major__ >= 1) | ||
| 253 | #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) | ||
| 254 | #else | ||
| 229 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 255 | #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) |
| 256 | #endif | ||
| 230 | #endif | 257 | #endif |
| 258 | #endif | ||
| 231 | #else | 259 | #else |
| 232 | // _MSC_VER | 260 | // _MSC_VER |
| 233 | // for arm32 | 261 | // for arm32 |
| 234 | #define _ARM_USE_NEW_NEON_INTRINSICS | 262 | #define _ARM_USE_NEW_NEON_INTRINSICS |
| 235 | #endif | 263 | #endif |
| 236 | 264 | ||
| 237 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 265 | |
| 266 | |||
| 267 | |||
| 268 | |||
| 269 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
| 238 | #include <arm64_neon.h> | 270 | #include <arm64_neon.h> |
| 239 | #else | 271 | #else |
| 272 | |||
| 273 | |||
| 274 | |||
| 275 | |||
| 276 | |||
| 277 | |||
| 278 | |||
| 279 | |||
| 280 | |||
| 281 | #if defined(__clang__) && __clang_major__ < 16 | ||
| 282 | #if !defined(__ARM_FEATURE_SHA2) && \ | ||
| 283 | !defined(__ARM_FEATURE_CRYPTO) | ||
| 284 | // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") | ||
| 285 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 286 | #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 | ||
| 287 | // #if defined(__clang__) && __clang_major__ < 13 | ||
| 288 | #define __ARM_FEATURE_CRYPTO 1 | ||
| 289 | // #else | ||
| 290 | #define __ARM_FEATURE_SHA2 1 | ||
| 291 | // #endif | ||
| 292 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 293 | #endif | ||
| 294 | #endif // clang | ||
| 295 | |||
| 296 | #if defined(__clang__) | ||
| 297 | |||
| 298 | #if defined(__ARM_ARCH) && __ARM_ARCH < 8 | ||
| 299 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 300 | // #pragma message("#define __ARM_ARCH 8") | ||
| 301 | #undef __ARM_ARCH | ||
| 302 | #define __ARM_ARCH 8 | ||
| 303 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 304 | #endif | ||
| 305 | |||
| 306 | #endif // clang | ||
| 307 | |||
| 240 | #include <arm_neon.h> | 308 | #include <arm_neon.h> |
| 309 | |||
| 310 | #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ | ||
| 311 | defined(__ARM_FEATURE_CRYPTO) && \ | ||
| 312 | defined(__ARM_FEATURE_SHA2) | ||
| 313 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 314 | #undef __ARM_FEATURE_CRYPTO | ||
| 315 | #undef __ARM_FEATURE_SHA2 | ||
| 316 | #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET | ||
| 317 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 318 | // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") | ||
| 241 | #endif | 319 | #endif |
| 242 | 320 | ||
| 321 | #endif // Z7_MSC_VER_ORIGINAL | ||
| 322 | |||
| 243 | typedef uint32x4_t v128; | 323 | typedef uint32x4_t v128; |
| 244 | // typedef __n128 v128; // MSVC | 324 | // typedef __n128 v128; // MSVC |
| 245 | 325 | ||
| @@ -316,10 +396,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 316 | LOAD_SHUFFLE (m2, 2) | 396 | LOAD_SHUFFLE (m2, 2) |
| 317 | LOAD_SHUFFLE (m3, 3) | 397 | LOAD_SHUFFLE (m3, 3) |
| 318 | 398 | ||
| 319 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); | 399 | R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) |
| 320 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 400 | R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
| 321 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); | 401 | R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) |
| 322 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); | 402 | R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) |
| 323 | 403 | ||
| 324 | state0 = vaddq_u32(state0, state0_save); | 404 | state0 = vaddq_u32(state0, state0_save); |
| 325 | state1 = vaddq_u32(state1, state1_save); | 405 | state1 = vaddq_u32(state1, state1_save); |
| @@ -337,16 +417,17 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 337 | #endif // MY_CPU_ARM_OR_ARM64 | 417 | #endif // MY_CPU_ARM_OR_ARM64 |
| 338 | 418 | ||
| 339 | 419 | ||
| 340 | #ifndef USE_HW_SHA | 420 | #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) |
| 341 | |||
| 342 | // #error Stop_Compiling_UNSUPPORTED_SHA | 421 | // #error Stop_Compiling_UNSUPPORTED_SHA |
| 343 | // #include <stdlib.h> | 422 | // #include <stdlib.h> |
| 344 | 423 | // We can compile this file with another C compiler, | |
| 424 | // or we can compile asm version. | ||
| 425 | // So we can generate real code instead of this stub function. | ||
| 345 | // #include "Sha256.h" | 426 | // #include "Sha256.h" |
| 346 | void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); | 427 | // #if defined(_MSC_VER) |
| 347 | |||
| 348 | #pragma message("Sha256 HW-SW stub was used") | 428 | #pragma message("Sha256 HW-SW stub was used") |
| 349 | 429 | // #endif | |
| 430 | void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks); | ||
| 350 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); | 431 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); |
| 351 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) | 432 | void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) |
| 352 | { | 433 | { |
| @@ -359,7 +440,6 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 359 | return; | 440 | return; |
| 360 | */ | 441 | */ |
| 361 | } | 442 | } |
| 362 | |||
| 363 | #endif | 443 | #endif |
| 364 | 444 | ||
| 365 | 445 | ||
| @@ -384,3 +464,4 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ | |||
| 384 | #undef USE_HW_SHA | 464 | #undef USE_HW_SHA |
| 385 | #undef ATTRIB_SHA | 465 | #undef ATTRIB_SHA |
| 386 | #undef USE_VER_MIN | 466 | #undef USE_VER_MIN |
| 467 | #undef Z7_USE_HW_SHA_STUB | ||
diff --git a/C/SwapBytes.c b/C/SwapBytes.c index 7901bba..9290592 100644 --- a/C/SwapBytes.c +++ b/C/SwapBytes.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* SwapBytes.c -- Byte Swap conversion filter | 1 | /* SwapBytes.c -- Byte Swap conversion filter |
| 2 | 2023-04-07 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -305,11 +305,12 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) | |||
| 305 | msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want | 305 | msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want |
| 306 | */ | 306 | */ |
| 307 | // _mm256_broadcastsi128_si256(*mask128_ptr); | 307 | // _mm256_broadcastsi128_si256(*mask128_ptr); |
| 308 | /* | 308 | #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) |
| 309 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) | 309 | #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) |
| 310 | MY_mm256_set_m128i | 310 | #else |
| 311 | */ | 311 | #define MY_mm256_set_m128i _mm256_set_m128i |
| 312 | _mm256_set_m128i( | 312 | #endif |
| 313 | MY_mm256_set_m128i( | ||
| 313 | *(const __m128i *)mask128_ptr, | 314 | *(const __m128i *)mask128_ptr, |
| 314 | *(const __m128i *)mask128_ptr); | 315 | *(const __m128i *)mask128_ptr); |
| 315 | #endif | 316 | #endif |
| @@ -330,32 +331,59 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) | |||
| 330 | 331 | ||
| 331 | 332 | ||
| 332 | // compile message "NEON intrinsics not available with the soft-float ABI" | 333 | // compile message "NEON intrinsics not available with the soft-float ABI" |
| 333 | #elif defined(MY_CPU_ARM_OR_ARM64) || \ | 334 | #elif defined(MY_CPU_ARM_OR_ARM64) \ |
| 334 | (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) | 335 | && defined(MY_CPU_LE) \ |
| 335 | // #elif defined(MY_CPU_ARM64) | 336 | && !defined(Z7_DISABLE_ARM_NEON) |
| 336 | 337 | ||
| 337 | #if defined(__clang__) && (__clang_major__ >= 8) \ | 338 | #if defined(__clang__) && (__clang_major__ >= 8) \ |
| 338 | || defined(__GNUC__) && (__GNUC__ >= 8) | 339 | || defined(__GNUC__) && (__GNUC__ >= 6) |
| 339 | #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ | 340 | #if defined(__ARM_FP) |
| 341 | #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \ | ||
| 340 | || defined(MY_CPU_ARM64) | 342 | || defined(MY_CPU_ARM64) |
| 343 | #if defined(MY_CPU_ARM64) \ | ||
| 344 | || !defined(Z7_CLANG_VERSION) \ | ||
| 345 | || defined(__ARM_NEON) | ||
| 341 | #define USE_SWAP_128 | 346 | #define USE_SWAP_128 |
| 342 | #endif | ||
| 343 | #ifdef MY_CPU_ARM64 | 347 | #ifdef MY_CPU_ARM64 |
| 344 | // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) | 348 | // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) |
| 345 | #else | 349 | #else |
| 346 | // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) | 350 | #if defined(Z7_CLANG_VERSION) |
| 347 | #endif | 351 | // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon"))) |
| 352 | #else | ||
| 353 | // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))") | ||
| 354 | #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon"))) | ||
| 355 | #endif | ||
| 356 | #endif // MY_CPU_ARM64 | ||
| 357 | #endif // __ARM_NEON | ||
| 358 | #endif // __ARM_ARCH | ||
| 359 | #endif // __ARM_FP | ||
| 360 | |||
| 348 | #elif defined(_MSC_VER) | 361 | #elif defined(_MSC_VER) |
| 349 | #if (_MSC_VER >= 1910) | 362 | #if (_MSC_VER >= 1910) |
| 350 | #define USE_SWAP_128 | 363 | #define USE_SWAP_128 |
| 351 | #endif | 364 | #endif |
| 352 | #endif | 365 | #endif |
| 353 | 366 | ||
| 354 | #if defined(_MSC_VER) && defined(MY_CPU_ARM64) | 367 | #ifdef USE_SWAP_128 |
| 368 | #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) | ||
| 355 | #include <arm64_neon.h> | 369 | #include <arm64_neon.h> |
| 356 | #else | 370 | #else |
| 371 | |||
| 372 | /* | ||
| 373 | #if !defined(__ARM_NEON) | ||
| 374 | #if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \ | ||
| 375 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \ | ||
| 376 | || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100) | ||
| 377 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 378 | #pragma message("#define __ARM_NEON 1") | ||
| 379 | // #define __ARM_NEON 1 | ||
| 380 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 381 | #endif | ||
| 382 | #endif | ||
| 383 | */ | ||
| 357 | #include <arm_neon.h> | 384 | #include <arm_neon.h> |
| 358 | #endif | 385 | #endif |
| 386 | #endif | ||
| 359 | 387 | ||
| 360 | #ifndef USE_SWAP_128 | 388 | #ifndef USE_SWAP_128 |
| 361 | #define FORCE_SWAP_MODE | 389 | #define FORCE_SWAP_MODE |
| @@ -464,6 +492,13 @@ Z7_ATTRIB_NO_VECTOR \ | |||
| 464 | void Z7_FASTCALL | 492 | void Z7_FASTCALL |
| 465 | 493 | ||
| 466 | 494 | ||
| 495 | #if defined(MY_CPU_ARM_OR_ARM64) | ||
| 496 | #if defined(__clang__) | ||
| 497 | #pragma GCC diagnostic ignored "-Wlanguage-extension-token" | ||
| 498 | #endif | ||
| 499 | #endif | ||
| 500 | |||
| 501 | |||
| 467 | #ifdef MY_CPU_64BIT | 502 | #ifdef MY_CPU_64BIT |
| 468 | 503 | ||
| 469 | #if defined(MY_CPU_ARM64) \ | 504 | #if defined(MY_CPU_ARM64) \ |
diff --git a/C/Threads.c b/C/Threads.c index cf52bd3..464efec 100644 --- a/C/Threads.c +++ b/C/Threads.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Threads.c -- multithreading library | 1 | /* Threads.c -- multithreading library |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-03-28 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -195,20 +195,19 @@ WRes CriticalSection_Init(CCriticalSection *p) | |||
| 195 | 195 | ||
| 196 | // ---------- POSIX ---------- | 196 | // ---------- POSIX ---------- |
| 197 | 197 | ||
| 198 | #ifndef __APPLE__ | 198 | #if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) |
| 199 | #ifndef Z7_AFFINITY_DISABLE | 199 | #ifndef Z7_AFFINITY_DISABLE |
| 200 | // _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET | 200 | // _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET |
| 201 | // clang < 3.6 : unknown warning group '-Wreserved-id-macro' | 201 | // clang < 3.6 : unknown warning group '-Wreserved-id-macro' |
| 202 | // clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" | 202 | // clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" |
| 203 | // clang >= 13 : do not give warning | 203 | // clang >= 13 : do not give warning |
| 204 | #if !defined(_GNU_SOURCE) | 204 | #if !defined(_GNU_SOURCE) |
| 205 | #if defined(__clang__) && (__clang_major__ >= 4) && (__clang_major__ <= 12) | 205 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER |
| 206 | #pragma GCC diagnostic ignored "-Wreserved-id-macro" | 206 | // #define _GNU_SOURCE |
| 207 | #endif | 207 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER |
| 208 | #define _GNU_SOURCE | ||
| 209 | #endif // !defined(_GNU_SOURCE) | 208 | #endif // !defined(_GNU_SOURCE) |
| 210 | #endif // Z7_AFFINITY_DISABLE | 209 | #endif // Z7_AFFINITY_DISABLE |
| 211 | #endif // __APPLE__ | 210 | #endif // __linux__ |
| 212 | 211 | ||
| 213 | #include "Threads.h" | 212 | #include "Threads.h" |
| 214 | 213 | ||
| @@ -244,8 +243,9 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, | |||
| 244 | { | 243 | { |
| 245 | if (cpuSet) | 244 | if (cpuSet) |
| 246 | { | 245 | { |
| 247 | #ifdef Z7_AFFINITY_SUPPORTED | 246 | // pthread_attr_setaffinity_np() is not supported for MUSL compile. |
| 248 | 247 | // so we check for __GLIBC__ here | |
| 248 | #if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__) | ||
| 249 | /* | 249 | /* |
| 250 | printf("\n affinity :"); | 250 | printf("\n affinity :"); |
| 251 | unsigned i; | 251 | unsigned i; |
| @@ -267,7 +267,7 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, | |||
| 267 | // ret2 = | 267 | // ret2 = |
| 268 | pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); | 268 | pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); |
| 269 | // if (ret2) ret = ret2; | 269 | // if (ret2) ret = ret2; |
| 270 | #endif | 270 | #endif |
| 271 | } | 271 | } |
| 272 | 272 | ||
| 273 | ret = pthread_create(&p->_tid, &attr, func, param); | 273 | ret = pthread_create(&p->_tid, &attr, func, param); |
| @@ -369,13 +369,20 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) | |||
| 369 | { return AutoResetEvent_Create(p, 0); } | 369 | { return AutoResetEvent_Create(p, 0); } |
| 370 | 370 | ||
| 371 | 371 | ||
| 372 | #if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) | ||
| 373 | // freebsd: | ||
| 374 | #pragma GCC diagnostic ignored "-Wthread-safety-analysis" | ||
| 375 | #endif | ||
| 376 | |||
| 372 | WRes Event_Set(CEvent *p) | 377 | WRes Event_Set(CEvent *p) |
| 373 | { | 378 | { |
| 374 | RINOK(pthread_mutex_lock(&p->_mutex)) | 379 | RINOK(pthread_mutex_lock(&p->_mutex)) |
| 375 | p->_state = True; | 380 | p->_state = True; |
| 376 | int res1 = pthread_cond_broadcast(&p->_cond); | 381 | { |
| 377 | int res2 = pthread_mutex_unlock(&p->_mutex); | 382 | const int res1 = pthread_cond_broadcast(&p->_cond); |
| 378 | return (res2 ? res2 : res1); | 383 | const int res2 = pthread_mutex_unlock(&p->_mutex); |
| 384 | return (res2 ? res2 : res1); | ||
| 385 | } | ||
| 379 | } | 386 | } |
| 380 | 387 | ||
| 381 | WRes Event_Reset(CEvent *p) | 388 | WRes Event_Reset(CEvent *p) |
| @@ -408,8 +415,8 @@ WRes Event_Close(CEvent *p) | |||
| 408 | return 0; | 415 | return 0; |
| 409 | p->_created = 0; | 416 | p->_created = 0; |
| 410 | { | 417 | { |
| 411 | int res1 = pthread_mutex_destroy(&p->_mutex); | 418 | const int res1 = pthread_mutex_destroy(&p->_mutex); |
| 412 | int res2 = pthread_cond_destroy(&p->_cond); | 419 | const int res2 = pthread_cond_destroy(&p->_cond); |
| 413 | return (res1 ? res1 : res2); | 420 | return (res1 ? res1 : res2); |
| 414 | } | 421 | } |
| 415 | } | 422 | } |
| @@ -487,8 +494,8 @@ WRes Semaphore_Close(CSemaphore *p) | |||
| 487 | return 0; | 494 | return 0; |
| 488 | p->_created = 0; | 495 | p->_created = 0; |
| 489 | { | 496 | { |
| 490 | int res1 = pthread_mutex_destroy(&p->_mutex); | 497 | const int res1 = pthread_mutex_destroy(&p->_mutex); |
| 491 | int res2 = pthread_cond_destroy(&p->_cond); | 498 | const int res2 = pthread_cond_destroy(&p->_cond); |
| 492 | return (res1 ? res1 : res2); | 499 | return (res1 ? res1 : res2); |
| 493 | } | 500 | } |
| 494 | } | 501 | } |
| @@ -549,6 +556,18 @@ LONG InterlockedIncrement(LONG volatile *addend) | |||
| 549 | #endif | 556 | #endif |
| 550 | } | 557 | } |
| 551 | 558 | ||
| 559 | LONG InterlockedDecrement(LONG volatile *addend) | ||
| 560 | { | ||
| 561 | // Print("InterlockedDecrement") | ||
| 562 | #ifdef USE_HACK_UNSAFE_ATOMIC | ||
| 563 | LONG val = *addend - 1; | ||
| 564 | *addend = val; | ||
| 565 | return val; | ||
| 566 | #else | ||
| 567 | return __sync_sub_and_fetch(addend, 1); | ||
| 568 | #endif | ||
| 569 | } | ||
| 570 | |||
| 552 | #endif // _WIN32 | 571 | #endif // _WIN32 |
| 553 | 572 | ||
| 554 | WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) | 573 | WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) |
diff --git a/C/Threads.h b/C/Threads.h index 4028464..c1484a2 100644 --- a/C/Threads.h +++ b/C/Threads.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Threads.h -- multithreading library | 1 | /* Threads.h -- multithreading library |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-28 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_THREADS_H | 4 | #ifndef ZIP7_INC_THREADS_H |
| 5 | #define ZIP7_INC_THREADS_H | 5 | #define ZIP7_INC_THREADS_H |
| @@ -9,12 +9,21 @@ | |||
| 9 | 9 | ||
| 10 | #else | 10 | #else |
| 11 | 11 | ||
| 12 | #include "Compiler.h" | ||
| 13 | |||
| 14 | // #define Z7_AFFINITY_DISABLE | ||
| 12 | #if defined(__linux__) | 15 | #if defined(__linux__) |
| 13 | #if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) | 16 | #if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) |
| 14 | #ifndef Z7_AFFINITY_DISABLE | 17 | #ifndef Z7_AFFINITY_DISABLE |
| 15 | #define Z7_AFFINITY_SUPPORTED | 18 | #define Z7_AFFINITY_SUPPORTED |
| 16 | // #pragma message(" ==== Z7_AFFINITY_SUPPORTED") | 19 | // #pragma message(" ==== Z7_AFFINITY_SUPPORTED") |
| 17 | // #define _GNU_SOURCE | 20 | #if !defined(_GNU_SOURCE) |
| 21 | // #pragma message(" ==== _GNU_SOURCE set") | ||
| 22 | // we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL | ||
| 23 | Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER | ||
| 24 | #define _GNU_SOURCE | ||
| 25 | Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER | ||
| 26 | #endif | ||
| 18 | #endif | 27 | #endif |
| 19 | #endif | 28 | #endif |
| 20 | #endif | 29 | #endif |
| @@ -173,7 +182,7 @@ WRes CriticalSection_Init(CCriticalSection *p); | |||
| 173 | 182 | ||
| 174 | #else // _WIN32 | 183 | #else // _WIN32 |
| 175 | 184 | ||
| 176 | typedef struct _CEvent | 185 | typedef struct |
| 177 | { | 186 | { |
| 178 | int _created; | 187 | int _created; |
| 179 | int _manual_reset; | 188 | int _manual_reset; |
| @@ -199,7 +208,7 @@ WRes Event_Wait(CEvent *p); | |||
| 199 | WRes Event_Close(CEvent *p); | 208 | WRes Event_Close(CEvent *p); |
| 200 | 209 | ||
| 201 | 210 | ||
| 202 | typedef struct _CSemaphore | 211 | typedef struct |
| 203 | { | 212 | { |
| 204 | int _created; | 213 | int _created; |
| 205 | UInt32 _count; | 214 | UInt32 _count; |
| @@ -219,7 +228,7 @@ WRes Semaphore_Wait(CSemaphore *p); | |||
| 219 | WRes Semaphore_Close(CSemaphore *p); | 228 | WRes Semaphore_Close(CSemaphore *p); |
| 220 | 229 | ||
| 221 | 230 | ||
| 222 | typedef struct _CCriticalSection | 231 | typedef struct |
| 223 | { | 232 | { |
| 224 | pthread_mutex_t _mutex; | 233 | pthread_mutex_t _mutex; |
| 225 | } CCriticalSection; | 234 | } CCriticalSection; |
| @@ -230,6 +239,7 @@ void CriticalSection_Enter(CCriticalSection *cs); | |||
| 230 | void CriticalSection_Leave(CCriticalSection *cs); | 239 | void CriticalSection_Leave(CCriticalSection *cs); |
| 231 | 240 | ||
| 232 | LONG InterlockedIncrement(LONG volatile *addend); | 241 | LONG InterlockedIncrement(LONG volatile *addend); |
| 242 | LONG InterlockedDecrement(LONG volatile *addend); | ||
| 233 | 243 | ||
| 234 | #endif // _WIN32 | 244 | #endif // _WIN32 |
| 235 | 245 | ||
diff --git a/C/Util/7z/7z.dsp b/C/Util/7z/7z.dsp index 11e1b03..474c660 100644 --- a/C/Util/7z/7z.dsp +++ b/C/Util/7z/7z.dsp | |||
| @@ -42,7 +42,7 @@ RSC=rc.exe | |||
| 42 | # PROP Ignore_Export_Lib 0 | 42 | # PROP Ignore_Export_Lib 0 |
| 43 | # PROP Target_Dir "" | 43 | # PROP Target_Dir "" |
| 44 | # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c | 44 | # ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c |
| 45 | # ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /FAcs /Yu"Precomp.h" /FD /c | 45 | # ADD CPP /nologo /MD /W4 /WX /GX /O2 /D "NDEBUG" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /FAcs /Yu"Precomp.h" /FD /c |
| 46 | # ADD BASE RSC /l 0x419 /d "NDEBUG" | 46 | # ADD BASE RSC /l 0x419 /d "NDEBUG" |
| 47 | # ADD RSC /l 0x419 /d "NDEBUG" | 47 | # ADD RSC /l 0x419 /d "NDEBUG" |
| 48 | BSC32=bscmake.exe | 48 | BSC32=bscmake.exe |
| @@ -67,7 +67,7 @@ LINK32=link.exe | |||
| 67 | # PROP Ignore_Export_Lib 0 | 67 | # PROP Ignore_Export_Lib 0 |
| 68 | # PROP Target_Dir "" | 68 | # PROP Target_Dir "" |
| 69 | # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c | 69 | # ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c |
| 70 | # ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /Yu"Precomp.h" /FD /GZ /c | 70 | # ADD CPP /nologo /W4 /WX /Gm /GX /ZI /Od /D "_DEBUG" /D "_SZ_ALLOC_DEBUG2" /D "_SZ_NO_INT_64_A" /D "WIN32" /D "_CONSOLE" /D "_UNICODE" /D "UNICODE" /D "Z7_PPMD_SUPPORT" /D "Z7_EXTRACT_ONLY" /Yu"Precomp.h" /FD /GZ /c |
| 71 | # ADD BASE RSC /l 0x419 /d "_DEBUG" | 71 | # ADD BASE RSC /l 0x419 /d "_DEBUG" |
| 72 | # ADD RSC /l 0x419 /d "_DEBUG" | 72 | # ADD RSC /l 0x419 /d "_DEBUG" |
| 73 | BSC32=bscmake.exe | 73 | BSC32=bscmake.exe |
| @@ -234,6 +234,10 @@ SOURCE=.\Precomp.c | |||
| 234 | # End Source File | 234 | # End Source File |
| 235 | # Begin Source File | 235 | # Begin Source File |
| 236 | 236 | ||
| 237 | SOURCE=..\..\Precomp.h | ||
| 238 | # End Source File | ||
| 239 | # Begin Source File | ||
| 240 | |||
| 237 | SOURCE=.\Precomp.h | 241 | SOURCE=.\Precomp.h |
| 238 | # End Source File | 242 | # End Source File |
| 239 | # End Group | 243 | # End Group |
diff --git a/C/Util/7z/7zMain.c b/C/Util/7z/7zMain.c index 547920a..6baf979 100644 --- a/C/Util/7z/7zMain.c +++ b/C/Util/7z/7zMain.c | |||
| @@ -1,20 +1,11 @@ | |||
| 1 | /* 7zMain.c - Test application for 7z Decoder | 1 | /* 7zMain.c - Test application for 7z Decoder |
| 2 | 2023-04-04 : Igor Pavlov : Public domain */ | 2 | 2024-02-28 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include <stdio.h> | 6 | #include <stdio.h> |
| 7 | #include <string.h> | 7 | #include <string.h> |
| 8 | 8 | ||
| 9 | #include "../../CpuArch.h" | ||
| 10 | |||
| 11 | #include "../../7z.h" | ||
| 12 | #include "../../7zAlloc.h" | ||
| 13 | #include "../../7zBuf.h" | ||
| 14 | #include "../../7zCrc.h" | ||
| 15 | #include "../../7zFile.h" | ||
| 16 | #include "../../7zVersion.h" | ||
| 17 | |||
| 18 | #ifndef USE_WINDOWS_FILE | 9 | #ifndef USE_WINDOWS_FILE |
| 19 | /* for mkdir */ | 10 | /* for mkdir */ |
| 20 | #ifdef _WIN32 | 11 | #ifdef _WIN32 |
| @@ -32,6 +23,15 @@ | |||
| 32 | #endif | 23 | #endif |
| 33 | #endif | 24 | #endif |
| 34 | 25 | ||
| 26 | #include "../../7zFile.h" | ||
| 27 | #include "../../7z.h" | ||
| 28 | #include "../../7zAlloc.h" | ||
| 29 | #include "../../7zBuf.h" | ||
| 30 | #include "../../7zCrc.h" | ||
| 31 | #include "../../7zVersion.h" | ||
| 32 | |||
| 33 | #include "../../CpuArch.h" | ||
| 34 | |||
| 35 | #define kInputBufSize ((size_t)1 << 18) | 35 | #define kInputBufSize ((size_t)1 << 18) |
| 36 | 36 | ||
| 37 | static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; | 37 | static const ISzAlloc g_Alloc = { SzAlloc, SzFree }; |
| @@ -168,12 +168,12 @@ static SRes Utf16_To_Char(CBuf *buf, const UInt16 *s | |||
| 168 | #endif | 168 | #endif |
| 169 | ) | 169 | ) |
| 170 | { | 170 | { |
| 171 | unsigned len = 0; | 171 | size_t len = 0; |
| 172 | for (len = 0; s[len] != 0; len++) {} | 172 | for (len = 0; s[len] != 0; len++) {} |
| 173 | 173 | ||
| 174 | #ifndef MY_USE_UTF8 | 174 | #ifndef MY_USE_UTF8 |
| 175 | { | 175 | { |
| 176 | const unsigned size = len * 3 + 100; | 176 | const size_t size = len * 3 + 100; |
| 177 | if (!Buf_EnsureSize(buf, size)) | 177 | if (!Buf_EnsureSize(buf, size)) |
| 178 | return SZ_ERROR_MEM; | 178 | return SZ_ERROR_MEM; |
| 179 | { | 179 | { |
| @@ -320,21 +320,20 @@ static void UIntToStr_2(char *s, unsigned value) | |||
| 320 | // typedef long BOOL; | 320 | // typedef long BOOL; |
| 321 | typedef int BOOL; | 321 | typedef int BOOL; |
| 322 | 322 | ||
| 323 | typedef struct _FILETIME | 323 | typedef struct |
| 324 | { | 324 | { |
| 325 | DWORD dwLowDateTime; | 325 | DWORD dwLowDateTime; |
| 326 | DWORD dwHighDateTime; | 326 | DWORD dwHighDateTime; |
| 327 | } FILETIME; | 327 | } FILETIME; |
| 328 | 328 | ||
| 329 | static LONG TIME_GetBias() | 329 | static LONG TIME_GetBias(void) |
| 330 | { | 330 | { |
| 331 | const time_t utc = time(NULL); | 331 | const time_t utc = time(NULL); |
| 332 | struct tm *ptm = localtime(&utc); | 332 | struct tm *ptm = localtime(&utc); |
| 333 | const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ | 333 | const int localdaylight = ptm->tm_isdst; /* daylight for local timezone */ |
| 334 | ptm = gmtime(&utc); | 334 | ptm = gmtime(&utc); |
| 335 | ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ | 335 | ptm->tm_isdst = localdaylight; /* use local daylight, not that of Greenwich */ |
| 336 | const LONG bias = (int)(mktime(ptm) - utc); | 336 | return (int)(mktime(ptm) - utc); |
| 337 | return bias; | ||
| 338 | } | 337 | } |
| 339 | 338 | ||
| 340 | #define TICKS_PER_SEC 10000000 | 339 | #define TICKS_PER_SEC 10000000 |
| @@ -359,11 +358,11 @@ static BOOL WINAPI FileTimeToLocalFileTime(const FILETIME *fileTime, FILETIME *l | |||
| 359 | static const UInt32 kNumTimeQuantumsInSecond = 10000000; | 358 | static const UInt32 kNumTimeQuantumsInSecond = 10000000; |
| 360 | static const UInt32 kFileTimeStartYear = 1601; | 359 | static const UInt32 kFileTimeStartYear = 1601; |
| 361 | static const UInt32 kUnixTimeStartYear = 1970; | 360 | static const UInt32 kUnixTimeStartYear = 1970; |
| 362 | static const UInt64 kUnixTimeOffset = | ||
| 363 | (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear)); | ||
| 364 | 361 | ||
| 365 | static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) | 362 | static Int64 Time_FileTimeToUnixTime64(const FILETIME *ft) |
| 366 | { | 363 | { |
| 364 | const UInt64 kUnixTimeOffset = | ||
| 365 | (UInt64)60 * 60 * 24 * (89 + 365 * (kUnixTimeStartYear - kFileTimeStartYear)); | ||
| 367 | const UInt64 winTime = GET_TIME_64(ft); | 366 | const UInt64 winTime = GET_TIME_64(ft); |
| 368 | return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; | 367 | return (Int64)(winTime / kNumTimeQuantumsInSecond) - (Int64)kUnixTimeOffset; |
| 369 | } | 368 | } |
| @@ -384,8 +383,10 @@ static void FILETIME_To_timespec(const FILETIME *ft, struct MY_ST_TIMESPEC *ts) | |||
| 384 | if (sec2 == sec) | 383 | if (sec2 == sec) |
| 385 | { | 384 | { |
| 386 | ts->tv_sec = sec2; | 385 | ts->tv_sec = sec2; |
| 387 | const UInt64 winTime = GET_TIME_64(ft); | 386 | { |
| 388 | ts->tv_nsec = (long)((winTime % 10000000) * 100); | 387 | const UInt64 winTime = GET_TIME_64(ft); |
| 388 | ts->tv_nsec = (long)((winTime % 10000000) * 100); | ||
| 389 | } | ||
| 389 | return; | 390 | return; |
| 390 | } | 391 | } |
| 391 | } | 392 | } |
| @@ -429,7 +430,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s) | |||
| 429 | { | 430 | { |
| 430 | unsigned year, mon, hour, min, sec; | 431 | unsigned year, mon, hour, min, sec; |
| 431 | Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; | 432 | Byte ms[] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }; |
| 432 | unsigned t; | 433 | UInt32 t; |
| 433 | UInt32 v; | 434 | UInt32 v; |
| 434 | // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); | 435 | // UInt64 v64 = nt->Low | ((UInt64)nt->High << 32); |
| 435 | UInt64 v64; | 436 | UInt64 v64; |
| @@ -461,7 +462,7 @@ static void ConvertFileTimeToString(const CNtfsFileTime *nTime, char *s) | |||
| 461 | ms[1] = 29; | 462 | ms[1] = 29; |
| 462 | for (mon = 0;; mon++) | 463 | for (mon = 0;; mon++) |
| 463 | { | 464 | { |
| 464 | const unsigned d = ms[mon]; | 465 | const UInt32 d = ms[mon]; |
| 465 | if (v < d) | 466 | if (v < d) |
| 466 | break; | 467 | break; |
| 467 | v -= d; | 468 | v -= d; |
diff --git a/C/Util/7z/Precomp.h b/C/Util/7z/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7z/Precomp.h +++ b/C/Util/7z/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/7z/makefile b/C/Util/7z/makefile index dfc560e..987f065 100644 --- a/C/Util/7z/makefile +++ b/C/Util/7z/makefile | |||
| @@ -5,8 +5,6 @@ PROG = 7zDec.exe | |||
| 5 | C_OBJS = \ | 5 | C_OBJS = \ |
| 6 | $O\7zAlloc.obj \ | 6 | $O\7zAlloc.obj \ |
| 7 | $O\7zBuf.obj \ | 7 | $O\7zBuf.obj \ |
| 8 | $O\7zCrc.obj \ | ||
| 9 | $O\7zCrcOpt.obj \ | ||
| 10 | $O\7zFile.obj \ | 8 | $O\7zFile.obj \ |
| 11 | $O\7zDec.obj \ | 9 | $O\7zDec.obj \ |
| 12 | $O\7zArcIn.obj \ | 10 | $O\7zArcIn.obj \ |
| @@ -25,10 +23,14 @@ C_OBJS = \ | |||
| 25 | 7Z_OBJS = \ | 23 | 7Z_OBJS = \ |
| 26 | $O\7zMain.obj \ | 24 | $O\7zMain.obj \ |
| 27 | 25 | ||
| 26 | !include "../../../CPP/7zip/Crc.mak" | ||
| 27 | !include "../../../CPP/7zip/LzmaDec.mak" | ||
| 28 | |||
| 28 | OBJS = \ | 29 | OBJS = \ |
| 29 | $O\Precomp.obj \ | 30 | $O\Precomp.obj \ |
| 30 | $(7Z_OBJS) \ | 31 | $(7Z_OBJS) \ |
| 31 | $(C_OBJS) \ | 32 | $(C_OBJS) \ |
| 33 | $(ASM_OBJS) \ | ||
| 32 | 34 | ||
| 33 | !include "../../../CPP/Build.mak" | 35 | !include "../../../CPP/Build.mak" |
| 34 | 36 | ||
| @@ -38,3 +40,5 @@ $(C_OBJS): ../../$(*B).c | |||
| 38 | $(CCOMPL_USE) | 40 | $(CCOMPL_USE) |
| 39 | $O\Precomp.obj: Precomp.c | 41 | $O\Precomp.obj: Precomp.c |
| 40 | $(CCOMPL_PCH) | 42 | $(CCOMPL_PCH) |
| 43 | |||
| 44 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/7zipInstall/7zipInstall.c b/C/Util/7zipInstall/7zipInstall.c index 7f5fd19..7d8e8c4 100644 --- a/C/Util/7zipInstall/7zipInstall.c +++ b/C/Util/7zipInstall/7zipInstall.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* 7zipInstall.c - 7-Zip Installer | 1 | /* 7zipInstall.c - 7-Zip Installer |
| 2 | 2023-04-04 : Igor Pavlov : Public domain */ | 2 | 2024-04-05 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -11,6 +11,8 @@ | |||
| 11 | #pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union | 11 | #pragma warning(disable : 4201) // nonstandard extension used : nameless struct/union |
| 12 | #endif | 12 | #endif |
| 13 | 13 | ||
| 14 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
| 15 | |||
| 14 | #ifdef Z7_OLD_WIN_SDK | 16 | #ifdef Z7_OLD_WIN_SDK |
| 15 | struct IShellView; | 17 | struct IShellView; |
| 16 | #define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE | 18 | #define SHFOLDERAPI EXTERN_C DECLSPEC_IMPORT HRESULT STDAPICALLTYPE |
| @@ -41,16 +43,6 @@ typedef enum { | |||
| 41 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | 43 | // #pragma GCC diagnostic ignored "-Wcast-function-type" |
| 42 | #endif | 44 | #endif |
| 43 | 45 | ||
| 44 | #if defined(__clang__) || defined(__GNUC__) | ||
| 45 | typedef void (*Z7_voidFunction)(void); | ||
| 46 | #define MY_CAST_FUNC (Z7_voidFunction) | ||
| 47 | #elif defined(_MSC_VER) && _MSC_VER > 1920 | ||
| 48 | #define MY_CAST_FUNC (void *) | ||
| 49 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
| 50 | #else | ||
| 51 | #define MY_CAST_FUNC | ||
| 52 | #endif | ||
| 53 | |||
| 54 | #define LLL_(quote) L##quote | 46 | #define LLL_(quote) L##quote |
| 55 | #define LLL(quote) LLL_(quote) | 47 | #define LLL(quote) LLL_(quote) |
| 56 | 48 | ||
| @@ -118,11 +110,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path" | |||
| 118 | #define k_Reg_WOW_Flag 0 | 110 | #define k_Reg_WOW_Flag 0 |
| 119 | #endif | 111 | #endif |
| 120 | 112 | ||
| 113 | #ifdef USE_7ZIP_32_DLL | ||
| 121 | #ifdef _WIN64 | 114 | #ifdef _WIN64 |
| 122 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY | 115 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY |
| 123 | #else | 116 | #else |
| 124 | #define k_Reg_WOW_Flag_32 0 | 117 | #define k_Reg_WOW_Flag_32 0 |
| 125 | #endif | 118 | #endif |
| 119 | #endif | ||
| 126 | 120 | ||
| 127 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" | 121 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" |
| 128 | 122 | ||
| @@ -219,11 +213,11 @@ static DWORD GetFileVersion(LPCWSTR s) | |||
| 219 | return 0; | 213 | return 0; |
| 220 | } | 214 | } |
| 221 | 215 | ||
| 222 | my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 216 | my_GetFileVersionInfoSizeW = (Func_GetFileVersionInfoSizeW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
| 223 | "GetFileVersionInfoSizeW"); | 217 | "GetFileVersionInfoSizeW"); |
| 224 | my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 218 | my_GetFileVersionInfoW = (Func_GetFileVersionInfoW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
| 225 | "GetFileVersionInfoW"); | 219 | "GetFileVersionInfoW"); |
| 226 | my_VerQueryValueW = (Func_VerQueryValueW) MY_CAST_FUNC GetProcAddress(g_version_dll_hModule, | 220 | my_VerQueryValueW = (Func_VerQueryValueW) Z7_CAST_FUNC_C GetProcAddress(g_version_dll_hModule, |
| 227 | "VerQueryValueW"); | 221 | "VerQueryValueW"); |
| 228 | 222 | ||
| 229 | if (!my_GetFileVersionInfoSizeW | 223 | if (!my_GetFileVersionInfoSizeW |
| @@ -1102,7 +1096,7 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
| 1102 | { | 1096 | { |
| 1103 | BOOL isWow64 = FALSE; | 1097 | BOOL isWow64 = FALSE; |
| 1104 | const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process) | 1098 | const Func_IsWow64Process func_IsWow64Process = (Func_IsWow64Process) |
| 1105 | MY_CAST_FUNC GetProcAddress(GetModuleHandleW(L"kernel32.dll"), | 1099 | Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"kernel32.dll"), |
| 1106 | "IsWow64Process"); | 1100 | "IsWow64Process"); |
| 1107 | 1101 | ||
| 1108 | if (func_IsWow64Process) | 1102 | if (func_IsWow64Process) |
| @@ -1111,7 +1105,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
| 1111 | if (!isWow64) | 1105 | if (!isWow64) |
| 1112 | { | 1106 | { |
| 1113 | if (!g_SilentMode) | 1107 | if (!g_SilentMode) |
| 1114 | PrintErrorMessage("This installation requires Windows " MY_CPU_NAME, NULL); | 1108 | PrintErrorMessage("This installation requires Windows " |
| 1109 | #ifdef MY_CPU_X86_OR_AMD64 | ||
| 1110 | "x64" | ||
| 1111 | #else | ||
| 1112 | "64-bit" | ||
| 1113 | #endif | ||
| 1114 | , NULL); | ||
| 1115 | return 1; | 1115 | return 1; |
| 1116 | } | 1116 | } |
| 1117 | } | 1117 | } |
diff --git a/C/Util/7zipInstall/Precomp.h b/C/Util/7zipInstall/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7zipInstall/Precomp.h +++ b/C/Util/7zipInstall/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/7zipInstall/makefile b/C/Util/7zipInstall/makefile index 18e2783..424bd6c 100644 --- a/C/Util/7zipInstall/makefile +++ b/C/Util/7zipInstall/makefile | |||
| @@ -19,9 +19,6 @@ C_OBJS = \ | |||
| 19 | $O\7zAlloc.obj \ | 19 | $O\7zAlloc.obj \ |
| 20 | $O\7zArcIn.obj \ | 20 | $O\7zArcIn.obj \ |
| 21 | $O\7zBuf.obj \ | 21 | $O\7zBuf.obj \ |
| 22 | $O\7zBuf2.obj \ | ||
| 23 | $O\7zCrc.obj \ | ||
| 24 | $O\7zCrcOpt.obj \ | ||
| 25 | $O\7zFile.obj \ | 22 | $O\7zFile.obj \ |
| 26 | $O\7zDec.obj \ | 23 | $O\7zDec.obj \ |
| 27 | $O\7zStream.obj \ | 24 | $O\7zStream.obj \ |
| @@ -34,11 +31,17 @@ C_OBJS = \ | |||
| 34 | OBJS = \ | 31 | OBJS = \ |
| 35 | $(MAIN_OBJS) \ | 32 | $(MAIN_OBJS) \ |
| 36 | $(C_OBJS) \ | 33 | $(C_OBJS) \ |
| 34 | $(ASM_OBJS) \ | ||
| 37 | $O\resource.res | 35 | $O\resource.res |
| 38 | 36 | ||
| 37 | !include "../../../CPP/7zip/Crc.mak" | ||
| 38 | # !include "../../../CPP/7zip/LzmaDec.mak" | ||
| 39 | |||
| 39 | !include "../../../CPP/Build.mak" | 40 | !include "../../../CPP/Build.mak" |
| 40 | 41 | ||
| 41 | $(MAIN_OBJS): $(*B).c | 42 | $(MAIN_OBJS): $(*B).c |
| 42 | $(COMPL_O1) | 43 | $(COMPL_O1) |
| 43 | $(C_OBJS): ../../$(*B).c | 44 | $(C_OBJS): ../../$(*B).c |
| 44 | $(COMPL_O1) | 45 | $(COMPL_O1) |
| 46 | |||
| 47 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/7zipInstall/resource.rc b/C/Util/7zipInstall/resource.rc index df6474e..40ed580 100644 --- a/C/Util/7zipInstall/resource.rc +++ b/C/Util/7zipInstall/resource.rc | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #include <winnt.h> | 1 | #include <windows.h> |
| 2 | #include <WinUser.h> | 2 | // #include <winnt.h> |
| 3 | // #include <WinUser.h> | ||
| 3 | #include <CommCtrl.h> | 4 | #include <CommCtrl.h> |
| 4 | 5 | ||
| 5 | #define USE_COPYRIGHT_CR | 6 | #define USE_COPYRIGHT_CR |
diff --git a/C/Util/7zipUninstall/7zipUninstall.c b/C/Util/7zipUninstall/7zipUninstall.c index 8bc18b3..e7051e2 100644 --- a/C/Util/7zipUninstall/7zipUninstall.c +++ b/C/Util/7zipUninstall/7zipUninstall.c | |||
| @@ -1,10 +1,11 @@ | |||
| 1 | /* 7zipUninstall.c - 7-Zip Uninstaller | 1 | /* 7zipUninstall.c - 7-Zip Uninstaller |
| 2 | 2022-07-15 : Igor Pavlov : Public domain */ | 2 | 2024-03-21 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | // #define SZ_ERROR_ABORT 100 | 6 | // #define SZ_ERROR_ABORT 100 |
| 7 | 7 | ||
| 8 | #include "../../7zTypes.h" | ||
| 8 | #include "../../7zWindows.h" | 9 | #include "../../7zWindows.h" |
| 9 | 10 | ||
| 10 | #if defined(_MSC_VER) && _MSC_VER < 1600 | 11 | #if defined(_MSC_VER) && _MSC_VER < 1600 |
| @@ -31,16 +32,7 @@ typedef enum { | |||
| 31 | 32 | ||
| 32 | #include "resource.h" | 33 | #include "resource.h" |
| 33 | 34 | ||
| 34 | #if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) | ||
| 35 | // #pragma GCC diagnostic ignored "-Wcast-function-type" | ||
| 36 | #endif | ||
| 37 | 35 | ||
| 38 | #if defined(_MSC_VER) && _MSC_VER > 1920 | ||
| 39 | #define MY_CAST_FUNC (void *) | ||
| 40 | // #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' | ||
| 41 | #else | ||
| 42 | #define MY_CAST_FUNC | ||
| 43 | #endif | ||
| 44 | 36 | ||
| 45 | 37 | ||
| 46 | #define LLL_(quote) L##quote | 38 | #define LLL_(quote) L##quote |
| @@ -101,11 +93,13 @@ static LPCWSTR const k_Reg_Path32 = L"Path" | |||
| 101 | #define k_Reg_WOW_Flag 0 | 93 | #define k_Reg_WOW_Flag 0 |
| 102 | #endif | 94 | #endif |
| 103 | 95 | ||
| 96 | #ifdef USE_7ZIP_32_DLL | ||
| 104 | #ifdef _WIN64 | 97 | #ifdef _WIN64 |
| 105 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY | 98 | #define k_Reg_WOW_Flag_32 KEY_WOW64_32KEY |
| 106 | #else | 99 | #else |
| 107 | #define k_Reg_WOW_Flag_32 0 | 100 | #define k_Reg_WOW_Flag_32 0 |
| 108 | #endif | 101 | #endif |
| 102 | #endif | ||
| 109 | 103 | ||
| 110 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" | 104 | #define k_7zip_CLSID L"{23170F69-40C1-278A-1000-000100020000}" |
| 111 | 105 | ||
| @@ -124,9 +118,19 @@ static HWND g_Path_HWND; | |||
| 124 | static HWND g_InfoLine_HWND; | 118 | static HWND g_InfoLine_HWND; |
| 125 | static HWND g_Progress_HWND; | 119 | static HWND g_Progress_HWND; |
| 126 | 120 | ||
| 127 | // WINADVAPI | 121 | // RegDeleteKeyExW is supported starting from win2003sp1/xp-pro-x64 |
| 122 | // Z7_WIN32_WINNT_MIN < 0x0600 // Vista | ||
| 123 | #if !defined(Z7_WIN32_WINNT_MIN) \ | ||
| 124 | || Z7_WIN32_WINNT_MIN < 0x0502 /* < win2003 */ \ | ||
| 125 | || Z7_WIN32_WINNT_MIN == 0x0502 && !defined(_M_AMD64) | ||
| 126 | #define Z7_USE_DYN_RegDeleteKeyExW | ||
| 127 | #endif | ||
| 128 | |||
| 129 | #ifdef Z7_USE_DYN_RegDeleteKeyExW | ||
| 130 | Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION | ||
| 128 | typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved); | 131 | typedef LONG (APIENTRY *Func_RegDeleteKeyExW)(HKEY hKey, LPCWSTR lpSubKey, REGSAM samDesired, DWORD Reserved); |
| 129 | static Func_RegDeleteKeyExW func_RegDeleteKeyExW; | 132 | static Func_RegDeleteKeyExW func_RegDeleteKeyExW; |
| 133 | #endif | ||
| 130 | 134 | ||
| 131 | static WCHAR cmd[MAX_PATH + 4]; | 135 | static WCHAR cmd[MAX_PATH + 4]; |
| 132 | static WCHAR cmdError[MAX_PATH + 4]; | 136 | static WCHAR cmdError[MAX_PATH + 4]; |
| @@ -247,13 +251,18 @@ static LONG MyRegistry_OpenKey_ReadWrite(HKEY parentKey, LPCWSTR name, HKEY *des | |||
| 247 | 251 | ||
| 248 | static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name) | 252 | static LONG MyRegistry_DeleteKey(HKEY parentKey, LPCWSTR name) |
| 249 | { | 253 | { |
| 250 | #if k_Reg_WOW_Flag != 0 | 254 | #if k_Reg_WOW_Flag != 0 |
| 251 | if (func_RegDeleteKeyExW) | 255 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
| 252 | return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag, 0); | 256 | if (!func_RegDeleteKeyExW) |
| 253 | return E_FAIL; | 257 | return E_FAIL; |
| 254 | #else | 258 | return func_RegDeleteKeyExW |
| 259 | #else | ||
| 260 | return RegDeleteKeyExW | ||
| 261 | #endif | ||
| 262 | (parentKey, name, k_Reg_WOW_Flag, 0); | ||
| 263 | #else | ||
| 255 | return RegDeleteKeyW(parentKey, name); | 264 | return RegDeleteKeyW(parentKey, name); |
| 256 | #endif | 265 | #endif |
| 257 | } | 266 | } |
| 258 | 267 | ||
| 259 | #ifdef USE_7ZIP_32_DLL | 268 | #ifdef USE_7ZIP_32_DLL |
| @@ -278,13 +287,18 @@ static LONG MyRegistry_OpenKey_ReadWrite_32(HKEY parentKey, LPCWSTR name, HKEY * | |||
| 278 | 287 | ||
| 279 | static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name) | 288 | static LONG MyRegistry_DeleteKey_32(HKEY parentKey, LPCWSTR name) |
| 280 | { | 289 | { |
| 281 | #if k_Reg_WOW_Flag_32 != 0 | 290 | #if k_Reg_WOW_Flag_32 != 0 |
| 282 | if (func_RegDeleteKeyExW) | 291 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
| 283 | return func_RegDeleteKeyExW(parentKey, name, k_Reg_WOW_Flag_32, 0); | 292 | if (!func_RegDeleteKeyExW) |
| 284 | return E_FAIL; | 293 | return E_FAIL; |
| 285 | #else | 294 | return func_RegDeleteKeyExW |
| 295 | #else | ||
| 296 | return RegDeleteKeyExW | ||
| 297 | #endif | ||
| 298 | (parentKey, name, k_Reg_WOW_Flag_32, 0); | ||
| 299 | #else | ||
| 286 | return RegDeleteKeyW(parentKey, name); | 300 | return RegDeleteKeyW(parentKey, name); |
| 287 | #endif | 301 | #endif |
| 288 | } | 302 | } |
| 289 | 303 | ||
| 290 | #endif | 304 | #endif |
| @@ -930,14 +944,17 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
| 930 | UNUSED_VAR(lpCmdLine) | 944 | UNUSED_VAR(lpCmdLine) |
| 931 | UNUSED_VAR(nCmdShow) | 945 | UNUSED_VAR(nCmdShow) |
| 932 | 946 | ||
| 933 | #ifndef UNDER_CE | 947 | #ifndef UNDER_CE |
| 934 | CoInitialize(NULL); | 948 | CoInitialize(NULL); |
| 935 | #endif | 949 | #endif |
| 936 | 950 | ||
| 937 | #ifndef UNDER_CE | 951 | #ifndef UNDER_CE |
| 938 | func_RegDeleteKeyExW = (Func_RegDeleteKeyExW) MY_CAST_FUNC | 952 | #ifdef Z7_USE_DYN_RegDeleteKeyExW |
| 939 | GetProcAddress(GetModuleHandleW(L"advapi32.dll"), "RegDeleteKeyExW"); | 953 | func_RegDeleteKeyExW = |
| 940 | #endif | 954 | (Func_RegDeleteKeyExW) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandleW(L"advapi32.dll"), |
| 955 | "RegDeleteKeyExW"); | ||
| 956 | #endif | ||
| 957 | #endif | ||
| 941 | 958 | ||
| 942 | { | 959 | { |
| 943 | const wchar_t *s = GetCommandLineW(); | 960 | const wchar_t *s = GetCommandLineW(); |
diff --git a/C/Util/7zipUninstall/Precomp.h b/C/Util/7zipUninstall/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/7zipUninstall/Precomp.h +++ b/C/Util/7zipUninstall/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/7zipUninstall/resource.rc b/C/Util/7zipUninstall/resource.rc index 00bdcc0..79400c6 100644 --- a/C/Util/7zipUninstall/resource.rc +++ b/C/Util/7zipUninstall/resource.rc | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | #include <winnt.h> | 1 | #include <windows.h> |
| 2 | #include <WinUser.h> | 2 | // #include <winnt.h> |
| 3 | // #include <WinUser.h> | ||
| 3 | #include <CommCtrl.h> | 4 | #include <CommCtrl.h> |
| 4 | 5 | ||
| 5 | #define USE_COPYRIGHT_CR | 6 | #define USE_COPYRIGHT_CR |
diff --git a/C/Util/Lzma/Precomp.h b/C/Util/Lzma/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/Lzma/Precomp.h +++ b/C/Util/Lzma/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/LzmaLib/Precomp.h b/C/Util/LzmaLib/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/LzmaLib/Precomp.h +++ b/C/Util/LzmaLib/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/LzmaLib/makefile b/C/Util/LzmaLib/makefile index b8e054e..9ed0aa4 100644 --- a/C/Util/LzmaLib/makefile +++ b/C/Util/LzmaLib/makefile | |||
| @@ -14,16 +14,19 @@ C_OBJS = \ | |||
| 14 | $O\CpuArch.obj \ | 14 | $O\CpuArch.obj \ |
| 15 | $O\LzFind.obj \ | 15 | $O\LzFind.obj \ |
| 16 | $O\LzFindMt.obj \ | 16 | $O\LzFindMt.obj \ |
| 17 | $O\LzFindOpt.obj \ | ||
| 18 | $O\LzmaDec.obj \ | 17 | $O\LzmaDec.obj \ |
| 19 | $O\LzmaEnc.obj \ | 18 | $O\LzmaEnc.obj \ |
| 20 | $O\LzmaLib.obj \ | 19 | $O\LzmaLib.obj \ |
| 21 | $O\Threads.obj \ | 20 | $O\Threads.obj \ |
| 22 | 21 | ||
| 22 | !include "../../../CPP/7zip/LzFindOpt.mak" | ||
| 23 | !include "../../../CPP/7zip/LzmaDec.mak" | ||
| 24 | |||
| 23 | OBJS = \ | 25 | OBJS = \ |
| 24 | $O\Precomp.obj \ | 26 | $O\Precomp.obj \ |
| 25 | $(LIB_OBJS) \ | 27 | $(LIB_OBJS) \ |
| 26 | $(C_OBJS) \ | 28 | $(C_OBJS) \ |
| 29 | $(ASM_OBJS) \ | ||
| 27 | $O\resource.res | 30 | $O\resource.res |
| 28 | 31 | ||
| 29 | !include "../../../CPP/Build.mak" | 32 | !include "../../../CPP/Build.mak" |
| @@ -52,3 +55,5 @@ $(C_OBJS): ../../$(*B).c | |||
| 52 | $(CCOMPLB_USE) | 55 | $(CCOMPLB_USE) |
| 53 | 56 | ||
| 54 | !ENDIF | 57 | !ENDIF |
| 58 | |||
| 59 | !include "../../Asm_c.mak" | ||
diff --git a/C/Util/SfxSetup/Precomp.h b/C/Util/SfxSetup/Precomp.h index bc8fa21..13a41ef 100644 --- a/C/Util/SfxSetup/Precomp.h +++ b/C/Util/SfxSetup/Precomp.h | |||
| @@ -1,14 +1,13 @@ | |||
| 1 | /* Precomp.h -- StdAfx | 1 | /* Precomp.h -- Precomp |
| 2 | 2023-03-04 : Igor Pavlov : Public domain */ | 2 | 2024-01-23 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_PRECOMP_H | 4 | // #ifndef ZIP7_INC_PRECOMP_LOC_H |
| 5 | #define ZIP7_INC_PRECOMP_H | 5 | // #define ZIP7_INC_PRECOMP_LOC_H |
| 6 | 6 | ||
| 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 | 7 | #if defined(_MSC_VER) && _MSC_VER >= 1800 |
| 8 | #pragma warning(disable : 4464) // relative include path contains '..' | 8 | #pragma warning(disable : 4464) // relative include path contains '..' |
| 9 | #endif | 9 | #endif |
| 10 | 10 | ||
| 11 | #include "../../Compiler.h" | 11 | #include "../../Precomp.h" |
| 12 | #include "../../7zTypes.h" | ||
| 13 | 12 | ||
| 14 | #endif | 13 | // #endif |
diff --git a/C/Util/SfxSetup/SfxSetup.c b/C/Util/SfxSetup/SfxSetup.c index 7304a0b..9b5c1f9 100644 --- a/C/Util/SfxSetup/SfxSetup.c +++ b/C/Util/SfxSetup/SfxSetup.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* SfxSetup.c - 7z SFX Setup | 1 | /* SfxSetup.c - 7z SFX Setup |
| 2 | 2019-02-02 : Igor Pavlov : Public domain */ | 2 | 2024-01-24 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -278,10 +278,10 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
| 278 | #ifdef _CONSOLE | 278 | #ifdef _CONSOLE |
| 279 | SetConsoleCtrlHandler(HandlerRoutine, TRUE); | 279 | SetConsoleCtrlHandler(HandlerRoutine, TRUE); |
| 280 | #else | 280 | #else |
| 281 | UNUSED_VAR(hInstance); | 281 | UNUSED_VAR(hInstance) |
| 282 | UNUSED_VAR(hPrevInstance); | 282 | UNUSED_VAR(hPrevInstance) |
| 283 | UNUSED_VAR(lpCmdLine); | 283 | UNUSED_VAR(lpCmdLine) |
| 284 | UNUSED_VAR(nCmdShow); | 284 | UNUSED_VAR(nCmdShow) |
| 285 | #endif | 285 | #endif |
| 286 | 286 | ||
| 287 | CrcGenerateTable(); | 287 | CrcGenerateTable(); |
| @@ -516,12 +516,13 @@ int APIENTRY WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, | |||
| 516 | #endif | 516 | #endif |
| 517 | 517 | ||
| 518 | { | 518 | { |
| 519 | const SRes res2 = File_Close(&outFile); | 519 | const WRes res2 = File_Close(&outFile); |
| 520 | if (res != SZ_OK) | 520 | if (res != SZ_OK) |
| 521 | break; | 521 | break; |
| 522 | if (res2 != SZ_OK) | 522 | if (res2 != 0) |
| 523 | { | 523 | { |
| 524 | res = res2; | 524 | errorMessage = "Can't close output file"; |
| 525 | res = SZ_ERROR_FAIL; | ||
| 525 | break; | 526 | break; |
| 526 | } | 527 | } |
| 527 | } | 528 | } |
diff --git a/C/Util/SfxSetup/makefile b/C/Util/SfxSetup/makefile index bc0cf8b..b3f25a2 100644 --- a/C/Util/SfxSetup/makefile +++ b/C/Util/SfxSetup/makefile | |||
| @@ -9,8 +9,6 @@ C_OBJS = \ | |||
| 9 | $O\7zArcIn.obj \ | 9 | $O\7zArcIn.obj \ |
| 10 | $O\7zBuf.obj \ | 10 | $O\7zBuf.obj \ |
| 11 | $O\7zBuf2.obj \ | 11 | $O\7zBuf2.obj \ |
| 12 | $O\7zCrc.obj \ | ||
| 13 | $O\7zCrcOpt.obj \ | ||
| 14 | $O\7zFile.obj \ | 12 | $O\7zFile.obj \ |
| 15 | $O\7zDec.obj \ | 13 | $O\7zDec.obj \ |
| 16 | $O\7zStream.obj \ | 14 | $O\7zStream.obj \ |
| @@ -27,9 +25,13 @@ C_OBJS = \ | |||
| 27 | 7Z_OBJS = \ | 25 | 7Z_OBJS = \ |
| 28 | $O\SfxSetup.obj \ | 26 | $O\SfxSetup.obj \ |
| 29 | 27 | ||
| 28 | !include "../../../CPP/7zip/Crc.mak" | ||
| 29 | # !include "../../../CPP/7zip/LzmaDec.mak" | ||
| 30 | |||
| 30 | OBJS = \ | 31 | OBJS = \ |
| 31 | $(7Z_OBJS) \ | 32 | $(7Z_OBJS) \ |
| 32 | $(C_OBJS) \ | 33 | $(C_OBJS) \ |
| 34 | $(ASM_OBJS) \ | ||
| 33 | $O\resource.res | 35 | $O\resource.res |
| 34 | 36 | ||
| 35 | !include "../../../CPP/Build.mak" | 37 | !include "../../../CPP/Build.mak" |
| @@ -38,3 +40,5 @@ $(7Z_OBJS): $(*B).c | |||
| 38 | $(COMPL_O1) | 40 | $(COMPL_O1) |
| 39 | $(C_OBJS): ../../$(*B).c | 41 | $(C_OBJS): ../../$(*B).c |
| 40 | $(COMPL_O1) | 42 | $(COMPL_O1) |
| 43 | |||
| 44 | !include "../../Asm_c.mak" | ||
diff --git a/C/Xxh64.c b/C/Xxh64.c new file mode 100644 index 0000000..dc02a02 --- /dev/null +++ b/C/Xxh64.c | |||
| @@ -0,0 +1,327 @@ | |||
| 1 | /* Xxh64.c -- XXH64 hash calculation | ||
| 2 | original code: Copyright (c) Yann Collet. | ||
| 3 | 2023-08-18 : modified by Igor Pavlov. | ||
| 4 | This source code is licensed under BSD 2-Clause License. | ||
| 5 | */ | ||
| 6 | |||
| 7 | #include "Precomp.h" | ||
| 8 | |||
| 9 | #include "CpuArch.h" | ||
| 10 | #include "RotateDefs.h" | ||
| 11 | #include "Xxh64.h" | ||
| 12 | |||
| 13 | #define Z7_XXH_PRIME64_1 UINT64_CONST(0x9E3779B185EBCA87) | ||
| 14 | #define Z7_XXH_PRIME64_2 UINT64_CONST(0xC2B2AE3D27D4EB4F) | ||
| 15 | #define Z7_XXH_PRIME64_3 UINT64_CONST(0x165667B19E3779F9) | ||
| 16 | #define Z7_XXH_PRIME64_4 UINT64_CONST(0x85EBCA77C2B2AE63) | ||
| 17 | #define Z7_XXH_PRIME64_5 UINT64_CONST(0x27D4EB2F165667C5) | ||
| 18 | |||
| 19 | void Xxh64State_Init(CXxh64State *p) | ||
| 20 | { | ||
| 21 | const UInt64 seed = 0; | ||
| 22 | p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2; | ||
| 23 | p->v[1] = seed + Z7_XXH_PRIME64_2; | ||
| 24 | p->v[2] = seed; | ||
| 25 | p->v[3] = seed - Z7_XXH_PRIME64_1; | ||
| 26 | } | ||
| 27 | |||
| 28 | #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER) | ||
| 29 | #define Z7_XXH64_USE_ASM | ||
| 30 | #endif | ||
| 31 | |||
| 32 | #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \ | ||
| 33 | && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200 | ||
| 34 | /* we try to avoid __allmul calls in MSVC for 64-bit multiply. | ||
| 35 | But MSVC6 still uses __allmul for our code. | ||
| 36 | So for MSVC6 we use default 64-bit multiply without our optimization. | ||
| 37 | */ | ||
| 38 | #define LOW32(b) ((UInt32)(b & 0xffffffff)) | ||
| 39 | /* MSVC compiler (MSVC > 1200) can use "mul" instruction | ||
| 40 | without __allmul for our MY_emulu MACRO. | ||
| 41 | MY_emulu is similar to __emulu(a, b) MACRO */ | ||
| 42 | #define MY_emulu(a, b) ((UInt64)(a) * (b)) | ||
| 43 | #define MY_SET_HIGH32(a) ((UInt64)(a) << 32) | ||
| 44 | #define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b)) | ||
| 45 | // /* | ||
| 46 | #define MY_MUL64(a, b) \ | ||
| 47 | ( MY_emulu((UInt32)(a), LOW32(b)) + \ | ||
| 48 | MY_SET_HIGH32( \ | ||
| 49 | (UInt32)((a) >> 32) * LOW32(b) + \ | ||
| 50 | (UInt32)(a) * (UInt32)((b) >> 32) \ | ||
| 51 | )) | ||
| 52 | // */ | ||
| 53 | /* | ||
| 54 | #define MY_MUL64(a, b) \ | ||
| 55 | ( MY_emulu((UInt32)(a), LOW32(b)) \ | ||
| 56 | + MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \ | ||
| 57 | + MY_MUL32_SET_HIGH32(a, (b) >> 32) \ | ||
| 58 | ) | ||
| 59 | */ | ||
| 60 | |||
| 61 | #define MY_MUL_32_64(a32, b) \ | ||
| 62 | ( MY_emulu((UInt32)(a32), LOW32(b)) \ | ||
| 63 | + MY_MUL32_SET_HIGH32(a32, (b) >> 32) \ | ||
| 64 | ) | ||
| 65 | |||
| 66 | #else | ||
| 67 | #define MY_MUL64(a, b) ((a) * (b)) | ||
| 68 | #define MY_MUL_32_64(a32, b) ((a32) * (UInt64)(b)) | ||
| 69 | #endif | ||
| 70 | |||
| 71 | |||
| 72 | static | ||
| 73 | Z7_FORCE_INLINE | ||
| 74 | UInt64 Xxh64_Round(UInt64 acc, UInt64 input) | ||
| 75 | { | ||
| 76 | acc += MY_MUL64(input, Z7_XXH_PRIME64_2); | ||
| 77 | acc = Z7_ROTL64(acc, 31); | ||
| 78 | return MY_MUL64(acc, Z7_XXH_PRIME64_1); | ||
| 79 | } | ||
| 80 | |||
| 81 | static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val) | ||
| 82 | { | ||
| 83 | acc ^= Xxh64_Round(0, val); | ||
| 84 | return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; | ||
| 85 | } | ||
| 86 | |||
| 87 | |||
| 88 | #ifdef Z7_XXH64_USE_ASM | ||
| 89 | |||
| 90 | #define Z7_XXH_PRIME64_1_HIGH 0x9E3779B1 | ||
| 91 | #define Z7_XXH_PRIME64_1_LOW 0x85EBCA87 | ||
| 92 | #define Z7_XXH_PRIME64_2_HIGH 0xC2B2AE3D | ||
| 93 | #define Z7_XXH_PRIME64_2_LOW 0x27D4EB4F | ||
| 94 | |||
| 95 | void | ||
| 96 | Z7_NO_INLINE | ||
| 97 | __declspec(naked) | ||
| 98 | Z7_FASTCALL | ||
| 99 | Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end) | ||
| 100 | { | ||
| 101 | #if !defined(__clang__) | ||
| 102 | UNUSED_VAR(p) | ||
| 103 | UNUSED_VAR(data) | ||
| 104 | UNUSED_VAR(end) | ||
| 105 | #endif | ||
| 106 | __asm push ebx | ||
| 107 | __asm push ebp | ||
| 108 | __asm push esi | ||
| 109 | __asm push edi | ||
| 110 | |||
| 111 | #define STACK_OFFSET 4 * 8 | ||
| 112 | __asm sub esp, STACK_OFFSET | ||
| 113 | |||
| 114 | #define COPY_1(n) \ | ||
| 115 | __asm mov eax, [ecx + n * 4] \ | ||
| 116 | __asm mov [esp + n * 4], eax \ | ||
| 117 | |||
| 118 | #define COPY_2(n) \ | ||
| 119 | __asm mov eax, [esp + n * 4] \ | ||
| 120 | __asm mov [ecx + n * 4], eax \ | ||
| 121 | |||
| 122 | COPY_1(0) | ||
| 123 | __asm mov edi, [ecx + 1 * 4] \ | ||
| 124 | COPY_1(2) | ||
| 125 | COPY_1(3) | ||
| 126 | COPY_1(4) | ||
| 127 | COPY_1(5) | ||
| 128 | COPY_1(6) | ||
| 129 | COPY_1(7) | ||
| 130 | |||
| 131 | __asm mov esi, edx \ | ||
| 132 | __asm mov [esp + 0 * 8 + 4], ecx | ||
| 133 | __asm mov ecx, Z7_XXH_PRIME64_2_LOW \ | ||
| 134 | __asm mov ebp, Z7_XXH_PRIME64_1_LOW \ | ||
| 135 | |||
| 136 | #define R(n, state1, state1_reg) \ | ||
| 137 | __asm mov eax, [esi + n * 8] \ | ||
| 138 | __asm imul ebx, eax, Z7_XXH_PRIME64_2_HIGH \ | ||
| 139 | __asm add ebx, state1 \ | ||
| 140 | __asm mul ecx \ | ||
| 141 | __asm add edx, ebx \ | ||
| 142 | __asm mov ebx, [esi + n * 8 + 4] \ | ||
| 143 | __asm imul ebx, ecx \ | ||
| 144 | __asm add eax, [esp + n * 8] \ | ||
| 145 | __asm adc edx, ebx \ | ||
| 146 | __asm mov ebx, eax \ | ||
| 147 | __asm shld eax, edx, 31 \ | ||
| 148 | __asm shld edx, ebx, 31 \ | ||
| 149 | __asm imul state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \ | ||
| 150 | __asm imul edx, ebp \ | ||
| 151 | __asm add state1_reg, edx \ | ||
| 152 | __asm mul ebp \ | ||
| 153 | __asm add state1_reg, edx \ | ||
| 154 | __asm mov [esp + n * 8], eax \ | ||
| 155 | |||
| 156 | #define R2(n) \ | ||
| 157 | R(n, [esp + n * 8 + 4], ebx) \ | ||
| 158 | __asm mov [esp + n * 8 + 4], ebx \ | ||
| 159 | |||
| 160 | __asm align 16 | ||
| 161 | __asm main_loop: | ||
| 162 | R(0, edi, edi) | ||
| 163 | R2(1) | ||
| 164 | R2(2) | ||
| 165 | R2(3) | ||
| 166 | __asm add esi, 32 | ||
| 167 | __asm cmp esi, [esp + STACK_OFFSET + 4 * 4 + 4] | ||
| 168 | __asm jne main_loop | ||
| 169 | |||
| 170 | __asm mov ecx, [esp + 0 * 8 + 4] | ||
| 171 | |||
| 172 | COPY_2(0) | ||
| 173 | __asm mov [ecx + 1 * 4], edi | ||
| 174 | COPY_2(2) | ||
| 175 | COPY_2(3) | ||
| 176 | COPY_2(4) | ||
| 177 | COPY_2(5) | ||
| 178 | COPY_2(6) | ||
| 179 | COPY_2(7) | ||
| 180 | |||
| 181 | __asm add esp, STACK_OFFSET | ||
| 182 | __asm pop edi | ||
| 183 | __asm pop esi | ||
| 184 | __asm pop ebp | ||
| 185 | __asm pop ebx | ||
| 186 | __asm ret 4 | ||
| 187 | } | ||
| 188 | |||
| 189 | #else | ||
| 190 | |||
| 191 | void | ||
| 192 | Z7_NO_INLINE | ||
| 193 | Z7_FASTCALL | ||
| 194 | Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end) | ||
| 195 | { | ||
| 196 | const Byte *data = (const Byte *)_data; | ||
| 197 | UInt64 v[4]; | ||
| 198 | v[0] = p->v[0]; | ||
| 199 | v[1] = p->v[1]; | ||
| 200 | v[2] = p->v[2]; | ||
| 201 | v[3] = p->v[3]; | ||
| 202 | do | ||
| 203 | { | ||
| 204 | v[0] = Xxh64_Round(v[0], GetUi64(data)); data += 8; | ||
| 205 | v[1] = Xxh64_Round(v[1], GetUi64(data)); data += 8; | ||
| 206 | v[2] = Xxh64_Round(v[2], GetUi64(data)); data += 8; | ||
| 207 | v[3] = Xxh64_Round(v[3], GetUi64(data)); data += 8; | ||
| 208 | } | ||
| 209 | while (data != end); | ||
| 210 | p->v[0] = v[0]; | ||
| 211 | p->v[1] = v[1]; | ||
| 212 | p->v[2] = v[2]; | ||
| 213 | p->v[3] = v[3]; | ||
| 214 | } | ||
| 215 | |||
| 216 | #endif | ||
| 217 | |||
| 218 | UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count) | ||
| 219 | { | ||
| 220 | UInt64 h = p->v[2]; | ||
| 221 | |||
| 222 | if (count >= 32) | ||
| 223 | { | ||
| 224 | h = Z7_ROTL64(p->v[0], 1) + | ||
| 225 | Z7_ROTL64(p->v[1], 7) + | ||
| 226 | Z7_ROTL64(h, 12) + | ||
| 227 | Z7_ROTL64(p->v[3], 18); | ||
| 228 | h = Xxh64_Merge(h, p->v[0]); | ||
| 229 | h = Xxh64_Merge(h, p->v[1]); | ||
| 230 | h = Xxh64_Merge(h, p->v[2]); | ||
| 231 | h = Xxh64_Merge(h, p->v[3]); | ||
| 232 | } | ||
| 233 | else | ||
| 234 | h += Z7_XXH_PRIME64_5; | ||
| 235 | |||
| 236 | h += count; | ||
| 237 | |||
| 238 | // XXH64_finalize(): | ||
| 239 | { | ||
| 240 | unsigned cnt = (unsigned)count & 31; | ||
| 241 | const Byte *data = (const Byte *)_data; | ||
| 242 | while (cnt >= 8) | ||
| 243 | { | ||
| 244 | h ^= Xxh64_Round(0, GetUi64(data)); | ||
| 245 | data += 8; | ||
| 246 | h = Z7_ROTL64(h, 27); | ||
| 247 | h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; | ||
| 248 | cnt -= 8; | ||
| 249 | } | ||
| 250 | if (cnt >= 4) | ||
| 251 | { | ||
| 252 | const UInt32 v = GetUi32(data); | ||
| 253 | data += 4; | ||
| 254 | h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1); | ||
| 255 | h = Z7_ROTL64(h, 23); | ||
| 256 | h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3; | ||
| 257 | cnt -= 4; | ||
| 258 | } | ||
| 259 | while (cnt) | ||
| 260 | { | ||
| 261 | const UInt32 v = *data++; | ||
| 262 | h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5); | ||
| 263 | h = Z7_ROTL64(h, 11); | ||
| 264 | h = MY_MUL64(h, Z7_XXH_PRIME64_1); | ||
| 265 | cnt--; | ||
| 266 | } | ||
| 267 | // XXH64_avalanche(h): | ||
| 268 | h ^= h >> 33; h = MY_MUL64(h, Z7_XXH_PRIME64_2); | ||
| 269 | h ^= h >> 29; h = MY_MUL64(h, Z7_XXH_PRIME64_3); | ||
| 270 | h ^= h >> 32; | ||
| 271 | return h; | ||
| 272 | } | ||
| 273 | } | ||
| 274 | |||
| 275 | |||
| 276 | void Xxh64_Init(CXxh64 *p) | ||
| 277 | { | ||
| 278 | Xxh64State_Init(&p->state); | ||
| 279 | p->count = 0; | ||
| 280 | p->buf64[0] = 0; | ||
| 281 | p->buf64[1] = 0; | ||
| 282 | p->buf64[2] = 0; | ||
| 283 | p->buf64[3] = 0; | ||
| 284 | } | ||
| 285 | |||
| 286 | void Xxh64_Update(CXxh64 *p, const void *_data, size_t size) | ||
| 287 | { | ||
| 288 | const Byte *data = (const Byte *)_data; | ||
| 289 | unsigned cnt; | ||
| 290 | if (size == 0) | ||
| 291 | return; | ||
| 292 | cnt = (unsigned)p->count; | ||
| 293 | p->count += size; | ||
| 294 | |||
| 295 | if (cnt &= 31) | ||
| 296 | { | ||
| 297 | unsigned rem = 32 - cnt; | ||
| 298 | Byte *dest = (Byte *)p->buf64 + cnt; | ||
| 299 | if (rem > size) | ||
| 300 | rem = (unsigned)size; | ||
| 301 | size -= rem; | ||
| 302 | cnt += rem; | ||
| 303 | // memcpy((Byte *)p->buf64 + cnt, data, rem); | ||
| 304 | do | ||
| 305 | *dest++ = *data++; | ||
| 306 | while (--rem); | ||
| 307 | if (cnt != 32) | ||
| 308 | return; | ||
| 309 | Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]); | ||
| 310 | } | ||
| 311 | |||
| 312 | if (size &= ~(size_t)31) | ||
| 313 | { | ||
| 314 | Xxh64State_UpdateBlocks(&p->state, data, data + size); | ||
| 315 | data += size; | ||
| 316 | } | ||
| 317 | |||
| 318 | cnt = (unsigned)p->count & 31; | ||
| 319 | if (cnt) | ||
| 320 | { | ||
| 321 | // memcpy(p->buf64, data, cnt); | ||
| 322 | Byte *dest = (Byte *)p->buf64; | ||
| 323 | do | ||
| 324 | *dest++ = *data++; | ||
| 325 | while (--cnt); | ||
| 326 | } | ||
| 327 | } | ||
diff --git a/C/Xxh64.h b/C/Xxh64.h new file mode 100644 index 0000000..efef65e --- /dev/null +++ b/C/Xxh64.h | |||
| @@ -0,0 +1,50 @@ | |||
| 1 | /* Xxh64.h -- XXH64 hash calculation interfaces | ||
| 2 | 2023-08-18 : Igor Pavlov : Public domain */ | ||
| 3 | |||
| 4 | #ifndef ZIP7_INC_XXH64_H | ||
| 5 | #define ZIP7_INC_XXH64_H | ||
| 6 | |||
| 7 | #include "7zTypes.h" | ||
| 8 | |||
| 9 | EXTERN_C_BEGIN | ||
| 10 | |||
| 11 | #define Z7_XXH64_BLOCK_SIZE (4 * 8) | ||
| 12 | |||
| 13 | typedef struct | ||
| 14 | { | ||
| 15 | UInt64 v[4]; | ||
| 16 | } CXxh64State; | ||
| 17 | |||
| 18 | void Xxh64State_Init(CXxh64State *p); | ||
| 19 | |||
| 20 | // end != data && end == data + Z7_XXH64_BLOCK_SIZE * numBlocks | ||
| 21 | void Z7_FASTCALL Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end); | ||
| 22 | |||
| 23 | /* | ||
| 24 | Xxh64State_Digest(): | ||
| 25 | data: | ||
| 26 | the function processes only | ||
| 27 | (totalCount & (Z7_XXH64_BLOCK_SIZE - 1)) bytes in (data): (smaller than 32 bytes). | ||
| 28 | totalCount: total size of hashed stream: | ||
| 29 | it includes total size of data processed by previous Xxh64State_UpdateBlocks() calls, | ||
| 30 | and it also includes current processed size in (data). | ||
| 31 | */ | ||
| 32 | UInt64 Xxh64State_Digest(const CXxh64State *p, const void *data, UInt64 totalCount); | ||
| 33 | |||
| 34 | |||
| 35 | typedef struct | ||
| 36 | { | ||
| 37 | CXxh64State state; | ||
| 38 | UInt64 count; | ||
| 39 | UInt64 buf64[4]; | ||
| 40 | } CXxh64; | ||
| 41 | |||
| 42 | void Xxh64_Init(CXxh64 *p); | ||
| 43 | void Xxh64_Update(CXxh64 *p, const void *data, size_t size); | ||
| 44 | |||
| 45 | #define Xxh64_Digest(p) \ | ||
| 46 | Xxh64State_Digest(&(p)->state, (p)->buf64, (p)->count) | ||
| 47 | |||
| 48 | EXTERN_C_END | ||
| 49 | |||
| 50 | #endif | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Xz.c - Xz | 1 | /* Xz.c - Xz |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode) | |||
| 52 | case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; | 52 | case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; |
| 53 | case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; | 53 | case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; |
| 54 | case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; | 54 | case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; |
| 55 | default: break; | ||
| 55 | } | 56 | } |
| 56 | } | 57 | } |
| 57 | 58 | ||
| @@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size) | |||
| 62 | case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; | 63 | case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; |
| 63 | case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; | 64 | case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; |
| 64 | case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; | 65 | case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; |
| 66 | default: break; | ||
| 65 | } | 67 | } |
| 66 | } | 68 | } |
| 67 | 69 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* Xz.h - Xz interface | 1 | /* Xz.h - Xz interface |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-01-26 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_XZ_H | 4 | #ifndef ZIP7_INC_XZ_H |
| 5 | #define ZIP7_INC_XZ_H | 5 | #define ZIP7_INC_XZ_H |
| @@ -18,6 +18,7 @@ EXTERN_C_BEGIN | |||
| 18 | #define XZ_ID_ARMT 8 | 18 | #define XZ_ID_ARMT 8 |
| 19 | #define XZ_ID_SPARC 9 | 19 | #define XZ_ID_SPARC 9 |
| 20 | #define XZ_ID_ARM64 0xa | 20 | #define XZ_ID_ARM64 0xa |
| 21 | #define XZ_ID_RISCV 0xb | ||
| 21 | #define XZ_ID_LZMA2 0x21 | 22 | #define XZ_ID_LZMA2 0x21 |
| 22 | 23 | ||
| 23 | unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); | 24 | unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); |
| @@ -233,13 +234,13 @@ typedef enum | |||
| 233 | typedef struct | 234 | typedef struct |
| 234 | { | 235 | { |
| 235 | EXzState state; | 236 | EXzState state; |
| 236 | UInt32 pos; | 237 | unsigned pos; |
| 237 | unsigned alignPos; | 238 | unsigned alignPos; |
| 238 | unsigned indexPreSize; | 239 | unsigned indexPreSize; |
| 239 | 240 | ||
| 240 | CXzStreamFlags streamFlags; | 241 | CXzStreamFlags streamFlags; |
| 241 | 242 | ||
| 242 | UInt32 blockHeaderSize; | 243 | unsigned blockHeaderSize; |
| 243 | UInt64 packSize; | 244 | UInt64 packSize; |
| 244 | UInt64 unpackSize; | 245 | UInt64 unpackSize; |
| 245 | 246 | ||
diff --git a/C/XzCrc64.c b/C/XzCrc64.c index c2fad6c..94fc1af 100644 --- a/C/XzCrc64.c +++ b/C/XzCrc64.c | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzCrc64.c -- CRC64 calculation | 1 | /* XzCrc64.c -- CRC64 calculation |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -8,36 +8,76 @@ | |||
| 8 | 8 | ||
| 9 | #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) | 9 | #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) |
| 10 | 10 | ||
| 11 | #ifdef MY_CPU_LE | 11 | // for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu |
| 12 | #define CRC64_NUM_TABLES 4 | 12 | // #define Z7_CRC64_DEBUG_BE |
| 13 | #ifdef Z7_CRC64_DEBUG_BE | ||
| 14 | #undef MY_CPU_LE | ||
| 15 | #define MY_CPU_BE | ||
| 16 | #endif | ||
| 17 | |||
| 18 | #ifdef Z7_CRC64_NUM_TABLES | ||
| 19 | #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES | ||
| 13 | #else | 20 | #else |
| 14 | #define CRC64_NUM_TABLES 5 | 21 | #define Z7_CRC64_NUM_TABLES_USE 12 |
| 22 | #endif | ||
| 15 | 23 | ||
| 16 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 24 | #if Z7_CRC64_NUM_TABLES_USE < 1 |
| 25 | #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES | ||
| 17 | #endif | 26 | #endif |
| 18 | 27 | ||
| 28 | |||
| 29 | #if Z7_CRC64_NUM_TABLES_USE != 1 | ||
| 30 | |||
| 19 | #ifndef MY_CPU_BE | 31 | #ifndef MY_CPU_BE |
| 20 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 32 | #define FUNC_NAME_LE_2(s) XzCrc64UpdateT ## s |
| 33 | #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) | ||
| 34 | #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE) | ||
| 35 | UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
| 36 | #endif | ||
| 37 | #ifndef MY_CPU_LE | ||
| 38 | #define FUNC_NAME_BE_2(s) XzCrc64UpdateBeT ## s | ||
| 39 | #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) | ||
| 40 | #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE) | ||
| 41 | UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
| 21 | #endif | 42 | #endif |
| 22 | 43 | ||
| 23 | typedef UInt64 (Z7_FASTCALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table); | 44 | #if defined(MY_CPU_LE) |
| 45 | #define FUNC_REF FUNC_NAME_LE | ||
| 46 | #elif defined(MY_CPU_BE) | ||
| 47 | #define FUNC_REF FUNC_NAME_BE | ||
| 48 | #else | ||
| 49 | #define FUNC_REF g_Crc64Update | ||
| 50 | static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table); | ||
| 51 | #endif | ||
| 52 | |||
| 53 | #endif | ||
| 54 | |||
| 55 | |||
| 56 | MY_ALIGN(64) | ||
| 57 | static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE]; | ||
| 24 | 58 | ||
| 25 | static CRC64_FUNC g_Crc64Update; | ||
| 26 | UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES]; | ||
| 27 | 59 | ||
| 28 | UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) | 60 | UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) |
| 29 | { | 61 | { |
| 30 | return g_Crc64Update(v, data, size, g_Crc64Table); | 62 | #if Z7_CRC64_NUM_TABLES_USE == 1 |
| 63 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | ||
| 64 | const UInt64 *table = g_Crc64Table; | ||
| 65 | const Byte *p = (const Byte *)data; | ||
| 66 | const Byte *lim = p + size; | ||
| 67 | for (; p != lim; p++) | ||
| 68 | v = CRC64_UPDATE_BYTE_2(v, *p); | ||
| 69 | return v; | ||
| 70 | #undef CRC64_UPDATE_BYTE_2 | ||
| 71 | #else | ||
| 72 | return FUNC_REF (v, data, size, g_Crc64Table); | ||
| 73 | #endif | ||
| 31 | } | 74 | } |
| 32 | 75 | ||
| 33 | UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size) | ||
| 34 | { | ||
| 35 | return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL; | ||
| 36 | } | ||
| 37 | 76 | ||
| 77 | Z7_NO_INLINE | ||
| 38 | void Z7_FASTCALL Crc64GenerateTable(void) | 78 | void Z7_FASTCALL Crc64GenerateTable(void) |
| 39 | { | 79 | { |
| 40 | UInt32 i; | 80 | unsigned i; |
| 41 | for (i = 0; i < 256; i++) | 81 | for (i = 0; i < 256; i++) |
| 42 | { | 82 | { |
| 43 | UInt64 r = i; | 83 | UInt64 r = i; |
| @@ -46,35 +86,55 @@ void Z7_FASTCALL Crc64GenerateTable(void) | |||
| 46 | r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); | 86 | r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); |
| 47 | g_Crc64Table[i] = r; | 87 | g_Crc64Table[i] = r; |
| 48 | } | 88 | } |
| 49 | for (i = 256; i < 256 * CRC64_NUM_TABLES; i++) | 89 | |
| 90 | #if Z7_CRC64_NUM_TABLES_USE != 1 | ||
| 91 | #if 1 || 1 && defined(MY_CPU_X86) // low register count | ||
| 92 | for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++) | ||
| 50 | { | 93 | { |
| 51 | const UInt64 r = g_Crc64Table[(size_t)i - 256]; | 94 | const UInt64 r0 = g_Crc64Table[(size_t)i]; |
| 52 | g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8); | 95 | g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); |
| 53 | } | 96 | } |
| 54 | 97 | #else | |
| 55 | #ifdef MY_CPU_LE | 98 | for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2) |
| 56 | 99 | { | |
| 57 | g_Crc64Update = XzCrc64UpdateT4; | 100 | UInt64 r0 = g_Crc64Table[(size_t)(i) ]; |
| 101 | UInt64 r1 = g_Crc64Table[(size_t)(i) + 1]; | ||
| 102 | r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); | ||
| 103 | r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8); | ||
| 104 | g_Crc64Table[(size_t)i + 256 ] = r0; | ||
| 105 | g_Crc64Table[(size_t)i + 256 + 1] = r1; | ||
| 106 | } | ||
| 107 | #endif | ||
| 58 | 108 | ||
| 59 | #else | 109 | #ifndef MY_CPU_LE |
| 60 | { | 110 | { |
| 61 | #ifndef MY_CPU_BE | 111 | #ifndef MY_CPU_BE |
| 62 | UInt32 k = 1; | 112 | UInt32 k = 1; |
| 63 | if (*(const Byte *)&k == 1) | 113 | if (*(const Byte *)&k == 1) |
| 64 | g_Crc64Update = XzCrc64UpdateT4; | 114 | FUNC_REF = FUNC_NAME_LE; |
| 65 | else | 115 | else |
| 66 | #endif | 116 | #endif |
| 67 | { | 117 | { |
| 68 | for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--) | 118 | #ifndef MY_CPU_BE |
| 119 | FUNC_REF = FUNC_NAME_BE; | ||
| 120 | #endif | ||
| 121 | for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++) | ||
| 69 | { | 122 | { |
| 70 | const UInt64 x = g_Crc64Table[(size_t)i - 256]; | 123 | const UInt64 x = g_Crc64Table[i]; |
| 71 | g_Crc64Table[i] = Z7_BSWAP64(x); | 124 | g_Crc64Table[i] = Z7_BSWAP64(x); |
| 72 | } | 125 | } |
| 73 | g_Crc64Update = XzCrc64UpdateT1_BeT4; | ||
| 74 | } | 126 | } |
| 75 | } | 127 | } |
| 76 | #endif | 128 | #endif // ndef MY_CPU_LE |
| 129 | #endif // Z7_CRC64_NUM_TABLES_USE != 1 | ||
| 77 | } | 130 | } |
| 78 | 131 | ||
| 79 | #undef kCrc64Poly | 132 | #undef kCrc64Poly |
| 80 | #undef CRC64_NUM_TABLES | 133 | #undef Z7_CRC64_NUM_TABLES_USE |
| 134 | #undef FUNC_REF | ||
| 135 | #undef FUNC_NAME_LE_2 | ||
| 136 | #undef FUNC_NAME_LE_1 | ||
| 137 | #undef FUNC_NAME_LE | ||
| 138 | #undef FUNC_NAME_BE_2 | ||
| 139 | #undef FUNC_NAME_BE_1 | ||
| 140 | #undef FUNC_NAME_BE | ||
diff --git a/C/XzCrc64.h b/C/XzCrc64.h index ca46869..04f8153 100644 --- a/C/XzCrc64.h +++ b/C/XzCrc64.h | |||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzCrc64.h -- CRC64 calculation | 1 | /* XzCrc64.h -- CRC64 calculation |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #ifndef ZIP7_INC_XZ_CRC64_H | 4 | #ifndef ZIP7_INC_XZ_CRC64_H |
| 5 | #define ZIP7_INC_XZ_CRC64_H | 5 | #define ZIP7_INC_XZ_CRC64_H |
| @@ -10,16 +10,16 @@ | |||
| 10 | 10 | ||
| 11 | EXTERN_C_BEGIN | 11 | EXTERN_C_BEGIN |
| 12 | 12 | ||
| 13 | extern UInt64 g_Crc64Table[]; | 13 | // extern UInt64 g_Crc64Table[]; |
| 14 | 14 | ||
| 15 | void Z7_FASTCALL Crc64GenerateTable(void); | 15 | void Z7_FASTCALL Crc64GenerateTable(void); |
| 16 | 16 | ||
| 17 | #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) | 17 | #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) |
| 18 | #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) | 18 | #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) |
| 19 | #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 19 | // #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
| 20 | 20 | ||
| 21 | UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); | 21 | UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); |
| 22 | UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); | 22 | // UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); |
| 23 | 23 | ||
| 24 | EXTERN_C_END | 24 | EXTERN_C_END |
| 25 | 25 | ||
diff --git a/C/XzCrc64Opt.c b/C/XzCrc64Opt.c index d03374c..0c1fc2f 100644 --- a/C/XzCrc64Opt.c +++ b/C/XzCrc64Opt.c | |||
| @@ -1,61 +1,261 @@ | |||
| 1 | /* XzCrc64Opt.c -- CRC64 calculation | 1 | /* XzCrc64Opt.c -- CRC64 calculation (optimized functions) |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-12-08 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| 6 | #include "CpuArch.h" | 6 | #include "CpuArch.h" |
| 7 | 7 | ||
| 8 | #if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1 | ||
| 9 | |||
| 10 | // for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu | ||
| 11 | // #define Z7_CRC64_DEBUG_BE | ||
| 12 | #ifdef Z7_CRC64_DEBUG_BE | ||
| 13 | #undef MY_CPU_LE | ||
| 14 | #define MY_CPU_BE | ||
| 15 | #endif | ||
| 16 | |||
| 17 | #if defined(MY_CPU_64BIT) | ||
| 18 | #define Z7_CRC64_USE_64BIT | ||
| 19 | #endif | ||
| 20 | |||
| 21 | // the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c | ||
| 22 | #ifdef Z7_CRC64_NUM_TABLES | ||
| 23 | #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES | ||
| 24 | #else | ||
| 25 | #define Z7_CRC64_NUM_TABLES_USE 12 | ||
| 26 | #endif | ||
| 27 | |||
| 28 | #if Z7_CRC64_NUM_TABLES_USE % 4 || \ | ||
| 29 | Z7_CRC64_NUM_TABLES_USE < 4 || \ | ||
| 30 | Z7_CRC64_NUM_TABLES_USE > 4 * 4 | ||
| 31 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
| 32 | #endif | ||
| 33 | |||
| 34 | |||
| 8 | #ifndef MY_CPU_BE | 35 | #ifndef MY_CPU_BE |
| 9 | 36 | ||
| 10 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) | 37 | #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) |
| 38 | |||
| 39 | #if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) | ||
| 11 | 40 | ||
| 12 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 41 | #define Q64LE(n, d) \ |
| 13 | UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) | 42 | ( (table + ((n) * 8 + 7) * 0x100)[((d) ) & 0xFF] \ |
| 43 | ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
| 44 | ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
| 45 | ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \ | ||
| 46 | ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \ | ||
| 47 | ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \ | ||
| 48 | ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \ | ||
| 49 | ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] ) | ||
| 50 | |||
| 51 | #define R64(a) *((const UInt64 *)(const void *)p + (a)) | ||
| 52 | |||
| 53 | #else | ||
| 54 | |||
| 55 | #define Q32LE(n, d) \ | ||
| 56 | ( (table + ((n) * 4 + 3) * 0x100)[((d) ) & 0xFF] \ | ||
| 57 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
| 58 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
| 59 | ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) | ||
| 60 | |||
| 61 | #define R32(a) *((const UInt32 *)(const void *)p + (a)) | ||
| 62 | |||
| 63 | #endif | ||
| 64 | |||
| 65 | |||
| 66 | #define CRC64_FUNC_PRE_LE2(step) \ | ||
| 67 | UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) | ||
| 68 | |||
| 69 | #define CRC64_FUNC_PRE_LE(step) \ | ||
| 70 | CRC64_FUNC_PRE_LE2(step); \ | ||
| 71 | CRC64_FUNC_PRE_LE2(step) | ||
| 72 | |||
| 73 | CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE) | ||
| 14 | { | 74 | { |
| 15 | const Byte *p = (const Byte *)data; | 75 | const Byte *p = (const Byte *)data; |
| 16 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 76 | const Byte *lim; |
| 77 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) | ||
| 17 | v = CRC64_UPDATE_BYTE_2(v, *p); | 78 | v = CRC64_UPDATE_BYTE_2(v, *p); |
| 18 | for (; size >= 4; size -= 4, p += 4) | 79 | lim = p + size; |
| 80 | if (size >= Z7_CRC64_NUM_TABLES_USE) | ||
| 19 | { | 81 | { |
| 20 | const UInt32 d = (UInt32)v ^ *(const UInt32 *)(const void *)p; | 82 | lim -= Z7_CRC64_NUM_TABLES_USE; |
| 21 | v = (v >> 32) | 83 | do |
| 22 | ^ (table + 0x300)[((d ) & 0xFF)] | 84 | { |
| 23 | ^ (table + 0x200)[((d >> 8) & 0xFF)] | 85 | #if Z7_CRC64_NUM_TABLES_USE == 4 |
| 24 | ^ (table + 0x100)[((d >> 16) & 0xFF)] | 86 | const UInt32 d = (UInt32)v ^ R32(0); |
| 25 | ^ (table + 0x000)[((d >> 24))]; | 87 | v = (v >> 32) ^ Q32LE(0, d); |
| 88 | #elif Z7_CRC64_NUM_TABLES_USE == 8 | ||
| 89 | #ifdef Z7_CRC64_USE_64BIT | ||
| 90 | v ^= R64(0); | ||
| 91 | v = Q64LE(0, v); | ||
| 92 | #else | ||
| 93 | UInt32 v0, v1; | ||
| 94 | v0 = (UInt32)v ^ R32(0); | ||
| 95 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
| 96 | v = Q32LE(1, v0) ^ Q32LE(0, v1); | ||
| 97 | #endif | ||
| 98 | #elif Z7_CRC64_NUM_TABLES_USE == 12 | ||
| 99 | UInt32 w; | ||
| 100 | UInt32 v0, v1; | ||
| 101 | v0 = (UInt32)v ^ R32(0); | ||
| 102 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
| 103 | w = R32(2); | ||
| 104 | v = Q32LE(0, w); | ||
| 105 | v ^= Q32LE(2, v0) ^ Q32LE(1, v1); | ||
| 106 | #elif Z7_CRC64_NUM_TABLES_USE == 16 | ||
| 107 | #ifdef Z7_CRC64_USE_64BIT | ||
| 108 | UInt64 w; | ||
| 109 | UInt64 x; | ||
| 110 | w = R64(1); x = Q64LE(0, w); | ||
| 111 | v ^= R64(0); v = x ^ Q64LE(1, v); | ||
| 112 | #else | ||
| 113 | UInt32 v0, v1; | ||
| 114 | UInt32 r0, r1; | ||
| 115 | v0 = (UInt32)v ^ R32(0); | ||
| 116 | v1 = (UInt32)(v >> 32) ^ R32(1); | ||
| 117 | r0 = R32(2); | ||
| 118 | r1 = R32(3); | ||
| 119 | v = Q32LE(1, r0) ^ Q32LE(0, r1); | ||
| 120 | v ^= Q32LE(3, v0) ^ Q32LE(2, v1); | ||
| 121 | #endif | ||
| 122 | #else | ||
| 123 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
| 124 | #endif | ||
| 125 | p += Z7_CRC64_NUM_TABLES_USE; | ||
| 126 | } | ||
| 127 | while (p <= lim); | ||
| 128 | lim += Z7_CRC64_NUM_TABLES_USE; | ||
| 26 | } | 129 | } |
| 27 | for (; size > 0; size--, p++) | 130 | for (; p < lim; p++) |
| 28 | v = CRC64_UPDATE_BYTE_2(v, *p); | 131 | v = CRC64_UPDATE_BYTE_2(v, *p); |
| 29 | return v; | 132 | return v; |
| 30 | } | 133 | } |
| 31 | 134 | ||
| 135 | #undef CRC64_UPDATE_BYTE_2 | ||
| 136 | #undef R32 | ||
| 137 | #undef R64 | ||
| 138 | #undef Q32LE | ||
| 139 | #undef Q64LE | ||
| 140 | #undef CRC64_FUNC_PRE_LE | ||
| 141 | #undef CRC64_FUNC_PRE_LE2 | ||
| 142 | |||
| 32 | #endif | 143 | #endif |
| 33 | 144 | ||
| 34 | 145 | ||
| 146 | |||
| 147 | |||
| 35 | #ifndef MY_CPU_LE | 148 | #ifndef MY_CPU_LE |
| 36 | 149 | ||
| 37 | #define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) | 150 | #define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8)) |
| 151 | |||
| 152 | #if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) | ||
| 153 | |||
| 154 | #define Q64BE(n, d) \ | ||
| 155 | ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \ | ||
| 156 | ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
| 157 | ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
| 158 | ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \ | ||
| 159 | ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \ | ||
| 160 | ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \ | ||
| 161 | ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \ | ||
| 162 | ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] ) | ||
| 163 | |||
| 164 | #ifdef Z7_CRC64_DEBUG_BE | ||
| 165 | #define R64BE(a) GetBe64a((const UInt64 *)(const void *)p + (a)) | ||
| 166 | #else | ||
| 167 | #define R64BE(a) *((const UInt64 *)(const void *)p + (a)) | ||
| 168 | #endif | ||
| 169 | |||
| 170 | #else | ||
| 171 | |||
| 172 | #define Q32BE(n, d) \ | ||
| 173 | ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \ | ||
| 174 | ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ | ||
| 175 | ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ | ||
| 176 | ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) | ||
| 38 | 177 | ||
| 39 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); | 178 | #ifdef Z7_CRC64_DEBUG_BE |
| 40 | UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) | 179 | #define R32BE(a) GetBe32a((const UInt32 *)(const void *)p + (a)) |
| 180 | #else | ||
| 181 | #define R32BE(a) *((const UInt32 *)(const void *)p + (a)) | ||
| 182 | #endif | ||
| 183 | |||
| 184 | #endif | ||
| 185 | |||
| 186 | #define CRC64_FUNC_PRE_BE2(step) \ | ||
| 187 | UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) | ||
| 188 | |||
| 189 | #define CRC64_FUNC_PRE_BE(step) \ | ||
| 190 | CRC64_FUNC_PRE_BE2(step); \ | ||
| 191 | CRC64_FUNC_PRE_BE2(step) | ||
| 192 | |||
| 193 | CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) | ||
| 41 | { | 194 | { |
| 42 | const Byte *p = (const Byte *)data; | 195 | const Byte *p = (const Byte *)data; |
| 43 | table += 0x100; | 196 | const Byte *lim; |
| 44 | v = Z7_BSWAP64(v); | 197 | v = Z7_BSWAP64(v); |
| 45 | for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) | 198 | for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) |
| 46 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); | 199 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); |
| 47 | for (; size >= 4; size -= 4, p += 4) | 200 | lim = p + size; |
| 201 | if (size >= Z7_CRC64_NUM_TABLES_USE) | ||
| 48 | { | 202 | { |
| 49 | const UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)(const void *)p; | 203 | lim -= Z7_CRC64_NUM_TABLES_USE; |
| 50 | v = (v << 32) | 204 | do |
| 51 | ^ (table + 0x000)[((d ) & 0xFF)] | 205 | { |
| 52 | ^ (table + 0x100)[((d >> 8) & 0xFF)] | 206 | #if Z7_CRC64_NUM_TABLES_USE == 4 |
| 53 | ^ (table + 0x200)[((d >> 16) & 0xFF)] | 207 | const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0); |
| 54 | ^ (table + 0x300)[((d >> 24))]; | 208 | v = (v << 32) ^ Q32BE(0, d); |
| 209 | #elif Z7_CRC64_NUM_TABLES_USE == 12 | ||
| 210 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
| 211 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
| 212 | const UInt32 w = R32BE(2); | ||
| 213 | v = Q32BE(0, w); | ||
| 214 | v ^= Q32BE(2, d1) ^ Q32BE(1, d0); | ||
| 215 | |||
| 216 | #elif Z7_CRC64_NUM_TABLES_USE == 8 | ||
| 217 | #ifdef Z7_CRC64_USE_64BIT | ||
| 218 | v ^= R64BE(0); | ||
| 219 | v = Q64BE(0, v); | ||
| 220 | #else | ||
| 221 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
| 222 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
| 223 | v = Q32BE(1, d1) ^ Q32BE(0, d0); | ||
| 224 | #endif | ||
| 225 | #elif Z7_CRC64_NUM_TABLES_USE == 16 | ||
| 226 | #ifdef Z7_CRC64_USE_64BIT | ||
| 227 | const UInt64 w = R64BE(1); | ||
| 228 | v ^= R64BE(0); | ||
| 229 | v = Q64BE(0, w) ^ Q64BE(1, v); | ||
| 230 | #else | ||
| 231 | const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); | ||
| 232 | const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); | ||
| 233 | const UInt32 w1 = R32BE(2); | ||
| 234 | const UInt32 w0 = R32BE(3); | ||
| 235 | v = Q32BE(1, w1) ^ Q32BE(0, w0); | ||
| 236 | v ^= Q32BE(3, d1) ^ Q32BE(2, d0); | ||
| 237 | #endif | ||
| 238 | #elif | ||
| 239 | #error Stop_Compiling_Bad_CRC64_NUM_TABLES | ||
| 240 | #endif | ||
| 241 | p += Z7_CRC64_NUM_TABLES_USE; | ||
| 242 | } | ||
| 243 | while (p <= lim); | ||
| 244 | lim += Z7_CRC64_NUM_TABLES_USE; | ||
| 55 | } | 245 | } |
| 56 | for (; size > 0; size--, p++) | 246 | for (; p < lim; p++) |
| 57 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); | 247 | v = CRC64_UPDATE_BYTE_2_BE(v, *p); |
| 58 | return Z7_BSWAP64(v); | 248 | return Z7_BSWAP64(v); |
| 59 | } | 249 | } |
| 60 | 250 | ||
| 251 | #undef CRC64_UPDATE_BYTE_2_BE | ||
| 252 | #undef R32BE | ||
| 253 | #undef R64BE | ||
| 254 | #undef Q32BE | ||
| 255 | #undef Q64BE | ||
| 256 | #undef CRC64_FUNC_PRE_BE | ||
| 257 | #undef CRC64_FUNC_PRE_BE2 | ||
| 258 | |||
| 259 | #endif | ||
| 260 | #undef Z7_CRC64_NUM_TABLES_USE | ||
| 61 | #endif | 261 | #endif |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzDec.c -- Xz Decode | 1 | /* XzDec.c -- Xz Decode |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -105,30 +105,32 @@ static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSiz | |||
| 105 | { | 105 | { |
| 106 | if (propSize != 1) | 106 | if (propSize != 1) |
| 107 | return SZ_ERROR_UNSUPPORTED; | 107 | return SZ_ERROR_UNSUPPORTED; |
| 108 | p->delta = (unsigned)props[0] + 1; | 108 | p->delta = (UInt32)props[0] + 1; |
| 109 | } | 109 | } |
| 110 | else | 110 | else |
| 111 | { | 111 | { |
| 112 | if (propSize == 4) | 112 | if (propSize == 4) |
| 113 | { | 113 | { |
| 114 | UInt32 v = GetUi32(props); | 114 | const UInt32 v = GetUi32(props); |
| 115 | switch (p->methodId) | 115 | switch (p->methodId) |
| 116 | { | 116 | { |
| 117 | case XZ_ID_PPC: | 117 | case XZ_ID_PPC: |
| 118 | case XZ_ID_ARM: | 118 | case XZ_ID_ARM: |
| 119 | case XZ_ID_SPARC: | 119 | case XZ_ID_SPARC: |
| 120 | case XZ_ID_ARM64: | 120 | case XZ_ID_ARM64: |
| 121 | if ((v & 3) != 0) | 121 | if (v & 3) |
| 122 | return SZ_ERROR_UNSUPPORTED; | 122 | return SZ_ERROR_UNSUPPORTED; |
| 123 | break; | 123 | break; |
| 124 | case XZ_ID_ARMT: | 124 | case XZ_ID_ARMT: |
| 125 | if ((v & 1) != 0) | 125 | case XZ_ID_RISCV: |
| 126 | if (v & 1) | ||
| 126 | return SZ_ERROR_UNSUPPORTED; | 127 | return SZ_ERROR_UNSUPPORTED; |
| 127 | break; | 128 | break; |
| 128 | case XZ_ID_IA64: | 129 | case XZ_ID_IA64: |
| 129 | if ((v & 0xF) != 0) | 130 | if (v & 0xf) |
| 130 | return SZ_ERROR_UNSUPPORTED; | 131 | return SZ_ERROR_UNSUPPORTED; |
| 131 | break; | 132 | break; |
| 133 | default: break; | ||
| 132 | } | 134 | } |
| 133 | p->ip = v; | 135 | p->ip = v; |
| 134 | } | 136 | } |
| @@ -151,12 +153,13 @@ static void XzBcFilterState_Init(void *pp) | |||
| 151 | 153 | ||
| 152 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = | 154 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = |
| 153 | { | 155 | { |
| 154 | Z7_BRANCH_CONV_DEC(PPC), | 156 | Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC), |
| 155 | Z7_BRANCH_CONV_DEC(IA64), | 157 | Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64), |
| 156 | Z7_BRANCH_CONV_DEC(ARM), | 158 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM), |
| 157 | Z7_BRANCH_CONV_DEC(ARMT), | 159 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT), |
| 158 | Z7_BRANCH_CONV_DEC(SPARC), | 160 | Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC), |
| 159 | Z7_BRANCH_CONV_DEC(ARM64) | 161 | Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64), |
| 162 | Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV) | ||
| 160 | }; | 163 | }; |
| 161 | 164 | ||
| 162 | static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) | 165 | static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) |
| @@ -262,7 +265,7 @@ static SRes XzBcFilterState_Code2(void *pp, | |||
| 262 | 265 | ||
| 263 | 266 | ||
| 264 | #define XZ_IS_SUPPORTED_FILTER_ID(id) \ | 267 | #define XZ_IS_SUPPORTED_FILTER_ID(id) \ |
| 265 | ((id) >= XZ_ID_Delta && (id) <= XZ_ID_ARM64) | 268 | ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV) |
| 266 | 269 | ||
| 267 | SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, | 270 | SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, |
| 268 | Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) | 271 | Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) |
| @@ -541,13 +544,12 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met | |||
| 541 | { | 544 | { |
| 542 | IStateCoder *sc = &p->coders[coderIndex]; | 545 | IStateCoder *sc = &p->coders[coderIndex]; |
| 543 | p->ids[coderIndex] = methodId; | 546 | p->ids[coderIndex] = methodId; |
| 544 | switch (methodId) | 547 | if (methodId == XZ_ID_LZMA2) |
| 545 | { | 548 | return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); |
| 546 | case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); | 549 | #ifdef USE_SUBBLOCK |
| 547 | #ifdef USE_SUBBLOCK | 550 | if (methodId == XZ_ID_Subblock) |
| 548 | case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc); | 551 | return SbState_SetFromMethod(sc, p->alloc); |
| 549 | #endif | 552 | #endif |
| 550 | } | ||
| 551 | if (coderIndex == 0) | 553 | if (coderIndex == 0) |
| 552 | return SZ_ERROR_UNSUPPORTED; | 554 | return SZ_ERROR_UNSUPPORTED; |
| 553 | return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, | 555 | return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, |
| @@ -558,10 +560,8 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met | |||
| 558 | static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) | 560 | static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) |
| 559 | { | 561 | { |
| 560 | IStateCoder *sc = &p->coders[coderIndex]; | 562 | IStateCoder *sc = &p->coders[coderIndex]; |
| 561 | switch (methodId) | 563 | if (methodId == XZ_ID_LZMA2) |
| 562 | { | 564 | return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); |
| 563 | case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); | ||
| 564 | } | ||
| 565 | return SZ_ERROR_UNSUPPORTED; | 565 | return SZ_ERROR_UNSUPPORTED; |
| 566 | } | 566 | } |
| 567 | 567 | ||
| @@ -804,7 +804,7 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte | |||
| 804 | } | 804 | } |
| 805 | 805 | ||
| 806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 806 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
| 807 | { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 807 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ |
| 808 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 808 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
| 809 | pos += s; } | 809 | pos += s; } |
| 810 | 810 | ||
| @@ -1034,7 +1034,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1034 | SRes res; | 1034 | SRes res; |
| 1035 | 1035 | ||
| 1036 | ECoderFinishMode finishMode2 = finishMode; | 1036 | ECoderFinishMode finishMode2 = finishMode; |
| 1037 | BoolInt srcFinished2 = srcFinished; | 1037 | BoolInt srcFinished2 = (BoolInt)srcFinished; |
| 1038 | BoolInt destFinish = False; | 1038 | BoolInt destFinish = False; |
| 1039 | 1039 | ||
| 1040 | if (p->block.packSize != (UInt64)(Int64)-1) | 1040 | if (p->block.packSize != (UInt64)(Int64)-1) |
| @@ -1127,7 +1127,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1127 | return SZ_OK; | 1127 | return SZ_OK; |
| 1128 | } | 1128 | } |
| 1129 | 1129 | ||
| 1130 | switch (p->state) | 1130 | switch ((int)p->state) |
| 1131 | { | 1131 | { |
| 1132 | case XZ_STATE_STREAM_HEADER: | 1132 | case XZ_STATE_STREAM_HEADER: |
| 1133 | { | 1133 | { |
| @@ -1172,15 +1172,15 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1172 | p->state = XZ_STATE_STREAM_INDEX; | 1172 | p->state = XZ_STATE_STREAM_INDEX; |
| 1173 | break; | 1173 | break; |
| 1174 | } | 1174 | } |
| 1175 | p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4; | 1175 | p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4; |
| 1176 | break; | 1176 | break; |
| 1177 | } | 1177 | } |
| 1178 | 1178 | ||
| 1179 | if (p->pos != p->blockHeaderSize) | 1179 | if (p->pos != p->blockHeaderSize) |
| 1180 | { | 1180 | { |
| 1181 | UInt32 cur = p->blockHeaderSize - p->pos; | 1181 | unsigned cur = p->blockHeaderSize - p->pos; |
| 1182 | if (cur > srcRem) | 1182 | if (cur > srcRem) |
| 1183 | cur = (UInt32)srcRem; | 1183 | cur = (unsigned)srcRem; |
| 1184 | memcpy(p->buf + p->pos, src, cur); | 1184 | memcpy(p->buf + p->pos, src, cur); |
| 1185 | p->pos += cur; | 1185 | p->pos += cur; |
| 1186 | (*srcLen) += cur; | 1186 | (*srcLen) += cur; |
| @@ -1222,8 +1222,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1222 | } | 1222 | } |
| 1223 | else | 1223 | else |
| 1224 | { | 1224 | { |
| 1225 | UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags); | 1225 | const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags); |
| 1226 | UInt32 cur = checkSize - p->pos; | 1226 | unsigned cur = checkSize - p->pos; |
| 1227 | if (cur != 0) | 1227 | if (cur != 0) |
| 1228 | { | 1228 | { |
| 1229 | if (srcRem == 0) | 1229 | if (srcRem == 0) |
| @@ -1232,7 +1232,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1232 | return SZ_OK; | 1232 | return SZ_OK; |
| 1233 | } | 1233 | } |
| 1234 | if (cur > srcRem) | 1234 | if (cur > srcRem) |
| 1235 | cur = (UInt32)srcRem; | 1235 | cur = (unsigned)srcRem; |
| 1236 | memcpy(p->buf + p->pos, src, cur); | 1236 | memcpy(p->buf + p->pos, src, cur); |
| 1237 | p->pos += cur; | 1237 | p->pos += cur; |
| 1238 | (*srcLen) += cur; | 1238 | (*srcLen) += cur; |
| @@ -1321,9 +1321,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1321 | 1321 | ||
| 1322 | case XZ_STATE_STREAM_FOOTER: | 1322 | case XZ_STATE_STREAM_FOOTER: |
| 1323 | { | 1323 | { |
| 1324 | UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos; | 1324 | unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos; |
| 1325 | if (cur > srcRem) | 1325 | if (cur > srcRem) |
| 1326 | cur = (UInt32)srcRem; | 1326 | cur = (unsigned)srcRem; |
| 1327 | memcpy(p->buf + p->pos, src, cur); | 1327 | memcpy(p->buf + p->pos, src, cur); |
| 1328 | p->pos += cur; | 1328 | p->pos += cur; |
| 1329 | (*srcLen) += cur; | 1329 | (*srcLen) += cur; |
| @@ -1358,6 +1358,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, | |||
| 1358 | } | 1358 | } |
| 1359 | 1359 | ||
| 1360 | case XZ_STATE_BLOCK: break; /* to disable GCC warning */ | 1360 | case XZ_STATE_BLOCK: break; /* to disable GCC warning */ |
| 1361 | |||
| 1362 | default: return SZ_ERROR_FAIL; | ||
| 1361 | } | 1363 | } |
| 1362 | } | 1364 | } |
| 1363 | /* | 1365 | /* |
| @@ -1773,10 +1775,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac | |||
| 1773 | } | 1775 | } |
| 1774 | } | 1776 | } |
| 1775 | { | 1777 | { |
| 1776 | UInt64 packSize = block->packSize; | 1778 | const UInt64 packSize = block->packSize; |
| 1777 | UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); | 1779 | const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); |
| 1778 | UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); | 1780 | const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); |
| 1779 | UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; | 1781 | const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; |
| 1780 | // if (blockPackSum <= me->props.inBlockMax) | 1782 | // if (blockPackSum <= me->props.inBlockMax) |
| 1781 | // unpackBlockMaxSize | 1783 | // unpackBlockMaxSize |
| 1782 | { | 1784 | { |
| @@ -2381,7 +2383,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p | |||
| 2381 | if (tMode) | 2383 | if (tMode) |
| 2382 | { | 2384 | { |
| 2383 | XzDecMt_FreeOutBufs(p); | 2385 | XzDecMt_FreeOutBufs(p); |
| 2384 | tMode = MtDec_PrepareRead(&p->mtc); | 2386 | tMode = (BoolInt)MtDec_PrepareRead(&p->mtc); |
| 2385 | } | 2387 | } |
| 2386 | #endif | 2388 | #endif |
| 2387 | 2389 | ||
| @@ -2644,7 +2646,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle p, | |||
| 2644 | p->outSize = *outDataSize; | 2646 | p->outSize = *outDataSize; |
| 2645 | } | 2647 | } |
| 2646 | 2648 | ||
| 2647 | p->finishMode = finishMode; | 2649 | p->finishMode = (BoolInt)finishMode; |
| 2648 | 2650 | ||
| 2649 | // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test | 2651 | // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test |
| 2650 | 2652 | ||
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzEnc.c -- Xz Encode | 1 | /* XzEnc.c -- Xz Encode |
| 2 | 2023-04-13 : Igor Pavlov : Public domain */ | 2 | 2024-03-01 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -29,8 +29,9 @@ | |||
| 29 | 29 | ||
| 30 | #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) | 30 | #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) |
| 31 | 31 | ||
| 32 | /* max pack size for LZMA2 block + check-64bytrs: */ | 32 | #define XZ_CHECK_SIZE_MAX 64 |
| 33 | #define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64) | 33 | /* max pack size for LZMA2 block + pad4 + check_size: */ |
| 34 | #define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX) | ||
| 34 | 35 | ||
| 35 | #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) | 36 | #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) |
| 36 | 37 | ||
| @@ -325,12 +326,13 @@ typedef struct | |||
| 325 | 326 | ||
| 326 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = | 327 | static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = |
| 327 | { | 328 | { |
| 328 | Z7_BRANCH_CONV_ENC(PPC), | 329 | Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC), |
| 329 | Z7_BRANCH_CONV_ENC(IA64), | 330 | Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64), |
| 330 | Z7_BRANCH_CONV_ENC(ARM), | 331 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM), |
| 331 | Z7_BRANCH_CONV_ENC(ARMT), | 332 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT), |
| 332 | Z7_BRANCH_CONV_ENC(SPARC), | 333 | Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC), |
| 333 | Z7_BRANCH_CONV_ENC(ARM64) | 334 | Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64), |
| 335 | Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV) | ||
| 334 | }; | 336 | }; |
| 335 | 337 | ||
| 336 | static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) | 338 | static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) |
| @@ -888,9 +890,9 @@ static SRes Xz_CompressBlock( | |||
| 888 | blockSizes->unpackSize = checkInStream.processed; | 890 | blockSizes->unpackSize = checkInStream.processed; |
| 889 | } | 891 | } |
| 890 | { | 892 | { |
| 891 | Byte buf[4 + 64]; | 893 | Byte buf[4 + XZ_CHECK_SIZE_MAX]; |
| 892 | unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); | 894 | const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); |
| 893 | UInt64 packSize = seqSizeOutStream.processed; | 895 | const UInt64 packSize = seqSizeOutStream.processed; |
| 894 | 896 | ||
| 895 | buf[0] = 0; | 897 | buf[0] = 0; |
| 896 | buf[1] = 0; | 898 | buf[1] = 0; |
| @@ -898,7 +900,8 @@ static SRes Xz_CompressBlock( | |||
| 898 | buf[3] = 0; | 900 | buf[3] = 0; |
| 899 | 901 | ||
| 900 | SeqCheckInStream_GetDigest(&checkInStream, buf + 4); | 902 | SeqCheckInStream_GetDigest(&checkInStream, buf + 4); |
| 901 | RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) | 903 | RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), |
| 904 | padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) | ||
| 902 | 905 | ||
| 903 | blockSizes->totalSize = seqSizeOutStream.processed - padSize; | 906 | blockSizes->totalSize = seqSizeOutStream.processed - padSize; |
| 904 | 907 | ||
| @@ -1083,18 +1086,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf | |||
| 1083 | CXzEnc *me = (CXzEnc *)pp; | 1086 | CXzEnc *me = (CXzEnc *)pp; |
| 1084 | SRes res; | 1087 | SRes res; |
| 1085 | CMtProgressThunk progressThunk; | 1088 | CMtProgressThunk progressThunk; |
| 1086 | 1089 | Byte *dest; | |
| 1087 | Byte *dest = me->outBufs[outBufIndex]; | ||
| 1088 | |||
| 1089 | UNUSED_VAR(finished) | 1090 | UNUSED_VAR(finished) |
| 1090 | |||
| 1091 | { | 1091 | { |
| 1092 | CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; | 1092 | CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; |
| 1093 | bInfo->totalSize = 0; | 1093 | bInfo->totalSize = 0; |
| 1094 | bInfo->unpackSize = 0; | 1094 | bInfo->unpackSize = 0; |
| 1095 | bInfo->headerSize = 0; | 1095 | bInfo->headerSize = 0; |
| 1096 | // v23.02: we don't compress empty blocks | ||
| 1097 | // also we must ignore that empty block in XzEnc_MtCallback_Write() | ||
| 1098 | if (srcSize == 0) | ||
| 1099 | return SZ_OK; | ||
| 1096 | } | 1100 | } |
| 1097 | 1101 | dest = me->outBufs[outBufIndex]; | |
| 1098 | if (!dest) | 1102 | if (!dest) |
| 1099 | { | 1103 | { |
| 1100 | dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); | 1104 | dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); |
| @@ -1140,18 +1144,20 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf | |||
| 1140 | static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) | 1144 | static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) |
| 1141 | { | 1145 | { |
| 1142 | CXzEnc *me = (CXzEnc *)pp; | 1146 | CXzEnc *me = (CXzEnc *)pp; |
| 1143 | |||
| 1144 | const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; | 1147 | const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; |
| 1145 | const Byte *data = me->outBufs[outBufIndex]; | 1148 | // v23.02: we don't write empty blocks |
| 1146 | 1149 | // note: if (bInfo->unpackSize == 0) then there is no compressed data of block | |
| 1147 | RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) | 1150 | if (bInfo->unpackSize == 0) |
| 1148 | 1151 | return SZ_OK; | |
| 1149 | { | 1152 | { |
| 1150 | UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); | 1153 | const Byte *data = me->outBufs[outBufIndex]; |
| 1151 | RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) | 1154 | RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) |
| 1155 | { | ||
| 1156 | const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); | ||
| 1157 | RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) | ||
| 1158 | } | ||
| 1159 | return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); | ||
| 1152 | } | 1160 | } |
| 1153 | |||
| 1154 | return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); | ||
| 1155 | } | 1161 | } |
| 1156 | 1162 | ||
| 1157 | #endif | 1163 | #endif |
| @@ -1,5 +1,5 @@ | |||
| 1 | /* XzIn.c - Xz input | 1 | /* XzIn.c - Xz input |
| 2 | 2023-04-02 : Igor Pavlov : Public domain */ | 2 | 2023-09-07 : Igor Pavlov : Public domain */ |
| 3 | 3 | ||
| 4 | #include "Precomp.h" | 4 | #include "Precomp.h" |
| 5 | 5 | ||
| @@ -27,7 +27,7 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) | |||
| 27 | } | 27 | } |
| 28 | 28 | ||
| 29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ | 29 | #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ |
| 30 | { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ | 30 | { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ |
| 31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ | 31 | if (s == 0) return SZ_ERROR_ARCHIVE; \ |
| 32 | pos += s; } | 32 | pos += s; } |
| 33 | 33 | ||
| @@ -37,7 +37,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
| 37 | unsigned headerSize; | 37 | unsigned headerSize; |
| 38 | *headerSizeRes = 0; | 38 | *headerSizeRes = 0; |
| 39 | RINOK(SeqInStream_ReadByte(inStream, &header[0])) | 39 | RINOK(SeqInStream_ReadByte(inStream, &header[0])) |
| 40 | headerSize = (unsigned)header[0]; | 40 | headerSize = header[0]; |
| 41 | if (headerSize == 0) | 41 | if (headerSize == 0) |
| 42 | { | 42 | { |
| 43 | *headerSizeRes = 1; | 43 | *headerSizeRes = 1; |
| @@ -47,7 +47,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
| 47 | 47 | ||
| 48 | *isIndex = False; | 48 | *isIndex = False; |
| 49 | headerSize = (headerSize << 2) + 4; | 49 | headerSize = (headerSize << 2) + 4; |
| 50 | *headerSizeRes = headerSize; | 50 | *headerSizeRes = (UInt32)headerSize; |
| 51 | { | 51 | { |
| 52 | size_t processedSize = headerSize - 1; | 52 | size_t processedSize = headerSize - 1; |
| 53 | RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) | 53 | RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) |
| @@ -58,7 +58,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, | |||
| 58 | } | 58 | } |
| 59 | 59 | ||
| 60 | #define ADD_SIZE_CHECK(size, val) \ | 60 | #define ADD_SIZE_CHECK(size, val) \ |
| 61 | { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } | 61 | { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } |
| 62 | 62 | ||
| 63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) | 63 | UInt64 Xz_GetUnpackSize(const CXzStream *p) |
| 64 | { | 64 | { |
diff --git a/C/ZstdDec.c b/C/ZstdDec.c new file mode 100644 index 0000000..ecf6d22 --- /dev/null +++ b/C/ZstdDec.c | |||
| @@ -0,0 +1,4064 @@ | |||
| 1 | /* ZstdDec.c -- Zstd Decoder | ||
| 2 | 2024-01-21 : the code was developed by Igor Pavlov, using Zstandard format | ||
| 3 | specification and original zstd decoder code as reference code. | ||
| 4 | original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved. | ||
| 5 | This source code is licensed under BSD 3-Clause License. | ||
| 6 | */ | ||
| 7 | |||
| 8 | #include "Precomp.h" | ||
| 9 | |||
| 10 | #include <string.h> | ||
| 11 | #include <stdlib.h> | ||
| 12 | // #include <stdio.h> | ||
| 13 | |||
| 14 | #include "Alloc.h" | ||
| 15 | #include "Xxh64.h" | ||
| 16 | #include "ZstdDec.h" | ||
| 17 | #include "CpuArch.h" | ||
| 18 | |||
| 19 | #if defined(MY_CPU_ARM64) | ||
| 20 | #include <arm_neon.h> | ||
| 21 | #endif | ||
| 22 | |||
| 23 | /* original-zstd still doesn't support window larger than 2 GiB. | ||
| 24 | So we also limit our decoder for 2 GiB window: */ | ||
| 25 | #if defined(MY_CPU_64BIT) && 0 == 1 | ||
| 26 | #define MAX_WINDOW_SIZE_LOG 41 | ||
| 27 | #else | ||
| 28 | #define MAX_WINDOW_SIZE_LOG 31 | ||
| 29 | #endif | ||
| 30 | |||
| 31 | typedef | ||
| 32 | #if MAX_WINDOW_SIZE_LOG < 32 | ||
| 33 | UInt32 | ||
| 34 | #else | ||
| 35 | size_t | ||
| 36 | #endif | ||
| 37 | CZstdDecOffset; | ||
| 38 | |||
| 39 | // for debug: simpler and smaller code but slow: | ||
| 40 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
| 41 | |||
| 42 | // #define SHOW_STAT | ||
| 43 | #ifdef SHOW_STAT | ||
| 44 | #include <stdio.h> | ||
| 45 | static unsigned g_Num_Blocks_Compressed = 0; | ||
| 46 | static unsigned g_Num_Blocks_memcpy = 0; | ||
| 47 | static unsigned g_Num_Wrap_memmove_Num = 0; | ||
| 48 | static unsigned g_Num_Wrap_memmove_Bytes = 0; | ||
| 49 | static unsigned g_NumSeqs_total = 0; | ||
| 50 | // static unsigned g_NumCopy = 0; | ||
| 51 | static unsigned g_NumOver = 0; | ||
| 52 | static unsigned g_NumOver2 = 0; | ||
| 53 | static unsigned g_Num_Match = 0; | ||
| 54 | static unsigned g_Num_Lits = 0; | ||
| 55 | static unsigned g_Num_LitsBig = 0; | ||
| 56 | static unsigned g_Num_Lit0 = 0; | ||
| 57 | static unsigned g_Num_Rep0 = 0; | ||
| 58 | static unsigned g_Num_Rep1 = 0; | ||
| 59 | static unsigned g_Num_Rep2 = 0; | ||
| 60 | static unsigned g_Num_Rep3 = 0; | ||
| 61 | static unsigned g_Num_Threshold_0 = 0; | ||
| 62 | static unsigned g_Num_Threshold_1 = 0; | ||
| 63 | static unsigned g_Num_Threshold_0sum = 0; | ||
| 64 | static unsigned g_Num_Threshold_1sum = 0; | ||
| 65 | #define STAT_UPDATE(v) v | ||
| 66 | #else | ||
| 67 | #define STAT_UPDATE(v) | ||
| 68 | #endif | ||
| 69 | #define STAT_INC(v) STAT_UPDATE(v++;) | ||
| 70 | |||
| 71 | |||
| 72 | typedef struct | ||
| 73 | { | ||
| 74 | const Byte *ptr; | ||
| 75 | size_t len; | ||
| 76 | } | ||
| 77 | CInBufPair; | ||
| 78 | |||
| 79 | |||
| 80 | #if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64) | ||
| 81 | #if (defined(__clang__) && (__clang_major__ >= 6)) \ | ||
| 82 | || (defined(__GNUC__) && (__GNUC__ >= 6)) | ||
| 83 | // disable for debug: | ||
| 84 | #define Z7_ZSTD_DEC_USE_BSR | ||
| 85 | #elif defined(_MSC_VER) && (_MSC_VER >= 1300) | ||
| 86 | // #if defined(MY_CPU_ARM_OR_ARM64) | ||
| 87 | #if (_MSC_VER >= 1600) | ||
| 88 | #include <intrin.h> | ||
| 89 | #endif | ||
| 90 | // disable for debug: | ||
| 91 | #define Z7_ZSTD_DEC_USE_BSR | ||
| 92 | #endif | ||
| 93 | #endif | ||
| 94 | |||
| 95 | #ifdef Z7_ZSTD_DEC_USE_BSR | ||
| 96 | #if defined(__clang__) || defined(__GNUC__) | ||
| 97 | #define MY_clz(x) ((unsigned)__builtin_clz((UInt32)x)) | ||
| 98 | #else // #if defined(_MSC_VER) | ||
| 99 | #ifdef MY_CPU_ARM_OR_ARM64 | ||
| 100 | #define MY_clz _CountLeadingZeros | ||
| 101 | #endif // MY_CPU_X86_OR_AMD64 | ||
| 102 | #endif // _MSC_VER | ||
| 103 | #elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE) | ||
| 104 | #define Z7_ZSTD_DEC_USE_LOG_TABLE | ||
| 105 | #endif | ||
| 106 | |||
| 107 | |||
| 108 | static | ||
| 109 | Z7_FORCE_INLINE | ||
| 110 | unsigned GetHighestSetBit_32_nonzero_big(UInt32 num) | ||
| 111 | { | ||
| 112 | // (num != 0) | ||
| 113 | #ifdef MY_clz | ||
| 114 | return 31 - MY_clz(num); | ||
| 115 | #elif defined(Z7_ZSTD_DEC_USE_BSR) | ||
| 116 | { | ||
| 117 | unsigned long zz; | ||
| 118 | _BitScanReverse(&zz, num); | ||
| 119 | return zz; | ||
| 120 | } | ||
| 121 | #else | ||
| 122 | { | ||
| 123 | int i = -1; | ||
| 124 | for (;;) | ||
| 125 | { | ||
| 126 | i++; | ||
| 127 | num >>= 1; | ||
| 128 | if (num == 0) | ||
| 129 | return (unsigned)i; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | #endif | ||
| 133 | } | ||
| 134 | |||
| 135 | #ifdef Z7_ZSTD_DEC_USE_LOG_TABLE | ||
| 136 | |||
| 137 | #define R1(a) a, a | ||
| 138 | #define R2(a) R1(a), R1(a) | ||
| 139 | #define R3(a) R2(a), R2(a) | ||
| 140 | #define R4(a) R3(a), R3(a) | ||
| 141 | #define R5(a) R4(a), R4(a) | ||
| 142 | #define R6(a) R5(a), R5(a) | ||
| 143 | #define R7(a) R6(a), R6(a) | ||
| 144 | #define R8(a) R7(a), R7(a) | ||
| 145 | #define R9(a) R8(a), R8(a) | ||
| 146 | |||
| 147 | #define Z7_ZSTD_FSE_MAX_ACCURACY 9 | ||
| 148 | // states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits. | ||
| 149 | static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] = | ||
| 150 | { | ||
| 151 | R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9) | ||
| 152 | }; | ||
| 153 | |||
| 154 | #define GetHighestSetBit_32_nonzero_small(num) (k_zstd_LogTable[num]) | ||
| 155 | #else | ||
| 156 | #define GetHighestSetBit_32_nonzero_small GetHighestSetBit_32_nonzero_big | ||
| 157 | #endif | ||
| 158 | |||
| 159 | |||
| 160 | #ifdef MY_clz | ||
| 161 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
| 162 | bitOffset -= (CBitCtr)(MY_clz(b) - 23); | ||
| 163 | #elif defined(Z7_ZSTD_DEC_USE_BSR) | ||
| 164 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
| 165 | { unsigned long zz; _BitScanReverse(&zz, b); bitOffset -= 8; bitOffset += zz; } | ||
| 166 | #else | ||
| 167 | #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \ | ||
| 168 | for (;;) { bitOffset--; if (b & 0x80) { break; } b <<= 1; } | ||
| 169 | #endif | ||
| 170 | |||
| 171 | #define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \ | ||
| 172 | { \ | ||
| 173 | unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \ | ||
| 174 | if (lastByte == 0) return SZ_ERROR_DATA; \ | ||
| 175 | bitOffset = (CBitCtr)((srcLen) * 8); \ | ||
| 176 | UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \ | ||
| 177 | } | ||
| 178 | |||
| 179 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
| 180 | |||
| 181 | #define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \ | ||
| 182 | { \ | ||
| 183 | unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \ | ||
| 184 | if (lastByte == 0) return SZ_ERROR_DATA; \ | ||
| 185 | srcLen_res *= 8; \ | ||
| 186 | bitOffset = (CBitCtr)srcLen_res; \ | ||
| 187 | UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \ | ||
| 188 | } | ||
| 189 | |||
| 190 | #endif | ||
| 191 | |||
| 192 | /* | ||
| 193 | typedef Int32 CBitCtr_signed; | ||
| 194 | typedef Int32 CBitCtr; | ||
| 195 | */ | ||
| 196 | // /* | ||
| 197 | typedef ptrdiff_t CBitCtr_signed; | ||
| 198 | typedef ptrdiff_t CBitCtr; | ||
| 199 | // */ | ||
| 200 | |||
| 201 | |||
| 202 | #define MATCH_LEN_MIN 3 | ||
| 203 | #define kBlockSizeMax (1u << 17) | ||
| 204 | |||
| 205 | // #define Z7_ZSTD_DEC_PRINT_TABLE | ||
| 206 | |||
| 207 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
| 208 | #define NUM_OFFSET_SYMBOLS_PREDEF 29 | ||
| 209 | #endif | ||
| 210 | #define NUM_OFFSET_SYMBOLS_MAX (MAX_WINDOW_SIZE_LOG + 1) // 32 | ||
| 211 | #define NUM_LL_SYMBOLS 36 | ||
| 212 | #define NUM_ML_SYMBOLS 53 | ||
| 213 | #define FSE_NUM_SYMBOLS_MAX 53 // NUM_ML_SYMBOLS | ||
| 214 | |||
| 215 | // /* | ||
| 216 | #if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT) | ||
| 217 | #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
| 218 | #endif | ||
| 219 | // */ | ||
| 220 | // for debug: | ||
| 221 | // #define Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
| 222 | // #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
| 223 | |||
| 224 | #define GLOBAL_TABLE(n) k_ ## n | ||
| 225 | |||
| 226 | #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) | ||
| 227 | #define BASES_TABLE(n) a_ ## n | ||
| 228 | #elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT) | ||
| 229 | #define BASES_TABLE(n) p->m_ ## n | ||
| 230 | #else | ||
| 231 | #define BASES_TABLE(n) GLOBAL_TABLE(n) | ||
| 232 | #endif | ||
| 233 | |||
| 234 | #define Z7_ZSTD_DEC_USE_ML_PLUS3 | ||
| 235 | |||
| 236 | #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \ | ||
| 237 | defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT) | ||
| 238 | |||
| 239 | #define SEQ_EXTRA_TABLES(n) \ | ||
| 240 | Byte n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \ | ||
| 241 | Byte n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \ | ||
| 242 | UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \ | ||
| 243 | UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \ | ||
| 244 | |||
| 245 | #define Z7_ZSTD_DEC_USE_BASES_CALC | ||
| 246 | |||
| 247 | #ifdef Z7_ZSTD_DEC_USE_BASES_CALC | ||
| 248 | |||
| 249 | #define FILL_LOC_BASES(n, startSum) \ | ||
| 250 | { unsigned i; UInt32 sum = startSum; \ | ||
| 251 | for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \ | ||
| 252 | { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \ | ||
| 253 | BASES_TABLE(n ## _BASES)[i] = sum; \ | ||
| 254 | /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \ | ||
| 255 | sum += (UInt32)1 << a; \ | ||
| 256 | BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }} | ||
| 257 | |||
| 258 | #define FILL_LOC_BASES_ALL \ | ||
| 259 | FILL_LOC_BASES (SEQ_LL, 0) \ | ||
| 260 | FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \ | ||
| 261 | |||
| 262 | #else | ||
| 263 | #define COPY_GLOBAL_ARR(n) \ | ||
| 264 | memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n))); | ||
| 265 | #define FILL_LOC_BASES_ALL \ | ||
| 266 | COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \ | ||
| 267 | COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \ | ||
| 268 | COPY_GLOBAL_ARR (SEQ_LL_BASES) \ | ||
| 269 | COPY_GLOBAL_ARR (SEQ_ML_BASES) \ | ||
| 270 | |||
| 271 | #endif | ||
| 272 | |||
| 273 | #endif | ||
| 274 | |||
| 275 | |||
| 276 | |||
| 277 | /// The sequence decoding baseline and number of additional bits to read/add | ||
| 278 | #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC) | ||
| 279 | static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] = | ||
| 280 | { | ||
| 281 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | ||
| 282 | 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, | ||
| 283 | 0x2000, 0x4000, 0x8000, 0x10000 | ||
| 284 | }; | ||
| 285 | #endif | ||
| 286 | |||
| 287 | static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] = | ||
| 288 | { | ||
| 289 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 290 | 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, | ||
| 291 | 13, 14, 15, 16 | ||
| 292 | }; | ||
| 293 | |||
| 294 | #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC) | ||
| 295 | static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] = | ||
| 296 | { | ||
| 297 | 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, | ||
| 298 | 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, | ||
| 299 | 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, | ||
| 300 | 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 | ||
| 301 | }; | ||
| 302 | #endif | ||
| 303 | |||
| 304 | static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] = | ||
| 305 | { | ||
| 306 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 307 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
| 308 | 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, | ||
| 309 | 12, 13, 14, 15, 16 | ||
| 310 | }; | ||
| 311 | |||
| 312 | |||
| 313 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
| 314 | |||
| 315 | static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] = | ||
| 316 | { | ||
| 317 | 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, | ||
| 318 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1, | ||
| 319 | -1,-1,-1,-1 | ||
| 320 | }; | ||
| 321 | static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] = | ||
| 322 | { | ||
| 323 | 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, | ||
| 324 | 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1 | ||
| 325 | }; | ||
| 326 | static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] = | ||
| 327 | { | ||
| 328 | 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, | ||
| 329 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | ||
| 330 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1, | ||
| 331 | -1,-1,-1,-1,-1 | ||
| 332 | }; | ||
| 333 | |||
| 334 | #endif | ||
| 335 | |||
| 336 | // typedef int FastInt; | ||
| 337 | // typedef Int32 FastInt32; | ||
| 338 | typedef unsigned FastInt; | ||
| 339 | typedef UInt32 FastInt32; | ||
| 340 | typedef FastInt32 CFseRecord; | ||
| 341 | |||
| 342 | |||
| 343 | #define FSE_REC_LEN_OFFSET 8 | ||
| 344 | #define FSE_REC_STATE_OFFSET 16 | ||
| 345 | #define GET_FSE_REC_SYM(st) ((Byte)(st)) | ||
| 346 | #define GET_FSE_REC_LEN(st) ((Byte)((st) >> FSE_REC_LEN_OFFSET)) | ||
| 347 | #define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET) | ||
| 348 | |||
| 349 | // #define FSE_REC_SYM_MASK (0xff) | ||
| 350 | // #define GET_FSE_REC_SYM(st) (st & FSE_REC_SYM_MASK) | ||
| 351 | |||
| 352 | #define W_BASE(state, len, sym) \ | ||
| 353 | (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \ | ||
| 354 | (len << FSE_REC_LEN_OFFSET) + (sym)) | ||
| 355 | #define W(state, len, sym) W_BASE(state, len, sym) | ||
| 356 | static const CFseRecord k_PredefRecords_LL[1 << 6] = { | ||
| 357 | W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9), | ||
| 358 | W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22), | ||
| 359 | W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1), | ||
| 360 | W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13), | ||
| 361 | W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25), | ||
| 362 | W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5), | ||
| 363 | W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18), | ||
| 364 | W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32) | ||
| 365 | }; | ||
| 366 | static const CFseRecord k_PredefRecords_OF[1 << 5] = { | ||
| 367 | W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12), | ||
| 368 | W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7), | ||
| 369 | W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1), | ||
| 370 | W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24) | ||
| 371 | }; | ||
| 372 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
| 373 | #undef W | ||
| 374 | #define W(state, len, sym) W_BASE(state, len, (sym + MATCH_LEN_MIN)) | ||
| 375 | #endif | ||
| 376 | static const CFseRecord k_PredefRecords_ML[1 << 6] = { | ||
| 377 | W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10), | ||
| 378 | W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33), | ||
| 379 | W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2), | ||
| 380 | W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18), | ||
| 381 | W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38), | ||
| 382 | W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5), | ||
| 383 | W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26), | ||
| 384 | W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46) | ||
| 385 | }; | ||
| 386 | |||
| 387 | |||
| 388 | // sum of freqs[] must be correct | ||
| 389 | // (numSyms != 0) | ||
| 390 | // (accuracy >= 5) | ||
| 391 | static | ||
| 392 | Z7_NO_INLINE | ||
| 393 | // Z7_FORCE_INLINE | ||
| 394 | void FSE_Generate(CFseRecord *table, | ||
| 395 | const Int16 *const freqs, const size_t numSyms, | ||
| 396 | const unsigned accuracy, UInt32 delta) | ||
| 397 | { | ||
| 398 | size_t size = (size_t)1 << accuracy; | ||
| 399 | // max value in states[x] is ((1 << accuracy) * 2) | ||
| 400 | UInt16 states[FSE_NUM_SYMBOLS_MAX]; | ||
| 401 | { | ||
| 402 | /* Symbols with "less than 1" probability get a single cell, | ||
| 403 | starting from the end of the table. | ||
| 404 | These symbols define a full state reset, reading (accuracy) bits. */ | ||
| 405 | size_t threshold = size; | ||
| 406 | { | ||
| 407 | size_t s = 0; | ||
| 408 | do | ||
| 409 | if (freqs[s] == -1) | ||
| 410 | { | ||
| 411 | table[--threshold] = (CFseRecord)s; | ||
| 412 | states[s] = 1; | ||
| 413 | } | ||
| 414 | while (++s != numSyms); | ||
| 415 | } | ||
| 416 | |||
| 417 | #ifdef SHOW_STAT | ||
| 418 | if (threshold == size) | ||
| 419 | { | ||
| 420 | STAT_INC(g_Num_Threshold_0) | ||
| 421 | STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;) | ||
| 422 | } | ||
| 423 | else | ||
| 424 | { | ||
| 425 | STAT_INC(g_Num_Threshold_1) | ||
| 426 | STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;) | ||
| 427 | } | ||
| 428 | #endif | ||
| 429 | |||
| 430 | // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++) | ||
| 431 | { | ||
| 432 | // Each (symbol) gets freqs[symbol] cells. | ||
| 433 | // Cell allocation is spread, not linear. | ||
| 434 | const size_t step = (size >> 1) + (size >> 3) + 3; | ||
| 435 | size_t pos = 0; | ||
| 436 | // const unsigned mask = size - 1; | ||
| 437 | /* | ||
| 438 | if (threshold == size) | ||
| 439 | { | ||
| 440 | size_t s = 0; | ||
| 441 | size--; | ||
| 442 | do | ||
| 443 | { | ||
| 444 | int freq = freqs[s]; | ||
| 445 | if (freq <= 0) | ||
| 446 | continue; | ||
| 447 | states[s] = (UInt16)freq; | ||
| 448 | do | ||
| 449 | { | ||
| 450 | table[pos] (CFseRecord)s; | ||
| 451 | pos = (pos + step) & size; // & mask; | ||
| 452 | } | ||
| 453 | while (--freq); | ||
| 454 | } | ||
| 455 | while (++s != numSyms); | ||
| 456 | } | ||
| 457 | else | ||
| 458 | */ | ||
| 459 | { | ||
| 460 | size_t s = 0; | ||
| 461 | size--; | ||
| 462 | do | ||
| 463 | { | ||
| 464 | int freq = freqs[s]; | ||
| 465 | if (freq <= 0) | ||
| 466 | continue; | ||
| 467 | states[s] = (UInt16)freq; | ||
| 468 | do | ||
| 469 | { | ||
| 470 | table[pos] = (CFseRecord)s; | ||
| 471 | // we skip position, if it's already occupied by a "less than 1" probability symbol. | ||
| 472 | // (step) is coprime to table size, so the cycle will visit each position exactly once | ||
| 473 | do | ||
| 474 | pos = (pos + step) & size; // & mask; | ||
| 475 | while (pos >= threshold); | ||
| 476 | } | ||
| 477 | while (--freq); | ||
| 478 | } | ||
| 479 | while (++s != numSyms); | ||
| 480 | } | ||
| 481 | size++; | ||
| 482 | // (pos != 0) is unexpected case that means that freqs[] are not correct. | ||
| 483 | // so it's some failure in code (for example, incorrect predefined freq[] table) | ||
| 484 | // if (pos != 0) return SZ_ERROR_FAIL; | ||
| 485 | } | ||
| 486 | // } | ||
| 487 | } | ||
| 488 | { | ||
| 489 | const CFseRecord * const limit = table + size; | ||
| 490 | delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta; | ||
| 491 | /* State increases by symbol over time, decreasing number of bits. | ||
| 492 | Baseline increases until the bit threshold is passed, at which point it resets to 0 */ | ||
| 493 | do | ||
| 494 | { | ||
| 495 | #define TABLE_ITER(a) \ | ||
| 496 | { \ | ||
| 497 | const FastInt sym = (FastInt)table[a]; \ | ||
| 498 | const unsigned nextState = states[sym]; \ | ||
| 499 | unsigned nb; \ | ||
| 500 | states[sym] = (UInt16)(nextState + 1); \ | ||
| 501 | nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \ | ||
| 502 | table[a] = (CFseRecord)(sym - delta \ | ||
| 503 | + ((UInt32)nb << FSE_REC_LEN_OFFSET) \ | ||
| 504 | + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \ | ||
| 505 | } | ||
| 506 | TABLE_ITER(0) | ||
| 507 | TABLE_ITER(1) | ||
| 508 | table += 2; | ||
| 509 | } | ||
| 510 | while (table != limit); | ||
| 511 | } | ||
| 512 | } | ||
| 513 | |||
| 514 | |||
| 515 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
| 516 | |||
| 517 | static void Print_Predef(unsigned predefAccuracy, | ||
| 518 | const unsigned numSymsPredef, | ||
| 519 | const Int16 * const predefFreqs, | ||
| 520 | const CFseRecord *checkTable) | ||
| 521 | { | ||
| 522 | CFseRecord table[1 << 6]; | ||
| 523 | unsigned i; | ||
| 524 | FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy, | ||
| 525 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
| 526 | numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : | ||
| 527 | #endif | ||
| 528 | 0 | ||
| 529 | ); | ||
| 530 | if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0) | ||
| 531 | exit(1); | ||
| 532 | for (i = 0; i < (1u << predefAccuracy); i++) | ||
| 533 | { | ||
| 534 | const UInt32 v = table[i]; | ||
| 535 | const unsigned state = (unsigned)(GET_FSE_REC_STATE(v)); | ||
| 536 | if (state & 0xf) | ||
| 537 | exit(1); | ||
| 538 | if (i != 0) | ||
| 539 | { | ||
| 540 | printf(","); | ||
| 541 | if (i % 8 == 0) | ||
| 542 | printf("\n"); | ||
| 543 | } | ||
| 544 | printf("W(%d,%d,%2d)", | ||
| 545 | (unsigned)(state >> 4), | ||
| 546 | (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff), | ||
| 547 | (unsigned)GET_FSE_REC_SYM(v)); | ||
| 548 | } | ||
| 549 | printf("\n\n"); | ||
| 550 | } | ||
| 551 | |||
| 552 | #endif | ||
| 553 | |||
| 554 | |||
| 555 | #define GET16(dest, p) { const Byte *ptr = p; dest = GetUi16(ptr); } | ||
| 556 | #define GET32(dest, p) { const Byte *ptr = p; dest = GetUi32(ptr); } | ||
| 557 | |||
| 558 | // (1 <= numBits <= 9) | ||
| 559 | #define FORWARD_READ_BITS(destVal, numBits, mask) \ | ||
| 560 | { const CBitCtr_signed bos3 = (bitOffset) >> 3; \ | ||
| 561 | if (bos3 >= 0) return SZ_ERROR_DATA; \ | ||
| 562 | GET16(destVal, src + bos3) \ | ||
| 563 | destVal >>= (bitOffset) & 7; \ | ||
| 564 | bitOffset += (CBitCtr_signed)(numBits); \ | ||
| 565 | mask = (1u << (numBits)) - 1; \ | ||
| 566 | destVal &= mask; \ | ||
| 567 | } | ||
| 568 | |||
| 569 | #define FORWARD_READ_1BIT(destVal) \ | ||
| 570 | { const CBitCtr_signed bos3 = (bitOffset) >> 3; \ | ||
| 571 | if (bos3 >= 0) return SZ_ERROR_DATA; \ | ||
| 572 | destVal = *(src + bos3); \ | ||
| 573 | destVal >>= (bitOffset) & 7; \ | ||
| 574 | (bitOffset)++; \ | ||
| 575 | destVal &= 1; \ | ||
| 576 | } | ||
| 577 | |||
| 578 | |||
| 579 | // in: (accuracyMax <= 9) | ||
| 580 | // at least 2 bytes will be processed from (in) stream. | ||
| 581 | // at return: (in->len > 0) | ||
| 582 | static | ||
| 583 | Z7_NO_INLINE | ||
| 584 | SRes FSE_DecodeHeader(CFseRecord *const table, | ||
| 585 | CInBufPair *const in, | ||
| 586 | const unsigned accuracyMax, | ||
| 587 | Byte *const accuracyRes, | ||
| 588 | unsigned numSymbolsMax) | ||
| 589 | { | ||
| 590 | unsigned accuracy; | ||
| 591 | unsigned remain1; | ||
| 592 | unsigned syms; | ||
| 593 | Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat) | ||
| 594 | const Byte *src = in->ptr; | ||
| 595 | CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1; | ||
| 596 | if (bitOffset <= 0) | ||
| 597 | return SZ_ERROR_DATA; | ||
| 598 | accuracy = *src & 0xf; | ||
| 599 | accuracy += 5; | ||
| 600 | if (accuracy > accuracyMax) | ||
| 601 | return SZ_ERROR_DATA; | ||
| 602 | *accuracyRes = (Byte)accuracy; | ||
| 603 | remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1) | ||
| 604 | syms = 0; | ||
| 605 | src += bitOffset; // src points to last byte | ||
| 606 | bitOffset = 4 - (bitOffset << 3); | ||
| 607 | |||
| 608 | for (;;) | ||
| 609 | { | ||
| 610 | // (2 <= remain1) | ||
| 611 | const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1); | ||
| 612 | // (1 <= bits <= accuracy) | ||
| 613 | unsigned val; // it must be unsigned or int | ||
| 614 | unsigned mask; | ||
| 615 | FORWARD_READ_BITS(val, bits, mask) | ||
| 616 | { | ||
| 617 | const unsigned val2 = remain1 + val - mask; | ||
| 618 | if (val2 > mask) | ||
| 619 | { | ||
| 620 | unsigned bit; | ||
| 621 | FORWARD_READ_1BIT(bit) | ||
| 622 | if (bit) | ||
| 623 | val = val2; | ||
| 624 | } | ||
| 625 | } | ||
| 626 | { | ||
| 627 | // (remain1 >= 2) | ||
| 628 | // (0 <= (int)val <= remain1) | ||
| 629 | val = (unsigned)((int)val - 1); | ||
| 630 | // val now is "probability" of symbol | ||
| 631 | // (probability == -1) means "less than 1" frequency. | ||
| 632 | // (-1 <= (int)val <= remain1 - 1) | ||
| 633 | freqs[syms++] = (Int16)(int)val; | ||
| 634 | if (val != 0) | ||
| 635 | { | ||
| 636 | remain1 -= (int)val < 0 ? 1u : (unsigned)val; | ||
| 637 | // remain1 -= val; | ||
| 638 | // val >>= (sizeof(val) * 8 - 2); | ||
| 639 | // remain1 -= val & 2; | ||
| 640 | // freqs[syms++] = (Int16)(int)val; | ||
| 641 | // syms++; | ||
| 642 | if (remain1 == 1) | ||
| 643 | break; | ||
| 644 | if (syms >= FSE_NUM_SYMBOLS_MAX) | ||
| 645 | return SZ_ERROR_DATA; | ||
| 646 | } | ||
| 647 | else // if (val == 0) | ||
| 648 | { | ||
| 649 | // freqs[syms++] = 0; | ||
| 650 | // syms++; | ||
| 651 | for (;;) | ||
| 652 | { | ||
| 653 | unsigned repeat; | ||
| 654 | FORWARD_READ_BITS(repeat, 2, mask) | ||
| 655 | freqs[syms ] = 0; | ||
| 656 | freqs[syms + 1] = 0; | ||
| 657 | freqs[syms + 2] = 0; | ||
| 658 | syms += repeat; | ||
| 659 | if (syms >= FSE_NUM_SYMBOLS_MAX) | ||
| 660 | return SZ_ERROR_DATA; | ||
| 661 | if (repeat != 3) | ||
| 662 | break; | ||
| 663 | } | ||
| 664 | } | ||
| 665 | } | ||
| 666 | } | ||
| 667 | |||
| 668 | if (syms > numSymbolsMax) | ||
| 669 | return SZ_ERROR_DATA; | ||
| 670 | bitOffset += 7; | ||
| 671 | bitOffset >>= 3; | ||
| 672 | if (bitOffset > 0) | ||
| 673 | return SZ_ERROR_DATA; | ||
| 674 | in->ptr = src + bitOffset; | ||
| 675 | in->len = (size_t)(1 - bitOffset); | ||
| 676 | { | ||
| 677 | // unsigned uuu; for (uuu = 0; uuu < 50; uuu++) | ||
| 678 | FSE_Generate(table, freqs, syms, accuracy, | ||
| 679 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
| 680 | numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : | ||
| 681 | #endif | ||
| 682 | 0 | ||
| 683 | ); | ||
| 684 | } | ||
| 685 | return SZ_OK; | ||
| 686 | } | ||
| 687 | |||
| 688 | |||
| 689 | // ---------- HUFFMAN ---------- | ||
| 690 | |||
| 691 | #define HUF_MAX_BITS 12 | ||
| 692 | #define HUF_MAX_SYMBS 256 | ||
| 693 | #define HUF_DUMMY_SIZE (128 + 8 * 2) // it must multiple of 8 | ||
| 694 | // #define HUF_DUMMY_SIZE 0 | ||
| 695 | #define HUF_TABLE_SIZE ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE) | ||
| 696 | #define HUF_GET_SYMBOLS(table) ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE) | ||
| 697 | // #define HUF_GET_LENS(table) (table) | ||
| 698 | |||
| 699 | typedef struct | ||
| 700 | { | ||
| 701 | // Byte table[HUF_TABLE_SIZE]; | ||
| 702 | UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)]; | ||
| 703 | } | ||
| 704 | CZstdDecHufTable; | ||
| 705 | |||
| 706 | /* | ||
| 707 | Input: | ||
| 708 | numSyms != 0 | ||
| 709 | (bits) array size must be aligned for 2 | ||
| 710 | if (numSyms & 1), then bits[numSyms] == 0, | ||
| 711 | Huffman tree must be correct before Huf_Build() call: | ||
| 712 | (sum (1/2^bits[i]) == 1). | ||
| 713 | && (bits[i] <= HUF_MAX_BITS) | ||
| 714 | */ | ||
| 715 | static | ||
| 716 | Z7_FORCE_INLINE | ||
| 717 | void Huf_Build(Byte * const table, | ||
| 718 | const Byte *bits, const unsigned numSyms) | ||
| 719 | { | ||
| 720 | unsigned counts0[HUF_MAX_BITS + 1]; | ||
| 721 | unsigned counts1[HUF_MAX_BITS + 1]; | ||
| 722 | const Byte * const bitsEnd = bits + numSyms; | ||
| 723 | // /* | ||
| 724 | { | ||
| 725 | unsigned t; | ||
| 726 | for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0; | ||
| 727 | for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0; | ||
| 728 | } | ||
| 729 | // */ | ||
| 730 | // memset(counts0, 0, sizeof(counts0)); | ||
| 731 | // memset(counts1, 0, sizeof(counts1)); | ||
| 732 | { | ||
| 733 | const Byte *bits2 = bits; | ||
| 734 | // we access additional bits[symbol] if (numSyms & 1) | ||
| 735 | do | ||
| 736 | { | ||
| 737 | counts0[bits2[0]]++; | ||
| 738 | counts1[bits2[1]]++; | ||
| 739 | } | ||
| 740 | while ((bits2 += 2) < bitsEnd); | ||
| 741 | } | ||
| 742 | { | ||
| 743 | unsigned r = 0; | ||
| 744 | unsigned i = HUF_MAX_BITS; | ||
| 745 | // Byte *lens = HUF_GET_LENS(symbols); | ||
| 746 | do | ||
| 747 | { | ||
| 748 | const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i); | ||
| 749 | counts0[i] = r; | ||
| 750 | if (num) | ||
| 751 | { | ||
| 752 | Byte *lens = &table[r]; | ||
| 753 | r += num; | ||
| 754 | memset(lens, (int)i, num); | ||
| 755 | } | ||
| 756 | } | ||
| 757 | while (--i); | ||
| 758 | counts0[0] = 0; // for speculated loads | ||
| 759 | // no need for check: | ||
| 760 | // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0); | ||
| 761 | } | ||
| 762 | { | ||
| 763 | #ifdef MY_CPU_64BIT | ||
| 764 | UInt64 | ||
| 765 | #else | ||
| 766 | UInt32 | ||
| 767 | #endif | ||
| 768 | v = 0; | ||
| 769 | Byte *symbols = HUF_GET_SYMBOLS(table); | ||
| 770 | do | ||
| 771 | { | ||
| 772 | const unsigned nb = *bits++; | ||
| 773 | if (nb) | ||
| 774 | { | ||
| 775 | const unsigned code = counts0[nb]; | ||
| 776 | const unsigned num = (1u << HUF_MAX_BITS) >> nb; | ||
| 777 | counts0[nb] = code + num; | ||
| 778 | // memset(&symbols[code], i, num); | ||
| 779 | // /* | ||
| 780 | { | ||
| 781 | Byte *s2 = &symbols[code]; | ||
| 782 | if (num <= 2) | ||
| 783 | { | ||
| 784 | s2[0] = (Byte)v; | ||
| 785 | s2[(size_t)num - 1] = (Byte)v; | ||
| 786 | } | ||
| 787 | else if (num <= 8) | ||
| 788 | { | ||
| 789 | *(UInt32 *)(void *)s2 = (UInt32)v; | ||
| 790 | *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v; | ||
| 791 | } | ||
| 792 | else | ||
| 793 | { | ||
| 794 | #ifdef MY_CPU_64BIT | ||
| 795 | UInt64 *s = (UInt64 *)(void *)s2; | ||
| 796 | const UInt64 *lim = (UInt64 *)(void *)(s2 + num); | ||
| 797 | do | ||
| 798 | { | ||
| 799 | s[0] = v; s[1] = v; s += 2; | ||
| 800 | } | ||
| 801 | while (s != lim); | ||
| 802 | #else | ||
| 803 | UInt32 *s = (UInt32 *)(void *)s2; | ||
| 804 | const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num); | ||
| 805 | do | ||
| 806 | { | ||
| 807 | s[0] = v; s[1] = v; s += 2; | ||
| 808 | s[0] = v; s[1] = v; s += 2; | ||
| 809 | } | ||
| 810 | while (s != lim); | ||
| 811 | #endif | ||
| 812 | } | ||
| 813 | } | ||
| 814 | // */ | ||
| 815 | } | ||
| 816 | v += | ||
| 817 | #ifdef MY_CPU_64BIT | ||
| 818 | 0x0101010101010101; | ||
| 819 | #else | ||
| 820 | 0x01010101; | ||
| 821 | #endif | ||
| 822 | } | ||
| 823 | while (bits != bitsEnd); | ||
| 824 | } | ||
| 825 | } | ||
| 826 | |||
| 827 | |||
| 828 | |||
| 829 | // how many bytes (src) was moved back from original value. | ||
| 830 | // we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access | ||
| 831 | #define HUF_SRC_OFFSET 3 | ||
| 832 | |||
| 833 | // v <<= 8 - (bitOffset & 7) + numBits; | ||
| 834 | // v >>= 32 - HUF_MAX_BITS; | ||
| 835 | #define HUF_GET_STATE(v, bitOffset, numBits) \ | ||
| 836 | GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \ | ||
| 837 | v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \ | ||
| 838 | v &= (1u << HUF_MAX_BITS) - 1; \ | ||
| 839 | |||
| 840 | |||
| 841 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
| 842 | #if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \ | ||
| 843 | || !defined(MY_CPU_X86_OR_AMD64) \ | ||
| 844 | // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */ | ||
| 845 | // we need big number (>=16) of registers for PRELOAD4 | ||
| 846 | #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4 | ||
| 847 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug | ||
| 848 | #endif | ||
| 849 | #endif | ||
| 850 | |||
| 851 | // for debug: simpler and smaller code but slow: | ||
| 852 | // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE | ||
| 853 | |||
| 854 | #if defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \ | ||
| 855 | !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS) | ||
| 856 | |||
| 857 | #define HUF_DECODE(bitOffset, dest) \ | ||
| 858 | { \ | ||
| 859 | UInt32 v; \ | ||
| 860 | HUF_GET_STATE(v, bitOffset, 0) \ | ||
| 861 | bitOffset -= table[v]; \ | ||
| 862 | *(dest) = symbols[v]; \ | ||
| 863 | if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \ | ||
| 864 | } | ||
| 865 | |||
| 866 | #endif | ||
| 867 | |||
| 868 | #if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \ | ||
| 869 | defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \ | ||
| 870 | defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \ | ||
| 871 | |||
| 872 | #define HUF_DECODE_2_INIT(v, bitOffset) \ | ||
| 873 | HUF_GET_STATE(v, bitOffset, 0) | ||
| 874 | |||
| 875 | #define HUF_DECODE_2(v, bitOffset, dest) \ | ||
| 876 | { \ | ||
| 877 | unsigned numBits; \ | ||
| 878 | numBits = table[v]; \ | ||
| 879 | *(dest) = symbols[v]; \ | ||
| 880 | HUF_GET_STATE(v, bitOffset, numBits) \ | ||
| 881 | bitOffset -= (CBitCtr)numBits; \ | ||
| 882 | if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \ | ||
| 883 | } | ||
| 884 | |||
| 885 | #endif | ||
| 886 | |||
| 887 | |||
| 888 | // src == ptr - HUF_SRC_OFFSET | ||
| 889 | // we are allowed to access 3 bytes before start of input buffer | ||
| 890 | static | ||
| 891 | Z7_NO_INLINE | ||
| 892 | SRes Huf_Decompress_1stream(const Byte * const table, | ||
| 893 | const Byte *src, const size_t srcLen, | ||
| 894 | Byte *dest, const size_t destLen) | ||
| 895 | { | ||
| 896 | CBitCtr bitOffset; | ||
| 897 | if (srcLen == 0) | ||
| 898 | return SZ_ERROR_DATA; | ||
| 899 | SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen) | ||
| 900 | if (destLen) | ||
| 901 | { | ||
| 902 | const Byte *symbols = HUF_GET_SYMBOLS(table); | ||
| 903 | const Byte *destLim = dest + destLen; | ||
| 904 | #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE | ||
| 905 | { | ||
| 906 | do | ||
| 907 | { | ||
| 908 | HUF_DECODE (bitOffset, dest) | ||
| 909 | } | ||
| 910 | while (++dest != destLim); | ||
| 911 | } | ||
| 912 | #else | ||
| 913 | { | ||
| 914 | UInt32 v; | ||
| 915 | HUF_DECODE_2_INIT (v, bitOffset) | ||
| 916 | do | ||
| 917 | { | ||
| 918 | HUF_DECODE_2 (v, bitOffset, dest) | ||
| 919 | } | ||
| 920 | while (++dest != destLim); | ||
| 921 | } | ||
| 922 | #endif | ||
| 923 | } | ||
| 924 | return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA; | ||
| 925 | } | ||
| 926 | |||
| 927 | |||
| 928 | // for debug : it reduces register pressure : by array copy can be slow : | ||
| 929 | // #define Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
| 930 | |||
| 931 | // src == ptr + (6 - HUF_SRC_OFFSET) | ||
| 932 | // srcLen >= 10 | ||
| 933 | // we are allowed to access 3 bytes before start of input buffer | ||
| 934 | static | ||
| 935 | Z7_NO_INLINE | ||
| 936 | SRes Huf_Decompress_4stream(const Byte * const | ||
| 937 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
| 938 | table2, | ||
| 939 | #else | ||
| 940 | table, | ||
| 941 | #endif | ||
| 942 | const Byte *src, size_t srcLen, | ||
| 943 | Byte *dest, size_t destLen) | ||
| 944 | { | ||
| 945 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
| 946 | Byte table[HUF_TABLE_SIZE]; | ||
| 947 | #endif | ||
| 948 | UInt32 sizes[3]; | ||
| 949 | const size_t delta = (destLen + 3) / 4; | ||
| 950 | if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
| 951 | if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
| 952 | sizes[1] += sizes[0]; | ||
| 953 | if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA; | ||
| 954 | sizes[2] += sizes[1]; | ||
| 955 | srcLen -= 6; | ||
| 956 | if (srcLen <= sizes[2]) | ||
| 957 | return SZ_ERROR_DATA; | ||
| 958 | |||
| 959 | #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL | ||
| 960 | { | ||
| 961 | // unsigned i = 0; for(; i < 1000; i++) | ||
| 962 | memcpy(table, table2, HUF_TABLE_SIZE); | ||
| 963 | } | ||
| 964 | #endif | ||
| 965 | |||
| 966 | #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
| 967 | { | ||
| 968 | CBitCtr bitOffset_0, | ||
| 969 | bitOffset_1, | ||
| 970 | bitOffset_2, | ||
| 971 | bitOffset_3; | ||
| 972 | { | ||
| 973 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0]) | ||
| 974 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1]) | ||
| 975 | SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2]) | ||
| 976 | SET_bitOffset_TO_PAD (bitOffset_3, src + HUF_SRC_OFFSET, srcLen) | ||
| 977 | } | ||
| 978 | { | ||
| 979 | const Byte * const symbols = HUF_GET_SYMBOLS(table); | ||
| 980 | Byte *destLim = dest + destLen - delta * 3; | ||
| 981 | |||
| 982 | if (dest != destLim) | ||
| 983 | #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4 | ||
| 984 | { | ||
| 985 | UInt32 v_0, v_1, v_2, v_3; | ||
| 986 | HUF_DECODE_2_INIT (v_0, bitOffset_0) | ||
| 987 | HUF_DECODE_2_INIT (v_1, bitOffset_1) | ||
| 988 | HUF_DECODE_2_INIT (v_2, bitOffset_2) | ||
| 989 | HUF_DECODE_2_INIT (v_3, bitOffset_3) | ||
| 990 | // #define HUF_DELTA (1 << 17) / 4 | ||
| 991 | do | ||
| 992 | { | ||
| 993 | HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3) | ||
| 994 | HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2) | ||
| 995 | HUF_DECODE_2 (v_1, bitOffset_1, dest + delta) | ||
| 996 | HUF_DECODE_2 (v_0, bitOffset_0, dest) | ||
| 997 | } | ||
| 998 | while (++dest != destLim); | ||
| 999 | /* | ||
| 1000 | {// unsigned y = 0; for (;y < 1; y++) | ||
| 1001 | { | ||
| 1002 | const size_t num = destLen - delta * 3; | ||
| 1003 | Byte *orig = dest - num; | ||
| 1004 | memmove (orig + delta , orig + HUF_DELTA, num); | ||
| 1005 | memmove (orig + delta * 2, orig + HUF_DELTA * 2, num); | ||
| 1006 | memmove (orig + delta * 3, orig + HUF_DELTA * 3, num); | ||
| 1007 | }} | ||
| 1008 | */ | ||
| 1009 | } | ||
| 1010 | #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) | ||
| 1011 | { | ||
| 1012 | UInt32 v_0, v_1, v_2, v_3; | ||
| 1013 | HUF_DECODE_2_INIT (v_0, bitOffset_0) | ||
| 1014 | HUF_DECODE_2_INIT (v_1, bitOffset_1) | ||
| 1015 | do | ||
| 1016 | { | ||
| 1017 | HUF_DECODE_2 (v_0, bitOffset_0, dest) | ||
| 1018 | HUF_DECODE_2 (v_1, bitOffset_1, dest + delta) | ||
| 1019 | } | ||
| 1020 | while (++dest != destLim); | ||
| 1021 | dest = destLim - (destLen - delta * 3); | ||
| 1022 | dest += delta * 2; | ||
| 1023 | destLim += delta * 2; | ||
| 1024 | HUF_DECODE_2_INIT (v_2, bitOffset_2) | ||
| 1025 | HUF_DECODE_2_INIT (v_3, bitOffset_3) | ||
| 1026 | do | ||
| 1027 | { | ||
| 1028 | HUF_DECODE_2 (v_2, bitOffset_2, dest) | ||
| 1029 | HUF_DECODE_2 (v_3, bitOffset_3, dest + delta) | ||
| 1030 | } | ||
| 1031 | while (++dest != destLim); | ||
| 1032 | dest -= delta * 2; | ||
| 1033 | destLim -= delta * 2; | ||
| 1034 | } | ||
| 1035 | #else | ||
| 1036 | { | ||
| 1037 | do | ||
| 1038 | { | ||
| 1039 | HUF_DECODE (bitOffset_3, dest + delta * 3) | ||
| 1040 | HUF_DECODE (bitOffset_2, dest + delta * 2) | ||
| 1041 | HUF_DECODE (bitOffset_1, dest + delta) | ||
| 1042 | HUF_DECODE (bitOffset_0, dest) | ||
| 1043 | } | ||
| 1044 | while (++dest != destLim); | ||
| 1045 | } | ||
| 1046 | #endif | ||
| 1047 | |||
| 1048 | if (bitOffset_3 != (CBitCtr)sizes[2]) | ||
| 1049 | return SZ_ERROR_DATA; | ||
| 1050 | if (destLen &= 3) | ||
| 1051 | { | ||
| 1052 | destLim = dest + 4 - destLen; | ||
| 1053 | do | ||
| 1054 | { | ||
| 1055 | HUF_DECODE (bitOffset_2, dest + delta * 2) | ||
| 1056 | HUF_DECODE (bitOffset_1, dest + delta) | ||
| 1057 | HUF_DECODE (bitOffset_0, dest) | ||
| 1058 | } | ||
| 1059 | while (++dest != destLim); | ||
| 1060 | } | ||
| 1061 | if ( bitOffset_0 != 0 | ||
| 1062 | || bitOffset_1 != (CBitCtr)sizes[0] | ||
| 1063 | || bitOffset_2 != (CBitCtr)sizes[1]) | ||
| 1064 | return SZ_ERROR_DATA; | ||
| 1065 | } | ||
| 1066 | } | ||
| 1067 | #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS | ||
| 1068 | { | ||
| 1069 | unsigned i; | ||
| 1070 | for (i = 0; i < 4; i++) | ||
| 1071 | { | ||
| 1072 | size_t d = destLen; | ||
| 1073 | size_t size = srcLen; | ||
| 1074 | if (i != 3) | ||
| 1075 | { | ||
| 1076 | d = delta; | ||
| 1077 | size = sizes[i]; | ||
| 1078 | } | ||
| 1079 | if (i != 0) | ||
| 1080 | size -= sizes[i - 1]; | ||
| 1081 | destLen -= d; | ||
| 1082 | RINOK(Huf_Decompress_1stream(table, src, size, dest, d)) | ||
| 1083 | dest += d; | ||
| 1084 | src += size; | ||
| 1085 | } | ||
| 1086 | } | ||
| 1087 | #endif | ||
| 1088 | |||
| 1089 | return SZ_OK; | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | |||
| 1093 | |||
| 1094 | // (in->len != 0) | ||
| 1095 | // we are allowed to access in->ptr[-3] | ||
| 1096 | // at least 2 bytes in (in->ptr) will be processed | ||
| 1097 | static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in) | ||
| 1098 | { | ||
| 1099 | Byte weights[HUF_MAX_SYMBS + 1]; // +1 for extra write for loop unroll | ||
| 1100 | unsigned numSyms; | ||
| 1101 | const unsigned header = *(in->ptr)++; | ||
| 1102 | in->len--; | ||
| 1103 | // memset(weights, 0, sizeof(weights)); | ||
| 1104 | if (header >= 128) | ||
| 1105 | { | ||
| 1106 | // direct representation: 4 bits field (0-15) per weight | ||
| 1107 | numSyms = header - 127; | ||
| 1108 | // numSyms != 0 | ||
| 1109 | { | ||
| 1110 | const size_t numBytes = (numSyms + 1) / 2; | ||
| 1111 | const Byte *const ws = in->ptr; | ||
| 1112 | size_t i = 0; | ||
| 1113 | if (in->len < numBytes) | ||
| 1114 | return SZ_ERROR_DATA; | ||
| 1115 | in->ptr += numBytes; | ||
| 1116 | in->len -= numBytes; | ||
| 1117 | do | ||
| 1118 | { | ||
| 1119 | const unsigned b = ws[i]; | ||
| 1120 | weights[i * 2 ] = (Byte)(b >> 4); | ||
| 1121 | weights[i * 2 + 1] = (Byte)(b & 0xf); | ||
| 1122 | } | ||
| 1123 | while (++i != numBytes); | ||
| 1124 | /* 7ZIP: we can restore correct zero value for weights[numSyms], | ||
| 1125 | if we want to use zero values starting from numSyms in code below. */ | ||
| 1126 | // weights[numSyms] = 0; | ||
| 1127 | } | ||
| 1128 | } | ||
| 1129 | else | ||
| 1130 | { | ||
| 1131 | #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6 | ||
| 1132 | CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS]; | ||
| 1133 | |||
| 1134 | Byte accuracy; | ||
| 1135 | const Byte *src; | ||
| 1136 | size_t srcLen; | ||
| 1137 | if (in->len < header) | ||
| 1138 | return SZ_ERROR_DATA; | ||
| 1139 | { | ||
| 1140 | CInBufPair fse_stream; | ||
| 1141 | fse_stream.len = header; | ||
| 1142 | fse_stream.ptr = in->ptr; | ||
| 1143 | in->ptr += header; | ||
| 1144 | in->len -= header; | ||
| 1145 | RINOK(FSE_DecodeHeader(table, &fse_stream, | ||
| 1146 | MAX_ACCURACY_LOG_FOR_WEIGHTS, | ||
| 1147 | &accuracy, | ||
| 1148 | 16 // num weight symbols max (max-symbol is 15) | ||
| 1149 | )) | ||
| 1150 | // at least 2 bytes were processed in fse_stream. | ||
| 1151 | // (srcLen > 0) after FSE_DecodeHeader() | ||
| 1152 | // if (srcLen == 0) return SZ_ERROR_DATA; | ||
| 1153 | src = fse_stream.ptr; | ||
| 1154 | srcLen = fse_stream.len; | ||
| 1155 | } | ||
| 1156 | // we are allowed to access src[-5] | ||
| 1157 | { | ||
| 1158 | // unsigned yyy = 200; do { | ||
| 1159 | CBitCtr bitOffset; | ||
| 1160 | FastInt32 state1, state2; | ||
| 1161 | SET_bitOffset_TO_PAD (bitOffset, src, srcLen) | ||
| 1162 | state1 = accuracy; | ||
| 1163 | src -= state1 >> 2; // src -= 1; // for GET16() optimization | ||
| 1164 | state1 <<= FSE_REC_LEN_OFFSET; | ||
| 1165 | state2 = state1; | ||
| 1166 | numSyms = 0; | ||
| 1167 | for (;;) | ||
| 1168 | { | ||
| 1169 | #define FSE_WEIGHT_DECODE(st) \ | ||
| 1170 | { \ | ||
| 1171 | const unsigned bits = GET_FSE_REC_LEN(st); \ | ||
| 1172 | FastInt r; \ | ||
| 1173 | GET16(r, src + (bitOffset >> 3)) \ | ||
| 1174 | r >>= (unsigned)bitOffset & 7; \ | ||
| 1175 | if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \ | ||
| 1176 | { if (bitOffset + (CBitCtr)bits != 0) \ | ||
| 1177 | return SZ_ERROR_DATA; \ | ||
| 1178 | break; } \ | ||
| 1179 | r &= 0xff; \ | ||
| 1180 | r >>= 8 - bits; \ | ||
| 1181 | st = table[GET_FSE_REC_STATE(st) + r]; \ | ||
| 1182 | weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \ | ||
| 1183 | } | ||
| 1184 | FSE_WEIGHT_DECODE (state1) | ||
| 1185 | FSE_WEIGHT_DECODE (state2) | ||
| 1186 | if (numSyms == HUF_MAX_SYMBS) | ||
| 1187 | return SZ_ERROR_DATA; | ||
| 1188 | } | ||
| 1189 | // src += (unsigned)accuracy >> 2; } while (--yyy); | ||
| 1190 | } | ||
| 1191 | } | ||
| 1192 | |||
| 1193 | // Build using weights: | ||
| 1194 | { | ||
| 1195 | UInt32 sum = 0; | ||
| 1196 | { | ||
| 1197 | // numSyms >= 1 | ||
| 1198 | unsigned i = 0; | ||
| 1199 | weights[numSyms] = 0; | ||
| 1200 | do | ||
| 1201 | { | ||
| 1202 | sum += ((UInt32)1 << weights[i ]) & ~(UInt32)1; | ||
| 1203 | sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1; | ||
| 1204 | i += 2; | ||
| 1205 | } | ||
| 1206 | while (i < numSyms); | ||
| 1207 | if (sum == 0) | ||
| 1208 | return SZ_ERROR_DATA; | ||
| 1209 | } | ||
| 1210 | { | ||
| 1211 | const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1; | ||
| 1212 | { | ||
| 1213 | const UInt32 left = ((UInt32)1 << maxBits) - sum; | ||
| 1214 | // (left != 0) | ||
| 1215 | // (left) must be power of 2 in correct stream | ||
| 1216 | if (left & (left - 1)) | ||
| 1217 | return SZ_ERROR_DATA; | ||
| 1218 | weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left); | ||
| 1219 | } | ||
| 1220 | // if (numSyms & 1) | ||
| 1221 | weights[numSyms] = 0; // for loop unroll | ||
| 1222 | // numSyms >= 2 | ||
| 1223 | { | ||
| 1224 | unsigned i = 0; | ||
| 1225 | do | ||
| 1226 | { | ||
| 1227 | /* | ||
| 1228 | #define WEIGHT_ITER(a) \ | ||
| 1229 | { unsigned w = weights[i + (a)]; \ | ||
| 1230 | const unsigned t = maxBits - w; \ | ||
| 1231 | w = w ? t: w; \ | ||
| 1232 | if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \ | ||
| 1233 | weights[i + (a)] = (Byte)w; } | ||
| 1234 | */ | ||
| 1235 | // /* | ||
| 1236 | #define WEIGHT_ITER(a) \ | ||
| 1237 | { unsigned w = weights[i + (a)]; \ | ||
| 1238 | if (w) { \ | ||
| 1239 | w = maxBits - w; \ | ||
| 1240 | if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \ | ||
| 1241 | weights[i + (a)] = (Byte)w; }} | ||
| 1242 | // */ | ||
| 1243 | WEIGHT_ITER(0) | ||
| 1244 | // WEIGHT_ITER(1) | ||
| 1245 | // i += 2; | ||
| 1246 | } | ||
| 1247 | while (++i != numSyms); | ||
| 1248 | } | ||
| 1249 | } | ||
| 1250 | } | ||
| 1251 | { | ||
| 1252 | // unsigned yyy; for (yyy = 0; yyy < 100; yyy++) | ||
| 1253 | Huf_Build((Byte *)(void *)p->table64, weights, numSyms); | ||
| 1254 | } | ||
| 1255 | return SZ_OK; | ||
| 1256 | } | ||
| 1257 | |||
| 1258 | |||
| 1259 | typedef enum | ||
| 1260 | { | ||
| 1261 | k_SeqMode_Predef = 0, | ||
| 1262 | k_SeqMode_RLE = 1, | ||
| 1263 | k_SeqMode_FSE = 2, | ||
| 1264 | k_SeqMode_Repeat = 3 | ||
| 1265 | } | ||
| 1266 | z7_zstd_enum_SeqMode; | ||
| 1267 | |||
| 1268 | // predefAccuracy == 5 for OFFSET symbols | ||
| 1269 | // predefAccuracy == 6 for MATCH/LIT LEN symbols | ||
| 1270 | static | ||
| 1271 | SRes | ||
| 1272 | Z7_NO_INLINE | ||
| 1273 | // Z7_FORCE_INLINE | ||
| 1274 | FSE_Decode_SeqTable(CFseRecord * const table, | ||
| 1275 | CInBufPair * const in, | ||
| 1276 | unsigned predefAccuracy, | ||
| 1277 | Byte * const accuracyRes, | ||
| 1278 | unsigned numSymbolsMax, | ||
| 1279 | const CFseRecord * const predefs, | ||
| 1280 | const unsigned seqMode) | ||
| 1281 | { | ||
| 1282 | // UNUSED_VAR(numSymsPredef) | ||
| 1283 | // UNUSED_VAR(predefFreqs) | ||
| 1284 | if (seqMode == k_SeqMode_FSE) | ||
| 1285 | { | ||
| 1286 | // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK( | ||
| 1287 | return | ||
| 1288 | FSE_DecodeHeader(table, in, | ||
| 1289 | predefAccuracy + 3, // accuracyMax | ||
| 1290 | accuracyRes, | ||
| 1291 | numSymbolsMax) | ||
| 1292 | ; | ||
| 1293 | // )} while (--y); return SZ_OK; | ||
| 1294 | } | ||
| 1295 | // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax | ||
| 1296 | // numSymsMax == 32 for offsets | ||
| 1297 | |||
| 1298 | if (seqMode == k_SeqMode_Predef) | ||
| 1299 | { | ||
| 1300 | *accuracyRes = (Byte)predefAccuracy; | ||
| 1301 | memcpy(table, predefs, sizeof(UInt32) << predefAccuracy); | ||
| 1302 | return SZ_OK; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | // (seqMode == k_SeqMode_RLE) | ||
| 1306 | if (in->len == 0) | ||
| 1307 | return SZ_ERROR_DATA; | ||
| 1308 | in->len--; | ||
| 1309 | { | ||
| 1310 | const Byte *ptr = in->ptr; | ||
| 1311 | const Byte sym = ptr[0]; | ||
| 1312 | in->ptr = ptr + 1; | ||
| 1313 | table[0] = (FastInt32)sym | ||
| 1314 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
| 1315 | + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0) | ||
| 1316 | #endif | ||
| 1317 | ; | ||
| 1318 | *accuracyRes = 0; | ||
| 1319 | } | ||
| 1320 | return SZ_OK; | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | |||
| 1324 | typedef struct | ||
| 1325 | { | ||
| 1326 | CFseRecord of[1 << 8]; | ||
| 1327 | CFseRecord ll[1 << 9]; | ||
| 1328 | CFseRecord ml[1 << 9]; | ||
| 1329 | } | ||
| 1330 | CZstdDecFseTables; | ||
| 1331 | |||
| 1332 | |||
| 1333 | typedef struct | ||
| 1334 | { | ||
| 1335 | Byte *win; | ||
| 1336 | SizeT cycSize; | ||
| 1337 | /* | ||
| 1338 | if (outBuf_fromCaller) : cycSize = outBufSize_fromCaller | ||
| 1339 | else { | ||
| 1340 | if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space) | ||
| 1341 | if (!isCyclicMode) : cycSize = ContentSize, | ||
| 1342 | (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown | ||
| 1343 | } | ||
| 1344 | */ | ||
| 1345 | SizeT winPos; | ||
| 1346 | |||
| 1347 | CZstdDecOffset reps[3]; | ||
| 1348 | |||
| 1349 | Byte ll_accuracy; | ||
| 1350 | Byte of_accuracy; | ||
| 1351 | Byte ml_accuracy; | ||
| 1352 | // Byte seqTables_wereSet; | ||
| 1353 | Byte litHuf_wasSet; | ||
| 1354 | |||
| 1355 | Byte *literalsBase; | ||
| 1356 | |||
| 1357 | size_t winSize; // from header | ||
| 1358 | size_t totalOutCheck; // totalOutCheck <= winSize | ||
| 1359 | |||
| 1360 | #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
| 1361 | SEQ_EXTRA_TABLES(m_) | ||
| 1362 | #endif | ||
| 1363 | // UInt64 _pad_Alignment; // is not required now | ||
| 1364 | CZstdDecFseTables fse; | ||
| 1365 | CZstdDecHufTable huf; | ||
| 1366 | } | ||
| 1367 | CZstdDec1; | ||
| 1368 | |||
| 1369 | #define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \ | ||
| 1370 | ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax) | ||
| 1371 | |||
| 1372 | #define SEQ_TABLES_WERE_NOT_SET_ml_accuracy 1 // accuracy=1 is not used by zstd | ||
| 1373 | #define IS_SEQ_TABLES_WERE_SET(p) (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy)) | ||
| 1374 | // #define IS_SEQ_TABLES_WERE_SET(p) ((p)->seqTables_wereSet) | ||
| 1375 | |||
| 1376 | |||
| 1377 | static void ZstdDec1_Construct(CZstdDec1 *p) | ||
| 1378 | { | ||
| 1379 | #ifdef Z7_ZSTD_DEC_PRINT_TABLE | ||
| 1380 | Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL); | ||
| 1381 | Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF); | ||
| 1382 | Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML); | ||
| 1383 | #endif | ||
| 1384 | |||
| 1385 | p->win = NULL; | ||
| 1386 | p->cycSize = 0; | ||
| 1387 | p->literalsBase = NULL; | ||
| 1388 | #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT | ||
| 1389 | FILL_LOC_BASES_ALL | ||
| 1390 | #endif | ||
| 1391 | } | ||
| 1392 | |||
| 1393 | |||
| 1394 | static void ZstdDec1_Init(CZstdDec1 *p) | ||
| 1395 | { | ||
| 1396 | p->reps[0] = 1; | ||
| 1397 | p->reps[1] = 4; | ||
| 1398 | p->reps[2] = 8; | ||
| 1399 | // p->seqTables_wereSet = False; | ||
| 1400 | p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy; | ||
| 1401 | p->litHuf_wasSet = False; | ||
| 1402 | p->totalOutCheck = 0; | ||
| 1403 | } | ||
| 1404 | |||
| 1405 | |||
| 1406 | |||
| 1407 | #ifdef MY_CPU_LE_UNALIGN | ||
| 1408 | #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY | ||
| 1409 | #endif | ||
| 1410 | |||
| 1411 | #ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY | ||
| 1412 | |||
| 1413 | #define COPY_CHUNK_SIZE 16 | ||
| 1414 | |||
| 1415 | #define COPY_CHUNK_4_2(dest, src) \ | ||
| 1416 | { \ | ||
| 1417 | ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \ | ||
| 1418 | ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \ | ||
| 1419 | src += 4 * 2; \ | ||
| 1420 | dest += 4 * 2; \ | ||
| 1421 | } | ||
| 1422 | |||
| 1423 | /* sse2 doesn't help here in GCC and CLANG. | ||
| 1424 | so we disabled sse2 here */ | ||
| 1425 | /* | ||
| 1426 | #if defined(MY_CPU_AMD64) | ||
| 1427 | #define Z7_ZSTD_DEC_USE_SSE2 | ||
| 1428 | #elif defined(MY_CPU_X86) | ||
| 1429 | #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \ | ||
| 1430 | || defined(__SSE2__) \ | ||
| 1431 | // || 1 == 1 // for debug only | ||
| 1432 | #define Z7_ZSTD_DEC_USE_SSE2 | ||
| 1433 | #endif | ||
| 1434 | #endif | ||
| 1435 | */ | ||
| 1436 | |||
| 1437 | #if defined(MY_CPU_ARM64) | ||
| 1438 | #define COPY_OFFSET_MIN 16 | ||
| 1439 | #define COPY_CHUNK1(dest, src) \ | ||
| 1440 | { \ | ||
| 1441 | vst1q_u8((uint8_t *)(void *)dest, \ | ||
| 1442 | vld1q_u8((const uint8_t *)(const void *)src)); \ | ||
| 1443 | src += 16; \ | ||
| 1444 | dest += 16; \ | ||
| 1445 | } | ||
| 1446 | |||
| 1447 | #define COPY_CHUNK(dest, src) \ | ||
| 1448 | { \ | ||
| 1449 | COPY_CHUNK1(dest, src) \ | ||
| 1450 | if ((len -= COPY_CHUNK_SIZE) == 0) break; \ | ||
| 1451 | COPY_CHUNK1(dest, src) \ | ||
| 1452 | } | ||
| 1453 | |||
| 1454 | #elif defined(Z7_ZSTD_DEC_USE_SSE2) | ||
| 1455 | #include <emmintrin.h> // sse2 | ||
| 1456 | #define COPY_OFFSET_MIN 16 | ||
| 1457 | |||
| 1458 | #define COPY_CHUNK1(dest, src) \ | ||
| 1459 | { \ | ||
| 1460 | _mm_storeu_si128((__m128i *)(void *)dest, \ | ||
| 1461 | _mm_loadu_si128((const __m128i *)(const void *)src)); \ | ||
| 1462 | src += 16; \ | ||
| 1463 | dest += 16; \ | ||
| 1464 | } | ||
| 1465 | |||
| 1466 | #define COPY_CHUNK(dest, src) \ | ||
| 1467 | { \ | ||
| 1468 | COPY_CHUNK1(dest, src) \ | ||
| 1469 | if ((len -= COPY_CHUNK_SIZE) == 0) break; \ | ||
| 1470 | COPY_CHUNK1(dest, src) \ | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | #elif defined(MY_CPU_64BIT) | ||
| 1474 | #define COPY_OFFSET_MIN 8 | ||
| 1475 | |||
| 1476 | #define COPY_CHUNK(dest, src) \ | ||
| 1477 | { \ | ||
| 1478 | ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \ | ||
| 1479 | ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \ | ||
| 1480 | src += 8 * 2; \ | ||
| 1481 | dest += 8 * 2; \ | ||
| 1482 | } | ||
| 1483 | |||
| 1484 | #else | ||
| 1485 | #define COPY_OFFSET_MIN 4 | ||
| 1486 | |||
| 1487 | #define COPY_CHUNK(dest, src) \ | ||
| 1488 | { \ | ||
| 1489 | COPY_CHUNK_4_2(dest, src); \ | ||
| 1490 | COPY_CHUNK_4_2(dest, src); \ | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | #endif | ||
| 1494 | #endif | ||
| 1495 | |||
| 1496 | |||
| 1497 | #ifndef COPY_CHUNK_SIZE | ||
| 1498 | #define COPY_OFFSET_MIN 4 | ||
| 1499 | #define COPY_CHUNK_SIZE 8 | ||
| 1500 | #define COPY_CHUNK_2(dest, src) \ | ||
| 1501 | { \ | ||
| 1502 | const Byte a0 = src[0]; \ | ||
| 1503 | const Byte a1 = src[1]; \ | ||
| 1504 | dest[0] = a0; \ | ||
| 1505 | dest[1] = a1; \ | ||
| 1506 | src += 2; \ | ||
| 1507 | dest += 2; \ | ||
| 1508 | } | ||
| 1509 | #define COPY_CHUNK(dest, src) \ | ||
| 1510 | { \ | ||
| 1511 | COPY_CHUNK_2(dest, src) \ | ||
| 1512 | COPY_CHUNK_2(dest, src) \ | ||
| 1513 | COPY_CHUNK_2(dest, src) \ | ||
| 1514 | COPY_CHUNK_2(dest, src) \ | ||
| 1515 | } | ||
| 1516 | #endif | ||
| 1517 | |||
| 1518 | |||
| 1519 | #define COPY_PREPARE \ | ||
| 1520 | len += (COPY_CHUNK_SIZE - 1); \ | ||
| 1521 | len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \ | ||
| 1522 | { if (len > rem) \ | ||
| 1523 | { len = rem; \ | ||
| 1524 | rem &= (COPY_CHUNK_SIZE - 1); \ | ||
| 1525 | if (rem) { \ | ||
| 1526 | len -= rem; \ | ||
| 1527 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ | ||
| 1528 | do *dest++ = *src++; while (--rem); \ | ||
| 1529 | if (len == 0) return; }}} | ||
| 1530 | |||
| 1531 | #define COPY_CHUNKS \ | ||
| 1532 | { \ | ||
| 1533 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \ | ||
| 1534 | do { COPY_CHUNK(dest, src) } \ | ||
| 1535 | while (len -= COPY_CHUNK_SIZE); \ | ||
| 1536 | } | ||
| 1537 | |||
| 1538 | // (len != 0) | ||
| 1539 | // (len <= rem) | ||
| 1540 | static | ||
| 1541 | Z7_FORCE_INLINE | ||
| 1542 | // Z7_ATTRIB_NO_VECTOR | ||
| 1543 | void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem) | ||
| 1544 | { | ||
| 1545 | COPY_PREPARE | ||
| 1546 | COPY_CHUNKS | ||
| 1547 | } | ||
| 1548 | |||
| 1549 | |||
| 1550 | /* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional | ||
| 1551 | space after cycSize that can be used to reduce the code in CopyMatch(): */ | ||
| 1552 | // for debug: | ||
| 1553 | // #define Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
| 1554 | |||
| 1555 | /* | ||
| 1556 | CopyMatch() | ||
| 1557 | if wrap (offset > winPos) | ||
| 1558 | { | ||
| 1559 | then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src): | ||
| 1560 | (cycSize >= offset + COPY_CHUNK_SIZE) | ||
| 1561 | if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF) | ||
| 1562 | we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1], | ||
| 1563 | } | ||
| 1564 | (len != 0) | ||
| 1565 | */ | ||
| 1566 | static | ||
| 1567 | Z7_FORCE_INLINE | ||
| 1568 | // Z7_ATTRIB_NO_VECTOR | ||
| 1569 | void CopyMatch(size_t offset, size_t len, | ||
| 1570 | Byte *win, size_t winPos, size_t rem, const size_t cycSize) | ||
| 1571 | { | ||
| 1572 | Byte *dest = win + winPos; | ||
| 1573 | const Byte *src; | ||
| 1574 | // STAT_INC(g_NumCopy) | ||
| 1575 | |||
| 1576 | if (offset > winPos) | ||
| 1577 | { | ||
| 1578 | size_t back = offset - winPos; | ||
| 1579 | // src = win + cycSize - back; | ||
| 1580 | // cycSize -= offset; | ||
| 1581 | STAT_INC(g_NumOver) | ||
| 1582 | src = dest + (cycSize - offset); | ||
| 1583 | // (src >= dest) here | ||
| 1584 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
| 1585 | if (back < len) | ||
| 1586 | { | ||
| 1587 | #else | ||
| 1588 | if (back < len + (COPY_CHUNK_SIZE - 1)) | ||
| 1589 | { | ||
| 1590 | if (back >= len) | ||
| 1591 | { | ||
| 1592 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
| 1593 | do | ||
| 1594 | *dest++ = *src++; | ||
| 1595 | while (--len); | ||
| 1596 | return; | ||
| 1597 | } | ||
| 1598 | #endif | ||
| 1599 | // back < len | ||
| 1600 | STAT_INC(g_NumOver2) | ||
| 1601 | len -= back; | ||
| 1602 | rem -= back; | ||
| 1603 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
| 1604 | do | ||
| 1605 | *dest++ = *src++; | ||
| 1606 | while (--back); | ||
| 1607 | src = dest - offset; | ||
| 1608 | // src = win; | ||
| 1609 | // we go to MAIN-COPY | ||
| 1610 | } | ||
| 1611 | } | ||
| 1612 | else | ||
| 1613 | src = dest - offset; | ||
| 1614 | |||
| 1615 | // len != 0 | ||
| 1616 | // do *dest++ = *src++; while (--len); return; | ||
| 1617 | |||
| 1618 | // --- MAIN COPY --- | ||
| 1619 | // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE) | ||
| 1620 | // so we have at least COPY_CHUNK_SIZE space before overlap for writing. | ||
| 1621 | COPY_PREPARE | ||
| 1622 | |||
| 1623 | /* now (len == COPY_CHUNK_SIZE * x) | ||
| 1624 | so we can unroll for aligned copy */ | ||
| 1625 | { | ||
| 1626 | // const unsigned b0 = src[0]; | ||
| 1627 | // (COPY_OFFSET_MIN >= 4) | ||
| 1628 | |||
| 1629 | if (offset >= COPY_OFFSET_MIN) | ||
| 1630 | { | ||
| 1631 | COPY_CHUNKS | ||
| 1632 | // return; | ||
| 1633 | } | ||
| 1634 | else | ||
| 1635 | #if (COPY_OFFSET_MIN > 4) | ||
| 1636 | #if COPY_CHUNK_SIZE < 8 | ||
| 1637 | #error Stop_Compiling_Bad_COPY_CHUNK_SIZE | ||
| 1638 | #endif | ||
| 1639 | if (offset >= 4) | ||
| 1640 | { | ||
| 1641 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
| 1642 | do | ||
| 1643 | { | ||
| 1644 | COPY_CHUNK_4_2(dest, src) | ||
| 1645 | #if COPY_CHUNK_SIZE != 16 | ||
| 1646 | if (len == 8) break; | ||
| 1647 | #endif | ||
| 1648 | COPY_CHUNK_4_2(dest, src) | ||
| 1649 | } | ||
| 1650 | while (len -= 16); | ||
| 1651 | // return; | ||
| 1652 | } | ||
| 1653 | else | ||
| 1654 | #endif | ||
| 1655 | { | ||
| 1656 | // (offset < 4) | ||
| 1657 | const unsigned b0 = src[0]; | ||
| 1658 | if (offset < 2) | ||
| 1659 | { | ||
| 1660 | #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16) | ||
| 1661 | #if defined(MY_CPU_64BIT) | ||
| 1662 | { | ||
| 1663 | const UInt64 v64 = (UInt64)b0 * 0x0101010101010101; | ||
| 1664 | Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE | ||
| 1665 | do | ||
| 1666 | { | ||
| 1667 | ((UInt64 *)(void *)dest)[0] = v64; | ||
| 1668 | ((UInt64 *)(void *)dest)[1] = v64; | ||
| 1669 | dest += 16; | ||
| 1670 | } | ||
| 1671 | while (len -= 16); | ||
| 1672 | } | ||
| 1673 | #else | ||
| 1674 | { | ||
| 1675 | UInt32 v = b0; | ||
| 1676 | v |= v << 8; | ||
| 1677 | v |= v << 16; | ||
| 1678 | do | ||
| 1679 | { | ||
| 1680 | ((UInt32 *)(void *)dest)[0] = v; | ||
| 1681 | ((UInt32 *)(void *)dest)[1] = v; | ||
| 1682 | dest += 8; | ||
| 1683 | ((UInt32 *)(void *)dest)[0] = v; | ||
| 1684 | ((UInt32 *)(void *)dest)[1] = v; | ||
| 1685 | dest += 8; | ||
| 1686 | } | ||
| 1687 | while (len -= 16); | ||
| 1688 | } | ||
| 1689 | #endif | ||
| 1690 | #else | ||
| 1691 | do | ||
| 1692 | { | ||
| 1693 | dest[0] = (Byte)b0; | ||
| 1694 | dest[1] = (Byte)b0; | ||
| 1695 | dest += 2; | ||
| 1696 | dest[0] = (Byte)b0; | ||
| 1697 | dest[1] = (Byte)b0; | ||
| 1698 | dest += 2; | ||
| 1699 | } | ||
| 1700 | while (len -= 4); | ||
| 1701 | #endif | ||
| 1702 | } | ||
| 1703 | else if (offset == 2) | ||
| 1704 | { | ||
| 1705 | const Byte b1 = src[1]; | ||
| 1706 | { | ||
| 1707 | do | ||
| 1708 | { | ||
| 1709 | dest[0] = (Byte)b0; | ||
| 1710 | dest[1] = b1; | ||
| 1711 | dest += 2; | ||
| 1712 | } | ||
| 1713 | while (len -= 2); | ||
| 1714 | } | ||
| 1715 | } | ||
| 1716 | else // (offset == 3) | ||
| 1717 | { | ||
| 1718 | const Byte *lim = dest + len - 2; | ||
| 1719 | const Byte b1 = src[1]; | ||
| 1720 | const Byte b2 = src[2]; | ||
| 1721 | do | ||
| 1722 | { | ||
| 1723 | dest[0] = (Byte)b0; | ||
| 1724 | dest[1] = b1; | ||
| 1725 | dest[2] = b2; | ||
| 1726 | dest += 3; | ||
| 1727 | } | ||
| 1728 | while (dest < lim); | ||
| 1729 | lim++; // points to last byte that must be written | ||
| 1730 | if (dest <= lim) | ||
| 1731 | { | ||
| 1732 | *dest = (Byte)b0; | ||
| 1733 | if (dest != lim) | ||
| 1734 | dest[1] = b1; | ||
| 1735 | } | ||
| 1736 | } | ||
| 1737 | } | ||
| 1738 | } | ||
| 1739 | } | ||
| 1740 | |||
| 1741 | |||
| 1742 | |||
| 1743 | #define UPDATE_TOTAL_OUT(p, size) \ | ||
| 1744 | { \ | ||
| 1745 | size_t _toc = (p)->totalOutCheck + (size); \ | ||
| 1746 | const size_t _ws = (p)->winSize; \ | ||
| 1747 | if (_toc >= _ws) _toc = _ws; \ | ||
| 1748 | (p)->totalOutCheck = _toc; \ | ||
| 1749 | } | ||
| 1750 | |||
| 1751 | |||
| 1752 | #if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN) | ||
| 1753 | // we can disable it for debug: | ||
| 1754 | #define Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1755 | #endif | ||
| 1756 | // #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit | ||
| 1757 | |||
| 1758 | // SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value. | ||
| 1759 | // we need (SEQ_SRC_OFFSET != 0) for optimized memory access | ||
| 1760 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1761 | #define SEQ_SRC_OFFSET 7 | ||
| 1762 | #else | ||
| 1763 | #define SEQ_SRC_OFFSET 3 | ||
| 1764 | #endif | ||
| 1765 | #define SRC_PLUS_FOR_4BYTES(bitOffset) (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3) | ||
| 1766 | #define BIT_OFFSET_7BITS(bitOffset) ((unsigned)(bitOffset) & 7) | ||
| 1767 | /* | ||
| 1768 | if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits | ||
| 1769 | if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1 | ||
| 1770 | and we can read 1 bit more in that mode : (8 * n + 1). | ||
| 1771 | */ | ||
| 1772 | // #define BIT_OFFSET_DELTA_BITS 0 | ||
| 1773 | #define BIT_OFFSET_DELTA_BITS 1 | ||
| 1774 | #if BIT_OFFSET_DELTA_BITS == 1 | ||
| 1775 | #define GET_SHIFT_FROM_BOFFS7(boff7) (7 ^ (boff7)) | ||
| 1776 | #else | ||
| 1777 | #define GET_SHIFT_FROM_BOFFS7(boff7) (8 - BIT_OFFSET_DELTA_BITS - (boff7)) | ||
| 1778 | #endif | ||
| 1779 | |||
| 1780 | #define UPDATE_BIT_OFFSET(bitOffset, numBits) \ | ||
| 1781 | (bitOffset) -= (CBitCtr)(numBits); | ||
| 1782 | |||
| 1783 | #define GET_SHIFT(bitOffset) GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset)) | ||
| 1784 | |||
| 1785 | |||
| 1786 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) | ||
| 1787 | #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32) | ||
| 1788 | /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1), | ||
| 1789 | we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */ | ||
| 1790 | #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
| 1791 | #endif | ||
| 1792 | #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
| 1793 | #if (BIT_OFFSET_DELTA_BITS == 1) | ||
| 1794 | /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17))) | ||
| 1795 | { | ||
| 1796 | the case (16 bits literal extra + 16 match extra) is not possible | ||
| 1797 | in correct stream. So error will be detected for (16 + 16) case. | ||
| 1798 | And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits). | ||
| 1799 | So we can use just one 64-bit load here in that case. | ||
| 1800 | } | ||
| 1801 | */ | ||
| 1802 | #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML | ||
| 1803 | #endif | ||
| 1804 | #endif | ||
| 1805 | #endif | ||
| 1806 | |||
| 1807 | |||
| 1808 | #if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \ | ||
| 1809 | (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \ | ||
| 1810 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)) | ||
| 1811 | // in : (0 < bits <= (24 or 25)): | ||
| 1812 | #define STREAM_READ_BITS(dest, bits) \ | ||
| 1813 | { \ | ||
| 1814 | GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
| 1815 | dest <<= GET_SHIFT(bitOffset); \ | ||
| 1816 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1817 | dest >>= 32 - bits; \ | ||
| 1818 | } | ||
| 1819 | #endif | ||
| 1820 | |||
| 1821 | |||
| 1822 | #define FSE_Peek_1(table, state) table[state] | ||
| 1823 | |||
| 1824 | #define STATE_VAR(name) state_ ## name | ||
| 1825 | |||
| 1826 | // in : (0 <= accuracy <= (24 or 25)) | ||
| 1827 | #define FSE_INIT_STATE(name, cond) \ | ||
| 1828 | { \ | ||
| 1829 | UInt32 r; \ | ||
| 1830 | const unsigned bits = p->name ## _accuracy; \ | ||
| 1831 | GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
| 1832 | r <<= GET_SHIFT(bitOffset); \ | ||
| 1833 | r >>= 1; \ | ||
| 1834 | r >>= 31 ^ bits; \ | ||
| 1835 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1836 | cond \ | ||
| 1837 | STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \ | ||
| 1838 | /* STATE_VAR(name) = dest << 16; */ \ | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | |||
| 1842 | #define FSE_Peek_Plus(name, r) \ | ||
| 1843 | STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \ | ||
| 1844 | GET_FSE_REC_STATE(STATE_VAR(name)) + r); | ||
| 1845 | |||
| 1846 | #define LZ_LOOP_ERROR_EXIT { return SZ_ERROR_DATA; } | ||
| 1847 | |||
| 1848 | #define BO_OVERFLOW_CHECK \ | ||
| 1849 | { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT } | ||
| 1850 | |||
| 1851 | |||
| 1852 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1853 | |||
| 1854 | #define GET64(dest, p) { const Byte *ptr = p; dest = GetUi64(ptr); } | ||
| 1855 | |||
| 1856 | #define FSE_PRELOAD \ | ||
| 1857 | { \ | ||
| 1858 | GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
| 1859 | v <<= GET_SHIFT(bitOffset); \ | ||
| 1860 | } | ||
| 1861 | |||
| 1862 | #define FSE_UPDATE_STATE_2(name, cond) \ | ||
| 1863 | { \ | ||
| 1864 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
| 1865 | UInt64 r = v; \ | ||
| 1866 | v <<= bits; \ | ||
| 1867 | r >>= 1; \ | ||
| 1868 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1869 | cond \ | ||
| 1870 | r >>= 63 ^ bits; \ | ||
| 1871 | FSE_Peek_Plus(name, r); \ | ||
| 1872 | } | ||
| 1873 | |||
| 1874 | #define FSE_UPDATE_STATES \ | ||
| 1875 | FSE_UPDATE_STATE_2 (ll, {} ) \ | ||
| 1876 | FSE_UPDATE_STATE_2 (ml, {} ) \ | ||
| 1877 | FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \ | ||
| 1878 | |||
| 1879 | #else // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1880 | |||
| 1881 | // it supports 8 bits accuracy for any code | ||
| 1882 | // it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1) | ||
| 1883 | #define FSE_UPDATE_STATE_0(name, cond) \ | ||
| 1884 | { \ | ||
| 1885 | UInt32 r; \ | ||
| 1886 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
| 1887 | GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
| 1888 | r >>= (bitOffset & 7); \ | ||
| 1889 | r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \ | ||
| 1890 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1891 | cond \ | ||
| 1892 | r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \ | ||
| 1893 | FSE_Peek_Plus(name, r); \ | ||
| 1894 | } | ||
| 1895 | |||
| 1896 | // for debug (slow): | ||
| 1897 | // #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE | ||
| 1898 | #if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE) | ||
| 1899 | #define Z7_ZSTD_DEC_USE_FSE_FUSION | ||
| 1900 | #endif | ||
| 1901 | |||
| 1902 | #ifdef Z7_ZSTD_DEC_USE_FSE_FUSION | ||
| 1903 | #define FSE_UPDATE_STATE_1(name) \ | ||
| 1904 | { UInt32 rest2; \ | ||
| 1905 | { \ | ||
| 1906 | UInt32 r; \ | ||
| 1907 | unsigned bits; \ | ||
| 1908 | GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \ | ||
| 1909 | bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
| 1910 | r <<= GET_SHIFT(bitOffset); \ | ||
| 1911 | rest2 = r << bits; \ | ||
| 1912 | r >>= 1; \ | ||
| 1913 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1914 | r >>= 31 ^ bits; \ | ||
| 1915 | FSE_Peek_Plus(name, r); \ | ||
| 1916 | } | ||
| 1917 | |||
| 1918 | #define FSE_UPDATE_STATE_3(name) \ | ||
| 1919 | { \ | ||
| 1920 | const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \ | ||
| 1921 | rest2 >>= 1; \ | ||
| 1922 | UPDATE_BIT_OFFSET(bitOffset, bits) \ | ||
| 1923 | rest2 >>= 31 ^ bits; \ | ||
| 1924 | FSE_Peek_Plus(name, rest2); \ | ||
| 1925 | }} | ||
| 1926 | |||
| 1927 | #define FSE_UPDATE_STATES \ | ||
| 1928 | FSE_UPDATE_STATE_1 (ll) \ | ||
| 1929 | FSE_UPDATE_STATE_3 (ml) \ | ||
| 1930 | FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \ | ||
| 1931 | |||
| 1932 | #else // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1933 | |||
| 1934 | #define FSE_UPDATE_STATES \ | ||
| 1935 | FSE_UPDATE_STATE_0 (ll, {} ) \ | ||
| 1936 | FSE_UPDATE_STATE_0 (ml, {} ) \ | ||
| 1937 | FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \ | ||
| 1938 | |||
| 1939 | #endif // Z7_ZSTD_DEC_USE_FSE_FUSION | ||
| 1940 | #endif // Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 1941 | |||
| 1942 | |||
| 1943 | |||
| 1944 | typedef struct | ||
| 1945 | { | ||
| 1946 | UInt32 numSeqs; | ||
| 1947 | UInt32 literalsLen; | ||
| 1948 | const Byte *literals; | ||
| 1949 | } | ||
| 1950 | CZstdDec1_Vars; | ||
| 1951 | |||
| 1952 | |||
| 1953 | // if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0) | ||
| 1954 | #define BIT_OFFSET_DELTA_BYTES BIT_OFFSET_DELTA_BITS | ||
| 1955 | |||
| 1956 | /* if (NUM_OFFSET_SYMBOLS_MAX == 32) | ||
| 1957 | max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits | ||
| 1958 | if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward | ||
| 1959 | lookahead offset, and we read UInt64 after literal_len reading. | ||
| 1960 | if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32) | ||
| 1961 | MAX_BACKWARD_DEPTH = 16 bytes | ||
| 1962 | */ | ||
| 1963 | #define MAX_BACKWARD_DEPTH \ | ||
| 1964 | ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES) | ||
| 1965 | |||
| 1966 | /* srcLen != 0 | ||
| 1967 | src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES | ||
| 1968 | if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then | ||
| 1969 | (winLimit - p->winPos <= (1 << 17)) is required | ||
| 1970 | */ | ||
| 1971 | static | ||
| 1972 | Z7_NO_INLINE | ||
| 1973 | // Z7_ATTRIB_NO_VECTOR | ||
| 1974 | SRes Decompress_Sequences(CZstdDec1 * const p, | ||
| 1975 | const Byte *src, const size_t srcLen, | ||
| 1976 | const size_t winLimit, | ||
| 1977 | const CZstdDec1_Vars * const vars) | ||
| 1978 | { | ||
| 1979 | #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
| 1980 | SEQ_EXTRA_TABLES(a_) | ||
| 1981 | #endif | ||
| 1982 | |||
| 1983 | // for debug: | ||
| 1984 | // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES | ||
| 1985 | #ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES | ||
| 1986 | #define FSE_TABLE(n) fse. n | ||
| 1987 | const CZstdDecFseTables fse = p->fse; | ||
| 1988 | /* | ||
| 1989 | CZstdDecFseTables fse; | ||
| 1990 | #define COPY_FSE_TABLE(n) \ | ||
| 1991 | memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy); | ||
| 1992 | COPY_FSE_TABLE(of) | ||
| 1993 | COPY_FSE_TABLE(ll) | ||
| 1994 | COPY_FSE_TABLE(ml) | ||
| 1995 | */ | ||
| 1996 | #else | ||
| 1997 | #define FSE_TABLE(n) (p->fse. n) | ||
| 1998 | #endif | ||
| 1999 | |||
| 2000 | #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL | ||
| 2001 | FILL_LOC_BASES_ALL | ||
| 2002 | #endif | ||
| 2003 | |||
| 2004 | { | ||
| 2005 | unsigned numSeqs = vars->numSeqs; | ||
| 2006 | const Byte *literals = vars->literals; | ||
| 2007 | ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen; | ||
| 2008 | Byte * const win = p->win; | ||
| 2009 | size_t winPos = p->winPos; | ||
| 2010 | const size_t cycSize = p->cycSize; | ||
| 2011 | size_t totalOutCheck = p->totalOutCheck; | ||
| 2012 | const size_t winSize = p->winSize; | ||
| 2013 | size_t reps_0 = p->reps[0]; | ||
| 2014 | size_t reps_1 = p->reps[1]; | ||
| 2015 | size_t reps_2 = p->reps[2]; | ||
| 2016 | UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml); | ||
| 2017 | CBitCtr bitOffset; | ||
| 2018 | |||
| 2019 | SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES) | ||
| 2020 | |||
| 2021 | bitOffset -= BIT_OFFSET_DELTA_BITS; | ||
| 2022 | |||
| 2023 | FSE_INIT_STATE(ll, {} ) | ||
| 2024 | FSE_INIT_STATE(of, {} ) | ||
| 2025 | FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK) | ||
| 2026 | |||
| 2027 | for (;;) | ||
| 2028 | { | ||
| 2029 | size_t matchLen; | ||
| 2030 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2031 | UInt64 v; | ||
| 2032 | #endif | ||
| 2033 | |||
| 2034 | #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF | ||
| 2035 | FSE_PRELOAD | ||
| 2036 | #endif | ||
| 2037 | |||
| 2038 | // if (of_code == 0) | ||
| 2039 | if ((Byte)STATE_VAR(of) == 0) | ||
| 2040 | { | ||
| 2041 | if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0) | ||
| 2042 | { | ||
| 2043 | const size_t offset = reps_1; | ||
| 2044 | reps_1 = reps_0; | ||
| 2045 | reps_0 = offset; | ||
| 2046 | STAT_INC(g_Num_Rep1) | ||
| 2047 | } | ||
| 2048 | STAT_UPDATE(else g_Num_Rep0++;) | ||
| 2049 | } | ||
| 2050 | else | ||
| 2051 | { | ||
| 2052 | const unsigned of_code = (Byte)STATE_VAR(of); | ||
| 2053 | |||
| 2054 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2055 | #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
| 2056 | FSE_PRELOAD | ||
| 2057 | #endif | ||
| 2058 | #else | ||
| 2059 | UInt32 v; | ||
| 2060 | { | ||
| 2061 | const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset); | ||
| 2062 | const unsigned skip = GET_SHIFT(bitOffset); | ||
| 2063 | GET32(v, src4) | ||
| 2064 | v <<= skip; | ||
| 2065 | v |= (UInt32)src4[-1] >> (8 - skip); | ||
| 2066 | } | ||
| 2067 | #endif | ||
| 2068 | |||
| 2069 | UPDATE_BIT_OFFSET(bitOffset, of_code) | ||
| 2070 | |||
| 2071 | if (of_code == 1) | ||
| 2072 | { | ||
| 2073 | // read 1 bit | ||
| 2074 | #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64) | ||
| 2075 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2076 | #define CHECK_HIGH_BIT_64(a) ((Int64)(UInt64)(a) < 0) | ||
| 2077 | #else | ||
| 2078 | #define CHECK_HIGH_BIT_32(a) ((Int32)(UInt32)(a) < 0) | ||
| 2079 | #endif | ||
| 2080 | #else | ||
| 2081 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2082 | #define CHECK_HIGH_BIT_64(a) ((UInt64)(a) & ((UInt64)1 << 63)) | ||
| 2083 | #else | ||
| 2084 | #define CHECK_HIGH_BIT_32(a) ((UInt32)(a) & ((UInt32)1 << 31)) | ||
| 2085 | #endif | ||
| 2086 | #endif | ||
| 2087 | |||
| 2088 | if | ||
| 2089 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2090 | CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v) | ||
| 2091 | #else | ||
| 2092 | CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v) | ||
| 2093 | #endif | ||
| 2094 | { | ||
| 2095 | v <<= 1; | ||
| 2096 | { | ||
| 2097 | const size_t offset = reps_2; | ||
| 2098 | reps_2 = reps_1; | ||
| 2099 | reps_1 = reps_0; | ||
| 2100 | reps_0 = offset; | ||
| 2101 | STAT_INC(g_Num_Rep2) | ||
| 2102 | } | ||
| 2103 | } | ||
| 2104 | else | ||
| 2105 | { | ||
| 2106 | if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0) | ||
| 2107 | { | ||
| 2108 | // litLen == 0 && bit == 1 | ||
| 2109 | STAT_INC(g_Num_Rep3) | ||
| 2110 | v <<= 1; | ||
| 2111 | reps_2 = reps_1; | ||
| 2112 | reps_1 = reps_0; | ||
| 2113 | if (--reps_0 == 0) | ||
| 2114 | { | ||
| 2115 | // LZ_LOOP_ERROR_EXIT | ||
| 2116 | // original-zstd decoder : input is corrupted; force offset to 1 | ||
| 2117 | // reps_0 = 1; | ||
| 2118 | reps_0++; | ||
| 2119 | } | ||
| 2120 | } | ||
| 2121 | else | ||
| 2122 | { | ||
| 2123 | // litLen != 0 && bit == 0 | ||
| 2124 | v <<= 1; | ||
| 2125 | { | ||
| 2126 | const size_t offset = reps_1; | ||
| 2127 | reps_1 = reps_0; | ||
| 2128 | reps_0 = offset; | ||
| 2129 | STAT_INC(g_Num_Rep1) | ||
| 2130 | } | ||
| 2131 | } | ||
| 2132 | } | ||
| 2133 | } | ||
| 2134 | else | ||
| 2135 | { | ||
| 2136 | // (2 <= of_code) | ||
| 2137 | // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check | ||
| 2138 | // we don't allow (of_code >= 32) cases in another code | ||
| 2139 | reps_2 = reps_1; | ||
| 2140 | reps_1 = reps_0; | ||
| 2141 | reps_0 = ((size_t)1 << of_code) - 3 + (size_t) | ||
| 2142 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2143 | (v >> (64 - of_code)); | ||
| 2144 | v <<= of_code; | ||
| 2145 | #else | ||
| 2146 | (v >> (32 - of_code)); | ||
| 2147 | #endif | ||
| 2148 | } | ||
| 2149 | } | ||
| 2150 | |||
| 2151 | #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML | ||
| 2152 | FSE_PRELOAD | ||
| 2153 | #endif | ||
| 2154 | |||
| 2155 | matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml)) | ||
| 2156 | #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3 | ||
| 2157 | + MATCH_LEN_MIN | ||
| 2158 | #endif | ||
| 2159 | ; | ||
| 2160 | { | ||
| 2161 | { | ||
| 2162 | if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20) | ||
| 2163 | { | ||
| 2164 | const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN]; | ||
| 2165 | matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN]; | ||
| 2166 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \ | ||
| 2167 | (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \ | ||
| 2168 | defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)) | ||
| 2169 | { | ||
| 2170 | UPDATE_BIT_OFFSET(bitOffset, extra) | ||
| 2171 | matchLen += (size_t)(v >> (64 - extra)); | ||
| 2172 | #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
| 2173 | FSE_PRELOAD | ||
| 2174 | #else | ||
| 2175 | v <<= extra; | ||
| 2176 | #endif | ||
| 2177 | } | ||
| 2178 | #else | ||
| 2179 | { | ||
| 2180 | UInt32 v32; | ||
| 2181 | STREAM_READ_BITS(v32, extra) | ||
| 2182 | matchLen += v32; | ||
| 2183 | } | ||
| 2184 | #endif | ||
| 2185 | STAT_INC(g_Num_Match) | ||
| 2186 | } | ||
| 2187 | } | ||
| 2188 | } | ||
| 2189 | |||
| 2190 | #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \ | ||
| 2191 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \ | ||
| 2192 | !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) | ||
| 2193 | FSE_PRELOAD | ||
| 2194 | #endif | ||
| 2195 | |||
| 2196 | { | ||
| 2197 | size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll)); | ||
| 2198 | if (litLen) | ||
| 2199 | { | ||
| 2200 | // if (STATE_VAR(ll) & 0x70) | ||
| 2201 | if (litLen >= 16) | ||
| 2202 | { | ||
| 2203 | const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen]; | ||
| 2204 | litLen = BASES_TABLE(SEQ_LL_BASES) [litLen]; | ||
| 2205 | #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS | ||
| 2206 | { | ||
| 2207 | UPDATE_BIT_OFFSET(bitOffset, extra) | ||
| 2208 | litLen += (size_t)(v >> (64 - extra)); | ||
| 2209 | #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) | ||
| 2210 | FSE_PRELOAD | ||
| 2211 | #else | ||
| 2212 | v <<= extra; | ||
| 2213 | #endif | ||
| 2214 | } | ||
| 2215 | #else | ||
| 2216 | { | ||
| 2217 | UInt32 v32; | ||
| 2218 | STREAM_READ_BITS(v32, extra) | ||
| 2219 | litLen += v32; | ||
| 2220 | } | ||
| 2221 | #endif | ||
| 2222 | STAT_INC(g_Num_LitsBig) | ||
| 2223 | } | ||
| 2224 | |||
| 2225 | if ((literalsLen -= (ptrdiff_t)litLen) < 0) | ||
| 2226 | LZ_LOOP_ERROR_EXIT | ||
| 2227 | totalOutCheck += litLen; | ||
| 2228 | { | ||
| 2229 | const size_t rem = winLimit - winPos; | ||
| 2230 | if (litLen > rem) | ||
| 2231 | LZ_LOOP_ERROR_EXIT | ||
| 2232 | { | ||
| 2233 | const Byte *literals_temp = literals; | ||
| 2234 | Byte *d = win + winPos; | ||
| 2235 | literals += litLen; | ||
| 2236 | winPos += litLen; | ||
| 2237 | CopyLiterals(d, literals_temp, litLen, rem); | ||
| 2238 | } | ||
| 2239 | } | ||
| 2240 | } | ||
| 2241 | STAT_UPDATE(else g_Num_Lit0++;) | ||
| 2242 | } | ||
| 2243 | |||
| 2244 | #define COPY_MATCH \ | ||
| 2245 | { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \ | ||
| 2246 | totalOutCheck += matchLen; \ | ||
| 2247 | { const size_t rem = winLimit - winPos; \ | ||
| 2248 | if (matchLen > rem) LZ_LOOP_ERROR_EXIT \ | ||
| 2249 | { const size_t winPos_temp = winPos; \ | ||
| 2250 | winPos += matchLen; \ | ||
| 2251 | CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}} | ||
| 2252 | |||
| 2253 | if (--numSeqs == 0) | ||
| 2254 | { | ||
| 2255 | COPY_MATCH | ||
| 2256 | break; | ||
| 2257 | } | ||
| 2258 | FSE_UPDATE_STATES | ||
| 2259 | COPY_MATCH | ||
| 2260 | } // for | ||
| 2261 | |||
| 2262 | if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS) | ||
| 2263 | return SZ_ERROR_DATA; | ||
| 2264 | |||
| 2265 | if (literalsLen) | ||
| 2266 | { | ||
| 2267 | const size_t rem = winLimit - winPos; | ||
| 2268 | if ((size_t)literalsLen > rem) | ||
| 2269 | return SZ_ERROR_DATA; | ||
| 2270 | { | ||
| 2271 | Byte *d = win + winPos; | ||
| 2272 | winPos += (size_t)literalsLen; | ||
| 2273 | totalOutCheck += (size_t)literalsLen; | ||
| 2274 | CopyLiterals | ||
| 2275 | // memcpy | ||
| 2276 | (d, literals, (size_t)literalsLen, rem); | ||
| 2277 | } | ||
| 2278 | } | ||
| 2279 | if (totalOutCheck >= winSize) | ||
| 2280 | totalOutCheck = winSize; | ||
| 2281 | p->totalOutCheck = totalOutCheck; | ||
| 2282 | p->winPos = winPos; | ||
| 2283 | p->reps[0] = (CZstdDecOffset)reps_0; | ||
| 2284 | p->reps[1] = (CZstdDecOffset)reps_1; | ||
| 2285 | p->reps[2] = (CZstdDecOffset)reps_2; | ||
| 2286 | } | ||
| 2287 | return SZ_OK; | ||
| 2288 | } | ||
| 2289 | |||
| 2290 | |||
| 2291 | // for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly: | ||
| 2292 | // #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only | ||
| 2293 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
| 2294 | static unsigned g_numSeqs; | ||
| 2295 | #endif | ||
| 2296 | |||
| 2297 | |||
| 2298 | #define k_LitBlockType_Flag_RLE_or_Treeless 1 | ||
| 2299 | #define k_LitBlockType_Flag_Compressed 2 | ||
| 2300 | |||
| 2301 | // outLimit : is strong limit | ||
| 2302 | // outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) | ||
| 2303 | // inSize != 0 | ||
| 2304 | static | ||
| 2305 | Z7_NO_INLINE | ||
| 2306 | SRes ZstdDec1_DecodeBlock(CZstdDec1 *p, | ||
| 2307 | const Byte *src, SizeT inSize, SizeT afterAvail, | ||
| 2308 | const size_t outLimit) | ||
| 2309 | { | ||
| 2310 | CZstdDec1_Vars vars; | ||
| 2311 | vars.literals = p->literalsBase; | ||
| 2312 | { | ||
| 2313 | const unsigned b0 = *src++; | ||
| 2314 | UInt32 numLits, compressedSize; | ||
| 2315 | const Byte *litStream; | ||
| 2316 | Byte *literalsDest; | ||
| 2317 | inSize--; | ||
| 2318 | |||
| 2319 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
| 2320 | { | ||
| 2321 | // we need at least one additional byte for (numSeqs). | ||
| 2322 | // so we check for that additional byte in conditions. | ||
| 2323 | numLits = b0 >> 3; | ||
| 2324 | if (b0 & 4) | ||
| 2325 | { | ||
| 2326 | UInt32 v; | ||
| 2327 | if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs). | ||
| 2328 | return SZ_ERROR_DATA; | ||
| 2329 | numLits >>= 1; | ||
| 2330 | v = GetUi16(src); | ||
| 2331 | src += 2; | ||
| 2332 | inSize -= 2; | ||
| 2333 | if ((b0 & 8) == 0) | ||
| 2334 | { | ||
| 2335 | src--; | ||
| 2336 | inSize++; | ||
| 2337 | v = (Byte)v; | ||
| 2338 | } | ||
| 2339 | numLits += v << 4; | ||
| 2340 | } | ||
| 2341 | compressedSize = 1; | ||
| 2342 | if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0) | ||
| 2343 | compressedSize = numLits; | ||
| 2344 | } | ||
| 2345 | else if (inSize < 4) | ||
| 2346 | return SZ_ERROR_DATA; | ||
| 2347 | else | ||
| 2348 | { | ||
| 2349 | const unsigned mode4Streams = b0 & 0xc; | ||
| 2350 | const unsigned numBytes = (3 * mode4Streams + 32) >> 4; | ||
| 2351 | const unsigned numBits = 4 * numBytes - 2; | ||
| 2352 | const UInt32 mask = ((UInt32)16 << numBits) - 1; | ||
| 2353 | compressedSize = GetUi32(src); | ||
| 2354 | numLits = (( | ||
| 2355 | #ifdef MY_CPU_LE_UNALIGN | ||
| 2356 | GetUi32(src - 1) | ||
| 2357 | #else | ||
| 2358 | ((compressedSize << 8) + b0) | ||
| 2359 | #endif | ||
| 2360 | ) >> 4) & mask; | ||
| 2361 | src += numBytes; | ||
| 2362 | inSize -= numBytes; | ||
| 2363 | compressedSize >>= numBits; | ||
| 2364 | compressedSize &= mask; | ||
| 2365 | /* | ||
| 2366 | if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u ratio2 = %u\n", | ||
| 2367 | i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits, | ||
| 2368 | (unsigned)numLits * 100 / (unsigned)inSize); | ||
| 2369 | } | ||
| 2370 | */ | ||
| 2371 | if (compressedSize == 0) | ||
| 2372 | return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed | ||
| 2373 | } | ||
| 2374 | |||
| 2375 | STAT_UPDATE(g_Num_Lits += numLits;) | ||
| 2376 | |||
| 2377 | vars.literalsLen = numLits; | ||
| 2378 | |||
| 2379 | if (compressedSize >= inSize) | ||
| 2380 | return SZ_ERROR_DATA; | ||
| 2381 | litStream = src; | ||
| 2382 | src += compressedSize; | ||
| 2383 | inSize -= compressedSize; | ||
| 2384 | // inSize != 0 | ||
| 2385 | { | ||
| 2386 | UInt32 numSeqs = *src++; | ||
| 2387 | inSize--; | ||
| 2388 | if (numSeqs > 127) | ||
| 2389 | { | ||
| 2390 | UInt32 b1; | ||
| 2391 | if (inSize == 0) | ||
| 2392 | return SZ_ERROR_DATA; | ||
| 2393 | numSeqs -= 128; | ||
| 2394 | b1 = *src++; | ||
| 2395 | inSize--; | ||
| 2396 | if (numSeqs == 127) | ||
| 2397 | { | ||
| 2398 | if (inSize == 0) | ||
| 2399 | return SZ_ERROR_DATA; | ||
| 2400 | numSeqs = (UInt32)(*src++) + 127; | ||
| 2401 | inSize--; | ||
| 2402 | } | ||
| 2403 | numSeqs = (numSeqs << 8) + b1; | ||
| 2404 | } | ||
| 2405 | if (numSeqs * MATCH_LEN_MIN + numLits > outLimit) | ||
| 2406 | return SZ_ERROR_DATA; | ||
| 2407 | vars.numSeqs = numSeqs; | ||
| 2408 | |||
| 2409 | STAT_UPDATE(g_NumSeqs_total += numSeqs;) | ||
| 2410 | /* | ||
| 2411 | #ifdef SHOW_STAT | ||
| 2412 | printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total, | ||
| 2413 | (int)g_NumSeqs_total / g_Num_Blocks_Compressed); | ||
| 2414 | #endif | ||
| 2415 | // printf("\nnumSeqs2 = %d", numSeqs); | ||
| 2416 | */ | ||
| 2417 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
| 2418 | if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug | ||
| 2419 | #endif | ||
| 2420 | if (numSeqs == 0) | ||
| 2421 | { | ||
| 2422 | if (inSize != 0) | ||
| 2423 | return SZ_ERROR_DATA; | ||
| 2424 | literalsDest = p->win + p->winPos; | ||
| 2425 | } | ||
| 2426 | else | ||
| 2427 | literalsDest = p->literalsBase; | ||
| 2428 | } | ||
| 2429 | |||
| 2430 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
| 2431 | { | ||
| 2432 | if (b0 & k_LitBlockType_Flag_RLE_or_Treeless) | ||
| 2433 | { | ||
| 2434 | memset(literalsDest, litStream[0], numLits); | ||
| 2435 | if (vars.numSeqs) | ||
| 2436 | { | ||
| 2437 | // literalsDest == p->literalsBase == vars.literals | ||
| 2438 | #if COPY_CHUNK_SIZE > 1 | ||
| 2439 | memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE); | ||
| 2440 | #endif | ||
| 2441 | } | ||
| 2442 | } | ||
| 2443 | else | ||
| 2444 | { | ||
| 2445 | // unsigned y; | ||
| 2446 | // for (y = 0; y < 10000; y++) | ||
| 2447 | memcpy(literalsDest, litStream, numLits); | ||
| 2448 | if (vars.numSeqs) | ||
| 2449 | { | ||
| 2450 | /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals(). | ||
| 2451 | If we have additional space in input stream after literals stream, | ||
| 2452 | we use direct copy of rar literals in input stream */ | ||
| 2453 | if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1)) | ||
| 2454 | vars.literals = litStream; | ||
| 2455 | else | ||
| 2456 | { | ||
| 2457 | // literalsDest == p->literalsBase == vars.literals | ||
| 2458 | #if COPY_CHUNK_SIZE > 1 | ||
| 2459 | /* CopyLiterals(): | ||
| 2460 | 1) we don't want reading non-initialized data | ||
| 2461 | 2) we will copy only zero byte after literals buffer */ | ||
| 2462 | memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE); | ||
| 2463 | #endif | ||
| 2464 | } | ||
| 2465 | } | ||
| 2466 | } | ||
| 2467 | } | ||
| 2468 | else | ||
| 2469 | { | ||
| 2470 | CInBufPair hufStream; | ||
| 2471 | hufStream.ptr = litStream; | ||
| 2472 | hufStream.len = compressedSize; | ||
| 2473 | |||
| 2474 | if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0) | ||
| 2475 | { | ||
| 2476 | // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2; | ||
| 2477 | RINOK(Huf_DecodeTable(&p->huf, &hufStream)) | ||
| 2478 | p->litHuf_wasSet = True; | ||
| 2479 | // } while (--y); | ||
| 2480 | } | ||
| 2481 | else if (!p->litHuf_wasSet) | ||
| 2482 | return SZ_ERROR_DATA; | ||
| 2483 | |||
| 2484 | { | ||
| 2485 | // int yyy; for (yyy = 0; yyy < 34; yyy++) { | ||
| 2486 | SRes sres; | ||
| 2487 | if ((b0 & 0xc) == 0) // mode4Streams | ||
| 2488 | sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64, | ||
| 2489 | hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits); | ||
| 2490 | else | ||
| 2491 | { | ||
| 2492 | // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes) | ||
| 2493 | if (hufStream.len < 6 + 4) | ||
| 2494 | return SZ_ERROR_DATA; | ||
| 2495 | // the condition from original-zstd decoder: | ||
| 2496 | #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6 | ||
| 2497 | if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS) | ||
| 2498 | return SZ_ERROR_DATA; | ||
| 2499 | sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64, | ||
| 2500 | hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits); | ||
| 2501 | } | ||
| 2502 | RINOK(sres) | ||
| 2503 | // } | ||
| 2504 | } | ||
| 2505 | } | ||
| 2506 | |||
| 2507 | if (vars.numSeqs == 0) | ||
| 2508 | { | ||
| 2509 | p->winPos += numLits; | ||
| 2510 | return SZ_OK; | ||
| 2511 | } | ||
| 2512 | } | ||
| 2513 | { | ||
| 2514 | CInBufPair in; | ||
| 2515 | unsigned mode; | ||
| 2516 | unsigned seqMode; | ||
| 2517 | |||
| 2518 | in.ptr = src; | ||
| 2519 | in.len = inSize; | ||
| 2520 | if (in.len == 0) | ||
| 2521 | return SZ_ERROR_DATA; | ||
| 2522 | in.len--; | ||
| 2523 | mode = *in.ptr++; | ||
| 2524 | if (mode & 3) // Reserved bits | ||
| 2525 | return SZ_ERROR_DATA; | ||
| 2526 | |||
| 2527 | seqMode = (mode >> 6); | ||
| 2528 | if (seqMode == k_SeqMode_Repeat) | ||
| 2529 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
| 2530 | else RINOK(FSE_Decode_SeqTable( | ||
| 2531 | p->fse.ll, | ||
| 2532 | &in, | ||
| 2533 | 6, // predefAccuracy | ||
| 2534 | &p->ll_accuracy, | ||
| 2535 | NUM_LL_SYMBOLS, | ||
| 2536 | k_PredefRecords_LL, | ||
| 2537 | seqMode)) | ||
| 2538 | |||
| 2539 | seqMode = (mode >> 4) & 3; | ||
| 2540 | if (seqMode == k_SeqMode_Repeat) | ||
| 2541 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
| 2542 | else RINOK(FSE_Decode_SeqTable( | ||
| 2543 | p->fse.of, | ||
| 2544 | &in, | ||
| 2545 | 5, // predefAccuracy | ||
| 2546 | &p->of_accuracy, | ||
| 2547 | NUM_OFFSET_SYMBOLS_MAX, | ||
| 2548 | k_PredefRecords_OF, | ||
| 2549 | seqMode)) | ||
| 2550 | |||
| 2551 | seqMode = (mode >> 2) & 3; | ||
| 2552 | if (seqMode == k_SeqMode_Repeat) | ||
| 2553 | { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; } | ||
| 2554 | else | ||
| 2555 | { | ||
| 2556 | RINOK(FSE_Decode_SeqTable( | ||
| 2557 | p->fse.ml, | ||
| 2558 | &in, | ||
| 2559 | 6, // predefAccuracy | ||
| 2560 | &p->ml_accuracy, | ||
| 2561 | NUM_ML_SYMBOLS, | ||
| 2562 | k_PredefRecords_ML, | ||
| 2563 | seqMode)) | ||
| 2564 | /* | ||
| 2565 | #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3) | ||
| 2566 | // { unsigned y = 1 << 10; do | ||
| 2567 | { | ||
| 2568 | const unsigned accuracy = p->ml_accuracy; | ||
| 2569 | if (accuracy == 0) | ||
| 2570 | p->fse.ml[0] += 3; | ||
| 2571 | else | ||
| 2572 | #ifdef MY_CPU_64BIT | ||
| 2573 | { | ||
| 2574 | // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code | ||
| 2575 | UInt64 *table = (UInt64 *)(void *)p->fse.ml; | ||
| 2576 | const UInt64 *end = (const UInt64 *)(const void *) | ||
| 2577 | ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy)); | ||
| 2578 | do | ||
| 2579 | { | ||
| 2580 | table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN; | ||
| 2581 | table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN; | ||
| 2582 | table += 2; | ||
| 2583 | } | ||
| 2584 | while (table != end); | ||
| 2585 | } | ||
| 2586 | #else | ||
| 2587 | { | ||
| 2588 | UInt32 *table = p->fse.ml; | ||
| 2589 | const UInt32 *end = (const UInt32 *)(const void *) | ||
| 2590 | ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy)); | ||
| 2591 | do | ||
| 2592 | { | ||
| 2593 | table[0] += MATCH_LEN_MIN; | ||
| 2594 | table[1] += MATCH_LEN_MIN; | ||
| 2595 | table += 2; | ||
| 2596 | table[0] += MATCH_LEN_MIN; | ||
| 2597 | table[1] += MATCH_LEN_MIN; | ||
| 2598 | table += 2; | ||
| 2599 | } | ||
| 2600 | while (table != end); | ||
| 2601 | } | ||
| 2602 | #endif | ||
| 2603 | } | ||
| 2604 | // while (--y); } | ||
| 2605 | #endif | ||
| 2606 | */ | ||
| 2607 | } | ||
| 2608 | |||
| 2609 | // p->seqTables_wereSet = True; | ||
| 2610 | if (in.len == 0) | ||
| 2611 | return SZ_ERROR_DATA; | ||
| 2612 | return Decompress_Sequences(p, | ||
| 2613 | in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len, | ||
| 2614 | p->winPos + outLimit, &vars); | ||
| 2615 | } | ||
| 2616 | } | ||
| 2617 | |||
| 2618 | |||
| 2619 | |||
| 2620 | |||
| 2621 | // inSize != 0 | ||
| 2622 | // it must do similar to ZstdDec1_DecodeBlock() | ||
| 2623 | static size_t ZstdDec1_NeedTempBufferForInput( | ||
| 2624 | const SizeT beforeSize, const Byte * const src, const SizeT inSize) | ||
| 2625 | { | ||
| 2626 | unsigned b0; | ||
| 2627 | UInt32 pos; | ||
| 2628 | |||
| 2629 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
| 2630 | g_numSeqs = 1 << 24; | ||
| 2631 | #else | ||
| 2632 | // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode | ||
| 2633 | #define MIN_BLOCK_LZ_HEADERS_SIZE 3 | ||
| 2634 | if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE) | ||
| 2635 | return 0; | ||
| 2636 | #endif | ||
| 2637 | |||
| 2638 | b0 = src[0]; | ||
| 2639 | |||
| 2640 | if ((b0 & k_LitBlockType_Flag_Compressed) == 0) | ||
| 2641 | { | ||
| 2642 | UInt32 numLits = b0 >> 3; | ||
| 2643 | pos = 1; | ||
| 2644 | if (b0 & 4) | ||
| 2645 | { | ||
| 2646 | UInt32 v; | ||
| 2647 | if (inSize < 3) | ||
| 2648 | return 0; | ||
| 2649 | numLits >>= 1; | ||
| 2650 | v = GetUi16(src + 1); | ||
| 2651 | pos = 3; | ||
| 2652 | if ((b0 & 8) == 0) | ||
| 2653 | { | ||
| 2654 | pos = 2; | ||
| 2655 | v = (Byte)v; | ||
| 2656 | } | ||
| 2657 | numLits += v << 4; | ||
| 2658 | } | ||
| 2659 | if (b0 & k_LitBlockType_Flag_RLE_or_Treeless) | ||
| 2660 | numLits = 1; | ||
| 2661 | pos += numLits; | ||
| 2662 | } | ||
| 2663 | else if (inSize < 5) | ||
| 2664 | return 0; | ||
| 2665 | else | ||
| 2666 | { | ||
| 2667 | const unsigned mode4Streams = b0 & 0xc; | ||
| 2668 | const unsigned numBytes = (3 * mode4Streams + 48) >> 4; | ||
| 2669 | const unsigned numBits = 4 * numBytes - 6; | ||
| 2670 | UInt32 cs = GetUi32(src + 1); | ||
| 2671 | cs >>= numBits; | ||
| 2672 | cs &= ((UInt32)16 << numBits) - 1; | ||
| 2673 | if (cs == 0) | ||
| 2674 | return 0; | ||
| 2675 | pos = numBytes + cs; | ||
| 2676 | } | ||
| 2677 | |||
| 2678 | if (pos >= inSize) | ||
| 2679 | return 0; | ||
| 2680 | { | ||
| 2681 | UInt32 numSeqs = src[pos++]; | ||
| 2682 | if (numSeqs > 127) | ||
| 2683 | { | ||
| 2684 | UInt32 b1; | ||
| 2685 | if (pos >= inSize) | ||
| 2686 | return 0; | ||
| 2687 | numSeqs -= 128; | ||
| 2688 | b1 = src[pos++]; | ||
| 2689 | if (numSeqs == 127) | ||
| 2690 | { | ||
| 2691 | if (pos >= inSize) | ||
| 2692 | return 0; | ||
| 2693 | numSeqs = (UInt32)(src[pos++]) + 127; | ||
| 2694 | } | ||
| 2695 | numSeqs = (numSeqs << 8) + b1; | ||
| 2696 | } | ||
| 2697 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
| 2698 | g_numSeqs = numSeqs; // for debug | ||
| 2699 | #endif | ||
| 2700 | if (numSeqs == 0) | ||
| 2701 | return 0; | ||
| 2702 | } | ||
| 2703 | /* | ||
| 2704 | if (pos >= inSize) | ||
| 2705 | return 0; | ||
| 2706 | pos++; | ||
| 2707 | */ | ||
| 2708 | // we will have one additional byte for seqMode: | ||
| 2709 | if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1) | ||
| 2710 | return 0; | ||
| 2711 | return 1; | ||
| 2712 | } | ||
| 2713 | |||
| 2714 | |||
| 2715 | |||
| 2716 | // ---------- ZSTD FRAME ---------- | ||
| 2717 | |||
| 2718 | #define kBlockType_Raw 0 | ||
| 2719 | #define kBlockType_RLE 1 | ||
| 2720 | #define kBlockType_Compressed 2 | ||
| 2721 | #define kBlockType_Reserved 3 | ||
| 2722 | |||
| 2723 | typedef enum | ||
| 2724 | { | ||
| 2725 | // begin: states that require 4 bytes: | ||
| 2726 | ZSTD2_STATE_SIGNATURE, | ||
| 2727 | ZSTD2_STATE_HASH, | ||
| 2728 | ZSTD2_STATE_SKIP_HEADER, | ||
| 2729 | // end of states that require 4 bytes | ||
| 2730 | |||
| 2731 | ZSTD2_STATE_SKIP_DATA, | ||
| 2732 | ZSTD2_STATE_FRAME_HEADER, | ||
| 2733 | ZSTD2_STATE_AFTER_HEADER, | ||
| 2734 | ZSTD2_STATE_BLOCK, | ||
| 2735 | ZSTD2_STATE_DATA, | ||
| 2736 | ZSTD2_STATE_FINISHED | ||
| 2737 | } EZstd2State; | ||
| 2738 | |||
| 2739 | |||
| 2740 | struct CZstdDec | ||
| 2741 | { | ||
| 2742 | EZstd2State frameState; | ||
| 2743 | unsigned tempSize; | ||
| 2744 | |||
| 2745 | Byte temp[14]; // 14 is required | ||
| 2746 | |||
| 2747 | Byte descriptor; | ||
| 2748 | Byte windowDescriptor; | ||
| 2749 | Byte isLastBlock; | ||
| 2750 | Byte blockType; | ||
| 2751 | Byte isErrorState; | ||
| 2752 | Byte hashError; | ||
| 2753 | Byte disableHash; | ||
| 2754 | Byte isCyclicMode; | ||
| 2755 | |||
| 2756 | UInt32 blockSize; | ||
| 2757 | UInt32 dictionaryId; | ||
| 2758 | UInt32 curBlockUnpackRem; // for compressed blocks only | ||
| 2759 | UInt32 inTempPos; | ||
| 2760 | |||
| 2761 | UInt64 contentSize; | ||
| 2762 | UInt64 contentProcessed; | ||
| 2763 | CXxh64State xxh64; | ||
| 2764 | |||
| 2765 | Byte *inTemp; | ||
| 2766 | SizeT winBufSize_Allocated; | ||
| 2767 | Byte *win_Base; | ||
| 2768 | |||
| 2769 | ISzAllocPtr alloc_Small; | ||
| 2770 | ISzAllocPtr alloc_Big; | ||
| 2771 | |||
| 2772 | CZstdDec1 decoder; | ||
| 2773 | }; | ||
| 2774 | |||
| 2775 | #define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \ | ||
| 2776 | ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1)) | ||
| 2777 | |||
| 2778 | #define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock) | ||
| 2779 | |||
| 2780 | |||
| 2781 | static void ZstdDec_FreeWindow(CZstdDec * const p) | ||
| 2782 | { | ||
| 2783 | if (p->win_Base) | ||
| 2784 | { | ||
| 2785 | ISzAlloc_Free(p->alloc_Big, p->win_Base); | ||
| 2786 | p->win_Base = NULL; | ||
| 2787 | // p->decoder.win = NULL; | ||
| 2788 | p->winBufSize_Allocated = 0; | ||
| 2789 | } | ||
| 2790 | } | ||
| 2791 | |||
| 2792 | |||
| 2793 | CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big) | ||
| 2794 | { | ||
| 2795 | CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec)); | ||
| 2796 | if (!p) | ||
| 2797 | return NULL; | ||
| 2798 | p->alloc_Small = alloc_Small; | ||
| 2799 | p->alloc_Big = alloc_Big; | ||
| 2800 | // ZstdDec_CONSTRUCT(p) | ||
| 2801 | p->inTemp = NULL; | ||
| 2802 | p->win_Base = NULL; | ||
| 2803 | p->winBufSize_Allocated = 0; | ||
| 2804 | p->disableHash = False; | ||
| 2805 | ZstdDec1_Construct(&p->decoder); | ||
| 2806 | return p; | ||
| 2807 | } | ||
| 2808 | |||
| 2809 | void ZstdDec_Destroy(CZstdDecHandle p) | ||
| 2810 | { | ||
| 2811 | #ifdef SHOW_STAT | ||
| 2812 | #define PRINT_STAT1(name, v) \ | ||
| 2813 | printf("\n%25s = %9u", name, v); | ||
| 2814 | PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed) | ||
| 2815 | PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy) | ||
| 2816 | PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num) | ||
| 2817 | PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes) | ||
| 2818 | if (g_Num_Blocks_Compressed) | ||
| 2819 | { | ||
| 2820 | #define PRINT_STAT(name, v) \ | ||
| 2821 | printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed); | ||
| 2822 | PRINT_STAT("g_NumSeqs", g_NumSeqs_total) | ||
| 2823 | // PRINT_STAT("g_NumCopy", g_NumCopy) | ||
| 2824 | PRINT_STAT("g_NumOver", g_NumOver) | ||
| 2825 | PRINT_STAT("g_NumOver2", g_NumOver2) | ||
| 2826 | PRINT_STAT("g_Num_Match", g_Num_Match) | ||
| 2827 | PRINT_STAT("g_Num_Lits", g_Num_Lits) | ||
| 2828 | PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig) | ||
| 2829 | PRINT_STAT("g_Num_Lit0", g_Num_Lit0) | ||
| 2830 | PRINT_STAT("g_Num_Rep_0", g_Num_Rep0) | ||
| 2831 | PRINT_STAT("g_Num_Rep_1", g_Num_Rep1) | ||
| 2832 | PRINT_STAT("g_Num_Rep_2", g_Num_Rep2) | ||
| 2833 | PRINT_STAT("g_Num_Rep_3", g_Num_Rep3) | ||
| 2834 | PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0) | ||
| 2835 | PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1) | ||
| 2836 | PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum) | ||
| 2837 | PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum) | ||
| 2838 | } | ||
| 2839 | printf("\n"); | ||
| 2840 | #endif | ||
| 2841 | |||
| 2842 | ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase); | ||
| 2843 | // p->->decoder.literalsBase = NULL; | ||
| 2844 | ISzAlloc_Free(p->alloc_Small, p->inTemp); | ||
| 2845 | // p->inTemp = NULL; | ||
| 2846 | ZstdDec_FreeWindow(p); | ||
| 2847 | ISzAlloc_Free(p->alloc_Small, p); | ||
| 2848 | } | ||
| 2849 | |||
| 2850 | |||
| 2851 | |||
| 2852 | #define kTempBuffer_PreSize (1u << 6) | ||
| 2853 | #if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH | ||
| 2854 | #error Stop_Compiling_Bad_kTempBuffer_PreSize | ||
| 2855 | #endif | ||
| 2856 | |||
| 2857 | static SRes ZstdDec_AllocateMisc(CZstdDec *p) | ||
| 2858 | { | ||
| 2859 | #define k_Lit_AfterAvail (1u << 6) | ||
| 2860 | #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1) | ||
| 2861 | #error Stop_Compiling_Bad_k_Lit_AfterAvail | ||
| 2862 | #endif | ||
| 2863 | // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small); | ||
| 2864 | if (!p->decoder.literalsBase) | ||
| 2865 | { | ||
| 2866 | p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small, | ||
| 2867 | kBlockSizeMax + k_Lit_AfterAvail); | ||
| 2868 | if (!p->decoder.literalsBase) | ||
| 2869 | return SZ_ERROR_MEM; | ||
| 2870 | } | ||
| 2871 | if (!p->inTemp) | ||
| 2872 | { | ||
| 2873 | // we need k_Lit_AfterAvail here for owerread from raw literals stream | ||
| 2874 | p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small, | ||
| 2875 | kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail); | ||
| 2876 | if (!p->inTemp) | ||
| 2877 | return SZ_ERROR_MEM; | ||
| 2878 | } | ||
| 2879 | return SZ_OK; | ||
| 2880 | } | ||
| 2881 | |||
| 2882 | |||
| 2883 | static void ZstdDec_Init_ForNewFrame(CZstdDec *p) | ||
| 2884 | { | ||
| 2885 | p->frameState = ZSTD2_STATE_SIGNATURE; | ||
| 2886 | p->tempSize = 0; | ||
| 2887 | |||
| 2888 | p->isErrorState = False; | ||
| 2889 | p->hashError = False; | ||
| 2890 | p->isCyclicMode = False; | ||
| 2891 | p->contentProcessed = 0; | ||
| 2892 | Xxh64State_Init(&p->xxh64); | ||
| 2893 | ZstdDec1_Init(&p->decoder); | ||
| 2894 | } | ||
| 2895 | |||
| 2896 | |||
| 2897 | void ZstdDec_Init(CZstdDec *p) | ||
| 2898 | { | ||
| 2899 | ZstdDec_Init_ForNewFrame(p); | ||
| 2900 | p->decoder.winPos = 0; | ||
| 2901 | memset(p->temp, 0, sizeof(p->temp)); | ||
| 2902 | } | ||
| 2903 | |||
| 2904 | |||
| 2905 | #define DESCRIPTOR_Get_DictionaryId_Flag(d) ((d) & 3) | ||
| 2906 | #define DESCRIPTOR_FLAG_CHECKSUM (1 << 2) | ||
| 2907 | #define DESCRIPTOR_FLAG_RESERVED (1 << 3) | ||
| 2908 | // #define DESCRIPTOR_FLAG_UNUSED (1 << 4) | ||
| 2909 | #define DESCRIPTOR_FLAG_SINGLE (1 << 5) | ||
| 2910 | #define DESCRIPTOR_Get_ContentSize_Flag3(d) ((d) >> 5) | ||
| 2911 | #define DESCRIPTOR_Is_ContentSize_Defined(d) (((d) & 0xe0) != 0) | ||
| 2912 | |||
| 2913 | |||
| 2914 | static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info) | ||
| 2915 | { | ||
| 2916 | unsigned tempSize = p->tempSize; | ||
| 2917 | p->temp[tempSize++] = b; | ||
| 2918 | p->tempSize = tempSize; | ||
| 2919 | |||
| 2920 | if (p->frameState == ZSTD2_STATE_BLOCK) | ||
| 2921 | { | ||
| 2922 | if (tempSize < 3) | ||
| 2923 | return ZSTD2_STATE_BLOCK; | ||
| 2924 | { | ||
| 2925 | UInt32 b0 = GetUi32(p->temp); | ||
| 2926 | const unsigned type = ((unsigned)b0 >> 1) & 3; | ||
| 2927 | if (type == kBlockType_RLE && tempSize == 3) | ||
| 2928 | return ZSTD2_STATE_BLOCK; | ||
| 2929 | // info->num_Blocks_forType[type]++; | ||
| 2930 | info->num_Blocks++; | ||
| 2931 | if (type == kBlockType_Reserved) | ||
| 2932 | { | ||
| 2933 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED | ||
| 2934 | return ZSTD2_STATE_BLOCK; | ||
| 2935 | } | ||
| 2936 | p->blockType = (Byte)type; | ||
| 2937 | p->isLastBlock = (Byte)(b0 & 1); | ||
| 2938 | p->inTempPos = 0; | ||
| 2939 | p->tempSize = 0; | ||
| 2940 | b0 >>= 3; | ||
| 2941 | b0 &= 0x1fffff; | ||
| 2942 | // info->num_BlockBytes_forType[type] += b0; | ||
| 2943 | if (b0 == 0) | ||
| 2944 | { | ||
| 2945 | // empty RAW/RLE blocks are allowed in original-zstd decoder | ||
| 2946 | if (type == kBlockType_Compressed) | ||
| 2947 | { | ||
| 2948 | p->isErrorState = True; | ||
| 2949 | return ZSTD2_STATE_BLOCK; | ||
| 2950 | } | ||
| 2951 | if (!ZSTD_DEC_IS_LAST_BLOCK(p)) | ||
| 2952 | return ZSTD2_STATE_BLOCK; | ||
| 2953 | if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
| 2954 | return ZSTD2_STATE_HASH; | ||
| 2955 | return ZSTD2_STATE_FINISHED; | ||
| 2956 | } | ||
| 2957 | p->blockSize = b0; | ||
| 2958 | { | ||
| 2959 | UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder); | ||
| 2960 | // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size) | ||
| 2961 | if (b0 > blockLim) | ||
| 2962 | { | ||
| 2963 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED; | ||
| 2964 | return ZSTD2_STATE_BLOCK; | ||
| 2965 | } | ||
| 2966 | if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)) | ||
| 2967 | { | ||
| 2968 | const UInt64 rem = p->contentSize - p->contentProcessed; | ||
| 2969 | if (blockLim > rem) | ||
| 2970 | blockLim = (UInt32)rem; | ||
| 2971 | } | ||
| 2972 | p->curBlockUnpackRem = blockLim; | ||
| 2973 | // uncompressed block size cannot be larger than remain data size: | ||
| 2974 | if (type != kBlockType_Compressed) | ||
| 2975 | { | ||
| 2976 | if (b0 > blockLim) | ||
| 2977 | { | ||
| 2978 | p->isErrorState = True; // SZ_ERROR_UNSUPPORTED; | ||
| 2979 | return ZSTD2_STATE_BLOCK; | ||
| 2980 | } | ||
| 2981 | } | ||
| 2982 | } | ||
| 2983 | } | ||
| 2984 | return ZSTD2_STATE_DATA; | ||
| 2985 | } | ||
| 2986 | |||
| 2987 | if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA) | ||
| 2988 | { | ||
| 2989 | UInt32 v; | ||
| 2990 | if (tempSize != 4) | ||
| 2991 | return p->frameState; | ||
| 2992 | v = GetUi32(p->temp); | ||
| 2993 | if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE | ||
| 2994 | { | ||
| 2995 | if (v == 0xfd2fb528) | ||
| 2996 | { | ||
| 2997 | p->tempSize = 0; | ||
| 2998 | info->num_DataFrames++; | ||
| 2999 | return ZSTD2_STATE_FRAME_HEADER; | ||
| 3000 | } | ||
| 3001 | if ((v & 0xfffffff0) == 0x184d2a50) | ||
| 3002 | { | ||
| 3003 | p->tempSize = 0; | ||
| 3004 | info->num_SkipFrames++; | ||
| 3005 | return ZSTD2_STATE_SKIP_HEADER; | ||
| 3006 | } | ||
| 3007 | p->isErrorState = True; | ||
| 3008 | return ZSTD2_STATE_SIGNATURE; | ||
| 3009 | // return ZSTD2_STATE_ERROR; // is not ZSTD stream | ||
| 3010 | } | ||
| 3011 | if (p->frameState == ZSTD2_STATE_HASH) | ||
| 3012 | { | ||
| 3013 | info->checksum_Defined = True; | ||
| 3014 | info->checksum = v; | ||
| 3015 | // #ifndef DISABLE_XXH_CHECK | ||
| 3016 | if (!p->disableHash) | ||
| 3017 | { | ||
| 3018 | if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)) | ||
| 3019 | { | ||
| 3020 | // unexpected code failure | ||
| 3021 | p->isErrorState = True; | ||
| 3022 | // SZ_ERROR_FAIL; | ||
| 3023 | } | ||
| 3024 | else | ||
| 3025 | if ((UInt32)Xxh64State_Digest(&p->xxh64, | ||
| 3026 | p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)), | ||
| 3027 | p->contentProcessed) != v) | ||
| 3028 | { | ||
| 3029 | p->hashError = True; | ||
| 3030 | // return ZSTD2_STATE_ERROR; // hash error | ||
| 3031 | } | ||
| 3032 | } | ||
| 3033 | // #endif | ||
| 3034 | return ZSTD2_STATE_FINISHED; | ||
| 3035 | } | ||
| 3036 | // (p->frameState == ZSTD2_STATE_SKIP_HEADER) | ||
| 3037 | { | ||
| 3038 | p->blockSize = v; | ||
| 3039 | info->skipFrames_Size += v; | ||
| 3040 | p->tempSize = 0; | ||
| 3041 | /* we want the caller could know that there was finished frame | ||
| 3042 | finished frame. So we allow the case where | ||
| 3043 | we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0). | ||
| 3044 | */ | ||
| 3045 | // if (v == 0) return ZSTD2_STATE_SIGNATURE; | ||
| 3046 | return ZSTD2_STATE_SKIP_DATA; | ||
| 3047 | } | ||
| 3048 | } | ||
| 3049 | |||
| 3050 | // if (p->frameState == ZSTD2_STATE_FRAME_HEADER) | ||
| 3051 | { | ||
| 3052 | unsigned descriptor; | ||
| 3053 | const Byte *h; | ||
| 3054 | descriptor = p->temp[0]; | ||
| 3055 | p->descriptor = (Byte)descriptor; | ||
| 3056 | if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit | ||
| 3057 | { | ||
| 3058 | p->isErrorState = True; | ||
| 3059 | return ZSTD2_STATE_FRAME_HEADER; | ||
| 3060 | // return ZSTD2_STATE_ERROR; | ||
| 3061 | } | ||
| 3062 | { | ||
| 3063 | const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor); | ||
| 3064 | // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1)); | ||
| 3065 | tempSize -= (0x9a563422u >> (n * 4)) & 0xf; | ||
| 3066 | } | ||
| 3067 | if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor)))) | ||
| 3068 | return ZSTD2_STATE_FRAME_HEADER; | ||
| 3069 | |||
| 3070 | info->descriptor_OR = (Byte)(info->descriptor_OR | descriptor); | ||
| 3071 | info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor); | ||
| 3072 | |||
| 3073 | h = &p->temp[1]; | ||
| 3074 | { | ||
| 3075 | Byte w = 0; | ||
| 3076 | if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0) | ||
| 3077 | { | ||
| 3078 | w = *h++; | ||
| 3079 | if (info->windowDescriptor_MAX < w) | ||
| 3080 | info->windowDescriptor_MAX = w; | ||
| 3081 | // info->are_WindowDescriptors = True; | ||
| 3082 | // info->num_WindowDescriptors++; | ||
| 3083 | } | ||
| 3084 | else | ||
| 3085 | { | ||
| 3086 | // info->are_SingleSegments = True; | ||
| 3087 | // info->num_SingleSegments++; | ||
| 3088 | } | ||
| 3089 | p->windowDescriptor = w; | ||
| 3090 | } | ||
| 3091 | { | ||
| 3092 | unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor); | ||
| 3093 | UInt32 d = 0; | ||
| 3094 | if (n) | ||
| 3095 | { | ||
| 3096 | n = 1u << (n - 1); | ||
| 3097 | d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n)); | ||
| 3098 | h += n; | ||
| 3099 | } | ||
| 3100 | p->dictionaryId = d; | ||
| 3101 | // info->dictionaryId_Cur = d; | ||
| 3102 | if (d != 0) | ||
| 3103 | { | ||
| 3104 | if (info->dictionaryId == 0) | ||
| 3105 | info->dictionaryId = d; | ||
| 3106 | else if (info->dictionaryId != d) | ||
| 3107 | info->are_DictionaryId_Different = True; | ||
| 3108 | } | ||
| 3109 | } | ||
| 3110 | { | ||
| 3111 | unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor); | ||
| 3112 | UInt64 v = 0; | ||
| 3113 | if (n) | ||
| 3114 | { | ||
| 3115 | n >>= 1; | ||
| 3116 | if (n == 1) | ||
| 3117 | v = 256; | ||
| 3118 | v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n))); | ||
| 3119 | // info->are_ContentSize_Known = True; | ||
| 3120 | // info->num_Frames_with_ContentSize++; | ||
| 3121 | if (info->contentSize_MAX < v) | ||
| 3122 | info->contentSize_MAX = v; | ||
| 3123 | info->contentSize_Total += v; | ||
| 3124 | } | ||
| 3125 | else | ||
| 3126 | { | ||
| 3127 | info->are_ContentSize_Unknown = True; | ||
| 3128 | // info->num_Frames_without_ContentSize++; | ||
| 3129 | } | ||
| 3130 | p->contentSize = v; | ||
| 3131 | } | ||
| 3132 | // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure | ||
| 3133 | p->tempSize = 0; | ||
| 3134 | |||
| 3135 | info->checksum_Defined = False; | ||
| 3136 | /* | ||
| 3137 | if (descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
| 3138 | info->are_Checksums = True; | ||
| 3139 | else | ||
| 3140 | info->are_Non_Checksums = True; | ||
| 3141 | */ | ||
| 3142 | |||
| 3143 | return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK; | ||
| 3144 | } | ||
| 3145 | } | ||
| 3146 | |||
| 3147 | |||
| 3148 | static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos) | ||
| 3149 | { | ||
| 3150 | /* | ||
| 3151 | #ifdef DISABLE_XXH_CHECK | ||
| 3152 | UNUSED_VAR(data) | ||
| 3153 | #else | ||
| 3154 | */ | ||
| 3155 | if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)) | ||
| 3156 | { | ||
| 3157 | // const size_t pos = p->xxh64_winPos; | ||
| 3158 | const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31; | ||
| 3159 | if (size) | ||
| 3160 | { | ||
| 3161 | // p->xxh64_winPos = pos + size; | ||
| 3162 | Xxh64State_UpdateBlocks(&p->xxh64, | ||
| 3163 | p->decoder.win + xxh64_winPos, | ||
| 3164 | p->decoder.win + xxh64_winPos + size); | ||
| 3165 | } | ||
| 3166 | } | ||
| 3167 | } | ||
| 3168 | |||
| 3169 | |||
| 3170 | /* | ||
| 3171 | in: | ||
| 3172 | (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible). | ||
| 3173 | - this function uses (winLimit) for RAW/RLE blocks only, | ||
| 3174 | because this function can decode single RAW/RLE block in several different calls. | ||
| 3175 | - this function DOESN'T use (winLimit) for Compressed blocks, | ||
| 3176 | because this function decodes full compressed block in single call. | ||
| 3177 | (CZstdDec1::winPos <= winLimit) | ||
| 3178 | (winLimit <= CZstdDec1::cycSize). | ||
| 3179 | Note: if (ds->outBuf_fromCaller) mode is used, then | ||
| 3180 | { | ||
| 3181 | (strong_limit) is stored in CZstdDec1::cycSize. | ||
| 3182 | So (winLimit) is more strong than (strong_limit). | ||
| 3183 | } | ||
| 3184 | |||
| 3185 | exit: | ||
| 3186 | Note: (CZstdDecState::winPos) will be set by caller after exit of this function. | ||
| 3187 | |||
| 3188 | This function can exit for any of these conditions: | ||
| 3189 | - (frameState == ZSTD2_STATE_AFTER_HEADER) | ||
| 3190 | - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set | ||
| 3191 | - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart). | ||
| 3192 | - ZSTD_STATUS_NEEDS_MORE_INPUT in src | ||
| 3193 | - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block | ||
| 3194 | |||
| 3195 | This function decodes no more than one non-empty block. | ||
| 3196 | So it fulfills the condition at exit: | ||
| 3197 | (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max) | ||
| 3198 | Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding. | ||
| 3199 | |||
| 3200 | if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo) | ||
| 3201 | { | ||
| 3202 | then this function uses additional strong limit from (CZstdDec1::cycSize). | ||
| 3203 | So this function will not write any data after (CZstdDec1::cycSize) | ||
| 3204 | And it fulfills the condition at exit: | ||
| 3205 | (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize) | ||
| 3206 | } | ||
| 3207 | */ | ||
| 3208 | static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds, | ||
| 3209 | SizeT winLimitAdd) | ||
| 3210 | { | ||
| 3211 | const Byte *src = ds->inBuf; | ||
| 3212 | SizeT * const srcLen = &ds->inPos; | ||
| 3213 | const SizeT inSize = ds->inLim; | ||
| 3214 | // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0; | ||
| 3215 | enum_ZstdStatus * const status = &ds->status; | ||
| 3216 | CZstdDecInfo * const info = &ds->info; | ||
| 3217 | SizeT winLimit; | ||
| 3218 | |||
| 3219 | const SizeT winPos_atFuncStart = p->decoder.winPos; | ||
| 3220 | src += *srcLen; | ||
| 3221 | *status = ZSTD_STATUS_NOT_SPECIFIED; | ||
| 3222 | |||
| 3223 | // finishMode = ZSTD_FINISH_ANY; | ||
| 3224 | if (ds->outSize_Defined) | ||
| 3225 | { | ||
| 3226 | if (ds->outSize < ds->outProcessed) | ||
| 3227 | { | ||
| 3228 | // p->isAfterSizeMode = 2; // we have extra bytes already | ||
| 3229 | *status = ZSTD_STATUS_OUT_REACHED; | ||
| 3230 | return SZ_OK; | ||
| 3231 | // size = 0; | ||
| 3232 | } | ||
| 3233 | else | ||
| 3234 | { | ||
| 3235 | // p->outSize >= p->outProcessed | ||
| 3236 | const UInt64 rem = ds->outSize - ds->outProcessed; | ||
| 3237 | /* | ||
| 3238 | if (rem == 0) | ||
| 3239 | p->isAfterSizeMode = 1; // we have reached exact required size | ||
| 3240 | */ | ||
| 3241 | if (winLimitAdd >= rem) | ||
| 3242 | { | ||
| 3243 | winLimitAdd = (SizeT)rem; | ||
| 3244 | // if (p->finishMode) finishMode = ZSTD_FINISH_END; | ||
| 3245 | } | ||
| 3246 | } | ||
| 3247 | } | ||
| 3248 | |||
| 3249 | winLimit = p->decoder.winPos + winLimitAdd; | ||
| 3250 | // (p->decoder.winPos <= winLimit) | ||
| 3251 | |||
| 3252 | // while (p->frameState != ZSTD2_STATE_ERROR) | ||
| 3253 | while (!p->isErrorState) | ||
| 3254 | { | ||
| 3255 | SizeT inCur = inSize - *srcLen; | ||
| 3256 | |||
| 3257 | if (p->frameState == ZSTD2_STATE_DATA) | ||
| 3258 | { | ||
| 3259 | /* (p->decoder.winPos == winPos_atFuncStart) is expected, | ||
| 3260 | because this function doesn't start new block. | ||
| 3261 | if it have finished some non-empty block in this call. */ | ||
| 3262 | if (p->decoder.winPos != winPos_atFuncStart) | ||
| 3263 | return SZ_ERROR_FAIL; // it's unexpected | ||
| 3264 | |||
| 3265 | /* | ||
| 3266 | if (p->decoder.winPos > winLimit) | ||
| 3267 | { | ||
| 3268 | // we can be here, if in this function call | ||
| 3269 | // - we have extracted non-empty compressed block, and (winPos > winLimit) after that. | ||
| 3270 | // - we have started new block decoding after that. | ||
| 3271 | // It's unexpected case, because we exit after non-empty non-last block. | ||
| 3272 | *status = (inSize == *srcLen) ? | ||
| 3273 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
| 3274 | ZSTD_STATUS_NOT_FINISHED; | ||
| 3275 | return SZ_OK; | ||
| 3276 | } | ||
| 3277 | */ | ||
| 3278 | // p->decoder.winPos <= winLimit | ||
| 3279 | |||
| 3280 | if (p->blockType != kBlockType_Compressed) | ||
| 3281 | { | ||
| 3282 | // it's RLE or RAW block. | ||
| 3283 | // p->BlockSize != 0_ | ||
| 3284 | // winLimit <= p->decoder.cycSize | ||
| 3285 | /* So here we use more strong (winLimit), even for | ||
| 3286 | (ds->outBuf_fromCaller) mode. */ | ||
| 3287 | SizeT outCur = winLimit - p->decoder.winPos; | ||
| 3288 | { | ||
| 3289 | const UInt32 rem = p->blockSize; | ||
| 3290 | if (outCur > rem) | ||
| 3291 | outCur = rem; | ||
| 3292 | } | ||
| 3293 | if (p->blockType == kBlockType_Raw) | ||
| 3294 | { | ||
| 3295 | if (outCur > inCur) | ||
| 3296 | outCur = inCur; | ||
| 3297 | /* output buffer is better aligned for XXH code. | ||
| 3298 | So we use hash for output buffer data */ | ||
| 3299 | // ZstdDec_Update_XXH(p, src, outCur); // for debug: | ||
| 3300 | memcpy(p->decoder.win + p->decoder.winPos, src, outCur); | ||
| 3301 | src += outCur; | ||
| 3302 | *srcLen += outCur; | ||
| 3303 | } | ||
| 3304 | else // kBlockType_RLE | ||
| 3305 | { | ||
| 3306 | #define RLE_BYTE_INDEX_IN_temp 3 | ||
| 3307 | memset(p->decoder.win + p->decoder.winPos, | ||
| 3308 | p->temp[RLE_BYTE_INDEX_IN_temp], outCur); | ||
| 3309 | } | ||
| 3310 | { | ||
| 3311 | const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p); | ||
| 3312 | p->decoder.winPos += outCur; | ||
| 3313 | p->contentProcessed += outCur; | ||
| 3314 | ZstdDec_Update_XXH(p, xxh64_winPos); | ||
| 3315 | } | ||
| 3316 | // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug: | ||
| 3317 | UPDATE_TOTAL_OUT(&p->decoder, outCur) | ||
| 3318 | ds->outProcessed += outCur; | ||
| 3319 | if (p->blockSize -= (UInt32)outCur) | ||
| 3320 | { | ||
| 3321 | /* | ||
| 3322 | if (ds->outSize_Defined) | ||
| 3323 | { | ||
| 3324 | if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus) | ||
| 3325 | (ds->outSize == ds->outProcessed ? 1u: 2u); | ||
| 3326 | } | ||
| 3327 | */ | ||
| 3328 | *status = (enum_ZstdStatus) | ||
| 3329 | (ds->outSize_Defined && ds->outSize <= ds->outProcessed ? | ||
| 3330 | ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ? | ||
| 3331 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
| 3332 | ZSTD_STATUS_NOT_FINISHED); | ||
| 3333 | return SZ_OK; | ||
| 3334 | } | ||
| 3335 | } | ||
| 3336 | else // kBlockType_Compressed | ||
| 3337 | { | ||
| 3338 | // p->blockSize != 0 | ||
| 3339 | // (uncompressed_size_of_block == 0) is allowed | ||
| 3340 | // (p->curBlockUnpackRem == 0) is allowed | ||
| 3341 | /* | ||
| 3342 | if (p->decoder.winPos >= winLimit) | ||
| 3343 | { | ||
| 3344 | if (p->decoder.winPos != winPos_atFuncStart) | ||
| 3345 | { | ||
| 3346 | // it's unexpected case | ||
| 3347 | // We already have some data in finished blocks in this function call. | ||
| 3348 | // So we don't decompress new block after (>=winLimit), | ||
| 3349 | // even if it's empty block. | ||
| 3350 | *status = (inSize == *srcLen) ? | ||
| 3351 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
| 3352 | ZSTD_STATUS_NOT_FINISHED; | ||
| 3353 | return SZ_OK; | ||
| 3354 | } | ||
| 3355 | // (p->decoder.winPos == winLimit == winPos_atFuncStart) | ||
| 3356 | // we will decode current block, because that current | ||
| 3357 | // block can be empty block and we want to make some visible | ||
| 3358 | // change of (src) stream after function start. | ||
| 3359 | } | ||
| 3360 | */ | ||
| 3361 | /* | ||
| 3362 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
| 3363 | { | ||
| 3364 | // we don't want to start new block, if we have more extra decoded bytes already | ||
| 3365 | *status = ZSTD_STATUS_OUT_REACHED; | ||
| 3366 | return SZ_OK; | ||
| 3367 | } | ||
| 3368 | */ | ||
| 3369 | { | ||
| 3370 | const Byte *comprStream; | ||
| 3371 | size_t afterAvail; | ||
| 3372 | UInt32 inTempPos = p->inTempPos; | ||
| 3373 | const UInt32 rem = p->blockSize - inTempPos; | ||
| 3374 | // rem != 0 | ||
| 3375 | if (inTempPos != 0 // (inTemp) buffer already contains some input data | ||
| 3376 | || inCur < rem // available input data size is smaller than compressed block size | ||
| 3377 | || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem)) | ||
| 3378 | { | ||
| 3379 | if (inCur > rem) | ||
| 3380 | inCur = rem; | ||
| 3381 | if (inCur) | ||
| 3382 | { | ||
| 3383 | STAT_INC(g_Num_Blocks_memcpy) | ||
| 3384 | // we clear data for backward lookahead reading | ||
| 3385 | if (inTempPos == 0) | ||
| 3386 | memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH); | ||
| 3387 | // { unsigned y = 0; for(;y < 1000; y++) | ||
| 3388 | memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur); | ||
| 3389 | // } | ||
| 3390 | src += inCur; | ||
| 3391 | *srcLen += inCur; | ||
| 3392 | inTempPos += (UInt32)inCur; | ||
| 3393 | p->inTempPos = inTempPos; | ||
| 3394 | } | ||
| 3395 | if (inTempPos != p->blockSize) | ||
| 3396 | { | ||
| 3397 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
| 3398 | return SZ_OK; | ||
| 3399 | } | ||
| 3400 | #if COPY_CHUNK_SIZE > 1 | ||
| 3401 | memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE); | ||
| 3402 | #endif | ||
| 3403 | comprStream = p->inTemp + kTempBuffer_PreSize; | ||
| 3404 | afterAvail = k_Lit_AfterAvail; | ||
| 3405 | // we don't want to read non-initialized data or junk in CopyMatch(): | ||
| 3406 | } | ||
| 3407 | else | ||
| 3408 | { | ||
| 3409 | // inCur >= rem | ||
| 3410 | // we use direct decoding from (src) buffer: | ||
| 3411 | afterAvail = inCur - rem; | ||
| 3412 | comprStream = src; | ||
| 3413 | src += rem; | ||
| 3414 | *srcLen += rem; | ||
| 3415 | } | ||
| 3416 | |||
| 3417 | #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP | ||
| 3418 | ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize); | ||
| 3419 | #endif | ||
| 3420 | // printf("\nblockSize=%u", p->blockSize); | ||
| 3421 | // printf("%x\n", (unsigned)p->contentProcessed); | ||
| 3422 | STAT_INC(g_Num_Blocks_Compressed) | ||
| 3423 | { | ||
| 3424 | SRes sres; | ||
| 3425 | const size_t winPos = p->decoder.winPos; | ||
| 3426 | /* | ||
| 3427 | if ( useAdditionalWinLimit), we use strong unpack limit: smallest from | ||
| 3428 | - limit from stream : (curBlockUnpackRem) | ||
| 3429 | - limit from caller : (cycSize - winPos) | ||
| 3430 | if (!useAdditionalWinLimit), we use only relaxed limit: | ||
| 3431 | - limit from stream : (curBlockUnpackRem) | ||
| 3432 | */ | ||
| 3433 | SizeT outLimit = p->curBlockUnpackRem; | ||
| 3434 | if (ds->outBuf_fromCaller) | ||
| 3435 | // if (useAdditionalWinLimit) | ||
| 3436 | { | ||
| 3437 | const size_t limit = p->decoder.cycSize - winPos; | ||
| 3438 | if (outLimit > limit) | ||
| 3439 | outLimit = limit; | ||
| 3440 | } | ||
| 3441 | sres = ZstdDec1_DecodeBlock(&p->decoder, | ||
| 3442 | comprStream, p->blockSize, afterAvail, outLimit); | ||
| 3443 | // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug: | ||
| 3444 | if (sres) | ||
| 3445 | { | ||
| 3446 | p->isErrorState = True; | ||
| 3447 | return sres; | ||
| 3448 | } | ||
| 3449 | { | ||
| 3450 | const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p); | ||
| 3451 | const size_t num = p->decoder.winPos - winPos; | ||
| 3452 | ds->outProcessed += num; | ||
| 3453 | p->contentProcessed += num; | ||
| 3454 | ZstdDec_Update_XXH(p, xxh64_winPos); | ||
| 3455 | } | ||
| 3456 | } | ||
| 3457 | // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos); | ||
| 3458 | } | ||
| 3459 | } | ||
| 3460 | |||
| 3461 | /* | ||
| 3462 | if (ds->outSize_Defined) | ||
| 3463 | { | ||
| 3464 | if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus) | ||
| 3465 | (ds->outSize == ds->outProcessed ? 1u: 2u); | ||
| 3466 | } | ||
| 3467 | */ | ||
| 3468 | |||
| 3469 | if (!ZSTD_DEC_IS_LAST_BLOCK(p)) | ||
| 3470 | { | ||
| 3471 | p->frameState = ZSTD2_STATE_BLOCK; | ||
| 3472 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
| 3473 | { | ||
| 3474 | *status = ZSTD_STATUS_OUT_REACHED; | ||
| 3475 | return SZ_OK; | ||
| 3476 | } | ||
| 3477 | // we exit only if (winPos) was changed in this function call: | ||
| 3478 | if (p->decoder.winPos != winPos_atFuncStart) | ||
| 3479 | { | ||
| 3480 | // decoded block was not empty. So we exit: | ||
| 3481 | *status = (enum_ZstdStatus)( | ||
| 3482 | (inSize == *srcLen) ? | ||
| 3483 | ZSTD_STATUS_NEEDS_MORE_INPUT : | ||
| 3484 | ZSTD_STATUS_NOT_FINISHED); | ||
| 3485 | return SZ_OK; | ||
| 3486 | } | ||
| 3487 | // (p->decoder.winPos == winPos_atFuncStart) | ||
| 3488 | // so current decoded block was empty. | ||
| 3489 | // we will try to decode more blocks in this function. | ||
| 3490 | continue; | ||
| 3491 | } | ||
| 3492 | |||
| 3493 | // decoded block was last in frame | ||
| 3494 | if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) | ||
| 3495 | { | ||
| 3496 | p->frameState = ZSTD2_STATE_HASH; | ||
| 3497 | if (ds->outSize_Defined && ds->outSize < ds->outProcessed) | ||
| 3498 | { | ||
| 3499 | *status = ZSTD_STATUS_OUT_REACHED; | ||
| 3500 | return SZ_OK; // disable if want to | ||
| 3501 | /* We want to get same return codes for any input buffer sizes. | ||
| 3502 | We want to get faster ZSTD_STATUS_OUT_REACHED status. | ||
| 3503 | So we exit with ZSTD_STATUS_OUT_REACHED here, | ||
| 3504 | instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing. | ||
| 3505 | that depends from input buffer size and that can set | ||
| 3506 | ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC. | ||
| 3507 | */ | ||
| 3508 | } | ||
| 3509 | } | ||
| 3510 | else | ||
| 3511 | { | ||
| 3512 | /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */ | ||
| 3513 | p->frameState = ZSTD2_STATE_FINISHED; | ||
| 3514 | } | ||
| 3515 | /* | ||
| 3516 | p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ? | ||
| 3517 | ZSTD2_STATE_HASH : | ||
| 3518 | ZSTD2_STATE_FINISHED; | ||
| 3519 | */ | ||
| 3520 | /* it's required to process ZSTD2_STATE_FINISHED state in this function call, | ||
| 3521 | because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code, | ||
| 3522 | while the caller can reinit full state for ZSTD2_STATE_FINISHED | ||
| 3523 | So we can't exit from function here. */ | ||
| 3524 | continue; | ||
| 3525 | } | ||
| 3526 | |||
| 3527 | if (p->frameState == ZSTD2_STATE_FINISHED) | ||
| 3528 | { | ||
| 3529 | *status = ZSTD_STATUS_FINISHED_FRAME; | ||
| 3530 | if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor) | ||
| 3531 | && p->contentSize != p->contentProcessed) | ||
| 3532 | return SZ_ERROR_DATA; | ||
| 3533 | if (p->hashError) // for debug | ||
| 3534 | return SZ_ERROR_CRC; | ||
| 3535 | return SZ_OK; | ||
| 3536 | // p->frameState = ZSTD2_STATE_SIGNATURE; | ||
| 3537 | // continue; | ||
| 3538 | } | ||
| 3539 | |||
| 3540 | if (p->frameState == ZSTD2_STATE_AFTER_HEADER) | ||
| 3541 | return SZ_OK; // we need memory allocation for that state | ||
| 3542 | |||
| 3543 | if (p->frameState == ZSTD2_STATE_SKIP_DATA) | ||
| 3544 | { | ||
| 3545 | UInt32 blockSize = p->blockSize; | ||
| 3546 | // (blockSize == 0) is possible | ||
| 3547 | if (inCur > blockSize) | ||
| 3548 | inCur = blockSize; | ||
| 3549 | src += inCur; | ||
| 3550 | *srcLen += inCur; | ||
| 3551 | blockSize -= (UInt32)inCur; | ||
| 3552 | p->blockSize = blockSize; | ||
| 3553 | if (blockSize == 0) | ||
| 3554 | { | ||
| 3555 | p->frameState = ZSTD2_STATE_SIGNATURE; | ||
| 3556 | // continue; // for debug: we can continue without return to caller. | ||
| 3557 | // we notify the caller that skip frame was finished: | ||
| 3558 | *status = ZSTD_STATUS_FINISHED_FRAME; | ||
| 3559 | return SZ_OK; | ||
| 3560 | } | ||
| 3561 | // blockSize != 0 | ||
| 3562 | // (inCur) was smaller than previous value of p->blockSize. | ||
| 3563 | // (inSize == *srcLen) now | ||
| 3564 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
| 3565 | return SZ_OK; | ||
| 3566 | } | ||
| 3567 | |||
| 3568 | if (inCur == 0) | ||
| 3569 | { | ||
| 3570 | *status = ZSTD_STATUS_NEEDS_MORE_INPUT; | ||
| 3571 | return SZ_OK; | ||
| 3572 | } | ||
| 3573 | |||
| 3574 | { | ||
| 3575 | (*srcLen)++; | ||
| 3576 | p->frameState = ZstdDec_UpdateState(p, *src++, info); | ||
| 3577 | } | ||
| 3578 | } | ||
| 3579 | |||
| 3580 | *status = ZSTD_STATUS_NOT_SPECIFIED; | ||
| 3581 | p->isErrorState = True; | ||
| 3582 | // p->frameState = ZSTD2_STATE_ERROR; | ||
| 3583 | // if (p->frameState = ZSTD2_STATE_SIGNATURE) return SZ_ERROR_NO_ARCHIVE | ||
| 3584 | return SZ_ERROR_DATA; | ||
| 3585 | } | ||
| 3586 | |||
| 3587 | |||
| 3588 | |||
| 3589 | |||
| 3590 | SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p) | ||
| 3591 | { | ||
| 3592 | p->needWrite_Size = 0; | ||
| 3593 | p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
| 3594 | dec->disableHash = p->disableHash; | ||
| 3595 | |||
| 3596 | if (p->outBuf_fromCaller) | ||
| 3597 | { | ||
| 3598 | dec->decoder.win = p->outBuf_fromCaller; | ||
| 3599 | dec->decoder.cycSize = p->outBufSize_fromCaller; | ||
| 3600 | } | ||
| 3601 | |||
| 3602 | // p->winPos = dec->decoder.winPos; | ||
| 3603 | |||
| 3604 | for (;;) | ||
| 3605 | { | ||
| 3606 | SizeT winPos, size; | ||
| 3607 | // SizeT outProcessed; | ||
| 3608 | SRes res; | ||
| 3609 | |||
| 3610 | if (p->wrPos > dec->decoder.winPos) | ||
| 3611 | return SZ_ERROR_FAIL; | ||
| 3612 | |||
| 3613 | if (dec->frameState == ZSTD2_STATE_FINISHED) | ||
| 3614 | { | ||
| 3615 | if (!p->outBuf_fromCaller) | ||
| 3616 | { | ||
| 3617 | // we need to set positions to zero for new frame. | ||
| 3618 | if (p->wrPos != dec->decoder.winPos) | ||
| 3619 | { | ||
| 3620 | /* We have already asked the caller to flush all data | ||
| 3621 | with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status. | ||
| 3622 | So it's unexpected case */ | ||
| 3623 | // p->winPos = dec->decoder.winPos; | ||
| 3624 | // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking | ||
| 3625 | // return SZ_OK; // ask to flush again | ||
| 3626 | return SZ_ERROR_FAIL; | ||
| 3627 | } | ||
| 3628 | // (p->wrPos == dec->decoder.winPos), and we wrap to zero: | ||
| 3629 | dec->decoder.winPos = 0; | ||
| 3630 | p->winPos = 0; | ||
| 3631 | p->wrPos = 0; | ||
| 3632 | } | ||
| 3633 | ZstdDec_Init_ForNewFrame(dec); | ||
| 3634 | // continue; | ||
| 3635 | } | ||
| 3636 | |||
| 3637 | winPos = dec->decoder.winPos; | ||
| 3638 | { | ||
| 3639 | SizeT next = dec->decoder.cycSize; | ||
| 3640 | /* cycSize == 0, if no buffer was allocated still, | ||
| 3641 | or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */ | ||
| 3642 | if (!p->outBuf_fromCaller | ||
| 3643 | && next | ||
| 3644 | && next <= winPos | ||
| 3645 | && dec->isCyclicMode) | ||
| 3646 | { | ||
| 3647 | // (0 < decoder.cycSize <= winPos) in isCyclicMode. | ||
| 3648 | // so we need to wrap (winPos) and (wrPos) over (cycSize). | ||
| 3649 | const size_t delta = next; | ||
| 3650 | // (delta) is how many bytes we remove from buffer. | ||
| 3651 | /* | ||
| 3652 | // we don't need data older than last (cycSize) bytes. | ||
| 3653 | size_t delta = winPos - next; // num bytes after (cycSize) | ||
| 3654 | if (delta <= next) // it's expected case | ||
| 3655 | delta = next; | ||
| 3656 | // delta == Max(cycSize, winPos - cycSize) | ||
| 3657 | */ | ||
| 3658 | if (p->wrPos < delta) | ||
| 3659 | { | ||
| 3660 | // (wrPos < decoder.cycSize) | ||
| 3661 | // We have asked already the caller to flush required data | ||
| 3662 | // p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
| 3663 | // p->winPos = winPos; | ||
| 3664 | // p->needWrite_Size = delta - p->wrPos; // flush size asking | ||
| 3665 | // return SZ_OK; // ask to flush again | ||
| 3666 | return SZ_ERROR_FAIL; | ||
| 3667 | } | ||
| 3668 | // p->wrPos >= decoder.cycSize | ||
| 3669 | // we move extra data after (decoder.cycSize) to start of cyclic buffer: | ||
| 3670 | winPos -= delta; | ||
| 3671 | if (winPos) | ||
| 3672 | { | ||
| 3673 | if (winPos >= delta) | ||
| 3674 | return SZ_ERROR_FAIL; | ||
| 3675 | memmove(dec->decoder.win, dec->decoder.win + delta, winPos); | ||
| 3676 | // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos); | ||
| 3677 | STAT_INC(g_Num_Wrap_memmove_Num) | ||
| 3678 | STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;) | ||
| 3679 | } | ||
| 3680 | dec->decoder.winPos = winPos; | ||
| 3681 | p->winPos = winPos; | ||
| 3682 | p->wrPos -= delta; | ||
| 3683 | // dec->xxh64_winPos -= delta; | ||
| 3684 | |||
| 3685 | // (winPos < delta) | ||
| 3686 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
| 3687 | /* we set the data after cycSize, because | ||
| 3688 | we don't want to read non-initialized data or junk in CopyMatch(). */ | ||
| 3689 | memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE); | ||
| 3690 | #endif | ||
| 3691 | |||
| 3692 | /* | ||
| 3693 | if (winPos == next) | ||
| 3694 | { | ||
| 3695 | if (winPos != p->wrPos) | ||
| 3696 | { | ||
| 3697 | // we already requested before to flush full data for that case. | ||
| 3698 | // but we give the caller a second chance to flush data: | ||
| 3699 | p->needWrite_Size = winPos - p->wrPos; | ||
| 3700 | return SZ_OK; | ||
| 3701 | } | ||
| 3702 | // (decoder.cycSize == winPos == p->wrPos) | ||
| 3703 | // so we do second wrapping to zero: | ||
| 3704 | winPos = 0; | ||
| 3705 | dec->decoder.winPos = 0; | ||
| 3706 | p->winPos = 0; | ||
| 3707 | p->wrPos = 0; | ||
| 3708 | } | ||
| 3709 | */ | ||
| 3710 | // (winPos < next) | ||
| 3711 | } | ||
| 3712 | |||
| 3713 | if (winPos > next) | ||
| 3714 | return SZ_ERROR_FAIL; // it's unexpected case | ||
| 3715 | /* | ||
| 3716 | if (!outBuf_fromCaller && isCyclicMode && cycSize != 0) | ||
| 3717 | then (winPos < cycSize) | ||
| 3718 | else (winPos <= cycSize) | ||
| 3719 | */ | ||
| 3720 | if (!p->outBuf_fromCaller) | ||
| 3721 | { | ||
| 3722 | // that code is optional. We try to optimize write chunk sizes. | ||
| 3723 | /* (next2) is expected next write position in the caller, | ||
| 3724 | if the caller writes by kBlockSizeMax chunks. | ||
| 3725 | */ | ||
| 3726 | /* | ||
| 3727 | const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1); | ||
| 3728 | if (winPos < next2 && next2 < next) | ||
| 3729 | next = next2; | ||
| 3730 | */ | ||
| 3731 | } | ||
| 3732 | size = next - winPos; | ||
| 3733 | } | ||
| 3734 | |||
| 3735 | // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks | ||
| 3736 | res = ZstdDec_DecodeBlock(dec, p, size); | ||
| 3737 | /* | ||
| 3738 | after one block decoding: | ||
| 3739 | if (!outBuf_fromCaller && isCyclicMode && cycSize != 0) | ||
| 3740 | then (winPos < cycSize + max_block_size) | ||
| 3741 | else (winPos <= cycSize) | ||
| 3742 | */ | ||
| 3743 | |||
| 3744 | if (!p->outBuf_fromCaller) | ||
| 3745 | p->win = dec->decoder.win; | ||
| 3746 | p->winPos = dec->decoder.winPos; | ||
| 3747 | |||
| 3748 | // outProcessed = dec->decoder.winPos - winPos; | ||
| 3749 | // p->outProcessed += outProcessed; | ||
| 3750 | |||
| 3751 | if (res != SZ_OK) | ||
| 3752 | return res; | ||
| 3753 | |||
| 3754 | if (dec->frameState != ZSTD2_STATE_AFTER_HEADER) | ||
| 3755 | { | ||
| 3756 | if (p->outBuf_fromCaller) | ||
| 3757 | return SZ_OK; | ||
| 3758 | { | ||
| 3759 | // !p->outBuf_fromCaller | ||
| 3760 | /* | ||
| 3761 | if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because | ||
| 3762 | 1) it's simpler to work with allocation and extracting of next frame, | ||
| 3763 | 2) it's better to start writing to next new frame with aligned memory | ||
| 3764 | for faster xxh 64-bit reads. | ||
| 3765 | */ | ||
| 3766 | size_t end = dec->decoder.winPos; // end pos for all data flushing | ||
| 3767 | if (p->status != ZSTD_STATUS_FINISHED_FRAME) | ||
| 3768 | { | ||
| 3769 | // we will request flush here only for cases when wrap in cyclic buffer can be required in next call. | ||
| 3770 | if (!dec->isCyclicMode) | ||
| 3771 | return SZ_OK; | ||
| 3772 | // isCyclicMode | ||
| 3773 | { | ||
| 3774 | const size_t delta = dec->decoder.cycSize; | ||
| 3775 | if (end < delta) | ||
| 3776 | return SZ_OK; // (winPos < cycSize). no need for flush | ||
| 3777 | // cycSize <= winPos | ||
| 3778 | // So we ask the caller to flush of (cycSize - wrPos) bytes, | ||
| 3779 | // and then we will wrap cylicBuffer in next call | ||
| 3780 | end = delta; | ||
| 3781 | } | ||
| 3782 | } | ||
| 3783 | p->needWrite_Size = end - p->wrPos; | ||
| 3784 | } | ||
| 3785 | return SZ_OK; | ||
| 3786 | } | ||
| 3787 | |||
| 3788 | // ZSTD2_STATE_AFTER_HEADER | ||
| 3789 | { | ||
| 3790 | BoolInt useCyclic = False; | ||
| 3791 | size_t cycSize; | ||
| 3792 | |||
| 3793 | // p->status = ZSTD_STATUS_NOT_FINISHED; | ||
| 3794 | if (dec->dictionaryId != 0) | ||
| 3795 | { | ||
| 3796 | /* actually we can try to decode some data, | ||
| 3797 | because it's possible that some data doesn't use dictionary */ | ||
| 3798 | // p->status = ZSTD_STATUS_NOT_SPECIFIED; | ||
| 3799 | return SZ_ERROR_UNSUPPORTED; | ||
| 3800 | } | ||
| 3801 | |||
| 3802 | { | ||
| 3803 | UInt64 winSize = dec->contentSize; | ||
| 3804 | UInt64 winSize_Allocate = winSize; | ||
| 3805 | const unsigned descriptor = dec->descriptor; | ||
| 3806 | |||
| 3807 | if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0) | ||
| 3808 | { | ||
| 3809 | const Byte wd = dec->windowDescriptor; | ||
| 3810 | winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3); | ||
| 3811 | if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor) | ||
| 3812 | || winSize_Allocate > winSize) | ||
| 3813 | { | ||
| 3814 | winSize_Allocate = winSize; | ||
| 3815 | useCyclic = True; | ||
| 3816 | } | ||
| 3817 | } | ||
| 3818 | /* | ||
| 3819 | else | ||
| 3820 | { | ||
| 3821 | if (p->info.singleSegment_ContentSize_MAX < winSize) | ||
| 3822 | p->info.singleSegment_ContentSize_MAX = winSize; | ||
| 3823 | // p->info.num_SingleSegments++; | ||
| 3824 | } | ||
| 3825 | */ | ||
| 3826 | if (p->info.windowSize_MAX < winSize) | ||
| 3827 | p->info.windowSize_MAX = winSize; | ||
| 3828 | if (p->info.windowSize_Allocate_MAX < winSize_Allocate) | ||
| 3829 | p->info.windowSize_Allocate_MAX = winSize_Allocate; | ||
| 3830 | /* | ||
| 3831 | winSize_Allocate is MIN(content_size, window_size_from_descriptor). | ||
| 3832 | Wven if (content_size < (window_size_from_descriptor)) | ||
| 3833 | original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed. | ||
| 3834 | We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate)) | ||
| 3835 | */ | ||
| 3836 | if ( | ||
| 3837 | // winSize_Allocate // it's relaxed check | ||
| 3838 | winSize // it's more strict check to be compatible with original-zstd | ||
| 3839 | > ((UInt64)1 << MAX_WINDOW_SIZE_LOG)) | ||
| 3840 | return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM | ||
| 3841 | cycSize = (size_t)winSize_Allocate; | ||
| 3842 | if (cycSize != winSize_Allocate) | ||
| 3843 | return SZ_ERROR_MEM; | ||
| 3844 | // cycSize <= winSize | ||
| 3845 | /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes. | ||
| 3846 | if (there is window descriptor) | ||
| 3847 | { | ||
| 3848 | We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate). | ||
| 3849 | Does original-zstd do it that way also? | ||
| 3850 | } | ||
| 3851 | Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize). | ||
| 3852 | Also we don't want too big values for (CZstdDec1::winSize). | ||
| 3853 | our (CZstdDec1::winSize) will meet the condition: | ||
| 3854 | (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize). | ||
| 3855 | */ | ||
| 3856 | dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize; | ||
| 3857 | // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic) | ||
| 3858 | } | ||
| 3859 | |||
| 3860 | RINOK(ZstdDec_AllocateMisc(dec)) | ||
| 3861 | |||
| 3862 | if (p->outBuf_fromCaller) | ||
| 3863 | dec->isCyclicMode = False; | ||
| 3864 | else | ||
| 3865 | { | ||
| 3866 | size_t d = cycSize; | ||
| 3867 | |||
| 3868 | if (dec->decoder.winPos != p->wrPos) | ||
| 3869 | return SZ_ERROR_FAIL; | ||
| 3870 | |||
| 3871 | dec->decoder.winPos = 0; | ||
| 3872 | p->wrPos = 0; | ||
| 3873 | p->winPos = dec->decoder.winPos; | ||
| 3874 | |||
| 3875 | /* | ||
| 3876 | const size_t needWrite = dec->decoder.winPos - p->wrPos; | ||
| 3877 | if (!needWrite) | ||
| 3878 | { | ||
| 3879 | dec->decoder.winPos = 0; | ||
| 3880 | p->wrPos = 0; | ||
| 3881 | p->winPos = dec->decoder.winPos; | ||
| 3882 | } | ||
| 3883 | */ | ||
| 3884 | /* if (!useCyclic) we allocate only cycSize = ContentSize. | ||
| 3885 | But if we want to support the case where new frame starts with winPos != 0, | ||
| 3886 | then we will wrap over zero, and we still need | ||
| 3887 | to set (useCyclic) and allocate additional buffer spaces. | ||
| 3888 | Now we don't allow new frame starting with (winPos != 0). | ||
| 3889 | so (dec->decoder->winPos == 0) | ||
| 3890 | can use (!useCyclic) with reduced buffer sizes. | ||
| 3891 | */ | ||
| 3892 | /* | ||
| 3893 | if (dec->decoder->winPos != 0) | ||
| 3894 | useCyclic = True; | ||
| 3895 | */ | ||
| 3896 | |||
| 3897 | if (useCyclic) | ||
| 3898 | { | ||
| 3899 | /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes | ||
| 3900 | larger than window size, because CopyMatch() can write additional | ||
| 3901 | (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer. | ||
| 3902 | But for performance reasons we align (cycSize) for (kBlockSizeMax). | ||
| 3903 | also we must provide (cycSize >= max_decoded_data_after_cycSize), | ||
| 3904 | because after data move wrapping over zero we must provide (winPos < cycSize). | ||
| 3905 | */ | ||
| 3906 | const size_t alignSize = kBlockSizeMax; | ||
| 3907 | /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because | ||
| 3908 | we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */ | ||
| 3909 | // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize) | ||
| 3910 | cycSize += (1 << 7) + alignSize; | ||
| 3911 | cycSize &= ~(size_t)(alignSize - 1); | ||
| 3912 | // cycSize must be aligned for 32, because xxh requires 32-bytes blocks. | ||
| 3913 | // cycSize += 12345; // for debug | ||
| 3914 | // cycSize += 1 << 10; // for debug | ||
| 3915 | // cycSize += 32; // for debug | ||
| 3916 | // cycSize += kBlockSizeMax; // for debug | ||
| 3917 | if (cycSize < d) | ||
| 3918 | return SZ_ERROR_MEM; | ||
| 3919 | /* | ||
| 3920 | in cyclic buffer mode we allow to decode one additional block | ||
| 3921 | that exceeds (cycSize). | ||
| 3922 | So we must allocate additional (kBlockSizeMax) bytes after (cycSize). | ||
| 3923 | if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF) | ||
| 3924 | { | ||
| 3925 | we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize) | ||
| 3926 | but we aready allocate additional kBlockSizeMax that | ||
| 3927 | is larger than COPY_CHUNK_SIZE. | ||
| 3928 | So we don't need additional space of COPY_CHUNK_SIZE after (cycSize). | ||
| 3929 | } | ||
| 3930 | */ | ||
| 3931 | /* | ||
| 3932 | #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF | ||
| 3933 | d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1) | ||
| 3934 | #endif | ||
| 3935 | */ | ||
| 3936 | d = cycSize + kBlockSizeMax; | ||
| 3937 | if (d < cycSize) | ||
| 3938 | return SZ_ERROR_MEM; | ||
| 3939 | } | ||
| 3940 | |||
| 3941 | { | ||
| 3942 | const size_t kMinWinAllocSize = 1 << 12; | ||
| 3943 | if (d < kMinWinAllocSize) | ||
| 3944 | d = kMinWinAllocSize; | ||
| 3945 | } | ||
| 3946 | |||
| 3947 | if (d > dec->winBufSize_Allocated) | ||
| 3948 | { | ||
| 3949 | /* | ||
| 3950 | if (needWrite) | ||
| 3951 | { | ||
| 3952 | p->needWrite_Size = needWrite; | ||
| 3953 | return SZ_OK; | ||
| 3954 | // return SZ_ERROR_FAIL; | ||
| 3955 | } | ||
| 3956 | */ | ||
| 3957 | |||
| 3958 | if (dec->winBufSize_Allocated != 0) | ||
| 3959 | { | ||
| 3960 | const size_t k_extra = (useCyclic || d >= (1u << 20)) ? | ||
| 3961 | 2 * kBlockSizeMax : 0; | ||
| 3962 | unsigned i = useCyclic ? 17 : 12; | ||
| 3963 | for (; i < sizeof(size_t) * 8; i++) | ||
| 3964 | { | ||
| 3965 | const size_t d2 = ((size_t)1 << i) + k_extra; | ||
| 3966 | if (d2 >= d) | ||
| 3967 | { | ||
| 3968 | d = d2; | ||
| 3969 | break; | ||
| 3970 | } | ||
| 3971 | } | ||
| 3972 | } | ||
| 3973 | // RINOK(ZstdDec_AllocateWindow(dec, d)) | ||
| 3974 | ZstdDec_FreeWindow(dec); | ||
| 3975 | dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d); | ||
| 3976 | if (!dec->win_Base) | ||
| 3977 | return SZ_ERROR_MEM; | ||
| 3978 | dec->decoder.win = dec->win_Base; | ||
| 3979 | dec->winBufSize_Allocated = d; | ||
| 3980 | } | ||
| 3981 | /* | ||
| 3982 | else | ||
| 3983 | { | ||
| 3984 | // for non-cyclycMode we want flush data, and set winPos = 0 | ||
| 3985 | if (needWrite) | ||
| 3986 | { | ||
| 3987 | if (!useCyclic || dec->decoder.winPos >= cycSize) | ||
| 3988 | { | ||
| 3989 | p->needWrite_Size = needWrite; | ||
| 3990 | return SZ_OK; | ||
| 3991 | // return SZ_ERROR_FAIL; | ||
| 3992 | } | ||
| 3993 | } | ||
| 3994 | } | ||
| 3995 | */ | ||
| 3996 | |||
| 3997 | dec->decoder.cycSize = cycSize; | ||
| 3998 | p->win = dec->decoder.win; | ||
| 3999 | // p->cycSize = dec->decoder.cycSize; | ||
| 4000 | dec->isCyclicMode = (Byte)useCyclic; | ||
| 4001 | } // (!p->outBuf_fromCaller) end | ||
| 4002 | |||
| 4003 | // p->winPos = dec->decoder.winPos; | ||
| 4004 | dec->frameState = ZSTD2_STATE_BLOCK; | ||
| 4005 | // continue; | ||
| 4006 | } // ZSTD2_STATE_AFTER_HEADER end | ||
| 4007 | } | ||
| 4008 | } | ||
| 4009 | |||
| 4010 | |||
| 4011 | void ZstdDec_GetResInfo(const CZstdDec *dec, | ||
| 4012 | const CZstdDecState *p, | ||
| 4013 | SRes res, | ||
| 4014 | CZstdDecResInfo *stat) | ||
| 4015 | { | ||
| 4016 | // ZstdDecInfo_CLEAR(stat); | ||
| 4017 | stat->extraSize = 0; | ||
| 4018 | stat->is_NonFinishedFrame = False; | ||
| 4019 | if (dec->frameState != ZSTD2_STATE_FINISHED) | ||
| 4020 | { | ||
| 4021 | if (dec->frameState == ZSTD2_STATE_SIGNATURE) | ||
| 4022 | { | ||
| 4023 | stat->extraSize = (Byte)dec->tempSize; | ||
| 4024 | if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0) | ||
| 4025 | res = SZ_ERROR_NO_ARCHIVE; | ||
| 4026 | } | ||
| 4027 | else | ||
| 4028 | { | ||
| 4029 | stat->is_NonFinishedFrame = True; | ||
| 4030 | if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT) | ||
| 4031 | res = SZ_ERROR_INPUT_EOF; | ||
| 4032 | } | ||
| 4033 | } | ||
| 4034 | stat->decode_SRes = res; | ||
| 4035 | } | ||
| 4036 | |||
| 4037 | |||
| 4038 | size_t ZstdDec_ReadUnusedFromInBuf( | ||
| 4039 | CZstdDecHandle dec, | ||
| 4040 | size_t afterDecoding_tempPos, | ||
| 4041 | void *data, size_t size) | ||
| 4042 | { | ||
| 4043 | size_t processed = 0; | ||
| 4044 | if (dec->frameState == ZSTD2_STATE_SIGNATURE) | ||
| 4045 | { | ||
| 4046 | Byte *dest = (Byte *)data; | ||
| 4047 | const size_t tempSize = dec->tempSize; | ||
| 4048 | while (afterDecoding_tempPos < tempSize) | ||
| 4049 | { | ||
| 4050 | if (size == 0) | ||
| 4051 | break; | ||
| 4052 | size--; | ||
| 4053 | *dest++ = dec->temp[afterDecoding_tempPos++]; | ||
| 4054 | processed++; | ||
| 4055 | } | ||
| 4056 | } | ||
| 4057 | return processed; | ||
| 4058 | } | ||
| 4059 | |||
| 4060 | |||
| 4061 | void ZstdDecState_Clear(CZstdDecState *p) | ||
| 4062 | { | ||
| 4063 | memset(p, 0 , sizeof(*p)); | ||
| 4064 | } | ||
diff --git a/C/ZstdDec.h b/C/ZstdDec.h new file mode 100644 index 0000000..cd26131 --- /dev/null +++ b/C/ZstdDec.h | |||
| @@ -0,0 +1,173 @@ | |||
| 1 | /* ZstdDec.h -- Zstd Decoder interfaces | ||
| 2 | 2024-01-21 : Igor Pavlov : Public domain */ | ||
| 3 | |||
| 4 | #ifndef ZIP7_INC_ZSTD_DEC_H | ||
| 5 | #define ZIP7_INC_ZSTD_DEC_H | ||
| 6 | |||
| 7 | EXTERN_C_BEGIN | ||
| 8 | |||
| 9 | typedef struct CZstdDec CZstdDec; | ||
| 10 | typedef CZstdDec * CZstdDecHandle; | ||
| 11 | |||
| 12 | CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big); | ||
| 13 | void ZstdDec_Destroy(CZstdDecHandle p); | ||
| 14 | |||
| 15 | typedef enum | ||
| 16 | { | ||
| 17 | ZSTD_STATUS_NOT_SPECIFIED, /* use main error code instead */ | ||
| 18 | ZSTD_STATUS_FINISHED_FRAME, /* data frame or skip frame was finished */ | ||
| 19 | ZSTD_STATUS_NOT_FINISHED, /* just finished non-empty block or unfinished RAW/RLE block */ | ||
| 20 | ZSTD_STATUS_NEEDS_MORE_INPUT, /* the callee needs more input bytes. It has more priority over ZSTD_STATUS_NOT_FINISHED */ | ||
| 21 | ZSTD_STATUS_OUT_REACHED /* is not finihed frame and ((outProcessed > outSize) || (outProcessed == outSize && unfinished RAW/RLE block) */ | ||
| 22 | } enum_ZstdStatus_Dummy; | ||
| 23 | |||
| 24 | #define ZstdDecState_DOES_NEED_MORE_INPUT_OR_FINISHED_FRAME(p) \ | ||
| 25 | ((p)->status & ZSTD_STATUS_FINISHED_FRAME) | ||
| 26 | /* | ||
| 27 | ((p)->status == ZSTD_STATUS_NEEDS_MORE_INPUT || \ | ||
| 28 | (p)->status == ZSTD_STATUS_FINISHED_FRAME) | ||
| 29 | */ | ||
| 30 | |||
| 31 | typedef Byte enum_ZstdStatus; | ||
| 32 | |||
| 33 | |||
| 34 | void ZstdDec_Init(CZstdDecHandle p); | ||
| 35 | |||
| 36 | typedef struct | ||
| 37 | { | ||
| 38 | UInt64 num_Blocks; | ||
| 39 | Byte descriptor_OR; | ||
| 40 | Byte descriptor_NOT_OR; | ||
| 41 | Byte are_ContentSize_Unknown; | ||
| 42 | Byte windowDescriptor_MAX; | ||
| 43 | |||
| 44 | // Byte are_ContentSize_Known; | ||
| 45 | // Byte are_SingleSegments; | ||
| 46 | // Byte are_WindowDescriptors; | ||
| 47 | Byte checksum_Defined; | ||
| 48 | // Byte are_Checksums; | ||
| 49 | // Byte are_Non_Checksums; | ||
| 50 | |||
| 51 | // Byte are_DictionaryId; | ||
| 52 | Byte are_DictionaryId_Different; | ||
| 53 | |||
| 54 | // Byte reserved[3]; | ||
| 55 | |||
| 56 | UInt32 checksum; // checksum of last data frame | ||
| 57 | /// UInt32 dictionaryId_Cur; | ||
| 58 | UInt32 dictionaryId; // if there are non-zero dictionary IDs, then it's first dictionaryId | ||
| 59 | |||
| 60 | UInt64 num_DataFrames; | ||
| 61 | UInt64 num_SkipFrames; | ||
| 62 | UInt64 skipFrames_Size; | ||
| 63 | UInt64 contentSize_Total; | ||
| 64 | UInt64 contentSize_MAX; | ||
| 65 | // UInt64 num_Checksums; | ||
| 66 | // UInt64 num_Non_Checksums; // frames without checksum | ||
| 67 | // UInt64 num_WindowDescriptors; | ||
| 68 | // UInt64 num_SingleSegments; | ||
| 69 | // UInt64 num_Frames_with_ContentSize; | ||
| 70 | // UInt64 num_Frames_without_ContentSize; | ||
| 71 | UInt64 windowSize_MAX; | ||
| 72 | UInt64 windowSize_Allocate_MAX; | ||
| 73 | // UInt64 num_DictionaryIds; | ||
| 74 | // UInt64 num_Blocks_forType[4]; | ||
| 75 | // UInt64 num_BlockBytes_forType[4]; | ||
| 76 | // UInt64 num_SingleSegments; | ||
| 77 | // UInt64 singleSegment_ContentSize_MAX; | ||
| 78 | } CZstdDecInfo; | ||
| 79 | |||
| 80 | #define ZstdDecInfo_CLEAR(p) { memset(p, 0, sizeof(*(p))); } | ||
| 81 | |||
| 82 | #define ZstdDecInfo_GET_NUM_FRAMES(p) ((p)->num_DataFrames + (p)->num_SkipFrames) | ||
| 83 | |||
| 84 | |||
| 85 | typedef struct CZstdDecState | ||
| 86 | { | ||
| 87 | enum_ZstdStatus status; // out | ||
| 88 | Byte disableHash; | ||
| 89 | // Byte mustBeFinished; | ||
| 90 | Byte outSize_Defined; | ||
| 91 | // Byte isAfterSizeMode; | ||
| 92 | // UInt64 inProcessed; | ||
| 93 | // SRes codeRes; | ||
| 94 | // Byte needWrite_IsStrong; | ||
| 95 | |||
| 96 | const Byte *inBuf; | ||
| 97 | size_t inPos; // in/out | ||
| 98 | size_t inLim; | ||
| 99 | |||
| 100 | const Byte *win; // out | ||
| 101 | size_t winPos; // out | ||
| 102 | size_t wrPos; // in/out | ||
| 103 | // size_t cycSize; // out : if (!outBuf_fromCaller) | ||
| 104 | size_t needWrite_Size; // out | ||
| 105 | |||
| 106 | Byte *outBuf_fromCaller; | ||
| 107 | size_t outBufSize_fromCaller; | ||
| 108 | /* (outBufSize_fromCaller >= full_uncompressed_size_of_all_frames) is required | ||
| 109 | for success decoding. | ||
| 110 | If outBufSize_fromCaller < full_uncompressed_size_of_all_frames), | ||
| 111 | decoding can give error message, because we decode per block basis. | ||
| 112 | */ | ||
| 113 | |||
| 114 | // size_t outStep; | ||
| 115 | UInt64 outSize; // total in all frames | ||
| 116 | UInt64 outProcessed; // out decoded in all frames (it can be >= outSize) | ||
| 117 | |||
| 118 | CZstdDecInfo info; | ||
| 119 | } CZstdDecState; | ||
| 120 | |||
| 121 | void ZstdDecState_Clear(CZstdDecState *p); | ||
| 122 | |||
| 123 | /* | ||
| 124 | ZstdDec_Decode() | ||
| 125 | return: | ||
| 126 | SZ_OK - no error | ||
| 127 | SZ_ERROR_DATA - Data Error | ||
| 128 | SZ_ERROR_MEM - Memory allocation error | ||
| 129 | SZ_ERROR_UNSUPPORTED - Unsupported method or method properties | ||
| 130 | SZ_ERROR_CRC - XXH hash Error | ||
| 131 | // SZ_ERROR_ARCHIVE - Headers error (not used now) | ||
| 132 | */ | ||
| 133 | SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p); | ||
| 134 | |||
| 135 | /* | ||
| 136 | ZstdDec_ReadUnusedFromInBuf(): | ||
| 137 | returns: the number of bytes that were read from InBuf | ||
| 138 | (*afterDecoding_tempPos) must be set to zero before first call of ZstdDec_ReadUnusedFromInBuf() | ||
| 139 | */ | ||
| 140 | size_t ZstdDec_ReadUnusedFromInBuf( | ||
| 141 | CZstdDecHandle dec, | ||
| 142 | size_t afterDecoding_tempPos, // in/out | ||
| 143 | void *data, size_t size); | ||
| 144 | |||
| 145 | typedef struct | ||
| 146 | { | ||
| 147 | SRes decode_SRes; // error code of data decoding | ||
| 148 | Byte is_NonFinishedFrame; // there is unfinished decoding for data frame or skip frame | ||
| 149 | Byte extraSize; | ||
| 150 | } CZstdDecResInfo; | ||
| 151 | |||
| 152 | /* | ||
| 153 | #define ZstdDecResInfo_CLEAR(p) \ | ||
| 154 | { (p)->decode_SRes = 0; \ | ||
| 155 | (p)->is_NonFinishedFrame; \ | ||
| 156 | (p)->extraSize = 0; \ | ||
| 157 | } | ||
| 158 | // memset(p, 0, sizeof(*p)); | ||
| 159 | */ | ||
| 160 | |||
| 161 | /* | ||
| 162 | additional error codes for CZstdDecResInfo::decode_SRes: | ||
| 163 | SZ_ERROR_NO_ARCHIVE - is not zstd stream (no frames) | ||
| 164 | SZ_ERROR_INPUT_EOF - need more data in input stream | ||
| 165 | */ | ||
| 166 | void ZstdDec_GetResInfo(const CZstdDec *dec, | ||
| 167 | const CZstdDecState *p, | ||
| 168 | SRes res, // it's result from ZstdDec_Decode() | ||
| 169 | CZstdDecResInfo *info); | ||
| 170 | |||
| 171 | EXTERN_C_END | ||
| 172 | |||
| 173 | #endif | ||
diff --git a/C/var_clang_arm64.mak b/C/var_clang_arm64.mak index 4b35409..971101a 100644 --- a/C/var_clang_arm64.mak +++ b/C/var_clang_arm64.mak | |||
| @@ -6,6 +6,7 @@ IS_ARM64=1 | |||
| 6 | CROSS_COMPILE= | 6 | CROSS_COMPILE= |
| 7 | MY_ARCH= | 7 | MY_ARCH= |
| 8 | USE_ASM=1 | 8 | USE_ASM=1 |
| 9 | ASM_FLAGS=-Wno-unused-macros | ||
| 9 | CC=$(CROSS_COMPILE)clang | 10 | CC=$(CROSS_COMPILE)clang |
| 10 | CXX=$(CROSS_COMPILE)clang++ | 11 | CXX=$(CROSS_COMPILE)clang++ |
| 11 | USE_CLANG=1 | 12 | USE_CLANG=1 |
