From e5431fa6f5505e385c6f9367260717e9c47dc2ee Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Fri, 29 Nov 2024 00:00:00 +0000 Subject: 24.09 --- C/7zDec.c | 5 +- C/7zVersion.h | 6 +- C/AesOpt.c | 233 +++++++++++++--------- C/CpuArch.c | 109 +++++++++-- C/CpuArch.h | 33 +++- C/LzmaEnc.c | 16 +- C/Md5.c | 206 ++++++++++++++++++++ C/Md5.h | 34 ++++ C/Sha1.c | 125 ++++-------- C/Sha1.h | 18 +- C/Sha1Opt.c | 146 +++++--------- C/Sha256.c | 162 +++++++-------- C/Sha256.h | 18 +- C/Sha256Opt.c | 172 ++++++++-------- C/Sha3.c | 359 ++++++++++++++++++++++++++++++++++ C/Sha3.h | 36 ++++ C/Sha512.c | 618 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ C/Sha512.h | 86 ++++++++ C/Sha512Opt.c | 395 +++++++++++++++++++++++++++++++++++++ 19 files changed, 2273 insertions(+), 504 deletions(-) create mode 100644 C/Md5.c create mode 100644 C/Md5.h create mode 100644 C/Sha3.c create mode 100644 C/Sha3.h create mode 100644 C/Sha512.c create mode 100644 C/Sha512.h create mode 100644 C/Sha512Opt.c (limited to 'C') diff --git a/C/7zDec.c b/C/7zDec.c index c9b4064..520cbfd 100644 --- a/C/7zDec.c +++ b/C/7zDec.c @@ -1,5 +1,5 @@ /* 7zDec.c -- Decoding from 7z folder -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -312,8 +312,9 @@ static BoolInt IS_MAIN_METHOD(UInt32 m) case k_PPMD: #endif return True; + default: + return False; } - return False; } static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) diff --git a/C/7zVersion.h b/C/7zVersion.h index 1ddef80..e82ba0b 100644 --- a/C/7zVersion.h +++ b/C/7zVersion.h @@ -1,7 +1,7 @@ #define MY_VER_MAJOR 24 -#define MY_VER_MINOR 8 +#define MY_VER_MINOR 9 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "24.08" +#define MY_VERSION_NUMBERS "24.09" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,7 +10,7 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2024-08-11" +#define MY_DATE "2024-11-29" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" diff --git a/C/AesOpt.c b/C/AesOpt.c index 58769ea..b281807 100644 --- a/C/AesOpt.c +++ b/C/AesOpt.c @@ -1,5 +1,5 @@ /* AesOpt.c -- AES optimized code for x86 AES hardware instructions -2024-03-01 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -80,19 +80,39 @@ AES_FUNC_START (name) #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) +#if 1 +// use aligned SSE load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// So we can use this branch of code. +// and compiler can use fused load-op SSE instructions: +// xorps xmm0, XMMWORD PTR [rdx] +#define LOAD_128(pp) (*(__m128i *)(void *)(pp)) +#define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v +// use aligned SSE load/store for data. Alternative code with direct access +// #define LOAD_128(pp) _mm_load_si128(pp) +// #define STORE_128(pp, _v) _mm_store_si128(pp, _v) +#else +// use unaligned load/store for data: movdqu XMMWORD PTR [rdx] +#define LOAD_128(pp) _mm_loadu_si128(pp) +#define STORE_128(pp, _v) _mm_storeu_si128(pp, _v) +#endif + AES_FUNC_START2 (AesCbc_Encode_HW) { + if (numBlocks == 0) + return; + { __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i m = *p; const __m128i k0 = p[2]; const __m128i k1 = p[3]; const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1; - for (; numBlocks != 0; numBlocks--, data++) + do { UInt32 r = numRounds2; const __m128i *w = p + 4; - __m128i temp = *data; + __m128i temp = LOAD_128(data); MM_XOR (temp, k0) MM_XOR (m, temp) MM_OP_m (_mm_aesenc_si128, k1) @@ -104,9 +124,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) } while (--r); MM_OP_m (_mm_aesenclast_si128, w[0]) - *data = m; + STORE_128(data, m); + data++; } + while (--numBlocks); *p = m; + } } @@ -139,12 +162,12 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define WOP(op) op (m0, 0) WOP_M1(op) - #define DECLARE_VAR(reg, ii) __m128i reg; -#define LOAD_data( reg, ii) reg = data[ii]; -#define STORE_data( reg, ii) data[ii] = reg; +#define LOAD_data_ii(ii) LOAD_128(data + (ii)) +#define LOAD_data( reg, ii) reg = LOAD_data_ii(ii); +#define STORE_data( reg, ii) STORE_128(data + (ii), reg); #if (NUM_WAYS > 1) -#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) +#define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1))) #endif #define MM_OP_key(op, reg) MM_OP(op, reg, key); @@ -156,25 +179,22 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; -#define CTR_END( reg, ii) MM_XOR (data[ii], reg) - +#define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \ + LOAD_128 (data + (ii)))); #define WOP_KEY(op, n) { \ const __m128i key = w[n]; \ - WOP(op); } - + WOP(op) } #define WIDE_LOOP_START \ dataEnd = data + numBlocks; \ if (numBlocks >= NUM_WAYS) \ { dataEnd -= NUM_WAYS; do { \ - #define WIDE_LOOP_END \ data += NUM_WAYS; \ } while (data <= dataEnd); \ dataEnd += NUM_WAYS; } \ - #define SINGLE_LOOP \ for (; data < dataEnd; data++) @@ -184,54 +204,73 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) #define AVX_DECLARE_VAR(reg, ii) __m256i reg; -#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; -#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; + +#if 1 +// use unaligned AVX load/store for data. +// It is required for our Aes functions, that data is aligned for 16-bytes. +// But we need 32-bytes reading. +// So we use intrinsics for unaligned AVX load/store. +// notes for _mm256_storeu_si256: +// msvc2022: uses vmovdqu and keeps the order of instruction sequence. +// new gcc11 uses vmovdqu +// old gcc9 could use pair of instructions: +// vmovups %xmm7, -224(%rax) +// vextracti128 $0x1, %ymm7, -208(%rax) +#define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v); +#else +// use aligned AVX load/store for data. +// for debug: we can use this branch, if we are sure that data is aligned for 32-bytes. +// msvc2022 uses vmovdqu still +// gcc uses vmovdqa (that requires 32-bytes alignment) +#define AVX_LOAD(p) (*(const __m256i *)(const void *)(p)) +#define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v; +#endif + +#define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii)); +#define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg) /* -AVX_XOR_data_M1() needs unaligned memory load -if (we don't use _mm256_loadu_si256() here) -{ - Most compilers with enabled optimizations generate fused AVX (LOAD + OP) - instruction that can load unaligned data. - But GCC and CLANG without -O2 or -O1 optimizations can generate separated - LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. -} -Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. -v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. +AVX_XOR_data_M1() needs unaligned memory load, even if (data) +is aligned for 256-bits, because we read 32-bytes chunk that +crosses (data) position: from (data - 16bytes) to (data + 16bytes). */ -#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) -// for debug only: the following code will fail on execution, if compiled by some compilers: -// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) +#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii))) #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) -#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); -#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) +#define AVX_CTR_START(reg, ii) \ + MM_OP (_mm256_add_epi64, ctr2, two) \ + reg = _mm256_xor_si256(ctr2, key); + +#define AVX_CTR_END(reg, ii) \ + AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \ + AVX_LOAD ((__m256i *)(void *)data + (ii)))); + #define AVX_WOP_KEY(op, n) { \ const __m256i key = w[n]; \ - WOP(op); } + WOP(op) } #define NUM_AES_KEYS_MAX 15 #define WIDE_LOOP_START_AVX(OP) \ dataEnd = data + numBlocks; \ if (numBlocks >= NUM_WAYS * 2) \ - { __m256i keys[NUM_AES_KEYS_MAX]; \ - UInt32 ii; \ - OP \ - for (ii = 0; ii < numRounds; ii++) \ - keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \ - dataEnd -= NUM_WAYS * 2; do { \ - + { __m256i keys[NUM_AES_KEYS_MAX]; \ + OP \ + { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \ + keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \ + dataEnd -= NUM_WAYS * 2; \ + do { \ #define WIDE_LOOP_END_AVX(OP) \ - data += NUM_WAYS * 2; \ - } while (data <= dataEnd); \ - dataEnd += NUM_WAYS * 2; \ - OP \ - _mm256_zeroupper(); \ + data += NUM_WAYS * 2; \ + } while (data <= dataEnd); \ + dataEnd += NUM_WAYS * 2; \ + OP \ + _mm256_zeroupper(); \ } \ /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, @@ -246,21 +285,20 @@ AES_FUNC_START2 (AesCbc_Decode_HW) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i iv = *p; - const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1; + const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1; const __m128i *dataEnd; p += 2; WIDE_LOOP_START { const __m128i *w = wStart; - WOP (DECLARE_VAR) WOP (LOAD_data) WOP_KEY (AES_XOR, 1) - do { WOP_KEY (AES_DEC, 0) + w--; } while (w != p); @@ -268,7 +306,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) MM_XOR (m0, iv) WOP_M1 (XOR_data_M1) - iv = data[NUM_WAYS - 1]; + LOAD_data(iv, NUM_WAYS - 1) WOP (STORE_data) } WIDE_LOOP_END @@ -276,7 +314,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) SINGLE_LOOP { const __m128i *w = wStart - 1; - __m128i m = _mm_xor_si128 (w[2], *data); + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); + do { MM_OP_m (_mm_aesdec_si128, w[1]) @@ -286,10 +325,9 @@ AES_FUNC_START2 (AesCbc_Decode_HW) while (w != p); MM_OP_m (_mm_aesdec_si128, w[1]) MM_OP_m (_mm_aesdeclast_si128, w[0]) - MM_XOR (m, iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -301,9 +339,9 @@ AES_FUNC_START2 (AesCtr_Code_HW) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i ctr = *p; - UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; + const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1; const __m128i *dataEnd; - __m128i one = _mm_cvtsi32_si128(1); + const __m128i one = _mm_cvtsi32_si128(1); p += 2; @@ -322,7 +360,6 @@ AES_FUNC_START2 (AesCtr_Code_HW) } while (--r); WOP_KEY (AES_ENC_LAST, 0) - WOP (CTR_END) } WIDE_LOOP_END @@ -344,7 +381,7 @@ AES_FUNC_START2 (AesCtr_Code_HW) while (--numRounds2); MM_OP_m (_mm_aesenc_si128, w[0]) MM_OP_m (_mm_aesenclast_si128, w[1]) - MM_XOR (*data, m) + CTR_END (m, 0) } p[-2] = ctr; @@ -421,7 +458,7 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) __m128i *data = (__m128i *)(void *)data8; __m128i iv = *p; const __m128i *dataEnd; - UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; p += 2; WIDE_LOOP_START_AVX(;) @@ -440,17 +477,17 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) while (w != keys); AVX_WOP_KEY (AVX_AES_DEC_LAST, 0) - AVX_XOR (m0, _mm256_setr_m128i(iv, data[0])) + AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0))) WOP_M1 (AVX_XOR_data_M1) - iv = data[NUM_WAYS * 2 - 1]; + LOAD_data (iv, NUM_WAYS * 2 - 1) WOP (AVX_STORE_data) } WIDE_LOOP_END_AVX(;) SINGLE_LOOP { - const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3; - __m128i m = _mm_xor_si128 (w[2], *data); + const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2; + __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0)); do { MM_OP_m (_mm_aesdec_si128, w[1]) @@ -462,8 +499,8 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256) MM_OP_m (_mm_aesdeclast_si128, w[0]) MM_XOR (m, iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -493,9 +530,9 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) __m128i *p = (__m128i *)(void *)ivAes; __m128i *data = (__m128i *)(void *)data8; __m128i ctr = *p; - UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; + const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1; const __m128i *dataEnd; - __m128i one = _mm_cvtsi32_si128(1); + const __m128i one = _mm_cvtsi32_si128(1); __m256i ctr2, two; p += 2; @@ -536,7 +573,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) while (--numRounds2); MM_OP_m (_mm_aesenc_si128, w[0]) MM_OP_m (_mm_aesenclast_si128, w[1]) - MM_XOR (*data, m) + CTR_END (m, 0) } p[-2] = ctr; @@ -731,9 +768,14 @@ AES_FUNC_START (name) AES_FUNC_START2 (AesCbc_Encode_HW) { - v128 * const p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + if (numBlocks == 0) + return; + { + v128 * const p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; v128 m = *p; + const UInt32 numRounds2 = *(const UInt32 *)(p + 1); + const v128 *w = p + (size_t)numRounds2 * 2; const v128 k0 = p[2]; const v128 k1 = p[3]; const v128 k2 = p[4]; @@ -744,11 +786,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW) const v128 k7 = p[9]; const v128 k8 = p[10]; const v128 k9 = p[11]; - const UInt32 numRounds2 = *(const UInt32 *)(p + 1); - const v128 *w = p + ((size_t)numRounds2 * 2); + const v128 k_z4 = w[-2]; + const v128 k_z3 = w[-1]; + const v128 k_z2 = w[0]; const v128 k_z1 = w[1]; const v128 k_z0 = w[2]; - for (; numBlocks != 0; numBlocks--, data++) + // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle, + // because gcc/clang compilers are not good for that optimization. + do { MM_XOR_m (*data) AES_E_MC_m (k0) @@ -757,24 +802,26 @@ AES_FUNC_START2 (AesCbc_Encode_HW) AES_E_MC_m (k3) AES_E_MC_m (k4) AES_E_MC_m (k5) - AES_E_MC_m (k6) - AES_E_MC_m (k7) - AES_E_MC_m (k8) if (numRounds2 >= 6) { - AES_E_MC_m (k9) - AES_E_MC_m (p[12]) + AES_E_MC_m (k6) + AES_E_MC_m (k7) if (numRounds2 != 6) { - AES_E_MC_m (p[13]) - AES_E_MC_m (p[14]) + AES_E_MC_m (k8) + AES_E_MC_m (k9) } } - AES_E_m (k_z1) - MM_XOR_m (k_z0) - *data = m; + AES_E_MC_m (k_z4) + AES_E_MC_m (k_z3) + AES_E_MC_m (k_z2) + AES_E_m (k_z1) + MM_XOR_m (k_z0) + *data++ = m; } + while (--numBlocks); *p = m; + } } @@ -834,10 +881,10 @@ AES_FUNC_START2 (AesCbc_Encode_HW) AES_FUNC_START2 (AesCbc_Decode_HW) { - v128 *p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; v128 iv = *p; - const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2; const v128 *dataEnd; p += 2; @@ -858,7 +905,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) WOP_KEY (AES_XOR, 0) MM_XOR (m0, iv) WOP_M1 (XOR_data_M1) - iv = data[NUM_WAYS - 1]; + LOAD_data(iv, NUM_WAYS - 1) WOP (STORE_data) } WIDE_LOOP_END @@ -866,7 +913,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) SINGLE_LOOP { const v128 *w = wStart; - v128 m = *data; + v128 m; LOAD_data(m, 0) AES_D_IMC_m (w[2]) do { @@ -878,8 +925,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW) AES_D_m (w[1]) MM_XOR_m (w[0]) MM_XOR_m (iv) - iv = *data; - *data = m; + LOAD_data(iv, 0) + STORE_data(m, 0) } p[-2] = iv; @@ -888,19 +935,17 @@ AES_FUNC_START2 (AesCbc_Decode_HW) AES_FUNC_START2 (AesCtr_Code_HW) { - v128 *p = (v128*)(void*)ivAes; - v128 *data = (v128*)(void*)data8; + v128 *p = (v128 *)(void *)ivAes; + v128 *data = (v128 *)(void *)data8; uint64x2_t ctr = vreinterpretq_u64_u8(*p); - const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; + const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2; const v128 *dataEnd; - uint64x2_t one = vdupq_n_u64(0); - // the bug in clang: // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); #if defined(__clang__) && (__clang_major__ <= 9) #pragma GCC diagnostic ignored "-Wvector-conversion" #endif - one = vsetq_lane_u64(1, one, 0); + const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0); p += 2; WIDE_LOOP_START diff --git a/C/CpuArch.c b/C/CpuArch.c index e792f39..6e02551 100644 --- a/C/CpuArch.c +++ b/C/CpuArch.c @@ -1,5 +1,5 @@ /* CpuArch.c -- CPU specific code -2024-07-04 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -17,7 +17,7 @@ /* cpuid instruction supports (subFunction) parameter in ECX, that is used only with some specific (function) parameter values. - But we always use only (subFunction==0). + most functions use only (subFunction==0). */ /* __cpuid(): MSVC and GCC/CLANG use same function/macro name @@ -49,43 +49,49 @@ #if defined(MY_CPU_AMD64) && defined(__PIC__) \ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) -#define x86_cpuid_MACRO(p, func) { \ + /* "=&r" selects free register. It can select even rbx, if that register is free. + "=&D" for (RDI) also works, but the code can be larger with "=&D" + "2"(subFun) : 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "mov %%rbx, %q1" \ ASM_LN "cpuid" \ ASM_LN "xchg %%rbx, %q1" \ - : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } - - /* "=&r" selects free register. It can select even rbx, if that register is free. - "=&D" for (RDI) also works, but the code can be larger with "=&D" - "2"(0) means (subFunction = 0), - 2 is (zero-based) index in the output constraint list "=c" (ECX). */ + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #elif defined(MY_CPU_X86) && defined(__PIC__) \ && ((defined (__GNUC__) && (__GNUC__ < 5)) || defined(__clang__)) -#define x86_cpuid_MACRO(p, func) { \ +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "mov %%ebx, %k1" \ ASM_LN "cpuid" \ ASM_LN "xchg %%ebx, %k1" \ - : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } + : "=a" ((p)[0]), "=&r" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #else -#define x86_cpuid_MACRO(p, func) { \ +#define x86_cpuid_MACRO_2(p, func, subFunc) { \ __asm__ __volatile__ ( \ ASM_LN "cpuid" \ - : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(0)); } + : "=a" ((p)[0]), "=b" ((p)[1]), "=c" ((p)[2]), "=d" ((p)[3]) : "0" (func), "2"(subFunc)); } #endif +#define x86_cpuid_MACRO(p, func) x86_cpuid_MACRO_2(p, func, 0) void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) { x86_cpuid_MACRO(p, func) } +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + x86_cpuid_MACRO_2(p, func, subFunc) +} + Z7_NO_INLINE UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) @@ -205,11 +211,39 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) __asm ret 0 } +static +void __declspec(naked) Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + UNUSED_VAR(p) + UNUSED_VAR(func) + UNUSED_VAR(subFunc) + __asm push ebx + __asm push edi + __asm mov edi, ecx // p + __asm mov eax, edx // func + __asm mov ecx, [esp + 12] // subFunc + __asm cpuid + __asm mov [edi ], eax + __asm mov [edi + 4], ebx + __asm mov [edi + 8], ecx + __asm mov [edi + 12], edx + __asm pop edi + __asm pop ebx + __asm ret 4 +} + #else // MY_CPU_AMD64 #if _MSC_VER >= 1600 #include #define MY_cpuidex __cpuidex + +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + __cpuidex((int *)p, func, subFunc); +} + #else /* __cpuid (func == (0 or 7)) requires subfunction number in ECX. @@ -219,7 +253,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) We still can use __cpuid for low (func) values that don't require ECX, but __cpuid() in old MSVC will be incorrect for some func values: (func == 7). So here we use the hack for old MSVC to send (subFunction) in ECX register to cpuid instruction, - where ECX value is first parameter for FASTCALL / NO_INLINE func, + where ECX value is first parameter for FASTCALL / NO_INLINE func. So the caller of MY_cpuidex_HACK() sets ECX as subFunction, and old MSVC for __cpuid() doesn't change ECX and cpuid instruction gets (subFunction) value. @@ -233,6 +267,11 @@ Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int } #define MY_cpuidex(info, func, func2) MY_cpuidex_HACK(func2, func, info) #pragma message("======== MY_cpuidex_HACK WAS USED ========") +static +void Z7_FASTCALL z7_x86_cpuid_subFunc(UInt32 p[4], UInt32 func, UInt32 subFunc) +{ + MY_cpuidex_HACK(subFunc, func, (Int32 *)p); +} #endif // _MSC_VER >= 1600 #if !defined(MY_CPU_AMD64) @@ -445,6 +484,23 @@ BoolInt CPU_IsSupported_SHA(void) } } + +BoolInt CPU_IsSupported_SHA512(void) +{ + if (!CPU_IsSupported_AVX2()) return False; // maybe CPU_IsSupported_AVX() is enough here + + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + z7_x86_cpuid_subFunc(d, 7, 0); + if (d[0] < 1) // d[0] - is max supported subleaf value + return False; + z7_x86_cpuid_subFunc(d, 7, 1); + return (BoolInt)(d[0]) & 1; + } +} + /* MSVC: _xgetbv() intrinsic is available since VS2010SP1. MSVC also defines (_XCR_XFEATURE_ENABLED_MASK) macro in @@ -776,6 +832,18 @@ BoolInt CPU_IsSupported_NEON(void) return z7_sysctlbyname_Get_BoolInt("hw.optional.neon"); } +BoolInt CPU_IsSupported_SHA512(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha512"); +} + +/* +BoolInt CPU_IsSupported_SHA3(void) +{ + return z7_sysctlbyname_Get_BoolInt("hw.optional.armv8_2_sha3"); +} +*/ + #ifdef MY_CPU_ARM64 #define APPLE_CRYPTO_SUPPORT_VAL 1 #else @@ -860,6 +928,19 @@ MY_HWCAP_CHECK_FUNC (CRC32) MY_HWCAP_CHECK_FUNC (SHA1) MY_HWCAP_CHECK_FUNC (SHA2) MY_HWCAP_CHECK_FUNC (AES) +#ifdef MY_CPU_ARM64 +// supports HWCAP_SHA512 and HWCAP_SHA3 since 2017. +// we define them here, if they are not defined +#ifndef HWCAP_SHA3 +// #define HWCAP_SHA3 (1 << 17) +#endif +#ifndef HWCAP_SHA512 +// #pragma message("=== HWCAP_SHA512 define === ") +#define HWCAP_SHA512 (1 << 21) +#endif +MY_HWCAP_CHECK_FUNC (SHA512) +// MY_HWCAP_CHECK_FUNC (SHA3) +#endif #endif // __APPLE__ #endif // _WIN32 diff --git a/C/CpuArch.h b/C/CpuArch.h index 683cfaa..a6297ea 100644 --- a/C/CpuArch.h +++ b/C/CpuArch.h @@ -1,5 +1,5 @@ /* CpuArch.h -- CPU specific code -2024-06-17 : Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #ifndef ZIP7_INC_CPU_ARCH_H #define ZIP7_INC_CPU_ARCH_H @@ -509,11 +509,19 @@ problem-4 : performace: #if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) +#if 0 +// Z7_BSWAP16 can be slow for x86-msvc +#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p))) +#else +#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16) +#endif + #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p)) #define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); } #if defined(MY_CPU_LE_UNALIGN_64) #define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p)) +#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); } #endif #else @@ -536,11 +544,27 @@ problem-4 : performace: #define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4)) #endif +#ifndef SetBe64 +#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \ + _ppp_[0] = (Byte)(_vvv_ >> 56); \ + _ppp_[1] = (Byte)(_vvv_ >> 48); \ + _ppp_[2] = (Byte)(_vvv_ >> 40); \ + _ppp_[3] = (Byte)(_vvv_ >> 32); \ + _ppp_[4] = (Byte)(_vvv_ >> 24); \ + _ppp_[5] = (Byte)(_vvv_ >> 16); \ + _ppp_[6] = (Byte)(_vvv_ >> 8); \ + _ppp_[7] = (Byte)_vvv_; } +#endif + #ifndef GetBe16 +#ifdef GetBe16_to32 +#define GetBe16(p) ( (UInt16) GetBe16_to32(p)) +#else #define GetBe16(p) ( (UInt16) ( \ ((UInt16)((const Byte *)(p))[0] << 8) | \ ((const Byte *)(p))[1] )) #endif +#endif #if defined(MY_CPU_BE) @@ -589,6 +613,11 @@ problem-4 : performace: #endif +#ifndef GetBe16_to32 +#define GetBe16_to32(p) GetBe16(p) +#endif + + #if defined(MY_CPU_X86_OR_AMD64) \ || defined(MY_CPU_ARM_OR_ARM64) \ || defined(MY_CPU_PPC_OR_PPC64) @@ -617,6 +646,7 @@ BoolInt CPU_IsSupported_SSE2(void); BoolInt CPU_IsSupported_SSSE3(void); BoolInt CPU_IsSupported_SSE41(void); BoolInt CPU_IsSupported_SHA(void); +BoolInt CPU_IsSupported_SHA512(void); BoolInt CPU_IsSupported_PageGB(void); #elif defined(MY_CPU_ARM_OR_ARM64) @@ -634,6 +664,7 @@ BoolInt CPU_IsSupported_SHA1(void); BoolInt CPU_IsSupported_SHA2(void); BoolInt CPU_IsSupported_AES(void); #endif +BoolInt CPU_IsSupported_SHA512(void); #endif diff --git a/C/LzmaEnc.c b/C/LzmaEnc.c index 37b2787..088b78f 100644 --- a/C/LzmaEnc.c +++ b/C/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2024-01-24: Igor Pavlov : Public domain */ +Igor Pavlov : Public domain */ #include "Precomp.h" @@ -72,11 +72,11 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) p->level = level; if (p->dictSize == 0) - p->dictSize = - ( level <= 3 ? ((UInt32)1 << (level * 2 + 16)) : - ( level <= 6 ? ((UInt32)1 << (level + 19)) : - ( level <= 7 ? ((UInt32)1 << 25) : ((UInt32)1 << 26) - ))); + p->dictSize = (unsigned)level <= 4 ? + (UInt32)1 << (level * 2 + 16) : + (unsigned)level <= sizeof(size_t) / 2 + 4 ? + (UInt32)1 << (level + 20) : + (UInt32)1 << (sizeof(size_t) / 2 + 24); if (p->dictSize > p->reduceSize) { @@ -92,8 +92,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (p->lp < 0) p->lp = 0; if (p->pb < 0) p->pb = 2; - if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); - if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); + if (p->algo < 0) p->algo = (unsigned)level < 5 ? 0 : 1; + if (p->fb < 0) p->fb = (unsigned)level < 7 ? 32 : 64; if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); if (p->numHashBytes < 0) p->numHashBytes = (p->btMode ? 4 : 5); if (p->mc == 0) p->mc = (16 + ((unsigned)p->fb >> 1)) >> (p->btMode ? 0 : 1); diff --git a/C/Md5.c b/C/Md5.c new file mode 100644 index 0000000..1b745d7 --- /dev/null +++ b/C/Md5.c @@ -0,0 +1,206 @@ +/* Md5.c -- MD5 Hash +: Igor Pavlov : Public domain +This code is based on Colin Plumb's public domain md5.c code */ + +#include "Precomp.h" + +#include + +#include "Md5.h" +#include "RotateDefs.h" +#include "CpuArch.h" + +#define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks + +Z7_NO_INLINE +void Md5_Init(CMd5 *p) +{ + p->count = 0; + p->state[0] = 0x67452301; + p->state[1] = 0xefcdab89; + p->state[2] = 0x98badcfe; + p->state[3] = 0x10325476; +} + +#if 0 && !defined(MY_CPU_LE_UNALIGN) +// optional optimization for Big-endian processors or processors without unaligned access: +// it is intended to reduce the number of complex LE32 memory reading from 64 to 16. +// But some compilers (sparc, armt) are better without this optimization. +#define Z7_MD5_USE_DATA32_ARRAY +#endif + +#define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i)) + +#ifdef Z7_MD5_USE_DATA32_ARRAY +#define D(i) data32[i] +#else +#define D(i) LOAD_DATA(i) +#endif + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define R1(i, f, start, step, w, x, y, z, s, k) \ + w += D((start + step * (i)) % 16) + k; \ + w += f(x, y, z); \ + w = rotlFixed(w, s) + x; \ + +#define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \ + R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \ + R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \ + R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \ + R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \ + +#define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \ + R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \ + R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \ + R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \ + R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \ + +static +Z7_NO_INLINE +void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks) +{ + UInt32 a, b, c, d; + // if (numBlocks == 0) return; + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + do + { +#ifdef Z7_MD5_USE_DATA32_ARRAY + UInt32 data32[MD5_NUM_BLOCK_WORDS]; + { +#define LOAD_data32_x4(i) { \ + data32[i ] = LOAD_DATA(i ); \ + data32[i + 1] = LOAD_DATA(i + 1); \ + data32[i + 2] = LOAD_DATA(i + 2); \ + data32[i + 3] = LOAD_DATA(i + 3); } +#if 1 + LOAD_data32_x4 (0 * 4) + LOAD_data32_x4 (1 * 4) + LOAD_data32_x4 (2 * 4) + LOAD_data32_x4 (3 * 4) +#else + unsigned i; + for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4) + { + LOAD_data32_x4(i) + } +#endif + } +#endif + + R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, + 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821) + R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, + 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, + 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a) + R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, + 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665) + R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, + 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391) + + a += state[0]; + b += state[1]; + c += state[2]; + d += state[3]; + + state[0] = a; + state[1] = b; + state[2] = c; + state[3] = d; + + data += MD5_BLOCK_SIZE; + } + while (--numBlocks); +} + + +#define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) + +void Md5_Update(CMd5 *p, const Byte *data, size_t size) +{ + if (size == 0) + return; + { + const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); + const unsigned num = MD5_BLOCK_SIZE - pos; + p->count += size; + if (num > size) + { + memcpy(p->buffer + pos, data, size); + return; + } + if (pos != 0) + { + size -= num; + memcpy(p->buffer + pos, data, num); + data += num; + Md5_UpdateBlock(p); + } + } + { + const size_t numBlocks = size >> 6; + if (numBlocks) + MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks); + size &= MD5_BLOCK_SIZE - 1; + if (size == 0) + return; + data += (numBlocks << 6); + memcpy(p->buffer, data, size); + } +} + + +void Md5_Final(CMd5 *p, Byte *digest) +{ + unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1); + p->buffer[pos++] = 0x80; + if (pos > (MD5_BLOCK_SIZE - 4 * 2)) + { + while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos); + Md5_UpdateBlock(p); + pos = 0; + } + memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos); + { + const UInt64 numBits = p->count << 3; +#if defined(MY_CPU_LE_UNALIGN) + SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits) +#else + SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits)) + SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32)) +#endif + } + Md5_UpdateBlock(p); + + SetUi32(digest, p->state[0]) + SetUi32(digest + 4, p->state[1]) + SetUi32(digest + 8, p->state[2]) + SetUi32(digest + 12, p->state[3]) + + Md5_Init(p); +} + +#undef R1 +#undef R4 +#undef R16 +#undef D +#undef LOAD_DATA +#undef LOAD_data32_x4 +#undef F1 +#undef F2 +#undef F3 +#undef F4 diff --git a/C/Md5.h b/C/Md5.h new file mode 100644 index 0000000..49c0741 --- /dev/null +++ b/C/Md5.h @@ -0,0 +1,34 @@ +/* Md5.h -- MD5 Hash +: Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_MD5_H +#define ZIP7_INC_MD5_H + +#include "7zTypes.h" + +EXTERN_C_BEGIN + +#define MD5_NUM_BLOCK_WORDS 16 +#define MD5_NUM_DIGEST_WORDS 4 + +#define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4) +#define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4) + +typedef struct +{ + UInt64 count; + UInt64 _pad_1; + // we want 16-bytes alignment here + UInt32 state[MD5_NUM_DIGEST_WORDS]; + UInt64 _pad_2[4]; + // we want 64-bytes alignment here + Byte buffer[MD5_BLOCK_SIZE]; +} CMd5; + +void Md5_Init(CMd5 *p); +void Md5_Update(CMd5 *p, const Byte *data, size_t size); +void Md5_Final(CMd5 *p, Byte *digest); + +EXTERN_C_END + +#endif diff --git a/C/Sha1.c b/C/Sha1.c index 4c92892..4ca21d7 100644 --- a/C/Sha1.c +++ b/C/Sha1.c @@ -1,18 +1,14 @@ /* Sha1.c -- SHA-1 Hash -2024-03-01 : Igor Pavlov : Public domain +: Igor Pavlov : Public domain This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ #include "Precomp.h" #include -#include "CpuArch.h" -#include "RotateDefs.h" #include "Sha1.h" - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -// #define USE_MY_MM -#endif +#include "RotateDefs.h" +#include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ @@ -56,7 +52,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW; - #define SHA1_UPDATE_BLOCKS(p) p->func_UpdateBlocks + #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks #else #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks #endif @@ -85,7 +81,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) return False; #endif - p->func_UpdateBlocks = func; + p->v.vars.func_UpdateBlocks = func; return True; } @@ -225,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo) void Sha1_InitState(CSha1 *p) { - p->count = 0; + p->v.vars.count = 0; p->state[0] = 0x67452301; p->state[1] = 0xEFCDAB89; p->state[2] = 0x98BADCFE; @@ -235,7 +231,7 @@ void Sha1_InitState(CSha1 *p) void Sha1_Init(CSha1 *p) { - p->func_UpdateBlocks = + p->v.vars.func_UpdateBlocks = #ifdef Z7_COMPILER_SHA1_SUPPORTED g_SHA1_FUNC_UPDATE_BLOCKS; #else @@ -250,7 +246,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num { UInt32 a, b, c, d, e; UInt32 W[kNumW]; - // if (numBlocks != 0x1264378347) return; + if (numBlocks == 0) return; @@ -283,7 +279,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num state[3] = d; state[4] = e; - data += 64; + data += SHA1_BLOCK_SIZE; } while (--numBlocks); } @@ -295,20 +291,15 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) { if (size == 0) return; - { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned num; - - p->count += size; - - num = 64 - pos; + const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); + const unsigned num = SHA1_BLOCK_SIZE - pos; + p->v.vars.count += size; if (num > size) { memcpy(p->buffer + pos, data, size); return; } - if (pos != 0) { size -= num; @@ -318,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) } } { - size_t numBlocks = size >> 6; + const size_t numBlocks = size >> 6; + // if (numBlocks) SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks); - size &= 0x3F; + size &= SHA1_BLOCK_SIZE - 1; if (size == 0) return; data += (numBlocks << 6); @@ -331,42 +323,21 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size) void Sha1_Final(CSha1 *p, Byte *digest) { - unsigned pos = (unsigned)p->count & 0x3F; - - + unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1); p->buffer[pos++] = 0x80; - - if (pos > (64 - 8)) + if (pos > (SHA1_BLOCK_SIZE - 4 * 2)) { - while (pos != 64) { p->buffer[pos++] = 0; } - // memset(&p->buf.buffer[pos], 0, 64 - pos); + while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos); Sha1_UpdateBlock(p); pos = 0; } - - /* - if (pos & 3) - { - p->buffer[pos] = 0; - p->buffer[pos + 1] = 0; - p->buffer[pos + 2] = 0; - pos += 3; - pos &= ~3; - } - { - for (; pos < 64 - 8; pos += 4) - *(UInt32 *)(&p->buffer[pos]) = 0; - } - */ - - memset(&p->buffer[pos], 0, (64 - 8) - pos); - + memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos); { - const UInt64 numBits = (p->count << 3); - SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) - SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) + const UInt64 numBits = p->v.vars.count << 3; + SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) + SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) } - Sha1_UpdateBlock(p); SetBe32(digest, p->state[0]) @@ -375,16 +346,13 @@ void Sha1_Final(CSha1 *p, Byte *digest) SetBe32(digest + 12, p->state[3]) SetBe32(digest + 16, p->state[4]) - - - Sha1_InitState(p); } void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) { - const UInt64 numBits = (p->count + size) << 3; + const UInt64 numBits = (p->v.vars.count + size) << 3; SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)) SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)) // SetBe32((UInt32 *)(block + size), 0x80000000); @@ -420,57 +388,32 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest) void Sha1Prepare(void) { - #ifdef Z7_COMPILER_SHA1_SUPPORTED +#ifdef Z7_COMPILER_SHA1_SUPPORTED SHA1_FUNC_UPDATE_BLOCKS f, f_hw; f = Sha1_UpdateBlocks; f_hw = NULL; - #ifdef MY_CPU_X86_OR_AMD64 - #ifndef USE_MY_MM +#ifdef MY_CPU_X86_OR_AMD64 if (CPU_IsSupported_SHA() && CPU_IsSupported_SSSE3() - // && CPU_IsSupported_SSE41() ) - #endif - #else +#else if (CPU_IsSupported_SHA1()) - #endif +#endif { // printf("\n========== HW SHA1 ======== \n"); - #if 0 && defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) +#if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037) /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). - It generated incorrect SHA-1 code. - 21.03 : we test sha1-hardware code at runtime initialization */ - - #pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") - - UInt32 state[5] = { 0, 1, 2, 3, 4 } ; - Byte data[64]; - unsigned i; - for (i = 0; i < sizeof(data); i += 2) - { - data[i ] = (Byte)(i); - data[i + 1] = (Byte)(i + 1); - } - - Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64); - - if ( state[0] != 0x9acd7297 - || state[1] != 0x4624d898 - || state[2] != 0x0bf079f0 - || state[3] != 0x031e61b3 - || state[4] != 0x8323fe20) - { - // printf("\n========== SHA-1 hardware version failure ======== \n"); - } - else - #endif + It generated incorrect SHA-1 code. */ + #pragma message("== SHA1 code can work incorrectly with this compiler") + #error Stop_Compiling_MSC_Compiler_BUG_SHA1 +#endif { f = f_hw = Sha1_UpdateBlocks_HW; } } g_SHA1_FUNC_UPDATE_BLOCKS = f; g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw; - #endif +#endif } #undef kNumW diff --git a/C/Sha1.h b/C/Sha1.h index fecd9d3..529be4d 100644 --- a/C/Sha1.h +++ b/C/Sha1.h @@ -1,5 +1,5 @@ /* Sha1.h -- SHA-1 Hash -2023-04-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_SHA1_H #define ZIP7_INC_SHA1_H @@ -14,6 +14,9 @@ EXTERN_C_BEGIN #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) + + + typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks); /* @@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte typedef struct { - SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; - UInt64 count; - UInt64 _pad_2[2]; + union + { + struct + { + SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; + UInt64 count; + } vars; + UInt64 _pad_64bit[4]; + void *_pad_align_ptr[2]; + } v; UInt32 state[SHA1_NUM_DIGEST_WORDS]; UInt32 _pad_3[3]; Byte buffer[SHA1_BLOCK_SIZE]; diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c index 4e835f1..8738b94 100644 --- a/C/Sha1Opt.c +++ b/C/Sha1Opt.c @@ -1,18 +1,11 @@ /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "Compiler.h" #include "CpuArch.h" -#if defined(_MSC_VER) -#if (_MSC_VER < 1900) && (_MSC_VER >= 1200) -// #define USE_MY_MM -#endif -#endif - // #define Z7_USE_HW_SHA_STUB // for debug - #ifdef MY_CPU_X86_OR_AMD64 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check #define USE_HW_SHA @@ -20,19 +13,14 @@ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) #define USE_HW_SHA - #if !defined(_INTEL_COMPILER) + #if !defined(__INTEL_COMPILER) // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) #if !defined(__SHA__) || !defined(__SSSE3__) #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) #endif #endif #elif defined(_MSC_VER) - #ifdef USE_MY_MM - #define USE_VER_MIN 1300 - #else - #define USE_VER_MIN 1900 - #endif - #if (_MSC_VER >= USE_VER_MIN) + #if (_MSC_VER >= 1900) #define USE_HW_SHA #else #define Z7_USE_HW_SHA_STUB @@ -47,23 +35,20 @@ // #pragma message("Sha1 HW") + + + // sse/sse2/ssse3: #include // sha*: #include #if defined (__clang__) && defined(_MSC_VER) - // #if !defined(__SSSE3__) - // #endif #if !defined(__SHA__) #include #endif #else -#ifdef USE_MY_MM -#include "My_mm.h" -#endif - #endif /* @@ -84,7 +69,6 @@ SHA: _mm_sha1* */ - #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); @@ -99,11 +83,12 @@ SHA: #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); - #define LOAD_SHUFFLE(m, k) \ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ SHUFFLE_EPI8(m, mask) \ +#define NNN(m0, m1, m2, m3) + #define SM1(m0, m1, m2, m3) \ SHA1_MSG1(m0, m1) \ @@ -116,35 +101,19 @@ SHA: SM1(m0, m1, m2, m3) \ SHA1_MSG2(m3, m2) \ -#define NNN(m0, m1, m2, m3) - - - - - - - - - - - - - - - - - -#define R4(k, e0, e1, m0, m1, m2, m3, OP) \ +#define R4(k, m0, m1, m2, m3, e0, e1, OP) \ e1 = abcd; \ SHA1_RND4(abcd, e0, (k) / 5) \ SHA1_NEXTE(e1, m1) \ OP(m0, m1, m2, m3) \ + + #define R16(k, mx, OP0, OP1, OP2, OP3) \ - R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ - R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ - R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ - R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ + R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \ + R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \ + R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \ + R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \ #define PREPARE_STATE \ SHUFFLE_EPI32 (abcd, 0x1B) \ @@ -162,8 +131,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t { const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); - __m128i abcd, e0; + __m128i abcd, e0; + if (numBlocks == 0) return; @@ -204,7 +174,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t PREPARE_STATE _mm_storeu_si128((__m128i *) (void *) state, abcd); - *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); + *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0); } #endif // USE_HW_SHA @@ -262,22 +232,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t #define _ARM_USE_NEW_NEON_INTRINSICS #endif - - - - #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else - - - - - - - - #if defined(__clang__) && __clang_major__ < 16 #if !defined(__ARM_FEATURE_SHA2) && \ !defined(__ARM_FEATURE_CRYPTO) @@ -329,26 +287,37 @@ typedef uint32x4_t v128; #endif #ifdef MY_CPU_BE - #define MY_rev32_for_LE(x) + #define MY_rev32_for_LE(x) x #else - #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) + #define MY_rev32_for_LE(x) vrev32q_u8(x) #endif -#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) -#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) +#define LOAD_128_32(_p) vld1q_u32(_p) +#define LOAD_128_8(_p) vld1q_u8 (_p) +#define STORE_128_32(_p, _v) vst1q_u32(_p, _v) #define LOAD_SHUFFLE(m, k) \ - m = LOAD_128((data + (k) * 16)); \ - MY_rev32_for_LE(m); \ - -#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3) -#define SU1(dest, src) dest = vsha1su1q_u32(dest, src) + m = vreinterpretq_u32_u8( \ + MY_rev32_for_LE( \ + LOAD_128_8(data + (k) * 16))); \ + +#define N0(dest, src2, src3) +#define N1(dest, src) +#define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); +#define U1(dest, src) dest = vsha1su1q_u32(dest, src); #define C(e) abcd = vsha1cq_u32(abcd, e, t) #define P(e) abcd = vsha1pq_u32(abcd, e, t) #define M(e) abcd = vsha1mq_u32(abcd, e, t) #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) #define T(m, c) t = vaddq_u32(m, c) +#define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \ + T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \ + T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \ + T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \ + T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \ + + void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); #ifdef ATTRIB_SHA ATTRIB_SHA @@ -367,7 +336,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t c2 = vdupq_n_u32(0x8f1bbcdc); c3 = vdupq_n_u32(0xca62c1d6); - abcd = LOAD_128(&state[0]); + abcd = LOAD_128_32(&state[0]); e0 = state[4]; do @@ -385,26 +354,11 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t LOAD_SHUFFLE (m2, 2) LOAD_SHUFFLE (m3, 3) - T(m0, c0); H(e1); C(e0); - T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); - T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); - T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); - T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); - T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); - T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); - T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); - T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); - T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); - T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); - T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); - T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); - T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); - T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); - T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); - T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); - T(m1, c3); SU1(m3, m2); H(e0); P(e1); - T(m2, c3); H(e1); P(e0); - T(m3, c3); H(e0); P(e1); + R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C ) + R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P ) + R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M ) + R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P ) + R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P ) abcd = vaddq_u32(abcd, abcd_save); e0 += e0_save; @@ -413,7 +367,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t } while (--numBlocks); - STORE_128(&state[0], abcd); + STORE_128_32(&state[0], abcd); state[4] = e0; } @@ -421,13 +375,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t #endif // MY_CPU_ARM_OR_ARM64 - #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) // #error Stop_Compiling_UNSUPPORTED_SHA // #include - - - // #include "Sha1.h" // #if defined(_MSC_VER) #pragma message("Sha1 HW-SW stub was used") @@ -447,8 +397,10 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t } #endif -#undef SU0 -#undef SU1 +#undef U0 +#undef U1 +#undef N0 +#undef N1 #undef C #undef P #undef M diff --git a/C/Sha256.c b/C/Sha256.c index 14d3be9..ea7ed8e 100644 --- a/C/Sha256.c +++ b/C/Sha256.c @@ -1,18 +1,14 @@ /* Sha256.c -- SHA-256 Hash -2024-03-01 : Igor Pavlov : Public domain +: Igor Pavlov : Public domain This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "Precomp.h" #include -#include "CpuArch.h" -#include "RotateDefs.h" #include "Sha256.h" - -#if defined(_MSC_VER) && (_MSC_VER < 1900) -// #define USE_MY_MM -#endif +#include "RotateDefs.h" +#include "CpuArch.h" #ifdef MY_CPU_X86_OR_AMD64 #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ @@ -56,7 +52,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW; - #define SHA256_UPDATE_BLOCKS(p) p->func_UpdateBlocks + #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks #else #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks #endif @@ -85,7 +81,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) return False; #endif - p->func_UpdateBlocks = func; + p->v.vars.func_UpdateBlocks = func; return True; } @@ -111,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo) void Sha256_InitState(CSha256 *p) { - p->count = 0; + p->v.vars.count = 0; p->state[0] = 0x6a09e667; p->state[1] = 0xbb67ae85; p->state[2] = 0x3c6ef372; @@ -122,9 +118,16 @@ void Sha256_InitState(CSha256 *p) p->state[7] = 0x5be0cd19; } + + + + + + + void Sha256_Init(CSha256 *p) { - p->func_UpdateBlocks = + p->v.vars.func_UpdateBlocks = #ifdef Z7_COMPILER_SHA256_SUPPORTED g_SHA256_FUNC_UPDATE_BLOCKS; #else @@ -133,10 +136,10 @@ void Sha256_Init(CSha256 *p) Sha256_InitState(p); } -#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) -#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) +#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22)) +#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25)) #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) -#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) +#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10)) #define Ch(x,y,z) (z^(x&(y^z))) #define Maj(x,y,z) ((x&y)|(z&(x|y))) @@ -224,12 +227,10 @@ void Sha256_Init(CSha256 *p) #endif -// static -extern MY_ALIGN(64) -const UInt32 SHA256_K_ARRAY[64]; -MY_ALIGN(64) -const UInt32 SHA256_K_ARRAY[64] = { +extern +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; +MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, @@ -248,27 +249,29 @@ const UInt32 SHA256_K_ARRAY[64] = { 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 }; -#define K SHA256_K_ARRAY + + +#define K SHA256_K_ARRAY + Z7_NO_INLINE void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks) { UInt32 W - #ifdef Z7_SHA256_BIG_W +#ifdef Z7_SHA256_BIG_W [64]; - #else +#else [16]; - #endif - +#endif unsigned j; - UInt32 a,b,c,d,e,f,g,h; - - #if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) +#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) UInt32 tmp; - #endif +#endif + if (numBlocks == 0) return; + a = state[0]; b = state[1]; c = state[2]; @@ -278,7 +281,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n g = state[6]; h = state[7]; - while (numBlocks) + do { for (j = 0; j < 16; j += STEP_PRE) @@ -352,19 +355,11 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n g += state[6]; state[6] = g; h += state[7]; state[7] = h; - data += 64; - numBlocks--; + data += SHA256_BLOCK_SIZE; } - - /* Wipe variables */ - /* memset(W, 0, sizeof(W)); */ + while (--numBlocks); } -#undef S0 -#undef S1 -#undef s0 -#undef s1 -#undef K #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) @@ -372,20 +367,15 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) { if (size == 0) return; - { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned num; - - p->count += size; - - num = 64 - pos; + const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); + const unsigned num = SHA256_BLOCK_SIZE - pos; + p->v.vars.count += size; if (num > size) { memcpy(p->buffer + pos, data, size); return; } - if (pos != 0) { size -= num; @@ -395,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) } } { - size_t numBlocks = size >> 6; + const size_t numBlocks = size >> 6; + // if (numBlocks) SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks); - size &= 0x3F; + size &= SHA256_BLOCK_SIZE - 1; if (size == 0) return; data += (numBlocks << 6); @@ -408,82 +399,69 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size) void Sha256_Final(CSha256 *p, Byte *digest) { - unsigned pos = (unsigned)p->count & 0x3F; - unsigned i; - + unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1); p->buffer[pos++] = 0x80; - - if (pos > (64 - 8)) + if (pos > (SHA256_BLOCK_SIZE - 4 * 2)) { - while (pos != 64) { p->buffer[pos++] = 0; } - // memset(&p->buf.buffer[pos], 0, 64 - pos); + while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos); Sha256_UpdateBlock(p); pos = 0; } - - /* - if (pos & 3) + memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos); { - p->buffer[pos] = 0; - p->buffer[pos + 1] = 0; - p->buffer[pos + 2] = 0; - pos += 3; - pos &= ~3; + const UInt64 numBits = p->v.vars.count << 3; + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32)) + SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits)) } + Sha256_UpdateBlock(p); +#if 1 && defined(MY_CPU_BE) + memcpy(digest, p->state, SHA256_DIGEST_SIZE); +#else { - for (; pos < 64 - 8; pos += 4) - *(UInt32 *)(&p->buffer[pos]) = 0; + unsigned i; + for (i = 0; i < 8; i += 2) + { + const UInt32 v0 = p->state[i]; + const UInt32 v1 = p->state[(size_t)i + 1]; + SetBe32(digest , v0) + SetBe32(digest + 4, v1) + digest += 4 * 2; + } } - */ - memset(&p->buffer[pos], 0, (64 - 8) - pos); - { - UInt64 numBits = (p->count << 3); - SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32)) - SetBe32(p->buffer + 64 - 4, (UInt32)(numBits)) - } - - Sha256_UpdateBlock(p); - for (i = 0; i < 8; i += 2) - { - UInt32 v0 = p->state[i]; - UInt32 v1 = p->state[(size_t)i + 1]; - SetBe32(digest , v0) - SetBe32(digest + 4, v1) - digest += 8; - } - + +#endif Sha256_InitState(p); } void Sha256Prepare(void) { - #ifdef Z7_COMPILER_SHA256_SUPPORTED +#ifdef Z7_COMPILER_SHA256_SUPPORTED SHA256_FUNC_UPDATE_BLOCKS f, f_hw; f = Sha256_UpdateBlocks; f_hw = NULL; - #ifdef MY_CPU_X86_OR_AMD64 - #ifndef USE_MY_MM +#ifdef MY_CPU_X86_OR_AMD64 if (CPU_IsSupported_SHA() && CPU_IsSupported_SSSE3() - // && CPU_IsSupported_SSE41() ) - #endif - #else +#else if (CPU_IsSupported_SHA2()) - #endif +#endif { // printf("\n========== HW SHA256 ======== \n"); f = f_hw = Sha256_UpdateBlocks_HW; } g_SHA256_FUNC_UPDATE_BLOCKS = f; g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw; - #endif +#endif } +#undef U64C +#undef K #undef S0 #undef S1 #undef s0 diff --git a/C/Sha256.h b/C/Sha256.h index 9e04223..75329cd 100644 --- a/C/Sha256.h +++ b/C/Sha256.h @@ -1,5 +1,5 @@ /* Sha256.h -- SHA-256 Hash -2023-04-02 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #ifndef ZIP7_INC_SHA256_H #define ZIP7_INC_SHA256_H @@ -14,6 +14,9 @@ EXTERN_C_BEGIN #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) + + + typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks); /* @@ -32,9 +35,16 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt typedef struct { - SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; - UInt64 count; - UInt64 _pad_2[2]; + union + { + struct + { + SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; + UInt64 count; + } vars; + UInt64 _pad_64bit[4]; + void *_pad_align_ptr[2]; + } v; UInt32 state[SHA256_NUM_DIGEST_WORDS]; Byte buffer[SHA256_BLOCK_SIZE]; diff --git a/C/Sha256Opt.c b/C/Sha256Opt.c index eb38166..1c6b50f 100644 --- a/C/Sha256Opt.c +++ b/C/Sha256Opt.c @@ -1,18 +1,11 @@ /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions -2024-03-01 : Igor Pavlov : Public domain */ +: Igor Pavlov : Public domain */ #include "Precomp.h" #include "Compiler.h" #include "CpuArch.h" -#if defined(_MSC_VER) -#if (_MSC_VER < 1900) && (_MSC_VER >= 1200) -// #define USE_MY_MM -#endif -#endif - // #define Z7_USE_HW_SHA_STUB // for debug - #ifdef MY_CPU_X86_OR_AMD64 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check #define USE_HW_SHA @@ -20,19 +13,14 @@ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) #define USE_HW_SHA - #if !defined(_INTEL_COMPILER) + #if !defined(__INTEL_COMPILER) // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) #if !defined(__SHA__) || !defined(__SSSE3__) #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) #endif #endif #elif defined(_MSC_VER) - #ifdef USE_MY_MM - #define USE_VER_MIN 1300 - #else - #define USE_VER_MIN 1900 - #endif - #if (_MSC_VER >= USE_VER_MIN) + #if (_MSC_VER >= 1900) #define USE_HW_SHA #else #define Z7_USE_HW_SHA_STUB @@ -47,23 +35,20 @@ // #pragma message("Sha256 HW") + + + // sse/sse2/ssse3: #include // sha*: #include #if defined (__clang__) && defined(_MSC_VER) - // #if !defined(__SSSE3__) - // #endif #if !defined(__SHA__) #include #endif #else -#ifdef USE_MY_MM -#include "My_mm.h" -#endif - #endif /* @@ -91,60 +76,44 @@ SHA: extern MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; - #define K SHA256_K_ARRAY #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); -#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); - +#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); #define LOAD_SHUFFLE(m, k) \ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ m = _mm_shuffle_epi8(m, mask); \ -#define SM1(g0, g1, g2, g3) \ - SHA256_MSG1(g3, g0); \ +#define NNN(m0, m1, m2, m3) -#define SM2(g0, g1, g2, g3) \ - tmp = _mm_alignr_epi8(g1, g0, 4); \ - ADD_EPI32(g2, tmp) \ - SHA25G_MSG2(g2, g1); \ - -// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k) -// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1) - - -#define NNN(g0, g1, g2, g3) +#define SM1(m1, m2, m3, m0) \ + SHA256_MSG1(m0, m1); \ +#define SM2(m2, m3, m0, m1) \ + ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \ + SHA256_MSG2(m0, m3); \ #define RND2(t0, t1) \ t0 = _mm_sha256rnds2_epu32(t0, t1, msg); -#define RND2_0(m, k) \ - msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \ - RND2(state0, state1); \ - msg = _mm_shuffle_epi32(msg, 0x0E); \ -#define RND2_1 \ +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \ + RND2(state0, state1); \ + msg = _mm_shuffle_epi32(msg, 0x0E); \ + OP0(m0, m1, m2, m3) \ RND2(state1, state0); \ - - -// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2 - -#define R4(k, g0, g1, g2, g3, OP0, OP1) \ - RND2_0(g0, k) \ - OP0(g0, g1, g2, g3) \ - RND2_1 \ - OP1(g0, g1, g2, g3) \ + OP1(m0, m1, m2, m3) \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ - R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ - R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ - R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ - R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ + R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ #define PREPARE_STATE \ tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ @@ -161,8 +130,9 @@ ATTRIB_SHA void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) { const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); - __m128i tmp; - __m128i state0, state1; + + + __m128i tmp, state0, state1; if (numBlocks == 0) return; @@ -262,22 +232,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #define _ARM_USE_NEW_NEON_INTRINSICS #endif - - - - #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else - - - - - - - - #if defined(__clang__) && __clang_major__ < 16 #if !defined(__ARM_FEATURE_SHA2) && \ !defined(__ARM_FEATURE_CRYPTO) @@ -324,41 +282,70 @@ typedef uint32x4_t v128; // typedef __n128 v128; // MSVC #ifdef MY_CPU_BE - #define MY_rev32_for_LE(x) + #define MY_rev32_for_LE(x) x #else - #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) + #define MY_rev32_for_LE(x) vrev32q_u8(x) #endif -#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) -#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) +#if 1 // 0 for debug +// for arm32: it works slower by some reason than direct code +/* +for arm32 it generates: +MSVC-2022, GCC-9: + vld1.32 {d18,d19}, [r10] + vst1.32 {d4,d5}, [r3] + vld1.8 {d20-d21}, [r4] +there is no align hint (like [r10:128]). So instruction allows unaligned access +*/ +#define LOAD_128_32(_p) vld1q_u32(_p) +#define LOAD_128_8(_p) vld1q_u8 (_p) +#define STORE_128_32(_p, _v) vst1q_u32(_p, _v) +#else +/* +for arm32: +MSVC-2022: + vldm r10,{d18,d19} + vstm r3,{d4,d5} + does it require strict alignment? +GCC-9: + vld1.64 {d30-d31}, [r0:64] + vldr d28, [r0, #16] + vldr d29, [r0, #24] + vst1.64 {d30-d31}, [r0:64] + vstr d28, [r0, #16] + vstr d29, [r0, #24] +there is hint [r0:64], so does it requires 64-bit alignment. +*/ +#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p)) +#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p)) +#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v) +#endif #define LOAD_SHUFFLE(m, k) \ - m = LOAD_128((data + (k) * 16)); \ - MY_rev32_for_LE(m); \ + m = vreinterpretq_u32_u8( \ + MY_rev32_for_LE( \ + LOAD_128_8(data + (k) * 16))); \ // K array must be aligned for 16-bytes at least. extern MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; - #define K SHA256_K_ARRAY - #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); -#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); +#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); -#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) -#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) -#define NNN(g0, g1, g2, g3) +#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0) +#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1) +#define NNN(m0, m1, m2, m3) - -#define R4(k, g0, g1, g2, g3, OP0, OP1) \ - msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \ +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \ tmp = state0; \ state0 = vsha256hq_u32( state0, state1, msg ); \ state1 = vsha256h2q_u32( state1, tmp, msg ); \ - OP0(g0, g1, g2, g3); \ - OP1(g0, g1, g2, g3); \ + OP0(m0, m1, m2, m3); \ + OP1(m0, m1, m2, m3); \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ @@ -379,8 +366,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ if (numBlocks == 0) return; - state0 = LOAD_128(&state[0]); - state1 = LOAD_128(&state[4]); + state0 = LOAD_128_32(&state[0]); + state1 = LOAD_128_32(&state[4]); do { @@ -408,8 +395,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ } while (--numBlocks); - STORE_128(&state[0], state0); - STORE_128(&state[4], state1); + STORE_128_32(&state[0], state0); + STORE_128_32(&state[4], state1); } #endif // USE_HW_SHA @@ -443,13 +430,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #endif - #undef K #undef RND2 -#undef RND2_0 -#undef RND2_1 - #undef MY_rev32_for_LE + #undef NNN #undef LOAD_128 #undef STORE_128 @@ -457,7 +441,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #undef SM1 #undef SM2 -#undef NNN + #undef R4 #undef R16 #undef PREPARE_STATE diff --git a/C/Sha3.c b/C/Sha3.c new file mode 100644 index 0000000..be972d6 --- /dev/null +++ b/C/Sha3.c @@ -0,0 +1,359 @@ +/* Sha3.c -- SHA-3 Hash +: Igor Pavlov : Public domain +This code is based on public domain code from Wei Dai's Crypto++ library. */ + +#include "Precomp.h" + +#include + +#include "Sha3.h" +#include "RotateDefs.h" +#include "CpuArch.h" + +#define U64C(x) UINT64_CONST(x) + +static +MY_ALIGN(64) +const UInt64 SHA3_K_ARRAY[24] = +{ + U64C(0x0000000000000001), U64C(0x0000000000008082), + U64C(0x800000000000808a), U64C(0x8000000080008000), + U64C(0x000000000000808b), U64C(0x0000000080000001), + U64C(0x8000000080008081), U64C(0x8000000000008009), + U64C(0x000000000000008a), U64C(0x0000000000000088), + U64C(0x0000000080008009), U64C(0x000000008000000a), + U64C(0x000000008000808b), U64C(0x800000000000008b), + U64C(0x8000000000008089), U64C(0x8000000000008003), + U64C(0x8000000000008002), U64C(0x8000000000000080), + U64C(0x000000000000800a), U64C(0x800000008000000a), + U64C(0x8000000080008081), U64C(0x8000000000008080), + U64C(0x0000000080000001), U64C(0x8000000080008008) +}; + +void Sha3_Init(CSha3 *p) +{ + p->count = 0; + memset(p->state, 0, sizeof(p->state)); +} + +#define GET_state(i, a) UInt64 a = state[i]; +#define SET_state(i, a) state[i] = a; + +#define LS_5(M, i, a0,a1,a2,a3,a4) \ + M ((i) * 5 , a0) \ + M ((i) * 5 + 1, a1) \ + M ((i) * 5 + 2, a2) \ + M ((i) * 5 + 3, a3) \ + M ((i) * 5 + 4, a4) \ + +#define LS_25(M) \ + LS_5 (M, 0, a50, a51, a52, a53, a54) \ + LS_5 (M, 1, a60, a61, a62, a63, a64) \ + LS_5 (M, 2, a70, a71, a72, a73, a74) \ + LS_5 (M, 3, a80, a81, a82, a83, a84) \ + LS_5 (M, 4, a90, a91, a92, a93, a94) \ + + +#define XOR_1(i, a0) \ + a0 ^= GetUi64(data + (i) * 8); \ + +#define XOR_4(i, a0,a1,a2,a3) \ + XOR_1 ((i) , a0); \ + XOR_1 ((i) + 1, a1); \ + XOR_1 ((i) + 2, a2); \ + XOR_1 ((i) + 3, a3); \ + +#define D(d,b1,b2) \ + d = b1 ^ Z7_ROTL64(b2, 1); + +#define D5 \ + D (d0, c4, c1) \ + D (d1, c0, c2) \ + D (d2, c1, c3) \ + D (d3, c2, c4) \ + D (d4, c3, c0) \ + +#define C0(c,a,d) \ + c = a ^ d; \ + +#define C(c,a,d,k) \ + c = a ^ d; \ + c = Z7_ROTL64(c, k); \ + +#define E4(e1,e2,e3,e4) \ + e1 = c1 ^ (~c2 & c3); \ + e2 = c2 ^ (~c3 & c4); \ + e3 = c3 ^ (~c4 & c0); \ + e4 = c4 ^ (~c0 & c1); \ + +#define CK( v0,w0, \ + v1,w1,k1, \ + v2,w2,k2, \ + v3,w3,k3, \ + v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \ + C0(c0,v0,w0) \ + C (c1,v1,w1,k1) \ + C (c2,v2,w2,k2) \ + C (c3,v3,w3,k3) \ + C (c4,v4,w4,k4) \ + e0 = c0 ^ (~c1 & c2) ^ keccak_c; \ + E4(e1,e2,e3,e4) \ + +#define CE( v0,w0,k0, \ + v1,w1,k1, \ + v2,w2,k2, \ + v3,w3,k3, \ + v4,w4,k4, e0,e1,e2,e3,e4) \ + C (c0,v0,w0,k0) \ + C (c1,v1,w1,k1) \ + C (c2,v2,w2,k2) \ + C (c3,v3,w3,k3) \ + C (c4,v4,w4,k4) \ + e0 = c0 ^ (~c1 & c2); \ + E4(e1,e2,e3,e4) \ + +// numBlocks != 0 +static +Z7_NO_INLINE +void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS], + const Byte *data, size_t numBlocks, size_t blockSize) +{ + LS_25 (GET_state) + + do + { + unsigned round; + XOR_4 ( 0, a50, a51, a52, a53) + XOR_4 ( 4, a54, a60, a61, a62) + XOR_1 ( 8, a63) + if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384 + if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256 + if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224 + if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128 + XOR_1 (19, a84) + XOR_1 (20, a90) }}}} + data += blockSize; + + for (round = 0; round < 24; round += 2) + { + UInt64 c0, c1, c2, c3, c4; + UInt64 d0, d1, d2, d3, d4; + UInt64 e50, e51, e52, e53, e54; + UInt64 e60, e61, e62, e63, e64; + UInt64 e70, e71, e72, e73, e74; + UInt64 e80, e81, e82, e83, e84; + UInt64 e90, e91, e92, e93, e94; + + c0 = a50^a60^a70^a80^a90; + c1 = a51^a61^a71^a81^a91; + c2 = a52^a62^a72^a82^a92; + c3 = a53^a63^a73^a83^a93; + c4 = a54^a64^a74^a84^a94; + D5 + CK( a50, d0, + a61, d1, 44, + a72, d2, 43, + a83, d3, 21, + a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round]) + CE( a53, d3, 28, + a64, d4, 20, + a70, d0, 3, + a81, d1, 45, + a92, d2, 61, e60, e61, e62, e63, e64) + CE( a51, d1, 1, + a62, d2, 6, + a73, d3, 25, + a84, d4, 8, + a90, d0, 18, e70, e71, e72, e73, e74) + CE( a54, d4, 27, + a60, d0, 36, + a71, d1, 10, + a82, d2, 15, + a93, d3, 56, e80, e81, e82, e83, e84) + CE( a52, d2, 62, + a63, d3, 55, + a74, d4, 39, + a80, d0, 41, + a91, d1, 2, e90, e91, e92, e93, e94) + + // ---------- ROUND + 1 ---------- + + c0 = e50^e60^e70^e80^e90; + c1 = e51^e61^e71^e81^e91; + c2 = e52^e62^e72^e82^e92; + c3 = e53^e63^e73^e83^e93; + c4 = e54^e64^e74^e84^e94; + D5 + CK( e50, d0, + e61, d1, 44, + e72, d2, 43, + e83, d3, 21, + e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1]) + CE( e53, d3, 28, + e64, d4, 20, + e70, d0, 3, + e81, d1, 45, + e92, d2, 61, a60, a61, a62, a63, a64) + CE( e51, d1, 1, + e62, d2, 6, + e73, d3, 25, + e84, d4, 8, + e90, d0, 18, a70, a71, a72, a73, a74) + CE (e54, d4, 27, + e60, d0, 36, + e71, d1, 10, + e82, d2, 15, + e93, d3, 56, a80, a81, a82, a83, a84) + CE (e52, d2, 62, + e63, d3, 55, + e74, d4, 39, + e80, d0, 41, + e91, d1, 2, a90, a91, a92, a93, a94) + } + } + while (--numBlocks); + + LS_25 (SET_state) +} + + +#define Sha3_UpdateBlock(p) \ + Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize) + +void Sha3_Update(CSha3 *p, const Byte *data, size_t size) +{ +/* + for (;;) + { + if (size == 0) + return; + unsigned cur = p->blockSize - p->count; + if (cur > size) + cur = (unsigned)size; + size -= cur; + unsigned pos = p->count; + p->count = pos + cur; + while (pos & 7) + { + if (cur == 0) + return; + Byte *pb = &(((Byte *)p->state)[pos]); + *pb = (Byte)(*pb ^ *data++); + cur--; + pos++; + } + if (cur >= 8) + { + do + { + *(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data); + data += 8; + pos += 8; + cur -= 8; + } + while (cur >= 8); + } + if (pos != p->blockSize) + { + if (cur) + { + Byte *pb = &(((Byte *)p->state)[pos]); + do + { + *pb = (Byte)(*pb ^ *data++); + pb++; + } + while (--cur); + } + return; + } + Sha3_UpdateBlock(p->state); + p->count = 0; + } +*/ + if (size == 0) + return; + { + const unsigned pos = p->count; + const unsigned num = p->blockSize - pos; + if (num > size) + { + p->count = pos + (unsigned)size; + memcpy(p->buffer + pos, data, size); + return; + } + if (pos != 0) + { + size -= num; + memcpy(p->buffer + pos, data, num); + data += num; + Sha3_UpdateBlock(p); + } + } + if (size >= p->blockSize) + { + const size_t numBlocks = size / p->blockSize; + const Byte *dataOld = data; + data += numBlocks * p->blockSize; + size = (size_t)(dataOld + size - data); + Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize); + } + p->count = (unsigned)size; + if (size) + memcpy(p->buffer, data, size); +} + + +// we support only (digestSize % 4 == 0) cases +void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake) +{ + memset(p->buffer + p->count, 0, p->blockSize - p->count); + // we write bits markers from low to higher in current byte: + // - if sha-3 : 2 bits : 0,1 + // - if shake : 4 bits : 1111 + // then we write bit 1 to same byte. + // And we write bit 1 to highest bit of last byte of block. + p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06); + // we need xor operation (^= 0x80) here because we must write 0x80 bit + // to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!! + p->buffer[p->blockSize - 1] ^= 0x80; +/* + ((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06); + ((Byte *)p->state)[p->blockSize - 1] ^= 0x80; +*/ + Sha3_UpdateBlock(p); +#if 1 && defined(MY_CPU_LE) + memcpy(digest, p->state, digestSize); +#else + { + const unsigned numWords = digestSize >> 3; + unsigned i; + for (i = 0; i < numWords; i++) + { + const UInt64 v = p->state[i]; + SetUi64(digest, v) + digest += 8; + } + if (digestSize & 4) // for SHA3-224 + { + const UInt32 v = (UInt32)p->state[numWords]; + SetUi32(digest, v) + } + } +#endif + Sha3_Init(p); +} + +#undef GET_state +#undef SET_state +#undef LS_5 +#undef LS_25 +#undef XOR_1 +#undef XOR_4 +#undef D +#undef D5 +#undef C0 +#undef C +#undef E4 +#undef CK +#undef CE diff --git a/C/Sha3.h b/C/Sha3.h new file mode 100644 index 0000000..c5909c9 --- /dev/null +++ b/C/Sha3.h @@ -0,0 +1,36 @@ +/* Sha3.h -- SHA-3 Hash +: Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_MD5_H +#define ZIP7_INC_MD5_H + +#include "7zTypes.h" + +EXTERN_C_BEGIN + +#define SHA3_NUM_STATE_WORDS 25 + +#define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \ + (SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2) + +typedef struct +{ + UInt32 count; // < blockSize + UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8 + UInt64 _pad1[3]; + // we want 32-bytes alignment here + UInt64 state[SHA3_NUM_STATE_WORDS]; + UInt64 _pad2[3]; + // we want 64-bytes alignment here + Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values +} CSha3; + +#define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); } + +void Sha3_Init(CSha3 *p); +void Sha3_Update(CSha3 *p, const Byte *data, size_t size); +void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake); + +EXTERN_C_END + +#endif diff --git a/C/Sha512.c b/C/Sha512.c new file mode 100644 index 0000000..04827d6 --- /dev/null +++ b/C/Sha512.c @@ -0,0 +1,618 @@ +/* Sha512.c -- SHA-512 Hash +: Igor Pavlov : Public domain +This code is based on public domain code from Wei Dai's Crypto++ library. */ + +#include "Precomp.h" + +#include + +#include "Sha512.h" +#include "RotateDefs.h" +#include "CpuArch.h" + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \ + || defined(_MSC_VER) && (_MSC_VER >= 1940) + #define Z7_COMPILER_SHA512_SUPPORTED + #endif +#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) + #if defined(__ARM_FEATURE_SHA512) + #define Z7_COMPILER_SHA512_SUPPORTED + #else + #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ + || defined(__GNUC__) && (__GNUC__ >= 9) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it + #define Z7_COMPILER_SHA512_SUPPORTED + #endif + #endif +#endif + + + + + + + + + + + + + + +void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); + +#ifdef Z7_COMPILER_SHA512_SUPPORTED + void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); + + static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks; + static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW; + + #define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks +#else + #define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks +#endif + + +BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo) +{ + SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks; + + #ifdef Z7_COMPILER_SHA512_SUPPORTED + if (algo != SHA512_ALGO_SW) + { + if (algo == SHA512_ALGO_DEFAULT) + func = g_SHA512_FUNC_UPDATE_BLOCKS; + else + { + if (algo != SHA512_ALGO_HW) + return False; + func = g_SHA512_FUNC_UPDATE_BLOCKS_HW; + if (!func) + return False; + } + } + #else + if (algo > 1) + return False; + #endif + + p->v.vars.func_UpdateBlocks = func; + return True; +} + + +/* define it for speed optimization */ + +#if 0 // 1 for size optimization + #define STEP_PRE 1 + #define STEP_MAIN 1 +#else + #define STEP_PRE 2 + #define STEP_MAIN 4 + // #define Z7_SHA512_UNROLL +#endif + +#undef Z7_SHA512_BIG_W +#if STEP_MAIN != 16 + #define Z7_SHA512_BIG_W +#endif + + +#define U64C(x) UINT64_CONST(x) + +static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = { +{ U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf), + U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1) +}, +{ U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd), + U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2) +}, +{ U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939), + U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4) +}, +{ U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1), + U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179) +}}; + +void Sha512_InitState(CSha512 *p, unsigned digestSize) +{ + p->v.vars.count = 0; + memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state)); +} + +void Sha512_Init(CSha512 *p, unsigned digestSize) +{ + p->v.vars.func_UpdateBlocks = + #ifdef Z7_COMPILER_SHA512_SUPPORTED + g_SHA512_FUNC_UPDATE_BLOCKS; + #else + NULL; + #endif + Sha512_InitState(p, digestSize); +} + +#define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39)) +#define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41)) +#define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7)) +#define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6)) + +#define Ch(x,y,z) (z^(x&(y^z))) +#define Maj(x,y,z) ((x&y)|(z&(x|y))) + + +#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8)) + +#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) + +#ifdef Z7_SHA512_BIG_W + // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. + #define w(j, i) W[(size_t)(j) + i] + #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) +#else + #if STEP_MAIN == 16 + #define w(j, i) W[(i) & 15] + #else + #define w(j, i) W[((size_t)(j) + (i)) & 15] + #endif + #define blk2(j, i) (w(j, i) += blk2_main(j, i)) +#endif + +#define W_MAIN(i) blk2(j, i) + + +#define T1(wx, i) \ + tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + h = g; \ + g = f; \ + f = e; \ + e = d + tmp; \ + tmp += S0(a) + Maj(a, b, c); \ + d = c; \ + c = b; \ + b = a; \ + a = tmp; \ + +#define R1_PRE(i) T1( W_PRE, i) +#define R1_MAIN(i) T1( W_MAIN, i) + +#if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) +#define R2_MAIN(i) \ + R1_MAIN(i) \ + R1_MAIN(i + 1) \ + +#endif + + + +#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 + +#define T4( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + tmp = h; \ + h += d; \ + d = tmp + S0(a) + Maj(a, b, c); \ + +#define R4( wx, i) \ + T4 ( a,b,c,d,e,f,g,h, wx, (i )); \ + T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \ + T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \ + T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \ + +#define R4_PRE(i) R4( W_PRE, i) +#define R4_MAIN(i) R4( W_MAIN, i) + + +#define T8( a,b,c,d,e,f,g,h, wx, i) \ + h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ + d += h; \ + h += S0(a) + Maj(a, b, c); \ + +#define R8( wx, i) \ + T8 ( a,b,c,d,e,f,g,h, wx, i ); \ + T8 ( h,a,b,c,d,e,f,g, wx, i+1); \ + T8 ( g,h,a,b,c,d,e,f, wx, i+2); \ + T8 ( f,g,h,a,b,c,d,e, wx, i+3); \ + T8 ( e,f,g,h,a,b,c,d, wx, i+4); \ + T8 ( d,e,f,g,h,a,b,c, wx, i+5); \ + T8 ( c,d,e,f,g,h,a,b, wx, i+6); \ + T8 ( b,c,d,e,f,g,h,a, wx, i+7); \ + +#define R8_PRE(i) R8( W_PRE, i) +#define R8_MAIN(i) R8( W_MAIN, i) + +#endif + + +extern +MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80]; +MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = { + U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc), + U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118), + U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2), + U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694), + U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65), + U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5), + U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4), + U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70), + U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df), + U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b), + U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30), + U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8), + U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8), + U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3), + U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec), + U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b), + U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178), + U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b), + U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c), + U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817) +}; + +#define K SHA512_K_ARRAY + +Z7_NO_INLINE +void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks) +{ + UInt64 W +#ifdef Z7_SHA512_BIG_W + [80]; +#else + [16]; +#endif + unsigned j; + UInt64 a,b,c,d,e,f,g,h; +#if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4) + UInt64 tmp; +#endif + + if (numBlocks == 0) return; + + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + do + { + + for (j = 0; j < 16; j += STEP_PRE) + { + #if STEP_PRE > 4 + + #if STEP_PRE < 8 + R4_PRE(0); + #else + R8_PRE(0); + #if STEP_PRE == 16 + R8_PRE(8); + #endif + #endif + + #else + + R1_PRE(0) + #if STEP_PRE >= 2 + R1_PRE(1) + #if STEP_PRE >= 4 + R1_PRE(2) + R1_PRE(3) + #endif + #endif + + #endif + } + + for (j = 16; j < 80; j += STEP_MAIN) + { + #if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8 + + #if STEP_MAIN < 8 + R4_MAIN(0) + #else + R8_MAIN(0) + #if STEP_MAIN == 16 + R8_MAIN(8) + #endif + #endif + + #else + + R1_MAIN(0) + #if STEP_MAIN >= 2 + R1_MAIN(1) + #if STEP_MAIN >= 4 + R2_MAIN(2) + #if STEP_MAIN >= 8 + R2_MAIN(4) + R2_MAIN(6) + #if STEP_MAIN >= 16 + R2_MAIN(8) + R2_MAIN(10) + R2_MAIN(12) + R2_MAIN(14) + #endif + #endif + #endif + #endif + #endif + } + + a += state[0]; state[0] = a; + b += state[1]; state[1] = b; + c += state[2]; state[2] = c; + d += state[3]; state[3] = d; + e += state[4]; state[4] = e; + f += state[5]; state[5] = f; + g += state[6]; state[6] = g; + h += state[7]; state[7] = h; + + data += SHA512_BLOCK_SIZE; + } + while (--numBlocks); +} + + +#define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1) + +void Sha512_Update(CSha512 *p, const Byte *data, size_t size) +{ + if (size == 0) + return; + { + const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); + const unsigned num = SHA512_BLOCK_SIZE - pos; + p->v.vars.count += size; + if (num > size) + { + memcpy(p->buffer + pos, data, size); + return; + } + if (pos != 0) + { + size -= num; + memcpy(p->buffer + pos, data, num); + data += num; + Sha512_UpdateBlock(p); + } + } + { + const size_t numBlocks = size >> 7; + // if (numBlocks) + SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks); + size &= SHA512_BLOCK_SIZE - 1; + if (size == 0) + return; + data += (numBlocks << 7); + memcpy(p->buffer, data, size); + } +} + + +void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize) +{ + unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1); + p->buffer[pos++] = 0x80; + if (pos > (SHA512_BLOCK_SIZE - 8 * 2)) + { + while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; } + // memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos); + Sha512_UpdateBlock(p); + pos = 0; + } + memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos); + { + const UInt64 numBits = p->v.vars.count << 3; + SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits) + SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits) + } + Sha512_UpdateBlock(p); +#if 1 && defined(MY_CPU_BE) + memcpy(digest, p->state, digestSize); +#else + { + const unsigned numWords = digestSize >> 3; + unsigned i; + for (i = 0; i < numWords; i++) + { + const UInt64 v = p->state[i]; + SetBe64(digest, v) + digest += 8; + } + if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE + { + const UInt32 v = (UInt32)((p->state[numWords]) >> 32); + SetBe32(digest, v) + } + } +#endif + Sha512_InitState(p, digestSize); +} + + + + +#if defined(_WIN32) && defined(Z7_COMPILER_SHA512_SUPPORTED) \ + && defined(MY_CPU_ARM64) // we can disable this check to debug in x64 + +#if 1 // 0 for debug + +#include "7zWindows.h" +// #include +#if 0 && defined(MY_CPU_X86_OR_AMD64) +#include // for debug : for __ud2() +#endif + +BoolInt CPU_IsSupported_SHA512(void) +{ +#if defined(MY_CPU_ARM64) + // we have no SHA512 flag for IsProcessorFeaturePresent() still. + if (!CPU_IsSupported_CRYPTO()) + return False; +#endif + // printf("\nCPU_IsSupported_SHA512\n"); + { + // we can't read ID_AA64ISAR0_EL1 register from application. + // but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value. + HKEY key = NULL; + LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE, + TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"), + 0, KEY_READ, &key); + if (res != ERROR_SUCCESS) + return False; + { + DWORD type = 0; + DWORD count = sizeof(UInt64); + UInt64 val = 0; + res = RegQueryValueEx(key, TEXT("CP 4030"), NULL, + &type, (LPBYTE)&val, &count); + RegCloseKey(key); + if (res != ERROR_SUCCESS + || type != REG_QWORD + || count != sizeof(UInt64) + || ((unsigned)(val >> 12) & 0xf) != 2) + return False; + // we parse SHA2 field of ID_AA64ISAR0_EL1 register: + // 0 : No SHA2 instructions implemented + // 1 : SHA256 implemented + // 2 : SHA256 and SHA512 implemented + } + } + + +#if 1 // 0 for debug to disable SHA512 PROBE code + +/* +----- SHA512 PROBE ----- + +We suppose that "CP 4030" registry reading is enough. +But we use additional SHA512 PROBE code, because +we can catch exception here, and we don't catch exceptions, +if we call Sha512 functions from main code. + +NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64. +The program just stops. +Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator +without SHA512 support (-skl switch), +The program stops, and we have message from SDE: + TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1 +But we still want to catch that exception instead of process stopping. +Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)? +Are there any ways to fix the problems with arm64-wine and x64-SDE cases? +*/ + + // printf("\n========== CPU_IsSupported_SHA512 PROBE ========\n"); + { +#ifdef __clang_major__ + #pragma GCC diagnostic ignored "-Wlanguage-extension-token" +#endif + __try + { +#if 0 // 1 : for debug (reduced version to detect sha512) + const uint64x2_t a = vdupq_n_u64(1); + const uint64x2_t b = vsha512hq_u64(a, a, a); + if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002) + return True; +#else + MY_ALIGN(16) + UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS]; + memset(temp, 0x5a, sizeof(temp)); +#if 0 && defined(MY_CPU_X86_OR_AMD64) + __ud2(); // for debug : that exception is not problem for SDE +#endif +#if 1 + Sha512_UpdateBlocks_HW(temp, + (const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1); + // printf("\n==== t = %x\n", (UInt32)temp[0]); + if ((UInt32)temp[0] == 0xa33cfdf7) + { + // printf("\n=== PROBE SHA512: SHA512 supported\n"); + return True; + } +#endif +#endif + } + __except (EXCEPTION_EXECUTE_HANDLER) + { + // printf("\n==== CPU_IsSupported_SHA512 EXCEPTION_EXECUTE_HANDLER\n"); + } + } + return False; +#else + // without SHA512 PROBE code + return True; +#endif + +} + +#else + +BoolInt CPU_IsSupported_SHA512(void) +{ + return False; +} + +#endif +#endif // WIN32 arm64 + + +void Sha512Prepare(void) +{ +#ifdef Z7_COMPILER_SHA512_SUPPORTED + SHA512_FUNC_UPDATE_BLOCKS f, f_hw; + f = Sha512_UpdateBlocks; + f_hw = NULL; +#ifdef MY_CPU_X86_OR_AMD64 + if (CPU_IsSupported_SHA512() + && CPU_IsSupported_AVX2() + ) +#else + if (CPU_IsSupported_SHA512()) +#endif + { + // printf("\n========== HW SHA512 ======== \n"); + f = f_hw = Sha512_UpdateBlocks_HW; + } + g_SHA512_FUNC_UPDATE_BLOCKS = f; + g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw; +#endif +} + + +#undef K +#undef S0 +#undef S1 +#undef s0 +#undef s1 +#undef Ch +#undef Maj +#undef W_MAIN +#undef W_PRE +#undef w +#undef blk2_main +#undef blk2 +#undef T1 +#undef T4 +#undef T8 +#undef R1_PRE +#undef R1_MAIN +#undef R2_MAIN +#undef R4 +#undef R4_PRE +#undef R4_MAIN +#undef R8 +#undef R8_PRE +#undef R8_MAIN +#undef STEP_PRE +#undef STEP_MAIN +#undef Z7_SHA512_BIG_W +#undef Z7_SHA512_UNROLL +#undef Z7_COMPILER_SHA512_SUPPORTED diff --git a/C/Sha512.h b/C/Sha512.h new file mode 100644 index 0000000..1f3a4d1 --- /dev/null +++ b/C/Sha512.h @@ -0,0 +1,86 @@ +/* Sha512.h -- SHA-512 Hash +: Igor Pavlov : Public domain */ + +#ifndef ZIP7_INC_SHA512_H +#define ZIP7_INC_SHA512_H + +#include "7zTypes.h" + +EXTERN_C_BEGIN + +#define SHA512_NUM_BLOCK_WORDS 16 +#define SHA512_NUM_DIGEST_WORDS 8 + +#define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8) +#define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8) +#define SHA512_224_DIGEST_SIZE (224 / 8) +#define SHA512_256_DIGEST_SIZE (256 / 8) +#define SHA512_384_DIGEST_SIZE (384 / 8) + +typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks); + +/* + if (the system supports different SHA512 code implementations) + { + (CSha512::func_UpdateBlocks) will be used + (CSha512::func_UpdateBlocks) can be set by + Sha512_Init() - to default (fastest) + Sha512_SetFunction() - to any algo + } + else + { + (CSha512::func_UpdateBlocks) is ignored. + } +*/ + +typedef struct +{ + union + { + struct + { + SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks; + UInt64 count; + } vars; + UInt64 _pad_64bit[8]; + void *_pad_align_ptr[2]; + } v; + UInt64 state[SHA512_NUM_DIGEST_WORDS]; + + Byte buffer[SHA512_BLOCK_SIZE]; +} CSha512; + + +#define SHA512_ALGO_DEFAULT 0 +#define SHA512_ALGO_SW 1 +#define SHA512_ALGO_HW 2 + +/* +Sha512_SetFunction() +return: + 0 - (algo) value is not supported, and func_UpdateBlocks was not changed + 1 - func_UpdateBlocks was set according (algo) value. +*/ + +BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo); +// we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8 +void Sha512_InitState(CSha512 *p, unsigned digestSize); +void Sha512_Init(CSha512 *p, unsigned digestSize); +void Sha512_Update(CSha512 *p, const Byte *data, size_t size); +void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize); + + + + +// void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks); + +/* +call Sha512Prepare() once at program start. +It prepares all supported implementations, and detects the fastest implementation. +*/ + +void Sha512Prepare(void); + +EXTERN_C_END + +#endif diff --git a/C/Sha512Opt.c b/C/Sha512Opt.c new file mode 100644 index 0000000..3a13868 --- /dev/null +++ b/C/Sha512Opt.c @@ -0,0 +1,395 @@ +/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions +: Igor Pavlov : Public domain */ + +#include "Precomp.h" +#include "Compiler.h" +#include "CpuArch.h" + +// #define Z7_USE_HW_SHA_STUB // for debug +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it + #define USE_HW_SHA + #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) + #define USE_HW_SHA + #if !defined(__INTEL_COMPILER) + // icc defines __GNUC__, but icc doesn't support __attribute__(__target__) + #if !defined(__SHA512__) || !defined(__AVX2__) + #define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2"))) + #endif + #endif + #elif defined(Z7_MSC_VER_ORIGINAL) + #if (_MSC_VER >= 1940) + #define USE_HW_SHA + #else + // #define Z7_USE_HW_SHA_STUB + #endif + #endif +// #endif // MY_CPU_X86_OR_AMD64 +#ifndef USE_HW_SHA + // #define Z7_USE_HW_SHA_STUB // for debug +#endif + +#ifdef USE_HW_SHA + +// #pragma message("Sha512 HW") + +#include + +#if defined (__clang__) && defined(_MSC_VER) + #if !defined(__AVX__) + #include + #endif + #if !defined(__AVX2__) + #include + #endif + #if !defined(__SHA512__) + #include + #endif +#else + +#endif + +/* +SHA512 uses: +AVX: + _mm256_loadu_si256 (vmovdqu) + _mm256_storeu_si256 + _mm256_set_epi32 (unused) +AVX2: + _mm256_add_epi64 : vpaddq + _mm256_shuffle_epi8 : vpshufb + _mm256_shuffle_epi32 : pshufd + _mm256_blend_epi32 : vpblendd + _mm256_permute4x64_epi64 : vpermq : 3c + _mm256_permute2x128_si256: vperm2i128 : 3c + _mm256_extracti128_si256 : vextracti128 : 3c +SHA512: + _mm256_sha512* +*/ + +// K array must be aligned for 32-bytes at least. +// The compiler can look align attribute and selects +// vmovdqu - for code without align attribute +// vmovdqa - for code with align attribute +extern +MY_ALIGN(64) +const UInt64 SHA512_K_ARRAY[80]; +#define K SHA512_K_ARRAY + + +#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src); +#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0)); +#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src); + +#define LOAD_SHUFFLE(m, k) \ + m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \ + m = _mm256_shuffle_epi8(m, mask); \ + +#define NNN(m0, m1, m2, m3) + +#define SM1(m1, m2, m3, m0) \ + SHA512_MSG1(m0, m1); \ + +#define SM2(m2, m3, m0, m1) \ + ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \ + SHA512_MSG2(m0, m3); \ + +#define RND2(t0, t1, lane) \ + t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane)); + + + +#define R4(k, m0, m1, m2, m3, OP0, OP1) \ + msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \ + RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \ + RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \ + + + + +#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ + R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \ + R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \ + R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \ + R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \ + +#define PREPARE_STATE \ + state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \ + state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \ + tmp = state0; \ + state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \ + state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \ + + +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA512 +ATTRIB_SHA512 +#endif +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) +{ + const __m256i mask = _mm256_set_epi32( + 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607, + 0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607); + __m256i tmp, state0, state1; + + if (numBlocks == 0) + return; + + state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]); + state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]); + + PREPARE_STATE + + do + { + __m256i state0_save, state1_save; + __m256i m0, m1, m2, m3; + __m256i msg; + // #define msg tmp + + state0_save = state0; + state1_save = state1; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + + + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) + ADD_EPI64(state0, state0_save) + ADD_EPI64(state1, state1_save) + + data += 128; + } + while (--numBlocks); + + PREPARE_STATE + + _mm256_storeu_si256((__m256i *) (void *) &state[0], state0); + _mm256_storeu_si256((__m256i *) (void *) &state[4], state1); +} + +#endif // USE_HW_SHA + +// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc +#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_SHA512) + #define USE_HW_SHA + #else + #if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \ + || defined(__GNUC__) && (__GNUC__ >= 9) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it + #define USE_HW_SHA + #endif + #endif + +#ifdef USE_HW_SHA + +// #pragma message("=== Sha512 HW === ") + + +#if defined(__clang__) || defined(__GNUC__) +#if !defined(__ARM_FEATURE_SHA512) +// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ") +#if defined(__clang__) + #define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3" +#else + #define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3"))) +#endif +#endif +#endif + + +#if defined(Z7_MSC_VER_ORIGINAL) +#include +#else + +#if defined(__clang__) && __clang_major__ < 16 +#if !defined(__ARM_FEATURE_SHA512) +// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ") + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_SHA512_WAS_SET 1 + #define __ARM_FEATURE_SHA512 1 + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif // clang + +#include + +#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \ + defined(__ARM_FEATURE_SHA512) + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #undef __ARM_FEATURE_SHA512 + #undef Z7_ARM_FEATURE_SHA512_WAS_SET + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") +#endif + +#endif // Z7_MSC_VER_ORIGINAL + +typedef uint64x2_t v128_64; +// typedef __n128 v128_64; // MSVC + +#ifdef MY_CPU_BE + #define MY_rev64_for_LE(x) x +#else + #define MY_rev64_for_LE(x) vrev64q_u8(x) +#endif + +#define LOAD_128_64(_p) vld1q_u64(_p) +#define LOAD_128_8(_p) vld1q_u8 (_p) +#define STORE_128_64(_p, _v) vst1q_u64(_p, _v) + +#define LOAD_SHUFFLE(m, k) \ + m = vreinterpretq_u64_u8( \ + MY_rev64_for_LE( \ + LOAD_128_8(data + (k) * 16))); \ + +// K array must be aligned for 16-bytes at least. +extern +MY_ALIGN(64) +const UInt64 SHA512_K_ARRAY[80]; +#define K SHA512_K_ARRAY + +#define NN(m0, m1, m4, m5, m7) +#define SM(m0, m1, m4, m5, m7) \ + m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1)); + +#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \ + OP(m0, m1, m4, m5, m7) \ + t = vaddq_u64(m0, vld1q_u64(k)); \ + t = vaddq_u64(vextq_u64(t, t, 1), a3); \ + t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \ + a3 = vsha512h2q_u64(t, a1, a0); \ + a1 = vaddq_u64(a1, t); \ + +#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \ + R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \ + R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \ + R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \ + R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \ + +#define R16(k, OP) \ + R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \ + R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \ + + +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA512 +ATTRIB_SHA512 +#endif +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) +{ + v128_64 a0, a1, a2, a3; + + if (numBlocks == 0) + return; + a0 = LOAD_128_64(&state[0]); + a1 = LOAD_128_64(&state[2]); + a2 = LOAD_128_64(&state[4]); + a3 = LOAD_128_64(&state[6]); + do + { + v128_64 a0_save, a1_save, a2_save, a3_save; + v128_64 m0, m1, m2, m3, m4, m5, m6, m7; + v128_64 t; + unsigned i; + const UInt64 *k_ptr; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + LOAD_SHUFFLE (m4, 4) + LOAD_SHUFFLE (m5, 5) + LOAD_SHUFFLE (m6, 6) + LOAD_SHUFFLE (m7, 7) + + a0_save = a0; + a1_save = a1; + a2_save = a2; + a3_save = a3; + + R16 ( K, NN ) + k_ptr = K + 16; + for (i = 0; i < 4; i++) + { + R16 ( k_ptr, SM ) + k_ptr += 16; + } + + a0 = vaddq_u64(a0, a0_save); + a1 = vaddq_u64(a1, a1_save); + a2 = vaddq_u64(a2, a2_save); + a3 = vaddq_u64(a3, a3_save); + + data += 128; + } + while (--numBlocks); + + STORE_128_64(&state[0], a0); + STORE_128_64(&state[2], a1); + STORE_128_64(&state[4], a2); + STORE_128_64(&state[6], a3); +} + +#endif // USE_HW_SHA + +#endif // MY_CPU_ARM_OR_ARM64 + + +#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) +// #error Stop_Compiling_UNSUPPORTED_SHA +// #include +// We can compile this file with another C compiler, +// or we can compile asm version. +// So we can generate real code instead of this stub function. +// #include "Sha512.h" +// #if defined(_MSC_VER) +#pragma message("Sha512 HW-SW stub was used") +// #endif +void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks); +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks); +void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks) +{ + Sha512_UpdateBlocks(state, data, numBlocks); + /* + UNUSED_VAR(state); + UNUSED_VAR(data); + UNUSED_VAR(numBlocks); + exit(1); + return; + */ +} +#endif + + +#undef K +#undef RND2 +#undef MY_rev64_for_LE +#undef NN +#undef NNN +#undef LOAD_128 +#undef STORE_128 +#undef LOAD_SHUFFLE +#undef SM1 +#undef SM2 +#undef SM +#undef R2 +#undef R4 +#undef R16 +#undef PREPARE_STATE +#undef USE_HW_SHA +#undef ATTRIB_SHA512 +#undef USE_VER_MIN +#undef Z7_USE_HW_SHA_STUB -- cgit v1.2.3-55-g6feb