From f19f813537c7aea1c20749c914e756b54a9c3cf5 Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Mon, 27 Dec 2021 00:00:00 +0000 Subject: '21.07' --- C/Sha1Opt.c | 373 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 C/Sha1Opt.c (limited to 'C/Sha1Opt.c') diff --git a/C/Sha1Opt.c b/C/Sha1Opt.c new file mode 100644 index 0000000..63132da --- /dev/null +++ b/C/Sha1Opt.c @@ -0,0 +1,373 @@ +/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions +2021-04-01 : Igor Pavlov : Public domain */ + +#include "Precomp.h" + +#if defined(_MSC_VER) +#if (_MSC_VER < 1900) && (_MSC_VER >= 1200) +// #define USE_MY_MM +#endif +#endif + +#include "CpuArch.h" + +#ifdef MY_CPU_X86_OR_AMD64 + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_SHA + #ifndef __SHA__ + #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) + #if defined(_MSC_VER) + // SSSE3: for clang-cl: + #include + #define __SHA__ + #endif + #endif + #pragma clang diagnostic ignored "-Wvector-conversion" + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 8) // fix that check + #define USE_HW_SHA + #ifndef __SHA__ + #define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) + // #pragma GCC target("sha,ssse3") + #endif + #endif + #elif defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1800) // fix that check + #define USE_HW_SHA + #endif + #elif defined(_MSC_VER) + #ifdef USE_MY_MM + #define USE_VER_MIN 1300 + #else + #define USE_VER_MIN 1910 + #endif + #if _MSC_VER >= USE_VER_MIN + #define USE_HW_SHA + #endif + #endif +// #endif // MY_CPU_X86_OR_AMD64 + +#ifdef USE_HW_SHA + +// #pragma message("Sha1 HW") +// #include + +#if !defined(_MSC_VER) || (_MSC_VER >= 1900) +#include +#else +#include + +#if defined(_MSC_VER) && (_MSC_VER >= 1600) +// #include +#endif + +#ifdef USE_MY_MM +#include "My_mm.h" +#endif + +#endif + +/* +SHA1 uses: +SSE2: + _mm_loadu_si128 + _mm_storeu_si128 + _mm_set_epi32 + _mm_add_epi32 + _mm_shuffle_epi32 / pshufd + _mm_xor_si128 + _mm_cvtsi128_si32 + _mm_cvtsi32_si128 +SSSE3: + _mm_shuffle_epi8 / pshufb + +SHA: + _mm_sha1* +*/ + +#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); +#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); +#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); +#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); + +#define SHA1_RND4(abcd, e0, f) abcd = _mm_sha1rnds4_epu32(abcd, e0, f); +#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m); + + + + + +#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); +#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); + + +#define LOAD_SHUFFLE(m, k) \ + m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ + SHUFFLE_EPI8(m, mask); \ + +#define SM1(m0, m1, m2, m3) \ + SHA1_MSG1(m0, m1); \ + +#define SM2(m0, m1, m2, m3) \ + XOR_SI128(m3, m1); \ + SHA1_MSG2(m3, m2); \ + +#define SM3(m0, m1, m2, m3) \ + XOR_SI128(m3, m1); \ + SM1(m0, m1, m2, m3) \ + SHA1_MSG2(m3, m2); \ + +#define NNN(m0, m1, m2, m3) + + + + + + + + + + + + + + + + + +#define R4(k, e0, e1, m0, m1, m2, m3, OP) \ + e1 = abcd; \ + SHA1_RND4(abcd, e0, (k) / 5); \ + SHA1_NEXTE(e1, m1); \ + OP(m0, m1, m2, m3); \ + +#define R16(k, mx, OP0, OP1, OP2, OP3) \ + R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ + R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ + R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ + R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ + +#define PREPARE_STATE \ + SHUFFLE_EPI32 (abcd, 0x1B); \ + SHUFFLE_EPI32 (e0, 0x1B); \ + + + + + +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) +{ + const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); + + __m128i abcd, e0; + + if (numBlocks == 0) + return; + + abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca + e0 = _mm_cvtsi32_si128((int)state[4]); // 000e + + PREPARE_STATE + + do + { + __m128i abcd_save, e2; + __m128i m0, m1, m2, m3; + __m128i e1; + + + abcd_save = abcd; + e2 = e0; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + ADD_EPI32(e0, m0); + + R16 ( 0, m0, SM1, SM3, SM3, SM3 ); + R16 ( 1, m0, SM3, SM3, SM3, SM3 ); + R16 ( 2, m0, SM3, SM3, SM3, SM3 ); + R16 ( 3, m0, SM3, SM3, SM3, SM3 ); + R16 ( 4, e2, SM2, NNN, NNN, NNN ); + + ADD_EPI32(abcd, abcd_save); + + data += 64; + } + while (--numBlocks); + + PREPARE_STATE + + _mm_storeu_si128((__m128i *) (void *) state, abcd); + *(state+4) = (UInt32)_mm_cvtsi128_si32(e0); +} + +#endif // USE_HW_SHA + +#elif defined(MY_CPU_ARM_OR_ARM64) + + #if defined(__clang__) + #if (__clang_major__ >= 8) // fix that check + #define USE_HW_SHA + #endif + #elif defined(__GNUC__) + #if (__GNUC__ >= 6) // fix that check + #define USE_HW_SHA + #endif + #elif defined(_MSC_VER) + #if _MSC_VER >= 1910 + #define USE_HW_SHA + #endif + #endif + +#ifdef USE_HW_SHA + +// #pragma message("=== Sha1 HW === ") + +#if defined(__clang__) || defined(__GNUC__) + #ifdef MY_CPU_ARM64 + #define ATTRIB_SHA __attribute__((__target__("+crypto"))) + #else + #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #endif +#else + // _MSC_VER + // for arm32 + #define _ARM_USE_NEW_NEON_INTRINSICS +#endif + +#if defined(_MSC_VER) && defined(MY_CPU_ARM64) +#include +#else +#include +#endif + +typedef uint32x4_t v128; +// typedef __n128 v128; // MSVC + +#ifdef MY_CPU_BE + #define MY_rev32_for_LE(x) +#else + #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) +#endif + +#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) +#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) + +#define LOAD_SHUFFLE(m, k) \ + m = LOAD_128((data + (k) * 16)); \ + MY_rev32_for_LE(m); \ + +#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); +#define SU1(dest, src) dest = vsha1su1q_u32(dest, src); +#define C(e) abcd = vsha1cq_u32(abcd, e, t); +#define P(e) abcd = vsha1pq_u32(abcd, e, t); +#define M(e) abcd = vsha1mq_u32(abcd, e, t); +#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) +#define T(m, c) t = vaddq_u32(m, c) + +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); +#ifdef ATTRIB_SHA +ATTRIB_SHA +#endif +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) +{ + v128 abcd; + v128 c0, c1, c2, c3; + uint32_t e0; + + if (numBlocks == 0) + return; + + c0 = vdupq_n_u32(0x5a827999); + c1 = vdupq_n_u32(0x6ed9eba1); + c2 = vdupq_n_u32(0x8f1bbcdc); + c3 = vdupq_n_u32(0xca62c1d6); + + abcd = LOAD_128(&state[0]); + e0 = state[4]; + + do + { + v128 abcd_save; + v128 m0, m1, m2, m3; + v128 t; + uint32_t e0_save, e1; + + abcd_save = abcd; + e0_save = e0; + + LOAD_SHUFFLE (m0, 0) + LOAD_SHUFFLE (m1, 1) + LOAD_SHUFFLE (m2, 2) + LOAD_SHUFFLE (m3, 3) + + T(m0, c0); H(e1); C(e0); + T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); + T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); + T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); + T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); + T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); + T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0); + T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); + T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); + T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1); + T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); + T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1); + T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0); + T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1); + T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0); + T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1); + T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0); + T(m1, c3); SU1(m3, m2); H(e0); P(e1); + T(m2, c3); H(e1); P(e0); + T(m3, c3); H(e0); P(e1); + + abcd = vaddq_u32(abcd, abcd_save); + e0 += e0_save; + + data += 64; + } + while (--numBlocks); + + STORE_128(&state[0], abcd); + state[4] = e0; +} + +#endif // USE_HW_SHA + +#endif // MY_CPU_ARM_OR_ARM64 + + +#ifndef USE_HW_SHA + +// #error Stop_Compiling_UNSUPPORTED_SHA +// #include + +// #include "Sha1.h" +void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); + +#pragma message("Sha1 HW-SW stub was used") + +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); +void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) +{ + Sha1_UpdateBlocks(state, data, numBlocks); + /* + UNUSED_VAR(state); + UNUSED_VAR(data); + UNUSED_VAR(numBlocks); + exit(1); + return; + */ +} + +#endif -- cgit v1.2.3-55-g6feb