/* Blake2s.c -- BLAKE2sp Hash 2024-01-29 : Igor Pavlov : Public domain 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */ #include "Precomp.h" // #include #include #include "Blake2.h" #include "RotateDefs.h" #include "Compiler.h" #include "CpuArch.h" #if defined(__SSE2__) #define Z7_BLAKE2S_USE_VECTORS #elif defined(MY_CPU_X86_OR_AMD64) #if defined(_MSC_VER) && _MSC_VER > 1200 \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \ || defined(__clang__) \ || defined(__INTEL_COMPILER) #define Z7_BLAKE2S_USE_VECTORS #endif #endif #ifdef Z7_BLAKE2S_USE_VECTORS #define Z7_BLAKE2SP_USE_FUNCTIONS // define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes. // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%. #if defined(__SSSE3__) #define Z7_BLAKE2S_USE_SSSE3 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) #define Z7_BLAKE2S_USE_SSSE3 #endif #ifdef Z7_BLAKE2S_USE_SSSE3 /* SSE41 : for _mm_insert_epi32 (pinsrd) it can slightly reduce code size and improves the performance in some cases. it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used. it can be used for all blocks in another algos (4+). */ #if defined(__SSE4_1__) #define Z7_BLAKE2S_USE_SSE41 #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \ || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \ || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000) #define Z7_BLAKE2S_USE_SSE41 #endif #endif // SSSE3 #if defined(__GNUC__) || defined(__clang__) #if defined(Z7_BLAKE2S_USE_SSE41) #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1"))) #elif defined(Z7_BLAKE2S_USE_SSSE3) #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3"))) #else #define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2"))) #endif #endif #if defined(__AVX2__) #define Z7_BLAKE2S_USE_AVX2 #else #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) #define Z7_BLAKE2S_USE_AVX2 #ifdef Z7_BLAKE2S_USE_AVX2 #define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2"))) #endif #elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) #if (Z7_MSC_VER_ORIGINAL == 1900) #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX #endif #define Z7_BLAKE2S_USE_AVX2 #endif #endif #ifdef Z7_BLAKE2S_USE_SSE41 #include // SSE4.1 #elif defined(Z7_BLAKE2S_USE_SSSE3) #include // SSSE3 #else #include // SSE2 #endif #ifdef Z7_BLAKE2S_USE_AVX2 #include #if defined(__clang__) #include #include #endif #endif // avx2 #if defined(__AVX512F__) && defined(__AVX512VL__) // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930) #define Z7_BLAKE2S_USE_AVX512_ALWAYS // #pragma message ("=== Blake2s AVX512") #endif #define Z7_BLAKE2S_USE_V128_FAST // for speed optimization for small messages: // #define Z7_BLAKE2S_USE_V128_WAY2 #ifdef Z7_BLAKE2S_USE_AVX2 // for debug: // gather is slow // #define Z7_BLAKE2S_USE_GATHER #define Z7_BLAKE2S_USE_AVX2_FAST // for speed optimization for small messages: // #define Z7_BLAKE2S_USE_AVX2_WAY2 // #define Z7_BLAKE2S_USE_AVX2_WAY4 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \ defined(Z7_BLAKE2S_USE_AVX2_WAY4) #define Z7_BLAKE2S_USE_AVX2_WAY_SLOW #endif #endif #define Z7_BLAKE2SP_ALGO_DEFAULT 0 #define Z7_BLAKE2SP_ALGO_SCALAR 1 #ifdef Z7_BLAKE2S_USE_V128_FAST #define Z7_BLAKE2SP_ALGO_V128_FAST 2 #endif #ifdef Z7_BLAKE2S_USE_AVX2_FAST #define Z7_BLAKE2SP_ALGO_V256_FAST 3 #endif #define Z7_BLAKE2SP_ALGO_V128_WAY1 4 #ifdef Z7_BLAKE2S_USE_V128_WAY2 #define Z7_BLAKE2SP_ALGO_V128_WAY2 5 #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 #define Z7_BLAKE2SP_ALGO_V256_WAY2 6 #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 #define Z7_BLAKE2SP_ALGO_V256_WAY4 7 #endif #endif // Z7_BLAKE2S_USE_VECTORS #define BLAKE2S_FINAL_FLAG (~(UInt32)0) #define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS #define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE) #define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1) #define V_INDEX_0_0 0 #define V_INDEX_1_0 1 #define V_INDEX_2_0 2 #define V_INDEX_3_0 3 #define V_INDEX_0_1 4 #define V_INDEX_1_1 5 #define V_INDEX_2_1 6 #define V_INDEX_3_1 7 #define V_INDEX_0_2 8 #define V_INDEX_1_2 9 #define V_INDEX_2_2 10 #define V_INDEX_3_2 11 #define V_INDEX_0_3 12 #define V_INDEX_1_3 13 #define V_INDEX_2_3 14 #define V_INDEX_3_3 15 #define V_INDEX_4_0 0 #define V_INDEX_5_0 1 #define V_INDEX_6_0 2 #define V_INDEX_7_0 3 #define V_INDEX_7_1 4 #define V_INDEX_4_1 5 #define V_INDEX_5_1 6 #define V_INDEX_6_1 7 #define V_INDEX_6_2 8 #define V_INDEX_7_2 9 #define V_INDEX_4_2 10 #define V_INDEX_5_2 11 #define V_INDEX_5_3 12 #define V_INDEX_6_3 13 #define V_INDEX_7_3 14 #define V_INDEX_4_3 15 #define V(row, col) v[V_INDEX_ ## row ## _ ## col] #define k_Blake2s_IV_0 0x6A09E667UL #define k_Blake2s_IV_1 0xBB67AE85UL #define k_Blake2s_IV_2 0x3C6EF372UL #define k_Blake2s_IV_3 0xA54FF53AUL #define k_Blake2s_IV_4 0x510E527FUL #define k_Blake2s_IV_5 0x9B05688CUL #define k_Blake2s_IV_6 0x1F83D9ABUL #define k_Blake2s_IV_7 0x5BE0CD19UL #define KIV(n) (k_Blake2s_IV_## n) #ifdef Z7_BLAKE2S_USE_VECTORS MY_ALIGN(16) static const UInt32 k_Blake2s_IV[8] = { KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7) }; #endif #define STATE_T(s) ((s) + 8) #define STATE_F(s) ((s) + 10) #ifdef Z7_BLAKE2S_USE_VECTORS #define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p)) #define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p)) #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED // here we use unaligned load and stores // use this branch if CBlake2sp can be unaligned for 16 bytes #define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r) #define LOAD_128_FROM_STRUCT(p) LOADU_128(p) #define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r) #else // here we use aligned load and stores // use this branch if CBlake2sp is aligned for 16 bytes #define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r) #define LOAD_128_FROM_STRUCT(p) LOAD_128(p) #define STORE_128_TO_STRUCT(p, r) STORE_128(p, r) #endif #endif // Z7_BLAKE2S_USE_VECTORS #if 0 static void PrintState(const UInt32 *s, unsigned num) { unsigned i; printf("\n"); for (i = 0; i < num; i++) printf(" %08x", (unsigned)s[i]); } static void PrintStates2(const UInt32 *s, unsigned x, unsigned y) { unsigned i; for (i = 0; i < y; i++) PrintState(s + i * x, x); printf("\n"); } #endif #define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) } #define BLAKE2S_NUM_ROUNDS 10 #if defined(Z7_BLAKE2S_USE_VECTORS) #define ROUNDS_LOOP(mac) \ { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) } #endif /* #define ROUNDS_LOOP_2(mac) \ { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } } */ #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) #define ROUNDS_LOOP_UNROLLED(m) \ { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) } #endif #define SIGMA_TABLE(M) \ M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \ M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \ M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \ M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \ M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \ M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \ M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \ M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \ M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \ M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 ) #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m } #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) // MY_ALIGN(32) MY_ALIGN(16) static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] = { SIGMA_TABLE(SIGMA_TABLE_MULT_4) }; #define GET_SIGMA_PTR(p, index) \ ((const void *)((const Byte *)(const void *)(p) + (index))) #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \ ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos))) #ifdef Z7_BLAKE2S_USE_VECTORS #if 0 // use loading constants from memory // is faster for some compilers. #define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n) MY_ALIGN(64) static const UInt32 k_Blake2s_IV_WAY4[]= { KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7) }; #define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i)) #else // use constant generation: #define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i)) #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW #define GET_CONST_128_FROM_ARRAY32(k) \ _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0]) #endif #if 0 #define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) #define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) #define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE) #define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0) #define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4) #else #if defined(Z7_BLAKE2S_USE_SSSE3) && \ !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 }; MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 }; #define k_r8 LOAD_128(k_r8_arr) #define k_r16 LOAD_128(k_r16_arr) #endif MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 }; #define k_inc LOAD_128(k_inc_arr) #define k_iv0_128 LOAD_128(k_Blake2s_IV + 0) #define k_iv4_128 LOAD_128(k_Blake2s_IV + 4) #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW #ifdef Z7_BLAKE2S_USE_AVX2 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) #else #define MY_mm256_set_m128i _mm256_set_m128i #endif #define SET_FROM_128(a) MY_mm256_set_m128i(a, a) #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS MY_ALIGN(32) static const Byte k_r8_arr_256 [32] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 }; MY_ALIGN(32) static const Byte k_r16_arr_256[32] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 }; #define k_r8_256 LOAD_256(k_r8_arr_256) #define k_r16_256 LOAD_256(k_r16_arr_256) #endif // #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)) // #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)) // #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)) // #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)) #define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)) #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW #endif /* IPC(TP) ports: 1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps 2 p_15 : icl+ 1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps 3 p015 : skl+ 3 p015 : SSE2 : pxor : _mm_xor_si128 2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32 2 p0_5: mrm-wsm : 3 p015 : skl+ 2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq 2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32 2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16 2 p_15 : : SSE2 : psrldq : 2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8 2 p_15 : : SSE4 : pblendw : _mm_blend_epi16 1 p__5 : hsw-skl : * 1 p0 : SSE2 : pslld (i8) : _mm_slli_si128 2 p01 : skl+ : 2 p_15 : ivb- : SSE3 : palignr 1 p__5 : hsw+ 2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8) 1 p__5 + p23 : hsw-skl 1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8) 0.5 2*p5 : hsw-skl 2 p23 : SSE2 : movd (m32) 3 p23A : adl : 1 p5: : SSE2 : movd (r32) */ #if 0 && defined(__XOP__) // we must debug and test __XOP__ instruction #include #include #define LOAD_ROTATE_CONSTS #define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c)) #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) #define LOAD_ROTATE_CONSTS #define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c) #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED #else // MSVC_1937+ uses "orps" instruction for _mm_or_si128(). // But "orps" has low throughput: TP=1 for bdw-nhm. // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps". // But "orps" is fast for modern cpus (skl+). // So we are default with "or" version: #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937 // minor optimization for some old cpus, if "xorps" is slow. #define MM128_EPI32_OR_or_ADD _mm_add_epi32 #else #define MM128_EPI32_OR_or_ADD _mm_or_si128 #endif #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \ MM128_EPI32_OR_or_ADD( \ _mm_srli_epi32((r), (c)), \ _mm_slli_epi32((r), 32-(c)))) #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) #define LOAD_ROTATE_CONSTS \ const __m128i r8 = k_r8; \ const __m128i r16 = k_r16; #define MM_ROR_EPI32(r, c) ( \ ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \ : (16==(c)) ? _mm_shuffle_epi8(r,r16) \ : MM_ROR_EPI32_VIA_SHIFT(r, c)) #else #define LOAD_ROTATE_CONSTS #define MM_ROR_EPI32(r, c) ( \ (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \ : MM_ROR_EPI32_VIA_SHIFT(r, c)) #endif #endif /* we have 3 main ways to load 4 32-bit integers to __m128i: 1) SSE2: _mm_set_epi32() 2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128() 3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128() good compiler for _mm_set_epi32() generates these instructions: { movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq; } good new compiler generates one instruction { for _mm_insert_epi32() : { pinsrd xmm, [m32], i } for _mm_cvtsi32_si128() : { movd xmm, [m32] } } but vc2010 generates slow pair of instructions: { for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i } for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 } } _mm_insert_epi32() (pinsrd) code reduces xmm register pressure in comparison with _mm_set_epi32() (movd + vpunpckld) code. Note that variant with "movd xmm, r32" can be more slow, but register pressure can be more important. So we can force to "pinsrd" always. */ // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86) #ifdef Z7_BLAKE2S_USE_SSE41 /* _mm_set_epi32() can be more effective for GCC and CLANG _mm_insert_epi32() is more effective for MSVC */ #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION #endif #endif // USE_SSE41 // #endif #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION // for SSE4.1 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ _mm_insert_epi32( \ _mm_insert_epi32( \ _mm_insert_epi32( \ _mm_cvtsi32_si128( \ *(const Int32 *)p0), \ *(const Int32 *)p1, 1), \ *(const Int32 *)p2, 2), \ *(const Int32 *)p3, 3) #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read. Also _mm_unpacklo_epi32 is more effective for another MSVC compilers. But _mm_set_epi32() is more effective for GCC and CLANG. So we use _mm_unpacklo_epi32 for MSVC only */ #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ _mm_unpacklo_epi64( \ _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \ _mm_cvtsi32_si128(*(const Int32 *)p1)), \ _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \ _mm_cvtsi32_si128(*(const Int32 *)p3))) #else #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \ _mm_set_epi32( \ *(const Int32 *)p3, \ *(const Int32 *)p2, \ *(const Int32 *)p1, \ *(const Int32 *)p0) #endif #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \ MM_LOAD_EPI32_FROM_4_POINTERS( \ GET_SIGMA_PTR(input, i0), \ GET_SIGMA_PTR(input, i1), \ GET_SIGMA_PTR(input, i2), \ GET_SIGMA_PTR(input, i3)) #define SET_ROW_FROM_SIGMA(input, sigma_index) \ SET_ROW_FROM_SIGMA_BASE(input, \ sigma[(sigma_index) ], \ sigma[(sigma_index) + 2 * 1], \ sigma[(sigma_index) + 2 * 2], \ sigma[(sigma_index) + 2 * 3]) \ #define ADD_128(a, b) _mm_add_epi32(a, b) #define XOR_128(a, b) _mm_xor_si128(a, b) #define D_ADD_128(dest, src) dest = ADD_128(dest, src) #define D_XOR_128(dest, src) dest = XOR_128(dest, src) #define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift) #define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src) #define AXR(a, b, d, shift) \ D_ADD_128(a, b); \ D_XOR_128(d, a); \ D_ROR_128(d, shift); #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \ a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \ AXR(a, b, d, shift1) \ AXR(c, d, b, shift2) #define ROTATE_WORDS_TO_RIGHT(a, n) \ a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); #define AXR4(a, b, c, d, input, sigma_index) \ AXR2(a, b, c, d, input, sigma_index, 16, 12) \ AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \ #define RR2(a, b, c, d, input) \ { \ AXR4(a, b, c, d, input, 0) \ ROTATE_WORDS_TO_RIGHT(b, 1) \ ROTATE_WORDS_TO_RIGHT(c, 2) \ ROTATE_WORDS_TO_RIGHT(d, 3) \ AXR4(a, b, c, d, input, 8) \ ROTATE_WORDS_TO_RIGHT(b, 3) \ ROTATE_WORDS_TO_RIGHT(c, 2) \ ROTATE_WORDS_TO_RIGHT(d, 1) \ } /* Way1: per 64 bytes block: 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1 * (7 + 1) = 320 cycles = if pslld TP=2 (skl+) additional operations per 7_op_iter : 4 movzx byte mem 1 movd mem 3 pinsrd mem 1.5 pshufd */ static #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \ defined(Z7_BLAKE2S_USE_V256_WAY2)) Z7_NO_INLINE #else Z7_FORCE_INLINE #endif #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input) { __m128i a, b, c, d; __m128i f0, f1; LOAD_ROTATE_CONSTS d = LOAD_128_FROM_STRUCT(STATE_T(s)); c = k_iv0_128; a = f0 = LOAD_128_FROM_STRUCT(s); b = f1 = LOAD_128_FROM_STRUCT(s + 4); D_ADD_EPI64_128(d, k_inc); STORE_128_TO_STRUCT (STATE_T(s), d); D_XOR_128(d, k_iv4_128); #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ RR2(a, b, c, d, input) } ROUNDS_LOOP(RR) #undef RR STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c))); STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d))); } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end) { size_t pos = 0; do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); Blake2s_Compress_V128_Way1(s, data); data += Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE; pos &= SUPER_BLOCK_MASK; } while (data != end); } #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \ defined(Z7_BLAKE2S_USE_AVX2_WAY2) #if 1 #define Z7_BLAKE2S_CompressSingleBlock(s, data) \ Blake2sp_Compress2_V128_Way1(s, data, \ (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE) #else #define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1 #endif #endif #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \ defined(Z7_BLAKE2S_USE_V128_WAY2)) && \ !defined(Z7_BLAKE2S_USE_GATHER) #define AXR2_LOAD_INDEXES(sigma_index) \ const unsigned i0 = sigma[(sigma_index)]; \ const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \ const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \ const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \ #define SET_ROW_FROM_SIGMA_W(input) \ SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) #endif #ifdef Z7_BLAKE2S_USE_V128_WAY2 #if 1 || !defined(Z7_BLAKE2S_USE_SSE41) /* we use SET_ROW_FROM_SIGMA_BASE, that uses (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined (SSE2) _mm_set_epi32() MSVC can be faster for this branch: */ #define AXR2_W(sigma_index, shift1, shift2) \ { \ AXR2_LOAD_INDEXES(sigma_index) \ a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \ a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \ AXR(a0, b0, d0, shift1) \ AXR(a1, b1, d1, shift1) \ AXR(c0, d0, b0, shift2) \ AXR(c1, d1, b1, shift2) \ } #else /* we use interleaved _mm_insert_epi32(): GCC can be faster for this branch: */ #define AXR2_W_PRE_INSERT(sigma_index, i) \ { const unsigned ii = sigma[(sigma_index) + i * 2]; \ t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \ t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \ } #define AXR2_W(sigma_index, shift1, shift2) \ { __m128i t0, t1; \ { const unsigned ii = sigma[sigma_index]; \ t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \ t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \ } \ AXR2_W_PRE_INSERT(sigma_index, 1) \ AXR2_W_PRE_INSERT(sigma_index, 2) \ AXR2_W_PRE_INSERT(sigma_index, 3) \ a0 = _mm_add_epi32(a0, t0); \ a1 = _mm_add_epi32(a1, t1); \ AXR(a0, b0, d0, shift1) \ AXR(a1, b1, d1, shift1) \ AXR(c0, d0, b0, shift2) \ AXR(c1, d1, b1, shift2) \ } #endif #define AXR4_W(sigma_index) \ AXR2_W(sigma_index, 16, 12) \ AXR2_W(sigma_index + 1, 8, 7) \ #define WW(r) \ { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \ AXR4_W(0) \ ROTATE_WORDS_TO_RIGHT(b0, 1) \ ROTATE_WORDS_TO_RIGHT(b1, 1) \ ROTATE_WORDS_TO_RIGHT(c0, 2) \ ROTATE_WORDS_TO_RIGHT(c1, 2) \ ROTATE_WORDS_TO_RIGHT(d0, 3) \ ROTATE_WORDS_TO_RIGHT(d1, 3) \ AXR4_W(8) \ ROTATE_WORDS_TO_RIGHT(b0, 3) \ ROTATE_WORDS_TO_RIGHT(b1, 3) \ ROTATE_WORDS_TO_RIGHT(c0, 2) \ ROTATE_WORDS_TO_RIGHT(c1, 2) \ ROTATE_WORDS_TO_RIGHT(d0, 1) \ ROTATE_WORDS_TO_RIGHT(d1, 1) \ } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end) { size_t pos = 0; end -= Z7_BLAKE2S_BLOCK_SIZE; if (data != end) { LOAD_ROTATE_CONSTS do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); __m128i a0, b0, c0, d0; __m128i a1, b1, c1, d1; { const __m128i inc = k_inc; const __m128i temp = k_iv4_128; d0 = LOAD_128_FROM_STRUCT (STATE_T(s)); d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); D_ADD_EPI64_128(d0, inc); D_ADD_EPI64_128(d1, inc); STORE_128_TO_STRUCT (STATE_T(s ), d0); STORE_128_TO_STRUCT (STATE_T(s + NSW), d1); D_XOR_128(d0, temp); D_XOR_128(d1, temp); } c1 = c0 = k_iv0_128; a0 = LOAD_128_FROM_STRUCT(s); b0 = LOAD_128_FROM_STRUCT(s + 4); a1 = LOAD_128_FROM_STRUCT(s + NSW); b1 = LOAD_128_FROM_STRUCT(s + NSW + 4); ROUNDS_LOOP (WW) #undef WW D_XOR_128(a0, c0); D_XOR_128(b0, d0); D_XOR_128(a1, c1); D_XOR_128(b1, d1); D_XOR_128(a0, LOAD_128_FROM_STRUCT(s)); D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4)); D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW)); D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4)); STORE_128_TO_STRUCT(s, a0); STORE_128_TO_STRUCT(s + 4, b0); STORE_128_TO_STRUCT(s + NSW, a1); STORE_128_TO_STRUCT(s + NSW + 4, b1); data += Z7_BLAKE2S_BLOCK_SIZE * 2; pos += Z7_BLAKE2S_BLOCK_SIZE * 2; pos &= SUPER_BLOCK_MASK; } while (data < end); if (data != end) return; } { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); Z7_BLAKE2S_CompressSingleBlock(s, data); } } #endif // Z7_BLAKE2S_USE_V128_WAY2 #ifdef Z7_BLAKE2S_USE_V128_WAY2 #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2 #else #define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1 #endif #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED #define ROT_128_8(x) MM_ROR_EPI32(x, 8) #define ROT_128_16(x) MM_ROR_EPI32(x, 16) #define ROT_128_7(x) MM_ROR_EPI32(x, 7) #define ROT_128_12(x) MM_ROR_EPI32(x, 12) #else #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41) #define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8 #define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16 #else #define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8) #define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16) #endif #define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7) #define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12) #endif #if 1 // this branch can provide similar speed on x86* in most cases, // because [base + index*4] provides same speed as [base + index]. // but some compilers can generate different code with this branch, that can be faster sometimes. // this branch uses additional table of 10*16=160 bytes. #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) MY_ALIGN(16) static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] = { SIGMA_TABLE(SIGMA_TABLE_MULT_16) }; #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r]; #define GET_SIGMA_VAL_128(n) (sigma[n]) #else #define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; #define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n]) #endif #ifdef Z7_BLAKE2S_USE_AVX2_FAST #if 1 #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) MY_ALIGN(64) static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] = { SIGMA_TABLE(SIGMA_TABLE_MULT_32) }; #define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r]; #define GET_SIGMA_VAL_256(n) (sigma[n]) #else #define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r]; #define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n]) #endif #endif // Z7_BLAKE2S_USE_AVX2_FAST #define D_ROT_128_7(dest) dest = ROT_128_7(dest) #define D_ROT_128_8(dest) dest = ROT_128_8(dest) #define D_ROT_128_12(dest) dest = ROT_128_12(dest) #define D_ROT_128_16(dest) dest = ROT_128_16(dest) #define OP_L(a, i) D_ADD_128 (V(a, 0), \ LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i)))); #define OP_0(a) OP_L(a, 0) #define OP_7(a) OP_L(a, 1) #define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1)); #define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0)); #define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3)); #define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2)); #define OP_3(a) D_ROT_128_16 (V(a, 3)); #define OP_6(a) D_ROT_128_12 (V(a, 1)); #define OP_8(a) D_ROT_128_8 (V(a, 3)); #define OP_9(a) D_ROT_128_7 (V(a, 1)); // for 32-bit x86 : interleave mode works slower, because of register pressure. #if 0 || 1 && (defined(MY_CPU_X86) \ || defined(__GNUC__) && !defined(__clang__)) // non-inteleaved version: // is fast for x86 32-bit. // is fast for GCC x86-64. #define V4G(a) \ OP_0 (a) \ OP_1 (a) \ OP_2 (a) \ OP_3 (a) \ OP_4 (a) \ OP_5 (a) \ OP_6 (a) \ OP_7 (a) \ OP_1 (a) \ OP_2 (a) \ OP_8 (a) \ OP_4 (a) \ OP_5 (a) \ OP_9 (a) \ #define V4R \ { \ V4G (0) \ V4G (1) \ V4G (2) \ V4G (3) \ V4G (4) \ V4G (5) \ V4G (6) \ V4G (7) \ } #elif 0 || 1 && defined(MY_CPU_X86) #define OP_INTER_2(op, a,b) \ op (a) \ op (b) \ #define V4G(a,b) \ OP_INTER_2 (OP_0, a,b) \ OP_INTER_2 (OP_1, a,b) \ OP_INTER_2 (OP_2, a,b) \ OP_INTER_2 (OP_3, a,b) \ OP_INTER_2 (OP_4, a,b) \ OP_INTER_2 (OP_5, a,b) \ OP_INTER_2 (OP_6, a,b) \ OP_INTER_2 (OP_7, a,b) \ OP_INTER_2 (OP_1, a,b) \ OP_INTER_2 (OP_2, a,b) \ OP_INTER_2 (OP_8, a,b) \ OP_INTER_2 (OP_4, a,b) \ OP_INTER_2 (OP_5, a,b) \ OP_INTER_2 (OP_9, a,b) \ #define V4R \ { \ V4G (0, 1) \ V4G (2, 3) \ V4G (4, 5) \ V4G (6, 7) \ } #else // iterleave-4 version is fast for x64 (MSVC/CLANG) #define OP_INTER_4(op, a,b,c,d) \ op (a) \ op (b) \ op (c) \ op (d) \ #define V4G(a,b,c,d) \ OP_INTER_4 (OP_0, a,b,c,d) \ OP_INTER_4 (OP_1, a,b,c,d) \ OP_INTER_4 (OP_2, a,b,c,d) \ OP_INTER_4 (OP_3, a,b,c,d) \ OP_INTER_4 (OP_4, a,b,c,d) \ OP_INTER_4 (OP_5, a,b,c,d) \ OP_INTER_4 (OP_6, a,b,c,d) \ OP_INTER_4 (OP_7, a,b,c,d) \ OP_INTER_4 (OP_1, a,b,c,d) \ OP_INTER_4 (OP_2, a,b,c,d) \ OP_INTER_4 (OP_8, a,b,c,d) \ OP_INTER_4 (OP_4, a,b,c,d) \ OP_INTER_4 (OP_5, a,b,c,d) \ OP_INTER_4 (OP_9, a,b,c,d) \ #define V4R \ { \ V4G (0, 1, 2, 3) \ V4G (4, 5, 6, 7) \ } #endif #define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R } #define V4_LOAD_MSG_1(w, m, i) \ { \ __m128i m0, m1, m2, m3; \ __m128i t0, t1, t2, t3; \ m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \ m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \ m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \ m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \ t0 = _mm_unpacklo_epi32(m0, m1); \ t1 = _mm_unpackhi_epi32(m0, m1); \ t2 = _mm_unpacklo_epi32(m2, m3); \ t3 = _mm_unpackhi_epi32(m2, m3); \ w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \ w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \ w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \ w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \ } #define V4_LOAD_MSG(w, m) \ { \ V4_LOAD_MSG_1 (w, m, 0) \ V4_LOAD_MSG_1 (w, m, 1) \ V4_LOAD_MSG_1 (w, m, 2) \ V4_LOAD_MSG_1 (w, m, 3) \ } #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \ { \ const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \ const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \ d0 = _mm_unpacklo_epi32(v0, v1); \ d1 = _mm_unpackhi_epi32(v0, v1); \ } #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \ { \ STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \ STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \ } #define V4_UNPACK_STATE(dest32, src32) \ { \ __m128i t0, t1, t2, t3, t4, t5, t6, t7; \ V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \ V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \ V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \ V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \ V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \ V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \ V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \ V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \ } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end) { // PrintStates2(s_items, 8, 16); size_t pos = 0; pos /= 2; do { #if defined(Z7_BLAKE2S_USE_SSSE3) && \ !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED) const __m128i r8 = k_r8; const __m128i r16 = k_r16; #endif __m128i w[16]; __m128i v[16]; UInt32 *s; V4_LOAD_MSG(w, data) s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); { __m128i ctr = LOAD_128_FROM_STRUCT(s + 64); D_ADD_EPI64_128 (ctr, k_inc); STORE_128_TO_STRUCT(s + 64, ctr); v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); } v[ 8] = GET_128_IV_WAY4(0); v[ 9] = GET_128_IV_WAY4(1); v[10] = GET_128_IV_WAY4(2); v[11] = GET_128_IV_WAY4(3); v[14] = GET_128_IV_WAY4(6); v[15] = GET_128_IV_WAY4(7); #define LOAD_STATE_128_FROM_STRUCT(i) \ v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4); #define UPDATE_STATE_128_IN_STRUCT(i) \ STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \ XOR_128(v[i], v[(i) + 8]), \ LOAD_128_FROM_STRUCT(s + (i) * 4))); REP8_MACRO (LOAD_STATE_128_FROM_STRUCT) ROUNDS_LOOP (V4_ROUND) REP8_MACRO (UPDATE_STATE_128_IN_STRUCT) data += Z7_BLAKE2S_BLOCK_SIZE * 4; pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2; pos &= SUPER_BLOCK_SIZE / 2 - 1; } while (data != end); } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2sp_Final_V128_Fast(UInt32 *states) { const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n"); // PrintStates2(states, 8, 16); { ptrdiff_t pos = 8 * 4; do { UInt32 *src32 = states + (size_t)(pos * 1); UInt32 *dest32 = states + (size_t)(pos * 2); V4_UNPACK_STATE(dest32, src32) pos -= 8 * 4; } while (pos >= 0); } { unsigned k; for (k = 0; k < 8; k++) { UInt32 *s = states + (size_t)k * 16; STORE_128_TO_STRUCT (STATE_T(s), ctr); } } // PrintStates2(states, 8, 16); } #ifdef Z7_BLAKE2S_USE_AVX2 #define ADD_256(a, b) _mm256_add_epi32(a, b) #define XOR_256(a, b) _mm256_xor_si256(a, b) #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) #define MM256_ROR_EPI32 _mm256_ror_epi32 #define Z7_MM256_ROR_EPI32_IS_SUPPORTED #define LOAD_ROTATE_CONSTS_256 #else #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 #define LOAD_ROTATE_CONSTS_256 \ const __m256i r8 = k_r8_256; \ const __m256i r16 = k_r16_256; #endif // AVX2_WAY2 #define MM256_ROR_EPI32(r, c) ( \ ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \ : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \ : _mm256_or_si256( \ _mm256_srli_epi32((r), (c)), \ _mm256_slli_epi32((r), 32-(c)))) #endif // WAY_SLOW #endif #define D_ADD_256(dest, src) dest = ADD_256(dest, src) #define D_XOR_256(dest, src) dest = XOR_256(dest, src) #define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p)) #ifdef Z7_BLAKE2S_USE_AVX2_FAST #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED #define ROT_256_16(x) MM256_ROR_EPI32((x), 16) #define ROT_256_12(x) MM256_ROR_EPI32((x), 12) #define ROT_256_8(x) MM256_ROR_EPI32((x), 8) #define ROT_256_7(x) MM256_ROR_EPI32((x), 7) #else #define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \ 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1) #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \ 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2) #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16) #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20)) #define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8) #define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25)) #endif #define D_ROT_256_7(dest) dest = ROT_256_7(dest) #define D_ROT_256_8(dest) dest = ROT_256_8(dest) #define D_ROT_256_12(dest) dest = ROT_256_12(dest) #define D_ROT_256_16(dest) dest = ROT_256_16(dest) #define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p)) #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r) #define LOAD_256_FROM_STRUCT(p) LOADU_256(p) #define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r) #else // if struct is aligned for 32-bytes #define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r) #define LOAD_256_FROM_STRUCT(p) LOAD_256(p) #define STORE_256_TO_STRUCT(p, r) STORE_256(p, r) #endif #endif // Z7_BLAKE2S_USE_AVX2_FAST #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW #if 0 #define DIAG_PERM2(s) \ { \ const __m256i a = LOAD_256_FROM_STRUCT((s) ); \ const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \ STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \ STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \ } #else #define DIAG_PERM2(s) \ { \ const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \ const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \ STORE_128_TO_STRUCT((s) + NSW, a); \ STORE_128_TO_STRUCT((s) + 4 , b); \ } #endif #define DIAG_PERM8(s_items) \ { \ DIAG_PERM2(s_items) \ DIAG_PERM2(s_items + NSW * 2) \ DIAG_PERM2(s_items + NSW * 4) \ DIAG_PERM2(s_items + NSW * 6) \ } #define AXR256(a, b, d, shift) \ D_ADD_256(a, b); \ D_XOR_256(d, a); \ d = MM256_ROR_EPI32(d, shift); \ #ifdef Z7_BLAKE2S_USE_GATHER #define TABLE_GATHER_256_4(a0,a1,a2,a3) \ a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16 #define TABLE_GATHER_256( \ a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \ { TABLE_GATHER_256_4(a0,a2,a4,a6), \ TABLE_GATHER_256_4(a1,a3,a5,a7), \ TABLE_GATHER_256_4(a8,a10,a12,a14), \ TABLE_GATHER_256_4(a9,a11,a13,a15) } MY_ALIGN(64) static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] = { SIGMA_TABLE(TABLE_GATHER_256) }; #define GET_SIGMA(r) \ const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r]; #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ const __m256i i01234567 = LOAD_256(sigma + (sigma_index)); #define SET_ROW_FROM_SIGMA_AVX(in) \ _mm256_i32gather_epi32((const void *)(in), i01234567, 4) #define SIGMA_INTERLEAVE 8 #define SIGMA_HALF_ROW_SIZE 16 #else // !Z7_BLAKE2S_USE_GATHER #define GET_SIGMA(r) \ const Byte * const sigma = k_Blake2s_Sigma_4[r]; #define AXR2_LOAD_INDEXES_AVX(sigma_index) \ AXR2_LOAD_INDEXES(sigma_index) #define SET_ROW_FROM_SIGMA_AVX(in) \ MY_mm256_set_m128i( \ SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \ SET_ROW_FROM_SIGMA_W(in)) #define SIGMA_INTERLEAVE 1 #define SIGMA_HALF_ROW_SIZE 8 #endif // !Z7_BLAKE2S_USE_GATHER #define ROTATE_WORDS_TO_RIGHT_256(a, n) \ a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3)); #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 #define AXR2_A(sigma_index, shift1, shift2) \ AXR2_LOAD_INDEXES_AVX(sigma_index) \ D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ AXR256(a0, b0, d0, shift1) \ AXR256(c0, d0, b0, shift2) \ #define AXR4_A(sigma_index) \ { AXR2_A(sigma_index, 16, 12) } \ { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) } #define EE1(r) \ { GET_SIGMA(r) \ AXR4_A(0) \ ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ AXR4_A(SIGMA_HALF_ROW_SIZE) \ ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_AVX2 BLAKE2S_ATTRIB_AVX2 #endif void Z7_FASTCALL Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end) { size_t pos = 0; end -= Z7_BLAKE2S_BLOCK_SIZE; if (data != end) { LOAD_ROTATE_CONSTS_256 DIAG_PERM8(s_items) do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); __m256i a0, b0, c0, d0; { const __m128i inc = k_inc; __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); D_ADD_EPI64_128(d0_128, inc); D_ADD_EPI64_128(d1_128, inc); STORE_128_TO_STRUCT (STATE_T(s ), d0_128); STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128); d0 = MY_mm256_set_m128i(d1_128, d0_128); D_XOR_256(d0, k_iv4_256); } c0 = SET_FROM_128(k_iv0_128); a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); ROUNDS_LOOP (EE1) D_XOR_256(a0, c0); D_XOR_256(b0, d0); D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); STORE_256_TO_STRUCT(s + NSW * 0, a0); STORE_256_TO_STRUCT(s + NSW * 1, b0); data += Z7_BLAKE2S_BLOCK_SIZE * 2; pos += Z7_BLAKE2S_BLOCK_SIZE * 2; pos &= SUPER_BLOCK_MASK; } while (data < end); DIAG_PERM8(s_items) if (data != end) return; } { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); Z7_BLAKE2S_CompressSingleBlock(s, data); } } #endif // Z7_BLAKE2S_USE_AVX2_WAY2 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 #define AXR2_X(sigma_index, shift1, shift2) \ AXR2_LOAD_INDEXES_AVX(sigma_index) \ D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \ D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \ AXR256(a0, b0, d0, shift1) \ AXR256(a1, b1, d1, shift1) \ AXR256(c0, d0, b0, shift2) \ AXR256(c1, d1, b1, shift2) \ #define AXR4_X(sigma_index) \ { AXR2_X(sigma_index, 16, 12) } \ { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) } #define EE2(r) \ { GET_SIGMA(r) \ AXR4_X(0) \ ROTATE_WORDS_TO_RIGHT_256(b0, 1) \ ROTATE_WORDS_TO_RIGHT_256(b1, 1) \ ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ ROTATE_WORDS_TO_RIGHT_256(d0, 3) \ ROTATE_WORDS_TO_RIGHT_256(d1, 3) \ AXR4_X(SIGMA_HALF_ROW_SIZE) \ ROTATE_WORDS_TO_RIGHT_256(b0, 3) \ ROTATE_WORDS_TO_RIGHT_256(b1, 3) \ ROTATE_WORDS_TO_RIGHT_256(c0, 2) \ ROTATE_WORDS_TO_RIGHT_256(c1, 2) \ ROTATE_WORDS_TO_RIGHT_256(d0, 1) \ ROTATE_WORDS_TO_RIGHT_256(d1, 1) \ } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_AVX2 BLAKE2S_ATTRIB_AVX2 #endif void Z7_FASTCALL Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end) { size_t pos = 0; if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4) { #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED const __m256i r8 = k_r8_256; const __m256i r16 = k_r16_256; #endif end -= Z7_BLAKE2S_BLOCK_SIZE * 3; DIAG_PERM8(s_items) do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); __m256i a0, b0, c0, d0; __m256i a1, b1, c1, d1; { const __m128i inc = k_inc; __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s)); __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW)); __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2)); __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3)); D_ADD_EPI64_128(d0_128, inc); D_ADD_EPI64_128(d1_128, inc); D_ADD_EPI64_128(d2_128, inc); D_ADD_EPI64_128(d3_128, inc); STORE_128_TO_STRUCT (STATE_T(s ), d0_128); STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128); STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128); STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128); d0 = MY_mm256_set_m128i(d1_128, d0_128); d1 = MY_mm256_set_m128i(d3_128, d2_128); D_XOR_256(d0, k_iv4_256); D_XOR_256(d1, k_iv4_256); } c1 = c0 = SET_FROM_128(k_iv0_128); a0 = LOAD_256_FROM_STRUCT(s + NSW * 0); b0 = LOAD_256_FROM_STRUCT(s + NSW * 1); a1 = LOAD_256_FROM_STRUCT(s + NSW * 2); b1 = LOAD_256_FROM_STRUCT(s + NSW * 3); ROUNDS_LOOP (EE2) D_XOR_256(a0, c0); D_XOR_256(b0, d0); D_XOR_256(a1, c1); D_XOR_256(b1, d1); D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0)); D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1)); D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2)); D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3)); STORE_256_TO_STRUCT(s + NSW * 0, a0); STORE_256_TO_STRUCT(s + NSW * 1, b0); STORE_256_TO_STRUCT(s + NSW * 2, a1); STORE_256_TO_STRUCT(s + NSW * 3, b1); data += Z7_BLAKE2S_BLOCK_SIZE * 4; pos += Z7_BLAKE2S_BLOCK_SIZE * 4; pos &= SUPER_BLOCK_MASK; } while (data < end); DIAG_PERM8(s_items) end += Z7_BLAKE2S_BLOCK_SIZE * 3; } if (data == end) return; // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos); do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); Z7_BLAKE2S_CompressSingleBlock(s, data); data += Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE; pos &= SUPER_BLOCK_MASK; } while (data != end); } #endif // Z7_BLAKE2S_USE_AVX2_WAY4 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW // --------------------------------------------------------- #ifdef Z7_BLAKE2S_USE_AVX2_FAST #define OP256_L(a, i) D_ADD_256 (V(a, 0), \ LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i)))); #define OP256_0(a) OP256_L(a, 0) #define OP256_7(a) OP256_L(a, 1) #define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1)); #define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0)); #define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3)); #define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2)); #define OP256_3(a) D_ROT_256_16 (V(a, 3)); #define OP256_6(a) D_ROT_256_12 (V(a, 1)); #define OP256_8(a) D_ROT_256_8 (V(a, 3)); #define OP256_9(a) D_ROT_256_7 (V(a, 1)); #if 0 || 1 && defined(MY_CPU_X86) #define V8_G(a) \ OP256_0 (a) \ OP256_1 (a) \ OP256_2 (a) \ OP256_3 (a) \ OP256_4 (a) \ OP256_5 (a) \ OP256_6 (a) \ OP256_7 (a) \ OP256_1 (a) \ OP256_2 (a) \ OP256_8 (a) \ OP256_4 (a) \ OP256_5 (a) \ OP256_9 (a) \ #define V8R { \ V8_G (0); \ V8_G (1); \ V8_G (2); \ V8_G (3); \ V8_G (4); \ V8_G (5); \ V8_G (6); \ V8_G (7); \ } #else #define OP256_INTER_4(op, a,b,c,d) \ op (a) \ op (b) \ op (c) \ op (d) \ #define V8_G(a,b,c,d) \ OP256_INTER_4 (OP256_0, a,b,c,d) \ OP256_INTER_4 (OP256_1, a,b,c,d) \ OP256_INTER_4 (OP256_2, a,b,c,d) \ OP256_INTER_4 (OP256_3, a,b,c,d) \ OP256_INTER_4 (OP256_4, a,b,c,d) \ OP256_INTER_4 (OP256_5, a,b,c,d) \ OP256_INTER_4 (OP256_6, a,b,c,d) \ OP256_INTER_4 (OP256_7, a,b,c,d) \ OP256_INTER_4 (OP256_1, a,b,c,d) \ OP256_INTER_4 (OP256_2, a,b,c,d) \ OP256_INTER_4 (OP256_8, a,b,c,d) \ OP256_INTER_4 (OP256_4, a,b,c,d) \ OP256_INTER_4 (OP256_5, a,b,c,d) \ OP256_INTER_4 (OP256_9, a,b,c,d) \ #define V8R { \ V8_G (0, 1, 2, 3) \ V8_G (4, 5, 6, 7) \ } #endif #define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R } // for debug: // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER) // gather instruction is slow. #define V8_LOAD_MSG(w, m) \ { \ unsigned i; \ for (i = 0; i < 16; ++i) { \ w[i] = _mm256_i32gather_epi32( \ (const void *)((m) + i * sizeof(UInt32)),\ _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \ sizeof(UInt32)); \ } \ } #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER #define V8_LOAD_MSG_2(w, a0, a1) \ { \ (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \ (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \ } #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \ { \ __m256i s0, s1, s2, s3; \ s0 = _mm256_unpacklo_epi64(z0, z1); \ s1 = _mm256_unpackhi_epi64(z0, z1); \ s2 = _mm256_unpacklo_epi64(z2, z3); \ s3 = _mm256_unpackhi_epi64(z2, z3); \ V8_LOAD_MSG_2((w) + 0, s0, s2) \ V8_LOAD_MSG_2((w) + 1, s1, s3) \ } #define V8_LOAD_MSG_0(t0, t1, m) \ { \ __m256i m0, m1; \ m0 = LOADU_256(m); \ m1 = LOADU_256((m) + 2 * 32); \ t0 = _mm256_unpacklo_epi32(m0, m1); \ t1 = _mm256_unpackhi_epi32(m0, m1); \ } #define V8_LOAD_MSG_8(w, m) \ { \ __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \ V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \ V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \ V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \ V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \ V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \ } #define V8_LOAD_MSG(w, m) \ { \ V8_LOAD_MSG_8(w, m) \ V8_LOAD_MSG_8((w) + 8, (m) + 32) \ } #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER #define V8_PERM_PAIR_STORE(u, a0, a2) \ { \ STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \ STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \ } #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \ { \ __m256i s0, s1, s2, s3; \ s0 = _mm256_unpacklo_epi64(z0, z1); \ s1 = _mm256_unpackhi_epi64(z0, z1); \ s2 = _mm256_unpacklo_epi64(z2, z3); \ s3 = _mm256_unpackhi_epi64(z2, z3); \ V8_PERM_PAIR_STORE(u + 0, s0, s2) \ V8_PERM_PAIR_STORE(u + 2, s1, s3) \ } #define V8_UNPACK_STORE_0(src32, d0, d1) \ { \ const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \ const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \ d0 = _mm256_unpacklo_epi32(v0, v1); \ d1 = _mm256_unpackhi_epi32(v0, v1); \ } #define V8_UNPACK_STATE(dest32, src32) \ { \ __m256i t0, t1, t2, t3, t4, t5, t6, t7; \ V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \ V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \ V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \ V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \ V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \ V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \ } #define V8_LOAD_STATE_256_FROM_STRUCT(i) \ v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8); #if 0 || 0 && defined(MY_CPU_X86) #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT #endif #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT // this branch doesn't use (iv) array // so register pressure can be lower. // it can be faster sometimes #define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i) #define V8_UPDATE_STATE_256(i) \ { \ STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \ XOR_256(v[i], v[(i) + 8]), \ LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \ } #else // it uses more variables (iv) registers // it's better for gcc // maybe that branch is better, if register pressure will be lower (avx512) #define V8_LOAD_STATE_256(i) { iv[i] = v[i]; } #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); } #define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); } #endif #if 0 // use loading constants from memory #define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n) MY_ALIGN(64) static const UInt32 k_Blake2s_IV_WAY8[]= { KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7) }; #define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i)) #else // use constant generation: #define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i)) #endif static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_AVX2 BLAKE2S_ATTRIB_AVX2 #endif void Z7_FASTCALL Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end) { #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT __m256i v[16]; #endif // PrintStates2(s_items, 8, 16); #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT) #endif do { __m256i w[16]; #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT __m256i v[16]; #else __m256i iv[8]; #endif V8_LOAD_MSG(w, data) { // we use load/store ctr inside loop to reduce register pressure: #if 1 || 1 && defined(MY_CPU_X86) const __m256i ctr = _mm256_add_epi64( LOAD_256_FROM_STRUCT(s_items + 64), _mm256_set_epi32( 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)); STORE_256_TO_STRUCT(s_items + 64, ctr); #else const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64) + Z7_BLAKE2S_BLOCK_SIZE; const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64); *(UInt64 *)(void *)(s_items + 64) = ctr64; #endif v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0))); v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1))); } v[ 8] = GET_256_IV_WAY8(0); v[ 9] = GET_256_IV_WAY8(1); v[10] = GET_256_IV_WAY8(2); v[11] = GET_256_IV_WAY8(3); v[14] = GET_256_IV_WAY8(6); v[15] = GET_256_IV_WAY8(7); REP8_MACRO (V8_LOAD_STATE_256) ROUNDS_LOOP (V8_ROUND) REP8_MACRO (V8_UPDATE_STATE_256) data += SUPER_BLOCK_SIZE; } while (data != end); #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT REP8_MACRO (V8_STORE_STATE_256) #endif } static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_AVX2 BLAKE2S_ATTRIB_AVX2 #endif void Z7_FASTCALL Blake2sp_Final_AVX2_Fast(UInt32 *states) { const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64); // PrintStates2(states, 8, 16); V8_UNPACK_STATE(states, states) // PrintStates2(states, 8, 16); { unsigned k; for (k = 0; k < 8; k++) { UInt32 *s = states + (size_t)k * 16; STORE_128_TO_STRUCT (STATE_T(s), ctr); } } // PrintStates2(states, 8, 16); // printf("\nafter V8_UNPACK_STATE \n"); } #endif // Z7_BLAKE2S_USE_AVX2_FAST #endif // avx2 #endif // vector /* #define Blake2s_Increment_Counter(s, inc) \ { STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); } #define Blake2s_Increment_Counter_Small(s, inc) \ { STATE_T(s)[0] += (inc); } */ #define Blake2s_Set_LastBlock(s) \ { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ } #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600 // good for vs2022 #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) } #else // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64. #define LOOP_8(mac) { REP8_MACRO(mac) } #endif static Z7_FORCE_INLINE // Z7_NO_INLINE void Z7_FASTCALL Blake2s_Compress(UInt32 *s, const Byte *input) { UInt32 m[16]; UInt32 v[16]; { unsigned i; for (i = 0; i < 16; i++) m[i] = GetUi32(input + i * 4); } #define INIT_v_FROM_s(i) v[i] = s[i]; LOOP_8(INIT_v_FROM_s) // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE) { const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE; const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE); STATE_T(s)[0] = t0; STATE_T(s)[1] = t1; v[12] = t0 ^ KIV(4); v[13] = t1 ^ KIV(5); } // v[12] = STATE_T(s)[0] ^ KIV(4); // v[13] = STATE_T(s)[1] ^ KIV(5); v[14] = STATE_F(s)[0] ^ KIV(6); v[15] = STATE_F(s)[1] ^ KIV(7); v[ 8] = KIV(0); v[ 9] = KIV(1); v[10] = KIV(2); v[11] = KIV(3); // PrintStates2((const UInt32 *)v, 1, 16); #define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]); #define ADD32M(dest, src, a) V(a, dest) += V(a, src); #define XOR32M(dest, src, a) V(a, dest) ^= V(a, src); #define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift); // big interleaving can provides big performance gain, if scheduler queues are small. #if 0 || 1 && defined(MY_CPU_X86) // interleave-1: for small register number (x86-32bit) #define G2(index, a, x, y) \ ADD_SIGMA (a, (index) + 2 * 0) \ ADD32M (0, 1, a) \ XOR32M (3, 0, a) \ RTR32M (3, x, a) \ ADD32M (2, 3, a) \ XOR32M (1, 2, a) \ RTR32M (1, y, a) \ #define G(a) \ G2(a * 2 , a, 16, 12) \ G2(a * 2 + 1, a, 8, 7) \ #define R2 \ G(0) \ G(1) \ G(2) \ G(3) \ G(4) \ G(5) \ G(6) \ G(7) \ #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64) // interleave-2: is good if the number of registers is not big (x86-64). #define REP2(mac, dest, src, a, b) \ mac(dest, src, a) \ mac(dest, src, b) #define G2(index, a, b, x, y) \ ADD_SIGMA (a, (index) + 2 * 0) \ ADD_SIGMA (b, (index) + 2 * 1) \ REP2 (ADD32M, 0, 1, a, b) \ REP2 (XOR32M, 3, 0, a, b) \ REP2 (RTR32M, 3, x, a, b) \ REP2 (ADD32M, 2, 3, a, b) \ REP2 (XOR32M, 1, 2, a, b) \ REP2 (RTR32M, 1, y, a, b) \ #define G(a, b) \ G2(a * 2 , a, b, 16, 12) \ G2(a * 2 + 1, a, b, 8, 7) \ #define R2 \ G(0, 1) \ G(2, 3) \ G(4, 5) \ G(6, 7) \ #else // interleave-4: // it has big register pressure for x86/x64. // and MSVC compilers for x86/x64 are slow for this branch. // but if we have big number of registers, this branch can be faster. #define REP4(mac, dest, src, a, b, c, d) \ mac(dest, src, a) \ mac(dest, src, b) \ mac(dest, src, c) \ mac(dest, src, d) #define G2(index, a, b, c, d, x, y) \ ADD_SIGMA (a, (index) + 2 * 0) \ ADD_SIGMA (b, (index) + 2 * 1) \ ADD_SIGMA (c, (index) + 2 * 2) \ ADD_SIGMA (d, (index) + 2 * 3) \ REP4 (ADD32M, 0, 1, a, b, c, d) \ REP4 (XOR32M, 3, 0, a, b, c, d) \ REP4 (RTR32M, 3, x, a, b, c, d) \ REP4 (ADD32M, 2, 3, a, b, c, d) \ REP4 (XOR32M, 1, 2, a, b, c, d) \ REP4 (RTR32M, 1, y, a, b, c, d) \ #define G(a, b, c, d) \ G2(a * 2 , a, b, c, d, 16, 12) \ G2(a * 2 + 1, a, b, c, d, 8, 7) \ #define R2 \ G(0, 1, 2, 3) \ G(4, 5, 6, 7) \ #endif #define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 } // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster: // 20-40% faster for (x86/x64) VC2010+/GCC/CLANG. // 30-60% faster for (arm64-arm32) GCC. // 5-11% faster for (arm64) CLANG-MAC. // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch. // But if there is vectors branch (for x86*), this scalar code will be unused mostly. // So we want smaller code (without unrolling) in that case (x86*). #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS) #define Z7_BLAKE2S_UNROLL #endif #ifdef Z7_BLAKE2S_UNROLL ROUNDS_LOOP_UNROLLED (R) #else ROUNDS_LOOP (R) #endif #undef G #undef G2 #undef R #undef R2 // printf("\n v after: \n"); // PrintStates2((const UInt32 *)v, 1, 16); #define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8]; LOOP_8(XOR_s_PAIR_v) // printf("\n s after:\n"); // PrintStates2((const UInt32 *)s, 1, 16); } static Z7_NO_INLINE void Z7_FASTCALL Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end) { size_t pos = 0; // PrintStates2(s_items, 8, 16); do { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos); Blake2s_Compress(s, data); data += Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE; pos &= SUPER_BLOCK_MASK; } while (data != end); } #ifdef Z7_BLAKE2S_USE_VECTORS static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2; static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2; static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init; static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final; static unsigned g_z7_Blake2sp_SupportedFlags; #define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single #else #define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2 #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2 #endif // Z7_BLAKE2S_USE_VECTORS #if 1 && defined(MY_CPU_LE) #define GET_DIGEST(_s, _digest) \ { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); } #else #define GET_DIGEST(_s, _digest) \ { unsigned _i; for (_i = 0; _i < 8; _i++) \ { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \ } #endif /* ---------- BLAKE2s ---------- */ /* // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() typedef struct { Byte digest_length; Byte key_length; Byte fanout; // = 1 : in sequential mode Byte depth; // = 1 : in sequential mode UInt32 leaf_length; Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode Byte node_depth; // 0 for the leaves, or in sequential mode Byte inner_length; // [0, 32], 0 in sequential mode Byte salt[BLAKE2S_SALTBYTES]; Byte personal[BLAKE2S_PERSONALBYTES]; } CBlake2sParam; */ #define k_Blake2sp_IV_0 \ (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24))) #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \ (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24)) Z7_FORCE_INLINE static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth) { s[0] = k_Blake2sp_IV_0; s[1] = KIV(1); s[2] = KIV(2) ^ (UInt32)node_offset; s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth); s[4] = KIV(4); s[5] = KIV(5); s[6] = KIV(6); s[7] = KIV(7); STATE_T(s)[0] = 0; STATE_T(s)[1] = 0; STATE_F(s)[0] = 0; STATE_F(s)[1] = 0; } #ifdef Z7_BLAKE2S_USE_V128_FAST static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_128BIT BLAKE2S_ATTRIB_128BIT #endif void Z7_FASTCALL Blake2sp_InitState_V128_Fast(UInt32 *states) { #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \ { STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \ STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \ } #define STORE_128_PAIR_INIT_STATES_1(i, mac) \ { const __m128i t = mac; \ STORE_128_PAIR_INIT_STATES_2(i, t, t) \ } #define STORE_128_PAIR_INIT_STATES_IV(i) \ STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i)) STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0)) STORE_128_PAIR_INIT_STATES_IV (1) { const __m128i t = GET_128_IV_WAY4(2); STORE_128_PAIR_INIT_STATES_2 (2, XOR_128(t, _mm_set_epi32(3, 2, 1, 0)), XOR_128(t, _mm_set_epi32(7, 6, 5, 4))) } STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) STORE_128_PAIR_INIT_STATES_IV (4) STORE_128_PAIR_INIT_STATES_IV (5) STORE_128_PAIR_INIT_STATES_IV (6) STORE_128_PAIR_INIT_STATES_IV (7) STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0)) // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]); } #endif // Z7_BLAKE2S_USE_V128_FAST #ifdef Z7_BLAKE2S_USE_AVX2_FAST static Z7_NO_INLINE #ifdef BLAKE2S_ATTRIB_AVX2 BLAKE2S_ATTRIB_AVX2 #endif void Z7_FASTCALL Blake2sp_InitState_AVX2_Fast(UInt32 *states) { #define STORE_256_INIT_STATES(i, t) \ STORE_256_TO_STRUCT(states + 8 * (i), t); #define STORE_256_INIT_STATES_IV(i) \ STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i)) STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0)) STORE_256_INIT_STATES_IV (1) STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2), _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0))) STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0))) STORE_256_INIT_STATES_IV (4) STORE_256_INIT_STATES_IV (5) STORE_256_INIT_STATES_IV (6) STORE_256_INIT_STATES_IV (7) STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0)) // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n"); } #endif // Z7_BLAKE2S_USE_AVX2_FAST Z7_NO_INLINE void Blake2sp_InitState(CBlake2sp *p) { size_t i; // memset(p->states, 0, sizeof(p->states)); // for debug p->u.header.cycPos = 0; #ifdef Z7_BLAKE2SP_USE_FUNCTIONS if (p->u.header.func_Init) { p->u.header.func_Init(p->states); return; } #endif for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++) Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0); } void Blake2sp_Init(CBlake2sp *p) { #ifdef Z7_BLAKE2SP_USE_FUNCTIONS p->u.header.func_Compress_Fast = #ifdef Z7_BLAKE2S_USE_VECTORS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; #else NULL; #endif p->u.header.func_Compress_Single = #ifdef Z7_BLAKE2S_USE_VECTORS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; #else NULL; #endif p->u.header.func_Init = #ifdef Z7_BLAKE2S_USE_VECTORS g_Z7_BLAKE2SP_FUNC_INIT_Init; #else NULL; #endif p->u.header.func_Final = #ifdef Z7_BLAKE2S_USE_VECTORS g_Z7_BLAKE2SP_FUNC_INIT_Final; #else NULL; #endif #endif Blake2sp_InitState(p); } void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size) { size_t pos; // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data); if (size == 0) return; pos = p->u.header.cycPos; // pos < SUPER_BLOCK_SIZE * 2 : is expected // pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also { const size_t pos2 = pos & SUPER_BLOCK_MASK; if (pos2) { const size_t rem = SUPER_BLOCK_SIZE - pos2; if (rem > size) { p->u.header.cycPos = (unsigned)(pos + size); // cycPos < SUPER_BLOCK_SIZE * 2 memcpy((Byte *)(void *)p->buf32 + pos, data, size); /* to simpilify the code here we don't try to process first superblock, if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */ return; } // (rem <= size) memcpy((Byte *)(void *)p->buf32 + pos, data, rem); pos += rem; data += rem; size -= rem; } } // pos <= SUPER_BLOCK_SIZE * 2 // pos % SUPER_BLOCK_SIZE == 0 if (pos) { /* pos == SUPER_BLOCK_SIZE || pos == SUPER_BLOCK_SIZE * 2 */ size_t end = pos; if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE || (end -= SUPER_BLOCK_SIZE)) { Z7_BLAKE2SP_Compress_Fast(p)(p->states, (const Byte *)(const void *)p->buf32, (const Byte *)(const void *)p->buf32 + end); if (pos -= end) memcpy(p->buf32, (const Byte *)(const void *)p->buf32 + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE); } } // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE) if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) { // pos == 0 const Byte *end; const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1)) & ~(size_t)SUPER_BLOCK_MASK; size -= size2; // size < SUPER_BLOCK_SIZE * 2 end = data + size2; Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end); data = end; } if (size != 0) { memcpy((Byte *)(void *)p->buf32 + pos, data, size); pos += size; } p->u.header.cycPos = (unsigned)pos; // cycPos < SUPER_BLOCK_SIZE * 2 } void Blake2sp_Final(CBlake2sp *p, Byte *digest) { // UInt32 * const R_states = p->states; // printf("\nBlake2sp_Final \n"); #ifdef Z7_BLAKE2SP_USE_FUNCTIONS if (p->u.header.func_Final) p->u.header.func_Final(p->states); #endif // printf("\n=====\nBlake2sp_Final \n"); // PrintStates(p->states, 32); // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch: if (p->u.header.cycPos <= SUPER_BLOCK_SIZE) { unsigned pos; memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, SUPER_BLOCK_SIZE - p->u.header.cycPos); STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); Blake2s_Set_LastBlock(s) if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos) { UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE; if (pos < p->u.header.cycPos) delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1); // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE { const UInt32 v = STATE_T(s)[0]; STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0) STATE_T(s)[0] = v - delta; } } } // PrintStates(p->states, 16); Z7_BLAKE2SP_Compress_Single(p)(p->states, (Byte *)(void *)p->buf32, (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); // PrintStates(p->states, 16); } else { // (p->u.header.cycPos > SUPER_BLOCK_SIZE) unsigned pos; for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos); if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos) Blake2s_Set_LastBlock(s) } if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; Z7_BLAKE2SP_Compress_Single(p)(p->states, (Byte *)(void *)p->buf32, (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE); // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE; STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG; // if (p->u.header.cycPos != SUPER_BLOCK_SIZE) { pos = SUPER_BLOCK_SIZE; for (;;) { UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK); Blake2s_Set_LastBlock(s) pos += Z7_BLAKE2S_BLOCK_SIZE; if (pos >= p->u.header.cycPos) { if (pos != p->u.header.cycPos) { const UInt32 delta = pos - p->u.header.cycPos; const UInt32 v = STATE_T(s)[0]; STATE_T(s)[1] -= v < delta; STATE_T(s)[0] = v - delta; memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta); } break; } } Z7_BLAKE2SP_Compress_Single(p)(p->states, (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE, (Byte *)(void *)p->buf32 + pos); } } { size_t pos; for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2) { const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2)); Byte *dest = (Byte *)(void *)p->buf32 + pos; GET_DIGEST(s, dest) } } Blake2sp_Init_Spec(p->states, 0, 1); { size_t pos; for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE) - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE) { Z7_BLAKE2SP_Compress_Single(p)(p->states, (const Byte *)(const void *)p->buf32 + pos, (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE); } } // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i); Blake2s_Set_LastBlock(p->states) STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG; { Z7_BLAKE2SP_Compress_Single(p)(p->states, (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE, (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE); } GET_DIGEST(p->states, digest) // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs); } BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo) { // printf("\n========== setfunction = %d ======== \n", algo); #ifdef Z7_BLAKE2SP_USE_FUNCTIONS Z7_BLAKE2SP_FUNC_COMPRESS func = NULL; Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL; Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; #else UNUSED_VAR(p) #endif #ifdef Z7_BLAKE2S_USE_VECTORS func = func_Single = Blake2sp_Compress2; if (algo != Z7_BLAKE2SP_ALGO_SCALAR) { // printf("\n========== setfunction NON-SCALER ======== \n"); if (algo == Z7_BLAKE2SP_ALGO_DEFAULT) { func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast; func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single; func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init; func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final; } else { if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0) return False; #ifdef Z7_BLAKE2S_USE_AVX2 func_Single = #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) Blake2sp_Compress2_AVX2_Way2; #else Z7_BLAKE2S_Compress2_V128; #endif #ifdef Z7_BLAKE2S_USE_AVX2_FAST if (algo == Z7_BLAKE2SP_ALGO_V256_FAST) { func = Blake2sp_Compress2_AVX2_Fast; func_Final = Blake2sp_Final_AVX2_Fast; func_Init = Blake2sp_InitState_AVX2_Fast; } else #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2) func = Blake2sp_Compress2_AVX2_Way2; else #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4) { func_Single = func = Blake2sp_Compress2_AVX2_Way4; } else #endif #endif // avx2 { if (algo == Z7_BLAKE2SP_ALGO_V128_FAST) { func = Blake2sp_Compress2_V128_Fast; func_Final = Blake2sp_Final_V128_Fast; func_Init = Blake2sp_InitState_V128_Fast; func_Single = Z7_BLAKE2S_Compress2_V128; } else #ifdef Z7_BLAKE2S_USE_V128_WAY2 if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2) func = func_Single = Blake2sp_Compress2_V128_Way2; else #endif { if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1) return False; func = func_Single = Blake2sp_Compress2_V128_Way1; } } } } #else // !VECTORS if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR return False; #endif // !VECTORS #ifdef Z7_BLAKE2SP_USE_FUNCTIONS p->u.header.func_Compress_Fast = func; p->u.header.func_Compress_Single = func_Single; p->u.header.func_Final = func_Final; p->u.header.func_Init = func_Init; #endif // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress); return True; } void z7_Black2sp_Prepare(void) { #ifdef Z7_BLAKE2S_USE_VECTORS unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR); Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2; Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2; Z7_BLAKE2SP_FUNC_INIT func_Init = NULL; Z7_BLAKE2SP_FUNC_INIT func_Final = NULL; #if defined(MY_CPU_X86_OR_AMD64) #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) if (CPU_IsSupported_AVX512F_AVX512VL()) #endif #if defined(Z7_BLAKE2S_USE_SSE41) if (CPU_IsSupported_SSE41()) #elif defined(Z7_BLAKE2S_USE_SSSE3) if (CPU_IsSupported_SSSE3()) #elif !defined(MY_CPU_AMD64) if (CPU_IsSupported_SSE2()) #endif #endif { #if defined(Z7_BLAKE2S_USE_SSE41) // printf("\n========== Blake2s SSE41 128-bit\n"); #elif defined(Z7_BLAKE2S_USE_SSSE3) // printf("\n========== Blake2s SSSE3 128-bit\n"); #else // printf("\n========== Blake2s SSE2 128-bit\n"); #endif // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2; // printf("\n========== Blake2sp_Compress2_V128_Way2\n"); func_Fast = func_Single = Z7_BLAKE2S_Compress2_V128; flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1); #ifdef Z7_BLAKE2S_USE_V128_WAY2 flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2); #endif #ifdef Z7_BLAKE2S_USE_V128_FAST flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST); func_Fast = Blake2sp_Compress2_V128_Fast; func_Init = Blake2sp_InitState_V128_Fast; func_Final = Blake2sp_Final_V128_Fast; #endif #ifdef Z7_BLAKE2S_USE_AVX2 #if defined(MY_CPU_X86_OR_AMD64) if ( #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS) CPU_IsSupported_AVX512F_AVX512VL() && #endif CPU_IsSupported_AVX2() ) #endif { // #pragma message ("=== Blake2s AVX2") // printf("\n========== Blake2s AVX2\n"); #ifdef Z7_BLAKE2S_USE_AVX2_WAY2 func_Single = Blake2sp_Compress2_AVX2_Way2; flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2); #endif #ifdef Z7_BLAKE2S_USE_AVX2_WAY4 flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4); #endif #ifdef Z7_BLAKE2S_USE_AVX2_FAST flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST); func_Fast = Blake2sp_Compress2_AVX2_Fast; func_Init = Blake2sp_InitState_AVX2_Fast; func_Final = Blake2sp_Final_AVX2_Fast; #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4) func_Fast = Blake2sp_Compress2_AVX2_Way4; #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2) func_Fast = Blake2sp_Compress2_AVX2_Way2; #endif } // avx2 #endif // avx2 } // sse* g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast; g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single; g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init; g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final; g_z7_Blake2sp_SupportedFlags = flags; // printf("\nflags=%x\n", flags); #endif // vectors } /* #ifdef Z7_BLAKE2S_USE_VECTORS void align_test2(CBlake2sp *sp); void align_test2(CBlake2sp *sp) { __m128i a = LOAD_128(sp->states); D_XOR_128(a, LOAD_128(sp->states + 4)); STORE_128(sp->states, a); } void align_test2(void); void align_test2(void) { CBlake2sp sp; Blake2sp_Init(&sp); Blake2sp_Update(&sp, NULL, 0); } #endif */