/* Xxh64.c -- XXH64 hash calculation original code: Copyright (c) Yann Collet. 2023-08-18 : modified by Igor Pavlov. This source code is licensed under BSD 2-Clause License. */ #include "Precomp.h" #include "CpuArch.h" #include "RotateDefs.h" #include "Xxh64.h" #define Z7_XXH_PRIME64_1 UINT64_CONST(0x9E3779B185EBCA87) #define Z7_XXH_PRIME64_2 UINT64_CONST(0xC2B2AE3D27D4EB4F) #define Z7_XXH_PRIME64_3 UINT64_CONST(0x165667B19E3779F9) #define Z7_XXH_PRIME64_4 UINT64_CONST(0x85EBCA77C2B2AE63) #define Z7_XXH_PRIME64_5 UINT64_CONST(0x27D4EB2F165667C5) void Xxh64State_Init(CXxh64State *p) { const UInt64 seed = 0; p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2; p->v[1] = seed + Z7_XXH_PRIME64_2; p->v[2] = seed; p->v[3] = seed - Z7_XXH_PRIME64_1; } #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER) #define Z7_XXH64_USE_ASM #endif #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \ && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200 /* we try to avoid __allmul calls in MSVC for 64-bit multiply. But MSVC6 still uses __allmul for our code. So for MSVC6 we use default 64-bit multiply without our optimization. */ #define LOW32(b) ((UInt32)(b & 0xffffffff)) /* MSVC compiler (MSVC > 1200) can use "mul" instruction without __allmul for our MY_emulu MACRO. MY_emulu is similar to __emulu(a, b) MACRO */ #define MY_emulu(a, b) ((UInt64)(a) * (b)) #define MY_SET_HIGH32(a) ((UInt64)(a) << 32) #define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b)) // /* #define MY_MUL64(a, b) \ ( MY_emulu((UInt32)(a), LOW32(b)) + \ MY_SET_HIGH32( \ (UInt32)((a) >> 32) * LOW32(b) + \ (UInt32)(a) * (UInt32)((b) >> 32) \ )) // */ /* #define MY_MUL64(a, b) \ ( MY_emulu((UInt32)(a), LOW32(b)) \ + MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \ + MY_MUL32_SET_HIGH32(a, (b) >> 32) \ ) */ #define MY_MUL_32_64(a32, b) \ ( MY_emulu((UInt32)(a32), LOW32(b)) \ + MY_MUL32_SET_HIGH32(a32, (b) >> 32) \ ) #else #define MY_MUL64(a, b) ((a) * (b)) #define MY_MUL_32_64(a32, b) ((a32) * (UInt64)(b)) #endif static Z7_FORCE_INLINE UInt64 Xxh64_Round(UInt64 acc, UInt64 input) { acc += MY_MUL64(input, Z7_XXH_PRIME64_2); acc = Z7_ROTL64(acc, 31); return MY_MUL64(acc, Z7_XXH_PRIME64_1); } static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val) { acc ^= Xxh64_Round(0, val); return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; } #ifdef Z7_XXH64_USE_ASM #define Z7_XXH_PRIME64_1_HIGH 0x9E3779B1 #define Z7_XXH_PRIME64_1_LOW 0x85EBCA87 #define Z7_XXH_PRIME64_2_HIGH 0xC2B2AE3D #define Z7_XXH_PRIME64_2_LOW 0x27D4EB4F void Z7_NO_INLINE __declspec(naked) Z7_FASTCALL Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end) { #if !defined(__clang__) UNUSED_VAR(p) UNUSED_VAR(data) UNUSED_VAR(end) #endif __asm push ebx __asm push ebp __asm push esi __asm push edi #define STACK_OFFSET 4 * 8 __asm sub esp, STACK_OFFSET #define COPY_1(n) \ __asm mov eax, [ecx + n * 4] \ __asm mov [esp + n * 4], eax \ #define COPY_2(n) \ __asm mov eax, [esp + n * 4] \ __asm mov [ecx + n * 4], eax \ COPY_1(0) __asm mov edi, [ecx + 1 * 4] \ COPY_1(2) COPY_1(3) COPY_1(4) COPY_1(5) COPY_1(6) COPY_1(7) __asm mov esi, edx \ __asm mov [esp + 0 * 8 + 4], ecx __asm mov ecx, Z7_XXH_PRIME64_2_LOW \ __asm mov ebp, Z7_XXH_PRIME64_1_LOW \ #define R(n, state1, state1_reg) \ __asm mov eax, [esi + n * 8] \ __asm imul ebx, eax, Z7_XXH_PRIME64_2_HIGH \ __asm add ebx, state1 \ __asm mul ecx \ __asm add edx, ebx \ __asm mov ebx, [esi + n * 8 + 4] \ __asm imul ebx, ecx \ __asm add eax, [esp + n * 8] \ __asm adc edx, ebx \ __asm mov ebx, eax \ __asm shld eax, edx, 31 \ __asm shld edx, ebx, 31 \ __asm imul state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \ __asm imul edx, ebp \ __asm add state1_reg, edx \ __asm mul ebp \ __asm add state1_reg, edx \ __asm mov [esp + n * 8], eax \ #define R2(n) \ R(n, [esp + n * 8 + 4], ebx) \ __asm mov [esp + n * 8 + 4], ebx \ __asm align 16 __asm main_loop: R(0, edi, edi) R2(1) R2(2) R2(3) __asm add esi, 32 __asm cmp esi, [esp + STACK_OFFSET + 4 * 4 + 4] __asm jne main_loop __asm mov ecx, [esp + 0 * 8 + 4] COPY_2(0) __asm mov [ecx + 1 * 4], edi COPY_2(2) COPY_2(3) COPY_2(4) COPY_2(5) COPY_2(6) COPY_2(7) __asm add esp, STACK_OFFSET __asm pop edi __asm pop esi __asm pop ebp __asm pop ebx __asm ret 4 } #else void Z7_NO_INLINE Z7_FASTCALL Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end) { const Byte *data = (const Byte *)_data; UInt64 v[4]; v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2]; v[3] = p->v[3]; do { v[0] = Xxh64_Round(v[0], GetUi64(data)); data += 8; v[1] = Xxh64_Round(v[1], GetUi64(data)); data += 8; v[2] = Xxh64_Round(v[2], GetUi64(data)); data += 8; v[3] = Xxh64_Round(v[3], GetUi64(data)); data += 8; } while (data != end); p->v[0] = v[0]; p->v[1] = v[1]; p->v[2] = v[2]; p->v[3] = v[3]; } #endif UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count) { UInt64 h = p->v[2]; if (count >= 32) { h = Z7_ROTL64(p->v[0], 1) + Z7_ROTL64(p->v[1], 7) + Z7_ROTL64(h, 12) + Z7_ROTL64(p->v[3], 18); h = Xxh64_Merge(h, p->v[0]); h = Xxh64_Merge(h, p->v[1]); h = Xxh64_Merge(h, p->v[2]); h = Xxh64_Merge(h, p->v[3]); } else h += Z7_XXH_PRIME64_5; h += count; // XXH64_finalize(): { unsigned cnt = (unsigned)count & 31; const Byte *data = (const Byte *)_data; while (cnt >= 8) { h ^= Xxh64_Round(0, GetUi64(data)); data += 8; h = Z7_ROTL64(h, 27); h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4; cnt -= 8; } if (cnt >= 4) { const UInt32 v = GetUi32(data); data += 4; h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1); h = Z7_ROTL64(h, 23); h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3; cnt -= 4; } while (cnt) { const UInt32 v = *data++; h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5); h = Z7_ROTL64(h, 11); h = MY_MUL64(h, Z7_XXH_PRIME64_1); cnt--; } // XXH64_avalanche(h): h ^= h >> 33; h = MY_MUL64(h, Z7_XXH_PRIME64_2); h ^= h >> 29; h = MY_MUL64(h, Z7_XXH_PRIME64_3); h ^= h >> 32; return h; } } void Xxh64_Init(CXxh64 *p) { Xxh64State_Init(&p->state); p->count = 0; p->buf64[0] = 0; p->buf64[1] = 0; p->buf64[2] = 0; p->buf64[3] = 0; } void Xxh64_Update(CXxh64 *p, const void *_data, size_t size) { const Byte *data = (const Byte *)_data; unsigned cnt; if (size == 0) return; cnt = (unsigned)p->count; p->count += size; if (cnt &= 31) { unsigned rem = 32 - cnt; Byte *dest = (Byte *)p->buf64 + cnt; if (rem > size) rem = (unsigned)size; size -= rem; cnt += rem; // memcpy((Byte *)p->buf64 + cnt, data, rem); do *dest++ = *data++; while (--rem); if (cnt != 32) return; Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]); } if (size &= ~(size_t)31) { Xxh64State_UpdateBlocks(&p->state, data, data + size); data += size; } cnt = (unsigned)p->count & 31; if (cnt) { // memcpy(p->buf64, data, cnt); Byte *dest = (Byte *)p->buf64; do *dest++ = *data++; while (--cnt); } }