From fc662341e6f85da78ada0e443f6116b978f79f22 Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Tue, 14 May 2024 00:00:00 +0000 Subject: 24.05 --- Asm/x86/7zAsm.asm | 68 +++++- Asm/x86/7zCrcOpt.asm | 352 ++++++++++++++++----------- Asm/x86/XzCrc64Opt.asm | 632 +++++++++++++++++++++++++++++++++++-------------- 3 files changed, 733 insertions(+), 319 deletions(-) (limited to 'Asm') diff --git a/Asm/x86/7zAsm.asm b/Asm/x86/7zAsm.asm index 19c40da..8910d16 100644 --- a/Asm/x86/7zAsm.asm +++ b/Asm/x86/7zAsm.asm @@ -1,5 +1,5 @@ ; 7zAsm.asm -- ASM macros -; 2022-05-16 : Igor Pavlov : Public domain +; 2023-12-08 : Igor Pavlov : Public domain ; UASM can require these changes @@ -43,7 +43,7 @@ else endif endif -OPTION PROLOGUE:NONE +OPTION PROLOGUE:NONE OPTION EPILOGUE:NONE MY_ASM_START macro @@ -121,10 +121,29 @@ endif x2_H equ DH x3_H equ BH +; r0_L equ AL +; r1_L equ CL +; r2_L equ DL +; r3_L equ BL + +; r0_H equ AH +; r1_H equ CH +; r2_H equ DH +; r3_H equ BH + + ifdef x64 x5_L equ BPL x6_L equ SIL x7_L equ DIL + x8_L equ r8b + x9_L equ r9b + x10_L equ r10b + x11_L equ r11b + x12_L equ r12b + x13_L equ r13b + x14_L equ r14b + x15_L equ r15b r0 equ RAX r1 equ RCX @@ -153,6 +172,22 @@ else r7 equ x7 endif + x0_R equ r0 + x1_R equ r1 + x2_R equ r2 + x3_R equ r3 + x4_R equ r4 + x5_R equ r5 + x6_R equ r6 + x7_R equ r7 + x8_R equ r8 + x9_R equ r9 + x10_R equ r10 + x11_R equ r11 + x12_R equ r12 + x13_R equ r13 + x14_R equ r14 + x15_R equ r15 ifdef x64 ifdef ABI_LINUX @@ -200,6 +235,14 @@ REG_ABI_PARAM_0 equ REG_PARAM_0 REG_ABI_PARAM_1_x equ REG_PARAM_1_x REG_ABI_PARAM_1 equ REG_PARAM_1 +MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro + MY_PUSH_4_REGS +endm + +MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro + MY_POP_4_REGS +endm + else ; x64 @@ -261,12 +304,25 @@ endm endif ; IS_LINUX -MY_PUSH_PRESERVED_ABI_REGS macro +MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro if (IS_LINUX gt 0) MY_PUSH_2_REGS else MY_PUSH_4_REGS endif +endm + +MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro + if (IS_LINUX gt 0) + MY_POP_2_REGS + else + MY_POP_4_REGS + endif +endm + + +MY_PUSH_PRESERVED_ABI_REGS macro + MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 push r12 push r13 push r14 @@ -279,11 +335,7 @@ MY_POP_PRESERVED_ABI_REGS macro pop r14 pop r13 pop r12 - if (IS_LINUX gt 0) - MY_POP_2_REGS - else - MY_POP_4_REGS - endif + MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 endm endif ; x64 diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm index 0fee206..c5de808 100644 --- a/Asm/x86/7zCrcOpt.asm +++ b/Asm/x86/7zCrcOpt.asm @@ -1,180 +1,258 @@ ; 7zCrcOpt.asm -- CRC32 calculation : optimized version -; 2021-02-07 : Igor Pavlov : Public domain +; 2023-12-08 : Igor Pavlov : Public domain include 7zAsm.asm MY_ASM_START -rD equ r2 -rN equ r7 -rT equ r5 +NUM_WORDS equ 3 +UNROLL_CNT equ 2 -ifdef x64 - num_VAR equ r8 - table_VAR equ r9 -else - if (IS_CDECL gt 0) - crc_OFFS equ (REG_SIZE * 5) - data_OFFS equ (REG_SIZE + crc_OFFS) - size_OFFS equ (REG_SIZE + data_OFFS) - else - size_OFFS equ (REG_SIZE * 5) - endif - table_OFFS equ (REG_SIZE + size_OFFS) - num_VAR equ [r4 + size_OFFS] - table_VAR equ [r4 + table_OFFS] +if (NUM_WORDS lt 1) or (NUM_WORDS gt 64) +.err +endif +if (UNROLL_CNT lt 1) +.err endif -SRCDAT equ rD + rN * 1 + 4 * +rD equ r2 +rD_x equ x2 +rN equ r7 +rT equ r5 + +ifndef x64 + if (IS_CDECL gt 0) + crc_OFFS equ (REG_SIZE * 5) + data_OFFS equ (REG_SIZE + crc_OFFS) + size_OFFS equ (REG_SIZE + data_OFFS) + else + size_OFFS equ (REG_SIZE * 5) + endif + table_OFFS equ (REG_SIZE + size_OFFS) +endif + +; rN + rD is same speed as rD, but we reduce one instruction in loop +SRCDAT_1 equ rN + rD * 1 + 1 * +SRCDAT_4 equ rN + rD * 1 + 4 * CRC macro op:req, dest:req, src:req, t:req - op dest, DWORD PTR [rT + src * 4 + 0400h * t] + op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)] endm CRC_XOR macro dest:req, src:req, t:req - CRC xor, dest, src, t + CRC xor, dest, src, t endm CRC_MOV macro dest:req, src:req, t:req - CRC mov, dest, src, t + CRC mov, dest, src, t +endm + +MOVZXLO macro dest:req, src:req + movzx dest, @CatStr(src, _L) +endm + +MOVZXHI macro dest:req, src:req + movzx dest, @CatStr(src, _H) endm +; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest +; movzx x3, x0_L sometimes is 0 cycles latency (not always) +; movzx x3, x0_L sometimes is 0.5 cycles latency +; movzx x3, x0_H is 2 cycles latency in some cpus + CRC1b macro - movzx x6, BYTE PTR [rD] - inc rD - movzx x3, x0_L - xor x6, x3 - shr x0, 8 - CRC xor, x0, r6, 0 - dec rN + movzx x6, byte ptr [rD] + MOVZXLO x3, x0 + inc rD + shr x0, 8 + xor x6, x3 + CRC_XOR x0, x6, 0 + dec rN +endm + +LOAD_1 macro dest:req, t:req, iter:req, index:req + movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] +endm + +LOAD_2 macro dest:req, t:req, iter:req, index:req + movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)] +endm + +CRC_QUAD macro nn, t:req, iter:req +ifdef x64 + ; paired memory loads give 1-3% speed gain, but it uses more registers + LOAD_2 x3, t, iter, 0 + LOAD_2 x9, t, iter, 2 + MOVZXLO x6, x3 + shr x3, 8 + CRC_XOR nn, x6, t * 4 + 3 + MOVZXLO x6, x9 + shr x9, 8 + CRC_XOR nn, x3, t * 4 + 2 + CRC_XOR nn, x6, t * 4 + 1 + CRC_XOR nn, x9, t * 4 + 0 +elseif 0 + LOAD_2 x3, t, iter, 0 + MOVZXLO x6, x3 + shr x3, 8 + CRC_XOR nn, x6, t * 4 + 3 + CRC_XOR nn, x3, t * 4 + 2 + LOAD_2 x3, t, iter, 2 + MOVZXLO x6, x3 + shr x3, 8 + CRC_XOR nn, x6, t * 4 + 1 + CRC_XOR nn, x3, t * 4 + 0 +elseif 0 + LOAD_1 x3, t, iter, 0 + LOAD_1 x6, t, iter, 1 + CRC_XOR nn, x3, t * 4 + 3 + CRC_XOR nn, x6, t * 4 + 2 + LOAD_1 x3, t, iter, 2 + LOAD_1 x6, t, iter, 3 + CRC_XOR nn, x3, t * 4 + 1 + CRC_XOR nn, x6, t * 4 + 0 +else + ; 32-bit load is better if there is only one read port (core2) + ; but that code can be slower if there are 2 read ports (snb) + mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)] + MOVZXLO x6, x3 + CRC_XOR nn, x6, t * 4 + 3 + MOVZXHI x6, x3 + shr x3, 16 + CRC_XOR nn, x6, t * 4 + 2 + MOVZXLO x6, x3 + shr x3, 8 + CRC_XOR nn, x6, t * 4 + 1 + CRC_XOR nn, x3, t * 4 + 0 +endif endm -MY_PROLOG macro crc_end:req +LAST equ (4 * (NUM_WORDS - 1)) + +CRC_ITER macro qq, nn, iter + mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))] + + i = 0 + rept NUM_WORDS - 1 + CRC_QUAD nn, i, iter + i = i + 1 + endm + + MOVZXLO x6, qq + mov x3, qq + shr x3, 24 + CRC_XOR nn, x6, LAST + 3 + CRC_XOR nn, x3, LAST + 0 + ror qq, 16 + MOVZXLO x6, qq + shr qq, 24 + CRC_XOR nn, x6, LAST + 1 +if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1)) + CRC_MOV qq, qq, LAST + 2 + xor qq, nn +else + CRC_XOR nn, qq, LAST + 2 +endif +endm + + +; + 4 for prefetching next 4-bytes after current iteration +NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4) +ALIGN_MASK equ 3 + + +; MY_PROC @CatStr(CrcUpdateT, 12), 4 +MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4 + MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 ifdef x64 + mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux) + mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux) + mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux) + ; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win) if (IS_LINUX gt 0) - MY_PUSH_2_REGS - mov x0, REG_ABI_PARAM_0_x ; x0 = x7 - mov rT, REG_ABI_PARAM_3 ; r5 = r1 - mov rN, REG_ABI_PARAM_2 ; r7 = r2 mov rD, REG_ABI_PARAM_1 ; r2 = r6 - else - MY_PUSH_4_REGS - mov x0, REG_ABI_PARAM_0_x ; x0 = x1 - mov rT, REG_ABI_PARAM_3 ; r5 = r9 - mov rN, REG_ABI_PARAM_2 ; r7 = r8 - ; mov rD, REG_ABI_PARAM_1 ; r2 = r2 endif else - MY_PUSH_4_REGS if (IS_CDECL gt 0) mov x0, [r4 + crc_OFFS] mov rD, [r4 + data_OFFS] else mov x0, REG_ABI_PARAM_0_x endif - mov rN, num_VAR - mov rT, table_VAR + mov rN, [r4 + size_OFFS] + mov rT, [r4 + table_OFFS] endif - test rN, rN - jz crc_end - @@: - test rD, 7 - jz @F - CRC1b - jnz @B - @@: - cmp rN, 16 - jb crc_end - add rN, rD - mov num_VAR, rN - sub rN, 8 - and rN, NOT 7 - sub rD, rN - xor x0, [SRCDAT 0] -endm + cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK + jb crc_end +@@: + test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK + jz @F + CRC1b + jmp @B +@@: + xor x0, dword ptr [rD] + lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] + sub rD, rN -MY_EPILOG macro crc_end:req - xor x0, [SRCDAT 0] - mov rD, rN - mov rN, num_VAR - sub rN, rD - crc_end: - test rN, rN - jz @F - CRC1b - jmp crc_end - @@: - if (IS_X64 gt 0) and (IS_LINUX gt 0) - MY_POP_2_REGS - else - MY_POP_4_REGS - endif +align 16 +@@: +unr_index = 0 +while unr_index lt UNROLL_CNT + if (unr_index and 1) eq 0 + CRC_ITER x0, x1, unr_index + else + CRC_ITER x1, x0, unr_index + endif + unr_index = unr_index + 1 endm -MY_PROC CrcUpdateT8, 4 - MY_PROLOG crc_end_8 - mov x1, [SRCDAT 1] - align 16 - main_loop_8: - mov x6, [SRCDAT 2] - movzx x3, x1_L - CRC_XOR x6, r3, 3 - movzx x3, x1_H - CRC_XOR x6, r3, 2 - shr x1, 16 - movzx x3, x1_L - movzx x1, x1_H - CRC_XOR x6, r3, 1 - movzx x3, x0_L - CRC_XOR x6, r1, 0 - - mov x1, [SRCDAT 3] - CRC_XOR x6, r3, 7 - movzx x3, x0_H - shr x0, 16 - CRC_XOR x6, r3, 6 - movzx x3, x0_L - CRC_XOR x6, r3, 5 - movzx x3, x0_H - CRC_MOV x0, r3, 4 - xor x0, x6 - add rD, 8 - jnz main_loop_8 - - MY_EPILOG crc_end_8 -MY_ENDP + add rD, NUM_WORDS * 4 * UNROLL_CNT + jnc @B + +if 0 + ; byte verson + add rD, rN + xor x0, dword ptr [rD] + add rN, NUM_BYTES_LIMIT - 1 +else + ; 4-byte version + add rN, 4 * NUM_WORDS * UNROLL_CNT + sub rD, 4 * NUM_WORDS * UNROLL_CNT +@@: + MOVZXLO x3, x0 + MOVZXHI x1, x0 + shr x0, 16 + MOVZXLO x6, x0 + shr x0, 8 + CRC_MOV x0, x0, 0 + CRC_XOR x0, x3, 3 + CRC_XOR x0, x1, 2 + CRC_XOR x0, x6, 1 + + add rD, 4 +if (NUM_WORDS * UNROLL_CNT) ne 1 + jc @F + xor x0, [SRCDAT_4 0] + jmp @B +@@: +endif + add rD, rN + add rN, 4 - 1 + +endif + + sub rN, rD +crc_end: + test rN, rN + jz func_end +@@: + CRC1b + jnz @B -MY_PROC CrcUpdateT4, 4 - MY_PROLOG crc_end_4 - align 16 - main_loop_4: - movzx x1, x0_L - movzx x3, x0_H - shr x0, 16 - movzx x6, x0_H - and x0, 0FFh - CRC_MOV x1, r1, 3 - xor x1, [SRCDAT 1] - CRC_XOR x1, r3, 2 - CRC_XOR x1, r6, 0 - CRC_XOR x1, r0, 1 - - movzx x0, x1_L - movzx x3, x1_H - shr x1, 16 - movzx x6, x1_H - and x1, 0FFh - CRC_MOV x0, r0, 3 - xor x0, [SRCDAT 2] - CRC_XOR x0, r3, 2 - CRC_XOR x0, r6, 0 - CRC_XOR x0, r1, 1 - add rD, 8 - jnz main_loop_4 - - MY_EPILOG crc_end_4 +func_end: + MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 MY_ENDP end diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm index ad22cc2..283424c 100644 --- a/Asm/x86/XzCrc64Opt.asm +++ b/Asm/x86/XzCrc64Opt.asm @@ -1,113 +1,231 @@ ; XzCrc64Opt.asm -- CRC64 calculation : optimized version -; 2021-02-06 : Igor Pavlov : Public domain +; 2023-12-08 : Igor Pavlov : Public domain include 7zAsm.asm MY_ASM_START +NUM_WORDS equ 3 + +if (NUM_WORDS lt 1) or (NUM_WORDS gt 64) +.err +endif + +NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4) + + +MOVZXLO macro dest:req, src:req + movzx dest, @CatStr(src, _L) +endm + +MOVZXHI macro dest:req, src:req + movzx dest, @CatStr(src, _H) +endm + + ifdef x64 -rD equ r9 +rD equ r11 rN equ r10 -rT equ r5 -num_VAR equ r8 - -SRCDAT4 equ dword ptr [rD + rN * 1] +rT equ r9 + +CRC_OP macro op:req, dest:req, src:req, t:req + op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)] +endm CRC_XOR macro dest:req, src:req, t:req - xor dest, QWORD PTR [rT + src * 8 + 0800h * t] + CRC_OP xor, dest, src, t +endm + +CRC_MOV macro dest:req, src:req, t:req + CRC_OP mov, dest, src, t endm CRC1b macro - movzx x6, BYTE PTR [rD] - inc rD - movzx x3, x0_L - xor x6, x3 - shr r0, 8 - CRC_XOR r0, r6, 0 - dec rN -endm - -MY_PROLOG macro crc_end:req - ifdef ABI_LINUX - MY_PUSH_2_REGS - else - MY_PUSH_4_REGS - endif - mov r0, REG_ABI_PARAM_0 - mov rN, REG_ABI_PARAM_2 - mov rT, REG_ABI_PARAM_3 - mov rD, REG_ABI_PARAM_1 - test rN, rN - jz crc_end - @@: - test rD, 3 - jz @F - CRC1b - jnz @B - @@: - cmp rN, 8 - jb crc_end - add rN, rD - mov num_VAR, rN - sub rN, 4 - and rN, NOT 3 - sub rD, rN - mov x1, SRCDAT4 - xor r0, r1 - add rN, 4 -endm - -MY_EPILOG macro crc_end:req - sub rN, 4 - mov x1, SRCDAT4 - xor r0, r1 - mov rD, rN - mov rN, num_VAR - sub rN, rD - crc_end: - test rN, rN - jz @F - CRC1b - jmp crc_end - @@: - ifdef ABI_LINUX - MY_POP_2_REGS - else - MY_POP_4_REGS - endif + movzx x6, BYTE PTR [rD] + inc rD + MOVZXLO x3, x0 + xor x6, x3 + shr r0, 8 + CRC_XOR r0, x6, 0 + dec rN endm -MY_PROC XzCrc64UpdateT4, 4 - MY_PROLOG crc_end_4 - align 16 - main_loop_4: - mov x1, SRCDAT4 - movzx x2, x0_L - movzx x3, x0_H - shr r0, 16 - movzx x6, x0_L - movzx x7, x0_H - shr r0, 16 - CRC_XOR r1, r2, 3 - CRC_XOR r0, r3, 2 - CRC_XOR r1, r6, 1 - CRC_XOR r0, r7, 0 - xor r0, r1 - - add rD, 4 - jnz main_loop_4 - - MY_EPILOG crc_end_4 + +; ALIGN_MASK is 3 or 7 bytes alignment: +ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4) + +if NUM_WORDS eq 1 + +src_rN_offset equ 4 +; + 4 for prefetching next 4-bytes after current iteration +NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4) +SRCDAT4 equ DWORD PTR [rN + rD * 1] + +XOR_NEXT macro + mov x1, [rD] + xor r0, r1 +endm + +else ; NUM_WORDS > 1 + +src_rN_offset equ 8 +; + 8 for prefetching next 8-bytes after current iteration +NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8) + +XOR_NEXT macro + xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned +endm + +; 32-bit or 64-bit +LOAD_SRC_MULT4 macro dest:req, word_index:req + mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset]; +endm + +endif + + + +MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4 + MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 + + mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7 + mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6 + mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2 +if (IS_LINUX gt 0) + mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1 +endif + + cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK + jb crc_end +@@: + test rD, ALIGN_MASK + jz @F + CRC1b + jmp @B +@@: + XOR_NEXT + lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] + sub rD, rN + add rN, src_rN_offset + +align 16 +@@: + +if NUM_WORDS eq 1 + + mov x1, x0 + shr x1, 8 + MOVZXLO x3, x1 + MOVZXLO x2, x0 + shr x1, 8 + shr r0, 32 + xor x0, SRCDAT4 + CRC_XOR r0, x2, 3 + CRC_XOR r0, x3, 2 + MOVZXLO x2, x1 + shr x1, 8 + CRC_XOR r0, x2, 1 + CRC_XOR r0, x1, 0 + +else ; NUM_WORDS > 1 + +if NUM_WORDS ne 2 + k = 2 + while k lt NUM_WORDS + + LOAD_SRC_MULT4 x1, k + crc_op1 textequ + + if k eq 2 + if (NUM_WORDS and 1) + LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit + LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit + shl r6, 32 + else + LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit + crc_op1 textequ + endif + endif + table = 4 * (NUM_WORDS - 1 - k) + MOVZXLO x3, x1 + CRC_OP crc_op1, r7, x3, 3 + table + MOVZXHI x3, x1 + shr x1, 16 + CRC_XOR r6, x3, 2 + table + MOVZXLO x3, x1 + shr x1, 8 + CRC_XOR r7, x3, 1 + table + CRC_XOR r6, x1, 0 + table + k = k + 1 + endm + crc_op2 textequ + +else ; NUM_WORDS == 2 + LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit + crc_op2 textequ +endif ; NUM_WORDS == 2 + + MOVZXHI x3, x0 + MOVZXLO x2, x0 + mov r1, r0 + shr r1, 32 + shr x0, 16 + CRC_XOR r6, x2, NUM_SKIP_BYTES + 7 + CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6 + MOVZXLO x2, x0 + MOVZXHI x5, x1 + MOVZXLO x3, x1 + shr x0, 8 + shr x1, 16 + CRC_XOR r7, x2, NUM_SKIP_BYTES + 5 + CRC_XOR r6, x3, NUM_SKIP_BYTES + 3 + CRC_XOR r7, x0, NUM_SKIP_BYTES + 4 + CRC_XOR r6, x5, NUM_SKIP_BYTES + 2 + MOVZXLO x2, x1 + shr x1, 8 + CRC_XOR r7, x2, NUM_SKIP_BYTES + 1 + CRC_MOV r0, x1, NUM_SKIP_BYTES + 0 + xor r0, r6 + xor r0, r7 + +endif ; NUM_WORDS > 1 + add rD, NUM_WORDS * 4 + jnc @B + + sub rN, src_rN_offset + add rD, rN + XOR_NEXT + add rN, NUM_BYTES_LIMIT - 1 + sub rN, rD + +crc_end: + test rN, rN + jz func_end +@@: + CRC1b + jnz @B +func_end: + MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 MY_ENDP + + else +; ================================================================== ; x86 (32-bit) -rD equ r1 -rN equ r7 +rD equ r7 +rN equ r1 rT equ r5 +xA equ x6 +xA_R equ r6 + +ifdef x64 + num_VAR equ r8 +else + crc_OFFS equ (REG_SIZE * 5) if (IS_CDECL gt 0) or (IS_LINUX gt 0) @@ -133,107 +251,273 @@ else table_VAR equ [r4 + table_OFFS] num_VAR equ table_VAR endif +endif ; x64 + +SRCDAT4 equ DWORD PTR [rN + rD * 1] -SRCDAT4 equ dword ptr [rD + rN * 1] +CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req + op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4] +endm CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req - op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t] - op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4] + CRC_1 op0, dest0, src, t, 0 + CRC_1 op1, dest1, src, t, 1 endm CRC_XOR macro dest0:req, dest1:req, src:req, t:req - CRC xor, xor, dest0, dest1, src, t + CRC xor, xor, dest0, dest1, src, t endm CRC1b macro - movzx x6, BYTE PTR [rD] - inc rD - movzx x3, x0_L - xor x6, x3 - shrd r0, r2, 8 - shr r2, 8 - CRC_XOR r0, r2, r6, 0 - dec rN -endm - -MY_PROLOG macro crc_end:req - MY_PUSH_4_REGS - - if (IS_CDECL gt 0) or (IS_LINUX gt 0) - proc_numParams = proc_numParams + 2 ; for ABI_LINUX - mov rN, [r4 + size_OFFS] - mov rD, [r4 + data_OFFS] + movzx xA, BYTE PTR [rD] + inc rD + MOVZXLO x3, x0 + xor xA, x3 + shrd x0, x2, 8 + shr x2, 8 + CRC_XOR x0, x2, xA, 0 + dec rN +endm + + +MY_PROLOG_BASE macro + MY_PUSH_4_REGS +ifdef x64 + mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7 + mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1 + mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2 + mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6 + mov r2, r0 + shr r2, 32 + mov x0, x0 +else + if (IS_CDECL gt 0) or (IS_LINUX gt 0) + proc_numParams = proc_numParams + 2 ; for ABI_LINUX + mov rN, [r4 + size_OFFS] + mov rD, [r4 + data_OFFS] + else + mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data) + mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size) + endif + mov x0, [r4 + crc_OFFS] + mov x2, [r4 + crc_OFFS + 4] + mov rT, table_VAR +endif +endm + + +MY_EPILOG_BASE macro crc_end:req, func_end:req +crc_end: + test rN, rN + jz func_end +@@: + CRC1b + jnz @B +func_end: +ifdef x64 + shl r2, 32 + xor r0, r2 +endif + MY_POP_4_REGS +endm + + +; ALIGN_MASK is 3 or 7 bytes alignment: +ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4) + +if (NUM_WORDS eq 1) + +NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4) + +MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5 + MY_PROLOG_BASE + + cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK + jb crc_end_4 +@@: + test rD, ALIGN_MASK + jz @F + CRC1b + jmp @B +@@: + xor x0, [rD] + lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)] + sub rD, rN + add rN, 4 + + MOVZXLO xA, x0 +align 16 +@@: + mov x3, SRCDAT4 + xor x3, x2 + shr x0, 8 + CRC xor, mov, x3, x2, xA, 3 + MOVZXLO xA, x0 + shr x0, 8 + ; MOVZXHI xA, x0 + ; shr x0, 16 + CRC_XOR x3, x2, xA, 2 + + MOVZXLO xA, x0 + shr x0, 8 + CRC_XOR x3, x2, xA, 1 + CRC_XOR x3, x2, x0, 0 + MOVZXLO xA, x3 + mov x0, x3 + + add rD, 4 + jnc @B + + sub rN, 4 + add rD, rN + xor x0, [rD] + add rN, NUM_BYTES_LIMIT_T4 - 1 + sub rN, rD + MY_EPILOG_BASE crc_end_4, func_end_4 +MY_ENDP + +else ; NUM_WORDS > 1 + +SHR_X macro x, imm + shr x, imm +endm + + +ITER_1 macro v0, v1, a, off + MOVZXLO xA, a + SHR_X a, 8 + CRC_XOR v0, v1, xA, off +endm + + +ITER_4 macro v0, v1, a, off +if 0 eq 0 + ITER_1 v0, v1, a, off + 3 + ITER_1 v0, v1, a, off + 2 + ITER_1 v0, v1, a, off + 1 + CRC_XOR v0, v1, a, off +elseif 0 eq 0 + MOVZXLO xA, a + CRC_XOR v0, v1, xA, off + 3 + mov xA, a + ror a, 16 ; 32-bit ror + shr xA, 24 + CRC_XOR v0, v1, xA, off + MOVZXLO xA, a + SHR_X a, 24 + CRC_XOR v0, v1, xA, off + 1 + CRC_XOR v0, v1, a, off + 2 +else + ; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction + MOVZXLO xA, a + CRC_XOR v0, v1, xA, off + 3 + MOVZXHI xA, a + SHR_X a, 16 + CRC_XOR v0, v1, xA, off + 2 + MOVZXLO xA, a + SHR_X a, 8 + CRC_XOR v0, v1, xA, off + 1 + CRC_XOR v0, v1, a, off +endif +endm + + + +ITER_1_PAIR macro v0, v1, a0, a1, off + ITER_1 v0, v1, a0, off + 4 + ITER_1 v0, v1, a1, off +endm + +src_rD_offset equ 8 +STEP_SIZE equ (NUM_WORDS * 4) + +ITER_12_NEXT macro op, index, v0, v1 + op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset] + op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset] +endm + +ITER_12 macro index, a0, a1, v0, v1 + + if NUM_SKIP_BYTES eq 0 + ITER_12_NEXT mov, index, v0, v1 else - mov rN, r2 + k = 0 + while k lt NUM_SKIP_BYTES + movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset] + if k eq 0 + CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k + else + CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k + endif + k = k + 1 + endm + ITER_12_NEXT xor, index, v0, v1 endif - mov x0, [r4 + crc_OFFS] - mov x2, [r4 + crc_OFFS + 4] - mov rT, table_VAR - test rN, rN - jz crc_end - @@: - test rD, 3 - jz @F - CRC1b - jnz @B - @@: - cmp rN, 8 - jb crc_end - add rN, rD - - mov num_VAR, rN - - sub rN, 4 - and rN, NOT 3 - sub rD, rN - xor r0, SRCDAT4 - add rN, 4 -endm - -MY_EPILOG macro crc_end:req - sub rN, 4 - xor r0, SRCDAT4 - - mov rD, rN - mov rN, num_VAR - sub rN, rD - crc_end: - test rN, rN - jz @F - CRC1b - jmp crc_end - @@: - MY_POP_4_REGS -endm - -MY_PROC XzCrc64UpdateT4, 5 - MY_PROLOG crc_end_4 - movzx x6, x0_L - align 16 - main_loop_4: - mov r3, SRCDAT4 - xor r3, r2 - - CRC xor, mov, r3, r2, r6, 3 - movzx x6, x0_H - shr r0, 16 - CRC_XOR r3, r2, r6, 2 - - movzx x6, x0_L - movzx x0, x0_H - CRC_XOR r3, r2, r6, 1 - CRC_XOR r3, r2, r0, 0 - movzx x6, x3_L - mov r0, r3 - - add rD, 4 - jnz main_loop_4 - - MY_EPILOG crc_end_4 +if 0 eq 0 + ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4 + ITER_4 v0, v1, a1, NUM_SKIP_BYTES +else ; interleave version is faster/slower for different processors + ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3 + ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2 + ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1 + CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4 + CRC_XOR v0, v1, a1, NUM_SKIP_BYTES +endif +endm + +; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads) +UNROLL_CNT equ (2 * 1) +NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8) + +MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5 + MY_PROLOG_BASE + + cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK + jb crc_end_12 +@@: + test rD, ALIGN_MASK + jz @F + CRC1b + jmp @B +@@: + xor x0, [rD] + xor x2, [rD + 4] + add rD, src_rD_offset + lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)] + mov num_VAR, rN + +align 16 +@@: + i = 0 + rept UNROLL_CNT + if (i and 1) eq 0 + ITER_12 i, x0, x2, x1, x3 + else + ITER_12 i, x1, x3, x0, x2 + endif + i = i + 1 + endm + + if (UNROLL_CNT and 1) + mov x0, x1 + mov x2, x3 + endif + add rD, STEP_SIZE * UNROLL_CNT + cmp rD, num_VAR + jb @B + + mov rN, num_VAR + add rN, NUM_BYTES_LIMIT - 1 + sub rN, rD + sub rD, src_rD_offset + xor x0, [rD] + xor x2, [rD + 4] + + MY_EPILOG_BASE crc_end_12, func_end_12 MY_ENDP +endif ; (NUM_WORDS > 1) endif ; ! x64 - end -- cgit v1.2.3-55-g6feb