From f19f813537c7aea1c20749c914e756b54a9c3cf5 Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Mon, 27 Dec 2021 00:00:00 +0000 Subject: '21.07' --- Asm/arm/7zCrcOpt.asm | 100 ++++ Asm/arm64/7zAsm.S | 181 ++++++ Asm/arm64/LzmaDecOpt.S | 1487 ++++++++++++++++++++++++++++++++++++++++++++++++ Asm/x86/7zAsm.asm | 284 +++++++++ Asm/x86/7zCrcOpt.asm | 180 ++++++ Asm/x86/AesOpt.asm | 742 ++++++++++++++++++++++++ Asm/x86/LzFindOpt.asm | 513 +++++++++++++++++ Asm/x86/LzmaDecOpt.asm | 1303 ++++++++++++++++++++++++++++++++++++++++++ Asm/x86/Sha1Opt.asm | 263 +++++++++ Asm/x86/Sha256Opt.asm | 263 +++++++++ Asm/x86/XzCrc64Opt.asm | 239 ++++++++ 11 files changed, 5555 insertions(+) create mode 100644 Asm/arm/7zCrcOpt.asm create mode 100644 Asm/arm64/7zAsm.S create mode 100644 Asm/arm64/LzmaDecOpt.S create mode 100644 Asm/x86/7zAsm.asm create mode 100644 Asm/x86/7zCrcOpt.asm create mode 100644 Asm/x86/AesOpt.asm create mode 100644 Asm/x86/LzFindOpt.asm create mode 100644 Asm/x86/LzmaDecOpt.asm create mode 100644 Asm/x86/Sha1Opt.asm create mode 100644 Asm/x86/Sha256Opt.asm create mode 100644 Asm/x86/XzCrc64Opt.asm (limited to 'Asm') diff --git a/Asm/arm/7zCrcOpt.asm b/Asm/arm/7zCrcOpt.asm new file mode 100644 index 0000000..6001d8e --- /dev/null +++ b/Asm/arm/7zCrcOpt.asm @@ -0,0 +1,100 @@ + CODE32 + + EXPORT |CrcUpdateT4@16| + + AREA |.text|, CODE, ARM + + MACRO + CRC32_STEP_1 + + ldrb r4, [r1], #1 + subs r2, r2, #1 + eor r4, r4, r0 + and r4, r4, #0xFF + ldr r4, [r3, +r4, lsl #2] + eor r0, r4, r0, lsr #8 + + MEND + + + MACRO + CRC32_STEP_4 $STREAM_WORD + + eor r7, r7, r8 + eor r7, r7, r9 + eor r0, r0, r7 + eor r0, r0, $STREAM_WORD + ldr $STREAM_WORD, [r1], #4 + + and r7, r0, #0xFF + and r8, r0, #0xFF00 + and r9, r0, #0xFF0000 + and r0, r0, #0xFF000000 + + ldr r7, [r6, +r7, lsl #2] + ldr r8, [r5, +r8, lsr #6] + ldr r9, [r4, +r9, lsr #14] + ldr r0, [r3, +r0, lsr #22] + + MEND + + +|CrcUpdateT4@16| PROC + + stmdb sp!, {r4-r11, lr} + cmp r2, #0 + beq |$fin| + +|$v1| + tst r1, #7 + beq |$v2| + CRC32_STEP_1 + bne |$v1| + +|$v2| + cmp r2, #16 + blo |$v3| + + ldr r10, [r1], #4 + ldr r11, [r1], #4 + + add r4, r3, #0x400 + add r5, r3, #0x800 + add r6, r3, #0xC00 + + mov r7, #0 + mov r8, #0 + mov r9, #0 + + sub r2, r2, #16 + +|$loop| + ; pld [r1, #0x40] + + CRC32_STEP_4 r10 + CRC32_STEP_4 r11 + + subs r2, r2, #8 + bhs |$loop| + + sub r1, r1, #8 + add r2, r2, #16 + + eor r7, r7, r8 + eor r7, r7, r9 + eor r0, r0, r7 + +|$v3| + cmp r2, #0 + beq |$fin| + +|$v4| + CRC32_STEP_1 + bne |$v4| + +|$fin| + ldmia sp!, {r4-r11, pc} + +|CrcUpdateT4@16| ENDP + + END diff --git a/Asm/arm64/7zAsm.S b/Asm/arm64/7zAsm.S new file mode 100644 index 0000000..12e950b --- /dev/null +++ b/Asm/arm64/7zAsm.S @@ -0,0 +1,181 @@ +// 7zAsm.S -- ASM macros for arm64 +// 2021-04-25 : Igor Pavlov : Public domain + +#define r0 x0 +#define r1 x1 +#define r2 x2 +#define r3 x3 +#define r4 x4 +#define r5 x5 +#define r6 x6 +#define r7 x7 +#define r8 x8 +#define r9 x9 +#define r10 x10 +#define r11 x11 +#define r12 x12 +#define r13 x13 +#define r14 x14 +#define r15 x15 +#define r16 x16 +#define r17 x17 +#define r18 x18 +#define r19 x19 +#define r20 x20 +#define r21 x21 +#define r22 x22 +#define r23 x23 +#define r24 x24 +#define r25 x25 +#define r26 x26 +#define r27 x27 +#define r28 x28 +#define r29 x29 +#define r30 x30 + +#define REG_ABI_PARAM_0 r0 +#define REG_ABI_PARAM_1 r1 +#define REG_ABI_PARAM_2 r2 + + +.macro p2_add reg:req, param:req + add \reg, \reg, \param +.endm + +.macro p2_sub reg:req, param:req + sub \reg, \reg, \param +.endm + +.macro p2_sub_s reg:req, param:req + subs \reg, \reg, \param +.endm + +.macro p2_and reg:req, param:req + and \reg, \reg, \param +.endm + +.macro xor reg:req, param:req + eor \reg, \reg, \param +.endm + +.macro or reg:req, param:req + orr \reg, \reg, \param +.endm + +.macro shl reg:req, param:req + lsl \reg, \reg, \param +.endm + +.macro shr reg:req, param:req + lsr \reg, \reg, \param +.endm + +.macro sar reg:req, param:req + asr \reg, \reg, \param +.endm + +.macro p1_neg reg:req + neg \reg, \reg +.endm + +.macro dec reg:req + sub \reg, \reg, 1 +.endm + +.macro dec_s reg:req + subs \reg, \reg, 1 +.endm + +.macro inc reg:req + add \reg, \reg, 1 +.endm + +.macro inc_s reg:req + adds \reg, \reg, 1 +.endm + + +.macro imul reg:req, param:req + mul \reg, \reg, \param +.endm + +/* +arm64 and arm use reverted c flag after subs/cmp instructions: + arm64-arm : x86 + b.lo / b.cc : jb / jc + b.hs / b.cs : jae / jnc +*/ + +.macro jmp lab:req + b \lab +.endm + +.macro je lab:req + b.eq \lab +.endm + +.macro jz lab:req + b.eq \lab +.endm + +.macro jnz lab:req + b.ne \lab +.endm + +.macro jne lab:req + b.ne \lab +.endm + +.macro jb lab:req + b.lo \lab +.endm + +.macro jbe lab:req + b.ls \lab +.endm + +.macro ja lab:req + b.hi \lab +.endm + +.macro jae lab:req + b.hs \lab +.endm + + +.macro cmove dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, eq +.endm + +.macro cmovne dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, ne +.endm + +.macro cmovs dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, mi +.endm + +.macro cmovns dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, pl +.endm + +.macro cmovb dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, lo +.endm + +.macro cmovae dest:req, srcTrue:req + csel \dest, \srcTrue, \dest, hs +.endm + + +.macro MY_ALIGN_16 macro + .p2align 4,, (1 << 4) - 1 +.endm + +.macro MY_ALIGN_32 macro + .p2align 5,, (1 << 5) - 1 +.endm + +.macro MY_ALIGN_64 macro + .p2align 6,, (1 << 6) - 1 +.endm diff --git a/Asm/arm64/LzmaDecOpt.S b/Asm/arm64/LzmaDecOpt.S new file mode 100644 index 0000000..10dc473 --- /dev/null +++ b/Asm/arm64/LzmaDecOpt.S @@ -0,0 +1,1487 @@ +// LzmaDecOpt.S -- ARM64-ASM version of LzmaDec_DecodeReal_3() function +// 2021-04-25 : Igor Pavlov : Public domain + +/* +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). +*/ + + +#include "7zAsm.S" + + // .arch armv8-a + // .file "LzmaDecOpt.c" + .text + .align 2 + .p2align 4,,15 +#ifdef __APPLE__ + .globl _LzmaDec_DecodeReal_3 +#else + .global LzmaDec_DecodeReal_3 +#endif + // .type LzmaDec_DecodeReal_3, %function + +// #define _LZMA_SIZE_OPT 1 + +#define LZMA_USE_4BYTES_FILL 1 +// #define LZMA_USE_2BYTES_COPY 1 +// #define LZMA_USE_CMOV_LZ_WRAP 1 +// #define _LZMA_PROB32 1 + +#define MY_ALIGN_FOR_ENTRY MY_ALIGN_32 +#define MY_ALIGN_FOR_LOOP MY_ALIGN_32 +#define MY_ALIGN_FOR_LOOP_16 MY_ALIGN_16 + +#ifdef _LZMA_PROB32 + .equ PSHIFT , 2 + .macro PLOAD dest:req, mem:req + ldr \dest, [\mem] + .endm + .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req + ldr \dest, [\mem, \offset]! + .endm + .macro PLOAD_2 dest:req, mem1:req, mem2:req + ldr \dest, [\mem1, \mem2] + .endm + .macro PLOAD_LSL dest:req, mem1:req, mem2:req + ldr \dest, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE src:req, mem:req + str \src, [\mem] + .endm + .macro PSTORE_2 src:req, mem1:req, mem2:req + str \src, [\mem1, \mem2] + .endm + .macro PSTORE_LSL src:req, mem1:req, mem2:req + str \src, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req + // you must check that temp_reg is free register when macro is used + add \temp_reg, \mem1, \mem2 + str \src, [\temp_reg, \mem2] + .endm +#else + // .equ PSHIFT , 1 + #define PSHIFT 1 + .macro PLOAD dest:req, mem:req + ldrh \dest, [\mem] + .endm + .macro PLOAD_PREINDEXED dest:req, mem:req, offset:req + ldrh \dest, [\mem, \offset]! + .endm + .macro PLOAD_2 dest:req, mem1:req, mem2:req + ldrh \dest, [\mem1, \mem2] + .endm + .macro PLOAD_LSL dest:req, mem1:req, mem2:req + ldrh \dest, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE src:req, mem:req + strh \src, [\mem] + .endm + .macro PSTORE_2 src:req, mem1:req, mem2:req + strh \src, [\mem1, \mem2] + .endm + .macro PSTORE_LSL src:req, mem1:req, mem2:req + strh \src, [\mem1, \mem2, lsl #PSHIFT] + .endm + .macro PSTORE_LSL_M1 src:req, mem1:req, mem2:req, temp_reg:req + strh \src, [\mem1, \mem2] + .endm +#endif + +.equ PMULT , (1 << PSHIFT) +.equ PMULT_2 , (2 << PSHIFT) + +.equ kMatchSpecLen_Error_Data , (1 << 9) + +# x7 t0 : NORM_CALC : prob2 (IF_BIT_1) +# x6 t1 : NORM_CALC : probs_state +# x8 t2 : (LITM) temp : (TREE) temp +# x4 t3 : (LITM) bit : (TREE) temp : UPDATE_0/UPDATE_0 temp +# x10 t4 : (LITM) offs : (TREE) probs_PMULT : numBits +# x9 t5 : (LITM) match : sym2 (ShortDist) +# x1 t6 : (LITM) litm_prob : (TREE) prob_reg : pbPos +# x2 t7 : (LITM) prm : probBranch : cnt +# x3 sym : dist +# x12 len +# x0 range +# x5 cod + + +#define range w0 + +// t6 +#define pbPos w1 +#define pbPos_R r1 +#define prob_reg w1 +#define litm_prob prob_reg + +// t7 +#define probBranch w2 +#define cnt w2 +#define cnt_R r2 +#define prm r2 + +#define sym w3 +#define sym_R r3 +#define dist sym + +#define t3 w4 +#define bit w4 +#define bit_R r4 +#define update_temp_reg r4 + +#define cod w5 + +#define t1 w6 +#define t1_R r6 +#define probs_state t1_R + +#define t0 w7 +#define t0_R r7 +#define prob2 t0 + +#define t2 w8 +#define t2_R r8 + +// t5 +#define match w9 +#define sym2 w9 +#define sym2_R r9 + +#define t4 w10 +#define t4_R r10 + +#define offs w10 +#define offs_R r10 + +#define probs r11 + +#define len w12 +#define len_R x12 + +#define state w13 +#define state_R r13 + +#define dicPos r14 +#define buf r15 +#define bufLimit r16 +#define dicBufSize r17 + +#define limit r19 +#define rep0 w20 +#define rep0_R r20 +#define rep1 w21 +#define rep2 w22 +#define rep3 w23 +#define dic r24 +#define probs_IsMatch r25 +#define probs_Spec r26 +#define checkDicSize w27 +#define processedPos w28 +#define pbMask w29 +#define lc2_lpMask w30 + + +.equ kNumBitModelTotalBits , 11 +.equ kBitModelTotal , (1 << kNumBitModelTotalBits) +.equ kNumMoveBits , 5 +.equ kBitModelOffset , (kBitModelTotal - (1 << kNumMoveBits) + 1) + +.macro NORM_2 macro + ldrb t0, [buf], 1 + shl range, 8 + orr cod, t0, cod, lsl 8 + /* + mov t0, cod + ldrb cod, [buf], 1 + shl range, 8 + bfi cod, t0, #8, #24 + */ +.endm + +.macro TEST_HIGH_BYTE_range macro + tst range, 0xFF000000 +.endm + +.macro NORM macro + TEST_HIGH_BYTE_range + jnz 1f + NORM_2 +1: +.endm + + +# ---------- Branch MACROS ---------- + +.macro UPDATE_0__0 + sub prob2, probBranch, kBitModelOffset +.endm + +.macro UPDATE_0__1 + sub probBranch, probBranch, prob2, asr #(kNumMoveBits) +.endm + +.macro UPDATE_0__2 probsArray:req, probOffset:req, probDisp:req + .if \probDisp == 0 + PSTORE_2 probBranch, \probsArray, \probOffset + .elseif \probOffset == 0 + PSTORE_2 probBranch, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + // add update_temp_reg, \probsArray, \probOffset + PSTORE_2 probBranch, update_temp_reg, \probDisp * PMULT + .endif +.endm + +.macro UPDATE_0 probsArray:req, probOffset:req, probDisp:req + UPDATE_0__0 + UPDATE_0__1 + UPDATE_0__2 \probsArray, \probOffset, \probDisp +.endm + + +.macro UPDATE_1 probsArray:req, probOffset:req, probDisp:req + // sub cod, cod, prob2 + // sub range, range, prob2 + p2_sub cod, range + sub range, prob2, range + sub prob2, probBranch, probBranch, lsr #(kNumMoveBits) + .if \probDisp == 0 + PSTORE_2 prob2, \probsArray, \probOffset + .elseif \probOffset == 0 + PSTORE_2 prob2, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + // add update_temp_reg, \probsArray, \probOffset + PSTORE_2 prob2, update_temp_reg, \probDisp * PMULT + .endif +.endm + + +.macro CMP_COD_BASE + NORM + // lsr prob2, range, kNumBitModelTotalBits + // imul prob2, probBranch + // cmp cod, prob2 + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +.endm + +.macro CMP_COD_1 probsArray:req + PLOAD probBranch, \probsArray + CMP_COD_BASE +.endm + +.macro CMP_COD_3 probsArray:req, probOffset:req, probDisp:req + .if \probDisp == 0 + PLOAD_2 probBranch, \probsArray, \probOffset + .elseif \probOffset == 0 + PLOAD_2 probBranch, \probsArray, \probDisp * PMULT + .else + .error "unsupported" + add update_temp_reg, \probsArray, \probOffset + PLOAD_2 probBranch, update_temp_reg, \probDisp * PMULT + .endif + CMP_COD_BASE +.endm + + +.macro IF_BIT_1_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD_3 \probsArray, \probOffset, \probDisp + jae \toLabel +.endm + + +.macro IF_BIT_1 probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP \probsArray, \probOffset, \probDisp, \toLabel + UPDATE_0 \probsArray, \probOffset, \probDisp +.endm + + +.macro IF_BIT_0_NOUP probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD_3 \probsArray, \probOffset, \probDisp + jb \toLabel +.endm + +.macro IF_BIT_0_NOUP_1 probsArray:req, toLabel:req + CMP_COD_1 \probsArray + jb \toLabel +.endm + + +# ---------- CMOV MACROS ---------- + +.macro NORM_LSR + NORM + lsr t0, range, #kNumBitModelTotalBits +.endm + +.macro COD_RANGE_SUB + subs t1, cod, t0 + p2_sub range, t0 +.endm + +.macro RANGE_IMUL prob:req + imul t0, \prob +.endm + +.macro NORM_CALC prob:req + NORM_LSR + RANGE_IMUL \prob + COD_RANGE_SUB +.endm + +.macro CMOV_range + cmovb range, t0 +.endm + +.macro CMOV_code + cmovae cod, t1 +.endm + +.macro CMOV_code_Model_Pre prob:req + sub t0, \prob, kBitModelOffset + CMOV_code + cmovae t0, \prob +.endm + + +.macro PUP_BASE_2 prob:req, dest_reg:req + # only sar works for both 16/32 bit prob modes + sub \dest_reg, \prob, \dest_reg, asr #(kNumMoveBits) +.endm + +.macro PUP prob:req, probPtr:req, mem2:req + PUP_BASE_2 \prob, t0 + PSTORE_2 t0, \probPtr, \mem2 +.endm + + + +#define probs_PMULT t4_R + +.macro BIT_01 + add probs_PMULT, probs, PMULT +.endm + + +.macro BIT_0_R prob:req + PLOAD_2 \prob, probs, 1 * PMULT + NORM_LSR + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + PLOAD_2 t2, probs, 1 * PMULT_2 + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + PLOAD_2 t0, probs, 1 * PMULT_2 + PMULT + PUP_BASE_2 \prob, t3 + csel \prob, t2, t0, lo + CMOV_code + mov sym, 2 + PSTORE_2 t3, probs, 1 * PMULT + adc sym, sym, wzr + BIT_01 +.endm + +.macro BIT_1_R prob:req + NORM_LSR + p2_add sym, sym + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + PLOAD_LSL t2, probs, sym_R + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + PLOAD_LSL t0, probs_PMULT, sym_R + PUP_BASE_2 \prob, t3 + csel \prob, t2, t0, lo + CMOV_code + PSTORE_LSL_M1 t3, probs, sym_R, t2_R + adc sym, sym, wzr +.endm + + +.macro BIT_2_R prob:req + NORM_LSR + p2_add sym, sym + sub t3, \prob, kBitModelOffset + RANGE_IMUL \prob + COD_RANGE_SUB + CMOV_range + cmovae t3, \prob + CMOV_code + PUP_BASE_2 \prob, t3 + PSTORE_LSL_M1 t3, probs, sym_R, t2_R + adc sym, sym, wzr +.endm + + +# ---------- MATCHED LITERAL ---------- + +.macro LITM_0 macro + shl match, (PSHIFT + 1) + and bit, match, 256 * PMULT + add prm, probs, 256 * PMULT + 1 * PMULT + p2_add match, match + p2_add prm, bit_R + eor offs, bit, 256 * PMULT + PLOAD litm_prob, prm + + NORM_LSR + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + cmovae offs, bit + CMOV_range + and bit, match, offs + cmovae t2, litm_prob + CMOV_code + mov sym, 2 + PUP_BASE_2 litm_prob, t2 + PSTORE t2, prm + add prm, probs, offs_R + adc sym, sym, wzr +.endm + +.macro LITM macro + p2_add prm, bit_R + xor offs, bit + PLOAD_LSL litm_prob, prm, sym_R + + NORM_LSR + p2_add match, match + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + cmovae offs, bit + CMOV_range + and bit, match, offs + cmovae t2, litm_prob + CMOV_code + PUP_BASE_2 litm_prob, t2 + PSTORE_LSL t2, prm, sym_R + add prm, probs, offs_R + adc sym, sym, sym +.endm + + +.macro LITM_2 macro + p2_add prm, bit_R + PLOAD_LSL litm_prob, prm, sym_R + + NORM_LSR + sub t2, litm_prob, kBitModelOffset + RANGE_IMUL litm_prob + COD_RANGE_SUB + CMOV_range + cmovae t2, litm_prob + CMOV_code + PUP_BASE_2 litm_prob, t2 + PSTORE_LSL t2, prm, sym_R + adc sym, sym, sym +.endm + + +# ---------- REVERSE BITS ---------- + +.macro REV_0 prob:req + NORM_CALC \prob + CMOV_range + PLOAD t2, sym2_R + PLOAD_2 t3, probs, 3 * PMULT + CMOV_code_Model_Pre \prob + add t1_R, probs, 3 * PMULT + cmovae sym2_R, t1_R + PUP \prob, probs, 1 * PMULT + csel \prob, t2, t3, lo +.endm + + +.macro REV_1 prob:req, step:req + NORM_LSR + PLOAD_PREINDEXED t2, sym2_R, (\step * PMULT) + RANGE_IMUL \prob + COD_RANGE_SUB + CMOV_range + PLOAD_2 t3, sym2_R, (\step * PMULT) + sub t0, \prob, kBitModelOffset + CMOV_code + add t1_R, sym2_R, \step * PMULT + cmovae t0, \prob + cmovae sym2_R, t1_R + PUP_BASE_2 \prob, t0 + csel \prob, t2, t3, lo + PSTORE_2 t0, t1_R, 0 - \step * PMULT_2 +.endm + + +.macro REV_2 prob:req, step:req + sub t1_R, sym2_R, probs + NORM_LSR + orr sym, sym, t1, lsr #PSHIFT + RANGE_IMUL \prob + COD_RANGE_SUB + sub t2, sym, \step + CMOV_range + cmovb sym, t2 + CMOV_code_Model_Pre \prob + PUP \prob, sym2_R, 0 +.endm + + +.macro REV_1_VAR prob:req + PLOAD \prob, sym_R + mov probs, sym_R + p2_add sym_R, sym2_R + NORM_LSR + add t2_R, sym_R, sym2_R + RANGE_IMUL \prob + COD_RANGE_SUB + cmovae sym_R, t2_R + CMOV_range + CMOV_code_Model_Pre \prob + p2_add sym2, sym2 + PUP \prob, probs, 0 +.endm + + +.macro add_big dest:req, src:req, param:req + .if (\param) < (1 << 12) + add \dest, \src, \param + .else + #ifndef _LZMA_PROB32 + .error "unexpcted add_big expansion" + #endif + add \dest, \src, (\param) / 2 + add \dest, \dest, (\param) - (\param) / 2 + .endif +.endm + +.macro sub_big dest:req, src:req, param:req + .if (\param) < (1 << 12) + sub \dest, \src, \param + .else + #ifndef _LZMA_PROB32 + .error "unexpcted sub_big expansion" + #endif + sub \dest, \src, (\param) / 2 + sub \dest, \dest, (\param) - (\param) / 2 + .endif +.endm + + +.macro SET_probs offset:req + // add_big probs, probs_Spec, (\offset) * PMULT + add probs, probs_IsMatch, ((\offset) - IsMatch) * PMULT +.endm + + +.macro LIT_PROBS + add sym, sym, processedPos, lsl 8 + inc processedPos + UPDATE_0__0 + shl sym, lc2_lpMask + SET_probs Literal + p2_and sym, lc2_lpMask + // p2_add probs_state, pbPos_R + p2_add probs, sym_R + UPDATE_0__1 + add probs, probs, sym_R, lsl 1 + UPDATE_0__2 probs_state, pbPos_R, 0 +.endm + + + +.equ kNumPosBitsMax , 4 +.equ kNumPosStatesMax , (1 << kNumPosBitsMax) + +.equ kLenNumLowBits , 3 +.equ kLenNumLowSymbols , (1 << kLenNumLowBits) +.equ kLenNumHighBits , 8 +.equ kLenNumHighSymbols , (1 << kLenNumHighBits) +.equ kNumLenProbs , (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +.equ LenLow , 0 +.equ LenChoice , LenLow +.equ LenChoice2 , (LenLow + kLenNumLowSymbols) +.equ LenHigh , (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +.equ kNumStates , 12 +.equ kNumStates2 , 16 +.equ kNumLitStates , 7 + +.equ kStartPosModelIndex , 4 +.equ kEndPosModelIndex , 14 +.equ kNumFullDistances , (1 << (kEndPosModelIndex >> 1)) + +.equ kNumPosSlotBits , 6 +.equ kNumLenToPosStates , 4 + +.equ kNumAlignBits , 4 +.equ kAlignTableSize , (1 << kNumAlignBits) + +.equ kMatchMinLen , 2 +.equ kMatchSpecLenStart , (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +// .equ kStartOffset , 1408 +.equ kStartOffset , 0 +.equ SpecPos , (-kStartOffset) +.equ IsRep0Long , (SpecPos + kNumFullDistances) +.equ RepLenCoder , (IsRep0Long + (kNumStates2 << kNumPosBitsMax)) +.equ LenCoder , (RepLenCoder + kNumLenProbs) +.equ IsMatch , (LenCoder + kNumLenProbs) +.equ kAlign , (IsMatch + (kNumStates2 << kNumPosBitsMax)) +.equ IsRep , (kAlign + kAlignTableSize) +.equ IsRepG0 , (IsRep + kNumStates) +.equ IsRepG1 , (IsRepG0 + kNumStates) +.equ IsRepG2 , (IsRepG1 + kNumStates) +.equ PosSlot , (IsRepG2 + kNumStates) +.equ Literal , (PosSlot + (kNumLenToPosStates << kNumPosSlotBits)) +.equ NUM_BASE_PROBS , (Literal + kStartOffset) + +.if kStartOffset != 0 // && IsMatch != 0 + .error "Stop_Compiling_Bad_StartOffset" +.endif + +.if NUM_BASE_PROBS != 1984 + .error "Stop_Compiling_Bad_LZMA_PROBS" +.endif + +.equ offset_lc , 0 +.equ offset_lp , 1 +.equ offset_pb , 2 +.equ offset_dicSize , 4 +.equ offset_probs , 4 + offset_dicSize +.equ offset_probs_1664 , 8 + offset_probs +.equ offset_dic , 8 + offset_probs_1664 +.equ offset_dicBufSize , 8 + offset_dic +.equ offset_dicPos , 8 + offset_dicBufSize +.equ offset_buf , 8 + offset_dicPos +.equ offset_range , 8 + offset_buf +.equ offset_code , 4 + offset_range +.equ offset_processedPos , 4 + offset_code +.equ offset_checkDicSize , 4 + offset_processedPos +.equ offset_rep0 , 4 + offset_checkDicSize +.equ offset_rep1 , 4 + offset_rep0 +.equ offset_rep2 , 4 + offset_rep1 +.equ offset_rep3 , 4 + offset_rep2 +.equ offset_state , 4 + offset_rep3 +.equ offset_remainLen , 4 + offset_state +.equ offset_TOTAL_SIZE , 4 + offset_remainLen + +.if offset_TOTAL_SIZE != 96 + .error "Incorrect offset_TOTAL_SIZE" +.endif + + +.macro IsMatchBranch_Pre + # prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + and pbPos, pbMask, processedPos, lsl #(kLenNumLowBits + 1 + PSHIFT) + add probs_state, probs_IsMatch, state_R +.endm + + +/* +.macro IsMatchBranch + IsMatchBranch_Pre + IF_BIT_1 probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label +.endm +*/ + +.macro CheckLimits + cmp buf, bufLimit + jae fin_OK + cmp dicPos, limit + jae fin_OK +.endm + +#define CheckLimits_lit CheckLimits +/* +.macro CheckLimits_lit + cmp buf, bufLimit + jae fin_OK_lit + cmp dicPos, limit + jae fin_OK_lit +.endm +*/ + + +#define PARAM_lzma REG_ABI_PARAM_0 +#define PARAM_limit REG_ABI_PARAM_1 +#define PARAM_bufLimit REG_ABI_PARAM_2 + + +.macro LOAD_LZMA_VAR reg:req, struct_offs:req + ldr \reg, [PARAM_lzma, \struct_offs] +.endm + +.macro LOAD_LZMA_BYTE reg:req, struct_offs:req + ldrb \reg, [PARAM_lzma, \struct_offs] +.endm + +.macro LOAD_LZMA_PAIR reg0:req, reg1:req, struct_offs:req + ldp \reg0, \reg1, [PARAM_lzma, \struct_offs] +.endm + + +LzmaDec_DecodeReal_3: +_LzmaDec_DecodeReal_3: +/* +.LFB0: + .cfi_startproc +*/ + + stp x19, x20, [sp, -128]! + stp x21, x22, [sp, 16] + stp x23, x24, [sp, 32] + stp x25, x26, [sp, 48] + stp x27, x28, [sp, 64] + stp x29, x30, [sp, 80] + + str PARAM_lzma, [sp, 120] + + mov bufLimit, PARAM_bufLimit + mov limit, PARAM_limit + + LOAD_LZMA_PAIR dic, dicBufSize, offset_dic + LOAD_LZMA_PAIR dicPos, buf, offset_dicPos + LOAD_LZMA_PAIR rep0, rep1, offset_rep0 + LOAD_LZMA_PAIR rep2, rep3, offset_rep2 + + mov t0, 1 << (kLenNumLowBits + 1 + PSHIFT) + LOAD_LZMA_BYTE pbMask, offset_pb + p2_add limit, dic + mov len, wzr // we can set it in all requiread branches instead + lsl pbMask, t0, pbMask + p2_add dicPos, dic + p2_sub pbMask, t0 + + LOAD_LZMA_BYTE lc2_lpMask, offset_lc + mov t0, 256 << PSHIFT + LOAD_LZMA_BYTE t1, offset_lp + p2_add t1, lc2_lpMask + p2_sub lc2_lpMask, (256 << PSHIFT) - PSHIFT + shl t0, t1 + p2_add lc2_lpMask, t0 + + LOAD_LZMA_VAR probs_Spec, offset_probs + LOAD_LZMA_VAR checkDicSize, offset_checkDicSize + LOAD_LZMA_VAR processedPos, offset_processedPos + LOAD_LZMA_VAR state, offset_state + // range is r0 : this load must be last don't move + LOAD_LZMA_PAIR range, cod, offset_range + mov sym, wzr + shl state, PSHIFT + + add_big probs_IsMatch, probs_Spec, ((IsMatch - SpecPos) << PSHIFT) + + // if (processedPos != 0 || checkDicSize != 0) + orr t0, checkDicSize, processedPos + cbz t0, 1f + add t0_R, dicBufSize, dic + cmp dicPos, dic + cmovne t0_R, dicPos + ldrb sym, [t0_R, -1] +1: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + +#define BIT_0 BIT_0_R prob_reg +#define BIT_1 BIT_1_R prob_reg +#define BIT_2 BIT_2_R prob_reg + +# ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + mov state, wzr +lit_start_2: + LIT_PROBS + + #ifdef _LZMA_SIZE_OPT + + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +lit_loop: + BIT_1 + tbz sym, 7, lit_loop + + #else + + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + + #endif + + BIT_2 + IsMatchBranch_Pre + strb sym, [dicPos], 1 + p2_and sym, 255 + + CheckLimits_lit +lit_end: + IF_BIT_0_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), lit_start + + # jmp IsMatch_label + + +#define FLAG_STATE_BITS (4 + PSHIFT) + +# ---------- MATCHES ---------- +# MY_ALIGN_FOR_ENTRY +IsMatch_label: + UPDATE_1 probs_state, pbPos_R, (IsMatch - IsMatch) + IF_BIT_1 probs_state, 0, (IsRep - IsMatch), IsRep_label + + SET_probs LenCoder + or state, (1 << FLAG_STATE_BITS) + +# ---------- LEN DECODE ---------- +len_decode: + mov len, 8 - kMatchMinLen + IF_BIT_0_NOUP_1 probs, len_mid_0 + UPDATE_1 probs, 0, 0 + p2_add probs, (1 << (kLenNumLowBits + PSHIFT)) + mov len, 0 - kMatchMinLen + IF_BIT_0_NOUP_1 probs, len_mid_0 + UPDATE_1 probs, 0, 0 + p2_add probs, LenHigh * PMULT - (1 << (kLenNumLowBits + PSHIFT)) + + #if 0 == 1 + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + #else + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +len8_loop: + BIT_1 + tbz sym, 6, len8_loop + #endif + + mov len, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - kMatchMinLen + jmp len_mid_2 + +MY_ALIGN_FOR_ENTRY +len_mid_0: + UPDATE_0 probs, 0, 0 + p2_add probs, pbPos_R + BIT_0 +len_mid_2: + BIT_1 + BIT_2 + sub len, sym, len + tbz state, FLAG_STATE_BITS, copy_match + +# ---------- DECODE DISTANCE ---------- + // probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp len, 3 + kMatchMinLen + cmovb t0, len + SET_probs PosSlot - (kMatchMinLen << (kNumPosSlotBits)) + add probs, probs, t0_R, lsl #(kNumPosSlotBits + PSHIFT) + + #ifdef _LZMA_SIZE_OPT + + PLOAD_2 prob_reg, probs, 1 * PMULT + mov sym, 1 + BIT_01 +MY_ALIGN_FOR_LOOP +slot_loop: + BIT_1 + tbz sym, 5, slot_loop + + #else + + BIT_0 + BIT_1 + BIT_1 + BIT_1 + BIT_1 + + #endif + + #define numBits t4 + mov numBits, sym + BIT_2 + // we need only low bits + p2_and sym, 3 + cmp numBits, 32 + kEndPosModelIndex / 2 + jb short_dist + + SET_probs kAlign + + # unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + p2_sub numBits, (32 + 1 + kNumAlignBits) + # distance = (2 | (distance & 1)); + or sym, 2 + PLOAD_2 prob_reg, probs, 1 * PMULT + add sym2_R, probs, 2 * PMULT + +# ---------- DIRECT DISTANCE ---------- + +.macro DIRECT_1 + shr range, 1 + subs t0, cod, range + p2_add sym, sym + // add t1, sym, 1 + csel cod, cod, t0, mi + csinc sym, sym, sym, mi + // csel sym, t1, sym, pl + // adc sym, sym, sym // not 100% compatible for "corruptued-allowed" LZMA streams + dec_s numBits + je direct_end +.endm + + #ifdef _LZMA_SIZE_OPT + + jmp direct_norm +MY_ALIGN_FOR_ENTRY +direct_loop: + DIRECT_1 +direct_norm: + TEST_HIGH_BYTE_range + jnz direct_loop + NORM_2 + jmp direct_loop + + #else + +.macro DIRECT_2 + TEST_HIGH_BYTE_range + jz direct_unroll + DIRECT_1 +.endm + + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + DIRECT_2 + +direct_unroll: + NORM_2 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + DIRECT_1 + jmp direct_unroll + + #endif + +MY_ALIGN_FOR_ENTRY +direct_end: + shl sym, kNumAlignBits + REV_0 prob_reg + REV_1 prob_reg, 2 + REV_1 prob_reg, 4 + REV_2 prob_reg, 8 + +decode_dist_end: + + // if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + tst checkDicSize, checkDicSize + csel t0, processedPos, checkDicSize, eq + cmp sym, t0 + jae end_of_payload + // jmp end_of_payload # for debug + + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + add rep0, sym, 1 + +.macro STATE_UPDATE_FOR_MATCH + // state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + // cmp state, (kNumStates + kNumLitStates) * PMULT + cmp state, kNumLitStates * PMULT + (1 << FLAG_STATE_BITS) + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 +.endm + STATE_UPDATE_FOR_MATCH + +# ---------- COPY MATCH ---------- +copy_match: + + // if ((rem = limit - dicPos) == 0) break // return SZ_ERROR_DATA; + subs cnt_R, limit, dicPos + // jz fin_dicPos_LIMIT + jz fin_OK + + // curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, len_R + cmovae cnt, len + + sub t0_R, dicPos, dic + p2_add dicPos, cnt_R + p2_add processedPos, cnt + p2_sub len, cnt + + // pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + p2_sub_s t0_R, rep0_R + jae 1f + + cmn t0_R, cnt_R + p2_add t0_R, dicBufSize + ja copy_match_cross +1: +# ---------- COPY MATCH FAST ---------- + # t0_R : src_pos + p2_add t0_R, dic + ldrb sym, [t0_R] + p2_add t0_R, cnt_R + p1_neg cnt_R + +copy_common: + dec dicPos + + # dicPos : (ptr_to_last_dest_BYTE) + # t0_R : (src_lim) + # cnt_R : (-curLen) + + IsMatchBranch_Pre + + inc_s cnt_R + jz copy_end + + cmp rep0, 1 + je copy_match_0 + + #ifdef LZMA_USE_2BYTES_COPY + strb sym, [dicPos, cnt_R] + dec dicPos + # dicPos : (ptr_to_last_dest_16bitWORD) + p2_and cnt_R, -2 + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jz 2f +MY_ALIGN_FOR_LOOP +1: + /* + strh sym, [dicPos, cnt_R] + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jz 2f + */ + + strh sym, [dicPos, cnt_R] + ldrh sym, [t0_R, cnt_R] + adds cnt_R, cnt_R, 2 + jnz 1b +2: + + /* + // for universal little/big endian code, but slow + strh sym, [dicPos] + inc dicPos + ldrb sym, [t0_R, -1] + */ + + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + // we must improve big-endian detection for another compilers + // for big-endian we need to revert bytes + rev16 sym, sym + #endif + + // (sym) must represent as little-endian here: + strb sym, [dicPos], 1 + shr sym, 8 + + #else + +MY_ALIGN_FOR_LOOP +1: + strb sym, [dicPos, cnt_R] + ldrb sym, [t0_R, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + ldrb sym, [t0_R, cnt_R] + inc_s cnt_R + jnz 1b + #endif + +copy_end: +lz_end_match: + strb sym, [dicPos], 1 + + # IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label + + + +# ---------- LITERAL MATCHED ---------- + + LIT_PROBS + + // matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + + sub t0_R, dicPos, dic + p2_sub_s t0_R, rep0_R + + #ifdef LZMA_USE_CMOV_LZ_WRAP + add t1_R, t0_R, dicBufSize + cmovb t0_R, t1_R + #else + jae 1f + p2_add t0_R, dicBufSize +1: + #endif + + ldrb match, [dic, t0_R] + + // state -= (state < 10) ? 3 : 6; + sub sym, state, 6 * PMULT + cmp state, 10 * PMULT + p2_sub state, 3 * PMULT + cmovae state, sym + + #ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov sym, 1 + and bit, match, offs + add prm, probs, offs_R + +MY_ALIGN_FOR_LOOP +litm_loop: + LITM + tbz sym, 8, litm_loop + + #else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + #endif + + IsMatchBranch_Pre + strb sym, [dicPos], 1 + p2_and sym, 255 + + // mov len, wzr // LITM uses same regisetr (len / offs). So we clear it + CheckLimits_lit +lit_matched_end: + IF_BIT_1_NOUP probs_state, pbPos_R, (IsMatch - IsMatch), IsMatch_label + # IsMatchBranch + p2_sub state, 3 * PMULT + jmp lit_start_2 + + + +# ---------- REP 0 LITERAL ---------- +MY_ALIGN_FOR_ENTRY +IsRep0Short_label: + UPDATE_0 probs_state, pbPos_R, 0 + + // dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + sub t0_R, dicPos, dic + + // state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + + # the caller doesn't allow (dicPos >= limit) case for REP_SHORT + # so we don't need the following (dicPos == limit) check here: + # cmp dicPos, limit + # jae fin_dicPos_LIMIT_REP_SHORT + # // jmp fin_dicPos_LIMIT_REP_SHORT // for testing/debug puposes + + inc processedPos + + IsMatchBranch_Pre + + p2_sub_s t0_R, rep0_R + #ifdef LZMA_USE_CMOV_LZ_WRAP + add sym_R, t0_R, dicBufSize + cmovb t0_R, sym_R + #else + jae 1f + p2_add t0_R, dicBufSize +1: + #endif + + ldrb sym, [dic, t0_R] + // mov len, wzr + jmp lz_end_match + +MY_ALIGN_FOR_ENTRY +IsRep_label: + UPDATE_1 probs_state, 0, (IsRep - IsMatch) + + # The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + # So we don't check it here. + + # mov t0, processedPos + # or t0, checkDicSize + # jz fin_ERROR_2 + + // state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + SET_probs RepLenCoder + + IF_BIT_1 probs_state, 0, (IsRepG0 - IsMatch), IsRepG0_label + sub_big probs_state, probs_state, (IsMatch - IsRep0Long) << PSHIFT + IF_BIT_0_NOUP probs_state, pbPos_R, 0, IsRep0Short_label + UPDATE_1 probs_state, pbPos_R, 0 + jmp len_decode + +MY_ALIGN_FOR_ENTRY +IsRepG0_label: + UPDATE_1 probs_state, 0, (IsRepG0 - IsMatch) + IF_BIT_1 probs_state, 0, (IsRepG1 - IsMatch), IsRepG1_label + mov dist, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + +# MY_ALIGN_FOR_ENTRY +IsRepG1_label: + UPDATE_1 probs_state, 0, (IsRepG1 - IsMatch) + IF_BIT_1 probs_state, 0, (IsRepG2 - IsMatch), IsRepG2_label + mov dist, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + +# MY_ALIGN_FOR_ENTRY +IsRepG2_label: + UPDATE_1 probs_state, 0, (IsRepG2 - IsMatch) + mov dist, rep3 + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, dist + jmp len_decode + + + +# ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_FOR_ENTRY +short_dist: + p2_sub_s numBits, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, numBits + add sym_R, probs_Spec, sym_R, lsl #PSHIFT + p2_add sym_R, SpecPos * PMULT + 1 * PMULT + mov sym2, PMULT // # step +MY_ALIGN_FOR_LOOP +spec_loop: + REV_1_VAR prob_reg + dec_s numBits + jnz spec_loop + + p2_add sym2_R, probs_Spec + .if SpecPos != 0 + p2_add sym2_R, SpecPos * PMULT + .endif + p2_sub sym_R, sym2_R + shr sym, PSHIFT + + jmp decode_dist_end + + + +# ---------- COPY MATCH 0 ---------- +MY_ALIGN_FOR_ENTRY +copy_match_0: + #ifdef LZMA_USE_4BYTES_FILL + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + orr t3, sym, sym, lsl 8 + p2_and cnt_R, -4 + orr t3, t3, t3, lsl 16 +MY_ALIGN_FOR_LOOP_16 +1: + /* + str t3, [dicPos, cnt_R] + adds cnt_R, cnt_R, 4 + jz 2f + */ + + str t3, [dicPos, cnt_R] + adds cnt_R, cnt_R, 4 + jnz 1b +2: + // p2_and sym, 255 + #else + +MY_ALIGN_FOR_LOOP +1: + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jz copy_end + + strb sym, [dicPos, cnt_R] + inc_s cnt_R + jnz 1b + #endif + + jmp copy_end + + +# ---------- COPY MATCH CROSS ---------- +copy_match_cross: + # t0_R - src pos + # cnt_R - total copy len + + p1_neg cnt_R +1: + ldrb sym, [dic, t0_R] + inc t0_R + strb sym, [dicPos, cnt_R] + inc cnt_R + cmp t0_R, dicBufSize + jne 1b + + ldrb sym, [dic] + sub t0_R, dic, cnt_R + jmp copy_common + + + + +/* +fin_dicPos_LIMIT_REP_SHORT: + mov len, 1 + jmp fin_OK +*/ + +/* +fin_dicPos_LIMIT: + jmp fin_OK + # For more strict mode we can stop decoding with error + # mov sym, 1 + # jmp fin +*/ + +fin_ERROR_MATCH_DIST: + # rep0 = distance + 1; + p2_add len, kMatchSpecLen_Error_Data + mov rep3, rep2 + mov rep2, rep1 + mov rep1, rep0 + mov rep0, sym + STATE_UPDATE_FOR_MATCH + # jmp fin_OK + mov sym, 1 + jmp fin + +end_of_payload: + inc_s sym + jnz fin_ERROR_MATCH_DIST + + mov len, kMatchSpecLenStart + xor state, (1 << FLAG_STATE_BITS) + jmp fin_OK + +/* +fin_OK_lit: + mov len, wzr +*/ + +fin_OK: + mov sym, wzr + +fin: + NORM + + #define fin_lzma_reg t0_R + + .macro STORE_LZMA_VAR reg:req, struct_offs:req + str \reg, [fin_lzma_reg, \struct_offs] + .endm + + .macro STORE_LZMA_PAIR reg0:req, reg1:req, struct_offs:req + stp \reg0, \reg1, [fin_lzma_reg, \struct_offs] + .endm + + ldr fin_lzma_reg, [sp, 120] + p2_sub dicPos, dic + shr state, PSHIFT + + STORE_LZMA_PAIR dicPos, buf, offset_dicPos + STORE_LZMA_PAIR range, cod, offset_range + STORE_LZMA_VAR processedPos, offset_processedPos + STORE_LZMA_PAIR rep0, rep1, offset_rep0 + STORE_LZMA_PAIR rep2, rep3, offset_rep2 + STORE_LZMA_PAIR state, len, offset_state + + mov w0, sym + + ldp x29, x30, [sp, 80] + ldp x27, x28, [sp, 64] + ldp x25, x26, [sp, 48] + ldp x23, x24, [sp, 32] + ldp x21, x22, [sp, 16] + ldp x19, x20, [sp], 128 + + ret +/* + .cfi_endproc +.LFE0: + .size LzmaDec_DecodeReal_3, .-LzmaDec_DecodeReal_3 + .ident "TAG_LZMA" + .section .note.GNU-stack,"",@progbits +*/ diff --git a/Asm/x86/7zAsm.asm b/Asm/x86/7zAsm.asm new file mode 100644 index 0000000..6275bb7 --- /dev/null +++ b/Asm/x86/7zAsm.asm @@ -0,0 +1,284 @@ +; 7zAsm.asm -- ASM macros +; 2021-12-25 : Igor Pavlov : Public domain + + +ifdef @wordsize +; @wordsize is defined only in JWASM and ASMC and is not defined in MASM +; @wordsize eq 8 for 64-bit x64 +; @wordsize eq 2 for 32-bit x86 +if @wordsize eq 8 + x64 equ 1 +endif +else +ifdef RAX + x64 equ 1 +endif +endif + + +ifdef x64 + IS_X64 equ 1 +else + IS_X64 equ 0 +endif + +ifdef ABI_LINUX + IS_LINUX equ 1 +else + IS_LINUX equ 0 +endif + +ifndef x64 +; Use ABI_CDECL for x86 (32-bit) only +; if ABI_CDECL is not defined, we use fastcall abi +ifdef ABI_CDECL + IS_CDECL equ 1 +else + IS_CDECL equ 0 +endif +endif + +OPTION PROLOGUE:NONE +OPTION EPILOGUE:NONE + +MY_ASM_START macro + ifdef x64 + .code + else + .386 + .model flat + _TEXT$00 SEGMENT PARA PUBLIC 'CODE' + endif +endm + +MY_PROC macro name:req, numParams:req + align 16 + proc_numParams = numParams + if (IS_X64 gt 0) + proc_name equ name + elseif (IS_LINUX gt 0) + proc_name equ name + elseif (IS_CDECL gt 0) + proc_name equ @CatStr(_,name) + else + proc_name equ @CatStr(@,name,@, %numParams * 4) + endif + proc_name PROC +endm + +MY_ENDP macro + if (IS_X64 gt 0) + ret + elseif (IS_CDECL gt 0) + ret + elseif (proc_numParams LT 3) + ret + else + ret (proc_numParams - 2) * 4 + endif + proc_name ENDP +endm + + +ifdef x64 + REG_SIZE equ 8 + REG_LOGAR_SIZE equ 3 +else + REG_SIZE equ 4 + REG_LOGAR_SIZE equ 2 +endif + + x0 equ EAX + x1 equ ECX + x2 equ EDX + x3 equ EBX + x4 equ ESP + x5 equ EBP + x6 equ ESI + x7 equ EDI + + x0_W equ AX + x1_W equ CX + x2_W equ DX + x3_W equ BX + + x5_W equ BP + x6_W equ SI + x7_W equ DI + + x0_L equ AL + x1_L equ CL + x2_L equ DL + x3_L equ BL + + x0_H equ AH + x1_H equ CH + x2_H equ DH + x3_H equ BH + +ifdef x64 + x5_L equ BPL + x6_L equ SIL + x7_L equ DIL + + r0 equ RAX + r1 equ RCX + r2 equ RDX + r3 equ RBX + r4 equ RSP + r5 equ RBP + r6 equ RSI + r7 equ RDI + x8 equ r8d + x9 equ r9d + x10 equ r10d + x11 equ r11d + x12 equ r12d + x13 equ r13d + x14 equ r14d + x15 equ r15d +else + r0 equ x0 + r1 equ x1 + r2 equ x2 + r3 equ x3 + r4 equ x4 + r5 equ x5 + r6 equ x6 + r7 equ x7 +endif + + +ifdef x64 +ifdef ABI_LINUX + +MY_PUSH_2_REGS macro + push r3 + push r5 +endm + +MY_POP_2_REGS macro + pop r5 + pop r3 +endm + +endif +endif + + +MY_PUSH_4_REGS macro + push r3 + push r5 + push r6 + push r7 +endm + +MY_POP_4_REGS macro + pop r7 + pop r6 + pop r5 + pop r3 +endm + + +; for fastcall and for WIN-x64 +REG_PARAM_0_x equ x1 +REG_PARAM_0 equ r1 +REG_PARAM_1_x equ x2 +REG_PARAM_1 equ r2 + +ifndef x64 +; for x86-fastcall + +REG_ABI_PARAM_0_x equ REG_PARAM_0_x +REG_ABI_PARAM_0 equ REG_PARAM_0 +REG_ABI_PARAM_1_x equ REG_PARAM_1_x +REG_ABI_PARAM_1 equ REG_PARAM_1 + +else +; x64 + +if (IS_LINUX eq 0) + +; for WIN-x64: +REG_PARAM_2_x equ x8 +REG_PARAM_2 equ r8 +REG_PARAM_3 equ r9 + +REG_ABI_PARAM_0_x equ REG_PARAM_0_x +REG_ABI_PARAM_0 equ REG_PARAM_0 +REG_ABI_PARAM_1_x equ REG_PARAM_1_x +REG_ABI_PARAM_1 equ REG_PARAM_1 +REG_ABI_PARAM_2_x equ REG_PARAM_2_x +REG_ABI_PARAM_2 equ REG_PARAM_2 +REG_ABI_PARAM_3 equ REG_PARAM_3 + +else +; for LINUX-x64: +REG_LINUX_PARAM_0_x equ x7 +REG_LINUX_PARAM_0 equ r7 +REG_LINUX_PARAM_1_x equ x6 +REG_LINUX_PARAM_1 equ r6 +REG_LINUX_PARAM_2 equ r2 +REG_LINUX_PARAM_3 equ r1 +REG_LINUX_PARAM_4_x equ x8 +REG_LINUX_PARAM_4 equ r8 +REG_LINUX_PARAM_5 equ r9 + +REG_ABI_PARAM_0_x equ REG_LINUX_PARAM_0_x +REG_ABI_PARAM_0 equ REG_LINUX_PARAM_0 +REG_ABI_PARAM_1_x equ REG_LINUX_PARAM_1_x +REG_ABI_PARAM_1 equ REG_LINUX_PARAM_1 +REG_ABI_PARAM_2 equ REG_LINUX_PARAM_2 +REG_ABI_PARAM_3 equ REG_LINUX_PARAM_3 +REG_ABI_PARAM_4_x equ REG_LINUX_PARAM_4_x +REG_ABI_PARAM_4 equ REG_LINUX_PARAM_4 +REG_ABI_PARAM_5 equ REG_LINUX_PARAM_5 + +MY_ABI_LINUX_TO_WIN_2 macro + mov r2, r6 + mov r1, r7 +endm + +MY_ABI_LINUX_TO_WIN_3 macro + mov r8, r2 + mov r2, r6 + mov r1, r7 +endm + +MY_ABI_LINUX_TO_WIN_4 macro + mov r9, r1 + mov r8, r2 + mov r2, r6 + mov r1, r7 +endm + +endif ; IS_LINUX + + +MY_PUSH_PRESERVED_ABI_REGS macro + if (IS_LINUX gt 0) + MY_PUSH_2_REGS + else + MY_PUSH_4_REGS + endif + push r12 + push r13 + push r14 + push r15 +endm + + +MY_POP_PRESERVED_ABI_REGS macro + pop r15 + pop r14 + pop r13 + pop r12 + if (IS_LINUX gt 0) + MY_POP_2_REGS + else + MY_POP_4_REGS + endif +endm + +endif ; x64 diff --git a/Asm/x86/7zCrcOpt.asm b/Asm/x86/7zCrcOpt.asm new file mode 100644 index 0000000..0fee206 --- /dev/null +++ b/Asm/x86/7zCrcOpt.asm @@ -0,0 +1,180 @@ +; 7zCrcOpt.asm -- CRC32 calculation : optimized version +; 2021-02-07 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +rD equ r2 +rN equ r7 +rT equ r5 + +ifdef x64 + num_VAR equ r8 + table_VAR equ r9 +else + if (IS_CDECL gt 0) + crc_OFFS equ (REG_SIZE * 5) + data_OFFS equ (REG_SIZE + crc_OFFS) + size_OFFS equ (REG_SIZE + data_OFFS) + else + size_OFFS equ (REG_SIZE * 5) + endif + table_OFFS equ (REG_SIZE + size_OFFS) + num_VAR equ [r4 + size_OFFS] + table_VAR equ [r4 + table_OFFS] +endif + +SRCDAT equ rD + rN * 1 + 4 * + +CRC macro op:req, dest:req, src:req, t:req + op dest, DWORD PTR [rT + src * 4 + 0400h * t] +endm + +CRC_XOR macro dest:req, src:req, t:req + CRC xor, dest, src, t +endm + +CRC_MOV macro dest:req, src:req, t:req + CRC mov, dest, src, t +endm + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shr x0, 8 + CRC xor, x0, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + + ifdef x64 + if (IS_LINUX gt 0) + MY_PUSH_2_REGS + mov x0, REG_ABI_PARAM_0_x ; x0 = x7 + mov rT, REG_ABI_PARAM_3 ; r5 = r1 + mov rN, REG_ABI_PARAM_2 ; r7 = r2 + mov rD, REG_ABI_PARAM_1 ; r2 = r6 + else + MY_PUSH_4_REGS + mov x0, REG_ABI_PARAM_0_x ; x0 = x1 + mov rT, REG_ABI_PARAM_3 ; r5 = r9 + mov rN, REG_ABI_PARAM_2 ; r7 = r8 + ; mov rD, REG_ABI_PARAM_1 ; r2 = r2 + endif + else + MY_PUSH_4_REGS + if (IS_CDECL gt 0) + mov x0, [r4 + crc_OFFS] + mov rD, [r4 + data_OFFS] + else + mov x0, REG_ABI_PARAM_0_x + endif + mov rN, num_VAR + mov rT, table_VAR + endif + + test rN, rN + jz crc_end + @@: + test rD, 7 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 16 + jb crc_end + add rN, rD + mov num_VAR, rN + sub rN, 8 + and rN, NOT 7 + sub rD, rN + xor x0, [SRCDAT 0] +endm + +MY_EPILOG macro crc_end:req + xor x0, [SRCDAT 0] + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + if (IS_X64 gt 0) and (IS_LINUX gt 0) + MY_POP_2_REGS + else + MY_POP_4_REGS + endif +endm + +MY_PROC CrcUpdateT8, 4 + MY_PROLOG crc_end_8 + mov x1, [SRCDAT 1] + align 16 + main_loop_8: + mov x6, [SRCDAT 2] + movzx x3, x1_L + CRC_XOR x6, r3, 3 + movzx x3, x1_H + CRC_XOR x6, r3, 2 + shr x1, 16 + movzx x3, x1_L + movzx x1, x1_H + CRC_XOR x6, r3, 1 + movzx x3, x0_L + CRC_XOR x6, r1, 0 + + mov x1, [SRCDAT 3] + CRC_XOR x6, r3, 7 + movzx x3, x0_H + shr x0, 16 + CRC_XOR x6, r3, 6 + movzx x3, x0_L + CRC_XOR x6, r3, 5 + movzx x3, x0_H + CRC_MOV x0, r3, 4 + xor x0, x6 + add rD, 8 + jnz main_loop_8 + + MY_EPILOG crc_end_8 +MY_ENDP + +MY_PROC CrcUpdateT4, 4 + MY_PROLOG crc_end_4 + align 16 + main_loop_4: + movzx x1, x0_L + movzx x3, x0_H + shr x0, 16 + movzx x6, x0_H + and x0, 0FFh + CRC_MOV x1, r1, 3 + xor x1, [SRCDAT 1] + CRC_XOR x1, r3, 2 + CRC_XOR x1, r6, 0 + CRC_XOR x1, r0, 1 + + movzx x0, x1_L + movzx x3, x1_H + shr x1, 16 + movzx x6, x1_H + and x1, 0FFh + CRC_MOV x0, r0, 3 + xor x0, [SRCDAT 2] + CRC_XOR x0, r3, 2 + CRC_XOR x0, r6, 0 + CRC_XOR x0, r1, 1 + add rD, 8 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +end diff --git a/Asm/x86/AesOpt.asm b/Asm/x86/AesOpt.asm new file mode 100644 index 0000000..84bf897 --- /dev/null +++ b/Asm/x86/AesOpt.asm @@ -0,0 +1,742 @@ +; AesOpt.asm -- AES optimized code for x86 AES hardware instructions +; 2021-12-25 : Igor Pavlov : Public domain + +include 7zAsm.asm + +ifdef __ASMC__ + use_vaes_256 equ 1 +else +ifdef ymm0 + use_vaes_256 equ 1 +endif +endif + + +ifdef use_vaes_256 + ECHO "++ VAES 256" +else + ECHO "-- NO VAES 256" +endif + +ifdef x64 + ECHO "x86-64" +else + ECHO "x86" +if (IS_CDECL gt 0) + ECHO "ABI : CDECL" +else + ECHO "ABI : no CDECL : FASTCALL" +endif +endif + +if (IS_LINUX gt 0) + ECHO "ABI : LINUX" +else + ECHO "ABI : WINDOWS" +endif + +MY_ASM_START + +ifndef x64 + .686 + .xmm +endif + + +; MY_ALIGN EQU ALIGN(64) +MY_ALIGN EQU + +SEG_ALIGN EQU MY_ALIGN + +MY_SEG_PROC macro name:req, numParams:req + ; seg_name equ @CatStr(_TEXT$, name) + ; seg_name SEGMENT SEG_ALIGN 'CODE' + MY_PROC name, numParams +endm + +MY_SEG_ENDP macro + ; seg_name ENDS +endm + + +NUM_AES_KEYS_MAX equ 15 + +; the number of push operators in function PROLOG +if (IS_LINUX eq 0) or (IS_X64 eq 0) +num_regs_push equ 2 +stack_param_offset equ (REG_SIZE * (1 + num_regs_push)) +endif + +ifdef x64 + num_param equ REG_ABI_PARAM_2 +else + if (IS_CDECL gt 0) + ; size_t size + ; void * data + ; UInt32 * aes + ; ret-ip <- (r4) + aes_OFFS equ (stack_param_offset) + data_OFFS equ (REG_SIZE + aes_OFFS) + size_OFFS equ (REG_SIZE + data_OFFS) + num_param equ [r4 + size_OFFS] + else + num_param equ [r4 + stack_param_offset] + endif +endif + +keys equ REG_PARAM_0 ; r1 +rD equ REG_PARAM_1 ; r2 +rN equ r0 + +koffs_x equ x7 +koffs_r equ r7 + +ksize_x equ x6 +ksize_r equ r6 + +keys2 equ r3 + +state equ xmm0 +key equ xmm0 +key_ymm equ ymm0 +key_ymm_n equ 0 + +ifdef x64 + ways = 11 +else + ways = 4 +endif + +ways_start_reg equ 1 + +iv equ @CatStr(xmm, %(ways_start_reg + ways)) +iv_ymm equ @CatStr(ymm, %(ways_start_reg + ways)) + + +WOP macro op, op2 + i = 0 + rept ways + op @CatStr(xmm, %(ways_start_reg + i)), op2 + i = i + 1 + endm +endm + + +ifndef ABI_LINUX +ifdef x64 + +; we use 32 bytes of home space in stack in WIN64-x64 +NUM_HOME_MM_REGS equ (32 / 16) +; we preserve xmm registers starting from xmm6 in WIN64-x64 +MM_START_SAVE_REG equ 6 + +SAVE_XMM macro num_used_mm_regs:req + num_save_mm_regs = num_used_mm_regs - MM_START_SAVE_REG + if num_save_mm_regs GT 0 + num_save_mm_regs2 = num_save_mm_regs - NUM_HOME_MM_REGS + ; RSP is (16*x + 8) after entering the function in WIN64-x64 + stack_offset = 16 * num_save_mm_regs2 + (stack_param_offset mod 16) + + i = 0 + rept num_save_mm_regs + + if i eq NUM_HOME_MM_REGS + sub r4, stack_offset + endif + + if i lt NUM_HOME_MM_REGS + movdqa [r4 + stack_param_offset + i * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) + else + movdqa [r4 + (i - NUM_HOME_MM_REGS) * 16], @CatStr(xmm, %(MM_START_SAVE_REG + i)) + endif + + i = i + 1 + endm + endif +endm + +RESTORE_XMM macro num_used_mm_regs:req + if num_save_mm_regs GT 0 + i = 0 + if num_save_mm_regs2 GT 0 + rept num_save_mm_regs2 + movdqa @CatStr(xmm, %(MM_START_SAVE_REG + NUM_HOME_MM_REGS + i)), [r4 + i * 16] + i = i + 1 + endm + add r4, stack_offset + endif + + num_low_regs = num_save_mm_regs - i + i = 0 + rept num_low_regs + movdqa @CatStr(xmm, %(MM_START_SAVE_REG + i)), [r4 + stack_param_offset + i * 16] + i = i + 1 + endm + endif +endm + +endif ; x64 +endif ; ABI_LINUX + + +MY_PROLOG macro num_used_mm_regs:req + ; num_regs_push: must be equal to the number of push operators + ; push r3 + ; push r5 + if (IS_LINUX eq 0) or (IS_X64 eq 0) + push r6 + push r7 + endif + + mov rN, num_param ; don't move it; num_param can use stack pointer (r4) + + if (IS_X64 eq 0) + if (IS_CDECL gt 0) + mov rD, [r4 + data_OFFS] + mov keys, [r4 + aes_OFFS] + endif + elseif (IS_LINUX gt 0) + MY_ABI_LINUX_TO_WIN_2 + endif + + + ifndef ABI_LINUX + ifdef x64 + SAVE_XMM num_used_mm_regs + endif + endif + + mov ksize_x, [keys + 16] + shl ksize_x, 5 +endm + + +MY_EPILOG macro + ifndef ABI_LINUX + ifdef x64 + RESTORE_XMM num_save_mm_regs + endif + endif + + if (IS_LINUX eq 0) or (IS_X64 eq 0) + pop r7 + pop r6 + endif + ; pop r5 + ; pop r3 + MY_ENDP +endm + + +OP_KEY macro op:req, offs:req + op state, [keys + offs] +endm + + +WOP_KEY macro op:req, offs:req + movdqa key, [keys + offs] + WOP op, key +endm + + +; ---------- AES-CBC Decode ---------- + + +XOR_WITH_DATA macro reg, _ppp_ + pxor reg, [rD + i * 16] +endm + +WRITE_TO_DATA macro reg, _ppp_ + movdqa [rD + i * 16], reg +endm + + +; state0 equ @CatStr(xmm, %(ways_start_reg)) + +key0 equ @CatStr(xmm, %(ways_start_reg + ways + 1)) +key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) + +key_last equ @CatStr(xmm, %(ways_start_reg + ways + 2)) +key_last_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) +key_last_ymm_n equ (ways_start_reg + ways + 2) + +NUM_CBC_REGS equ (ways_start_reg + ways + 3) + + +MY_SEG_PROC AesCbc_Decode_HW, 3 + + AesCbc_Decode_HW_start:: + MY_PROLOG NUM_CBC_REGS + + AesCbc_Decode_HW_start_2:: + movdqa iv, [keys] + add keys, 32 + + movdqa key0, [keys + 1 * ksize_r] + movdqa key_last, [keys] + sub ksize_x, 16 + + jmp check2 + align 16 + nextBlocks2: + WOP movdqa, [rD + i * 16] + mov koffs_x, ksize_x + ; WOP_KEY pxor, ksize_r + 16 + WOP pxor, key0 + ; align 16 + @@: + WOP_KEY aesdec, 1 * koffs_r + sub koffs_r, 16 + jnz @B + ; WOP_KEY aesdeclast, 0 + WOP aesdeclast, key_last + + pxor @CatStr(xmm, %(ways_start_reg)), iv + i = 1 + rept ways - 1 + pxor @CatStr(xmm, %(ways_start_reg + i)), [rD + i * 16 - 16] + i = i + 1 + endm + movdqa iv, [rD + ways * 16 - 16] + WOP WRITE_TO_DATA + + add rD, ways * 16 + AesCbc_Decode_HW_start_3:: + check2: + sub rN, ways + jnc nextBlocks2 + add rN, ways + + sub ksize_x, 16 + + jmp check + nextBlock: + movdqa state, [rD] + mov koffs_x, ksize_x + ; OP_KEY pxor, 1 * ksize_r + 32 + pxor state, key0 + ; movdqa state0, [rD] + ; movdqa state, key0 + ; pxor state, state0 + @@: + OP_KEY aesdec, 1 * koffs_r + 16 + OP_KEY aesdec, 1 * koffs_r + sub koffs_r, 32 + jnz @B + OP_KEY aesdec, 16 + ; OP_KEY aesdeclast, 0 + aesdeclast state, key_last + + pxor state, iv + movdqa iv, [rD] + ; movdqa iv, state0 + movdqa [rD], state + + add rD, 16 + check: + sub rN, 1 + jnc nextBlock + + movdqa [keys - 32], iv +MY_EPILOG + + + + +; ---------- AVX ---------- + + +AVX__WOP_n macro op + i = 0 + rept ways + op (ways_start_reg + i) + i = i + 1 + endm +endm + +AVX__WOP macro op + i = 0 + rept ways + op @CatStr(ymm, %(ways_start_reg + i)) + i = i + 1 + endm +endm + + +AVX__WOP_KEY macro op:req, offs:req + vmovdqa key_ymm, ymmword ptr [keys2 + offs] + AVX__WOP_n op +endm + + +AVX__CBC_START macro reg + ; vpxor reg, key_ymm, ymmword ptr [rD + 32 * i] + vpxor reg, key0_ymm, ymmword ptr [rD + 32 * i] +endm + +AVX__CBC_END macro reg + if i eq 0 + vpxor reg, reg, iv_ymm + else + vpxor reg, reg, ymmword ptr [rD + i * 32 - 16] + endif +endm + + +AVX__WRITE_TO_DATA macro reg + vmovdqu ymmword ptr [rD + 32 * i], reg +endm + +AVX__XOR_WITH_DATA macro reg + vpxor reg, reg, ymmword ptr [rD + 32 * i] +endm + +AVX__CTR_START macro reg + vpaddq iv_ymm, iv_ymm, one_ymm + ; vpxor reg, iv_ymm, key_ymm + vpxor reg, iv_ymm, key0_ymm +endm + + +MY_VAES_INSTR_2 macro cmd, dest, a1, a2 + db 0c4H + db 2 + 040H + 020h * (1 - (a2) / 8) + 080h * (1 - (dest) / 8) + db 5 + 8 * ((not (a1)) and 15) + db cmd + db 0c0H + 8 * ((dest) and 7) + ((a2) and 7) +endm + +MY_VAES_INSTR macro cmd, dest, a + MY_VAES_INSTR_2 cmd, dest, dest, a +endm + +MY_vaesenc macro dest, a + MY_VAES_INSTR 0dcH, dest, a +endm +MY_vaesenclast macro dest, a + MY_VAES_INSTR 0ddH, dest, a +endm +MY_vaesdec macro dest, a + MY_VAES_INSTR 0deH, dest, a +endm +MY_vaesdeclast macro dest, a + MY_VAES_INSTR 0dfH, dest, a +endm + + +AVX__VAES_DEC macro reg + MY_vaesdec reg, key_ymm_n +endm + +AVX__VAES_DEC_LAST_key_last macro reg + ; MY_vaesdeclast reg, key_ymm_n + MY_vaesdeclast reg, key_last_ymm_n +endm + +AVX__VAES_ENC macro reg + MY_vaesenc reg, key_ymm_n +endm + +AVX__VAES_ENC_LAST macro reg + MY_vaesenclast reg, key_ymm_n +endm + +AVX__vinserti128_TO_HIGH macro dest, src + vinserti128 dest, dest, src, 1 +endm + + +MY_PROC AesCbc_Decode_HW_256, 3 + ifdef use_vaes_256 + MY_PROLOG NUM_CBC_REGS + + cmp rN, ways * 2 + jb AesCbc_Decode_HW_start_2 + + vmovdqa iv, xmmword ptr [keys] + add keys, 32 + + vbroadcasti128 key0_ymm, xmmword ptr [keys + 1 * ksize_r] + vbroadcasti128 key_last_ymm, xmmword ptr [keys] + sub ksize_x, 16 + mov koffs_x, ksize_x + add ksize_x, ksize_x + + AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 2) * 32) + push keys2 + sub r4, AVX_STACK_SUB + ; sub r4, 32 + ; sub r4, ksize_r + ; lea keys2, [r4 + 32] + mov keys2, r4 + and keys2, -32 + broad: + vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] + vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm + sub koffs_r, 16 + ; jnc broad + jnz broad + + sub rN, ways * 2 + + align 16 + avx_cbcdec_nextBlock2: + mov koffs_x, ksize_x + ; AVX__WOP_KEY AVX__CBC_START, 1 * koffs_r + 32 + AVX__WOP AVX__CBC_START + @@: + AVX__WOP_KEY AVX__VAES_DEC, 1 * koffs_r + sub koffs_r, 32 + jnz @B + ; AVX__WOP_KEY AVX__VAES_DEC_LAST, 0 + AVX__WOP_n AVX__VAES_DEC_LAST_key_last + + AVX__vinserti128_TO_HIGH iv_ymm, xmmword ptr [rD] + AVX__WOP AVX__CBC_END + + vmovdqa iv, xmmword ptr [rD + ways * 32 - 16] + AVX__WOP AVX__WRITE_TO_DATA + + add rD, ways * 32 + sub rN, ways * 2 + jnc avx_cbcdec_nextBlock2 + add rN, ways * 2 + + shr ksize_x, 1 + + ; lea r4, [r4 + 1 * ksize_r + 32] + add r4, AVX_STACK_SUB + pop keys2 + + vzeroupper + jmp AesCbc_Decode_HW_start_3 + else + jmp AesCbc_Decode_HW_start + endif +MY_ENDP +MY_SEG_ENDP + + + + +; ---------- AES-CBC Encode ---------- + +e0 equ xmm1 + +CENC_START_KEY equ 2 +CENC_NUM_REG_KEYS equ (3 * 2) +; last_key equ @CatStr(xmm, %(CENC_START_KEY + CENC_NUM_REG_KEYS)) + +MY_SEG_PROC AesCbc_Encode_HW, 3 + MY_PROLOG (CENC_START_KEY + CENC_NUM_REG_KEYS + 0) + + movdqa state, [keys] + add keys, 32 + + i = 0 + rept CENC_NUM_REG_KEYS + movdqa @CatStr(xmm, %(CENC_START_KEY + i)), [keys + i * 16] + i = i + 1 + endm + + add keys, ksize_r + neg ksize_r + add ksize_r, (16 * CENC_NUM_REG_KEYS) + ; movdqa last_key, [keys] + jmp check_e + + align 16 + nextBlock_e: + movdqa e0, [rD] + mov koffs_r, ksize_r + pxor e0, @CatStr(xmm, %(CENC_START_KEY)) + pxor state, e0 + + i = 1 + rept (CENC_NUM_REG_KEYS - 1) + aesenc state, @CatStr(xmm, %(CENC_START_KEY + i)) + i = i + 1 + endm + + @@: + OP_KEY aesenc, 1 * koffs_r + OP_KEY aesenc, 1 * koffs_r + 16 + add koffs_r, 32 + jnz @B + OP_KEY aesenclast, 0 + ; aesenclast state, last_key + + movdqa [rD], state + add rD, 16 + check_e: + sub rN, 1 + jnc nextBlock_e + + ; movdqa [keys - 32], state + movdqa [keys + 1 * ksize_r - (16 * CENC_NUM_REG_KEYS) - 32], state +MY_EPILOG +MY_SEG_ENDP + + + +; ---------- AES-CTR ---------- + +ifdef x64 + ; ways = 11 +endif + + +one equ @CatStr(xmm, %(ways_start_reg + ways + 1)) +one_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 1)) +key0 equ @CatStr(xmm, %(ways_start_reg + ways + 2)) +key0_ymm equ @CatStr(ymm, %(ways_start_reg + ways + 2)) +NUM_CTR_REGS equ (ways_start_reg + ways + 3) + +INIT_CTR macro reg, _ppp_ + paddq iv, one + movdqa reg, iv +endm + + +MY_SEG_PROC AesCtr_Code_HW, 3 + Ctr_start:: + MY_PROLOG NUM_CTR_REGS + + Ctr_start_2:: + movdqa iv, [keys] + add keys, 32 + movdqa key0, [keys] + + add keys, ksize_r + neg ksize_r + add ksize_r, 16 + + Ctr_start_3:: + mov koffs_x, 1 + movd one, koffs_x + jmp check2_c + + align 16 + nextBlocks2_c: + WOP INIT_CTR, 0 + mov koffs_r, ksize_r + ; WOP_KEY pxor, 1 * koffs_r -16 + WOP pxor, key0 + @@: + WOP_KEY aesenc, 1 * koffs_r + add koffs_r, 16 + jnz @B + WOP_KEY aesenclast, 0 + + WOP XOR_WITH_DATA + WOP WRITE_TO_DATA + add rD, ways * 16 + check2_c: + sub rN, ways + jnc nextBlocks2_c + add rN, ways + + sub keys, 16 + add ksize_r, 16 + + jmp check_c + + ; align 16 + nextBlock_c: + paddq iv, one + ; movdqa state, [keys + 1 * koffs_r - 16] + movdqa state, key0 + mov koffs_r, ksize_r + pxor state, iv + + @@: + OP_KEY aesenc, 1 * koffs_r + OP_KEY aesenc, 1 * koffs_r + 16 + add koffs_r, 32 + jnz @B + OP_KEY aesenc, 0 + OP_KEY aesenclast, 16 + + pxor state, [rD] + movdqa [rD], state + add rD, 16 + check_c: + sub rN, 1 + jnc nextBlock_c + + ; movdqa [keys - 32], iv + movdqa [keys + 1 * ksize_r - 16 - 32], iv +MY_EPILOG + + +MY_PROC AesCtr_Code_HW_256, 3 + ifdef use_vaes_256 + MY_PROLOG NUM_CTR_REGS + + cmp rN, ways * 2 + jb Ctr_start_2 + + vbroadcasti128 iv_ymm, xmmword ptr [keys] + add keys, 32 + vbroadcasti128 key0_ymm, xmmword ptr [keys] + mov koffs_x, 1 + vmovd one, koffs_x + vpsubq iv_ymm, iv_ymm, one_ymm + vpaddq one, one, one + AVX__vinserti128_TO_HIGH one_ymm, one + + add keys, ksize_r + sub ksize_x, 16 + neg ksize_r + mov koffs_r, ksize_r + add ksize_r, ksize_r + + AVX_STACK_SUB = ((NUM_AES_KEYS_MAX + 1 - 1) * 32) + push keys2 + lea keys2, [r4 - 32] + sub r4, AVX_STACK_SUB + and keys2, -32 + vbroadcasti128 key_ymm, xmmword ptr [keys] + vmovdqa ymmword ptr [keys2], key_ymm + @@: + vbroadcasti128 key_ymm, xmmword ptr [keys + 1 * koffs_r] + vmovdqa ymmword ptr [keys2 + koffs_r * 2], key_ymm + add koffs_r, 16 + jnz @B + + sub rN, ways * 2 + + align 16 + avx_ctr_nextBlock2: + mov koffs_r, ksize_r + AVX__WOP AVX__CTR_START + ; AVX__WOP_KEY AVX__CTR_START, 1 * koffs_r - 32 + @@: + AVX__WOP_KEY AVX__VAES_ENC, 1 * koffs_r + add koffs_r, 32 + jnz @B + AVX__WOP_KEY AVX__VAES_ENC_LAST, 0 + + AVX__WOP AVX__XOR_WITH_DATA + AVX__WOP AVX__WRITE_TO_DATA + + add rD, ways * 32 + sub rN, ways * 2 + jnc avx_ctr_nextBlock2 + add rN, ways * 2 + + vextracti128 iv, iv_ymm, 1 + sar ksize_r, 1 + + add r4, AVX_STACK_SUB + pop keys2 + + vzeroupper + jmp Ctr_start_3 + else + jmp Ctr_start + endif +MY_ENDP +MY_SEG_ENDP + +end diff --git a/Asm/x86/LzFindOpt.asm b/Asm/x86/LzFindOpt.asm new file mode 100644 index 0000000..42e10bd --- /dev/null +++ b/Asm/x86/LzFindOpt.asm @@ -0,0 +1,513 @@ +; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function +; 2021-07-21: Igor Pavlov : Public domain +; + +ifndef x64 +; x64=1 +; .err +endif + +include 7zAsm.asm + +MY_ASM_START + +_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE' + +MY_ALIGN macro num:req + align num +endm + +MY_ALIGN_32 macro + MY_ALIGN 32 +endm + +MY_ALIGN_64 macro + MY_ALIGN 64 +endm + + +t0_L equ x0_L +t0_x equ x0 +t0 equ r0 +t1_x equ x3 +t1 equ r3 + +cp_x equ t1_x +cp_r equ t1 +m equ x5 +m_r equ r5 +len_x equ x6 +len equ r6 +diff_x equ x7 +diff equ r7 +len0 equ r10 +len1_x equ x11 +len1 equ r11 +maxLen_x equ x12 +maxLen equ r12 +d equ r13 +ptr0 equ r14 +ptr1 equ r15 + +d_lim equ m_r +cycSize equ len_x +hash_lim equ len0 +delta1_x equ len1_x +delta1_r equ len1 +delta_x equ maxLen_x +delta_r equ maxLen +hash equ ptr0 +src equ ptr1 + + + +if (IS_LINUX gt 0) + +; r1 r2 r8 r9 : win32 +; r7 r6 r2 r1 r8 r9 : linux + +lenLimit equ r8 +lenLimit_x equ x8 +; pos_r equ r2 +pos equ x2 +cur equ r1 +son equ r9 + +else + +lenLimit equ REG_ABI_PARAM_2 +lenLimit_x equ REG_ABI_PARAM_2_x +pos equ REG_ABI_PARAM_1_x +cur equ REG_ABI_PARAM_0 +son equ REG_ABI_PARAM_3 + +endif + + +if (IS_LINUX gt 0) + maxLen_OFFS equ (REG_SIZE * (6 + 1)) +else + cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4)) + d_OFFS equ (REG_SIZE + cutValue_OFFS) + maxLen_OFFS equ (REG_SIZE + d_OFFS) +endif + hash_OFFS equ (REG_SIZE + maxLen_OFFS) + limit_OFFS equ (REG_SIZE + hash_OFFS) + size_OFFS equ (REG_SIZE + limit_OFFS) + cycPos_OFFS equ (REG_SIZE + size_OFFS) + cycSize_OFFS equ (REG_SIZE + cycPos_OFFS) + posRes_OFFS equ (REG_SIZE + cycSize_OFFS) + +if (IS_LINUX gt 0) +else + cutValue_PAR equ [r0 + cutValue_OFFS] + d_PAR equ [r0 + d_OFFS] +endif + maxLen_PAR equ [r0 + maxLen_OFFS] + hash_PAR equ [r0 + hash_OFFS] + limit_PAR equ [r0 + limit_OFFS] + size_PAR equ [r0 + size_OFFS] + cycPos_PAR equ [r0 + cycPos_OFFS] + cycSize_PAR equ [r0 + cycSize_OFFS] + posRes_PAR equ [r0 + posRes_OFFS] + + + cutValue_VAR equ DWORD PTR [r4 + 8 * 0] + cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4] + cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0] + cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4] + hash_VAR equ QWORD PTR [r4 + 8 * 2] + limit_VAR equ QWORD PTR [r4 + 8 * 3] + size_VAR equ QWORD PTR [r4 + 8 * 4] + distances equ QWORD PTR [r4 + 8 * 5] + maxLen_VAR equ QWORD PTR [r4 + 8 * 6] + + Old_RSP equ QWORD PTR [r4 + 8 * 7] + LOCAL_SIZE equ 8 * 8 + +COPY_VAR_32 macro dest_var, src_var + mov x3, src_var + mov dest_var, x3 +endm + +COPY_VAR_64 macro dest_var, src_var + mov r3, src_var + mov dest_var, r3 +endm + + +; MY_ALIGN_64 +MY_PROC GetMatchesSpecN_2, 13 +MY_PUSH_PRESERVED_ABI_REGS + mov r0, RSP + lea r3, [r0 - LOCAL_SIZE] + and r3, -64 + mov RSP, r3 + mov Old_RSP, r0 + +if (IS_LINUX gt 0) + mov d, REG_ABI_PARAM_5 ; r13 = r9 + mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8 + mov son, REG_ABI_PARAM_3 ; r9 = r1 + mov r8, REG_ABI_PARAM_2 ; r8 = r2 + mov pos, REG_ABI_PARAM_1_x ; r2 = x6 + mov r1, REG_ABI_PARAM_0 ; r1 = r7 +else + COPY_VAR_32 cutValue_VAR, cutValue_PAR + mov d, d_PAR +endif + + COPY_VAR_64 limit_VAR, limit_PAR + + mov hash_lim, size_PAR + mov size_VAR, hash_lim + + mov cp_x, cycPos_PAR + mov hash, hash_PAR + + mov cycSize, cycSize_PAR + mov cycSize_VAR, cycSize + + ; we want cur in (rcx). So we change the cur and lenLimit variables + sub lenLimit, cur + neg lenLimit_x + inc lenLimit_x + + mov t0_x, maxLen_PAR + sub t0, lenLimit + mov maxLen_VAR, t0 + + jmp main_loop + +MY_ALIGN_64 +fill_empty: + ; ptr0 = *ptr1 = kEmptyHashValue; + mov QWORD PTR [ptr1], 0 + inc pos + inc cp_x + mov DWORD PTR [d - 4], 0 + cmp d, limit_VAR + jae fin + cmp hash, hash_lim + je fin + +; MY_ALIGN_64 +main_loop: + ; UInt32 delta = *hash++; + mov diff_x, [hash] ; delta + add hash, 4 + ; mov cycPos_VAR, cp_x + + inc cur + add d, 4 + mov m, pos + sub m, diff_x; ; matchPos + + ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; + lea ptr1, [son + 8 * cp_r] + ; mov cycSize, cycSize_VAR + cmp pos, cycSize + jb directMode ; if (pos < cycSize_VAR) + + ; CYC MODE + + cmp diff_x, cycSize + jae fill_empty ; if (delta >= cycSize_VAR) + + xor t0_x, t0_x + mov cycPos_VAR, cp_x + sub cp_x, diff_x + ; jae prepare_for_tree_loop + ; add cp_x, cycSize + cmovb t0_x, cycSize + add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) + jmp prepare_for_tree_loop + + +directMode: + cmp diff_x, pos + je fill_empty ; if (delta == pos) + jae fin_error ; if (delta >= pos) + + mov cycPos_VAR, cp_x + mov cp_x, m + +prepare_for_tree_loop: + mov len0, lenLimit + mov hash_VAR, hash + ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; + lea ptr0, [ptr1 + 4] + ; UInt32 *_distances = ++d; + mov distances, d + + neg len0 + mov len1, len0 + + mov t0_x, cutValue_VAR + mov maxLen, maxLen_VAR + mov cutValueCur_VAR, t0_x + +MY_ALIGN_32 +tree_loop: + neg diff + mov len, len0 + cmp len1, len0 + cmovb len, len1 ; len = (len1 < len0 ? len1 : len0); + add diff, cur + + mov t0_x, [son + cp_r * 8] ; prefetch + movzx t0_x, BYTE PTR [diff + 1 * len] + lea cp_r, [son + cp_r * 8] + cmp [cur + 1 * len], t0_L + je matched_1 + + jb left_0 + + mov [ptr1], m + mov m, [cp_r + 4] + lea ptr1, [cp_r + 4] + sub diff, cur ; FIX32 + jmp next_node + +MY_ALIGN_32 +left_0: + mov [ptr0], m + mov m, [cp_r] + mov ptr0, cp_r + sub diff, cur ; FIX32 + ; jmp next_node + +; ------------ NEXT NODE ------------ +; MY_ALIGN_32 +next_node: + mov cycSize, cycSize_VAR + dec cutValueCur_VAR + je finish_tree + + add diff_x, pos ; prev_match = pos + diff + cmp m, diff_x + jae fin_error ; if (new_match >= prev_match) + + mov diff_x, pos + sub diff_x, m ; delta = pos - new_match + cmp pos, cycSize + jae cyc_mode_2 ; if (pos >= cycSize) + + mov cp_x, m + test m, m + jne tree_loop ; if (m != 0) + +finish_tree: + ; ptr0 = *ptr1 = kEmptyHashValue; + mov DWORD PTR [ptr0], 0 + mov DWORD PTR [ptr1], 0 + + inc pos + + ; _distances[-1] = (UInt32)(d - _distances); + mov t0, distances + mov t1, d + sub t1, t0 + shr t1_x, 2 + mov [t0 - 4], t1_x + + cmp d, limit_VAR + jae fin ; if (d >= limit) + + mov cp_x, cycPos_VAR + mov hash, hash_VAR + mov hash_lim, size_VAR + inc cp_x + cmp hash, hash_lim + jne main_loop ; if (hash != size) + jmp fin + + +MY_ALIGN_32 +cyc_mode_2: + cmp diff_x, cycSize + jae finish_tree ; if (delta >= cycSize) + + mov cp_x, cycPos_VAR + xor t0_x, t0_x + sub cp_x, diff_x ; cp_x = cycPos - delta + cmovb t0_x, cycSize + add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) + jmp tree_loop + + +MY_ALIGN_32 +matched_1: + + inc len + ; cmp len_x, lenLimit_x + je short lenLimit_reach + movzx t0_x, BYTE PTR [diff + 1 * len] + cmp [cur + 1 * len], t0_L + jne mismatch + + +MY_ALIGN_32 +match_loop: + ; while (++len != lenLimit) (len[diff] != len[0]) ; + + inc len + ; cmp len_x, lenLimit_x + je short lenLimit_reach + movzx t0_x, BYTE PTR [diff + 1 * len] + cmp BYTE PTR [cur + 1 * len], t0_L + je match_loop + +mismatch: + jb left_2 + + mov [ptr1], m + mov m, [cp_r + 4] + lea ptr1, [cp_r + 4] + mov len1, len + + jmp max_update + +MY_ALIGN_32 +left_2: + mov [ptr0], m + mov m, [cp_r] + mov ptr0, cp_r + mov len0, len + +max_update: + sub diff, cur ; restore diff + + cmp maxLen, len + jae next_node + + mov maxLen, len + add len, lenLimit + mov [d], len_x + mov t0_x, diff_x + not t0_x + mov [d + 4], t0_x + add d, 8 + + jmp next_node + + + +MY_ALIGN_32 +lenLimit_reach: + + mov delta_r, cur + sub delta_r, diff + lea delta1_r, [delta_r - 1] + + mov t0_x, [cp_r] + mov [ptr1], t0_x + mov t0_x, [cp_r + 4] + mov [ptr0], t0_x + + mov [d], lenLimit_x + mov [d + 4], delta1_x + add d, 8 + + ; _distances[-1] = (UInt32)(d - _distances); + mov t0, distances + mov t1, d + sub t1, t0 + shr t1_x, 2 + mov [t0 - 4], t1_x + + mov hash, hash_VAR + mov hash_lim, size_VAR + + inc pos + mov cp_x, cycPos_VAR + inc cp_x + + mov d_lim, limit_VAR + mov cycSize, cycSize_VAR + ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) + ; break; + cmp hash, hash_lim + je fin + cmp d, d_lim + jae fin + cmp delta_x, [hash] + jne main_loop + movzx t0_x, BYTE PTR [diff] + cmp [cur], t0_L + jne main_loop + + ; jmp main_loop ; bypass for debug + + mov cycPos_VAR, cp_x + shl len, 3 ; cycSize * 8 + sub diff, cur ; restore diff + xor t0_x, t0_x + cmp cp_x, delta_x ; cmp (cycPos_VAR, delta) + lea cp_r, [son + 8 * cp_r] ; dest + lea src, [cp_r + 8 * diff] + cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0) + add src, t0 + add len, son ; len = son + cycSize * 8 + + +MY_ALIGN_32 +long_loop: + add hash, 4 + + ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; + + mov t0, [src] + add src, 8 + mov [cp_r], t0 + add cp_r, 8 + cmp src, len + cmove src, son ; if end of (son) buffer is reached, we wrap to begin + + mov DWORD PTR [d], 2 + mov [d + 4], lenLimit_x + mov [d + 8], delta1_x + add d, 12 + + inc cur + + cmp hash, hash_lim + je long_footer + cmp delta_x, [hash] + jne long_footer + movzx t0_x, BYTE PTR [diff + 1 * cur] + cmp [cur], t0_L + jne long_footer + cmp d, d_lim + jb long_loop + +long_footer: + sub cp_r, son + shr cp_r, 3 + add pos, cp_x + sub pos, cycPos_VAR + mov cycSize, cycSize_VAR + + cmp d, d_lim + jae fin + cmp hash, hash_lim + jne main_loop + jmp fin + + + +fin_error: + xor d, d + +fin: + mov RSP, Old_RSP + mov t0, [r4 + posRes_OFFS] + mov [t0], pos + mov r0, d + +MY_POP_PRESERVED_ABI_REGS +MY_ENDP + +_TEXT$LZFINDOPT ENDS + +end diff --git a/Asm/x86/LzmaDecOpt.asm b/Asm/x86/LzmaDecOpt.asm new file mode 100644 index 0000000..f2818e7 --- /dev/null +++ b/Asm/x86/LzmaDecOpt.asm @@ -0,0 +1,1303 @@ +; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function +; 2021-02-23: Igor Pavlov : Public domain +; +; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() +; function for check at link time. +; That code is tightly coupled with LzmaDec_TryDummy() +; and with another functions in LzmaDec.c file. +; CLzmaDec structure, (probs) array layout, input and output of +; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM). + +ifndef x64 +; x64=1 +; .err +endif + +include 7zAsm.asm + +MY_ASM_START + +_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' + +MY_ALIGN macro num:req + align num +endm + +MY_ALIGN_16 macro + MY_ALIGN 16 +endm + +MY_ALIGN_32 macro + MY_ALIGN 32 +endm + +MY_ALIGN_64 macro + MY_ALIGN 64 +endm + + +; _LZMA_SIZE_OPT equ 1 + +; _LZMA_PROB32 equ 1 + +ifdef _LZMA_PROB32 + PSHIFT equ 2 + PLOAD macro dest, mem + mov dest, dword ptr [mem] + endm + PSTORE macro src, mem + mov dword ptr [mem], src + endm +else + PSHIFT equ 1 + PLOAD macro dest, mem + movzx dest, word ptr [mem] + endm + PSTORE macro src, mem + mov word ptr [mem], @CatStr(src, _W) + endm +endif + +PMULT equ (1 SHL PSHIFT) +PMULT_HALF equ (1 SHL (PSHIFT - 1)) +PMULT_2 equ (1 SHL (PSHIFT + 1)) + +kMatchSpecLen_Error_Data equ (1 SHL 9) + +; x0 range +; x1 pbPos / (prob) TREE +; x2 probBranch / prm (MATCHED) / pbPos / cnt +; x3 sym +;====== r4 === RSP +; x5 cod +; x6 t1 NORM_CALC / probs_state / dist +; x7 t0 NORM_CALC / prob2 IF_BIT_1 +; x8 state +; x9 match (MATCHED) / sym2 / dist2 / lpMask_reg +; x10 kBitModelTotal_reg +; r11 probs +; x12 offs (MATCHED) / dic / len_temp +; x13 processedPos +; x14 bit (MATCHED) / dicPos +; r15 buf + + +cod equ x5 +cod_L equ x5_L +range equ x0 +state equ x8 +state_R equ r8 +buf equ r15 +processedPos equ x13 +kBitModelTotal_reg equ x10 + +probBranch equ x2 +probBranch_R equ r2 +probBranch_W equ x2_W + +pbPos equ x1 +pbPos_R equ r1 + +cnt equ x2 +cnt_R equ r2 + +lpMask_reg equ x9 +dicPos equ r14 + +sym equ x3 +sym_R equ r3 +sym_L equ x3_L + +probs equ r11 +dic equ r12 + +t0 equ x7 +t0_W equ x7_W +t0_R equ r7 + +prob2 equ t0 +prob2_W equ t0_W + +t1 equ x6 +t1_R equ r6 + +probs_state equ t1 +probs_state_R equ t1_R + +prm equ r2 +match equ x9 +match_R equ r9 +offs equ x12 +offs_R equ r12 +bit equ x14 +bit_R equ r14 + +sym2 equ x9 +sym2_R equ r9 + +len_temp equ x12 + +dist equ sym +dist2 equ x9 + + + +kNumBitModelTotalBits equ 11 +kBitModelTotal equ (1 SHL kNumBitModelTotalBits) +kNumMoveBits equ 5 +kBitModelOffset equ ((1 SHL kNumMoveBits) - 1) +kTopValue equ (1 SHL 24) + +NORM_2 macro + ; movzx t0, BYTE PTR [buf] + shl cod, 8 + mov cod_L, BYTE PTR [buf] + shl range, 8 + ; or cod, t0 + inc buf +endm + + +NORM macro + cmp range, kTopValue + jae SHORT @F + NORM_2 +@@: +endm + + +; ---------- Branch MACROS ---------- + +UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req + mov prob2, kBitModelTotal_reg + sub prob2, probBranch + shr prob2, kNumMoveBits + add probBranch, prob2 + PSTORE probBranch, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req + sub prob2, range + sub cod, range + mov range, prob2 + mov prob2, probBranch + shr probBranch, kNumMoveBits + sub prob2, probBranch + PSTORE prob2, probOffset * 1 + probsArray + probDisp * PMULT +endm + + +CMP_COD macro probsArray:req, probOffset:req, probDisp:req + PLOAD probBranch, probOffset * 1 + probsArray + probDisp * PMULT + NORM + mov prob2, range + shr range, kNumBitModelTotalBits + imul range, probBranch + cmp cod, range +endm + + +IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jae toLabel +endm + + +IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel + UPDATE_0 probsArray, probOffset, probDisp +endm + + +IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req + CMP_COD probsArray, probOffset, probDisp + jb toLabel +endm + + +; ---------- CMOV MACROS ---------- + +NORM_CALC macro prob:req + NORM + mov t0, range + shr range, kNumBitModelTotalBits + imul range, prob + sub t0, range + mov t1, cod + sub cod, range +endm + + +PUP macro prob:req, probPtr:req + sub t0, prob + ; only sar works for both 16/32 bit prob modes + sar t0, kNumMoveBits + add t0, prob + PSTORE t0, probPtr +endm + + +PUP_SUB macro prob:req, probPtr:req, symSub:req + sbb sym, symSub + PUP prob, probPtr +endm + + +PUP_COD macro prob:req, probPtr:req, symSub:req + mov t0, kBitModelOffset + cmovb cod, t1 + mov t1, sym + cmovb t0, kBitModelTotal_reg + PUP_SUB prob, probPtr, symSub +endm + + +BIT_0 macro prob:req, probNext:req + PLOAD prob, probs + 1 * PMULT + PLOAD probNext, probs + 1 * PMULT_2 + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 1 * PMULT_2 + PMULT + cmovae probNext, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 2 + PUP_SUB prob, probs + 1 * PMULT, 0 - 1 +endm + + +BIT_1 macro prob:req, probNext:req + PLOAD probNext, probs + sym_R * PMULT_2 + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + sym_R * PMULT + PMULT + cmovae probNext, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1 +endm + + +BIT_2 macro prob:req, symSub:req + add sym, sym + + NORM_CALC prob + + cmovae range, t0 + PUP_COD prob, probs + t1_R * PMULT_HALF, symSub +endm + + +; ---------- MATCHED LITERAL ---------- + +LITM_0 macro + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, offs + and bit, match + PLOAD x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT + lea prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT] + ; lea prm, [probs + 256 * PMULT + 1 * PMULT] + ; add prm, bit_R + xor offs, bit + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + mov t0, kBitModelOffset + cmovb cod, t1 + cmovb t0, kBitModelTotal_reg + mov sym, 0 + PUP_SUB x1, prm, -2-1 +endm + + +LITM macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + xor offs, bit + add sym, sym + add match, match + + NORM_CALC x1 + + cmovae offs, bit + mov bit, match + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, - 1 +endm + + +LITM_2 macro + and bit, offs + lea prm, [probs + offs_R * 1] + add prm, bit_R + PLOAD x1, prm + sym_R * PMULT + add sym, sym + + NORM_CALC x1 + + cmovae range, t0 + PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1 +endm + + +; ---------- REVERSE BITS ---------- + +REV_0 macro prob:req, probNext:req + ; PLOAD prob, probs + 1 * PMULT + ; lea sym2_R, [probs + 2 * PMULT] + ; PLOAD probNext, probs + 2 * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, probs + 3 * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [probs + 3 * PMULT] + cmovae sym2_R, t1_R + PUP prob, probs + 1 * PMULT +endm + + +REV_1 macro prob:req, probNext:req, step:req + add sym2_R, step * PMULT + PLOAD probNext, sym2_R + + NORM_CALC prob + + cmovae range, t0 + PLOAD t0, sym2_R + step * PMULT + cmovae probNext, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + lea t1_R, [sym2_R + step * PMULT] + cmovae sym2_R, t1_R + PUP prob, t1_R - step * PMULT_2 +endm + + +REV_2 macro prob:req, step:req + sub sym2_R, probs + shr sym2, PSHIFT + or sym, sym2 + + NORM_CALC prob + + cmovae range, t0 + lea t0, [sym - step] + cmovb sym, t0 + cmovb cod, t1 + mov t0, kBitModelOffset + cmovb t0, kBitModelTotal_reg + PUP prob, probs + sym2_R * PMULT +endm + + +REV_1_VAR macro prob:req + PLOAD prob, sym_R + mov probs, sym_R + add sym_R, sym2_R + + NORM_CALC prob + + cmovae range, t0 + lea t0_R, [sym_R + 1 * sym2_R] + cmovae sym_R, t0_R + mov t0, kBitModelOffset + cmovb cod, t1 + ; mov t1, kBitModelTotal + ; cmovb t0, t1 + cmovb t0, kBitModelTotal_reg + add sym2, sym2 + PUP prob, probs +endm + + + + +LIT_PROBS macro lpMaskParam:req + ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); + mov t0, processedPos + shl t0, 8 + add sym, t0 + and sym, lpMaskParam + add probs_state_R, pbPos_R + mov x1, LOC lc2 + lea sym, dword ptr[sym_R + 2 * sym_R] + add probs, Literal * PMULT + shl sym, x1_L + add probs, sym_R + UPDATE_0 probs_state_R, 0, IsMatch + inc processedPos +endm + + + +kNumPosBitsMax equ 4 +kNumPosStatesMax equ (1 SHL kNumPosBitsMax) + +kLenNumLowBits equ 3 +kLenNumLowSymbols equ (1 SHL kLenNumLowBits) +kLenNumHighBits equ 8 +kLenNumHighSymbols equ (1 SHL kLenNumHighBits) +kNumLenProbs equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols) + +LenLow equ 0 +LenChoice equ LenLow +LenChoice2 equ (LenLow + kLenNumLowSymbols) +LenHigh equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax) + +kNumStates equ 12 +kNumStates2 equ 16 +kNumLitStates equ 7 + +kStartPosModelIndex equ 4 +kEndPosModelIndex equ 14 +kNumFullDistances equ (1 SHL (kEndPosModelIndex SHR 1)) + +kNumPosSlotBits equ 6 +kNumLenToPosStates equ 4 + +kNumAlignBits equ 4 +kAlignTableSize equ (1 SHL kNumAlignBits) + +kMatchMinLen equ 2 +kMatchSpecLenStart equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols) + +kStartOffset equ 1664 +SpecPos equ (-kStartOffset) +IsRep0Long equ (SpecPos + kNumFullDistances) +RepLenCoder equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax)) +LenCoder equ (RepLenCoder + kNumLenProbs) +IsMatch equ (LenCoder + kNumLenProbs) +kAlign equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax)) +IsRep equ (kAlign + kAlignTableSize) +IsRepG0 equ (IsRep + kNumStates) +IsRepG1 equ (IsRepG0 + kNumStates) +IsRepG2 equ (IsRepG1 + kNumStates) +PosSlot equ (IsRepG2 + kNumStates) +Literal equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits)) +NUM_BASE_PROBS equ (Literal + kStartOffset) + +if kAlign ne 0 + .err +endif + +if NUM_BASE_PROBS ne 1984 + .err +endif + + +PTR_FIELD equ dq ? + +CLzmaDec_Asm struct + lc db ? + lp db ? + pb db ? + _pad_ db ? + dicSize dd ? + + probs_Spec PTR_FIELD + probs_1664 PTR_FIELD + dic_Spec PTR_FIELD + dicBufSize PTR_FIELD + dicPos_Spec PTR_FIELD + buf_Spec PTR_FIELD + + range_Spec dd ? + code_Spec dd ? + processedPos_Spec dd ? + checkDicSize dd ? + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? + state_Spec dd ? + remainLen dd ? +CLzmaDec_Asm ends + + +CLzmaDec_Asm_Loc struct + OLD_RSP PTR_FIELD + lzmaPtr PTR_FIELD + _pad0_ PTR_FIELD + _pad1_ PTR_FIELD + _pad2_ PTR_FIELD + dicBufSize PTR_FIELD + probs_Spec PTR_FIELD + dic_Spec PTR_FIELD + + limit PTR_FIELD + bufLimit PTR_FIELD + lc2 dd ? + lpMask dd ? + pbMask dd ? + checkDicSize dd ? + + _pad_ dd ? + remainLen dd ? + dicPos_Spec PTR_FIELD + rep0 dd ? + rep1 dd ? + rep2 dd ? + rep3 dd ? +CLzmaDec_Asm_Loc ends + + +GLOB_2 equ [sym_R].CLzmaDec_Asm. +GLOB equ [r1].CLzmaDec_Asm. +LOC_0 equ [r0].CLzmaDec_Asm_Loc. +LOC equ [RSP].CLzmaDec_Asm_Loc. + + +COPY_VAR macro name + mov t0, GLOB_2 name + mov LOC_0 name, t0 +endm + + +RESTORE_VAR macro name + mov t0, LOC name + mov GLOB name, t0 +endm + + + +IsMatchBranch_Pre macro reg + ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; + mov pbPos, LOC pbMask + and pbPos, processedPos + shl pbPos, (kLenNumLowBits + 1 + PSHIFT) + lea probs_state_R, [probs + 1 * state_R] +endm + + +IsMatchBranch macro reg + IsMatchBranch_Pre + IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label +endm + + +CheckLimits macro reg + cmp buf, LOC bufLimit + jae fin_OK + cmp dicPos, LOC limit + jae fin_OK +endm + + + +; RSP is (16x + 8) bytes aligned in WIN64-x64 +; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8) + +PARAM_lzma equ REG_ABI_PARAM_0 +PARAM_limit equ REG_ABI_PARAM_1 +PARAM_bufLimit equ REG_ABI_PARAM_2 + +; MY_ALIGN_64 +MY_PROC LzmaDec_DecodeReal_3, 3 +MY_PUSH_PRESERVED_ABI_REGS + + lea r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)] + and r0, -128 + mov r5, RSP + mov RSP, r0 + mov LOC_0 Old_RSP, r5 + mov LOC_0 lzmaPtr, PARAM_lzma + + mov LOC_0 remainLen, 0 ; remainLen must be ZERO + + mov LOC_0 bufLimit, PARAM_bufLimit + mov sym_R, PARAM_lzma ; CLzmaDec_Asm_Loc pointer for GLOB_2 + mov dic, GLOB_2 dic_Spec + add PARAM_limit, dic + mov LOC_0 limit, PARAM_limit + + COPY_VAR(rep0) + COPY_VAR(rep1) + COPY_VAR(rep2) + COPY_VAR(rep3) + + mov dicPos, GLOB_2 dicPos_Spec + add dicPos, dic + mov LOC_0 dicPos_Spec, dicPos + mov LOC_0 dic_Spec, dic + + mov x1_L, GLOB_2 pb + mov t0, 1 + shl t0, x1_L + dec t0 + mov LOC_0 pbMask, t0 + + ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; + ; unsigned lc = p->prop.lc; + ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc); + + mov x1_L, GLOB_2 lc + mov x2, 100h + mov t0, x2 + shr x2, x1_L + ; inc x1 + add x1_L, PSHIFT + mov LOC_0 lc2, x1 + mov x1_L, GLOB_2 lp + shl t0, x1_L + sub t0, x2 + mov LOC_0 lpMask, t0 + mov lpMask_reg, t0 + + ; mov probs, GLOB_2 probs_Spec + ; add probs, kStartOffset SHL PSHIFT + mov probs, GLOB_2 probs_1664 + mov LOC_0 probs_Spec, probs + + mov t0_R, GLOB_2 dicBufSize + mov LOC_0 dicBufSize, t0_R + + mov x1, GLOB_2 checkDicSize + mov LOC_0 checkDicSize, x1 + + mov processedPos, GLOB_2 processedPos_Spec + + mov state, GLOB_2 state_Spec + shl state, PSHIFT + + mov buf, GLOB_2 buf_Spec + mov range, GLOB_2 range_Spec + mov cod, GLOB_2 code_Spec + mov kBitModelTotal_reg, kBitModelTotal + xor sym, sym + + ; if (processedPos != 0 || checkDicSize != 0) + or x1, processedPos + jz @f + + add t0_R, dic + cmp dicPos, dic + cmovnz t0_R, dicPos + movzx sym, byte ptr[t0_R - 1] + +@@: + IsMatchBranch_Pre + cmp state, 4 * PMULT + jb lit_end + cmp state, kNumLitStates * PMULT + jb lit_matched_end + jmp lz_end + + + + +; ---------- LITERAL ---------- +MY_ALIGN_64 +lit_start: + xor state, state +lit_start_2: + LIT_PROBS lpMask_reg + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +lit_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 127 + jbe lit_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + BIT_2 x2, 256 - 1 + + ; mov dic, LOC dic_Spec + mov probs, LOC probs_Spec + IsMatchBranch_Pre + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_end: + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start + + ; jmp IsMatch_label + +; ---------- MATCHES ---------- +; MY_ALIGN_32 +IsMatch_label: + UPDATE_1 probs_state_R, pbPos_R, IsMatch + IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label + + add probs, LenCoder * PMULT + add state, kNumStates * PMULT + +; ---------- LEN DECODE ---------- +len_decode: + mov len_temp, 8 - 1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, (1 SHL (kLenNumLowBits + PSHIFT)) + mov len_temp, -1 - kMatchMinLen + IF_BIT_0_NOUP probs, 0, 0, len_mid_0 + UPDATE_1 probs, 0, 0 + add probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT)) + mov sym, 1 + PLOAD x1, probs + 1 * PMULT + +MY_ALIGN_32 +len8_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 64 + jb len8_loop + + mov len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen + jmp short len_mid_2 ; we use short here for MASM that doesn't optimize that code as another assembler programs + +MY_ALIGN_32 +len_mid_0: + UPDATE_0 probs, 0, 0 + add probs, pbPos_R + BIT_0 x2, x1 +len_mid_2: + BIT_1 x1, x2 + BIT_2 x2, len_temp + mov probs, LOC probs_Spec + cmp state, kNumStates * PMULT + jb copy_match + + +; ---------- DECODE DISTANCE ---------- + ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); + + mov t0, 3 + kMatchMinLen + cmp sym, 3 + kMatchMinLen + cmovb t0, sym + add probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT)) + shl t0, (kNumPosSlotBits + PSHIFT) + add probs, t0_R + + ; sym = Len + ; mov LOC remainLen, sym + mov len_temp, sym + + ifdef _LZMA_SIZE_OPT + + PLOAD x1, probs + 1 * PMULT + mov sym, 1 +MY_ALIGN_16 +slot_loop: + BIT_1 x1, x2 + mov x1, x2 + cmp sym, 32 + jb slot_loop + + else + + BIT_0 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + BIT_1 x2, x1 + BIT_1 x1, x2 + + endif + + mov x1, sym + BIT_2 x2, 64-1 + + and sym, 3 + mov probs, LOC probs_Spec + cmp x1, 32 + kEndPosModelIndex / 2 + jb short_dist + + ; unsigned numDirectBits = (unsigned)(((distance >> 1) - 1)); + sub x1, (32 + 1 + kNumAlignBits) + ; distance = (2 | (distance & 1)); + or sym, 2 + PLOAD x2, probs + 1 * PMULT + shl sym, kNumAlignBits + 1 + lea sym2_R, [probs + 2 * PMULT] + + jmp direct_norm + ; lea t1, [sym_R + (1 SHL kNumAlignBits)] + ; cmp range, kTopValue + ; jb direct_norm + +; ---------- DIRECT DISTANCE ---------- +MY_ALIGN_32 +direct_loop: + shr range, 1 + mov t0, cod + sub cod, range + cmovs cod, t0 + cmovns sym, t1 + + comment ~ + sub cod, range + mov x2, cod + sar x2, 31 + lea sym, dword ptr [r2 + sym_R * 2 + 1] + and x2, range + add cod, x2 + ~ + dec x1 + je direct_end + + add sym, sym +direct_norm: + lea t1, [sym_R + (1 SHL kNumAlignBits)] + cmp range, kTopValue + jae near ptr direct_loop + ; we align for 32 here with "near ptr" command above + NORM_2 + jmp direct_loop + +MY_ALIGN_32 +direct_end: + ; prob = + kAlign; + ; distance <<= kNumAlignBits; + REV_0 x2, x1 + REV_1 x1, x2, 2 + REV_1 x2, x1, 4 + REV_2 x1, 8 + +decode_dist_end: + + ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize)) + + mov t1, LOC rep0 + mov x1, LOC rep1 + mov x2, LOC rep2 + + mov t0, LOC checkDicSize + test t0, t0 + cmove t0, processedPos + cmp sym, t0 + jae end_of_payload + ; jmp end_of_payload ; for debug + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + inc sym + mov LOC rep0, sym + ; mov sym, LOC remainLen + mov sym, len_temp + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + +; ---------- COPY MATCH ---------- +copy_match: + + ; len += kMatchMinLen; + ; add sym, kMatchMinLen + + ; if ((rem = limit - dicPos) == 0) + ; { + ; p->dicPos = dicPos; + ; return SZ_ERROR_DATA; + ; } + mov cnt_R, LOC limit + sub cnt_R, dicPos + jz fin_dicPos_LIMIT + + ; curLen = ((rem < len) ? (unsigned)rem : len); + cmp cnt_R, sym_R + ; cmovae cnt_R, sym_R ; 64-bit + cmovae cnt, sym ; 32-bit + + mov dic, LOC dic_Spec + mov x1, LOC rep0 + + mov t0_R, dicPos + add dicPos, cnt_R + ; processedPos += curLen; + add processedPos, cnt + ; len -= curLen; + sub sym, cnt + mov LOC remainLen, sym + + sub t0_R, dic + + ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0); + sub t0_R, r1 + jae @f + + mov r1, LOC dicBufSize + add t0_R, r1 + sub r1, t0_R + cmp cnt_R, r1 + ja copy_match_cross +@@: + ; if (curLen <= dicBufSize - pos) + +; ---------- COPY MATCH FAST ---------- + ; Byte *dest = dic + dicPos; + ; mov r1, dic + ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; + ; sub t0_R, dicPos + ; dicPos += curLen; + + ; const Byte *lim = dest + curLen; + add t0_R, dic + movzx sym, byte ptr[t0_R] + add t0_R, cnt_R + neg cnt_R + ; lea r1, [dicPos - 1] +copy_common: + dec dicPos + ; cmp LOC rep0, 1 + ; je rep0Label + + ; t0_R - src_lim + ; r1 - dest_lim - 1 + ; cnt_R - (-cnt) + + IsMatchBranch_Pre + inc cnt_R + jz copy_end +MY_ALIGN_16 +@@: + mov byte ptr[cnt_R * 1 + dicPos], sym_L + movzx sym, byte ptr[cnt_R * 1 + t0_R] + inc cnt_R + jnz @b + +copy_end: +lz_end_match: + mov byte ptr[dicPos], sym_L + inc dicPos + + ; IsMatchBranch_Pre + CheckLimits +lz_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + + + +; ---------- LITERAL MATCHED ---------- + + LIT_PROBS LOC lpMask + + ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov x1, LOC rep0 + ; mov dic, LOC dic_Spec + mov LOC dicPos_Spec, dicPos + + ; state -= (state < 10) ? 3 : 6; + lea t0, [state_R - 6 * PMULT] + sub state, 3 * PMULT + cmp state, 7 * PMULT + cmovae state, t0 + + sub dicPos, dic + sub dicPos, r1 + jae @f + add dicPos, LOC dicBufSize +@@: + comment ~ + xor t0, t0 + sub dicPos, r1 + cmovb t0_R, LOC dicBufSize + ~ + + movzx match, byte ptr[dic + dicPos * 1] + + ifdef _LZMA_SIZE_OPT + + mov offs, 256 * PMULT + shl match, (PSHIFT + 1) + mov bit, match + mov sym, 1 +MY_ALIGN_16 +litm_loop: + LITM + cmp sym, 256 + jb litm_loop + sub sym, 256 + + else + + LITM_0 + LITM + LITM + LITM + LITM + LITM + LITM + LITM_2 + + endif + + mov probs, LOC probs_Spec + IsMatchBranch_Pre + ; mov dic, LOC dic_Spec + mov dicPos, LOC dicPos_Spec + mov byte ptr[dicPos], sym_L + inc dicPos + + CheckLimits +lit_matched_end: + IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label + ; IsMatchBranch + mov lpMask_reg, LOC lpMask + sub state, 3 * PMULT + jmp lit_start_2 + + + +; ---------- REP 0 LITERAL ---------- +MY_ALIGN_32 +IsRep0Short_label: + UPDATE_0 probs_state_R, pbPos_R, IsRep0Long + + ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)]; + mov dic, LOC dic_Spec + mov t0_R, dicPos + mov probBranch, LOC rep0 + sub t0_R, dic + + sub probs, RepLenCoder * PMULT + + ; state = state < kNumLitStates ? 9 : 11; + or state, 1 * PMULT + + ; the caller doesn't allow (dicPos >= limit) case for REP_SHORT + ; so we don't need the following (dicPos == limit) check here: + ; cmp dicPos, LOC limit + ; jae fin_dicPos_LIMIT_REP_SHORT + + inc processedPos + + IsMatchBranch_Pre + +; xor sym, sym +; sub t0_R, probBranch_R +; cmovb sym_R, LOC dicBufSize +; add t0_R, sym_R + sub t0_R, probBranch_R + jae @f + add t0_R, LOC dicBufSize +@@: + movzx sym, byte ptr[dic + t0_R * 1] + jmp lz_end_match + + +MY_ALIGN_32 +IsRep_label: + UPDATE_1 probs_state_R, 0, IsRep + + ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode. + ; So we don't check it here. + + ; mov t0, processedPos + ; or t0, LOC checkDicSize + ; jz fin_ERROR_2 + + ; state = state < kNumLitStates ? 8 : 11; + cmp state, kNumLitStates * PMULT + mov state, 8 * PMULT + mov probBranch, 11 * PMULT + cmovae state, probBranch + + ; prob = probs + RepLenCoder; + add probs, RepLenCoder * PMULT + + IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label + IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label + UPDATE_1 probs_state_R, pbPos_R, IsRep0Long + jmp len_decode + +MY_ALIGN_32 +IsRepG0_label: + UPDATE_1 probs_state_R, 0, IsRepG0 + mov dist2, LOC rep0 + mov dist, LOC rep1 + mov LOC rep1, dist2 + + IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label + mov LOC rep0, dist + jmp len_decode + +; MY_ALIGN_32 +IsRepG1_label: + UPDATE_1 probs_state_R, 0, IsRepG1 + mov dist2, LOC rep2 + mov LOC rep2, dist + + IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label + mov LOC rep0, dist2 + jmp len_decode + +; MY_ALIGN_32 +IsRepG2_label: + UPDATE_1 probs_state_R, 0, IsRepG2 + mov dist, LOC rep3 + mov LOC rep3, dist2 + mov LOC rep0, dist + jmp len_decode + + + +; ---------- SPEC SHORT DISTANCE ---------- + +MY_ALIGN_32 +short_dist: + sub x1, 32 + 1 + jbe decode_dist_end + or sym, 2 + shl sym, x1_L + lea sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT] + mov sym2, PMULT ; step +MY_ALIGN_32 +spec_loop: + REV_1_VAR x2 + dec x1 + jnz spec_loop + + mov probs, LOC probs_Spec + sub sym, sym2 + sub sym, SpecPos * PMULT + sub sym_R, probs + shr sym, PSHIFT + + jmp decode_dist_end + + +; ---------- COPY MATCH CROSS ---------- +copy_match_cross: + ; t0_R - src pos + ; r1 - len to dicBufSize + ; cnt_R - total copy len + + mov t1_R, t0_R ; srcPos + mov t0_R, dic + mov r1, LOC dicBufSize ; + neg cnt_R +@@: + movzx sym, byte ptr[t1_R * 1 + t0_R] + inc t1_R + mov byte ptr[cnt_R * 1 + dicPos], sym_L + inc cnt_R + cmp t1_R, r1 + jne @b + + movzx sym, byte ptr[t0_R] + sub t0_R, cnt_R + jmp copy_common + + + + +; fin_dicPos_LIMIT_REP_SHORT: + ; mov sym, 1 + +fin_dicPos_LIMIT: + mov LOC remainLen, sym + jmp fin_OK + ; For more strict mode we can stop decoding with error + ; mov sym, 1 + ; jmp fin + + +fin_ERROR_MATCH_DIST: + + ; rep3 = rep2; + ; rep2 = rep1; + ; rep1 = rep0; + ; rep0 = distance + 1; + + add len_temp, kMatchSpecLen_Error_Data + mov LOC remainLen, len_temp + + mov LOC rep0, sym + mov LOC rep1, t1 + mov LOC rep2, x1 + mov LOC rep3, x2 + + ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; + cmp state, (kNumStates + kNumLitStates) * PMULT + mov state, kNumLitStates * PMULT + mov t0, (kNumLitStates + 3) * PMULT + cmovae state, t0 + + ; jmp fin_OK + mov sym, 1 + jmp fin + +end_of_payload: + inc sym + jnz fin_ERROR_MATCH_DIST + + mov LOC remainLen, kMatchSpecLenStart + sub state, kNumStates * PMULT + +fin_OK: + xor sym, sym + +fin: + NORM + + mov r1, LOC lzmaPtr + + sub dicPos, LOC dic_Spec + mov GLOB dicPos_Spec, dicPos + mov GLOB buf_Spec, buf + mov GLOB range_Spec, range + mov GLOB code_Spec, cod + shr state, PSHIFT + mov GLOB state_Spec, state + mov GLOB processedPos_Spec, processedPos + + RESTORE_VAR(remainLen) + RESTORE_VAR(rep0) + RESTORE_VAR(rep1) + RESTORE_VAR(rep2) + RESTORE_VAR(rep3) + + mov x0, sym + + mov RSP, LOC Old_RSP + +MY_POP_PRESERVED_ABI_REGS +MY_ENDP + +_TEXT$LZMADECOPT ENDS + +end diff --git a/Asm/x86/Sha1Opt.asm b/Asm/x86/Sha1Opt.asm new file mode 100644 index 0000000..3495fd1 --- /dev/null +++ b/Asm/x86/Sha1Opt.asm @@ -0,0 +1,263 @@ +; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions +; 2021-03-10 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + + + + + + + + + + + + + + + + +CONST SEGMENT + +align 16 +Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0 + + + + + + + + + + + + + + + + + + + + + + +CONST ENDS + +; _TEXT$SHA1OPT SEGMENT 'CODE' + +ifndef x64 + .686 + .xmm +endif + +ifdef x64 + rNum equ REG_ABI_PARAM_2 + if (IS_LINUX eq 0) + LOCAL_SIZE equ (16 * 2) + endif +else + rNum equ r0 + LOCAL_SIZE equ (16 * 1) +endif + +rState equ REG_ABI_PARAM_0 +rData equ REG_ABI_PARAM_1 + + +MY_sha1rnds4 macro a1, a2, imm + db 0fH, 03aH, 0ccH, (0c0H + a1 * 8 + a2), imm +endm + +MY_SHA_INSTR macro cmd, a1, a2 + db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2) +endm + +cmd_sha1nexte equ 0c8H +cmd_sha1msg1 equ 0c9H +cmd_sha1msg2 equ 0caH + +MY_sha1nexte macro a1, a2 + MY_SHA_INSTR cmd_sha1nexte, a1, a2 +endm + +MY_sha1msg1 macro a1, a2 + MY_SHA_INSTR cmd_sha1msg1, a1, a2 +endm + +MY_sha1msg2 macro a1, a2 + MY_SHA_INSTR cmd_sha1msg2, a1, a2 +endm + +MY_PROLOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa [r4 + 8], xmm6 + movdqa [r4 + 8 + 16], xmm7 + sub r4, LOCAL_SIZE + 8 + movdqa [r4 ], xmm8 + movdqa [r4 + 16], xmm9 + endif + else ; x86 + if (IS_CDECL gt 0) + mov rState, [r4 + REG_SIZE * 1] + mov rData, [r4 + REG_SIZE * 2] + mov rNum, [r4 + REG_SIZE * 3] + else ; fastcall + mov rNum, [r4 + REG_SIZE * 1] + endif + push r5 + mov r5, r4 + and r4, -16 + sub r4, LOCAL_SIZE + endif +endm + +MY_EPILOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa xmm8, [r4] + movdqa xmm9, [r4 + 16] + add r4, LOCAL_SIZE + 8 + movdqa xmm6, [r4 + 8] + movdqa xmm7, [r4 + 8 + 16] + endif + else ; x86 + mov r4, r5 + pop r5 + endif + MY_ENDP +endm + + +e0_N equ 0 +e1_N equ 1 +abcd_N equ 2 +e0_save_N equ 3 +w_regs equ 4 + +e0 equ @CatStr(xmm, %e0_N) +e1 equ @CatStr(xmm, %e1_N) +abcd equ @CatStr(xmm, %abcd_N) +e0_save equ @CatStr(xmm, %e0_save_N) + + +ifdef x64 + abcd_save equ xmm8 + mask2 equ xmm9 +else + abcd_save equ [r4] + mask2 equ e1 +endif + +LOAD_MASK macro + movdqa mask2, XMMWORD PTR Reverse_Endian_Mask +endm + +LOAD_W macro k:req + movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))] + pshufb @CatStr(xmm, %(w_regs + k)), mask2 +endm + + +; pre2 can be 2 or 3 (recommended) +pre2 equ 3 +pre1 equ (pre2 + 1) + +NUM_ROUNDS4 equ 20 + +RND4 macro k + movdqa @CatStr(xmm, %(e0_N + ((k + 1) mod 2))), abcd + MY_sha1rnds4 abcd_N, (e0_N + (k mod 2)), k / 5 + + nextM = (w_regs + ((k + 1) mod 4)) + + if (k EQ NUM_ROUNDS4 - 1) + nextM = e0_save_N + endif + + MY_sha1nexte (e0_N + ((k + 1) mod 2)), nextM + + if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2)) + pxor @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))) + endif + + if (k GE (4 - pre1)) AND (k LT (NUM_ROUNDS4 - pre1)) + MY_sha1msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4)) + endif + + if (k GE (4 - pre2)) AND (k LT (NUM_ROUNDS4 - pre2)) + MY_sha1msg2 (w_regs + ((k + pre2) mod 4)), (w_regs + ((k + pre2 - 1) mod 4)) + endif +endm + + +REVERSE_STATE macro + ; abcd ; dcba + ; e0 ; 000e + pshufd abcd, abcd, 01bH ; abcd + pshufd e0, e0, 01bH ; e000 +endm + + + + + +MY_PROC Sha1_UpdateBlocks_HW, 3 + MY_PROLOG + + cmp rNum, 0 + je end_c + + movdqu abcd, [rState] ; dcba + movd e0, dword ptr [rState + 16] ; 000e + + REVERSE_STATE + + ifdef x64 + LOAD_MASK + endif + + align 16 + nextBlock: + movdqa abcd_save, abcd + movdqa e0_save, e0 + + ifndef x64 + LOAD_MASK + endif + + LOAD_W 0 + LOAD_W 1 + LOAD_W 2 + LOAD_W 3 + + paddd e0, @CatStr(xmm, %(w_regs)) + k = 0 + rept NUM_ROUNDS4 + RND4 k + k = k + 1 + endm + + paddd abcd, abcd_save + + + add rData, 64 + sub rNum, 1 + jnz nextBlock + + REVERSE_STATE + + movdqu [rState], abcd + movd dword ptr [rState + 16], e0 + + end_c: +MY_EPILOG + +; _TEXT$SHA1OPT ENDS + +end diff --git a/Asm/x86/Sha256Opt.asm b/Asm/x86/Sha256Opt.asm new file mode 100644 index 0000000..5d02c90 --- /dev/null +++ b/Asm/x86/Sha256Opt.asm @@ -0,0 +1,263 @@ +; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions +; 2021-03-10 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +; .data +; public K + +; we can use external SHA256_K_ARRAY defined in Sha256.c +; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes + +COMMENT @ +ifdef x64 +K_CONST equ SHA256_K_ARRAY +else +K_CONST equ _SHA256_K_ARRAY +endif +EXTRN K_CONST:xmmword +@ + +CONST SEGMENT + +align 16 +Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 + +; COMMENT @ +align 16 +K_CONST \ +DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H +DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H +DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H +DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H +DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH +DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH +DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H +DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H +DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H +DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H +DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H +DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H +DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H +DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H +DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H +DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H +; @ + +CONST ENDS + +; _TEXT$SHA256OPT SEGMENT 'CODE' + +ifndef x64 + .686 + .xmm +endif + +ifdef x64 + rNum equ REG_ABI_PARAM_2 + if (IS_LINUX eq 0) + LOCAL_SIZE equ (16 * 2) + endif +else + rNum equ r0 + LOCAL_SIZE equ (16 * 1) +endif + +rState equ REG_ABI_PARAM_0 +rData equ REG_ABI_PARAM_1 + + + + + + +MY_SHA_INSTR macro cmd, a1, a2 + db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2) +endm + +cmd_sha256rnds2 equ 0cbH +cmd_sha256msg1 equ 0ccH +cmd_sha256msg2 equ 0cdH + +MY_sha256rnds2 macro a1, a2 + MY_SHA_INSTR cmd_sha256rnds2, a1, a2 +endm + +MY_sha256msg1 macro a1, a2 + MY_SHA_INSTR cmd_sha256msg1, a1, a2 +endm + +MY_sha256msg2 macro a1, a2 + MY_SHA_INSTR cmd_sha256msg2, a1, a2 +endm + +MY_PROLOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa [r4 + 8], xmm6 + movdqa [r4 + 8 + 16], xmm7 + sub r4, LOCAL_SIZE + 8 + movdqa [r4 ], xmm8 + movdqa [r4 + 16], xmm9 + endif + else ; x86 + if (IS_CDECL gt 0) + mov rState, [r4 + REG_SIZE * 1] + mov rData, [r4 + REG_SIZE * 2] + mov rNum, [r4 + REG_SIZE * 3] + else ; fastcall + mov rNum, [r4 + REG_SIZE * 1] + endif + push r5 + mov r5, r4 + and r4, -16 + sub r4, LOCAL_SIZE + endif +endm + +MY_EPILOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa xmm8, [r4] + movdqa xmm9, [r4 + 16] + add r4, LOCAL_SIZE + 8 + movdqa xmm6, [r4 + 8] + movdqa xmm7, [r4 + 8 + 16] + endif + else ; x86 + mov r4, r5 + pop r5 + endif + MY_ENDP +endm + + +msg equ xmm0 +tmp equ xmm0 +state0_N equ 2 +state1_N equ 3 +w_regs equ 4 + + +state1_save equ xmm1 +state0 equ @CatStr(xmm, %state0_N) +state1 equ @CatStr(xmm, %state1_N) + + +ifdef x64 + state0_save equ xmm8 + mask2 equ xmm9 +else + state0_save equ [r4] + mask2 equ xmm0 +endif + +LOAD_MASK macro + movdqa mask2, XMMWORD PTR Reverse_Endian_Mask +endm + +LOAD_W macro k:req + movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))] + pshufb @CatStr(xmm, %(w_regs + k)), mask2 +endm + + +; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1 +pre1 equ 3 +pre2 equ 2 + + + +RND4 macro k + movdqa msg, xmmword ptr [K_CONST + (k) * 16] + paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4))) + MY_sha256rnds2 state0_N, state1_N + pshufd msg, msg, 0eH + + if (k GE (4 - pre1)) AND (k LT (16 - pre1)) + ; w4[0] = msg1(w4[-4], w4[-3]) + MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4)) + endif + + MY_sha256rnds2 state1_N, state0_N + + if (k GE (4 - pre2)) AND (k LT (16 - pre2)) + movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4))) + palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4 + paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp + ; w4[0] = msg2(w4[0], w4[-1]) + MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4)) + endif +endm + + + + + +REVERSE_STATE macro + ; state0 ; dcba + ; state1 ; hgfe + pshufd tmp, state0, 01bH ; abcd + pshufd state0, state1, 01bH ; efgh + movdqa state1, state0 ; efgh + punpcklqdq state0, tmp ; cdgh + punpckhqdq state1, tmp ; abef +endm + + +MY_PROC Sha256_UpdateBlocks_HW, 3 + MY_PROLOG + + cmp rNum, 0 + je end_c + + movdqu state0, [rState] ; dcba + movdqu state1, [rState + 16] ; hgfe + + REVERSE_STATE + + ifdef x64 + LOAD_MASK + endif + + align 16 + nextBlock: + movdqa state0_save, state0 + movdqa state1_save, state1 + + ifndef x64 + LOAD_MASK + endif + + LOAD_W 0 + LOAD_W 1 + LOAD_W 2 + LOAD_W 3 + + + k = 0 + rept 16 + RND4 k + k = k + 1 + endm + + paddd state0, state0_save + paddd state1, state1_save + + add rData, 64 + sub rNum, 1 + jnz nextBlock + + REVERSE_STATE + + movdqu [rState], state0 + movdqu [rState + 16], state1 + + end_c: +MY_EPILOG + +; _TEXT$SHA256OPT ENDS + +end diff --git a/Asm/x86/XzCrc64Opt.asm b/Asm/x86/XzCrc64Opt.asm new file mode 100644 index 0000000..ad22cc2 --- /dev/null +++ b/Asm/x86/XzCrc64Opt.asm @@ -0,0 +1,239 @@ +; XzCrc64Opt.asm -- CRC64 calculation : optimized version +; 2021-02-06 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +ifdef x64 + +rD equ r9 +rN equ r10 +rT equ r5 +num_VAR equ r8 + +SRCDAT4 equ dword ptr [rD + rN * 1] + +CRC_XOR macro dest:req, src:req, t:req + xor dest, QWORD PTR [rT + src * 8 + 0800h * t] +endm + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shr r0, 8 + CRC_XOR r0, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + ifdef ABI_LINUX + MY_PUSH_2_REGS + else + MY_PUSH_4_REGS + endif + mov r0, REG_ABI_PARAM_0 + mov rN, REG_ABI_PARAM_2 + mov rT, REG_ABI_PARAM_3 + mov rD, REG_ABI_PARAM_1 + test rN, rN + jz crc_end + @@: + test rD, 3 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 8 + jb crc_end + add rN, rD + mov num_VAR, rN + sub rN, 4 + and rN, NOT 3 + sub rD, rN + mov x1, SRCDAT4 + xor r0, r1 + add rN, 4 +endm + +MY_EPILOG macro crc_end:req + sub rN, 4 + mov x1, SRCDAT4 + xor r0, r1 + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + ifdef ABI_LINUX + MY_POP_2_REGS + else + MY_POP_4_REGS + endif +endm + +MY_PROC XzCrc64UpdateT4, 4 + MY_PROLOG crc_end_4 + align 16 + main_loop_4: + mov x1, SRCDAT4 + movzx x2, x0_L + movzx x3, x0_H + shr r0, 16 + movzx x6, x0_L + movzx x7, x0_H + shr r0, 16 + CRC_XOR r1, r2, 3 + CRC_XOR r0, r3, 2 + CRC_XOR r1, r6, 1 + CRC_XOR r0, r7, 0 + xor r0, r1 + + add rD, 4 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +else +; x86 (32-bit) + +rD equ r1 +rN equ r7 +rT equ r5 + +crc_OFFS equ (REG_SIZE * 5) + +if (IS_CDECL gt 0) or (IS_LINUX gt 0) + ; cdecl or (GNU fastcall) stack: + ; (UInt32 *) table + ; size_t size + ; void * data + ; (UInt64) crc + ; ret-ip <-(r4) + data_OFFS equ (8 + crc_OFFS) + size_OFFS equ (REG_SIZE + data_OFFS) + table_OFFS equ (REG_SIZE + size_OFFS) + num_VAR equ [r4 + size_OFFS] + table_VAR equ [r4 + table_OFFS] +else + ; Windows fastcall: + ; r1 = data, r2 = size + ; stack: + ; (UInt32 *) table + ; (UInt64) crc + ; ret-ip <-(r4) + table_OFFS equ (8 + crc_OFFS) + table_VAR equ [r4 + table_OFFS] + num_VAR equ table_VAR +endif + +SRCDAT4 equ dword ptr [rD + rN * 1] + +CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req + op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t] + op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4] +endm + +CRC_XOR macro dest0:req, dest1:req, src:req, t:req + CRC xor, xor, dest0, dest1, src, t +endm + + +CRC1b macro + movzx x6, BYTE PTR [rD] + inc rD + movzx x3, x0_L + xor x6, x3 + shrd r0, r2, 8 + shr r2, 8 + CRC_XOR r0, r2, r6, 0 + dec rN +endm + +MY_PROLOG macro crc_end:req + MY_PUSH_4_REGS + + if (IS_CDECL gt 0) or (IS_LINUX gt 0) + proc_numParams = proc_numParams + 2 ; for ABI_LINUX + mov rN, [r4 + size_OFFS] + mov rD, [r4 + data_OFFS] + else + mov rN, r2 + endif + + mov x0, [r4 + crc_OFFS] + mov x2, [r4 + crc_OFFS + 4] + mov rT, table_VAR + test rN, rN + jz crc_end + @@: + test rD, 3 + jz @F + CRC1b + jnz @B + @@: + cmp rN, 8 + jb crc_end + add rN, rD + + mov num_VAR, rN + + sub rN, 4 + and rN, NOT 3 + sub rD, rN + xor r0, SRCDAT4 + add rN, 4 +endm + +MY_EPILOG macro crc_end:req + sub rN, 4 + xor r0, SRCDAT4 + + mov rD, rN + mov rN, num_VAR + sub rN, rD + crc_end: + test rN, rN + jz @F + CRC1b + jmp crc_end + @@: + MY_POP_4_REGS +endm + +MY_PROC XzCrc64UpdateT4, 5 + MY_PROLOG crc_end_4 + movzx x6, x0_L + align 16 + main_loop_4: + mov r3, SRCDAT4 + xor r3, r2 + + CRC xor, mov, r3, r2, r6, 3 + movzx x6, x0_H + shr r0, 16 + CRC_XOR r3, r2, r6, 2 + + movzx x6, x0_L + movzx x0, x0_H + CRC_XOR r3, r2, r6, 1 + CRC_XOR r3, r2, r0, 0 + movzx x6, x3_L + mov r0, r3 + + add rD, 4 + jnz main_loop_4 + + MY_EPILOG crc_end_4 +MY_ENDP + +endif ; ! x64 + +end -- cgit v1.2.3-55-g6feb