From f19f813537c7aea1c20749c914e756b54a9c3cf5 Mon Sep 17 00:00:00 2001 From: Igor Pavlov <87184205+ip7z@users.noreply.github.com> Date: Mon, 27 Dec 2021 00:00:00 +0000 Subject: '21.07' --- Asm/x86/Sha256Opt.asm | 263 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 Asm/x86/Sha256Opt.asm (limited to 'Asm/x86/Sha256Opt.asm') diff --git a/Asm/x86/Sha256Opt.asm b/Asm/x86/Sha256Opt.asm new file mode 100644 index 0000000..5d02c90 --- /dev/null +++ b/Asm/x86/Sha256Opt.asm @@ -0,0 +1,263 @@ +; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions +; 2021-03-10 : Igor Pavlov : Public domain + +include 7zAsm.asm + +MY_ASM_START + +; .data +; public K + +; we can use external SHA256_K_ARRAY defined in Sha256.c +; but we must guarantee that SHA256_K_ARRAY is aligned for 16-bytes + +COMMENT @ +ifdef x64 +K_CONST equ SHA256_K_ARRAY +else +K_CONST equ _SHA256_K_ARRAY +endif +EXTRN K_CONST:xmmword +@ + +CONST SEGMENT + +align 16 +Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 + +; COMMENT @ +align 16 +K_CONST \ +DD 0428a2f98H, 071374491H, 0b5c0fbcfH, 0e9b5dba5H +DD 03956c25bH, 059f111f1H, 0923f82a4H, 0ab1c5ed5H +DD 0d807aa98H, 012835b01H, 0243185beH, 0550c7dc3H +DD 072be5d74H, 080deb1feH, 09bdc06a7H, 0c19bf174H +DD 0e49b69c1H, 0efbe4786H, 00fc19dc6H, 0240ca1ccH +DD 02de92c6fH, 04a7484aaH, 05cb0a9dcH, 076f988daH +DD 0983e5152H, 0a831c66dH, 0b00327c8H, 0bf597fc7H +DD 0c6e00bf3H, 0d5a79147H, 006ca6351H, 014292967H +DD 027b70a85H, 02e1b2138H, 04d2c6dfcH, 053380d13H +DD 0650a7354H, 0766a0abbH, 081c2c92eH, 092722c85H +DD 0a2bfe8a1H, 0a81a664bH, 0c24b8b70H, 0c76c51a3H +DD 0d192e819H, 0d6990624H, 0f40e3585H, 0106aa070H +DD 019a4c116H, 01e376c08H, 02748774cH, 034b0bcb5H +DD 0391c0cb3H, 04ed8aa4aH, 05b9cca4fH, 0682e6ff3H +DD 0748f82eeH, 078a5636fH, 084c87814H, 08cc70208H +DD 090befffaH, 0a4506cebH, 0bef9a3f7H, 0c67178f2H +; @ + +CONST ENDS + +; _TEXT$SHA256OPT SEGMENT 'CODE' + +ifndef x64 + .686 + .xmm +endif + +ifdef x64 + rNum equ REG_ABI_PARAM_2 + if (IS_LINUX eq 0) + LOCAL_SIZE equ (16 * 2) + endif +else + rNum equ r0 + LOCAL_SIZE equ (16 * 1) +endif + +rState equ REG_ABI_PARAM_0 +rData equ REG_ABI_PARAM_1 + + + + + + +MY_SHA_INSTR macro cmd, a1, a2 + db 0fH, 038H, cmd, (0c0H + a1 * 8 + a2) +endm + +cmd_sha256rnds2 equ 0cbH +cmd_sha256msg1 equ 0ccH +cmd_sha256msg2 equ 0cdH + +MY_sha256rnds2 macro a1, a2 + MY_SHA_INSTR cmd_sha256rnds2, a1, a2 +endm + +MY_sha256msg1 macro a1, a2 + MY_SHA_INSTR cmd_sha256msg1, a1, a2 +endm + +MY_sha256msg2 macro a1, a2 + MY_SHA_INSTR cmd_sha256msg2, a1, a2 +endm + +MY_PROLOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa [r4 + 8], xmm6 + movdqa [r4 + 8 + 16], xmm7 + sub r4, LOCAL_SIZE + 8 + movdqa [r4 ], xmm8 + movdqa [r4 + 16], xmm9 + endif + else ; x86 + if (IS_CDECL gt 0) + mov rState, [r4 + REG_SIZE * 1] + mov rData, [r4 + REG_SIZE * 2] + mov rNum, [r4 + REG_SIZE * 3] + else ; fastcall + mov rNum, [r4 + REG_SIZE * 1] + endif + push r5 + mov r5, r4 + and r4, -16 + sub r4, LOCAL_SIZE + endif +endm + +MY_EPILOG macro + ifdef x64 + if (IS_LINUX eq 0) + movdqa xmm8, [r4] + movdqa xmm9, [r4 + 16] + add r4, LOCAL_SIZE + 8 + movdqa xmm6, [r4 + 8] + movdqa xmm7, [r4 + 8 + 16] + endif + else ; x86 + mov r4, r5 + pop r5 + endif + MY_ENDP +endm + + +msg equ xmm0 +tmp equ xmm0 +state0_N equ 2 +state1_N equ 3 +w_regs equ 4 + + +state1_save equ xmm1 +state0 equ @CatStr(xmm, %state0_N) +state1 equ @CatStr(xmm, %state1_N) + + +ifdef x64 + state0_save equ xmm8 + mask2 equ xmm9 +else + state0_save equ [r4] + mask2 equ xmm0 +endif + +LOAD_MASK macro + movdqa mask2, XMMWORD PTR Reverse_Endian_Mask +endm + +LOAD_W macro k:req + movdqu @CatStr(xmm, %(w_regs + k)), [rData + (16 * (k))] + pshufb @CatStr(xmm, %(w_regs + k)), mask2 +endm + + +; pre1 <= 4 && pre2 >= 1 && pre1 > pre2 && (pre1 - pre2) <= 1 +pre1 equ 3 +pre2 equ 2 + + + +RND4 macro k + movdqa msg, xmmword ptr [K_CONST + (k) * 16] + paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4))) + MY_sha256rnds2 state0_N, state1_N + pshufd msg, msg, 0eH + + if (k GE (4 - pre1)) AND (k LT (16 - pre1)) + ; w4[0] = msg1(w4[-4], w4[-3]) + MY_sha256msg1 (w_regs + ((k + pre1) mod 4)), (w_regs + ((k + pre1 - 3) mod 4)) + endif + + MY_sha256rnds2 state1_N, state0_N + + if (k GE (4 - pre2)) AND (k LT (16 - pre2)) + movdqa tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 1) mod 4))) + palignr tmp, @CatStr(xmm, %(w_regs + ((k + pre2 - 2) mod 4))), 4 + paddd @CatStr(xmm, %(w_regs + ((k + pre2) mod 4))), tmp + ; w4[0] = msg2(w4[0], w4[-1]) + MY_sha256msg2 %(w_regs + ((k + pre2) mod 4)), %(w_regs + ((k + pre2 - 1) mod 4)) + endif +endm + + + + + +REVERSE_STATE macro + ; state0 ; dcba + ; state1 ; hgfe + pshufd tmp, state0, 01bH ; abcd + pshufd state0, state1, 01bH ; efgh + movdqa state1, state0 ; efgh + punpcklqdq state0, tmp ; cdgh + punpckhqdq state1, tmp ; abef +endm + + +MY_PROC Sha256_UpdateBlocks_HW, 3 + MY_PROLOG + + cmp rNum, 0 + je end_c + + movdqu state0, [rState] ; dcba + movdqu state1, [rState + 16] ; hgfe + + REVERSE_STATE + + ifdef x64 + LOAD_MASK + endif + + align 16 + nextBlock: + movdqa state0_save, state0 + movdqa state1_save, state1 + + ifndef x64 + LOAD_MASK + endif + + LOAD_W 0 + LOAD_W 1 + LOAD_W 2 + LOAD_W 3 + + + k = 0 + rept 16 + RND4 k + k = k + 1 + endm + + paddd state0, state0_save + paddd state1, state1_save + + add rData, 64 + sub rNum, 1 + jnz nextBlock + + REVERSE_STATE + + movdqu [rState], state0 + movdqu [rState + 16], state1 + + end_c: +MY_EPILOG + +; _TEXT$SHA256OPT ENDS + +end -- cgit v1.2.3-55-g6feb