From 6a6c1c0ea91edeeb18736190feb5a7278d3d1141 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 9 Feb 2022 11:29:23 +0100 Subject: whitespace fix Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 6 +++--- libbb/hash_md5_sha256_x86-64_shaNI.S | 6 +++--- libbb/hash_md5_sha_x86-32_shaNI.S | 4 ++-- libbb/hash_md5_sha_x86-64_shaNI.S | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index aa68193bd..413e2df9e 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -250,7 +250,7 @@ sha256_process_block64_shaNI: .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 + .balign 16 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 @@ -270,8 +270,8 @@ K256: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 + .balign 16 PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 + .octa 0x0c0d0e0f08090a0b0405060700010203 #endif diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 4663f750a..c246762aa 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -257,7 +257,7 @@ sha256_process_block64_shaNI: .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI .section .rodata.cst256.K256, "aM", @progbits, 256 - .balign 16 + .balign 16 K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 @@ -277,8 +277,8 @@ K256: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 - .balign 16 + .balign 16 PSHUFFLE_BSWAP32_FLIP_MASK: - .octa 0x0c0d0e0f08090a0b0405060700010203 + .octa 0x0c0d0e0f08090a0b0405060700010203 #endif diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index a61b3cbed..afca98a62 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -219,8 +219,8 @@ sha1_process_block64_shaNI: .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 + .balign 16 PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f #endif diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index b32029360..54d122788 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -217,8 +217,8 @@ sha1_process_block64_shaNI: .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 - .balign 16 + .balign 16 PSHUFFLE_BYTE_FLIP_MASK: - .octa 0x000102030405060708090a0b0c0d0e0f + .octa 0x000102030405060708090a0b0c0d0e0f #endif -- cgit v1.2.3-55-g6feb From 6f56fa17131b3cbb84e887c6c5fb202f2492169e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 10 Feb 2022 15:38:10 +0100 Subject: libbb/sha: improve comments Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 18 +++++++++--------- libbb/hash_md5_sha256_x86-64_shaNI.S | 19 +++++++++---------- libbb/hash_md5_sha_x86-32_shaNI.S | 2 +- libbb/hash_md5_sha_x86-64_shaNI.S | 2 +- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 413e2df9e..4b33449d4 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -4,7 +4,7 @@ // We use shorter insns, even though they are for "wrong" // data type (fp, not int). // For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA1 insns). +// (CPUs which do have such penalty do not support SHA insns). // For AMD, the penalty is one extra cycle // (allegedly: I failed to find measurable difference). @@ -39,12 +39,13 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ - movu128 76+1*16(%eax), STATE1 /* HGFE */ + movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ + movu128 76+1*16(%eax), STATE1 /* EFGH */ /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ mova128 STATE1, STATE0 - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP @@ -232,12 +233,11 @@ sha256_process_block64_shaNI: sha256rnds2 STATE1, STATE0 /* Write hash values back in the correct order */ - /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ - /* STATE1: CDGH */ mova128 STATE0, XMMTMP /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ /* add current hash values to previous ones */ movu128 76+1*16(%eax), STATE1 paddd XMMTMP, STATE1 diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index c246762aa..5ed80c2ef 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -4,7 +4,7 @@ // We use shorter insns, even though they are for "wrong" // data type (fp, not int). // For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA1 insns). +// (CPUs which do have such penalty do not support SHA insns). // For AMD, the penalty is one extra cycle // (allegedly: I failed to find measurable difference). @@ -42,12 +42,13 @@ .balign 8 # allow decoders to fetch at least 2 first insns sha256_process_block64_shaNI: - movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ - movu128 80+1*16(%rdi), STATE1 /* HGFE */ + movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ + movu128 80+1*16(%rdi), STATE1 /* EFGH */ /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ mova128 STATE1, STATE0 - shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ - shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ + /* --- -------------- ABCD -- EFGH */ + shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ + shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ /* XMMTMP holds flip mask from here... */ mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP @@ -243,13 +244,11 @@ sha256_process_block64_shaNI: paddd CDGH_SAVE, STATE1 /* Write hash values back in the correct order */ - /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ - /* STATE1: CDGH */ mova128 STATE0, XMMTMP /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ - shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ - shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ - + /* --- -------------- HGDC -- FEBA */ + shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ + shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ movu128 STATE0, 80+0*16(%rdi) movu128 XMMTMP, 80+1*16(%rdi) diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index afca98a62..c7fb243ce 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -4,7 +4,7 @@ // We use shorter insns, even though they are for "wrong" // data type (fp, not int). // For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA1 insns). +// (CPUs which do have such penalty do not support SHA insns). // For AMD, the penalty is one extra cycle // (allegedly: I failed to find measurable difference). diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index 54d122788..c13cdec07 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -4,7 +4,7 @@ // We use shorter insns, even though they are for "wrong" // data type (fp, not int). // For Intel, there is no penalty for doing it at all -// (CPUs which do have such penalty do not support SHA1 insns). +// (CPUs which do have such penalty do not support SHA insns). // For AMD, the penalty is one extra cycle // (allegedly: I failed to find measurable difference). -- cgit v1.2.3-55-g6feb From 8154146be491bc66ab34d5d5f2a2466ddbdcff52 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 11 Feb 2022 06:08:27 +0100 Subject: libbb/sha1: shrink unrolled x86-64 code function old new delta sha1_process_block64 3481 3384 -97 Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 129 ++++++++++++++++++++--------------------- libbb/hash_md5_sha_x86-64.S.sh | 111 +++++++++++++++++------------------ 2 files changed, 117 insertions(+), 123 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 287cfe547..51fde082a 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -27,68 +27,60 @@ sha1_process_block64: # xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - movaps sha1const(%rip), %xmm7 + movaps bswap32_mask(%rip), %xmm4 pshufd $0x00, %xmm7, %xmm6 - # Load W[] to xmm registers, byteswapping on the fly. + # Load W[] to xmm0..3, byteswapping on the fly. # - # For iterations 0..15, we pass W[] in rsi,r8..r14 + # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq $32, %rsi # rsi = W[1]:W[0] - rolq $32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, %xmm4 - punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, %xmm4 # add RCONST, spill to stack -# paddd %xmm6, %xmm4 -# movups %xmm4, -64+16*0(%rsp) + # ADDs in two first RD1As shorter by one byte). + movups 16*0(%rdi), %xmm0 + pshufb %xmm4, %xmm0 + movaps %xmm0, %xmm5 + paddd %xmm6, %xmm5 + movq %xmm5, %rsi +# pextrq $1, %xmm5, %r8 #SSE4.1 insn +# movhpd %xmm5, %r8 #can only move to mem, not to reg + shufps $0x0e, %xmm5, %xmm5 + movq %xmm5, %r8 + + movups 16*1(%rdi), %xmm1 + pshufb %xmm4, %xmm1 + movaps %xmm1, %xmm5 + paddd %xmm6, %xmm5 + movq %xmm5, %r9 + shufps $0x0e, %xmm5, %xmm5 + movq %xmm5, %r10 - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq $32, %r9 # r9 = W[5]:W[4] - rolq $32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, %xmm4 - punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + movups 16*2(%rdi), %xmm2 + pshufb %xmm4, %xmm2 + movaps %xmm2, %xmm5 + paddd %xmm6, %xmm5 + movq %xmm5, %r11 + shufps $0x0e, %xmm5, %xmm5 + movq %xmm5, %r12 - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq $32, %r11 # r11 = W[9]:W[8] - rolq $32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, %xmm4 - punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + movups 16*3(%rdi), %xmm3 + pshufb %xmm4, %xmm3 + movaps %xmm3, %xmm5 + paddd %xmm6, %xmm5 + movq %xmm5, %r13 + shufps $0x0e, %xmm5, %xmm5 + movq %xmm5, %r14 - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq $32, %r13 # r13 = W[13]:W[12] - rolq $32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, %xmm4 - punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) + # MOVQs to GPRs (above) have somewhat high latency. + # Load hash[] while they are completing: + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] # 0 - leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] + addl %esi, %ebp # e += RCONST + W[n] shrq $32, %rsi movl %ecx, %edi # c xorl %edx, %edi # ^d @@ -100,7 +92,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 1 - leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] + addl %esi, %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b @@ -111,7 +103,7 @@ sha1_process_block64: addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 2 - leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] + addl %r8d, %ecx # e += RCONST + W[n] shrq $32, %r8 movl %eax, %edi # c xorl %ebx, %edi # ^d @@ -123,7 +115,7 @@ sha1_process_block64: addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 3 - leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] + addl %r8d, %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b @@ -134,7 +126,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 4 - leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] + addl %r9d, %eax # e += RCONST + W[n] shrq $32, %r9 movl %edx, %edi # c xorl %ebp, %edi # ^d @@ -146,7 +138,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 5 - leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] + addl %r9d, %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b @@ -157,7 +149,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 6 - leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] + addl %r10d, %edx # e += RCONST + W[n] shrq $32, %r10 movl %ebx, %edi # c xorl %ecx, %edi # ^d @@ -169,7 +161,7 @@ sha1_process_block64: addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 7 - leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] + addl %r10d, %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b @@ -210,7 +202,7 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*0(%rsp) # 8 - leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] + addl %r11d, %ebx # e += RCONST + W[n] shrq $32, %r11 movl %ebp, %edi # c xorl %eax, %edi # ^d @@ -222,7 +214,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 9 - leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] + addl %r11d, %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b @@ -233,7 +225,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 10 - leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] + addl %r12d, %ebp # e += RCONST + W[n] shrq $32, %r12 movl %ecx, %edi # c xorl %edx, %edi # ^d @@ -245,7 +237,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 11 - leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] + addl %r12d, %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b @@ -287,7 +279,7 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*1(%rsp) # 12 - leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] + addl %r13d, %ecx # e += RCONST + W[n] shrq $32, %r13 movl %eax, %edi # c xorl %ebx, %edi # ^d @@ -299,7 +291,7 @@ sha1_process_block64: addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 13 - leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] + addl %r13d, %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b @@ -310,7 +302,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 14 - leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] + addl %r14d, %eax # e += RCONST + W[n] shrq $32, %r14 movl %edx, %edi # c xorl %ebp, %edi # ^d @@ -322,7 +314,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 15 - leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] + addl %r14d, %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b @@ -1475,6 +1467,11 @@ sha1_process_block64: ret .size sha1_process_block64, .-sha1_process_block64 + .section .rodata.cst16.bswap32_mask, "aM", @progbits, 16 + .balign 16 +bswap32_mask: + .octa 0x0c0d0e0f08090a0b0405060700010203 + .section .rodata.cst16.sha1const, "aM", @progbits, 16 .balign 16 sha1const: diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index a10ac411d..f34e6e6fa 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -129,65 +129,57 @@ sha1_process_block64: # xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] - movaps sha1const(%rip), $xmmALLRCONST + movaps bswap32_mask(%rip), $xmmT1 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST - # Load W[] to xmm registers, byteswapping on the fly. + # Load W[] to xmm0..3, byteswapping on the fly. # - # For iterations 0..15, we pass W[] in rsi,r8..r14 + # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 # for use in RD1As instead of spilling them to stack. - # We lose parallelized addition of RCONST, but LEA - # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # LEAs in two first RD1As shorter by one byte). - movq 4*0(%rdi), %rsi - movq 4*2(%rdi), %r8 - bswapq %rsi - bswapq %r8 - rolq \$32, %rsi # rsi = W[1]:W[0] - rolq \$32, %r8 # r8 = W[3]:W[2] - movq %rsi, %xmm0 - movq %r8, $xmmT1 - punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) -# movaps %xmm0, $xmmT1 # add RCONST, spill to stack -# paddd $xmmRCONST, $xmmT1 -# movups $xmmT1, -64+16*0(%rsp) - - movq 4*4(%rdi), %r9 - movq 4*6(%rdi), %r10 - bswapq %r9 - bswapq %r10 - rolq \$32, %r9 # r9 = W[5]:W[4] - rolq \$32, %r10 # r10 = W[7]:W[6] - movq %r9, %xmm1 - movq %r10, $xmmT1 - punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - - movq 4*8(%rdi), %r11 - movq 4*10(%rdi), %r12 - bswapq %r11 - bswapq %r12 - rolq \$32, %r11 # r11 = W[9]:W[8] - rolq \$32, %r12 # r12 = W[11]:W[10] - movq %r11, %xmm2 - movq %r12, $xmmT1 - punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - - movq 4*12(%rdi), %r13 - movq 4*14(%rdi), %r14 - bswapq %r13 - bswapq %r14 - rolq \$32, %r13 # r13 = W[13]:W[12] - rolq \$32, %r14 # r14 = W[15]:W[14] - movq %r13, %xmm3 - movq %r14, $xmmT1 - punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) + # ADDs in two first RD1As shorter by one byte). + movups 16*0(%rdi), %xmm0 + pshufb $xmmT1, %xmm0 + movaps %xmm0, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movq $xmmT2, %rsi +# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn +# movhpd $xmmT2, %r8 #can only move to mem, not to reg + shufps \$0x0e, $xmmT2, $xmmT2 + movq $xmmT2, %r8 + + movups 16*1(%rdi), %xmm1 + pshufb $xmmT1, %xmm1 + movaps %xmm1, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movq $xmmT2, %r9 + shufps \$0x0e, $xmmT2, $xmmT2 + movq $xmmT2, %r10 + + movups 16*2(%rdi), %xmm2 + pshufb $xmmT1, %xmm2 + movaps %xmm2, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movq $xmmT2, %r11 + shufps \$0x0e, $xmmT2, $xmmT2 + movq $xmmT2, %r12 + + movups 16*3(%rdi), %xmm3 + pshufb $xmmT1, %xmm3 + movaps %xmm3, $xmmT2 + paddd $xmmRCONST, $xmmT2 + movq $xmmT2, %r13 + shufps \$0x0e, $xmmT2, $xmmT2 + movq $xmmT2, %r14 + + # MOVQs to GPRs (above) have somewhat high latency. + # Load hash[] while they are completing: + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] " PREP() { @@ -266,15 +258,15 @@ local rN=$((7+n0/2)) echo " # $n ";test $n0 = 0 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] + addl %esi, %e$e # e += RCONST + W[n] shrq \$32, %rsi ";test $n0 = 1 && echo " - leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] + addl %esi, %e$e # e += RCONST + W[n] ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] + addl %r${rN}d, %e$e # e += RCONST + W[n] shrq \$32, %r$rN ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " - leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] + addl %r${rN}d, %e$e # e += RCONST + W[n] ";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d @@ -440,6 +432,11 @@ echo " ret .size sha1_process_block64, .-sha1_process_block64 + .section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16 + .balign 16 +bswap32_mask: + .octa 0x0c0d0e0f08090a0b0405060700010203 + .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 .balign 16 sha1const: -- cgit v1.2.3-55-g6feb From dda77e83762861b52d62f0f161e2b4bf8092eacf Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 11 Feb 2022 14:53:26 +0100 Subject: libbb/sha1: revert last commit: pshufb is a SSSE3 insn, can't use it Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 4 ++ libbb/hash_md5_sha256_x86-64_shaNI.S | 4 ++ libbb/hash_md5_sha_x86-32_shaNI.S | 5 ++ libbb/hash_md5_sha_x86-64.S | 127 +++++++++++++++++---------------- libbb/hash_md5_sha_x86-64.S.sh | 133 +++++++++++++++++++++-------------- libbb/hash_md5_sha_x86-64_shaNI.S | 5 ++ 6 files changed, 163 insertions(+), 115 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index 4b33449d4..c059fb18d 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -15,6 +15,10 @@ //#define shuf128_32 pshufd #define shuf128_32 shufps +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + .section .text.sha256_process_block64_shaNI, "ax", @progbits .globl sha256_process_block64_shaNI .hidden sha256_process_block64_shaNI diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 5ed80c2ef..9578441f8 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -15,6 +15,10 @@ //#define shuf128_32 pshufd #define shuf128_32 shufps +// pshufb and palignr are SSSE3 insns. +// We do not check SSSE3 in cpuid, +// all SHA-capable CPUs support it as well. + .section .text.sha256_process_block64_shaNI, "ax", @progbits .globl sha256_process_block64_shaNI .hidden sha256_process_block64_shaNI diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index c7fb243ce..2366b046a 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S @@ -20,6 +20,11 @@ #define extr128_32 pextrd //#define extr128_32 extractps # not shorter +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + .section .text.sha1_process_block64_shaNI, "ax", @progbits .globl sha1_process_block64_shaNI .hidden sha1_process_block64_shaNI diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 51fde082a..f0daa30f6 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -27,60 +27,68 @@ sha1_process_block64: # xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + movaps sha1const(%rip), %xmm7 - movaps bswap32_mask(%rip), %xmm4 pshufd $0x00, %xmm7, %xmm6 # Load W[] to xmm0..3, byteswapping on the fly. # - # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 + # For iterations 0..15, we pass W[] in rsi,r8..r14 # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # ADDs in two first RD1As shorter by one byte). - movups 16*0(%rdi), %xmm0 - pshufb %xmm4, %xmm0 - movaps %xmm0, %xmm5 - paddd %xmm6, %xmm5 - movq %xmm5, %rsi -# pextrq $1, %xmm5, %r8 #SSE4.1 insn -# movhpd %xmm5, %r8 #can only move to mem, not to reg - shufps $0x0e, %xmm5, %xmm5 - movq %xmm5, %r8 - - movups 16*1(%rdi), %xmm1 - pshufb %xmm4, %xmm1 - movaps %xmm1, %xmm5 - paddd %xmm6, %xmm5 - movq %xmm5, %r9 - shufps $0x0e, %xmm5, %xmm5 - movq %xmm5, %r10 + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq $32, %rsi # rsi = W[1]:W[0] + rolq $32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, %xmm4 + punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, %xmm4 # add RCONST, spill to stack +# paddd %xmm6, %xmm4 +# movups %xmm4, -64+16*0(%rsp) - movups 16*2(%rdi), %xmm2 - pshufb %xmm4, %xmm2 - movaps %xmm2, %xmm5 - paddd %xmm6, %xmm5 - movq %xmm5, %r11 - shufps $0x0e, %xmm5, %xmm5 - movq %xmm5, %r12 + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq $32, %r9 # r9 = W[5]:W[4] + rolq $32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, %xmm4 + punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) - movups 16*3(%rdi), %xmm3 - pshufb %xmm4, %xmm3 - movaps %xmm3, %xmm5 - paddd %xmm6, %xmm5 - movq %xmm5, %r13 - shufps $0x0e, %xmm5, %xmm5 - movq %xmm5, %r14 + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq $32, %r11 # r11 = W[9]:W[8] + rolq $32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, %xmm4 + punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) - # MOVQs to GPRs (above) have somewhat high latency. - # Load hash[] while they are completing: - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq $32, %r13 # r13 = W[13]:W[12] + rolq $32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, %xmm4 + punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) # 0 - addl %esi, %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] shrq $32, %rsi movl %ecx, %edi # c xorl %edx, %edi # ^d @@ -92,7 +100,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 1 - addl %esi, %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b @@ -103,7 +111,7 @@ sha1_process_block64: addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 2 - addl %r8d, %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] shrq $32, %r8 movl %eax, %edi # c xorl %ebx, %edi # ^d @@ -115,7 +123,7 @@ sha1_process_block64: addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 3 - addl %r8d, %ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b @@ -126,7 +134,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 4 - addl %r9d, %eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] shrq $32, %r9 movl %edx, %edi # c xorl %ebp, %edi # ^d @@ -138,7 +146,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 5 - addl %r9d, %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b @@ -149,7 +157,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 6 - addl %r10d, %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] shrq $32, %r10 movl %ebx, %edi # c xorl %ecx, %edi # ^d @@ -161,7 +169,7 @@ sha1_process_block64: addl %edi, %edx # e += rotl32(a,5) rorl $2, %eax # b = rotl32(b,30) # 7 - addl %r10d, %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] movl %eax, %edi # c xorl %ebx, %edi # ^d andl %ebp, %edi # &b @@ -202,7 +210,7 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*0(%rsp) # 8 - addl %r11d, %ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] shrq $32, %r11 movl %ebp, %edi # c xorl %eax, %edi # ^d @@ -214,7 +222,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 9 - addl %r11d, %eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] movl %edx, %edi # c xorl %ebp, %edi # ^d andl %ecx, %edi # &b @@ -225,7 +233,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 10 - addl %r12d, %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] shrq $32, %r12 movl %ecx, %edi # c xorl %edx, %edi # ^d @@ -237,7 +245,7 @@ sha1_process_block64: addl %edi, %ebp # e += rotl32(a,5) rorl $2, %ebx # b = rotl32(b,30) # 11 - addl %r12d, %edx # e += RCONST + W[n] + leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] movl %ebx, %edi # c xorl %ecx, %edi # ^d andl %eax, %edi # &b @@ -279,7 +287,7 @@ sha1_process_block64: paddd %xmm6, %xmm5 movups %xmm5, -64+16*1(%rsp) # 12 - addl %r13d, %ecx # e += RCONST + W[n] + leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] shrq $32, %r13 movl %eax, %edi # c xorl %ebx, %edi # ^d @@ -291,7 +299,7 @@ sha1_process_block64: addl %edi, %ecx # e += rotl32(a,5) rorl $2, %ebp # b = rotl32(b,30) # 13 - addl %r13d, %ebx # e += RCONST + W[n] + leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] movl %ebp, %edi # c xorl %eax, %edi # ^d andl %edx, %edi # &b @@ -302,7 +310,7 @@ sha1_process_block64: addl %edi, %ebx # e += rotl32(a,5) rorl $2, %edx # b = rotl32(b,30) # 14 - addl %r14d, %eax # e += RCONST + W[n] + leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] shrq $32, %r14 movl %edx, %edi # c xorl %ebp, %edi # ^d @@ -314,7 +322,7 @@ sha1_process_block64: addl %edi, %eax # e += rotl32(a,5) rorl $2, %ecx # b = rotl32(b,30) # 15 - addl %r14d, %ebp # e += RCONST + W[n] + leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] movl %ecx, %edi # c xorl %edx, %edi # ^d andl %ebx, %edi # &b @@ -1467,11 +1475,6 @@ sha1_process_block64: ret .size sha1_process_block64, .-sha1_process_block64 - .section .rodata.cst16.bswap32_mask, "aM", @progbits, 16 - .balign 16 -bswap32_mask: - .octa 0x0c0d0e0f08090a0b0405060700010203 - .section .rodata.cst16.sha1const, "aM", @progbits, 16 .balign 16 sha1const: diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index f34e6e6fa..57e77b118 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -99,6 +99,30 @@ INTERLEAVE() { ) } +# movaps bswap32_mask(%rip), $xmmT1 +# Load W[] to xmm0..3, byteswapping on the fly. +# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 +# for use in RD1As instead of spilling them to stack. +# (We use rsi instead of rN because this makes two +# ADDs in two first RD1As shorter by one byte). +# movups 16*0(%rdi), %xmm0 +# pshufb $xmmT1, %xmm0 #SSSE3 insn +# movaps %xmm0, $xmmT2 +# paddd $xmmRCONST, $xmmT2 +# movq $xmmT2, %rsi +# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn +# #movhpd $xmmT2, %r8 #can only move to mem, not to reg +# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence +# movq $xmmT2, %r8 # instead +# ... +# +# ... +#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] +#+ addl %esi, %e$e # e += RCONST + W[n] +# ^^^^^^^^^^^^^^^^^^^^^^^^ +# The above is -97 bytes of code... +# ...but pshufb is a SSSE3 insn. Can't use it. + echo \ "### Generated by hash_md5_sha_x86-64.S.sh ### @@ -129,57 +153,65 @@ sha1_process_block64: # xmm7: all round constants # -64(%rsp): area for passing RCONST + W[] from vector to integer units + movl 80(%rdi), %eax # a = ctx->hash[0] + movl 84(%rdi), %ebx # b = ctx->hash[1] + movl 88(%rdi), %ecx # c = ctx->hash[2] + movl 92(%rdi), %edx # d = ctx->hash[3] + movl 96(%rdi), %ebp # e = ctx->hash[4] + movaps sha1const(%rip), $xmmALLRCONST - movaps bswap32_mask(%rip), $xmmT1 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST # Load W[] to xmm0..3, byteswapping on the fly. # - # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 + # For iterations 0..15, we pass W[] in rsi,r8..r14 # for use in RD1As instead of spilling them to stack. + # We lose parallelized addition of RCONST, but LEA + # can do two additions at once, so it is probably a wash. # (We use rsi instead of rN because this makes two - # ADDs in two first RD1As shorter by one byte). - movups 16*0(%rdi), %xmm0 - pshufb $xmmT1, %xmm0 - movaps %xmm0, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movq $xmmT2, %rsi -# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn -# movhpd $xmmT2, %r8 #can only move to mem, not to reg - shufps \$0x0e, $xmmT2, $xmmT2 - movq $xmmT2, %r8 - - movups 16*1(%rdi), %xmm1 - pshufb $xmmT1, %xmm1 - movaps %xmm1, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movq $xmmT2, %r9 - shufps \$0x0e, $xmmT2, $xmmT2 - movq $xmmT2, %r10 - - movups 16*2(%rdi), %xmm2 - pshufb $xmmT1, %xmm2 - movaps %xmm2, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movq $xmmT2, %r11 - shufps \$0x0e, $xmmT2, $xmmT2 - movq $xmmT2, %r12 - - movups 16*3(%rdi), %xmm3 - pshufb $xmmT1, %xmm3 - movaps %xmm3, $xmmT2 - paddd $xmmRCONST, $xmmT2 - movq $xmmT2, %r13 - shufps \$0x0e, $xmmT2, $xmmT2 - movq $xmmT2, %r14 - - # MOVQs to GPRs (above) have somewhat high latency. - # Load hash[] while they are completing: - movl 80(%rdi), %eax # a = ctx->hash[0] - movl 84(%rdi), %ebx # b = ctx->hash[1] - movl 88(%rdi), %ecx # c = ctx->hash[2] - movl 92(%rdi), %edx # d = ctx->hash[3] - movl 96(%rdi), %ebp # e = ctx->hash[4] + # LEAs in two first RD1As shorter by one byte). + movq 4*0(%rdi), %rsi + movq 4*2(%rdi), %r8 + bswapq %rsi + bswapq %r8 + rolq \$32, %rsi # rsi = W[1]:W[0] + rolq \$32, %r8 # r8 = W[3]:W[2] + movq %rsi, %xmm0 + movq %r8, $xmmT1 + punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) +# movaps %xmm0, $xmmT1 # add RCONST, spill to stack +# paddd $xmmRCONST, $xmmT1 +# movups $xmmT1, -64+16*0(%rsp) + + movq 4*4(%rdi), %r9 + movq 4*6(%rdi), %r10 + bswapq %r9 + bswapq %r10 + rolq \$32, %r9 # r9 = W[5]:W[4] + rolq \$32, %r10 # r10 = W[7]:W[6] + movq %r9, %xmm1 + movq %r10, $xmmT1 + punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) + + movq 4*8(%rdi), %r11 + movq 4*10(%rdi), %r12 + bswapq %r11 + bswapq %r12 + rolq \$32, %r11 # r11 = W[9]:W[8] + rolq \$32, %r12 # r12 = W[11]:W[10] + movq %r11, %xmm2 + movq %r12, $xmmT1 + punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) + + movq 4*12(%rdi), %r13 + movq 4*14(%rdi), %r14 + bswapq %r13 + bswapq %r14 + rolq \$32, %r13 # r13 = W[13]:W[12] + rolq \$32, %r14 # r14 = W[15]:W[14] + movq %r13, %xmm3 + movq %r14, $xmmT1 + punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) " PREP() { @@ -258,15 +290,15 @@ local rN=$((7+n0/2)) echo " # $n ";test $n0 = 0 && echo " - addl %esi, %e$e # e += RCONST + W[n] + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] shrq \$32, %rsi ";test $n0 = 1 && echo " - addl %esi, %e$e # e += RCONST + W[n] + leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " - addl %r${rN}d, %e$e # e += RCONST + W[n] + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] shrq \$32, %r$rN ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " - addl %r${rN}d, %e$e # e += RCONST + W[n] + leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] ";echo " movl %e$c, %edi # c xorl %e$d, %edi # ^d @@ -432,11 +464,6 @@ echo " ret .size sha1_process_block64, .-sha1_process_block64 - .section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16 - .balign 16 -bswap32_mask: - .octa 0x0c0d0e0f08090a0b0405060700010203 - .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 .balign 16 sha1const: diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index c13cdec07..794e97040 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S @@ -20,6 +20,11 @@ #define extr128_32 pextrd //#define extr128_32 extractps # not shorter +// pshufb is a SSSE3 insn. +// pinsrd, pextrd, extractps are SSE4.1 insns. +// We do not check SSSE3/SSE4.1 in cpuid, +// all SHA-capable CPUs support them as well. + .section .text.sha1_process_block64_shaNI, "ax", @progbits .globl sha1_process_block64_shaNI .hidden sha1_process_block64_shaNI -- cgit v1.2.3-55-g6feb From 1f272c06d02e7c7f0f3af1f97165722255c8828d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 11 Feb 2022 23:03:27 +0100 Subject: whitespace fixes Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha_x86-64.S | 8 ++++---- libbb/hash_md5_sha_x86-64.S.sh | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index f0daa30f6..1d55b91f8 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S @@ -71,8 +71,8 @@ sha1_process_block64: movq 4*10(%rdi), %r12 bswapq %r11 bswapq %r12 - rolq $32, %r11 # r11 = W[9]:W[8] - rolq $32, %r12 # r12 = W[11]:W[10] + rolq $32, %r11 # r11 = W[9]:W[8] + rolq $32, %r12 # r12 = W[11]:W[10] movq %r11, %xmm2 movq %r12, %xmm4 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) @@ -81,8 +81,8 @@ sha1_process_block64: movq 4*14(%rdi), %r14 bswapq %r13 bswapq %r14 - rolq $32, %r13 # r13 = W[13]:W[12] - rolq $32, %r14 # r14 = W[15]:W[14] + rolq $32, %r13 # r13 = W[13]:W[12] + rolq $32, %r14 # r14 = W[15]:W[14] movq %r13, %xmm3 movq %r14, %xmm4 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 57e77b118..40c979d35 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh @@ -99,7 +99,7 @@ INTERLEAVE() { ) } -# movaps bswap32_mask(%rip), $xmmT1 +# movaps bswap32_mask(%rip), $xmmT1 # Load W[] to xmm0..3, byteswapping on the fly. # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 # for use in RD1As instead of spilling them to stack. @@ -110,8 +110,8 @@ INTERLEAVE() { # movaps %xmm0, $xmmT2 # paddd $xmmRCONST, $xmmT2 # movq $xmmT2, %rsi -# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn -# #movhpd $xmmT2, %r8 #can only move to mem, not to reg +# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn +# #movhpd $xmmT2, %r8 #can only move to mem, not to reg # shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence # movq $xmmT2, %r8 # instead # ... @@ -197,8 +197,8 @@ sha1_process_block64: movq 4*10(%rdi), %r12 bswapq %r11 bswapq %r12 - rolq \$32, %r11 # r11 = W[9]:W[8] - rolq \$32, %r12 # r12 = W[11]:W[10] + rolq \$32, %r11 # r11 = W[9]:W[8] + rolq \$32, %r12 # r12 = W[11]:W[10] movq %r11, %xmm2 movq %r12, $xmmT1 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) @@ -207,8 +207,8 @@ sha1_process_block64: movq 4*14(%rdi), %r14 bswapq %r13 bswapq %r14 - rolq \$32, %r13 # r13 = W[13]:W[12] - rolq \$32, %r14 # r14 = W[15]:W[14] + rolq \$32, %r13 # r13 = W[13]:W[12] + rolq \$32, %r14 # r14 = W[15]:W[14] movq %r13, %xmm3 movq %r14, $xmmT1 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) -- cgit v1.2.3-55-g6feb From c2e7780e526b0f421c3b43367a53019d1dc5f2d6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 12 Feb 2022 00:52:12 +0100 Subject: libbb/sha256: explicitly use sha256rnds2's %xmm0 (MSG) argument Else, the code seemingly does not use MSG. Signed-off-by: Denys Vlasenko --- libbb/hash_md5_sha256_x86-32_shaNI.S | 64 +++++++++++++++--------------- libbb/hash_md5_sha256_x86-64_shaNI.S | 76 ++++++++++++++++++------------------ 2 files changed, 70 insertions(+), 70 deletions(-) diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index c059fb18d..3905bad9a 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S @@ -60,18 +60,18 @@ sha256_process_block64_shaNI: pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 8-11 */ @@ -79,9 +79,9 @@ sha256_process_block64_shaNI: pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 12-15 */ @@ -90,151 +90,151 @@ sha256_process_block64_shaNI: /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 16-19 */ mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 20-23 */ mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 24-27 */ mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 28-31 */ mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 32-35 */ mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 36-39 */ mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 40-43 */ mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 44-47 */ mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 48-51 */ mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 52-55 */ mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 56-59 */ mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 60-63 */ mova128 MSGTMP3, MSG paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Write hash values back in the correct order */ mova128 STATE0, XMMTMP diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 9578441f8..082ceafe4 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S @@ -38,8 +38,8 @@ #define XMMTMP %xmm7 -#define ABEF_SAVE %xmm9 -#define CDGH_SAVE %xmm10 +#define SAVE0 %xmm8 +#define SAVE1 %xmm9 #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) @@ -59,26 +59,26 @@ sha256_process_block64_shaNI: leaq K256+8*16(%rip), SHA256CONSTANTS /* Save hash values for addition after rounds */ - mova128 STATE0, ABEF_SAVE - mova128 STATE1, CDGH_SAVE + mova128 STATE0, SAVE0 + mova128 STATE1, SAVE1 /* Rounds 0-3 */ movu128 0*16(DATA_PTR), MSG pshufb XMMTMP, MSG mova128 MSG, MSGTMP0 paddd 0*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 4-7 */ movu128 1*16(DATA_PTR), MSG pshufb XMMTMP, MSG mova128 MSG, MSGTMP1 paddd 1*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 8-11 */ @@ -86,9 +86,9 @@ sha256_process_block64_shaNI: pshufb XMMTMP, MSG mova128 MSG, MSGTMP2 paddd 2*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 12-15 */ @@ -97,155 +97,155 @@ sha256_process_block64_shaNI: /* ...to here */ mova128 MSG, MSGTMP3 paddd 3*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 16-19 */ mova128 MSGTMP0, MSG paddd 4*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 20-23 */ mova128 MSGTMP1, MSG paddd 5*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 24-27 */ mova128 MSGTMP2, MSG paddd 6*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 28-31 */ mova128 MSGTMP3, MSG paddd 7*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 32-35 */ mova128 MSGTMP0, MSG paddd 8*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 36-39 */ mova128 MSGTMP1, MSG paddd 9*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP1, MSGTMP0 /* Rounds 40-43 */ mova128 MSGTMP2, MSG paddd 10*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP2, MSGTMP1 /* Rounds 44-47 */ mova128 MSGTMP3, MSG paddd 11*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP3, XMMTMP palignr $4, MSGTMP2, XMMTMP paddd XMMTMP, MSGTMP0 sha256msg2 MSGTMP3, MSGTMP0 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP3, MSGTMP2 /* Rounds 48-51 */ mova128 MSGTMP0, MSG paddd 12*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP0, XMMTMP palignr $4, MSGTMP3, XMMTMP paddd XMMTMP, MSGTMP1 sha256msg2 MSGTMP0, MSGTMP1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 sha256msg1 MSGTMP0, MSGTMP3 /* Rounds 52-55 */ mova128 MSGTMP1, MSG paddd 13*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP1, XMMTMP palignr $4, MSGTMP0, XMMTMP paddd XMMTMP, MSGTMP2 sha256msg2 MSGTMP1, MSGTMP2 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 56-59 */ mova128 MSGTMP2, MSG paddd 14*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 mova128 MSGTMP2, XMMTMP palignr $4, MSGTMP1, XMMTMP paddd XMMTMP, MSGTMP3 sha256msg2 MSGTMP2, MSGTMP3 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Rounds 60-63 */ mova128 MSGTMP3, MSG paddd 15*16-8*16(SHA256CONSTANTS), MSG - sha256rnds2 STATE0, STATE1 + sha256rnds2 MSG, STATE0, STATE1 shuf128_32 $0x0E, MSG, MSG - sha256rnds2 STATE1, STATE0 + sha256rnds2 MSG, STATE1, STATE0 /* Add current hash values with previously saved */ - paddd ABEF_SAVE, STATE0 - paddd CDGH_SAVE, STATE1 + paddd SAVE0, STATE0 + paddd SAVE1, STATE1 /* Write hash values back in the correct order */ mova128 STATE0, XMMTMP -- cgit v1.2.3-55-g6feb From 1891fdda59092a215d3a407d9108bbbe6ab8df7a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 18 Feb 2022 17:09:51 +0100 Subject: libbb/sha1: update config help text with new performance numbers Signed-off-by: Denys Vlasenko --- libbb/Config.src | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libbb/Config.src b/libbb/Config.src index 0ecd5bd46..66a3ffa23 100644 --- a/libbb/Config.src +++ b/libbb/Config.src @@ -57,11 +57,12 @@ config SHA1_SMALL range 0 3 help Trade binary size versus speed for the sha1 algorithm. + With FEATURE_COPYBUF_KB=64: throughput MB/s size of sha1_process_block64 value 486 x86-64 486 x86-64 - 0 367 375 3657 3502 - 1 224 229 654 732 - 2,3 200 195 358 380 + 0 440 485 3481 3502 + 1 265 265 641 696 + 2,3 220 210 342 364 config SHA1_HWACCEL bool "SHA1: Use hardware accelerated instructions if possible" -- cgit v1.2.3-55-g6feb From fa52ac9781f479de8ab4d8526276244c0a0471f4 Mon Sep 17 00:00:00 2001 From: Sören Tempel Date: Mon, 28 Feb 2022 08:36:50 +0100 Subject: ash: don't read past end of var in subvareval for bash substitutions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this patch, BusyBox handles bash pattern substitutions without a terminating '/' character incorrectly. Consider the following shell script: _bootstrapver=5.0.211-r0 _referencesdir="/usr/${_bootstrapver/-*}/Sources" echo $_referencesdir This should output `/usr/5.0.211/Sources`. However, without this patch it instead outputs `/usr/5.0.211Sources`. This is due to the fact that BusyBox expects the bash pattern substitutions to always be terminated with a '/' (at least in this part of subvareval) and thus reads passed the substitution itself and consumes the '/' character which is part of the literal string. If there is no '/' after the substitution then BusyBox might perform an out-of-bounds read under certain circumstances. When replacing the bash pattern substitution with `${_bootstrapver/-*/}`, or with this patch applied, ash outputs the correct value. Signed-off-by: Sören Tempel Signed-off-by: Denys Vlasenko --- shell/ash.c | 4 ++++ shell/ash_test/ash-vars/var_bash_repl_unterminated.right | 1 + shell/ash_test/ash-vars/var_bash_repl_unterminated.tests | 2 ++ shell/hush_test/hush-vars/var_bash_repl_unterminated.right | 1 + shell/hush_test/hush-vars/var_bash_repl_unterminated.tests | 2 ++ 5 files changed, 10 insertions(+) create mode 100644 shell/ash_test/ash-vars/var_bash_repl_unterminated.right create mode 100755 shell/ash_test/ash-vars/var_bash_repl_unterminated.tests create mode 100644 shell/hush_test/hush-vars/var_bash_repl_unterminated.right create mode 100755 shell/hush_test/hush-vars/var_bash_repl_unterminated.tests diff --git a/shell/ash.c b/shell/ash.c index adb0f223a..54335c5dd 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -7081,6 +7081,10 @@ subevalvar(char *start, char *str, int strloc, *repl = '\0'; break; } + if ((unsigned char)*repl == CTLENDVAR) { /* ${v/pattern} (no trailing /, no repl) */ + repl = NULL; + break; + } /* Handle escaped slashes, e.g. "${v/\//_}" (they are CTLESC'ed by this point) */ if ((unsigned char)*repl == CTLESC && repl[1]) repl++; diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.right b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right new file mode 100644 index 000000000..5bff3a6fa --- /dev/null +++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right @@ -0,0 +1 @@ +b/d diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests new file mode 100755 index 000000000..c9513343d --- /dev/null +++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests @@ -0,0 +1,2 @@ +a=b-c +echo ${a/-*}/d diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.right b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right new file mode 100644 index 000000000..5bff3a6fa --- /dev/null +++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right @@ -0,0 +1 @@ +b/d diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests new file mode 100755 index 000000000..c9513343d --- /dev/null +++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests @@ -0,0 +1,2 @@ +a=b-c +echo ${a/-*}/d -- cgit v1.2.3-55-g6feb From 7750b5a25a8cf9081b7c248687c876d0068e85bb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 1 Mar 2022 09:56:54 +0100 Subject: ash: fix unsafe use of mempcpy function old new delta subevalvar 1549 1557 +8 Signed-off-by: Denys Vlasenko --- shell/ash.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/shell/ash.c b/shell/ash.c index 54335c5dd..44ec2eafd 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -7191,7 +7191,13 @@ subevalvar(char *start, char *str, int strloc, len = orig_len - pos; if (!quotes) { - loc = mempcpy(startp, startp + pos, len); + /* want: loc = mempcpy(startp, startp + pos, len) + * but it does not allow overlapping arguments */ + loc = startp; + while (--len >= 0) { + *loc = loc[pos]; + loc++; + } } else { for (vstr = startp; pos != 0; pos--) { if ((unsigned char)*vstr == CTLESC) -- cgit v1.2.3-55-g6feb From 5fe20cf3212fbada86fb75cf13064caed6a5f3a9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 1 Mar 2022 10:08:59 +0100 Subject: ash: do not truncate failed tilde expansion on unknown user names Do not skip over "*p = c;" statement. Testcase: echo ~~nouser/qwe function old new delta argstr 1396 1406 +10 Signed-off-by: Denys Vlasenko --- shell/ash.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/shell/ash.c b/shell/ash.c index 44ec2eafd..ef4a47afe 100644 --- a/shell/ash.c +++ b/shell/ash.c @@ -6532,9 +6532,7 @@ exptilde(char *startp, int flag) home = lookupvar("HOME"); } else { pw = getpwnam(name); - if (pw == NULL) - goto lose; - home = pw->pw_dir; + home = pw ? pw->pw_dir : NULL; } *p = c; if (!home) -- cgit v1.2.3-55-g6feb From 55f969a006109703dd056bee1b6c1d11b0602449 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 1 Mar 2022 10:46:49 +0100 Subject: taskset: fix printf format mismatch in !FEATURE_TASKSET_FANCY config. closes 14616 Signed-off-by: Denys Vlasenko --- util-linux/taskset.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/util-linux/taskset.c b/util-linux/taskset.c index d2ef9b98f..8b410f369 100644 --- a/util-linux/taskset.c +++ b/util-linux/taskset.c @@ -55,7 +55,6 @@ * Not yet implemented: * -a/--all-tasks (affect all threads) * needs to get TIDs from /proc/PID/task/ and use _them_ as "pid" in sched_setaffinity(pid) - * -c/--cpu-list (specify CPUs via "1,3,5-7") */ #include @@ -91,7 +90,7 @@ static char *from_mask(const ul *mask, unsigned sz_in_bytes) } #else #define TASKSET_PRINTF_MASK "%lx" -static unsigned long long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM) +static unsigned long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM) { return *mask; } -- cgit v1.2.3-55-g6feb From fc7868602ecf0d761a9a877141add4a9b6918d02 Mon Sep 17 00:00:00 2001 From: Ron Yorston Date: Thu, 3 Mar 2022 11:35:46 +0000 Subject: vi: improved handling of backspace in replace mode In replace mode ('R' command) the backspace character should get special treatment: - backspace only goes back to the start of the replacement; - backspacing over replaced characters restores the original text. Prior to this commit BusyBox vi deleted the characters both before and after the cursor in replace mode. function old new delta undo_pop - 235 +235 char_insert 858 884 +26 indicate_error 81 84 +3 find_range 654 657 +3 static.text_yank 77 79 +2 do_cmd 4486 4243 -243 ------------------------------------------------------------------------------ (add/remove: 1/0 grow/shrink: 4/1 up/down: 269/-243) Total: 26 bytes Signed-off-by: Ron Yorston Signed-off-by: Denys Vlasenko --- editors/vi.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/editors/vi.c b/editors/vi.c index d37cd48a3..4257c0fdc 100644 --- a/editors/vi.c +++ b/editors/vi.c @@ -224,6 +224,7 @@ #endif +#define isbackspace(c) ((c) == term_orig.c_cc[VERASE] || (c) == 8 || (c) == 127) enum { MAX_TABSTOP = 32, // sanity limit @@ -342,6 +343,7 @@ struct globals { int last_modified_count; // = -1; int cmdline_filecnt; // how many file names on cmd line int cmdcnt; // repetition count + char *rstart; // start of text in Replace mode unsigned rows, columns; // the terminal screen is this size #if ENABLE_FEATURE_VI_ASK_TERMINAL int get_rowcol_error; @@ -474,6 +476,7 @@ struct globals { #define last_modified_count (G.last_modified_count) #define cmdline_filecnt (G.cmdline_filecnt ) #define cmdcnt (G.cmdcnt ) +#define rstart (G.rstart ) #define rows (G.rows ) #define columns (G.columns ) #define crow (G.crow ) @@ -1212,7 +1215,7 @@ static char *get_input_line(const char *prompt) c = get_one_char(); if (c == '\n' || c == '\r' || c == 27) break; // this is end of input - if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) { + if (isbackspace(c)) { // user wants to erase prev char write1("\b \b"); // erase char on screen buf[--i] = '\0'; @@ -2174,8 +2177,16 @@ static char *char_insert(char *p, char c, int undo) // insert the char c at 'p' p += 1 + stupid_insert(p, ' '); } #endif - } else if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) { // Is this a BS - if (p > text) { + } else if (isbackspace(c)) { + if (cmd_mode == 2) { + // special treatment for backspace in Replace mode + if (p > rstart) { + p--; +#if ENABLE_FEATURE_VI_UNDO + undo_pop(); +#endif + } + } else if (p > text) { p--; p = text_hole_delete(p, p, ALLOW_UNDO_QUEUED); // shrink buffer 1 char } @@ -3703,9 +3714,9 @@ static void do_cmd(int c) undo_queue_commit(); } else { if (1 <= c || Isprint(c)) { - if (c != 27) - dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO); // delete char - dot = char_insert(dot, c, ALLOW_UNDO_CHAIN); // insert new char + if (c != 27 && !isbackspace(c)) + dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO); + dot = char_insert(dot, c, ALLOW_UNDO_CHAIN); } goto dc1; } @@ -4264,6 +4275,7 @@ static void do_cmd(int c) dc5: cmd_mode = 2; undo_queue_commit(); + rstart = dot; break; case KEYCODE_DELETE: if (dot < end - 1) -- cgit v1.2.3-55-g6feb