libbb/sha1: shrink unrolled x86-64 code

function old new delta sha1_process_block64 3481 3384 -97 Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
author: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-11 06:08:27 +0100
committer: Denys Vlasenko <vda.linux@googlemail.com> 2022-02-11 06:08:27 +0100
commit: 8154146be491bc66ab34d5d5f2a2466ddbdcff52 (patch)
tree: 4e890a52e047c466ca05c8263230a6a110884e52 /libbb/hash_md5_sha_x86-64.S.sh
parent: 6f56fa17131b3cbb84e887c6c5fb202f2492169e (diff)
download: busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.gz
busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.bz2
busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.zip
1 files changed, 54 insertions, 57 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index a10ac411d..f34e6e6fa 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -129,65 +129,57 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
-        movl    80(%rdi), %eax          # a = ctx->hash[0]
-        movl    84(%rdi), %ebx          # b = ctx->hash[1]
-        movl    88(%rdi), %ecx          # c = ctx->hash[2]
-        movl    92(%rdi), %edx          # d = ctx->hash[3]
-        movl    96(%rdi), %ebp          # e = ctx->hash[4]
        movaps  sha1const(%rip), $xmmALLRCONST
+        movaps  bswap32_mask(%rip), $xmmT1
        pshufd  \$0x00, $xmmALLRCONST, $xmmRCONST
-        # Load W[] to xmm registers, byteswapping on the fly.
+        # Load W[] to xmm0..3, byteswapping on the fly.
        #
-        # For iterations 0..15, we pass W[] in rsi,r8..r14
+        # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
        # for use in RD1As instead of spilling them to stack.
-        # We lose parallelized addition of RCONST, but LEA
-        # can do two additions at once, so it is probably a wash.
        # (We use rsi instead of rN because this makes two
-        # LEAs in two first RD1As shorter by one byte).
+        # ADDs in two first RD1As shorter by one byte).
-        movq    4*0(%rdi), %rsi
+        movups  16*0(%rdi), %xmm0
-        movq    4*2(%rdi), %r8
+        pshufb  $xmmT1, %xmm0
-        bswapq  %rsi
+        movaps  %xmm0, $xmmT2
-        bswapq  %r8
+        paddd   $xmmRCONST, $xmmT2
-        rolq    \$32, %rsi              # rsi = W[1]:W[0]
+        movq    $xmmT2, %rsi
-        rolq    \$32, %r8               # r8  = W[3]:W[2]
+#       pextrq  \$1, $xmmT2, %r8        #SSE4.1 insn
-        movq    %rsi, %xmm0
+#       movhpd  $xmmT2, %r8             #can only move to mem, not to reg
-        movq    %r8, $xmmT1
+        shufps  \$0x0e, $xmmT2, $xmmT2
-        punpcklqdq $xmmT1, %xmm0        # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+        movq    $xmmT2, %r8
-#       movaps  %xmm0, $xmmT1           # add RCONST, spill to stack
-#       paddd   $xmmRCONST, $xmmT1
+        movups  16*1(%rdi), %xmm1
-#       movups  $xmmT1, -64+16*0(%rsp)
+        pshufb  $xmmT1, %xmm1
+        movaps  %xmm1, $xmmT2
-        movq    4*4(%rdi), %r9
+        paddd   $xmmRCONST, $xmmT2
-        movq    4*6(%rdi), %r10
+        movq    $xmmT2, %r9
-        bswapq  %r9
+        shufps  \$0x0e, $xmmT2, $xmmT2
-        bswapq  %r10
+        movq    $xmmT2, %r10
-        rolq    \$32, %r9               # r9  = W[5]:W[4]
-        rolq    \$32, %r10              # r10 = W[7]:W[6]
+        movups  16*2(%rdi), %xmm2
-        movq    %r9, %xmm1
+        pshufb  $xmmT1, %xmm2
-        movq    %r10, $xmmT1
+        movaps  %xmm2, $xmmT2
-        punpcklqdq $xmmT1, %xmm1        # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+        paddd   $xmmRCONST, $xmmT2
+        movq    $xmmT2, %r11
-        movq    4*8(%rdi), %r11
+        shufps  \$0x0e, $xmmT2, $xmmT2
-        movq    4*10(%rdi), %r12
+        movq    $xmmT2, %r12
-        bswapq  %r11
-        bswapq  %r12
+        movups  16*3(%rdi), %xmm3
-        rolq    \$32, %r11              # r11  = W[9]:W[8]
+        pshufb  $xmmT1, %xmm3
-        rolq    \$32, %r12              # r12  = W[11]:W[10]
+        movaps  %xmm3, $xmmT2
-        movq    %r11, %xmm2
+        paddd   $xmmRCONST, $xmmT2
-        movq    %r12, $xmmT1
+        movq    $xmmT2, %r13
-        punpcklqdq $xmmT1, %xmm2        # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+        shufps  \$0x0e, $xmmT2, $xmmT2
+        movq    $xmmT2, %r14
-        movq    4*12(%rdi), %r13
-        movq    4*14(%rdi), %r14
+        # MOVQs to GPRs (above) have somewhat high latency.
-        bswapq  %r13
+        # Load hash[] while they are completing:
-        bswapq  %r14
+        movl    80(%rdi), %eax          # a = ctx->hash[0]
-        rolq    \$32, %r13              # r13  = W[13]:W[12]
+        movl    84(%rdi), %ebx          # b = ctx->hash[1]
-        rolq    \$32, %r14              # r14  = W[15]:W[14]
+        movl    88(%rdi), %ecx          # c = ctx->hash[2]
-        movq    %r13, %xmm3
+        movl    92(%rdi), %edx          # d = ctx->hash[3]
-        movq    %r14, $xmmT1
+        movl    96(%rdi), %ebp          # e = ctx->hash[4]
-        punpcklqdq $xmmT1, %xmm3        # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 "
 PREP() {
@@ -266,15 +258,15 @@ local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+        addl    %esi, %e$e              # e += RCONST + W[n]
        shrq    \$32, %rsi
 ";test $n0 = 1 && echo "
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+        addl    %esi, %e$e              # e += RCONST + W[n]
 ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+        addl    %r${rN}d, %e$e          # e += RCONST + W[n]
        shrq    \$32, %r$rN
 ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+        addl    %r${rN}d, %e$e          # e += RCONST + W[n]
 ";echo "
        movl    %e$c, %edi              # c
        xorl    %e$d, %edi              # ^d
@@ -440,6 +432,11 @@ echo "
        ret
        .size   sha1_process_block64, .-sha1_process_block64
+        .section        .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
+        .balign 16
+bswap32_mask:
+        .octa   0x0c0d0e0f08090a0b0405060700010203
        .section        .rodata.cst16.sha1const, \"aM\", @progbits, 16
        .balign 16
 sha1const:
author	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-11 06:08:27 +0100
committer	Denys Vlasenko <vda.linux@googlemail.com>	2022-02-11 06:08:27 +0100
commit	8154146be491bc66ab34d5d5f2a2466ddbdcff52 (patch)
tree	4e890a52e047c466ca05c8263230a6a110884e52 /libbb/hash_md5_sha_x86-64.S.sh
parent	6f56fa17131b3cbb84e887c6c5fb202f2492169e (diff)
download	busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.gz busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.tar.bz2 busybox-w32-8154146be491bc66ab34d5d5f2a2466ddbdcff52.zip

diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index a10ac411d..f34e6e6fa 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -129,65 +129,57 @@ sha1_process_block64:
129	# xmm7: all round constants	129	# xmm7: all round constants
130	# -64(%rsp): area for passing RCONST + W[] from vector to integer units	130	# -64(%rsp): area for passing RCONST + W[] from vector to integer units
131		131
132	movl 80(%rdi), %eax # a = ctx->hash[0]
133	movl 84(%rdi), %ebx # b = ctx->hash[1]
134	movl 88(%rdi), %ecx # c = ctx->hash[2]
135	movl 92(%rdi), %edx # d = ctx->hash[3]
136	movl 96(%rdi), %ebp # e = ctx->hash[4]
137
138	movaps sha1const(%rip), $xmmALLRCONST	132	movaps sha1const(%rip), $xmmALLRCONST
		133	movaps bswap32_mask(%rip), $xmmT1
139	pshufd \$0x00, $xmmALLRCONST, $xmmRCONST	134	pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
140		135
141	# Load W[] to xmm registers, byteswapping on the fly.	136	# Load W[] to xmm0..3, byteswapping on the fly.
142	#	137	#
143	# For iterations 0..15, we pass W[] in rsi,r8..r14	138	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
144	# for use in RD1As instead of spilling them to stack.	139	# for use in RD1As instead of spilling them to stack.
145	# We lose parallelized addition of RCONST, but LEA
146	# can do two additions at once, so it is probably a wash.
147	# (We use rsi instead of rN because this makes two	140	# (We use rsi instead of rN because this makes two
148	# LEAs in two first RD1As shorter by one byte).	141	# ADDs in two first RD1As shorter by one byte).
149	movq 4*0(%rdi), %rsi	142	movups 16*0(%rdi), %xmm0
150	movq 4*2(%rdi), %r8	143	pshufb $xmmT1, %xmm0
151	bswapq %rsi	144	movaps %xmm0, $xmmT2
152	bswapq %r8	145	paddd $xmmRCONST, $xmmT2
153	rolq \$32, %rsi # rsi = W[1]:W[0]	146	movq $xmmT2, %rsi
154	rolq \$32, %r8 # r8 = W[3]:W[2]	147	# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn
155	movq %rsi, %xmm0	148	# movhpd $xmmT2, %r8 #can only move to mem, not to reg
156	movq %r8, $xmmT1	149	shufps \$0x0e, $xmmT2, $xmmT2
157	punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])	150	movq $xmmT2, %r8
158	# movaps %xmm0, $xmmT1 # add RCONST, spill to stack	151
159	# paddd $xmmRCONST, $xmmT1	152	movups 16*1(%rdi), %xmm1
160	# movups $xmmT1, -64+16*0(%rsp)	153	pshufb $xmmT1, %xmm1
161		154	movaps %xmm1, $xmmT2
162	movq 4*4(%rdi), %r9	155	paddd $xmmRCONST, $xmmT2
163	movq 4*6(%rdi), %r10	156	movq $xmmT2, %r9
164	bswapq %r9	157	shufps \$0x0e, $xmmT2, $xmmT2
165	bswapq %r10	158	movq $xmmT2, %r10
166	rolq \$32, %r9 # r9 = W[5]:W[4]	159
167	rolq \$32, %r10 # r10 = W[7]:W[6]	160	movups 16*2(%rdi), %xmm2
168	movq %r9, %xmm1	161	pshufb $xmmT1, %xmm2
169	movq %r10, $xmmT1	162	movaps %xmm2, $xmmT2
170	punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])	163	paddd $xmmRCONST, $xmmT2
171		164	movq $xmmT2, %r11
172	movq 4*8(%rdi), %r11	165	shufps \$0x0e, $xmmT2, $xmmT2
173	movq 4*10(%rdi), %r12	166	movq $xmmT2, %r12
174	bswapq %r11	167
175	bswapq %r12	168	movups 16*3(%rdi), %xmm3
176	rolq \$32, %r11 # r11 = W[9]:W[8]	169	pshufb $xmmT1, %xmm3
177	rolq \$32, %r12 # r12 = W[11]:W[10]	170	movaps %xmm3, $xmmT2
178	movq %r11, %xmm2	171	paddd $xmmRCONST, $xmmT2
179	movq %r12, $xmmT1	172	movq $xmmT2, %r13
180	punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])	173	shufps \$0x0e, $xmmT2, $xmmT2
181		174	movq $xmmT2, %r14
182	movq 4*12(%rdi), %r13	175
183	movq 4*14(%rdi), %r14	176	# MOVQs to GPRs (above) have somewhat high latency.
184	bswapq %r13	177	# Load hash[] while they are completing:
185	bswapq %r14	178	movl 80(%rdi), %eax # a = ctx->hash[0]
186	rolq \$32, %r13 # r13 = W[13]:W[12]	179	movl 84(%rdi), %ebx # b = ctx->hash[1]
187	rolq \$32, %r14 # r14 = W[15]:W[14]	180	movl 88(%rdi), %ecx # c = ctx->hash[2]
188	movq %r13, %xmm3	181	movl 92(%rdi), %edx # d = ctx->hash[3]
189	movq %r14, $xmmT1	182	movl 96(%rdi), %ebp # e = ctx->hash[4]
190	punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
191	"	183	"
192		184
193	PREP() {	185	PREP() {
@@ -266,15 +258,15 @@ local rN=$((7+n0/2))
266	echo "	258	echo "
267	# $n	259	# $n
268	";test $n0 = 0 && echo "	260	";test $n0 = 0 && echo "
269	leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]	261	addl %esi, %e$e # e += RCONST + W[n]
270	shrq \$32, %rsi	262	shrq \$32, %rsi
271	";test $n0 = 1 && echo "	263	";test $n0 = 1 && echo "
272	leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]	264	addl %esi, %e$e # e += RCONST + W[n]
273	";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "	265	";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
274	leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]	266	addl %r${rN}d, %e$e # e += RCONST + W[n]
275	shrq \$32, %r$rN	267	shrq \$32, %r$rN
276	";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "	268	";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
277	leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]	269	addl %r${rN}d, %e$e # e += RCONST + W[n]
278	";echo "	270	";echo "
279	movl %e$c, %edi # c	271	movl %e$c, %edi # c
280	xorl %e$d, %edi # ^d	272	xorl %e$d, %edi # ^d
@@ -440,6 +432,11 @@ echo "
440	ret	432	ret
441	.size sha1_process_block64, .-sha1_process_block64	433	.size sha1_process_block64, .-sha1_process_block64
442		434
		435	.section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
		436	.balign 16
		437	bswap32_mask:
		438	.octa 0x0c0d0e0f08090a0b0405060700010203
		439
443	.section .rodata.cst16.sha1const, \"aM\", @progbits, 16	440	.section .rodata.cst16.sha1const, \"aM\", @progbits, 16
444	.balign 16	441	.balign 16
445	sha1const:	442	sha1const: