1 files changed, 80 insertions, 53 deletions
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index f34e6e6fa..57e77b118 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
        )
 }
+#       movaps  bswap32_mask(%rip), $xmmT1
+# Load W[] to xmm0..3, byteswapping on the fly.
+# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+# for use in RD1As instead of spilling them to stack.
+# (We use rsi instead of rN because this makes two
+# ADDs in two first RD1As shorter by one byte).
+#       movups  16*0(%rdi), %xmm0
+#       pshufb  $xmmT1, %xmm0           #SSSE3 insn
+#       movaps  %xmm0, $xmmT2
+#       paddd   $xmmRCONST, $xmmT2
+#       movq    $xmmT2, %rsi
+#       #pextrq \$1, $xmmT2, %r8        #SSE4.1 insn
+#       #movhpd $xmmT2, %r8             #can only move to mem, not to reg
+#       shufps  \$0x0e, $xmmT2, $xmmT2  # have to use two-insn sequence
+#       movq    $xmmT2, %r8             # instead
+#       ...
+#       <repeat for xmm1,2,3>
+#       ...
+#-      leal    $RCONST(%r$e,%rsi), %e$e        # e += RCONST + W[n]
+#+      addl    %esi, %e$e                      # e += RCONST + W[n]
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# The above is -97 bytes of code...
+# ...but pshufb is a SSSE3 insn. Can't use it.
 echo \
 "### Generated by hash_md5_sha_x86-64.S.sh ###
@@ -129,57 +153,65 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
+        movl    80(%rdi), %eax          # a = ctx->hash[0]
+        movl    84(%rdi), %ebx          # b = ctx->hash[1]
+        movl    88(%rdi), %ecx          # c = ctx->hash[2]
+        movl    92(%rdi), %edx          # d = ctx->hash[3]
+        movl    96(%rdi), %ebp          # e = ctx->hash[4]
        movaps  sha1const(%rip), $xmmALLRCONST
-        movaps  bswap32_mask(%rip), $xmmT1
        pshufd  \$0x00, $xmmALLRCONST, $xmmRCONST
        # Load W[] to xmm0..3, byteswapping on the fly.
        #
-        # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+        # For iterations 0..15, we pass W[] in rsi,r8..r14
        # for use in RD1As instead of spilling them to stack.
+        # We lose parallelized addition of RCONST, but LEA
+        # can do two additions at once, so it is probably a wash.
        # (We use rsi instead of rN because this makes two
-        # ADDs in two first RD1As shorter by one byte).
+        # LEAs in two first RD1As shorter by one byte).
-        movups  16*0(%rdi), %xmm0
+        movq    4*0(%rdi), %rsi
-        pshufb  $xmmT1, %xmm0
+        movq    4*2(%rdi), %r8
-        movaps  %xmm0, $xmmT2
+        bswapq  %rsi
-        paddd   $xmmRCONST, $xmmT2
+        bswapq  %r8
-        movq    $xmmT2, %rsi
+        rolq    \$32, %rsi              # rsi = W[1]:W[0]
-#       pextrq  \$1, $xmmT2, %r8        #SSE4.1 insn
+        rolq    \$32, %r8               # r8  = W[3]:W[2]
-#       movhpd  $xmmT2, %r8             #can only move to mem, not to reg
+        movq    %rsi, %xmm0
-        shufps  \$0x0e, $xmmT2, $xmmT2
+        movq    %r8, $xmmT1
-        movq    $xmmT2, %r8
+        punpcklqdq $xmmT1, %xmm0        # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#       movaps  %xmm0, $xmmT1           # add RCONST, spill to stack
-        movups  16*1(%rdi), %xmm1
+#       paddd   $xmmRCONST, $xmmT1
-        pshufb  $xmmT1, %xmm1
+#       movups  $xmmT1, -64+16*0(%rsp)
-        movaps  %xmm1, $xmmT2
-        paddd   $xmmRCONST, $xmmT2
+        movq    4*4(%rdi), %r9
-        movq    $xmmT2, %r9
+        movq    4*6(%rdi), %r10
-        shufps  \$0x0e, $xmmT2, $xmmT2
+        bswapq  %r9
-        movq    $xmmT2, %r10
+        bswapq  %r10
+        rolq    \$32, %r9               # r9  = W[5]:W[4]
-        movups  16*2(%rdi), %xmm2
+        rolq    \$32, %r10              # r10 = W[7]:W[6]
-        pshufb  $xmmT1, %xmm2
+        movq    %r9, %xmm1
-        movaps  %xmm2, $xmmT2
+        movq    %r10, $xmmT1
-        paddd   $xmmRCONST, $xmmT2
+        punpcklqdq $xmmT1, %xmm1        # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-        movq    $xmmT2, %r11
-        shufps  \$0x0e, $xmmT2, $xmmT2
+        movq    4*8(%rdi), %r11
-        movq    $xmmT2, %r12
+        movq    4*10(%rdi), %r12
+        bswapq  %r11
-        movups  16*3(%rdi), %xmm3
+        bswapq  %r12
-        pshufb  $xmmT1, %xmm3
+        rolq    \$32, %r11              # r11  = W[9]:W[8]
-        movaps  %xmm3, $xmmT2
+        rolq    \$32, %r12              # r12  = W[11]:W[10]
-        paddd   $xmmRCONST, $xmmT2
+        movq    %r11, %xmm2
-        movq    $xmmT2, %r13
+        movq    %r12, $xmmT1
-        shufps  \$0x0e, $xmmT2, $xmmT2
+        punpcklqdq $xmmT1, %xmm2        # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-        movq    $xmmT2, %r14
+        movq    4*12(%rdi), %r13
-        # MOVQs to GPRs (above) have somewhat high latency.
+        movq    4*14(%rdi), %r14
-        # Load hash[] while they are completing:
+        bswapq  %r13
-        movl    80(%rdi), %eax          # a = ctx->hash[0]
+        bswapq  %r14
-        movl    84(%rdi), %ebx          # b = ctx->hash[1]
+        rolq    \$32, %r13              # r13  = W[13]:W[12]
-        movl    88(%rdi), %ecx          # c = ctx->hash[2]
+        rolq    \$32, %r14              # r14  = W[15]:W[14]
-        movl    92(%rdi), %edx          # d = ctx->hash[3]
+        movq    %r13, %xmm3
-        movl    96(%rdi), %ebp          # e = ctx->hash[4]
+        movq    %r14, $xmmT1
+        punpcklqdq $xmmT1, %xmm3        # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 "
 PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-        addl    %esi, %e$e              # e += RCONST + W[n]
+        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
        shrq    \$32, %rsi
 ";test $n0 = 1 && echo "
-        addl    %esi, %e$e              # e += RCONST + W[n]
+        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-        addl    %r${rN}d, %e$e          # e += RCONST + W[n]
+        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
        shrq    \$32, %r$rN
 ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-        addl    %r${rN}d, %e$e          # e += RCONST + W[n]
+        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 ";echo "
        movl    %e$c, %edi              # c
        xorl    %e$d, %edi              # ^d
@@ -432,11 +464,6 @@ echo "
        ret
        .size   sha1_process_block64, .-sha1_process_block64
-        .section        .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
-        .balign 16
-bswap32_mask:
-        .octa   0x0c0d0e0f08090a0b0405060700010203
        .section        .rodata.cst16.sha1const, \"aM\", @progbits, 16
        .balign 16
 sha1const:

diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index f34e6e6fa..57e77b118 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
99	)	99	)
100	}	100	}
101		101
		102	# movaps bswap32_mask(%rip), $xmmT1
		103	# Load W[] to xmm0..3, byteswapping on the fly.
		104	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
		105	# for use in RD1As instead of spilling them to stack.
		106	# (We use rsi instead of rN because this makes two
		107	# ADDs in two first RD1As shorter by one byte).
		108	# movups 16*0(%rdi), %xmm0
		109	# pshufb $xmmT1, %xmm0 #SSSE3 insn
		110	# movaps %xmm0, $xmmT2
		111	# paddd $xmmRCONST, $xmmT2
		112	# movq $xmmT2, %rsi
		113	# #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn
		114	# #movhpd $xmmT2, %r8 #can only move to mem, not to reg
		115	# shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence
		116	# movq $xmmT2, %r8 # instead
		117	# ...
		118	# <repeat for xmm1,2,3>
		119	# ...
		120	#- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
		121	#+ addl %esi, %e$e # e += RCONST + W[n]
		122	# ^^^^^^^^^^^^^^^^^^^^^^^^
		123	# The above is -97 bytes of code...
		124	# ...but pshufb is a SSSE3 insn. Can't use it.
		125
102	echo \	126	echo \
103	"### Generated by hash_md5_sha_x86-64.S.sh ###	127	"### Generated by hash_md5_sha_x86-64.S.sh ###
104		128
@@ -129,57 +153,65 @@ sha1_process_block64:
129	# xmm7: all round constants	153	# xmm7: all round constants
130	# -64(%rsp): area for passing RCONST + W[] from vector to integer units	154	# -64(%rsp): area for passing RCONST + W[] from vector to integer units
131		155
		156	movl 80(%rdi), %eax # a = ctx->hash[0]
		157	movl 84(%rdi), %ebx # b = ctx->hash[1]
		158	movl 88(%rdi), %ecx # c = ctx->hash[2]
		159	movl 92(%rdi), %edx # d = ctx->hash[3]
		160	movl 96(%rdi), %ebp # e = ctx->hash[4]
		161
132	movaps sha1const(%rip), $xmmALLRCONST	162	movaps sha1const(%rip), $xmmALLRCONST
133	movaps bswap32_mask(%rip), $xmmT1
134	pshufd \$0x00, $xmmALLRCONST, $xmmRCONST	163	pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
135		164
136	# Load W[] to xmm0..3, byteswapping on the fly.	165	# Load W[] to xmm0..3, byteswapping on the fly.
137	#	166	#
138	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14	167	# For iterations 0..15, we pass W[] in rsi,r8..r14
139	# for use in RD1As instead of spilling them to stack.	168	# for use in RD1As instead of spilling them to stack.
		169	# We lose parallelized addition of RCONST, but LEA
		170	# can do two additions at once, so it is probably a wash.
140	# (We use rsi instead of rN because this makes two	171	# (We use rsi instead of rN because this makes two
141	# ADDs in two first RD1As shorter by one byte).	172	# LEAs in two first RD1As shorter by one byte).
142	movups 16*0(%rdi), %xmm0	173	movq 4*0(%rdi), %rsi
143	pshufb $xmmT1, %xmm0	174	movq 4*2(%rdi), %r8
144	movaps %xmm0, $xmmT2	175	bswapq %rsi
145	paddd $xmmRCONST, $xmmT2	176	bswapq %r8
146	movq $xmmT2, %rsi	177	rolq \$32, %rsi # rsi = W[1]:W[0]
147	# pextrq \$1, $xmmT2, %r8 #SSE4.1 insn	178	rolq \$32, %r8 # r8 = W[3]:W[2]
148	# movhpd $xmmT2, %r8 #can only move to mem, not to reg	179	movq %rsi, %xmm0
149	shufps \$0x0e, $xmmT2, $xmmT2	180	movq %r8, $xmmT1
150	movq $xmmT2, %r8	181	punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
151		182	# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
152	movups 16*1(%rdi), %xmm1	183	# paddd $xmmRCONST, $xmmT1
153	pshufb $xmmT1, %xmm1	184	# movups $xmmT1, -64+16*0(%rsp)
154	movaps %xmm1, $xmmT2	185
155	paddd $xmmRCONST, $xmmT2	186	movq 4*4(%rdi), %r9
156	movq $xmmT2, %r9	187	movq 4*6(%rdi), %r10
157	shufps \$0x0e, $xmmT2, $xmmT2	188	bswapq %r9
158	movq $xmmT2, %r10	189	bswapq %r10
159		190	rolq \$32, %r9 # r9 = W[5]:W[4]
160	movups 16*2(%rdi), %xmm2	191	rolq \$32, %r10 # r10 = W[7]:W[6]
161	pshufb $xmmT1, %xmm2	192	movq %r9, %xmm1
162	movaps %xmm2, $xmmT2	193	movq %r10, $xmmT1
163	paddd $xmmRCONST, $xmmT2	194	punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
164	movq $xmmT2, %r11	195
165	shufps \$0x0e, $xmmT2, $xmmT2	196	movq 4*8(%rdi), %r11
166	movq $xmmT2, %r12	197	movq 4*10(%rdi), %r12
167		198	bswapq %r11
168	movups 16*3(%rdi), %xmm3	199	bswapq %r12
169	pshufb $xmmT1, %xmm3	200	rolq \$32, %r11 # r11 = W[9]:W[8]
170	movaps %xmm3, $xmmT2	201	rolq \$32, %r12 # r12 = W[11]:W[10]
171	paddd $xmmRCONST, $xmmT2	202	movq %r11, %xmm2
172	movq $xmmT2, %r13	203	movq %r12, $xmmT1
173	shufps \$0x0e, $xmmT2, $xmmT2	204	punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
174	movq $xmmT2, %r14	205
175		206	movq 4*12(%rdi), %r13
176	# MOVQs to GPRs (above) have somewhat high latency.	207	movq 4*14(%rdi), %r14
177	# Load hash[] while they are completing:	208	bswapq %r13
178	movl 80(%rdi), %eax # a = ctx->hash[0]	209	bswapq %r14
179	movl 84(%rdi), %ebx # b = ctx->hash[1]	210	rolq \$32, %r13 # r13 = W[13]:W[12]
180	movl 88(%rdi), %ecx # c = ctx->hash[2]	211	rolq \$32, %r14 # r14 = W[15]:W[14]
181	movl 92(%rdi), %edx # d = ctx->hash[3]	212	movq %r13, %xmm3
182	movl 96(%rdi), %ebp # e = ctx->hash[4]	213	movq %r14, $xmmT1
		214	punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
183	"	215	"
184		216
185	PREP() {	217	PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
258	echo "	290	echo "
259	# $n	291	# $n
260	";test $n0 = 0 && echo "	292	";test $n0 = 0 && echo "
261	addl %esi, %e$e # e += RCONST + W[n]	293	leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
262	shrq \$32, %rsi	294	shrq \$32, %rsi
263	";test $n0 = 1 && echo "	295	";test $n0 = 1 && echo "
264	addl %esi, %e$e # e += RCONST + W[n]	296	leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
265	";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "	297	";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
266	addl %r${rN}d, %e$e # e += RCONST + W[n]	298	leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
267	shrq \$32, %r$rN	299	shrq \$32, %r$rN
268	";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "	300	";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
269	addl %r${rN}d, %e$e # e += RCONST + W[n]	301	leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
270	";echo "	302	";echo "
271	movl %e$c, %edi # c	303	movl %e$c, %edi # c
272	xorl %e$d, %edi # ^d	304	xorl %e$d, %edi # ^d
@@ -432,11 +464,6 @@ echo "
432	ret	464	ret
433	.size sha1_process_block64, .-sha1_process_block64	465	.size sha1_process_block64, .-sha1_process_block64
434		466
435	.section .rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
436	.balign 16
437	bswap32_mask:
438	.octa 0x0c0d0e0f08090a0b0405060700010203
439
440	.section .rodata.cst16.sha1const, \"aM\", @progbits, 16	467	.section .rodata.cst16.sha1const, \"aM\", @progbits, 16
441	.balign 16	468	.balign 16
442	sha1const:	469	sha1const: