From 6a6c1c0ea91edeeb18736190feb5a7278d3d1141 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 9 Feb 2022 11:29:23 +0100
Subject: whitespace fix

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 6 +++---
 libbb/hash_md5_sha256_x86-64_shaNI.S | 6 +++---
 libbb/hash_md5_sha_x86-32_shaNI.S    | 4 ++--
 libbb/hash_md5_sha_x86-64_shaNI.S    | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index aa68193bd..413e2df9e 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -250,7 +250,7 @@ sha256_process_block64_shaNI:
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 
 	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign 16
+	.balign	16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -270,8 +270,8 @@ K256:
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign 16
+	.balign	16
 PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa 0x0c0d0e0f08090a0b0405060700010203
+	.octa	0x0c0d0e0f08090a0b0405060700010203
 
 #endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 4663f750a..c246762aa 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -257,7 +257,7 @@ sha256_process_block64_shaNI:
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 
 	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign 16
+	.balign	16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -277,8 +277,8 @@ K256:
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
 	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign 16
+	.balign	16
 PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa 0x0c0d0e0f08090a0b0405060700010203
+	.octa	0x0c0d0e0f08090a0b0405060700010203
 
 #endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index a61b3cbed..afca98a62 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -219,8 +219,8 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
 	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign 16
+	.balign	16
 PSHUFFLE_BYTE_FLIP_MASK:
-	.octa 0x000102030405060708090a0b0c0d0e0f
+	.octa	0x000102030405060708090a0b0c0d0e0f
 
 #endif
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index b32029360..54d122788 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -217,8 +217,8 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
 	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign 16
+	.balign	16
 PSHUFFLE_BYTE_FLIP_MASK:
-	.octa 0x000102030405060708090a0b0c0d0e0f
+	.octa	0x000102030405060708090a0b0c0d0e0f
 
 #endif
-- 
cgit v1.2.3-55-g6feb


From 6f56fa17131b3cbb84e887c6c5fb202f2492169e Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 10 Feb 2022 15:38:10 +0100
Subject: libbb/sha: improve comments

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 18 +++++++++---------
 libbb/hash_md5_sha256_x86-64_shaNI.S | 19 +++++++++----------
 libbb/hash_md5_sha_x86-32_shaNI.S    |  2 +-
 libbb/hash_md5_sha_x86-64_shaNI.S    |  2 +-
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 413e2df9e..4b33449d4 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -4,7 +4,7 @@
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA1 insns).
+// (CPUs which do have such penalty do not support SHA insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 
@@ -39,12 +39,13 @@
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
 
-	movu128		76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
-	movu128		76+1*16(%eax), STATE1 /* HGFE */
+	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		76+1*16(%eax), STATE1 /* EFGH */
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
 	mova128		STATE1, STATE0
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
@@ -232,12 +233,11 @@ sha256_process_block64_shaNI:
 		sha256rnds2	STATE1, STATE0
 
 	/* Write hash values back in the correct order */
-	/* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
-	/* STATE1: CDGH */
 	mova128		STATE0, XMMTMP
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
 	/* add current hash values to previous ones */
 	movu128		76+1*16(%eax), STATE1
 	paddd		XMMTMP, STATE1
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index c246762aa..5ed80c2ef 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -4,7 +4,7 @@
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA1 insns).
+// (CPUs which do have such penalty do not support SHA insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 
@@ -42,12 +42,13 @@
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
 
-	movu128		80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
-	movu128		80+1*16(%rdi), STATE1 /* HGFE */
+	movu128		80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		80+1*16(%rdi), STATE1 /* EFGH */
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
 	mova128		STATE1, STATE0
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
@@ -243,13 +244,11 @@ sha256_process_block64_shaNI:
 	paddd		CDGH_SAVE, STATE1
 
 	/* Write hash values back in the correct order */
-	/* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
-	/* STATE1: CDGH */
 	mova128		STATE0, XMMTMP
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
-
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
 	movu128		STATE0, 80+0*16(%rdi)
 	movu128		XMMTMP, 80+1*16(%rdi)
 
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index afca98a62..c7fb243ce 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -4,7 +4,7 @@
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA1 insns).
+// (CPUs which do have such penalty do not support SHA insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 54d122788..c13cdec07 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -4,7 +4,7 @@
 // We use shorter insns, even though they are for "wrong"
 // data type (fp, not int).
 // For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA1 insns).
+// (CPUs which do have such penalty do not support SHA insns).
 // For AMD, the penalty is one extra cycle
 // (allegedly: I failed to find measurable difference).
 
-- 
cgit v1.2.3-55-g6feb


From 8154146be491bc66ab34d5d5f2a2466ddbdcff52 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 11 Feb 2022 06:08:27 +0100
Subject: libbb/sha1: shrink unrolled x86-64 code

function                                             old     new   delta
sha1_process_block64                                3481    3384     -97

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 129 ++++++++++++++++++++---------------------
 libbb/hash_md5_sha_x86-64.S.sh | 111 +++++++++++++++++------------------
 2 files changed, 117 insertions(+), 123 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 287cfe547..51fde082a 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,68 +27,60 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
 	movaps	sha1const(%rip), %xmm7
+	movaps	bswap32_mask(%rip), %xmm4
 	pshufd	$0x00, %xmm7, %xmm6
 
-	# Load W[] to xmm registers, byteswapping on the fly.
+	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	$32, %rsi		# rsi = W[1]:W[0]
-	rolq	$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, %xmm4
-	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
-#	paddd	%xmm6, %xmm4
-#	movups	%xmm4, -64+16*0(%rsp)
+	# ADDs in two first RD1As shorter by one byte).
+	movups	16*0(%rdi), %xmm0
+	pshufb	%xmm4, %xmm0
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movq	%xmm5, %rsi
+#	pextrq	$1, %xmm5, %r8	#SSE4.1 insn
+#	movhpd	%xmm5, %r8		#can only move to mem, not to reg
+	shufps	$0x0e, %xmm5, %xmm5
+	movq	%xmm5, %r8
+
+	movups	16*1(%rdi), %xmm1
+	pshufb	%xmm4, %xmm1
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movq	%xmm5, %r9
+	shufps	$0x0e, %xmm5, %xmm5
+	movq	%xmm5, %r10
 
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	$32, %r9		# r9  = W[5]:W[4]
-	rolq	$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+	movups	16*2(%rdi), %xmm2
+	pshufb	%xmm4, %xmm2
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movq	%xmm5, %r11
+	shufps	$0x0e, %xmm5, %xmm5
+	movq	%xmm5, %r12
 
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	$32, %r11		# r11  = W[9]:W[8]
-	rolq	$32, %r12		# r12  = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, %xmm4
-	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+	movups	16*3(%rdi), %xmm3
+	pshufb	%xmm4, %xmm3
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movq	%xmm5, %r13
+	shufps	$0x0e, %xmm5, %xmm5
+	movq	%xmm5, %r14
 
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	$32, %r13		# r13  = W[13]:W[12]
-	rolq	$32, %r14		# r14  = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, %xmm4
-	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+	# MOVQs to GPRs (above) have somewhat high latency.
+	# Load hash[] while they are completing:
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
 # 0
-	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
+	addl	%esi, %ebp		# e += RCONST + W[n]
 	shrq	$32, %rsi
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -100,7 +92,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 1
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
+	addl	%esi, %edx		# e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -111,7 +103,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 2
-	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
+	addl	%r8d, %ecx		# e += RCONST + W[n]
 	shrq	$32, %r8
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -123,7 +115,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 3
-	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
+	addl	%r8d, %ebx		# e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -134,7 +126,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 4
-	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+	addl	%r9d, %eax		# e += RCONST + W[n]
 	shrq	$32, %r9
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -146,7 +138,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 5
-	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
+	addl	%r9d, %ebp		# e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -157,7 +149,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 6
-	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
+	addl	%r10d, %edx		# e += RCONST + W[n]
 	shrq	$32, %r10
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
@@ -169,7 +161,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 7
-	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
+	addl	%r10d, %ecx		# e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
@@ -210,7 +202,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*0(%rsp)
 # 8
-	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
+	addl	%r11d, %ebx		# e += RCONST + W[n]
 	shrq	$32, %r11
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
@@ -222,7 +214,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 9
-	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
+	addl	%r11d, %eax		# e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
@@ -233,7 +225,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 10
-	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
+	addl	%r12d, %ebp		# e += RCONST + W[n]
 	shrq	$32, %r12
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -245,7 +237,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 11
-	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
+	addl	%r12d, %edx		# e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -287,7 +279,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*1(%rsp)
 # 12
-	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
+	addl	%r13d, %ecx		# e += RCONST + W[n]
 	shrq	$32, %r13
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -299,7 +291,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 13
-	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
+	addl	%r13d, %ebx		# e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -310,7 +302,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 14
-	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
+	addl	%r14d, %eax		# e += RCONST + W[n]
 	shrq	$32, %r14
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -322,7 +314,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 15
-	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
+	addl	%r14d, %ebp		# e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -1475,6 +1467,11 @@ sha1_process_block64:
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
+	.section	.rodata.cst16.bswap32_mask, "aM", @progbits, 16
+	.balign	16
+bswap32_mask:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
 	.balign	16
 sha1const:
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index a10ac411d..f34e6e6fa 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -129,65 +129,57 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
 	movaps	sha1const(%rip), $xmmALLRCONST
+	movaps	bswap32_mask(%rip), $xmmT1
 	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
 
-	# Load W[] to xmm registers, byteswapping on the fly.
+	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	\$32, %rsi		# rsi = W[1]:W[0]
-	rolq	\$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, $xmmT1
-	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
-#	paddd	$xmmRCONST, $xmmT1
-#	movups	$xmmT1, -64+16*0(%rsp)
-
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	\$32, %r9		# r9  = W[5]:W[4]
-	rolq	\$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	\$32, %r11		# r11  = W[9]:W[8]
-	rolq	\$32, %r12		# r12  = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, $xmmT1
-	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	\$32, %r13		# r13  = W[13]:W[12]
-	rolq	\$32, %r14		# r14  = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, $xmmT1
-	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+	# ADDs in two first RD1As shorter by one byte).
+	movups	16*0(%rdi), %xmm0
+	pshufb	$xmmT1, %xmm0
+	movaps	%xmm0, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movq	$xmmT2, %rsi
+#	pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
+#	movhpd	$xmmT2, %r8		#can only move to mem, not to reg
+	shufps	\$0x0e, $xmmT2, $xmmT2
+	movq	$xmmT2, %r8
+
+	movups	16*1(%rdi), %xmm1
+	pshufb	$xmmT1, %xmm1
+	movaps	%xmm1, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movq	$xmmT2, %r9
+	shufps	\$0x0e, $xmmT2, $xmmT2
+	movq	$xmmT2, %r10
+
+	movups	16*2(%rdi), %xmm2
+	pshufb	$xmmT1, %xmm2
+	movaps	%xmm2, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movq	$xmmT2, %r11
+	shufps	\$0x0e, $xmmT2, $xmmT2
+	movq	$xmmT2, %r12
+
+	movups	16*3(%rdi), %xmm3
+	pshufb	$xmmT1, %xmm3
+	movaps	%xmm3, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movq	$xmmT2, %r13
+	shufps	\$0x0e, $xmmT2, $xmmT2
+	movq	$xmmT2, %r14
+
+	# MOVQs to GPRs (above) have somewhat high latency.
+	# Load hash[] while they are completing:
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 "
 
 PREP() {
@@ -266,15 +258,15 @@ local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+	addl	%esi, %e$e		# e += RCONST + W[n]
 	shrq	\$32, %rsi
 ";test $n0 = 1 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+	addl	%esi, %e$e		# e += RCONST + W[n]
 ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
 	shrq	\$32, %r$rN
 ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
@@ -440,6 +432,11 @@ echo "
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
+	.section	.rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
+	.balign	16
+bswap32_mask:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
 	.balign	16
 sha1const:
-- 
cgit v1.2.3-55-g6feb


From dda77e83762861b52d62f0f161e2b4bf8092eacf Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 11 Feb 2022 14:53:26 +0100
Subject: libbb/sha1: revert last commit: pshufb is a SSSE3 insn, can't use it

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S |   4 ++
 libbb/hash_md5_sha256_x86-64_shaNI.S |   4 ++
 libbb/hash_md5_sha_x86-32_shaNI.S    |   5 ++
 libbb/hash_md5_sha_x86-64.S          | 127 +++++++++++++++++----------------
 libbb/hash_md5_sha_x86-64.S.sh       | 133 +++++++++++++++++++++--------------
 libbb/hash_md5_sha_x86-64_shaNI.S    |   5 ++
 6 files changed, 163 insertions(+), 115 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 4b33449d4..c059fb18d 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -15,6 +15,10 @@
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 5ed80c2ef..9578441f8 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -15,6 +15,10 @@
 //#define shuf128_32 pshufd
 #define shuf128_32 shufps
 
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
 	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
 	.globl	sha256_process_block64_shaNI
 	.hidden	sha256_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index c7fb243ce..2366b046a 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,6 +20,11 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
 	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 51fde082a..f0daa30f6 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -27,60 +27,68 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
 	movaps	sha1const(%rip), %xmm7
-	movaps	bswap32_mask(%rip), %xmm4
 	pshufd	$0x00, %xmm7, %xmm6
 
 	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# ADDs in two first RD1As shorter by one byte).
-	movups	16*0(%rdi), %xmm0
-	pshufb	%xmm4, %xmm0
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %rsi
-#	pextrq	$1, %xmm5, %r8	#SSE4.1 insn
-#	movhpd	%xmm5, %r8		#can only move to mem, not to reg
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r8
-
-	movups	16*1(%rdi), %xmm1
-	pshufb	%xmm4, %xmm1
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r9
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r10
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	$32, %rsi		# rsi = W[1]:W[0]
+	rolq	$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
+#	paddd	%xmm6, %xmm4
+#	movups	%xmm4, -64+16*0(%rsp)
 
-	movups	16*2(%rdi), %xmm2
-	pshufb	%xmm4, %xmm2
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r11
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r12
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	$32, %r9		# r9  = W[5]:W[4]
+	rolq	$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
 
-	movups	16*3(%rdi), %xmm3
-	pshufb	%xmm4, %xmm3
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movq	%xmm5, %r13
-	shufps	$0x0e, %xmm5, %xmm5
-	movq	%xmm5, %r14
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	$32, %r11		# r11  = W[9]:W[8]
+	rolq	$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
 
-	# MOVQs to GPRs (above) have somewhat high latency.
-	# Load hash[] while they are completing:
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	$32, %r13		# r13  = W[13]:W[12]
+	rolq	$32, %r14		# r14  = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, %xmm4
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 
 # 0
-	addl	%esi, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	shrq	$32, %rsi
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -92,7 +100,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 1
-	addl	%esi, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -103,7 +111,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 2
-	addl	%r8d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
 	shrq	$32, %r8
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -115,7 +123,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 3
-	addl	%r8d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -126,7 +134,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 4
-	addl	%r9d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 	shrq	$32, %r9
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -138,7 +146,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 5
-	addl	%r9d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -149,7 +157,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 6
-	addl	%r10d, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
 	shrq	$32, %r10
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
@@ -161,7 +169,7 @@ sha1_process_block64:
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 7
-	addl	%r10d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
@@ -202,7 +210,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*0(%rsp)
 # 8
-	addl	%r11d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
 	shrq	$32, %r11
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
@@ -214,7 +222,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 9
-	addl	%r11d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
@@ -225,7 +233,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 10
-	addl	%r12d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
 	shrq	$32, %r12
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
@@ -237,7 +245,7 @@ sha1_process_block64:
 	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 11
-	addl	%r12d, %edx		# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
@@ -279,7 +287,7 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*1(%rsp)
 # 12
-	addl	%r13d, %ecx		# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
 	shrq	$32, %r13
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
@@ -291,7 +299,7 @@ sha1_process_block64:
 	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 13
-	addl	%r13d, %ebx		# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
@@ -302,7 +310,7 @@ sha1_process_block64:
 	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 14
-	addl	%r14d, %eax		# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 	shrq	$32, %r14
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
@@ -314,7 +322,7 @@ sha1_process_block64:
 	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 15
-	addl	%r14d, %ebp		# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
@@ -1467,11 +1475,6 @@ sha1_process_block64:
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
-	.section	.rodata.cst16.bswap32_mask, "aM", @progbits, 16
-	.balign	16
-bswap32_mask:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
 	.balign	16
 sha1const:
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index f34e6e6fa..57e77b118 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,6 +99,30 @@ INTERLEAVE() {
 	)
 }
 
+#	movaps  bswap32_mask(%rip), $xmmT1
+# Load W[] to xmm0..3, byteswapping on the fly.
+# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+# for use in RD1As instead of spilling them to stack.
+# (We use rsi instead of rN because this makes two
+# ADDs in two first RD1As shorter by one byte).
+#	movups	16*0(%rdi), %xmm0
+#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
+#	movaps	%xmm0, $xmmT2
+#	paddd	$xmmRCONST, $xmmT2
+#	movq	$xmmT2, %rsi
+#	#pextrq	\$1, $xmmT2, %r8        #SSE4.1 insn
+#	#movhpd	$xmmT2, %r8             #can only move to mem, not to reg
+#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
+#	movq	$xmmT2, %r8		# instead
+#	...
+#	<repeat for xmm1,2,3>
+#	...
+#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
+#+	addl	%esi, %e$e			# e += RCONST + W[n]
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# The above is -97 bytes of code...
+# ...but pshufb is a SSSE3 insn. Can't use it.
+
 echo \
 "### Generated by hash_md5_sha_x86-64.S.sh ###
 
@@ -129,57 +153,65 @@ sha1_process_block64:
 # xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
 	movaps	sha1const(%rip), $xmmALLRCONST
-	movaps	bswap32_mask(%rip), $xmmT1
 	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
 
 	# Load W[] to xmm0..3, byteswapping on the fly.
 	#
-	# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
 	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# ADDs in two first RD1As shorter by one byte).
-	movups	16*0(%rdi), %xmm0
-	pshufb	$xmmT1, %xmm0
-	movaps	%xmm0, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %rsi
-#	pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
-#	movhpd	$xmmT2, %r8		#can only move to mem, not to reg
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r8
-
-	movups	16*1(%rdi), %xmm1
-	pshufb	$xmmT1, %xmm1
-	movaps	%xmm1, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r9
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r10
-
-	movups	16*2(%rdi), %xmm2
-	pshufb	$xmmT1, %xmm2
-	movaps	%xmm2, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r11
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r12
-
-	movups	16*3(%rdi), %xmm3
-	pshufb	$xmmT1, %xmm3
-	movaps	%xmm3, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movq	$xmmT2, %r13
-	shufps	\$0x0e, $xmmT2, $xmmT2
-	movq	$xmmT2, %r14
-
-	# MOVQs to GPRs (above) have somewhat high latency.
-	# Load hash[] while they are completing:
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	\$32, %rsi		# rsi = W[1]:W[0]
+	rolq	\$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
+#	paddd	$xmmRCONST, $xmmT1
+#	movups	$xmmT1, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	\$32, %r9		# r9  = W[5]:W[4]
+	rolq	\$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	\$32, %r11		# r11  = W[9]:W[8]
+	rolq	\$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	\$32, %r13		# r13  = W[13]:W[12]
+	rolq	\$32, %r14		# r14  = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, $xmmT1
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 "
 
 PREP() {
@@ -258,15 +290,15 @@ local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-	addl	%esi, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 	shrq	\$32, %rsi
 ";test $n0 = 1 && echo "
-	addl	%esi, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 	shrq	\$32, %r$rN
 ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-	addl	%r${rN}d, %e$e		# e += RCONST + W[n]
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
@@ -432,11 +464,6 @@ echo "
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
 
-	.section	.rodata.cst16.bswap32_mask, \"aM\", @progbits, 16
-	.balign	16
-bswap32_mask:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
 	.balign	16
 sha1const:
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index c13cdec07..794e97040 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,6 +20,11 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
 	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI
-- 
cgit v1.2.3-55-g6feb


From 1f272c06d02e7c7f0f3af1f97165722255c8828d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 11 Feb 2022 23:03:27 +0100
Subject: whitespace fixes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    |  8 ++++----
 libbb/hash_md5_sha_x86-64.S.sh | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index f0daa30f6..1d55b91f8 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -71,8 +71,8 @@ sha1_process_block64:
 	movq	4*10(%rdi), %r12
 	bswapq	%r11
 	bswapq	%r12
-	rolq	$32, %r11		# r11  = W[9]:W[8]
-	rolq	$32, %r12		# r12  = W[11]:W[10]
+	rolq	$32, %r11		# r11 = W[9]:W[8]
+	rolq	$32, %r12		# r12 = W[11]:W[10]
 	movq	%r11, %xmm2
 	movq	%r12, %xmm4
 	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
@@ -81,8 +81,8 @@ sha1_process_block64:
 	movq	4*14(%rdi), %r14
 	bswapq	%r13
 	bswapq	%r14
-	rolq	$32, %r13		# r13  = W[13]:W[12]
-	rolq	$32, %r14		# r14  = W[15]:W[14]
+	rolq	$32, %r13		# r13 = W[13]:W[12]
+	rolq	$32, %r14		# r14 = W[15]:W[14]
 	movq	%r13, %xmm3
 	movq	%r14, %xmm4
 	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 57e77b118..40c979d35 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -99,7 +99,7 @@ INTERLEAVE() {
 	)
 }
 
-#	movaps  bswap32_mask(%rip), $xmmT1
+#	movaps	bswap32_mask(%rip), $xmmT1
 # Load W[] to xmm0..3, byteswapping on the fly.
 # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
 # for use in RD1As instead of spilling them to stack.
@@ -110,8 +110,8 @@ INTERLEAVE() {
 #	movaps	%xmm0, $xmmT2
 #	paddd	$xmmRCONST, $xmmT2
 #	movq	$xmmT2, %rsi
-#	#pextrq	\$1, $xmmT2, %r8        #SSE4.1 insn
-#	#movhpd	$xmmT2, %r8             #can only move to mem, not to reg
+#	#pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
+#	#movhpd	$xmmT2, %r8		#can only move to mem, not to reg
 #	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
 #	movq	$xmmT2, %r8		# instead
 #	...
@@ -197,8 +197,8 @@ sha1_process_block64:
 	movq	4*10(%rdi), %r12
 	bswapq	%r11
 	bswapq	%r12
-	rolq	\$32, %r11		# r11  = W[9]:W[8]
-	rolq	\$32, %r12		# r12  = W[11]:W[10]
+	rolq	\$32, %r11		# r11 = W[9]:W[8]
+	rolq	\$32, %r12		# r12 = W[11]:W[10]
 	movq	%r11, %xmm2
 	movq	%r12, $xmmT1
 	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
@@ -207,8 +207,8 @@ sha1_process_block64:
 	movq	4*14(%rdi), %r14
 	bswapq	%r13
 	bswapq	%r14
-	rolq	\$32, %r13		# r13  = W[13]:W[12]
-	rolq	\$32, %r14		# r14  = W[15]:W[14]
+	rolq	\$32, %r13		# r13 = W[13]:W[12]
+	rolq	\$32, %r14		# r14 = W[15]:W[14]
 	movq	%r13, %xmm3
 	movq	%r14, $xmmT1
 	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-- 
cgit v1.2.3-55-g6feb


From c2e7780e526b0f421c3b43367a53019d1dc5f2d6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 12 Feb 2022 00:52:12 +0100
Subject: libbb/sha256: explicitly use sha256rnds2's %xmm0 (MSG) argument

Else, the code seemingly does not use MSG.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 64 +++++++++++++++---------------
 libbb/hash_md5_sha256_x86-64_shaNI.S | 76 ++++++++++++++++++------------------
 2 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index c059fb18d..3905bad9a 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -60,18 +60,18 @@ sha256_process_block64_shaNI:
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 8-11 */
@@ -79,9 +79,9 @@ sha256_process_block64_shaNI:
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 12-15 */
@@ -90,151 +90,151 @@ sha256_process_block64_shaNI:
 /* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
 		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Write hash values back in the correct order */
 	mova128		STATE0, XMMTMP
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 9578441f8..082ceafe4 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -38,8 +38,8 @@
 
 #define XMMTMP		%xmm7
 
-#define ABEF_SAVE	%xmm9
-#define CDGH_SAVE	%xmm10
+#define SAVE0		%xmm8
+#define SAVE1		%xmm9
 
 #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
 
@@ -59,26 +59,26 @@ sha256_process_block64_shaNI:
 	leaq		K256+8*16(%rip), SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
-	mova128		STATE0, ABEF_SAVE
-	mova128		STATE1, CDGH_SAVE
+	mova128		STATE0, SAVE0
+	mova128		STATE1, SAVE1
 
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 8-11 */
@@ -86,9 +86,9 @@ sha256_process_block64_shaNI:
 	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 12-15 */
@@ -97,155 +97,155 @@ sha256_process_block64_shaNI:
 /* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP1, MSGTMP0
 
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP2, MSGTMP1
 
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP3, XMMTMP
 	palignr		$4, MSGTMP2, XMMTMP
 	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP3, MSGTMP2
 
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP0, XMMTMP
 	palignr		$4, MSGTMP3, XMMTMP
 	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 	sha256msg1	MSGTMP0, MSGTMP3
 
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP1, XMMTMP
 	palignr		$4, MSGTMP0, XMMTMP
 	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 	mova128		MSGTMP2, XMMTMP
 	palignr		$4, MSGTMP1, XMMTMP
 	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
 		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	STATE0, STATE1
+		sha256rnds2	MSG, STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	STATE1, STATE0
+		sha256rnds2	MSG, STATE1, STATE0
 
 	/* Add current hash values with previously saved */
-	paddd		ABEF_SAVE, STATE0
-	paddd		CDGH_SAVE, STATE1
+	paddd		SAVE0, STATE0
+	paddd		SAVE1, STATE1
 
 	/* Write hash values back in the correct order */
 	mova128		STATE0, XMMTMP
-- 
cgit v1.2.3-55-g6feb


From 1891fdda59092a215d3a407d9108bbbe6ab8df7a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 18 Feb 2022 17:09:51 +0100
Subject: libbb/sha1: update config help text with new performance numbers

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index 0ecd5bd46..66a3ffa23 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -57,11 +57,12 @@ config SHA1_SMALL
 	range 0 3
 	help
 	Trade binary size versus speed for the sha1 algorithm.
+	With FEATURE_COPYBUF_KB=64:
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               367  375          3657  3502
-	1               224  229           654   732
-	2,3             200  195           358   380
+	0               440  485          3481  3502
+	1               265  265           641   696
+	2,3             220  210           342   364
 
 config SHA1_HWACCEL
 	bool "SHA1: Use hardware accelerated instructions if possible"
-- 
cgit v1.2.3-55-g6feb


From fa52ac9781f479de8ab4d8526276244c0a0471f4 Mon Sep 17 00:00:00 2001
From: Sören Tempel <soeren@soeren-tempel.net>
Date: Mon, 28 Feb 2022 08:36:50 +0100
Subject: ash: don't read past end of var in subvareval for bash substitutions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch, BusyBox handles bash pattern substitutions without
a terminating '/' character incorrectly.

Consider the following shell script:

	_bootstrapver=5.0.211-r0
	_referencesdir="/usr/${_bootstrapver/-*}/Sources"
	echo $_referencesdir

This should output `/usr/5.0.211/Sources`. However, without this patch
it instead outputs `/usr/5.0.211Sources`. This is due to the fact that
BusyBox expects the bash pattern substitutions to always be terminated
with a '/' (at least in this part of subvareval) and thus reads passed
the substitution itself and consumes the '/' character which is part of
the literal string. If there is no '/' after the substitution then
BusyBox might perform an out-of-bounds read under certain circumstances.

When replacing the bash pattern substitution with `${_bootstrapver/-*/}`,
or with this patch applied, ash outputs the correct value.

Signed-off-by: Sören Tempel <soeren@soeren-tempel.net>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 shell/ash.c                                                | 4 ++++
 shell/ash_test/ash-vars/var_bash_repl_unterminated.right   | 1 +
 shell/ash_test/ash-vars/var_bash_repl_unterminated.tests   | 2 ++
 shell/hush_test/hush-vars/var_bash_repl_unterminated.right | 1 +
 shell/hush_test/hush-vars/var_bash_repl_unterminated.tests | 2 ++
 5 files changed, 10 insertions(+)
 create mode 100644 shell/ash_test/ash-vars/var_bash_repl_unterminated.right
 create mode 100755 shell/ash_test/ash-vars/var_bash_repl_unterminated.tests
 create mode 100644 shell/hush_test/hush-vars/var_bash_repl_unterminated.right
 create mode 100755 shell/hush_test/hush-vars/var_bash_repl_unterminated.tests

diff --git a/shell/ash.c b/shell/ash.c
index adb0f223a..54335c5dd 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -7081,6 +7081,10 @@ subevalvar(char *start, char *str, int strloc,
 				*repl = '\0';
 				break;
 			}
+			if ((unsigned char)*repl == CTLENDVAR) { /* ${v/pattern} (no trailing /, no repl) */
+				repl = NULL;
+				break;
+			}
 			/* Handle escaped slashes, e.g. "${v/\//_}" (they are CTLESC'ed by this point) */
 			if ((unsigned char)*repl == CTLESC && repl[1])
 				repl++;
diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.right b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right
new file mode 100644
index 000000000..5bff3a6fa
--- /dev/null
+++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right
@@ -0,0 +1 @@
+b/d
diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests
new file mode 100755
index 000000000..c9513343d
--- /dev/null
+++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests
@@ -0,0 +1,2 @@
+a=b-c
+echo ${a/-*}/d
diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.right b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right
new file mode 100644
index 000000000..5bff3a6fa
--- /dev/null
+++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right
@@ -0,0 +1 @@
+b/d
diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests
new file mode 100755
index 000000000..c9513343d
--- /dev/null
+++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests
@@ -0,0 +1,2 @@
+a=b-c
+echo ${a/-*}/d
-- 
cgit v1.2.3-55-g6feb


From 7750b5a25a8cf9081b7c248687c876d0068e85bb Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 1 Mar 2022 09:56:54 +0100
Subject: ash: fix unsafe use of mempcpy

function                                             old     new   delta
subevalvar                                          1549    1557      +8

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 shell/ash.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/shell/ash.c b/shell/ash.c
index 54335c5dd..44ec2eafd 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -7191,7 +7191,13 @@ subevalvar(char *start, char *str, int strloc,
 			len = orig_len - pos;
 
 		if (!quotes) {
-			loc = mempcpy(startp, startp + pos, len);
+			/* want: loc = mempcpy(startp, startp + pos, len)
+			 * but it does not allow overlapping arguments */
+			loc = startp;
+			while (--len >= 0) {
+				*loc = loc[pos];
+				loc++;
+			}
 		} else {
 			for (vstr = startp; pos != 0; pos--) {
 				if ((unsigned char)*vstr == CTLESC)
-- 
cgit v1.2.3-55-g6feb


From 5fe20cf3212fbada86fb75cf13064caed6a5f3a9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 1 Mar 2022 10:08:59 +0100
Subject: ash: do not truncate failed tilde expansion on unknown user names

Do not skip over "*p = c;" statement.

Testcase: echo ~~nouser/qwe

function                                             old     new   delta
argstr                                              1396    1406     +10

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 shell/ash.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/shell/ash.c b/shell/ash.c
index 44ec2eafd..ef4a47afe 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -6532,9 +6532,7 @@ exptilde(char *startp, int flag)
 		home = lookupvar("HOME");
 	} else {
 		pw = getpwnam(name);
-		if (pw == NULL)
-			goto lose;
-		home = pw->pw_dir;
+		home = pw ? pw->pw_dir : NULL;
 	}
 	*p = c;
 	if (!home)
-- 
cgit v1.2.3-55-g6feb


From 55f969a006109703dd056bee1b6c1d11b0602449 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 1 Mar 2022 10:46:49 +0100
Subject: taskset: fix printf format mismatch in !FEATURE_TASKSET_FANCY config.
 closes 14616

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 util-linux/taskset.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/util-linux/taskset.c b/util-linux/taskset.c
index d2ef9b98f..8b410f369 100644
--- a/util-linux/taskset.c
+++ b/util-linux/taskset.c
@@ -55,7 +55,6 @@
  * Not yet implemented:
  * -a/--all-tasks (affect all threads)
  *	needs to get TIDs from /proc/PID/task/ and use _them_ as "pid" in sched_setaffinity(pid)
- * -c/--cpu-list  (specify CPUs via "1,3,5-7")
  */
 
 #include <sched.h>
@@ -91,7 +90,7 @@ static char *from_mask(const ul *mask, unsigned sz_in_bytes)
 }
 #else
 #define TASKSET_PRINTF_MASK "%lx"
-static unsigned long long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM)
+static unsigned long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM)
 {
 	return *mask;
 }
-- 
cgit v1.2.3-55-g6feb


From fc7868602ecf0d761a9a877141add4a9b6918d02 Mon Sep 17 00:00:00 2001
From: Ron Yorston <rmy@pobox.com>
Date: Thu, 3 Mar 2022 11:35:46 +0000
Subject: vi: improved handling of backspace in replace mode

In replace mode ('R' command) the backspace character should get
special treatment:

- backspace only goes back to the start of the replacement;
- backspacing over replaced characters restores the original text.

Prior to this commit BusyBox vi deleted the characters both before
and after the cursor in replace mode.

function                                             old     new   delta
undo_pop                                               -     235    +235
char_insert                                          858     884     +26
indicate_error                                        81      84      +3
find_range                                           654     657      +3
static.text_yank                                      77      79      +2
do_cmd                                              4486    4243    -243
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 4/1 up/down: 269/-243)           Total: 26 bytes

Signed-off-by: Ron Yorston <rmy@pobox.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/vi.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/editors/vi.c b/editors/vi.c
index d37cd48a3..4257c0fdc 100644
--- a/editors/vi.c
+++ b/editors/vi.c
@@ -224,6 +224,7 @@
 
 #endif
 
+#define isbackspace(c) ((c) == term_orig.c_cc[VERASE] || (c) == 8 || (c) == 127)
 
 enum {
 	MAX_TABSTOP = 32, // sanity limit
@@ -342,6 +343,7 @@ struct globals {
 	int last_modified_count; // = -1;
 	int cmdline_filecnt;     // how many file names on cmd line
 	int cmdcnt;              // repetition count
+	char *rstart;            // start of text in Replace mode
 	unsigned rows, columns;	 // the terminal screen is this size
 #if ENABLE_FEATURE_VI_ASK_TERMINAL
 	int get_rowcol_error;
@@ -474,6 +476,7 @@ struct globals {
 #define last_modified_count     (G.last_modified_count)
 #define cmdline_filecnt         (G.cmdline_filecnt    )
 #define cmdcnt                  (G.cmdcnt             )
+#define rstart                  (G.rstart             )
 #define rows                    (G.rows               )
 #define columns                 (G.columns            )
 #define crow                    (G.crow               )
@@ -1212,7 +1215,7 @@ static char *get_input_line(const char *prompt)
 		c = get_one_char();
 		if (c == '\n' || c == '\r' || c == 27)
 			break;		// this is end of input
-		if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) {
+		if (isbackspace(c)) {
 			// user wants to erase prev char
 			write1("\b \b"); // erase char on screen
 			buf[--i] = '\0';
@@ -2174,8 +2177,16 @@ static char *char_insert(char *p, char c, int undo) // insert the char c at 'p'
 			p += 1 + stupid_insert(p, ' ');
 		}
 #endif
-	} else if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) { // Is this a BS
-		if (p > text) {
+	} else if (isbackspace(c)) {
+		if (cmd_mode == 2) {
+			// special treatment for backspace in Replace mode
+			if (p > rstart) {
+				p--;
+#if ENABLE_FEATURE_VI_UNDO
+				undo_pop();
+#endif
+			}
+		} else if (p > text) {
 			p--;
 			p = text_hole_delete(p, p, ALLOW_UNDO_QUEUED);	// shrink buffer 1 char
 		}
@@ -3703,9 +3714,9 @@ static void do_cmd(int c)
 			undo_queue_commit();
 		} else {
 			if (1 <= c || Isprint(c)) {
-				if (c != 27)
-					dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO);	// delete char
-				dot = char_insert(dot, c, ALLOW_UNDO_CHAIN);	// insert new char
+				if (c != 27 && !isbackspace(c))
+					dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO);
+				dot = char_insert(dot, c, ALLOW_UNDO_CHAIN);
 			}
 			goto dc1;
 		}
@@ -4264,6 +4275,7 @@ static void do_cmd(int c)
  dc5:
 		cmd_mode = 2;
 		undo_queue_commit();
+		rstart = dot;
 		break;
 	case KEYCODE_DELETE:
 		if (dot < end - 1)
-- 
cgit v1.2.3-55-g6feb