From f1d06462e872270f38c497e36f8cd018ee7415bf Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 28 Dec 2021 09:05:12 +0100
Subject: libbb: cose shrink in sha1

function                                             old     new   delta
sha1_process_block64                                 356     342     -14

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index e0db8ce67..a468397e3 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -523,9 +523,6 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 				work = (work & b) ^ d;
 				if (j <= 3)
 					goto ge16;
-				/* Used to do SWAP_BE32 here, but this
-				 * requires ctx (see comment above) */
-				work += W[cnt];
 			} else {
 				if (i == 2)
 					work = ((b | c) & d) | (b & c);
@@ -533,14 +530,14 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 					work ^= b;
  ge16:
 				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
-				work += W[cnt];
 			}
+			work += W[cnt];
 			work += e + rotl32(a, 5) + rconsts[i];
 
 			/* Rotate by one for next time */
 			e = d;
 			d = c;
-			c = /* b = */ rotl32(b, 30);
+			c = rotl32(b, 30);
 			b = a;
 			a = work;
 			cnt = (cnt + 1) & 15;
-- 
cgit v1.2.3-55-g6feb


From 0fcc7f5f738e38766cde59ffd193643458c26cba Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 28 Dec 2021 21:05:59 +0100
Subject: scripts/echo.c: fix NUL handling in "abc\0 def"

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 scripts/echo.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/scripts/echo.c b/scripts/echo.c
index 7474ccdd4..e3a07adf0 100644
--- a/scripts/echo.c
+++ b/scripts/echo.c
@@ -153,25 +153,32 @@ int main(int argc, char **argv)
 		if (!eflag) {
 			/* optimization for very common case */
 			fputs(arg, stdout);
-		} else while ((c = *arg++)) {
-			if (c == eflag) {	/* Check for escape seq. */
+		} else
+		while ((c = *arg++) != '\0') {
+			if (c == eflag) {
+				/* This is an "\x" sequence */
+
 				if (*arg == 'c') {
-					/* '\c' means cancel newline and
+					/* "\c" means cancel newline and
 					 * ignore all subsequent chars. */
 					goto ret;
 				}
-				{
-					/* Since SUSv3 mandates a first digit of 0, 4-digit octals
-					* of the form \0### are accepted. */
-					if (*arg == '0') {
-						/* NB: don't turn "...\0" into "...\" */
-						if (arg[1] && ((unsigned char)(arg[1]) - '0') < 8) {
-							arg++;
-						}
+				/* Since SUSv3 mandates a first digit of 0, 4-digit octals
+				* of the form \0### are accepted. */
+				if (*arg == '0') {
+					if ((unsigned char)(arg[1] - '0') < 8) {
+						/* 2nd char is 0..7: skip leading '0' */
+						arg++;
 					}
-					/* bb_process_escape_sequence handles NUL correctly
-					 * ("...\" case. */
-					c = bb_process_escape_sequence(&arg);
+				}
+				/* bb_process_escape_sequence handles NUL correctly
+				 * ("...\" case). */
+				{
+					/* optimization: don't force arg to be on-stack,
+					 * use another variable for that. ~30 bytes win */
+					const char *z = arg;
+					c = bb_process_escape_sequence(&z);
+					arg = z;
 				}
 			}
 			putchar(c);
-- 
cgit v1.2.3-55-g6feb


From 0e2cb6d1e2553675bba2999829bbc29219aea987 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 29 Dec 2021 06:41:05 +0100
Subject: echo: add FIXME comment

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/echo.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/coreutils/echo.c b/coreutils/echo.c
index 82f0358b6..44b2eb5d0 100644
--- a/coreutils/echo.c
+++ b/coreutils/echo.c
@@ -321,6 +321,8 @@ int echo_main(int argc, char **argv)
 				if (*arg == '0' && (unsigned char)(arg[1] - '0') < 8) {
 					arg++;
 				}
+//FIXME? we also accept non-0 starting sequences (see echo-prints-slash_41 test)
+// echo -ne '-\41-' prints "-!-". bash 5.0.17 does not (prints "-\41-").
 				/* bb_process_escape_sequence can handle nul correctly */
 				c = bb_process_escape_sequence( (void*) &arg);
 			}
-- 
cgit v1.2.3-55-g6feb


From 9173c9cce48dc4c867fb06bb72e8c762740c5c86 Mon Sep 17 00:00:00 2001
From: Sören Tempel <soeren+git@soeren-tempel.net>
Date: Wed, 29 Dec 2021 16:15:50 +0100
Subject: ed: add support for -s command-line option as mandated by POSIX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apart from the -p option, POSIX also mandates an -s option which
suppresses the output of byte counts for the e, E, r, and w command.
From these commands, Busybox ed presently only implements the r and w
commands. This commit ensures that these two command do not output any
bytes counts when the -s option is passed. The shell escape command,
also effected by the -s option, is not implemented by Busybox at the
moment.

function                                             old     new   delta
packed_usage                                       34096   34115     +19
doCommands                                          1887    1900     +13
readLines                                            388     397      +9
.rodata                                           104196  104200      +4
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 4/0 up/down: 45/0)               Total: 45 bytes

Signed-off-by: Sören Tempel <soeren+git@soeren-tempel.net>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/ed.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/editors/ed.c b/editors/ed.c
index dfe0f1a77..209ce9942 100644
--- a/editors/ed.c
+++ b/editors/ed.c
@@ -18,7 +18,7 @@
 
 //applet:IF_ED(APPLET(ed, BB_DIR_BIN, BB_SUID_DROP))
 
-//usage:#define ed_trivial_usage "[-p PROMPT] [FILE]"
+//usage:#define ed_trivial_usage "[-p PROMPT] [-s] [FILE]"
 //usage:#define ed_full_usage ""
 
 #include "libbb.h"
@@ -71,6 +71,11 @@ struct globals {
 	SET_PTR_TO_GLOBALS(xzalloc(sizeof(G))); \
 } while (0)
 
+#define OPTION_STR "sp:"
+enum {
+	OPT_s = (1 << 0),
+};
+
 static int bad_nums(int num1, int num2, const char *for_what)
 {
 	if ((num1 < 1) || (num2 > lastNum) || (num1 > num2)) {
@@ -458,7 +463,8 @@ static int readLines(const char *file, int num)
 	 * in the following format:
 	 * "%d\n", <number of bytes read>
 	 */
-	printf("%u\n", charCount);
+	if (!(option_mask32 & OPT_s))
+		printf("%u\n", charCount);
 	return TRUE;
 }
 
@@ -510,7 +516,8 @@ static int writeLines(const char *file, int num1, int num2)
 	 * unless the -s option was specified, in the following format:
 	 * "%d\n", <number of bytes written>
 	 */
-	printf("%u\n", charCount);
+	if (!(option_mask32 & OPT_s))
+		printf("%u\n", charCount);
 	return TRUE;
 }
 
@@ -1005,7 +1012,7 @@ int ed_main(int argc UNUSED_PARAM, char **argv)
 	lines.prev = &lines;
 
 	prompt = ""; /* no prompt by default */
-	getopt32(argv, "p:", &prompt);
+	getopt32(argv, OPTION_STR, &prompt);
 	argv += optind;
 
 	if (argv[0]) {
-- 
cgit v1.2.3-55-g6feb


From 25aadc893d21b35f7d34a9d1edc843632e7abd8f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 30 Dec 2021 13:07:12 +0100
Subject: libbb/sha1: add config-selectable fully unrolled version, closes
 14391

function                                             old     new   delta
sha1_process_block64                                 364    4167   +3803
static.rconsts                                        16       -     -16
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 1/0 up/down: 3803/-16)         Total: 3787 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     | 25 +++++++++++-----
 libbb/hash_md5_sha.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 95 insertions(+), 14 deletions(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index 24b31fad9..13188ef03 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -42,21 +42,32 @@ config MD5_SMALL
 	default 1  # all "fast or small" options default to small
 	range 0 3
 	help
-	Trade binary size versus speed for the md5sum algorithm.
+	Trade binary size versus speed for the md5 algorithm.
 	Approximate values running uClibc and hashing
 	linux-2.4.4.tar.bz2 were:
-	value               user times (sec)  text size (386)
-	0 (fastest)         1.1                6144
-	1                   1.4                5392
-	2                   3.0                5088
-	3 (smallest)        5.1                4912
+	value           user times (sec)  text size (386)
+	0 (fastest)     1.1               6144
+	1               1.4               5392
+	2               3.0               5088
+	3 (smallest)    5.1               4912
+
+config SHA1_SMALL
+	int "SHA1: Trade bytes for speed (0:fast, 3:slow)"
+	default 3  # all "fast or small" options default to small
+	range 0 3
+	help
+	Trade binary size versus speed for the sha1 algorithm.
+	                throughput MB/s   size of sha1_process_block64
+	value           486  x86-64       486   x86-64
+	0               339  374          4149  4167
+	1,2,3           200  195           358   380
 
 config SHA3_SMALL
 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
 	default 1  # all "fast or small" options default to small
 	range 0 1
 	help
-	Trade binary size versus speed for the sha3sum algorithm.
+	Trade binary size versus speed for the sha3 algorithm.
 	SHA3_SMALL=0 compared to SHA3_SMALL=1 (approximate):
 	64-bit x86: +270 bytes of code, 45% faster
 	32-bit x86: +450 bytes of code, 75% faster
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a468397e3..75673e334 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -390,7 +390,6 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 	OP(FI, D, A, B, C, 11, 10, 0xbd3af235);
 	OP(FI, C, D, A, B, 2, 15, 0x2ad7d2bb);
 	OP(FI, B, C, D, A, 9, 21, 0xeb86d391);
-# undef OP
 # endif
 	/* Add checksum to the starting values */
 	ctx->hash[0] += A;
@@ -399,6 +398,7 @@ static void FAST_FUNC md5_process_block64(md5_ctx_t *ctx)
 	ctx->hash[3] += D;
 #endif
 }
+#undef OP
 #undef FF
 #undef FG
 #undef FH
@@ -490,18 +490,87 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
  * then rebuild and compare "shaNNNsum bigfile" results.
  */
 
+#if CONFIG_SHA1_SMALL == 0
+/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
+ * It seems further speedup can be achieved by handling more than
+ * 64 bytes per one function call (coreutils does that).
+ */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+	static const uint32_t rconsts[] ALIGN4 = {
+		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+	};
+	uint32_t W[16];
+	uint32_t a, b, c, d, e;
+
+	a = ctx->hash[0];
+	b = ctx->hash[1];
+	c = ctx->hash[2];
+	d = ctx->hash[3];
+	e = ctx->hash[4];
+
+#undef OP
+#define OP(A,B,C,D,E, n) \
+	do { \
+		uint32_t work = EXPR(B, C, D); \
+		if (n <= 15) \
+			work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+		if (n >= 16) \
+			work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
+		E += work + rotl32(A, 5) + rconsts[n / 20]; \
+		B = rotl32(B, 30); \
+	} while (0)
+#define OP20(n) \
+	OP(a,b,c,d,e, (n+ 0)); OP(e,a,b,c,d, (n+ 1)); OP(d,e,a,b,c, (n+ 2)); OP(c,d,e,a,b, (n+ 3)); OP(b,c,d,e,a, (n+ 4)); \
+	OP(a,b,c,d,e, (n+ 5)); OP(e,a,b,c,d, (n+ 6)); OP(d,e,a,b,c, (n+ 7)); OP(c,d,e,a,b, (n+ 8)); OP(b,c,d,e,a, (n+ 9)); \
+	OP(a,b,c,d,e, (n+10)); OP(e,a,b,c,d, (n+11)); OP(d,e,a,b,c, (n+12)); OP(c,d,e,a,b, (n+13)); OP(b,c,d,e,a, (n+14)); \
+	OP(a,b,c,d,e, (n+15)); OP(e,a,b,c,d, (n+16)); OP(d,e,a,b,c, (n+17)); OP(c,d,e,a,b, (n+18)); OP(b,c,d,e,a, (n+19))
+
+	/* 4 rounds of 20 operations each */
+#define EXPR(b,c,d) (((c ^ d) & b) ^ d)
+	OP20(0);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+	OP20(20);
+#undef EXPR
+#define EXPR(b,c,d) (((b | c) & d) | (b & c))
+	OP20(40);
+#undef EXPR
+#define EXPR(b,c,d) (c ^ d ^ b)
+	OP20(60);
+
+#undef EXPR
+#undef OP
+#undef OP20
+
+	ctx->hash[0] += a;
+	ctx->hash[1] += b;
+	ctx->hash[2] += c;
+	ctx->hash[3] += d;
+	ctx->hash[4] += e;
+}
+#else
+/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
+
+/* Compact version, almost twice as slow as fully unrolled */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 {
 	static const uint32_t rconsts[] ALIGN4 = {
 		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
 	};
 	int i, j;
-	int cnt;
+	int n;
 	uint32_t W[16+16];
 	uint32_t a, b, c, d, e;
 
 	/* On-stack work buffer frees up one register in the main loop
-	 * which otherwise will be needed to hold ctx pointer */
+	 * which otherwise will be needed to hold ctx pointer.
+	 *
+	 * The compiler is not smart enough to realize it, though. :(
+	 * If __attribute__((optimize("2"))) is added to the function,
+	 * only then gcc-9.3.1 spills "ctx" to stack and uses the freed
+	 * register (making code 6 bytes smaller, not just faster).
+	 */
 	for (i = 0; i < 16; i++)
 		W[i] = W[i+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[i]);
 
@@ -512,7 +581,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	e = ctx->hash[4];
 
 	/* 4 rounds of 20 operations each */
-	cnt = 0;
+	n = 0;
 	for (i = 0; i < 4; i++) {
 		j = 19;
 		do {
@@ -529,9 +598,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 				else /* i = 1 or 3 */
 					work ^= b;
  ge16:
-				W[cnt] = W[cnt+16] = rotl32(W[cnt+13] ^ W[cnt+8] ^ W[cnt+2] ^ W[cnt], 1);
+				W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
 			}
-			work += W[cnt];
+			work += W[n];
 			work += e + rotl32(a, 5) + rconsts[i];
 
 			/* Rotate by one for next time */
@@ -540,7 +609,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 			c = rotl32(b, 30);
 			b = a;
 			a = work;
-			cnt = (cnt + 1) & 15;
+			n = (n + 1) & 15;
 		} while (--j >= 0);
 	}
 
@@ -550,6 +619,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	ctx->hash[3] += d;
 	ctx->hash[4] += e;
 }
+#endif
 
 /* Constants for SHA512 from FIPS 180-2:4.2.3.
  * SHA256 constants from FIPS 180-2:4.2.2
-- 
cgit v1.2.3-55-g6feb


From 0b62a08777e29c34f947c791a1eded5b97e05699 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 30 Dec 2021 18:54:02 +0100
Subject: libbb/sha1: add config-selectable partially unrolled version

function                                             old     new   delta
sha1_process_block64                                 364     732    +368
static.rconsts                                        16       -     -16
------------------------------------------------------------------------------
(add/remove: 0/1 grow/shrink: 1/0 up/down: 368/-16)           Total: 352 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     |   3 +-
 libbb/hash_md5_sha.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index 13188ef03..c793f5939 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -60,7 +60,8 @@ config SHA1_SMALL
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
 	0               339  374          4149  4167
-	1,2,3           200  195           358   380
+	1               224  229           654   732
+	2,3             200  195           358   380
 
 config SHA3_SMALL
 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 75673e334..053ebe291 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -514,9 +514,9 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	do { \
 		uint32_t work = EXPR(B, C, D); \
 		if (n <= 15) \
-			work += W[n & 0xf] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
+			work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
 		if (n >= 16) \
-			work += W[n & 0xf] = rotl32(W[(n+13) & 0xf] ^ W[(n+8) & 0xf] ^ W[(n+2) & 0xf] ^ W[n & 0xf], 1); \
+			work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
 		E += work + rotl32(A, 5) + rconsts[n / 20]; \
 		B = rotl32(B, 30); \
 	} while (0)
@@ -549,9 +549,101 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	ctx->hash[3] += d;
 	ctx->hash[4] += e;
 }
-#else
-/* TODO: for CONFIG_SHA1_SMALL == 1, have a partially unrolled version? */
+#elif CONFIG_SHA1_SMALL == 1
+/* Middle-sized version, +300 bytes of code on x86. */
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
+{
+	static const uint32_t rconsts[] ALIGN4 = {
+		0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6
+	};
+	int j;
+	int n;
+	uint32_t W[16+16];
+	uint32_t a, b, c, d, e;
+
+	a = ctx->hash[0];
+	b = ctx->hash[1];
+	c = ctx->hash[2];
+	d = ctx->hash[3];
+	e = ctx->hash[4];
+
+	/* 1st round of 20 operations */
+	n = 0;
+	do {
+		uint32_t work = ((c ^ d) & b) ^ d;
+		W[n] = W[n+16] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]);
+		work += W[n];
+		work += e + rotl32(a, 5) + rconsts[0];
+		/* Rotate by one for next time */
+		e = d;
+		d = c;
+		c = rotl32(b, 30);
+		b = a;
+		a = work;
+		n = (n + 1) & 15;
+	} while (n != 0);
+	do {
+		uint32_t work = ((c ^ d) & b) ^ d;
+		W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+		work += W[n];
+		work += e + rotl32(a, 5) + rconsts[0];
+		e = d;
+		d = c;
+		c = rotl32(b, 30);
+		b = a;
+		a = work;
+		n = (n + 1) & 15;
+	} while (n != 4);
+	/* 2nd round of 20 operations */
+	j = 19;
+	do {
+		uint32_t work = c ^ d ^ b;
+		W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+		work += W[n];
+		work += e + rotl32(a, 5) + rconsts[1];
+		e = d;
+		d = c;
+		c = rotl32(b, 30);
+		b = a;
+		a = work;
+		n = (n + 1) & 15;
+	} while (--j >= 0);
+	/* 3rd round */
+	j = 19;
+	do {
+		uint32_t work = ((b | c) & d) | (b & c);
+		W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+		work += W[n];
+		work += e + rotl32(a, 5) + rconsts[2];
+		e = d;
+		d = c;
+		c = rotl32(b, 30);
+		b = a;
+		a = work;
+		n = (n + 1) & 15;
+	} while (--j >= 0);
+	/* 4th round */
+	j = 19;
+	do {
+		uint32_t work = c ^ d ^ b;
+		W[n] = W[n+16] = rotl32(W[n+13] ^ W[n+8] ^ W[n+2] ^ W[n], 1);
+		work += W[n];
+		work += e + rotl32(a, 5) + rconsts[3];
+		e = d;
+		d = c;
+		c = rotl32(b, 30);
+		b = a;
+		a = work;
+		n = (n + 1) & 15;
+	} while (--j >= 0);
 
+	ctx->hash[0] += a;
+	ctx->hash[1] += b;
+	ctx->hash[2] += c;
+	ctx->hash[3] += d;
+	ctx->hash[4] += e;
+}
+#else
 /* Compact version, almost twice as slow as fully unrolled */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 {
-- 
cgit v1.2.3-55-g6feb


From f09d088fdf6eeeba902fb5627930145a3058a5f0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 31 Dec 2021 17:06:00 +0100
Subject: libbb/sha1: shrink and speed up fully unrolled version

function                                             old     new   delta
sha1_process_block64                                4149    3950    -199

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     |  2 +-
 libbb/hash_md5_sha.c | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index c793f5939..d2054dc63 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               339  374          4149  4167
+	0               360  374          3950  4167
 	1               224  229           654   732
 	2,3             200  195           358   380
 
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 053ebe291..faf485df5 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -509,6 +509,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	d = ctx->hash[3];
 	e = ctx->hash[4];
 
+/* From kernel source comments:
+ * """
+ * If you have 32 registers or more, the compiler can (and should)
+ * try to change the array[] accesses into registers. However, on
+ * machines with less than ~25 registers, that won't really work,
+ * and at least gcc will make an unholy mess of it.
+ *
+ * So to avoid that mess which just slows things down, we force
+ * the stores to memory to actually happen (we might be better off
+ * with a 'W(t)=(val);asm("":"+m" (W(t))' there instead, as
+ * suggested by Artur Skawina - that will also make gcc unable to
+ * try to do the silly "optimize away loads" part because it won't
+ * see what the value will be).
+ * """
+ */
+#if defined(__i386__)
+# define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
+#else
+# define DO_NOT_TRY_PROPAGATING(m) ((void)0)
+#endif
+
 #undef OP
 #define OP(A,B,C,D,E, n) \
 	do { \
@@ -517,6 +538,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 			work += W[n & 15] = SWAP_BE32(((uint32_t*)ctx->wbuffer)[n]); \
 		if (n >= 16) \
 			work += W[n & 15] = rotl32(W[(n+13) & 15] ^ W[(n+8) & 15] ^ W[(n+2) & 15] ^ W[n & 15], 1); \
+		DO_NOT_TRY_PROPAGATING(W[n & 15]); \
 		E += work + rotl32(A, 5) + rconsts[n / 20]; \
 		B = rotl32(B, 30); \
 	} while (0)
-- 
cgit v1.2.3-55-g6feb


From 5f6817020467598868b7d1c9ca477d7ccd66b87d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 1 Jan 2022 12:21:01 +0100
Subject: libbb/sha1: assembly versions for x86

32 bits:
function                                             old     new   delta
sha1_process_block64                                3950    3657    -293
64 bits:
sha1_process_block64                                4167    3683    -484

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     |   2 +-
 libbb/hash_md5_sha.c | 417 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 418 insertions(+), 1 deletion(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index d2054dc63..e027c14a8 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               360  374          3950  4167
+	0               367  367          3657  3683
 	1               224  229           654   732
 	2,3             200  195           358   380
 
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index faf485df5..9de30dfe6 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -8,6 +8,9 @@
  */
 #include "libbb.h"
 
+#define STR1(s) #s
+#define STR(s) STR1(s)
+
 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
 
 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
@@ -491,6 +494,419 @@ unsigned FAST_FUNC md5_end(md5_ctx_t *ctx, void *resbuf)
  */
 
 #if CONFIG_SHA1_SMALL == 0
+# if defined(__GNUC__) && defined(__i386__)
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
+{
+	BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 76);
+	asm(
+"\n\
+	pushl	%ebp	#                                           \n\
+	pushl	%edi	#                                           \n\
+	pushl	%esi	#                                           \n\
+	pushl	%ebx	#                                           \n\
+	pushl	%eax                                                \n\
+	movl	$15, %edi                                           \n\
+1:                                                                  \n\
+	movl	(%eax,%edi,4), %esi                                 \n\
+	bswap	%esi                                                \n\
+	pushl	%esi                                                \n\
+	decl	%edi                                                \n\
+	jns	1b                                                  \n\
+	movl	80(%eax), %ebx	# b = ctx->hash[1]                  \n\
+	movl	84(%eax), %ecx	# c = ctx->hash[2]                  \n\
+	movl	88(%eax), %edx	# d = ctx->hash[3]                  \n\
+	movl	92(%eax), %ebp	# e = ctx->hash[4]                  \n\
+	movl	76(%eax), %eax	# a = ctx->hash[0]                  \n\
+#Register and stack use:                                            \n\
+# eax..edx: a..d                                                    \n\
+# ebp: e                                                            \n\
+# esi,edi: temps                                                    \n\
+# 4*n(%esp): W[n]                                                   \n\
+"
+#define RD1As(a,b,c,d,e, n, RCONST) \
+"\n\
+	##movl	4*"n"(%esp), %esi	# n=0, W[0] already in %esi \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	andl	"b", %edi		# &b                        \n\
+	xorl	"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + W[n]        \n\
+	addl	%edi, "e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD1Bs(a,b,c,d,e, n, RCONST) \
+"\n\
+	movl	4*"n"(%esp), %esi	# W[n]                      \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	andl	"b", %edi		# &b                        \n\
+	xorl	"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + W[n]        \n\
+	addl	%edi, "e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	4*"n13"(%esp), %esi	# W[(n+13) & 15]            \n\
+	xorl	4*"n8"(%esp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	4*"n2"(%esp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	4*"n"(%esp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, 4*"n"(%esp)	# store to W[n & 15]        \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	andl	"b", %edi		# &b                        \n\
+	xorl	"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + mixed_W     \n\
+	addl	%edi, "e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD1A(a,b,c,d,e, n) RD1As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
+#define RD1B(a,b,c,d,e, n) RD1Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR((n)), STR(RCONST))
+#define RD1C(a,b,c,d,e, n) RD1Cs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x5A827999
+	RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
+	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
+	RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
+	RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
+#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	4*"n13"(%esp), %esi	# W[(n+13) & 15]            \n\
+	xorl	4*"n8"(%esp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	4*"n2"(%esp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	4*"n"(%esp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, 4*"n"(%esp)	# store to W[n & 15]        \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	xorl	"b", %edi		# ^b                        \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + mixed_W     \n\
+	addl	%edi, "e"		# e += (c ^ d ^ b)          \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD2(a,b,c,d,e, n) RD2s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x6ED9EBA1
+	RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
+	RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
+	RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
+	RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
+
+#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	"b", %edi		# di: b                     \n\
+	movl	"b", %esi		# si: b                     \n\
+	orl	"c", %edi		# di: b | c                 \n\
+	andl	"c", %esi		# si: b & c                 \n\
+	andl	"d", %edi		# di: (b | c) & d           \n\
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)   \n\
+	movl	4*"n13"(%esp), %esi	# W[(n+13) & 15]            \n\
+	xorl	4*"n8"(%esp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	4*"n2"(%esp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	4*"n"(%esp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, 4*"n"(%esp)	# store to W[n & 15]        \n\
+	addl	%edi, "e"		# += ((b | c) & d) | (b & c)\n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + mixed_W     \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD3(a,b,c,d,e, n) RD3s("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x8F1BBCDC
+	RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
+	RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
+	RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
+	RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
+
+#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	4*"n13"(%esp), %esi	# W[(n+13) & 15]            \n\
+	xorl	4*"n8"(%esp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	4*"n2"(%esp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	4*"n"(%esp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, 4*"n"(%esp)	# store to W[n & 15]        \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	xorl	"b", %edi		# ^b                        \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + mixed_W     \n\
+	addl	%edi, "e"		# e += (c ^ d ^ b)          \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	4*"n13"(%esp), %esi	# W[(n+13) & 15]            \n\
+	xorl	4*"n8"(%esp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	4*"n2"(%esp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	4*"n"(%esp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	##movl	%esi, 4*"n"(%esp)	# store to W[n & 15] elided \n\
+	movl	"c", %edi		# c                         \n\
+	xorl	"d", %edi		# ^d                        \n\
+	xorl	"b", %edi		# ^b                        \n\
+	leal	"RCONST"("e",%esi), "e"	# e += RCONST + mixed_W     \n\
+	addl	%edi, "e"		# e += (c ^ d ^ b)          \n\
+	movl	"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, "e"		# e += rotl32(a,5)          \n\
+	rorl	$2, "b"			# b = rotl32(b,30)          \n\
+"
+#define RD4A(a,b,c,d,e, n) RD4As("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#define RD4B(a,b,c,d,e, n) RD4Bs("%e"STR(a),"%e"STR(b),"%e"STR(c),"%e"STR(d),"%e"STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0xCA62C1D6
+	RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
+	RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
+	RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
+	RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
+
+"\n\
+	movl	4*16(%esp), %esi	#                           \n\
+	addl	$4*(16+1), %esp		#                           \n\
+	addl	%eax, 76(%esi)  	# ctx->hash[0] += a         \n\
+	addl	%ebx, 80(%esi)  	# ctx->hash[1] += b         \n\
+	addl	%ecx, 84(%esi)  	# ctx->hash[2] += c         \n\
+	addl	%edx, 88(%esi)  	# ctx->hash[3] += d         \n\
+	addl	%ebp, 92(%esi)  	# ctx->hash[4] += e         \n\
+	popl	%ebx			#                           \n\
+	popl	%esi			#                           \n\
+	popl	%edi			#                           \n\
+	popl	%ebp			#                           \n\
+"
+	); /* asm */
+#undef RCONST
+}
+# elif defined(__GNUC__) && defined(__x86_64__)
+static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
+{
+	BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
+	asm(
+// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save)
+"\n\
+	##pushq	%r15		#                                   \n\
+	##pushq	%r14		#                                   \n\
+	##pushq	%r13		#                                   \n\
+	##pushq	%r12		#                                   \n\
+	##pushq	%rbp		#                                   \n\
+	##pushq	%rbx		#                                   \n\
+	movq	%rbp, %r8	# callee-saved                      \n\
+	movq	%rbx, %r9	# callee-saved                      \n\
+	movq	%rdi, %r10	# we need ctx at the end            \n\
+	movl	$15, %eax                                           \n\
+1:                                                                  \n\
+	movl	(%rdi,%rax,4), %esi                                 \n\
+	bswap	%esi                                                \n\
+	movl	%esi, -64(%rsp,%rax,4)                              \n\
+	decl	%eax                                                \n\
+	jns	1b                                                  \n\
+	movl	80(%rdi), %eax	# a = ctx->hash[0]                  \n\
+	movl	84(%rdi), %ebx	# b = ctx->hash[1]                  \n\
+	movl	88(%rdi), %ecx	# c = ctx->hash[2]                  \n\
+	movl	92(%rdi), %edx	# d = ctx->hash[3]                  \n\
+	movl	96(%rdi), %ebp	# e = ctx->hash[4]                  \n\
+#Register and stack use:                                            \n\
+# eax..edx: a..d                                                    \n\
+# ebp: e                                                            \n\
+# esi,edi: temps                                                    \n\
+# -64+4*n(%rsp): W[n]                                               \n\
+"
+#define RD1As(a,b,c,d,e, n, RCONST) \
+"\n\
+	##movl	-64+4*"n"(%rsp), %esi	# n=0, W[0] already in %esi \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	andl	%e"b", %edi		# &b                        \n\
+	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n]    \n\
+	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD1Bs(a,b,c,d,e, n, RCONST) \
+"\n\
+	movl	-64+4*"n"(%rsp), %esi	# W[n]                      \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	andl	%e"b", %edi		# &b                        \n\
+	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n]    \n\
+	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
+	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	andl	%e"b", %edi		# &b                        \n\
+	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
+	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
+#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
+#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x5A827999
+	RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
+	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
+	RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
+	RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
+#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
+	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	xorl	%e"b", %edi		# ^b                        \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
+	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
+#undef  RCONST
+#define RCONST 0x6ED9EBA1
+	RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
+	RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
+	RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
+	RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
+
+#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	%e"b", %edi		# di: b                     \n\
+	movl	%e"b", %esi		# si: b                     \n\
+	orl	%e"c", %edi		# di: b | c                 \n\
+	andl	%e"c", %esi		# si: b & c                 \n\
+	andl	%e"d", %edi		# di: (b | c) & d           \n\
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)   \n\
+	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
+	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	addl	%edi, %e"e"		# += ((b | c) & d) | (b & c)\n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
+#undef  RCONST
+//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement"
+#define RCONST  -0x70e44324
+	RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
+	RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
+	RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
+	RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
+
+#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
+	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	xorl	%e"b", %edi		# ^b                        \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
+	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
+	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
+	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
+	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	roll	%esi			#                           \n\
+	##movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15] elided \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	xorl	%e"b", %edi		# ^b                        \n\
+	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
+	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
+#undef  RCONST
+//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement"
+#define RCONST  -0x359d3e2a
+	RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
+	RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
+	RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
+	RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
+
+"\n\
+	movq	%r10, %rdi	#                                   \n\
+	addl	%eax, 80(%rdi)  # ctx->hash[0] += a                 \n\
+	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b                 \n\
+	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c                 \n\
+	addl	%edx, 92(%rdi)  # ctx->hash[3] += d                 \n\
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e                 \n\
+	movq	%r9, %rbx	# callee-saved                      \n\
+	movq	%r8, %rbp	# callee-saved                      \n\
+	##popq	%rbx		#                                   \n\
+	##popq	%rbp		#                                   \n\
+	##popq	%r12		#                                   \n\
+	##popq	%r13		#                                   \n\
+	##popq	%r14		#                                   \n\
+	##popq	%r15		#                                   \n\
+"
+	); /* asm */
+#undef RCONST
+}
+# else
 /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
  * It seems further speedup can be achieved by handling more than
  * 64 bytes per one function call (coreutils does that).
@@ -571,6 +987,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 	ctx->hash[3] += d;
 	ctx->hash[4] += e;
 }
+# endif
 #elif CONFIG_SHA1_SMALL == 1
 /* Middle-sized version, +300 bytes of code on x86. */
 static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
-- 
cgit v1.2.3-55-g6feb


From d643010feeef312c77d7f51c3dd476d4e605c982 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 1 Jan 2022 15:01:53 +0100
Subject: libbb/sha1: shrink x86_64 version - use r8..15 for W[8..15]

function                                             old     new   delta
sha1_process_block64                                3683    3562    -121

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     |   2 +-
 libbb/hash_md5_sha.c | 299 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 240 insertions(+), 61 deletions(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index e027c14a8..f66f65f81 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               367  367          3657  3683
+	0               367  367          3657  3562
 	1               224  229           654   732
 	2,3             200  195           358   380
 
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 9de30dfe6..a4e36066a 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -700,22 +700,194 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 {
 	BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
 	asm(
-// TODO: store W[] in r8..r15? (r8..r11 are callee-clobbered, no need to save)
 "\n\
-	##pushq	%r15		#                                   \n\
-	##pushq	%r14		#                                   \n\
-	##pushq	%r13		#                                   \n\
-	##pushq	%r12		#                                   \n\
-	##pushq	%rbp		#                                   \n\
-	##pushq	%rbx		#                                   \n\
-	movq	%rbp, %r8	# callee-saved                      \n\
-	movq	%rbx, %r9	# callee-saved                      \n\
-	movq	%rdi, %r10	# we need ctx at the end            \n\
-	movl	$15, %eax                                           \n\
+	pushq	%r15		#                                   \n\
+	pushq	%r14		#                                   \n\
+	pushq	%r13		#                                   \n\
+	pushq	%r12		#                                   \n\
+	pushq	%rbp		#                                   \n\
+	pushq	%rbx		#                                   \n\
+	pushq	%rdi		# we need ctx at the end            \n\
+                                                                    \n\
+#Register and stack use:                                            \n\
+# eax..edx: a..d                                                    \n\
+# ebp: e                                                            \n\
+# esi,edi: temps                                                    \n\
+# -32+4*n(%rsp),r8...r15: W[0..7,8..15]                             \n\
+	.macro	loadW n,r                                           \n\
+	.if \\n == 0                                                \n\
+	movl	-32+4*0(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 1                                                \n\
+	movl	-32+4*1(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 2                                                \n\
+	movl	-32+4*2(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 3                                                \n\
+	movl	-32+4*3(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 4                                                \n\
+	movl	-32+4*4(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 5                                                \n\
+	movl	-32+4*5(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 6                                                \n\
+	movl	-32+4*6(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 7                                                \n\
+	movl	-32+4*7(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 8                                                \n\
+	movl	%r8d,\\r                                            \n\
+	.endif                                                      \n\
+	.if \\n == 9                                                \n\
+	movl	%r9d,\\r                                            \n\
+	.endif                                                      \n\
+	.if \\n == 10                                               \n\
+	movl	%r10d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 11                                               \n\
+	movl	%r11d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 12                                               \n\
+	movl	%r12d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 13                                               \n\
+	movl	%r13d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 14                                               \n\
+	movl	%r14d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 15                                               \n\
+	movl	%r15d,\\r                                           \n\
+	.endif                                                      \n\
+	.endm                                                       \n\
+                                                                    \n\
+	.macro	storeW r,n                                          \n\
+	.if \\n == 0                                                \n\
+	movl	\\r,-32+4*0(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 1                                                \n\
+	movl	\\r,-32+4*1(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 2                                                \n\
+	movl	\\r,-32+4*2(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 3                                                \n\
+	movl	\\r,-32+4*3(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 4                                                \n\
+	movl	\\r,-32+4*4(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 5                                                \n\
+	movl	\\r,-32+4*5(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 6                                                \n\
+	movl	\\r,-32+4*6(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 7                                                \n\
+	movl	\\r,-32+4*7(%rsp)                                   \n\
+	.endif                                                      \n\
+	.if \\n == 8                                                \n\
+	movl	\\r,%r8d                                            \n\
+	.endif                                                      \n\
+	.if \\n == 9                                                \n\
+	movl	\\r,%r9d                                            \n\
+	.endif                                                      \n\
+	.if \\n == 10                                               \n\
+	movl	\\r,%r10d                                           \n\
+	.endif                                                      \n\
+	.if \\n == 11                                               \n\
+	movl	\\r,%r11d                                           \n\
+	.endif                                                      \n\
+	.if \\n == 12                                               \n\
+	movl	\\r,%r12d                                           \n\
+	.endif                                                      \n\
+	.if \\n == 13                                               \n\
+	movl	\\r,%r13d                                           \n\
+	.endif                                                      \n\
+	.if \\n == 14                                               \n\
+	movl	\\r,%r14d                                           \n\
+	.endif                                                      \n\
+	.if \\n == 15                                               \n\
+	movl	\\r,%r15d                                           \n\
+	.endif                                                      \n\
+	.endm                                                       \n\
+                                                                    \n\
+	.macro	xorW n,r                                            \n\
+	.if \\n == 0                                                \n\
+	xorl	-32+4*0(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 1                                                \n\
+	xorl	-32+4*1(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 2                                                \n\
+	xorl	-32+4*2(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 3                                                \n\
+	xorl	-32+4*3(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 4                                                \n\
+	xorl	-32+4*4(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 5                                                \n\
+	xorl	-32+4*5(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 6                                                \n\
+	xorl	-32+4*6(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 7                                                \n\
+	xorl	-32+4*7(%rsp),\\r                                   \n\
+	.endif                                                      \n\
+	.if \\n == 8                                                \n\
+	xorl	%r8d,\\r                                            \n\
+	.endif                                                      \n\
+	.if \\n == 9                                                \n\
+	xorl	%r9d,\\r                                            \n\
+	.endif                                                      \n\
+	.if \\n == 10                                               \n\
+	xorl	%r10d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 11                                               \n\
+	xorl	%r11d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 12                                               \n\
+	xorl	%r12d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 13                                               \n\
+	xorl	%r13d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 14                                               \n\
+	xorl	%r14d,\\r                                           \n\
+	.endif                                                      \n\
+	.if \\n == 15                                               \n\
+	xorl	%r15d,\\r                                           \n\
+	.endif                                                      \n\
+	.endm                                                       \n\
+                                                                    \n\
+	movl	4*8(%rdi), %r8d                                     \n\
+	bswap	%r8d                                                \n\
+	movl	4*9(%rdi), %r9d                                     \n\
+	bswap	%r9d                                                \n\
+	movl	4*10(%rdi), %r10d                                   \n\
+	bswap	%r10d                                               \n\
+	movl	4*11(%rdi), %r11d                                   \n\
+	bswap	%r11d                                               \n\
+	movl	4*12(%rdi), %r12d                                   \n\
+	bswap	%r12d                                               \n\
+	movl	4*13(%rdi), %r13d                                   \n\
+	bswap	%r13d                                               \n\
+	movl	4*14(%rdi), %r14d                                   \n\
+	bswap	%r14d                                               \n\
+	movl	4*15(%rdi), %r15d                                   \n\
+	bswap	%r15d                                               \n\
+	movl	$7, %eax                                            \n\
 1:                                                                  \n\
 	movl	(%rdi,%rax,4), %esi                                 \n\
 	bswap	%esi                                                \n\
-	movl	%esi, -64(%rsp,%rax,4)                              \n\
+	movl	%esi, -32(%rsp,%rax,4)                              \n\
 	decl	%eax                                                \n\
 	jns	1b                                                  \n\
 	movl	80(%rdi), %eax	# a = ctx->hash[0]                  \n\
@@ -723,15 +895,10 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	movl	88(%rdi), %ecx	# c = ctx->hash[2]                  \n\
 	movl	92(%rdi), %edx	# d = ctx->hash[3]                  \n\
 	movl	96(%rdi), %ebp	# e = ctx->hash[4]                  \n\
-#Register and stack use:                                            \n\
-# eax..edx: a..d                                                    \n\
-# ebp: e                                                            \n\
-# esi,edi: temps                                                    \n\
-# -64+4*n(%rsp): W[n]                                               \n\
 "
 #define RD1As(a,b,c,d,e, n, RCONST) \
 "\n\
-	##movl	-64+4*"n"(%rsp), %esi	# n=0, W[0] already in %esi \n\
+	##loadW	"n", %esi		# n=0, W[0] already in %esi \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	andl	%e"b", %edi		# &b                        \n\
@@ -745,7 +912,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 "
 #define RD1Bs(a,b,c,d,e, n, RCONST) \
 "\n\
-	movl	-64+4*"n"(%rsp), %esi	# W[n]                      \n\
+	loadW	"n", %esi		# W[n]                      \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	andl	%e"b", %edi		# &b                        \n\
@@ -757,14 +924,27 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
 	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
 "
-#define RD1Cs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+#define RD1Cs(a,b,c,d,e, n, RCONST) \
 "\n\
-	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
-	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
-	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
-	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	movl	%e"c", %edi		# c                         \n\
+	xorl	%e"d", %edi		# ^d                        \n\
+	andl	%e"b", %edi		# &b                        \n\
+	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
+	leal	"RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n]   \n\
+	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
+	movl	%e"a", %esi		#                           \n\
+	roll	$5, %esi		# rotl32(a,5)               \n\
+	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
+	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
+"
+#define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \
+"\n\
+	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
+	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
+	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
+	xorW	"n", %esi		# ^W[n & 15]                \n\
 	roll	%esi			#                           \n\
-	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	storeW	%esi, "n"		# store to W[n & 15]        \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	andl	%e"b", %edi		# &b                        \n\
@@ -776,23 +956,24 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
 	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
 "
-#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
-#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR((n)), STR(RCONST))
-#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
+#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
+#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
+#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
+#define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
 #undef  RCONST
 #define RCONST 0x5A827999
 	RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
-	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1B(cx,dx,bp,ax,bx, 8) RD1B(bx,cx,dx,bp,ax, 9)
-	RD1B(ax,bx,cx,dx,bp,10) RD1B(bp,ax,bx,cx,dx,11) RD1B(dx,bp,ax,bx,cx,12) RD1B(cx,dx,bp,ax,bx,13) RD1B(bx,cx,dx,bp,ax,14)
-	RD1B(ax,bx,cx,dx,bp,15) RD1C(bp,ax,bx,cx,dx,16) RD1C(dx,bp,ax,bx,cx,17) RD1C(cx,dx,bp,ax,bx,18) RD1C(bx,cx,dx,bp,ax,19)
+	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9)
+	RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14)
+	RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19)
 #define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
 "\n\
-	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
-	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
-	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
-	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
+	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
+	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
+	xorW	"n", %esi		# ^W[n & 15]                \n\
 	roll	%esi			#                           \n\
-	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	storeW	%esi, "n"		# store to W[n & 15]        \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	xorl	%e"b", %edi		# ^b                        \n\
@@ -819,12 +1000,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	andl	%e"c", %esi		# si: b & c                 \n\
 	andl	%e"d", %edi		# di: (b | c) & d           \n\
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)   \n\
-	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
-	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
-	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
-	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
+	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
+	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
+	xorW	"n", %esi		# ^W[n & 15]                \n\
 	roll	%esi			#                           \n\
-	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	storeW	%esi, "n"		# store to W[n & 15]        \n\
 	addl	%edi, %e"e"		# += ((b | c) & d) | (b & c)\n\
 	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
 	movl	%e"a", %esi		#                           \n\
@@ -843,12 +1024,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 
 #define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
 "\n\
-	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
-	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
-	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
-	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
+	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
+	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
+	xorW	"n", %esi		# ^W[n & 15]                \n\
 	roll	%esi			#                           \n\
-	movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15]        \n\
+	storeW	%esi, "n"		# store to W[n & 15]        \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	xorl	%e"b", %edi		# ^b                        \n\
@@ -861,12 +1042,12 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 "
 #define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
 "\n\
-	movl	-64+4*"n13"(%rsp), %esi	# W[(n+13) & 15]            \n\
-	xorl	-64+4*"n8"(%rsp), %esi	# ^W[(n+8) & 15]            \n\
-	xorl	-64+4*"n2"(%rsp), %esi	# ^W[(n+2) & 15]            \n\
-	xorl	-64+4*"n"(%rsp), %esi	# ^W[n & 15]                \n\
+	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
+	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
+	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
+	xorW	"n", %esi		# ^W[n & 15]                \n\
 	roll	%esi			#                           \n\
-	##movl	%esi, -64+4*"n"(%rsp)	# store to W[n & 15] elided \n\
+	#storeW	%esi, "n"		# store to W[n & 15] elided \n\
 	movl	%e"c", %edi		# c                         \n\
 	xorl	%e"d", %edi		# ^d                        \n\
 	xorl	%e"b", %edi		# ^b                        \n\
@@ -888,20 +1069,18 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
 
 "\n\
-	movq	%r10, %rdi	#                                   \n\
+	popq	%rdi		#                                   \n\
 	addl	%eax, 80(%rdi)  # ctx->hash[0] += a                 \n\
 	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b                 \n\
 	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c                 \n\
 	addl	%edx, 92(%rdi)  # ctx->hash[3] += d                 \n\
 	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e                 \n\
-	movq	%r9, %rbx	# callee-saved                      \n\
-	movq	%r8, %rbp	# callee-saved                      \n\
-	##popq	%rbx		#                                   \n\
-	##popq	%rbp		#                                   \n\
-	##popq	%r12		#                                   \n\
-	##popq	%r13		#                                   \n\
-	##popq	%r14		#                                   \n\
-	##popq	%r15		#                                   \n\
+	popq	%rbx		#                                   \n\
+	popq	%rbp		#                                   \n\
+	popq	%r12		#                                   \n\
+	popq	%r13		#                                   \n\
+	popq	%r14		#                                   \n\
+	popq	%r15		#                                   \n\
 "
 	); /* asm */
 #undef RCONST
-- 
cgit v1.2.3-55-g6feb


From 4d4f1f2096f06d69a6f205f0d8e33d4398f25677 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 1 Jan 2022 15:42:15 +0100
Subject: libbb/sha1: x86_64 version: bswap in 64-bit chunks

function                                             old     new   delta
sha1_process_block64                                3562    3570      +8

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src     |  2 +-
 libbb/hash_md5_sha.c | 42 ++++++++++++++++++++++--------------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/libbb/Config.src b/libbb/Config.src
index f66f65f81..42a2283aa 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               367  367          3657  3562
+	0               367  367          3657  3570
 	1               224  229           654   732
 	2,3             200  195           358   380
 
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a4e36066a..959bfc951 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -867,27 +867,29 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 	.endif                                                      \n\
 	.endm                                                       \n\
                                                                     \n\
-	movl	4*8(%rdi), %r8d                                     \n\
-	bswap	%r8d                                                \n\
-	movl	4*9(%rdi), %r9d                                     \n\
-	bswap	%r9d                                                \n\
-	movl	4*10(%rdi), %r10d                                   \n\
-	bswap	%r10d                                               \n\
-	movl	4*11(%rdi), %r11d                                   \n\
-	bswap	%r11d                                               \n\
-	movl	4*12(%rdi), %r12d                                   \n\
-	bswap	%r12d                                               \n\
-	movl	4*13(%rdi), %r13d                                   \n\
-	bswap	%r13d                                               \n\
-	movl	4*14(%rdi), %r14d                                   \n\
-	bswap	%r14d                                               \n\
-	movl	4*15(%rdi), %r15d                                   \n\
-	bswap	%r15d                                               \n\
-	movl	$7, %eax                                            \n\
+	movq	4*8(%rdi), %r8                                      \n\
+	bswap	%r8                                                 \n\
+	movl	%r8d, %r9d                                          \n\
+	shrq	$32, %r8                                            \n\
+	movq	4*10(%rdi), %r10                                    \n\
+	bswap	%r10                                                \n\
+	movl	%r10d, %r11d                                        \n\
+	shrq	$32, %r10                                           \n\
+	movq	4*12(%rdi), %r12                                    \n\
+	bswap	%r12                                                \n\
+	movl	%r12d, %r13d                                        \n\
+	shrq	$32, %r12                                           \n\
+	movq	4*14(%rdi), %r14                                    \n\
+	bswap	%r14                                                \n\
+	movl	%r14d, %r15d                                        \n\
+	shrq	$32, %r14                                           \n\
+                                                                    \n\
+	movl	$3, %eax                                            \n\
 1:                                                                  \n\
-	movl	(%rdi,%rax,4), %esi                                 \n\
-	bswap	%esi                                                \n\
-	movl	%esi, -32(%rsp,%rax,4)                              \n\
+	movq	(%rdi,%rax,8), %rsi                                 \n\
+	bswap	%rsi                                                \n\
+	rolq	$32, %rsi                                           \n\
+	movq	%rsi, -32(%rsp,%rax,8)                              \n\
 	decl	%eax                                                \n\
 	jns	1b                                                  \n\
 	movl	80(%rdi), %eax	# a = ctx->hash[0]                  \n\
-- 
cgit v1.2.3-55-g6feb


From 5c0c5582319a5123635c9fd62f8e99ef01cceb3f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 2 Jan 2022 01:56:35 +0100
Subject: libbb/sha1: code shrink in medium-speed version

function                                             old     new   delta
sha1_process_block64                                 654     641     -13

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 959bfc951..7eca3de4d 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -1121,7 +1121,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
  * see what the value will be).
  * """
  */
-#if defined(__i386__)
+#if defined(__GNUC__) && defined(__i386__)
 # define DO_NOT_TRY_PROPAGATING(m) asm("":"+m"(m))
 #else
 # define DO_NOT_TRY_PROPAGATING(m) ((void)0)
@@ -1212,7 +1212,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx)
 		c = rotl32(b, 30);
 		b = a;
 		a = work;
-		n = (n + 1) & 15;
+		n = (n + 1) /* & 15*/;
 	} while (n != 4);
 	/* 2nd round of 20 operations */
 	j = 19;
-- 
cgit v1.2.3-55-g6feb


From 05fd13ebec869fc5e6f226481a2405a2685e8db1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 3 Jan 2022 01:57:29 +0100
Subject: libbb/sha1: x86_64 version: move to a separate .S file, no code
 changes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Kbuild.src            |    1 +
 libbb/hash_md5_sha.c        |  392 +------------
 libbb/hash_md5_sha_x86-64.S | 1349 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1353 insertions(+), 389 deletions(-)
 create mode 100644 libbb/hash_md5_sha_x86-64.S

diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 2fa239857..19b8aad60 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -56,6 +56,7 @@ lib-y += login.o
 lib-y += make_directory.o
 lib-y += makedev.o
 lib-y += hash_md5_sha.o
+lib-y += hash_md5_sha_x86-64.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index 7eca3de4d..ee19c1cb7 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -696,397 +696,11 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
 #undef RCONST
 }
 # elif defined(__GNUC__) && defined(__x86_64__)
-static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
-{
-	BUILD_BUG_ON(offsetof(sha1_ctx_t, hash) != 80);
-	asm(
-"\n\
-	pushq	%r15		#                                   \n\
-	pushq	%r14		#                                   \n\
-	pushq	%r13		#                                   \n\
-	pushq	%r12		#                                   \n\
-	pushq	%rbp		#                                   \n\
-	pushq	%rbx		#                                   \n\
-	pushq	%rdi		# we need ctx at the end            \n\
-                                                                    \n\
-#Register and stack use:                                            \n\
-# eax..edx: a..d                                                    \n\
-# ebp: e                                                            \n\
-# esi,edi: temps                                                    \n\
-# -32+4*n(%rsp),r8...r15: W[0..7,8..15]                             \n\
-	.macro	loadW n,r                                           \n\
-	.if \\n == 0                                                \n\
-	movl	-32+4*0(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 1                                                \n\
-	movl	-32+4*1(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 2                                                \n\
-	movl	-32+4*2(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 3                                                \n\
-	movl	-32+4*3(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 4                                                \n\
-	movl	-32+4*4(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 5                                                \n\
-	movl	-32+4*5(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 6                                                \n\
-	movl	-32+4*6(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 7                                                \n\
-	movl	-32+4*7(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 8                                                \n\
-	movl	%r8d,\\r                                            \n\
-	.endif                                                      \n\
-	.if \\n == 9                                                \n\
-	movl	%r9d,\\r                                            \n\
-	.endif                                                      \n\
-	.if \\n == 10                                               \n\
-	movl	%r10d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 11                                               \n\
-	movl	%r11d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 12                                               \n\
-	movl	%r12d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 13                                               \n\
-	movl	%r13d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 14                                               \n\
-	movl	%r14d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 15                                               \n\
-	movl	%r15d,\\r                                           \n\
-	.endif                                                      \n\
-	.endm                                                       \n\
-                                                                    \n\
-	.macro	storeW r,n                                          \n\
-	.if \\n == 0                                                \n\
-	movl	\\r,-32+4*0(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 1                                                \n\
-	movl	\\r,-32+4*1(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 2                                                \n\
-	movl	\\r,-32+4*2(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 3                                                \n\
-	movl	\\r,-32+4*3(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 4                                                \n\
-	movl	\\r,-32+4*4(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 5                                                \n\
-	movl	\\r,-32+4*5(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 6                                                \n\
-	movl	\\r,-32+4*6(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 7                                                \n\
-	movl	\\r,-32+4*7(%rsp)                                   \n\
-	.endif                                                      \n\
-	.if \\n == 8                                                \n\
-	movl	\\r,%r8d                                            \n\
-	.endif                                                      \n\
-	.if \\n == 9                                                \n\
-	movl	\\r,%r9d                                            \n\
-	.endif                                                      \n\
-	.if \\n == 10                                               \n\
-	movl	\\r,%r10d                                           \n\
-	.endif                                                      \n\
-	.if \\n == 11                                               \n\
-	movl	\\r,%r11d                                           \n\
-	.endif                                                      \n\
-	.if \\n == 12                                               \n\
-	movl	\\r,%r12d                                           \n\
-	.endif                                                      \n\
-	.if \\n == 13                                               \n\
-	movl	\\r,%r13d                                           \n\
-	.endif                                                      \n\
-	.if \\n == 14                                               \n\
-	movl	\\r,%r14d                                           \n\
-	.endif                                                      \n\
-	.if \\n == 15                                               \n\
-	movl	\\r,%r15d                                           \n\
-	.endif                                                      \n\
-	.endm                                                       \n\
-                                                                    \n\
-	.macro	xorW n,r                                            \n\
-	.if \\n == 0                                                \n\
-	xorl	-32+4*0(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 1                                                \n\
-	xorl	-32+4*1(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 2                                                \n\
-	xorl	-32+4*2(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 3                                                \n\
-	xorl	-32+4*3(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 4                                                \n\
-	xorl	-32+4*4(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 5                                                \n\
-	xorl	-32+4*5(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 6                                                \n\
-	xorl	-32+4*6(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 7                                                \n\
-	xorl	-32+4*7(%rsp),\\r                                   \n\
-	.endif                                                      \n\
-	.if \\n == 8                                                \n\
-	xorl	%r8d,\\r                                            \n\
-	.endif                                                      \n\
-	.if \\n == 9                                                \n\
-	xorl	%r9d,\\r                                            \n\
-	.endif                                                      \n\
-	.if \\n == 10                                               \n\
-	xorl	%r10d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 11                                               \n\
-	xorl	%r11d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 12                                               \n\
-	xorl	%r12d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 13                                               \n\
-	xorl	%r13d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 14                                               \n\
-	xorl	%r14d,\\r                                           \n\
-	.endif                                                      \n\
-	.if \\n == 15                                               \n\
-	xorl	%r15d,\\r                                           \n\
-	.endif                                                      \n\
-	.endm                                                       \n\
-                                                                    \n\
-	movq	4*8(%rdi), %r8                                      \n\
-	bswap	%r8                                                 \n\
-	movl	%r8d, %r9d                                          \n\
-	shrq	$32, %r8                                            \n\
-	movq	4*10(%rdi), %r10                                    \n\
-	bswap	%r10                                                \n\
-	movl	%r10d, %r11d                                        \n\
-	shrq	$32, %r10                                           \n\
-	movq	4*12(%rdi), %r12                                    \n\
-	bswap	%r12                                                \n\
-	movl	%r12d, %r13d                                        \n\
-	shrq	$32, %r12                                           \n\
-	movq	4*14(%rdi), %r14                                    \n\
-	bswap	%r14                                                \n\
-	movl	%r14d, %r15d                                        \n\
-	shrq	$32, %r14                                           \n\
-                                                                    \n\
-	movl	$3, %eax                                            \n\
-1:                                                                  \n\
-	movq	(%rdi,%rax,8), %rsi                                 \n\
-	bswap	%rsi                                                \n\
-	rolq	$32, %rsi                                           \n\
-	movq	%rsi, -32(%rsp,%rax,8)                              \n\
-	decl	%eax                                                \n\
-	jns	1b                                                  \n\
-	movl	80(%rdi), %eax	# a = ctx->hash[0]                  \n\
-	movl	84(%rdi), %ebx	# b = ctx->hash[1]                  \n\
-	movl	88(%rdi), %ecx	# c = ctx->hash[2]                  \n\
-	movl	92(%rdi), %edx	# d = ctx->hash[3]                  \n\
-	movl	96(%rdi), %ebp	# e = ctx->hash[4]                  \n\
-"
-#define RD1As(a,b,c,d,e, n, RCONST) \
-"\n\
-	##loadW	"n", %esi		# n=0, W[0] already in %esi \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	andl	%e"b", %edi		# &b                        \n\
-	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n]    \n\
-	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD1Bs(a,b,c,d,e, n, RCONST) \
-"\n\
-	loadW	"n", %esi		# W[n]                      \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	andl	%e"b", %edi		# &b                        \n\
-	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + W[n]    \n\
-	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD1Cs(a,b,c,d,e, n, RCONST) \
-"\n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	andl	%e"b", %edi		# &b                        \n\
-	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
-	leal	"RCONST"(%r"e",%r"n"), %e"e" # e += RCONST + W[n]   \n\
-	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD1Ds(a,b,c,d,e, n13,n8,n2,n, RCONST) \
-"\n\
-	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
-	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
-	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
-	xorW	"n", %esi		# ^W[n & 15]                \n\
-	roll	%esi			#                           \n\
-	storeW	%esi, "n"		# store to W[n & 15]        \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	andl	%e"b", %edi		# &b                        \n\
-	xorl	%e"d", %edi		# (((c ^ d) & b) ^ d)       \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
-	addl	%edi, %e"e"		# e += (((c ^ d) & b) ^ d)  \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD1A(a,b,c,d,e, n) RD1As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
-#define RD1B(a,b,c,d,e, n) RD1Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
-#define RD1C(a,b,c,d,e, n) RD1Cs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(n), STR(RCONST))
-#define RD1D(a,b,c,d,e, n) RD1Ds(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((n+13)&15)), STR(((n+8)&15)), STR(((n+2)&15)), STR(((n)&15)), STR(RCONST))
-#undef  RCONST
-#define RCONST 0x5A827999
-	RD1A(ax,bx,cx,dx,bp, 0) RD1B(bp,ax,bx,cx,dx, 1) RD1B(dx,bp,ax,bx,cx, 2) RD1B(cx,dx,bp,ax,bx, 3) RD1B(bx,cx,dx,bp,ax, 4)
-	RD1B(ax,bx,cx,dx,bp, 5) RD1B(bp,ax,bx,cx,dx, 6) RD1B(dx,bp,ax,bx,cx, 7) RD1C(cx,dx,bp,ax,bx, 8) RD1C(bx,cx,dx,bp,ax, 9)
-	RD1C(ax,bx,cx,dx,bp,10) RD1C(bp,ax,bx,cx,dx,11) RD1C(dx,bp,ax,bx,cx,12) RD1C(cx,dx,bp,ax,bx,13) RD1C(bx,cx,dx,bp,ax,14)
-	RD1C(ax,bx,cx,dx,bp,15) RD1D(bp,ax,bx,cx,dx,16) RD1D(dx,bp,ax,bx,cx,17) RD1D(cx,dx,bp,ax,bx,18) RD1D(bx,cx,dx,bp,ax,19)
-#define RD2s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
-"\n\
-	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
-	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
-	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
-	xorW	"n", %esi		# ^W[n & 15]                \n\
-	roll	%esi			#                           \n\
-	storeW	%esi, "n"		# store to W[n & 15]        \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	xorl	%e"b", %edi		# ^b                        \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
-	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD2(a,b,c,d,e, n) RD2s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((20+n+13)&15)), STR(((20+n+8)&15)), STR(((20+n+2)&15)), STR(((20+n)&15)), STR(RCONST))
-#undef  RCONST
-#define RCONST 0x6ED9EBA1
-	RD2(ax,bx,cx,dx,bp, 0) RD2(bp,ax,bx,cx,dx, 1) RD2(dx,bp,ax,bx,cx, 2) RD2(cx,dx,bp,ax,bx, 3) RD2(bx,cx,dx,bp,ax, 4)
-	RD2(ax,bx,cx,dx,bp, 5) RD2(bp,ax,bx,cx,dx, 6) RD2(dx,bp,ax,bx,cx, 7) RD2(cx,dx,bp,ax,bx, 8) RD2(bx,cx,dx,bp,ax, 9)
-	RD2(ax,bx,cx,dx,bp,10) RD2(bp,ax,bx,cx,dx,11) RD2(dx,bp,ax,bx,cx,12) RD2(cx,dx,bp,ax,bx,13) RD2(bx,cx,dx,bp,ax,14)
-	RD2(ax,bx,cx,dx,bp,15) RD2(bp,ax,bx,cx,dx,16) RD2(dx,bp,ax,bx,cx,17) RD2(cx,dx,bp,ax,bx,18) RD2(bx,cx,dx,bp,ax,19)
-
-#define RD3s(a,b,c,d,e, n13,n8,n2,n, RCONST) \
-"\n\
-	movl	%e"b", %edi		# di: b                     \n\
-	movl	%e"b", %esi		# si: b                     \n\
-	orl	%e"c", %edi		# di: b | c                 \n\
-	andl	%e"c", %esi		# si: b & c                 \n\
-	andl	%e"d", %edi		# di: (b | c) & d           \n\
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)   \n\
-	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
-	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
-	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
-	xorW	"n", %esi		# ^W[n & 15]                \n\
-	roll	%esi			#                           \n\
-	storeW	%esi, "n"		# store to W[n & 15]        \n\
-	addl	%edi, %e"e"		# += ((b | c) & d) | (b & c)\n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD3(a,b,c,d,e, n) RD3s(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((40+n+13)&15)), STR(((40+n+8)&15)), STR(((40+n+2)&15)), STR(((40+n)&15)), STR(RCONST))
-#undef  RCONST
-//#define RCONST 0x8F1BBCDC "out of range for signed 32bit displacement"
-#define RCONST  -0x70e44324
-	RD3(ax,bx,cx,dx,bp, 0) RD3(bp,ax,bx,cx,dx, 1) RD3(dx,bp,ax,bx,cx, 2) RD3(cx,dx,bp,ax,bx, 3) RD3(bx,cx,dx,bp,ax, 4)
-	RD3(ax,bx,cx,dx,bp, 5) RD3(bp,ax,bx,cx,dx, 6) RD3(dx,bp,ax,bx,cx, 7) RD3(cx,dx,bp,ax,bx, 8) RD3(bx,cx,dx,bp,ax, 9)
-	RD3(ax,bx,cx,dx,bp,10) RD3(bp,ax,bx,cx,dx,11) RD3(dx,bp,ax,bx,cx,12) RD3(cx,dx,bp,ax,bx,13) RD3(bx,cx,dx,bp,ax,14)
-	RD3(ax,bx,cx,dx,bp,15) RD3(bp,ax,bx,cx,dx,16) RD3(dx,bp,ax,bx,cx,17) RD3(cx,dx,bp,ax,bx,18) RD3(bx,cx,dx,bp,ax,19)
 
-#define RD4As(a,b,c,d,e, n13,n8,n2,n, RCONST) \
-"\n\
-	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
-	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
-	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
-	xorW	"n", %esi		# ^W[n & 15]                \n\
-	roll	%esi			#                           \n\
-	storeW	%esi, "n"		# store to W[n & 15]        \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	xorl	%e"b", %edi		# ^b                        \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
-	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD4Bs(a,b,c,d,e, n13,n8,n2,n, RCONST) \
-"\n\
-	loadW	"n13", %esi		# W[(n+13) & 15]            \n\
-	xorW	"n8", %esi		# ^W[(n+8) & 15]            \n\
-	xorW	"n2", %esi		# ^W[(n+2) & 15]            \n\
-	xorW	"n", %esi		# ^W[n & 15]                \n\
-	roll	%esi			#                           \n\
-	#storeW	%esi, "n"		# store to W[n & 15] elided \n\
-	movl	%e"c", %edi		# c                         \n\
-	xorl	%e"d", %edi		# ^d                        \n\
-	xorl	%e"b", %edi		# ^b                        \n\
-	leal	"RCONST"(%r"e",%rsi), %e"e" # e += RCONST + mixed_W \n\
-	addl	%edi, %e"e"		# e += (c ^ d ^ b)          \n\
-	movl	%e"a", %esi		#                           \n\
-	roll	$5, %esi		# rotl32(a,5)               \n\
-	addl	%esi, %e"e"		# e += rotl32(a,5)          \n\
-	rorl	$2, %e"b"		# b = rotl32(b,30)          \n\
-"
-#define RD4A(a,b,c,d,e, n) RD4As(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
-#define RD4B(a,b,c,d,e, n) RD4Bs(STR(a),STR(b),STR(c),STR(d),STR(e), STR(((60+n+13)&15)), STR(((60+n+8)&15)), STR(((60+n+2)&15)), STR(((60+n)&15)), STR(RCONST))
-#undef  RCONST
-//#define RCONST 0xCA62C1D6 "out of range for signed 32bit displacement"
-#define RCONST  -0x359d3e2a
-	RD4A(ax,bx,cx,dx,bp, 0) RD4A(bp,ax,bx,cx,dx, 1) RD4A(dx,bp,ax,bx,cx, 2) RD4A(cx,dx,bp,ax,bx, 3) RD4A(bx,cx,dx,bp,ax, 4)
-	RD4A(ax,bx,cx,dx,bp, 5) RD4A(bp,ax,bx,cx,dx, 6) RD4A(dx,bp,ax,bx,cx, 7) RD4A(cx,dx,bp,ax,bx, 8) RD4A(bx,cx,dx,bp,ax, 9)
-	RD4A(ax,bx,cx,dx,bp,10) RD4A(bp,ax,bx,cx,dx,11) RD4A(dx,bp,ax,bx,cx,12) RD4A(cx,dx,bp,ax,bx,13) RD4A(bx,cx,dx,bp,ax,14)
-	RD4A(ax,bx,cx,dx,bp,15) RD4A(bp,ax,bx,cx,dx,16) RD4B(dx,bp,ax,bx,cx,17) RD4B(cx,dx,bp,ax,bx,18) RD4B(bx,cx,dx,bp,ax,19)
+/* in hash_md5_sha_x86-64.S */
+struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
+void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM);
 
-"\n\
-	popq	%rdi		#                                   \n\
-	addl	%eax, 80(%rdi)  # ctx->hash[0] += a                 \n\
-	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b                 \n\
-	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c                 \n\
-	addl	%edx, 92(%rdi)  # ctx->hash[3] += d                 \n\
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e                 \n\
-	popq	%rbx		#                                   \n\
-	popq	%rbp		#                                   \n\
-	popq	%r12		#                                   \n\
-	popq	%r13		#                                   \n\
-	popq	%r14		#                                   \n\
-	popq	%r15		#                                   \n\
-"
-	); /* asm */
-#undef RCONST
-}
 # else
 /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
  * It seems further speedup can be achieved by handling more than
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
new file mode 100644
index 000000000..466cd9ae9
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -0,0 +1,1349 @@
+### Generated by hash_md5_sha_x86-64.S.sh ###
+#if defined(__GNUC__) && defined(__x86_64__)
+	.section	.text.sha1_process_block64,"ax",@progbits
+        .globl  sha1_process_block64
+        .hidden sha1_process_block64
+	.type	sha1_process_block64, @function
+sha1_process_block64:
+	pushq	%r15		#
+	pushq	%r14		#
+	pushq	%r13		#
+	pushq	%r12		#
+	pushq	%rbp		#
+	pushq	%rbx		#
+	pushq	%rdi		# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi: temps
+# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
+
+	movq	4*8(%rdi), %r8
+	bswapq	%r8
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movq	4*12(%rdi), %r12
+	bswapq	%r12
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r14
+	movl	%r14d, %r15d
+	shrq	$32, %r14
+
+	movl	$3, %eax
+1:
+	movq	(%rdi,%rax,8), %rsi
+	bswapq	%rsi
+	rolq	$32, %rsi
+	movq	%rsi, -32(%rsp,%rax,8)
+	decl	%eax
+	jns	1b
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+# 0
+	# W[0], already in %esi
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n]
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 1
+	movl	-32+4*1(%rsp), %esi		# W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 2
+	movl	-32+4*2(%rsp), %esi		# W[n]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 3
+	movl	-32+4*3(%rsp), %esi		# W[n]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 4
+	movl	-32+4*4(%rsp), %esi		# W[n]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 5
+	movl	-32+4*5(%rsp), %esi		# W[n]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n]
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 6
+	movl	-32+4*6(%rsp), %esi		# W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 7
+	movl	-32+4*7(%rsp), %esi		# W[n]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 8
+	# W[n], in %r8
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbx,%r8),%ebx # e += RCONST + W[n]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 9
+	# W[n], in %r9
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rax,%r9),%eax # e += RCONST + W[n]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 10
+	# W[n], in %r10
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbp,%r10),%ebp # e += RCONST + W[n]
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 11
+	# W[n], in %r11
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rdx,%r11),%edx # e += RCONST + W[n]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 12
+	# W[n], in %r12
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rcx,%r12),%ecx # e += RCONST + W[n]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 13
+	# W[n], in %r13
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbx,%r13),%ebx # e += RCONST + W[n]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 14
+	# W[n], in %r14
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rax,%r14),%eax # e += RCONST + W[n]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 15
+	# W[n], in %r15
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbp,%r15),%ebp # e += RCONST + W[n]
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 16
+	movl	%r13d, %esi	# W[(n+13) & 15]
+	xorl	%r8d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 17
+	movl	%r14d, %esi	# W[(n+13) & 15]
+	xorl	%r9d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 18
+	movl	%r15d, %esi	# W[(n+13) & 15]
+	xorl	%r10d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 19
+	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r11d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	leal	0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 20
+	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r12d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 21
+	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r13d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 22
+	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r14d, %esi		# ^W[(n+8) & 15]
+	xorl	%r8d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 23
+	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r15d, %esi		# ^W[(n+8) & 15]
+	xorl	%r9d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 24
+	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r10d, %esi		# ^W[(n+2) & 15]
+	xorl	%r8d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r8d		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 25
+	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r11d, %esi		# ^W[(n+2) & 15]
+	xorl	%r9d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r9d		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 26
+	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r12d, %esi		# ^W[(n+2) & 15]
+	xorl	%r10d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r10d		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 27
+	movl	%r8d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r13d, %esi		# ^W[(n+2) & 15]
+	xorl	%r11d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r11d		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 28
+	movl	%r9d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r14d, %esi		# ^W[(n+2) & 15]
+	xorl	%r12d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r12d		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 29
+	movl	%r10d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r15d, %esi		# ^W[(n+2) & 15]
+	xorl	%r13d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r13d		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 30
+	movl	%r11d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r14d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r14d		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 31
+	movl	%r12d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r15d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r15d		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 32
+	movl	%r13d, %esi	# W[(n+13) & 15]
+	xorl	%r8d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 33
+	movl	%r14d, %esi	# W[(n+13) & 15]
+	xorl	%r9d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 34
+	movl	%r15d, %esi	# W[(n+13) & 15]
+	xorl	%r10d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 35
+	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r11d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 36
+	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r12d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 37
+	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r13d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 38
+	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r14d, %esi		# ^W[(n+8) & 15]
+	xorl	%r8d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 39
+	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r15d, %esi		# ^W[(n+8) & 15]
+	xorl	%r9d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 40
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r10d, %esi		# ^W[(n+2) & 15]
+	xorl	%r8d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r8d		# store to W[n & 15]
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 41
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r11d, %esi		# ^W[(n+2) & 15]
+	xorl	%r9d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r9d		# store to W[n & 15]
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 42
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r12d, %esi		# ^W[(n+2) & 15]
+	xorl	%r10d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r10d		# store to W[n & 15]
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 43
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r8d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r13d, %esi		# ^W[(n+2) & 15]
+	xorl	%r11d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r11d		# store to W[n & 15]
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 44
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r9d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r14d, %esi		# ^W[(n+2) & 15]
+	xorl	%r12d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r12d		# store to W[n & 15]
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 45
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r10d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r15d, %esi		# ^W[(n+2) & 15]
+	xorl	%r13d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r13d		# store to W[n & 15]
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 46
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r11d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r14d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r14d		# store to W[n & 15]
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 47
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r12d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r15d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r15d		# store to W[n & 15]
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 48
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r13d, %esi	# W[(n+13) & 15]
+	xorl	%r8d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 49
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r14d, %esi	# W[(n+13) & 15]
+	xorl	%r9d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 50
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r15d, %esi	# W[(n+13) & 15]
+	xorl	%r10d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 51
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r11d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 52
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r12d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 53
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r13d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 54
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r14d, %esi		# ^W[(n+8) & 15]
+	xorl	%r8d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 55
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r15d, %esi		# ^W[(n+8) & 15]
+	xorl	%r9d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 56
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r10d, %esi		# ^W[(n+2) & 15]
+	xorl	%r8d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r8d		# store to W[n & 15]
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 57
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r11d, %esi		# ^W[(n+2) & 15]
+	xorl	%r9d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r9d		# store to W[n & 15]
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 58
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r12d, %esi		# ^W[(n+2) & 15]
+	xorl	%r10d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r10d		# store to W[n & 15]
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 59
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	movl	%r8d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r13d, %esi		# ^W[(n+2) & 15]
+	xorl	%r11d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r11d		# store to W[n & 15]
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 60
+	movl	%r9d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r14d, %esi		# ^W[(n+2) & 15]
+	xorl	%r12d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r12d		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 61
+	movl	%r10d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r15d, %esi		# ^W[(n+2) & 15]
+	xorl	%r13d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r13d		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 62
+	movl	%r11d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r14d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r14d		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 63
+	movl	%r12d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r15d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r15d		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 64
+	movl	%r13d, %esi	# W[(n+13) & 15]
+	xorl	%r8d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 65
+	movl	%r14d, %esi	# W[(n+13) & 15]
+	xorl	%r9d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 66
+	movl	%r15d, %esi	# W[(n+13) & 15]
+	xorl	%r10d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 67
+	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r11d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 68
+	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r12d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 69
+	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r13d, %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 70
+	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r14d, %esi		# ^W[(n+8) & 15]
+	xorl	%r8d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 71
+	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
+	xorl	%r15d, %esi		# ^W[(n+8) & 15]
+	xorl	%r9d, %esi		# ^W[(n+2) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 72
+	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r10d, %esi		# ^W[(n+2) & 15]
+	xorl	%r8d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r8d		# store to W[n & 15]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 73
+	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r11d, %esi		# ^W[(n+2) & 15]
+	xorl	%r9d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r9d		# store to W[n & 15]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 74
+	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r12d, %esi		# ^W[(n+2) & 15]
+	xorl	%r10d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r10d		# store to W[n & 15]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 75
+	movl	%r8d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r13d, %esi		# ^W[(n+2) & 15]
+	xorl	%r11d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r11d		# store to W[n & 15]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 76
+	movl	%r9d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r14d, %esi		# ^W[(n+2) & 15]
+	xorl	%r12d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, %r12d		# store to W[n & 15]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 77
+	movl	%r10d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	%r15d, %esi		# ^W[(n+2) & 15]
+	xorl	%r13d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	# store to W[n & 15] - unused, not done
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 78
+	movl	%r11d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r14d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	# store to W[n & 15] - unused, not done
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 79
+	movl	%r12d, %esi	# W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
+	xorl	%r15d, %esi		# ^W[n & 15]
+	roll	%esi			#
+	# store to W[n & 15] - unused, not done
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+
+	popq	%rdi		#
+	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
+	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
+	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
+	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
+	popq	%rbx		#
+	popq	%rbp		#
+	popq	%r12		#
+	popq	%r13		#
+	popq	%r14		#
+	popq	%r15		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+#endif
-- 
cgit v1.2.3-55-g6feb


From 947bef0deaba7b2ce432d515379091dcd4cf747f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 3 Jan 2022 13:00:07 +0100
Subject: libbb/sha1: x86_64 version: generate from a script, optimize a bit

function                                             old     new   delta
sha1_process_block64                                3569    3502     -67

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src               |   2 +-
 libbb/hash_md5_sha_x86-64.S    | 472 ++++++++++++++++++-----------------------
 libbb/hash_md5_sha_x86-64.S.sh | 267 +++++++++++++++++++++++
 3 files changed, 474 insertions(+), 267 deletions(-)
 create mode 100755 libbb/hash_md5_sha_x86-64.S.sh

diff --git a/libbb/Config.src b/libbb/Config.src
index 42a2283aa..c80bee286 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -59,7 +59,7 @@ config SHA1_SMALL
 	Trade binary size versus speed for the sha1 algorithm.
 	                throughput MB/s   size of sha1_process_block64
 	value           486  x86-64       486   x86-64
-	0               367  367          3657  3570
+	0               367  375          3657  3502
 	1               224  229           654   732
 	2,3             200  195           358   380
 
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 466cd9ae9..3e1c4b455 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,23 +1,27 @@
 ### Generated by hash_md5_sha_x86-64.S.sh ###
-#if defined(__GNUC__) && defined(__x86_64__)
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
 	.section	.text.sha1_process_block64,"ax",@progbits
-        .globl  sha1_process_block64
-        .hidden sha1_process_block64
+	.globl  sha1_process_block64
+	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 4 first insns
 sha1_process_block64:
-	pushq	%r15		#
-	pushq	%r14		#
-	pushq	%r13		#
-	pushq	%r12		#
-	pushq	%rbp		#
-	pushq	%rbx		#
-	pushq	%rdi		# we need ctx at the end
+	pushq	%r15	#
+	pushq	%r14	#
+	pushq	%r13	#
+	pushq	%r12	#
+	pushq	%rbp	#
+	pushq	%rbx	#
+	pushq	%rdi	# we need ctx at the end
 
 #Register and stack use:
 # eax..edx: a..d
 # ebp: e
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
+# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?)
 
 	movq	4*8(%rdi), %r8
 	bswapq	%r8
@@ -253,7 +257,7 @@ sha1_process_block64:
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -270,7 +274,7 @@ sha1_process_block64:
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -287,7 +291,7 @@ sha1_process_block64:
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -304,7 +308,7 @@ sha1_process_block64:
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n]
+	leal	0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -320,7 +324,7 @@ sha1_process_block64:
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -336,7 +340,7 @@ sha1_process_block64:
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -352,7 +356,7 @@ sha1_process_block64:
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -368,135 +372,119 @@ sha1_process_block64:
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 24
-	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r10d, %esi		# ^W[(n+2) & 15]
-	xorl	%r8d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r8d		# store to W[n & 15]
+	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
+	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
+	roll	%r8d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 25
-	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r11d, %esi		# ^W[(n+2) & 15]
-	xorl	%r9d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r9d		# store to W[n & 15]
+	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
+	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
+	roll	%r9d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 26
-	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r12d, %esi		# ^W[(n+2) & 15]
-	xorl	%r10d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r10d		# store to W[n & 15]
+	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
+	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
+	roll	%r10d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 27
-	movl	%r8d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r13d, %esi		# ^W[(n+2) & 15]
-	xorl	%r11d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r11d		# store to W[n & 15]
+	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
+	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
+	roll	%r11d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 28
-	movl	%r9d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r14d, %esi		# ^W[(n+2) & 15]
-	xorl	%r12d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r12d		# store to W[n & 15]
+	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
+	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
+	roll	%r12d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 29
-	movl	%r10d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r15d, %esi		# ^W[(n+2) & 15]
-	xorl	%r13d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r13d		# store to W[n & 15]
+	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
+	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
+	roll	%r13d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 30
-	movl	%r11d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r14d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r14d		# store to W[n & 15]
+	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
+	roll	%r14d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 31
-	movl	%r12d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r15d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r15d		# store to W[n & 15]
+	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
+	roll	%r15d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -512,7 +500,7 @@ sha1_process_block64:
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -528,7 +516,7 @@ sha1_process_block64:
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -544,7 +532,7 @@ sha1_process_block64:
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -560,7 +548,7 @@ sha1_process_block64:
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -576,7 +564,7 @@ sha1_process_block64:
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -592,7 +580,7 @@ sha1_process_block64:
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -608,7 +596,7 @@ sha1_process_block64:
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -624,7 +612,7 @@ sha1_process_block64:
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -637,14 +625,12 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r10d, %esi		# ^W[(n+2) & 15]
-	xorl	%r8d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r8d		# store to W[n & 15]
+	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
+	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
+	roll	%r8d		#
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -656,14 +642,12 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r11d, %esi		# ^W[(n+2) & 15]
-	xorl	%r9d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r9d		# store to W[n & 15]
+	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
+	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
+	roll	%r9d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -675,14 +659,12 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r12d, %esi		# ^W[(n+2) & 15]
-	xorl	%r10d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r10d		# store to W[n & 15]
+	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
+	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
+	roll	%r10d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -694,14 +676,12 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r8d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r13d, %esi		# ^W[(n+2) & 15]
-	xorl	%r11d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r11d		# store to W[n & 15]
+	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
+	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
+	roll	%r11d		#
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -713,14 +693,12 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r9d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r14d, %esi		# ^W[(n+2) & 15]
-	xorl	%r12d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r12d		# store to W[n & 15]
+	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
+	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
+	roll	%r12d		#
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -732,14 +710,12 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r10d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r15d, %esi		# ^W[(n+2) & 15]
-	xorl	%r13d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r13d		# store to W[n & 15]
+	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
+	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
+	roll	%r13d		#
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -751,14 +727,12 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r11d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r14d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r14d		# store to W[n & 15]
+	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
+	roll	%r14d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -770,14 +744,12 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r12d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r15d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r15d		# store to W[n & 15]
+	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
+	roll	%r15d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -796,7 +768,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -815,7 +787,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -834,7 +806,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -853,7 +825,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -872,7 +844,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -891,7 +863,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -910,7 +882,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -929,7 +901,7 @@ sha1_process_block64:
 	roll	%esi			#
 	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -941,14 +913,12 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r10d, %esi		# ^W[(n+2) & 15]
-	xorl	%r8d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r8d		# store to W[n & 15]
+	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
+	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
+	roll	%r8d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -960,14 +930,12 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r11d, %esi		# ^W[(n+2) & 15]
-	xorl	%r9d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r9d		# store to W[n & 15]
+	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
+	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
+	roll	%r9d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -979,14 +947,12 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r12d, %esi		# ^W[(n+2) & 15]
-	xorl	%r10d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r10d		# store to W[n & 15]
+	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
+	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
+	roll	%r10d		#
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -998,77 +964,67 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r8d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r13d, %esi		# ^W[(n+2) & 15]
-	xorl	%r11d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r11d		# store to W[n & 15]
+	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
+	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
+	roll	%r11d		#
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70e44324(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 60
-	movl	%r9d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r14d, %esi		# ^W[(n+2) & 15]
-	xorl	%r12d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r12d		# store to W[n & 15]
+	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
+	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
+	roll	%r12d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 61
-	movl	%r10d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r15d, %esi		# ^W[(n+2) & 15]
-	xorl	%r13d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r13d		# store to W[n & 15]
+	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
+	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
+	roll	%r13d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 62
-	movl	%r11d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r14d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r14d		# store to W[n & 15]
+	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
+	roll	%r14d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 63
-	movl	%r12d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r15d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r15d		# store to W[n & 15]
+	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
+	roll	%r15d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1084,7 +1040,7 @@ sha1_process_block64:
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1100,7 +1056,7 @@ sha1_process_block64:
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1116,7 +1072,7 @@ sha1_process_block64:
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1132,7 +1088,7 @@ sha1_process_block64:
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1148,7 +1104,7 @@ sha1_process_block64:
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1164,7 +1120,7 @@ sha1_process_block64:
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1180,7 +1136,7 @@ sha1_process_block64:
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1196,135 +1152,119 @@ sha1_process_block64:
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 72
-	movl	-32+4*5(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r10d, %esi		# ^W[(n+2) & 15]
-	xorl	%r8d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r8d		# store to W[n & 15]
+	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
+	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
+	roll	%r8d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 73
-	movl	-32+4*6(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r11d, %esi		# ^W[(n+2) & 15]
-	xorl	%r9d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r9d		# store to W[n & 15]
+	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
+	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
+	roll	%r9d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 74
-	movl	-32+4*7(%rsp), %esi	# W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r12d, %esi		# ^W[(n+2) & 15]
-	xorl	%r10d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r10d		# store to W[n & 15]
+	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
+	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
+	roll	%r10d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 75
-	movl	%r8d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r13d, %esi		# ^W[(n+2) & 15]
-	xorl	%r11d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r11d		# store to W[n & 15]
+	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
+	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
+	roll	%r11d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359d3e2a(%rbp,%rsi), %ebp # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 76
-	movl	%r9d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r14d, %esi		# ^W[(n+2) & 15]
-	xorl	%r12d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, %r12d		# store to W[n & 15]
+	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
+	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
+	roll	%r12d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359d3e2a(%rdx,%rsi), %edx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 77
-	movl	%r10d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	%r15d, %esi		# ^W[(n+2) & 15]
-	xorl	%r13d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	# store to W[n & 15] - unused, not done
+	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
+	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
+	roll	%r13d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359d3e2a(%rcx,%rsi), %ecx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 78
-	movl	%r11d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r14d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	# store to W[n & 15] - unused, not done
+	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
+	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
+	roll	%r14d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359d3e2a(%rbx,%rsi), %ebx # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 79
-	movl	%r12d, %esi	# W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	%r15d, %esi		# ^W[n & 15]
-	roll	%esi			#
-	# store to W[n & 15] - unused, not done
+	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
+	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
+	roll	%r15d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359d3e2a(%rax,%rsi), %eax # e += RCONST + mixed_W
+	leal	-0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
new file mode 100755
index 000000000..931c0f0fd
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -0,0 +1,267 @@
+#!/bin/sh
+
+# We don't regenerate it on every "make" invocation - only by hand.
+# The reason is that the changes to generated code are difficult
+# to visualize by looking only at this script, it helps when the commit
+# also contains the diff of the generated file.
+exec >hash_md5_sha_x86-64.S
+
+echo \
+'### Generated by hash_md5_sha_x86-64.S.sh ###
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
+	.section	.text.sha1_process_block64,"ax",@progbits
+	.globl  sha1_process_block64
+	.hidden sha1_process_block64
+	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 4 first insns
+sha1_process_block64:
+	pushq	%r15	#
+	pushq	%r14	#
+	pushq	%r13	#
+	pushq	%r12	#
+	pushq	%rbp	#
+	pushq	%rbx	#
+	pushq	%rdi	# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi: temps
+# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
+# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?)
+
+	movq	4*8(%rdi), %r8
+	bswapq	%r8
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r10
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movq	4*12(%rdi), %r12
+	bswapq	%r12
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r14
+	movl	%r14d, %r15d
+	shrq	$32, %r14
+
+	movl	$3, %eax
+1:
+	movq	(%rdi,%rax,8), %rsi
+	bswapq	%rsi
+	rolq	$32, %rsi
+	movq	%rsi, -32(%rsp,%rax,8)
+	decl	%eax
+	jns	1b
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+'
+W32() {
+test "$1" || exit 1
+test "$1" -lt 0 && exit 1
+test "$1" -gt 15 && exit 1
+test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
+test "$1" -ge 8 && echo "%r${1}d"
+}
+
+RD1A() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+echo "# $n"
+test $n = 0 && echo "
+	# W[0], already in %esi
+";test $n != 0 && test $n -lt 8 && echo "
+	movl	`W32 $n`, %esi		# W[n]
+";test $n -ge 8 && echo "
+	# W[n], in %r$n
+";echo "
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+";test $n -lt 8 && echo "
+	leal	$RCONST(%r$e,%rsi),%e$e # e += RCONST + W[n]
+";test $n -ge 8 && echo "
+	leal	$RCONST(%r$e,%r$n),%e$e # e += RCONST + W[n]
+";echo "
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+RD1B() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+";test $n0 -lt 8 && echo "
+	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
+	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
+	xorl	`W32 $n0`, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, `W32 $n0`		# store to W[n & 15]
+";test $n0 -ge 8 && echo "
+	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
+	roll	`W32 $n0`		#
+"; echo "
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+";test $n0 -lt 8 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
+";test $n0 -ge 8 && echo "
+	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
+";echo "
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+{
+RCONST=0x5A827999
+RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3; RD1A bx cx dx bp ax  4
+RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7; RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9
+RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
+RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
+} | grep -v '^$'
+
+RD2() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+";test $n0 -lt 8 && echo "
+	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
+	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
+	xorl	`W32 $n0`, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, `W32 $n0`		# store to W[n & 15]
+";test $n0 -ge 8 && echo "
+	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
+	roll	`W32 $n0`		#
+"; echo "
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	xorl	%e$b, %edi		# ^b
+";test $n0 -lt 8 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
+";test $n0 -ge 8 && echo "
+	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
+";echo "
+	addl	%edi, %e$e		# e += (c ^ d ^ b)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+{
+RCONST=0x6ED9EBA1
+RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
+RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
+RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
+RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
+} | grep -v '^$'
+
+RD3() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$b, %edi		# di: b
+	movl	%e$b, %esi		# si: b
+	orl	%e$c, %edi		# di: b | c
+	andl	%e$c, %esi		# si: b & c
+	andl	%e$d, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+";test $n0 -lt 8 && echo "
+	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
+	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
+	xorl	`W32 $n0`, %esi		# ^W[n & 15]
+	roll	%esi			#
+	movl	%esi, `W32 $n0`		# store to W[n & 15]
+";test $n0 -ge 8 && echo "
+	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
+	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
+	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
+	roll	`W32 $n0`		#
+"; echo "
+	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
+";test $n0 -lt 8 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
+";test $n0 -ge 8 && echo "
+	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
+";echo "
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+{
+#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement"
+RCONST=-0x70E44324
+RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44
+RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49
+RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54
+RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59
+} | grep -v '^$'
+
+# Round 4 has the same logic as round 2, only n and RCONST are different
+{
+#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement"
+RCONST=-0x359D3E2A
+RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64
+RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
+RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
+RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
+} | grep -v '^$'
+
+echo "
+	popq	%rdi		#
+	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
+	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
+	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
+	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
+	popq	%rbx		#
+	popq	%rbp		#
+	popq	%r12		#
+	popq	%r13		#
+	popq	%r14		#
+	popq	%r15		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+#endif"
-- 
cgit v1.2.3-55-g6feb


From 4387077f8e69c26ce5ce4a8119c225cc1c461f88 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 3 Jan 2022 13:14:09 +0100
Subject: typo fix

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 2 +-
 libbb/hash_md5_sha_x86-64.S.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 3e1c4b455..ec4e63765 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -21,7 +21,7 @@ sha1_process_block64:
 # ebp: e
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
-# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?)
+# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
 
 	movq	4*8(%rdi), %r8
 	bswapq	%r8
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 931c0f0fd..5f09546b2 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -30,7 +30,7 @@ sha1_process_block64:
 # ebp: e
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
-# (TODO: actually W[0..7] are used a bit more often, put _thme_ into r8..r15?)
+# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
 
 	movq	4*8(%rdi), %r8
 	bswapq	%r8
-- 
cgit v1.2.3-55-g6feb


From 7abb2bb96e0cd584f44dd8b219ad16d0232a6485 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 3 Jan 2022 17:02:48 +0100
Subject: libbb/sha1: x86_64 version: tidying up, no code changes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 32 ++++++++++++++++----------------
 libbb/hash_md5_sha_x86-64.S.sh | 33 ++++++++++++++++++++-------------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index ec4e63765..95b85d80a 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -60,7 +60,7 @@ sha1_process_block64:
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -72,7 +72,7 @@ sha1_process_block64:
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -84,7 +84,7 @@ sha1_process_block64:
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -96,7 +96,7 @@ sha1_process_block64:
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%rsi),%ebx # e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -108,7 +108,7 @@ sha1_process_block64:
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%rsi),%eax # e += RCONST + W[n]
+	leal	0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -120,7 +120,7 @@ sha1_process_block64:
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%rsi),%ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -132,7 +132,7 @@ sha1_process_block64:
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi),%edx # e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -144,7 +144,7 @@ sha1_process_block64:
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi),%ecx # e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -156,7 +156,7 @@ sha1_process_block64:
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%r8),%ebx # e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -168,7 +168,7 @@ sha1_process_block64:
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%r9),%eax # e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -180,7 +180,7 @@ sha1_process_block64:
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%r10),%ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -192,7 +192,7 @@ sha1_process_block64:
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%r11),%edx # e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -204,7 +204,7 @@ sha1_process_block64:
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%r12),%ecx # e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -216,7 +216,7 @@ sha1_process_block64:
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%r13),%ebx # e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -228,7 +228,7 @@ sha1_process_block64:
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%r14),%eax # e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -240,7 +240,7 @@ sha1_process_block64:
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%r15),%ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 5f09546b2..c5f0ef504 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -74,22 +74,24 @@ test "$1" -ge 8 && echo "%r${1}d"
 RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
-echo "# $n"
-test $n = 0 && echo "
+local n0=$(((n+0) & 15))
+echo "
+# $n
+";test $n0 = 0 && echo "
 	# W[0], already in %esi
-";test $n != 0 && test $n -lt 8 && echo "
-	movl	`W32 $n`, %esi		# W[n]
-";test $n -ge 8 && echo "
-	# W[n], in %r$n
+";test $n0 != 0 && test $n0 -lt 8 && echo "
+	movl	`W32 $n0`, %esi		# W[n]
+";test $n0 -ge 8 && echo "
+	# W[n], in %r$n0
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	andl	%e$b, %edi		# &b
 	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-";test $n -lt 8 && echo "
-	leal	$RCONST(%r$e,%rsi),%e$e # e += RCONST + W[n]
-";test $n -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n),%e$e # e += RCONST + W[n]
+";test $n0 -lt 8 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+";test $n0 -ge 8 && echo "
+	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
 ";echo "
 	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
 	movl	%e$a, %esi		#
@@ -119,7 +121,7 @@ echo "
 	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
 	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
 	roll	`W32 $n0`		#
-"; echo "
+";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	andl	%e$b, %edi		# &b
@@ -165,7 +167,7 @@ echo "
 	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
 	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
 	roll	`W32 $n0`		#
-"; echo "
+";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	xorl	%e$b, %edi		# ^b
@@ -216,7 +218,7 @@ echo "
 	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
 	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
 	roll	`W32 $n0`		#
-"; echo "
+";echo "
 	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
 ";test $n0 -lt 8 && echo "
 	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
@@ -246,6 +248,11 @@ RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx b
 RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
 RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
 RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
+# Note: new W[n&15] values generated in last 3 iterations
+# (W[13,14,15]) are unused after each of these iterations.
+# Since we use r8..r15 for W[8..15], this does not matter.
+# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15]
+# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed.
 } | grep -v '^$'
 
 echo "
-- 
cgit v1.2.3-55-g6feb


From 1fc520ed286f815cae1da1e9f8014cb18a256744 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 00:50:32 +0100
Subject: md5/shaXsum: use FEATURE_COPYBUF_KB to size the buffer instead of
 fixed 4k

function                                             old     new   delta
md5_sha1_sum_main                                    536     565     +29
hash_file                                            419     401     -18
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/1 up/down: 29/-18)             Total: 11 bytes

In my test, for unrolled sha1, COPYBUF_KB=64 increases throughput
from 367 MB/s to 457 MB/s.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/md5_sha1_sum.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/coreutils/md5_sha1_sum.c b/coreutils/md5_sha1_sum.c
index 4efa23061..3b389cb6b 100644
--- a/coreutils/md5_sha1_sum.c
+++ b/coreutils/md5_sha1_sum.c
@@ -151,10 +151,12 @@ static unsigned char *hash_bin_to_hex(unsigned char *hash_value,
 	return (unsigned char *)hex_value;
 }
 
+#define BUFSZ (CONFIG_FEATURE_COPYBUF_KB < 4 ? 4096 : CONFIG_FEATURE_COPYBUF_KB * 1024)
+
 #if !ENABLE_SHA3SUM
-# define hash_file(f,w) hash_file(f)
+# define hash_file(b,f,w) hash_file(b,f)
 #endif
-static uint8_t *hash_file(const char *filename, unsigned sha3_width)
+static uint8_t *hash_file(unsigned char *in_buf, const char *filename, unsigned sha3_width)
 {
 	int src_fd, hash_len, count;
 	union _ctx_ {
@@ -227,8 +229,7 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width)
 	}
 
 	{
-		RESERVE_CONFIG_UBUFFER(in_buf, 4096);
-		while ((count = safe_read(src_fd, in_buf, 4096)) > 0) {
+		while ((count = safe_read(src_fd, in_buf, BUFSZ)) > 0) {
 			update(&context, in_buf, count);
 		}
 		hash_value = NULL;
@@ -238,7 +239,6 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width)
 			final(&context, in_buf);
 			hash_value = hash_bin_to_hex(in_buf, hash_len);
 		}
-		RELEASE_CONFIG_BUFFER(in_buf);
 	}
 
 	if (src_fd != STDIN_FILENO) {
@@ -251,6 +251,7 @@ static uint8_t *hash_file(const char *filename, unsigned sha3_width)
 int md5_sha1_sum_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv)
 {
+	unsigned char *in_buf;
 	int return_value = EXIT_SUCCESS;
 	unsigned flags;
 #if ENABLE_SHA3SUM
@@ -279,6 +280,12 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv)
 	if (!*argv)
 		*--argv = (char*)"-";
 
+	/* The buffer is not alloc/freed for each input file:
+	 * for big values of COPYBUF_KB, this helps to keep its pages
+	 * pre-faulted and possibly even fully cached on local CPU.
+	 */
+	in_buf = xmalloc(BUFSZ);
+
 	do {
 		if (ENABLE_FEATURE_MD5_SHA1_SUM_CHECK && (flags & FLAG_CHECK)) {
 			FILE *pre_computed_stream;
@@ -310,7 +317,7 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv)
 				*filename_ptr = '\0';
 				filename_ptr += 2;
 
-				hash_value = hash_file(filename_ptr, sha3_width);
+				hash_value = hash_file(in_buf, filename_ptr, sha3_width);
 
 				if (hash_value && (strcmp((char*)hash_value, line) == 0)) {
 					if (!(flags & FLAG_SILENT))
@@ -339,7 +346,7 @@ int md5_sha1_sum_main(int argc UNUSED_PARAM, char **argv)
 			}
 			fclose_if_not_stdin(pre_computed_stream);
 		} else {
-			uint8_t *hash_value = hash_file(*argv, sha3_width);
+			uint8_t *hash_value = hash_file(in_buf, *argv, sha3_width);
 			if (hash_value == NULL) {
 				return_value = EXIT_FAILURE;
 			} else {
-- 
cgit v1.2.3-55-g6feb


From c3cfcc92422f6e525073226cdbfdcb00ab1e7dc7 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 01:45:13 +0100
Subject: libbb/sha1: x86_64 version: reorder prologue/epilogue insns

Not clear exactly why, but this increases hashing speed
on Skylake from 454 MB/s to 464 MB/s.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 60 ++++++++++++++++++-------------------
 libbb/hash_md5_sha_x86-64.S.sh | 67 +++++++++++++++++++++++-------------------
 2 files changed, 67 insertions(+), 60 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 95b85d80a..ff78fc049 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -6,14 +6,14 @@
 	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function
 
-	.balign	8	# allow decoders to fetch at least 4 first insns
+	.balign	8	# allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
 	pushq	%rdi	# we need ctx at the end
 
 #Register and stack use:
@@ -22,24 +22,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
-	movq	4*8(%rdi), %r8
-	bswapq	%r8
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r10
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movq	4*12(%rdi), %r12
-	bswapq	%r12
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movq	4*14(%rdi), %r14
-	bswapq	%r14
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-
 	movl	$3, %eax
 1:
 	movq	(%rdi,%rax,8), %rsi
@@ -48,12 +30,30 @@ sha1_process_block64:
 	movq	%rsi, -32(%rsp,%rax,8)
 	decl	%eax
 	jns	1b
+
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
 	movl	88(%rdi), %ecx		# c = ctx->hash[2]
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
+	movq	4*8(%rdi), %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	movq	4*12(%rdi), %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r12
+	bswapq	%r14
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movl	%r14d, %r15d
+	shrq	$32, %r14
+
 # 0
 	# W[0], already in %esi
 	movl	%ecx, %edi		# c
@@ -1272,17 +1272,17 @@ sha1_process_block64:
 	rorl	$2, %ecx		# b = rotl32(b,30)
 
 	popq	%rdi		#
+	popq	%r12		#
 	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
+	popq	%r13		#
 	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
+	popq	%r14		#
 	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
+	popq	%r15		#
 	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbx		#
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbp		#
-	popq	%r12		#
-	popq	%r13		#
-	popq	%r14		#
-	popq	%r15		#
 
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index c5f0ef504..7e50b64fb 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -15,14 +15,14 @@ echo \
 	.hidden sha1_process_block64
 	.type	sha1_process_block64, @function
 
-	.balign	8	# allow decoders to fetch at least 4 first insns
+	.balign	8	# allow decoders to fetch at least 5 first insns
 sha1_process_block64:
-	pushq	%r15	#
-	pushq	%r14	#
-	pushq	%r13	#
-	pushq	%r12	#
-	pushq	%rbp	#
-	pushq	%rbx	#
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
 	pushq	%rdi	# we need ctx at the end
 
 #Register and stack use:
@@ -31,24 +31,6 @@ sha1_process_block64:
 # esi,edi: temps
 # -32+4*n(%rsp),r8...r15: W[0..7,8..15]
 # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-
-	movq	4*8(%rdi), %r8
-	bswapq	%r8
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r10
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movq	4*12(%rdi), %r12
-	bswapq	%r12
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movq	4*14(%rdi), %r14
-	bswapq	%r14
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-
 	movl	$3, %eax
 1:
 	movq	(%rdi,%rax,8), %rsi
@@ -57,11 +39,29 @@ sha1_process_block64:
 	movq	%rsi, -32(%rsp,%rax,8)
 	decl	%eax
 	jns	1b
+
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
 	movl	88(%rdi), %ecx		# c = ctx->hash[2]
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movq	4*8(%rdi), %r8
+	movq	4*10(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	movq	4*12(%rdi), %r12
+	movq	4*14(%rdi), %r14
+	bswapq	%r12
+	bswapq	%r14
+	movl	%r8d, %r9d
+	shrq	$32, %r8
+	movl	%r10d, %r11d
+	shrq	$32, %r10
+	movl	%r12d, %r13d
+	shrq	$32, %r12
+	movl	%r14d, %r15d
+	shrq	$32, %r14
 '
 W32() {
 test "$1" || exit 1
@@ -71,6 +71,13 @@ test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
 test "$1" -ge 8 && echo "%r${1}d"
 }
 
+# It's possible to interleave insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
+
 RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
@@ -257,17 +264,17 @@ RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx b
 
 echo "
 	popq	%rdi		#
+	popq	%r12		#
 	addl	%eax, 80(%rdi)  # ctx->hash[0] += a
+	popq	%r13		#
 	addl	%ebx, 84(%rdi)  # ctx->hash[1] += b
+	popq	%r14		#
 	addl	%ecx, 88(%rdi)  # ctx->hash[2] += c
+	popq	%r15		#
 	addl	%edx, 92(%rdi)  # ctx->hash[3] += d
-	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbx		#
+	addl	%ebp, 96(%rdi)  # ctx->hash[4] += e
 	popq	%rbp		#
-	popq	%r12		#
-	popq	%r13		#
-	popq	%r14		#
-	popq	%r15		#
 
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
-- 
cgit v1.2.3-55-g6feb


From ed2af2e82dbcfccb7392e9fbc3f837de1594c103 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 14:32:41 +0100
Subject: build system: detect if build host has no bzip2

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 scripts/bb_release       |  6 +++---
 scripts/embedded_scripts |  6 ++++++
 scripts/mkconfigs        | 11 +++++++++++
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/scripts/bb_release b/scripts/bb_release
index 545440d3a..180ad8f2e 100755
--- a/scripts/bb_release
+++ b/scripts/bb_release
@@ -17,7 +17,7 @@ VERSION=`ls busybox-*.tar.gz | sed 's/busybox-\(.*\)\.tar\.gz/\1/'`
 zcat busybox-$VERSION.tar.gz | bzip2 > busybox-$VERSION.tar.bz2
 
 for releasefile in busybox-$VERSION.tar.gz busybox-$VERSION.tar.bz2; do
-    test -f $releasefile || { echo "no $releasefile"; exit 1; }
-    gpg --detach-sign $releasefile
-    sha256sum $releasefile > $releasefile.sha256
+	test -f $releasefile || { echo "no $releasefile"; exit 1; }
+	gpg --detach-sign $releasefile
+	sha256sum $releasefile > $releasefile.sha256
 done
diff --git a/scripts/embedded_scripts b/scripts/embedded_scripts
index aa7bf3e8a..205ac591a 100755
--- a/scripts/embedded_scripts
+++ b/scripts/embedded_scripts
@@ -23,6 +23,12 @@ if test $? != 0; then
 	exit 1
 fi
 
+bzip2 </dev/null >/dev/null
+if test $? != 0; then
+	echo 'bzip2 is not installed'
+	exit 1
+fi
+
 custom_scripts=""
 if [ -d "$custom_loc" ]
 then
diff --git a/scripts/mkconfigs b/scripts/mkconfigs
index 6a26fe1dd..1bbf10c3a 100755
--- a/scripts/mkconfigs
+++ b/scripts/mkconfigs
@@ -28,6 +28,17 @@
 
 config=.config
 
+od -v -b </dev/null >/dev/null
+if test $? != 0; then
+	echo 'od tool is not installed or cannot accept "-v -b" options'
+	exit 1
+fi
+bzip2 </dev/null >/dev/null
+if test $? != 0; then
+	echo 'bzip2 is not installed'
+	exit 1
+fi
+
 {
 echo "\
 #ifndef _BBCONFIGOPTS_H
-- 
cgit v1.2.3-55-g6feb


From 286b33721d5f6afd615f752ea83bbd72658c6bb9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 19:42:36 +0100
Subject: sed: correctly handle 'w FILE' commands writing to the same file

function                                             old     new   delta
sed_xfopen_w                                           -      84     +84

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/sed.c       | 31 +++++++++++++++++++++++++++++--
 testsuite/sed.tests |  9 +++++++++
 2 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/editors/sed.c b/editors/sed.c
index e8c82ac63..48b0dbf67 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -97,6 +97,12 @@ enum {
 	OPT_in_place = 1 << 0,
 };
 
+struct sed_FILE {
+	struct sed_FILE *next; /* Next (linked list, NULL terminated) */
+	const char *fname;
+	FILE *fp;
+};
+
 /* Each sed command turns into one of these structures. */
 typedef struct sed_cmd_s {
 	/* Ordered by alignment requirements: currently 36 bytes on x86 */
@@ -151,6 +157,11 @@ struct globals {
 	/* linked list of append lines */
 	llist_t *append_head;
 
+	/* linked list of FILEs opened for 'w' and s///w'.
+	 * Needed to handle duplicate fnames: sed '/a/w F;/b/w F'
+	 */
+	struct sed_FILE *FILE_head;
+
 	char *add_cmd_line;
 
 	struct pipeline {
@@ -211,6 +222,22 @@ static void sed_free_and_close_stuff(void)
 void sed_free_and_close_stuff(void);
 #endif
 
+static FILE *sed_xfopen_w(const char *fname)
+{
+	struct sed_FILE **pp = &G.FILE_head;
+	struct sed_FILE *cur;
+	while ((cur = *pp) != NULL) {
+		if (strcmp(cur->fname, fname) == 0)
+			return cur->fp;
+		pp = &cur->next;
+	}
+	*pp = cur = xzalloc(sizeof(*cur));
+	/*cur->next = NULL; - already is */
+	cur->fname = xstrdup(fname);
+	cur->fp = xfopen_for_write(fname);
+	return cur->fp;
+}
+
 /* If something bad happens during -i operation, delete temp file */
 
 static void cleanup_outname(void)
@@ -446,7 +473,7 @@ static int parse_subst_cmd(sed_cmd_t *sed_cmd, const char *substr)
 		{
 			char *fname;
 			idx += parse_file_cmd(/*sed_cmd,*/ substr+idx+1, &fname);
-			sed_cmd->sw_file = xfopen_for_write(fname);
+			sed_cmd->sw_file = sed_xfopen_w(fname);
 			sed_cmd->sw_last_char = '\n';
 			free(fname);
 			break;
@@ -561,7 +588,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
 		}
 		cmdstr += parse_file_cmd(/*sed_cmd,*/ cmdstr, &sed_cmd->string);
 		if (sed_cmd->cmd == 'w') {
-			sed_cmd->sw_file = xfopen_for_write(sed_cmd->string);
+			sed_cmd->sw_file = sed_xfopen_w(sed_cmd->string);
 			sed_cmd->sw_last_char = '\n';
 		}
 	}
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index 2b78c9b12..e62b839f7 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -405,6 +405,15 @@ testing "sed ^ OR not^" \
 	"" \
 	"abca\n"
 
+# This only works if file name is exactly the same.
+# For example, w FILE; w ./FILE won't work.
+testing "sed understands duplicate file name" \
+	"sed -n -e '/a/w sed.output' -e '/c/w sed.output' 2>&1 && cat sed.output && rm sed.output" \
+	"a\nc\n" \
+	"" \
+	"a\nb\nc\n"
+
+
 # testing "description" "commands" "result" "infile" "stdin"
 
 exit $FAILCOUNT
-- 
cgit v1.2.3-55-g6feb


From 31f45c1b369bee73843f7d791313423997618448 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 23:31:58 +0100
Subject: libbb: factor out fflush_stdout_and_exit(EXIT_SUCCESS)

function                                             old     new   delta
fflush_stdout_and_exit_SUCCESS                         -       7      +7
xxd_main                                             890     888      -2
vlock_main                                           353     351      -2
uuencode_main                                        318     316      -2
uniq_main                                            427     425      -2
uname_main                                           250     248      -2
sort_main                                            853     851      -2
shuf_main                                            500     498      -2
route_main                                           238     236      -2
readlink_main                                        113     111      -2
nice_main                                            156     154      -2
last_main                                            957     955      -2
ipcs_main                                            960     958      -2
env_main                                             209     207      -2
chrt_main                                            464     462      -2
cal_main                                             921     919      -2
baseNUM_main                                         650     648      -2
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/16 up/down: 7/-32)            Total: -25 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/env.c                | 2 +-
 coreutils/nice.c               | 2 +-
 coreutils/readlink.c           | 2 +-
 coreutils/shuf.c               | 2 +-
 coreutils/sort.c               | 2 +-
 coreutils/uname.c              | 2 +-
 coreutils/uniq.c               | 2 +-
 coreutils/uudecode.c           | 2 +-
 coreutils/uuencode.c           | 2 +-
 include/libbb.h                | 1 +
 libbb/fflush_stdout_and_exit.c | 5 +++++
 loginutils/vlock.c             | 2 +-
 networking/route.c             | 2 +-
 sysklogd/logread.c             | 2 +-
 util-linux/cal.c               | 2 +-
 util-linux/chrt.c              | 2 +-
 util-linux/hexdump_xxd.c       | 2 +-
 util-linux/ipcs.c              | 8 ++++----
 util-linux/last.c              | 2 +-
 util-linux/last_fancy.c        | 2 +-
 20 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/coreutils/env.c b/coreutils/env.c
index a0ea4dd27..6eafd06ef 100644
--- a/coreutils/env.c
+++ b/coreutils/env.c
@@ -100,7 +100,7 @@ int env_main(int argc UNUSED_PARAM, char **argv)
 		}
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
 
 /*
diff --git a/coreutils/nice.c b/coreutils/nice.c
index 28591ac61..e70da5d2b 100644
--- a/coreutils/nice.c
+++ b/coreutils/nice.c
@@ -33,7 +33,7 @@ int nice_main(int argc UNUSED_PARAM, char **argv)
 
 	if (!*++argv) { /* No args, so (GNU) output current nice value. */
 		printf("%d\n", old_priority);
-		fflush_stdout_and_exit(EXIT_SUCCESS);
+		fflush_stdout_and_exit_SUCCESS();
 	}
 
 	adjustment = 10;  /* Set default adjustment. */
diff --git a/coreutils/readlink.c b/coreutils/readlink.c
index 09d69df2b..b2e867883 100644
--- a/coreutils/readlink.c
+++ b/coreutils/readlink.c
@@ -96,5 +96,5 @@ int readlink_main(int argc UNUSED_PARAM, char **argv)
 	printf((opt & 2) ? "%s" : "%s\n", buf);
 	free(buf);
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/coreutils/shuf.c b/coreutils/shuf.c
index 3def3d80f..337366b45 100644
--- a/coreutils/shuf.c
+++ b/coreutils/shuf.c
@@ -171,5 +171,5 @@ int shuf_main(int argc, char **argv)
 			printf("%s%c", lines[i], eol);
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/coreutils/sort.c b/coreutils/sort.c
index 32a06e40a..0cbb6f597 100644
--- a/coreutils/sort.c
+++ b/coreutils/sort.c
@@ -644,5 +644,5 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
 			printf("%s%c", lines[i], ch);
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/coreutils/uname.c b/coreutils/uname.c
index da785ab4c..6c0bdf096 100644
--- a/coreutils/uname.c
+++ b/coreutils/uname.c
@@ -209,5 +209,5 @@ int uname_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
 #endif
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS); /* coreutils-6.9 compat */
+	fflush_stdout_and_exit_SUCCESS(); /* coreutils-6.9 compat */
 }
diff --git a/coreutils/uniq.c b/coreutils/uniq.c
index a3058ac07..06c57f750 100644
--- a/coreutils/uniq.c
+++ b/coreutils/uniq.c
@@ -139,5 +139,5 @@ int uniq_main(int argc UNUSED_PARAM, char **argv)
 
 	die_if_ferror(stdin, input_filename);
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/coreutils/uudecode.c b/coreutils/uudecode.c
index e90902f52..63a8d4d48 100644
--- a/coreutils/uudecode.c
+++ b/coreutils/uudecode.c
@@ -352,7 +352,7 @@ int baseNUM_main(int argc UNUSED_PARAM, char **argv)
 #undef src_buf
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
 #endif
 
diff --git a/coreutils/uuencode.c b/coreutils/uuencode.c
index db49ec80a..f096e3122 100644
--- a/coreutils/uuencode.c
+++ b/coreutils/uuencode.c
@@ -78,5 +78,5 @@ int uuencode_main(int argc UNUSED_PARAM, char **argv)
 	}
 	printf(tbl == bb_uuenc_tbl_std ? "\n`\nend\n" : "\n====\n");
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/include/libbb.h b/include/libbb.h
index a48782832..8308d6259 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1054,6 +1054,7 @@ void die_if_ferror(FILE *file, const char *msg) FAST_FUNC;
 void die_if_ferror_stdout(void) FAST_FUNC;
 int fflush_all(void) FAST_FUNC;
 void fflush_stdout_and_exit(int retval) NORETURN FAST_FUNC;
+void fflush_stdout_and_exit_SUCCESS(void) NORETURN FAST_FUNC;
 int fclose_if_not_stdin(FILE *file) FAST_FUNC;
 FILE* xfopen(const char *filename, const char *mode) FAST_FUNC;
 /* Prints warning to stderr and returns NULL on failure: */
diff --git a/libbb/fflush_stdout_and_exit.c b/libbb/fflush_stdout_and_exit.c
index 5df74170e..5a13ebcf8 100644
--- a/libbb/fflush_stdout_and_exit.c
+++ b/libbb/fflush_stdout_and_exit.c
@@ -20,3 +20,8 @@ void FAST_FUNC fflush_stdout_and_exit(int retval)
 	 * but use xfunc_die() */
 	xfunc_die();
 }
+
+void FAST_FUNC fflush_stdout_and_exit_SUCCESS(void)
+{
+	fflush_stdout_and_exit(EXIT_SUCCESS);
+}
diff --git a/loginutils/vlock.c b/loginutils/vlock.c
index 334b7d2ad..720835c4b 100644
--- a/loginutils/vlock.c
+++ b/loginutils/vlock.c
@@ -128,5 +128,5 @@ int vlock_main(int argc UNUSED_PARAM, char **argv)
 	ioctl(STDIN_FILENO, VT_SETMODE, &ovtm);
 #endif
 	tcsetattr_stdin_TCSANOW(&oterm);
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/networking/route.c b/networking/route.c
index ff5daa8a7..26146f8e9 100644
--- a/networking/route.c
+++ b/networking/route.c
@@ -702,7 +702,7 @@ int route_main(int argc UNUSED_PARAM, char **argv)
 #endif
 			bb_displayroutes(noresolve, opt & ROUTE_OPT_e);
 
-		fflush_stdout_and_exit(EXIT_SUCCESS);
+		fflush_stdout_and_exit_SUCCESS();
 	}
 
 	/* Check verb.  At the moment, must be add, del, or delete. */
diff --git a/sysklogd/logread.c b/sysklogd/logread.c
index d5f8ca0a2..e6cfcf4a7 100644
--- a/sysklogd/logread.c
+++ b/sysklogd/logread.c
@@ -226,5 +226,5 @@ int logread_main(int argc UNUSED_PARAM, char **argv)
 
 	/* shmdt(shbuf); - on Linux, shmdt is not mandatory on exit */
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/util-linux/cal.c b/util-linux/cal.c
index 6ba6ebf98..522ab3476 100644
--- a/util-linux/cal.c
+++ b/util-linux/cal.c
@@ -233,7 +233,7 @@ int cal_main(int argc UNUSED_PARAM, char **argv)
 		}
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
 
 /*
diff --git a/util-linux/chrt.c b/util-linux/chrt.c
index 6799abb2d..be96fa426 100644
--- a/util-linux/chrt.c
+++ b/util-linux/chrt.c
@@ -110,7 +110,7 @@ int chrt_main(int argc UNUSED_PARAM, char **argv)
 		show_min_max(SCHED_RR);
 		show_min_max(SCHED_BATCH);
 		show_min_max(SCHED_IDLE);
-		fflush_stdout_and_exit(EXIT_SUCCESS);
+		fflush_stdout_and_exit_SUCCESS();
 	}
 	//if (opt & OPT_r)
 	//	policy = SCHED_RR; - default, already set
diff --git a/util-linux/hexdump_xxd.c b/util-linux/hexdump_xxd.c
index 76dada983..4372ac770 100644
--- a/util-linux/hexdump_xxd.c
+++ b/util-linux/hexdump_xxd.c
@@ -150,7 +150,7 @@ static void reverse(unsigned opt, const char *filename)
 		free(buf);
 	}
 	//fclose(fp);
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
 
 static void print_C_style(const char *p, const char *hdr)
diff --git a/util-linux/ipcs.c b/util-linux/ipcs.c
index ef2529c05..5973cbf57 100644
--- a/util-linux/ipcs.c
+++ b/util-linux/ipcs.c
@@ -600,15 +600,15 @@ int ipcs_main(int argc UNUSED_PARAM, char **argv)
 		id = xatoi(opt_i);
 		if (opt & flag_shm) {
 			print_shm(id);
-			fflush_stdout_and_exit(EXIT_SUCCESS);
+			fflush_stdout_and_exit_SUCCESS();
 		}
 		if (opt & flag_sem) {
 			print_sem(id);
-			fflush_stdout_and_exit(EXIT_SUCCESS);
+			fflush_stdout_and_exit_SUCCESS();
 		}
 		if (opt & flag_msg) {
 			print_msg(id);
-			fflush_stdout_and_exit(EXIT_SUCCESS);
+			fflush_stdout_and_exit_SUCCESS();
 		}
 		bb_show_usage();
 	}
@@ -633,5 +633,5 @@ int ipcs_main(int argc UNUSED_PARAM, char **argv)
 		do_sem(format);
 		bb_putchar('\n');
 	}
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/util-linux/last.c b/util-linux/last.c
index 24ce7a8d8..63751ca45 100644
--- a/util-linux/last.c
+++ b/util-linux/last.c
@@ -162,5 +162,5 @@ int last_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
 		xlseek(file, pos, SEEK_SET);
 	}
 
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
diff --git a/util-linux/last_fancy.c b/util-linux/last_fancy.c
index e56e0ba85..648236229 100644
--- a/util-linux/last_fancy.c
+++ b/util-linux/last_fancy.c
@@ -296,5 +296,5 @@ int last_main(int argc UNUSED_PARAM, char **argv)
 
 	if (ENABLE_FEATURE_CLEAN_UP)
 		close(file);
-	fflush_stdout_and_exit(EXIT_SUCCESS);
+	fflush_stdout_and_exit_SUCCESS();
 }
-- 
cgit v1.2.3-55-g6feb


From dfd8aafcf59c88662516a534a4334b3f08f58c88 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 23:36:16 +0100
Subject: libbb: fflush_stdout_and_exit(0) still exits with _error_ (not 0!) if
 fflush fails

function                                             old     new   delta
fflush_stdout_and_exit                                36      40      +4

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/fflush_stdout_and_exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libbb/fflush_stdout_and_exit.c b/libbb/fflush_stdout_and_exit.c
index 5a13ebcf8..33e28ae34 100644
--- a/libbb/fflush_stdout_and_exit.c
+++ b/libbb/fflush_stdout_and_exit.c
@@ -13,9 +13,9 @@
  */
 void FAST_FUNC fflush_stdout_and_exit(int retval)
 {
-	xfunc_error_retval = retval;
 	if (fflush(stdout))
 		bb_simple_perror_msg_and_die(bb_msg_standard_output);
+	xfunc_error_retval = retval;
 	/* In case we are in NOFORK applet. Do not exit() directly,
 	 * but use xfunc_die() */
 	xfunc_die();
-- 
cgit v1.2.3-55-g6feb


From cc7d2e21780c28608b00a4faf0fed297527bcbf4 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 4 Jan 2022 23:53:21 +0100
Subject: sort: fix -s -r interaction: 'stable' order is not affected by -r

function                                             old     new   delta
compare_keys                                         818     820      +2

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/sort.c     |  4 +++-
 testsuite/sort.tests | 13 +++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/coreutils/sort.c b/coreutils/sort.c
index 0cbb6f597..9ff777851 100644
--- a/coreutils/sort.c
+++ b/coreutils/sort.c
@@ -380,7 +380,9 @@ static int compare_keys(const void *xarg, const void *yarg)
 
 			/* If x > y, 1, else -1 */
 			retval = (x32 > y32) * 2 - 1;
-		} else
+			/* Here, -r has no effect! */
+			return retval;
+		}
 		if (!(option_mask32 & FLAG_no_tie_break)) {
 			/* fallback sort */
 			flags = option_mask32;
diff --git a/testsuite/sort.tests b/testsuite/sort.tests
index c51a8e475..5375f93de 100755
--- a/testsuite/sort.tests
+++ b/testsuite/sort.tests
@@ -175,6 +175,19 @@ testing "sort file in place" \
 111
 " ""
 
+testing "sort -sr (stable and reverse) does NOT reverse 'stable' ordering" \
+"sort -k2 -r -s input" "\
+b 2
+d 2
+a 1
+c 1
+" "\
+a 1
+b 2
+c 1
+d 2
+" ""
+
 # testing "description" "command(s)" "result" "infile" "stdin"
 
 exit $FAILCOUNT
-- 
cgit v1.2.3-55-g6feb


From 34e0bb3931b595e7a48061255692ec4ff29499c5 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 5 Jan 2022 12:05:55 +0100
Subject: sort: support -h

function                                             old     new   delta
static.scale_suffix                                    -      62     +62
.rodata                                           104304  104336     +32
compare_keys                                         820     848     +28
packed_usage                                       34159   34184     +25
static.suffix                                          -       9      +9
sort_opt_str                                          37      38      +1
------------------------------------------------------------------------------
(add/remove: 2/0 grow/shrink: 4/0 up/down: 157/0)             Total: 157 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/sort.c     | 95 +++++++++++++++++++++++++++++++++++-----------------
 testsuite/sort.tests | 29 ++++++++++++++++
 2 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/coreutils/sort.c b/coreutils/sort.c
index 9ff777851..9aac656fe 100644
--- a/coreutils/sort.c
+++ b/coreutils/sort.c
@@ -18,7 +18,7 @@
 //config:	sort is used to sort lines of text in specified files.
 //config:
 //config:config FEATURE_SORT_BIG
-//config:	bool "Full SuSv3 compliant sort (support -ktcbdfiogM)"
+//config:	bool "Full SuSv3 compliant sort (support -ktcbdfioghM)"
 //config:	default y
 //config:	depends on SORT
 //config:	help
@@ -43,7 +43,7 @@
 
 //usage:#define sort_trivial_usage
 //usage:       "[-nru"
-//usage:	IF_FEATURE_SORT_BIG("gMcszbdfiokt] [-o FILE] [-k START[.OFS][OPTS][,END[.OFS][OPTS]] [-t CHAR")
+//usage:	IF_FEATURE_SORT_BIG("ghMcszbdfiokt] [-o FILE] [-k START[.OFS][OPTS][,END[.OFS][OPTS]] [-t CHAR")
 //usage:       "] [FILE]..."
 //usage:#define sort_full_usage "\n\n"
 //usage:       "Sort lines of text\n"
@@ -59,6 +59,7 @@
 //usage:     "\n	-n	Sort numbers"
 //usage:	IF_FEATURE_SORT_BIG(
 //usage:     "\n	-g	General numerical sort"
+//usage:     "\n	-h	Sort human readable numbers (2K 1G)"
 //usage:     "\n	-M	Sort month"
 //usage:     "\n	-V	Sort version"
 //usage:     "\n	-t CHAR	Field separator"
@@ -94,31 +95,32 @@
 enum {
 	FLAG_n  = 1 << 0,       /* Numeric sort */
 	FLAG_g  = 1 << 1,       /* Sort using strtod() */
-	FLAG_M  = 1 << 2,       /* Sort date */
-	FLAG_V  = 1 << 3,       /* Sort version */
+	FLAG_h  = 1 << 2,       /* Sort using strtod(), plus KMGT suffixes */
+	FLAG_M  = 1 << 3,       /* Sort date */
+	FLAG_V  = 1 << 4,       /* Sort version */
 /* ucsz apply to root level only, not keys.  b at root level implies bb */
-	FLAG_u  = 1 << 4,       /* Unique */
-	FLAG_c  = 1 << 5,       /* Check: no output, exit(!ordered) */
-	FLAG_s  = 1 << 6,       /* Stable sort, no ascii fallback at end */
-	FLAG_z  = 1 << 7,       /* Input and output is NUL terminated, not \n */
+	FLAG_u  = 1 << 5,       /* Unique */
+	FLAG_c  = 1 << 6,       /* Check: no output, exit(!ordered) */
+	FLAG_s  = 1 << 7,       /* Stable sort, no ascii fallback at end */
+	FLAG_z  = 1 << 8,       /* Input and output is NUL terminated, not \n */
 /* These can be applied to search keys, the previous four can't */
-	FLAG_b  = 1 << 8,       /* Ignore leading blanks */
-	FLAG_r  = 1 << 9,       /* Reverse */
-	FLAG_d  = 1 << 10,      /* Ignore !(isalnum()|isspace()) */
-	FLAG_f  = 1 << 11,      /* Force uppercase */
-	FLAG_i  = 1 << 12,      /* Ignore !isprint() */
-	FLAG_m  = 1 << 13,      /* ignored: merge already sorted files; do not sort */
-	FLAG_S  = 1 << 14,      /* ignored: -S, --buffer-size=SIZE */
-	FLAG_T  = 1 << 15,      /* ignored: -T, --temporary-directory=DIR */
-	FLAG_o  = 1 << 16,
-	FLAG_k  = 1 << 17,
-	FLAG_t  = 1 << 18,
+	FLAG_b  = 1 << 9,       /* Ignore leading blanks */
+	FLAG_r  = 1 << 10,      /* Reverse */
+	FLAG_d  = 1 << 11,      /* Ignore !(isalnum()|isspace()) */
+	FLAG_f  = 1 << 12,      /* Force uppercase */
+	FLAG_i  = 1 << 13,      /* Ignore !isprint() */
+	FLAG_m  = 1 << 14,      /* ignored: merge already sorted files; do not sort */
+	FLAG_S  = 1 << 15,      /* ignored: -S, --buffer-size=SIZE */
+	FLAG_T  = 1 << 16,      /* ignored: -T, --temporary-directory=DIR */
+	FLAG_o  = 1 << 17,
+	FLAG_k  = 1 << 18,
+	FLAG_t  = 1 << 19,
 	FLAG_bb = 0x80000000,   /* Ignore trailing blanks  */
 	FLAG_no_tie_break = 0x40000000,
 };
 
 static const char sort_opt_str[] ALIGN1 = "^"
-			"ngMVucszbrdfimS:T:o:k:*t:"
+			"nghMVucszbrdfimS:T:o:k:*t:"
 			"\0" "o--o:t--t"/*-t, -o: at most one of each*/;
 /*
  * OPT_STR must not be string literal, needs to have stable address:
@@ -253,6 +255,25 @@ static struct sort_key *add_key(void)
 #define GET_LINE(fp) xmalloc_fgetline(fp)
 #endif
 
+#if ENABLE_FEATURE_SORT_BIG
+static int scale_suffix(const char *tail)
+{
+	static const char suffix[] ALIGN1 = "kmgtpezy";
+	const char *s;
+	int n;
+
+	if (!tail[0])
+		return -1;
+	s = strchr(suffix, tail[0] | 0x20);
+	if (!s)
+		return -1;
+	n = s - suffix;
+	if (n != 0 && tail[0] >= 'a')
+		return -1; /* mg... not accepted, only MG... */
+	return n;
+}
+#endif
+
 /* Iterate through keys list and perform comparisons */
 static int compare_keys(const void *xarg, const void *yarg)
 {
@@ -275,7 +296,7 @@ static int compare_keys(const void *xarg, const void *yarg)
 		y = *(char **)yarg;
 #endif
 		/* Perform actual comparison */
-		switch (flags & (FLAG_n | FLAG_g | FLAG_M | FLAG_V)) {
+		switch (flags & (FLAG_n | FLAG_g | FLAG_h | FLAG_M | FLAG_V)) {
 		default:
 			bb_simple_error_msg_and_die("unknown sort type");
 			break;
@@ -293,7 +314,8 @@ static int compare_keys(const void *xarg, const void *yarg)
 #endif
 			break;
 #if ENABLE_FEATURE_SORT_BIG
-		case FLAG_g: {
+		case FLAG_g:
+		case FLAG_h: {
 			char *xx, *yy;
 //TODO: needs setlocale(LC_NUMERIC, "C")?
 			double dx = strtod(x, &xx);
@@ -308,16 +330,26 @@ static int compare_keys(const void *xarg, const void *yarg)
 				retval = (dy != dy) ? 0 : -1;
 			else if (dy != dy)
 				retval = 1;
-			/* Check for infinity.  Could underflow, but it avoids libm. */
-			else if (1.0 / dx == 0.0) {
-				if (dx < 0)
-					retval = (1.0 / dy == 0.0 && dy < 0) ? 0 : -1;
+			else {
+				if (flags & FLAG_h) {
+					int xs = scale_suffix(xx);
+					int ys = scale_suffix(yy);
+					if (xs != ys) {
+						retval = xs - ys;
+						break;
+					}
+				}
+				/* Check for infinity.  Could underflow, but it avoids libm. */
+				if (1.0 / dx == 0.0) {
+					if (dx < 0)
+						retval = (1.0 / dy == 0.0 && dy < 0) ? 0 : -1;
+					else
+						retval = (1.0 / dy == 0.0 && dy > 0) ? 0 : 1;
+				} else if (1.0 / dy == 0.0)
+					retval = (dy < 0) ? 1 : -1;
 				else
-					retval = (1.0 / dy == 0.0 && dy > 0) ? 0 : 1;
-			} else if (1.0 / dy == 0.0)
-				retval = (dy < 0) ? 1 : -1;
-			else
-				retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0);
+					retval = (dx > dy) ? 1 : ((dx < dy) ? -1 : 0);
+			}
 			break;
 		}
 		case FLAG_M: {
@@ -476,6 +508,7 @@ int sort_main(int argc UNUSED_PARAM, char **argv)
 			FLAG_allowed_for_k =
 				FLAG_n | /* Numeric sort */
 				FLAG_g | /* Sort using strtod() */
+				FLAG_h | /* Sort using strtod(), plus KMGT suffixes */
 				FLAG_M | /* Sort date */
 				FLAG_b | /* Ignore leading blanks */
 				FLAG_r | /* Reverse */
diff --git a/testsuite/sort.tests b/testsuite/sort.tests
index 5375f93de..ff33e21b4 100755
--- a/testsuite/sort.tests
+++ b/testsuite/sort.tests
@@ -188,6 +188,35 @@ c 1
 d 2
 " ""
 
+testing "sort -h" \
+"sort -h input" "\
+3e
+4m
+5y
+1023
+1024
+1025
+3000
+2K
+3k
+1M
+2E
+1Y
+" "\
+1Y
+5y
+1M
+2E
+3k
+3e
+2K
+4m
+1023
+1025
+3000
+1024
+" ""
+
 # testing "description" "command(s)" "result" "infile" "stdin"
 
 exit $FAILCOUNT
-- 
cgit v1.2.3-55-g6feb


From 076f5e064fa7b6cc2c03b030abcf2cbd60514180 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 5 Jan 2022 22:04:21 +0100
Subject: less: code shrink

function                                             old     new   delta
restore_tty                                            -      29     +29
less_main                                           2107    2105      -2
getch_nowait                                         253     251      -2
buffer_print                                         614     612      -2
less_exit                                             51      12     -39
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/4 up/down: 29/-45)            Total: -16 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 miscutils/less.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/miscutils/less.c b/miscutils/less.c
index 26983f40d..6825e5577 100644
--- a/miscutils/less.c
+++ b/miscutils/less.c
@@ -325,15 +325,18 @@ static void print_statusline(const char *str)
 }
 
 /* Exit the program gracefully */
-static void less_exit(int code)
+static void restore_tty(void)
 {
 	set_tty_cooked();
 	if (!(G.kbd_fd_orig_flags & O_NONBLOCK))
 		ndelay_off(kbd_fd);
 	clear_line();
-	if (code < 0)
-		kill_myself_with_sig(- code); /* does not return */
-	exit(code);
+}
+
+static void less_exit(void)
+{
+	restore_tty();
+	exit(EXIT_SUCCESS);
 }
 
 #if (ENABLE_FEATURE_LESS_DASHCMD && ENABLE_FEATURE_LESS_LINENUMS) \
@@ -913,7 +916,7 @@ static void buffer_print(void)
 	) {
 		i = option_mask32 & FLAG_F ? 0 : cur_fline;
 		if (max_fline - i <= max_displayed_line)
-			less_exit(EXIT_SUCCESS);
+			less_exit();
 	}
 	status_print();
 }
@@ -1146,7 +1149,7 @@ static int64_t getch_nowait(void)
 			goto again;
 		}
 		/* EOF/error (ssh session got killed etc) */
-		less_exit(EXIT_SUCCESS);
+		less_exit();
 	}
 	set_tty_cooked();
 	return key64;
@@ -1297,7 +1300,7 @@ static void colon_process(void)
 		change_file(-1);
 		break;
 	case 'q':
-		less_exit(EXIT_SUCCESS);
+		less_exit();
 		break;
 	case 'x':
 		change_file(0);
@@ -1715,7 +1718,7 @@ static void keypress_process(int keypress)
 		buffer_line(cur_fline);
 		break;
 	case 'q': case 'Q':
-		less_exit(EXIT_SUCCESS);
+		less_exit();
 		break;
 #if ENABLE_FEATURE_LESS_MARKS
 	case 'm':
@@ -1793,7 +1796,8 @@ static void keypress_process(int keypress)
 
 static void sig_catcher(int sig)
 {
-	less_exit(- sig);
+	restore_tty();
+	kill_myself_with_sig(sig); /* does not return */
 }
 
 #if ENABLE_FEATURE_LESS_WINCH
-- 
cgit v1.2.3-55-g6feb


From db5546ca101846f18294a43b39883bc4ff53613a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 5 Jan 2022 22:16:06 +0100
Subject: libbb: code shrink: introduce and use [_]exit_SUCCESS()

function                                             old     new   delta
exit_SUCCESS                                           -       7      +7
_exit_SUCCESS                                          -       7      +7
run_pipe                                            1562    1567      +5
pseudo_exec_argv                                     399     400      +1
finish                                                86      87      +1
start_stop_daemon_main                              1109    1107      -2
shutdown_on_signal                                    38      36      -2
runsv_main                                          1662    1660      -2
redirect                                            1070    1068      -2
read_line                                             79      77      -2
pause_and_low_level_reboot                            54      52      -2
list_i2c_busses_and_exit                             483     481      -2
less_exit                                             12      10      -2
identify                                            4123    4121      -2
grep_file                                           1161    1159      -2
getty_main                                          1519    1517      -2
fsck_minix_main                                     2681    2679      -2
free_session                                         132     130      -2
fdisk_main                                          4739    4737      -2
clean_up_and_exit                                     53      51      -2
bsd_select                                          1566    1564      -2
bb_daemonize_or_rexec                                198     196      -2
------------------------------------------------------------------------------
(add/remove: 2/0 grow/shrink: 3/17 up/down: 21/-34)           Total: -13 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 debianutils/start_stop_daemon.c |  4 ++--
 findutils/grep.c                |  2 +-
 include/libbb.h                 |  2 ++
 init/init.c                     |  6 +++---
 libbb/vfork_daemon_rexec.c      |  4 ++--
 libbb/xfuncs.c                  | 10 ++++++++++
 loginutils/getty.c              |  6 +++---
 loginutils/login.c              |  2 +-
 miscutils/devfsd.c              |  4 ++--
 miscutils/hdparm.c              |  2 +-
 miscutils/i2c_tools.c           |  2 +-
 miscutils/less.c                |  4 ++--
 miscutils/watchdog.c            |  2 +-
 modutils/modprobe-small.c       |  2 +-
 networking/arping.c             |  2 +-
 networking/inetd.c              |  2 +-
 networking/nc.c                 |  2 +-
 networking/telnetd.c            |  2 +-
 runit/runsv.c                   |  2 +-
 shell/ash.c                     |  2 +-
 shell/hush.c                    |  4 ++--
 util-linux/fdisk.c              |  4 ++--
 util-linux/fdisk_osf.c          |  4 ++--
 util-linux/fsck_minix.c         |  2 +-
 24 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/debianutils/start_stop_daemon.c b/debianutils/start_stop_daemon.c
index 68df44ae9..3e5dd9faa 100644
--- a/debianutils/start_stop_daemon.c
+++ b/debianutils/start_stop_daemon.c
@@ -519,7 +519,7 @@ int start_stop_daemon_main(int argc UNUSED_PARAM, char **argv)
 			/* why _exit? the child may have changed the stack,
 			 * so "return 0" may do bad things
 			 */
-			_exit(EXIT_SUCCESS);
+			_exit_SUCCESS();
 		}
 		/* Child */
 		setsid(); /* detach from controlling tty */
@@ -531,7 +531,7 @@ int start_stop_daemon_main(int argc UNUSED_PARAM, char **argv)
 		 */
 		pid = xvfork();
 		if (pid != 0)
-			_exit(EXIT_SUCCESS); /* Parent */
+			_exit_SUCCESS(); /* Parent */
 	}
 	if (opt & OPT_MAKEPID) {
 		/* User wants _us_ to make the pidfile */
diff --git a/findutils/grep.c b/findutils/grep.c
index 8600d72fa..0b72812f1 100644
--- a/findutils/grep.c
+++ b/findutils/grep.c
@@ -470,7 +470,7 @@ static int grep_file(FILE *file)
 					 * "exit immediately with zero status
 					 * if any match is found,
 					 * even if errors were detected" */
-					exit(EXIT_SUCCESS);
+					exit_SUCCESS();
 				}
 				/* -l "print filenames with matches": stop after the first match */
 				if (option_mask32 & OPT_l) {
diff --git a/include/libbb.h b/include/libbb.h
index 8308d6259..c93058f6d 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1278,6 +1278,8 @@ void set_task_comm(const char *comm) FAST_FUNC;
 # define re_execed_comm() 0
 # define set_task_comm(name) ((void)0)
 #endif
+void exit_SUCCESS(void) NORETURN FAST_FUNC;
+void _exit_SUCCESS(void) NORETURN FAST_FUNC;
 
 /* Helpers for daemonization.
  *
diff --git a/init/init.c b/init/init.c
index efab5dcb4..785a3b460 100644
--- a/init/init.c
+++ b/init/init.c
@@ -744,7 +744,7 @@ static void pause_and_low_level_reboot(unsigned magic)
 	pid = vfork();
 	if (pid == 0) { /* child */
 		reboot(magic);
-		_exit(EXIT_SUCCESS);
+		_exit_SUCCESS();
 	}
 	/* Used to have "while (1) sleep(1)" here.
 	 * However, in containers reboot() call is ignored, and with that loop
@@ -752,7 +752,7 @@ static void pause_and_low_level_reboot(unsigned magic)
 	 */
 	waitpid(pid, NULL, 0);
 	sleep1(); /* paranoia */
-	_exit(EXIT_SUCCESS);
+	_exit_SUCCESS();
 }
 
 static void run_shutdown_and_kill_processes(void)
@@ -942,7 +942,7 @@ static void reload_inittab(void)
 			for (a = G.init_action_list; a; a = a->next)
 				if (a->action_type == 0 && a->pid != 0)
 					kill(a->pid, SIGKILL);
-			_exit(EXIT_SUCCESS);
+			_exit_SUCCESS();
 		}
 	}
 #endif
diff --git a/libbb/vfork_daemon_rexec.c b/libbb/vfork_daemon_rexec.c
index 31e97051f..79141936a 100644
--- a/libbb/vfork_daemon_rexec.c
+++ b/libbb/vfork_daemon_rexec.c
@@ -308,7 +308,7 @@ void FAST_FUNC bb_daemonize_or_rexec(int flags, char **argv)
 		/* fflush_all(); - add it in fork_or_rexec() if necessary */
 
 		if (fork_or_rexec(argv))
-			_exit(EXIT_SUCCESS); /* parent */
+			_exit_SUCCESS(); /* parent */
 		/* if daemonizing, detach from stdio & ctty */
 		setsid();
 		dup2(fd, 0);
@@ -320,7 +320,7 @@ void FAST_FUNC bb_daemonize_or_rexec(int flags, char **argv)
 //			 * Prevent this: stop being a session leader.
 //			 */
 //			if (fork_or_rexec(argv))
-//				_exit(EXIT_SUCCESS); /* parent */
+//				_exit_SUCCESS(); /* parent */
 //		}
 	}
 	while (fd > 2) {
diff --git a/libbb/xfuncs.c b/libbb/xfuncs.c
index c40dcb706..465e5366c 100644
--- a/libbb/xfuncs.c
+++ b/libbb/xfuncs.c
@@ -423,3 +423,13 @@ int FAST_FUNC wait4pid(pid_t pid)
 		return WTERMSIG(status) + 0x180;
 	return 0;
 }
+
+void FAST_FUNC exit_SUCCESS(void)
+{
+	exit(EXIT_SUCCESS);
+}
+
+void FAST_FUNC _exit_SUCCESS(void)
+{
+	_exit(EXIT_SUCCESS);
+}
diff --git a/loginutils/getty.c b/loginutils/getty.c
index 6c6d409f4..cd6378d80 100644
--- a/loginutils/getty.c
+++ b/loginutils/getty.c
@@ -484,7 +484,7 @@ static char *get_logname(void)
 			if (read(STDIN_FILENO, &c, 1) < 1) {
 				finalize_tty_attrs();
 				if (errno == EINTR || errno == EIO)
-					exit(EXIT_SUCCESS);
+					exit_SUCCESS();
 				bb_simple_perror_msg_and_die(bb_msg_read_error);
 			}
 
@@ -511,7 +511,7 @@ static char *get_logname(void)
 			case CTL('C'):
 			case CTL('D'):
 				finalize_tty_attrs();
-				exit(EXIT_SUCCESS);
+				exit_SUCCESS();
 			case '\0':
 				/* BREAK. If we have speeds to try,
 				 * return NULL (will switch speeds and return here) */
@@ -538,7 +538,7 @@ static char *get_logname(void)
 static void alarm_handler(int sig UNUSED_PARAM)
 {
 	finalize_tty_attrs();
-	_exit(EXIT_SUCCESS);
+	_exit_SUCCESS();
 }
 
 static void sleep10(void)
diff --git a/loginutils/login.c b/loginutils/login.c
index ce87e318a..569053c12 100644
--- a/loginutils/login.c
+++ b/loginutils/login.c
@@ -312,7 +312,7 @@ static void alarm_handler(int sig UNUSED_PARAM)
 	/* unix API is brain damaged regarding O_NONBLOCK,
 	 * we should undo it, or else we can affect other processes */
 	ndelay_off(STDOUT_FILENO);
-	_exit(EXIT_SUCCESS);
+	_exit_SUCCESS();
 }
 
 int login_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c
index e5bb8a2d8..839d00fd0 100644
--- a/miscutils/devfsd.c
+++ b/miscutils/devfsd.c
@@ -453,7 +453,7 @@ int devfsd_main(int argc, char **argv)
 				DEVFSD_PROTOCOL_REVISION_DAEMON, bb_msg_proto_rev, proto_rev);
 		if (DEVFSD_PROTOCOL_REVISION_DAEMON != proto_rev)
 			bb_error_msg_and_die("%s mismatch!", bb_msg_proto_rev);
-		exit(EXIT_SUCCESS); /* -v */
+		exit_SUCCESS(); /* -v */
 	}
 	/*  Tell kernel we are special(i.e. we get to see hidden entries)  */
 	xioctl(fd, DEVFSDIOC_SET_EVENT_MASK, 0);
@@ -474,7 +474,7 @@ int devfsd_main(int argc, char **argv)
 	dir_operation(SERVICE, mount_point, 0, NULL);
 
 	if (ENABLE_DEVFSD_FG_NP && no_polling)
-		exit(EXIT_SUCCESS);
+		exit_SUCCESS();
 
 	if (ENABLE_DEVFSD_VERBOSE || ENABLE_DEBUG)
 		logmode = LOGMODE_BOTH;
diff --git a/miscutils/hdparm.c b/miscutils/hdparm.c
index 01b4e8e2e..d8d8f6166 100644
--- a/miscutils/hdparm.c
+++ b/miscutils/hdparm.c
@@ -1271,7 +1271,7 @@ static void identify(uint16_t *val)
 		}
 	}
 
-	exit(EXIT_SUCCESS);
+	exit_SUCCESS();
 }
 #endif
 
diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c
index b25d49792..e3741eeba 100644
--- a/miscutils/i2c_tools.c
+++ b/miscutils/i2c_tools.c
@@ -1212,7 +1212,7 @@ static void NORETURN list_i2c_busses_and_exit(void)
 		}
 	}
 
-	exit(EXIT_SUCCESS);
+	exit_SUCCESS();
 }
 
 static void NORETURN no_support(const char *cmd)
diff --git a/miscutils/less.c b/miscutils/less.c
index 6825e5577..82c4b21f0 100644
--- a/miscutils/less.c
+++ b/miscutils/less.c
@@ -333,10 +333,10 @@ static void restore_tty(void)
 	clear_line();
 }
 
-static void less_exit(void)
+static NOINLINE void less_exit(void)
 {
 	restore_tty();
-	exit(EXIT_SUCCESS);
+	exit_SUCCESS();
 }
 
 #if (ENABLE_FEATURE_LESS_DASHCMD && ENABLE_FEATURE_LESS_LINENUMS) \
diff --git a/miscutils/watchdog.c b/miscutils/watchdog.c
index d8e9c78f5..9f5a4b849 100644
--- a/miscutils/watchdog.c
+++ b/miscutils/watchdog.c
@@ -76,7 +76,7 @@ static void shutdown_on_signal(int sig UNUSED_PARAM)
 {
 	remove_pidfile_std_path_and_ext("watchdog");
 	shutdown_watchdog();
-	_exit(EXIT_SUCCESS);
+	_exit_SUCCESS();
 }
 
 static void watchdog_open(const char* device)
diff --git a/modutils/modprobe-small.c b/modutils/modprobe-small.c
index db44a2ed0..b61651621 100644
--- a/modutils/modprobe-small.c
+++ b/modutils/modprobe-small.c
@@ -415,7 +415,7 @@ static FAST_FUNC int fileAction(struct recursive_state *state,
 			/* Load was successful, there is nothing else to do.
 			 * This can happen ONLY for "top-level" module load,
 			 * not a dep, because deps don't do dirscan. */
-			exit(EXIT_SUCCESS);
+			exit_SUCCESS();
 		}
 	}
 
diff --git a/networking/arping.c b/networking/arping.c
index d44d7d697..86f0221ed 100644
--- a/networking/arping.c
+++ b/networking/arping.c
@@ -159,7 +159,7 @@ static void finish(void)
 	if (option_mask32 & DAD)
 		exit(!!received);
 	if (option_mask32 & UNSOLICITED)
-		exit(EXIT_SUCCESS);
+		exit_SUCCESS();
 	exit(!received);
 }
 
diff --git a/networking/inetd.c b/networking/inetd.c
index e5352a555..e71be51c3 100644
--- a/networking/inetd.c
+++ b/networking/inetd.c
@@ -1208,7 +1208,7 @@ static void clean_up_and_exit(int sig UNUSED_PARAM)
 			close(sep->se_fd);
 	}
 	remove_pidfile_std_path_and_ext("inetd");
-	exit(EXIT_SUCCESS);
+	exit_SUCCESS();
 }
 
 int inetd_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
diff --git a/networking/nc.c b/networking/nc.c
index d351bf72a..ab1316339 100644
--- a/networking/nc.c
+++ b/networking/nc.c
@@ -268,7 +268,7 @@ int nc_main(int argc, char **argv)
 				nread = safe_read(pfds[fdidx].fd, iobuf, COMMON_BUFSIZE);
 				if (fdidx != 0) {
 					if (nread < 1)
-						exit(EXIT_SUCCESS);
+						exit_SUCCESS();
 					ofd = STDOUT_FILENO;
 				} else {
 					if (nread < 1) {
diff --git a/networking/telnetd.c b/networking/telnetd.c
index 581da1924..0805e464f 100644
--- a/networking/telnetd.c
+++ b/networking/telnetd.c
@@ -582,7 +582,7 @@ free_session(struct tsession *ts)
 	struct tsession *t;
 
 	if (option_mask32 & OPT_INETD)
-		exit(EXIT_SUCCESS);
+		exit_SUCCESS();
 
 	/* Unlink this telnet session from the session list */
 	t = G.sessions;
diff --git a/runit/runsv.c b/runit/runsv.c
index a4b8af494..6ad6bf46e 100644
--- a/runit/runsv.c
+++ b/runit/runsv.c
@@ -700,7 +700,7 @@ int runsv_main(int argc UNUSED_PARAM, char **argv)
 
 		if (svd[0].sd_want == W_EXIT && svd[0].state == S_DOWN) {
 			if (svd[1].pid == 0)
-				_exit(EXIT_SUCCESS);
+				_exit_SUCCESS();
 			if (svd[1].sd_want != W_EXIT) {
 				svd[1].sd_want = W_EXIT;
 				/* stopservice(&svd[1]); */
diff --git a/shell/ash.c b/shell/ash.c
index 827643808..4a8ec0c03 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -5505,7 +5505,7 @@ openhere(union node *redir)
 		ignoresig(SIGTSTP); //signal(SIGTSTP, SIG_IGN);
 		signal(SIGPIPE, SIG_DFL);
 		xwrite(pip[1], p, len);
-		_exit(EXIT_SUCCESS);
+		_exit_SUCCESS();
 	}
  out:
 	close(pip[1]);
diff --git a/shell/hush.c b/shell/hush.c
index 6a27b1634..982fc356a 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -8587,7 +8587,7 @@ static NOINLINE void pseudo_exec_argv(nommu_save_t *nommu_save,
 		 * expand_assignments(): think about ... | var=`sleep 1` | ...
 		 */
 		free_strings(new_env);
-		_exit(EXIT_SUCCESS);
+		_exit_SUCCESS();
 	}
 
 	sv_shadowed = G.shadowed_vars_pp;
@@ -8768,7 +8768,7 @@ static void pseudo_exec(nommu_save_t *nommu_save,
 
 	/* Case when we are here: ... | >file */
 	debug_printf_exec("pseudo_exec'ed null command\n");
-	_exit(EXIT_SUCCESS);
+	_exit_SUCCESS();
 }
 
 #if ENABLE_HUSH_JOB
diff --git a/util-linux/fdisk.c b/util-linux/fdisk.c
index 1c2a7d683..9c393b8fc 100644
--- a/util-linux/fdisk.c
+++ b/util-linux/fdisk.c
@@ -665,7 +665,7 @@ read_line(const char *prompt)
 
 	sz = read_line_input(NULL, prompt, line_buffer, sizeof(line_buffer));
 	if (sz <= 0)
-		exit(EXIT_SUCCESS); /* Ctrl-D or Ctrl-C */
+		exit_SUCCESS(); /* Ctrl-D or Ctrl-C */
 
 	if (line_buffer[sz-1] == '\n')
 		line_buffer[--sz] = '\0';
@@ -2855,7 +2855,7 @@ xselect(void)
 			if (ENABLE_FEATURE_CLEAN_UP)
 				close_dev_fd();
 			bb_putchar('\n');
-			exit(EXIT_SUCCESS);
+			exit_SUCCESS();
 		case 'r':
 			return;
 		case 's':
diff --git a/util-linux/fdisk_osf.c b/util-linux/fdisk_osf.c
index 765740ff1..6c66c130d 100644
--- a/util-linux/fdisk_osf.c
+++ b/util-linux/fdisk_osf.c
@@ -383,7 +383,7 @@ bsd_select(void)
 
 	if (xbsd_readlabel(NULL) == 0)
 		if (xbsd_create_disklabel() == 0)
-			exit(EXIT_SUCCESS);
+			exit_SUCCESS();
 
 #endif
 
@@ -411,7 +411,7 @@ bsd_select(void)
 		case 'q':
 			if (ENABLE_FEATURE_CLEAN_UP)
 				close_dev_fd();
-			exit(EXIT_SUCCESS);
+			exit_SUCCESS();
 		case 'r':
 			return;
 		case 's':
diff --git a/util-linux/fsck_minix.c b/util-linux/fsck_minix.c
index 40b86d01b..dd2265c32 100644
--- a/util-linux/fsck_minix.c
+++ b/util-linux/fsck_minix.c
@@ -423,7 +423,7 @@ static void check_mount(void)
 			cont = ask("Do you really want to continue", 0);
 		if (!cont) {
 			puts("Check aborted");
-			exit(EXIT_SUCCESS);
+			exit_SUCCESS();
 		}
 	}
 }
-- 
cgit v1.2.3-55-g6feb


From 6062c0d19bc201cbeb61b8875598cdd7a14a5ae0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 5 Jan 2022 23:02:13 +0100
Subject: libbb: change xstrndup, xmemdup to take size_t as size parameter

Also, remove entirely usually-disabled paranoia check (was also using
wrong config option to enable itself).

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 include/libbb.h       | 4 ++--
 libbb/xfuncs_printf.c | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/libbb.h b/include/libbb.h
index c93058f6d..daa310776 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -429,8 +429,8 @@ void *xrealloc(void *old, size_t size) FAST_FUNC;
 	xrealloc_vector_helper((vector), (sizeof((vector)[0]) << 8) + (shift), (idx))
 void* xrealloc_vector_helper(void *vector, unsigned sizeof_and_shift, int idx) FAST_FUNC;
 char *xstrdup(const char *s) FAST_FUNC RETURNS_MALLOC;
-char *xstrndup(const char *s, int n) FAST_FUNC RETURNS_MALLOC;
-void *xmemdup(const void *s, int n) FAST_FUNC RETURNS_MALLOC;
+char *xstrndup(const char *s, size_t n) FAST_FUNC RETURNS_MALLOC;
+void *xmemdup(const void *s, size_t n) FAST_FUNC RETURNS_MALLOC;
 void *mmap_read(int fd, size_t size) FAST_FUNC;
 void *mmap_anon(size_t size) FAST_FUNC;
 void *xmmap_anon(size_t size) FAST_FUNC;
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c
index d29acebcd..fc630d176 100644
--- a/libbb/xfuncs_printf.c
+++ b/libbb/xfuncs_printf.c
@@ -91,13 +91,10 @@ char* FAST_FUNC xstrdup(const char *s)
 
 // Die if we can't allocate n+1 bytes (space for the null terminator) and copy
 // the (possibly truncated to length n) string into it.
-char* FAST_FUNC xstrndup(const char *s, int n)
+char* FAST_FUNC xstrndup(const char *s, size_t n)
 {
 	char *t;
 
-	if (ENABLE_DEBUG && s == NULL)
-		bb_simple_error_msg_and_die("xstrndup bug");
-
 	t = strndup(s, n);
 
 	if (t == NULL)
@@ -106,7 +103,7 @@ char* FAST_FUNC xstrndup(const char *s, int n)
 	return t;
 }
 
-void* FAST_FUNC xmemdup(const void *s, int n)
+void* FAST_FUNC xmemdup(const void *s, size_t n)
 {
 	return memcpy(xmalloc(n), s, n);
 }
-- 
cgit v1.2.3-55-g6feb