From ab755f492599cf595d532f0f240a14c6e5caa435 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Fri, 14 Jul 2023 16:37:24 +0200
Subject: hwclock: force LONG_OPTS, stop accepting non-compatible -t

function                                             old     new   delta
hwclock_main                                         576     579      +3
.rodata                                           105404  105349     -55
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/1 up/down: 3/-55)             Total: -52 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 util-linux/hwclock.c | 83 ++++++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 41 deletions(-)

diff --git a/util-linux/hwclock.c b/util-linux/hwclock.c
index d78bfe374..e6f0043d0 100644
--- a/util-linux/hwclock.c
+++ b/util-linux/hwclock.c
@@ -9,6 +9,7 @@
 //config:config HWCLOCK
 //config:	bool "hwclock (5.9 kb)"
 //config:	default y
+//config:	select LONG_OPTS
 //config:	help
 //config:	The hwclock utility is used to read and set the hardware clock
 //config:	on a system. This is primarily used to set the current time on
@@ -409,89 +410,89 @@ static void set_rtc_param(const char **pp_rtcname, char *rtc_param)
 // -v, --verbose        display more details
 
 //usage:#define hwclock_trivial_usage
-//usage:	IF_LONG_OPTS(
-//usage:       "[-swul] [--systz] [--param-get PARAM] [--param-set PARAM=VAL] [-f DEV]"
-//usage:	)
-//usage:	IF_NOT_LONG_OPTS(
-//usage:       "[-swult] [-g PARAM] [-p PARAM=VAL] [-f DEV]"
-//usage:	)
+//usage:       "[-ul] [-f DEV] [-s|-w|--systz|--param-get PARAM|--param-set PARAM=VAL]"
 //usage:#define hwclock_full_usage "\n\n"
 //usage:       "Show or set hardware clock (RTC)\n"
+//usage:     "\n	-f DEV	Use this device (e.g. /dev/rtc2)"
+//usage:     "\n	-u	Assume RTC is kept in UTC"
+//usage:     "\n	-l	Assume RTC is kept in local time"
+//usage:     "\n		(if neither is given, read from "ADJTIME_PATH")"
 ///////:     "\n	-r	Show RTC time"
 ///////-r is default, don't bother showing it in help
 //usage:     "\n	-s	Set system time from RTC"
 //usage:     "\n	-w	Set RTC from system time"
-//usage:	IF_LONG_OPTS(
 //usage:     "\n	--systz	Set in-kernel timezone, correct system time"
 //usage:     "\n		if RTC is kept in local time"
 //usage:     "\n	--param-get PARAM	Get RTC parameter"
 //usage:     "\n	--param-set PARAM=VAL	Set RTC parameter"
-//usage:	)
-//usage:     "\n	-f DEV	Use specified device (e.g. /dev/rtc2)"
-//usage:     "\n	-u	Assume RTC is kept in UTC"
-//usage:     "\n	-l	Assume RTC is kept in local time"
-//usage:     "\n		(if neither is given, read from "ADJTIME_PATH")"
-
-//TODO: get rid of incompatible -t alias to --systz?
-
-#define HWCLOCK_OPT_LOCALTIME   0x01
-#define HWCLOCK_OPT_UTC         0x02
-#define HWCLOCK_OPT_SHOW        0x04
-#define HWCLOCK_OPT_HCTOSYS     0x08
-#define HWCLOCK_OPT_SYSTOHC     0x10
-#define HWCLOCK_OPT_SYSTZ       0x20
-#define HWCLOCK_OPT_RTCFILE     0x40
-#define HWCLOCK_OPT_PARAM_GET   0x80
-#define HWCLOCK_OPT_PARAM_SET   0x100
 
 int hwclock_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int hwclock_main(int argc UNUSED_PARAM, char **argv)
 {
 	const char *rtcname = NULL;
 	char *param;
-	unsigned opt;
+	unsigned opt, exclusive;
 	int utc;
-#if ENABLE_LONG_OPTS
+#define OPT_LOCALTIME   (1 << 0)
+#define OPT_UTC         (1 << 1)
+#define OPT_RTCFILE     (1 << 2)
+#define OPT_SHOW        (1 << 3)
+#define OPT_HCTOSYS     (1 << 4)
+#define OPT_SYSTOHC     (1 << 5)
+#define OPT_PARAM_GET   (1 << 6)
+#define OPT_PARAM_SET   (1 << 7)
+//#define OPT_VERBOSE   (1 << 8) UNUSED
+#define OPT_SYSTZ       (1 << 9)
 	static const char hwclock_longopts[] ALIGN1 =
 		"localtime\0" No_argument "l"
 		"utc\0"       No_argument "u"
+		"rtc\0"       Required_argument "f"
 		"show\0"      No_argument "r"
 		"hctosys\0"   No_argument "s"
 		"systohc\0"   No_argument "w"
-		"systz\0"     No_argument "t" /* short opt is non-standard */
-		"rtc\0"       Required_argument "f"
-		"param-get\0" Required_argument "g"  /* short opt is non-standard */
-		"param-set\0" Required_argument "p"  /* short opt is non-standard */
+		"param-get\0" Required_argument "\xfd" /* no short equivalent */
+		"param-set\0" Required_argument "\xfe" /* no short equivalent */
+		"systz\0"     No_argument "\xff" /* no short equivalent */
 		;
-#endif
 	opt = getopt32long(argv,
-		"^""lurswtf:g:p:v" /* -v is accepted and ignored */
+		"^""luf:rsw\xfd:\xfe:v" /* -v is accepted and ignored */
 		"\0"
-		"r--wstgp:w--rstgp:s--wrtgp:t--rswgp:g--rswtp:p--rswtg:l--u:u--l",
+		"l--u:u--l",
 		hwclock_longopts,
 		&rtcname,
 		&param,
 		&param
 	);
+#if 0 //DEBUG
+	bb_error_msg("opt:0x%x", opt);
+	if (opt & OPT_PARAM_GET) bb_error_msg("OPT_PARAM_GET %s", param);
+	if (opt & OPT_PARAM_SET) bb_error_msg("OPT_PARAM_SET %s", param);
+	if (opt & OPT_SYSTZ    ) bb_error_msg("OPT_SYSTZ");
+	return 0;
+#endif
+	/* All options apart from -luf are exclusive, enforce */
+	exclusive = opt >> 3;
+	if ((exclusive - 1) & exclusive) /* more than one bit set? */
+		bb_show_usage();
 
 	/* If -u or -l wasn't given, check if we are using utc */
-	if (opt & (HWCLOCK_OPT_UTC | HWCLOCK_OPT_LOCALTIME))
-		utc = (opt & HWCLOCK_OPT_UTC);
+	if (opt & (OPT_UTC | OPT_LOCALTIME))
+		utc = (opt & OPT_UTC);
 	else
 		utc = rtc_adjtime_is_utc();
 
-	if (opt & HWCLOCK_OPT_HCTOSYS)
+	if (opt & OPT_HCTOSYS)
 		to_sys_clock(&rtcname, utc);
-	else if (opt & HWCLOCK_OPT_SYSTOHC)
+	else if (opt & OPT_SYSTOHC)
 		from_sys_clock(&rtcname, utc);
-	else if (opt & HWCLOCK_OPT_SYSTZ)
+	else if (opt & OPT_SYSTZ)
 		set_kernel_timezone_and_clock(utc, NULL);
-	else if (opt & HWCLOCK_OPT_PARAM_GET)
+	else if (opt & OPT_PARAM_GET)
 		get_rtc_param(&rtcname, param);
-	else if (opt & HWCLOCK_OPT_PARAM_SET)
+	else if (opt & OPT_PARAM_SET)
 		set_rtc_param(&rtcname, param);
 	else
-		/* default HWCLOCK_OPT_SHOW */
+		/* default OPT_SHOW */
 		show_clock(&rtcname, utc);
 
 	return 0;
-- 
cgit v1.2.3-55-g6feb


From cf809e2f2dbf699035e4841e45070b947374a989 Mon Sep 17 00:00:00 2001
From: YU Jincheng <shana@zju.edu.cn>
Date: Fri, 7 Jul 2023 16:44:24 +0800
Subject: getfattr: new applet

function                                             old     new   delta
getfattr_main                                          -     309    +309
print_attr                                             -     115    +115
packed_usage                                       34576   34631     +55
.rodata                                           105349  105395     +46
lgetxattr                                              -      41     +41
getxattr                                               -      41     +41
llistxattr                                             -      35     +35
listxattr                                              -      35     +35
applet_names                                        2806    2815      +9
applet_main                                         1620    1624      +4
------------------------------------------------------------------------------
(add/remove: 7/0 grow/shrink: 4/0 up/down: 690/0)             Total: 690 bytes

Signed-off-by: YU Jincheng <shana@zju.edu.cn>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 miscutils/getfattr.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 miscutils/getfattr.c

diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c
new file mode 100644
index 000000000..59b6f6bca
--- /dev/null
+++ b/miscutils/getfattr.c
@@ -0,0 +1,131 @@
+/*
+ * getfattr - get extended attributes of filesystem objects.
+ *
+ * Copyright (C) 2023 by LoveSy <lovesykun@gmail.com>
+ *
+ * Licensed under GPLv2, see file LICENSE in this source tree.
+ */
+//config:config GETFATTR
+//config:	bool "getfattr (12.3 kb)"
+//config:	default y
+//config:	help
+//config:	Get extended attributes on files
+
+//applet:IF_GETFATTR(APPLET_NOEXEC(getfattr, getfattr, BB_DIR_USR_BIN, BB_SUID_DROP, getfattr))
+
+//kbuild:lib-$(CONFIG_GETFATTR) += getfattr.o
+
+#include <stdio.h>
+#include <sys/xattr.h>
+#include "libbb.h"
+
+//usage:#define getfattr_trivial_usage
+//usage:       "[-h] {-d|-n ATTR} FILE...\n"
+//usage:#define getfattr_full_usage "\n\n"
+//usage:       "Get extended attributes"
+//usage:     "\n"
+//usage:     "\n	-h		Do not follow symlinks"
+//usage:     "\n	-d		Dump all attributes"
+//usage:     "\n	-n ATTR		Get attribute ATTR"
+
+enum {
+	OPT_h = (1 << 0),
+	OPT_d = (1 << 1),
+};
+
+static int print_attr(const char *file, const char *name, char **buf, size_t *bufsize)
+{
+	ssize_t len;
+
+	if (*bufsize == 0)
+		goto grow;
+ again:
+	len = ((option_mask32 &  OPT_h) ? lgetxattr: getxattr)(file, name, *buf, *bufsize);
+	if (len < 0) {
+		if (errno != ERANGE)
+			return len;
+ grow:
+		*bufsize = (*bufsize * 2) + 1024;
+		*buf = xrealloc(*buf, *bufsize);
+		goto again;
+	}
+	printf("%s=\"%.*s\"\n", name, len, *buf);
+	return 0;
+}
+
+static ssize_t list_attr(const char *file, char **list, size_t *listsize)
+{
+	ssize_t len;
+
+	if (*listsize == 0)
+		goto grow;
+ again:
+	len = ((option_mask32 &  OPT_h) ? llistxattr : listxattr)(file, *list, *listsize);
+	if (len < 0) {
+		if (errno != ERANGE)
+			return len;
+ grow:
+		*listsize = (*listsize * 2) + 1024;
+		*list = xrealloc(*list, *listsize);
+		goto again;
+	}
+	return len;
+}
+
+int getfattr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
+int getfattr_main(int argc UNUSED_PARAM, char **argv)
+{
+	const char *name;
+	int status;
+	int opt;
+	char *buf = NULL;
+	size_t bufsize = 0;
+	char *list = NULL;
+	size_t listsize = 0;
+
+	opt = getopt32(argv, "^"
+		"hdn:"
+		/* Min one arg; exactly one of -n or -d is required. */
+		"\0" "-1:d:n:n--d:d--n"
+		, &name
+	);
+	argv += optind;
+	status = EXIT_SUCCESS;
+
+	do {
+		int r;
+		if (opt & OPT_d) {
+			ssize_t len = list_attr(*argv, &list, &listsize);
+			if (len > 0) {
+				char *key;
+				printf("# file: %s\n", *argv);
+				key = list;
+				while (len > 0) {
+					ssize_t keylen;
+					r = print_attr(*argv, key, &buf, &bufsize);
+					if (r)
+						goto err;
+					keylen = strlen(key) + 1;
+					key += keylen;
+					len -= keylen;
+				}
+				bb_putchar('\n');
+			}
+		} else {
+			printf("# file: %s\n", *argv);
+			r = print_attr(*argv, name, &buf, &bufsize);
+			if (r) {
+ err:
+				bb_simple_perror_msg(*argv);
+				status = EXIT_FAILURE;
+				// continue; maybe?
+			}
+			bb_putchar('\n');
+		}
+	} while (*++argv);
+
+	if (ENABLE_FEATURE_CLEAN_UP)
+		free(buf);
+
+	fflush_stdout_and_exit(status);
+}
-- 
cgit v1.2.3-55-g6feb


From a6a102ec4c8d96fcfb968c88fbdae80f6142c7bf Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 17 Jul 2023 09:36:17 +0200
Subject: getfattr: fix "getfattr NOTEXIST" - now prints error msg

function                                             old     new   delta
getfattr_main                                        309     307      -2
.rodata                                           105395  105391      -4
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-6)               Total: -6 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 miscutils/getfattr.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c
index 59b6f6bca..905aec65f 100644
--- a/miscutils/getfattr.c
+++ b/miscutils/getfattr.c
@@ -31,6 +31,7 @@
 enum {
 	OPT_h = (1 << 0),
 	OPT_d = (1 << 1),
+	OPT_n = (1 << 2),
 };
 
 static int print_attr(const char *file, const char *name, char **buf, size_t *bufsize)
@@ -85,8 +86,9 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv)
 
 	opt = getopt32(argv, "^"
 		"hdn:"
-		/* Min one arg; exactly one of -n or -d is required. */
-		"\0" "-1:d:n:n--d:d--n"
+		/* Min one arg; -d and -n are exclusive */
+		"\0" "-1:n--d:d--n"
+			//getfattr 2.5.1 does not enforce this: ":d:n" /* exactly one of -n or -d is required */
 		, &name
 	);
 	argv += optind;
@@ -94,8 +96,11 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv)
 
 	do {
 		int r;
-		if (opt & OPT_d) {
+//getfattr 2.5.1 with no -n/-d defaults to -d
+		if (!(opt & OPT_n)) {
 			ssize_t len = list_attr(*argv, &list, &listsize);
+			if (len < 0)
+				goto err;
 			if (len > 0) {
 				char *key;
 				printf("# file: %s\n", *argv);
@@ -118,7 +123,7 @@ int getfattr_main(int argc UNUSED_PARAM, char **argv)
  err:
 				bb_simple_perror_msg(*argv);
 				status = EXIT_FAILURE;
-				// continue; maybe?
+				continue;
 			}
 			bb_putchar('\n');
 		}
-- 
cgit v1.2.3-55-g6feb


From c484846c4459affa769b84cbd0b586f2bbaec828 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 17 Jul 2023 17:29:36 +0200
Subject: introduce and use exitcode_t

function                                             old     new   delta
strings_main                                         422     420      -2
setfattr_main                                        175     173      -2
brctl_main                                          1548    1546      -2
makedevs_main                                        979     975      -4
rev_main                                             337     332      -5
getfattr_main                                        307     302      -5
cut_main                                            1201    1196      -5
cksum_main                                           398     393      -5
umount_main                                          573     565      -8
ln_main                                              516     508      -8
expand_main                                          660     652      -8
df_main                                             1068    1060      -8
renice_main                                          346     332     -14
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/13 up/down: 0/-76)            Total: -76 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/cksum.c    | 2 +-
 coreutils/cut.c      | 2 +-
 coreutils/dd.c       | 2 +-
 coreutils/df.c       | 2 +-
 coreutils/expand.c   | 2 +-
 coreutils/fold.c     | 2 +-
 coreutils/ln.c       | 2 +-
 coreutils/touch.c    | 2 +-
 include/libbb.h      | 7 +++++++
 miscutils/getfattr.c | 2 +-
 miscutils/makedevs.c | 2 +-
 miscutils/setfattr.c | 2 +-
 miscutils/strings.c  | 3 ++-
 networking/brctl.c   | 2 +-
 networking/tc.c      | 5 ++---
 util-linux/renice.c  | 2 +-
 util-linux/rev.c     | 2 +-
 util-linux/umount.c  | 2 +-
 18 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/coreutils/cksum.c b/coreutils/cksum.c
index badc63a6a..1fb6ef2d0 100644
--- a/coreutils/cksum.c
+++ b/coreutils/cksum.c
@@ -39,7 +39,7 @@ int cksum_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int cksum_main(int argc UNUSED_PARAM, char **argv)
 {
 	uint32_t *crc32_table = crc32_filltable(NULL, IS_CKSUM);
-	int exit_code = EXIT_SUCCESS;
+	exitcode_t exit_code = EXIT_SUCCESS;
 
 #if ENABLE_DESKTOP
 	getopt32(argv, ""); /* cksum coreutils 6.9 compat */
diff --git a/coreutils/cut.c b/coreutils/cut.c
index 25b16d1a8..d129f9b9d 100644
--- a/coreutils/cut.c
+++ b/coreutils/cut.c
@@ -311,7 +311,7 @@ int cut_main(int argc UNUSED_PARAM, char **argv)
 	}
 
 	{
-		int retval = EXIT_SUCCESS;
+		exitcode_t retval = EXIT_SUCCESS;
 
 		if (!*argv)
 			*--argv = (char *)"-";
diff --git a/coreutils/dd.c b/coreutils/dd.c
index c032ebe1b..8bb782781 100644
--- a/coreutils/dd.c
+++ b/coreutils/dd.c
@@ -375,7 +375,7 @@ int dd_main(int argc UNUSED_PARAM, char **argv)
 		OP_oflag_direct,
 #endif
 	};
-	smallint exitcode = EXIT_FAILURE;
+	exitcode_t exitcode = EXIT_FAILURE;
 	int i;
 	size_t ibs = 512;
 	char *ibuf;
diff --git a/coreutils/df.c b/coreutils/df.c
index 76e9cefbf..03aa78148 100644
--- a/coreutils/df.c
+++ b/coreutils/df.c
@@ -113,7 +113,7 @@ int df_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int df_main(int argc UNUSED_PARAM, char **argv)
 {
 	unsigned long df_disp_hr = 1024;
-	int status = EXIT_SUCCESS;
+	exitcode_t status = EXIT_SUCCESS;
 	unsigned opt;
 	FILE *mount_table;
 	struct mntent *mount_entry;
diff --git a/coreutils/expand.c b/coreutils/expand.c
index 47693e144..c4db26055 100644
--- a/coreutils/expand.c
+++ b/coreutils/expand.c
@@ -192,7 +192,7 @@ int expand_main(int argc UNUSED_PARAM, char **argv)
 	FILE *file;
 	unsigned tab_size;
 	unsigned opt;
-	int exit_status = EXIT_SUCCESS;
+	exitcode_t exit_status = EXIT_SUCCESS;
 
 	init_unicode();
 
diff --git a/coreutils/fold.c b/coreutils/fold.c
index 2839c8c68..8112fe911 100644
--- a/coreutils/fold.c
+++ b/coreutils/fold.c
@@ -77,7 +77,7 @@ int fold_main(int argc UNUSED_PARAM, char **argv)
 	char *line_out = NULL;
 	const char *w_opt = "80";
 	unsigned width;
-	smallint exitcode = EXIT_SUCCESS;
+	exitcode_t exitcode = EXIT_SUCCESS;
 
 	init_unicode();
 
diff --git a/coreutils/ln.c b/coreutils/ln.c
index 34eec398a..080ba142e 100644
--- a/coreutils/ln.c
+++ b/coreutils/ln.c
@@ -52,7 +52,7 @@
 int ln_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int ln_main(int argc, char **argv)
 {
-	int status = EXIT_SUCCESS;
+	exitcode_t status = EXIT_SUCCESS;
 	int opts;
 	char *last;
 	char *src_name;
diff --git a/coreutils/touch.c b/coreutils/touch.c
index 8fde70e12..ced596c89 100644
--- a/coreutils/touch.c
+++ b/coreutils/touch.c
@@ -77,7 +77,7 @@ int touch_main(int argc UNUSED_PARAM, char **argv)
 {
 	int fd;
 	int opts;
-	smalluint status = EXIT_SUCCESS;
+	exitcode_t status = EXIT_SUCCESS;
 #if ENABLE_FEATURE_TOUCH_SUSV3
 	char *reference_file;
 	char *date_str;
diff --git a/include/libbb.h b/include/libbb.h
index 640fa3988..eb97a9880 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1444,6 +1444,13 @@ void bb_verror_msg(const char *s, va_list p, const char *strerr) FAST_FUNC;
 void bb_die_memory_exhausted(void) NORETURN FAST_FUNC;
 void bb_logenv_override(void) FAST_FUNC;
 
+/* x86 benefits from narrow exit code variables
+ * (because it has no widening MOV imm8,word32 insn, has to use MOV imm32,w
+ * for "exitcode = EXIT_FAILURE" and similar. The downside is that sometimes
+*  gcc widens the variable to int in various ugly suboptimal ways).
+ */
+typedef smalluint exitcode_t;
+
 #if ENABLE_FEATURE_SYSLOG_INFO
 void bb_info_msg(const char *s, ...) __attribute__ ((format (printf, 1, 2))) FAST_FUNC;
 void bb_simple_info_msg(const char *s) FAST_FUNC;
diff --git a/miscutils/getfattr.c b/miscutils/getfattr.c
index 905aec65f..cb42fdac0 100644
--- a/miscutils/getfattr.c
+++ b/miscutils/getfattr.c
@@ -77,7 +77,7 @@ int getfattr_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int getfattr_main(int argc UNUSED_PARAM, char **argv)
 {
 	const char *name;
-	int status;
+	exitcode_t status;
 	int opt;
 	char *buf = NULL;
 	size_t bufsize = 0;
diff --git a/miscutils/makedevs.c b/miscutils/makedevs.c
index 48be91875..999a3b976 100644
--- a/miscutils/makedevs.c
+++ b/miscutils/makedevs.c
@@ -181,7 +181,7 @@ int makedevs_main(int argc UNUSED_PARAM, char **argv)
 {
 	parser_t *parser;
 	char *line = (char *)"-";
-	int ret = EXIT_SUCCESS;
+	exitcode_t ret = EXIT_SUCCESS;
 
 	getopt32(argv, "^" "d:" "\0" "=1", &line);
 	argv += optind;
diff --git a/miscutils/setfattr.c b/miscutils/setfattr.c
index 10d1840c9..b68bc9452 100644
--- a/miscutils/setfattr.c
+++ b/miscutils/setfattr.c
@@ -32,7 +32,7 @@ int setfattr_main(int argc UNUSED_PARAM, char **argv)
 {
 	const char *name;
 	const char *value = "";
-	int status;
+	exitcode_t status;
 	int opt;
 	enum {
 		OPT_h = (1 << 0),
diff --git a/miscutils/strings.c b/miscutils/strings.c
index 036df5c5d..bd1850cbb 100644
--- a/miscutils/strings.c
+++ b/miscutils/strings.c
@@ -40,7 +40,8 @@
 int strings_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int strings_main(int argc UNUSED_PARAM, char **argv)
 {
-	int n, c, status = EXIT_SUCCESS;
+	int n, c;
+	exitcode_t status = EXIT_SUCCESS;
 	unsigned count;
 	off_t offset;
 	FILE *file;
diff --git a/networking/brctl.c b/networking/brctl.c
index 7b0270b51..0f8dc2f7a 100644
--- a/networking/brctl.c
+++ b/networking/brctl.c
@@ -538,7 +538,7 @@ int brctl_main(int argc UNUSED_PARAM, char **argv)
 		DIR *net;
 		struct dirent *ent;
 		int need_hdr = 1;
-		int exitcode = EXIT_SUCCESS;
+		exitcode_t exitcode = EXIT_SUCCESS;
 
 		if (*argv) {
 			/* "show BR1 BR2 BR3" */
diff --git a/networking/tc.c b/networking/tc.c
index 43187f7ee..3a79fd2d9 100644
--- a/networking/tc.c
+++ b/networking/tc.c
@@ -502,7 +502,7 @@ int tc_main(int argc UNUSED_PARAM, char **argv)
 	};
 	struct rtnl_handle rth;
 	struct tcmsg msg;
-	int ret, obj, cmd, arg;
+	int obj, cmd, arg;
 	char *dev = NULL;
 
 	INIT_G();
@@ -510,7 +510,6 @@ int tc_main(int argc UNUSED_PARAM, char **argv)
 	if (!*++argv)
 		bb_show_usage();
 	xrtnl_open(&rth);
-	ret = EXIT_SUCCESS;
 
 	obj = index_in_substrings(objects, *argv++);
 	if (obj < 0)
@@ -625,5 +624,5 @@ int tc_main(int argc UNUSED_PARAM, char **argv)
 	if (ENABLE_FEATURE_CLEAN_UP) {
 		rtnl_close(&rth);
 	}
-	return ret;
+	return EXIT_SUCCESS;
 }
diff --git a/util-linux/renice.c b/util-linux/renice.c
index 53f197cce..f2737f29b 100644
--- a/util-linux/renice.c
+++ b/util-linux/renice.c
@@ -45,7 +45,7 @@ int renice_main(int argc UNUSED_PARAM, char **argv)
 {
 	static const char Xetpriority_msg[] ALIGN1 = "%cetpriority";
 
-	int retval = EXIT_SUCCESS;
+	exitcode_t retval = EXIT_SUCCESS;
 	int which = PRIO_PROCESS;  /* Default 'which' value. */
 	int use_relative = 0;
 	int adjustment, new_priority;
diff --git a/util-linux/rev.c b/util-linux/rev.c
index 12df2b9ff..aad53722d 100644
--- a/util-linux/rev.c
+++ b/util-linux/rev.c
@@ -51,7 +51,7 @@ static void strrev(CHAR_T *s, int len)
 int rev_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int rev_main(int argc UNUSED_PARAM, char **argv)
 {
-	int retval;
+	exitcode_t retval;
 	size_t bufsize;
 	char *buf;
 
diff --git a/util-linux/umount.c b/util-linux/umount.c
index 23da32868..f5c97a034 100644
--- a/util-linux/umount.c
+++ b/util-linux/umount.c
@@ -97,7 +97,7 @@ int umount_main(int argc UNUSED_PARAM, char **argv)
 	struct mntent me;
 	FILE *fp;
 	char *fstype = NULL;
-	int status = EXIT_SUCCESS;
+	exitcode_t status = EXIT_SUCCESS;
 	unsigned opt;
 	struct mtab_list {
 		char *dir;
-- 
cgit v1.2.3-55-g6feb


From 8f0845cad7bfc46939132b33f9cd0753b261b953 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 18 Jul 2023 16:41:12 +0200
Subject: libbb: rename source files, no code changes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Kbuild.src                     |   10 +-
 libbb/hash_md5_sha256_x86-32_shaNI.S |  284 -------
 libbb/hash_md5_sha256_x86-64_shaNI.S |  290 -------
 libbb/hash_md5_sha_x86-32_shaNI.S    |  234 ------
 libbb/hash_md5_sha_x86-64.S          | 1489 ----------------------------------
 libbb/hash_md5_sha_x86-64.S.sh       |  478 -----------
 libbb/hash_md5_sha_x86-64_shaNI.S    |  232 ------
 libbb/hash_sha1_hwaccel_x86-32.S     |  234 ++++++
 libbb/hash_sha1_hwaccel_x86-64.S     |  232 ++++++
 libbb/hash_sha1_x86-64.S             | 1489 ++++++++++++++++++++++++++++++++++
 libbb/hash_sha1_x86-64.S.sh          |  478 +++++++++++
 libbb/hash_sha256_hwaccel_x86-32.S   |  284 +++++++
 libbb/hash_sha256_hwaccel_x86-64.S   |  290 +++++++
 13 files changed, 3012 insertions(+), 3012 deletions(-)
 delete mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S
 delete mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S
 delete mode 100644 libbb/hash_md5_sha_x86-32_shaNI.S
 delete mode 100644 libbb/hash_md5_sha_x86-64.S
 delete mode 100755 libbb/hash_md5_sha_x86-64.S.sh
 delete mode 100644 libbb/hash_md5_sha_x86-64_shaNI.S
 create mode 100644 libbb/hash_sha1_hwaccel_x86-32.S
 create mode 100644 libbb/hash_sha1_hwaccel_x86-64.S
 create mode 100644 libbb/hash_sha1_x86-64.S
 create mode 100755 libbb/hash_sha1_x86-64.S.sh
 create mode 100644 libbb/hash_sha256_hwaccel_x86-32.S
 create mode 100644 libbb/hash_sha256_hwaccel_x86-64.S

diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 653025e56..c3b30003f 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -56,11 +56,11 @@ lib-y += login.o
 lib-y += make_directory.o
 lib-y += makedev.o
 lib-y += hash_md5_sha.o
-lib-y += hash_md5_sha_x86-64.o
-lib-y += hash_md5_sha_x86-64_shaNI.o
-lib-y += hash_md5_sha_x86-32_shaNI.o
-lib-y += hash_md5_sha256_x86-64_shaNI.o
-lib-y += hash_md5_sha256_x86-32_shaNI.o
+lib-y += hash_sha1_x86-64.o
+lib-y += hash_sha1_hwaccel_x86-64.o
+lib-y += hash_sha1_hwaccel_x86-32.o
+lib-y += hash_sha256_hwaccel_x86-64.o
+lib-y += hash_sha256_hwaccel_x86-32.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
deleted file mode 100644
index a0e4a571a..000000000
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ /dev/null
@@ -1,284 +0,0 @@
-#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-// pshufb and palignr are SSSE3 insns.
-// We do not check SSSE3 in cpuid,
-// all SHA-capable CPUs support it as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
-	.globl	sha256_process_block64_shaNI
-	.hidden	sha256_process_block64_shaNI
-	.type	sha256_process_block64_shaNI, @function
-
-#define DATA_PTR	%eax
-
-#define SHA256CONSTANTS	%ecx
-
-#define MSG		%xmm0
-#define STATE0		%xmm1
-#define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-
-#define XMMTMP		%xmm7
-
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha256_process_block64_shaNI:
-
-	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
-	movu128		76+1*16(%eax), STATE1 /* EFGH */
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE1, STATE0
-	/* ---		-------------- ABCD -- EFGH */
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
-
-/* XMMTMP holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
-	movl		$K256+8*16, SHA256CONSTANTS
-
-	/* Rounds 0-3 */
-	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP0
-		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP1
-		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP2
-		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-/* ...to here */
-	mova128		MSG, MSGTMP3
-		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	mova128		MSGTMP0, MSG
-		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	mova128		MSGTMP1, MSG
-		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	mova128		MSGTMP2, MSG
-		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	mova128		MSGTMP3, MSG
-		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	mova128		MSGTMP0, MSG
-		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	mova128		MSGTMP1, MSG
-		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	mova128		MSGTMP2, MSG
-		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	mova128		MSGTMP3, MSG
-		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	mova128		MSGTMP0, MSG
-		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	mova128		MSGTMP1, MSG
-		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 56-59 */
-	mova128		MSGTMP2, MSG
-		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 60-63 */
-	mova128		MSGTMP3, MSG
-		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Write hash values back in the correct order */
-	mova128		STATE0, XMMTMP
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	/* ---		-------------- HGDC -- FEBA */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
-	/* add current hash values to previous ones */
-	movu128		76+1*16(%eax), STATE1
-	paddd		XMMTMP, STATE1
-	movu128		STATE1, 76+1*16(%eax)
-	movu128		76+0*16(%eax), XMMTMP
-	paddd		XMMTMP, STATE0
-	movu128		STATE0, 76+0*16(%eax)
-
-	ret
-	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-
-	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign	16
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
-#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
deleted file mode 100644
index 172c2eae2..000000000
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ /dev/null
@@ -1,290 +0,0 @@
-#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-// pshufb and palignr are SSSE3 insns.
-// We do not check SSSE3 in cpuid,
-// all SHA-capable CPUs support it as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
-	.globl	sha256_process_block64_shaNI
-	.hidden	sha256_process_block64_shaNI
-	.type	sha256_process_block64_shaNI, @function
-
-#define DATA_PTR	%rdi
-
-#define SHA256CONSTANTS	%rax
-
-#define MSG		%xmm0
-#define STATE0		%xmm1
-#define STATE1		%xmm2
-#define MSGTMP0		%xmm3
-#define MSGTMP1		%xmm4
-#define MSGTMP2		%xmm5
-#define MSGTMP3		%xmm6
-
-#define XMMTMP		%xmm7
-
-#define SAVE0		%xmm8
-#define SAVE1		%xmm9
-
-#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha256_process_block64_shaNI:
-
-	movu128		80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
-	movu128		80+1*16(%rdi), STATE1 /* EFGH */
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE1, STATE0
-	/* ---		-------------- ABCD -- EFGH */
-	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
-	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
-
-/* XMMTMP holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
-	leaq		K256+8*16(%rip), SHA256CONSTANTS
-
-	/* Save hash values for addition after rounds */
-	mova128		STATE0, SAVE0
-	mova128		STATE1, SAVE1
-
-	/* Rounds 0-3 */
-	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP0
-		paddd		0*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 4-7 */
-	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP1
-		paddd		1*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 8-11 */
-	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-	mova128		MSG, MSGTMP2
-		paddd		2*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 12-15 */
-	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP, MSG
-/* ...to here */
-	mova128		MSG, MSGTMP3
-		paddd		3*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 16-19 */
-	mova128		MSGTMP0, MSG
-		paddd		4*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 20-23 */
-	mova128		MSGTMP1, MSG
-		paddd		5*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 24-27 */
-	mova128		MSGTMP2, MSG
-		paddd		6*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 28-31 */
-	mova128		MSGTMP3, MSG
-		paddd		7*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 32-35 */
-	mova128		MSGTMP0, MSG
-		paddd		8*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 36-39 */
-	mova128		MSGTMP1, MSG
-		paddd		9*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP1, MSGTMP0
-
-	/* Rounds 40-43 */
-	mova128		MSGTMP2, MSG
-		paddd		10*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP2, MSGTMP1
-
-	/* Rounds 44-47 */
-	mova128		MSGTMP3, MSG
-		paddd		11*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP
-	palignr		$4, MSGTMP2, XMMTMP
-	paddd		XMMTMP, MSGTMP0
-	sha256msg2	MSGTMP3, MSGTMP0
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP3, MSGTMP2
-
-	/* Rounds 48-51 */
-	mova128		MSGTMP0, MSG
-		paddd		12*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP
-	palignr		$4, MSGTMP3, XMMTMP
-	paddd		XMMTMP, MSGTMP1
-	sha256msg2	MSGTMP0, MSGTMP1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-	sha256msg1	MSGTMP0, MSGTMP3
-
-	/* Rounds 52-55 */
-	mova128		MSGTMP1, MSG
-		paddd		13*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP
-	palignr		$4, MSGTMP0, XMMTMP
-	paddd		XMMTMP, MSGTMP2
-	sha256msg2	MSGTMP1, MSGTMP2
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 56-59 */
-	mova128		MSGTMP2, MSG
-		paddd		14*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP
-	palignr		$4, MSGTMP1, XMMTMP
-	paddd		XMMTMP, MSGTMP3
-	sha256msg2	MSGTMP2, MSGTMP3
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Rounds 60-63 */
-	mova128		MSGTMP3, MSG
-		paddd		15*16-8*16(SHA256CONSTANTS), MSG
-		sha256rnds2	MSG, STATE0, STATE1
-		shuf128_32	$0x0E, MSG, MSG
-		sha256rnds2	MSG, STATE1, STATE0
-
-	/* Add current hash values with previously saved */
-	paddd		SAVE0, STATE0
-	paddd		SAVE1, STATE1
-
-	/* Write hash values back in the correct order */
-	mova128		STATE0, XMMTMP
-/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	/* ---		-------------- HGDC -- FEBA */
-	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
-	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
-	movu128		STATE0, 80+0*16(%rdi)
-	movu128		XMMTMP, 80+1*16(%rdi)
-
-	ret
-	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-
-	.section	.rodata.cst256.K256, "aM", @progbits, 256
-	.balign	16
-K256:
-	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
-	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
-	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
-	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
-	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
-	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
-	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
-	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
-	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
-	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
-	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
-	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
-	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
-	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
-	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
-	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
-
-	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BSWAP32_FLIP_MASK:
-	.octa	0x0c0d0e0f08090a0b0405060700010203
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
deleted file mode 100644
index 7455a29f0..000000000
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ /dev/null
@@ -1,234 +0,0 @@
-#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define xor128 pxor
-#define xor128 xorps
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-#define extr128_32 pextrd
-//#define extr128_32 extractps	# not shorter
-
-// pshufb is a SSSE3 insn.
-// pinsrd, pextrd, extractps are SSE4.1 insns.
-// We do not check SSSE3/SSE4.1 in cpuid,
-// all SHA-capable CPUs support them as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
-	.globl	sha1_process_block64_shaNI
-	.hidden	sha1_process_block64_shaNI
-	.type	sha1_process_block64_shaNI, @function
-
-#define ABCD		%xmm0
-#define E0		%xmm1	/* Need two E's b/c they ping pong */
-#define E1		%xmm2
-#define MSG0		%xmm3
-#define MSG1		%xmm4
-#define MSG2		%xmm5
-#define MSG3		%xmm6
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha1_process_block64_shaNI:
-	/* load initial hash values */
-	movu128		76(%eax), ABCD
-	xor128		E0, E0
-	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
-	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
-
-	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
-
-	movu128		0*16(%eax), MSG0
-	pshufb		%xmm7, MSG0
-	movu128		1*16(%eax), MSG1
-	pshufb		%xmm7, MSG1
-	movu128		2*16(%eax), MSG2
-	pshufb		%xmm7, MSG2
-	movu128		3*16(%eax), MSG3
-	pshufb		%xmm7, MSG3
-
-	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm7
-	/*mova128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
-
-	/* Rounds 0-3 */
-		paddd		MSG0, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 12-15 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	xor128		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
-
-	/* Add current hash values with previously saved */
-	sha1nexte	%xmm7, E0
-	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
-	movu128		76(%eax), %xmm7	# get original ABCD (not shuffled)...
-
-	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, ABCD, ABCD
-	paddd		%xmm7, ABCD	# ...add it to final ABCD
-	movu128		ABCD, 76(%eax)
-	extr128_32	$3, E0, 76+4*4(%eax)
-
-	ret
-	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-
-	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa	0x000102030405060708090a0b0c0d0e0f
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
deleted file mode 100644
index 2cdd22015..000000000
--- a/libbb/hash_md5_sha_x86-64.S
+++ /dev/null
@@ -1,1489 +0,0 @@
-### Generated by hash_md5_sha_x86-64.S.sh ###
-
-#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64, "ax", @progbits
-	.globl	sha1_process_block64
-	.hidden	sha1_process_block64
-	.type	sha1_process_block64, @function
-
-	.balign	8	# allow decoders to fetch at least 5 first insns
-sha1_process_block64:
-	pushq	%rbp	# 1 byte insn
-	pushq	%rbx	# 1 byte insn
-#	pushq	%r15	# 2 byte insn
-	pushq	%r14	# 2 byte insn
-	pushq	%r13	# 2 byte insn
-	pushq	%r12	# 2 byte insn
-	pushq	%rdi	# we need ctx at the end
-
-#Register and stack use:
-# eax..edx: a..d
-# ebp: e
-# esi,edi,r8..r14: temps
-# r15: unused
-# xmm0..xmm3: W[]
-# xmm4,xmm5: temps
-# xmm6: current round constant
-# xmm7: all round constants
-# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
-	movaps	sha1const(%rip), %xmm7
-	pshufd	$0x00, %xmm7, %xmm6
-
-	# Load W[] to xmm0..3, byteswapping on the fly.
-	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
-	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	$32, %rsi		# rsi = W[1]:W[0]
-	rolq	$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, %xmm4
-	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
-#	paddd	%xmm6, %xmm4
-#	movups	%xmm4, -64+16*0(%rsp)
-
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	$32, %r9		# r9  = W[5]:W[4]
-	rolq	$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	$32, %r11		# r11 = W[9]:W[8]
-	rolq	$32, %r12		# r12 = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, %xmm4
-	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	$32, %r13		# r13 = W[13]:W[12]
-	rolq	$32, %r14		# r14 = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, %xmm4
-	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-
-# 0
-	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
-	shrq	$32, %rsi
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 1
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 2
-	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
-	shrq	$32, %r8
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 3
-	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 4
-	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
-	shrq	$32, %r9
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 5
-	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 6
-	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
-	shrq	$32, %r10
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 7
-	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 8
-	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
-	shrq	$32, %r11
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 9
-	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 10
-	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
-	shrq	$32, %r12
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 11
-	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0x55, %xmm7, %xmm6
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 12
-	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
-	shrq	$32, %r13
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 13
-	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 14
-	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
-	shrq	$32, %r14
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 15
-	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	andl	%ebx, %edi		# &b
-	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %edi		#
-	roll	$5, %edi		# rotl32(a,5)
-	addl	%edi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 16
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	andl	%eax, %edi		# &b
-	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*0(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 17
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	andl	%ebp, %edi		# &b
-	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*1(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 18
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	andl	%edx, %edi		# &b
-	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*2(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 19
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	andl	%ecx, %edi		# &b
-	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*3(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 20
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*4(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 21
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*5(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 22
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*6(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 23
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*7(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 24
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*8(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 25
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*9(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 26
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*10(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 27
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*11(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 28
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*12(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 29
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*13(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 30
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*14(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 31
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*15(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0xaa, %xmm7, %xmm6
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 32
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*0(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 33
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*1(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 34
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*2(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 35
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*3(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 36
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*4(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 37
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*5(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 38
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*6(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 39
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*7(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 40
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*8(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 41
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*9(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 42
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*10(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 43
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*11(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 44
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*12(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 45
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*13(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 46
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*14(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 47
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*15(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 48
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*0(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 49
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*1(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 50
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*2(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 51
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*3(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-	pshufd	$0xff, %xmm7, %xmm6
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 52
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*4(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 53
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*5(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 54
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*6(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 55
-	movl	%ebx, %edi		# di: b
-	movl	%ebx, %esi		# si: b
-	orl	%ecx, %edi		# di: b | c
-	andl	%ecx, %esi		# si: b & c
-	andl	%edx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	addl	-64+4*7(%rsp), %ebp	# e += RCONST + W[n & 15]
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
-	movaps	%xmm3, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm0, %xmm5
-	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm0, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	#  shift left by 1
-	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm0, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*0(%rsp)
-# 56
-	movl	%eax, %edi		# di: b
-	movl	%eax, %esi		# si: b
-	orl	%ebx, %edi		# di: b | c
-	andl	%ebx, %esi		# si: b & c
-	andl	%ecx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*8(%rsp), %edx	# e += RCONST + W[n & 15]
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 57
-	movl	%ebp, %edi		# di: b
-	movl	%ebp, %esi		# si: b
-	orl	%eax, %edi		# di: b | c
-	andl	%eax, %esi		# si: b & c
-	andl	%ebx, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*9(%rsp), %ecx	# e += RCONST + W[n & 15]
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 58
-	movl	%edx, %edi		# di: b
-	movl	%edx, %esi		# si: b
-	orl	%ebp, %edi		# di: b | c
-	andl	%ebp, %esi		# si: b & c
-	andl	%eax, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	addl	-64+4*10(%rsp), %ebx	# e += RCONST + W[n & 15]
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 59
-	movl	%ecx, %edi		# di: b
-	movl	%ecx, %esi		# si: b
-	orl	%edx, %edi		# di: b | c
-	andl	%edx, %esi		# si: b & c
-	andl	%ebp, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	addl	-64+4*11(%rsp), %eax	# e += RCONST + W[n & 15]
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
-	movaps	%xmm0, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm1, %xmm5
-	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm1	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm1, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	#  shift left by 1
-	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm1, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*1(%rsp)
-# 60
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*12(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 61
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*13(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 62
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*14(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 63
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*15(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
-	movaps	%xmm1, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm2, %xmm5
-	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm2	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm2, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	#  shift left by 1
-	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm2, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*2(%rsp)
-# 64
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*0(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 65
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*1(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 66
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*2(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 67
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*3(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
-	movaps	%xmm2, %xmm4
-	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	%xmm3, %xmm5
-	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	%xmm5, %xmm3	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	%xmm3, %xmm5
-	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	#  shift left by 1
-	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	%xmm5, %xmm4
-	pslld	$2, %xmm5
-	psrld	$30, %xmm4
-#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
-	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
-	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-	movaps	%xmm3, %xmm5
-	paddd	%xmm6, %xmm5
-	movups	%xmm5, -64+16*3(%rsp)
-# 68
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*4(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 69
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*5(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 70
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*6(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 71
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*7(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 72
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*8(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 73
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*9(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 74
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*10(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-# 75
-	movl	%ecx, %edi		# c
-	xorl	%edx, %edi		# ^d
-	xorl	%ebx, %edi		# ^b
-	addl	-64+4*11(%rsp), %ebp	# e += RCONST + W[n & 15]
-	addl	%edi, %ebp		# e += (c ^ d ^ b)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
-	rorl	$2, %ebx		# b = rotl32(b,30)
-# 76
-	movl	%ebx, %edi		# c
-	xorl	%ecx, %edi		# ^d
-	xorl	%eax, %edi		# ^b
-	addl	-64+4*12(%rsp), %edx	# e += RCONST + W[n & 15]
-	addl	%edi, %edx		# e += (c ^ d ^ b)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
-	rorl	$2, %eax		# b = rotl32(b,30)
-# 77
-	movl	%eax, %edi		# c
-	xorl	%ebx, %edi		# ^d
-	xorl	%ebp, %edi		# ^b
-	addl	-64+4*13(%rsp), %ecx	# e += RCONST + W[n & 15]
-	addl	%edi, %ecx		# e += (c ^ d ^ b)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
-	rorl	$2, %ebp		# b = rotl32(b,30)
-# 78
-	movl	%ebp, %edi		# c
-	xorl	%eax, %edi		# ^d
-	xorl	%edx, %edi		# ^b
-	addl	-64+4*14(%rsp), %ebx	# e += RCONST + W[n & 15]
-	addl	%edi, %ebx		# e += (c ^ d ^ b)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
-	rorl	$2, %edx		# b = rotl32(b,30)
-# 79
-	movl	%edx, %edi		# c
-	xorl	%ebp, %edi		# ^d
-	xorl	%ecx, %edi		# ^b
-	addl	-64+4*15(%rsp), %eax	# e += RCONST + W[n & 15]
-	addl	%edi, %eax		# e += (c ^ d ^ b)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
-	rorl	$2, %ecx		# b = rotl32(b,30)
-
-	popq	%rdi		#
-	popq	%r12		#
-	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
-	popq	%r13		#
-	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
-	popq	%r14		#
-	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-#	popq	%r15		#
-	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
-	popq	%rbx		#
-	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
-	popq	%rbp		#
-
-	ret
-	.size	sha1_process_block64, .-sha1_process_block64
-
-	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
-	.balign	16
-sha1const:
-	.long	0x5A827999
-	.long	0x6ED9EBA1
-	.long	0x8F1BBCDC
-	.long	0xCA62C1D6
-
-#endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
deleted file mode 100755
index 653fe4989..000000000
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ /dev/null
@@ -1,478 +0,0 @@
-#!/bin/sh
-
-# We don't regenerate it on every "make" invocation - only by hand.
-# The reason is that the changes to generated code are difficult
-# to visualize by looking only at this script, it helps when the commit
-# also contains the diff of the generated file.
-exec >hash_md5_sha_x86-64.S
-
-# Based on http://arctic.org/~dean/crypto/sha1.html.
-# ("This SHA1 implementation is public domain.")
-#
-# x86-64 has at least SSE2 vector insns always available.
-# We can use them without any CPUID checks (and without a need
-# for a fallback code if needed insns are not available).
-# This code uses them to calculate W[] ahead of time.
-#
-# Unfortunately, results are passed from vector unit to
-# integer ALUs on the stack. MOVD/Q insns to move them directly
-# from vector to integer registers are slower than store-to-load
-# forwarding in LSU (on Skylake at least).
-#
-# The win against a purely integer code is small on Skylake,
-# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
-# It can do 4 ops at once in one 128-bit register,
-# but we have to use x2 of them because of W[0] complication,
-# SSE2 has no "rotate each word by N bits" insns,
-# moving data to/from vector unit is clunky, and Skylake
-# has four integer ALUs unified with three vector ALUs,
-# which makes pure integer code rather fast, and makes
-# vector ops compete with integer ones.
-#
-# Zen3, with its separate vector ALUs, wins more, about 12%.
-
-xmmT1="%xmm4"
-xmmT2="%xmm5"
-xmmRCONST="%xmm6"
-xmmALLRCONST="%xmm7"
-T=`printf '\t'`
-
-# SSE instructions are longer than 4 bytes on average.
-# Intel CPUs (up to Tiger Lake at least) can't decode
-# more than 16 bytes of code in one cycle.
-# By interleaving SSE code and integer code
-# we mostly achieve a situation where 16-byte decode fetch window
-# contains 4 (or more) insns.
-#
-# However. On Skylake, there was no observed difference,
-# but on Zen3, non-interleaved code is ~3% faster
-# (822 Mb/s versus 795 Mb/s hashing speed).
-# Off for now:
-interleave=false
-
-INTERLEAVE() {
-	$interleave || \
-	{
-		# Generate non-interleaved code
-		# (it should work correctly too)
-		echo "$1"
-		echo "$2"
-		return
-	}
-	(
-	echo "$1" | grep -v '^$' >"$0.temp1"
-	echo "$2" | grep -v '^$' >"$0.temp2"
-	exec 3<"$0.temp1"
-	exec 4<"$0.temp2"
-	IFS=''
-	while :; do
-		line1=''
-		line2=''
-		while :; do
-			read -r line1 <&3
-			if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
-				break
-			fi
-			echo "$line1"
-		done
-		while :; do
-			read -r line2 <&4
-			if test "${line2:0:4}" = "${T}lea"; then
-				# We use 7-8 byte long forms of LEA.
-				# Do not interleave them with SSE insns
-				# which are also long.
-				echo "$line2"
-				read -r line2 <&4
-				echo "$line2"
-				continue
-			fi
-			if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
-				break
-			fi
-			echo "$line2"
-		done
-		test "$line1$line2" || break
-		echo "$line1"
-		echo "$line2"
-	done
-	rm "$0.temp1" "$0.temp2"
-	)
-}
-
-#	movaps	bswap32_mask(%rip), $xmmT1
-# Load W[] to xmm0..3, byteswapping on the fly.
-# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
-# for use in RD1As instead of spilling them to stack.
-# (We use rsi instead of rN because this makes two
-# ADDs in two first RD1As shorter by one byte).
-#	movups	16*0(%rdi), %xmm0
-#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
-#	movaps	%xmm0, $xmmT2
-#	paddd	$xmmRCONST, $xmmT2
-#	movq	$xmmT2, %rsi
-#	#pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
-#	#movhpd	$xmmT2, %r8		#can only move to mem, not to reg
-#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
-#	movq	$xmmT2, %r8		# instead
-#	...
-#	<repeat for xmm1,2,3>
-#	...
-#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
-#+	addl	%esi, %e$e			# e += RCONST + W[n]
-# ^^^^^^^^^^^^^^^^^^^^^^^^
-# The above is -97 bytes of code...
-# ...but pshufb is a SSSE3 insn. Can't use it.
-
-echo \
-"### Generated by hash_md5_sha_x86-64.S.sh ###
-
-#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-#ifdef __linux__
-	.section	.note.GNU-stack, \"\", @progbits
-#endif
-	.section	.text.sha1_process_block64, \"ax\", @progbits
-	.globl	sha1_process_block64
-	.hidden	sha1_process_block64
-	.type	sha1_process_block64, @function
-
-	.balign	8	# allow decoders to fetch at least 5 first insns
-sha1_process_block64:
-	pushq	%rbp	# 1 byte insn
-	pushq	%rbx	# 1 byte insn
-#	pushq	%r15	# 2 byte insn
-	pushq	%r14	# 2 byte insn
-	pushq	%r13	# 2 byte insn
-	pushq	%r12	# 2 byte insn
-	pushq	%rdi	# we need ctx at the end
-
-#Register and stack use:
-# eax..edx: a..d
-# ebp: e
-# esi,edi,r8..r14: temps
-# r15: unused
-# xmm0..xmm3: W[]
-# xmm4,xmm5: temps
-# xmm6: current round constant
-# xmm7: all round constants
-# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-
-	movl	80(%rdi), %eax		# a = ctx->hash[0]
-	movl	84(%rdi), %ebx		# b = ctx->hash[1]
-	movl	88(%rdi), %ecx		# c = ctx->hash[2]
-	movl	92(%rdi), %edx		# d = ctx->hash[3]
-	movl	96(%rdi), %ebp		# e = ctx->hash[4]
-
-	movaps	sha1const(%rip), $xmmALLRCONST
-	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
-
-	# Load W[] to xmm0..3, byteswapping on the fly.
-	#
-	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1As instead of spilling them to stack.
-	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it is probably a wash.
-	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1As shorter by one byte).
-	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r8
-	bswapq	%rsi
-	bswapq	%r8
-	rolq	\$32, %rsi		# rsi = W[1]:W[0]
-	rolq	\$32, %r8		# r8  = W[3]:W[2]
-	movq	%rsi, %xmm0
-	movq	%r8, $xmmT1
-	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
-#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
-#	paddd	$xmmRCONST, $xmmT1
-#	movups	$xmmT1, -64+16*0(%rsp)
-
-	movq	4*4(%rdi), %r9
-	movq	4*6(%rdi), %r10
-	bswapq	%r9
-	bswapq	%r10
-	rolq	\$32, %r9		# r9  = W[5]:W[4]
-	rolq	\$32, %r10		# r10 = W[7]:W[6]
-	movq	%r9, %xmm1
-	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
-
-	movq	4*8(%rdi), %r11
-	movq	4*10(%rdi), %r12
-	bswapq	%r11
-	bswapq	%r12
-	rolq	\$32, %r11		# r11 = W[9]:W[8]
-	rolq	\$32, %r12		# r12 = W[11]:W[10]
-	movq	%r11, %xmm2
-	movq	%r12, $xmmT1
-	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
-
-	movq	4*12(%rdi), %r13
-	movq	4*14(%rdi), %r14
-	bswapq	%r13
-	bswapq	%r14
-	rolq	\$32, %r13		# r13 = W[13]:W[12]
-	rolq	\$32, %r14		# r14 = W[15]:W[14]
-	movq	%r13, %xmm3
-	movq	%r14, $xmmT1
-	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-"
-
-PREP() {
-local xmmW0=$1
-local xmmW4=$2
-local xmmW8=$3
-local xmmW12=$4
-# the above must be %xmm0..3 in some permutation
-local dstmem=$5
-#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
-#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
-#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
-#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
-#W[3] ^= rol(W[0], 1);
-echo "# PREP $@
-	movaps	$xmmW12, $xmmT1
-	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-
-#	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-#	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
-# same result as above, but shorter and faster:
-# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
-# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
-	movaps	$xmmW0, $xmmT2
-	shufps	\$0x4e, $xmmW4, $xmmT2	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
-
-	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
-	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
-	xorps	$xmmT2, $xmmW0	# ^
-	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
-	movaps	$xmmW0, $xmmT2
-
-	xorps	$xmmT1, $xmmT1	# rol(W0,1):
-	pcmpgtd	$xmmW0, $xmmT1	#  ffffffff for elements <0 (ones with msb bit 1)
-	paddd	$xmmW0, $xmmW0	#  shift left by 1
-	psubd	$xmmT1, $xmmW0	#  add 1 to those who had msb bit 1
-	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
-
-	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
-	movaps	$xmmT2, $xmmT1
-	pslld	\$2, $xmmT2
-	psrld	\$30, $xmmT1
-#	xorps	$xmmT1, $xmmT2	# rol((0,0,0,unrotW[0]),2)
-	xorps	$xmmT1, $xmmW0	# same result, but does not depend on/does not modify T2
-
-	xorps	$xmmT2, $xmmW0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
-"
-#	movq	$xmmW0, %r8	# high latency (~6 cycles)
-#	movaps	$xmmW0, $xmmT1
-#	psrldq	\$8, $xmmT1	# rshift by 8 bytes: move upper 64 bits to lower
-#	movq	$xmmT1, %r10	# high latency
-#	movq	%r8, %r9
-#	movq	%r10, %r11
-#	shrq	\$32, %r9
-#	shrq	\$32, %r11
-# ^^^ slower than passing the results on stack (!!!)
-echo "
-	movaps	$xmmW0, $xmmT2
-	paddd	$xmmRCONST, $xmmT2
-	movups	$xmmT2, $dstmem
-"
-}
-
-# It's possible to interleave integer insns in rounds to mostly eliminate
-# dependency chains, but this likely to only help old Pentium-based
-# CPUs (ones without OOO, which can only simultaneously execute a pair
-# of _adjacent_ insns).
-# Testing on old-ish Silvermont CPU (which has OOO window of only
-# about ~8 insns) shows very small (~1%) speedup.
-
-RD1A() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n0=$(((n+0) & 15))
-local rN=$((7+n0/2))
-echo "
-# $n
-";test $n0 = 0 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-	shrq	\$32, %rsi
-";test $n0 = 1 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
-	shrq	\$32, %r$rN
-";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
-	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
-";echo "
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	andl	%e$b, %edi		# &b
-	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
-	movl	%e$a, %edi		#
-	roll	\$5, %edi		# rotl32(a,5)
-	addl	%edi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-RD1B() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	andl	%e$b, %edi		# &b
-	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-RD2() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$c, %edi		# c
-	xorl	%e$d, %edi		# ^d
-	xorl	%e$b, %edi		# ^b
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	addl	%edi, %e$e		# e += (c ^ d ^ b)
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-RD3() {
-local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
-local n=$(($6))
-local n13=$(((n+13) & 15))
-local n8=$(((n+8) & 15))
-local n2=$(((n+2) & 15))
-local n0=$(((n+0) & 15))
-echo "
-# $n
-	movl	%e$b, %edi		# di: b
-	movl	%e$b, %esi		# si: b
-	orl	%e$c, %edi		# di: b | c
-	andl	%e$c, %esi		# si: b & c
-	andl	%e$d, %edi		# di: (b | c) & d
-	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
-	rorl	\$2, %e$b		# b = rotl32(b,30)
-"
-}
-
-{
-# Round 1
-RCONST=0x5A827999
-RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
-RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0x55, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
-INTERLEAVE "$a" "$b"
-
-# Round 2
-RCONST=0x6ED9EBA1
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0xaa, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
-INTERLEAVE "$a" "$b"
-
-# Round 3
-RCONST=0x8F1BBCDC
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
-INTERLEAVE "$a" "$b"
-a=`echo "	pshufd	\\$0xff, $xmmALLRCONST, $xmmRCONST"
-   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
-INTERLEAVE "$a" "$b"
-
-# Round 4 has the same logic as round 2, only n and RCONST are different
-RCONST=0xCA62C1D6
-a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
-INTERLEAVE "$a" "$b"
-a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
-INTERLEAVE "$a" "$b"
-RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
-RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
-} | grep -v '^$'
-
-echo "
-	popq	%rdi		#
-	popq	%r12		#
-	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
-	popq	%r13		#
-	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
-	popq	%r14		#
-	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-#	popq	%r15		#
-	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
-	popq	%rbx		#
-	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
-	popq	%rbp		#
-
-	ret
-	.size	sha1_process_block64, .-sha1_process_block64
-
-	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
-	.balign	16
-sha1const:
-	.long	0x5A827999
-	.long	0x6ED9EBA1
-	.long	0x8F1BBCDC
-	.long	0xCA62C1D6
-
-#endif"
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
deleted file mode 100644
index 2f03e1ce4..000000000
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ /dev/null
@@ -1,232 +0,0 @@
-#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
-/* The code is adapted from Linux kernel's source */
-
-// We use shorter insns, even though they are for "wrong"
-// data type (fp, not int).
-// For Intel, there is no penalty for doing it at all
-// (CPUs which do have such penalty do not support SHA insns).
-// For AMD, the penalty is one extra cycle
-// (allegedly: I failed to find measurable difference).
-
-//#define mova128 movdqa
-#define mova128 movaps
-//#define movu128 movdqu
-#define movu128 movups
-//#define xor128 pxor
-#define xor128 xorps
-//#define shuf128_32 pshufd
-#define shuf128_32 shufps
-
-#define extr128_32 pextrd
-//#define extr128_32 extractps	# not shorter
-
-// pshufb is a SSSE3 insn.
-// pinsrd, pextrd, extractps are SSE4.1 insns.
-// We do not check SSSE3/SSE4.1 in cpuid,
-// all SHA-capable CPUs support them as well.
-
-#ifdef __linux__
-	.section	.note.GNU-stack, "", @progbits
-#endif
-	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
-	.globl	sha1_process_block64_shaNI
-	.hidden	sha1_process_block64_shaNI
-	.type	sha1_process_block64_shaNI, @function
-
-#define ABCD		%xmm0
-#define E0		%xmm1	/* Need two E's b/c they ping pong */
-#define E1		%xmm2
-#define MSG0		%xmm3
-#define MSG1		%xmm4
-#define MSG2		%xmm5
-#define MSG3		%xmm6
-
-	.balign	8	# allow decoders to fetch at least 2 first insns
-sha1_process_block64_shaNI:
-	/* load initial hash values */
-	movu128		80(%rdi), ABCD
-	xor128		E0, E0
-	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
-	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
-
-	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
-
-	movu128		0*16(%rdi), MSG0
-	pshufb		%xmm7, MSG0
-	movu128		1*16(%rdi), MSG1
-	pshufb		%xmm7, MSG1
-	movu128		2*16(%rdi), MSG2
-	pshufb		%xmm7, MSG2
-	movu128		3*16(%rdi), MSG3
-	pshufb		%xmm7, MSG3
-
-	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm7
-	mova128		ABCD, %xmm8
-
-	/* Rounds 0-3 */
-		paddd		MSG0, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-
-	/* Rounds 4-7 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG1, MSG0
-
-	/* Rounds 8-11 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 12-15 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$0, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 16-19 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$0, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 20-23 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 24-27 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 28-31 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 32-35 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$1, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 36-39 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$1, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 40-43 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 44-47 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 48-51 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 52-55 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$2, E1, ABCD
-	sha1msg1	MSG1, MSG0
-	xor128		MSG1, MSG3
-
-	/* Rounds 56-59 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$2, E0, ABCD
-	sha1msg1	MSG2, MSG1
-	xor128		MSG2, MSG0
-
-	/* Rounds 60-63 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG3, MSG0
-		sha1rnds4	$3, E1, ABCD
-	sha1msg1	MSG3, MSG2
-	xor128		MSG3, MSG1
-
-	/* Rounds 64-67 */
-		sha1nexte	MSG0, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG0, MSG1
-		sha1rnds4	$3, E0, ABCD
-	sha1msg1	MSG0, MSG3
-	xor128		MSG0, MSG2
-
-	/* Rounds 68-71 */
-		sha1nexte	MSG1, E1
-		mova128		ABCD, E0
-	sha1msg2	MSG1, MSG2
-		sha1rnds4	$3, E1, ABCD
-	xor128		MSG1, MSG3
-
-	/* Rounds 72-75 */
-		sha1nexte	MSG2, E0
-		mova128		ABCD, E1
-	sha1msg2	MSG2, MSG3
-		sha1rnds4	$3, E0, ABCD
-
-	/* Rounds 76-79 */
-		sha1nexte	MSG3, E1
-		mova128		ABCD, E0
-		sha1rnds4	$3, E1, ABCD
-
-	/* Add current hash values with previously saved */
-	sha1nexte	%xmm7, E0
-	paddd		%xmm8, ABCD
-
-	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, ABCD, ABCD
-	movu128		ABCD, 80(%rdi)
-	extr128_32	$3, E0, 80+4*4(%rdi)
-
-	ret
-	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-
-	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-	.balign	16
-PSHUFFLE_BYTE_FLIP_MASK:
-	.octa	0x000102030405060708090a0b0c0d0e0f
-
-#endif
diff --git a/libbb/hash_sha1_hwaccel_x86-32.S b/libbb/hash_sha1_hwaccel_x86-32.S
new file mode 100644
index 000000000..7455a29f0
--- /dev/null
+++ b/libbb/hash_sha1_hwaccel_x86-32.S
@@ -0,0 +1,234 @@
+#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define xor128 pxor
+#define xor128 xorps
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+#define extr128_32 pextrd
+//#define extr128_32 extractps	# not shorter
+
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
+	.globl	sha1_process_block64_shaNI
+	.hidden	sha1_process_block64_shaNI
+	.type	sha1_process_block64_shaNI, @function
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha1_process_block64_shaNI:
+	/* load initial hash values */
+	movu128		76(%eax), ABCD
+	xor128		E0, E0
+	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
+	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
+
+	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+
+	movu128		0*16(%eax), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%eax), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%eax), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%eax), MSG3
+	pshufb		%xmm7, MSG3
+
+	/* Save hash values for addition after rounds */
+	mova128		E0, %xmm7
+	/*mova128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
+
+	/* Rounds 0-3 */
+		paddd		MSG0, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 12-15 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	xor128		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	%xmm7, E0
+	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
+	movu128		76(%eax), %xmm7	# get original ABCD (not shuffled)...
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, ABCD, ABCD
+	paddd		%xmm7, ABCD	# ...add it to final ABCD
+	movu128		ABCD, 76(%eax)
+	extr128_32	$3, E0, 76+4*4(%eax)
+
+	ret
+	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
+
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+#endif
diff --git a/libbb/hash_sha1_hwaccel_x86-64.S b/libbb/hash_sha1_hwaccel_x86-64.S
new file mode 100644
index 000000000..2f03e1ce4
--- /dev/null
+++ b/libbb/hash_sha1_hwaccel_x86-64.S
@@ -0,0 +1,232 @@
+#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define xor128 pxor
+#define xor128 xorps
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+#define extr128_32 pextrd
+//#define extr128_32 extractps	# not shorter
+
+// pshufb is a SSSE3 insn.
+// pinsrd, pextrd, extractps are SSE4.1 insns.
+// We do not check SSSE3/SSE4.1 in cpuid,
+// all SHA-capable CPUs support them as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
+	.globl	sha1_process_block64_shaNI
+	.hidden	sha1_process_block64_shaNI
+	.type	sha1_process_block64_shaNI, @function
+
+#define ABCD		%xmm0
+#define E0		%xmm1	/* Need two E's b/c they ping pong */
+#define E1		%xmm2
+#define MSG0		%xmm3
+#define MSG1		%xmm4
+#define MSG2		%xmm5
+#define MSG3		%xmm6
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha1_process_block64_shaNI:
+	/* load initial hash values */
+	movu128		80(%rdi), ABCD
+	xor128		E0, E0
+	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
+	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
+
+	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+
+	movu128		0*16(%rdi), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%rdi), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%rdi), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%rdi), MSG3
+	pshufb		%xmm7, MSG3
+
+	/* Save hash values for addition after rounds */
+	mova128		E0, %xmm7
+	mova128		ABCD, %xmm8
+
+	/* Rounds 0-3 */
+		paddd		MSG0, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+
+	/* Rounds 4-7 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG1, MSG0
+
+	/* Rounds 8-11 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 12-15 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$0, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 16-19 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$0, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 20-23 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 24-27 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 28-31 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 32-35 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$1, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 36-39 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$1, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 40-43 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 44-47 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 48-51 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 52-55 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$2, E1, ABCD
+	sha1msg1	MSG1, MSG0
+	xor128		MSG1, MSG3
+
+	/* Rounds 56-59 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$2, E0, ABCD
+	sha1msg1	MSG2, MSG1
+	xor128		MSG2, MSG0
+
+	/* Rounds 60-63 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG3, MSG0
+		sha1rnds4	$3, E1, ABCD
+	sha1msg1	MSG3, MSG2
+	xor128		MSG3, MSG1
+
+	/* Rounds 64-67 */
+		sha1nexte	MSG0, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG0, MSG1
+		sha1rnds4	$3, E0, ABCD
+	sha1msg1	MSG0, MSG3
+	xor128		MSG0, MSG2
+
+	/* Rounds 68-71 */
+		sha1nexte	MSG1, E1
+		mova128		ABCD, E0
+	sha1msg2	MSG1, MSG2
+		sha1rnds4	$3, E1, ABCD
+	xor128		MSG1, MSG3
+
+	/* Rounds 72-75 */
+		sha1nexte	MSG2, E0
+		mova128		ABCD, E1
+	sha1msg2	MSG2, MSG3
+		sha1rnds4	$3, E0, ABCD
+
+	/* Rounds 76-79 */
+		sha1nexte	MSG3, E1
+		mova128		ABCD, E0
+		sha1rnds4	$3, E1, ABCD
+
+	/* Add current hash values with previously saved */
+	sha1nexte	%xmm7, E0
+	paddd		%xmm8, ABCD
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, ABCD, ABCD
+	movu128		ABCD, 80(%rdi)
+	extr128_32	$3, E0, 80+4*4(%rdi)
+
+	ret
+	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
+
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BYTE_FLIP_MASK:
+	.octa	0x000102030405060708090a0b0c0d0e0f
+
+#endif
diff --git a/libbb/hash_sha1_x86-64.S b/libbb/hash_sha1_x86-64.S
new file mode 100644
index 000000000..b1968fff6
--- /dev/null
+++ b/libbb/hash_sha1_x86-64.S
@@ -0,0 +1,1489 @@
+### Generated by hash_sha1_x86-64.S.sh ###
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha1_process_block64, "ax", @progbits
+	.globl	sha1_process_block64
+	.hidden	sha1_process_block64
+	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 5 first insns
+sha1_process_block64:
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+#	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
+	pushq	%rdi	# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi,r8..r14: temps
+# r15: unused
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# xmm7: all round constants
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
+
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movaps	sha1const(%rip), %xmm7
+	pshufd	$0x00, %xmm7, %xmm6
+
+	# Load W[] to xmm0..3, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	$32, %rsi		# rsi = W[1]:W[0]
+	rolq	$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
+#	paddd	%xmm6, %xmm4
+#	movups	%xmm4, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	$32, %r9		# r9  = W[5]:W[4]
+	rolq	$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	$32, %r11		# r11 = W[9]:W[8]
+	rolq	$32, %r12		# r12 = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	$32, %r13		# r13 = W[13]:W[12]
+	rolq	$32, %r14		# r14 = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, %xmm4
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+
+# 0
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
+	shrq	$32, %rsi
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 1
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 2
+	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
+	shrq	$32, %r8
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 3
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 4
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+	shrq	$32, %r9
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 5
+	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 6
+	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
+	shrq	$32, %r10
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 7
+	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 8
+	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
+	shrq	$32, %r11
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 9
+	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 10
+	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
+	shrq	$32, %r12
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 11
+	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0x55, %xmm7, %xmm6
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 12
+	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
+	shrq	$32, %r13
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 13
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 14
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
+	shrq	$32, %r14
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 15
+	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	andl	%ebx, %edi		# &b
+	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 16
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	andl	%eax, %edi		# &b
+	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*0(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 17
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	andl	%ebp, %edi		# &b
+	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*1(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 18
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	andl	%edx, %edi		# &b
+	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*2(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 19
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	andl	%ecx, %edi		# &b
+	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*3(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 20
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*4(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 21
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*5(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 22
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*6(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 23
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*7(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 24
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*8(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 25
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*9(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 26
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*10(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 27
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*11(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 28
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*12(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 29
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*13(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 30
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*14(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 31
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*15(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0xaa, %xmm7, %xmm6
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 32
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*0(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 33
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*1(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 34
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*2(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 35
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*3(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 36
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*4(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 37
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*5(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 38
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*6(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 39
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*7(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 40
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*8(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 41
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*9(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 42
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*10(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 43
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*11(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 44
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*12(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 45
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*13(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 46
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*14(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 47
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*15(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 48
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*0(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 49
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*1(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 50
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*2(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 51
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*3(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+	pshufd	$0xff, %xmm7, %xmm6
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 52
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*4(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 53
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*5(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 54
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*6(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 55
+	movl	%ebx, %edi		# di: b
+	movl	%ebx, %esi		# si: b
+	orl	%ecx, %edi		# di: b | c
+	andl	%ecx, %esi		# si: b & c
+	andl	%edx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
+	addl	-64+4*7(%rsp), %ebp	# e += RCONST + W[n & 15]
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
+# 56
+	movl	%eax, %edi		# di: b
+	movl	%eax, %esi		# si: b
+	orl	%ebx, %edi		# di: b | c
+	andl	%ebx, %esi		# si: b & c
+	andl	%ecx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*8(%rsp), %edx	# e += RCONST + W[n & 15]
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 57
+	movl	%ebp, %edi		# di: b
+	movl	%ebp, %esi		# si: b
+	orl	%eax, %edi		# di: b | c
+	andl	%eax, %esi		# si: b & c
+	andl	%ebx, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*9(%rsp), %ecx	# e += RCONST + W[n & 15]
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 58
+	movl	%edx, %edi		# di: b
+	movl	%edx, %esi		# si: b
+	orl	%ebp, %edi		# di: b | c
+	andl	%ebp, %esi		# si: b & c
+	andl	%eax, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
+	addl	-64+4*10(%rsp), %ebx	# e += RCONST + W[n & 15]
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 59
+	movl	%ecx, %edi		# di: b
+	movl	%ecx, %esi		# si: b
+	orl	%edx, %edi		# di: b | c
+	andl	%edx, %esi		# si: b & c
+	andl	%ebp, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
+	addl	-64+4*11(%rsp), %eax	# e += RCONST + W[n & 15]
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
+# 60
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*12(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 61
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*13(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 62
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*14(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 63
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*15(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
+# 64
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*0(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 65
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*1(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 66
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*2(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 67
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*3(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
+# 68
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*4(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 69
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*5(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 70
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*6(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 71
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*7(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 72
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*8(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 73
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*9(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 74
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*10(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+# 75
+	movl	%ecx, %edi		# c
+	xorl	%edx, %edi		# ^d
+	xorl	%ebx, %edi		# ^b
+	addl	-64+4*11(%rsp), %ebp	# e += RCONST + W[n & 15]
+	addl	%edi, %ebp		# e += (c ^ d ^ b)
+	movl	%eax, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebp		# e += rotl32(a,5)
+	rorl	$2, %ebx		# b = rotl32(b,30)
+# 76
+	movl	%ebx, %edi		# c
+	xorl	%ecx, %edi		# ^d
+	xorl	%eax, %edi		# ^b
+	addl	-64+4*12(%rsp), %edx	# e += RCONST + W[n & 15]
+	addl	%edi, %edx		# e += (c ^ d ^ b)
+	movl	%ebp, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %edx		# e += rotl32(a,5)
+	rorl	$2, %eax		# b = rotl32(b,30)
+# 77
+	movl	%eax, %edi		# c
+	xorl	%ebx, %edi		# ^d
+	xorl	%ebp, %edi		# ^b
+	addl	-64+4*13(%rsp), %ecx	# e += RCONST + W[n & 15]
+	addl	%edi, %ecx		# e += (c ^ d ^ b)
+	movl	%edx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ecx		# e += rotl32(a,5)
+	rorl	$2, %ebp		# b = rotl32(b,30)
+# 78
+	movl	%ebp, %edi		# c
+	xorl	%eax, %edi		# ^d
+	xorl	%edx, %edi		# ^b
+	addl	-64+4*14(%rsp), %ebx	# e += RCONST + W[n & 15]
+	addl	%edi, %ebx		# e += (c ^ d ^ b)
+	movl	%ecx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %ebx		# e += rotl32(a,5)
+	rorl	$2, %edx		# b = rotl32(b,30)
+# 79
+	movl	%edx, %edi		# c
+	xorl	%ebp, %edi		# ^d
+	xorl	%ecx, %edi		# ^b
+	addl	-64+4*15(%rsp), %eax	# e += RCONST + W[n & 15]
+	addl	%edi, %eax		# e += (c ^ d ^ b)
+	movl	%ebx, %esi		#
+	roll	$5, %esi		# rotl32(a,5)
+	addl	%esi, %eax		# e += rotl32(a,5)
+	rorl	$2, %ecx		# b = rotl32(b,30)
+
+	popq	%rdi		#
+	popq	%r12		#
+	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
+	popq	%r13		#
+	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
+	popq	%r14		#
+	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
+#	popq	%r15		#
+	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
+	popq	%rbp		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
+	.balign	16
+sha1const:
+	.long	0x5A827999
+	.long	0x6ED9EBA1
+	.long	0x8F1BBCDC
+	.long	0xCA62C1D6
+
+#endif
diff --git a/libbb/hash_sha1_x86-64.S.sh b/libbb/hash_sha1_x86-64.S.sh
new file mode 100755
index 000000000..3fc125d51
--- /dev/null
+++ b/libbb/hash_sha1_x86-64.S.sh
@@ -0,0 +1,478 @@
+#!/bin/sh
+
+# We don't regenerate it on every "make" invocation - only by hand.
+# The reason is that the changes to generated code are difficult
+# to visualize by looking only at this script, it helps when the commit
+# also contains the diff of the generated file.
+exec >hash_sha1_x86-64.S
+
+# Based on http://arctic.org/~dean/crypto/sha1.html.
+# ("This SHA1 implementation is public domain.")
+#
+# x86-64 has at least SSE2 vector insns always available.
+# We can use them without any CPUID checks (and without a need
+# for a fallback code if needed insns are not available).
+# This code uses them to calculate W[] ahead of time.
+#
+# Unfortunately, results are passed from vector unit to
+# integer ALUs on the stack. MOVD/Q insns to move them directly
+# from vector to integer registers are slower than store-to-load
+# forwarding in LSU (on Skylake at least).
+#
+# The win against a purely integer code is small on Skylake,
+# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
+# It can do 4 ops at once in one 128-bit register,
+# but we have to use x2 of them because of W[0] complication,
+# SSE2 has no "rotate each word by N bits" insns,
+# moving data to/from vector unit is clunky, and Skylake
+# has four integer ALUs unified with three vector ALUs,
+# which makes pure integer code rather fast, and makes
+# vector ops compete with integer ones.
+#
+# Zen3, with its separate vector ALUs, wins more, about 12%.
+
+xmmT1="%xmm4"
+xmmT2="%xmm5"
+xmmRCONST="%xmm6"
+xmmALLRCONST="%xmm7"
+T=`printf '\t'`
+
+# SSE instructions are longer than 4 bytes on average.
+# Intel CPUs (up to Tiger Lake at least) can't decode
+# more than 16 bytes of code in one cycle.
+# By interleaving SSE code and integer code
+# we mostly achieve a situation where 16-byte decode fetch window
+# contains 4 (or more) insns.
+#
+# However. On Skylake, there was no observed difference,
+# but on Zen3, non-interleaved code is ~3% faster
+# (822 Mb/s versus 795 Mb/s hashing speed).
+# Off for now:
+interleave=false
+
+INTERLEAVE() {
+	$interleave || \
+	{
+		# Generate non-interleaved code
+		# (it should work correctly too)
+		echo "$1"
+		echo "$2"
+		return
+	}
+	(
+	echo "$1" | grep -v '^$' >"$0.temp1"
+	echo "$2" | grep -v '^$' >"$0.temp2"
+	exec 3<"$0.temp1"
+	exec 4<"$0.temp2"
+	IFS=''
+	while :; do
+		line1=''
+		line2=''
+		while :; do
+			read -r line1 <&3
+			if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line1"
+		done
+		while :; do
+			read -r line2 <&4
+			if test "${line2:0:4}" = "${T}lea"; then
+				# We use 7-8 byte long forms of LEA.
+				# Do not interleave them with SSE insns
+				# which are also long.
+				echo "$line2"
+				read -r line2 <&4
+				echo "$line2"
+				continue
+			fi
+			if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line2"
+		done
+		test "$line1$line2" || break
+		echo "$line1"
+		echo "$line2"
+	done
+	rm "$0.temp1" "$0.temp2"
+	)
+}
+
+#	movaps	bswap32_mask(%rip), $xmmT1
+# Load W[] to xmm0..3, byteswapping on the fly.
+# For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14
+# for use in RD1As instead of spilling them to stack.
+# (We use rsi instead of rN because this makes two
+# ADDs in two first RD1As shorter by one byte).
+#	movups	16*0(%rdi), %xmm0
+#	pshufb	$xmmT1, %xmm0		#SSSE3 insn
+#	movaps	%xmm0, $xmmT2
+#	paddd	$xmmRCONST, $xmmT2
+#	movq	$xmmT2, %rsi
+#	#pextrq	\$1, $xmmT2, %r8	#SSE4.1 insn
+#	#movhpd	$xmmT2, %r8		#can only move to mem, not to reg
+#	shufps	\$0x0e, $xmmT2, $xmmT2	# have to use two-insn sequence
+#	movq	$xmmT2, %r8		# instead
+#	...
+#	<repeat for xmm1,2,3>
+#	...
+#-	leal	$RCONST(%r$e,%rsi), %e$e	# e += RCONST + W[n]
+#+	addl	%esi, %e$e			# e += RCONST + W[n]
+# ^^^^^^^^^^^^^^^^^^^^^^^^
+# The above is -97 bytes of code...
+# ...but pshufb is a SSSE3 insn. Can't use it.
+
+echo \
+"### Generated by hash_sha1_x86-64.S.sh ###
+
+#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
+#ifdef __linux__
+	.section	.note.GNU-stack, \"\", @progbits
+#endif
+	.section	.text.sha1_process_block64, \"ax\", @progbits
+	.globl	sha1_process_block64
+	.hidden	sha1_process_block64
+	.type	sha1_process_block64, @function
+
+	.balign	8	# allow decoders to fetch at least 5 first insns
+sha1_process_block64:
+	pushq	%rbp	# 1 byte insn
+	pushq	%rbx	# 1 byte insn
+#	pushq	%r15	# 2 byte insn
+	pushq	%r14	# 2 byte insn
+	pushq	%r13	# 2 byte insn
+	pushq	%r12	# 2 byte insn
+	pushq	%rdi	# we need ctx at the end
+
+#Register and stack use:
+# eax..edx: a..d
+# ebp: e
+# esi,edi,r8..r14: temps
+# r15: unused
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# xmm7: all round constants
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
+
+	movl	80(%rdi), %eax		# a = ctx->hash[0]
+	movl	84(%rdi), %ebx		# b = ctx->hash[1]
+	movl	88(%rdi), %ecx		# c = ctx->hash[2]
+	movl	92(%rdi), %edx		# d = ctx->hash[3]
+	movl	96(%rdi), %ebp		# e = ctx->hash[4]
+
+	movaps	sha1const(%rip), $xmmALLRCONST
+	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
+
+	# Load W[] to xmm0..3, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1As instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it is probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1As shorter by one byte).
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r8
+	bswapq	%rsi
+	bswapq	%r8
+	rolq	\$32, %rsi		# rsi = W[1]:W[0]
+	rolq	\$32, %r8		# r8  = W[3]:W[2]
+	movq	%rsi, %xmm0
+	movq	%r8, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
+#	paddd	$xmmRCONST, $xmmT1
+#	movups	$xmmT1, -64+16*0(%rsp)
+
+	movq	4*4(%rdi), %r9
+	movq	4*6(%rdi), %r10
+	bswapq	%r9
+	bswapq	%r10
+	rolq	\$32, %r9		# r9  = W[5]:W[4]
+	rolq	\$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	\$32, %r11		# r11 = W[9]:W[8]
+	rolq	\$32, %r12		# r12 = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+
+	movq	4*12(%rdi), %r13
+	movq	4*14(%rdi), %r14
+	bswapq	%r13
+	bswapq	%r14
+	rolq	\$32, %r13		# r13 = W[13]:W[12]
+	rolq	\$32, %r14		# r14 = W[15]:W[14]
+	movq	%r13, %xmm3
+	movq	%r14, $xmmT1
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
+"
+
+PREP() {
+local xmmW0=$1
+local xmmW4=$2
+local xmmW8=$3
+local xmmW12=$4
+# the above must be %xmm0..3 in some permutation
+local dstmem=$5
+#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
+#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
+#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
+#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
+#W[3] ^= rol(W[0], 1);
+echo "# PREP $@
+	movaps	$xmmW12, $xmmT1
+	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+
+#	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	$xmmW0, $xmmT2
+	shufps	\$0x4e, $xmmW4, $xmmT2	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+
+	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	$xmmT2, $xmmW0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	$xmmW0, $xmmT2
+
+	xorps	$xmmT1, $xmmT1	# rol(W0,1):
+	pcmpgtd	$xmmW0, $xmmT1	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	$xmmW0, $xmmW0	#  shift left by 1
+	psubd	$xmmT1, $xmmW0	#  add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+
+	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	$xmmT2, $xmmT1
+	pslld	\$2, $xmmT2
+	psrld	\$30, $xmmT1
+#	xorps	$xmmT1, $xmmT2	# rol((0,0,0,unrotW[0]),2)
+	xorps	$xmmT1, $xmmW0	# same result, but does not depend on/does not modify T2
+
+	xorps	$xmmT2, $xmmW0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+"
+#	movq	$xmmW0, %r8	# high latency (~6 cycles)
+#	movaps	$xmmW0, $xmmT1
+#	psrldq	\$8, $xmmT1	# rshift by 8 bytes: move upper 64 bits to lower
+#	movq	$xmmT1, %r10	# high latency
+#	movq	%r8, %r9
+#	movq	%r10, %r11
+#	shrq	\$32, %r9
+#	shrq	\$32, %r11
+# ^^^ slower than passing the results on stack (!!!)
+echo "
+	movaps	$xmmW0, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movups	$xmmT2, $dstmem
+"
+}
+
+# It's possible to interleave integer insns in rounds to mostly eliminate
+# dependency chains, but this likely to only help old Pentium-based
+# CPUs (ones without OOO, which can only simultaneously execute a pair
+# of _adjacent_ insns).
+# Testing on old-ish Silvermont CPU (which has OOO window of only
+# about ~8 insns) shows very small (~1%) speedup.
+
+RD1A() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n0=$(((n+0) & 15))
+local rN=$((7+n0/2))
+echo "
+# $n
+";test $n0 = 0 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+	shrq	\$32, %rsi
+";test $n0 = 1 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+	shrq	\$32, %r$rN
+";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+";echo "
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %edi		#
+	roll	\$5, %edi		# rotl32(a,5)
+	addl	%edi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+RD1B() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	andl	%e$b, %edi		# &b
+	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+RD2() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$c, %edi		# c
+	xorl	%e$d, %edi		# ^d
+	xorl	%e$b, %edi		# ^b
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	addl	%edi, %e$e		# e += (c ^ d ^ b)
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+RD3() {
+local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
+local n=$(($6))
+local n13=$(((n+13) & 15))
+local n8=$(((n+8) & 15))
+local n2=$(((n+2) & 15))
+local n0=$(((n+0) & 15))
+echo "
+# $n
+	movl	%e$b, %edi		# di: b
+	movl	%e$b, %esi		# si: b
+	orl	%e$c, %edi		# di: b | c
+	andl	%e$c, %esi		# si: b & c
+	andl	%e$d, %edi		# di: (b | c) & d
+	orl	%esi, %edi		# ((b | c) & d) | (b & c)
+	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
+	movl	%e$a, %esi		#
+	roll	\$5, %esi		# rotl32(a,5)
+	addl	%esi, %e$e		# e += rotl32(a,5)
+	rorl	\$2, %e$b		# b = rotl32(b,30)
+"
+}
+
+{
+# Round 1
+RCONST=0x5A827999
+RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
+RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0x55, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
+INTERLEAVE "$a" "$b"
+
+# Round 2
+RCONST=0x6ED9EBA1
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0xaa, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
+INTERLEAVE "$a" "$b"
+
+# Round 3
+RCONST=0x8F1BBCDC
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
+INTERLEAVE "$a" "$b"
+a=`echo "	pshufd	\\$0xff, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
+INTERLEAVE "$a" "$b"
+
+# Round 4 has the same logic as round 2, only n and RCONST are different
+RCONST=0xCA62C1D6
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
+INTERLEAVE "$a" "$b"
+RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
+RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
+} | grep -v '^$'
+
+echo "
+	popq	%rdi		#
+	popq	%r12		#
+	addl	%eax, 80(%rdi)	# ctx->hash[0] += a
+	popq	%r13		#
+	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
+	popq	%r14		#
+	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
+#	popq	%r15		#
+	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
+	popq	%rbx		#
+	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
+	popq	%rbp		#
+
+	ret
+	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
+	.balign	16
+sha1const:
+	.long	0x5A827999
+	.long	0x6ED9EBA1
+	.long	0x8F1BBCDC
+	.long	0xCA62C1D6
+
+#endif"
diff --git a/libbb/hash_sha256_hwaccel_x86-32.S b/libbb/hash_sha256_hwaccel_x86-32.S
new file mode 100644
index 000000000..a0e4a571a
--- /dev/null
+++ b/libbb/hash_sha256_hwaccel_x86-32.S
@@ -0,0 +1,284 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%eax
+
+#define SHA256CONSTANTS	%ecx
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+
+#define XMMTMP		%xmm7
+
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+
+	movu128		76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		76+1*16(%eax), STATE1 /* EFGH */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	mova128		STATE1, STATE0
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
+
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
+	movl		$K256+8*16, SHA256CONSTANTS
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+/* ...to here */
+	mova128		MSG, MSGTMP3
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Write hash values back in the correct order */
+	mova128		STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
+	/* add current hash values to previous ones */
+	movu128		76+1*16(%eax), STATE1
+	paddd		XMMTMP, STATE1
+	movu128		STATE1, 76+1*16(%eax)
+	movu128		76+0*16(%eax), XMMTMP
+	paddd		XMMTMP, STATE0
+	movu128		STATE0, 76+0*16(%eax)
+
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign	16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
+#endif
diff --git a/libbb/hash_sha256_hwaccel_x86-64.S b/libbb/hash_sha256_hwaccel_x86-64.S
new file mode 100644
index 000000000..172c2eae2
--- /dev/null
+++ b/libbb/hash_sha256_hwaccel_x86-64.S
@@ -0,0 +1,290 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+// pshufb and palignr are SSSE3 insns.
+// We do not check SSSE3 in cpuid,
+// all SHA-capable CPUs support it as well.
+
+#ifdef __linux__
+	.section	.note.GNU-stack, "", @progbits
+#endif
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%rdi
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+
+#define XMMTMP		%xmm7
+
+#define SAVE0		%xmm8
+#define SAVE1		%xmm9
+
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+
+	movu128		80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */
+	movu128		80+1*16(%rdi), STATE1 /* EFGH */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	mova128		STATE1, STATE0
+	/* ---		-------------- ABCD -- EFGH */
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */
+
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
+	leaq		K256+8*16(%rip), SHA256CONSTANTS
+
+	/* Save hash values for addition after rounds */
+	mova128		STATE0, SAVE0
+	mova128		STATE1, SAVE1
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		XMMTMP, MSG
+/* ...to here */
+	mova128		MSG, MSGTMP3
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	MSG, STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	MSG, STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		SAVE0, STATE0
+	paddd		SAVE1, STATE1
+
+	/* Write hash values back in the correct order */
+	mova128		STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	/* ---		-------------- HGDC -- FEBA */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */
+	movu128		STATE0, 80+0*16(%rdi)
+	movu128		XMMTMP, 80+1*16(%rdi)
+
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign	16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign	16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa	0x0c0d0e0f08090a0b0405060700010203
+
+#endif
-- 
cgit v1.2.3-55-g6feb