From 931c55f9e2b41473132683488820c6fb7c47506b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 13 Jan 2022 12:50:48 +0100
Subject: libbb: invert the meaning of SETUP_ENV_NO_CHDIR -> SETUP_ENV_CHDIR

Double negatives are hard to grok.

function                                             old     new   delta
login_main                                           986     988      +2
su_main                                              474     470      -4
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 1/1 up/down: 2/-4)               Total: -2 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 include/libbb.h           | 6 +++---
 libbb/setup_environment.c | 5 +++--
 loginutils/login.c        | 4 +++-
 loginutils/su.c           | 7 +++----
 loginutils/sulogin.c      | 9 ++++++---
 miscutils/crontab.c       | 4 ++--
 shell/ash.c               | 2 +-
 shell/hush.c              | 2 +-
 8 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/include/libbb.h b/include/libbb.h
index a0ffbef62..780e9ae7d 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1726,7 +1726,7 @@ extern void selinux_or_die(void) FAST_FUNC;
 
 
 /* setup_environment:
- * if !SETUP_ENV_NO_CHDIR:
+ * if SETUP_ENV_CHDIR:
  *   if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die
  * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set
  *   TERM=(old value)
@@ -1734,7 +1734,7 @@ extern void selinux_or_die(void) FAST_FUNC;
  *   PATH=bb_default_[root_]path
  *   HOME=pw->pw_dir
  *   SHELL=shell
- * else if SETUP_ENV_CHANGEENV:
+ * else if SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME:
  *   if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME:
  *     USER=pw->pw_name, LOGNAME=pw->pw_name
  *   HOME=pw->pw_dir
@@ -1748,7 +1748,7 @@ extern void selinux_or_die(void) FAST_FUNC;
 #define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1)
 #define SETUP_ENV_CLEARENV          (1 << 2)
 #define SETUP_ENV_TO_TMP            (1 << 3)
-#define SETUP_ENV_NO_CHDIR          (1 << 4)
+#define SETUP_ENV_CHDIR             (1 << 4)
 void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC;
 void nuke_str(char *str) FAST_FUNC;
 #if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index df2983958..37777204e 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -36,7 +36,7 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
 
 	/* Change the current working directory to be the home directory
 	 * of the user */
-	if (!(flags & SETUP_ENV_NO_CHDIR)) {
+	if (flags & SETUP_ENV_CHDIR) {
 		if (chdir(pw->pw_dir) != 0) {
 			bb_error_msg("can't change directory to '%s'", pw->pw_dir);
 			xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/");
@@ -59,7 +59,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
 		//xsetenv("LOGNAME", pw->pw_name);
 		//xsetenv("HOME",    pw->pw_dir);
 		//xsetenv("SHELL",   shell);
-	} else if (flags & SETUP_ENV_CHANGEENV) {
+	} else
+	if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) {
 		/* Set HOME, SHELL, and if not becoming a super-user
 		 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME.  */
 		if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) {
diff --git a/loginutils/login.c b/loginutils/login.c
index cac4349b2..332238181 100644
--- a/loginutils/login.c
+++ b/loginutils/login.c
@@ -564,7 +564,9 @@ int login_main(int argc UNUSED_PARAM, char **argv)
 
 	change_identity(pw);
 	setup_environment(pw->pw_shell,
-			(!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + SETUP_ENV_CHANGEENV,
+			(!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV)
+				+ SETUP_ENV_CHANGEENV
+				+ SETUP_ENV_CHDIR,
 			pw);
 
 #if ENABLE_PAM
diff --git a/loginutils/su.c b/loginutils/su.c
index e1db7590f..6efe1981a 100644
--- a/loginutils/su.c
+++ b/loginutils/su.c
@@ -176,10 +176,9 @@ int su_main(int argc UNUSED_PARAM, char **argv)
 
 	change_identity(pw);
 	setup_environment(opt_shell,
-			((flags & SU_OPT_l) / SU_OPT_l * SETUP_ENV_CLEARENV)
-			+ (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV)
-			+ (!(flags & SU_OPT_l) * SETUP_ENV_NO_CHDIR),
-			pw);
+		((flags & SU_OPT_l) ? (SETUP_ENV_CLEARENV + SETUP_ENV_CHDIR) : 0)
+			+ (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV),
+		pw);
 	IF_SELINUX(set_current_security_context(NULL);)
 
 	if (opt_command) {
diff --git a/loginutils/sulogin.c b/loginutils/sulogin.c
index c9817960c..681022acb 100644
--- a/loginutils/sulogin.c
+++ b/loginutils/sulogin.c
@@ -94,10 +94,13 @@ int sulogin_main(int argc UNUSED_PARAM, char **argv)
 		shell = pwd->pw_shell;
 
 	/* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */
-	setup_environment(shell, SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME, pwd);
+	setup_environment(shell, 0
+		+ SETUP_ENV_CHANGEENV_LOGNAME
+		+ SETUP_ENV_CHDIR
+		, pwd);
 	// no SETUP_ENV_CLEARENV
-	// SETUP_ENV_CHANGEENV[+LOGNAME] - set HOME, SHELL, USER,and LOGNAME
-	// no SETUP_ENV_NO_CHDIR - IOW: cd to $HOME
+	// SETUP_ENV_CHANGEENV_LOGNAME - set HOME, SHELL, USER,and LOGNAME
+	// SETUP_ENV_CHDIR - cd to $HOME
 
 	/* util-linux 2.36.1 compat: steal ctty if we don't have it yet
 	 * (yes, util-linux uses force=1)  */
diff --git a/miscutils/crontab.c b/miscutils/crontab.c
index 411a18a50..1111f4d54 100644
--- a/miscutils/crontab.c
+++ b/miscutils/crontab.c
@@ -55,8 +55,8 @@ static void edit_file(const struct passwd *pas, const char *file)
 	/* initgroups, setgid, setuid */
 	change_identity(pas);
 	setup_environment(pas->pw_shell,
-			SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP,
-			pas);
+		SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP | SETUP_ENV_CHDIR,
+		pas);
 	ptr = getenv("VISUAL");
 	if (!ptr) {
 		ptr = getenv("EDITOR");
diff --git a/shell/ash.c b/shell/ash.c
index 12b2db3a9..ca5c755b6 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -10791,7 +10791,7 @@ preadfd(void)
 			write(STDOUT_FILENO, "^C", 2);
 			raise(SIGINT);
 			/* raise(SIGINT) did not work! (e.g. if SIGINT
-			 * is SIG_INGed on startup, it stays SIG_IGNed)
+			 * is SIG_IGNed on startup, it stays SIG_IGNed)
 			 */
 			if (trap[SIGINT]) {
 				buf[0] = '\n';
diff --git a/shell/hush.c b/shell/hush.c
index 982fc356a..7d0dc67e4 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -10361,7 +10361,7 @@ int hush_main(int argc, char **argv)
 //it ignores TERM:
 //	bash -i -c 'kill $$; echo ALIVE'
 //	ALIVE
-//it resets SIG_INGed HUP to SIG_DFL:
+//it resets SIG_IGNed HUP to SIG_DFL:
 //	trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE'
 //	Hangup   [the message is not printed by bash, it's the shell which started it]
 //is talkative about jobs and exiting:
-- 
cgit v1.2.3-55-g6feb


From c2788f88f430da8ae5fb5f293b13fc2b167ea2fe Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 13 Jan 2022 12:56:10 +0100
Subject: libbb: introduce and use chdir_or_warn()

function                                             old     new   delta
chdir_or_warn                                          -      37     +37
send_cgi_and_exit                                    720     711      -9
xchdir                                                27      15     -12
setup_environment                                    233     217     -16
fork_job                                             449     433     -16
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 0/4 up/down: 37/-53)            Total: -16 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 include/libbb.h           |  1 +
 libbb/setup_environment.c |  3 +--
 libbb/xfuncs_printf.c     | 11 +++++++++--
 miscutils/crond.c         |  3 +--
 networking/httpd.c        |  3 +--
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/libbb.h b/include/libbb.h
index 780e9ae7d..91b456915 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -645,6 +645,7 @@ void xsetgid(gid_t gid) FAST_FUNC;
 void xsetuid(uid_t uid) FAST_FUNC;
 void xsetegid(gid_t egid) FAST_FUNC;
 void xseteuid(uid_t euid) FAST_FUNC;
+int chdir_or_warn(const char *path) FAST_FUNC;
 void xchdir(const char *path) FAST_FUNC;
 void xfchdir(int fd) FAST_FUNC;
 void xchroot(const char *path) FAST_FUNC;
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index 37777204e..3549e2099 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -37,8 +37,7 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
 	/* Change the current working directory to be the home directory
 	 * of the user */
 	if (flags & SETUP_ENV_CHDIR) {
-		if (chdir(pw->pw_dir) != 0) {
-			bb_error_msg("can't change directory to '%s'", pw->pw_dir);
+		if (chdir_or_warn(pw->pw_dir) != 0) {
 			xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/");
 		}
 	}
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c
index fc630d176..842d10cd2 100644
--- a/libbb/xfuncs_printf.c
+++ b/libbb/xfuncs_printf.c
@@ -415,11 +415,18 @@ void FAST_FUNC xseteuid(uid_t euid)
 	if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid");
 }
 
+int FAST_FUNC chdir_or_warn(const char *path)
+{
+	int r = chdir(path);
+	if (r != 0)
+		bb_perror_msg("can't change directory to '%s'", path);
+	return r;
+}
 // Die if we can't chdir to a new path.
 void FAST_FUNC xchdir(const char *path)
 {
-	if (chdir(path))
-		bb_perror_msg_and_die("can't change directory to '%s'", path);
+	if (chdir_or_warn(path) != 0)
+		xfunc_die();
 }
 
 void FAST_FUNC xfchdir(int fd)
diff --git a/miscutils/crond.c b/miscutils/crond.c
index b74427351..1965af656 100644
--- a/miscutils/crond.c
+++ b/miscutils/crond.c
@@ -675,8 +675,7 @@ static void change_user(struct passwd *pas)
 {
 	/* careful: we're after vfork! */
 	change_identity(pas); /* - initgroups, setgid, setuid */
-	if (chdir(pas->pw_dir) < 0) {
-		bb_error_msg("can't change directory to '%s'", pas->pw_dir);
+	if (chdir_or_warn(pas->pw_dir) != 0) {
 		xchdir(CRON_DIR);
 	}
 }
diff --git a/networking/httpd.c b/networking/httpd.c
index 33045163f..ffc58e10b 100644
--- a/networking/httpd.c
+++ b/networking/httpd.c
@@ -1667,8 +1667,7 @@ static void send_cgi_and_exit(
 		script = last_slash;
 		if (script != url) { /* paranoia */
 			*script = '\0';
-			if (chdir(url + 1) != 0) {
-				bb_perror_msg("can't change directory to '%s'", url + 1);
+			if (chdir_or_warn(url + 1) != 0) {
 				goto error_execing_cgi;
 			}
 			// not needed: *script = '/';
-- 
cgit v1.2.3-55-g6feb


From a277506a64404e6c4472ff89c944c4f353db1c33 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 16 Jan 2022 23:54:46 +0100
Subject: shell: add comments about SIGINT-related problems

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 shell/ash.c          | 13 ++++++++-----
 shell/shell_common.c |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/shell/ash.c b/shell/ash.c
index ca5c755b6..086773dd7 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -664,7 +664,7 @@ raise_exception(int e)
 /*
  * Called when a SIGINT is received.  (If the user specifies
  * that SIGINT is to be trapped or ignored using the trap builtin, then
- * this routine is not called.)  Suppressint is nonzero when interrupts
+ * this routine is not called.)  suppress_int is nonzero when interrupts
  * are held using the INT_OFF macro.  (The test for iflag is just
  * defensive programming.)
  */
@@ -695,13 +695,12 @@ raise_interrupt(void)
 } while (0)
 #endif
 
-static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void
+static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
 int_on(void)
 {
 	barrier();
-	if (--suppress_int == 0 && pending_int) {
+	if (--suppress_int == 0 && pending_int)
 		raise_interrupt();
-	}
 }
 #if DEBUG_INTONOFF
 # define INT_ON do { \
@@ -711,7 +710,7 @@ int_on(void)
 #else
 # define INT_ON int_on()
 #endif
-static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void
+static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
 force_int_on(void)
 {
 	barrier();
@@ -10785,6 +10784,10 @@ preadfd(void)
 # endif
 		reinit_unicode_for_ash();
  again:
+//BUG: not in INT_OFF/INT_ON section - SIGINT et al would longjmp out of read_line_input()!
+//This would cause a memory leak in interactive shell
+//(repeated internal allocations in read_line_input):
+// (while kill -INT $$; do :; done) &
 		nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ);
 		if (nr == 0) {
 			/* ^C pressed, "convert" to SIGINT */
diff --git a/shell/shell_common.c b/shell/shell_common.c
index 2e36d9208..13163acdf 100644
--- a/shell/shell_common.c
+++ b/shell/shell_common.c
@@ -196,6 +196,7 @@ shell_builtin_read(struct builtin_read_params *params)
 		 */
 		errno = 0;
 		pfd[0].events = POLLIN;
+//TODO race with a signal arriving just before the poll!
 		if (poll(pfd, 1, timeout) <= 0) {
 			/* timed out, or EINTR */
 			err = errno;
-- 
cgit v1.2.3-55-g6feb


From 12566e7f9b5e5c5d445bc4d36991d134b431dc6c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 17 Jan 2022 03:02:40 +0100
Subject: ash,hush: fix handling of SIGINT while waiting for interactive input

function                                             old     new   delta
lineedit_read_key                                    160     237     +77
__pgetc                                              522     589     +67
fgetc_interactive                                    244     309     +65
safe_read_key                                          -      39     +39
read_key                                             588     607     +19
record_pending_signo                                  23      32      +9
signal_handler                                        75      81      +6
.rodata                                           104312  104309      -3
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 6/1 up/down: 282/-3)            Total: 279 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/vi.c        |  4 ++--
 include/libbb.h     |  5 +++-
 libbb/lineedit.c    | 24 ++++++++++++++++---
 libbb/read_key.c    | 16 +++++++++++--
 miscutils/hexedit.c |  2 +-
 miscutils/less.c    |  4 ++--
 procps/top.c        |  2 +-
 shell/ash.c         | 39 ++++++++++++++++++++++++-------
 shell/hush.c        | 67 +++++++++++++++++++++++++++++++++++++----------------
 9 files changed, 122 insertions(+), 41 deletions(-)

diff --git a/editors/vi.c b/editors/vi.c
index 3dbe5b471..d37cd48a3 100644
--- a/editors/vi.c
+++ b/editors/vi.c
@@ -1122,7 +1122,7 @@ static int readit(void) // read (maybe cursor) key from stdin
 	// on nonblocking stdin.
 	// Note: read_key sets errno to 0 on success.
  again:
-	c = read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1);
+	c = safe_read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1);
 	if (c == -1) { // EOF/error
 		if (errno == EAGAIN) // paranoia
 			goto again;
@@ -4770,7 +4770,7 @@ static void edit_file(char *fn)
 		uint64_t k;
 		write1(ESC"[999;999H" ESC"[6n");
 		fflush_all();
-		k = read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100);
+		k = safe_read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100);
 		if ((int32_t)k == KEYCODE_CURSOR_POS) {
 			uint32_t rc = (k >> 32);
 			columns = (rc & 0x7fff);
diff --git a/include/libbb.h b/include/libbb.h
index 91b456915..b45ce91c5 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1908,6 +1908,8 @@ enum {
  * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout
  */
 int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC;
+/* This version loops on EINTR: */
+int64_t safe_read_key(int fd, char *buffer, int timeout) FAST_FUNC;
 void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC;
 
 
@@ -1961,7 +1963,8 @@ enum {
 	USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION,
 	VI_MODE          = 8 * ENABLE_FEATURE_EDITING_VI,
 	WITH_PATH_LOOKUP = 0x10,
-	FOR_SHELL        = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION,
+	LI_INTERRUPTIBLE = 0x20,
+	FOR_SHELL        = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION | LI_INTERRUPTIBLE,
 };
 line_input_t *new_line_input_t(int flags) FAST_FUNC;
 #if ENABLE_FEATURE_EDITING_SAVEHISTORY
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index e14c78707..f76afd37d 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -2161,12 +2161,30 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 		 * insist on full MB_CUR_MAX buffer to declare input like
 		 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls".
 		 *
+		 * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll()
+		 * inside read_key, or if bb_got_signal != 0 (IOW: if signal
+		 * arrived before poll() is reached).
+		 *
 		 * Note: read_key sets errno to 0 on success.
 		 */
-		IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
-		ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
-		IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
+		do {
+			if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) {
+				errno = EINTR;
+				return -1;
+			}
+//FIXME: still races here with signals, but small window to poll() inside read_key
+			IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
+			ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
+			IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
+		} while (!(state->flags & LI_INTERRUPTIBLE) && errno == EINTR);
+
 		if (errno) {
+			/* LI_INTERRUPTIBLE can bail out with EINTR here,
+			 * but nothing really guarantees that bb_got_signal
+			 * is nonzero. Follow the least surprise principle:
+			 */
+			if (errno == EINTR && bb_got_signal == 0)
+				bb_got_signal = 255; /* something nonzero */
 #if ENABLE_UNICODE_SUPPORT
 			if (errno == EAGAIN && unicode_idx != 0)
 				goto pushback;
diff --git a/libbb/read_key.c b/libbb/read_key.c
index 03b7da656..829ae215c 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
 		 * if fd can be in non-blocking mode.
 		 */
 		if (timeout >= -1) {
-			if (safe_poll(&pfd, 1, timeout) == 0) {
+			n = poll(&pfd, 1, timeout);
+			if (n < 0 && errno == EINTR)
+				return n;
+			if (n == 0) {
 				/* Timed out */
 				errno = EAGAIN;
 				return -1;
@@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
 		 * When we were reading 3 bytes here, we were eating
 		 * "li" too, and cat was getting wrong input.
 		 */
-		n = safe_read(fd, buffer, 1);
+		n = read(fd, buffer, 1);
 		if (n <= 0)
 			return -1;
 	}
@@ -284,6 +287,15 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
 	goto start_over;
 }
 
+int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout)
+{
+	int64_t r;
+	do {
+		r = read_key(fd, buffer, timeout);
+	} while (errno == EINTR);
+	return r;
+}
+
 void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
 {
 	unsigned cur_len = (unsigned char)buffer[0];
diff --git a/miscutils/hexedit.c b/miscutils/hexedit.c
index f8ff9b62b..15ad78377 100644
--- a/miscutils/hexedit.c
+++ b/miscutils/hexedit.c
@@ -292,7 +292,7 @@ int hexedit_main(int argc UNUSED_PARAM, char **argv)
 		fflush_all();
 		G.in_read_key = 1;
 		if (!bb_got_signal)
-			key = read_key(STDIN_FILENO, G.read_key_buffer, -1);
+			key = safe_read_key(STDIN_FILENO, G.read_key_buffer, -1);
 		G.in_read_key = 0;
 		if (bb_got_signal)
 			key = CTRL('X');
diff --git a/miscutils/less.c b/miscutils/less.c
index 82c4b21f0..8a0525cb7 100644
--- a/miscutils/less.c
+++ b/miscutils/less.c
@@ -1137,9 +1137,9 @@ static int64_t getch_nowait(void)
 #endif
 	}
 
-	/* We have kbd_fd in O_NONBLOCK mode, read inside read_key()
+	/* We have kbd_fd in O_NONBLOCK mode, read inside safe_read_key()
 	 * would not block even if there is no input available */
-	key64 = read_key(kbd_fd, kbd_input, /*timeout off:*/ -2);
+	key64 = safe_read_key(kbd_fd, kbd_input, /*timeout off:*/ -2);
 	if ((int)key64 == -1) {
 		if (errno == EAGAIN) {
 			/* No keyboard input available. Since poll() did return,
diff --git a/procps/top.c b/procps/top.c
index 4cd545c69..804d6f258 100644
--- a/procps/top.c
+++ b/procps/top.c
@@ -913,7 +913,7 @@ static unsigned handle_input(unsigned scan_mask, duration_t interval)
 	while (1) {
 		int32_t c;
 
-		c = read_key(STDIN_FILENO, G.kbd_input, interval * 1000);
+		c = safe_read_key(STDIN_FILENO, G.kbd_input, interval * 1000);
 		if (c == -1 && errno != EAGAIN) {
 			/* error/EOF */
 			option_mask32 |= OPT_EOF;
diff --git a/shell/ash.c b/shell/ash.c
index 086773dd7..55df54bd0 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -3679,7 +3679,9 @@ signal_handler(int signo)
 		if (!trap[SIGCHLD])
 			return;
 	}
-
+#if ENABLE_FEATURE_EDITING
+	bb_got_signal = signo; /* for read_line_input: "we got a signal" */
+#endif
 	gotsig[signo - 1] = 1;
 	pending_sig = signo;
 
@@ -10784,33 +10786,52 @@ preadfd(void)
 # endif
 		reinit_unicode_for_ash();
  again:
-//BUG: not in INT_OFF/INT_ON section - SIGINT et al would longjmp out of read_line_input()!
-//This would cause a memory leak in interactive shell
-//(repeated internal allocations in read_line_input):
-// (while kill -INT $$; do :; done) &
+		/* For shell, LI_INTERRUPTIBLE is set:
+		 * read_line_input will abort on either
+		 * getting EINTR in poll(), or if it sees bb_got_signal != 0
+		 * (IOW: if signal arrives before poll() is reached).
+		 * Interactive testcases:
+		 * (while kill -INT $$; do sleep 1; done) &
+		 * #^^^ prints ^C, prints prompt, repeats
+		 * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
+		 * #^^^ prints ^C, prints "I", prints prompt, repeats
+		 * trap 'echo T' term; (while kill $$; do sleep 1; done) &
+		 * #^^^ prints "T", prints prompt, repeats
+		 * #(bash 5.0.17 exits after first "T", looks like a bug)
+		 */
+		bb_got_signal = 0;
+		INT_OFF; /* no longjmp'ing out of read_line_input please */
 		nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ);
+		if (bb_got_signal == SIGINT)
+			write(STDOUT_FILENO, "^C\n", 3);
+		INT_ON; /* here non-blocked SIGINT will longjmp */
 		if (nr == 0) {
 			/* ^C pressed, "convert" to SIGINT */
-			write(STDOUT_FILENO, "^C", 2);
-			raise(SIGINT);
+			write(STDOUT_FILENO, "^C\n", 3);
+			raise(SIGINT); /* here non-blocked SIGINT will longjmp */
 			/* raise(SIGINT) did not work! (e.g. if SIGINT
 			 * is SIG_IGNed on startup, it stays SIG_IGNed)
 			 */
 			if (trap[SIGINT]) {
+ empty_line_input:
 				buf[0] = '\n';
 				buf[1] = '\0';
 				return 1;
 			}
 			exitstatus = 128 + SIGINT;
 			/* bash behavior on ^C + ignored SIGINT: */
-			write(STDOUT_FILENO, "\n", 1);
 			goto again;
 		}
 		if (nr < 0) {
 			if (errno == 0) {
-				/* Ctrl+D pressed */
+				/* ^D pressed */
 				nr = 0;
 			}
+			else if (errno == EINTR) { /* got signal? */
+				if (bb_got_signal != SIGINT)
+					write(STDOUT_FILENO, "\n", 1);
+				goto empty_line_input;
+			}
 # if ENABLE_ASH_IDLE_TIMEOUT
 			else if (errno == EAGAIN && timeout > 0) {
 				puts("\007timed out waiting for input: auto-logout");
diff --git a/shell/hush.c b/shell/hush.c
index 7d0dc67e4..6dc2ecaac 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -918,6 +918,7 @@ struct globals {
 #if ENABLE_HUSH_INTERACTIVE
 	smallint promptmode; /* 0: PS1, 1: PS2 */
 #endif
+	/* set by signal handler if SIGINT is received _and_ its trap is not set */
 	smallint flag_SIGINT;
 #if ENABLE_HUSH_LOOPS
 	smallint flag_break_continue;
@@ -1944,6 +1945,9 @@ enum {
 static void record_pending_signo(int sig)
 {
 	sigaddset(&G.pending_set, sig);
+#if ENABLE_FEATURE_EDITING
+	bb_got_signal = sig; /* for read_line_input: "we got a signal" */
+#endif
 #if ENABLE_HUSH_FAST
 	if (sig == SIGCHLD) {
 		G.count_SIGCHLD++;
@@ -2652,30 +2656,53 @@ static int get_user_input(struct in_str *i)
 	for (;;) {
 		reinit_unicode_for_hush();
 		G.flag_SIGINT = 0;
-		/* buglet: SIGINT will not make new prompt to appear _at once_,
-		 * only after <Enter>. (^C works immediately) */
-		r = read_line_input(G.line_input_state, prompt_str,
+
+		bb_got_signal = 0;
+		if (!sigisemptyset(&G.pending_set)) {
+			/* Whoops, already got a signal, do not call read_line_input */
+			bb_got_signal = r = -1;
+		} else {
+			/* For shell, LI_INTERRUPTIBLE is set:
+			 * read_line_input will abort on either
+			 * getting EINTR in poll(), or if it sees bb_got_signal != 0
+			 * (IOW: if signal arrives before poll() is reached).
+			 * Interactive testcases:
+			 * (while kill -INT $$; do sleep 1; done) &
+			 * #^^^ prints ^C, prints prompt, repeats
+			 * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
+			 * #^^^ prints ^C, prints "I", prints prompt, repeats
+			 * trap 'echo T' term; (while kill $$; do sleep 1; done) &
+			 * #^^^ prints "T", prints prompt, repeats
+			 * #(bash 5.0.17 exits after first "T", looks like a bug)
+			 */
+			r = read_line_input(G.line_input_state, prompt_str,
 				G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1
-		);
-		/* read_line_input intercepts ^C, "convert" it to SIGINT */
-		if (r == 0) {
-			raise(SIGINT);
+			);
+			/* read_line_input intercepts ^C, "convert" it to SIGINT */
+			if (r == 0)
+				raise(SIGINT);
+		}
+		/* bash prints ^C (before running a trap, if any)
+		 * both on keyboard ^C and on real SIGINT (non-kbd generated).
+		 */
+		if (sigismember(&G.pending_set, SIGINT)) {
+			write(STDOUT_FILENO, "^C\n", 3);
+			G.last_exitcode = 128 | SIGINT;
 		}
 		check_and_run_traps();
-		if (r != 0 && !G.flag_SIGINT)
+		if (r == 0) /* keyboard ^C? */
+			continue; /* go back, read another input line */
+		if (r > 0) /* normal input? (no ^C, no ^D, no signals) */
 			break;
-		/* ^C or SIGINT: repeat */
-		/* bash prints ^C even on real SIGINT (non-kbd generated) */
-		write(STDOUT_FILENO, "^C\n", 3);
-		G.last_exitcode = 128 | SIGINT;
-	}
-	if (r < 0) {
-		/* EOF/error detected */
-		/* ^D on interactive input goes to next line before exiting: */
-		write(STDOUT_FILENO, "\n", 1);
-		i->p = NULL;
-		i->peek_buf[0] = r = EOF;
-		return r;
+		if (!bb_got_signal) {
+			/* r < 0: ^D/EOF/error detected (but not signal) */
+			/* ^D on interactive input goes to next line before exiting: */
+			write(STDOUT_FILENO, "\n", 1);
+			i->p = NULL;
+			i->peek_buf[0] = r = EOF;
+			return r;
+		}
+		/* it was a signal: go back, read another input line */
 	}
 	i->p = G.user_input_buf;
 	return (unsigned char)*i->p++;
-- 
cgit v1.2.3-55-g6feb


From 8ad2acf352d790d0bdd792b8e126d58a088451f3 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 17 Jan 2022 23:59:46 +0100
Subject: fix "defined but not used" warnings

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 archival/libarchive/get_header_tar.c | 2 ++
 miscutils/i2c_tools.c                | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/archival/libarchive/get_header_tar.c b/archival/libarchive/get_header_tar.c
index d26868bf8..cc6f3f0ad 100644
--- a/archival/libarchive/get_header_tar.c
+++ b/archival/libarchive/get_header_tar.c
@@ -147,11 +147,13 @@ static void process_pax_hdr(archive_handle_t *archive_handle, unsigned sz, int g
 #endif
 }
 
+#if ENABLE_FEATURE_TAR_GNU_EXTENSIONS
 static void die_if_bad_fnamesize(off_t sz)
 {
 	if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */
 		bb_simple_error_msg_and_die("bad archive");
 }
+#endif
 
 char FAST_FUNC get_header_tar(archive_handle_t *archive_handle)
 {
diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c
index e3741eeba..da26f5e19 100644
--- a/miscutils/i2c_tools.c
+++ b/miscutils/i2c_tools.c
@@ -120,6 +120,7 @@ static int32_t i2c_smbus_access(int fd, char read_write, uint8_t cmd,
 	return ioctl(fd, I2C_SMBUS, &args);
 }
 
+#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP || ENABLE_I2CDETECT
 static int32_t i2c_smbus_read_byte(int fd)
 {
 	union i2c_smbus_data data;
@@ -131,6 +132,7 @@ static int32_t i2c_smbus_read_byte(int fd)
 
 	return data.byte;
 }
+#endif
 
 #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP
 static int32_t i2c_smbus_write_byte(int fd, uint8_t val)
-- 
cgit v1.2.3-55-g6feb


From 1e825acf8d715fe49af040cb02f9e96c26955832 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 18 Jan 2022 00:31:27 +0100
Subject: libbb: shrink lineedit_read_key()

function                                             old     new   delta
lineedit_read_key                                    237     231      -6

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 archival/libarchive/decompress_bunzip2.c |  2 +-
 coreutils/head.c                         |  6 +++---
 editors/patch.c                          |  2 +-
 editors/patch_toybox.c                   |  2 +-
 include/libbb.h                          |  2 ++
 libbb/lineedit.c                         | 26 ++++++++++++++++----------
 libbb/read_key.c                         |  1 +
 7 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/archival/libarchive/decompress_bunzip2.c b/archival/libarchive/decompress_bunzip2.c
index 42e2b4f88..4a2b668aa 100644
--- a/archival/libarchive/decompress_bunzip2.c
+++ b/archival/libarchive/decompress_bunzip2.c
@@ -654,7 +654,7 @@ static int read_bunzip(bunzip_data *bd, char *outbuf, int len)
 				/* Subtract the 1 copy we'd output anyway to get extras */
 				--bd->writeCopies;
 			}
-		} /* for(;;) */
+		} /* for (;;) */
 
 		/* Decompression of this input block completed successfully */
 		bd->writeCRC = CRC = ~CRC;
diff --git a/coreutils/head.c b/coreutils/head.c
index 9586f869f..c7537a20e 100644
--- a/coreutils/head.c
+++ b/coreutils/head.c
@@ -76,7 +76,7 @@ print_except_N_last_bytes(FILE *fp, unsigned count)
 {
 	unsigned char *circle = xmalloc(++count);
 	unsigned head = 0;
-	for(;;) {
+	for (;;) {
 		int c;
 		c = getc(fp);
 		if (c == EOF)
@@ -105,7 +105,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
 {
 	char **circle = xzalloc((++count) * sizeof(circle[0]));
 	unsigned head = 0;
-	for(;;) {
+	for (;;) {
 		char *c;
 		c = xmalloc_fgets(fp);
 		if (!c)
@@ -127,7 +127,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
 	}
  ret:
 	head = 0;
-	for(;;) {
+	for (;;) {
 		free(circle[head++]);
 		if (head == count)
 			break;
diff --git a/editors/patch.c b/editors/patch.c
index 110176630..aebb5073e 100644
--- a/editors/patch.c
+++ b/editors/patch.c
@@ -418,7 +418,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
 	}
 
 	// Loop through the lines in the patch
-	for(;;) {
+	for (;;) {
 		char *patchline;
 
 		patchline = xmalloc_fgetline(stdin);
diff --git a/editors/patch_toybox.c b/editors/patch_toybox.c
index aebab8132..69a508b2e 100644
--- a/editors/patch_toybox.c
+++ b/editors/patch_toybox.c
@@ -441,7 +441,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
 	TT.filein = TT.fileout = -1;
 
 	// Loop through the lines in the patch
-	for(;;) {
+	for (;;) {
 		char *patchline;
 
 		patchline = get_line(TT.filepatch);
diff --git a/include/libbb.h b/include/libbb.h
index b45ce91c5..8e3b7ae8e 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -1900,6 +1900,8 @@ enum {
  * (unless fd is in non-blocking mode),
  * subsequent reads will time out after a few milliseconds.
  * Return of -1 means EOF or error (errno == 0 on EOF).
+ * Nonzero errno is not preserved across the call:
+ * if there was no error, errno will be cleared to 0.
  * buffer[0] is used as a counter of buffered chars and must be 0
  * on first call.
  * timeout:
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index f76afd37d..82624757e 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -2155,7 +2155,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 #endif
 
 	fflush_all();
-	while (1) {
+	for (;;) {
 		/* Wait for input. TIMEOUT = -1 makes read_key wait even
 		 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't
 		 * insist on full MB_CUR_MAX buffer to declare input like
@@ -2167,24 +2167,30 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 		 *
 		 * Note: read_key sets errno to 0 on success.
 		 */
-		do {
+		for (;;) {
 			if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) {
 				errno = EINTR;
 				return -1;
 			}
 //FIXME: still races here with signals, but small window to poll() inside read_key
 			IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
+			/* errno = 0; - read_key does this itself */
 			ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
 			IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
-		} while (!(state->flags & LI_INTERRUPTIBLE) && errno == EINTR);
+			if (errno != EINTR)
+				break;
+			if (state->flags & LI_INTERRUPTIBLE) {
+				/* LI_INTERRUPTIBLE bails out on EINTR,
+				 * but nothing really guarantees that bb_got_signal
+				 * is nonzero. Follow the least surprise principle:
+				 */
+				if (bb_got_signal == 0)
+					bb_got_signal = 255;
+				goto ret;
+			}
+		}
 
 		if (errno) {
-			/* LI_INTERRUPTIBLE can bail out with EINTR here,
-			 * but nothing really guarantees that bb_got_signal
-			 * is nonzero. Follow the least surprise principle:
-			 */
-			if (errno == EINTR && bb_got_signal == 0)
-				bb_got_signal = 255; /* something nonzero */
 #if ENABLE_UNICODE_SUPPORT
 			if (errno == EAGAIN && unicode_idx != 0)
 				goto pushback;
@@ -2251,7 +2257,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 #endif
 		break;
 	}
-
+ ret:
 	return ic;
 }
 
diff --git a/libbb/read_key.c b/libbb/read_key.c
index 829ae215c..cf8ed411e 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -291,6 +291,7 @@ int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout)
 {
 	int64_t r;
 	do {
+		/* errno = 0; - read_key does this itself */
 		r = read_key(fd, buffer, timeout);
 	} while (errno == EINTR);
 	return r;
-- 
cgit v1.2.3-55-g6feb


From 39369ff460f3e2dbfec7f6be181b2fb98f3c1867 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 23 Jan 2022 09:27:30 +0100
Subject: libbb/sha1: use SSE2 in unrolled x86-64 code. ~10% faster

function                                             old     new   delta
.rodata                                           108241  108305     +64
sha1_process_block64                                3502    3495      -7
------------------------------------------------------------------------------
(add/remove: 5/0 grow/shrink: 1/1 up/down: 64/-7)              Total: 57 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 992 +++++++++++++++++++++++------------------
 libbb/hash_md5_sha_x86-64.S.sh | 440 ++++++++++++------
 2 files changed, 854 insertions(+), 578 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 87fb616a1..069a18719 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -20,16 +20,10 @@ sha1_process_block64:
 # eax..edx: a..d
 # ebp: e
 # esi,edi: temps
-# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
-# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-	movl	$3, %eax
-1:
-	movq	(%rdi,%rax,8), %rsi
-	bswapq	%rsi
-	rolq	$32, %rsi
-	movq	%rsi, -32(%rsp,%rax,8)
-	decl	%eax
-	jns	1b
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
@@ -37,587 +31,709 @@ sha1_process_block64:
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
+	movaps	rconst0x5A827999(%rip), %xmm6
+
+	# For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15
+	# instead of spilling them to stack.
+	# (We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so...)
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r10
+	bswapq	%rsi
+	bswapq	%r10
+	rolq	$32, %rsi		# rsi = W[1]:W[0]
+	rolq	$32, %r10
+	movq	%rsi, %xmm0
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r10:rsi = (W[0],W[1],W[2],W[3])
+	movaps	%xmm0, %xmm4
+	paddd	%xmm6, %xmm4
+	movups	%xmm4, -64+4*0(%rsp)
+
+	movq	4*4(%rdi), %r8
+	movq	4*6(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	rolq	$32, %r8
+	rolq	$32, %r10
+	movq	%r8, %xmm1
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r8 = (W[4],W[5],W[6],W[7])
+	movaps	%xmm1, %xmm4
+	paddd	%xmm6, %xmm4
+	movups	%xmm4, -64+4*4(%rsp)
+
 	movq	4*8(%rdi), %r8
 	movq	4*10(%rdi), %r10
 	bswapq	%r8
 	bswapq	%r10
+	movl	%r8d, %r9d		# r9d = W[9]
+	rolq	$32, %r8		# r8  = W[9]:W[8]
+	movl	%r10d, %r11d		# r11d = W[11]
+	rolq	$32, %r10		# r10  = W[11]:W[10]
+	movq	%r8, %xmm2
+	movq	%r10, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
+
 	movq	4*12(%rdi), %r12
 	movq	4*14(%rdi), %r14
 	bswapq	%r12
 	bswapq	%r14
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movl	%r14d, %r15d
-	shrq	$32, %r14
+	movl	%r12d, %r13d		# r13d = W[13]
+	rolq	$32, %r12		# r12  = W[13]:W[12]
+	movl	%r14d, %r15d		# r15d = W[15]
+	rolq	$32, %r14		# r14  = W[15]:W[14]
+	movq	%r12, %xmm3
+	movq	%r14, %xmm4
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r12 = (W[12],W[13],W[14],W[15])
 
 # 0
-	# W[0], already in %esi
+	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 1
-	movl	-32+4*1(%rsp), %esi		# W[n]
+	addl	-64+4*1(%rsp), %edx	# e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 2
-	movl	-32+4*2(%rsp), %esi		# W[n]
+	addl	-64+4*2(%rsp), %ecx	# e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 3
-	movl	-32+4*3(%rsp), %esi		# W[n]
+	addl	-64+4*3(%rsp), %ebx	# e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 4
-	movl	-32+4*4(%rsp), %esi		# W[n]
+	addl	-64+4*4(%rsp), %eax	# e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 5
-	movl	-32+4*5(%rsp), %esi		# W[n]
+	addl	-64+4*5(%rsp), %ebp	# e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 6
-	movl	-32+4*6(%rsp), %esi		# W[n]
+	addl	-64+4*6(%rsp), %edx	# e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 7
-	movl	-32+4*7(%rsp), %esi		# W[n]
+	addl	-64+4*7(%rsp), %ecx	# e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	# shift left by 1
+	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
 # 8
-	# W[n], in %r8
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 9
-	# W[n], in %r9
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 10
-	# W[n], in %r10
+	leal	0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 11
-	# W[n], in %r11
+	leal	0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
+	movaps	rconst0x6ED9EBA1(%rip), %xmm6
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	# shift left by 1
+	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
 # 12
-	# W[n], in %r12
+	leal	0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 13
-	# W[n], in %r13
+	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 14
-	# W[n], in %r14
+	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 15
-	# W[n], in %r15
+	leal	0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	# shift left by 1
+	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
 # 16
-	movl	%r13d, %esi	# W[(n+13) & 15]
-	xorl	%r8d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*0(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 17
-	movl	%r14d, %esi	# W[(n+13) & 15]
-	xorl	%r9d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*1(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 18
-	movl	%r15d, %esi	# W[(n+13) & 15]
-	xorl	%r10d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*2(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 19
-	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r11d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
-	leal	0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*3(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	# shift left by 1
+	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
 # 20
-	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r12d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*4(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 21
-	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r13d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*5(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 22
-	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r14d, %esi		# ^W[(n+8) & 15]
-	xorl	%r8d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*6(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 23
-	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r15d, %esi		# ^W[(n+8) & 15]
-	xorl	%r9d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*7(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	# shift left by 1
+	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
 # 24
-	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
-	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
-	roll	%r8d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*8(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 25
-	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
-	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
-	roll	%r9d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*9(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 26
-	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
-	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
-	roll	%r10d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*10(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 27
-	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
-	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
-	roll	%r11d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*11(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	# shift left by 1
+	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
 # 28
-	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
-	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
-	roll	%r12d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*12(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 29
-	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
-	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
-	roll	%r13d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*13(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 30
-	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
-	roll	%r14d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*14(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 31
-	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
-	roll	%r15d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*15(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
+	movaps	rconst0x8F1BBCDC(%rip), %xmm6
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	# shift left by 1
+	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
 # 32
-	movl	%r13d, %esi	# W[(n+13) & 15]
-	xorl	%r8d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*0(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 33
-	movl	%r14d, %esi	# W[(n+13) & 15]
-	xorl	%r9d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*1(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 34
-	movl	%r15d, %esi	# W[(n+13) & 15]
-	xorl	%r10d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*2(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 35
-	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r11d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*3(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	# shift left by 1
+	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
 # 36
-	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r12d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*4(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 37
-	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r13d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*5(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 38
-	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r14d, %esi		# ^W[(n+8) & 15]
-	xorl	%r8d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*6(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 39
-	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r15d, %esi		# ^W[(n+8) & 15]
-	xorl	%r9d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*7(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	# shift left by 1
+	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
 # 40
 	movl	%ebx, %edi		# di: b
 	movl	%ebx, %esi		# si: b
@@ -625,12 +741,8 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
-	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
-	roll	%r8d		#
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*8(%rsp), %ebp	# e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -642,12 +754,8 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
-	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
-	roll	%r9d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*9(%rsp), %edx	# e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -659,12 +767,8 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
-	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
-	roll	%r10d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*10(%rsp), %ecx	# e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -676,16 +780,37 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
-	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
-	roll	%r11d		#
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*11(%rsp), %ebx	# e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	# shift left by 1
+	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
 # 44
 	movl	%ecx, %edi		# di: b
 	movl	%ecx, %esi		# si: b
@@ -693,12 +818,8 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
-	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
-	roll	%r12d		#
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*12(%rsp), %eax	# e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -710,12 +831,8 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
-	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
-	roll	%r13d		#
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*13(%rsp), %ebp	# e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -727,12 +844,8 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
-	roll	%r14d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*14(%rsp), %edx	# e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -744,16 +857,37 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
-	roll	%r15d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*15(%rsp), %ecx	# e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	# shift left by 1
+	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
 # 48
 	movl	%edx, %edi		# di: b
 	movl	%edx, %esi		# si: b
@@ -761,14 +895,8 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r13d, %esi	# W[(n+13) & 15]
-	xorl	%r8d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*0(%rsp), %ebx	# e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -780,14 +908,8 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r14d, %esi	# W[(n+13) & 15]
-	xorl	%r9d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*1(%rsp), %eax	# e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -799,14 +921,8 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	%r15d, %esi	# W[(n+13) & 15]
-	xorl	%r10d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*2(%rsp), %ebp	# e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
@@ -818,18 +934,38 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r11d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*3(%rsp), %edx	# e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
+	movaps	rconst0xCA62C1D6(%rip), %xmm6
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	# shift left by 1
+	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
 # 52
 	movl	%ebp, %edi		# di: b
 	movl	%ebp, %esi		# si: b
@@ -837,14 +973,8 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r12d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*4(%rsp), %ecx	# e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -856,14 +986,8 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r13d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*5(%rsp), %ebx	# e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -875,14 +999,8 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r14d, %esi		# ^W[(n+8) & 15]
-	xorl	%r8d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*6(%rsp), %eax	# e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
@@ -894,18 +1012,37 @@ sha1_process_block64:
 	andl	%ecx, %esi		# si: b & c
 	andl	%edx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r15d, %esi		# ^W[(n+8) & 15]
-	xorl	%r9d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
 	addl	%edi, %ebp		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*7(%rsp), %ebp	# e += RCONST + W[n & 15]
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+	movaps	%xmm3, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm0, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	# shift left by 1
+	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm0	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm0, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*0(%rsp)
 # 56
 	movl	%eax, %edi		# di: b
 	movl	%eax, %esi		# si: b
@@ -913,12 +1050,8 @@ sha1_process_block64:
 	andl	%ebx, %esi		# si: b & c
 	andl	%ecx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
-	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
-	roll	%r8d		#
 	addl	%edi, %edx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*8(%rsp), %edx	# e += RCONST + W[n & 15]
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
@@ -930,12 +1063,8 @@ sha1_process_block64:
 	andl	%eax, %esi		# si: b & c
 	andl	%ebx, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
-	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
-	roll	%r9d		#
 	addl	%edi, %ecx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*9(%rsp), %ecx	# e += RCONST + W[n & 15]
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
@@ -947,12 +1076,8 @@ sha1_process_block64:
 	andl	%ebp, %esi		# si: b & c
 	andl	%eax, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
-	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
-	roll	%r10d		#
 	addl	%edi, %ebx		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*10(%rsp), %ebx	# e += RCONST + W[n & 15]
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
@@ -964,307 +1089,282 @@ sha1_process_block64:
 	andl	%edx, %esi		# si: b & c
 	andl	%ebp, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
-	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
-	roll	%r11d		#
 	addl	%edi, %eax		# += ((b | c) & d) | (b & c)
-	leal	-0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*11(%rsp), %eax	# e += RCONST + W[n & 15]
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+	movaps	%xmm0, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm1	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm1, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	# shift left by 1
+	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm1	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm1	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm1, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*1(%rsp)
 # 60
-	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
-	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
-	roll	%r12d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*12(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 61
-	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
-	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
-	roll	%r13d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*13(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 62
-	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
-	roll	%r14d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*14(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 63
-	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
-	roll	%r15d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*15(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+	movaps	%xmm1, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm2	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm2, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	# shift left by 1
+	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm2	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm2	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm2, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*2(%rsp)
 # 64
-	movl	%r13d, %esi	# W[(n+13) & 15]
-	xorl	%r8d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*0(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*0(%rsp)		# store to W[n & 15]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*0(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 65
-	movl	%r14d, %esi	# W[(n+13) & 15]
-	xorl	%r9d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*1(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*1(%rsp)		# store to W[n & 15]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*1(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 66
-	movl	%r15d, %esi	# W[(n+13) & 15]
-	xorl	%r10d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*2(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*2(%rsp)		# store to W[n & 15]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*2(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 67
-	movl	-32+4*0(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r11d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*3(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*3(%rsp)		# store to W[n & 15]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*3(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+	movaps	%xmm2, %xmm4
+	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	%xmm5, %xmm3	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	%xmm3, %xmm5
+	xorps	%xmm4, %xmm4	# rol(W0,1):
+	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	# shift left by 1
+	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	%xmm5, %xmm4
+	pslld	$2, %xmm5
+	psrld	$30, %xmm4
+#	xorps	%xmm4, %xmm5	# rol((0,0,0,unrotW[0]),2)
+	xorps	%xmm4, %xmm3	# same result, but does not depend on/does not modify T2
+	xorps	%xmm5, %xmm3	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+	movaps	%xmm3, %xmm5
+	paddd	%xmm6, %xmm5
+	movups	%xmm5, -64+16*3(%rsp)
 # 68
-	movl	-32+4*1(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r12d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*4(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*4(%rsp)		# store to W[n & 15]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*4(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 69
-	movl	-32+4*2(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r13d, %esi		# ^W[(n+8) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*5(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*5(%rsp)		# store to W[n & 15]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*5(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 70
-	movl	-32+4*3(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r14d, %esi		# ^W[(n+8) & 15]
-	xorl	%r8d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*6(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*6(%rsp)		# store to W[n & 15]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*6(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 71
-	movl	-32+4*4(%rsp), %esi	# W[(n+13) & 15]
-	xorl	%r15d, %esi		# ^W[(n+8) & 15]
-	xorl	%r9d, %esi		# ^W[(n+2) & 15]
-	xorl	-32+4*7(%rsp), %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, -32+4*7(%rsp)		# store to W[n & 15]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*7(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 72
-	xorl	-32+4*5(%rsp), %r8d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*0(%rsp), %r8d	# ^W[(n+8) & 15]
-	xorl	%r10d, %r8d	# ^W[(n+2) & 15]
-	roll	%r8d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*8(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 73
-	xorl	-32+4*6(%rsp), %r9d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*1(%rsp), %r9d	# ^W[(n+8) & 15]
-	xorl	%r11d, %r9d	# ^W[(n+2) & 15]
-	roll	%r9d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*9(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 74
-	xorl	-32+4*7(%rsp), %r10d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*2(%rsp), %r10d	# ^W[(n+8) & 15]
-	xorl	%r12d, %r10d	# ^W[(n+2) & 15]
-	roll	%r10d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*10(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 75
-	xorl	%r8d, %r11d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*3(%rsp), %r11d	# ^W[(n+8) & 15]
-	xorl	%r13d, %r11d	# ^W[(n+2) & 15]
-	roll	%r11d		#
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	xorl	%ebx, %edi		# ^b
-	leal	-0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15]
+	addl	-64+4*11(%rsp), %ebp	# e += RCONST + W[n & 15]
 	addl	%edi, %ebp		# e += (c ^ d ^ b)
 	movl	%eax, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 76
-	xorl	%r9d, %r12d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*4(%rsp), %r12d	# ^W[(n+8) & 15]
-	xorl	%r14d, %r12d	# ^W[(n+2) & 15]
-	roll	%r12d		#
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	xorl	%eax, %edi		# ^b
-	leal	-0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15]
+	addl	-64+4*12(%rsp), %edx	# e += RCONST + W[n & 15]
 	addl	%edi, %edx		# e += (c ^ d ^ b)
 	movl	%ebp, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 77
-	xorl	%r10d, %r13d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*5(%rsp), %r13d	# ^W[(n+8) & 15]
-	xorl	%r15d, %r13d	# ^W[(n+2) & 15]
-	roll	%r13d		#
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	xorl	%ebp, %edi		# ^b
-	leal	-0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15]
+	addl	-64+4*13(%rsp), %ecx	# e += RCONST + W[n & 15]
 	addl	%edi, %ecx		# e += (c ^ d ^ b)
 	movl	%edx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 78
-	xorl	%r11d, %r14d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*6(%rsp), %r14d	# ^W[(n+8) & 15]
-	xorl	-32+4*0(%rsp), %r14d	# ^W[(n+2) & 15]
-	roll	%r14d		#
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	xorl	%edx, %edi		# ^b
-	leal	-0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15]
+	addl	-64+4*14(%rsp), %ebx	# e += RCONST + W[n & 15]
 	addl	%edi, %ebx		# e += (c ^ d ^ b)
 	movl	%ecx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 79
-	xorl	%r12d, %r15d	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	-32+4*7(%rsp), %r15d	# ^W[(n+8) & 15]
-	xorl	-32+4*1(%rsp), %r15d	# ^W[(n+2) & 15]
-	roll	%r15d		#
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	xorl	%ecx, %edi		# ^b
-	leal	-0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15]
+	addl	-64+4*15(%rsp), %eax	# e += RCONST + W[n & 15]
 	addl	%edi, %eax		# e += (c ^ d ^ b)
 	movl	%ebx, %esi		#
 	roll	$5, %esi		# rotl32(a,5)
@@ -1286,4 +1386,28 @@ sha1_process_block64:
 
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
+	.align	16
+rconst0x5A827999:
+	.long	0x5A827999
+	.long	0x5A827999
+	.long	0x5A827999
+	.long	0x5A827999
+rconst0x6ED9EBA1:
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+rconst0x8F1BBCDC:
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+rconst0xCA62C1D6:
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+
 #endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 901896e6e..87c2d0800 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,33 +6,103 @@
 # also contains the diff of the generated file.
 exec >hash_md5_sha_x86-64.S
 
-# There is a way to use XMM registers (which always exist for x86-64!) for W[]
-# For example, if we load W as follows:
-#	%xmm0:  w[0x0] w[0x1] w[0x2] w[0x3]
-#	%xmm4:  w[0x4] w[0x5] w[0x6] w[0x7]
-#	%xmm8:  w[0x8] w[0x9] w[0xa] w[0xb]
-#	%xmm12: w[0xc] w[0xd] w[0xe] w[0xf]
-# then the xor'ing operation to generate next W[0..3] is:
-#	movaps	%xmm0, %xmmT2
-#	palignr	$0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5])
-#	# Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn.
-#	movaps	%xmm0, %xmmT13
-#	palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0])
-#	xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13
-#	xmm0 = rol32(xmm0,1)	# no such insn, have to use pslld+psrld+or
-# and then results can be extracted for use:
-#	movd	%xmm0, %esi	# new W[0]
-#	pextrd	$1, %xmm0, %esi	# new W[1]
-#	# SSE4.1 insn. Can use EXTRACTPS (also SSE4.1)
-#	pextrd	$2, %xmm0, %esi	# new W[2]
-#	pextrd	$3, %xmm0, %esi	# new W[3]
-# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64.
+# Based on http://arctic.org/~dean/crypto/sha1.html.
+# ("This SHA1 implementation is public domain.")
+#
+# x86-64 has at least SSE2 vector insns always available.
+# We can use them without any CPUID checks (and without a need
+# for a fallback code if needed insns are not available).
+# This code uses them to calculate W[] ahead of time.
+#
+# Unfortunately, results are passed from vector unit to
+# integer ALUs on the stack. MOVD/Q insns to move them directly
+# from vector to integer registers are slower than store-to-load
+# forwarding in LSU (on Skylake at least).
+#
+# The win against a purely integer code is small on Skylake,
+# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
+# It can do 4 ops at once in one 128-bit register,
+# but we have to use x2 of them because of W[0] complication,
+# SSE2 has no "rotate each word by N bits" insns,
+# moving data to/from vector unit is clunky, and Skylake
+# has four integer ALUs unified with three vector ALUs,
+# which makes pure integer code rather fast, and makes
+# vector ops compete with integer ones.
+#
+# Zen3, with its separate vector ALUs, wins more, about 12%.
+
+xmmT1="%xmm4"
+xmmT2="%xmm5"
+xmmRCONST="%xmm6"
+T=`printf '\t'`
+
+# SSE instructions are longer than 4 bytes on average.
+# Intel CPUs (up to Tiger Lake at least) can't decode
+# more than 16 bytes of code in one cycle.
+# By interleaving SSE code and integer code
+# we mostly achieve a situation where 16-byte decode fetch window
+# contains 4 (or more) insns.
+#
+# However. On Skylake, there was no observed difference,
+# but on Zen3, non-interleaved code is ~3% faster
+# (822 Mb/s versus 795 Mb/s hashing speed).
+# Off for now:
+interleave=false
+
+INTERLEAVE() {
+	$interleave || \
+	{
+		# Generate non-interleaved code
+		# (it should work correctly too)
+		echo "$1"
+		echo "$2"
+		return
+	}
+	(
+	echo "$1" | grep -v '^$' >"$0.temp1"
+	echo "$2" | grep -v '^$' >"$0.temp2"
+	exec 3<"$0.temp1"
+	exec 4<"$0.temp2"
+	IFS=''
+	while :; do
+		line1=''
+		line2=''
+		while :; do
+			read -r line1 <&3
+			if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line1"
+		done
+		while :; do
+			read -r line2 <&4
+			if test "${line2:0:4}" = "${T}lea"; then
+				# We use 7-8 byte long forms of LEA.
+				# Do not interleave them with SSE insns
+				# which are also long.
+				echo "$line2"
+				read -r line2 <&4
+				echo "$line2"
+				continue
+			fi
+			if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
+				break
+			fi
+			echo "$line2"
+		done
+		test "$line1$line2" || break
+		echo "$line1"
+		echo "$line2"
+	done
+	rm "$0.temp1" "$0.temp2"
+	)
+}
 
 echo \
-'### Generated by hash_md5_sha_x86-64.S.sh ###
+"### Generated by hash_md5_sha_x86-64.S.sh ###
 
 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-	.section	.text.sha1_process_block64,"ax",@progbits
+	.section	.text.sha1_process_block64,\"ax\",@progbits
 	.globl	sha1_process_block64
 	.hidden	sha1_process_block64
 	.type	sha1_process_block64, @function
@@ -51,16 +121,10 @@ sha1_process_block64:
 # eax..edx: a..d
 # ebp: e
 # esi,edi: temps
-# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
-# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
-	movl	$3, %eax
-1:
-	movq	(%rdi,%rax,8), %rsi
-	bswapq	%rsi
-	rolq	$32, %rsi
-	movq	%rsi, -32(%rsp,%rax,8)
-	decl	%eax
-	jns	1b
+# xmm0..xmm3: W[]
+# xmm4,xmm5: temps
+# xmm6: current round constant
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
 	movl	84(%rdi), %ebx		# b = ctx->hash[1]
@@ -68,32 +132,120 @@ sha1_process_block64:
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
+	movaps	rconst0x5A827999(%rip), $xmmRCONST
+
+	# For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15
+	# instead of spilling them to stack.
+	# (We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so...)
+	movq	4*0(%rdi), %rsi
+	movq	4*2(%rdi), %r10
+	bswapq	%rsi
+	bswapq	%r10
+	rolq	\$32, %rsi		# rsi = W[1]:W[0]
+	rolq	\$32, %r10
+	movq	%rsi, %xmm0
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r10:rsi = (W[0],W[1],W[2],W[3])
+	movaps	%xmm0, $xmmT1
+	paddd	$xmmRCONST, $xmmT1
+	movups	$xmmT1, -64+4*0(%rsp)
+
+	movq	4*4(%rdi), %r8
+	movq	4*6(%rdi), %r10
+	bswapq	%r8
+	bswapq	%r10
+	rolq	\$32, %r8
+	rolq	\$32, %r10
+	movq	%r8, %xmm1
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r8 = (W[4],W[5],W[6],W[7])
+	movaps	%xmm1, $xmmT1
+	paddd	$xmmRCONST, $xmmT1
+	movups	$xmmT1, -64+4*4(%rsp)
+
 	movq	4*8(%rdi), %r8
 	movq	4*10(%rdi), %r10
 	bswapq	%r8
 	bswapq	%r10
+	movl	%r8d, %r9d		# r9d = W[9]
+	rolq	\$32, %r8		# r8  = W[9]:W[8]
+	movl	%r10d, %r11d		# r11d = W[11]
+	rolq	\$32, %r10		# r10  = W[11]:W[10]
+	movq	%r8, %xmm2
+	movq	%r10, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
+
 	movq	4*12(%rdi), %r12
 	movq	4*14(%rdi), %r14
 	bswapq	%r12
 	bswapq	%r14
-	movl	%r8d, %r9d
-	shrq	$32, %r8
-	movl	%r10d, %r11d
-	shrq	$32, %r10
-	movl	%r12d, %r13d
-	shrq	$32, %r12
-	movl	%r14d, %r15d
-	shrq	$32, %r14
-'
-W32() {
-test "$1" || exit 1
-test "$1" -lt 0 && exit 1
-test "$1" -gt 15 && exit 1
-test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
-test "$1" -ge 8 && echo "%r${1}d"
+	movl	%r12d, %r13d		# r13d = W[13]
+	rolq	\$32, %r12		# r12  = W[13]:W[12]
+	movl	%r14d, %r15d		# r15d = W[15]
+	rolq	\$32, %r14		# r14  = W[15]:W[14]
+	movq	%r12, %xmm3
+	movq	%r14, $xmmT1
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r12 = (W[12],W[13],W[14],W[15])
+"
+
+PREP() {
+local xmmW0=$1
+local xmmW4=$2
+local xmmW8=$3
+local xmmW12=$4
+# the above must be %xmm0..3 in some permutation
+local dstmem=$5
+#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
+#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
+#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
+#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
+#W[3] ^= rol(W[0], 1);
+echo "# PREP $@
+	movaps	$xmmW12, $xmmT1
+	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
+
+	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+
+	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+	xorps	$xmmT2, $xmmW0	# ^
+	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+	movaps	$xmmW0, $xmmT2
+
+	xorps	$xmmT1, $xmmT1	# rol(W0,1):
+	pcmpgtd	$xmmW0, $xmmT1	# ffffffff for elements <0 (ones with msb bit 1)
+	paddd	$xmmW0, $xmmW0	# shift left by 1
+	psubd	$xmmT1, $xmmW0	# add 1 to those who had msb bit 1
+	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+
+	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+	movaps	$xmmT2, $xmmT1
+	pslld	\$2, $xmmT2
+	psrld	\$30, $xmmT1
+#	xorps	$xmmT1, $xmmT2	# rol((0,0,0,unrotW[0]),2)
+	xorps	$xmmT1, $xmmW0	# same result, but does not depend on/does not modify T2
+
+	xorps	$xmmT2, $xmmW0	# W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+"
+#	movq	$xmmW0, %r8	# high latency (~6 cycles)
+#	movaps	$xmmW0, $xmmT1
+#	psrldq	\$8, $xmmT1	# rshift by 8 bytes: move upper 64 bits to lower
+#	movq	$xmmT1, %r10	# high latency
+#	movq	%r8, %r9
+#	movq	%r10, %r11
+#	shrq	\$32, %r9
+#	shrq	\$32, %r11
+# ^^^ slower than passing the results on stack (!!!)
+echo "
+	movaps	$xmmW0, $xmmT2
+	paddd	$xmmRCONST, $xmmT2
+	movups	$xmmT2, $dstmem
+"
 }
 
-# It's possible to interleave insns in rounds to mostly eliminate
+# It's possible to interleave integer insns in rounds to mostly eliminate
 # dependency chains, but this likely to only help old Pentium-based
 # CPUs (ones without OOO, which can only simultaneously execute a pair
 # of _adjacent_ insns).
@@ -107,21 +259,16 @@ local n0=$(((n+0) & 15))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-	# W[0], already in %esi
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
 ";test $n0 != 0 && test $n0 -lt 8 && echo "
-	movl	`W32 $n0`, %esi		# W[n]
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n]
 ";test $n0 -ge 8 && echo "
-	# W[n], in %r$n0
+	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	andl	%e$b, %edi		# &b
 	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-";test $n0 -lt 8 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
-";echo "
 	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
 	movl	%e$a, %esi		#
 	roll	\$5, %esi		# rotl32(a,5)
@@ -138,28 +285,11 @@ local n2=$(((n+2) & 15))
 local n0=$(((n+0) & 15))
 echo "
 # $n
-";test $n0 -lt 8 && echo "
-	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
-	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
-	xorl	`W32 $n0`, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, `W32 $n0`		# store to W[n & 15]
-";test $n0 -ge 8 && echo "
-	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
-	roll	`W32 $n0`		#
-";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	andl	%e$b, %edi		# &b
 	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
-";test $n0 -lt 8 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
 	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
 	movl	%e$a, %esi		#
 	roll	\$5, %esi		# rotl32(a,5)
@@ -167,13 +297,6 @@ echo "
 	rorl	\$2, %e$b		# b = rotl32(b,30)
 "
 }
-{
-RCONST=0x5A827999
-RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3; RD1A bx cx dx bp ax  4
-RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7; RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9
-RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
-RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
-} | grep -v '^$'
 
 RD2() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -184,27 +307,10 @@ local n2=$(((n+2) & 15))
 local n0=$(((n+0) & 15))
 echo "
 # $n
-";test $n0 -lt 8 && echo "
-	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
-	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
-	xorl	`W32 $n0`, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, `W32 $n0`		# store to W[n & 15]
-";test $n0 -ge 8 && echo "
-	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
-	roll	`W32 $n0`		#
-";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	xorl	%e$b, %edi		# ^b
-";test $n0 -lt 8 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
 	addl	%edi, %e$e		# e += (c ^ d ^ b)
 	movl	%e$a, %esi		#
 	roll	\$5, %esi		# rotl32(a,5)
@@ -212,13 +318,6 @@ echo "
 	rorl	\$2, %e$b		# b = rotl32(b,30)
 "
 }
-{
-RCONST=0x6ED9EBA1
-RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
-RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
-RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
-RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
-} | grep -v '^$'
 
 RD3() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -235,53 +334,82 @@ echo "
 	andl	%e$c, %esi		# si: b & c
 	andl	%e$d, %edi		# di: (b | c) & d
 	orl	%esi, %edi		# ((b | c) & d) | (b & c)
-";test $n0 -lt 8 && echo "
-	movl	`W32 $n13`, %esi	# W[(n+13) & 15]
-	xorl	`W32 $n8`, %esi		# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, %esi		# ^W[(n+2) & 15]
-	xorl	`W32 $n0`, %esi		# ^W[n & 15]
-	roll	%esi			#
-	movl	%esi, `W32 $n0`		# store to W[n & 15]
-";test $n0 -ge 8 && echo "
-	xorl	`W32 $n13`, `W32 $n0`	# W[n & 15] ^= W[(n+13) & 15]
-	xorl	`W32 $n8`, `W32 $n0`	# ^W[(n+8) & 15]
-	xorl	`W32 $n2`, `W32 $n0`	# ^W[(n+2) & 15]
-	roll	`W32 $n0`		#
-";echo "
 	addl	%edi, %e$e		# += ((b | c) & d) | (b & c)
-";test $n0 -lt 8 && echo "
-	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
+	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n & 15]
 	movl	%e$a, %esi		#
 	roll	\$5, %esi		# rotl32(a,5)
 	addl	%esi, %e$e		# e += rotl32(a,5)
 	rorl	\$2, %e$b		# b = rotl32(b,30)
 "
 }
+
 {
-#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement"
-RCONST=-0x70E44324
-RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44
-RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49
-RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54
-RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59
-} | grep -v '^$'
+# Round 1
+RCONST=0x5A827999
+RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
+RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
+INTERLEAVE "$a" "$b"
+a=`echo "	movaps	rconst0x6ED9EBA1(%rip), $xmmRCONST"
+   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
+INTERLEAVE "$a" "$b"
+
+# Round 2
+RCONST=0x6ED9EBA1
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
+INTERLEAVE "$a" "$b"
+a=`echo "	movaps	rconst0x8F1BBCDC(%rip), $xmmRCONST"
+   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
+INTERLEAVE "$a" "$b"
+
+# Round 3
+RCONST=0x8F1BBCDC
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
+INTERLEAVE "$a" "$b"
+a=`echo "	movaps	rconst0xCA62C1D6(%rip), $xmmRCONST"
+   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
+INTERLEAVE "$a" "$b"
 
 # Round 4 has the same logic as round 2, only n and RCONST are different
-{
-#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement"
-RCONST=-0x359D3E2A
-RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64
-RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
-RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
-RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
-# Note: new W[n&15] values generated in last 3 iterations
-# (W[13,14,15]) are unused after each of these iterations.
-# Since we use r8..r15 for W[8..15], this does not matter.
-# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15]
-# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed.
+RCONST=0xCA62C1D6
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
+INTERLEAVE "$a" "$b"
+RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
+RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
 } | grep -v '^$'
 
 echo "
@@ -300,4 +428,28 @@ echo "
 
 	ret
 	.size	sha1_process_block64, .-sha1_process_block64
+
+	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
+	.align	16
+rconst0x5A827999:
+	.long	0x5A827999
+	.long	0x5A827999
+	.long	0x5A827999
+	.long	0x5A827999
+rconst0x6ED9EBA1:
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+	.long	0x6ED9EBA1
+rconst0x8F1BBCDC:
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+	.long	0x8F1BBCDC
+rconst0xCA62C1D6:
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+	.long	0xCA62C1D6
+
 #endif"
-- 
cgit v1.2.3-55-g6feb


From 33a9f34df5c53d3dd074a2168ff40d612a36667a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 23 Jan 2022 15:46:05 +0100
Subject: add busybox_ldscript.README.txt

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 busybox_ldscript.README.txt | 47 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 busybox_ldscript.README.txt

diff --git a/busybox_ldscript.README.txt b/busybox_ldscript.README.txt
new file mode 100644
index 000000000..1625a970a
--- /dev/null
+++ b/busybox_ldscript.README.txt
@@ -0,0 +1,47 @@
+/* Add SORT_BY_ALIGNMENT to linker script (found in busybox_unstripped.out):
+##  .rodata : { *(.rodata SORT_BY_ALIGNMENT(.rodata.*) .gnu.linkonce.r.*) }
+##  .data   : { *(.data SORT_BY_ALIGNMENT(.data.*) .gnu.linkonce.d.*) }
+##  .bss    : { *(.bss SORT_BY_ALIGNMENT(.bss.*) .gnu.linkonce.b.*) }
+## This will eliminate most of the padding (~3kb).
+## Hmm, "ld --sort-section alignment" should do it too.
+##
+## There is a ld hack which is meant to decrease disk usage
+## at the cost of more RAM usage (??!!) in standard ld script:
+##  . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
+## Replace it with:
+##  . = ALIGN (0x1000); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
+## to unconditionally align .data to the next page boundary,
+## instead of "next page, plus current offset in this page"
+*/
+
+/* To reduce the number of VMAs each bbox process has,
+## move *(.bss SORT_BY_ALIGNMENT(.bss.*) ...)
+## part from .bss : {...} block to .data : { ... } block.
+## (This usually increases .data section by only one page).
+## Result:
+##
+##    text data  bss     dec    hex filename
+## 1050792  560 7580 1058932 102874 busybox.bss
+## 1050792 8149    0 1058941 10287d busybox.nobss
+##
+## $ exec busybox.bss pmap $$
+## 0000000008048000    1028K r-xp  /path/to/busybox.bss
+## 0000000008149000       8K rw-p  /path/to/busybox.bss
+## 000000000814b000       4K rw-p    [ anon ]  <---- this VMA is eliminated
+## 00000000085f5000       4K ---p  [heap]
+## 00000000085f6000       4K rw-p  [heap]
+## 00000000f7778000       8K rw-p    [ anon ]
+## 00000000f777a000      12K r--p  [vvar]
+## 00000000f777d000       8K r-xp  [vdso]
+## 00000000ff7e9000     132K rw-p  [stack]
+##
+## $ exec busybox.nobss pmap $$
+## 0000000008048000    1028K r-xp  /path/to/busybox.nobss
+## 0000000008149000      12K rw-p  /path/to/busybox.nobss
+## 00000000086f0000       4K ---p  [heap]
+## 00000000086f1000       4K rw-p  [heap]
+## 00000000f7783000       8K rw-p    [ anon ]
+## 00000000f7785000      12K r--p  [vvar]
+## 00000000f7788000       8K r-xp  [vdso]
+## 00000000ffac0000     132K rw-p  [stack]
+*/
-- 
cgit v1.2.3-55-g6feb


From e998c7c032458a05a7afcc13ce0dc980b99ecc6c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 23 Jan 2022 18:48:49 +0100
Subject: sed: fix handling of escaped delimiters in s/// search pattern,
 closes 14541

function                                             old     new   delta
copy_parsing_escapes                                  67      96     +29
parse_regex_delim                                    109     111      +2
get_address                                          213     215      +2
add_cmd                                             1176    1178      +2
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 4/0 up/down: 35/0)               Total: 35 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/sed.c       | 19 +++++++++++--------
 testsuite/sed.tests | 10 ++++++++++
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/editors/sed.c b/editors/sed.c
index 48b0dbf67..02a527b4a 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -246,7 +246,6 @@ static void cleanup_outname(void)
 }
 
 /* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */
-
 static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to)
 {
 	char *d = dest;
@@ -276,7 +275,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from
 	return d - dest;
 }
 
-static char *copy_parsing_escapes(const char *string, int len)
+static char *copy_parsing_escapes(const char *string, int len, char delim)
 {
 	const char *s;
 	char *dest = xmalloc(len + 1);
@@ -287,10 +286,15 @@ static char *copy_parsing_escapes(const char *string, int len)
 		len = parse_escapes(dest, string, len, s[1], s[0]);
 		string = dest;
 	}
+	if (delim) {
+		/* we additionally unescape any instances of escaped delimiter.
+		 * For example, in 's+9\++X+' the pattern is "9+", not "9\+".
+		 */
+		len = parse_escapes(dest, string, len, delim, delim);
+	}
 	return dest;
 }
 
-
 /*
  * index_of_next_unescaped_regexp_delim - walks left to right through a string
  * beginning at a specified index and returns the index of the next regular
@@ -347,12 +351,11 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
 
 	/* save the match string */
 	idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
-	*match = copy_parsing_escapes(cmdstr_ptr, idx);
-
+	*match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter);
 	/* save the replacement string */
 	cmdstr_ptr += idx + 1;
 	idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr);
-	*replace = copy_parsing_escapes(cmdstr_ptr, idx);
+	*replace = copy_parsing_escapes(cmdstr_ptr, idx, 0);
 
 	return ((cmdstr_ptr - cmdstr) + idx);
 }
@@ -380,7 +383,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex)
 			delimiter = *++pos;
 		next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
 		if (next != 0) {
-			temp = copy_parsing_escapes(pos, next);
+			temp = copy_parsing_escapes(pos, next, 0);
 			G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t));
 			xregcomp(*regex, temp, G.regex_type);
 			free(temp);
@@ -575,7 +578,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
 			cmdstr++;
 		}
 		len = strlen(cmdstr);
-		sed_cmd->string = copy_parsing_escapes(cmdstr, len);
+		sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0);
 		cmdstr += len;
 		/* "\anychar" -> "anychar" */
 		parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0');
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index e62b839f7..440996a21 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -324,6 +324,16 @@ testing "sed zero chars match/replace logic must not falsely trigger here 2" \
 	"sed 's/ *$/_/g'" \
 	"qwerty_\n" "" "qwerty\n"
 
+# the pattern here is interpreted as "9+", not as "9\+"
+testing "sed special char as s/// delimiter, in pattern" \
+	"sed 's+9\++X+'" \
+	"X8=17\n" "" "9+8=17\n"
+
+# but in replacement string, "\&" remains "\&", not interpreted as "&"
+testing "sed special char as s/// delimiter, in replacement" \
+	"sed 's&9&X\&&'" \
+	"X&+8=17\n" "" "9+8=17\n"
+
 testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \
 	"sed ': testcont; /\\\\$/{ =; N; b testcont }'" \
 	"\
-- 
cgit v1.2.3-55-g6feb


From f12fb1e4092900f26f7f8c71cde44b1cd7d26439 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 23 Jan 2022 19:04:27 +0100
Subject: sed: fix handling of escaped delimiters in s/// replacement

function                                             old     new   delta
parse_regex_delim                                    111     140     +29

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/sed.c       | 5 ++++-
 testsuite/sed.tests | 9 +++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/editors/sed.c b/editors/sed.c
index 02a527b4a..32a4b61f6 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -355,7 +355,10 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
 	/* save the replacement string */
 	cmdstr_ptr += idx + 1;
 	idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr);
-	*replace = copy_parsing_escapes(cmdstr_ptr, idx, 0);
+//GNU sed 4.8:
+// echo 789 | sed 's&8&\&&'       - 7&9  ("\&" remained "\&")
+// echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11")
+	*replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0);
 
 	return ((cmdstr_ptr - cmdstr) + idx);
 }
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index 440996a21..626542e33 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -329,10 +329,15 @@ testing "sed special char as s/// delimiter, in pattern" \
 	"sed 's+9\++X+'" \
 	"X8=17\n" "" "9+8=17\n"
 
-# but in replacement string, "\&" remains "\&", not interpreted as "&"
-testing "sed special char as s/// delimiter, in replacement" \
+# Matching GNU sed 4.8:
+# in replacement string, "\&" remains "\&", not interpreted as "&"
+testing "sed special char as s/// delimiter, in replacement 1" \
 	"sed 's&9&X\&&'" \
 	"X&+8=17\n" "" "9+8=17\n"
+# in replacement string, "\1" is interpreted as "1"
+testing "sed special char as s/// delimiter, in replacement 2" \
+	"sed 's1\(9\)1X\11'" \
+	"X1+8=17\n" "" "9+8=17\n"
 
 testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \
 	"sed ': testcont; /\\\\$/{ =; N; b testcont }'" \
-- 
cgit v1.2.3-55-g6feb


From 6dd6a6c42d1465d8cca2539476f6bffd5e1353dd Mon Sep 17 00:00:00 2001
From: Walter Lozano <walter.lozano@collabora.com>
Date: Fri, 21 Jan 2022 11:00:27 -0300
Subject: Add support for long options to cmp

In order to improve compatibility with GNU cmp add support for long
options to busybox cmp.

function                                             old     new   delta
static.cmp_longopts                                    -      36     +36
cmp_main                                             589     594      +5
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 1/0 up/down: 41/0)               Total: 41 bytes

Signed-off-by: Walter Lozano <walter.lozano@collabora.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 editors/cmp.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/editors/cmp.c b/editors/cmp.c
index 6d2b0c6c3..b89e519ad 100644
--- a/editors/cmp.c
+++ b/editors/cmp.c
@@ -54,6 +54,7 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
 	int retval = 0;
 	int max_count = -1;
 
+#if !ENABLE_LONG_OPTS
 	opt = getopt32(argv, "^"
 			OPT_STR
 			"\0" "-1"
@@ -62,6 +63,23 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
 			":l--s:s--l",
 			&max_count
 	);
+#else
+	static const char cmp_longopts[] ALIGN1 =
+		"bytes\0"          Required_argument  "n"
+		"quiet\0"          No_argument        "s"
+		"silent\0"         No_argument        "s"
+		"verbose\0"        No_argument        "l"
+		;
+	opt = getopt32long(argv, "^"
+			OPT_STR
+			"\0" "-1"
+			IF_DESKTOP(":?4")
+			IF_NOT_DESKTOP(":?2")
+			":l--s:s--l",
+			cmp_longopts,
+			&max_count
+	);
+#endif
 	argv += optind;
 
 	filename1 = *argv;
-- 
cgit v1.2.3-55-g6feb


From 78fdf4d22d578d5d51cc08c768b35d050a92902a Mon Sep 17 00:00:00 2001
From: Timo Teräs <timo.teras@iki.fi>
Date: Fri, 21 Jan 2022 13:17:00 +0200
Subject: mkfs.vfat: fix volume label to be padded with space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The specification requires volume label to be space padded.

Latest fsck.vfat will remove the zero padded volume label
as invalid. See also:
https://github.com/dosfstools/dosfstools/issues/172

Make the default label also "NO NAME" which has the special meaning
that label is not set.

function                                             old     new   delta
mkfs_vfat_main                                      1470    1502     +32
static.NO_NAME_11                                      -      12     +12
.rodata                                           104309  104318      +9
------------------------------------------------------------------------------
(add/remove: 1/0 grow/shrink: 2/0 up/down: 53/0)               Total: 53 bytes

Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 util-linux/mkfs_vfat.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/util-linux/mkfs_vfat.c b/util-linux/mkfs_vfat.c
index 844d965f8..821371953 100644
--- a/util-linux/mkfs_vfat.c
+++ b/util-linux/mkfs_vfat.c
@@ -218,8 +218,11 @@ static const char boot_code[] ALIGN1 =
 int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 {
+	static const char NO_NAME_11[] = "NO NAME    ";
+
 	struct stat st;
-	const char *volume_label = "";
+	const char *arg_volume_label = NO_NAME_11; //default
+	char volume_label11[12];
 	char *buf;
 	char *device_name;
 	uoff_t volume_size_bytes;
@@ -257,14 +260,17 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 	opts = getopt32(argv, "^"
 		"Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v"
 		"\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c
-		NULL, NULL, NULL, NULL, NULL,
-		NULL, NULL, &volume_label, NULL, NULL, NULL, NULL);
+		/*b*/NULL, /*f*/NULL, /*F*/NULL, /*h*/NULL, /*i*/NULL,
+		/*l*/NULL, /*m*/NULL, /*n*/&arg_volume_label,
+		/*r*/NULL, /*R*/NULL, /*s*/NULL, /*S*/NULL);
 	argv += optind;
 
 	// cache device name
 	device_name = argv[0];
 	// default volume ID = creation time
 	volume_id = time(NULL);
+	// truncate to exactly 11 chars, pad with spaces
+	sprintf(volume_label11, "%-11.11s", arg_volume_label);
 
 	dev = xopen(device_name, O_RDWR);
 	xfstat(dev, &st, device_name);
@@ -459,7 +465,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 			(int)media_byte,
 			volume_size_sect, (int)total_clust, (int)sect_per_clust,
 			sect_per_fat,
-			(int)volume_id, volume_label
+			(int)volume_id, volume_label11
 		);
 	}
 
@@ -508,7 +514,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 		STORE_LE(boot_blk->vi.ext_boot_sign, 0x29);
 		STORE_LE(boot_blk->vi.volume_id32, volume_id);
 		memcpy(boot_blk->vi.fs_type, "FAT32   ", sizeof(boot_blk->vi.fs_type));
-		strncpy(boot_blk->vi.volume_label, volume_label, sizeof(boot_blk->vi.volume_label));
+		memcpy(boot_blk->vi.volume_label, volume_label11, 11);
 		memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code));
 		STORE_LE(boot_blk->boot_sign, BOOT_SIGN);
 
@@ -545,15 +551,18 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 	// root directory
 	// empty directory is just a set of zero bytes
 	memset(buf, 0, sect_per_clust * bytes_per_sect);
-	if (volume_label[0]) {
-		// create dir entry for volume_label
+	// not "NO NAME", "NO NAME  " etc?
+	// (mkfs.fat 4.1 won't create dir entry even with explicit -n 'NO NAME',
+	// but will create one with e.g. -n '', -n '  zZz')
+	if (strcmp(volume_label11, NO_NAME_11) != 0) {
+		// create dir entry for volume label
 		struct msdos_dir_entry *de;
 #if 0
 		struct tm tm_time;
 		uint16_t t, d;
 #endif
 		de = (void*)buf;
-		strncpy(de->name, volume_label, sizeof(de->name));
+		memcpy(de->name, volume_label11, 11);
 		STORE_LE(de->attr, ATTR_VOLUME);
 #if 0
 		localtime_r(&create_time, &tm_time);
-- 
cgit v1.2.3-55-g6feb


From 117a8c9b7a50053964159c342af1f3810cbbd5b8 Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Wed, 12 Jan 2022 10:54:54 -0800
Subject: apply const trick to ptr_to_globals

This was missing in the previous attempt to fix it via [1]

This helps fix segfaults when compiling with clang ( seen on riscv64 )

[  452.428349] less[270]: unhandled signal 11 code 0x1 at 0x000000000000000c in busybox.nosuid[2ab7491000+ba000]
[  452.430246] CPU: 3 PID: 270 Comm: less Not tainted 5.15.13-yocto-standard #1
[  452.431323] Hardware name: riscv-virtio,qemu (DT)
[  452.431925] epc : 0000002ab74a19ee ra : 0000002ab74a19dc sp : 0000003fec6ec980
[  452.432725]  gp : 0000002ab754dcb0 tp : 0000003f88783800 t0 : 0000003f8878d4a0
[  452.433744]  t1 : 0000002ab749b00c t2 : 0000000000000000 s0 : 0000003fec6ecc38
[  452.434732]  s1 : 000000000000004c a0 : 00000000ffffffff a1 : 0000002ab754dde0
[  452.435861]  a2 : 0000000000000000 a3 : 0000000000000100 a4 : 0000002ab754f3a0
[  452.436787]  a5 : 0000002ab754f3a0 a6 : 0000000000000000 a7 : 0000002ab754f2a0
[  452.437974]  s2 : 0000000000000002 s3 : 0000002ab754b6c8 s4 : 0000002ab749b60e
[  452.438781]  s5 : 0000000000000000 s6 : 0000002ab754b6c8 s7 : 0000003f88943060
[  452.439723]  s8 : 0000003f88944050 s9 : 0000002ad8502e88 s10: 0000002ad8502de8
[  452.440538]  s11: 0000000000000014 t3 : 0000003f887fceb6 t4 : 0000003f8893af0c
[  452.441438]  t5 : 0000000000000000 t6 : 0000003f88923000

[1] https://git.busybox.net/busybox/commit/?id=1f925038a

Signed-off-by: Khem Raj <raj.khem@gmail.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 include/libbb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/libbb.h b/include/libbb.h
index 8e3b7ae8e..6aeec249d 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -2292,7 +2292,7 @@ struct globals;
 /* '*const' ptr makes gcc optimize code much better.
  * Magic prevents ptr_to_globals from going into rodata.
  * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */
-extern struct globals *const ptr_to_globals;
+extern struct globals *BB_GLOBAL_CONST ptr_to_globals;
 
 #define barrier() asm volatile ("":::"memory")
 
-- 
cgit v1.2.3-55-g6feb


From 99e22d230ded676ab53dfa8ab276c1301c2955a0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 24 Jan 2022 07:07:17 +0100
Subject: cut: build fix for FEATURE_CUT_REGEX

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Kbuild.src | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index e8bb24f6d..b9d34de8e 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -200,6 +200,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o
 lib-$(CONFIG_PKILL) += xregcomp.o
 lib-$(CONFIG_DEVFSD) += xregcomp.o
 lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o
+lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o
 
 # Add the experimental logging functionality, only used by zcip
 lib-$(CONFIG_ZCIP) += logenv.o
-- 
cgit v1.2.3-55-g6feb


From 205042c07a3bf6c8e685c434713f2a9e46630cd0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 25 Jan 2022 17:00:57 +0100
Subject: libbb/sha1: in unrolled x86-64 code, pass initial W[] in registers,
 not on stack

This can be faster on some CPUs.
On Skylake, evidently load latency from L1 (or store-to-load
forwarding in LSU) is fast enough to completely hide
memory reference latencies here.

function                                             old     new   delta
sha1_process_block64                                3495    3514     +19

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 310 +++++++++++++++++++++--------------------
 libbb/hash_md5_sha_x86-64.S.sh | 109 ++++++++-------
 2 files changed, 214 insertions(+), 205 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 069a18719..743269d98 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,7 +1,7 @@
 ### Generated by hash_md5_sha_x86-64.S.sh ###
 
 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-	.section	.text.sha1_process_block64,"ax",@progbits
+	.section	.text.sha1_process_block64, "ax", @progbits
 	.globl	sha1_process_block64
 	.hidden	sha1_process_block64
 	.type	sha1_process_block64, @function
@@ -10,7 +10,7 @@
 sha1_process_block64:
 	pushq	%rbp	# 1 byte insn
 	pushq	%rbx	# 1 byte insn
-	pushq	%r15	# 2 byte insn
+#	pushq	%r15	# 2 byte insn
 	pushq	%r14	# 2 byte insn
 	pushq	%r13	# 2 byte insn
 	pushq	%r12	# 2 byte insn
@@ -19,7 +19,8 @@ sha1_process_block64:
 #Register and stack use:
 # eax..edx: a..d
 # ebp: e
-# esi,edi: temps
+# esi,edi,r8..r14: temps
+# r15: unused
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
@@ -33,147 +34,148 @@ sha1_process_block64:
 
 	movaps	rconst0x5A827999(%rip), %xmm6
 
-	# For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15
-	# instead of spilling them to stack.
-	# (We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so...)
+	# Load W[] to xmm registers, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1A's instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it's probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1A's shorter by one byte).
 	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r10
+	movq	4*2(%rdi), %r8
 	bswapq	%rsi
-	bswapq	%r10
+	bswapq	%r8
 	rolq	$32, %rsi		# rsi = W[1]:W[0]
-	rolq	$32, %r10
+	rolq	$32, %r8		# r8  = W[3]:W[2]
 	movq	%rsi, %xmm0
-	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm0	# xmm0 = r10:rsi = (W[0],W[1],W[2],W[3])
-	movaps	%xmm0, %xmm4
-	paddd	%xmm6, %xmm4
-	movups	%xmm4, -64+4*0(%rsp)
+	movq	%r8, %xmm4
+	punpcklqdq %xmm4, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, %xmm4		# add RCONST, spill to stack
+#	paddd	%xmm6, %xmm4
+#	movups	%xmm4, -64+16*0(%rsp)
 
-	movq	4*4(%rdi), %r8
+	movq	4*4(%rdi), %r9
 	movq	4*6(%rdi), %r10
-	bswapq	%r8
+	bswapq	%r9
 	bswapq	%r10
-	rolq	$32, %r8
-	rolq	$32, %r10
-	movq	%r8, %xmm1
+	rolq	$32, %r9		# r9  = W[5]:W[4]
+	rolq	$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
 	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r8 = (W[4],W[5],W[6],W[7])
-	movaps	%xmm1, %xmm4
-	paddd	%xmm6, %xmm4
-	movups	%xmm4, -64+4*4(%rsp)
+	punpcklqdq %xmm4, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
 
-	movq	4*8(%rdi), %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r8
-	bswapq	%r10
-	movl	%r8d, %r9d		# r9d = W[9]
-	rolq	$32, %r8		# r8  = W[9]:W[8]
-	movl	%r10d, %r11d		# r11d = W[11]
-	rolq	$32, %r10		# r10  = W[11]:W[10]
-	movq	%r8, %xmm2
-	movq	%r10, %xmm4
-	punpcklqdq %xmm4, %xmm2	# xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	$32, %r11		# r11  = W[9]:W[8]
+	rolq	$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, %xmm4
+	punpcklqdq %xmm4, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
 
-	movq	4*12(%rdi), %r12
+	movq	4*12(%rdi), %r13
 	movq	4*14(%rdi), %r14
-	bswapq	%r12
+	bswapq	%r13
 	bswapq	%r14
-	movl	%r12d, %r13d		# r13d = W[13]
-	rolq	$32, %r12		# r12  = W[13]:W[12]
-	movl	%r14d, %r15d		# r15d = W[15]
+	rolq	$32, %r13		# r13  = W[13]:W[12]
 	rolq	$32, %r14		# r14  = W[15]:W[14]
-	movq	%r12, %xmm3
+	movq	%r13, %xmm3
 	movq	%r14, %xmm4
-	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r12 = (W[12],W[13],W[14],W[15])
+	punpcklqdq %xmm4, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 
 # 0
 	leal	0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
+	shrq	$32, %rsi
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 1
-	addl	-64+4*1(%rsp), %edx	# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 2
-	addl	-64+4*2(%rsp), %ecx	# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
+	shrq	$32, %r8
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 3
-	addl	-64+4*3(%rsp), %ebx	# e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 4
-	addl	-64+4*4(%rsp), %eax	# e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+	shrq	$32, %r9
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 5
-	addl	-64+4*5(%rsp), %ebp	# e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 6
-	addl	-64+4*6(%rsp), %edx	# e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
+	shrq	$32, %r10
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 # 7
-	addl	-64+4*7(%rsp), %ecx	# e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 	movaps	%xmm3, %xmm4
@@ -186,9 +188,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm0, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	# shift left by 1
-	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -201,48 +203,50 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*0(%rsp)
 # 8
-	leal	0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
+	leal	0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
+	shrq	$32, %r11
 	movl	%ebp, %edi		# c
 	xorl	%eax, %edi		# ^d
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 9
-	leal	0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+	leal	0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 10
-	leal	0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
+	shrq	$32, %r12
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # 11
-	leal	0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
+	leal	0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
 	movl	%ebx, %edi		# c
 	xorl	%ecx, %edi		# ^d
 	andl	%eax, %edi		# &b
 	xorl	%ecx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %edx		# e += (((c ^ d) & b) ^ d)
-	movl	%ebp, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %edx		# e += rotl32(a,5)
+	movl	%ebp, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
 	movaps	rconst0x6ED9EBA1(%rip), %xmm6
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
@@ -256,9 +260,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm1, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	# shift left by 1
-	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -271,15 +275,16 @@ sha1_process_block64:
 	paddd	%xmm6, %xmm5
 	movups	%xmm5, -64+16*1(%rsp)
 # 12
-	leal	0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
+	leal	0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
+	shrq	$32, %r13
 	movl	%eax, %edi		# c
 	xorl	%ebx, %edi		# ^d
 	andl	%ebp, %edi		# &b
 	xorl	%ebx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ecx		# e += (((c ^ d) & b) ^ d)
-	movl	%edx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ecx		# e += rotl32(a,5)
+	movl	%edx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ecx		# e += rotl32(a,5)
 	rorl	$2, %ebp		# b = rotl32(b,30)
 # 13
 	leal	0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
@@ -288,31 +293,32 @@ sha1_process_block64:
 	andl	%edx, %edi		# &b
 	xorl	%eax, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebx		# e += (((c ^ d) & b) ^ d)
-	movl	%ecx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebx		# e += rotl32(a,5)
+	movl	%ecx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebx		# e += rotl32(a,5)
 	rorl	$2, %edx		# b = rotl32(b,30)
 # 14
 	leal	0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
+	shrq	$32, %r14
 	movl	%edx, %edi		# c
 	xorl	%ebp, %edi		# ^d
 	andl	%ecx, %edi		# &b
 	xorl	%ebp, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %eax		# e += (((c ^ d) & b) ^ d)
-	movl	%ebx, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %eax		# e += rotl32(a,5)
+	movl	%ebx, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %eax		# e += rotl32(a,5)
 	rorl	$2, %ecx		# b = rotl32(b,30)
 # 15
-	leal	0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
+	leal	0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
 	movl	%ecx, %edi		# c
 	xorl	%edx, %edi		# ^d
 	andl	%ebx, %edi		# &b
 	xorl	%edx, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %ebp		# e += (((c ^ d) & b) ^ d)
-	movl	%eax, %esi		#
-	roll	$5, %esi		# rotl32(a,5)
-	addl	%esi, %ebp		# e += rotl32(a,5)
+	movl	%eax, %edi		#
+	roll	$5, %edi		# rotl32(a,5)
+	addl	%edi, %ebp		# e += rotl32(a,5)
 	rorl	$2, %ebx		# b = rotl32(b,30)
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
@@ -325,9 +331,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm2, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	# shift left by 1
-	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -394,9 +400,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm3, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	# shift left by 1
-	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -459,9 +465,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm0, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	# shift left by 1
-	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -524,9 +530,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm1, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	# shift left by 1
-	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -590,9 +596,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm2, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	# shift left by 1
-	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -655,9 +661,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm3, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	# shift left by 1
-	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -720,9 +726,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm0, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	# shift left by 1
-	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -797,9 +803,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm1, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	# shift left by 1
-	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -874,9 +880,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm2, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	# shift left by 1
-	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -952,9 +958,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm3, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	# shift left by 1
-	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -1029,9 +1035,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm0, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm0, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm0, %xmm0	# shift left by 1
-	psubd	%xmm4, %xmm0	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm0, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm0, %xmm0	#  shift left by 1
+	psubd	%xmm4, %xmm0	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -1106,9 +1112,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm1, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm1, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm1, %xmm1	# shift left by 1
-	psubd	%xmm4, %xmm1	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm1, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm1, %xmm1	#  shift left by 1
+	psubd	%xmm4, %xmm1	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -1171,9 +1177,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm2, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm2, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm2, %xmm2	# shift left by 1
-	psubd	%xmm4, %xmm2	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm2, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm2, %xmm2	#  shift left by 1
+	psubd	%xmm4, %xmm2	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -1236,9 +1242,9 @@ sha1_process_block64:
 	# W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
 	movaps	%xmm3, %xmm5
 	xorps	%xmm4, %xmm4	# rol(W0,1):
-	pcmpgtd	%xmm3, %xmm4	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	%xmm3, %xmm3	# shift left by 1
-	psubd	%xmm4, %xmm3	# add 1 to those who had msb bit 1
+	pcmpgtd	%xmm3, %xmm4	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	%xmm3, %xmm3	#  shift left by 1
+	psubd	%xmm4, %xmm3	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 	pslldq	$12, %xmm5	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
 	movaps	%xmm5, %xmm4
@@ -1378,7 +1384,7 @@ sha1_process_block64:
 	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
 	popq	%r14		#
 	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-	popq	%r15		#
+#	popq	%r15		#
 	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
 	popq	%rbx		#
 	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 87c2d0800..47c40af0d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -102,7 +102,7 @@ echo \
 "### Generated by hash_md5_sha_x86-64.S.sh ###
 
 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-	.section	.text.sha1_process_block64,\"ax\",@progbits
+	.section	.text.sha1_process_block64, \"ax\", @progbits
 	.globl	sha1_process_block64
 	.hidden	sha1_process_block64
 	.type	sha1_process_block64, @function
@@ -111,7 +111,7 @@ echo \
 sha1_process_block64:
 	pushq	%rbp	# 1 byte insn
 	pushq	%rbx	# 1 byte insn
-	pushq	%r15	# 2 byte insn
+#	pushq	%r15	# 2 byte insn
 	pushq	%r14	# 2 byte insn
 	pushq	%r13	# 2 byte insn
 	pushq	%r12	# 2 byte insn
@@ -120,7 +120,8 @@ sha1_process_block64:
 #Register and stack use:
 # eax..edx: a..d
 # ebp: e
-# esi,edi: temps
+# esi,edi,r8..r14: temps
+# r15: unused
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
@@ -134,59 +135,56 @@ sha1_process_block64:
 
 	movaps	rconst0x5A827999(%rip), $xmmRCONST
 
-	# For round 1, steps 0 and 8..15, we pass W[0,8..15] in esi,r8..r15
-	# instead of spilling them to stack.
-	# (We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so...)
+	# Load W[] to xmm registers, byteswapping on the fly.
+	#
+	# For iterations 0..15, we pass W[] in rsi,r8..r14
+	# for use in RD1A's instead of spilling them to stack.
+	# We lose parallelized addition of RCONST, but LEA
+	# can do two additions at once, so it's probably a wash.
+	# (We use rsi instead of rN because this makes two
+	# LEAs in two first RD1A's shorter by one byte).
 	movq	4*0(%rdi), %rsi
-	movq	4*2(%rdi), %r10
+	movq	4*2(%rdi), %r8
 	bswapq	%rsi
-	bswapq	%r10
+	bswapq	%r8
 	rolq	\$32, %rsi		# rsi = W[1]:W[0]
-	rolq	\$32, %r10
+	rolq	\$32, %r8		# r8  = W[3]:W[2]
 	movq	%rsi, %xmm0
-	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm0	# xmm0 = r10:rsi = (W[0],W[1],W[2],W[3])
-	movaps	%xmm0, $xmmT1
-	paddd	$xmmRCONST, $xmmT1
-	movups	$xmmT1, -64+4*0(%rsp)
+	movq	%r8, $xmmT1
+	punpcklqdq $xmmT1, %xmm0	# xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#	movaps	%xmm0, $xmmT1		# add RCONST, spill to stack
+#	paddd	$xmmRCONST, $xmmT1
+#	movups	$xmmT1, -64+16*0(%rsp)
 
-	movq	4*4(%rdi), %r8
+	movq	4*4(%rdi), %r9
 	movq	4*6(%rdi), %r10
-	bswapq	%r8
+	bswapq	%r9
 	bswapq	%r10
-	rolq	\$32, %r8
-	rolq	\$32, %r10
-	movq	%r8, %xmm1
+	rolq	\$32, %r9		# r9  = W[5]:W[4]
+	rolq	\$32, %r10		# r10 = W[7]:W[6]
+	movq	%r9, %xmm1
 	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r8 = (W[4],W[5],W[6],W[7])
-	movaps	%xmm1, $xmmT1
-	paddd	$xmmRCONST, $xmmT1
-	movups	$xmmT1, -64+4*4(%rsp)
+	punpcklqdq $xmmT1, %xmm1	# xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
 
-	movq	4*8(%rdi), %r8
-	movq	4*10(%rdi), %r10
-	bswapq	%r8
-	bswapq	%r10
-	movl	%r8d, %r9d		# r9d = W[9]
-	rolq	\$32, %r8		# r8  = W[9]:W[8]
-	movl	%r10d, %r11d		# r11d = W[11]
-	rolq	\$32, %r10		# r10  = W[11]:W[10]
-	movq	%r8, %xmm2
-	movq	%r10, $xmmT1
-	punpcklqdq $xmmT1, %xmm2	# xmm2 = r10:r8 = (W[8],W[9],W[10],W[11])
+	movq	4*8(%rdi), %r11
+	movq	4*10(%rdi), %r12
+	bswapq	%r11
+	bswapq	%r12
+	rolq	\$32, %r11		# r11  = W[9]:W[8]
+	rolq	\$32, %r12		# r12  = W[11]:W[10]
+	movq	%r11, %xmm2
+	movq	%r12, $xmmT1
+	punpcklqdq $xmmT1, %xmm2	# xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
 
-	movq	4*12(%rdi), %r12
+	movq	4*12(%rdi), %r13
 	movq	4*14(%rdi), %r14
-	bswapq	%r12
+	bswapq	%r13
 	bswapq	%r14
-	movl	%r12d, %r13d		# r13d = W[13]
-	rolq	\$32, %r12		# r12  = W[13]:W[12]
-	movl	%r14d, %r15d		# r15d = W[15]
+	rolq	\$32, %r13		# r13  = W[13]:W[12]
 	rolq	\$32, %r14		# r14  = W[15]:W[14]
-	movq	%r12, %xmm3
+	movq	%r13, %xmm3
 	movq	%r14, $xmmT1
-	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r12 = (W[12],W[13],W[14],W[15])
+	punpcklqdq $xmmT1, %xmm3	# xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
 "
 
 PREP() {
@@ -215,9 +213,9 @@ echo "# PREP $@
 	movaps	$xmmW0, $xmmT2
 
 	xorps	$xmmT1, $xmmT1	# rol(W0,1):
-	pcmpgtd	$xmmW0, $xmmT1	# ffffffff for elements <0 (ones with msb bit 1)
-	paddd	$xmmW0, $xmmW0	# shift left by 1
-	psubd	$xmmT1, $xmmW0	# add 1 to those who had msb bit 1
+	pcmpgtd	$xmmW0, $xmmT1	#  ffffffff for elements <0 (ones with msb bit 1)
+	paddd	$xmmW0, $xmmW0	#  shift left by 1
+	psubd	$xmmT1, $xmmW0	#  add 1 to those who had msb bit 1
 	# W0 = rotated (W[0]..W[3]), still needs W[3] fixup
 
 	pslldq	\$12, $xmmT2	# lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
@@ -256,23 +254,28 @@ RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
 local n0=$(((n+0) & 15))
+local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
 	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 != 0 && test $n0 -lt 8 && echo "
-	addl	-64+4*$n0(%rsp), %e$e	# e += RCONST + W[n]
-";test $n0 -ge 8 && echo "
-	leal	$RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
+	shrq	\$32, %rsi
+";test $n0 = 1 && echo "
+	leal	$RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
+";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+	shrq	\$32, %r$rN
+";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
+	leal	$RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 ";echo "
 	movl	%e$c, %edi		# c
 	xorl	%e$d, %edi		# ^d
 	andl	%e$b, %edi		# &b
 	xorl	%e$d, %edi		# (((c ^ d) & b) ^ d)
 	addl	%edi, %e$e		# e += (((c ^ d) & b) ^ d)
-	movl	%e$a, %esi		#
-	roll	\$5, %esi		# rotl32(a,5)
-	addl	%esi, %e$e		# e += rotl32(a,5)
+	movl	%e$a, %edi		#
+	roll	\$5, %edi		# rotl32(a,5)
+	addl	%edi, %e$e		# e += rotl32(a,5)
 	rorl	\$2, %e$b		# b = rotl32(b,30)
 "
 }
@@ -420,7 +423,7 @@ echo "
 	addl	%ebx, 84(%rdi)	# ctx->hash[1] += b
 	popq	%r14		#
 	addl	%ecx, 88(%rdi)	# ctx->hash[2] += c
-	popq	%r15		#
+#	popq	%r15		#
 	addl	%edx, 92(%rdi)	# ctx->hash[3] += d
 	popq	%rbx		#
 	addl	%ebp, 96(%rdi)	# ctx->hash[4] += e
-- 
cgit v1.2.3-55-g6feb


From 6472ac942898437e040171cec991de1c0b962f72 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 3 Feb 2022 14:15:20 +0100
Subject: libbb/sha256: optional x86 hardware accelerated hashing

64 bit:
function                                             old     new   delta
sha256_process_block64_shaNI                           -     730    +730
.rodata                                           108314  108586    +272
sha256_begin                                          31      83     +52
------------------------------------------------------------------------------
(add/remove: 5/1 grow/shrink: 2/0 up/down: 1055/-1)          Total: 1054 bytes

32 bit:
function                                             old     new   delta
sha256_process_block64_shaNI                           -     747    +747
.rodata                                           104318  104590    +272
sha256_begin                                          29      84     +55
------------------------------------------------------------------------------
(add/remove: 5/1 grow/shrink: 2/0 up/down: 1075/-1)          Total: 1074 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/Config.src                     |   6 +
 libbb/Kbuild.src                     |   2 +
 libbb/hash_md5_sha.c                 |  54 ++++---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 283 +++++++++++++++++++++++++++++++++++
 libbb/hash_md5_sha256_x86-64_shaNI.S | 281 ++++++++++++++++++++++++++++++++++
 libbb/hash_md5_sha_x86-32_shaNI.S    |   4 +-
 libbb/hash_md5_sha_x86-64.S          |   2 +-
 libbb/hash_md5_sha_x86-64.S.sh       |   2 +-
 libbb/hash_md5_sha_x86-64_shaNI.S    |   4 +-
 9 files changed, 612 insertions(+), 26 deletions(-)
 create mode 100644 libbb/hash_md5_sha256_x86-32_shaNI.S
 create mode 100644 libbb/hash_md5_sha256_x86-64_shaNI.S

diff --git a/libbb/Config.src b/libbb/Config.src
index 708d3b0c8..0ecd5bd46 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -70,6 +70,12 @@ config SHA1_HWACCEL
 	On x86, this adds ~590 bytes of code. Throughput
 	is about twice as fast as fully-unrolled generic code.
 
+config SHA256_HWACCEL
+	bool "SHA256: Use hardware accelerated instructions if possible"
+	default y
+	help
+	On x86, this adds ~1k bytes of code.
+
 config SHA3_SMALL
 	int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
 	default 1  # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index b9d34de8e..653025e56 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -59,6 +59,8 @@ lib-y += hash_md5_sha.o
 lib-y += hash_md5_sha_x86-64.o
 lib-y += hash_md5_sha_x86-64_shaNI.o
 lib-y += hash_md5_sha_x86-32_shaNI.o
+lib-y += hash_md5_sha256_x86-64_shaNI.o
+lib-y += hash_md5_sha256_x86-32_shaNI.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a23db5152..880ffab01 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -13,6 +13,27 @@
 
 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
 
+#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
+# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+	asm ("cpuid"
+		: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
+		: "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
+	);
+}
+static smallint shaNI;
+void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
+void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
+#  if defined(__i386__)
+struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
+#  endif
+#  if defined(__x86_64__)
+struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
+#  endif
+# endif
+#endif
+
 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
  * (for rotX32, there is no difference). Why? My guess is that
  * macro requires clever common subexpression elimination heuristics
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 }
 #endif /* NEED_SHA512 */
 
-#if ENABLE_SHA1_HWACCEL
-# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
-{
-	asm ("cpuid"
-		: "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
-		: "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
-	);
-}
-void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
-#  if defined(__i386__)
-struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
-#  endif
-#  if defined(__x86_64__)
-struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
-#  endif
-# endif
-#endif
-
 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 {
 	ctx->hash[0] = 0x67452301;
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 #if ENABLE_SHA1_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
 	{
-		static smallint shaNI;
 		if (!shaNI) {
 			unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
 			cpuid(&eax, &ebx, &ecx, &edx);
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
 	memcpy(&ctx->total64, init256, sizeof(init256));
 	/*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
 	ctx->process_block = sha256_process_block64;
+#if ENABLE_SHA256_HWACCEL
+# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+	{
+		if (!shaNI) {
+			unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
+			cpuid(&eax, &ebx, &ecx, &edx);
+			shaNI = ((ebx >> 29) << 1) - 1;
+		}
+		if (shaNI > 0)
+			ctx->process_block = sha256_process_block64_shaNI;
+	}
+# endif
+#endif
 }
 
 #if NEED_SHA512
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..56e37fa38
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,283 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA1 insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%eax
+
+#define SHA256CONSTANTS	%ecx
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+
+	.balign	8	# allow decoders to fetch at least 3 first insns
+sha256_process_block64_shaNI:
+	pushl		%ebp
+	movl		%esp, %ebp
+	subl		$32, %esp
+	andl		$~0xF, %esp	# paddd needs aligned memory operand
+
+	movu128		76+0*16(%eax), STATE0
+	movu128		76+1*16(%eax), STATE1
+
+	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
+	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
+	mova128		STATE0, MSGTMP4
+	palignr		$8, STATE1,  STATE0		/* ABEF */
+	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+
+#	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK
+	lea		K256, SHA256CONSTANTS
+
+	/* Save hash values for addition after rounds */
+	mova128		STATE0, 0*16(%esp)
+	mova128		STATE1, 1*16(%esp)
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	mova128		MSG, MSGTMP3
+		paddd		3*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		0*16(%esp), STATE0
+	paddd		1*16(%esp), STATE1
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
+	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
+	mova128		STATE0, MSGTMP4
+	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
+	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+
+	movu128		STATE0, 76+0*16(%eax)
+	movu128		STATE1, 76+1*16(%eax)
+
+	movl	%ebp, %esp
+	popl	%ebp
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.balign 16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+.balign 16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..1c2b75af3
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,281 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA1 insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+
+	.section	.text.sha256_process_block64_shaNI, "ax", @progbits
+	.globl	sha256_process_block64_shaNI
+	.hidden	sha256_process_block64_shaNI
+	.type	sha256_process_block64_shaNI, @function
+
+#define DATA_PTR	%rdi
+
+#define SHA256CONSTANTS	%rax
+
+#define MSG		%xmm0
+#define STATE0		%xmm1
+#define STATE1		%xmm2
+#define MSGTMP0		%xmm3
+#define MSGTMP1		%xmm4
+#define MSGTMP2		%xmm5
+#define MSGTMP3		%xmm6
+#define MSGTMP4		%xmm7
+
+#define SHUF_MASK	%xmm8
+
+#define ABEF_SAVE	%xmm9
+#define CDGH_SAVE	%xmm10
+
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+	movu128		80+0*16(%rdi), STATE0
+	movu128		80+1*16(%rdi), STATE1
+
+	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
+	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
+	mova128		STATE0, MSGTMP4
+	palignr		$8, STATE1,  STATE0		/* ABEF */
+	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
+	lea		K256(%rip), SHA256CONSTANTS
+
+	/* Save hash values for addition after rounds */
+	mova128		STATE0, ABEF_SAVE
+	mova128		STATE1, CDGH_SAVE
+
+	/* Rounds 0-3 */
+	movu128		0*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	mova128		MSG, MSGTMP0
+		paddd		0*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 4-7 */
+	movu128		1*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	mova128		MSG, MSGTMP1
+		paddd		1*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 8-11 */
+	movu128		2*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	mova128		MSG, MSGTMP2
+		paddd		2*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 12-15 */
+	movu128		3*16(DATA_PTR), MSG
+	pshufb		SHUF_MASK, MSG
+	mova128		MSG, MSGTMP3
+		paddd		3*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 16-19 */
+	mova128		MSGTMP0, MSG
+		paddd		4*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 20-23 */
+	mova128		MSGTMP1, MSG
+		paddd		5*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 24-27 */
+	mova128		MSGTMP2, MSG
+		paddd		6*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 28-31 */
+	mova128		MSGTMP3, MSG
+		paddd		7*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 32-35 */
+	mova128		MSGTMP0, MSG
+		paddd		8*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 36-39 */
+	mova128		MSGTMP1, MSG
+		paddd		9*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP1, MSGTMP0
+
+	/* Rounds 40-43 */
+	mova128		MSGTMP2, MSG
+		paddd		10*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP2, MSGTMP1
+
+	/* Rounds 44-47 */
+	mova128		MSGTMP3, MSG
+		paddd		11*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP3, MSGTMP4
+	palignr		$4, MSGTMP2, MSGTMP4
+	paddd		MSGTMP4, MSGTMP0
+	sha256msg2	MSGTMP3, MSGTMP0
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP3, MSGTMP2
+
+	/* Rounds 48-51 */
+	mova128		MSGTMP0, MSG
+		paddd		12*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP0, MSGTMP4
+	palignr		$4, MSGTMP3, MSGTMP4
+	paddd		MSGTMP4, MSGTMP1
+	sha256msg2	MSGTMP0, MSGTMP1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+	sha256msg1	MSGTMP0, MSGTMP3
+
+	/* Rounds 52-55 */
+	mova128		MSGTMP1, MSG
+		paddd		13*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP1, MSGTMP4
+	palignr		$4, MSGTMP0, MSGTMP4
+	paddd		MSGTMP4, MSGTMP2
+	sha256msg2	MSGTMP1, MSGTMP2
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 56-59 */
+	mova128		MSGTMP2, MSG
+		paddd		14*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+	mova128		MSGTMP2, MSGTMP4
+	palignr		$4, MSGTMP1, MSGTMP4
+	paddd		MSGTMP4, MSGTMP3
+	sha256msg2	MSGTMP2, MSGTMP3
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Rounds 60-63 */
+	mova128		MSGTMP3, MSG
+		paddd		15*16(SHA256CONSTANTS), MSG
+		sha256rnds2	STATE0, STATE1
+		shuf128_32	$0x0E, MSG, MSG
+		sha256rnds2	STATE1, STATE0
+
+	/* Add current hash values with previously saved */
+	paddd		ABEF_SAVE, STATE0
+	paddd		CDGH_SAVE, STATE1
+
+	/* Write hash values back in the correct order */
+	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
+	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
+	mova128		STATE0, MSGTMP4
+	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
+	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+
+	movu128		STATE0, 80+0*16(%rdi)
+	movu128		STATE1, 80+1*16(%rdi)
+
+	ret
+	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+
+.section	.rodata.cst256.K256, "aM", @progbits, 256
+.balign 16
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+.balign 16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+	.octa 0x0c0d0e0f08090a0b0405060700010203
+
+#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 166cfd38a..11b855e26 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,7 +20,7 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
-	.section	.text.sha1_process_block64_shaNI,"ax",@progbits
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI
 	.type	sha1_process_block64_shaNI, @function
@@ -224,7 +224,7 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
 
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 743269d98..47ace60de 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1394,7 +1394,7 @@ sha1_process_block64:
 	.size	sha1_process_block64, .-sha1_process_block64
 
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
-	.align	16
+	.balign	16
 rconst0x5A827999:
 	.long	0x5A827999
 	.long	0x5A827999
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 47c40af0d..656fb5414 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -433,7 +433,7 @@ echo "
 	.size	sha1_process_block64, .-sha1_process_block64
 
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
-	.align	16
+	.balign	16
 rconst0x5A827999:
 	.long	0x5A827999
 	.long	0x5A827999
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 33cc3bf7f..ba92f09df 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,7 +20,7 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps	# not shorter
 
-	.section	.text.sha1_process_block64_shaNI,"ax",@progbits
+	.section	.text.sha1_process_block64_shaNI, "ax", @progbits
 	.globl	sha1_process_block64_shaNI
 	.hidden	sha1_process_block64_shaNI
 	.type	sha1_process_block64_shaNI, @function
@@ -218,7 +218,7 @@ sha1_process_block64_shaNI:
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
 .section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
 
-- 
cgit v1.2.3-55-g6feb


From de6cb4bed82356db72af81890c7c26d7e85fb50d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 3 Feb 2022 15:11:23 +0100
Subject: libbb/sha256: code shrink in 32-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         747     722     -25

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 56e37fa38..632dab7e6 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -49,8 +49,7 @@ sha256_process_block64_shaNI:
 	palignr		$8, STATE1,  STATE0		/* ABEF */
 	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
 
-#	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK
-	lea		K256, SHA256CONSTANTS
+	movl		$K256+8*16, SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
 	mova128		STATE0, 0*16(%esp)
@@ -60,7 +59,7 @@ sha256_process_block64_shaNI:
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -69,7 +68,7 @@ sha256_process_block64_shaNI:
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -79,7 +78,7 @@ sha256_process_block64_shaNI:
 	movu128		2*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -89,7 +88,7 @@ sha256_process_block64_shaNI:
 	movu128		3*16(DATA_PTR), MSG
 	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
 	mova128		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -101,7 +100,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -113,7 +112,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -125,7 +124,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -137,7 +136,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -149,7 +148,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -161,7 +160,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -173,7 +172,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -185,7 +184,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -197,7 +196,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -209,7 +208,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -220,7 +219,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -231,7 +230,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
-- 
cgit v1.2.3-55-g6feb


From a1429fbb8ca373efc01939d599f6f65969b1a366 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Thu, 3 Feb 2022 15:17:42 +0100
Subject: libbb/sha256: code shrink in 64-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         730     706     -24

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-64_shaNI.S | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index 1c2b75af3..f3df541e4 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -50,7 +50,7 @@ sha256_process_block64_shaNI:
 	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
 
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
-	lea		K256(%rip), SHA256CONSTANTS
+	leaq		K256+8*16(%rip), SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
 	mova128		STATE0, ABEF_SAVE
@@ -60,7 +60,7 @@ sha256_process_block64_shaNI:
 	movu128		0*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP0
-		paddd		0*16(SHA256CONSTANTS), MSG
+		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -69,7 +69,7 @@ sha256_process_block64_shaNI:
 	movu128		1*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP1
-		paddd		1*16(SHA256CONSTANTS), MSG
+		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -79,7 +79,7 @@ sha256_process_block64_shaNI:
 	movu128		2*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP2
-		paddd		2*16(SHA256CONSTANTS), MSG
+		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -89,7 +89,7 @@ sha256_process_block64_shaNI:
 	movu128		3*16(DATA_PTR), MSG
 	pshufb		SHUF_MASK, MSG
 	mova128		MSG, MSGTMP3
-		paddd		3*16(SHA256CONSTANTS), MSG
+		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -101,7 +101,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 16-19 */
 	mova128		MSGTMP0, MSG
-		paddd		4*16(SHA256CONSTANTS), MSG
+		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -113,7 +113,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 20-23 */
 	mova128		MSGTMP1, MSG
-		paddd		5*16(SHA256CONSTANTS), MSG
+		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -125,7 +125,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 24-27 */
 	mova128		MSGTMP2, MSG
-		paddd		6*16(SHA256CONSTANTS), MSG
+		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -137,7 +137,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 28-31 */
 	mova128		MSGTMP3, MSG
-		paddd		7*16(SHA256CONSTANTS), MSG
+		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -149,7 +149,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 32-35 */
 	mova128		MSGTMP0, MSG
-		paddd		8*16(SHA256CONSTANTS), MSG
+		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -161,7 +161,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 36-39 */
 	mova128		MSGTMP1, MSG
-		paddd		9*16(SHA256CONSTANTS), MSG
+		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -173,7 +173,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 40-43 */
 	mova128		MSGTMP2, MSG
-		paddd		10*16(SHA256CONSTANTS), MSG
+		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -185,7 +185,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 44-47 */
 	mova128		MSGTMP3, MSG
-		paddd		11*16(SHA256CONSTANTS), MSG
+		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP3, MSGTMP4
 	palignr		$4, MSGTMP2, MSGTMP4
@@ -197,7 +197,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 48-51 */
 	mova128		MSGTMP0, MSG
-		paddd		12*16(SHA256CONSTANTS), MSG
+		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP0, MSGTMP4
 	palignr		$4, MSGTMP3, MSGTMP4
@@ -209,7 +209,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 52-55 */
 	mova128		MSGTMP1, MSG
-		paddd		13*16(SHA256CONSTANTS), MSG
+		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP1, MSGTMP4
 	palignr		$4, MSGTMP0, MSGTMP4
@@ -220,7 +220,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 56-59 */
 	mova128		MSGTMP2, MSG
-		paddd		14*16(SHA256CONSTANTS), MSG
+		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 	mova128		MSGTMP2, MSGTMP4
 	palignr		$4, MSGTMP1, MSGTMP4
@@ -231,7 +231,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 60-63 */
 	mova128		MSGTMP3, MSG
-		paddd		15*16(SHA256CONSTANTS), MSG
+		paddd		15*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
-- 
cgit v1.2.3-55-g6feb


From 31c1c310772fa6c897ee1585ea15fc38f3ab3dff Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 6 Feb 2022 00:30:03 +0100
Subject: libbb/sha256: code shrink in 64-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         706     701      -5

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-64_shaNI.S | 96 ++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index f3df541e4..dbf391135 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -31,9 +31,7 @@
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
-#define MSGTMP4		%xmm7
-
-#define SHUF_MASK	%xmm8
+#define XMMTMP4		%xmm7
 
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
@@ -45,11 +43,12 @@ sha256_process_block64_shaNI:
 
 	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
 	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	pblendw		$0xF0, XMMTMP4, STATE1		/* CDGH */
 
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
+/* XMMTMP4 holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4
 	leaq		K256+8*16(%rip), SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
@@ -58,7 +57,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -67,7 +66,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -77,7 +76,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -87,13 +86,14 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
-	pshufb		SHUF_MASK, MSG
+	pshufb		XMMTMP4, MSG
+/* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -103,9 +103,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -115,9 +115,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -127,9 +127,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -139,9 +139,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -151,9 +151,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -163,9 +163,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -175,9 +175,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -187,9 +187,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -199,9 +199,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -211,9 +211,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -222,9 +222,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -243,9 +243,9 @@ sha256_process_block64_shaNI:
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
 	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	palignr		$8, XMMTMP4, STATE1		/* HGFE */
 
 	movu128		STATE0, 80+0*16(%rdi)
 	movu128		STATE1, 80+1*16(%rdi)
-- 
cgit v1.2.3-55-g6feb


From 4f40735c87f8292a87c066b3b7099b0be007cf59 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 6 Feb 2022 00:55:52 +0100
Subject: libbb/sha256: code shrink in 32-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         722     713      -9

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 93 +++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 45 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 632dab7e6..417da37d8 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -31,7 +31,7 @@
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
-#define MSGTMP4		%xmm7
+#define XMMTMP4		%xmm7
 
 	.balign	8	# allow decoders to fetch at least 3 first insns
 sha256_process_block64_shaNI:
@@ -45,10 +45,12 @@ sha256_process_block64_shaNI:
 
 	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
 	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
+	pblendw		$0xF0, XMMTMP4, STATE1		/* CDGH */
 
+/* XMMTMP4 holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4
 	movl		$K256+8*16, SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
@@ -57,7 +59,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -66,7 +68,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -76,7 +78,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -86,13 +88,14 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
-	pshufb		PSHUFFLE_BSWAP32_FLIP_MASK, MSG
+	pshufb		XMMTMP4, MSG
+/* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -102,9 +105,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -114,9 +117,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -126,9 +129,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -138,9 +141,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -150,9 +153,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -162,9 +165,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -174,9 +177,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -186,9 +189,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, MSGTMP4
-	palignr		$4, MSGTMP2, MSGTMP4
-	paddd		MSGTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP4
+	palignr		$4, MSGTMP2, XMMTMP4
+	paddd		XMMTMP4, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -198,9 +201,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, MSGTMP4
-	palignr		$4, MSGTMP3, MSGTMP4
-	paddd		MSGTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP4
+	palignr		$4, MSGTMP3, XMMTMP4
+	paddd		XMMTMP4, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -210,9 +213,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, MSGTMP4
-	palignr		$4, MSGTMP0, MSGTMP4
-	paddd		MSGTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP4
+	palignr		$4, MSGTMP0, XMMTMP4
+	paddd		XMMTMP4, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -221,9 +224,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, MSGTMP4
-	palignr		$4, MSGTMP1, MSGTMP4
-	paddd		MSGTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP4
+	palignr		$4, MSGTMP1, XMMTMP4
+	paddd		XMMTMP4, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -242,9 +245,9 @@ sha256_process_block64_shaNI:
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
 	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
-	mova128		STATE0, MSGTMP4
+	mova128		STATE0, XMMTMP4
 	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, MSGTMP4, STATE1		/* HGFE */
+	palignr		$8, XMMTMP4, STATE1		/* HGFE */
 
 	movu128		STATE0, 76+0*16(%eax)
 	movu128		STATE1, 76+1*16(%eax)
-- 
cgit v1.2.3-55-g6feb


From ca466f385ac985a8b3491daa9f326dc480cdee70 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 6 Feb 2022 19:53:10 +0100
Subject: *: slap on a few ALIGN* where appropriate

The result of looking at "grep -F -B2 '*fill*' busybox_unstripped.map"

function                                             old     new   delta
.rodata                                           108586  108460    -126
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-126)           Total: -126 bytes
   text	   data	    bss	    dec	    hex	filename
 970412	   4219	   1848	 976479	  ee65f	busybox_old
 970286	   4219	   1848	 976353	  ee5e1	busybox_unstripped

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 console-tools/reset.c             | 2 +-
 coreutils/od.c                    | 2 +-
 include/platform.h                | 1 +
 libbb/appletlib.c                 | 2 +-
 libbb/get_console.c               | 2 +-
 miscutils/bc.c                    | 2 +-
 miscutils/man.c                   | 2 +-
 networking/ifupdown.c             | 8 ++++----
 networking/interface.c            | 6 +++---
 networking/libiproute/ipaddress.c | 2 +-
 networking/udhcp/common.c         | 2 +-
 networking/udhcp/d6_dhcpc.c       | 2 +-
 shell/ash.c                       | 2 +-
 util-linux/hexdump.c              | 2 +-
 util-linux/nsenter.c              | 2 +-
 util-linux/unshare.c              | 2 +-
 16 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/console-tools/reset.c b/console-tools/reset.c
index b3acf69f8..cc04e4fcc 100644
--- a/console-tools/reset.c
+++ b/console-tools/reset.c
@@ -36,7 +36,7 @@ int stty_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int reset_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
 {
-	static const char *const args[] = {
+	static const char *const args[] ALIGN_PTR = {
 		"stty", "sane", NULL
 	};
 
diff --git a/coreutils/od.c b/coreutils/od.c
index 9a888dd5f..6f22331e0 100644
--- a/coreutils/od.c
+++ b/coreutils/od.c
@@ -144,7 +144,7 @@ odoffset(dumper_t *dumper, int argc, char ***argvp)
 	}
 }
 
-static const char *const add_strings[] = {
+static const char *const add_strings[] ALIGN_PTR = {
 	"16/1 \"%3_u \" \"\\n\"",              /* a */
 	"8/2 \" %06o \" \"\\n\"",              /* B, o */
 	"16/1 \"%03o \" \"\\n\"",              /* b */
diff --git a/include/platform.h b/include/platform.h
index ad27bb31a..ea0512f36 100644
--- a/include/platform.h
+++ b/include/platform.h
@@ -346,6 +346,7 @@ typedef unsigned smalluint;
 # define ALIGN4
 #endif
 #define ALIGN8     __attribute__((aligned(8)))
+#define ALIGN_INT  __attribute__((aligned(sizeof(int))))
 #define ALIGN_PTR  __attribute__((aligned(sizeof(void*))))
 
 /*
diff --git a/libbb/appletlib.c b/libbb/appletlib.c
index 03389f541..841b3b873 100644
--- a/libbb/appletlib.c
+++ b/libbb/appletlib.c
@@ -651,7 +651,7 @@ static void check_suid(int applet_no)
 # if ENABLE_FEATURE_INSTALLER
 static const char usr_bin [] ALIGN1 = "/usr/bin/";
 static const char usr_sbin[] ALIGN1 = "/usr/sbin/";
-static const char *const install_dir[] = {
+static const char *const install_dir[] ALIGN_PTR = {
 	&usr_bin [8], /* "/" */
 	&usr_bin [4], /* "/bin/" */
 	&usr_sbin[4]  /* "/sbin/" */
diff --git a/libbb/get_console.c b/libbb/get_console.c
index 7f2c75332..9044efea1 100644
--- a/libbb/get_console.c
+++ b/libbb/get_console.c
@@ -37,7 +37,7 @@ static int open_a_console(const char *fnam)
  */
 int FAST_FUNC get_console_fd_or_die(void)
 {
-	static const char *const console_names[] = {
+	static const char *const console_names[] ALIGN_PTR = {
 		DEV_CONSOLE, CURRENT_VC, CURRENT_TTY
 	};
 
diff --git a/miscutils/bc.c b/miscutils/bc.c
index ae370ff55..ab785bbc8 100644
--- a/miscutils/bc.c
+++ b/miscutils/bc.c
@@ -6011,7 +6011,7 @@ static BC_STATUS zxc_program_assign(char inst)
 #endif
 
 	if (ib || sc || left->t == XC_RESULT_OBASE) {
-		static const char *const msg[] = {
+		static const char *const msg[] ALIGN_PTR = {
 			"bad ibase; must be [2,16]",                 //XC_RESULT_IBASE
 			"bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE
 			"bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE
diff --git a/miscutils/man.c b/miscutils/man.c
index d319e8bba..deaf9e5ab 100644
--- a/miscutils/man.c
+++ b/miscutils/man.c
@@ -303,7 +303,7 @@ int man_main(int argc UNUSED_PARAM, char **argv)
 	config_close(parser);
 
 	if (!man_path_list) {
-		static const char *const mpl[] = { "/usr/man", "/usr/share/man", NULL };
+		static const char *const mpl[] ALIGN_PTR = { "/usr/man", "/usr/share/man", NULL };
 		man_path_list = (char**)mpl;
 		/*count_mp = 2; - not used below anyway */
 	}
diff --git a/networking/ifupdown.c b/networking/ifupdown.c
index 737113dd4..6c4ae27f2 100644
--- a/networking/ifupdown.c
+++ b/networking/ifupdown.c
@@ -532,7 +532,7 @@ static int FAST_FUNC v4tunnel_down(struct interface_defn_t * ifd, execfn * exec)
 }
 # endif
 
-static const struct method_t methods6[] = {
+static const struct method_t methods6[] ALIGN_PTR = {
 # if ENABLE_FEATURE_IFUPDOWN_IP
 	{ "v4tunnel" , v4tunnel_up     , v4tunnel_down   , },
 # endif
@@ -627,7 +627,7 @@ struct dhcp_client_t {
 	const char *stopcmd;
 };
 
-static const struct dhcp_client_t ext_dhcp_clients[] = {
+static const struct dhcp_client_t ext_dhcp_clients[] ALIGN_PTR = {
 	{ "dhcpcd",
 		"dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%",
 		"dhcpcd -k %iface%",
@@ -774,7 +774,7 @@ static int FAST_FUNC wvdial_down(struct interface_defn_t *ifd, execfn *exec)
 			"-p /var/run/wvdial.%iface% -s 2", ifd, exec);
 }
 
-static const struct method_t methods[] = {
+static const struct method_t methods[] ALIGN_PTR = {
 	{ "manual"  , manual_up_down, manual_up_down, },
 	{ "wvdial"  , wvdial_up     , wvdial_down   , },
 	{ "ppp"     , ppp_up        , ppp_down      , },
@@ -797,7 +797,7 @@ static int FAST_FUNC link_up_down(struct interface_defn_t *ifd UNUSED_PARAM, exe
 	return 1;
 }
 
-static const struct method_t link_methods[] = {
+static const struct method_t link_methods[] ALIGN_PTR = {
 	{ "none", link_up_down, link_up_down }
 };
 
diff --git a/networking/interface.c b/networking/interface.c
index ea6a2c8a8..6b6c0944a 100644
--- a/networking/interface.c
+++ b/networking/interface.c
@@ -446,13 +446,13 @@ static char *get_name(char name[IFNAMSIZ], char *p)
  * %n specifiers (even the size of integers may not match).
  */
 #if INT_MAX == LONG_MAX
-static const char *const ss_fmt[] = {
+static const char *const ss_fmt[] ALIGN_PTR = {
 	"%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u",
 	"%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u",
 	"%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u"
 };
 #else
-static const char *const ss_fmt[] = {
+static const char *const ss_fmt[] ALIGN_PTR = {
 	"%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu",
 	"%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu",
 	"%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu"
@@ -731,7 +731,7 @@ static const struct hwtype ib_hwtype = {
 #endif
 
 
-static const struct hwtype *const hwtypes[] = {
+static const struct hwtype *const hwtypes[] ALIGN_PTR = {
 	&loop_hwtype,
 	&ether_hwtype,
 	&ppp_hwtype,
diff --git a/networking/libiproute/ipaddress.c b/networking/libiproute/ipaddress.c
index 17a838411..ecc3848ff 100644
--- a/networking/libiproute/ipaddress.c
+++ b/networking/libiproute/ipaddress.c
@@ -58,7 +58,7 @@ typedef struct filter_t filter_t;
 
 static void print_link_flags(unsigned flags, unsigned mdown)
 {
-	static const int flag_masks[] = {
+	static const int flag_masks[] ALIGN_INT = {
 		IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT,
 		IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP };
 	static const char flag_labels[] ALIGN1 =
diff --git a/networking/udhcp/common.c b/networking/udhcp/common.c
index 8e9b93655..ae818db05 100644
--- a/networking/udhcp/common.c
+++ b/networking/udhcp/common.c
@@ -19,7 +19,7 @@ const uint8_t MAC_BCAST_ADDR[6] ALIGN2 = {
  * See RFC2132 for more options.
  * OPTION_REQ: these options are requested by udhcpc (unless -o).
  */
-const struct dhcp_optflag dhcp_optflags[] = {
+const struct dhcp_optflag dhcp_optflags[] ALIGN2 = {
 	/* flags                                    code */
 	{ OPTION_IP                   | OPTION_REQ, 0x01 }, /* DHCP_SUBNET        */
 	{ OPTION_S32                              , 0x02 }, /* DHCP_TIME_OFFSET   */
diff --git a/networking/udhcp/d6_dhcpc.c b/networking/udhcp/d6_dhcpc.c
index 9d2a8f5d3..9fc690315 100644
--- a/networking/udhcp/d6_dhcpc.c
+++ b/networking/udhcp/d6_dhcpc.c
@@ -65,7 +65,7 @@
 
 /* "struct client_data_t client_data" is in bb_common_bufsiz1 */
 
-static const struct dhcp_optflag d6_optflags[] = {
+static const struct dhcp_optflag d6_optflags[] ALIGN2 = {
 #if ENABLE_FEATURE_UDHCPC6_RFC3646
 	{ OPTION_6RD | OPTION_LIST        | OPTION_REQ, D6_OPT_DNS_SERVERS },
 	{ OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST },
diff --git a/shell/ash.c b/shell/ash.c
index 55df54bd0..adb0f223a 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -313,7 +313,7 @@ typedef long arith_t;
 /* ============ Shell options */
 
 /* If you add/change options hare, update --help text too */
-static const char *const optletters_optnames[] = {
+static const char *const optletters_optnames[] ALIGN_PTR = {
 	"e"   "errexit",
 	"f"   "noglob",
 /* bash has '-o ignoreeof', but no short synonym -I for it */
diff --git a/util-linux/hexdump.c b/util-linux/hexdump.c
index 57e7e8db7..307a84803 100644
--- a/util-linux/hexdump.c
+++ b/util-linux/hexdump.c
@@ -71,7 +71,7 @@ static void bb_dump_addfile(dumper_t *dumper, char *name)
 	fclose(fp);
 }
 
-static const char *const add_strings[] = {
+static const char *const add_strings[] ALIGN_PTR = {
 	"\"%07.7_ax \"16/1 \"%03o \"\"\n\"",   /* b */
 	"\"%07.7_ax \"16/1 \"%3_c \"\"\n\"",   /* c */
 	"\"%07.7_ax \"8/2 \"  %05u \"\"\n\"",  /* d */
diff --git a/util-linux/nsenter.c b/util-linux/nsenter.c
index e6339da2f..1aa045b35 100644
--- a/util-linux/nsenter.c
+++ b/util-linux/nsenter.c
@@ -93,7 +93,7 @@ enum {
  * The user namespace comes first, so that it is entered first.
  * This gives an unprivileged user the potential to enter other namespaces.
  */
-static const struct namespace_descr ns_list[] = {
+static const struct namespace_descr ns_list[] ALIGN_INT = {
 	{ CLONE_NEWUSER, "ns/user", },
 	{ CLONE_NEWIPC,  "ns/ipc",  },
 	{ CLONE_NEWUTS,  "ns/uts",  },
diff --git a/util-linux/unshare.c b/util-linux/unshare.c
index 68ccdd874..06b938074 100644
--- a/util-linux/unshare.c
+++ b/util-linux/unshare.c
@@ -120,7 +120,7 @@ enum {
 	NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */
 	NS_COUNT,
 };
-static const struct namespace_descr ns_list[] = {
+static const struct namespace_descr ns_list[] ALIGN_INT = {
 	{ CLONE_NEWNS,   "mnt"  },
 	{ CLONE_NEWUTS,  "uts"  },
 	{ CLONE_NEWIPC,  "ipc"  },
-- 
cgit v1.2.3-55-g6feb


From 987be932ed3cbea56b68bbe85649191c13b66015 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sun, 6 Feb 2022 20:07:12 +0100
Subject: *: slap on a few ALIGN_PTR where appropriate

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 coreutils/test.c       |  2 +-
 e2fsprogs/fsck.c       |  2 +-
 libbb/getopt32.c       |  2 +-
 miscutils/devfsd.c     |  4 ++--
 modutils/modutils-24.c |  4 ++--
 networking/inetd.c     |  2 +-
 procps/nmeter.c        |  2 +-
 selinux/setenforce.c   |  2 +-
 shell/hush.c           | 10 +++++-----
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/coreutils/test.c b/coreutils/test.c
index a914c7490..840a0daaf 100644
--- a/coreutils/test.c
+++ b/coreutils/test.c
@@ -242,7 +242,7 @@ int depth;
 	depth--; \
 	return __res; \
 } while (0)
-static const char *const TOKSTR[] = {
+static const char *const TOKSTR[] ALIGN_PTR = {
 	"EOI",
 	"FILRD",
 	"FILWR",
diff --git a/e2fsprogs/fsck.c b/e2fsprogs/fsck.c
index 96c1e51e0..028f8a803 100644
--- a/e2fsprogs/fsck.c
+++ b/e2fsprogs/fsck.c
@@ -190,7 +190,7 @@ struct globals {
  * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3
  * pathames.
  */
-static const char *const devfs_hier[] = {
+static const char *const devfs_hier[] ALIGN_PTR = {
 	"host", "bus", "target", "lun", NULL
 };
 #endif
diff --git a/libbb/getopt32.c b/libbb/getopt32.c
index 5ab4d66f1..e861d0567 100644
--- a/libbb/getopt32.c
+++ b/libbb/getopt32.c
@@ -296,7 +296,7 @@ Special characters:
 
 /* Code here assumes that 'unsigned' is at least 32 bits wide */
 
-const char *const bb_argv_dash[] = { "-", NULL };
+const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL };
 
 enum {
 	PARAM_STRING,
diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c
index 839d00fd0..fb9ebcf60 100644
--- a/miscutils/devfsd.c
+++ b/miscutils/devfsd.c
@@ -928,7 +928,7 @@ static void action_compat(const struct devfsd_notify_struct *info, unsigned int
 	unsigned int i;
 	char rewind_;
 	/* 1 to 5  "scsi/" , 6 to 9 "ide/host" */
-	static const char *const fmt[] = {
+	static const char *const fmt[] ALIGN_PTR = {
 		NULL ,
 		"sg/c%db%dt%du%d",		/* scsi/generic */
 		"sd/c%db%dt%du%d",		/* scsi/disc */
@@ -1468,7 +1468,7 @@ const char *get_old_name(const char *devname, unsigned int namelen,
 	const char *pty1;
 	const char *pty2;
 	/* 1 to 5  "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */
-	static const char *const fmt[] = {
+	static const char *const fmt[] ALIGN_PTR = {
 		NULL ,
 		"sg%u",			/* scsi/generic */
 		NULL,			/* scsi/disc */
diff --git a/modutils/modutils-24.c b/modutils/modutils-24.c
index ac8632481..d0bc2a6ef 100644
--- a/modutils/modutils-24.c
+++ b/modutils/modutils-24.c
@@ -3458,7 +3458,7 @@ static int obj_load_progbits(char *image, size_t image_size, struct obj_file *f,
 
 static void hide_special_symbols(struct obj_file *f)
 {
-	static const char *const specials[] = {
+	static const char *const specials[] ALIGN_PTR = {
 		SPFX "cleanup_module",
 		SPFX "init_module",
 		SPFX "kernel_version",
@@ -3484,7 +3484,7 @@ static int obj_gpl_license(struct obj_file *f, const char **license)
 	 * linux/include/linux/module.h.  Checking for leading "GPL" will not
 	 * work, somebody will use "GPL sucks, this is proprietary".
 	 */
-	static const char *const gpl_licenses[] = {
+	static const char *const gpl_licenses[] ALIGN_PTR = {
 		"GPL",
 		"GPL v2",
 		"GPL and additional rights",
diff --git a/networking/inetd.c b/networking/inetd.c
index e71be51c3..fb2fbe323 100644
--- a/networking/inetd.c
+++ b/networking/inetd.c
@@ -1538,7 +1538,7 @@ int inetd_main(int argc UNUSED_PARAM, char **argv)
 #if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \
  || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD
 # if !BB_MMU
-static const char *const cat_args[] = { "cat", NULL };
+static const char *const cat_args[] ALIGN_PTR = { "cat", NULL };
 # endif
 #endif
 
diff --git a/procps/nmeter.c b/procps/nmeter.c
index 2310e9844..088d366bf 100644
--- a/procps/nmeter.c
+++ b/procps/nmeter.c
@@ -70,7 +70,7 @@ typedef struct proc_file {
 	smallint last_gen;
 } proc_file;
 
-static const char *const proc_name[] = {
+static const char *const proc_name[] ALIGN_PTR = {
 	"stat",		// Must match the order of proc_file's!
 	"loadavg",
 	"net/dev",
diff --git a/selinux/setenforce.c b/selinux/setenforce.c
index 996034f8e..2267be451 100644
--- a/selinux/setenforce.c
+++ b/selinux/setenforce.c
@@ -26,7 +26,7 @@
 /* These strings are arranged so that odd ones
  * result in security_setenforce(1) being done,
  * the rest will do security_setenforce(0) */
-static const char *const setenforce_cmd[] = {
+static const char *const setenforce_cmd[] ALIGN_PTR = {
 	"0",
 	"1",
 	"permissive",
diff --git a/shell/hush.c b/shell/hush.c
index 6dc2ecaac..ae81f0da5 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -564,7 +564,7 @@ enum {
 #define NULL_O_STRING { NULL }
 
 #ifndef debug_printf_parse
-static const char *const assignment_flag[] = {
+static const char *const assignment_flag[] ALIGN_PTR = {
 	"MAYBE_ASSIGNMENT",
 	"DEFINITELY_ASSIGNMENT",
 	"NOT_ASSIGNMENT",
@@ -3682,7 +3682,7 @@ static void free_pipe_list(struct pipe *pi)
 #ifndef debug_print_tree
 static void debug_print_tree(struct pipe *pi, int lvl)
 {
-	static const char *const PIPE[] = {
+	static const char *const PIPE[] ALIGN_PTR = {
 		[PIPE_SEQ] = "SEQ",
 		[PIPE_AND] = "AND",
 		[PIPE_OR ] = "OR" ,
@@ -3717,7 +3717,7 @@ static void debug_print_tree(struct pipe *pi, int lvl)
 		[RES_XXXX ] = "XXXX" ,
 		[RES_SNTX ] = "SNTX" ,
 	};
-	static const char *const CMDTYPE[] = {
+	static const char *const CMDTYPE[] ALIGN_PTR = {
 		"{}",
 		"()",
 		"[noglob]",
@@ -7659,7 +7659,7 @@ static int generate_stream_from_string(const char *s, pid_t *pid_p)
 		if (is_prefixed_with(s, "trap")
 		 && skip_whitespace(s + 4)[0] == '\0'
 		) {
-			static const char *const argv[] = { NULL, NULL };
+			static const char *const argv[] ALIGN_PTR = { NULL, NULL };
 			builtin_trap((char**)argv);
 			fflush_all(); /* important */
 			_exit(0);
@@ -9826,7 +9826,7 @@ static int run_list(struct pipe *pi)
 				static const char encoded_dollar_at[] ALIGN1 = {
 					SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0'
 				}; /* encoded representation of "$@" */
-				static const char *const encoded_dollar_at_argv[] = {
+				static const char *const encoded_dollar_at_argv[] ALIGN_PTR = {
 					encoded_dollar_at, NULL
 				}; /* argv list with one element: "$@" */
 				char **vals;
-- 
cgit v1.2.3-55-g6feb


From c193cbd6dfd095c6b8346bab1ea6ba7106b3e5bb Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Mon, 7 Feb 2022 02:06:18 +0100
Subject: libbb/sha1: shrink and speed up unrolled x86-64 code

function                                             old     new   delta
sha1_process_block64                                3514    3482     -32

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S |   8 +-
 libbb/hash_md5_sha256_x86-64_shaNI.S |   8 +-
 libbb/hash_md5_sha_x86-32_shaNI.S    |   4 +-
 libbb/hash_md5_sha_x86-64.S          | 144 +++++++++++++++++++++++++++--------
 libbb/hash_md5_sha_x86-64.S.sh       |   9 ++-
 libbb/hash_md5_sha_x86-64_shaNI.S    |   4 +-
 6 files changed, 131 insertions(+), 46 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 417da37d8..39e2baf41 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -257,8 +257,8 @@ sha256_process_block64_shaNI:
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 
-.section	.rodata.cst256.K256, "aM", @progbits, 256
-.balign 16
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign 16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -277,8 +277,8 @@ K256:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
-.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index dbf391135..c6c931341 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -253,8 +253,8 @@ sha256_process_block64_shaNI:
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 
-.section	.rodata.cst256.K256, "aM", @progbits, 256
-.balign 16
+	.section	.rodata.cst256.K256, "aM", @progbits, 256
+	.balign 16
 K256:
 	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
@@ -273,8 +273,8 @@ K256:
 	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 
-.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+	.section	.rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+	.balign 16
 PSHUFFLE_BSWAP32_FLIP_MASK:
 	.octa 0x0c0d0e0f08090a0b0405060700010203
 
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 11b855e26..5d082ebfb 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -223,8 +223,8 @@ sha1_process_block64_shaNI:
 	ret
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
-.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
 
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 47ace60de..e26c46f25 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -180,8 +180,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 	movaps	%xmm3, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm0	# ^
@@ -252,8 +257,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 	movaps	%xmm0, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm1	# ^
@@ -323,8 +333,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm2	# ^
@@ -392,8 +407,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 	movaps	%xmm2, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm3	# ^
@@ -457,8 +477,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 	movaps	%xmm3, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm0	# ^
@@ -522,8 +547,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 	movaps	%xmm0, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm1	# ^
@@ -588,8 +618,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm2	# ^
@@ -653,8 +688,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 	movaps	%xmm2, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm3	# ^
@@ -718,8 +758,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 	movaps	%xmm3, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm0	# ^
@@ -795,8 +840,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 	movaps	%xmm0, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm1	# ^
@@ -872,8 +922,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm2	# ^
@@ -950,8 +1005,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 	movaps	%xmm2, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm3	# ^
@@ -1027,8 +1087,13 @@ sha1_process_block64:
 # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
 	movaps	%xmm3, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm0, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm1, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm0, %xmm5
+	shufps	$0x4e, %xmm1, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm2, %xmm0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm0	# ^
@@ -1104,8 +1169,13 @@ sha1_process_block64:
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 	movaps	%xmm0, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm1, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm2, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm1, %xmm5
+	shufps	$0x4e, %xmm2, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm3, %xmm1	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm1	# ^
@@ -1169,8 +1239,13 @@ sha1_process_block64:
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm2, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm3, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm2, %xmm5
+	shufps	$0x4e, %xmm3, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm0, %xmm2	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm2	# ^
@@ -1234,8 +1309,13 @@ sha1_process_block64:
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 	movaps	%xmm2, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
-	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	$0x4e, %xmm3, %xmm5	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq %xmm0, %xmm5	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	%xmm3, %xmm5
+	shufps	$0x4e, %xmm0, %xmm5	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 	xorps	%xmm1, %xmm3	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	%xmm4, %xmm5	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
 	xorps	%xmm5, %xmm3	# ^
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 656fb5414..fb1e4b57e 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -203,8 +203,13 @@ echo "# PREP $@
 	movaps	$xmmW12, $xmmT1
 	psrldq	\$4, $xmmT1	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
 
-	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
-	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+#	pshufd	\$0x4e, $xmmW0, $xmmT2	# 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#	punpcklqdq $xmmW4, $xmmT2	# T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+	movaps	$xmmW0, $xmmT2
+	shufps	\$0x4e, $xmmW4, $xmmT2	# 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
 
 	xorps	$xmmW8, $xmmW0	# ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
 	xorps	$xmmT1, $xmmT2	# ([13],[14],[15],0) ^ ([2],[3],[4],[5])
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index ba92f09df..8ddec87ce 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -217,8 +217,8 @@ sha1_process_block64_shaNI:
 	ret
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
-.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.balign 16
+	.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+	.balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
 	.octa 0x000102030405060708090a0b0c0d0e0f
 
-- 
cgit v1.2.3-55-g6feb


From 4923f74e5873b25b8205a4059964cff75ee731a8 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 8 Feb 2022 03:29:16 +0100
Subject: libbb/sha1: shrink unrolled x86-64 code

function                                             old     new   delta
sha1_process_block64                                3482    3481      -1
.rodata                                           108460  108412     -48
------------------------------------------------------------------------------
(add/remove: 1/4 grow/shrink: 0/2 up/down: 0/-49)             Total: -49 bytes

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-64.S    | 33 ++++++++++-----------------------
 libbb/hash_md5_sha_x86-64.S.sh | 34 +++++++++++-----------------------
 2 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index e26c46f25..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -24,6 +24,7 @@ sha1_process_block64:
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
+# xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
@@ -32,16 +33,17 @@ sha1_process_block64:
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
-	movaps	rconst0x5A827999(%rip), %xmm6
+	movaps	sha1const(%rip), %xmm7
+	pshufd	$0x00, %xmm7, %xmm6
 
 	# Load W[] to xmm registers, byteswapping on the fly.
 	#
 	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1A's instead of spilling them to stack.
+	# for use in RD1As instead of spilling them to stack.
 	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it's probably a wash.
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1A's shorter by one byte).
+	# LEAs in two first RD1As shorter by one byte).
 	movq	4*0(%rdi), %rsi
 	movq	4*2(%rdi), %r8
 	bswapq	%rsi
@@ -253,7 +255,7 @@ sha1_process_block64:
 	roll	$5, %edi		# rotl32(a,5)
 	addl	%edi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
-	movaps	rconst0x6ED9EBA1(%rip), %xmm6
+	pshufd	$0x55, %xmm7, %xmm6
 # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
 	movaps	%xmm0, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -614,7 +616,7 @@ sha1_process_block64:
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
-	movaps	rconst0x8F1BBCDC(%rip), %xmm6
+	pshufd	$0xaa, %xmm7, %xmm6
 # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
 	movaps	%xmm1, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1001,7 +1003,7 @@ sha1_process_block64:
 	roll	$5, %esi		# rotl32(a,5)
 	addl	%esi, %edx		# e += rotl32(a,5)
 	rorl	$2, %eax		# b = rotl32(b,30)
-	movaps	rconst0xCA62C1D6(%rip), %xmm6
+	pshufd	$0xff, %xmm7, %xmm6
 # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
 	movaps	%xmm2, %xmm4
 	psrldq	$4, %xmm4	# rshift by 4 bytes: T1 = ([13],[14],[15],0)
@@ -1475,25 +1477,10 @@ sha1_process_block64:
 
 	.section	.rodata.cst16.sha1const, "aM", @progbits, 16
 	.balign	16
-rconst0x5A827999:
+sha1const:
 	.long	0x5A827999
-	.long	0x5A827999
-	.long	0x5A827999
-	.long	0x5A827999
-rconst0x6ED9EBA1:
-	.long	0x6ED9EBA1
-	.long	0x6ED9EBA1
-	.long	0x6ED9EBA1
 	.long	0x6ED9EBA1
-rconst0x8F1BBCDC:
 	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-rconst0xCA62C1D6:
-	.long	0xCA62C1D6
-	.long	0xCA62C1D6
-	.long	0xCA62C1D6
 	.long	0xCA62C1D6
 
 #endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index fb1e4b57e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -34,6 +34,7 @@ exec >hash_md5_sha_x86-64.S
 xmmT1="%xmm4"
 xmmT2="%xmm5"
 xmmRCONST="%xmm6"
+xmmALLRCONST="%xmm7"
 T=`printf '\t'`
 
 # SSE instructions are longer than 4 bytes on average.
@@ -125,6 +126,7 @@ sha1_process_block64:
 # xmm0..xmm3: W[]
 # xmm4,xmm5: temps
 # xmm6: current round constant
+# xmm7: all round constants
 # -64(%rsp): area for passing RCONST + W[] from vector to integer units
 
 	movl	80(%rdi), %eax		# a = ctx->hash[0]
@@ -133,16 +135,17 @@ sha1_process_block64:
 	movl	92(%rdi), %edx		# d = ctx->hash[3]
 	movl	96(%rdi), %ebp		# e = ctx->hash[4]
 
-	movaps	rconst0x5A827999(%rip), $xmmRCONST
+	movaps	sha1const(%rip), $xmmALLRCONST
+	pshufd	\$0x00, $xmmALLRCONST, $xmmRCONST
 
 	# Load W[] to xmm registers, byteswapping on the fly.
 	#
 	# For iterations 0..15, we pass W[] in rsi,r8..r14
-	# for use in RD1A's instead of spilling them to stack.
+	# for use in RD1As instead of spilling them to stack.
 	# We lose parallelized addition of RCONST, but LEA
-	# can do two additions at once, so it's probably a wash.
+	# can do two additions at once, so it is probably a wash.
 	# (We use rsi instead of rN because this makes two
-	# LEAs in two first RD1A's shorter by one byte).
+	# LEAs in two first RD1As shorter by one byte).
 	movq	4*0(%rdi), %rsi
 	movq	4*2(%rdi), %r8
 	bswapq	%rsi
@@ -359,7 +362,7 @@ RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx
 a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
 b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
 INTERLEAVE "$a" "$b"
-a=`echo "	movaps	rconst0x6ED9EBA1(%rip), $xmmRCONST"
+a=`echo "	pshufd	\\$0x55, $xmmALLRCONST, $xmmRCONST"
    PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
 b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
 INTERLEAVE "$a" "$b"
@@ -378,7 +381,7 @@ INTERLEAVE "$a" "$b"
 a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
 b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
 INTERLEAVE "$a" "$b"
-a=`echo "	movaps	rconst0x8F1BBCDC(%rip), $xmmRCONST"
+a=`echo "	pshufd	\\$0xaa, $xmmALLRCONST, $xmmRCONST"
    PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
 b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
 INTERLEAVE "$a" "$b"
@@ -397,7 +400,7 @@ INTERLEAVE "$a" "$b"
 a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
 b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
 INTERLEAVE "$a" "$b"
-a=`echo "	movaps	rconst0xCA62C1D6(%rip), $xmmRCONST"
+a=`echo "	pshufd	\\$0xff, $xmmALLRCONST, $xmmRCONST"
    PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
 b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
 INTERLEAVE "$a" "$b"
@@ -439,25 +442,10 @@ echo "
 
 	.section	.rodata.cst16.sha1const, \"aM\", @progbits, 16
 	.balign	16
-rconst0x5A827999:
+sha1const:
 	.long	0x5A827999
-	.long	0x5A827999
-	.long	0x5A827999
-	.long	0x5A827999
-rconst0x6ED9EBA1:
-	.long	0x6ED9EBA1
-	.long	0x6ED9EBA1
-	.long	0x6ED9EBA1
 	.long	0x6ED9EBA1
-rconst0x8F1BBCDC:
 	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-	.long	0x8F1BBCDC
-rconst0xCA62C1D6:
-	.long	0xCA62C1D6
-	.long	0xCA62C1D6
-	.long	0xCA62C1D6
 	.long	0xCA62C1D6
 
 #endif"
-- 
cgit v1.2.3-55-g6feb


From 71a1cccaad679bd102f87283f78c581a8fb0e255 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 8 Feb 2022 08:20:27 +0100
Subject: libbb/sha1: shrink x86 hardware accelerated hashing

function                                             old     new   delta
sha1_process_block64_shaNI 32-bit                    524     517      -7
sha1_process_block64_shaNI 64-bit                    510     508      -2

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-32_shaNI.S | 37 +++++++++++++++++--------------------
 libbb/hash_md5_sha_x86-64_shaNI.S | 24 ++++++++++++------------
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 5d082ebfb..0f3fe57ca 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -32,14 +32,10 @@
 #define MSG1		%xmm4
 #define MSG2		%xmm5
 #define MSG3		%xmm6
-#define SHUF_MASK	%xmm7
 
-	.balign	8	# allow decoders to fetch at least 3 first insns
+	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
-	pushl		%ebp
-	movl		%esp, %ebp
-	subl		$32, %esp
-	andl		$~0xF, %esp	# paddd needs aligned memory operand
+	subl		$16, %esp
 
 	/* load initial hash values */
 	xor128		E0, E0
@@ -47,30 +43,33 @@ sha1_process_block64_shaNI:
 	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
-	mova128		PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
+	mova128		PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+
+	movu128		0*16(%eax), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%eax), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%eax), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%eax), MSG3
+	pshufb		%xmm7, MSG3
 
 	/* Save hash values for addition after rounds */
-	movu128		E0, 16(%esp)
+	movu128		E0, %xmm7
 	movu128		ABCD, (%esp)
 
 	/* Rounds 0-3 */
-	movu128		0*16(%eax), MSG0
-	pshufb		SHUF_MASK, MSG0
 		paddd		MSG0, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
 
 	/* Rounds 4-7 */
-	movu128		1*16(%eax), MSG1
-	pshufb		SHUF_MASK, MSG1
 		sha1nexte	MSG1, E1
 		mova128		ABCD, E0
 		sha1rnds4	$0, E1, ABCD
 	sha1msg1	MSG1, MSG0
 
 	/* Rounds 8-11 */
-	movu128		2*16(%eax), MSG2
-	pshufb		SHUF_MASK, MSG2
 		sha1nexte	MSG2, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
@@ -78,8 +77,6 @@ sha1_process_block64_shaNI:
 	xor128		MSG2, MSG0
 
 	/* Rounds 12-15 */
-	movu128		3*16(%eax), MSG3
-	pshufb		SHUF_MASK, MSG3
 		sha1nexte	MSG3, E1
 		mova128		ABCD, E0
 	sha1msg2	MSG3, MSG0
@@ -210,16 +207,16 @@ sha1_process_block64_shaNI:
 		sha1rnds4	$3, E1, ABCD
 
 	/* Add current hash values with previously saved */
-	sha1nexte	16(%esp), E0
-	paddd		(%esp), ABCD
+	sha1nexte	%xmm7, E0
+	movu128		(%esp), %xmm7
+	paddd		%xmm7, ABCD
 
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, ABCD, ABCD
 	movu128		ABCD, 76(%eax)
 	extr128_32	$3, E0, 76+4*4(%eax)
 
-	movl	%ebp, %esp
-	popl	%ebp
+	addl		$16, %esp
 	ret
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 8ddec87ce..fc2ca92e8 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -32,7 +32,6 @@
 #define MSG1		%xmm4
 #define MSG2		%xmm5
 #define MSG3		%xmm6
-#define SHUF_MASK	%xmm7
 
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
@@ -43,30 +42,33 @@ sha1_process_block64_shaNI:
 	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
-	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+	mova128		PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+
+	movu128		0*16(%rdi), MSG0
+	pshufb		%xmm7, MSG0
+	movu128		1*16(%rdi), MSG1
+	pshufb		%xmm7, MSG1
+	movu128		2*16(%rdi), MSG2
+	pshufb		%xmm7, MSG2
+	movu128		3*16(%rdi), MSG3
+	pshufb		%xmm7, MSG3
 
 	/* Save hash values for addition after rounds */
-	mova128		E0, %xmm9
+	mova128		E0, %xmm7
 	mova128		ABCD, %xmm8
 
 	/* Rounds 0-3 */
-	movu128		0*16(%rdi), MSG0
-	pshufb		SHUF_MASK, MSG0
 		paddd		MSG0, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
 
 	/* Rounds 4-7 */
-	movu128		1*16(%rdi), MSG1
-	pshufb		SHUF_MASK, MSG1
 		sha1nexte	MSG1, E1
 		mova128		ABCD, E0
 		sha1rnds4	$0, E1, ABCD
 	sha1msg1	MSG1, MSG0
 
 	/* Rounds 8-11 */
-	movu128		2*16(%rdi), MSG2
-	pshufb		SHUF_MASK, MSG2
 		sha1nexte	MSG2, E0
 		mova128		ABCD, E1
 		sha1rnds4	$0, E0, ABCD
@@ -74,8 +76,6 @@ sha1_process_block64_shaNI:
 	xor128		MSG2, MSG0
 
 	/* Rounds 12-15 */
-	movu128		3*16(%rdi), MSG3
-	pshufb		SHUF_MASK, MSG3
 		sha1nexte	MSG3, E1
 		mova128		ABCD, E0
 	sha1msg2	MSG3, MSG0
@@ -206,7 +206,7 @@ sha1_process_block64_shaNI:
 		sha1rnds4	$3, E1, ABCD
 
 	/* Add current hash values with previously saved */
-	sha1nexte	%xmm9, E0
+	sha1nexte	%xmm7, E0
 	paddd		%xmm8, ABCD
 
 	/* Write hash values back in the correct order */
-- 
cgit v1.2.3-55-g6feb


From eb52e7fa522d829fb400461ca4c808ee5c1d6428 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 8 Feb 2022 15:23:26 +0100
Subject: libbb/sha1: shrink x86 hardware accelerated hashing (32-bit)

function                                             old     new   delta
sha1_process_block64_shaNI                           517     511      -6

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-32_shaNI.S | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 0f3fe57ca..ad814a21b 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -35,11 +35,9 @@
 
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
-	subl		$16, %esp
-
 	/* load initial hash values */
-	xor128		E0, E0
 	movu128		76(%eax), ABCD
+	xor128		E0, E0
 	pinsrd		$3, 76+4*4(%eax), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
@@ -56,7 +54,7 @@ sha1_process_block64_shaNI:
 
 	/* Save hash values for addition after rounds */
 	movu128		E0, %xmm7
-	movu128		ABCD, (%esp)
+	/*movu128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
 
 	/* Rounds 0-3 */
 		paddd		MSG0, E0
@@ -208,7 +206,9 @@ sha1_process_block64_shaNI:
 
 	/* Add current hash values with previously saved */
 	sha1nexte	%xmm7, E0
-	movu128		(%esp), %xmm7
+	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
+	movu128		76(%eax), %xmm7		# recreate original ABCD
+	shuf128_32	$0x1B, %xmm7, %xmm7	#  DCBA -> ABCD
 	paddd		%xmm7, ABCD
 
 	/* Write hash values back in the correct order */
@@ -216,7 +216,6 @@ sha1_process_block64_shaNI:
 	movu128		ABCD, 76(%eax)
 	extr128_32	$3, E0, 76+4*4(%eax)
 
-	addl		$16, %esp
 	ret
 	.size	sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
 
-- 
cgit v1.2.3-55-g6feb


From eb8d5f3b8f3c91f3ed82a52b4ce52a154c146ede Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Tue, 8 Feb 2022 15:34:02 +0100
Subject: libbb/sha1: shrink x86 hardware accelerated hashing (32-bit)

function                                             old     new   delta
sha1_process_block64_shaNI                           511     507      -4

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha_x86-32_shaNI.S | 9 ++++-----
 libbb/hash_md5_sha_x86-64_shaNI.S | 3 +--
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index ad814a21b..a61b3cbed 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -53,8 +53,8 @@ sha1_process_block64_shaNI:
 	pshufb		%xmm7, MSG3
 
 	/* Save hash values for addition after rounds */
-	movu128		E0, %xmm7
-	/*movu128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
+	mova128		E0, %xmm7
+	/*mova128	ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
 
 	/* Rounds 0-3 */
 		paddd		MSG0, E0
@@ -207,12 +207,11 @@ sha1_process_block64_shaNI:
 	/* Add current hash values with previously saved */
 	sha1nexte	%xmm7, E0
 	/*paddd		%xmm8, ABCD - 32-bit mode has no xmm8 */
-	movu128		76(%eax), %xmm7		# recreate original ABCD
-	shuf128_32	$0x1B, %xmm7, %xmm7	#  DCBA -> ABCD
-	paddd		%xmm7, ABCD
+	movu128		76(%eax), %xmm7	# get original ABCD (not shuffled)...
 
 	/* Write hash values back in the correct order */
 	shuf128_32	$0x1B, ABCD, ABCD
+	paddd		%xmm7, ABCD	# ...add it to final ABCD
 	movu128		ABCD, 76(%eax)
 	extr128_32	$3, E0, 76+4*4(%eax)
 
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index fc2ca92e8..b32029360 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -36,9 +36,8 @@
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
 	/* load initial hash values */
-
-	xor128		E0, E0
 	movu128		80(%rdi), ABCD
+	xor128		E0, E0
 	pinsrd		$3, 80+4*4(%rdi), E0	# load to uppermost 32-bit word
 	shuf128_32	$0x1B, ABCD, ABCD	# DCBA -> ABCD
 
-- 
cgit v1.2.3-55-g6feb


From c0ff0d4528d718c20b9ca2290bd10d59e9f794a3 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 9 Feb 2022 00:33:39 +0100
Subject: libbb/sha256: code shrink in 32-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         713     697     -16

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 130 ++++++++++++++++-------------------
 libbb/hash_md5_sha256_x86-64_shaNI.S | 107 ++++++++++++++--------------
 2 files changed, 114 insertions(+), 123 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 39e2baf41..a849dfcc2 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -31,35 +31,27 @@
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
-#define XMMTMP4		%xmm7
 
-	.balign	8	# allow decoders to fetch at least 3 first insns
-sha256_process_block64_shaNI:
-	pushl		%ebp
-	movl		%esp, %ebp
-	subl		$32, %esp
-	andl		$~0xF, %esp	# paddd needs aligned memory operand
+#define XMMTMP		%xmm7
 
+	.balign	8	# allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
 	movu128		76+0*16(%eax), STATE0
 	movu128		76+1*16(%eax), STATE1
 
-	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
-	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
-	mova128		STATE0, XMMTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, XMMTMP4, STATE1		/* CDGH */
+	shuf128_32	$0xB1, STATE0, STATE0		/* CDAB */
+	shuf128_32	$0x1B, STATE1, STATE1		/* EFGH */
+	mova128		STATE0, XMMTMP
+	palignr		$8, STATE1, STATE0		/* ABEF */
+	pblendw		$0xF0, XMMTMP, STATE1		/* CDGH */
 
-/* XMMTMP4 holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP4
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
 	movl		$K256+8*16, SHA256CONSTANTS
 
-	/* Save hash values for addition after rounds */
-	mova128		STATE0, 0*16(%esp)
-	mova128		STATE1, 1*16(%esp)
-
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -68,7 +60,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -78,7 +70,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -88,14 +80,14 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 /* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -105,9 +97,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -117,9 +109,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -129,9 +121,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -141,9 +133,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -153,9 +145,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -165,9 +157,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -177,9 +169,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -189,9 +181,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -201,9 +193,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -213,9 +205,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -224,9 +216,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -238,22 +230,20 @@ sha256_process_block64_shaNI:
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
 
-	/* Add current hash values with previously saved */
-	paddd		0*16(%esp), STATE0
-	paddd		1*16(%esp), STATE1
-
 	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
-	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
-	mova128		STATE0, XMMTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, XMMTMP4, STATE1		/* HGFE */
-
+	shuf128_32	$0x1B, STATE0, STATE0		/* FEBA */
+	shuf128_32	$0xB1, STATE1, STATE1		/* DCHG */
+	mova128		STATE0, XMMTMP
+	pblendw		$0xF0, STATE1, STATE0		/* DCBA */
+	palignr		$8, XMMTMP, STATE1		/* HGFE */
+	/* add current hash values to previous ones */
+	movu128		76+0*16(%eax), XMMTMP
+	paddd		XMMTMP, STATE0
+	movu128		76+1*16(%eax), XMMTMP
 	movu128		STATE0, 76+0*16(%eax)
+	paddd		XMMTMP, STATE1
 	movu128		STATE1, 76+1*16(%eax)
 
-	movl	%ebp, %esp
-	popl	%ebp
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
 
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index c6c931341..b5c950a9a 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -31,7 +31,8 @@
 #define MSGTMP1		%xmm4
 #define MSGTMP2		%xmm5
 #define MSGTMP3		%xmm6
-#define XMMTMP4		%xmm7
+
+#define XMMTMP		%xmm7
 
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
@@ -41,14 +42,14 @@ sha256_process_block64_shaNI:
 	movu128		80+0*16(%rdi), STATE0
 	movu128		80+1*16(%rdi), STATE1
 
-	shuf128_32	$0xB1, STATE0,  STATE0		/* CDAB */
-	shuf128_32	$0x1B, STATE1,  STATE1		/* EFGH */
-	mova128		STATE0, XMMTMP4
-	palignr		$8, STATE1,  STATE0		/* ABEF */
-	pblendw		$0xF0, XMMTMP4, STATE1		/* CDGH */
+	shuf128_32	$0xB1, STATE0, STATE0		/* CDAB */
+	shuf128_32	$0x1B, STATE1, STATE1		/* EFGH */
+	mova128		STATE0, XMMTMP
+	palignr		$8, STATE1, STATE0		/* ABEF */
+	pblendw		$0xF0, XMMTMP, STATE1		/* CDGH */
 
-/* XMMTMP4 holds flip mask from here... */
-	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP4
+/* XMMTMP holds flip mask from here... */
+	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
 	leaq		K256+8*16(%rip), SHA256CONSTANTS
 
 	/* Save hash values for addition after rounds */
@@ -57,7 +58,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 0-3 */
 	movu128		0*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP0
 		paddd		0*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -66,7 +67,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 4-7 */
 	movu128		1*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP1
 		paddd		1*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -76,7 +77,7 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 8-11 */
 	movu128		2*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 	mova128		MSG, MSGTMP2
 		paddd		2*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
@@ -86,14 +87,14 @@ sha256_process_block64_shaNI:
 
 	/* Rounds 12-15 */
 	movu128		3*16(DATA_PTR), MSG
-	pshufb		XMMTMP4, MSG
+	pshufb		XMMTMP, MSG
 /* ...to here */
 	mova128		MSG, MSGTMP3
 		paddd		3*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -103,9 +104,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		4*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -115,9 +116,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		5*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -127,9 +128,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		6*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -139,9 +140,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		7*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -151,9 +152,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		8*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -163,9 +164,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		9*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -175,9 +176,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		10*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -187,9 +188,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP3, MSG
 		paddd		11*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP3, XMMTMP4
-	palignr		$4, MSGTMP2, XMMTMP4
-	paddd		XMMTMP4, MSGTMP0
+	mova128		MSGTMP3, XMMTMP
+	palignr		$4, MSGTMP2, XMMTMP
+	paddd		XMMTMP, MSGTMP0
 	sha256msg2	MSGTMP3, MSGTMP0
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -199,9 +200,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP0, MSG
 		paddd		12*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP0, XMMTMP4
-	palignr		$4, MSGTMP3, XMMTMP4
-	paddd		XMMTMP4, MSGTMP1
+	mova128		MSGTMP0, XMMTMP
+	palignr		$4, MSGTMP3, XMMTMP
+	paddd		XMMTMP, MSGTMP1
 	sha256msg2	MSGTMP0, MSGTMP1
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -211,9 +212,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP1, MSG
 		paddd		13*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP1, XMMTMP4
-	palignr		$4, MSGTMP0, XMMTMP4
-	paddd		XMMTMP4, MSGTMP2
+	mova128		MSGTMP1, XMMTMP
+	palignr		$4, MSGTMP0, XMMTMP
+	paddd		XMMTMP, MSGTMP2
 	sha256msg2	MSGTMP1, MSGTMP2
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -222,9 +223,9 @@ sha256_process_block64_shaNI:
 	mova128		MSGTMP2, MSG
 		paddd		14*16-8*16(SHA256CONSTANTS), MSG
 		sha256rnds2	STATE0, STATE1
-	mova128		MSGTMP2, XMMTMP4
-	palignr		$4, MSGTMP1, XMMTMP4
-	paddd		XMMTMP4, MSGTMP3
+	mova128		MSGTMP2, XMMTMP
+	palignr		$4, MSGTMP1, XMMTMP
+	paddd		XMMTMP, MSGTMP3
 	sha256msg2	MSGTMP2, MSGTMP3
 		shuf128_32	$0x0E, MSG, MSG
 		sha256rnds2	STATE1, STATE0
@@ -241,11 +242,11 @@ sha256_process_block64_shaNI:
 	paddd		CDGH_SAVE, STATE1
 
 	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, STATE0,  STATE0		/* FEBA */
-	shuf128_32	$0xB1, STATE1,  STATE1		/* DCHG */
-	mova128		STATE0, XMMTMP4
-	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
-	palignr		$8, XMMTMP4, STATE1		/* HGFE */
+	shuf128_32	$0x1B, STATE0, STATE0		/* FEBA */
+	shuf128_32	$0xB1, STATE1, STATE1		/* DCHG */
+	mova128		STATE0, XMMTMP
+	pblendw		$0xF0, STATE1, STATE0		/* DCBA */
+	palignr		$8, XMMTMP, STATE1		/* HGFE */
 
 	movu128		STATE0, 80+0*16(%rdi)
 	movu128		STATE1, 80+1*16(%rdi)
-- 
cgit v1.2.3-55-g6feb


From 461a994b09c5022b93bccccf903b39438d61bbf1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 9 Feb 2022 01:30:23 +0100
Subject: libbb/sha256: code shrink in 32-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         697     676     -21

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index a849dfcc2..846230e3e 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -34,16 +34,18 @@
 
 #define XMMTMP		%xmm7
 
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
-	movu128		76+0*16(%eax), STATE0
-	movu128		76+1*16(%eax), STATE1
 
-	shuf128_32	$0xB1, STATE0, STATE0		/* CDAB */
-	shuf128_32	$0x1B, STATE1, STATE1		/* EFGH */
+	movu128		76+0*16(%eax), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */
+	movu128		76+1*16(%eax), STATE0 /* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
 	mova128		STATE0, XMMTMP
-	palignr		$8, STATE1, STATE0		/* ABEF */
-	pblendw		$0xF0, XMMTMP, STATE1		/* CDGH */
+	shufps		SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */
+	shufps		SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */
+	mova128		XMMTMP, STATE1
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
@@ -231,18 +233,19 @@ sha256_process_block64_shaNI:
 		sha256rnds2	STATE1, STATE0
 
 	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, STATE0, STATE0		/* FEBA */
-	shuf128_32	$0xB1, STATE1, STATE1		/* DCHG */
+	/* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
+	/* STATE1: CDGH */
 	mova128		STATE0, XMMTMP
-	pblendw		$0xF0, STATE1, STATE0		/* DCBA */
-	palignr		$8, XMMTMP, STATE1		/* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
 	/* add current hash values to previous ones */
+	movu128		76+1*16(%eax), STATE1
+	paddd		XMMTMP, STATE1
+	movu128		STATE1, 76+1*16(%eax)
 	movu128		76+0*16(%eax), XMMTMP
 	paddd		XMMTMP, STATE0
-	movu128		76+1*16(%eax), XMMTMP
 	movu128		STATE0, 76+0*16(%eax)
-	paddd		XMMTMP, STATE1
-	movu128		STATE1, 76+1*16(%eax)
 
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-- 
cgit v1.2.3-55-g6feb


From 11bcea7ac0ac4b2156c1b2d53f926d789b9792b4 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 9 Feb 2022 01:42:49 +0100
Subject: libbb/sha256: code shrink in 64-bit x86

function                                             old     new   delta
sha256_process_block64_shaNI                         701     680     -21

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-64_shaNI.S | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index b5c950a9a..bc063b9cc 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -37,16 +37,18 @@
 #define ABEF_SAVE	%xmm9
 #define CDGH_SAVE	%xmm10
 
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
-	movu128		80+0*16(%rdi), STATE0
-	movu128		80+1*16(%rdi), STATE1
 
-	shuf128_32	$0xB1, STATE0, STATE0		/* CDAB */
-	shuf128_32	$0x1B, STATE1, STATE1		/* EFGH */
+	movu128		80+0*16(%rdi), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */
+	movu128		80+1*16(%rdi), STATE0 /* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
 	mova128		STATE0, XMMTMP
-	palignr		$8, STATE1, STATE0		/* ABEF */
-	pblendw		$0xF0, XMMTMP, STATE1		/* CDGH */
+	shufps		SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */
+	shufps		SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */
+	mova128		XMMTMP, STATE1
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
@@ -242,14 +244,15 @@ sha256_process_block64_shaNI:
 	paddd		CDGH_SAVE, STATE1
 
 	/* Write hash values back in the correct order */
-	shuf128_32	$0x1B, STATE0, STATE0		/* FEBA */
-	shuf128_32	$0xB1, STATE1, STATE1		/* DCHG */
+	/* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
+	/* STATE1: CDGH */
 	mova128		STATE0, XMMTMP
-	pblendw		$0xF0, STATE1, STATE0		/* DCBA */
-	palignr		$8, XMMTMP, STATE1		/* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+	shufps		SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
+	shufps		SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
 
 	movu128		STATE0, 80+0*16(%rdi)
-	movu128		STATE1, 80+1*16(%rdi)
+	movu128		XMMTMP, 80+1*16(%rdi)
 
 	ret
 	.size	sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
-- 
cgit v1.2.3-55-g6feb


From caa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 9 Feb 2022 01:50:22 +0100
Subject: libbb/sha256: code shrink in x86 assembly

function                                             old     new   delta
sha256_process_block64_shaNI 32-bit                  676     673      -3
sha256_process_block64_shaNI 64-bit                  680     677      -3

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
---
 libbb/hash_md5_sha256_x86-32_shaNI.S | 11 +++++------
 libbb/hash_md5_sha256_x86-64_shaNI.S | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
index 846230e3e..aa68193bd 100644
--- a/libbb/hash_md5_sha256_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -39,13 +39,12 @@
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
 
-	movu128		76+0*16(%eax), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */
-	movu128		76+1*16(%eax), STATE0 /* HGFE */
+	movu128		76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
+	movu128		76+1*16(%eax), STATE1 /* HGFE */
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE0, XMMTMP
-	shufps		SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */
-	shufps		SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */
-	mova128		XMMTMP, STATE1
+	mova128		STATE1, STATE0
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
index bc063b9cc..4663f750a 100644
--- a/libbb/hash_md5_sha256_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -42,13 +42,12 @@
 	.balign	8	# allow decoders to fetch at least 2 first insns
 sha256_process_block64_shaNI:
 
-	movu128		80+0*16(%rdi), STATE1 /* DCBA (msb-to-lsb: 3,2,1,0) */
-	movu128		80+1*16(%rdi), STATE0 /* HGFE */
+	movu128		80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
+	movu128		80+1*16(%rdi), STATE1 /* HGFE */
 /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
-	mova128		STATE0, XMMTMP
-	shufps		SHUF(1,0,1,0), STATE1, STATE0 /* ABEF */
-	shufps		SHUF(3,2,3,2), STATE1, XMMTMP /* CDGH */
-	mova128		XMMTMP, STATE1
+	mova128		STATE1, STATE0
+	shufps		SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
+	shufps		SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
 
 /* XMMTMP holds flip mask from here... */
 	mova128		PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
-- 
cgit v1.2.3-55-g6feb