Merge busybox into merge

Fix conflicts in reset and ash. Redefine the new safe_read_key() as a reference to read_key(). Disable SHA256_HWACCEL.
author: Ron Yorston <rmy@pobox.com> 2022-02-09 09:03:18 +0000
committer: Ron Yorston <rmy@pobox.com> 2022-02-09 09:05:39 +0000
commit: 492d0a7492a57fe8f02c766e25960b0ce0d88759 (patch)
tree: 4f5764a5c2250c031ea05e9aeacbb40d7971f493
parent: 4734416a21312488a5099a297907783bee4ccc22 (diff)
parent: caa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 (diff)
download: busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.gz
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.bz2
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.zip
63 files changed, 1969 insertions, 836 deletions
diff --git a/archival/libarchive/decompress_bunzip2.c b/archival/libarchive/decompress_bunzip2.c
index 42e2b4f88..4a2b668aa 100644
--- a/archival/libarchive/decompress_bunzip2.c
+++ b/archival/libarchive/decompress_bunzip2.c
@@ -654,7 +654,7 @@ static int read_bunzip(bunzip_data *bd, char *outbuf, int len)
                                /* Subtract the 1 copy we'd output anyway to get extras */
                                --bd->writeCopies;
                        }
-                } /* for(;;) */
+                } /* for (;;) */
                /* Decompression of this input block completed successfully */
                bd->writeCRC = CRC = ~CRC;
diff --git a/archival/libarchive/get_header_tar.c b/archival/libarchive/get_header_tar.c
index d26868bf8..cc6f3f0ad 100644
--- a/archival/libarchive/get_header_tar.c
+++ b/archival/libarchive/get_header_tar.c
@@ -147,11 +147,13 @@ static void process_pax_hdr(archive_handle_t *archive_handle, unsigned sz, int g
 #endif
 }
+#if ENABLE_FEATURE_TAR_GNU_EXTENSIONS
 static void die_if_bad_fnamesize(off_t sz)
 {
        if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */
                bb_simple_error_msg_and_die("bad archive");
 }
+#endif
 char FAST_FUNC get_header_tar(archive_handle_t *archive_handle)
 {
diff --git a/busybox_ldscript.README.txt b/busybox_ldscript.README.txt
new file mode 100644
index 000000000..1625a970a
--- /dev/null
+++ b/busybox_ldscript.README.txt
@@ -0,0 +1,47 @@
+/* Add SORT_BY_ALIGNMENT to linker script (found in busybox_unstripped.out):
+##  .rodata : { *(.rodata SORT_BY_ALIGNMENT(.rodata.*) .gnu.linkonce.r.*) }
+##  .data   : { *(.data SORT_BY_ALIGNMENT(.data.*) .gnu.linkonce.d.*) }
+##  .bss    : { *(.bss SORT_BY_ALIGNMENT(.bss.*) .gnu.linkonce.b.*) }
+## This will eliminate most of the padding (~3kb).
+## Hmm, "ld --sort-section alignment" should do it too.
+##
+## There is a ld hack which is meant to decrease disk usage
+## at the cost of more RAM usage (??!!) in standard ld script:
+##  . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
+## Replace it with:
+##  . = ALIGN (0x1000); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
+## to unconditionally align .data to the next page boundary,
+## instead of "next page, plus current offset in this page"
+*/
+/* To reduce the number of VMAs each bbox process has,
+## move *(.bss SORT_BY_ALIGNMENT(.bss.*) ...)
+## part from .bss : {...} block to .data : { ... } block.
+## (This usually increases .data section by only one page).
+## Result:
+##
+##    text data  bss     dec    hex filename
+## 1050792  560 7580 1058932 102874 busybox.bss
+## 1050792 8149    0 1058941 10287d busybox.nobss
+##
+## $ exec busybox.bss pmap $$
+## 0000000008048000    1028K r-xp  /path/to/busybox.bss
+## 0000000008149000       8K rw-p  /path/to/busybox.bss
+## 000000000814b000       4K rw-p    [ anon ]  <---- this VMA is eliminated
+## 00000000085f5000       4K ---p  [heap]
+## 00000000085f6000       4K rw-p  [heap]
+## 00000000f7778000       8K rw-p    [ anon ]
+## 00000000f777a000      12K r--p  [vvar]
+## 00000000f777d000       8K r-xp  [vdso]
+## 00000000ff7e9000     132K rw-p  [stack]
+##
+## $ exec busybox.nobss pmap $$
+## 0000000008048000    1028K r-xp  /path/to/busybox.nobss
+## 0000000008149000      12K rw-p  /path/to/busybox.nobss
+## 00000000086f0000       4K ---p  [heap]
+## 00000000086f1000       4K rw-p  [heap]
+## 00000000f7783000       8K rw-p    [ anon ]
+## 00000000f7785000      12K r--p  [vvar]
+## 00000000f7788000       8K r-xp  [vdso]
+## 00000000ffac0000     132K rw-p  [stack]
+*/
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig
index 408b13eb8..98288bfb2 100644
--- a/configs/mingw32_defconfig
+++ b/configs/mingw32_defconfig
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6
 CONFIG_MD5_SMALL=1
 CONFIG_SHA1_SMALL=3
 # CONFIG_SHA1_HWACCEL is not set
+# CONFIG_SHA256_HWACCEL is not set
 CONFIG_SHA3_SMALL=1
 # CONFIG_FEATURE_FAST_TOP is not set
 # CONFIG_FEATURE_ETC_NETWORKS is not set
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig
index 05596ab8e..1ce3831a9 100644
--- a/configs/mingw64_defconfig
+++ b/configs/mingw64_defconfig
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6
 CONFIG_MD5_SMALL=1
 CONFIG_SHA1_SMALL=3
 # CONFIG_SHA1_HWACCEL is not set
+# CONFIG_SHA256_HWACCEL is not set
 CONFIG_SHA3_SMALL=1
 # CONFIG_FEATURE_FAST_TOP is not set
 # CONFIG_FEATURE_ETC_NETWORKS is not set
diff --git a/console-tools/reset.c b/console-tools/reset.c
index e0d228d50..151bc47d1 100644
--- a/console-tools/reset.c
+++ b/console-tools/reset.c
@@ -40,7 +40,7 @@ int reset_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
 {
 #if !ENABLE_PLATFORM_MINGW32
-        static const char *const args[] = {
+        static const char *const args[] ALIGN_PTR = {
                "stty", "sane", NULL
        };
diff --git a/coreutils/head.c b/coreutils/head.c
index 9586f869f..c7537a20e 100644
--- a/coreutils/head.c
+++ b/coreutils/head.c
@@ -76,7 +76,7 @@ print_except_N_last_bytes(FILE *fp, unsigned count)
 {
        unsigned char *circle = xmalloc(++count);
        unsigned head = 0;
-        for(;;) {
+        for (;;) {
                int c;
                c = getc(fp);
                if (c == EOF)
@@ -105,7 +105,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
 {
        char **circle = xzalloc((++count) * sizeof(circle[0]));
        unsigned head = 0;
-        for(;;) {
+        for (;;) {
                char *c;
                c = xmalloc_fgets(fp);
                if (!c)
@@ -127,7 +127,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
        }
 ret:
        head = 0;
-        for(;;) {
+        for (;;) {
                free(circle[head++]);
                if (head == count)
                        break;
diff --git a/coreutils/od.c b/coreutils/od.c
index 9a888dd5f..6f22331e0 100644
--- a/coreutils/od.c
+++ b/coreutils/od.c
@@ -144,7 +144,7 @@ odoffset(dumper_t *dumper, int argc, char ***argvp)
        }
 }
-static const char *const add_strings[] = {
+static const char *const add_strings[] ALIGN_PTR = {
        "16/1 \"%3_u \" \"\\n\"",              /* a */
        "8/2 \" %06o \" \"\\n\"",              /* B, o */
        "16/1 \"%03o \" \"\\n\"",              /* b */
diff --git a/coreutils/test.c b/coreutils/test.c
index a914c7490..840a0daaf 100644
--- a/coreutils/test.c
+++ b/coreutils/test.c
@@ -242,7 +242,7 @@ int depth;
        depth--; \
        return __res; \
 } while (0)
-static const char *const TOKSTR[] = {
+static const char *const TOKSTR[] ALIGN_PTR = {
        "EOI",
        "FILRD",
        "FILWR",
diff --git a/e2fsprogs/fsck.c b/e2fsprogs/fsck.c
index 96c1e51e0..028f8a803 100644
--- a/e2fsprogs/fsck.c
+++ b/e2fsprogs/fsck.c
@@ -190,7 +190,7 @@ struct globals {
 * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3
 * pathames.
 */
-static const char *const devfs_hier[] = {
+static const char *const devfs_hier[] ALIGN_PTR = {
        "host", "bus", "target", "lun", NULL
 };
 #endif
diff --git a/editors/cmp.c b/editors/cmp.c
index 6d2b0c6c3..b89e519ad 100644
--- a/editors/cmp.c
+++ b/editors/cmp.c
@@ -54,6 +54,7 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
        int retval = 0;
        int max_count = -1;
+#if !ENABLE_LONG_OPTS
        opt = getopt32(argv, "^"
                        OPT_STR
                        "\0" "-1"
@@ -62,6 +63,23 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
                        ":l--s:s--l",
                        &max_count
        );
+#else
+        static const char cmp_longopts[] ALIGN1 =
+                "bytes\0"          Required_argument  "n"
+                "quiet\0"          No_argument        "s"
+                "silent\0"         No_argument        "s"
+                "verbose\0"        No_argument        "l"
+                ;
+        opt = getopt32long(argv, "^"
+                        OPT_STR
+                        "\0" "-1"
+                        IF_DESKTOP(":?4")
+                        IF_NOT_DESKTOP(":?2")
+                        ":l--s:s--l",
+                        cmp_longopts,
+                        &max_count
+        );
+#endif
        argv += optind;
        filename1 = *argv;
diff --git a/editors/patch.c b/editors/patch.c
index 110176630..aebb5073e 100644
--- a/editors/patch.c
+++ b/editors/patch.c
@@ -418,7 +418,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
        }
        // Loop through the lines in the patch
-        for(;;) {
+        for (;;) {
                char *patchline;
                patchline = xmalloc_fgetline(stdin);
diff --git a/editors/patch_toybox.c b/editors/patch_toybox.c
index aebab8132..69a508b2e 100644
--- a/editors/patch_toybox.c
+++ b/editors/patch_toybox.c
@@ -441,7 +441,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
        TT.filein = TT.fileout = -1;
        // Loop through the lines in the patch
-        for(;;) {
+        for (;;) {
                char *patchline;
                patchline = get_line(TT.filepatch);
diff --git a/editors/sed.c b/editors/sed.c
index 374830f3f..f4a5f7b8a 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -252,7 +252,6 @@ static void cleanup_outname(void)
 }
 /* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */
 static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to)
 {
        char *d = dest;
@@ -282,7 +281,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from
        return d - dest;
 }
-static char *copy_parsing_escapes(const char *string, int len)
+static char *copy_parsing_escapes(const char *string, int len, char delim)
 {
        const char *s;
        char *dest = xmalloc(len + 1);
@@ -293,10 +292,15 @@ static char *copy_parsing_escapes(const char *string, int len)
                len = parse_escapes(dest, string, len, s[1], s[0]);
                string = dest;
        }
+        if (delim) {
+                /* we additionally unescape any instances of escaped delimiter.
+                 * For example, in 's+9\++X+' the pattern is "9+", not "9\+".
+                 */
+                len = parse_escapes(dest, string, len, delim, delim);
+        }
        return dest;
 }
 /*
 * index_of_next_unescaped_regexp_delim - walks left to right through a string
 * beginning at a specified index and returns the index of the next regular
@@ -353,12 +357,14 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
        /* save the match string */
        idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
-        *match = copy_parsing_escapes(cmdstr_ptr, idx);
+        *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter);
        /* save the replacement string */
        cmdstr_ptr += idx + 1;
        idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr);
-        *replace = copy_parsing_escapes(cmdstr_ptr, idx);
+//GNU sed 4.8:
+// echo 789 | sed 's&8&\&&'       - 7&9  ("\&" remained "\&")
+// echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11")
+        *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0);
        return ((cmdstr_ptr - cmdstr) + idx);
 }
@@ -386,7 +392,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex)
                        delimiter = *++pos;
                next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
                if (next != 0) {
-                        temp = copy_parsing_escapes(pos, next);
+                        temp = copy_parsing_escapes(pos, next, 0);
                        G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t));
                        xregcomp(*regex, temp, G.regex_type);
                        free(temp);
@@ -581,7 +587,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
                        cmdstr++;
                }
                len = strlen(cmdstr);
-                sed_cmd->string = copy_parsing_escapes(cmdstr, len);
+                sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0);
                cmdstr += len;
                /* "\anychar" -> "anychar" */
                parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0');
diff --git a/editors/vi.c b/editors/vi.c
index b973cc056..b30369302 100644
--- a/editors/vi.c
+++ b/editors/vi.c
@@ -1182,7 +1182,7 @@ static int readit(void) // read (maybe cursor) key from stdin
        // on nonblocking stdin.
        // Note: read_key sets errno to 0 on success.
 again:
-        c = read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1);
+        c = safe_read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1);
        if (c == -1) { // EOF/error
                if (errno == EAGAIN) // paranoia
                        goto again;
@@ -4930,7 +4930,7 @@ static void edit_file(char *fn)
                uint64_t k;
                write1(ESC"[999;999H" ESC"[6n");
                fflush_all();
-                k = read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100);
+                k = safe_read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100);
                if ((int32_t)k == KEYCODE_CURSOR_POS) {
                        uint32_t rc = (k >> 32);
                        columns = (rc & 0x7fff);
diff --git a/include/libbb.h b/include/libbb.h
index e540f2a90..740c25528 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -691,6 +691,7 @@ void xsetgid(gid_t gid) FAST_FUNC;
 void xsetuid(uid_t uid) FAST_FUNC;
 void xsetegid(gid_t egid) FAST_FUNC;
 void xseteuid(uid_t euid) FAST_FUNC;
+int chdir_or_warn(const char *path) FAST_FUNC;
 void xchdir(const char *path) FAST_FUNC;
 void xfchdir(int fd) FAST_FUNC;
 void xchroot(const char *path) FAST_FUNC;
@@ -1776,7 +1777,7 @@ extern void selinux_or_die(void) FAST_FUNC;
 /* setup_environment:
- * if !SETUP_ENV_NO_CHDIR:
+ * if SETUP_ENV_CHDIR:
 *   if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die
 * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set
 *   TERM=(old value)
@@ -1784,7 +1785,7 @@ extern void selinux_or_die(void) FAST_FUNC;
 *   PATH=bb_default_[root_]path
 *   HOME=pw->pw_dir
 *   SHELL=shell
- * else if SETUP_ENV_CHANGEENV:
+ * else if SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME:
 *   if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME:
 *     USER=pw->pw_name, LOGNAME=pw->pw_name
 *   HOME=pw->pw_dir
@@ -1798,7 +1799,7 @@ extern void selinux_or_die(void) FAST_FUNC;
 #define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1)
 #define SETUP_ENV_CLEARENV          (1 << 2)
 #define SETUP_ENV_TO_TMP            (1 << 3)
-#define SETUP_ENV_NO_CHDIR          (1 << 4)
+#define SETUP_ENV_CHDIR             (1 << 4)
 void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC;
 void nuke_str(char *str) FAST_FUNC;
 #if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM
@@ -1955,6 +1956,8 @@ enum {
 * (unless fd is in non-blocking mode),
 * subsequent reads will time out after a few milliseconds.
 * Return of -1 means EOF or error (errno == 0 on EOF).
+ * Nonzero errno is not preserved across the call:
+ * if there was no error, errno will be cleared to 0.
 * buffer[0] is used as a counter of buffered chars and must be 0
 * on first call.
 * timeout:
@@ -1963,6 +1966,12 @@ enum {
 * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout
 */
 int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC;
+#if ENABLE_PLATFORM_MINGW32
+#define safe_read_key(f, b, t) read_key(f, b, t)
+#else
+/* This version loops on EINTR: */
+int64_t safe_read_key(int fd, char *buffer, int timeout) FAST_FUNC;
+#endif
 void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC;
@@ -2016,7 +2025,8 @@ enum {
        USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION,
        VI_MODE          = 8 * ENABLE_FEATURE_EDITING_VI,
        WITH_PATH_LOOKUP = 0x10,
-        FOR_SHELL        = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION,
+        LI_INTERRUPTIBLE = 0x20,
+        FOR_SHELL        = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION | LI_INTERRUPTIBLE,
 };
 line_input_t *new_line_input_t(int flags) FAST_FUNC;
 #if ENABLE_FEATURE_EDITING_SAVEHISTORY
@@ -2361,7 +2371,7 @@ struct globals;
 /* '*const' ptr makes gcc optimize code much better.
 * Magic prevents ptr_to_globals from going into rodata.
 * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */
-extern struct globals *const ptr_to_globals;
+extern struct globals *BB_GLOBAL_CONST ptr_to_globals;
 #define barrier() asm volatile ("":::"memory")
diff --git a/include/platform.h b/include/platform.h
index 3fb1a2dc8..8ae5ed4bc 100644
--- a/include/platform.h
+++ b/include/platform.h
@@ -367,6 +367,7 @@ typedef unsigned smalluint;
 # define ALIGN4
 #endif
 #define ALIGN8     __attribute__((aligned(8)))
+#define ALIGN_INT  __attribute__((aligned(sizeof(int))))
 #define ALIGN_PTR  __attribute__((aligned(sizeof(void*))))
 /*
diff --git a/libbb/Config.src b/libbb/Config.src
index 708d3b0c8..0ecd5bd46 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -70,6 +70,12 @@ config SHA1_HWACCEL
        On x86, this adds ~590 bytes of code. Throughput
        is about twice as fast as fully-unrolled generic code.
+config SHA256_HWACCEL
+        bool "SHA256: Use hardware accelerated instructions if possible"
+        default y
+        help
+        On x86, this adds ~1k bytes of code.
 config SHA3_SMALL
        int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
        default 1  # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 67d3c7cf7..191984c9d 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -48,6 +48,8 @@ lib-y += hash_md5_sha.o
 lib-y += hash_md5_sha_x86-64.o
 lib-y += hash_md5_sha_x86-64_shaNI.o
 lib-y += hash_md5_sha_x86-32_shaNI.o
+lib-y += hash_md5_sha256_x86-64_shaNI.o
+lib-y += hash_md5_sha256_x86-32_shaNI.o
 # Alternative (disabled) MD5 implementation
 #lib-y += hash_md5prime.o
 lib-y += messages.o
@@ -204,6 +206,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o
 lib-$(CONFIG_PKILL) += xregcomp.o
 lib-$(CONFIG_DEVFSD) += xregcomp.o
 lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o
+lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o
 # Add the experimental logging functionality, only used by zcip
 lib-$(CONFIG_ZCIP) += logenv.o
diff --git a/libbb/appletlib.c b/libbb/appletlib.c
index 6c0be4a83..a8b82e729 100644
--- a/libbb/appletlib.c
+++ b/libbb/appletlib.c
@@ -671,7 +671,7 @@ static void check_suid(int applet_no)
 # if ENABLE_FEATURE_INSTALLER
 static const char usr_bin [] ALIGN1 = "/usr/bin/";
 static const char usr_sbin[] ALIGN1 = "/usr/sbin/";
-static const char *const install_dir[] = {
+static const char *const install_dir[] ALIGN_PTR = {
        &usr_bin [8], /* "/" */
        &usr_bin [4], /* "/bin/" */
        &usr_sbin[4]  /* "/sbin/" */
diff --git a/libbb/get_console.c b/libbb/get_console.c
index 7f2c75332..9044efea1 100644
--- a/libbb/get_console.c
+++ b/libbb/get_console.c
@@ -37,7 +37,7 @@ static int open_a_console(const char *fnam)
 */
 int FAST_FUNC get_console_fd_or_die(void)
 {
-        static const char *const console_names[] = {
+        static const char *const console_names[] ALIGN_PTR = {
                DEV_CONSOLE, CURRENT_VC, CURRENT_TTY
        };
diff --git a/libbb/getopt32.c b/libbb/getopt32.c
index 5ab4d66f1..e861d0567 100644
--- a/libbb/getopt32.c
+++ b/libbb/getopt32.c
@@ -296,7 +296,7 @@ Special characters:
 /* Code here assumes that 'unsigned' is at least 32 bits wide */
-const char *const bb_argv_dash[] = { "-", NULL };
+const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL };
 enum {
        PARAM_STRING,
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a23db5152..880ffab01 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -13,6 +13,27 @@
 #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
+#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
+# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+        asm ("cpuid"
+                : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
+                : "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
+        );
+}
+static smallint shaNI;
+void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
+void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
+#  if defined(__i386__)
+struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
+#  endif
+#  if defined(__x86_64__)
+struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
+#  endif
+# endif
+#endif
 /* gcc 4.2.1 optimizes rotr64 better with inline than with macro
 * (for rotX32, there is no difference). Why? My guess is that
 * macro requires clever common subexpression elimination heuristics
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
 }
 #endif /* NEED_SHA512 */
-#if ENABLE_SHA1_HWACCEL
-# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
-static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
-{
-        asm ("cpuid"
-                : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
-                : "0"(*eax),  "1"(*ebx),  "2"(*ecx),  "3"(*edx)
-        );
-}
-void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
-#  if defined(__i386__)
-struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
-#  endif
-#  if defined(__x86_64__)
-struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
-#  endif
-# endif
-#endif
 void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 {
        ctx->hash[0] = 0x67452301;
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
 #if ENABLE_SHA1_HWACCEL
 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
        {
-                static smallint shaNI;
                if (!shaNI) {
                        unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
                        cpuid(&eax, &ebx, &ecx, &edx);
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
        memcpy(&ctx->total64, init256, sizeof(init256));
        /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
        ctx->process_block = sha256_process_block64;
+#if ENABLE_SHA256_HWACCEL
+# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
+        {
+                if (!shaNI) {
+                        unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
+                        cpuid(&eax, &ebx, &ecx, &edx);
+                        shaNI = ((ebx >> 29) << 1) - 1;
+                }
+                if (shaNI > 0)
+                        ctx->process_block = sha256_process_block64_shaNI;
+        }
+# endif
+#endif
 }
 #if NEED_SHA512
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..aa68193bd
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,277 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
+/* The code is adapted from Linux kernel's source */
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA1 insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+        .section        .text.sha256_process_block64_shaNI, "ax", @progbits
+        .globl  sha256_process_block64_shaNI
+        .hidden sha256_process_block64_shaNI
+        .type   sha256_process_block64_shaNI, @function
+#define DATA_PTR        %eax
+#define SHA256CONSTANTS %ecx
+#define MSG             %xmm0
+#define STATE0          %xmm1
+#define STATE1          %xmm2
+#define MSGTMP0         %xmm3
+#define MSGTMP1         %xmm4
+#define MSGTMP2         %xmm5
+#define MSGTMP3         %xmm6
+#define XMMTMP          %xmm7
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+        .balign 8       # allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+        movu128         76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
+        movu128         76+1*16(%eax), STATE1 /* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+        mova128         STATE1, STATE0
+        shufps          SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
+        shufps          SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
+/* XMMTMP holds flip mask from here... */
+        mova128         PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
+        movl            $K256+8*16, SHA256CONSTANTS
+        /* Rounds 0-3 */
+        movu128         0*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP0
+                paddd           0*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 4-7 */
+        movu128         1*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP1
+                paddd           1*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 8-11 */
+        movu128         2*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP2
+                paddd           2*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 12-15 */
+        movu128         3*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+/* ...to here */
+        mova128         MSG, MSGTMP3
+                paddd           3*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 16-19 */
+        mova128         MSGTMP0, MSG
+                paddd           4*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 20-23 */
+        mova128         MSGTMP1, MSG
+                paddd           5*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 24-27 */
+        mova128         MSGTMP2, MSG
+                paddd           6*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 28-31 */
+        mova128         MSGTMP3, MSG
+                paddd           7*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 32-35 */
+        mova128         MSGTMP0, MSG
+                paddd           8*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 36-39 */
+        mova128         MSGTMP1, MSG
+                paddd           9*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 40-43 */
+        mova128         MSGTMP2, MSG
+                paddd           10*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 44-47 */
+        mova128         MSGTMP3, MSG
+                paddd           11*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 48-51 */
+        mova128         MSGTMP0, MSG
+                paddd           12*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 52-55 */
+        mova128         MSGTMP1, MSG
+                paddd           13*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 56-59 */
+        mova128         MSGTMP2, MSG
+                paddd           14*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 60-63 */
+        mova128         MSGTMP3, MSG
+                paddd           15*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Write hash values back in the correct order */
+        /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
+        /* STATE1: CDGH */
+        mova128         STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+        shufps          SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
+        shufps          SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
+        /* add current hash values to previous ones */
+        movu128         76+1*16(%eax), STATE1
+        paddd           XMMTMP, STATE1
+        movu128         STATE1, 76+1*16(%eax)
+        movu128         76+0*16(%eax), XMMTMP
+        paddd           XMMTMP, STATE0
+        movu128         STATE0, 76+0*16(%eax)
+        ret
+        .size   sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+        .section        .rodata.cst256.K256, "aM", @progbits, 256
+        .balign 16
+K256:
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+        .section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+        .balign 16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+        .octa 0x0c0d0e0f08090a0b0405060700010203
+#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..4663f750a
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,284 @@
+#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
+/* The code is adapted from Linux kernel's source */
+// We use shorter insns, even though they are for "wrong"
+// data type (fp, not int).
+// For Intel, there is no penalty for doing it at all
+// (CPUs which do have such penalty do not support SHA1 insns).
+// For AMD, the penalty is one extra cycle
+// (allegedly: I failed to find measurable difference).
+//#define mova128 movdqa
+#define mova128 movaps
+//#define movu128 movdqu
+#define movu128 movups
+//#define shuf128_32 pshufd
+#define shuf128_32 shufps
+        .section        .text.sha256_process_block64_shaNI, "ax", @progbits
+        .globl  sha256_process_block64_shaNI
+        .hidden sha256_process_block64_shaNI
+        .type   sha256_process_block64_shaNI, @function
+#define DATA_PTR        %rdi
+#define SHA256CONSTANTS %rax
+#define MSG             %xmm0
+#define STATE0          %xmm1
+#define STATE1          %xmm2
+#define MSGTMP0         %xmm3
+#define MSGTMP1         %xmm4
+#define MSGTMP2         %xmm5
+#define MSGTMP3         %xmm6
+#define XMMTMP          %xmm7
+#define ABEF_SAVE       %xmm9
+#define CDGH_SAVE       %xmm10
+#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
+        .balign 8       # allow decoders to fetch at least 2 first insns
+sha256_process_block64_shaNI:
+        movu128         80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
+        movu128         80+1*16(%rdi), STATE1 /* HGFE */
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+        mova128         STATE1, STATE0
+        shufps          SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
+        shufps          SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
+/* XMMTMP holds flip mask from here... */
+        mova128         PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
+        leaq            K256+8*16(%rip), SHA256CONSTANTS
+        /* Save hash values for addition after rounds */
+        mova128         STATE0, ABEF_SAVE
+        mova128         STATE1, CDGH_SAVE
+        /* Rounds 0-3 */
+        movu128         0*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP0
+                paddd           0*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 4-7 */
+        movu128         1*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP1
+                paddd           1*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 8-11 */
+        movu128         2*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+        mova128         MSG, MSGTMP2
+                paddd           2*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 12-15 */
+        movu128         3*16(DATA_PTR), MSG
+        pshufb          XMMTMP, MSG
+/* ...to here */
+        mova128         MSG, MSGTMP3
+                paddd           3*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 16-19 */
+        mova128         MSGTMP0, MSG
+                paddd           4*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 20-23 */
+        mova128         MSGTMP1, MSG
+                paddd           5*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 24-27 */
+        mova128         MSGTMP2, MSG
+                paddd           6*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 28-31 */
+        mova128         MSGTMP3, MSG
+                paddd           7*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 32-35 */
+        mova128         MSGTMP0, MSG
+                paddd           8*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 36-39 */
+        mova128         MSGTMP1, MSG
+                paddd           9*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP1, MSGTMP0
+        /* Rounds 40-43 */
+        mova128         MSGTMP2, MSG
+                paddd           10*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP2, MSGTMP1
+        /* Rounds 44-47 */
+        mova128         MSGTMP3, MSG
+                paddd           11*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP3, XMMTMP
+        palignr         $4, MSGTMP2, XMMTMP
+        paddd           XMMTMP, MSGTMP0
+        sha256msg2      MSGTMP3, MSGTMP0
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP3, MSGTMP2
+        /* Rounds 48-51 */
+        mova128         MSGTMP0, MSG
+                paddd           12*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP0, XMMTMP
+        palignr         $4, MSGTMP3, XMMTMP
+        paddd           XMMTMP, MSGTMP1
+        sha256msg2      MSGTMP0, MSGTMP1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        sha256msg1      MSGTMP0, MSGTMP3
+        /* Rounds 52-55 */
+        mova128         MSGTMP1, MSG
+                paddd           13*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP1, XMMTMP
+        palignr         $4, MSGTMP0, XMMTMP
+        paddd           XMMTMP, MSGTMP2
+        sha256msg2      MSGTMP1, MSGTMP2
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 56-59 */
+        mova128         MSGTMP2, MSG
+                paddd           14*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+        mova128         MSGTMP2, XMMTMP
+        palignr         $4, MSGTMP1, XMMTMP
+        paddd           XMMTMP, MSGTMP3
+        sha256msg2      MSGTMP2, MSGTMP3
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Rounds 60-63 */
+        mova128         MSGTMP3, MSG
+                paddd           15*16-8*16(SHA256CONSTANTS), MSG
+                sha256rnds2     STATE0, STATE1
+                shuf128_32      $0x0E, MSG, MSG
+                sha256rnds2     STATE1, STATE0
+        /* Add current hash values with previously saved */
+        paddd           ABEF_SAVE, STATE0
+        paddd           CDGH_SAVE, STATE1
+        /* Write hash values back in the correct order */
+        /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
+        /* STATE1: CDGH */
+        mova128         STATE0, XMMTMP
+/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
+        shufps          SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
+        shufps          SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
+        movu128         STATE0, 80+0*16(%rdi)
+        movu128         XMMTMP, 80+1*16(%rdi)
+        ret
+        .size   sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
+        .section        .rodata.cst256.K256, "aM", @progbits, 256
+        .balign 16
+K256:
+        .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+        .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+        .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+        .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+        .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+        .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+        .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+        .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+        .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+        .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+        .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+        .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+        .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+        .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+        .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+        .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+        .section        .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
+        .balign 16
+PSHUFFLE_BSWAP32_FLIP_MASK:
+        .octa 0x0c0d0e0f08090a0b0405060700010203
+#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 166cfd38a..a61b3cbed 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,7 +20,7 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps  # not shorter
-        .section        .text.sha1_process_block64_shaNI,"ax",@progbits
+        .section        .text.sha1_process_block64_shaNI, "ax", @progbits
        .globl  sha1_process_block64_shaNI
        .hidden sha1_process_block64_shaNI
        .type   sha1_process_block64_shaNI, @function
@@ -32,45 +32,42 @@
 #define MSG1            %xmm4
 #define MSG2            %xmm5
 #define MSG3            %xmm6
-#define SHUF_MASK       %xmm7
-        .balign 8       # allow decoders to fetch at least 3 first insns
+        .balign 8       # allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
-        pushl           %ebp
-        movl            %esp, %ebp
-        subl            $32, %esp
-        andl            $~0xF, %esp     # paddd needs aligned memory operand
        /* load initial hash values */
-        xor128          E0, E0
        movu128         76(%eax), ABCD
+        xor128          E0, E0
        pinsrd          $3, 76+4*4(%eax), E0    # load to uppermost 32-bit word
        shuf128_32      $0x1B, ABCD, ABCD       # DCBA -> ABCD
-        mova128         PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK
+        mova128         PSHUFFLE_BYTE_FLIP_MASK, %xmm7
+        movu128         0*16(%eax), MSG0
+        pshufb          %xmm7, MSG0
+        movu128         1*16(%eax), MSG1
+        pshufb          %xmm7, MSG1
+        movu128         2*16(%eax), MSG2
+        pshufb          %xmm7, MSG2
+        movu128         3*16(%eax), MSG3
+        pshufb          %xmm7, MSG3
        /* Save hash values for addition after rounds */
-        movu128         E0, 16(%esp)
+        mova128         E0, %xmm7
-        movu128         ABCD, (%esp)
+        /*mova128       ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
        /* Rounds 0-3 */
-        movu128         0*16(%eax), MSG0
-        pshufb          SHUF_MASK, MSG0
                paddd           MSG0, E0
                mova128         ABCD, E1
                sha1rnds4       $0, E0, ABCD
        /* Rounds 4-7 */
-        movu128         1*16(%eax), MSG1
-        pshufb          SHUF_MASK, MSG1
                sha1nexte       MSG1, E1
                mova128         ABCD, E0
                sha1rnds4       $0, E1, ABCD
        sha1msg1        MSG1, MSG0
        /* Rounds 8-11 */
-        movu128         2*16(%eax), MSG2
-        pshufb          SHUF_MASK, MSG2
                sha1nexte       MSG2, E0
                mova128         ABCD, E1
                sha1rnds4       $0, E0, ABCD
@@ -78,8 +75,6 @@ sha1_process_block64_shaNI:
        xor128          MSG2, MSG0
        /* Rounds 12-15 */
-        movu128         3*16(%eax), MSG3
-        pshufb          SHUF_MASK, MSG3
                sha1nexte       MSG3, E1
                mova128         ABCD, E0
        sha1msg2        MSG3, MSG0
@@ -210,21 +205,21 @@ sha1_process_block64_shaNI:
                sha1rnds4       $3, E1, ABCD
        /* Add current hash values with previously saved */
-        sha1nexte       16(%esp), E0
+        sha1nexte       %xmm7, E0
-        paddd           (%esp), ABCD
+        /*paddd         %xmm8, ABCD - 32-bit mode has no xmm8 */
+        movu128         76(%eax), %xmm7 # get original ABCD (not shuffled)...
        /* Write hash values back in the correct order */
        shuf128_32      $0x1B, ABCD, ABCD
+        paddd           %xmm7, ABCD     # ...add it to final ABCD
        movu128         ABCD, 76(%eax)
        extr128_32      $3, E0, 76+4*4(%eax)
-        movl    %ebp, %esp
-        popl    %ebp
        ret
        .size   sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+        .balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
        .octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 87fb616a1..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,7 +1,7 @@
 ### Generated by hash_md5_sha_x86-64.S.sh ###
 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-        .section        .text.sha1_process_block64,"ax",@progbits
+        .section        .text.sha1_process_block64, "ax", @progbits
        .globl  sha1_process_block64
        .hidden sha1_process_block64
        .type   sha1_process_block64, @function
@@ -10,7 +10,7 @@
 sha1_process_block64:
        pushq   %rbp    # 1 byte insn
        pushq   %rbx    # 1 byte insn
-        pushq   %r15    # 2 byte insn
+#       pushq   %r15    # 2 byte insn
        pushq   %r14    # 2 byte insn
        pushq   %r13    # 2 byte insn
        pushq   %r12    # 2 byte insn
@@ -19,17 +19,13 @@ sha1_process_block64:
 #Register and stack use:
 # eax..edx: a..d
 # ebp: e
-# esi,edi: temps
+# esi,edi,r8..r14: temps
-# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
+# r15: unused
-# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
+# xmm0..xmm3: W[]
-        movl    $3, %eax
+# xmm4,xmm5: temps
-1:
+# xmm6: current round constant
-        movq    (%rdi,%rax,8), %rsi
+# xmm7: all round constants
-        bswapq  %rsi
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-        rolq    $32, %rsi
-        movq    %rsi, -32(%rsp,%rax,8)
-        decl    %eax
-        jns     1b
        movl    80(%rdi), %eax          # a = ctx->hash[0]
        movl    84(%rdi), %ebx          # b = ctx->hash[1]
@@ -37,587 +33,760 @@ sha1_process_block64:
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
-        movq    4*8(%rdi), %r8
+        movaps  sha1const(%rip), %xmm7
-        movq    4*10(%rdi), %r10
+        pshufd  $0x00, %xmm7, %xmm6
+        # Load W[] to xmm registers, byteswapping on the fly.
+        #
+        # For iterations 0..15, we pass W[] in rsi,r8..r14
+        # for use in RD1As instead of spilling them to stack.
+        # We lose parallelized addition of RCONST, but LEA
+        # can do two additions at once, so it is probably a wash.
+        # (We use rsi instead of rN because this makes two
+        # LEAs in two first RD1As shorter by one byte).
+        movq    4*0(%rdi), %rsi
+        movq    4*2(%rdi), %r8
+        bswapq  %rsi
        bswapq  %r8
+        rolq    $32, %rsi               # rsi = W[1]:W[0]
+        rolq    $32, %r8                # r8  = W[3]:W[2]
+        movq    %rsi, %xmm0
+        movq    %r8, %xmm4
+        punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#       movaps  %xmm0, %xmm4            # add RCONST, spill to stack
+#       paddd   %xmm6, %xmm4
+#       movups  %xmm4, -64+16*0(%rsp)
+        movq    4*4(%rdi), %r9
+        movq    4*6(%rdi), %r10
+        bswapq  %r9
        bswapq  %r10
-        movq    4*12(%rdi), %r12
+        rolq    $32, %r9                # r9  = W[5]:W[4]
-        movq    4*14(%rdi), %r14
+        rolq    $32, %r10               # r10 = W[7]:W[6]
+        movq    %r9, %xmm1
+        movq    %r10, %xmm4
+        punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+        movq    4*8(%rdi), %r11
+        movq    4*10(%rdi), %r12
+        bswapq  %r11
        bswapq  %r12
+        rolq    $32, %r11               # r11  = W[9]:W[8]
+        rolq    $32, %r12               # r12  = W[11]:W[10]
+        movq    %r11, %xmm2
+        movq    %r12, %xmm4
+        punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+        movq    4*12(%rdi), %r13
+        movq    4*14(%rdi), %r14
+        bswapq  %r13
        bswapq  %r14
-        movl    %r8d, %r9d
+        rolq    $32, %r13               # r13  = W[13]:W[12]
-        shrq    $32, %r8
+        rolq    $32, %r14               # r14  = W[15]:W[14]
-        movl    %r10d, %r11d
+        movq    %r13, %xmm3
-        shrq    $32, %r10
+        movq    %r14, %xmm4
-        movl    %r12d, %r13d
+        punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-        shrq    $32, %r12
-        movl    %r14d, %r15d
-        shrq    $32, %r14
 # 0
-        # W[0], already in %esi
+        leal    0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
+        shrq    $32, %rsi
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        andl    %ebx, %edi              # &b
        xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
        addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
-        movl    %eax, %esi              #
+        movl    %eax, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebp              # e += rotl32(a,5)
+        addl    %edi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 1
-        movl    -32+4*1(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        andl    %eax, %edi              # &b
        xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
        addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
-        movl    %ebp, %esi              #
+        movl    %ebp, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %edx              # e += rotl32(a,5)
+        addl    %edi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 2
-        movl    -32+4*2(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
+        shrq    $32, %r8
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        andl    %ebp, %edi              # &b
        xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
        addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
-        movl    %edx, %esi              #
+        movl    %edx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ecx              # e += rotl32(a,5)
+        addl    %edi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 3
-        movl    -32+4*3(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        andl    %edx, %edi              # &b
        xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n]
        addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
-        movl    %ecx, %esi              #
+        movl    %ecx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebx              # e += rotl32(a,5)
+        addl    %edi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 4
-        movl    -32+4*4(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
+        shrq    $32, %r9
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        andl    %ecx, %edi              # &b
        xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n]
        addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
-        movl    %ebx, %esi              #
+        movl    %ebx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %eax              # e += rotl32(a,5)
+        addl    %edi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 5
-        movl    -32+4*5(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        andl    %ebx, %edi              # &b
        xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
        addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
-        movl    %eax, %esi              #
+        movl    %eax, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebp              # e += rotl32(a,5)
+        addl    %edi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 6
-        movl    -32+4*6(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
+        shrq    $32, %r10
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        andl    %eax, %edi              # &b
        xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
        addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
-        movl    %ebp, %esi              #
+        movl    %ebp, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %edx              # e += rotl32(a,5)
+        addl    %edi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 7
-        movl    -32+4*7(%rsp), %esi             # W[n]
+        leal    0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        andl    %ebp, %edi              # &b
        xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
        addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
-        movl    %edx, %esi              #
+        movl    %edx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ecx              # e += rotl32(a,5)
+        addl    %edi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+        movaps  %xmm3, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm0    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm0, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm0, %xmm0    #  shift left by 1
+        psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm0, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*0(%rsp)
 # 8
-        # W[n], in %r8
+        leal    0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
+        shrq    $32, %r11
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        andl    %edx, %edi              # &b
        xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
        addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
-        movl    %ecx, %esi              #
+        movl    %ecx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebx              # e += rotl32(a,5)
+        addl    %edi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 9
-        # W[n], in %r9
+        leal    0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        andl    %ecx, %edi              # &b
        xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
        addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
-        movl    %ebx, %esi              #
+        movl    %ebx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %eax              # e += rotl32(a,5)
+        addl    %edi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 10
-        # W[n], in %r10
+        leal    0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
+        shrq    $32, %r12
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        andl    %ebx, %edi              # &b
        xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
        addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
-        movl    %eax, %esi              #
+        movl    %eax, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebp              # e += rotl32(a,5)
+        addl    %edi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 11
-        # W[n], in %r11
+        leal    0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        andl    %eax, %edi              # &b
        xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
        addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
-        movl    %ebp, %esi              #
+        movl    %ebp, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %edx              # e += rotl32(a,5)
+        addl    %edi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
+        pshufd  $0x55, %xmm7, %xmm6
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+        movaps  %xmm0, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm1    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm1, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm1, %xmm1    #  shift left by 1
+        psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm1, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*1(%rsp)
 # 12
-        # W[n], in %r12
+        leal    0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
+        shrq    $32, %r13
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        andl    %ebp, %edi              # &b
        xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
        addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
-        movl    %edx, %esi              #
+        movl    %edx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ecx              # e += rotl32(a,5)
+        addl    %edi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 13
-        # W[n], in %r13
+        leal    0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        andl    %edx, %edi              # &b
        xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
        addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
-        movl    %ecx, %esi              #
+        movl    %ecx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebx              # e += rotl32(a,5)
+        addl    %edi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 14
-        # W[n], in %r14
+        leal    0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
+        shrq    $32, %r14
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        andl    %ecx, %edi              # &b
        xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
        addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
-        movl    %ebx, %esi              #
+        movl    %ebx, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %eax              # e += rotl32(a,5)
+        addl    %edi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 15
-        # W[n], in %r15
+        leal    0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        andl    %ebx, %edi              # &b
        xorl    %edx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
        addl    %edi, %ebp              # e += (((c ^ d) & b) ^ d)
-        movl    %eax, %esi              #
+        movl    %eax, %edi              #
-        roll    $5, %esi                # rotl32(a,5)
+        roll    $5, %edi                # rotl32(a,5)
-        addl    %esi, %ebp              # e += rotl32(a,5)
+        addl    %edi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+        movaps  %xmm1, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm2    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm2, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm2, %xmm2    #  shift left by 1
+        psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm2, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*2(%rsp)
 # 16
-        movl    %r13d, %esi     # W[(n+13) & 15]
-        xorl    %r8d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*0(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*0(%rsp)             # store to W[n & 15]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        andl    %eax, %edi              # &b
        xorl    %ecx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*0(%rsp), %edx     # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (((c ^ d) & b) ^ d)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 17
-        movl    %r14d, %esi     # W[(n+13) & 15]
-        xorl    %r9d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*1(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*1(%rsp)             # store to W[n & 15]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        andl    %ebp, %edi              # &b
        xorl    %ebx, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*1(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (((c ^ d) & b) ^ d)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 18
-        movl    %r15d, %esi     # W[(n+13) & 15]
-        xorl    %r10d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*2(%rsp)             # store to W[n & 15]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        andl    %edx, %edi              # &b
        xorl    %eax, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*2(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (((c ^ d) & b) ^ d)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 19
-        movl    -32+4*0(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r11d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*3(%rsp)             # store to W[n & 15]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        andl    %ecx, %edi              # &b
        xorl    %ebp, %edi              # (((c ^ d) & b) ^ d)
-        leal    0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*3(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (((c ^ d) & b) ^ d)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+        movaps  %xmm2, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm3    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm3, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm3, %xmm3    #  shift left by 1
+        psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm3, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*3(%rsp)
 # 20
-        movl    -32+4*1(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r12d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*4(%rsp)             # store to W[n & 15]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*4(%rsp), %ebp     # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 21
-        movl    -32+4*2(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r13d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*5(%rsp)             # store to W[n & 15]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*5(%rsp), %edx     # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 22
-        movl    -32+4*3(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r14d, %esi             # ^W[(n+8) & 15]
-        xorl    %r8d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*6(%rsp)             # store to W[n & 15]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*6(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 23
-        movl    -32+4*4(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r15d, %esi             # ^W[(n+8) & 15]
-        xorl    %r9d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*7(%rsp)             # store to W[n & 15]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*7(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+        movaps  %xmm3, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm0    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm0, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm0, %xmm0    #  shift left by 1
+        psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm0, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*0(%rsp)
 # 24
-        xorl    -32+4*5(%rsp), %r8d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*0(%rsp), %r8d     # ^W[(n+8) & 15]
-        xorl    %r10d, %r8d     # ^W[(n+2) & 15]
-        roll    %r8d            #
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*8(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 25
-        xorl    -32+4*6(%rsp), %r9d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*1(%rsp), %r9d     # ^W[(n+8) & 15]
-        xorl    %r11d, %r9d     # ^W[(n+2) & 15]
-        roll    %r9d            #
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*9(%rsp), %ebp     # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 26
-        xorl    -32+4*7(%rsp), %r10d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*2(%rsp), %r10d    # ^W[(n+8) & 15]
-        xorl    %r12d, %r10d    # ^W[(n+2) & 15]
-        roll    %r10d           #
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*10(%rsp), %edx    # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 27
-        xorl    %r8d, %r11d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*3(%rsp), %r11d    # ^W[(n+8) & 15]
-        xorl    %r13d, %r11d    # ^W[(n+2) & 15]
-        roll    %r11d           #
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*11(%rsp), %ecx    # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+        movaps  %xmm0, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm1    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm1, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm1, %xmm1    #  shift left by 1
+        psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm1, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*1(%rsp)
 # 28
-        xorl    %r9d, %r12d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*4(%rsp), %r12d    # ^W[(n+8) & 15]
-        xorl    %r14d, %r12d    # ^W[(n+2) & 15]
-        roll    %r12d           #
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*12(%rsp), %ebx    # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 29
-        xorl    %r10d, %r13d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*5(%rsp), %r13d    # ^W[(n+8) & 15]
-        xorl    %r15d, %r13d    # ^W[(n+2) & 15]
-        roll    %r13d           #
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*13(%rsp), %eax    # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 30
-        xorl    %r11d, %r14d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*6(%rsp), %r14d    # ^W[(n+8) & 15]
-        xorl    -32+4*0(%rsp), %r14d    # ^W[(n+2) & 15]
-        roll    %r14d           #
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*14(%rsp), %ebp    # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 31
-        xorl    %r12d, %r15d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*7(%rsp), %r15d    # ^W[(n+8) & 15]
-        xorl    -32+4*1(%rsp), %r15d    # ^W[(n+2) & 15]
-        roll    %r15d           #
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*15(%rsp), %edx    # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
+        pshufd  $0xaa, %xmm7, %xmm6
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+        movaps  %xmm1, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm2    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm2, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm2, %xmm2    #  shift left by 1
+        psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm2, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*2(%rsp)
 # 32
-        movl    %r13d, %esi     # W[(n+13) & 15]
-        xorl    %r8d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*0(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*0(%rsp)             # store to W[n & 15]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*0(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 33
-        movl    %r14d, %esi     # W[(n+13) & 15]
-        xorl    %r9d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*1(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*1(%rsp)             # store to W[n & 15]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*1(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 34
-        movl    %r15d, %esi     # W[(n+13) & 15]
-        xorl    %r10d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*2(%rsp)             # store to W[n & 15]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*2(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 35
-        movl    -32+4*0(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r11d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*3(%rsp)             # store to W[n & 15]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*3(%rsp), %ebp     # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+        movaps  %xmm2, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm3    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm3, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm3, %xmm3    #  shift left by 1
+        psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm3, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*3(%rsp)
 # 36
-        movl    -32+4*1(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r12d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*4(%rsp)             # store to W[n & 15]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*4(%rsp), %edx     # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 37
-        movl    -32+4*2(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r13d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*5(%rsp)             # store to W[n & 15]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*5(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 38
-        movl    -32+4*3(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r14d, %esi             # ^W[(n+8) & 15]
-        xorl    %r8d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*6(%rsp)             # store to W[n & 15]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*6(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 39
-        movl    -32+4*4(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r15d, %esi             # ^W[(n+8) & 15]
-        xorl    %r9d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*7(%rsp)             # store to W[n & 15]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*7(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+        movaps  %xmm3, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm0    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm0, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm0, %xmm0    #  shift left by 1
+        psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm0, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*0(%rsp)
 # 40
        movl    %ebx, %edi              # di: b
        movl    %ebx, %esi              # si: b
@@ -625,12 +794,8 @@ sha1_process_block64:
        andl    %ecx, %esi              # si: b & c
        andl    %edx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*5(%rsp), %r8d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*0(%rsp), %r8d     # ^W[(n+8) & 15]
-        xorl    %r10d, %r8d     # ^W[(n+2) & 15]
-        roll    %r8d            #
        addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*8(%rsp), %ebp     # e += RCONST + W[n & 15]
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
@@ -642,12 +807,8 @@ sha1_process_block64:
        andl    %ebx, %esi              # si: b & c
        andl    %ecx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*6(%rsp), %r9d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*1(%rsp), %r9d     # ^W[(n+8) & 15]
-        xorl    %r11d, %r9d     # ^W[(n+2) & 15]
-        roll    %r9d            #
        addl    %edi, %edx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*9(%rsp), %edx     # e += RCONST + W[n & 15]
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
@@ -659,12 +820,8 @@ sha1_process_block64:
        andl    %eax, %esi              # si: b & c
        andl    %ebx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*7(%rsp), %r10d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*2(%rsp), %r10d    # ^W[(n+8) & 15]
-        xorl    %r12d, %r10d    # ^W[(n+2) & 15]
-        roll    %r10d           #
        addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*10(%rsp), %ecx    # e += RCONST + W[n & 15]
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
@@ -676,16 +833,42 @@ sha1_process_block64:
        andl    %ebp, %esi              # si: b & c
        andl    %eax, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r8d, %r11d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*3(%rsp), %r11d    # ^W[(n+8) & 15]
-        xorl    %r13d, %r11d    # ^W[(n+2) & 15]
-        roll    %r11d           #
        addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*11(%rsp), %ebx    # e += RCONST + W[n & 15]
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+        movaps  %xmm0, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm1    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm1, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm1, %xmm1    #  shift left by 1
+        psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm1, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*1(%rsp)
 # 44
        movl    %ecx, %edi              # di: b
        movl    %ecx, %esi              # si: b
@@ -693,12 +876,8 @@ sha1_process_block64:
        andl    %edx, %esi              # si: b & c
        andl    %ebp, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r9d, %r12d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*4(%rsp), %r12d    # ^W[(n+8) & 15]
-        xorl    %r14d, %r12d    # ^W[(n+2) & 15]
-        roll    %r12d           #
        addl    %edi, %eax              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*12(%rsp), %eax    # e += RCONST + W[n & 15]
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
@@ -710,12 +889,8 @@ sha1_process_block64:
        andl    %ecx, %esi              # si: b & c
        andl    %edx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r10d, %r13d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*5(%rsp), %r13d    # ^W[(n+8) & 15]
-        xorl    %r15d, %r13d    # ^W[(n+2) & 15]
-        roll    %r13d           #
        addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*13(%rsp), %ebp    # e += RCONST + W[n & 15]
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
@@ -727,12 +902,8 @@ sha1_process_block64:
        andl    %ebx, %esi              # si: b & c
        andl    %ecx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r11d, %r14d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*6(%rsp), %r14d    # ^W[(n+8) & 15]
-        xorl    -32+4*0(%rsp), %r14d    # ^W[(n+2) & 15]
-        roll    %r14d           #
        addl    %edi, %edx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*14(%rsp), %edx    # e += RCONST + W[n & 15]
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
@@ -744,16 +915,42 @@ sha1_process_block64:
        andl    %eax, %esi              # si: b & c
        andl    %ebx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r12d, %r15d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*7(%rsp), %r15d    # ^W[(n+8) & 15]
-        xorl    -32+4*1(%rsp), %r15d    # ^W[(n+2) & 15]
-        roll    %r15d           #
        addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*15(%rsp), %ecx    # e += RCONST + W[n & 15]
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+        movaps  %xmm1, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm2    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm2, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm2, %xmm2    #  shift left by 1
+        psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm2, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*2(%rsp)
 # 48
        movl    %edx, %edi              # di: b
        movl    %edx, %esi              # si: b
@@ -761,14 +958,8 @@ sha1_process_block64:
        andl    %ebp, %esi              # si: b & c
        andl    %eax, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    %r13d, %esi     # W[(n+13) & 15]
-        xorl    %r8d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*0(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*0(%rsp)             # store to W[n & 15]
        addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*0(%rsp), %ebx     # e += RCONST + W[n & 15]
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
@@ -780,14 +971,8 @@ sha1_process_block64:
        andl    %edx, %esi              # si: b & c
        andl    %ebp, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    %r14d, %esi     # W[(n+13) & 15]
-        xorl    %r9d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*1(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*1(%rsp)             # store to W[n & 15]
        addl    %edi, %eax              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*1(%rsp), %eax     # e += RCONST + W[n & 15]
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
@@ -799,14 +984,8 @@ sha1_process_block64:
        andl    %ecx, %esi              # si: b & c
        andl    %edx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    %r15d, %esi     # W[(n+13) & 15]
-        xorl    %r10d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*2(%rsp)             # store to W[n & 15]
        addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*2(%rsp), %ebp     # e += RCONST + W[n & 15]
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
@@ -818,18 +997,43 @@ sha1_process_block64:
        andl    %ebx, %esi              # si: b & c
        andl    %ecx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    -32+4*0(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r11d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*3(%rsp)             # store to W[n & 15]
        addl    %edi, %edx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*3(%rsp), %edx     # e += RCONST + W[n & 15]
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
+        pshufd  $0xff, %xmm7, %xmm6
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+        movaps  %xmm2, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm3    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm3, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm3, %xmm3    #  shift left by 1
+        psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm3, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*3(%rsp)
 # 52
        movl    %ebp, %edi              # di: b
        movl    %ebp, %esi              # si: b
@@ -837,14 +1041,8 @@ sha1_process_block64:
        andl    %eax, %esi              # si: b & c
        andl    %ebx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    -32+4*1(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r12d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*4(%rsp)             # store to W[n & 15]
        addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*4(%rsp), %ecx     # e += RCONST + W[n & 15]
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
@@ -856,14 +1054,8 @@ sha1_process_block64:
        andl    %ebp, %esi              # si: b & c
        andl    %eax, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    -32+4*2(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r13d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*5(%rsp)             # store to W[n & 15]
        addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*5(%rsp), %ebx     # e += RCONST + W[n & 15]
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
@@ -875,14 +1067,8 @@ sha1_process_block64:
        andl    %edx, %esi              # si: b & c
        andl    %ebp, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    -32+4*3(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r14d, %esi             # ^W[(n+8) & 15]
-        xorl    %r8d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*6(%rsp)             # store to W[n & 15]
        addl    %edi, %eax              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*6(%rsp), %eax     # e += RCONST + W[n & 15]
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
@@ -894,18 +1080,42 @@ sha1_process_block64:
        andl    %ecx, %esi              # si: b & c
        andl    %edx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        movl    -32+4*4(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r15d, %esi             # ^W[(n+8) & 15]
-        xorl    %r9d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*7(%rsp)             # store to W[n & 15]
        addl    %edi, %ebp              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*7(%rsp), %ebp     # e += RCONST + W[n & 15]
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
+# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
+        movaps  %xmm3, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm0, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm0, %xmm5
+        shufps  $0x4e, %xmm1, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm2, %xmm0    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm0    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm0, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm0, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm0, %xmm0    #  shift left by 1
+        psubd   %xmm4, %xmm0    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm0    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm0    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm0, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*0(%rsp)
 # 56
        movl    %eax, %edi              # di: b
        movl    %eax, %esi              # si: b
@@ -913,12 +1123,8 @@ sha1_process_block64:
        andl    %ebx, %esi              # si: b & c
        andl    %ecx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*5(%rsp), %r8d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*0(%rsp), %r8d     # ^W[(n+8) & 15]
-        xorl    %r10d, %r8d     # ^W[(n+2) & 15]
-        roll    %r8d            #
        addl    %edi, %edx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*8(%rsp), %edx     # e += RCONST + W[n & 15]
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
@@ -930,12 +1136,8 @@ sha1_process_block64:
        andl    %eax, %esi              # si: b & c
        andl    %ebx, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*6(%rsp), %r9d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*1(%rsp), %r9d     # ^W[(n+8) & 15]
-        xorl    %r11d, %r9d     # ^W[(n+2) & 15]
-        roll    %r9d            #
        addl    %edi, %ecx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*9(%rsp), %ecx     # e += RCONST + W[n & 15]
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
@@ -947,12 +1149,8 @@ sha1_process_block64:
        andl    %ebp, %esi              # si: b & c
        andl    %eax, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    -32+4*7(%rsp), %r10d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*2(%rsp), %r10d    # ^W[(n+8) & 15]
-        xorl    %r12d, %r10d    # ^W[(n+2) & 15]
-        roll    %r10d           #
        addl    %edi, %ebx              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*10(%rsp), %ebx    # e += RCONST + W[n & 15]
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
@@ -964,307 +1162,297 @@ sha1_process_block64:
        andl    %edx, %esi              # si: b & c
        andl    %ebp, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-        xorl    %r8d, %r11d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*3(%rsp), %r11d    # ^W[(n+8) & 15]
-        xorl    %r13d, %r11d    # ^W[(n+2) & 15]
-        roll    %r11d           #
        addl    %edi, %eax              # += ((b | c) & d) | (b & c)
-        leal    -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*11(%rsp), %eax    # e += RCONST + W[n & 15]
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
+# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
+        movaps  %xmm0, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm1, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm1, %xmm5
+        shufps  $0x4e, %xmm2, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm3, %xmm1    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm1    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm1, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm1, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm1, %xmm1    #  shift left by 1
+        psubd   %xmm4, %xmm1    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm1    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm1    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm1, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*1(%rsp)
 # 60
-        xorl    %r9d, %r12d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*4(%rsp), %r12d    # ^W[(n+8) & 15]
-        xorl    %r14d, %r12d    # ^W[(n+2) & 15]
-        roll    %r12d           #
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*12(%rsp), %ebp    # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 61
-        xorl    %r10d, %r13d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*5(%rsp), %r13d    # ^W[(n+8) & 15]
-        xorl    %r15d, %r13d    # ^W[(n+2) & 15]
-        roll    %r13d           #
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*13(%rsp), %edx    # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 62
-        xorl    %r11d, %r14d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*6(%rsp), %r14d    # ^W[(n+8) & 15]
-        xorl    -32+4*0(%rsp), %r14d    # ^W[(n+2) & 15]
-        roll    %r14d           #
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*14(%rsp), %ecx    # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 63
-        xorl    %r12d, %r15d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*7(%rsp), %r15d    # ^W[(n+8) & 15]
-        xorl    -32+4*1(%rsp), %r15d    # ^W[(n+2) & 15]
-        roll    %r15d           #
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*15(%rsp), %ebx    # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
+# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
+        movaps  %xmm1, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm2, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm2, %xmm5
+        shufps  $0x4e, %xmm3, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm0, %xmm2    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm2    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm2, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm2, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm2, %xmm2    #  shift left by 1
+        psubd   %xmm4, %xmm2    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm2    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm2    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm2, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*2(%rsp)
 # 64
-        movl    %r13d, %esi     # W[(n+13) & 15]
-        xorl    %r8d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*0(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*0(%rsp)             # store to W[n & 15]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*0(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 65
-        movl    %r14d, %esi     # W[(n+13) & 15]
-        xorl    %r9d, %esi              # ^W[(n+8) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*1(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*1(%rsp)             # store to W[n & 15]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*1(%rsp), %ebp     # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 66
-        movl    %r15d, %esi     # W[(n+13) & 15]
-        xorl    %r10d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*2(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*2(%rsp)             # store to W[n & 15]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*2(%rsp), %edx     # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 67
-        movl    -32+4*0(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r11d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*3(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*3(%rsp)             # store to W[n & 15]
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*3(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
+# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
+        movaps  %xmm2, %xmm4
+        psrldq  $4, %xmm4       # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  $0x4e, %xmm3, %xmm5     # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  %xmm3, %xmm5
+        shufps  $0x4e, %xmm0, %xmm5     # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   %xmm1, %xmm3    # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   %xmm4, %xmm5    # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   %xmm5, %xmm3    # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  %xmm3, %xmm5
+        xorps   %xmm4, %xmm4    # rol(W0,1):
+        pcmpgtd %xmm3, %xmm4    #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   %xmm3, %xmm3    #  shift left by 1
+        psubd   %xmm4, %xmm3    #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  $12, %xmm5      # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  %xmm5, %xmm4
+        pslld   $2, %xmm5
+        psrld   $30, %xmm4
+#       xorps   %xmm4, %xmm5    # rol((0,0,0,unrotW[0]),2)
+        xorps   %xmm4, %xmm3    # same result, but does not depend on/does not modify T2
+        xorps   %xmm5, %xmm3    # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+        movaps  %xmm3, %xmm5
+        paddd   %xmm6, %xmm5
+        movups  %xmm5, -64+16*3(%rsp)
 # 68
-        movl    -32+4*1(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r12d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*4(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*4(%rsp)             # store to W[n & 15]
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*4(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 69
-        movl    -32+4*2(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r13d, %esi             # ^W[(n+8) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[(n+2) & 15]
-        xorl    -32+4*5(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*5(%rsp)             # store to W[n & 15]
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*5(%rsp), %eax     # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 70
-        movl    -32+4*3(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r14d, %esi             # ^W[(n+8) & 15]
-        xorl    %r8d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*6(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*6(%rsp)             # store to W[n & 15]
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*6(%rsp), %ebp     # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 71
-        movl    -32+4*4(%rsp), %esi     # W[(n+13) & 15]
-        xorl    %r15d, %esi             # ^W[(n+8) & 15]
-        xorl    %r9d, %esi              # ^W[(n+2) & 15]
-        xorl    -32+4*7(%rsp), %esi             # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, -32+4*7(%rsp)             # store to W[n & 15]
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*7(%rsp), %edx     # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 72
-        xorl    -32+4*5(%rsp), %r8d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*0(%rsp), %r8d     # ^W[(n+8) & 15]
-        xorl    %r10d, %r8d     # ^W[(n+2) & 15]
-        roll    %r8d            #
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*8(%rsp), %ecx     # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 73
-        xorl    -32+4*6(%rsp), %r9d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*1(%rsp), %r9d     # ^W[(n+8) & 15]
-        xorl    %r11d, %r9d     # ^W[(n+2) & 15]
-        roll    %r9d            #
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*9(%rsp), %ebx     # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 74
-        xorl    -32+4*7(%rsp), %r10d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*2(%rsp), %r10d    # ^W[(n+8) & 15]
-        xorl    %r12d, %r10d    # ^W[(n+2) & 15]
-        roll    %r10d           #
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*10(%rsp), %eax    # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %eax              # e += rotl32(a,5)
        rorl    $2, %ecx                # b = rotl32(b,30)
 # 75
-        xorl    %r8d, %r11d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*3(%rsp), %r11d    # ^W[(n+8) & 15]
-        xorl    %r13d, %r11d    # ^W[(n+2) & 15]
-        roll    %r11d           #
        movl    %ecx, %edi              # c
        xorl    %edx, %edi              # ^d
        xorl    %ebx, %edi              # ^b
-        leal    -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15]
+        addl    -64+4*11(%rsp), %ebp    # e += RCONST + W[n & 15]
        addl    %edi, %ebp              # e += (c ^ d ^ b)
        movl    %eax, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebp              # e += rotl32(a,5)
        rorl    $2, %ebx                # b = rotl32(b,30)
 # 76
-        xorl    %r9d, %r12d     # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*4(%rsp), %r12d    # ^W[(n+8) & 15]
-        xorl    %r14d, %r12d    # ^W[(n+2) & 15]
-        roll    %r12d           #
        movl    %ebx, %edi              # c
        xorl    %ecx, %edi              # ^d
        xorl    %eax, %edi              # ^b
-        leal    -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15]
+        addl    -64+4*12(%rsp), %edx    # e += RCONST + W[n & 15]
        addl    %edi, %edx              # e += (c ^ d ^ b)
        movl    %ebp, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %edx              # e += rotl32(a,5)
        rorl    $2, %eax                # b = rotl32(b,30)
 # 77
-        xorl    %r10d, %r13d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*5(%rsp), %r13d    # ^W[(n+8) & 15]
-        xorl    %r15d, %r13d    # ^W[(n+2) & 15]
-        roll    %r13d           #
        movl    %eax, %edi              # c
        xorl    %ebx, %edi              # ^d
        xorl    %ebp, %edi              # ^b
-        leal    -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15]
+        addl    -64+4*13(%rsp), %ecx    # e += RCONST + W[n & 15]
        addl    %edi, %ecx              # e += (c ^ d ^ b)
        movl    %edx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ecx              # e += rotl32(a,5)
        rorl    $2, %ebp                # b = rotl32(b,30)
 # 78
-        xorl    %r11d, %r14d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*6(%rsp), %r14d    # ^W[(n+8) & 15]
-        xorl    -32+4*0(%rsp), %r14d    # ^W[(n+2) & 15]
-        roll    %r14d           #
        movl    %ebp, %edi              # c
        xorl    %eax, %edi              # ^d
        xorl    %edx, %edi              # ^b
-        leal    -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15]
+        addl    -64+4*14(%rsp), %ebx    # e += RCONST + W[n & 15]
        addl    %edi, %ebx              # e += (c ^ d ^ b)
        movl    %ecx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
        addl    %esi, %ebx              # e += rotl32(a,5)
        rorl    $2, %edx                # b = rotl32(b,30)
 # 79
-        xorl    %r12d, %r15d    # W[n & 15] ^= W[(n+13) & 15]
-        xorl    -32+4*7(%rsp), %r15d    # ^W[(n+8) & 15]
-        xorl    -32+4*1(%rsp), %r15d    # ^W[(n+2) & 15]
-        roll    %r15d           #
        movl    %edx, %edi              # c
        xorl    %ebp, %edi              # ^d
        xorl    %ecx, %edi              # ^b
-        leal    -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15]
+        addl    -64+4*15(%rsp), %eax    # e += RCONST + W[n & 15]
        addl    %edi, %eax              # e += (c ^ d ^ b)
        movl    %ebx, %esi              #
        roll    $5, %esi                # rotl32(a,5)
@@ -1278,7 +1466,7 @@ sha1_process_block64:
        addl    %ebx, 84(%rdi)  # ctx->hash[1] += b
        popq    %r14            #
        addl    %ecx, 88(%rdi)  # ctx->hash[2] += c
-        popq    %r15            #
+#       popq    %r15            #
        addl    %edx, 92(%rdi)  # ctx->hash[3] += d
        popq    %rbx            #
        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
@@ -1286,4 +1474,13 @@ sha1_process_block64:
        ret
        .size   sha1_process_block64, .-sha1_process_block64
+        .section        .rodata.cst16.sha1const, "aM", @progbits, 16
+        .balign 16
+sha1const:
+        .long   0x5A827999
+        .long   0x6ED9EBA1
+        .long   0x8F1BBCDC
+        .long   0xCA62C1D6
 #endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 901896e6e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,33 +6,104 @@
 # also contains the diff of the generated file.
 exec >hash_md5_sha_x86-64.S
-# There is a way to use XMM registers (which always exist for x86-64!) for W[]
+# Based on http://arctic.org/~dean/crypto/sha1.html.
-# For example, if we load W as follows:
+# ("This SHA1 implementation is public domain.")
-#       %xmm0:  w[0x0] w[0x1] w[0x2] w[0x3]
+#
-#       %xmm4:  w[0x4] w[0x5] w[0x6] w[0x7]
+# x86-64 has at least SSE2 vector insns always available.
-#       %xmm8:  w[0x8] w[0x9] w[0xa] w[0xb]
+# We can use them without any CPUID checks (and without a need
-#       %xmm12: w[0xc] w[0xd] w[0xe] w[0xf]
+# for a fallback code if needed insns are not available).
-# then the xor'ing operation to generate next W[0..3] is:
+# This code uses them to calculate W[] ahead of time.
-#       movaps  %xmm0, %xmmT2
+#
-#       palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5])
+# Unfortunately, results are passed from vector unit to
-#       # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn.
+# integer ALUs on the stack. MOVD/Q insns to move them directly
-#       movaps  %xmm0, %xmmT13
+# from vector to integer registers are slower than store-to-load
-#       palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0])
+# forwarding in LSU (on Skylake at least).
-#       xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13
+#
-#       xmm0 = rol32(xmm0,1)    # no such insn, have to use pslld+psrld+or
+# The win against a purely integer code is small on Skylake,
-# and then results can be extracted for use:
+# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
-#       movd    %xmm0, %esi     # new W[0]
+# It can do 4 ops at once in one 128-bit register,
-#       pextrd  $1, %xmm0, %esi # new W[1]
+# but we have to use x2 of them because of W[0] complication,
-#       # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1)
+# SSE2 has no "rotate each word by N bits" insns,
-#       pextrd  $2, %xmm0, %esi # new W[2]
+# moving data to/from vector unit is clunky, and Skylake
-#       pextrd  $3, %xmm0, %esi # new W[3]
+# has four integer ALUs unified with three vector ALUs,
-# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64.
+# which makes pure integer code rather fast, and makes
+# vector ops compete with integer ones.
+#
+# Zen3, with its separate vector ALUs, wins more, about 12%.
+xmmT1="%xmm4"
+xmmT2="%xmm5"
+xmmRCONST="%xmm6"
+xmmALLRCONST="%xmm7"
+T=`printf '\t'`
+# SSE instructions are longer than 4 bytes on average.
+# Intel CPUs (up to Tiger Lake at least) can't decode
+# more than 16 bytes of code in one cycle.
+# By interleaving SSE code and integer code
+# we mostly achieve a situation where 16-byte decode fetch window
+# contains 4 (or more) insns.
+#
+# However. On Skylake, there was no observed difference,
+# but on Zen3, non-interleaved code is ~3% faster
+# (822 Mb/s versus 795 Mb/s hashing speed).
+# Off for now:
+interleave=false
+INTERLEAVE() {
+        $interleave || \
+        {
+                # Generate non-interleaved code
+                # (it should work correctly too)
+                echo "$1"
+                echo "$2"
+                return
+        }
+        (
+        echo "$1" | grep -v '^$' >"$0.temp1"
+        echo "$2" | grep -v '^$' >"$0.temp2"
+        exec 3<"$0.temp1"
+        exec 4<"$0.temp2"
+        IFS=''
+        while :; do
+                line1=''
+                line2=''
+                while :; do
+                        read -r line1 <&3
+                        if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
+                                break
+                        fi
+                        echo "$line1"
+                done
+                while :; do
+                        read -r line2 <&4
+                        if test "${line2:0:4}" = "${T}lea"; then
+                                # We use 7-8 byte long forms of LEA.
+                                # Do not interleave them with SSE insns
+                                # which are also long.
+                                echo "$line2"
+                                read -r line2 <&4
+                                echo "$line2"
+                                continue
+                        fi
+                        if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
+                                break
+                        fi
+                        echo "$line2"
+                done
+                test "$line1$line2" || break
+                echo "$line1"
+                echo "$line2"
+        done
+        rm "$0.temp1" "$0.temp2"
+        )
+}
 echo \
-'### Generated by hash_md5_sha_x86-64.S.sh ###
+"### Generated by hash_md5_sha_x86-64.S.sh ###
 #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
-        .section        .text.sha1_process_block64,"ax",@progbits
+        .section        .text.sha1_process_block64, \"ax\", @progbits
        .globl  sha1_process_block64
        .hidden sha1_process_block64
        .type   sha1_process_block64, @function
@@ -41,7 +112,7 @@ echo \
 sha1_process_block64:
        pushq   %rbp    # 1 byte insn
        pushq   %rbx    # 1 byte insn
-        pushq   %r15    # 2 byte insn
+#       pushq   %r15    # 2 byte insn
        pushq   %r14    # 2 byte insn
        pushq   %r13    # 2 byte insn
        pushq   %r12    # 2 byte insn
@@ -50,17 +121,13 @@ sha1_process_block64:
 #Register and stack use:
 # eax..edx: a..d
 # ebp: e
-# esi,edi: temps
+# esi,edi,r8..r14: temps
-# -32+4*n(%rsp),r8...r15: W[0..7,8..15]
+# r15: unused
-# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?)
+# xmm0..xmm3: W[]
-        movl    $3, %eax
+# xmm4,xmm5: temps
-1:
+# xmm6: current round constant
-        movq    (%rdi,%rax,8), %rsi
+# xmm7: all round constants
-        bswapq  %rsi
+# -64(%rsp): area for passing RCONST + W[] from vector to integer units
-        rolq    $32, %rsi
-        movq    %rsi, -32(%rsp,%rax,8)
-        decl    %eax
-        jns     1b
        movl    80(%rdi), %eax          # a = ctx->hash[0]
        movl    84(%rdi), %ebx          # b = ctx->hash[1]
@@ -68,32 +135,123 @@ sha1_process_block64:
        movl    92(%rdi), %edx          # d = ctx->hash[3]
        movl    96(%rdi), %ebp          # e = ctx->hash[4]
-        movq    4*8(%rdi), %r8
+        movaps  sha1const(%rip), $xmmALLRCONST
-        movq    4*10(%rdi), %r10
+        pshufd  \$0x00, $xmmALLRCONST, $xmmRCONST
+        # Load W[] to xmm registers, byteswapping on the fly.
+        #
+        # For iterations 0..15, we pass W[] in rsi,r8..r14
+        # for use in RD1As instead of spilling them to stack.
+        # We lose parallelized addition of RCONST, but LEA
+        # can do two additions at once, so it is probably a wash.
+        # (We use rsi instead of rN because this makes two
+        # LEAs in two first RD1As shorter by one byte).
+        movq    4*0(%rdi), %rsi
+        movq    4*2(%rdi), %r8
+        bswapq  %rsi
        bswapq  %r8
+        rolq    \$32, %rsi              # rsi = W[1]:W[0]
+        rolq    \$32, %r8               # r8  = W[3]:W[2]
+        movq    %rsi, %xmm0
+        movq    %r8, $xmmT1
+        punpcklqdq $xmmT1, %xmm0        # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
+#       movaps  %xmm0, $xmmT1           # add RCONST, spill to stack
+#       paddd   $xmmRCONST, $xmmT1
+#       movups  $xmmT1, -64+16*0(%rsp)
+        movq    4*4(%rdi), %r9
+        movq    4*6(%rdi), %r10
+        bswapq  %r9
        bswapq  %r10
-        movq    4*12(%rdi), %r12
+        rolq    \$32, %r9               # r9  = W[5]:W[4]
-        movq    4*14(%rdi), %r14
+        rolq    \$32, %r10              # r10 = W[7]:W[6]
+        movq    %r9, %xmm1
+        movq    %r10, $xmmT1
+        punpcklqdq $xmmT1, %xmm1        # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
+        movq    4*8(%rdi), %r11
+        movq    4*10(%rdi), %r12
+        bswapq  %r11
        bswapq  %r12
+        rolq    \$32, %r11              # r11  = W[9]:W[8]
+        rolq    \$32, %r12              # r12  = W[11]:W[10]
+        movq    %r11, %xmm2
+        movq    %r12, $xmmT1
+        punpcklqdq $xmmT1, %xmm2        # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
+        movq    4*12(%rdi), %r13
+        movq    4*14(%rdi), %r14
+        bswapq  %r13
        bswapq  %r14
-        movl    %r8d, %r9d
+        rolq    \$32, %r13              # r13  = W[13]:W[12]
-        shrq    $32, %r8
+        rolq    \$32, %r14              # r14  = W[15]:W[14]
-        movl    %r10d, %r11d
+        movq    %r13, %xmm3
-        shrq    $32, %r10
+        movq    %r14, $xmmT1
-        movl    %r12d, %r13d
+        punpcklqdq $xmmT1, %xmm3        # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
-        shrq    $32, %r12
+"
-        movl    %r14d, %r15d
-        shrq    $32, %r14
+PREP() {
-'
+local xmmW0=$1
-W32() {
+local xmmW4=$2
-test "$1" || exit 1
+local xmmW8=$3
-test "$1" -lt 0 && exit 1
+local xmmW12=$4
-test "$1" -gt 15 && exit 1
+# the above must be %xmm0..3 in some permutation
-test "$1" -lt 8 && echo "-32+4*$1(%rsp)"
+local dstmem=$5
-test "$1" -ge 8 && echo "%r${1}d"
+#W[0] = rol(W[13] ^ W[8]  ^ W[2] ^ W[0], 1);
+#W[1] = rol(W[14] ^ W[9]  ^ W[3] ^ W[1], 1);
+#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
+#W[3] = rol(  0   ^ W[11] ^ W[5] ^ W[3], 1);
+#W[3] ^= rol(W[0], 1);
+echo "# PREP $@
+        movaps  $xmmW12, $xmmT1
+        psrldq  \$4, $xmmT1     # rshift by 4 bytes: T1 = ([13],[14],[15],0)
+#       pshufd  \$0x4e, $xmmW0, $xmmT2  # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
+#       punpcklqdq $xmmW4, $xmmT2       # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
+# same result as above, but shorter and faster:
+# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
+# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
+        movaps  $xmmW0, $xmmT2
+        shufps  \$0x4e, $xmmW4, $xmmT2  # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
+        xorps   $xmmW8, $xmmW0  # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
+        xorps   $xmmT1, $xmmT2  # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
+        xorps   $xmmT2, $xmmW0  # ^
+        # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
+        movaps  $xmmW0, $xmmT2
+        xorps   $xmmT1, $xmmT1  # rol(W0,1):
+        pcmpgtd $xmmW0, $xmmT1  #  ffffffff for elements <0 (ones with msb bit 1)
+        paddd   $xmmW0, $xmmW0  #  shift left by 1
+        psubd   $xmmT1, $xmmW0  #  add 1 to those who had msb bit 1
+        # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
+        pslldq  \$12, $xmmT2    # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
+        movaps  $xmmT2, $xmmT1
+        pslld   \$2, $xmmT2
+        psrld   \$30, $xmmT1
+#       xorps   $xmmT1, $xmmT2  # rol((0,0,0,unrotW[0]),2)
+        xorps   $xmmT1, $xmmW0  # same result, but does not depend on/does not modify T2
+        xorps   $xmmT2, $xmmW0  # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
+"
+#       movq    $xmmW0, %r8     # high latency (~6 cycles)
+#       movaps  $xmmW0, $xmmT1
+#       psrldq  \$8, $xmmT1     # rshift by 8 bytes: move upper 64 bits to lower
+#       movq    $xmmT1, %r10    # high latency
+#       movq    %r8, %r9
+#       movq    %r10, %r11
+#       shrq    \$32, %r9
+#       shrq    \$32, %r11
+# ^^^ slower than passing the results on stack (!!!)
+echo "
+        movaps  $xmmW0, $xmmT2
+        paddd   $xmmRCONST, $xmmT2
+        movups  $xmmT2, $dstmem
+"
 }
-# It's possible to interleave insns in rounds to mostly eliminate
+# It's possible to interleave integer insns in rounds to mostly eliminate
 # dependency chains, but this likely to only help old Pentium-based
 # CPUs (ones without OOO, which can only simultaneously execute a pair
 # of _adjacent_ insns).
@@ -104,28 +262,28 @@ RD1A() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
 local n=$(($6))
 local n0=$(((n+0) & 15))
+local rN=$((7+n0/2))
 echo "
 # $n
 ";test $n0 = 0 && echo "
-        # W[0], already in %esi
+        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 != 0 && test $n0 -lt 8 && echo "
+        shrq    \$32, %rsi
-        movl    `W32 $n0`, %esi         # W[n]
+";test $n0 = 1 && echo "
-";test $n0 -ge 8 && echo "
+        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-        # W[n], in %r$n0
+";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
+        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
+        shrq    \$32, %r$rN
+";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
+        leal    $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
 ";echo "
        movl    %e$c, %edi              # c
        xorl    %e$d, %edi              # ^d
        andl    %e$b, %edi              # &b
        xorl    %e$d, %edi              # (((c ^ d) & b) ^ d)
-";test $n0 -lt 8 && echo "
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
-";test $n0 -ge 8 && echo "
-        leal    $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
-";echo "
        addl    %edi, %e$e              # e += (((c ^ d) & b) ^ d)
-        movl    %e$a, %esi              #
+        movl    %e$a, %edi              #
-        roll    \$5, %esi               # rotl32(a,5)
+        roll    \$5, %edi               # rotl32(a,5)
-        addl    %esi, %e$e              # e += rotl32(a,5)
+        addl    %edi, %e$e              # e += rotl32(a,5)
        rorl    \$2, %e$b               # b = rotl32(b,30)
 "
 }
@@ -138,28 +296,11 @@ local n2=$(((n+2) & 15))
 local n0=$(((n+0) & 15))
 echo "
 # $n
-";test $n0 -lt 8 && echo "
-        movl    `W32 $n13`, %esi        # W[(n+13) & 15]
-        xorl    `W32 $n8`, %esi         # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, %esi         # ^W[(n+2) & 15]
-        xorl    `W32 $n0`, %esi         # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, `W32 $n0`         # store to W[n & 15]
-";test $n0 -ge 8 && echo "
-        xorl    `W32 $n13`, `W32 $n0`   # W[n & 15] ^= W[(n+13) & 15]
-        xorl    `W32 $n8`, `W32 $n0`    # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, `W32 $n0`    # ^W[(n+2) & 15]
-        roll    `W32 $n0`               #
-";echo "
        movl    %e$c, %edi              # c
        xorl    %e$d, %edi              # ^d
        andl    %e$b, %edi              # &b
        xorl    %e$d, %edi              # (((c ^ d) & b) ^ d)
-";test $n0 -lt 8 && echo "
+        addl    -64+4*$n0(%rsp), %e$e   # e += RCONST + W[n & 15]
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-        leal    $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
        addl    %edi, %e$e              # e += (((c ^ d) & b) ^ d)
        movl    %e$a, %esi              #
        roll    \$5, %esi               # rotl32(a,5)
@@ -167,13 +308,6 @@ echo "
        rorl    \$2, %e$b               # b = rotl32(b,30)
 "
 }
-{
-RCONST=0x5A827999
-RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3; RD1A bx cx dx bp ax  4
-RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7; RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9
-RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
-RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
-} | grep -v '^$'
 RD2() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -184,27 +318,10 @@ local n2=$(((n+2) & 15))
 local n0=$(((n+0) & 15))
 echo "
 # $n
-";test $n0 -lt 8 && echo "
-        movl    `W32 $n13`, %esi        # W[(n+13) & 15]
-        xorl    `W32 $n8`, %esi         # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, %esi         # ^W[(n+2) & 15]
-        xorl    `W32 $n0`, %esi         # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, `W32 $n0`         # store to W[n & 15]
-";test $n0 -ge 8 && echo "
-        xorl    `W32 $n13`, `W32 $n0`   # W[n & 15] ^= W[(n+13) & 15]
-        xorl    `W32 $n8`, `W32 $n0`    # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, `W32 $n0`    # ^W[(n+2) & 15]
-        roll    `W32 $n0`               #
-";echo "
        movl    %e$c, %edi              # c
        xorl    %e$d, %edi              # ^d
        xorl    %e$b, %edi              # ^b
-";test $n0 -lt 8 && echo "
+        addl    -64+4*$n0(%rsp), %e$e   # e += RCONST + W[n & 15]
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-        leal    $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
        addl    %edi, %e$e              # e += (c ^ d ^ b)
        movl    %e$a, %esi              #
        roll    \$5, %esi               # rotl32(a,5)
@@ -212,13 +329,6 @@ echo "
        rorl    \$2, %e$b               # b = rotl32(b,30)
 "
 }
-{
-RCONST=0x6ED9EBA1
-RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
-RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
-RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
-RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
-} | grep -v '^$'
 RD3() {
 local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -235,53 +345,82 @@ echo "
        andl    %e$c, %esi              # si: b & c
        andl    %e$d, %edi              # di: (b | c) & d
        orl     %esi, %edi              # ((b | c) & d) | (b & c)
-";test $n0 -lt 8 && echo "
-        movl    `W32 $n13`, %esi        # W[(n+13) & 15]
-        xorl    `W32 $n8`, %esi         # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, %esi         # ^W[(n+2) & 15]
-        xorl    `W32 $n0`, %esi         # ^W[n & 15]
-        roll    %esi                    #
-        movl    %esi, `W32 $n0`         # store to W[n & 15]
-";test $n0 -ge 8 && echo "
-        xorl    `W32 $n13`, `W32 $n0`   # W[n & 15] ^= W[(n+13) & 15]
-        xorl    `W32 $n8`, `W32 $n0`    # ^W[(n+8) & 15]
-        xorl    `W32 $n2`, `W32 $n0`    # ^W[(n+2) & 15]
-        roll    `W32 $n0`               #
-";echo "
        addl    %edi, %e$e              # += ((b | c) & d) | (b & c)
-";test $n0 -lt 8 && echo "
+        addl    -64+4*$n0(%rsp), %e$e   # e += RCONST + W[n & 15]
-        leal    $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
-";test $n0 -ge 8 && echo "
-        leal    $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
-";echo "
        movl    %e$a, %esi              #
        roll    \$5, %esi               # rotl32(a,5)
        addl    %esi, %e$e              # e += rotl32(a,5)
        rorl    \$2, %e$b               # b = rotl32(b,30)
 "
 }
 {
-#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement"
+# Round 1
-RCONST=-0x70E44324
+RCONST=0x5A827999
-RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44
+RD1A ax bx cx dx bp  0; RD1A bp ax bx cx dx  1; RD1A dx bp ax bx cx  2; RD1A cx dx bp ax bx  3;
-RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49
+RD1A bx cx dx bp ax  4; RD1A ax bx cx dx bp  5; RD1A bp ax bx cx dx  6; RD1A dx bp ax bx cx  7;
-RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
-RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59
+b=`RD1A cx dx bp ax bx  8; RD1A bx cx dx bp ax  9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
-} | grep -v '^$'
+INTERLEAVE "$a" "$b"
+a=`echo "       pshufd  \\$0x55, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
+INTERLEAVE "$a" "$b"
+# Round 2
+RCONST=0x6ED9EBA1
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
+INTERLEAVE "$a" "$b"
+a=`echo "       pshufd  \\$0xaa, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
+INTERLEAVE "$a" "$b"
+# Round 3
+RCONST=0x8F1BBCDC
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
+b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
+b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
+INTERLEAVE "$a" "$b"
+a=`echo "       pshufd  \\$0xff, $xmmALLRCONST, $xmmRCONST"
+   PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
+b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
+INTERLEAVE "$a" "$b"
+a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
+b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
+INTERLEAVE "$a" "$b"
 # Round 4 has the same logic as round 2, only n and RCONST are different
-{
+RCONST=0xCA62C1D6
-#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement"
+a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
-RCONST=-0x359D3E2A
+b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
-RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64
+INTERLEAVE "$a" "$b"
-RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69
+a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
-RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74
+b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
-RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79
+INTERLEAVE "$a" "$b"
-# Note: new W[n&15] values generated in last 3 iterations
+a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
-# (W[13,14,15]) are unused after each of these iterations.
+b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
-# Since we use r8..r15 for W[8..15], this does not matter.
+INTERLEAVE "$a" "$b"
-# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15]
+RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
-# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed.
+RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
 } | grep -v '^$'
 echo "
@@ -292,7 +431,7 @@ echo "
        addl    %ebx, 84(%rdi)  # ctx->hash[1] += b
        popq    %r14            #
        addl    %ecx, 88(%rdi)  # ctx->hash[2] += c
-        popq    %r15            #
+#       popq    %r15            #
        addl    %edx, 92(%rdi)  # ctx->hash[3] += d
        popq    %rbx            #
        addl    %ebp, 96(%rdi)  # ctx->hash[4] += e
@@ -300,4 +439,13 @@ echo "
        ret
        .size   sha1_process_block64, .-sha1_process_block64
+        .section        .rodata.cst16.sha1const, \"aM\", @progbits, 16
+        .balign 16
+sha1const:
+        .long   0x5A827999
+        .long   0x6ED9EBA1
+        .long   0x8F1BBCDC
+        .long   0xCA62C1D6
 #endif"
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 33cc3bf7f..b32029360 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,7 +20,7 @@
 #define extr128_32 pextrd
 //#define extr128_32 extractps  # not shorter
-        .section        .text.sha1_process_block64_shaNI,"ax",@progbits
+        .section        .text.sha1_process_block64_shaNI, "ax", @progbits
        .globl  sha1_process_block64_shaNI
        .hidden sha1_process_block64_shaNI
        .type   sha1_process_block64_shaNI, @function
@@ -32,41 +32,42 @@
 #define MSG1            %xmm4
 #define MSG2            %xmm5
 #define MSG3            %xmm6
-#define SHUF_MASK       %xmm7
        .balign 8       # allow decoders to fetch at least 2 first insns
 sha1_process_block64_shaNI:
        /* load initial hash values */
-        xor128          E0, E0
        movu128         80(%rdi), ABCD
+        xor128          E0, E0
        pinsrd          $3, 80+4*4(%rdi), E0    # load to uppermost 32-bit word
        shuf128_32      $0x1B, ABCD, ABCD       # DCBA -> ABCD
-        mova128         PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
+        mova128         PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
+        movu128         0*16(%rdi), MSG0
+        pshufb          %xmm7, MSG0
+        movu128         1*16(%rdi), MSG1
+        pshufb          %xmm7, MSG1
+        movu128         2*16(%rdi), MSG2
+        pshufb          %xmm7, MSG2
+        movu128         3*16(%rdi), MSG3
+        pshufb          %xmm7, MSG3
        /* Save hash values for addition after rounds */
-        mova128         E0, %xmm9
+        mova128         E0, %xmm7
        mova128         ABCD, %xmm8
        /* Rounds 0-3 */
-        movu128         0*16(%rdi), MSG0
-        pshufb          SHUF_MASK, MSG0
                paddd           MSG0, E0
                mova128         ABCD, E1
                sha1rnds4       $0, E0, ABCD
        /* Rounds 4-7 */
-        movu128         1*16(%rdi), MSG1
-        pshufb          SHUF_MASK, MSG1
                sha1nexte       MSG1, E1
                mova128         ABCD, E0
                sha1rnds4       $0, E1, ABCD
        sha1msg1        MSG1, MSG0
        /* Rounds 8-11 */
-        movu128         2*16(%rdi), MSG2
-        pshufb          SHUF_MASK, MSG2
                sha1nexte       MSG2, E0
                mova128         ABCD, E1
                sha1rnds4       $0, E0, ABCD
@@ -74,8 +75,6 @@ sha1_process_block64_shaNI:
        xor128          MSG2, MSG0
        /* Rounds 12-15 */
-        movu128         3*16(%rdi), MSG3
-        pshufb          SHUF_MASK, MSG3
                sha1nexte       MSG3, E1
                mova128         ABCD, E0
        sha1msg2        MSG3, MSG0
@@ -206,7 +205,7 @@ sha1_process_block64_shaNI:
                sha1rnds4       $3, E1, ABCD
        /* Add current hash values with previously saved */
-        sha1nexte       %xmm9, E0
+        sha1nexte       %xmm7, E0
        paddd           %xmm8, ABCD
        /* Write hash values back in the correct order */
@@ -217,8 +216,8 @@ sha1_process_block64_shaNI:
        ret
        .size   sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
-.section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
+        .section        .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
-.align 16
+        .balign 16
 PSHUFFLE_BYTE_FLIP_MASK:
        .octa 0x000102030405060708090a0b0c0d0e0f
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 8abc87976..778511d16 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -2274,17 +2274,41 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 #endif
        fflush_all();
-        while (1) {
+        for (;;) {
                /* Wait for input. TIMEOUT = -1 makes read_key wait even
                 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't
                 * insist on full MB_CUR_MAX buffer to declare input like
                 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls".
                 *
+                 * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll()
+                 * inside read_key, or if bb_got_signal != 0 (IOW: if signal
+                 * arrived before poll() is reached).
+                 *
                 * Note: read_key sets errno to 0 on success.
                 */
-                IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
+                for (;;) {
-                ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
+                        if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) {
-                IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
+                                errno = EINTR;
+                                return -1;
+                        }
+//FIXME: still races here with signals, but small window to poll() inside read_key
+                        IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
+                        /* errno = 0; - read_key does this itself */
+                        ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
+                        IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
+                        if (errno != EINTR)
+                                break;
+                        if (state->flags & LI_INTERRUPTIBLE) {
+                                /* LI_INTERRUPTIBLE bails out on EINTR,
+                                 * but nothing really guarantees that bb_got_signal
+                                 * is nonzero. Follow the least surprise principle:
+                                 */
+                                if (bb_got_signal == 0)
+                                        bb_got_signal = 255;
+                                goto ret;
+                        }
+                }
                if (errno) {
 #if ENABLE_UNICODE_SUPPORT
                        if (errno == EAGAIN && unicode_idx != 0)
@@ -2352,7 +2376,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
 #endif
                break;
        }
+ ret:
        return ic;
 }
diff --git a/libbb/read_key.c b/libbb/read_key.c
index 03b7da656..cf8ed411e 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
                 * if fd can be in non-blocking mode.
                 */
                if (timeout >= -1) {
-                        if (safe_poll(&pfd, 1, timeout) == 0) {
+                        n = poll(&pfd, 1, timeout);
+                        if (n < 0 && errno == EINTR)
+                                return n;
+                        if (n == 0) {
                                /* Timed out */
                                errno = EAGAIN;
                                return -1;
@@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
                 * When we were reading 3 bytes here, we were eating
                 * "li" too, and cat was getting wrong input.
                 */
-                n = safe_read(fd, buffer, 1);
+                n = read(fd, buffer, 1);
                if (n <= 0)
                        return -1;
        }
@@ -284,6 +287,16 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
        goto start_over;
 }
+int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout)
+{
+        int64_t r;
+        do {
+                /* errno = 0; - read_key does this itself */
+                r = read_key(fd, buffer, timeout);
+        } while (errno == EINTR);
+        return r;
+}
 void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
 {
        unsigned cur_len = (unsigned char)buffer[0];
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index df2983958..3549e2099 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -36,9 +36,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
        /* Change the current working directory to be the home directory
         * of the user */
-        if (!(flags & SETUP_ENV_NO_CHDIR)) {
+        if (flags & SETUP_ENV_CHDIR) {
-                if (chdir(pw->pw_dir) != 0) {
+                if (chdir_or_warn(pw->pw_dir) != 0) {
-                        bb_error_msg("can't change directory to '%s'", pw->pw_dir);
                        xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/");
                }
        }
@@ -59,7 +58,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
                //xsetenv("LOGNAME", pw->pw_name);
                //xsetenv("HOME",    pw->pw_dir);
                //xsetenv("SHELL",   shell);
-        } else if (flags & SETUP_ENV_CHANGEENV) {
+        } else
+        if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) {
                /* Set HOME, SHELL, and if not becoming a super-user
                 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME.  */
                if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) {
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c
index aae3b092d..a9add8ab2 100644
--- a/libbb/xfuncs_printf.c
+++ b/libbb/xfuncs_printf.c
@@ -417,11 +417,18 @@ void FAST_FUNC xseteuid(uid_t euid)
        if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid");
 }
+int FAST_FUNC chdir_or_warn(const char *path)
+{
+        int r = chdir(path);
+        if (r != 0)
+                bb_perror_msg("can't change directory to '%s'", path);
+        return r;
+}
 // Die if we can't chdir to a new path.
 void FAST_FUNC xchdir(const char *path)
 {
-        if (chdir(path))
+        if (chdir_or_warn(path) != 0)
-                bb_perror_msg_and_die("can't change directory to '%s'", path);
+                xfunc_die();
 }
 void FAST_FUNC xfchdir(int fd)
diff --git a/loginutils/login.c b/loginutils/login.c
index cac4349b2..332238181 100644
--- a/loginutils/login.c
+++ b/loginutils/login.c
@@ -564,7 +564,9 @@ int login_main(int argc UNUSED_PARAM, char **argv)
        change_identity(pw);
        setup_environment(pw->pw_shell,
-                        (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + SETUP_ENV_CHANGEENV,
+                        (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV)
+                                + SETUP_ENV_CHANGEENV
+                                + SETUP_ENV_CHDIR,
                        pw);
 #if ENABLE_PAM
diff --git a/loginutils/su.c b/loginutils/su.c
index 647c97fb1..b61e3753a 100644
--- a/loginutils/su.c
+++ b/loginutils/su.c
@@ -177,10 +177,9 @@ int su_main(int argc UNUSED_PARAM, char **argv)
        change_identity(pw);
        setup_environment(opt_shell,
-                        ((flags & SU_OPT_l) / SU_OPT_l * SETUP_ENV_CLEARENV)
+                ((flags & SU_OPT_l) ? (SETUP_ENV_CLEARENV + SETUP_ENV_CHDIR) : 0)
-                        + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV)
+                        + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV),
-                        + (!(flags & SU_OPT_l) * SETUP_ENV_NO_CHDIR),
+                pw);
-                        pw);
        IF_SELINUX(set_current_security_context(NULL);)
        if (opt_command) {
diff --git a/loginutils/sulogin.c b/loginutils/sulogin.c
index c9817960c..681022acb 100644
--- a/loginutils/sulogin.c
+++ b/loginutils/sulogin.c
@@ -94,10 +94,13 @@ int sulogin_main(int argc UNUSED_PARAM, char **argv)
                shell = pwd->pw_shell;
        /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */
-        setup_environment(shell, SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME, pwd);
+        setup_environment(shell, 0
+                + SETUP_ENV_CHANGEENV_LOGNAME
+                + SETUP_ENV_CHDIR
+                , pwd);
        // no SETUP_ENV_CLEARENV
-        // SETUP_ENV_CHANGEENV[+LOGNAME] - set HOME, SHELL, USER,and LOGNAME
+        // SETUP_ENV_CHANGEENV_LOGNAME - set HOME, SHELL, USER,and LOGNAME
-        // no SETUP_ENV_NO_CHDIR - IOW: cd to $HOME
+        // SETUP_ENV_CHDIR - cd to $HOME
        /* util-linux 2.36.1 compat: steal ctty if we don't have it yet
         * (yes, util-linux uses force=1)  */
diff --git a/miscutils/bc.c b/miscutils/bc.c
index e3f7573c9..fe555d018 100644
--- a/miscutils/bc.c
+++ b/miscutils/bc.c
@@ -6011,7 +6011,7 @@ static BC_STATUS zxc_program_assign(char inst)
 #endif
        if (ib || sc || left->t == XC_RESULT_OBASE) {
-                static const char *const msg[] = {
+                static const char *const msg[] ALIGN_PTR = {
                        "bad ibase; must be [2,16]",                 //XC_RESULT_IBASE
                        "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE
                        "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE
diff --git a/miscutils/crond.c b/miscutils/crond.c
index b74427351..1965af656 100644
--- a/miscutils/crond.c
+++ b/miscutils/crond.c
@@ -675,8 +675,7 @@ static void change_user(struct passwd *pas)
 {
        /* careful: we're after vfork! */
        change_identity(pas); /* - initgroups, setgid, setuid */
-        if (chdir(pas->pw_dir) < 0) {
+        if (chdir_or_warn(pas->pw_dir) != 0) {
-                bb_error_msg("can't change directory to '%s'", pas->pw_dir);
                xchdir(CRON_DIR);
        }
 }
diff --git a/miscutils/crontab.c b/miscutils/crontab.c
index 411a18a50..1111f4d54 100644
--- a/miscutils/crontab.c
+++ b/miscutils/crontab.c
@@ -55,8 +55,8 @@ static void edit_file(const struct passwd *pas, const char *file)
        /* initgroups, setgid, setuid */
        change_identity(pas);
        setup_environment(pas->pw_shell,
-                        SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP,
+                SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP | SETUP_ENV_CHDIR,
-                        pas);
+                pas);
        ptr = getenv("VISUAL");
        if (!ptr) {
                ptr = getenv("EDITOR");
diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c
index 839d00fd0..fb9ebcf60 100644
--- a/miscutils/devfsd.c
+++ b/miscutils/devfsd.c
@@ -928,7 +928,7 @@ static void action_compat(const struct devfsd_notify_struct *info, unsigned int
        unsigned int i;
        char rewind_;
        /* 1 to 5  "scsi/" , 6 to 9 "ide/host" */
-        static const char *const fmt[] = {
+        static const char *const fmt[] ALIGN_PTR = {
                NULL ,
                "sg/c%db%dt%du%d",              /* scsi/generic */
                "sd/c%db%dt%du%d",              /* scsi/disc */
@@ -1468,7 +1468,7 @@ const char *get_old_name(const char *devname, unsigned int namelen,
        const char *pty1;
        const char *pty2;
        /* 1 to 5  "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */
-        static const char *const fmt[] = {
+        static const char *const fmt[] ALIGN_PTR = {
                NULL ,
                "sg%u",                 /* scsi/generic */
                NULL,                   /* scsi/disc */
diff --git a/miscutils/hexedit.c b/miscutils/hexedit.c
index f8ff9b62b..15ad78377 100644
--- a/miscutils/hexedit.c
+++ b/miscutils/hexedit.c
@@ -292,7 +292,7 @@ int hexedit_main(int argc UNUSED_PARAM, char **argv)
                fflush_all();
                G.in_read_key = 1;
                if (!bb_got_signal)
-                        key = read_key(STDIN_FILENO, G.read_key_buffer, -1);
+                        key = safe_read_key(STDIN_FILENO, G.read_key_buffer, -1);
                G.in_read_key = 0;
                if (bb_got_signal)
                        key = CTRL('X');
diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c
index e3741eeba..da26f5e19 100644
--- a/miscutils/i2c_tools.c
+++ b/miscutils/i2c_tools.c
@@ -120,6 +120,7 @@ static int32_t i2c_smbus_access(int fd, char read_write, uint8_t cmd,
        return ioctl(fd, I2C_SMBUS, &args);
 }
+#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP || ENABLE_I2CDETECT
 static int32_t i2c_smbus_read_byte(int fd)
 {
        union i2c_smbus_data data;
@@ -131,6 +132,7 @@ static int32_t i2c_smbus_read_byte(int fd)
        return data.byte;
 }
+#endif
 #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP
 static int32_t i2c_smbus_write_byte(int fd, uint8_t val)
diff --git a/miscutils/less.c b/miscutils/less.c
index 6da991a0e..842031ca3 100644
--- a/miscutils/less.c
+++ b/miscutils/less.c
@@ -1177,9 +1177,9 @@ static int64_t getch_nowait(void)
 #endif
        }
-        /* We have kbd_fd in O_NONBLOCK mode, read inside read_key()
+        /* We have kbd_fd in O_NONBLOCK mode, read inside safe_read_key()
         * would not block even if there is no input available */
-        key64 = read_key(kbd_fd, kbd_input, /*timeout off:*/ -2);
+        key64 = safe_read_key(kbd_fd, kbd_input, /*timeout off:*/ -2);
        if ((int)key64 == -1) {
                if (errno == EAGAIN) {
                        /* No keyboard input available. Since poll() did return,
diff --git a/miscutils/man.c b/miscutils/man.c
index be3b2a000..c3efe4484 100644
--- a/miscutils/man.c
+++ b/miscutils/man.c
@@ -328,7 +328,7 @@ int man_main(int argc UNUSED_PARAM, char **argv)
        }
 #else
        if (!man_path_list) {
-                static const char *const mpl[] = { "/usr/man", "/usr/share/man", NULL };
+                static const char *const mpl[] ALIGN_PTR = { "/usr/man", "/usr/share/man", NULL };
                man_path_list = (char**)mpl;
                /*count_mp = 2; - not used below anyway */
        }
diff --git a/modutils/modutils-24.c b/modutils/modutils-24.c
index ac8632481..d0bc2a6ef 100644
--- a/modutils/modutils-24.c
+++ b/modutils/modutils-24.c
@@ -3458,7 +3458,7 @@ static int obj_load_progbits(char *image, size_t image_size, struct obj_file *f,
 static void hide_special_symbols(struct obj_file *f)
 {
-        static const char *const specials[] = {
+        static const char *const specials[] ALIGN_PTR = {
                SPFX "cleanup_module",
                SPFX "init_module",
                SPFX "kernel_version",
@@ -3484,7 +3484,7 @@ static int obj_gpl_license(struct obj_file *f, const char **license)
         * linux/include/linux/module.h.  Checking for leading "GPL" will not
         * work, somebody will use "GPL sucks, this is proprietary".
         */
-        static const char *const gpl_licenses[] = {
+        static const char *const gpl_licenses[] ALIGN_PTR = {
                "GPL",
                "GPL v2",
                "GPL and additional rights",
diff --git a/networking/httpd.c b/networking/httpd.c
index 5f7b3a4dd..59b4a769c 100644
--- a/networking/httpd.c
+++ b/networking/httpd.c
@@ -1707,8 +1707,7 @@ static void send_cgi_and_exit(
                script = last_slash;
                if (script != url) { /* paranoia */
                        *script = '\0';
-                        if (chdir(url + 1) != 0) {
+                        if (chdir_or_warn(url + 1) != 0) {
-                                bb_perror_msg("can't change directory to '%s'", url + 1);
                                goto error_execing_cgi;
                        }
                        // not needed: *script = '/';
diff --git a/networking/ifupdown.c b/networking/ifupdown.c
index 737113dd4..6c4ae27f2 100644
--- a/networking/ifupdown.c
+++ b/networking/ifupdown.c
@@ -532,7 +532,7 @@ static int FAST_FUNC v4tunnel_down(struct interface_defn_t * ifd, execfn * exec)
 }
 # endif
-static const struct method_t methods6[] = {
+static const struct method_t methods6[] ALIGN_PTR = {
 # if ENABLE_FEATURE_IFUPDOWN_IP
        { "v4tunnel" , v4tunnel_up     , v4tunnel_down   , },
 # endif
@@ -627,7 +627,7 @@ struct dhcp_client_t {
        const char *stopcmd;
 };
-static const struct dhcp_client_t ext_dhcp_clients[] = {
+static const struct dhcp_client_t ext_dhcp_clients[] ALIGN_PTR = {
        { "dhcpcd",
                "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%",
                "dhcpcd -k %iface%",
@@ -774,7 +774,7 @@ static int FAST_FUNC wvdial_down(struct interface_defn_t *ifd, execfn *exec)
                        "-p /var/run/wvdial.%iface% -s 2", ifd, exec);
 }
-static const struct method_t methods[] = {
+static const struct method_t methods[] ALIGN_PTR = {
        { "manual"  , manual_up_down, manual_up_down, },
        { "wvdial"  , wvdial_up     , wvdial_down   , },
        { "ppp"     , ppp_up        , ppp_down      , },
@@ -797,7 +797,7 @@ static int FAST_FUNC link_up_down(struct interface_defn_t *ifd UNUSED_PARAM, exe
        return 1;
 }
-static const struct method_t link_methods[] = {
+static const struct method_t link_methods[] ALIGN_PTR = {
        { "none", link_up_down, link_up_down }
 };
diff --git a/networking/inetd.c b/networking/inetd.c
index e71be51c3..fb2fbe323 100644
--- a/networking/inetd.c
+++ b/networking/inetd.c
@@ -1538,7 +1538,7 @@ int inetd_main(int argc UNUSED_PARAM, char **argv)
 #if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \
 || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD
 # if !BB_MMU
-static const char *const cat_args[] = { "cat", NULL };
+static const char *const cat_args[] ALIGN_PTR = { "cat", NULL };
 # endif
 #endif
diff --git a/networking/interface.c b/networking/interface.c
index ea6a2c8a8..6b6c0944a 100644
--- a/networking/interface.c
+++ b/networking/interface.c
@@ -446,13 +446,13 @@ static char *get_name(char name[IFNAMSIZ], char *p)
 * %n specifiers (even the size of integers may not match).
 */
 #if INT_MAX == LONG_MAX
-static const char *const ss_fmt[] = {
+static const char *const ss_fmt[] ALIGN_PTR = {
        "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u",
        "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u",
        "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u"
 };
 #else
-static const char *const ss_fmt[] = {
+static const char *const ss_fmt[] ALIGN_PTR = {
        "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu",
        "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu",
        "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu"
@@ -731,7 +731,7 @@ static const struct hwtype ib_hwtype = {
 #endif
-static const struct hwtype *const hwtypes[] = {
+static const struct hwtype *const hwtypes[] ALIGN_PTR = {
        &loop_hwtype,
        &ether_hwtype,
        &ppp_hwtype,
diff --git a/networking/libiproute/ipaddress.c b/networking/libiproute/ipaddress.c
index 17a838411..ecc3848ff 100644
--- a/networking/libiproute/ipaddress.c
+++ b/networking/libiproute/ipaddress.c
@@ -58,7 +58,7 @@ typedef struct filter_t filter_t;
 static void print_link_flags(unsigned flags, unsigned mdown)
 {
-        static const int flag_masks[] = {
+        static const int flag_masks[] ALIGN_INT = {
                IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT,
                IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP };
        static const char flag_labels[] ALIGN1 =
diff --git a/networking/udhcp/common.c b/networking/udhcp/common.c
index 8e9b93655..ae818db05 100644
--- a/networking/udhcp/common.c
+++ b/networking/udhcp/common.c
@@ -19,7 +19,7 @@ const uint8_t MAC_BCAST_ADDR[6] ALIGN2 = {
 * See RFC2132 for more options.
 * OPTION_REQ: these options are requested by udhcpc (unless -o).
 */
-const struct dhcp_optflag dhcp_optflags[] = {
+const struct dhcp_optflag dhcp_optflags[] ALIGN2 = {
        /* flags                                    code */
        { OPTION_IP                   | OPTION_REQ, 0x01 }, /* DHCP_SUBNET        */
        { OPTION_S32                              , 0x02 }, /* DHCP_TIME_OFFSET   */
diff --git a/networking/udhcp/d6_dhcpc.c b/networking/udhcp/d6_dhcpc.c
index 9d2a8f5d3..9fc690315 100644
--- a/networking/udhcp/d6_dhcpc.c
+++ b/networking/udhcp/d6_dhcpc.c
@@ -65,7 +65,7 @@
 /* "struct client_data_t client_data" is in bb_common_bufsiz1 */
-static const struct dhcp_optflag d6_optflags[] = {
+static const struct dhcp_optflag d6_optflags[] ALIGN2 = {
 #if ENABLE_FEATURE_UDHCPC6_RFC3646
        { OPTION_6RD | OPTION_LIST        | OPTION_REQ, D6_OPT_DNS_SERVERS },
        { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST },
diff --git a/procps/nmeter.c b/procps/nmeter.c
index 2310e9844..088d366bf 100644
--- a/procps/nmeter.c
+++ b/procps/nmeter.c
@@ -70,7 +70,7 @@ typedef struct proc_file {
        smallint last_gen;
 } proc_file;
-static const char *const proc_name[] = {
+static const char *const proc_name[] ALIGN_PTR = {
        "stat",         // Must match the order of proc_file's!
        "loadavg",
        "net/dev",
diff --git a/procps/top.c b/procps/top.c
index 4cd545c69..804d6f258 100644
--- a/procps/top.c
+++ b/procps/top.c
@@ -913,7 +913,7 @@ static unsigned handle_input(unsigned scan_mask, duration_t interval)
        while (1) {
                int32_t c;
-                c = read_key(STDIN_FILENO, G.kbd_input, interval * 1000);
+                c = safe_read_key(STDIN_FILENO, G.kbd_input, interval * 1000);
                if (c == -1 && errno != EAGAIN) {
                        /* error/EOF */
                        option_mask32 |= OPT_EOF;
diff --git a/selinux/setenforce.c b/selinux/setenforce.c
index 996034f8e..2267be451 100644
--- a/selinux/setenforce.c
+++ b/selinux/setenforce.c
@@ -26,7 +26,7 @@
 /* These strings are arranged so that odd ones
 * result in security_setenforce(1) being done,
 * the rest will do security_setenforce(0) */
-static const char *const setenforce_cmd[] = {
+static const char *const setenforce_cmd[] ALIGN_PTR = {
        "0",
        "1",
        "permissive",
diff --git a/shell/ash.c b/shell/ash.c
index a1d01447a..46c4f1675 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -428,7 +428,7 @@ static void forkshell_print(FILE *fp0, struct forkshell *fs, const char **notes)
 /* ============ Shell options */
 /* If you add/change options hare, update --help text too */
-static const char *const optletters_optnames[] = {
+static const char *const optletters_optnames[] ALIGN_PTR = {
        "e"   "errexit",
        "f"   "noglob",
 /* bash has '-o ignoreeof', but no short synonym -I for it */
@@ -845,7 +845,7 @@ raise_exception(int e)
 /*
 * Called when a SIGINT is received.  (If the user specifies
 * that SIGINT is to be trapped or ignored using the trap builtin, then
- * this routine is not called.)  Suppressint is nonzero when interrupts
+ * this routine is not called.)  suppress_int is nonzero when interrupts
 * are held using the INT_OFF macro.  (The test for iflag is just
 * defensive programming.)
 */
@@ -882,13 +882,12 @@ raise_interrupt(void)
 } while (0)
 #endif
-static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void
+static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
 int_on(void)
 {
        barrier();
-        if (--suppress_int == 0 && pending_int) {
+        if (--suppress_int == 0 && pending_int)
                raise_interrupt();
-        }
 }
 #if DEBUG_INTONOFF
 # define INT_ON do { \
@@ -898,7 +897,7 @@ int_on(void)
 #else
 # define INT_ON int_on()
 #endif
-static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void
+static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
 force_int_on(void)
 {
        barrier();
@@ -4143,7 +4142,9 @@ signal_handler(int signo)
                if (!trap[SIGCHLD])
                        return;
        }
+#if ENABLE_FEATURE_EDITING
+        bb_got_signal = signo; /* for read_line_input: "we got a signal" */
+#endif
        gotsig[signo - 1] = 1;
        pending_sig = signo;
@@ -11656,33 +11657,56 @@ preadfd(void)
 # endif
                reinit_unicode_for_ash();
 again:
+                /* For shell, LI_INTERRUPTIBLE is set:
+                 * read_line_input will abort on either
+                 * getting EINTR in poll(), or if it sees bb_got_signal != 0
+                 * (IOW: if signal arrives before poll() is reached).
+                 * Interactive testcases:
+                 * (while kill -INT $$; do sleep 1; done) &
+                 * #^^^ prints ^C, prints prompt, repeats
+                 * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
+                 * #^^^ prints ^C, prints "I", prints prompt, repeats
+                 * trap 'echo T' term; (while kill $$; do sleep 1; done) &
+                 * #^^^ prints "T", prints prompt, repeats
+                 * #(bash 5.0.17 exits after first "T", looks like a bug)
+                 */
+                bb_got_signal = 0;
+                INT_OFF; /* no longjmp'ing out of read_line_input please */
                nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ);
+                if (bb_got_signal == SIGINT)
+                        write(STDOUT_FILENO, "^C\n", 3);
+                INT_ON; /* here non-blocked SIGINT will longjmp */
                if (nr == 0) {
                        /* ^C pressed, "convert" to SIGINT */
 # if !ENABLE_PLATFORM_MINGW32
-                        write(STDOUT_FILENO, "^C", 2);
+                        write(STDOUT_FILENO, "^C\n", 3);
-                        raise(SIGINT);
+                        raise(SIGINT); /* here non-blocked SIGINT will longjmp */
                        /* raise(SIGINT) did not work! (e.g. if SIGINT
-                         * is SIG_INGed on startup, it stays SIG_IGNed)
+                         * is SIG_IGNed on startup, it stays SIG_IGNed)
                         */
 # else
                        raise_interrupt();
 # endif
                        if (trap[SIGINT]) {
+ empty_line_input:
                                buf[0] = '\n';
                                buf[1] = '\0';
                                return 1;
                        }
                        exitstatus = 128 + SIGINT;
                        /* bash behavior on ^C + ignored SIGINT: */
-                        write(STDOUT_FILENO, "\n", 1);
                        goto again;
                }
                if (nr < 0) {
                        if (errno == 0) {
-                                /* Ctrl+D pressed */
+                                /* ^D pressed */
                                nr = 0;
                        }
+                        else if (errno == EINTR) { /* got signal? */
+                                if (bb_got_signal != SIGINT)
+                                        write(STDOUT_FILENO, "\n", 1);
+                                goto empty_line_input;
+                        }
 # if ENABLE_ASH_IDLE_TIMEOUT
                        else if (errno == EAGAIN && timeout > 0) {
                                puts("\007timed out waiting for input: auto-logout");
diff --git a/shell/hush.c b/shell/hush.c
index 982fc356a..ae81f0da5 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -564,7 +564,7 @@ enum {
 #define NULL_O_STRING { NULL }
 #ifndef debug_printf_parse
-static const char *const assignment_flag[] = {
+static const char *const assignment_flag[] ALIGN_PTR = {
        "MAYBE_ASSIGNMENT",
        "DEFINITELY_ASSIGNMENT",
        "NOT_ASSIGNMENT",
@@ -918,6 +918,7 @@ struct globals {
 #if ENABLE_HUSH_INTERACTIVE
        smallint promptmode; /* 0: PS1, 1: PS2 */
 #endif
+        /* set by signal handler if SIGINT is received _and_ its trap is not set */
        smallint flag_SIGINT;
 #if ENABLE_HUSH_LOOPS
        smallint flag_break_continue;
@@ -1944,6 +1945,9 @@ enum {
 static void record_pending_signo(int sig)
 {
        sigaddset(&G.pending_set, sig);
+#if ENABLE_FEATURE_EDITING
+        bb_got_signal = sig; /* for read_line_input: "we got a signal" */
+#endif
 #if ENABLE_HUSH_FAST
        if (sig == SIGCHLD) {
                G.count_SIGCHLD++;
@@ -2652,30 +2656,53 @@ static int get_user_input(struct in_str *i)
        for (;;) {
                reinit_unicode_for_hush();
                G.flag_SIGINT = 0;
-                /* buglet: SIGINT will not make new prompt to appear _at once_,
-                 * only after <Enter>. (^C works immediately) */
+                bb_got_signal = 0;
-                r = read_line_input(G.line_input_state, prompt_str,
+                if (!sigisemptyset(&G.pending_set)) {
+                        /* Whoops, already got a signal, do not call read_line_input */
+                        bb_got_signal = r = -1;
+                } else {
+                        /* For shell, LI_INTERRUPTIBLE is set:
+                         * read_line_input will abort on either
+                         * getting EINTR in poll(), or if it sees bb_got_signal != 0
+                         * (IOW: if signal arrives before poll() is reached).
+                         * Interactive testcases:
+                         * (while kill -INT $$; do sleep 1; done) &
+                         * #^^^ prints ^C, prints prompt, repeats
+                         * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
+                         * #^^^ prints ^C, prints "I", prints prompt, repeats
+                         * trap 'echo T' term; (while kill $$; do sleep 1; done) &
+                         * #^^^ prints "T", prints prompt, repeats
+                         * #(bash 5.0.17 exits after first "T", looks like a bug)
+                         */
+                        r = read_line_input(G.line_input_state, prompt_str,
                                G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1
-                );
+                        );
-                /* read_line_input intercepts ^C, "convert" it to SIGINT */
+                        /* read_line_input intercepts ^C, "convert" it to SIGINT */
-                if (r == 0) {
+                        if (r == 0)
-                        raise(SIGINT);
+                                raise(SIGINT);
+                }
+                /* bash prints ^C (before running a trap, if any)
+                 * both on keyboard ^C and on real SIGINT (non-kbd generated).
+                 */
+                if (sigismember(&G.pending_set, SIGINT)) {
+                        write(STDOUT_FILENO, "^C\n", 3);
+                        G.last_exitcode = 128 | SIGINT;
                }
                check_and_run_traps();
-                if (r != 0 && !G.flag_SIGINT)
+                if (r == 0) /* keyboard ^C? */
+                        continue; /* go back, read another input line */
+                if (r > 0) /* normal input? (no ^C, no ^D, no signals) */
                        break;
-                /* ^C or SIGINT: repeat */
+                if (!bb_got_signal) {
-                /* bash prints ^C even on real SIGINT (non-kbd generated) */
+                        /* r < 0: ^D/EOF/error detected (but not signal) */
-                write(STDOUT_FILENO, "^C\n", 3);
+                        /* ^D on interactive input goes to next line before exiting: */
-                G.last_exitcode = 128 | SIGINT;
+                        write(STDOUT_FILENO, "\n", 1);
-        }
+                        i->p = NULL;
-        if (r < 0) {
+                        i->peek_buf[0] = r = EOF;
-                /* EOF/error detected */
+                        return r;
-                /* ^D on interactive input goes to next line before exiting: */
+                }
-                write(STDOUT_FILENO, "\n", 1);
+                /* it was a signal: go back, read another input line */
-                i->p = NULL;
-                i->peek_buf[0] = r = EOF;
-                return r;
        }
        i->p = G.user_input_buf;
        return (unsigned char)*i->p++;
@@ -3655,7 +3682,7 @@ static void free_pipe_list(struct pipe *pi)
 #ifndef debug_print_tree
 static void debug_print_tree(struct pipe *pi, int lvl)
 {
-        static const char *const PIPE[] = {
+        static const char *const PIPE[] ALIGN_PTR = {
                [PIPE_SEQ] = "SEQ",
                [PIPE_AND] = "AND",
                [PIPE_OR ] = "OR" ,
@@ -3690,7 +3717,7 @@ static void debug_print_tree(struct pipe *pi, int lvl)
                [RES_XXXX ] = "XXXX" ,
                [RES_SNTX ] = "SNTX" ,
        };
-        static const char *const CMDTYPE[] = {
+        static const char *const CMDTYPE[] ALIGN_PTR = {
                "{}",
                "()",
                "[noglob]",
@@ -7632,7 +7659,7 @@ static int generate_stream_from_string(const char *s, pid_t *pid_p)
                if (is_prefixed_with(s, "trap")
                 && skip_whitespace(s + 4)[0] == '\0'
                ) {
-                        static const char *const argv[] = { NULL, NULL };
+                        static const char *const argv[] ALIGN_PTR = { NULL, NULL };
                        builtin_trap((char**)argv);
                        fflush_all(); /* important */
                        _exit(0);
@@ -9799,7 +9826,7 @@ static int run_list(struct pipe *pi)
                                static const char encoded_dollar_at[] ALIGN1 = {
                                        SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0'
                                }; /* encoded representation of "$@" */
-                                static const char *const encoded_dollar_at_argv[] = {
+                                static const char *const encoded_dollar_at_argv[] ALIGN_PTR = {
                                        encoded_dollar_at, NULL
                                }; /* argv list with one element: "$@" */
                                char **vals;
@@ -10361,7 +10388,7 @@ int hush_main(int argc, char **argv)
 //it ignores TERM:
 //      bash -i -c 'kill $$; echo ALIVE'
 //      ALIVE
-//it resets SIG_INGed HUP to SIG_DFL:
+//it resets SIG_IGNed HUP to SIG_DFL:
 //      trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE'
 //      Hangup   [the message is not printed by bash, it's the shell which started it]
 //is talkative about jobs and exiting:
diff --git a/shell/shell_common.c b/shell/shell_common.c
index fff356c04..399d5e684 100644
--- a/shell/shell_common.c
+++ b/shell/shell_common.c
@@ -218,6 +218,7 @@ shell_builtin_read(struct builtin_read_params *params)
                 */
                errno = 0;
                pfd[0].events = POLLIN;
+//TODO race with a signal arriving just before the poll!
                if (poll(pfd, 1, timeout) <= 0) {
                        /* timed out, or EINTR */
                        err = errno;
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index e62b839f7..626542e33 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -324,6 +324,21 @@ testing "sed zero chars match/replace logic must not falsely trigger here 2" \
        "sed 's/ *$/_/g'" \
        "qwerty_\n" "" "qwerty\n"
+# the pattern here is interpreted as "9+", not as "9\+"
+testing "sed special char as s/// delimiter, in pattern" \
+        "sed 's+9\++X+'" \
+        "X8=17\n" "" "9+8=17\n"
+# Matching GNU sed 4.8:
+# in replacement string, "\&" remains "\&", not interpreted as "&"
+testing "sed special char as s/// delimiter, in replacement 1" \
+        "sed 's&9&X\&&'" \
+        "X&+8=17\n" "" "9+8=17\n"
+# in replacement string, "\1" is interpreted as "1"
+testing "sed special char as s/// delimiter, in replacement 2" \
+        "sed 's1\(9\)1X\11'" \
+        "X1+8=17\n" "" "9+8=17\n"
 testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \
        "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \
        "\
diff --git a/util-linux/hexdump.c b/util-linux/hexdump.c
index 57e7e8db7..307a84803 100644
--- a/util-linux/hexdump.c
+++ b/util-linux/hexdump.c
@@ -71,7 +71,7 @@ static void bb_dump_addfile(dumper_t *dumper, char *name)
        fclose(fp);
 }
-static const char *const add_strings[] = {
+static const char *const add_strings[] ALIGN_PTR = {
        "\"%07.7_ax \"16/1 \"%03o \"\"\n\"",   /* b */
        "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"",   /* c */
        "\"%07.7_ax \"8/2 \"  %05u \"\"\n\"",  /* d */
diff --git a/util-linux/mkfs_vfat.c b/util-linux/mkfs_vfat.c
index 844d965f8..821371953 100644
--- a/util-linux/mkfs_vfat.c
+++ b/util-linux/mkfs_vfat.c
@@ -218,8 +218,11 @@ static const char boot_code[] ALIGN1 =
 int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
 int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
 {
+        static const char NO_NAME_11[] = "NO NAME    ";
        struct stat st;
-        const char *volume_label = "";
+        const char *arg_volume_label = NO_NAME_11; //default
+        char volume_label11[12];
        char *buf;
        char *device_name;
        uoff_t volume_size_bytes;
@@ -257,14 +260,17 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
        opts = getopt32(argv, "^"
                "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v"
                "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c
-                NULL, NULL, NULL, NULL, NULL,
+                /*b*/NULL, /*f*/NULL, /*F*/NULL, /*h*/NULL, /*i*/NULL,
-                NULL, NULL, &volume_label, NULL, NULL, NULL, NULL);
+                /*l*/NULL, /*m*/NULL, /*n*/&arg_volume_label,
+                /*r*/NULL, /*R*/NULL, /*s*/NULL, /*S*/NULL);
        argv += optind;
        // cache device name
        device_name = argv[0];
        // default volume ID = creation time
        volume_id = time(NULL);
+        // truncate to exactly 11 chars, pad with spaces
+        sprintf(volume_label11, "%-11.11s", arg_volume_label);
        dev = xopen(device_name, O_RDWR);
        xfstat(dev, &st, device_name);
@@ -459,7 +465,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
                        (int)media_byte,
                        volume_size_sect, (int)total_clust, (int)sect_per_clust,
                        sect_per_fat,
-                        (int)volume_id, volume_label
+                        (int)volume_id, volume_label11
                );
        }
@@ -508,7 +514,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
                STORE_LE(boot_blk->vi.ext_boot_sign, 0x29);
                STORE_LE(boot_blk->vi.volume_id32, volume_id);
                memcpy(boot_blk->vi.fs_type, "FAT32   ", sizeof(boot_blk->vi.fs_type));
-                strncpy(boot_blk->vi.volume_label, volume_label, sizeof(boot_blk->vi.volume_label));
+                memcpy(boot_blk->vi.volume_label, volume_label11, 11);
                memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code));
                STORE_LE(boot_blk->boot_sign, BOOT_SIGN);
@@ -545,15 +551,18 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
        // root directory
        // empty directory is just a set of zero bytes
        memset(buf, 0, sect_per_clust * bytes_per_sect);
-        if (volume_label[0]) {
+        // not "NO NAME", "NO NAME  " etc?
-                // create dir entry for volume_label
+        // (mkfs.fat 4.1 won't create dir entry even with explicit -n 'NO NAME',
+        // but will create one with e.g. -n '', -n '  zZz')
+        if (strcmp(volume_label11, NO_NAME_11) != 0) {
+                // create dir entry for volume label
                struct msdos_dir_entry *de;
 #if 0
                struct tm tm_time;
                uint16_t t, d;
 #endif
                de = (void*)buf;
-                strncpy(de->name, volume_label, sizeof(de->name));
+                memcpy(de->name, volume_label11, 11);
                STORE_LE(de->attr, ATTR_VOLUME);
 #if 0
                localtime_r(&create_time, &tm_time);
diff --git a/util-linux/nsenter.c b/util-linux/nsenter.c
index e6339da2f..1aa045b35 100644
--- a/util-linux/nsenter.c
+++ b/util-linux/nsenter.c
@@ -93,7 +93,7 @@ enum {
 * The user namespace comes first, so that it is entered first.
 * This gives an unprivileged user the potential to enter other namespaces.
 */
-static const struct namespace_descr ns_list[] = {
+static const struct namespace_descr ns_list[] ALIGN_INT = {
        { CLONE_NEWUSER, "ns/user", },
        { CLONE_NEWIPC,  "ns/ipc",  },
        { CLONE_NEWUTS,  "ns/uts",  },
diff --git a/util-linux/unshare.c b/util-linux/unshare.c
index 68ccdd874..06b938074 100644
--- a/util-linux/unshare.c
+++ b/util-linux/unshare.c
@@ -120,7 +120,7 @@ enum {
        NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */
        NS_COUNT,
 };
-static const struct namespace_descr ns_list[] = {
+static const struct namespace_descr ns_list[] ALIGN_INT = {
        { CLONE_NEWNS,   "mnt"  },
        { CLONE_NEWUTS,  "uts"  },
        { CLONE_NEWIPC,  "ipc"  },
author	Ron Yorston <rmy@pobox.com>	2022-02-09 09:03:18 +0000
committer	Ron Yorston <rmy@pobox.com>	2022-02-09 09:05:39 +0000
commit	492d0a7492a57fe8f02c766e25960b0ce0d88759 (patch)
tree	4f5764a5c2250c031ea05e9aeacbb40d7971f493
parent	4734416a21312488a5099a297907783bee4ccc22 (diff)
parent	caa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 (diff)
download	busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.gz busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.bz2 busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.zip