aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRon Yorston <rmy@pobox.com>2022-02-09 09:03:18 +0000
committerRon Yorston <rmy@pobox.com>2022-02-09 09:05:39 +0000
commit492d0a7492a57fe8f02c766e25960b0ce0d88759 (patch)
tree4f5764a5c2250c031ea05e9aeacbb40d7971f493
parent4734416a21312488a5099a297907783bee4ccc22 (diff)
parentcaa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 (diff)
downloadbusybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.gz
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.bz2
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.zip
Merge busybox into merge
Fix conflicts in reset and ash. Redefine the new safe_read_key() as a reference to read_key(). Disable SHA256_HWACCEL.
-rw-r--r--archival/libarchive/decompress_bunzip2.c2
-rw-r--r--archival/libarchive/get_header_tar.c2
-rw-r--r--busybox_ldscript.README.txt47
-rw-r--r--configs/mingw32_defconfig1
-rw-r--r--configs/mingw64_defconfig1
-rw-r--r--console-tools/reset.c2
-rw-r--r--coreutils/head.c6
-rw-r--r--coreutils/od.c2
-rw-r--r--coreutils/test.c2
-rw-r--r--e2fsprogs/fsck.c2
-rw-r--r--editors/cmp.c18
-rw-r--r--editors/patch.c2
-rw-r--r--editors/patch_toybox.c2
-rw-r--r--editors/sed.c22
-rw-r--r--editors/vi.c4
-rw-r--r--include/libbb.h20
-rw-r--r--include/platform.h1
-rw-r--r--libbb/Config.src6
-rw-r--r--libbb/Kbuild.src3
-rw-r--r--libbb/appletlib.c2
-rw-r--r--libbb/get_console.c2
-rw-r--r--libbb/getopt32.c2
-rw-r--r--libbb/hash_md5_sha.c54
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S277
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S284
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S47
-rw-r--r--libbb/hash_md5_sha_x86-64.S1177
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh460
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S33
-rw-r--r--libbb/lineedit.c34
-rw-r--r--libbb/read_key.c17
-rw-r--r--libbb/setup_environment.c8
-rw-r--r--libbb/xfuncs_printf.c11
-rw-r--r--loginutils/login.c4
-rw-r--r--loginutils/su.c7
-rw-r--r--loginutils/sulogin.c9
-rw-r--r--miscutils/bc.c2
-rw-r--r--miscutils/crond.c3
-rw-r--r--miscutils/crontab.c4
-rw-r--r--miscutils/devfsd.c4
-rw-r--r--miscutils/hexedit.c2
-rw-r--r--miscutils/i2c_tools.c2
-rw-r--r--miscutils/less.c4
-rw-r--r--miscutils/man.c2
-rw-r--r--modutils/modutils-24.c4
-rw-r--r--networking/httpd.c3
-rw-r--r--networking/ifupdown.c8
-rw-r--r--networking/inetd.c2
-rw-r--r--networking/interface.c6
-rw-r--r--networking/libiproute/ipaddress.c2
-rw-r--r--networking/udhcp/common.c2
-rw-r--r--networking/udhcp/d6_dhcpc.c2
-rw-r--r--procps/nmeter.c2
-rw-r--r--procps/top.c2
-rw-r--r--selinux/setenforce.c2
-rw-r--r--shell/ash.c48
-rw-r--r--shell/hush.c79
-rw-r--r--shell/shell_common.c1
-rwxr-xr-xtestsuite/sed.tests15
-rw-r--r--util-linux/hexdump.c2
-rw-r--r--util-linux/mkfs_vfat.c25
-rw-r--r--util-linux/nsenter.c2
-rw-r--r--util-linux/unshare.c2
63 files changed, 1969 insertions, 836 deletions
diff --git a/archival/libarchive/decompress_bunzip2.c b/archival/libarchive/decompress_bunzip2.c
index 42e2b4f88..4a2b668aa 100644
--- a/archival/libarchive/decompress_bunzip2.c
+++ b/archival/libarchive/decompress_bunzip2.c
@@ -654,7 +654,7 @@ static int read_bunzip(bunzip_data *bd, char *outbuf, int len)
654 /* Subtract the 1 copy we'd output anyway to get extras */ 654 /* Subtract the 1 copy we'd output anyway to get extras */
655 --bd->writeCopies; 655 --bd->writeCopies;
656 } 656 }
657 } /* for(;;) */ 657 } /* for (;;) */
658 658
659 /* Decompression of this input block completed successfully */ 659 /* Decompression of this input block completed successfully */
660 bd->writeCRC = CRC = ~CRC; 660 bd->writeCRC = CRC = ~CRC;
diff --git a/archival/libarchive/get_header_tar.c b/archival/libarchive/get_header_tar.c
index d26868bf8..cc6f3f0ad 100644
--- a/archival/libarchive/get_header_tar.c
+++ b/archival/libarchive/get_header_tar.c
@@ -147,11 +147,13 @@ static void process_pax_hdr(archive_handle_t *archive_handle, unsigned sz, int g
147#endif 147#endif
148} 148}
149 149
150#if ENABLE_FEATURE_TAR_GNU_EXTENSIONS
150static void die_if_bad_fnamesize(off_t sz) 151static void die_if_bad_fnamesize(off_t sz)
151{ 152{
152 if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */ 153 if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */
153 bb_simple_error_msg_and_die("bad archive"); 154 bb_simple_error_msg_and_die("bad archive");
154} 155}
156#endif
155 157
156char FAST_FUNC get_header_tar(archive_handle_t *archive_handle) 158char FAST_FUNC get_header_tar(archive_handle_t *archive_handle)
157{ 159{
diff --git a/busybox_ldscript.README.txt b/busybox_ldscript.README.txt
new file mode 100644
index 000000000..1625a970a
--- /dev/null
+++ b/busybox_ldscript.README.txt
@@ -0,0 +1,47 @@
1/* Add SORT_BY_ALIGNMENT to linker script (found in busybox_unstripped.out):
2## .rodata : { *(.rodata SORT_BY_ALIGNMENT(.rodata.*) .gnu.linkonce.r.*) }
3## .data : { *(.data SORT_BY_ALIGNMENT(.data.*) .gnu.linkonce.d.*) }
4## .bss : { *(.bss SORT_BY_ALIGNMENT(.bss.*) .gnu.linkonce.b.*) }
5## This will eliminate most of the padding (~3kb).
6## Hmm, "ld --sort-section alignment" should do it too.
7##
8## There is a ld hack which is meant to decrease disk usage
9## at the cost of more RAM usage (??!!) in standard ld script:
10## . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
11## Replace it with:
12## . = ALIGN (0x1000); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000);
13## to unconditionally align .data to the next page boundary,
14## instead of "next page, plus current offset in this page"
15*/
16
17/* To reduce the number of VMAs each bbox process has,
18## move *(.bss SORT_BY_ALIGNMENT(.bss.*) ...)
19## part from .bss : {...} block to .data : { ... } block.
20## (This usually increases .data section by only one page).
21## Result:
22##
23## text data bss dec hex filename
24## 1050792 560 7580 1058932 102874 busybox.bss
25## 1050792 8149 0 1058941 10287d busybox.nobss
26##
27## $ exec busybox.bss pmap $$
28## 0000000008048000 1028K r-xp /path/to/busybox.bss
29## 0000000008149000 8K rw-p /path/to/busybox.bss
30## 000000000814b000 4K rw-p [ anon ] <---- this VMA is eliminated
31## 00000000085f5000 4K ---p [heap]
32## 00000000085f6000 4K rw-p [heap]
33## 00000000f7778000 8K rw-p [ anon ]
34## 00000000f777a000 12K r--p [vvar]
35## 00000000f777d000 8K r-xp [vdso]
36## 00000000ff7e9000 132K rw-p [stack]
37##
38## $ exec busybox.nobss pmap $$
39## 0000000008048000 1028K r-xp /path/to/busybox.nobss
40## 0000000008149000 12K rw-p /path/to/busybox.nobss
41## 00000000086f0000 4K ---p [heap]
42## 00000000086f1000 4K rw-p [heap]
43## 00000000f7783000 8K rw-p [ anon ]
44## 00000000f7785000 12K r--p [vvar]
45## 00000000f7788000 8K r-xp [vdso]
46## 00000000ffac0000 132K rw-p [stack]
47*/
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig
index 408b13eb8..98288bfb2 100644
--- a/configs/mingw32_defconfig
+++ b/configs/mingw32_defconfig
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6
114CONFIG_MD5_SMALL=1 114CONFIG_MD5_SMALL=1
115CONFIG_SHA1_SMALL=3 115CONFIG_SHA1_SMALL=3
116# CONFIG_SHA1_HWACCEL is not set 116# CONFIG_SHA1_HWACCEL is not set
117# CONFIG_SHA256_HWACCEL is not set
117CONFIG_SHA3_SMALL=1 118CONFIG_SHA3_SMALL=1
118# CONFIG_FEATURE_FAST_TOP is not set 119# CONFIG_FEATURE_FAST_TOP is not set
119# CONFIG_FEATURE_ETC_NETWORKS is not set 120# CONFIG_FEATURE_ETC_NETWORKS is not set
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig
index 05596ab8e..1ce3831a9 100644
--- a/configs/mingw64_defconfig
+++ b/configs/mingw64_defconfig
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6
114CONFIG_MD5_SMALL=1 114CONFIG_MD5_SMALL=1
115CONFIG_SHA1_SMALL=3 115CONFIG_SHA1_SMALL=3
116# CONFIG_SHA1_HWACCEL is not set 116# CONFIG_SHA1_HWACCEL is not set
117# CONFIG_SHA256_HWACCEL is not set
117CONFIG_SHA3_SMALL=1 118CONFIG_SHA3_SMALL=1
118# CONFIG_FEATURE_FAST_TOP is not set 119# CONFIG_FEATURE_FAST_TOP is not set
119# CONFIG_FEATURE_ETC_NETWORKS is not set 120# CONFIG_FEATURE_ETC_NETWORKS is not set
diff --git a/console-tools/reset.c b/console-tools/reset.c
index e0d228d50..151bc47d1 100644
--- a/console-tools/reset.c
+++ b/console-tools/reset.c
@@ -40,7 +40,7 @@ int reset_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
40int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) 40int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM)
41{ 41{
42#if !ENABLE_PLATFORM_MINGW32 42#if !ENABLE_PLATFORM_MINGW32
43 static const char *const args[] = { 43 static const char *const args[] ALIGN_PTR = {
44 "stty", "sane", NULL 44 "stty", "sane", NULL
45 }; 45 };
46 46
diff --git a/coreutils/head.c b/coreutils/head.c
index 9586f869f..c7537a20e 100644
--- a/coreutils/head.c
+++ b/coreutils/head.c
@@ -76,7 +76,7 @@ print_except_N_last_bytes(FILE *fp, unsigned count)
76{ 76{
77 unsigned char *circle = xmalloc(++count); 77 unsigned char *circle = xmalloc(++count);
78 unsigned head = 0; 78 unsigned head = 0;
79 for(;;) { 79 for (;;) {
80 int c; 80 int c;
81 c = getc(fp); 81 c = getc(fp);
82 if (c == EOF) 82 if (c == EOF)
@@ -105,7 +105,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
105{ 105{
106 char **circle = xzalloc((++count) * sizeof(circle[0])); 106 char **circle = xzalloc((++count) * sizeof(circle[0]));
107 unsigned head = 0; 107 unsigned head = 0;
108 for(;;) { 108 for (;;) {
109 char *c; 109 char *c;
110 c = xmalloc_fgets(fp); 110 c = xmalloc_fgets(fp);
111 if (!c) 111 if (!c)
@@ -127,7 +127,7 @@ print_except_N_last_lines(FILE *fp, unsigned count)
127 } 127 }
128 ret: 128 ret:
129 head = 0; 129 head = 0;
130 for(;;) { 130 for (;;) {
131 free(circle[head++]); 131 free(circle[head++]);
132 if (head == count) 132 if (head == count)
133 break; 133 break;
diff --git a/coreutils/od.c b/coreutils/od.c
index 9a888dd5f..6f22331e0 100644
--- a/coreutils/od.c
+++ b/coreutils/od.c
@@ -144,7 +144,7 @@ odoffset(dumper_t *dumper, int argc, char ***argvp)
144 } 144 }
145} 145}
146 146
147static const char *const add_strings[] = { 147static const char *const add_strings[] ALIGN_PTR = {
148 "16/1 \"%3_u \" \"\\n\"", /* a */ 148 "16/1 \"%3_u \" \"\\n\"", /* a */
149 "8/2 \" %06o \" \"\\n\"", /* B, o */ 149 "8/2 \" %06o \" \"\\n\"", /* B, o */
150 "16/1 \"%03o \" \"\\n\"", /* b */ 150 "16/1 \"%03o \" \"\\n\"", /* b */
diff --git a/coreutils/test.c b/coreutils/test.c
index a914c7490..840a0daaf 100644
--- a/coreutils/test.c
+++ b/coreutils/test.c
@@ -242,7 +242,7 @@ int depth;
242 depth--; \ 242 depth--; \
243 return __res; \ 243 return __res; \
244} while (0) 244} while (0)
245static const char *const TOKSTR[] = { 245static const char *const TOKSTR[] ALIGN_PTR = {
246 "EOI", 246 "EOI",
247 "FILRD", 247 "FILRD",
248 "FILWR", 248 "FILWR",
diff --git a/e2fsprogs/fsck.c b/e2fsprogs/fsck.c
index 96c1e51e0..028f8a803 100644
--- a/e2fsprogs/fsck.c
+++ b/e2fsprogs/fsck.c
@@ -190,7 +190,7 @@ struct globals {
190 * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3 190 * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3
191 * pathames. 191 * pathames.
192 */ 192 */
193static const char *const devfs_hier[] = { 193static const char *const devfs_hier[] ALIGN_PTR = {
194 "host", "bus", "target", "lun", NULL 194 "host", "bus", "target", "lun", NULL
195}; 195};
196#endif 196#endif
diff --git a/editors/cmp.c b/editors/cmp.c
index 6d2b0c6c3..b89e519ad 100644
--- a/editors/cmp.c
+++ b/editors/cmp.c
@@ -54,6 +54,7 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
54 int retval = 0; 54 int retval = 0;
55 int max_count = -1; 55 int max_count = -1;
56 56
57#if !ENABLE_LONG_OPTS
57 opt = getopt32(argv, "^" 58 opt = getopt32(argv, "^"
58 OPT_STR 59 OPT_STR
59 "\0" "-1" 60 "\0" "-1"
@@ -62,6 +63,23 @@ int cmp_main(int argc UNUSED_PARAM, char **argv)
62 ":l--s:s--l", 63 ":l--s:s--l",
63 &max_count 64 &max_count
64 ); 65 );
66#else
67 static const char cmp_longopts[] ALIGN1 =
68 "bytes\0" Required_argument "n"
69 "quiet\0" No_argument "s"
70 "silent\0" No_argument "s"
71 "verbose\0" No_argument "l"
72 ;
73 opt = getopt32long(argv, "^"
74 OPT_STR
75 "\0" "-1"
76 IF_DESKTOP(":?4")
77 IF_NOT_DESKTOP(":?2")
78 ":l--s:s--l",
79 cmp_longopts,
80 &max_count
81 );
82#endif
65 argv += optind; 83 argv += optind;
66 84
67 filename1 = *argv; 85 filename1 = *argv;
diff --git a/editors/patch.c b/editors/patch.c
index 110176630..aebb5073e 100644
--- a/editors/patch.c
+++ b/editors/patch.c
@@ -418,7 +418,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
418 } 418 }
419 419
420 // Loop through the lines in the patch 420 // Loop through the lines in the patch
421 for(;;) { 421 for (;;) {
422 char *patchline; 422 char *patchline;
423 423
424 patchline = xmalloc_fgetline(stdin); 424 patchline = xmalloc_fgetline(stdin);
diff --git a/editors/patch_toybox.c b/editors/patch_toybox.c
index aebab8132..69a508b2e 100644
--- a/editors/patch_toybox.c
+++ b/editors/patch_toybox.c
@@ -441,7 +441,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv)
441 TT.filein = TT.fileout = -1; 441 TT.filein = TT.fileout = -1;
442 442
443 // Loop through the lines in the patch 443 // Loop through the lines in the patch
444 for(;;) { 444 for (;;) {
445 char *patchline; 445 char *patchline;
446 446
447 patchline = get_line(TT.filepatch); 447 patchline = get_line(TT.filepatch);
diff --git a/editors/sed.c b/editors/sed.c
index 374830f3f..f4a5f7b8a 100644
--- a/editors/sed.c
+++ b/editors/sed.c
@@ -252,7 +252,6 @@ static void cleanup_outname(void)
252} 252}
253 253
254/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */ 254/* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */
255
256static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to) 255static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to)
257{ 256{
258 char *d = dest; 257 char *d = dest;
@@ -282,7 +281,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from
282 return d - dest; 281 return d - dest;
283} 282}
284 283
285static char *copy_parsing_escapes(const char *string, int len) 284static char *copy_parsing_escapes(const char *string, int len, char delim)
286{ 285{
287 const char *s; 286 const char *s;
288 char *dest = xmalloc(len + 1); 287 char *dest = xmalloc(len + 1);
@@ -293,10 +292,15 @@ static char *copy_parsing_escapes(const char *string, int len)
293 len = parse_escapes(dest, string, len, s[1], s[0]); 292 len = parse_escapes(dest, string, len, s[1], s[0]);
294 string = dest; 293 string = dest;
295 } 294 }
295 if (delim) {
296 /* we additionally unescape any instances of escaped delimiter.
297 * For example, in 's+9\++X+' the pattern is "9+", not "9\+".
298 */
299 len = parse_escapes(dest, string, len, delim, delim);
300 }
296 return dest; 301 return dest;
297} 302}
298 303
299
300/* 304/*
301 * index_of_next_unescaped_regexp_delim - walks left to right through a string 305 * index_of_next_unescaped_regexp_delim - walks left to right through a string
302 * beginning at a specified index and returns the index of the next regular 306 * beginning at a specified index and returns the index of the next regular
@@ -353,12 +357,14 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace)
353 357
354 /* save the match string */ 358 /* save the match string */
355 idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr); 359 idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr);
356 *match = copy_parsing_escapes(cmdstr_ptr, idx); 360 *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter);
357
358 /* save the replacement string */ 361 /* save the replacement string */
359 cmdstr_ptr += idx + 1; 362 cmdstr_ptr += idx + 1;
360 idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); 363 idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr);
361 *replace = copy_parsing_escapes(cmdstr_ptr, idx); 364//GNU sed 4.8:
365// echo 789 | sed 's&8&\&&' - 7&9 ("\&" remained "\&")
366// echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11")
367 *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0);
362 368
363 return ((cmdstr_ptr - cmdstr) + idx); 369 return ((cmdstr_ptr - cmdstr) + idx);
364} 370}
@@ -386,7 +392,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex)
386 delimiter = *++pos; 392 delimiter = *++pos;
387 next = index_of_next_unescaped_regexp_delim(delimiter, ++pos); 393 next = index_of_next_unescaped_regexp_delim(delimiter, ++pos);
388 if (next != 0) { 394 if (next != 0) {
389 temp = copy_parsing_escapes(pos, next); 395 temp = copy_parsing_escapes(pos, next, 0);
390 G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t)); 396 G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t));
391 xregcomp(*regex, temp, G.regex_type); 397 xregcomp(*regex, temp, G.regex_type);
392 free(temp); 398 free(temp);
@@ -581,7 +587,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr)
581 cmdstr++; 587 cmdstr++;
582 } 588 }
583 len = strlen(cmdstr); 589 len = strlen(cmdstr);
584 sed_cmd->string = copy_parsing_escapes(cmdstr, len); 590 sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0);
585 cmdstr += len; 591 cmdstr += len;
586 /* "\anychar" -> "anychar" */ 592 /* "\anychar" -> "anychar" */
587 parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0'); 593 parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0');
diff --git a/editors/vi.c b/editors/vi.c
index b973cc056..b30369302 100644
--- a/editors/vi.c
+++ b/editors/vi.c
@@ -1182,7 +1182,7 @@ static int readit(void) // read (maybe cursor) key from stdin
1182 // on nonblocking stdin. 1182 // on nonblocking stdin.
1183 // Note: read_key sets errno to 0 on success. 1183 // Note: read_key sets errno to 0 on success.
1184 again: 1184 again:
1185 c = read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1); 1185 c = safe_read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1);
1186 if (c == -1) { // EOF/error 1186 if (c == -1) { // EOF/error
1187 if (errno == EAGAIN) // paranoia 1187 if (errno == EAGAIN) // paranoia
1188 goto again; 1188 goto again;
@@ -4930,7 +4930,7 @@ static void edit_file(char *fn)
4930 uint64_t k; 4930 uint64_t k;
4931 write1(ESC"[999;999H" ESC"[6n"); 4931 write1(ESC"[999;999H" ESC"[6n");
4932 fflush_all(); 4932 fflush_all();
4933 k = read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100); 4933 k = safe_read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100);
4934 if ((int32_t)k == KEYCODE_CURSOR_POS) { 4934 if ((int32_t)k == KEYCODE_CURSOR_POS) {
4935 uint32_t rc = (k >> 32); 4935 uint32_t rc = (k >> 32);
4936 columns = (rc & 0x7fff); 4936 columns = (rc & 0x7fff);
diff --git a/include/libbb.h b/include/libbb.h
index e540f2a90..740c25528 100644
--- a/include/libbb.h
+++ b/include/libbb.h
@@ -691,6 +691,7 @@ void xsetgid(gid_t gid) FAST_FUNC;
691void xsetuid(uid_t uid) FAST_FUNC; 691void xsetuid(uid_t uid) FAST_FUNC;
692void xsetegid(gid_t egid) FAST_FUNC; 692void xsetegid(gid_t egid) FAST_FUNC;
693void xseteuid(uid_t euid) FAST_FUNC; 693void xseteuid(uid_t euid) FAST_FUNC;
694int chdir_or_warn(const char *path) FAST_FUNC;
694void xchdir(const char *path) FAST_FUNC; 695void xchdir(const char *path) FAST_FUNC;
695void xfchdir(int fd) FAST_FUNC; 696void xfchdir(int fd) FAST_FUNC;
696void xchroot(const char *path) FAST_FUNC; 697void xchroot(const char *path) FAST_FUNC;
@@ -1776,7 +1777,7 @@ extern void selinux_or_die(void) FAST_FUNC;
1776 1777
1777 1778
1778/* setup_environment: 1779/* setup_environment:
1779 * if !SETUP_ENV_NO_CHDIR: 1780 * if SETUP_ENV_CHDIR:
1780 * if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die 1781 * if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die
1781 * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set 1782 * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set
1782 * TERM=(old value) 1783 * TERM=(old value)
@@ -1784,7 +1785,7 @@ extern void selinux_or_die(void) FAST_FUNC;
1784 * PATH=bb_default_[root_]path 1785 * PATH=bb_default_[root_]path
1785 * HOME=pw->pw_dir 1786 * HOME=pw->pw_dir
1786 * SHELL=shell 1787 * SHELL=shell
1787 * else if SETUP_ENV_CHANGEENV: 1788 * else if SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME:
1788 * if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME: 1789 * if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME:
1789 * USER=pw->pw_name, LOGNAME=pw->pw_name 1790 * USER=pw->pw_name, LOGNAME=pw->pw_name
1790 * HOME=pw->pw_dir 1791 * HOME=pw->pw_dir
@@ -1798,7 +1799,7 @@ extern void selinux_or_die(void) FAST_FUNC;
1798#define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1) 1799#define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1)
1799#define SETUP_ENV_CLEARENV (1 << 2) 1800#define SETUP_ENV_CLEARENV (1 << 2)
1800#define SETUP_ENV_TO_TMP (1 << 3) 1801#define SETUP_ENV_TO_TMP (1 << 3)
1801#define SETUP_ENV_NO_CHDIR (1 << 4) 1802#define SETUP_ENV_CHDIR (1 << 4)
1802void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC; 1803void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC;
1803void nuke_str(char *str) FAST_FUNC; 1804void nuke_str(char *str) FAST_FUNC;
1804#if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM 1805#if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM
@@ -1955,6 +1956,8 @@ enum {
1955 * (unless fd is in non-blocking mode), 1956 * (unless fd is in non-blocking mode),
1956 * subsequent reads will time out after a few milliseconds. 1957 * subsequent reads will time out after a few milliseconds.
1957 * Return of -1 means EOF or error (errno == 0 on EOF). 1958 * Return of -1 means EOF or error (errno == 0 on EOF).
1959 * Nonzero errno is not preserved across the call:
1960 * if there was no error, errno will be cleared to 0.
1958 * buffer[0] is used as a counter of buffered chars and must be 0 1961 * buffer[0] is used as a counter of buffered chars and must be 0
1959 * on first call. 1962 * on first call.
1960 * timeout: 1963 * timeout:
@@ -1963,6 +1966,12 @@ enum {
1963 * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout 1966 * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout
1964 */ 1967 */
1965int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC; 1968int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC;
1969#if ENABLE_PLATFORM_MINGW32
1970#define safe_read_key(f, b, t) read_key(f, b, t)
1971#else
1972/* This version loops on EINTR: */
1973int64_t safe_read_key(int fd, char *buffer, int timeout) FAST_FUNC;
1974#endif
1966void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC; 1975void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC;
1967 1976
1968 1977
@@ -2016,7 +2025,8 @@ enum {
2016 USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION, 2025 USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION,
2017 VI_MODE = 8 * ENABLE_FEATURE_EDITING_VI, 2026 VI_MODE = 8 * ENABLE_FEATURE_EDITING_VI,
2018 WITH_PATH_LOOKUP = 0x10, 2027 WITH_PATH_LOOKUP = 0x10,
2019 FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION, 2028 LI_INTERRUPTIBLE = 0x20,
2029 FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION | LI_INTERRUPTIBLE,
2020}; 2030};
2021line_input_t *new_line_input_t(int flags) FAST_FUNC; 2031line_input_t *new_line_input_t(int flags) FAST_FUNC;
2022#if ENABLE_FEATURE_EDITING_SAVEHISTORY 2032#if ENABLE_FEATURE_EDITING_SAVEHISTORY
@@ -2361,7 +2371,7 @@ struct globals;
2361/* '*const' ptr makes gcc optimize code much better. 2371/* '*const' ptr makes gcc optimize code much better.
2362 * Magic prevents ptr_to_globals from going into rodata. 2372 * Magic prevents ptr_to_globals from going into rodata.
2363 * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */ 2373 * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */
2364extern struct globals *const ptr_to_globals; 2374extern struct globals *BB_GLOBAL_CONST ptr_to_globals;
2365 2375
2366#define barrier() asm volatile ("":::"memory") 2376#define barrier() asm volatile ("":::"memory")
2367 2377
diff --git a/include/platform.h b/include/platform.h
index 3fb1a2dc8..8ae5ed4bc 100644
--- a/include/platform.h
+++ b/include/platform.h
@@ -367,6 +367,7 @@ typedef unsigned smalluint;
367# define ALIGN4 367# define ALIGN4
368#endif 368#endif
369#define ALIGN8 __attribute__((aligned(8))) 369#define ALIGN8 __attribute__((aligned(8)))
370#define ALIGN_INT __attribute__((aligned(sizeof(int))))
370#define ALIGN_PTR __attribute__((aligned(sizeof(void*)))) 371#define ALIGN_PTR __attribute__((aligned(sizeof(void*))))
371 372
372/* 373/*
diff --git a/libbb/Config.src b/libbb/Config.src
index 708d3b0c8..0ecd5bd46 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -70,6 +70,12 @@ config SHA1_HWACCEL
70 On x86, this adds ~590 bytes of code. Throughput 70 On x86, this adds ~590 bytes of code. Throughput
71 is about twice as fast as fully-unrolled generic code. 71 is about twice as fast as fully-unrolled generic code.
72 72
73config SHA256_HWACCEL
74 bool "SHA256: Use hardware accelerated instructions if possible"
75 default y
76 help
77 On x86, this adds ~1k bytes of code.
78
73config SHA3_SMALL 79config SHA3_SMALL
74 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 80 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
75 default 1 # all "fast or small" options default to small 81 default 1 # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 67d3c7cf7..191984c9d 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -48,6 +48,8 @@ lib-y += hash_md5_sha.o
48lib-y += hash_md5_sha_x86-64.o 48lib-y += hash_md5_sha_x86-64.o
49lib-y += hash_md5_sha_x86-64_shaNI.o 49lib-y += hash_md5_sha_x86-64_shaNI.o
50lib-y += hash_md5_sha_x86-32_shaNI.o 50lib-y += hash_md5_sha_x86-32_shaNI.o
51lib-y += hash_md5_sha256_x86-64_shaNI.o
52lib-y += hash_md5_sha256_x86-32_shaNI.o
51# Alternative (disabled) MD5 implementation 53# Alternative (disabled) MD5 implementation
52#lib-y += hash_md5prime.o 54#lib-y += hash_md5prime.o
53lib-y += messages.o 55lib-y += messages.o
@@ -204,6 +206,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o
204lib-$(CONFIG_PKILL) += xregcomp.o 206lib-$(CONFIG_PKILL) += xregcomp.o
205lib-$(CONFIG_DEVFSD) += xregcomp.o 207lib-$(CONFIG_DEVFSD) += xregcomp.o
206lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o 208lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o
209lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o
207 210
208# Add the experimental logging functionality, only used by zcip 211# Add the experimental logging functionality, only used by zcip
209lib-$(CONFIG_ZCIP) += logenv.o 212lib-$(CONFIG_ZCIP) += logenv.o
diff --git a/libbb/appletlib.c b/libbb/appletlib.c
index 6c0be4a83..a8b82e729 100644
--- a/libbb/appletlib.c
+++ b/libbb/appletlib.c
@@ -671,7 +671,7 @@ static void check_suid(int applet_no)
671# if ENABLE_FEATURE_INSTALLER 671# if ENABLE_FEATURE_INSTALLER
672static const char usr_bin [] ALIGN1 = "/usr/bin/"; 672static const char usr_bin [] ALIGN1 = "/usr/bin/";
673static const char usr_sbin[] ALIGN1 = "/usr/sbin/"; 673static const char usr_sbin[] ALIGN1 = "/usr/sbin/";
674static const char *const install_dir[] = { 674static const char *const install_dir[] ALIGN_PTR = {
675 &usr_bin [8], /* "/" */ 675 &usr_bin [8], /* "/" */
676 &usr_bin [4], /* "/bin/" */ 676 &usr_bin [4], /* "/bin/" */
677 &usr_sbin[4] /* "/sbin/" */ 677 &usr_sbin[4] /* "/sbin/" */
diff --git a/libbb/get_console.c b/libbb/get_console.c
index 7f2c75332..9044efea1 100644
--- a/libbb/get_console.c
+++ b/libbb/get_console.c
@@ -37,7 +37,7 @@ static int open_a_console(const char *fnam)
37 */ 37 */
38int FAST_FUNC get_console_fd_or_die(void) 38int FAST_FUNC get_console_fd_or_die(void)
39{ 39{
40 static const char *const console_names[] = { 40 static const char *const console_names[] ALIGN_PTR = {
41 DEV_CONSOLE, CURRENT_VC, CURRENT_TTY 41 DEV_CONSOLE, CURRENT_VC, CURRENT_TTY
42 }; 42 };
43 43
diff --git a/libbb/getopt32.c b/libbb/getopt32.c
index 5ab4d66f1..e861d0567 100644
--- a/libbb/getopt32.c
+++ b/libbb/getopt32.c
@@ -296,7 +296,7 @@ Special characters:
296 296
297/* Code here assumes that 'unsigned' is at least 32 bits wide */ 297/* Code here assumes that 'unsigned' is at least 32 bits wide */
298 298
299const char *const bb_argv_dash[] = { "-", NULL }; 299const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL };
300 300
301enum { 301enum {
302 PARAM_STRING, 302 PARAM_STRING,
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a23db5152..880ffab01 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -13,6 +13,27 @@
13 13
14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) 14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
15 15
16#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
17# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
18static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
19{
20 asm ("cpuid"
21 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
22 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
23 );
24}
25static smallint shaNI;
26void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
27void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
28# if defined(__i386__)
29struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
30# endif
31# if defined(__x86_64__)
32struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
33# endif
34# endif
35#endif
36
16/* gcc 4.2.1 optimizes rotr64 better with inline than with macro 37/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
17 * (for rotX32, there is no difference). Why? My guess is that 38 * (for rotX32, there is no difference). Why? My guess is that
18 * macro requires clever common subexpression elimination heuristics 39 * macro requires clever common subexpression elimination heuristics
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142} 1163}
1143#endif /* NEED_SHA512 */ 1164#endif /* NEED_SHA512 */
1144 1165
1145#if ENABLE_SHA1_HWACCEL
1146# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1147static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
1148{
1149 asm ("cpuid"
1150 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
1151 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
1152 );
1153}
1154void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
1155# if defined(__i386__)
1156struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
1157# endif
1158# if defined(__x86_64__)
1159struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
1160# endif
1161# endif
1162#endif
1163
1164void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) 1166void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1165{ 1167{
1166 ctx->hash[0] = 0x67452301; 1168 ctx->hash[0] = 0x67452301;
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1173#if ENABLE_SHA1_HWACCEL 1175#if ENABLE_SHA1_HWACCEL
1174# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 1176# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1175 { 1177 {
1176 static smallint shaNI;
1177 if (!shaNI) { 1178 if (!shaNI) {
1178 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; 1179 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1179 cpuid(&eax, &ebx, &ecx, &edx); 1180 cpuid(&eax, &ebx, &ecx, &edx);
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
1225 memcpy(&ctx->total64, init256, sizeof(init256)); 1226 memcpy(&ctx->total64, init256, sizeof(init256));
1226 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ 1227 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
1227 ctx->process_block = sha256_process_block64; 1228 ctx->process_block = sha256_process_block64;
1229#if ENABLE_SHA256_HWACCEL
1230# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1231 {
1232 if (!shaNI) {
1233 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1234 cpuid(&eax, &ebx, &ecx, &edx);
1235 shaNI = ((ebx >> 29) << 1) - 1;
1236 }
1237 if (shaNI > 0)
1238 ctx->process_block = sha256_process_block64_shaNI;
1239 }
1240# endif
1241#endif
1228} 1242}
1229 1243
1230#if NEED_SHA512 1244#if NEED_SHA512
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..aa68193bd
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,277 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
38
39 .balign 8 # allow decoders to fetch at least 2 first insns
40sha256_process_block64_shaNI:
41
42 movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
43 movu128 76+1*16(%eax), STATE1 /* HGFE */
44/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
45 mova128 STATE1, STATE0
46 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
47 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
48
49/* XMMTMP holds flip mask from here... */
50 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
51 movl $K256+8*16, SHA256CONSTANTS
52
53 /* Rounds 0-3 */
54 movu128 0*16(DATA_PTR), MSG
55 pshufb XMMTMP, MSG
56 mova128 MSG, MSGTMP0
57 paddd 0*16-8*16(SHA256CONSTANTS), MSG
58 sha256rnds2 STATE0, STATE1
59 shuf128_32 $0x0E, MSG, MSG
60 sha256rnds2 STATE1, STATE0
61
62 /* Rounds 4-7 */
63 movu128 1*16(DATA_PTR), MSG
64 pshufb XMMTMP, MSG
65 mova128 MSG, MSGTMP1
66 paddd 1*16-8*16(SHA256CONSTANTS), MSG
67 sha256rnds2 STATE0, STATE1
68 shuf128_32 $0x0E, MSG, MSG
69 sha256rnds2 STATE1, STATE0
70 sha256msg1 MSGTMP1, MSGTMP0
71
72 /* Rounds 8-11 */
73 movu128 2*16(DATA_PTR), MSG
74 pshufb XMMTMP, MSG
75 mova128 MSG, MSGTMP2
76 paddd 2*16-8*16(SHA256CONSTANTS), MSG
77 sha256rnds2 STATE0, STATE1
78 shuf128_32 $0x0E, MSG, MSG
79 sha256rnds2 STATE1, STATE0
80 sha256msg1 MSGTMP2, MSGTMP1
81
82 /* Rounds 12-15 */
83 movu128 3*16(DATA_PTR), MSG
84 pshufb XMMTMP, MSG
85/* ...to here */
86 mova128 MSG, MSGTMP3
87 paddd 3*16-8*16(SHA256CONSTANTS), MSG
88 sha256rnds2 STATE0, STATE1
89 mova128 MSGTMP3, XMMTMP
90 palignr $4, MSGTMP2, XMMTMP
91 paddd XMMTMP, MSGTMP0
92 sha256msg2 MSGTMP3, MSGTMP0
93 shuf128_32 $0x0E, MSG, MSG
94 sha256rnds2 STATE1, STATE0
95 sha256msg1 MSGTMP3, MSGTMP2
96
97 /* Rounds 16-19 */
98 mova128 MSGTMP0, MSG
99 paddd 4*16-8*16(SHA256CONSTANTS), MSG
100 sha256rnds2 STATE0, STATE1
101 mova128 MSGTMP0, XMMTMP
102 palignr $4, MSGTMP3, XMMTMP
103 paddd XMMTMP, MSGTMP1
104 sha256msg2 MSGTMP0, MSGTMP1
105 shuf128_32 $0x0E, MSG, MSG
106 sha256rnds2 STATE1, STATE0
107 sha256msg1 MSGTMP0, MSGTMP3
108
109 /* Rounds 20-23 */
110 mova128 MSGTMP1, MSG
111 paddd 5*16-8*16(SHA256CONSTANTS), MSG
112 sha256rnds2 STATE0, STATE1
113 mova128 MSGTMP1, XMMTMP
114 palignr $4, MSGTMP0, XMMTMP
115 paddd XMMTMP, MSGTMP2
116 sha256msg2 MSGTMP1, MSGTMP2
117 shuf128_32 $0x0E, MSG, MSG
118 sha256rnds2 STATE1, STATE0
119 sha256msg1 MSGTMP1, MSGTMP0
120
121 /* Rounds 24-27 */
122 mova128 MSGTMP2, MSG
123 paddd 6*16-8*16(SHA256CONSTANTS), MSG
124 sha256rnds2 STATE0, STATE1
125 mova128 MSGTMP2, XMMTMP
126 palignr $4, MSGTMP1, XMMTMP
127 paddd XMMTMP, MSGTMP3
128 sha256msg2 MSGTMP2, MSGTMP3
129 shuf128_32 $0x0E, MSG, MSG
130 sha256rnds2 STATE1, STATE0
131 sha256msg1 MSGTMP2, MSGTMP1
132
133 /* Rounds 28-31 */
134 mova128 MSGTMP3, MSG
135 paddd 7*16-8*16(SHA256CONSTANTS), MSG
136 sha256rnds2 STATE0, STATE1
137 mova128 MSGTMP3, XMMTMP
138 palignr $4, MSGTMP2, XMMTMP
139 paddd XMMTMP, MSGTMP0
140 sha256msg2 MSGTMP3, MSGTMP0
141 shuf128_32 $0x0E, MSG, MSG
142 sha256rnds2 STATE1, STATE0
143 sha256msg1 MSGTMP3, MSGTMP2
144
145 /* Rounds 32-35 */
146 mova128 MSGTMP0, MSG
147 paddd 8*16-8*16(SHA256CONSTANTS), MSG
148 sha256rnds2 STATE0, STATE1
149 mova128 MSGTMP0, XMMTMP
150 palignr $4, MSGTMP3, XMMTMP
151 paddd XMMTMP, MSGTMP1
152 sha256msg2 MSGTMP0, MSGTMP1
153 shuf128_32 $0x0E, MSG, MSG
154 sha256rnds2 STATE1, STATE0
155 sha256msg1 MSGTMP0, MSGTMP3
156
157 /* Rounds 36-39 */
158 mova128 MSGTMP1, MSG
159 paddd 9*16-8*16(SHA256CONSTANTS), MSG
160 sha256rnds2 STATE0, STATE1
161 mova128 MSGTMP1, XMMTMP
162 palignr $4, MSGTMP0, XMMTMP
163 paddd XMMTMP, MSGTMP2
164 sha256msg2 MSGTMP1, MSGTMP2
165 shuf128_32 $0x0E, MSG, MSG
166 sha256rnds2 STATE1, STATE0
167 sha256msg1 MSGTMP1, MSGTMP0
168
169 /* Rounds 40-43 */
170 mova128 MSGTMP2, MSG
171 paddd 10*16-8*16(SHA256CONSTANTS), MSG
172 sha256rnds2 STATE0, STATE1
173 mova128 MSGTMP2, XMMTMP
174 palignr $4, MSGTMP1, XMMTMP
175 paddd XMMTMP, MSGTMP3
176 sha256msg2 MSGTMP2, MSGTMP3
177 shuf128_32 $0x0E, MSG, MSG
178 sha256rnds2 STATE1, STATE0
179 sha256msg1 MSGTMP2, MSGTMP1
180
181 /* Rounds 44-47 */
182 mova128 MSGTMP3, MSG
183 paddd 11*16-8*16(SHA256CONSTANTS), MSG
184 sha256rnds2 STATE0, STATE1
185 mova128 MSGTMP3, XMMTMP
186 palignr $4, MSGTMP2, XMMTMP
187 paddd XMMTMP, MSGTMP0
188 sha256msg2 MSGTMP3, MSGTMP0
189 shuf128_32 $0x0E, MSG, MSG
190 sha256rnds2 STATE1, STATE0
191 sha256msg1 MSGTMP3, MSGTMP2
192
193 /* Rounds 48-51 */
194 mova128 MSGTMP0, MSG
195 paddd 12*16-8*16(SHA256CONSTANTS), MSG
196 sha256rnds2 STATE0, STATE1
197 mova128 MSGTMP0, XMMTMP
198 palignr $4, MSGTMP3, XMMTMP
199 paddd XMMTMP, MSGTMP1
200 sha256msg2 MSGTMP0, MSGTMP1
201 shuf128_32 $0x0E, MSG, MSG
202 sha256rnds2 STATE1, STATE0
203 sha256msg1 MSGTMP0, MSGTMP3
204
205 /* Rounds 52-55 */
206 mova128 MSGTMP1, MSG
207 paddd 13*16-8*16(SHA256CONSTANTS), MSG
208 sha256rnds2 STATE0, STATE1
209 mova128 MSGTMP1, XMMTMP
210 palignr $4, MSGTMP0, XMMTMP
211 paddd XMMTMP, MSGTMP2
212 sha256msg2 MSGTMP1, MSGTMP2
213 shuf128_32 $0x0E, MSG, MSG
214 sha256rnds2 STATE1, STATE0
215
216 /* Rounds 56-59 */
217 mova128 MSGTMP2, MSG
218 paddd 14*16-8*16(SHA256CONSTANTS), MSG
219 sha256rnds2 STATE0, STATE1
220 mova128 MSGTMP2, XMMTMP
221 palignr $4, MSGTMP1, XMMTMP
222 paddd XMMTMP, MSGTMP3
223 sha256msg2 MSGTMP2, MSGTMP3
224 shuf128_32 $0x0E, MSG, MSG
225 sha256rnds2 STATE1, STATE0
226
227 /* Rounds 60-63 */
228 mova128 MSGTMP3, MSG
229 paddd 15*16-8*16(SHA256CONSTANTS), MSG
230 sha256rnds2 STATE0, STATE1
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Write hash values back in the correct order */
235 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
236 /* STATE1: CDGH */
237 mova128 STATE0, XMMTMP
238/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
239 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
240 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
241 /* add current hash values to previous ones */
242 movu128 76+1*16(%eax), STATE1
243 paddd XMMTMP, STATE1
244 movu128 STATE1, 76+1*16(%eax)
245 movu128 76+0*16(%eax), XMMTMP
246 paddd XMMTMP, STATE0
247 movu128 STATE0, 76+0*16(%eax)
248
249 ret
250 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
251
252 .section .rodata.cst256.K256, "aM", @progbits, 256
253 .balign 16
254K256:
255 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
256 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
257 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
258 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
259 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
260 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
261 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
262 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
263 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
264 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
265 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
266 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
267 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
268 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
269 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
270 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
271
272 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
273 .balign 16
274PSHUFFLE_BSWAP32_FLIP_MASK:
275 .octa 0x0c0d0e0f08090a0b0405060700010203
276
277#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..4663f750a
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,284 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define ABEF_SAVE %xmm9
38#define CDGH_SAVE %xmm10
39
40#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
41
42 .balign 8 # allow decoders to fetch at least 2 first insns
43sha256_process_block64_shaNI:
44
45 movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
46 movu128 80+1*16(%rdi), STATE1 /* HGFE */
47/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
48 mova128 STATE1, STATE0
49 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
50 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
51
52/* XMMTMP holds flip mask from here... */
53 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
54 leaq K256+8*16(%rip), SHA256CONSTANTS
55
56 /* Save hash values for addition after rounds */
57 mova128 STATE0, ABEF_SAVE
58 mova128 STATE1, CDGH_SAVE
59
60 /* Rounds 0-3 */
61 movu128 0*16(DATA_PTR), MSG
62 pshufb XMMTMP, MSG
63 mova128 MSG, MSGTMP0
64 paddd 0*16-8*16(SHA256CONSTANTS), MSG
65 sha256rnds2 STATE0, STATE1
66 shuf128_32 $0x0E, MSG, MSG
67 sha256rnds2 STATE1, STATE0
68
69 /* Rounds 4-7 */
70 movu128 1*16(DATA_PTR), MSG
71 pshufb XMMTMP, MSG
72 mova128 MSG, MSGTMP1
73 paddd 1*16-8*16(SHA256CONSTANTS), MSG
74 sha256rnds2 STATE0, STATE1
75 shuf128_32 $0x0E, MSG, MSG
76 sha256rnds2 STATE1, STATE0
77 sha256msg1 MSGTMP1, MSGTMP0
78
79 /* Rounds 8-11 */
80 movu128 2*16(DATA_PTR), MSG
81 pshufb XMMTMP, MSG
82 mova128 MSG, MSGTMP2
83 paddd 2*16-8*16(SHA256CONSTANTS), MSG
84 sha256rnds2 STATE0, STATE1
85 shuf128_32 $0x0E, MSG, MSG
86 sha256rnds2 STATE1, STATE0
87 sha256msg1 MSGTMP2, MSGTMP1
88
89 /* Rounds 12-15 */
90 movu128 3*16(DATA_PTR), MSG
91 pshufb XMMTMP, MSG
92/* ...to here */
93 mova128 MSG, MSGTMP3
94 paddd 3*16-8*16(SHA256CONSTANTS), MSG
95 sha256rnds2 STATE0, STATE1
96 mova128 MSGTMP3, XMMTMP
97 palignr $4, MSGTMP2, XMMTMP
98 paddd XMMTMP, MSGTMP0
99 sha256msg2 MSGTMP3, MSGTMP0
100 shuf128_32 $0x0E, MSG, MSG
101 sha256rnds2 STATE1, STATE0
102 sha256msg1 MSGTMP3, MSGTMP2
103
104 /* Rounds 16-19 */
105 mova128 MSGTMP0, MSG
106 paddd 4*16-8*16(SHA256CONSTANTS), MSG
107 sha256rnds2 STATE0, STATE1
108 mova128 MSGTMP0, XMMTMP
109 palignr $4, MSGTMP3, XMMTMP
110 paddd XMMTMP, MSGTMP1
111 sha256msg2 MSGTMP0, MSGTMP1
112 shuf128_32 $0x0E, MSG, MSG
113 sha256rnds2 STATE1, STATE0
114 sha256msg1 MSGTMP0, MSGTMP3
115
116 /* Rounds 20-23 */
117 mova128 MSGTMP1, MSG
118 paddd 5*16-8*16(SHA256CONSTANTS), MSG
119 sha256rnds2 STATE0, STATE1
120 mova128 MSGTMP1, XMMTMP
121 palignr $4, MSGTMP0, XMMTMP
122 paddd XMMTMP, MSGTMP2
123 sha256msg2 MSGTMP1, MSGTMP2
124 shuf128_32 $0x0E, MSG, MSG
125 sha256rnds2 STATE1, STATE0
126 sha256msg1 MSGTMP1, MSGTMP0
127
128 /* Rounds 24-27 */
129 mova128 MSGTMP2, MSG
130 paddd 6*16-8*16(SHA256CONSTANTS), MSG
131 sha256rnds2 STATE0, STATE1
132 mova128 MSGTMP2, XMMTMP
133 palignr $4, MSGTMP1, XMMTMP
134 paddd XMMTMP, MSGTMP3
135 sha256msg2 MSGTMP2, MSGTMP3
136 shuf128_32 $0x0E, MSG, MSG
137 sha256rnds2 STATE1, STATE0
138 sha256msg1 MSGTMP2, MSGTMP1
139
140 /* Rounds 28-31 */
141 mova128 MSGTMP3, MSG
142 paddd 7*16-8*16(SHA256CONSTANTS), MSG
143 sha256rnds2 STATE0, STATE1
144 mova128 MSGTMP3, XMMTMP
145 palignr $4, MSGTMP2, XMMTMP
146 paddd XMMTMP, MSGTMP0
147 sha256msg2 MSGTMP3, MSGTMP0
148 shuf128_32 $0x0E, MSG, MSG
149 sha256rnds2 STATE1, STATE0
150 sha256msg1 MSGTMP3, MSGTMP2
151
152 /* Rounds 32-35 */
153 mova128 MSGTMP0, MSG
154 paddd 8*16-8*16(SHA256CONSTANTS), MSG
155 sha256rnds2 STATE0, STATE1
156 mova128 MSGTMP0, XMMTMP
157 palignr $4, MSGTMP3, XMMTMP
158 paddd XMMTMP, MSGTMP1
159 sha256msg2 MSGTMP0, MSGTMP1
160 shuf128_32 $0x0E, MSG, MSG
161 sha256rnds2 STATE1, STATE0
162 sha256msg1 MSGTMP0, MSGTMP3
163
164 /* Rounds 36-39 */
165 mova128 MSGTMP1, MSG
166 paddd 9*16-8*16(SHA256CONSTANTS), MSG
167 sha256rnds2 STATE0, STATE1
168 mova128 MSGTMP1, XMMTMP
169 palignr $4, MSGTMP0, XMMTMP
170 paddd XMMTMP, MSGTMP2
171 sha256msg2 MSGTMP1, MSGTMP2
172 shuf128_32 $0x0E, MSG, MSG
173 sha256rnds2 STATE1, STATE0
174 sha256msg1 MSGTMP1, MSGTMP0
175
176 /* Rounds 40-43 */
177 mova128 MSGTMP2, MSG
178 paddd 10*16-8*16(SHA256CONSTANTS), MSG
179 sha256rnds2 STATE0, STATE1
180 mova128 MSGTMP2, XMMTMP
181 palignr $4, MSGTMP1, XMMTMP
182 paddd XMMTMP, MSGTMP3
183 sha256msg2 MSGTMP2, MSGTMP3
184 shuf128_32 $0x0E, MSG, MSG
185 sha256rnds2 STATE1, STATE0
186 sha256msg1 MSGTMP2, MSGTMP1
187
188 /* Rounds 44-47 */
189 mova128 MSGTMP3, MSG
190 paddd 11*16-8*16(SHA256CONSTANTS), MSG
191 sha256rnds2 STATE0, STATE1
192 mova128 MSGTMP3, XMMTMP
193 palignr $4, MSGTMP2, XMMTMP
194 paddd XMMTMP, MSGTMP0
195 sha256msg2 MSGTMP3, MSGTMP0
196 shuf128_32 $0x0E, MSG, MSG
197 sha256rnds2 STATE1, STATE0
198 sha256msg1 MSGTMP3, MSGTMP2
199
200 /* Rounds 48-51 */
201 mova128 MSGTMP0, MSG
202 paddd 12*16-8*16(SHA256CONSTANTS), MSG
203 sha256rnds2 STATE0, STATE1
204 mova128 MSGTMP0, XMMTMP
205 palignr $4, MSGTMP3, XMMTMP
206 paddd XMMTMP, MSGTMP1
207 sha256msg2 MSGTMP0, MSGTMP1
208 shuf128_32 $0x0E, MSG, MSG
209 sha256rnds2 STATE1, STATE0
210 sha256msg1 MSGTMP0, MSGTMP3
211
212 /* Rounds 52-55 */
213 mova128 MSGTMP1, MSG
214 paddd 13*16-8*16(SHA256CONSTANTS), MSG
215 sha256rnds2 STATE0, STATE1
216 mova128 MSGTMP1, XMMTMP
217 palignr $4, MSGTMP0, XMMTMP
218 paddd XMMTMP, MSGTMP2
219 sha256msg2 MSGTMP1, MSGTMP2
220 shuf128_32 $0x0E, MSG, MSG
221 sha256rnds2 STATE1, STATE0
222
223 /* Rounds 56-59 */
224 mova128 MSGTMP2, MSG
225 paddd 14*16-8*16(SHA256CONSTANTS), MSG
226 sha256rnds2 STATE0, STATE1
227 mova128 MSGTMP2, XMMTMP
228 palignr $4, MSGTMP1, XMMTMP
229 paddd XMMTMP, MSGTMP3
230 sha256msg2 MSGTMP2, MSGTMP3
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Rounds 60-63 */
235 mova128 MSGTMP3, MSG
236 paddd 15*16-8*16(SHA256CONSTANTS), MSG
237 sha256rnds2 STATE0, STATE1
238 shuf128_32 $0x0E, MSG, MSG
239 sha256rnds2 STATE1, STATE0
240
241 /* Add current hash values with previously saved */
242 paddd ABEF_SAVE, STATE0
243 paddd CDGH_SAVE, STATE1
244
245 /* Write hash values back in the correct order */
246 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
247 /* STATE1: CDGH */
248 mova128 STATE0, XMMTMP
249/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
250 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
251 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
252
253 movu128 STATE0, 80+0*16(%rdi)
254 movu128 XMMTMP, 80+1*16(%rdi)
255
256 ret
257 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
258
259 .section .rodata.cst256.K256, "aM", @progbits, 256
260 .balign 16
261K256:
262 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
263 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
264 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
265 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
266 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
267 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
268 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
269 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
270 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
271 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
272 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
273 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
274 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
275 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
276 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
277 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
278
279 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
280 .balign 16
281PSHUFFLE_BSWAP32_FLIP_MASK:
282 .octa 0x0c0d0e0f08090a0b0405060700010203
283
284#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 166cfd38a..a61b3cbed 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -32,45 +32,42 @@
32#define MSG1 %xmm4 32#define MSG1 %xmm4
33#define MSG2 %xmm5 33#define MSG2 %xmm5
34#define MSG3 %xmm6 34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36 35
37 .balign 8 # allow decoders to fetch at least 3 first insns 36 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI: 37sha1_process_block64_shaNI:
39 pushl %ebp
40 movl %esp, %ebp
41 subl $32, %esp
42 andl $~0xF, %esp # paddd needs aligned memory operand
43
44 /* load initial hash values */ 38 /* load initial hash values */
45 xor128 E0, E0
46 movu128 76(%eax), ABCD 39 movu128 76(%eax), ABCD
40 xor128 E0, E0
47 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word 41 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
48 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD 42 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
49 43
50 mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK 44 mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7
45
46 movu128 0*16(%eax), MSG0
47 pshufb %xmm7, MSG0
48 movu128 1*16(%eax), MSG1
49 pshufb %xmm7, MSG1
50 movu128 2*16(%eax), MSG2
51 pshufb %xmm7, MSG2
52 movu128 3*16(%eax), MSG3
53 pshufb %xmm7, MSG3
51 54
52 /* Save hash values for addition after rounds */ 55 /* Save hash values for addition after rounds */
53 movu128 E0, 16(%esp) 56 mova128 E0, %xmm7
54 movu128 ABCD, (%esp) 57 /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
55 58
56 /* Rounds 0-3 */ 59 /* Rounds 0-3 */
57 movu128 0*16(%eax), MSG0
58 pshufb SHUF_MASK, MSG0
59 paddd MSG0, E0 60 paddd MSG0, E0
60 mova128 ABCD, E1 61 mova128 ABCD, E1
61 sha1rnds4 $0, E0, ABCD 62 sha1rnds4 $0, E0, ABCD
62 63
63 /* Rounds 4-7 */ 64 /* Rounds 4-7 */
64 movu128 1*16(%eax), MSG1
65 pshufb SHUF_MASK, MSG1
66 sha1nexte MSG1, E1 65 sha1nexte MSG1, E1
67 mova128 ABCD, E0 66 mova128 ABCD, E0
68 sha1rnds4 $0, E1, ABCD 67 sha1rnds4 $0, E1, ABCD
69 sha1msg1 MSG1, MSG0 68 sha1msg1 MSG1, MSG0
70 69
71 /* Rounds 8-11 */ 70 /* Rounds 8-11 */
72 movu128 2*16(%eax), MSG2
73 pshufb SHUF_MASK, MSG2
74 sha1nexte MSG2, E0 71 sha1nexte MSG2, E0
75 mova128 ABCD, E1 72 mova128 ABCD, E1
76 sha1rnds4 $0, E0, ABCD 73 sha1rnds4 $0, E0, ABCD
@@ -78,8 +75,6 @@ sha1_process_block64_shaNI:
78 xor128 MSG2, MSG0 75 xor128 MSG2, MSG0
79 76
80 /* Rounds 12-15 */ 77 /* Rounds 12-15 */
81 movu128 3*16(%eax), MSG3
82 pshufb SHUF_MASK, MSG3
83 sha1nexte MSG3, E1 78 sha1nexte MSG3, E1
84 mova128 ABCD, E0 79 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0 80 sha1msg2 MSG3, MSG0
@@ -210,21 +205,21 @@ sha1_process_block64_shaNI:
210 sha1rnds4 $3, E1, ABCD 205 sha1rnds4 $3, E1, ABCD
211 206
212 /* Add current hash values with previously saved */ 207 /* Add current hash values with previously saved */
213 sha1nexte 16(%esp), E0 208 sha1nexte %xmm7, E0
214 paddd (%esp), ABCD 209 /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */
210 movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)...
215 211
216 /* Write hash values back in the correct order */ 212 /* Write hash values back in the correct order */
217 shuf128_32 $0x1B, ABCD, ABCD 213 shuf128_32 $0x1B, ABCD, ABCD
214 paddd %xmm7, ABCD # ...add it to final ABCD
218 movu128 ABCD, 76(%eax) 215 movu128 ABCD, 76(%eax)
219 extr128_32 $3, E0, 76+4*4(%eax) 216 extr128_32 $3, E0, 76+4*4(%eax)
220 217
221 movl %ebp, %esp
222 popl %ebp
223 ret 218 ret
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 219 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225 220
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 221 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.align 16 222 .balign 16
228PSHUFFLE_BYTE_FLIP_MASK: 223PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f 224 .octa 0x000102030405060708090a0b0c0d0e0f
230 225
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 87fb616a1..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,7 +1,7 @@
1### Generated by hash_md5_sha_x86-64.S.sh ### 1### Generated by hash_md5_sha_x86-64.S.sh ###
2 2
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
4 .section .text.sha1_process_block64,"ax",@progbits 4 .section .text.sha1_process_block64, "ax", @progbits
5 .globl sha1_process_block64 5 .globl sha1_process_block64
6 .hidden sha1_process_block64 6 .hidden sha1_process_block64
7 .type sha1_process_block64, @function 7 .type sha1_process_block64, @function
@@ -10,7 +10,7 @@
10sha1_process_block64: 10sha1_process_block64:
11 pushq %rbp # 1 byte insn 11 pushq %rbp # 1 byte insn
12 pushq %rbx # 1 byte insn 12 pushq %rbx # 1 byte insn
13 pushq %r15 # 2 byte insn 13# pushq %r15 # 2 byte insn
14 pushq %r14 # 2 byte insn 14 pushq %r14 # 2 byte insn
15 pushq %r13 # 2 byte insn 15 pushq %r13 # 2 byte insn
16 pushq %r12 # 2 byte insn 16 pushq %r12 # 2 byte insn
@@ -19,17 +19,13 @@ sha1_process_block64:
19#Register and stack use: 19#Register and stack use:
20# eax..edx: a..d 20# eax..edx: a..d
21# ebp: e 21# ebp: e
22# esi,edi: temps 22# esi,edi,r8..r14: temps
23# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 23# r15: unused
24# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 24# xmm0..xmm3: W[]
25 movl $3, %eax 25# xmm4,xmm5: temps
261: 26# xmm6: current round constant
27 movq (%rdi,%rax,8), %rsi 27# xmm7: all round constants
28 bswapq %rsi 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
29 rolq $32, %rsi
30 movq %rsi, -32(%rsp,%rax,8)
31 decl %eax
32 jns 1b
33 29
34 movl 80(%rdi), %eax # a = ctx->hash[0] 30 movl 80(%rdi), %eax # a = ctx->hash[0]
35 movl 84(%rdi), %ebx # b = ctx->hash[1] 31 movl 84(%rdi), %ebx # b = ctx->hash[1]
@@ -37,587 +33,760 @@ sha1_process_block64:
37 movl 92(%rdi), %edx # d = ctx->hash[3] 33 movl 92(%rdi), %edx # d = ctx->hash[3]
38 movl 96(%rdi), %ebp # e = ctx->hash[4] 34 movl 96(%rdi), %ebp # e = ctx->hash[4]
39 35
40 movq 4*8(%rdi), %r8 36 movaps sha1const(%rip), %xmm7
41 movq 4*10(%rdi), %r10 37 pshufd $0x00, %xmm7, %xmm6
38
39 # Load W[] to xmm registers, byteswapping on the fly.
40 #
41 # For iterations 0..15, we pass W[] in rsi,r8..r14
42 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
45 # (We use rsi instead of rN because this makes two
46 # LEAs in two first RD1As shorter by one byte).
47 movq 4*0(%rdi), %rsi
48 movq 4*2(%rdi), %r8
49 bswapq %rsi
42 bswapq %r8 50 bswapq %r8
51 rolq $32, %rsi # rsi = W[1]:W[0]
52 rolq $32, %r8 # r8 = W[3]:W[2]
53 movq %rsi, %xmm0
54 movq %r8, %xmm4
55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
56# movaps %xmm0, %xmm4 # add RCONST, spill to stack
57# paddd %xmm6, %xmm4
58# movups %xmm4, -64+16*0(%rsp)
59
60 movq 4*4(%rdi), %r9
61 movq 4*6(%rdi), %r10
62 bswapq %r9
43 bswapq %r10 63 bswapq %r10
44 movq 4*12(%rdi), %r12 64 rolq $32, %r9 # r9 = W[5]:W[4]
45 movq 4*14(%rdi), %r14 65 rolq $32, %r10 # r10 = W[7]:W[6]
66 movq %r9, %xmm1
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
69
70 movq 4*8(%rdi), %r11
71 movq 4*10(%rdi), %r12
72 bswapq %r11
46 bswapq %r12 73 bswapq %r12
74 rolq $32, %r11 # r11 = W[9]:W[8]
75 rolq $32, %r12 # r12 = W[11]:W[10]
76 movq %r11, %xmm2
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
79
80 movq 4*12(%rdi), %r13
81 movq 4*14(%rdi), %r14
82 bswapq %r13
47 bswapq %r14 83 bswapq %r14
48 movl %r8d, %r9d 84 rolq $32, %r13 # r13 = W[13]:W[12]
49 shrq $32, %r8 85 rolq $32, %r14 # r14 = W[15]:W[14]
50 movl %r10d, %r11d 86 movq %r13, %xmm3
51 shrq $32, %r10 87 movq %r14, %xmm4
52 movl %r12d, %r13d 88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
53 shrq $32, %r12
54 movl %r14d, %r15d
55 shrq $32, %r14
56 89
57# 0 90# 0
58 # W[0], already in %esi 91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
92 shrq $32, %rsi
59 movl %ecx, %edi # c 93 movl %ecx, %edi # c
60 xorl %edx, %edi # ^d 94 xorl %edx, %edi # ^d
61 andl %ebx, %edi # &b 95 andl %ebx, %edi # &b
62 xorl %edx, %edi # (((c ^ d) & b) ^ d) 96 xorl %edx, %edi # (((c ^ d) & b) ^ d)
63 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
64 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 97 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
65 movl %eax, %esi # 98 movl %eax, %edi #
66 roll $5, %esi # rotl32(a,5) 99 roll $5, %edi # rotl32(a,5)
67 addl %esi, %ebp # e += rotl32(a,5) 100 addl %edi, %ebp # e += rotl32(a,5)
68 rorl $2, %ebx # b = rotl32(b,30) 101 rorl $2, %ebx # b = rotl32(b,30)
69# 1 102# 1
70 movl -32+4*1(%rsp), %esi # W[n] 103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
71 movl %ebx, %edi # c 104 movl %ebx, %edi # c
72 xorl %ecx, %edi # ^d 105 xorl %ecx, %edi # ^d
73 andl %eax, %edi # &b 106 andl %eax, %edi # &b
74 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 107 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
75 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
76 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 108 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
77 movl %ebp, %esi # 109 movl %ebp, %edi #
78 roll $5, %esi # rotl32(a,5) 110 roll $5, %edi # rotl32(a,5)
79 addl %esi, %edx # e += rotl32(a,5) 111 addl %edi, %edx # e += rotl32(a,5)
80 rorl $2, %eax # b = rotl32(b,30) 112 rorl $2, %eax # b = rotl32(b,30)
81# 2 113# 2
82 movl -32+4*2(%rsp), %esi # W[n] 114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
115 shrq $32, %r8
83 movl %eax, %edi # c 116 movl %eax, %edi # c
84 xorl %ebx, %edi # ^d 117 xorl %ebx, %edi # ^d
85 andl %ebp, %edi # &b 118 andl %ebp, %edi # &b
86 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 119 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
87 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
88 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 120 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
89 movl %edx, %esi # 121 movl %edx, %edi #
90 roll $5, %esi # rotl32(a,5) 122 roll $5, %edi # rotl32(a,5)
91 addl %esi, %ecx # e += rotl32(a,5) 123 addl %edi, %ecx # e += rotl32(a,5)
92 rorl $2, %ebp # b = rotl32(b,30) 124 rorl $2, %ebp # b = rotl32(b,30)
93# 3 125# 3
94 movl -32+4*3(%rsp), %esi # W[n] 126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
95 movl %ebp, %edi # c 127 movl %ebp, %edi # c
96 xorl %eax, %edi # ^d 128 xorl %eax, %edi # ^d
97 andl %edx, %edi # &b 129 andl %edx, %edi # &b
98 xorl %eax, %edi # (((c ^ d) & b) ^ d) 130 xorl %eax, %edi # (((c ^ d) & b) ^ d)
99 leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n]
100 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 131 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
101 movl %ecx, %esi # 132 movl %ecx, %edi #
102 roll $5, %esi # rotl32(a,5) 133 roll $5, %edi # rotl32(a,5)
103 addl %esi, %ebx # e += rotl32(a,5) 134 addl %edi, %ebx # e += rotl32(a,5)
104 rorl $2, %edx # b = rotl32(b,30) 135 rorl $2, %edx # b = rotl32(b,30)
105# 4 136# 4
106 movl -32+4*4(%rsp), %esi # W[n] 137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
138 shrq $32, %r9
107 movl %edx, %edi # c 139 movl %edx, %edi # c
108 xorl %ebp, %edi # ^d 140 xorl %ebp, %edi # ^d
109 andl %ecx, %edi # &b 141 andl %ecx, %edi # &b
110 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 142 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
111 leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n]
112 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 143 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
113 movl %ebx, %esi # 144 movl %ebx, %edi #
114 roll $5, %esi # rotl32(a,5) 145 roll $5, %edi # rotl32(a,5)
115 addl %esi, %eax # e += rotl32(a,5) 146 addl %edi, %eax # e += rotl32(a,5)
116 rorl $2, %ecx # b = rotl32(b,30) 147 rorl $2, %ecx # b = rotl32(b,30)
117# 5 148# 5
118 movl -32+4*5(%rsp), %esi # W[n] 149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
119 movl %ecx, %edi # c 150 movl %ecx, %edi # c
120 xorl %edx, %edi # ^d 151 xorl %edx, %edi # ^d
121 andl %ebx, %edi # &b 152 andl %ebx, %edi # &b
122 xorl %edx, %edi # (((c ^ d) & b) ^ d) 153 xorl %edx, %edi # (((c ^ d) & b) ^ d)
123 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
124 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 154 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
125 movl %eax, %esi # 155 movl %eax, %edi #
126 roll $5, %esi # rotl32(a,5) 156 roll $5, %edi # rotl32(a,5)
127 addl %esi, %ebp # e += rotl32(a,5) 157 addl %edi, %ebp # e += rotl32(a,5)
128 rorl $2, %ebx # b = rotl32(b,30) 158 rorl $2, %ebx # b = rotl32(b,30)
129# 6 159# 6
130 movl -32+4*6(%rsp), %esi # W[n] 160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
161 shrq $32, %r10
131 movl %ebx, %edi # c 162 movl %ebx, %edi # c
132 xorl %ecx, %edi # ^d 163 xorl %ecx, %edi # ^d
133 andl %eax, %edi # &b 164 andl %eax, %edi # &b
134 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 165 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
135 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
136 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 166 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
137 movl %ebp, %esi # 167 movl %ebp, %edi #
138 roll $5, %esi # rotl32(a,5) 168 roll $5, %edi # rotl32(a,5)
139 addl %esi, %edx # e += rotl32(a,5) 169 addl %edi, %edx # e += rotl32(a,5)
140 rorl $2, %eax # b = rotl32(b,30) 170 rorl $2, %eax # b = rotl32(b,30)
141# 7 171# 7
142 movl -32+4*7(%rsp), %esi # W[n] 172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
143 movl %eax, %edi # c 173 movl %eax, %edi # c
144 xorl %ebx, %edi # ^d 174 xorl %ebx, %edi # ^d
145 andl %ebp, %edi # &b 175 andl %ebp, %edi # &b
146 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 176 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
147 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
148 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 177 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
149 movl %edx, %esi # 178 movl %edx, %edi #
150 roll $5, %esi # rotl32(a,5) 179 roll $5, %edi # rotl32(a,5)
151 addl %esi, %ecx # e += rotl32(a,5) 180 addl %edi, %ecx # e += rotl32(a,5)
152 rorl $2, %ebp # b = rotl32(b,30) 181 rorl $2, %ebp # b = rotl32(b,30)
182# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
183 movaps %xmm3, %xmm4
184 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
185# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
186# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
187# same result as above, but shorter and faster:
188# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
189# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
190 movaps %xmm0, %xmm5
191 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
192 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
193 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
194 xorps %xmm5, %xmm0 # ^
195 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
196 movaps %xmm0, %xmm5
197 xorps %xmm4, %xmm4 # rol(W0,1):
198 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
199 paddd %xmm0, %xmm0 # shift left by 1
200 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
201 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
202 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
203 movaps %xmm5, %xmm4
204 pslld $2, %xmm5
205 psrld $30, %xmm4
206# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
207 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
208 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
209 movaps %xmm0, %xmm5
210 paddd %xmm6, %xmm5
211 movups %xmm5, -64+16*0(%rsp)
153# 8 212# 8
154 # W[n], in %r8 213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
214 shrq $32, %r11
155 movl %ebp, %edi # c 215 movl %ebp, %edi # c
156 xorl %eax, %edi # ^d 216 xorl %eax, %edi # ^d
157 andl %edx, %edi # &b 217 andl %edx, %edi # &b
158 xorl %eax, %edi # (((c ^ d) & b) ^ d) 218 xorl %eax, %edi # (((c ^ d) & b) ^ d)
159 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
160 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 219 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
161 movl %ecx, %esi # 220 movl %ecx, %edi #
162 roll $5, %esi # rotl32(a,5) 221 roll $5, %edi # rotl32(a,5)
163 addl %esi, %ebx # e += rotl32(a,5) 222 addl %edi, %ebx # e += rotl32(a,5)
164 rorl $2, %edx # b = rotl32(b,30) 223 rorl $2, %edx # b = rotl32(b,30)
165# 9 224# 9
166 # W[n], in %r9 225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
167 movl %edx, %edi # c 226 movl %edx, %edi # c
168 xorl %ebp, %edi # ^d 227 xorl %ebp, %edi # ^d
169 andl %ecx, %edi # &b 228 andl %ecx, %edi # &b
170 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 229 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
171 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
172 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 230 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
173 movl %ebx, %esi # 231 movl %ebx, %edi #
174 roll $5, %esi # rotl32(a,5) 232 roll $5, %edi # rotl32(a,5)
175 addl %esi, %eax # e += rotl32(a,5) 233 addl %edi, %eax # e += rotl32(a,5)
176 rorl $2, %ecx # b = rotl32(b,30) 234 rorl $2, %ecx # b = rotl32(b,30)
177# 10 235# 10
178 # W[n], in %r10 236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
237 shrq $32, %r12
179 movl %ecx, %edi # c 238 movl %ecx, %edi # c
180 xorl %edx, %edi # ^d 239 xorl %edx, %edi # ^d
181 andl %ebx, %edi # &b 240 andl %ebx, %edi # &b
182 xorl %edx, %edi # (((c ^ d) & b) ^ d) 241 xorl %edx, %edi # (((c ^ d) & b) ^ d)
183 leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
184 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 242 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
185 movl %eax, %esi # 243 movl %eax, %edi #
186 roll $5, %esi # rotl32(a,5) 244 roll $5, %edi # rotl32(a,5)
187 addl %esi, %ebp # e += rotl32(a,5) 245 addl %edi, %ebp # e += rotl32(a,5)
188 rorl $2, %ebx # b = rotl32(b,30) 246 rorl $2, %ebx # b = rotl32(b,30)
189# 11 247# 11
190 # W[n], in %r11 248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
191 movl %ebx, %edi # c 249 movl %ebx, %edi # c
192 xorl %ecx, %edi # ^d 250 xorl %ecx, %edi # ^d
193 andl %eax, %edi # &b 251 andl %eax, %edi # &b
194 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 252 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
195 leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
196 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 253 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
197 movl %ebp, %esi # 254 movl %ebp, %edi #
198 roll $5, %esi # rotl32(a,5) 255 roll $5, %edi # rotl32(a,5)
199 addl %esi, %edx # e += rotl32(a,5) 256 addl %edi, %edx # e += rotl32(a,5)
200 rorl $2, %eax # b = rotl32(b,30) 257 rorl $2, %eax # b = rotl32(b,30)
258 pshufd $0x55, %xmm7, %xmm6
259# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
260 movaps %xmm0, %xmm4
261 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
262# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
263# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
264# same result as above, but shorter and faster:
265# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
266# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
267 movaps %xmm1, %xmm5
268 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
269 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
270 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
271 xorps %xmm5, %xmm1 # ^
272 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
273 movaps %xmm1, %xmm5
274 xorps %xmm4, %xmm4 # rol(W0,1):
275 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
276 paddd %xmm1, %xmm1 # shift left by 1
277 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
278 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
279 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
280 movaps %xmm5, %xmm4
281 pslld $2, %xmm5
282 psrld $30, %xmm4
283# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
284 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
285 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
286 movaps %xmm1, %xmm5
287 paddd %xmm6, %xmm5
288 movups %xmm5, -64+16*1(%rsp)
201# 12 289# 12
202 # W[n], in %r12 290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
291 shrq $32, %r13
203 movl %eax, %edi # c 292 movl %eax, %edi # c
204 xorl %ebx, %edi # ^d 293 xorl %ebx, %edi # ^d
205 andl %ebp, %edi # &b 294 andl %ebp, %edi # &b
206 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 295 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
207 leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
208 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 296 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
209 movl %edx, %esi # 297 movl %edx, %edi #
210 roll $5, %esi # rotl32(a,5) 298 roll $5, %edi # rotl32(a,5)
211 addl %esi, %ecx # e += rotl32(a,5) 299 addl %edi, %ecx # e += rotl32(a,5)
212 rorl $2, %ebp # b = rotl32(b,30) 300 rorl $2, %ebp # b = rotl32(b,30)
213# 13 301# 13
214 # W[n], in %r13 302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
215 movl %ebp, %edi # c 303 movl %ebp, %edi # c
216 xorl %eax, %edi # ^d 304 xorl %eax, %edi # ^d
217 andl %edx, %edi # &b 305 andl %edx, %edi # &b
218 xorl %eax, %edi # (((c ^ d) & b) ^ d) 306 xorl %eax, %edi # (((c ^ d) & b) ^ d)
219 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
220 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 307 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
221 movl %ecx, %esi # 308 movl %ecx, %edi #
222 roll $5, %esi # rotl32(a,5) 309 roll $5, %edi # rotl32(a,5)
223 addl %esi, %ebx # e += rotl32(a,5) 310 addl %edi, %ebx # e += rotl32(a,5)
224 rorl $2, %edx # b = rotl32(b,30) 311 rorl $2, %edx # b = rotl32(b,30)
225# 14 312# 14
226 # W[n], in %r14 313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
314 shrq $32, %r14
227 movl %edx, %edi # c 315 movl %edx, %edi # c
228 xorl %ebp, %edi # ^d 316 xorl %ebp, %edi # ^d
229 andl %ecx, %edi # &b 317 andl %ecx, %edi # &b
230 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 318 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
231 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
232 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 319 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
233 movl %ebx, %esi # 320 movl %ebx, %edi #
234 roll $5, %esi # rotl32(a,5) 321 roll $5, %edi # rotl32(a,5)
235 addl %esi, %eax # e += rotl32(a,5) 322 addl %edi, %eax # e += rotl32(a,5)
236 rorl $2, %ecx # b = rotl32(b,30) 323 rorl $2, %ecx # b = rotl32(b,30)
237# 15 324# 15
238 # W[n], in %r15 325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
239 movl %ecx, %edi # c 326 movl %ecx, %edi # c
240 xorl %edx, %edi # ^d 327 xorl %edx, %edi # ^d
241 andl %ebx, %edi # &b 328 andl %ebx, %edi # &b
242 xorl %edx, %edi # (((c ^ d) & b) ^ d) 329 xorl %edx, %edi # (((c ^ d) & b) ^ d)
243 leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
244 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 330 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
245 movl %eax, %esi # 331 movl %eax, %edi #
246 roll $5, %esi # rotl32(a,5) 332 roll $5, %edi # rotl32(a,5)
247 addl %esi, %ebp # e += rotl32(a,5) 333 addl %edi, %ebp # e += rotl32(a,5)
248 rorl $2, %ebx # b = rotl32(b,30) 334 rorl $2, %ebx # b = rotl32(b,30)
335# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
336 movaps %xmm1, %xmm4
337 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
338# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
339# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
340# same result as above, but shorter and faster:
341# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
342# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
343 movaps %xmm2, %xmm5
344 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
345 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
346 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
347 xorps %xmm5, %xmm2 # ^
348 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
349 movaps %xmm2, %xmm5
350 xorps %xmm4, %xmm4 # rol(W0,1):
351 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
352 paddd %xmm2, %xmm2 # shift left by 1
353 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
354 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
355 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
356 movaps %xmm5, %xmm4
357 pslld $2, %xmm5
358 psrld $30, %xmm4
359# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
360 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
361 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
362 movaps %xmm2, %xmm5
363 paddd %xmm6, %xmm5
364 movups %xmm5, -64+16*2(%rsp)
249# 16 365# 16
250 movl %r13d, %esi # W[(n+13) & 15]
251 xorl %r8d, %esi # ^W[(n+8) & 15]
252 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
253 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
254 roll %esi #
255 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
256 movl %ebx, %edi # c 366 movl %ebx, %edi # c
257 xorl %ecx, %edi # ^d 367 xorl %ecx, %edi # ^d
258 andl %eax, %edi # &b 368 andl %eax, %edi # &b
259 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 369 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
260 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 370 addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
261 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 371 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
262 movl %ebp, %esi # 372 movl %ebp, %esi #
263 roll $5, %esi # rotl32(a,5) 373 roll $5, %esi # rotl32(a,5)
264 addl %esi, %edx # e += rotl32(a,5) 374 addl %esi, %edx # e += rotl32(a,5)
265 rorl $2, %eax # b = rotl32(b,30) 375 rorl $2, %eax # b = rotl32(b,30)
266# 17 376# 17
267 movl %r14d, %esi # W[(n+13) & 15]
268 xorl %r9d, %esi # ^W[(n+8) & 15]
269 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
270 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
271 roll %esi #
272 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
273 movl %eax, %edi # c 377 movl %eax, %edi # c
274 xorl %ebx, %edi # ^d 378 xorl %ebx, %edi # ^d
275 andl %ebp, %edi # &b 379 andl %ebp, %edi # &b
276 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 380 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
277 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 381 addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
278 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 382 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
279 movl %edx, %esi # 383 movl %edx, %esi #
280 roll $5, %esi # rotl32(a,5) 384 roll $5, %esi # rotl32(a,5)
281 addl %esi, %ecx # e += rotl32(a,5) 385 addl %esi, %ecx # e += rotl32(a,5)
282 rorl $2, %ebp # b = rotl32(b,30) 386 rorl $2, %ebp # b = rotl32(b,30)
283# 18 387# 18
284 movl %r15d, %esi # W[(n+13) & 15]
285 xorl %r10d, %esi # ^W[(n+8) & 15]
286 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
287 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
288 roll %esi #
289 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
290 movl %ebp, %edi # c 388 movl %ebp, %edi # c
291 xorl %eax, %edi # ^d 389 xorl %eax, %edi # ^d
292 andl %edx, %edi # &b 390 andl %edx, %edi # &b
293 xorl %eax, %edi # (((c ^ d) & b) ^ d) 391 xorl %eax, %edi # (((c ^ d) & b) ^ d)
294 leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 392 addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
295 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 393 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
296 movl %ecx, %esi # 394 movl %ecx, %esi #
297 roll $5, %esi # rotl32(a,5) 395 roll $5, %esi # rotl32(a,5)
298 addl %esi, %ebx # e += rotl32(a,5) 396 addl %esi, %ebx # e += rotl32(a,5)
299 rorl $2, %edx # b = rotl32(b,30) 397 rorl $2, %edx # b = rotl32(b,30)
300# 19 398# 19
301 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
302 xorl %r11d, %esi # ^W[(n+8) & 15]
303 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
304 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
305 roll %esi #
306 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
307 movl %edx, %edi # c 399 movl %edx, %edi # c
308 xorl %ebp, %edi # ^d 400 xorl %ebp, %edi # ^d
309 andl %ecx, %edi # &b 401 andl %ecx, %edi # &b
310 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 402 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
311 leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] 403 addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
312 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 404 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
313 movl %ebx, %esi # 405 movl %ebx, %esi #
314 roll $5, %esi # rotl32(a,5) 406 roll $5, %esi # rotl32(a,5)
315 addl %esi, %eax # e += rotl32(a,5) 407 addl %esi, %eax # e += rotl32(a,5)
316 rorl $2, %ecx # b = rotl32(b,30) 408 rorl $2, %ecx # b = rotl32(b,30)
409# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
410 movaps %xmm2, %xmm4
411 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
412# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
413# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
414# same result as above, but shorter and faster:
415# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
416# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
417 movaps %xmm3, %xmm5
418 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
419 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
420 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
421 xorps %xmm5, %xmm3 # ^
422 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
423 movaps %xmm3, %xmm5
424 xorps %xmm4, %xmm4 # rol(W0,1):
425 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
426 paddd %xmm3, %xmm3 # shift left by 1
427 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
428 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
429 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
430 movaps %xmm5, %xmm4
431 pslld $2, %xmm5
432 psrld $30, %xmm4
433# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
434 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
435 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
436 movaps %xmm3, %xmm5
437 paddd %xmm6, %xmm5
438 movups %xmm5, -64+16*3(%rsp)
317# 20 439# 20
318 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
319 xorl %r12d, %esi # ^W[(n+8) & 15]
320 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
321 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
322 roll %esi #
323 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
324 movl %ecx, %edi # c 440 movl %ecx, %edi # c
325 xorl %edx, %edi # ^d 441 xorl %edx, %edi # ^d
326 xorl %ebx, %edi # ^b 442 xorl %ebx, %edi # ^b
327 leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 443 addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
328 addl %edi, %ebp # e += (c ^ d ^ b) 444 addl %edi, %ebp # e += (c ^ d ^ b)
329 movl %eax, %esi # 445 movl %eax, %esi #
330 roll $5, %esi # rotl32(a,5) 446 roll $5, %esi # rotl32(a,5)
331 addl %esi, %ebp # e += rotl32(a,5) 447 addl %esi, %ebp # e += rotl32(a,5)
332 rorl $2, %ebx # b = rotl32(b,30) 448 rorl $2, %ebx # b = rotl32(b,30)
333# 21 449# 21
334 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
335 xorl %r13d, %esi # ^W[(n+8) & 15]
336 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
337 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
338 roll %esi #
339 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
340 movl %ebx, %edi # c 450 movl %ebx, %edi # c
341 xorl %ecx, %edi # ^d 451 xorl %ecx, %edi # ^d
342 xorl %eax, %edi # ^b 452 xorl %eax, %edi # ^b
343 leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 453 addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
344 addl %edi, %edx # e += (c ^ d ^ b) 454 addl %edi, %edx # e += (c ^ d ^ b)
345 movl %ebp, %esi # 455 movl %ebp, %esi #
346 roll $5, %esi # rotl32(a,5) 456 roll $5, %esi # rotl32(a,5)
347 addl %esi, %edx # e += rotl32(a,5) 457 addl %esi, %edx # e += rotl32(a,5)
348 rorl $2, %eax # b = rotl32(b,30) 458 rorl $2, %eax # b = rotl32(b,30)
349# 22 459# 22
350 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
351 xorl %r14d, %esi # ^W[(n+8) & 15]
352 xorl %r8d, %esi # ^W[(n+2) & 15]
353 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
354 roll %esi #
355 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
356 movl %eax, %edi # c 460 movl %eax, %edi # c
357 xorl %ebx, %edi # ^d 461 xorl %ebx, %edi # ^d
358 xorl %ebp, %edi # ^b 462 xorl %ebp, %edi # ^b
359 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 463 addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
360 addl %edi, %ecx # e += (c ^ d ^ b) 464 addl %edi, %ecx # e += (c ^ d ^ b)
361 movl %edx, %esi # 465 movl %edx, %esi #
362 roll $5, %esi # rotl32(a,5) 466 roll $5, %esi # rotl32(a,5)
363 addl %esi, %ecx # e += rotl32(a,5) 467 addl %esi, %ecx # e += rotl32(a,5)
364 rorl $2, %ebp # b = rotl32(b,30) 468 rorl $2, %ebp # b = rotl32(b,30)
365# 23 469# 23
366 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
367 xorl %r15d, %esi # ^W[(n+8) & 15]
368 xorl %r9d, %esi # ^W[(n+2) & 15]
369 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
370 roll %esi #
371 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
372 movl %ebp, %edi # c 470 movl %ebp, %edi # c
373 xorl %eax, %edi # ^d 471 xorl %eax, %edi # ^d
374 xorl %edx, %edi # ^b 472 xorl %edx, %edi # ^b
375 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 473 addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
376 addl %edi, %ebx # e += (c ^ d ^ b) 474 addl %edi, %ebx # e += (c ^ d ^ b)
377 movl %ecx, %esi # 475 movl %ecx, %esi #
378 roll $5, %esi # rotl32(a,5) 476 roll $5, %esi # rotl32(a,5)
379 addl %esi, %ebx # e += rotl32(a,5) 477 addl %esi, %ebx # e += rotl32(a,5)
380 rorl $2, %edx # b = rotl32(b,30) 478 rorl $2, %edx # b = rotl32(b,30)
479# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
480 movaps %xmm3, %xmm4
481 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
482# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
483# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
484# same result as above, but shorter and faster:
485# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
486# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
487 movaps %xmm0, %xmm5
488 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
489 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
490 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
491 xorps %xmm5, %xmm0 # ^
492 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
493 movaps %xmm0, %xmm5
494 xorps %xmm4, %xmm4 # rol(W0,1):
495 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
496 paddd %xmm0, %xmm0 # shift left by 1
497 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
498 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
499 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
500 movaps %xmm5, %xmm4
501 pslld $2, %xmm5
502 psrld $30, %xmm4
503# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
504 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
505 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
506 movaps %xmm0, %xmm5
507 paddd %xmm6, %xmm5
508 movups %xmm5, -64+16*0(%rsp)
381# 24 509# 24
382 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
383 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
384 xorl %r10d, %r8d # ^W[(n+2) & 15]
385 roll %r8d #
386 movl %edx, %edi # c 510 movl %edx, %edi # c
387 xorl %ebp, %edi # ^d 511 xorl %ebp, %edi # ^d
388 xorl %ecx, %edi # ^b 512 xorl %ecx, %edi # ^b
389 leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] 513 addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
390 addl %edi, %eax # e += (c ^ d ^ b) 514 addl %edi, %eax # e += (c ^ d ^ b)
391 movl %ebx, %esi # 515 movl %ebx, %esi #
392 roll $5, %esi # rotl32(a,5) 516 roll $5, %esi # rotl32(a,5)
393 addl %esi, %eax # e += rotl32(a,5) 517 addl %esi, %eax # e += rotl32(a,5)
394 rorl $2, %ecx # b = rotl32(b,30) 518 rorl $2, %ecx # b = rotl32(b,30)
395# 25 519# 25
396 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
397 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
398 xorl %r11d, %r9d # ^W[(n+2) & 15]
399 roll %r9d #
400 movl %ecx, %edi # c 520 movl %ecx, %edi # c
401 xorl %edx, %edi # ^d 521 xorl %edx, %edi # ^d
402 xorl %ebx, %edi # ^b 522 xorl %ebx, %edi # ^b
403 leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] 523 addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
404 addl %edi, %ebp # e += (c ^ d ^ b) 524 addl %edi, %ebp # e += (c ^ d ^ b)
405 movl %eax, %esi # 525 movl %eax, %esi #
406 roll $5, %esi # rotl32(a,5) 526 roll $5, %esi # rotl32(a,5)
407 addl %esi, %ebp # e += rotl32(a,5) 527 addl %esi, %ebp # e += rotl32(a,5)
408 rorl $2, %ebx # b = rotl32(b,30) 528 rorl $2, %ebx # b = rotl32(b,30)
409# 26 529# 26
410 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
411 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
412 xorl %r12d, %r10d # ^W[(n+2) & 15]
413 roll %r10d #
414 movl %ebx, %edi # c 530 movl %ebx, %edi # c
415 xorl %ecx, %edi # ^d 531 xorl %ecx, %edi # ^d
416 xorl %eax, %edi # ^b 532 xorl %eax, %edi # ^b
417 leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] 533 addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
418 addl %edi, %edx # e += (c ^ d ^ b) 534 addl %edi, %edx # e += (c ^ d ^ b)
419 movl %ebp, %esi # 535 movl %ebp, %esi #
420 roll $5, %esi # rotl32(a,5) 536 roll $5, %esi # rotl32(a,5)
421 addl %esi, %edx # e += rotl32(a,5) 537 addl %esi, %edx # e += rotl32(a,5)
422 rorl $2, %eax # b = rotl32(b,30) 538 rorl $2, %eax # b = rotl32(b,30)
423# 27 539# 27
424 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
425 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
426 xorl %r13d, %r11d # ^W[(n+2) & 15]
427 roll %r11d #
428 movl %eax, %edi # c 540 movl %eax, %edi # c
429 xorl %ebx, %edi # ^d 541 xorl %ebx, %edi # ^d
430 xorl %ebp, %edi # ^b 542 xorl %ebp, %edi # ^b
431 leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] 543 addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
432 addl %edi, %ecx # e += (c ^ d ^ b) 544 addl %edi, %ecx # e += (c ^ d ^ b)
433 movl %edx, %esi # 545 movl %edx, %esi #
434 roll $5, %esi # rotl32(a,5) 546 roll $5, %esi # rotl32(a,5)
435 addl %esi, %ecx # e += rotl32(a,5) 547 addl %esi, %ecx # e += rotl32(a,5)
436 rorl $2, %ebp # b = rotl32(b,30) 548 rorl $2, %ebp # b = rotl32(b,30)
549# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
550 movaps %xmm0, %xmm4
551 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
552# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
553# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
554# same result as above, but shorter and faster:
555# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
556# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
557 movaps %xmm1, %xmm5
558 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
559 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
560 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
561 xorps %xmm5, %xmm1 # ^
562 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
563 movaps %xmm1, %xmm5
564 xorps %xmm4, %xmm4 # rol(W0,1):
565 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
566 paddd %xmm1, %xmm1 # shift left by 1
567 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
568 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
569 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
570 movaps %xmm5, %xmm4
571 pslld $2, %xmm5
572 psrld $30, %xmm4
573# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
574 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
575 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
576 movaps %xmm1, %xmm5
577 paddd %xmm6, %xmm5
578 movups %xmm5, -64+16*1(%rsp)
437# 28 579# 28
438 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
439 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
440 xorl %r14d, %r12d # ^W[(n+2) & 15]
441 roll %r12d #
442 movl %ebp, %edi # c 580 movl %ebp, %edi # c
443 xorl %eax, %edi # ^d 581 xorl %eax, %edi # ^d
444 xorl %edx, %edi # ^b 582 xorl %edx, %edi # ^b
445 leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] 583 addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
446 addl %edi, %ebx # e += (c ^ d ^ b) 584 addl %edi, %ebx # e += (c ^ d ^ b)
447 movl %ecx, %esi # 585 movl %ecx, %esi #
448 roll $5, %esi # rotl32(a,5) 586 roll $5, %esi # rotl32(a,5)
449 addl %esi, %ebx # e += rotl32(a,5) 587 addl %esi, %ebx # e += rotl32(a,5)
450 rorl $2, %edx # b = rotl32(b,30) 588 rorl $2, %edx # b = rotl32(b,30)
451# 29 589# 29
452 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
453 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
454 xorl %r15d, %r13d # ^W[(n+2) & 15]
455 roll %r13d #
456 movl %edx, %edi # c 590 movl %edx, %edi # c
457 xorl %ebp, %edi # ^d 591 xorl %ebp, %edi # ^d
458 xorl %ecx, %edi # ^b 592 xorl %ecx, %edi # ^b
459 leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] 593 addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
460 addl %edi, %eax # e += (c ^ d ^ b) 594 addl %edi, %eax # e += (c ^ d ^ b)
461 movl %ebx, %esi # 595 movl %ebx, %esi #
462 roll $5, %esi # rotl32(a,5) 596 roll $5, %esi # rotl32(a,5)
463 addl %esi, %eax # e += rotl32(a,5) 597 addl %esi, %eax # e += rotl32(a,5)
464 rorl $2, %ecx # b = rotl32(b,30) 598 rorl $2, %ecx # b = rotl32(b,30)
465# 30 599# 30
466 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
467 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
468 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
469 roll %r14d #
470 movl %ecx, %edi # c 600 movl %ecx, %edi # c
471 xorl %edx, %edi # ^d 601 xorl %edx, %edi # ^d
472 xorl %ebx, %edi # ^b 602 xorl %ebx, %edi # ^b
473 leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] 603 addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
474 addl %edi, %ebp # e += (c ^ d ^ b) 604 addl %edi, %ebp # e += (c ^ d ^ b)
475 movl %eax, %esi # 605 movl %eax, %esi #
476 roll $5, %esi # rotl32(a,5) 606 roll $5, %esi # rotl32(a,5)
477 addl %esi, %ebp # e += rotl32(a,5) 607 addl %esi, %ebp # e += rotl32(a,5)
478 rorl $2, %ebx # b = rotl32(b,30) 608 rorl $2, %ebx # b = rotl32(b,30)
479# 31 609# 31
480 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
481 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
482 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
483 roll %r15d #
484 movl %ebx, %edi # c 610 movl %ebx, %edi # c
485 xorl %ecx, %edi # ^d 611 xorl %ecx, %edi # ^d
486 xorl %eax, %edi # ^b 612 xorl %eax, %edi # ^b
487 leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] 613 addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
488 addl %edi, %edx # e += (c ^ d ^ b) 614 addl %edi, %edx # e += (c ^ d ^ b)
489 movl %ebp, %esi # 615 movl %ebp, %esi #
490 roll $5, %esi # rotl32(a,5) 616 roll $5, %esi # rotl32(a,5)
491 addl %esi, %edx # e += rotl32(a,5) 617 addl %esi, %edx # e += rotl32(a,5)
492 rorl $2, %eax # b = rotl32(b,30) 618 rorl $2, %eax # b = rotl32(b,30)
619 pshufd $0xaa, %xmm7, %xmm6
620# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
621 movaps %xmm1, %xmm4
622 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
623# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
624# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
625# same result as above, but shorter and faster:
626# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
627# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
628 movaps %xmm2, %xmm5
629 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
630 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
631 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
632 xorps %xmm5, %xmm2 # ^
633 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
634 movaps %xmm2, %xmm5
635 xorps %xmm4, %xmm4 # rol(W0,1):
636 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
637 paddd %xmm2, %xmm2 # shift left by 1
638 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
639 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
640 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
641 movaps %xmm5, %xmm4
642 pslld $2, %xmm5
643 psrld $30, %xmm4
644# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
645 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
646 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
647 movaps %xmm2, %xmm5
648 paddd %xmm6, %xmm5
649 movups %xmm5, -64+16*2(%rsp)
493# 32 650# 32
494 movl %r13d, %esi # W[(n+13) & 15]
495 xorl %r8d, %esi # ^W[(n+8) & 15]
496 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
497 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
498 roll %esi #
499 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
500 movl %eax, %edi # c 651 movl %eax, %edi # c
501 xorl %ebx, %edi # ^d 652 xorl %ebx, %edi # ^d
502 xorl %ebp, %edi # ^b 653 xorl %ebp, %edi # ^b
503 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 654 addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
504 addl %edi, %ecx # e += (c ^ d ^ b) 655 addl %edi, %ecx # e += (c ^ d ^ b)
505 movl %edx, %esi # 656 movl %edx, %esi #
506 roll $5, %esi # rotl32(a,5) 657 roll $5, %esi # rotl32(a,5)
507 addl %esi, %ecx # e += rotl32(a,5) 658 addl %esi, %ecx # e += rotl32(a,5)
508 rorl $2, %ebp # b = rotl32(b,30) 659 rorl $2, %ebp # b = rotl32(b,30)
509# 33 660# 33
510 movl %r14d, %esi # W[(n+13) & 15]
511 xorl %r9d, %esi # ^W[(n+8) & 15]
512 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
513 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
514 roll %esi #
515 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
516 movl %ebp, %edi # c 661 movl %ebp, %edi # c
517 xorl %eax, %edi # ^d 662 xorl %eax, %edi # ^d
518 xorl %edx, %edi # ^b 663 xorl %edx, %edi # ^b
519 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 664 addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
520 addl %edi, %ebx # e += (c ^ d ^ b) 665 addl %edi, %ebx # e += (c ^ d ^ b)
521 movl %ecx, %esi # 666 movl %ecx, %esi #
522 roll $5, %esi # rotl32(a,5) 667 roll $5, %esi # rotl32(a,5)
523 addl %esi, %ebx # e += rotl32(a,5) 668 addl %esi, %ebx # e += rotl32(a,5)
524 rorl $2, %edx # b = rotl32(b,30) 669 rorl $2, %edx # b = rotl32(b,30)
525# 34 670# 34
526 movl %r15d, %esi # W[(n+13) & 15]
527 xorl %r10d, %esi # ^W[(n+8) & 15]
528 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
529 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
530 roll %esi #
531 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
532 movl %edx, %edi # c 671 movl %edx, %edi # c
533 xorl %ebp, %edi # ^d 672 xorl %ebp, %edi # ^d
534 xorl %ecx, %edi # ^b 673 xorl %ecx, %edi # ^b
535 leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] 674 addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
536 addl %edi, %eax # e += (c ^ d ^ b) 675 addl %edi, %eax # e += (c ^ d ^ b)
537 movl %ebx, %esi # 676 movl %ebx, %esi #
538 roll $5, %esi # rotl32(a,5) 677 roll $5, %esi # rotl32(a,5)
539 addl %esi, %eax # e += rotl32(a,5) 678 addl %esi, %eax # e += rotl32(a,5)
540 rorl $2, %ecx # b = rotl32(b,30) 679 rorl $2, %ecx # b = rotl32(b,30)
541# 35 680# 35
542 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
543 xorl %r11d, %esi # ^W[(n+8) & 15]
544 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
545 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
546 roll %esi #
547 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
548 movl %ecx, %edi # c 681 movl %ecx, %edi # c
549 xorl %edx, %edi # ^d 682 xorl %edx, %edi # ^d
550 xorl %ebx, %edi # ^b 683 xorl %ebx, %edi # ^b
551 leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 684 addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
552 addl %edi, %ebp # e += (c ^ d ^ b) 685 addl %edi, %ebp # e += (c ^ d ^ b)
553 movl %eax, %esi # 686 movl %eax, %esi #
554 roll $5, %esi # rotl32(a,5) 687 roll $5, %esi # rotl32(a,5)
555 addl %esi, %ebp # e += rotl32(a,5) 688 addl %esi, %ebp # e += rotl32(a,5)
556 rorl $2, %ebx # b = rotl32(b,30) 689 rorl $2, %ebx # b = rotl32(b,30)
690# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
691 movaps %xmm2, %xmm4
692 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
693# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
694# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
695# same result as above, but shorter and faster:
696# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
697# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
698 movaps %xmm3, %xmm5
699 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
700 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
701 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
702 xorps %xmm5, %xmm3 # ^
703 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
704 movaps %xmm3, %xmm5
705 xorps %xmm4, %xmm4 # rol(W0,1):
706 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
707 paddd %xmm3, %xmm3 # shift left by 1
708 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
709 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
710 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
711 movaps %xmm5, %xmm4
712 pslld $2, %xmm5
713 psrld $30, %xmm4
714# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
715 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
716 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
717 movaps %xmm3, %xmm5
718 paddd %xmm6, %xmm5
719 movups %xmm5, -64+16*3(%rsp)
557# 36 720# 36
558 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
559 xorl %r12d, %esi # ^W[(n+8) & 15]
560 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
561 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
562 roll %esi #
563 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
564 movl %ebx, %edi # c 721 movl %ebx, %edi # c
565 xorl %ecx, %edi # ^d 722 xorl %ecx, %edi # ^d
566 xorl %eax, %edi # ^b 723 xorl %eax, %edi # ^b
567 leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 724 addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
568 addl %edi, %edx # e += (c ^ d ^ b) 725 addl %edi, %edx # e += (c ^ d ^ b)
569 movl %ebp, %esi # 726 movl %ebp, %esi #
570 roll $5, %esi # rotl32(a,5) 727 roll $5, %esi # rotl32(a,5)
571 addl %esi, %edx # e += rotl32(a,5) 728 addl %esi, %edx # e += rotl32(a,5)
572 rorl $2, %eax # b = rotl32(b,30) 729 rorl $2, %eax # b = rotl32(b,30)
573# 37 730# 37
574 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
575 xorl %r13d, %esi # ^W[(n+8) & 15]
576 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
577 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
578 roll %esi #
579 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
580 movl %eax, %edi # c 731 movl %eax, %edi # c
581 xorl %ebx, %edi # ^d 732 xorl %ebx, %edi # ^d
582 xorl %ebp, %edi # ^b 733 xorl %ebp, %edi # ^b
583 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 734 addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
584 addl %edi, %ecx # e += (c ^ d ^ b) 735 addl %edi, %ecx # e += (c ^ d ^ b)
585 movl %edx, %esi # 736 movl %edx, %esi #
586 roll $5, %esi # rotl32(a,5) 737 roll $5, %esi # rotl32(a,5)
587 addl %esi, %ecx # e += rotl32(a,5) 738 addl %esi, %ecx # e += rotl32(a,5)
588 rorl $2, %ebp # b = rotl32(b,30) 739 rorl $2, %ebp # b = rotl32(b,30)
589# 38 740# 38
590 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
591 xorl %r14d, %esi # ^W[(n+8) & 15]
592 xorl %r8d, %esi # ^W[(n+2) & 15]
593 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
594 roll %esi #
595 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
596 movl %ebp, %edi # c 741 movl %ebp, %edi # c
597 xorl %eax, %edi # ^d 742 xorl %eax, %edi # ^d
598 xorl %edx, %edi # ^b 743 xorl %edx, %edi # ^b
599 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 744 addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
600 addl %edi, %ebx # e += (c ^ d ^ b) 745 addl %edi, %ebx # e += (c ^ d ^ b)
601 movl %ecx, %esi # 746 movl %ecx, %esi #
602 roll $5, %esi # rotl32(a,5) 747 roll $5, %esi # rotl32(a,5)
603 addl %esi, %ebx # e += rotl32(a,5) 748 addl %esi, %ebx # e += rotl32(a,5)
604 rorl $2, %edx # b = rotl32(b,30) 749 rorl $2, %edx # b = rotl32(b,30)
605# 39 750# 39
606 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
607 xorl %r15d, %esi # ^W[(n+8) & 15]
608 xorl %r9d, %esi # ^W[(n+2) & 15]
609 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
610 roll %esi #
611 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
612 movl %edx, %edi # c 751 movl %edx, %edi # c
613 xorl %ebp, %edi # ^d 752 xorl %ebp, %edi # ^d
614 xorl %ecx, %edi # ^b 753 xorl %ecx, %edi # ^b
615 leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] 754 addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
616 addl %edi, %eax # e += (c ^ d ^ b) 755 addl %edi, %eax # e += (c ^ d ^ b)
617 movl %ebx, %esi # 756 movl %ebx, %esi #
618 roll $5, %esi # rotl32(a,5) 757 roll $5, %esi # rotl32(a,5)
619 addl %esi, %eax # e += rotl32(a,5) 758 addl %esi, %eax # e += rotl32(a,5)
620 rorl $2, %ecx # b = rotl32(b,30) 759 rorl $2, %ecx # b = rotl32(b,30)
760# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
761 movaps %xmm3, %xmm4
762 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
763# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
764# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
765# same result as above, but shorter and faster:
766# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
767# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
768 movaps %xmm0, %xmm5
769 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
770 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
771 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
772 xorps %xmm5, %xmm0 # ^
773 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
774 movaps %xmm0, %xmm5
775 xorps %xmm4, %xmm4 # rol(W0,1):
776 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
777 paddd %xmm0, %xmm0 # shift left by 1
778 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
779 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
780 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
781 movaps %xmm5, %xmm4
782 pslld $2, %xmm5
783 psrld $30, %xmm4
784# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
785 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
786 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
787 movaps %xmm0, %xmm5
788 paddd %xmm6, %xmm5
789 movups %xmm5, -64+16*0(%rsp)
621# 40 790# 40
622 movl %ebx, %edi # di: b 791 movl %ebx, %edi # di: b
623 movl %ebx, %esi # si: b 792 movl %ebx, %esi # si: b
@@ -625,12 +794,8 @@ sha1_process_block64:
625 andl %ecx, %esi # si: b & c 794 andl %ecx, %esi # si: b & c
626 andl %edx, %edi # di: (b | c) & d 795 andl %edx, %edi # di: (b | c) & d
627 orl %esi, %edi # ((b | c) & d) | (b & c) 796 orl %esi, %edi # ((b | c) & d) | (b & c)
628 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
629 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
630 xorl %r10d, %r8d # ^W[(n+2) & 15]
631 roll %r8d #
632 addl %edi, %ebp # += ((b | c) & d) | (b & c) 797 addl %edi, %ebp # += ((b | c) & d) | (b & c)
633 leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] 798 addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
634 movl %eax, %esi # 799 movl %eax, %esi #
635 roll $5, %esi # rotl32(a,5) 800 roll $5, %esi # rotl32(a,5)
636 addl %esi, %ebp # e += rotl32(a,5) 801 addl %esi, %ebp # e += rotl32(a,5)
@@ -642,12 +807,8 @@ sha1_process_block64:
642 andl %ebx, %esi # si: b & c 807 andl %ebx, %esi # si: b & c
643 andl %ecx, %edi # di: (b | c) & d 808 andl %ecx, %edi # di: (b | c) & d
644 orl %esi, %edi # ((b | c) & d) | (b & c) 809 orl %esi, %edi # ((b | c) & d) | (b & c)
645 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
646 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
647 xorl %r11d, %r9d # ^W[(n+2) & 15]
648 roll %r9d #
649 addl %edi, %edx # += ((b | c) & d) | (b & c) 810 addl %edi, %edx # += ((b | c) & d) | (b & c)
650 leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] 811 addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
651 movl %ebp, %esi # 812 movl %ebp, %esi #
652 roll $5, %esi # rotl32(a,5) 813 roll $5, %esi # rotl32(a,5)
653 addl %esi, %edx # e += rotl32(a,5) 814 addl %esi, %edx # e += rotl32(a,5)
@@ -659,12 +820,8 @@ sha1_process_block64:
659 andl %eax, %esi # si: b & c 820 andl %eax, %esi # si: b & c
660 andl %ebx, %edi # di: (b | c) & d 821 andl %ebx, %edi # di: (b | c) & d
661 orl %esi, %edi # ((b | c) & d) | (b & c) 822 orl %esi, %edi # ((b | c) & d) | (b & c)
662 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
663 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
664 xorl %r12d, %r10d # ^W[(n+2) & 15]
665 roll %r10d #
666 addl %edi, %ecx # += ((b | c) & d) | (b & c) 823 addl %edi, %ecx # += ((b | c) & d) | (b & c)
667 leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] 824 addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
668 movl %edx, %esi # 825 movl %edx, %esi #
669 roll $5, %esi # rotl32(a,5) 826 roll $5, %esi # rotl32(a,5)
670 addl %esi, %ecx # e += rotl32(a,5) 827 addl %esi, %ecx # e += rotl32(a,5)
@@ -676,16 +833,42 @@ sha1_process_block64:
676 andl %ebp, %esi # si: b & c 833 andl %ebp, %esi # si: b & c
677 andl %eax, %edi # di: (b | c) & d 834 andl %eax, %edi # di: (b | c) & d
678 orl %esi, %edi # ((b | c) & d) | (b & c) 835 orl %esi, %edi # ((b | c) & d) | (b & c)
679 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
680 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
681 xorl %r13d, %r11d # ^W[(n+2) & 15]
682 roll %r11d #
683 addl %edi, %ebx # += ((b | c) & d) | (b & c) 836 addl %edi, %ebx # += ((b | c) & d) | (b & c)
684 leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] 837 addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
685 movl %ecx, %esi # 838 movl %ecx, %esi #
686 roll $5, %esi # rotl32(a,5) 839 roll $5, %esi # rotl32(a,5)
687 addl %esi, %ebx # e += rotl32(a,5) 840 addl %esi, %ebx # e += rotl32(a,5)
688 rorl $2, %edx # b = rotl32(b,30) 841 rorl $2, %edx # b = rotl32(b,30)
842# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
843 movaps %xmm0, %xmm4
844 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
845# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
846# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
847# same result as above, but shorter and faster:
848# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
849# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
850 movaps %xmm1, %xmm5
851 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
852 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
853 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
854 xorps %xmm5, %xmm1 # ^
855 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
856 movaps %xmm1, %xmm5
857 xorps %xmm4, %xmm4 # rol(W0,1):
858 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
859 paddd %xmm1, %xmm1 # shift left by 1
860 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
861 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
862 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
863 movaps %xmm5, %xmm4
864 pslld $2, %xmm5
865 psrld $30, %xmm4
866# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
867 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
868 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
869 movaps %xmm1, %xmm5
870 paddd %xmm6, %xmm5
871 movups %xmm5, -64+16*1(%rsp)
689# 44 872# 44
690 movl %ecx, %edi # di: b 873 movl %ecx, %edi # di: b
691 movl %ecx, %esi # si: b 874 movl %ecx, %esi # si: b
@@ -693,12 +876,8 @@ sha1_process_block64:
693 andl %edx, %esi # si: b & c 876 andl %edx, %esi # si: b & c
694 andl %ebp, %edi # di: (b | c) & d 877 andl %ebp, %edi # di: (b | c) & d
695 orl %esi, %edi # ((b | c) & d) | (b & c) 878 orl %esi, %edi # ((b | c) & d) | (b & c)
696 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
697 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
698 xorl %r14d, %r12d # ^W[(n+2) & 15]
699 roll %r12d #
700 addl %edi, %eax # += ((b | c) & d) | (b & c) 879 addl %edi, %eax # += ((b | c) & d) | (b & c)
701 leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] 880 addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
702 movl %ebx, %esi # 881 movl %ebx, %esi #
703 roll $5, %esi # rotl32(a,5) 882 roll $5, %esi # rotl32(a,5)
704 addl %esi, %eax # e += rotl32(a,5) 883 addl %esi, %eax # e += rotl32(a,5)
@@ -710,12 +889,8 @@ sha1_process_block64:
710 andl %ecx, %esi # si: b & c 889 andl %ecx, %esi # si: b & c
711 andl %edx, %edi # di: (b | c) & d 890 andl %edx, %edi # di: (b | c) & d
712 orl %esi, %edi # ((b | c) & d) | (b & c) 891 orl %esi, %edi # ((b | c) & d) | (b & c)
713 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
714 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
715 xorl %r15d, %r13d # ^W[(n+2) & 15]
716 roll %r13d #
717 addl %edi, %ebp # += ((b | c) & d) | (b & c) 892 addl %edi, %ebp # += ((b | c) & d) | (b & c)
718 leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] 893 addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
719 movl %eax, %esi # 894 movl %eax, %esi #
720 roll $5, %esi # rotl32(a,5) 895 roll $5, %esi # rotl32(a,5)
721 addl %esi, %ebp # e += rotl32(a,5) 896 addl %esi, %ebp # e += rotl32(a,5)
@@ -727,12 +902,8 @@ sha1_process_block64:
727 andl %ebx, %esi # si: b & c 902 andl %ebx, %esi # si: b & c
728 andl %ecx, %edi # di: (b | c) & d 903 andl %ecx, %edi # di: (b | c) & d
729 orl %esi, %edi # ((b | c) & d) | (b & c) 904 orl %esi, %edi # ((b | c) & d) | (b & c)
730 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
731 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
732 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
733 roll %r14d #
734 addl %edi, %edx # += ((b | c) & d) | (b & c) 905 addl %edi, %edx # += ((b | c) & d) | (b & c)
735 leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] 906 addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
736 movl %ebp, %esi # 907 movl %ebp, %esi #
737 roll $5, %esi # rotl32(a,5) 908 roll $5, %esi # rotl32(a,5)
738 addl %esi, %edx # e += rotl32(a,5) 909 addl %esi, %edx # e += rotl32(a,5)
@@ -744,16 +915,42 @@ sha1_process_block64:
744 andl %eax, %esi # si: b & c 915 andl %eax, %esi # si: b & c
745 andl %ebx, %edi # di: (b | c) & d 916 andl %ebx, %edi # di: (b | c) & d
746 orl %esi, %edi # ((b | c) & d) | (b & c) 917 orl %esi, %edi # ((b | c) & d) | (b & c)
747 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
748 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
749 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
750 roll %r15d #
751 addl %edi, %ecx # += ((b | c) & d) | (b & c) 918 addl %edi, %ecx # += ((b | c) & d) | (b & c)
752 leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] 919 addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
753 movl %edx, %esi # 920 movl %edx, %esi #
754 roll $5, %esi # rotl32(a,5) 921 roll $5, %esi # rotl32(a,5)
755 addl %esi, %ecx # e += rotl32(a,5) 922 addl %esi, %ecx # e += rotl32(a,5)
756 rorl $2, %ebp # b = rotl32(b,30) 923 rorl $2, %ebp # b = rotl32(b,30)
924# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
925 movaps %xmm1, %xmm4
926 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
927# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
928# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
929# same result as above, but shorter and faster:
930# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
931# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
932 movaps %xmm2, %xmm5
933 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
934 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
935 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
936 xorps %xmm5, %xmm2 # ^
937 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
938 movaps %xmm2, %xmm5
939 xorps %xmm4, %xmm4 # rol(W0,1):
940 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
941 paddd %xmm2, %xmm2 # shift left by 1
942 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
943 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
944 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
945 movaps %xmm5, %xmm4
946 pslld $2, %xmm5
947 psrld $30, %xmm4
948# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
949 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
950 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
951 movaps %xmm2, %xmm5
952 paddd %xmm6, %xmm5
953 movups %xmm5, -64+16*2(%rsp)
757# 48 954# 48
758 movl %edx, %edi # di: b 955 movl %edx, %edi # di: b
759 movl %edx, %esi # si: b 956 movl %edx, %esi # si: b
@@ -761,14 +958,8 @@ sha1_process_block64:
761 andl %ebp, %esi # si: b & c 958 andl %ebp, %esi # si: b & c
762 andl %eax, %edi # di: (b | c) & d 959 andl %eax, %edi # di: (b | c) & d
763 orl %esi, %edi # ((b | c) & d) | (b & c) 960 orl %esi, %edi # ((b | c) & d) | (b & c)
764 movl %r13d, %esi # W[(n+13) & 15]
765 xorl %r8d, %esi # ^W[(n+8) & 15]
766 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
767 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
768 roll %esi #
769 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
770 addl %edi, %ebx # += ((b | c) & d) | (b & c) 961 addl %edi, %ebx # += ((b | c) & d) | (b & c)
771 leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 962 addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
772 movl %ecx, %esi # 963 movl %ecx, %esi #
773 roll $5, %esi # rotl32(a,5) 964 roll $5, %esi # rotl32(a,5)
774 addl %esi, %ebx # e += rotl32(a,5) 965 addl %esi, %ebx # e += rotl32(a,5)
@@ -780,14 +971,8 @@ sha1_process_block64:
780 andl %edx, %esi # si: b & c 971 andl %edx, %esi # si: b & c
781 andl %ebp, %edi # di: (b | c) & d 972 andl %ebp, %edi # di: (b | c) & d
782 orl %esi, %edi # ((b | c) & d) | (b & c) 973 orl %esi, %edi # ((b | c) & d) | (b & c)
783 movl %r14d, %esi # W[(n+13) & 15]
784 xorl %r9d, %esi # ^W[(n+8) & 15]
785 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
786 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
787 roll %esi #
788 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
789 addl %edi, %eax # += ((b | c) & d) | (b & c) 974 addl %edi, %eax # += ((b | c) & d) | (b & c)
790 leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] 975 addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
791 movl %ebx, %esi # 976 movl %ebx, %esi #
792 roll $5, %esi # rotl32(a,5) 977 roll $5, %esi # rotl32(a,5)
793 addl %esi, %eax # e += rotl32(a,5) 978 addl %esi, %eax # e += rotl32(a,5)
@@ -799,14 +984,8 @@ sha1_process_block64:
799 andl %ecx, %esi # si: b & c 984 andl %ecx, %esi # si: b & c
800 andl %edx, %edi # di: (b | c) & d 985 andl %edx, %edi # di: (b | c) & d
801 orl %esi, %edi # ((b | c) & d) | (b & c) 986 orl %esi, %edi # ((b | c) & d) | (b & c)
802 movl %r15d, %esi # W[(n+13) & 15]
803 xorl %r10d, %esi # ^W[(n+8) & 15]
804 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
805 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
806 roll %esi #
807 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
808 addl %edi, %ebp # += ((b | c) & d) | (b & c) 987 addl %edi, %ebp # += ((b | c) & d) | (b & c)
809 leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 988 addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
810 movl %eax, %esi # 989 movl %eax, %esi #
811 roll $5, %esi # rotl32(a,5) 990 roll $5, %esi # rotl32(a,5)
812 addl %esi, %ebp # e += rotl32(a,5) 991 addl %esi, %ebp # e += rotl32(a,5)
@@ -818,18 +997,43 @@ sha1_process_block64:
818 andl %ebx, %esi # si: b & c 997 andl %ebx, %esi # si: b & c
819 andl %ecx, %edi # di: (b | c) & d 998 andl %ecx, %edi # di: (b | c) & d
820 orl %esi, %edi # ((b | c) & d) | (b & c) 999 orl %esi, %edi # ((b | c) & d) | (b & c)
821 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
822 xorl %r11d, %esi # ^W[(n+8) & 15]
823 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
824 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
825 roll %esi #
826 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
827 addl %edi, %edx # += ((b | c) & d) | (b & c) 1000 addl %edi, %edx # += ((b | c) & d) | (b & c)
828 leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1001 addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
829 movl %ebp, %esi # 1002 movl %ebp, %esi #
830 roll $5, %esi # rotl32(a,5) 1003 roll $5, %esi # rotl32(a,5)
831 addl %esi, %edx # e += rotl32(a,5) 1004 addl %esi, %edx # e += rotl32(a,5)
832 rorl $2, %eax # b = rotl32(b,30) 1005 rorl $2, %eax # b = rotl32(b,30)
1006 pshufd $0xff, %xmm7, %xmm6
1007# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1008 movaps %xmm2, %xmm4
1009 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1010# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1011# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1012# same result as above, but shorter and faster:
1013# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1014# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1015 movaps %xmm3, %xmm5
1016 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1017 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1018 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1019 xorps %xmm5, %xmm3 # ^
1020 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1021 movaps %xmm3, %xmm5
1022 xorps %xmm4, %xmm4 # rol(W0,1):
1023 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1024 paddd %xmm3, %xmm3 # shift left by 1
1025 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1026 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1027 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1028 movaps %xmm5, %xmm4
1029 pslld $2, %xmm5
1030 psrld $30, %xmm4
1031# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1032 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1033 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1034 movaps %xmm3, %xmm5
1035 paddd %xmm6, %xmm5
1036 movups %xmm5, -64+16*3(%rsp)
833# 52 1037# 52
834 movl %ebp, %edi # di: b 1038 movl %ebp, %edi # di: b
835 movl %ebp, %esi # si: b 1039 movl %ebp, %esi # si: b
@@ -837,14 +1041,8 @@ sha1_process_block64:
837 andl %eax, %esi # si: b & c 1041 andl %eax, %esi # si: b & c
838 andl %ebx, %edi # di: (b | c) & d 1042 andl %ebx, %edi # di: (b | c) & d
839 orl %esi, %edi # ((b | c) & d) | (b & c) 1043 orl %esi, %edi # ((b | c) & d) | (b & c)
840 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
841 xorl %r12d, %esi # ^W[(n+8) & 15]
842 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
843 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
844 roll %esi #
845 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
846 addl %edi, %ecx # += ((b | c) & d) | (b & c) 1044 addl %edi, %ecx # += ((b | c) & d) | (b & c)
847 leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 1045 addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
848 movl %edx, %esi # 1046 movl %edx, %esi #
849 roll $5, %esi # rotl32(a,5) 1047 roll $5, %esi # rotl32(a,5)
850 addl %esi, %ecx # e += rotl32(a,5) 1048 addl %esi, %ecx # e += rotl32(a,5)
@@ -856,14 +1054,8 @@ sha1_process_block64:
856 andl %ebp, %esi # si: b & c 1054 andl %ebp, %esi # si: b & c
857 andl %eax, %edi # di: (b | c) & d 1055 andl %eax, %edi # di: (b | c) & d
858 orl %esi, %edi # ((b | c) & d) | (b & c) 1056 orl %esi, %edi # ((b | c) & d) | (b & c)
859 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
860 xorl %r13d, %esi # ^W[(n+8) & 15]
861 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
862 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
863 roll %esi #
864 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
865 addl %edi, %ebx # += ((b | c) & d) | (b & c) 1057 addl %edi, %ebx # += ((b | c) & d) | (b & c)
866 leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 1058 addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
867 movl %ecx, %esi # 1059 movl %ecx, %esi #
868 roll $5, %esi # rotl32(a,5) 1060 roll $5, %esi # rotl32(a,5)
869 addl %esi, %ebx # e += rotl32(a,5) 1061 addl %esi, %ebx # e += rotl32(a,5)
@@ -875,14 +1067,8 @@ sha1_process_block64:
875 andl %edx, %esi # si: b & c 1067 andl %edx, %esi # si: b & c
876 andl %ebp, %edi # di: (b | c) & d 1068 andl %ebp, %edi # di: (b | c) & d
877 orl %esi, %edi # ((b | c) & d) | (b & c) 1069 orl %esi, %edi # ((b | c) & d) | (b & c)
878 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
879 xorl %r14d, %esi # ^W[(n+8) & 15]
880 xorl %r8d, %esi # ^W[(n+2) & 15]
881 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
882 roll %esi #
883 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
884 addl %edi, %eax # += ((b | c) & d) | (b & c) 1070 addl %edi, %eax # += ((b | c) & d) | (b & c)
885 leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1071 addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
886 movl %ebx, %esi # 1072 movl %ebx, %esi #
887 roll $5, %esi # rotl32(a,5) 1073 roll $5, %esi # rotl32(a,5)
888 addl %esi, %eax # e += rotl32(a,5) 1074 addl %esi, %eax # e += rotl32(a,5)
@@ -894,18 +1080,42 @@ sha1_process_block64:
894 andl %ecx, %esi # si: b & c 1080 andl %ecx, %esi # si: b & c
895 andl %edx, %edi # di: (b | c) & d 1081 andl %edx, %edi # di: (b | c) & d
896 orl %esi, %edi # ((b | c) & d) | (b & c) 1082 orl %esi, %edi # ((b | c) & d) | (b & c)
897 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
898 xorl %r15d, %esi # ^W[(n+8) & 15]
899 xorl %r9d, %esi # ^W[(n+2) & 15]
900 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
901 roll %esi #
902 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
903 addl %edi, %ebp # += ((b | c) & d) | (b & c) 1083 addl %edi, %ebp # += ((b | c) & d) | (b & c)
904 leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1084 addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
905 movl %eax, %esi # 1085 movl %eax, %esi #
906 roll $5, %esi # rotl32(a,5) 1086 roll $5, %esi # rotl32(a,5)
907 addl %esi, %ebp # e += rotl32(a,5) 1087 addl %esi, %ebp # e += rotl32(a,5)
908 rorl $2, %ebx # b = rotl32(b,30) 1088 rorl $2, %ebx # b = rotl32(b,30)
1089# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1090 movaps %xmm3, %xmm4
1091 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1092# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1093# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1094# same result as above, but shorter and faster:
1095# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1096# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1097 movaps %xmm0, %xmm5
1098 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1099 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1100 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1101 xorps %xmm5, %xmm0 # ^
1102 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1103 movaps %xmm0, %xmm5
1104 xorps %xmm4, %xmm4 # rol(W0,1):
1105 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1106 paddd %xmm0, %xmm0 # shift left by 1
1107 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
1108 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1109 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1110 movaps %xmm5, %xmm4
1111 pslld $2, %xmm5
1112 psrld $30, %xmm4
1113# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1114 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
1115 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1116 movaps %xmm0, %xmm5
1117 paddd %xmm6, %xmm5
1118 movups %xmm5, -64+16*0(%rsp)
909# 56 1119# 56
910 movl %eax, %edi # di: b 1120 movl %eax, %edi # di: b
911 movl %eax, %esi # si: b 1121 movl %eax, %esi # si: b
@@ -913,12 +1123,8 @@ sha1_process_block64:
913 andl %ebx, %esi # si: b & c 1123 andl %ebx, %esi # si: b & c
914 andl %ecx, %edi # di: (b | c) & d 1124 andl %ecx, %edi # di: (b | c) & d
915 orl %esi, %edi # ((b | c) & d) | (b & c) 1125 orl %esi, %edi # ((b | c) & d) | (b & c)
916 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
917 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
918 xorl %r10d, %r8d # ^W[(n+2) & 15]
919 roll %r8d #
920 addl %edi, %edx # += ((b | c) & d) | (b & c) 1126 addl %edi, %edx # += ((b | c) & d) | (b & c)
921 leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] 1127 addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
922 movl %ebp, %esi # 1128 movl %ebp, %esi #
923 roll $5, %esi # rotl32(a,5) 1129 roll $5, %esi # rotl32(a,5)
924 addl %esi, %edx # e += rotl32(a,5) 1130 addl %esi, %edx # e += rotl32(a,5)
@@ -930,12 +1136,8 @@ sha1_process_block64:
930 andl %eax, %esi # si: b & c 1136 andl %eax, %esi # si: b & c
931 andl %ebx, %edi # di: (b | c) & d 1137 andl %ebx, %edi # di: (b | c) & d
932 orl %esi, %edi # ((b | c) & d) | (b & c) 1138 orl %esi, %edi # ((b | c) & d) | (b & c)
933 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
934 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
935 xorl %r11d, %r9d # ^W[(n+2) & 15]
936 roll %r9d #
937 addl %edi, %ecx # += ((b | c) & d) | (b & c) 1139 addl %edi, %ecx # += ((b | c) & d) | (b & c)
938 leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] 1140 addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
939 movl %edx, %esi # 1141 movl %edx, %esi #
940 roll $5, %esi # rotl32(a,5) 1142 roll $5, %esi # rotl32(a,5)
941 addl %esi, %ecx # e += rotl32(a,5) 1143 addl %esi, %ecx # e += rotl32(a,5)
@@ -947,12 +1149,8 @@ sha1_process_block64:
947 andl %ebp, %esi # si: b & c 1149 andl %ebp, %esi # si: b & c
948 andl %eax, %edi # di: (b | c) & d 1150 andl %eax, %edi # di: (b | c) & d
949 orl %esi, %edi # ((b | c) & d) | (b & c) 1151 orl %esi, %edi # ((b | c) & d) | (b & c)
950 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
951 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
952 xorl %r12d, %r10d # ^W[(n+2) & 15]
953 roll %r10d #
954 addl %edi, %ebx # += ((b | c) & d) | (b & c) 1152 addl %edi, %ebx # += ((b | c) & d) | (b & c)
955 leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] 1153 addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
956 movl %ecx, %esi # 1154 movl %ecx, %esi #
957 roll $5, %esi # rotl32(a,5) 1155 roll $5, %esi # rotl32(a,5)
958 addl %esi, %ebx # e += rotl32(a,5) 1156 addl %esi, %ebx # e += rotl32(a,5)
@@ -964,307 +1162,297 @@ sha1_process_block64:
964 andl %edx, %esi # si: b & c 1162 andl %edx, %esi # si: b & c
965 andl %ebp, %edi # di: (b | c) & d 1163 andl %ebp, %edi # di: (b | c) & d
966 orl %esi, %edi # ((b | c) & d) | (b & c) 1164 orl %esi, %edi # ((b | c) & d) | (b & c)
967 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
968 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
969 xorl %r13d, %r11d # ^W[(n+2) & 15]
970 roll %r11d #
971 addl %edi, %eax # += ((b | c) & d) | (b & c) 1165 addl %edi, %eax # += ((b | c) & d) | (b & c)
972 leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] 1166 addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
973 movl %ebx, %esi # 1167 movl %ebx, %esi #
974 roll $5, %esi # rotl32(a,5) 1168 roll $5, %esi # rotl32(a,5)
975 addl %esi, %eax # e += rotl32(a,5) 1169 addl %esi, %eax # e += rotl32(a,5)
976 rorl $2, %ecx # b = rotl32(b,30) 1170 rorl $2, %ecx # b = rotl32(b,30)
1171# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1172 movaps %xmm0, %xmm4
1173 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1174# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1175# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1176# same result as above, but shorter and faster:
1177# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1178# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1179 movaps %xmm1, %xmm5
1180 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1181 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1182 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1183 xorps %xmm5, %xmm1 # ^
1184 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1185 movaps %xmm1, %xmm5
1186 xorps %xmm4, %xmm4 # rol(W0,1):
1187 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1188 paddd %xmm1, %xmm1 # shift left by 1
1189 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
1190 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1191 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1192 movaps %xmm5, %xmm4
1193 pslld $2, %xmm5
1194 psrld $30, %xmm4
1195# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1196 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
1197 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1198 movaps %xmm1, %xmm5
1199 paddd %xmm6, %xmm5
1200 movups %xmm5, -64+16*1(%rsp)
977# 60 1201# 60
978 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
979 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
980 xorl %r14d, %r12d # ^W[(n+2) & 15]
981 roll %r12d #
982 movl %ecx, %edi # c 1202 movl %ecx, %edi # c
983 xorl %edx, %edi # ^d 1203 xorl %edx, %edi # ^d
984 xorl %ebx, %edi # ^b 1204 xorl %ebx, %edi # ^b
985 leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] 1205 addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
986 addl %edi, %ebp # e += (c ^ d ^ b) 1206 addl %edi, %ebp # e += (c ^ d ^ b)
987 movl %eax, %esi # 1207 movl %eax, %esi #
988 roll $5, %esi # rotl32(a,5) 1208 roll $5, %esi # rotl32(a,5)
989 addl %esi, %ebp # e += rotl32(a,5) 1209 addl %esi, %ebp # e += rotl32(a,5)
990 rorl $2, %ebx # b = rotl32(b,30) 1210 rorl $2, %ebx # b = rotl32(b,30)
991# 61 1211# 61
992 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
993 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
994 xorl %r15d, %r13d # ^W[(n+2) & 15]
995 roll %r13d #
996 movl %ebx, %edi # c 1212 movl %ebx, %edi # c
997 xorl %ecx, %edi # ^d 1213 xorl %ecx, %edi # ^d
998 xorl %eax, %edi # ^b 1214 xorl %eax, %edi # ^b
999 leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] 1215 addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
1000 addl %edi, %edx # e += (c ^ d ^ b) 1216 addl %edi, %edx # e += (c ^ d ^ b)
1001 movl %ebp, %esi # 1217 movl %ebp, %esi #
1002 roll $5, %esi # rotl32(a,5) 1218 roll $5, %esi # rotl32(a,5)
1003 addl %esi, %edx # e += rotl32(a,5) 1219 addl %esi, %edx # e += rotl32(a,5)
1004 rorl $2, %eax # b = rotl32(b,30) 1220 rorl $2, %eax # b = rotl32(b,30)
1005# 62 1221# 62
1006 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
1007 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
1008 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
1009 roll %r14d #
1010 movl %eax, %edi # c 1222 movl %eax, %edi # c
1011 xorl %ebx, %edi # ^d 1223 xorl %ebx, %edi # ^d
1012 xorl %ebp, %edi # ^b 1224 xorl %ebp, %edi # ^b
1013 leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] 1225 addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
1014 addl %edi, %ecx # e += (c ^ d ^ b) 1226 addl %edi, %ecx # e += (c ^ d ^ b)
1015 movl %edx, %esi # 1227 movl %edx, %esi #
1016 roll $5, %esi # rotl32(a,5) 1228 roll $5, %esi # rotl32(a,5)
1017 addl %esi, %ecx # e += rotl32(a,5) 1229 addl %esi, %ecx # e += rotl32(a,5)
1018 rorl $2, %ebp # b = rotl32(b,30) 1230 rorl $2, %ebp # b = rotl32(b,30)
1019# 63 1231# 63
1020 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
1021 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
1022 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
1023 roll %r15d #
1024 movl %ebp, %edi # c 1232 movl %ebp, %edi # c
1025 xorl %eax, %edi # ^d 1233 xorl %eax, %edi # ^d
1026 xorl %edx, %edi # ^b 1234 xorl %edx, %edi # ^b
1027 leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] 1235 addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
1028 addl %edi, %ebx # e += (c ^ d ^ b) 1236 addl %edi, %ebx # e += (c ^ d ^ b)
1029 movl %ecx, %esi # 1237 movl %ecx, %esi #
1030 roll $5, %esi # rotl32(a,5) 1238 roll $5, %esi # rotl32(a,5)
1031 addl %esi, %ebx # e += rotl32(a,5) 1239 addl %esi, %ebx # e += rotl32(a,5)
1032 rorl $2, %edx # b = rotl32(b,30) 1240 rorl $2, %edx # b = rotl32(b,30)
1241# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1242 movaps %xmm1, %xmm4
1243 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1244# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1245# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1246# same result as above, but shorter and faster:
1247# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1248# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1249 movaps %xmm2, %xmm5
1250 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1251 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1252 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1253 xorps %xmm5, %xmm2 # ^
1254 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1255 movaps %xmm2, %xmm5
1256 xorps %xmm4, %xmm4 # rol(W0,1):
1257 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1258 paddd %xmm2, %xmm2 # shift left by 1
1259 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
1260 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1261 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1262 movaps %xmm5, %xmm4
1263 pslld $2, %xmm5
1264 psrld $30, %xmm4
1265# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1266 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
1267 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1268 movaps %xmm2, %xmm5
1269 paddd %xmm6, %xmm5
1270 movups %xmm5, -64+16*2(%rsp)
1033# 64 1271# 64
1034 movl %r13d, %esi # W[(n+13) & 15]
1035 xorl %r8d, %esi # ^W[(n+8) & 15]
1036 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
1037 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
1038 roll %esi #
1039 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
1040 movl %edx, %edi # c 1272 movl %edx, %edi # c
1041 xorl %ebp, %edi # ^d 1273 xorl %ebp, %edi # ^d
1042 xorl %ecx, %edi # ^b 1274 xorl %ecx, %edi # ^b
1043 leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1275 addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
1044 addl %edi, %eax # e += (c ^ d ^ b) 1276 addl %edi, %eax # e += (c ^ d ^ b)
1045 movl %ebx, %esi # 1277 movl %ebx, %esi #
1046 roll $5, %esi # rotl32(a,5) 1278 roll $5, %esi # rotl32(a,5)
1047 addl %esi, %eax # e += rotl32(a,5) 1279 addl %esi, %eax # e += rotl32(a,5)
1048 rorl $2, %ecx # b = rotl32(b,30) 1280 rorl $2, %ecx # b = rotl32(b,30)
1049# 65 1281# 65
1050 movl %r14d, %esi # W[(n+13) & 15]
1051 xorl %r9d, %esi # ^W[(n+8) & 15]
1052 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
1053 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
1054 roll %esi #
1055 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
1056 movl %ecx, %edi # c 1282 movl %ecx, %edi # c
1057 xorl %edx, %edi # ^d 1283 xorl %edx, %edi # ^d
1058 xorl %ebx, %edi # ^b 1284 xorl %ebx, %edi # ^b
1059 leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1285 addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
1060 addl %edi, %ebp # e += (c ^ d ^ b) 1286 addl %edi, %ebp # e += (c ^ d ^ b)
1061 movl %eax, %esi # 1287 movl %eax, %esi #
1062 roll $5, %esi # rotl32(a,5) 1288 roll $5, %esi # rotl32(a,5)
1063 addl %esi, %ebp # e += rotl32(a,5) 1289 addl %esi, %ebp # e += rotl32(a,5)
1064 rorl $2, %ebx # b = rotl32(b,30) 1290 rorl $2, %ebx # b = rotl32(b,30)
1065# 66 1291# 66
1066 movl %r15d, %esi # W[(n+13) & 15]
1067 xorl %r10d, %esi # ^W[(n+8) & 15]
1068 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
1069 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
1070 roll %esi #
1071 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
1072 movl %ebx, %edi # c 1292 movl %ebx, %edi # c
1073 xorl %ecx, %edi # ^d 1293 xorl %ecx, %edi # ^d
1074 xorl %eax, %edi # ^b 1294 xorl %eax, %edi # ^b
1075 leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1295 addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
1076 addl %edi, %edx # e += (c ^ d ^ b) 1296 addl %edi, %edx # e += (c ^ d ^ b)
1077 movl %ebp, %esi # 1297 movl %ebp, %esi #
1078 roll $5, %esi # rotl32(a,5) 1298 roll $5, %esi # rotl32(a,5)
1079 addl %esi, %edx # e += rotl32(a,5) 1299 addl %esi, %edx # e += rotl32(a,5)
1080 rorl $2, %eax # b = rotl32(b,30) 1300 rorl $2, %eax # b = rotl32(b,30)
1081# 67 1301# 67
1082 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
1083 xorl %r11d, %esi # ^W[(n+8) & 15]
1084 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
1085 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
1086 roll %esi #
1087 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
1088 movl %eax, %edi # c 1302 movl %eax, %edi # c
1089 xorl %ebx, %edi # ^d 1303 xorl %ebx, %edi # ^d
1090 xorl %ebp, %edi # ^b 1304 xorl %ebp, %edi # ^b
1091 leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 1305 addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
1092 addl %edi, %ecx # e += (c ^ d ^ b) 1306 addl %edi, %ecx # e += (c ^ d ^ b)
1093 movl %edx, %esi # 1307 movl %edx, %esi #
1094 roll $5, %esi # rotl32(a,5) 1308 roll $5, %esi # rotl32(a,5)
1095 addl %esi, %ecx # e += rotl32(a,5) 1309 addl %esi, %ecx # e += rotl32(a,5)
1096 rorl $2, %ebp # b = rotl32(b,30) 1310 rorl $2, %ebp # b = rotl32(b,30)
1311# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1312 movaps %xmm2, %xmm4
1313 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1314# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1315# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1316# same result as above, but shorter and faster:
1317# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1318# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1319 movaps %xmm3, %xmm5
1320 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1321 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1322 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1323 xorps %xmm5, %xmm3 # ^
1324 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1325 movaps %xmm3, %xmm5
1326 xorps %xmm4, %xmm4 # rol(W0,1):
1327 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1328 paddd %xmm3, %xmm3 # shift left by 1
1329 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1330 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1331 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1332 movaps %xmm5, %xmm4
1333 pslld $2, %xmm5
1334 psrld $30, %xmm4
1335# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1336 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1337 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1338 movaps %xmm3, %xmm5
1339 paddd %xmm6, %xmm5
1340 movups %xmm5, -64+16*3(%rsp)
1097# 68 1341# 68
1098 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
1099 xorl %r12d, %esi # ^W[(n+8) & 15]
1100 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
1101 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
1102 roll %esi #
1103 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
1104 movl %ebp, %edi # c 1342 movl %ebp, %edi # c
1105 xorl %eax, %edi # ^d 1343 xorl %eax, %edi # ^d
1106 xorl %edx, %edi # ^b 1344 xorl %edx, %edi # ^b
1107 leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 1345 addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
1108 addl %edi, %ebx # e += (c ^ d ^ b) 1346 addl %edi, %ebx # e += (c ^ d ^ b)
1109 movl %ecx, %esi # 1347 movl %ecx, %esi #
1110 roll $5, %esi # rotl32(a,5) 1348 roll $5, %esi # rotl32(a,5)
1111 addl %esi, %ebx # e += rotl32(a,5) 1349 addl %esi, %ebx # e += rotl32(a,5)
1112 rorl $2, %edx # b = rotl32(b,30) 1350 rorl $2, %edx # b = rotl32(b,30)
1113# 69 1351# 69
1114 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
1115 xorl %r13d, %esi # ^W[(n+8) & 15]
1116 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
1117 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
1118 roll %esi #
1119 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
1120 movl %edx, %edi # c 1352 movl %edx, %edi # c
1121 xorl %ebp, %edi # ^d 1353 xorl %ebp, %edi # ^d
1122 xorl %ecx, %edi # ^b 1354 xorl %ecx, %edi # ^b
1123 leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1355 addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
1124 addl %edi, %eax # e += (c ^ d ^ b) 1356 addl %edi, %eax # e += (c ^ d ^ b)
1125 movl %ebx, %esi # 1357 movl %ebx, %esi #
1126 roll $5, %esi # rotl32(a,5) 1358 roll $5, %esi # rotl32(a,5)
1127 addl %esi, %eax # e += rotl32(a,5) 1359 addl %esi, %eax # e += rotl32(a,5)
1128 rorl $2, %ecx # b = rotl32(b,30) 1360 rorl $2, %ecx # b = rotl32(b,30)
1129# 70 1361# 70
1130 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
1131 xorl %r14d, %esi # ^W[(n+8) & 15]
1132 xorl %r8d, %esi # ^W[(n+2) & 15]
1133 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
1134 roll %esi #
1135 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
1136 movl %ecx, %edi # c 1362 movl %ecx, %edi # c
1137 xorl %edx, %edi # ^d 1363 xorl %edx, %edi # ^d
1138 xorl %ebx, %edi # ^b 1364 xorl %ebx, %edi # ^b
1139 leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1365 addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
1140 addl %edi, %ebp # e += (c ^ d ^ b) 1366 addl %edi, %ebp # e += (c ^ d ^ b)
1141 movl %eax, %esi # 1367 movl %eax, %esi #
1142 roll $5, %esi # rotl32(a,5) 1368 roll $5, %esi # rotl32(a,5)
1143 addl %esi, %ebp # e += rotl32(a,5) 1369 addl %esi, %ebp # e += rotl32(a,5)
1144 rorl $2, %ebx # b = rotl32(b,30) 1370 rorl $2, %ebx # b = rotl32(b,30)
1145# 71 1371# 71
1146 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
1147 xorl %r15d, %esi # ^W[(n+8) & 15]
1148 xorl %r9d, %esi # ^W[(n+2) & 15]
1149 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
1150 roll %esi #
1151 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
1152 movl %ebx, %edi # c 1372 movl %ebx, %edi # c
1153 xorl %ecx, %edi # ^d 1373 xorl %ecx, %edi # ^d
1154 xorl %eax, %edi # ^b 1374 xorl %eax, %edi # ^b
1155 leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1375 addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
1156 addl %edi, %edx # e += (c ^ d ^ b) 1376 addl %edi, %edx # e += (c ^ d ^ b)
1157 movl %ebp, %esi # 1377 movl %ebp, %esi #
1158 roll $5, %esi # rotl32(a,5) 1378 roll $5, %esi # rotl32(a,5)
1159 addl %esi, %edx # e += rotl32(a,5) 1379 addl %esi, %edx # e += rotl32(a,5)
1160 rorl $2, %eax # b = rotl32(b,30) 1380 rorl $2, %eax # b = rotl32(b,30)
1161# 72 1381# 72
1162 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
1163 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
1164 xorl %r10d, %r8d # ^W[(n+2) & 15]
1165 roll %r8d #
1166 movl %eax, %edi # c 1382 movl %eax, %edi # c
1167 xorl %ebx, %edi # ^d 1383 xorl %ebx, %edi # ^d
1168 xorl %ebp, %edi # ^b 1384 xorl %ebp, %edi # ^b
1169 leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] 1385 addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
1170 addl %edi, %ecx # e += (c ^ d ^ b) 1386 addl %edi, %ecx # e += (c ^ d ^ b)
1171 movl %edx, %esi # 1387 movl %edx, %esi #
1172 roll $5, %esi # rotl32(a,5) 1388 roll $5, %esi # rotl32(a,5)
1173 addl %esi, %ecx # e += rotl32(a,5) 1389 addl %esi, %ecx # e += rotl32(a,5)
1174 rorl $2, %ebp # b = rotl32(b,30) 1390 rorl $2, %ebp # b = rotl32(b,30)
1175# 73 1391# 73
1176 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
1177 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
1178 xorl %r11d, %r9d # ^W[(n+2) & 15]
1179 roll %r9d #
1180 movl %ebp, %edi # c 1392 movl %ebp, %edi # c
1181 xorl %eax, %edi # ^d 1393 xorl %eax, %edi # ^d
1182 xorl %edx, %edi # ^b 1394 xorl %edx, %edi # ^b
1183 leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] 1395 addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
1184 addl %edi, %ebx # e += (c ^ d ^ b) 1396 addl %edi, %ebx # e += (c ^ d ^ b)
1185 movl %ecx, %esi # 1397 movl %ecx, %esi #
1186 roll $5, %esi # rotl32(a,5) 1398 roll $5, %esi # rotl32(a,5)
1187 addl %esi, %ebx # e += rotl32(a,5) 1399 addl %esi, %ebx # e += rotl32(a,5)
1188 rorl $2, %edx # b = rotl32(b,30) 1400 rorl $2, %edx # b = rotl32(b,30)
1189# 74 1401# 74
1190 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
1191 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
1192 xorl %r12d, %r10d # ^W[(n+2) & 15]
1193 roll %r10d #
1194 movl %edx, %edi # c 1402 movl %edx, %edi # c
1195 xorl %ebp, %edi # ^d 1403 xorl %ebp, %edi # ^d
1196 xorl %ecx, %edi # ^b 1404 xorl %ecx, %edi # ^b
1197 leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] 1405 addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
1198 addl %edi, %eax # e += (c ^ d ^ b) 1406 addl %edi, %eax # e += (c ^ d ^ b)
1199 movl %ebx, %esi # 1407 movl %ebx, %esi #
1200 roll $5, %esi # rotl32(a,5) 1408 roll $5, %esi # rotl32(a,5)
1201 addl %esi, %eax # e += rotl32(a,5) 1409 addl %esi, %eax # e += rotl32(a,5)
1202 rorl $2, %ecx # b = rotl32(b,30) 1410 rorl $2, %ecx # b = rotl32(b,30)
1203# 75 1411# 75
1204 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
1205 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
1206 xorl %r13d, %r11d # ^W[(n+2) & 15]
1207 roll %r11d #
1208 movl %ecx, %edi # c 1412 movl %ecx, %edi # c
1209 xorl %edx, %edi # ^d 1413 xorl %edx, %edi # ^d
1210 xorl %ebx, %edi # ^b 1414 xorl %ebx, %edi # ^b
1211 leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] 1415 addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
1212 addl %edi, %ebp # e += (c ^ d ^ b) 1416 addl %edi, %ebp # e += (c ^ d ^ b)
1213 movl %eax, %esi # 1417 movl %eax, %esi #
1214 roll $5, %esi # rotl32(a,5) 1418 roll $5, %esi # rotl32(a,5)
1215 addl %esi, %ebp # e += rotl32(a,5) 1419 addl %esi, %ebp # e += rotl32(a,5)
1216 rorl $2, %ebx # b = rotl32(b,30) 1420 rorl $2, %ebx # b = rotl32(b,30)
1217# 76 1421# 76
1218 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
1219 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
1220 xorl %r14d, %r12d # ^W[(n+2) & 15]
1221 roll %r12d #
1222 movl %ebx, %edi # c 1422 movl %ebx, %edi # c
1223 xorl %ecx, %edi # ^d 1423 xorl %ecx, %edi # ^d
1224 xorl %eax, %edi # ^b 1424 xorl %eax, %edi # ^b
1225 leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] 1425 addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
1226 addl %edi, %edx # e += (c ^ d ^ b) 1426 addl %edi, %edx # e += (c ^ d ^ b)
1227 movl %ebp, %esi # 1427 movl %ebp, %esi #
1228 roll $5, %esi # rotl32(a,5) 1428 roll $5, %esi # rotl32(a,5)
1229 addl %esi, %edx # e += rotl32(a,5) 1429 addl %esi, %edx # e += rotl32(a,5)
1230 rorl $2, %eax # b = rotl32(b,30) 1430 rorl $2, %eax # b = rotl32(b,30)
1231# 77 1431# 77
1232 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
1233 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
1234 xorl %r15d, %r13d # ^W[(n+2) & 15]
1235 roll %r13d #
1236 movl %eax, %edi # c 1432 movl %eax, %edi # c
1237 xorl %ebx, %edi # ^d 1433 xorl %ebx, %edi # ^d
1238 xorl %ebp, %edi # ^b 1434 xorl %ebp, %edi # ^b
1239 leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] 1435 addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
1240 addl %edi, %ecx # e += (c ^ d ^ b) 1436 addl %edi, %ecx # e += (c ^ d ^ b)
1241 movl %edx, %esi # 1437 movl %edx, %esi #
1242 roll $5, %esi # rotl32(a,5) 1438 roll $5, %esi # rotl32(a,5)
1243 addl %esi, %ecx # e += rotl32(a,5) 1439 addl %esi, %ecx # e += rotl32(a,5)
1244 rorl $2, %ebp # b = rotl32(b,30) 1440 rorl $2, %ebp # b = rotl32(b,30)
1245# 78 1441# 78
1246 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
1247 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
1248 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
1249 roll %r14d #
1250 movl %ebp, %edi # c 1442 movl %ebp, %edi # c
1251 xorl %eax, %edi # ^d 1443 xorl %eax, %edi # ^d
1252 xorl %edx, %edi # ^b 1444 xorl %edx, %edi # ^b
1253 leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] 1445 addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
1254 addl %edi, %ebx # e += (c ^ d ^ b) 1446 addl %edi, %ebx # e += (c ^ d ^ b)
1255 movl %ecx, %esi # 1447 movl %ecx, %esi #
1256 roll $5, %esi # rotl32(a,5) 1448 roll $5, %esi # rotl32(a,5)
1257 addl %esi, %ebx # e += rotl32(a,5) 1449 addl %esi, %ebx # e += rotl32(a,5)
1258 rorl $2, %edx # b = rotl32(b,30) 1450 rorl $2, %edx # b = rotl32(b,30)
1259# 79 1451# 79
1260 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
1261 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
1262 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
1263 roll %r15d #
1264 movl %edx, %edi # c 1452 movl %edx, %edi # c
1265 xorl %ebp, %edi # ^d 1453 xorl %ebp, %edi # ^d
1266 xorl %ecx, %edi # ^b 1454 xorl %ecx, %edi # ^b
1267 leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] 1455 addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
1268 addl %edi, %eax # e += (c ^ d ^ b) 1456 addl %edi, %eax # e += (c ^ d ^ b)
1269 movl %ebx, %esi # 1457 movl %ebx, %esi #
1270 roll $5, %esi # rotl32(a,5) 1458 roll $5, %esi # rotl32(a,5)
@@ -1278,7 +1466,7 @@ sha1_process_block64:
1278 addl %ebx, 84(%rdi) # ctx->hash[1] += b 1466 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1279 popq %r14 # 1467 popq %r14 #
1280 addl %ecx, 88(%rdi) # ctx->hash[2] += c 1468 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1281 popq %r15 # 1469# popq %r15 #
1282 addl %edx, 92(%rdi) # ctx->hash[3] += d 1470 addl %edx, 92(%rdi) # ctx->hash[3] += d
1283 popq %rbx # 1471 popq %rbx #
1284 addl %ebp, 96(%rdi) # ctx->hash[4] += e 1472 addl %ebp, 96(%rdi) # ctx->hash[4] += e
@@ -1286,4 +1474,13 @@ sha1_process_block64:
1286 1474
1287 ret 1475 ret
1288 .size sha1_process_block64, .-sha1_process_block64 1476 .size sha1_process_block64, .-sha1_process_block64
1477
1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1479 .balign 16
1480sha1const:
1481 .long 0x5A827999
1482 .long 0x6ED9EBA1
1483 .long 0x8F1BBCDC
1484 .long 0xCA62C1D6
1485
1289#endif 1486#endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 901896e6e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,33 +6,104 @@
6# also contains the diff of the generated file. 6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S 7exec >hash_md5_sha_x86-64.S
8 8
9# There is a way to use XMM registers (which always exist for x86-64!) for W[] 9# Based on http://arctic.org/~dean/crypto/sha1.html.
10# For example, if we load W as follows: 10# ("This SHA1 implementation is public domain.")
11# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] 11#
12# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] 12# x86-64 has at least SSE2 vector insns always available.
13# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] 13# We can use them without any CPUID checks (and without a need
14# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] 14# for a fallback code if needed insns are not available).
15# then the xor'ing operation to generate next W[0..3] is: 15# This code uses them to calculate W[] ahead of time.
16# movaps %xmm0, %xmmT2 16#
17# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) 17# Unfortunately, results are passed from vector unit to
18# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. 18# integer ALUs on the stack. MOVD/Q insns to move them directly
19# movaps %xmm0, %xmmT13 19# from vector to integer registers are slower than store-to-load
20# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) 20# forwarding in LSU (on Skylake at least).
21# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 21#
22# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or 22# The win against a purely integer code is small on Skylake,
23# and then results can be extracted for use: 23# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
24# movd %xmm0, %esi # new W[0] 24# It can do 4 ops at once in one 128-bit register,
25# pextrd $1, %xmm0, %esi # new W[1] 25# but we have to use x2 of them because of W[0] complication,
26# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) 26# SSE2 has no "rotate each word by N bits" insns,
27# pextrd $2, %xmm0, %esi # new W[2] 27# moving data to/from vector unit is clunky, and Skylake
28# pextrd $3, %xmm0, %esi # new W[3] 28# has four integer ALUs unified with three vector ALUs,
29# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. 29# which makes pure integer code rather fast, and makes
30# vector ops compete with integer ones.
31#
32# Zen3, with its separate vector ALUs, wins more, about 12%.
33
34xmmT1="%xmm4"
35xmmT2="%xmm5"
36xmmRCONST="%xmm6"
37xmmALLRCONST="%xmm7"
38T=`printf '\t'`
39
40# SSE instructions are longer than 4 bytes on average.
41# Intel CPUs (up to Tiger Lake at least) can't decode
42# more than 16 bytes of code in one cycle.
43# By interleaving SSE code and integer code
44# we mostly achieve a situation where 16-byte decode fetch window
45# contains 4 (or more) insns.
46#
47# However. On Skylake, there was no observed difference,
48# but on Zen3, non-interleaved code is ~3% faster
49# (822 Mb/s versus 795 Mb/s hashing speed).
50# Off for now:
51interleave=false
52
53INTERLEAVE() {
54 $interleave || \
55 {
56 # Generate non-interleaved code
57 # (it should work correctly too)
58 echo "$1"
59 echo "$2"
60 return
61 }
62 (
63 echo "$1" | grep -v '^$' >"$0.temp1"
64 echo "$2" | grep -v '^$' >"$0.temp2"
65 exec 3<"$0.temp1"
66 exec 4<"$0.temp2"
67 IFS=''
68 while :; do
69 line1=''
70 line2=''
71 while :; do
72 read -r line1 <&3
73 if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
74 break
75 fi
76 echo "$line1"
77 done
78 while :; do
79 read -r line2 <&4
80 if test "${line2:0:4}" = "${T}lea"; then
81 # We use 7-8 byte long forms of LEA.
82 # Do not interleave them with SSE insns
83 # which are also long.
84 echo "$line2"
85 read -r line2 <&4
86 echo "$line2"
87 continue
88 fi
89 if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
90 break
91 fi
92 echo "$line2"
93 done
94 test "$line1$line2" || break
95 echo "$line1"
96 echo "$line2"
97 done
98 rm "$0.temp1" "$0.temp2"
99 )
100}
30 101
31echo \ 102echo \
32'### Generated by hash_md5_sha_x86-64.S.sh ### 103"### Generated by hash_md5_sha_x86-64.S.sh ###
33 104
34#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 105#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
35 .section .text.sha1_process_block64,"ax",@progbits 106 .section .text.sha1_process_block64, \"ax\", @progbits
36 .globl sha1_process_block64 107 .globl sha1_process_block64
37 .hidden sha1_process_block64 108 .hidden sha1_process_block64
38 .type sha1_process_block64, @function 109 .type sha1_process_block64, @function
@@ -41,7 +112,7 @@ echo \
41sha1_process_block64: 112sha1_process_block64:
42 pushq %rbp # 1 byte insn 113 pushq %rbp # 1 byte insn
43 pushq %rbx # 1 byte insn 114 pushq %rbx # 1 byte insn
44 pushq %r15 # 2 byte insn 115# pushq %r15 # 2 byte insn
45 pushq %r14 # 2 byte insn 116 pushq %r14 # 2 byte insn
46 pushq %r13 # 2 byte insn 117 pushq %r13 # 2 byte insn
47 pushq %r12 # 2 byte insn 118 pushq %r12 # 2 byte insn
@@ -50,17 +121,13 @@ sha1_process_block64:
50#Register and stack use: 121#Register and stack use:
51# eax..edx: a..d 122# eax..edx: a..d
52# ebp: e 123# ebp: e
53# esi,edi: temps 124# esi,edi,r8..r14: temps
54# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 125# r15: unused
55# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 126# xmm0..xmm3: W[]
56 movl $3, %eax 127# xmm4,xmm5: temps
571: 128# xmm6: current round constant
58 movq (%rdi,%rax,8), %rsi 129# xmm7: all round constants
59 bswapq %rsi 130# -64(%rsp): area for passing RCONST + W[] from vector to integer units
60 rolq $32, %rsi
61 movq %rsi, -32(%rsp,%rax,8)
62 decl %eax
63 jns 1b
64 131
65 movl 80(%rdi), %eax # a = ctx->hash[0] 132 movl 80(%rdi), %eax # a = ctx->hash[0]
66 movl 84(%rdi), %ebx # b = ctx->hash[1] 133 movl 84(%rdi), %ebx # b = ctx->hash[1]
@@ -68,32 +135,123 @@ sha1_process_block64:
68 movl 92(%rdi), %edx # d = ctx->hash[3] 135 movl 92(%rdi), %edx # d = ctx->hash[3]
69 movl 96(%rdi), %ebp # e = ctx->hash[4] 136 movl 96(%rdi), %ebp # e = ctx->hash[4]
70 137
71 movq 4*8(%rdi), %r8 138 movaps sha1const(%rip), $xmmALLRCONST
72 movq 4*10(%rdi), %r10 139 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
140
141 # Load W[] to xmm registers, byteswapping on the fly.
142 #
143 # For iterations 0..15, we pass W[] in rsi,r8..r14
144 # for use in RD1As instead of spilling them to stack.
145 # We lose parallelized addition of RCONST, but LEA
146 # can do two additions at once, so it is probably a wash.
147 # (We use rsi instead of rN because this makes two
148 # LEAs in two first RD1As shorter by one byte).
149 movq 4*0(%rdi), %rsi
150 movq 4*2(%rdi), %r8
151 bswapq %rsi
73 bswapq %r8 152 bswapq %r8
153 rolq \$32, %rsi # rsi = W[1]:W[0]
154 rolq \$32, %r8 # r8 = W[3]:W[2]
155 movq %rsi, %xmm0
156 movq %r8, $xmmT1
157 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
158# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
159# paddd $xmmRCONST, $xmmT1
160# movups $xmmT1, -64+16*0(%rsp)
161
162 movq 4*4(%rdi), %r9
163 movq 4*6(%rdi), %r10
164 bswapq %r9
74 bswapq %r10 165 bswapq %r10
75 movq 4*12(%rdi), %r12 166 rolq \$32, %r9 # r9 = W[5]:W[4]
76 movq 4*14(%rdi), %r14 167 rolq \$32, %r10 # r10 = W[7]:W[6]
168 movq %r9, %xmm1
169 movq %r10, $xmmT1
170 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
171
172 movq 4*8(%rdi), %r11
173 movq 4*10(%rdi), %r12
174 bswapq %r11
77 bswapq %r12 175 bswapq %r12
176 rolq \$32, %r11 # r11 = W[9]:W[8]
177 rolq \$32, %r12 # r12 = W[11]:W[10]
178 movq %r11, %xmm2
179 movq %r12, $xmmT1
180 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
181
182 movq 4*12(%rdi), %r13
183 movq 4*14(%rdi), %r14
184 bswapq %r13
78 bswapq %r14 185 bswapq %r14
79 movl %r8d, %r9d 186 rolq \$32, %r13 # r13 = W[13]:W[12]
80 shrq $32, %r8 187 rolq \$32, %r14 # r14 = W[15]:W[14]
81 movl %r10d, %r11d 188 movq %r13, %xmm3
82 shrq $32, %r10 189 movq %r14, $xmmT1
83 movl %r12d, %r13d 190 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
84 shrq $32, %r12 191"
85 movl %r14d, %r15d 192
86 shrq $32, %r14 193PREP() {
87' 194local xmmW0=$1
88W32() { 195local xmmW4=$2
89test "$1" || exit 1 196local xmmW8=$3
90test "$1" -lt 0 && exit 1 197local xmmW12=$4
91test "$1" -gt 15 && exit 1 198# the above must be %xmm0..3 in some permutation
92test "$1" -lt 8 && echo "-32+4*$1(%rsp)" 199local dstmem=$5
93test "$1" -ge 8 && echo "%r${1}d" 200#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1);
201#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1);
202#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
203#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1);
204#W[3] ^= rol(W[0], 1);
205echo "# PREP $@
206 movaps $xmmW12, $xmmT1
207 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
208
209# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
210# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
211# same result as above, but shorter and faster:
212# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
213# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
214 movaps $xmmW0, $xmmT2
215 shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
216
217 xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
218 xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
219 xorps $xmmT2, $xmmW0 # ^
220 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
221 movaps $xmmW0, $xmmT2
222
223 xorps $xmmT1, $xmmT1 # rol(W0,1):
224 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1)
225 paddd $xmmW0, $xmmW0 # shift left by 1
226 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1
227 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
228
229 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
230 movaps $xmmT2, $xmmT1
231 pslld \$2, $xmmT2
232 psrld \$30, $xmmT1
233# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2)
234 xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2
235
236 xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
237"
238# movq $xmmW0, %r8 # high latency (~6 cycles)
239# movaps $xmmW0, $xmmT1
240# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower
241# movq $xmmT1, %r10 # high latency
242# movq %r8, %r9
243# movq %r10, %r11
244# shrq \$32, %r9
245# shrq \$32, %r11
246# ^^^ slower than passing the results on stack (!!!)
247echo "
248 movaps $xmmW0, $xmmT2
249 paddd $xmmRCONST, $xmmT2
250 movups $xmmT2, $dstmem
251"
94} 252}
95 253
96# It's possible to interleave insns in rounds to mostly eliminate 254# It's possible to interleave integer insns in rounds to mostly eliminate
97# dependency chains, but this likely to only help old Pentium-based 255# dependency chains, but this likely to only help old Pentium-based
98# CPUs (ones without OOO, which can only simultaneously execute a pair 256# CPUs (ones without OOO, which can only simultaneously execute a pair
99# of _adjacent_ insns). 257# of _adjacent_ insns).
@@ -104,28 +262,28 @@ RD1A() {
104local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 262local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
105local n=$(($6)) 263local n=$(($6))
106local n0=$(((n+0) & 15)) 264local n0=$(((n+0) & 15))
265local rN=$((7+n0/2))
107echo " 266echo "
108# $n 267# $n
109";test $n0 = 0 && echo " 268";test $n0 = 0 && echo "
110 # W[0], already in %esi 269 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
111";test $n0 != 0 && test $n0 -lt 8 && echo " 270 shrq \$32, %rsi
112 movl `W32 $n0`, %esi # W[n] 271";test $n0 = 1 && echo "
113";test $n0 -ge 8 && echo " 272 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
114 # W[n], in %r$n0 273";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
274 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
275 shrq \$32, %r$rN
276";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
277 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
115";echo " 278";echo "
116 movl %e$c, %edi # c 279 movl %e$c, %edi # c
117 xorl %e$d, %edi # ^d 280 xorl %e$d, %edi # ^d
118 andl %e$b, %edi # &b 281 andl %e$b, %edi # &b
119 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 282 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
120";test $n0 -lt 8 && echo "
121 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
122";test $n0 -ge 8 && echo "
123 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
124";echo "
125 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 283 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
126 movl %e$a, %esi # 284 movl %e$a, %edi #
127 roll \$5, %esi # rotl32(a,5) 285 roll \$5, %edi # rotl32(a,5)
128 addl %esi, %e$e # e += rotl32(a,5) 286 addl %edi, %e$e # e += rotl32(a,5)
129 rorl \$2, %e$b # b = rotl32(b,30) 287 rorl \$2, %e$b # b = rotl32(b,30)
130" 288"
131} 289}
@@ -138,28 +296,11 @@ local n2=$(((n+2) & 15))
138local n0=$(((n+0) & 15)) 296local n0=$(((n+0) & 15))
139echo " 297echo "
140# $n 298# $n
141";test $n0 -lt 8 && echo "
142 movl `W32 $n13`, %esi # W[(n+13) & 15]
143 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
144 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
145 xorl `W32 $n0`, %esi # ^W[n & 15]
146 roll %esi #
147 movl %esi, `W32 $n0` # store to W[n & 15]
148";test $n0 -ge 8 && echo "
149 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
150 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
151 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
152 roll `W32 $n0` #
153";echo "
154 movl %e$c, %edi # c 299 movl %e$c, %edi # c
155 xorl %e$d, %edi # ^d 300 xorl %e$d, %edi # ^d
156 andl %e$b, %edi # &b 301 andl %e$b, %edi # &b
157 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 302 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
158";test $n0 -lt 8 && echo " 303 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
159 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
160";test $n0 -ge 8 && echo "
161 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
162";echo "
163 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 304 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
164 movl %e$a, %esi # 305 movl %e$a, %esi #
165 roll \$5, %esi # rotl32(a,5) 306 roll \$5, %esi # rotl32(a,5)
@@ -167,13 +308,6 @@ echo "
167 rorl \$2, %e$b # b = rotl32(b,30) 308 rorl \$2, %e$b # b = rotl32(b,30)
168" 309"
169} 310}
170{
171RCONST=0x5A827999
172RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4
173RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9
174RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
175RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
176} | grep -v '^$'
177 311
178RD2() { 312RD2() {
179local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 313local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -184,27 +318,10 @@ local n2=$(((n+2) & 15))
184local n0=$(((n+0) & 15)) 318local n0=$(((n+0) & 15))
185echo " 319echo "
186# $n 320# $n
187";test $n0 -lt 8 && echo "
188 movl `W32 $n13`, %esi # W[(n+13) & 15]
189 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
190 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
191 xorl `W32 $n0`, %esi # ^W[n & 15]
192 roll %esi #
193 movl %esi, `W32 $n0` # store to W[n & 15]
194";test $n0 -ge 8 && echo "
195 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
196 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
197 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
198 roll `W32 $n0` #
199";echo "
200 movl %e$c, %edi # c 321 movl %e$c, %edi # c
201 xorl %e$d, %edi # ^d 322 xorl %e$d, %edi # ^d
202 xorl %e$b, %edi # ^b 323 xorl %e$b, %edi # ^b
203";test $n0 -lt 8 && echo " 324 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
204 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
205";test $n0 -ge 8 && echo "
206 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
207";echo "
208 addl %edi, %e$e # e += (c ^ d ^ b) 325 addl %edi, %e$e # e += (c ^ d ^ b)
209 movl %e$a, %esi # 326 movl %e$a, %esi #
210 roll \$5, %esi # rotl32(a,5) 327 roll \$5, %esi # rotl32(a,5)
@@ -212,13 +329,6 @@ echo "
212 rorl \$2, %e$b # b = rotl32(b,30) 329 rorl \$2, %e$b # b = rotl32(b,30)
213" 330"
214} 331}
215{
216RCONST=0x6ED9EBA1
217RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
218RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
219RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
220RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
221} | grep -v '^$'
222 332
223RD3() { 333RD3() {
224local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 334local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -235,53 +345,82 @@ echo "
235 andl %e$c, %esi # si: b & c 345 andl %e$c, %esi # si: b & c
236 andl %e$d, %edi # di: (b | c) & d 346 andl %e$d, %edi # di: (b | c) & d
237 orl %esi, %edi # ((b | c) & d) | (b & c) 347 orl %esi, %edi # ((b | c) & d) | (b & c)
238";test $n0 -lt 8 && echo "
239 movl `W32 $n13`, %esi # W[(n+13) & 15]
240 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
241 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
242 xorl `W32 $n0`, %esi # ^W[n & 15]
243 roll %esi #
244 movl %esi, `W32 $n0` # store to W[n & 15]
245";test $n0 -ge 8 && echo "
246 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
247 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
248 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
249 roll `W32 $n0` #
250";echo "
251 addl %edi, %e$e # += ((b | c) & d) | (b & c) 348 addl %edi, %e$e # += ((b | c) & d) | (b & c)
252";test $n0 -lt 8 && echo " 349 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
253 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
254";test $n0 -ge 8 && echo "
255 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
256";echo "
257 movl %e$a, %esi # 350 movl %e$a, %esi #
258 roll \$5, %esi # rotl32(a,5) 351 roll \$5, %esi # rotl32(a,5)
259 addl %esi, %e$e # e += rotl32(a,5) 352 addl %esi, %e$e # e += rotl32(a,5)
260 rorl \$2, %e$b # b = rotl32(b,30) 353 rorl \$2, %e$b # b = rotl32(b,30)
261" 354"
262} 355}
356
263{ 357{
264#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" 358# Round 1
265RCONST=-0x70E44324 359RCONST=0x5A827999
266RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 360RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3;
267RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 361RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7;
268RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 362a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
269RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 363b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
270} | grep -v '^$' 364INTERLEAVE "$a" "$b"
365a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST"
366 PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
367b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
368INTERLEAVE "$a" "$b"
369a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
370b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
371INTERLEAVE "$a" "$b"
372
373# Round 2
374RCONST=0x6ED9EBA1
375a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
376b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
377INTERLEAVE "$a" "$b"
378a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
379b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
380INTERLEAVE "$a" "$b"
381a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
382b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
383INTERLEAVE "$a" "$b"
384a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST"
385 PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
386b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
387INTERLEAVE "$a" "$b"
388a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
389b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
390INTERLEAVE "$a" "$b"
391
392# Round 3
393RCONST=0x8F1BBCDC
394a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
395b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
396INTERLEAVE "$a" "$b"
397a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
398b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
399INTERLEAVE "$a" "$b"
400a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
401b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
402INTERLEAVE "$a" "$b"
403a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST"
404 PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
405b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
406INTERLEAVE "$a" "$b"
407a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
408b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
409INTERLEAVE "$a" "$b"
271 410
272# Round 4 has the same logic as round 2, only n and RCONST are different 411# Round 4 has the same logic as round 2, only n and RCONST are different
273{ 412RCONST=0xCA62C1D6
274#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" 413a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
275RCONST=-0x359D3E2A 414b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
276RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 415INTERLEAVE "$a" "$b"
277RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 416a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
278RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 417b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
279RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 418INTERLEAVE "$a" "$b"
280# Note: new W[n&15] values generated in last 3 iterations 419a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
281# (W[13,14,15]) are unused after each of these iterations. 420b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
282# Since we use r8..r15 for W[8..15], this does not matter. 421INTERLEAVE "$a" "$b"
283# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] 422RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
284# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. 423RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
285} | grep -v '^$' 424} | grep -v '^$'
286 425
287echo " 426echo "
@@ -292,7 +431,7 @@ echo "
292 addl %ebx, 84(%rdi) # ctx->hash[1] += b 431 addl %ebx, 84(%rdi) # ctx->hash[1] += b
293 popq %r14 # 432 popq %r14 #
294 addl %ecx, 88(%rdi) # ctx->hash[2] += c 433 addl %ecx, 88(%rdi) # ctx->hash[2] += c
295 popq %r15 # 434# popq %r15 #
296 addl %edx, 92(%rdi) # ctx->hash[3] += d 435 addl %edx, 92(%rdi) # ctx->hash[3] += d
297 popq %rbx # 436 popq %rbx #
298 addl %ebp, 96(%rdi) # ctx->hash[4] += e 437 addl %ebp, 96(%rdi) # ctx->hash[4] += e
@@ -300,4 +439,13 @@ echo "
300 439
301 ret 440 ret
302 .size sha1_process_block64, .-sha1_process_block64 441 .size sha1_process_block64, .-sha1_process_block64
442
443 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
444 .balign 16
445sha1const:
446 .long 0x5A827999
447 .long 0x6ED9EBA1
448 .long 0x8F1BBCDC
449 .long 0xCA62C1D6
450
303#endif" 451#endif"
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 33cc3bf7f..b32029360 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -32,41 +32,42 @@
32#define MSG1 %xmm4 32#define MSG1 %xmm4
33#define MSG2 %xmm5 33#define MSG2 %xmm5
34#define MSG3 %xmm6 34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36 35
37 .balign 8 # allow decoders to fetch at least 2 first insns 36 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI: 37sha1_process_block64_shaNI:
39 /* load initial hash values */ 38 /* load initial hash values */
40
41 xor128 E0, E0
42 movu128 80(%rdi), ABCD 39 movu128 80(%rdi), ABCD
40 xor128 E0, E0
43 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word 41 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
44 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD 42 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
45 43
46 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 44 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
45
46 movu128 0*16(%rdi), MSG0
47 pshufb %xmm7, MSG0
48 movu128 1*16(%rdi), MSG1
49 pshufb %xmm7, MSG1
50 movu128 2*16(%rdi), MSG2
51 pshufb %xmm7, MSG2
52 movu128 3*16(%rdi), MSG3
53 pshufb %xmm7, MSG3
47 54
48 /* Save hash values for addition after rounds */ 55 /* Save hash values for addition after rounds */
49 mova128 E0, %xmm9 56 mova128 E0, %xmm7
50 mova128 ABCD, %xmm8 57 mova128 ABCD, %xmm8
51 58
52 /* Rounds 0-3 */ 59 /* Rounds 0-3 */
53 movu128 0*16(%rdi), MSG0
54 pshufb SHUF_MASK, MSG0
55 paddd MSG0, E0 60 paddd MSG0, E0
56 mova128 ABCD, E1 61 mova128 ABCD, E1
57 sha1rnds4 $0, E0, ABCD 62 sha1rnds4 $0, E0, ABCD
58 63
59 /* Rounds 4-7 */ 64 /* Rounds 4-7 */
60 movu128 1*16(%rdi), MSG1
61 pshufb SHUF_MASK, MSG1
62 sha1nexte MSG1, E1 65 sha1nexte MSG1, E1
63 mova128 ABCD, E0 66 mova128 ABCD, E0
64 sha1rnds4 $0, E1, ABCD 67 sha1rnds4 $0, E1, ABCD
65 sha1msg1 MSG1, MSG0 68 sha1msg1 MSG1, MSG0
66 69
67 /* Rounds 8-11 */ 70 /* Rounds 8-11 */
68 movu128 2*16(%rdi), MSG2
69 pshufb SHUF_MASK, MSG2
70 sha1nexte MSG2, E0 71 sha1nexte MSG2, E0
71 mova128 ABCD, E1 72 mova128 ABCD, E1
72 sha1rnds4 $0, E0, ABCD 73 sha1rnds4 $0, E0, ABCD
@@ -74,8 +75,6 @@ sha1_process_block64_shaNI:
74 xor128 MSG2, MSG0 75 xor128 MSG2, MSG0
75 76
76 /* Rounds 12-15 */ 77 /* Rounds 12-15 */
77 movu128 3*16(%rdi), MSG3
78 pshufb SHUF_MASK, MSG3
79 sha1nexte MSG3, E1 78 sha1nexte MSG3, E1
80 mova128 ABCD, E0 79 mova128 ABCD, E0
81 sha1msg2 MSG3, MSG0 80 sha1msg2 MSG3, MSG0
@@ -206,7 +205,7 @@ sha1_process_block64_shaNI:
206 sha1rnds4 $3, E1, ABCD 205 sha1rnds4 $3, E1, ABCD
207 206
208 /* Add current hash values with previously saved */ 207 /* Add current hash values with previously saved */
209 sha1nexte %xmm9, E0 208 sha1nexte %xmm7, E0
210 paddd %xmm8, ABCD 209 paddd %xmm8, ABCD
211 210
212 /* Write hash values back in the correct order */ 211 /* Write hash values back in the correct order */
@@ -217,8 +216,8 @@ sha1_process_block64_shaNI:
217 ret 216 ret
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 217 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219 218
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 219 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.align 16 220 .balign 16
222PSHUFFLE_BYTE_FLIP_MASK: 221PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f 222 .octa 0x000102030405060708090a0b0c0d0e0f
224 223
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 8abc87976..778511d16 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -2274,17 +2274,41 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
2274#endif 2274#endif
2275 2275
2276 fflush_all(); 2276 fflush_all();
2277 while (1) { 2277 for (;;) {
2278 /* Wait for input. TIMEOUT = -1 makes read_key wait even 2278 /* Wait for input. TIMEOUT = -1 makes read_key wait even
2279 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't 2279 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't
2280 * insist on full MB_CUR_MAX buffer to declare input like 2280 * insist on full MB_CUR_MAX buffer to declare input like
2281 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls". 2281 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls".
2282 * 2282 *
2283 * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll()
2284 * inside read_key, or if bb_got_signal != 0 (IOW: if signal
2285 * arrived before poll() is reached).
2286 *
2283 * Note: read_key sets errno to 0 on success. 2287 * Note: read_key sets errno to 0 on success.
2284 */ 2288 */
2285 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) 2289 for (;;) {
2286 ic = read_key(STDIN_FILENO, read_key_buffer, timeout); 2290 if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) {
2287 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) 2291 errno = EINTR;
2292 return -1;
2293 }
2294//FIXME: still races here with signals, but small window to poll() inside read_key
2295 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
2296 /* errno = 0; - read_key does this itself */
2297 ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
2298 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
2299 if (errno != EINTR)
2300 break;
2301 if (state->flags & LI_INTERRUPTIBLE) {
2302 /* LI_INTERRUPTIBLE bails out on EINTR,
2303 * but nothing really guarantees that bb_got_signal
2304 * is nonzero. Follow the least surprise principle:
2305 */
2306 if (bb_got_signal == 0)
2307 bb_got_signal = 255;
2308 goto ret;
2309 }
2310 }
2311
2288 if (errno) { 2312 if (errno) {
2289#if ENABLE_UNICODE_SUPPORT 2313#if ENABLE_UNICODE_SUPPORT
2290 if (errno == EAGAIN && unicode_idx != 0) 2314 if (errno == EAGAIN && unicode_idx != 0)
@@ -2352,7 +2376,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
2352#endif 2376#endif
2353 break; 2377 break;
2354 } 2378 }
2355 2379 ret:
2356 return ic; 2380 return ic;
2357} 2381}
2358 2382
diff --git a/libbb/read_key.c b/libbb/read_key.c
index 03b7da656..cf8ed411e 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
126 * if fd can be in non-blocking mode. 126 * if fd can be in non-blocking mode.
127 */ 127 */
128 if (timeout >= -1) { 128 if (timeout >= -1) {
129 if (safe_poll(&pfd, 1, timeout) == 0) { 129 n = poll(&pfd, 1, timeout);
130 if (n < 0 && errno == EINTR)
131 return n;
132 if (n == 0) {
130 /* Timed out */ 133 /* Timed out */
131 errno = EAGAIN; 134 errno = EAGAIN;
132 return -1; 135 return -1;
@@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
138 * When we were reading 3 bytes here, we were eating 141 * When we were reading 3 bytes here, we were eating
139 * "li" too, and cat was getting wrong input. 142 * "li" too, and cat was getting wrong input.
140 */ 143 */
141 n = safe_read(fd, buffer, 1); 144 n = read(fd, buffer, 1);
142 if (n <= 0) 145 if (n <= 0)
143 return -1; 146 return -1;
144 } 147 }
@@ -284,6 +287,16 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
284 goto start_over; 287 goto start_over;
285} 288}
286 289
290int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout)
291{
292 int64_t r;
293 do {
294 /* errno = 0; - read_key does this itself */
295 r = read_key(fd, buffer, timeout);
296 } while (errno == EINTR);
297 return r;
298}
299
287void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len) 300void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
288{ 301{
289 unsigned cur_len = (unsigned char)buffer[0]; 302 unsigned cur_len = (unsigned char)buffer[0];
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index df2983958..3549e2099 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -36,9 +36,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
36 36
37 /* Change the current working directory to be the home directory 37 /* Change the current working directory to be the home directory
38 * of the user */ 38 * of the user */
39 if (!(flags & SETUP_ENV_NO_CHDIR)) { 39 if (flags & SETUP_ENV_CHDIR) {
40 if (chdir(pw->pw_dir) != 0) { 40 if (chdir_or_warn(pw->pw_dir) != 0) {
41 bb_error_msg("can't change directory to '%s'", pw->pw_dir);
42 xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); 41 xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/");
43 } 42 }
44 } 43 }
@@ -59,7 +58,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
59 //xsetenv("LOGNAME", pw->pw_name); 58 //xsetenv("LOGNAME", pw->pw_name);
60 //xsetenv("HOME", pw->pw_dir); 59 //xsetenv("HOME", pw->pw_dir);
61 //xsetenv("SHELL", shell); 60 //xsetenv("SHELL", shell);
62 } else if (flags & SETUP_ENV_CHANGEENV) { 61 } else
62 if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) {
63 /* Set HOME, SHELL, and if not becoming a super-user 63 /* Set HOME, SHELL, and if not becoming a super-user
64 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ 64 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */
65 if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { 65 if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) {
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c
index aae3b092d..a9add8ab2 100644
--- a/libbb/xfuncs_printf.c
+++ b/libbb/xfuncs_printf.c
@@ -417,11 +417,18 @@ void FAST_FUNC xseteuid(uid_t euid)
417 if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid"); 417 if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid");
418} 418}
419 419
420int FAST_FUNC chdir_or_warn(const char *path)
421{
422 int r = chdir(path);
423 if (r != 0)
424 bb_perror_msg("can't change directory to '%s'", path);
425 return r;
426}
420// Die if we can't chdir to a new path. 427// Die if we can't chdir to a new path.
421void FAST_FUNC xchdir(const char *path) 428void FAST_FUNC xchdir(const char *path)
422{ 429{
423 if (chdir(path)) 430 if (chdir_or_warn(path) != 0)
424 bb_perror_msg_and_die("can't change directory to '%s'", path); 431 xfunc_die();
425} 432}
426 433
427void FAST_FUNC xfchdir(int fd) 434void FAST_FUNC xfchdir(int fd)
diff --git a/loginutils/login.c b/loginutils/login.c
index cac4349b2..332238181 100644
--- a/loginutils/login.c
+++ b/loginutils/login.c
@@ -564,7 +564,9 @@ int login_main(int argc UNUSED_PARAM, char **argv)
564 564
565 change_identity(pw); 565 change_identity(pw);
566 setup_environment(pw->pw_shell, 566 setup_environment(pw->pw_shell,
567 (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + SETUP_ENV_CHANGEENV, 567 (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV)
568 + SETUP_ENV_CHANGEENV
569 + SETUP_ENV_CHDIR,
568 pw); 570 pw);
569 571
570#if ENABLE_PAM 572#if ENABLE_PAM
diff --git a/loginutils/su.c b/loginutils/su.c
index 647c97fb1..b61e3753a 100644
--- a/loginutils/su.c
+++ b/loginutils/su.c
@@ -177,10 +177,9 @@ int su_main(int argc UNUSED_PARAM, char **argv)
177 177
178 change_identity(pw); 178 change_identity(pw);
179 setup_environment(opt_shell, 179 setup_environment(opt_shell,
180 ((flags & SU_OPT_l) / SU_OPT_l * SETUP_ENV_CLEARENV) 180 ((flags & SU_OPT_l) ? (SETUP_ENV_CLEARENV + SETUP_ENV_CHDIR) : 0)
181 + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV) 181 + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV),
182 + (!(flags & SU_OPT_l) * SETUP_ENV_NO_CHDIR), 182 pw);
183 pw);
184 IF_SELINUX(set_current_security_context(NULL);) 183 IF_SELINUX(set_current_security_context(NULL);)
185 184
186 if (opt_command) { 185 if (opt_command) {
diff --git a/loginutils/sulogin.c b/loginutils/sulogin.c
index c9817960c..681022acb 100644
--- a/loginutils/sulogin.c
+++ b/loginutils/sulogin.c
@@ -94,10 +94,13 @@ int sulogin_main(int argc UNUSED_PARAM, char **argv)
94 shell = pwd->pw_shell; 94 shell = pwd->pw_shell;
95 95
96 /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */ 96 /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */
97 setup_environment(shell, SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME, pwd); 97 setup_environment(shell, 0
98 + SETUP_ENV_CHANGEENV_LOGNAME
99 + SETUP_ENV_CHDIR
100 , pwd);
98 // no SETUP_ENV_CLEARENV 101 // no SETUP_ENV_CLEARENV
99 // SETUP_ENV_CHANGEENV[+LOGNAME] - set HOME, SHELL, USER,and LOGNAME 102 // SETUP_ENV_CHANGEENV_LOGNAME - set HOME, SHELL, USER,and LOGNAME
100 // no SETUP_ENV_NO_CHDIR - IOW: cd to $HOME 103 // SETUP_ENV_CHDIR - cd to $HOME
101 104
102 /* util-linux 2.36.1 compat: steal ctty if we don't have it yet 105 /* util-linux 2.36.1 compat: steal ctty if we don't have it yet
103 * (yes, util-linux uses force=1) */ 106 * (yes, util-linux uses force=1) */
diff --git a/miscutils/bc.c b/miscutils/bc.c
index e3f7573c9..fe555d018 100644
--- a/miscutils/bc.c
+++ b/miscutils/bc.c
@@ -6011,7 +6011,7 @@ static BC_STATUS zxc_program_assign(char inst)
6011#endif 6011#endif
6012 6012
6013 if (ib || sc || left->t == XC_RESULT_OBASE) { 6013 if (ib || sc || left->t == XC_RESULT_OBASE) {
6014 static const char *const msg[] = { 6014 static const char *const msg[] ALIGN_PTR = {
6015 "bad ibase; must be [2,16]", //XC_RESULT_IBASE 6015 "bad ibase; must be [2,16]", //XC_RESULT_IBASE
6016 "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE 6016 "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE
6017 "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE 6017 "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE
diff --git a/miscutils/crond.c b/miscutils/crond.c
index b74427351..1965af656 100644
--- a/miscutils/crond.c
+++ b/miscutils/crond.c
@@ -675,8 +675,7 @@ static void change_user(struct passwd *pas)
675{ 675{
676 /* careful: we're after vfork! */ 676 /* careful: we're after vfork! */
677 change_identity(pas); /* - initgroups, setgid, setuid */ 677 change_identity(pas); /* - initgroups, setgid, setuid */
678 if (chdir(pas->pw_dir) < 0) { 678 if (chdir_or_warn(pas->pw_dir) != 0) {
679 bb_error_msg("can't change directory to '%s'", pas->pw_dir);
680 xchdir(CRON_DIR); 679 xchdir(CRON_DIR);
681 } 680 }
682} 681}
diff --git a/miscutils/crontab.c b/miscutils/crontab.c
index 411a18a50..1111f4d54 100644
--- a/miscutils/crontab.c
+++ b/miscutils/crontab.c
@@ -55,8 +55,8 @@ static void edit_file(const struct passwd *pas, const char *file)
55 /* initgroups, setgid, setuid */ 55 /* initgroups, setgid, setuid */
56 change_identity(pas); 56 change_identity(pas);
57 setup_environment(pas->pw_shell, 57 setup_environment(pas->pw_shell,
58 SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP, 58 SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP | SETUP_ENV_CHDIR,
59 pas); 59 pas);
60 ptr = getenv("VISUAL"); 60 ptr = getenv("VISUAL");
61 if (!ptr) { 61 if (!ptr) {
62 ptr = getenv("EDITOR"); 62 ptr = getenv("EDITOR");
diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c
index 839d00fd0..fb9ebcf60 100644
--- a/miscutils/devfsd.c
+++ b/miscutils/devfsd.c
@@ -928,7 +928,7 @@ static void action_compat(const struct devfsd_notify_struct *info, unsigned int
928 unsigned int i; 928 unsigned int i;
929 char rewind_; 929 char rewind_;
930 /* 1 to 5 "scsi/" , 6 to 9 "ide/host" */ 930 /* 1 to 5 "scsi/" , 6 to 9 "ide/host" */
931 static const char *const fmt[] = { 931 static const char *const fmt[] ALIGN_PTR = {
932 NULL , 932 NULL ,
933 "sg/c%db%dt%du%d", /* scsi/generic */ 933 "sg/c%db%dt%du%d", /* scsi/generic */
934 "sd/c%db%dt%du%d", /* scsi/disc */ 934 "sd/c%db%dt%du%d", /* scsi/disc */
@@ -1468,7 +1468,7 @@ const char *get_old_name(const char *devname, unsigned int namelen,
1468 const char *pty1; 1468 const char *pty1;
1469 const char *pty2; 1469 const char *pty2;
1470 /* 1 to 5 "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */ 1470 /* 1 to 5 "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */
1471 static const char *const fmt[] = { 1471 static const char *const fmt[] ALIGN_PTR = {
1472 NULL , 1472 NULL ,
1473 "sg%u", /* scsi/generic */ 1473 "sg%u", /* scsi/generic */
1474 NULL, /* scsi/disc */ 1474 NULL, /* scsi/disc */
diff --git a/miscutils/hexedit.c b/miscutils/hexedit.c
index f8ff9b62b..15ad78377 100644
--- a/miscutils/hexedit.c
+++ b/miscutils/hexedit.c
@@ -292,7 +292,7 @@ int hexedit_main(int argc UNUSED_PARAM, char **argv)
292 fflush_all(); 292 fflush_all();
293 G.in_read_key = 1; 293 G.in_read_key = 1;
294 if (!bb_got_signal) 294 if (!bb_got_signal)
295 key = read_key(STDIN_FILENO, G.read_key_buffer, -1); 295 key = safe_read_key(STDIN_FILENO, G.read_key_buffer, -1);
296 G.in_read_key = 0; 296 G.in_read_key = 0;
297 if (bb_got_signal) 297 if (bb_got_signal)
298 key = CTRL('X'); 298 key = CTRL('X');
diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c
index e3741eeba..da26f5e19 100644
--- a/miscutils/i2c_tools.c
+++ b/miscutils/i2c_tools.c
@@ -120,6 +120,7 @@ static int32_t i2c_smbus_access(int fd, char read_write, uint8_t cmd,
120 return ioctl(fd, I2C_SMBUS, &args); 120 return ioctl(fd, I2C_SMBUS, &args);
121} 121}
122 122
123#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP || ENABLE_I2CDETECT
123static int32_t i2c_smbus_read_byte(int fd) 124static int32_t i2c_smbus_read_byte(int fd)
124{ 125{
125 union i2c_smbus_data data; 126 union i2c_smbus_data data;
@@ -131,6 +132,7 @@ static int32_t i2c_smbus_read_byte(int fd)
131 132
132 return data.byte; 133 return data.byte;
133} 134}
135#endif
134 136
135#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP 137#if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP
136static int32_t i2c_smbus_write_byte(int fd, uint8_t val) 138static int32_t i2c_smbus_write_byte(int fd, uint8_t val)
diff --git a/miscutils/less.c b/miscutils/less.c
index 6da991a0e..842031ca3 100644
--- a/miscutils/less.c
+++ b/miscutils/less.c
@@ -1177,9 +1177,9 @@ static int64_t getch_nowait(void)
1177#endif 1177#endif
1178 } 1178 }
1179 1179
1180 /* We have kbd_fd in O_NONBLOCK mode, read inside read_key() 1180 /* We have kbd_fd in O_NONBLOCK mode, read inside safe_read_key()
1181 * would not block even if there is no input available */ 1181 * would not block even if there is no input available */
1182 key64 = read_key(kbd_fd, kbd_input, /*timeout off:*/ -2); 1182 key64 = safe_read_key(kbd_fd, kbd_input, /*timeout off:*/ -2);
1183 if ((int)key64 == -1) { 1183 if ((int)key64 == -1) {
1184 if (errno == EAGAIN) { 1184 if (errno == EAGAIN) {
1185 /* No keyboard input available. Since poll() did return, 1185 /* No keyboard input available. Since poll() did return,
diff --git a/miscutils/man.c b/miscutils/man.c
index be3b2a000..c3efe4484 100644
--- a/miscutils/man.c
+++ b/miscutils/man.c
@@ -328,7 +328,7 @@ int man_main(int argc UNUSED_PARAM, char **argv)
328 } 328 }
329#else 329#else
330 if (!man_path_list) { 330 if (!man_path_list) {
331 static const char *const mpl[] = { "/usr/man", "/usr/share/man", NULL }; 331 static const char *const mpl[] ALIGN_PTR = { "/usr/man", "/usr/share/man", NULL };
332 man_path_list = (char**)mpl; 332 man_path_list = (char**)mpl;
333 /*count_mp = 2; - not used below anyway */ 333 /*count_mp = 2; - not used below anyway */
334 } 334 }
diff --git a/modutils/modutils-24.c b/modutils/modutils-24.c
index ac8632481..d0bc2a6ef 100644
--- a/modutils/modutils-24.c
+++ b/modutils/modutils-24.c
@@ -3458,7 +3458,7 @@ static int obj_load_progbits(char *image, size_t image_size, struct obj_file *f,
3458 3458
3459static void hide_special_symbols(struct obj_file *f) 3459static void hide_special_symbols(struct obj_file *f)
3460{ 3460{
3461 static const char *const specials[] = { 3461 static const char *const specials[] ALIGN_PTR = {
3462 SPFX "cleanup_module", 3462 SPFX "cleanup_module",
3463 SPFX "init_module", 3463 SPFX "init_module",
3464 SPFX "kernel_version", 3464 SPFX "kernel_version",
@@ -3484,7 +3484,7 @@ static int obj_gpl_license(struct obj_file *f, const char **license)
3484 * linux/include/linux/module.h. Checking for leading "GPL" will not 3484 * linux/include/linux/module.h. Checking for leading "GPL" will not
3485 * work, somebody will use "GPL sucks, this is proprietary". 3485 * work, somebody will use "GPL sucks, this is proprietary".
3486 */ 3486 */
3487 static const char *const gpl_licenses[] = { 3487 static const char *const gpl_licenses[] ALIGN_PTR = {
3488 "GPL", 3488 "GPL",
3489 "GPL v2", 3489 "GPL v2",
3490 "GPL and additional rights", 3490 "GPL and additional rights",
diff --git a/networking/httpd.c b/networking/httpd.c
index 5f7b3a4dd..59b4a769c 100644
--- a/networking/httpd.c
+++ b/networking/httpd.c
@@ -1707,8 +1707,7 @@ static void send_cgi_and_exit(
1707 script = last_slash; 1707 script = last_slash;
1708 if (script != url) { /* paranoia */ 1708 if (script != url) { /* paranoia */
1709 *script = '\0'; 1709 *script = '\0';
1710 if (chdir(url + 1) != 0) { 1710 if (chdir_or_warn(url + 1) != 0) {
1711 bb_perror_msg("can't change directory to '%s'", url + 1);
1712 goto error_execing_cgi; 1711 goto error_execing_cgi;
1713 } 1712 }
1714 // not needed: *script = '/'; 1713 // not needed: *script = '/';
diff --git a/networking/ifupdown.c b/networking/ifupdown.c
index 737113dd4..6c4ae27f2 100644
--- a/networking/ifupdown.c
+++ b/networking/ifupdown.c
@@ -532,7 +532,7 @@ static int FAST_FUNC v4tunnel_down(struct interface_defn_t * ifd, execfn * exec)
532} 532}
533# endif 533# endif
534 534
535static const struct method_t methods6[] = { 535static const struct method_t methods6[] ALIGN_PTR = {
536# if ENABLE_FEATURE_IFUPDOWN_IP 536# if ENABLE_FEATURE_IFUPDOWN_IP
537 { "v4tunnel" , v4tunnel_up , v4tunnel_down , }, 537 { "v4tunnel" , v4tunnel_up , v4tunnel_down , },
538# endif 538# endif
@@ -627,7 +627,7 @@ struct dhcp_client_t {
627 const char *stopcmd; 627 const char *stopcmd;
628}; 628};
629 629
630static const struct dhcp_client_t ext_dhcp_clients[] = { 630static const struct dhcp_client_t ext_dhcp_clients[] ALIGN_PTR = {
631 { "dhcpcd", 631 { "dhcpcd",
632 "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%", 632 "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%",
633 "dhcpcd -k %iface%", 633 "dhcpcd -k %iface%",
@@ -774,7 +774,7 @@ static int FAST_FUNC wvdial_down(struct interface_defn_t *ifd, execfn *exec)
774 "-p /var/run/wvdial.%iface% -s 2", ifd, exec); 774 "-p /var/run/wvdial.%iface% -s 2", ifd, exec);
775} 775}
776 776
777static const struct method_t methods[] = { 777static const struct method_t methods[] ALIGN_PTR = {
778 { "manual" , manual_up_down, manual_up_down, }, 778 { "manual" , manual_up_down, manual_up_down, },
779 { "wvdial" , wvdial_up , wvdial_down , }, 779 { "wvdial" , wvdial_up , wvdial_down , },
780 { "ppp" , ppp_up , ppp_down , }, 780 { "ppp" , ppp_up , ppp_down , },
@@ -797,7 +797,7 @@ static int FAST_FUNC link_up_down(struct interface_defn_t *ifd UNUSED_PARAM, exe
797 return 1; 797 return 1;
798} 798}
799 799
800static const struct method_t link_methods[] = { 800static const struct method_t link_methods[] ALIGN_PTR = {
801 { "none", link_up_down, link_up_down } 801 { "none", link_up_down, link_up_down }
802}; 802};
803 803
diff --git a/networking/inetd.c b/networking/inetd.c
index e71be51c3..fb2fbe323 100644
--- a/networking/inetd.c
+++ b/networking/inetd.c
@@ -1538,7 +1538,7 @@ int inetd_main(int argc UNUSED_PARAM, char **argv)
1538#if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \ 1538#if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \
1539 || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD 1539 || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD
1540# if !BB_MMU 1540# if !BB_MMU
1541static const char *const cat_args[] = { "cat", NULL }; 1541static const char *const cat_args[] ALIGN_PTR = { "cat", NULL };
1542# endif 1542# endif
1543#endif 1543#endif
1544 1544
diff --git a/networking/interface.c b/networking/interface.c
index ea6a2c8a8..6b6c0944a 100644
--- a/networking/interface.c
+++ b/networking/interface.c
@@ -446,13 +446,13 @@ static char *get_name(char name[IFNAMSIZ], char *p)
446 * %n specifiers (even the size of integers may not match). 446 * %n specifiers (even the size of integers may not match).
447 */ 447 */
448#if INT_MAX == LONG_MAX 448#if INT_MAX == LONG_MAX
449static const char *const ss_fmt[] = { 449static const char *const ss_fmt[] ALIGN_PTR = {
450 "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u", 450 "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u",
451 "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u", 451 "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u",
452 "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u" 452 "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u"
453}; 453};
454#else 454#else
455static const char *const ss_fmt[] = { 455static const char *const ss_fmt[] ALIGN_PTR = {
456 "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu", 456 "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu",
457 "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu", 457 "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu",
458 "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu" 458 "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu"
@@ -731,7 +731,7 @@ static const struct hwtype ib_hwtype = {
731#endif 731#endif
732 732
733 733
734static const struct hwtype *const hwtypes[] = { 734static const struct hwtype *const hwtypes[] ALIGN_PTR = {
735 &loop_hwtype, 735 &loop_hwtype,
736 &ether_hwtype, 736 &ether_hwtype,
737 &ppp_hwtype, 737 &ppp_hwtype,
diff --git a/networking/libiproute/ipaddress.c b/networking/libiproute/ipaddress.c
index 17a838411..ecc3848ff 100644
--- a/networking/libiproute/ipaddress.c
+++ b/networking/libiproute/ipaddress.c
@@ -58,7 +58,7 @@ typedef struct filter_t filter_t;
58 58
59static void print_link_flags(unsigned flags, unsigned mdown) 59static void print_link_flags(unsigned flags, unsigned mdown)
60{ 60{
61 static const int flag_masks[] = { 61 static const int flag_masks[] ALIGN_INT = {
62 IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT, 62 IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT,
63 IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP }; 63 IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP };
64 static const char flag_labels[] ALIGN1 = 64 static const char flag_labels[] ALIGN1 =
diff --git a/networking/udhcp/common.c b/networking/udhcp/common.c
index 8e9b93655..ae818db05 100644
--- a/networking/udhcp/common.c
+++ b/networking/udhcp/common.c
@@ -19,7 +19,7 @@ const uint8_t MAC_BCAST_ADDR[6] ALIGN2 = {
19 * See RFC2132 for more options. 19 * See RFC2132 for more options.
20 * OPTION_REQ: these options are requested by udhcpc (unless -o). 20 * OPTION_REQ: these options are requested by udhcpc (unless -o).
21 */ 21 */
22const struct dhcp_optflag dhcp_optflags[] = { 22const struct dhcp_optflag dhcp_optflags[] ALIGN2 = {
23 /* flags code */ 23 /* flags code */
24 { OPTION_IP | OPTION_REQ, 0x01 }, /* DHCP_SUBNET */ 24 { OPTION_IP | OPTION_REQ, 0x01 }, /* DHCP_SUBNET */
25 { OPTION_S32 , 0x02 }, /* DHCP_TIME_OFFSET */ 25 { OPTION_S32 , 0x02 }, /* DHCP_TIME_OFFSET */
diff --git a/networking/udhcp/d6_dhcpc.c b/networking/udhcp/d6_dhcpc.c
index 9d2a8f5d3..9fc690315 100644
--- a/networking/udhcp/d6_dhcpc.c
+++ b/networking/udhcp/d6_dhcpc.c
@@ -65,7 +65,7 @@
65 65
66/* "struct client_data_t client_data" is in bb_common_bufsiz1 */ 66/* "struct client_data_t client_data" is in bb_common_bufsiz1 */
67 67
68static const struct dhcp_optflag d6_optflags[] = { 68static const struct dhcp_optflag d6_optflags[] ALIGN2 = {
69#if ENABLE_FEATURE_UDHCPC6_RFC3646 69#if ENABLE_FEATURE_UDHCPC6_RFC3646
70 { OPTION_6RD | OPTION_LIST | OPTION_REQ, D6_OPT_DNS_SERVERS }, 70 { OPTION_6RD | OPTION_LIST | OPTION_REQ, D6_OPT_DNS_SERVERS },
71 { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST }, 71 { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST },
diff --git a/procps/nmeter.c b/procps/nmeter.c
index 2310e9844..088d366bf 100644
--- a/procps/nmeter.c
+++ b/procps/nmeter.c
@@ -70,7 +70,7 @@ typedef struct proc_file {
70 smallint last_gen; 70 smallint last_gen;
71} proc_file; 71} proc_file;
72 72
73static const char *const proc_name[] = { 73static const char *const proc_name[] ALIGN_PTR = {
74 "stat", // Must match the order of proc_file's! 74 "stat", // Must match the order of proc_file's!
75 "loadavg", 75 "loadavg",
76 "net/dev", 76 "net/dev",
diff --git a/procps/top.c b/procps/top.c
index 4cd545c69..804d6f258 100644
--- a/procps/top.c
+++ b/procps/top.c
@@ -913,7 +913,7 @@ static unsigned handle_input(unsigned scan_mask, duration_t interval)
913 while (1) { 913 while (1) {
914 int32_t c; 914 int32_t c;
915 915
916 c = read_key(STDIN_FILENO, G.kbd_input, interval * 1000); 916 c = safe_read_key(STDIN_FILENO, G.kbd_input, interval * 1000);
917 if (c == -1 && errno != EAGAIN) { 917 if (c == -1 && errno != EAGAIN) {
918 /* error/EOF */ 918 /* error/EOF */
919 option_mask32 |= OPT_EOF; 919 option_mask32 |= OPT_EOF;
diff --git a/selinux/setenforce.c b/selinux/setenforce.c
index 996034f8e..2267be451 100644
--- a/selinux/setenforce.c
+++ b/selinux/setenforce.c
@@ -26,7 +26,7 @@
26/* These strings are arranged so that odd ones 26/* These strings are arranged so that odd ones
27 * result in security_setenforce(1) being done, 27 * result in security_setenforce(1) being done,
28 * the rest will do security_setenforce(0) */ 28 * the rest will do security_setenforce(0) */
29static const char *const setenforce_cmd[] = { 29static const char *const setenforce_cmd[] ALIGN_PTR = {
30 "0", 30 "0",
31 "1", 31 "1",
32 "permissive", 32 "permissive",
diff --git a/shell/ash.c b/shell/ash.c
index a1d01447a..46c4f1675 100644
--- a/shell/ash.c
+++ b/shell/ash.c
@@ -428,7 +428,7 @@ static void forkshell_print(FILE *fp0, struct forkshell *fs, const char **notes)
428/* ============ Shell options */ 428/* ============ Shell options */
429 429
430/* If you add/change options hare, update --help text too */ 430/* If you add/change options hare, update --help text too */
431static const char *const optletters_optnames[] = { 431static const char *const optletters_optnames[] ALIGN_PTR = {
432 "e" "errexit", 432 "e" "errexit",
433 "f" "noglob", 433 "f" "noglob",
434/* bash has '-o ignoreeof', but no short synonym -I for it */ 434/* bash has '-o ignoreeof', but no short synonym -I for it */
@@ -845,7 +845,7 @@ raise_exception(int e)
845/* 845/*
846 * Called when a SIGINT is received. (If the user specifies 846 * Called when a SIGINT is received. (If the user specifies
847 * that SIGINT is to be trapped or ignored using the trap builtin, then 847 * that SIGINT is to be trapped or ignored using the trap builtin, then
848 * this routine is not called.) Suppressint is nonzero when interrupts 848 * this routine is not called.) suppress_int is nonzero when interrupts
849 * are held using the INT_OFF macro. (The test for iflag is just 849 * are held using the INT_OFF macro. (The test for iflag is just
850 * defensive programming.) 850 * defensive programming.)
851 */ 851 */
@@ -882,13 +882,12 @@ raise_interrupt(void)
882} while (0) 882} while (0)
883#endif 883#endif
884 884
885static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void 885static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
886int_on(void) 886int_on(void)
887{ 887{
888 barrier(); 888 barrier();
889 if (--suppress_int == 0 && pending_int) { 889 if (--suppress_int == 0 && pending_int)
890 raise_interrupt(); 890 raise_interrupt();
891 }
892} 891}
893#if DEBUG_INTONOFF 892#if DEBUG_INTONOFF
894# define INT_ON do { \ 893# define INT_ON do { \
@@ -898,7 +897,7 @@ int_on(void)
898#else 897#else
899# define INT_ON int_on() 898# define INT_ON int_on()
900#endif 899#endif
901static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void 900static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void
902force_int_on(void) 901force_int_on(void)
903{ 902{
904 barrier(); 903 barrier();
@@ -4143,7 +4142,9 @@ signal_handler(int signo)
4143 if (!trap[SIGCHLD]) 4142 if (!trap[SIGCHLD])
4144 return; 4143 return;
4145 } 4144 }
4146 4145#if ENABLE_FEATURE_EDITING
4146 bb_got_signal = signo; /* for read_line_input: "we got a signal" */
4147#endif
4147 gotsig[signo - 1] = 1; 4148 gotsig[signo - 1] = 1;
4148 pending_sig = signo; 4149 pending_sig = signo;
4149 4150
@@ -11656,33 +11657,56 @@ preadfd(void)
11656# endif 11657# endif
11657 reinit_unicode_for_ash(); 11658 reinit_unicode_for_ash();
11658 again: 11659 again:
11660 /* For shell, LI_INTERRUPTIBLE is set:
11661 * read_line_input will abort on either
11662 * getting EINTR in poll(), or if it sees bb_got_signal != 0
11663 * (IOW: if signal arrives before poll() is reached).
11664 * Interactive testcases:
11665 * (while kill -INT $$; do sleep 1; done) &
11666 * #^^^ prints ^C, prints prompt, repeats
11667 * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
11668 * #^^^ prints ^C, prints "I", prints prompt, repeats
11669 * trap 'echo T' term; (while kill $$; do sleep 1; done) &
11670 * #^^^ prints "T", prints prompt, repeats
11671 * #(bash 5.0.17 exits after first "T", looks like a bug)
11672 */
11673 bb_got_signal = 0;
11674 INT_OFF; /* no longjmp'ing out of read_line_input please */
11659 nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ); 11675 nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ);
11676 if (bb_got_signal == SIGINT)
11677 write(STDOUT_FILENO, "^C\n", 3);
11678 INT_ON; /* here non-blocked SIGINT will longjmp */
11660 if (nr == 0) { 11679 if (nr == 0) {
11661 /* ^C pressed, "convert" to SIGINT */ 11680 /* ^C pressed, "convert" to SIGINT */
11662# if !ENABLE_PLATFORM_MINGW32 11681# if !ENABLE_PLATFORM_MINGW32
11663 write(STDOUT_FILENO, "^C", 2); 11682 write(STDOUT_FILENO, "^C\n", 3);
11664 raise(SIGINT); 11683 raise(SIGINT); /* here non-blocked SIGINT will longjmp */
11665 /* raise(SIGINT) did not work! (e.g. if SIGINT 11684 /* raise(SIGINT) did not work! (e.g. if SIGINT
11666 * is SIG_INGed on startup, it stays SIG_IGNed) 11685 * is SIG_IGNed on startup, it stays SIG_IGNed)
11667 */ 11686 */
11668# else 11687# else
11669 raise_interrupt(); 11688 raise_interrupt();
11670# endif 11689# endif
11671 if (trap[SIGINT]) { 11690 if (trap[SIGINT]) {
11691 empty_line_input:
11672 buf[0] = '\n'; 11692 buf[0] = '\n';
11673 buf[1] = '\0'; 11693 buf[1] = '\0';
11674 return 1; 11694 return 1;
11675 } 11695 }
11676 exitstatus = 128 + SIGINT; 11696 exitstatus = 128 + SIGINT;
11677 /* bash behavior on ^C + ignored SIGINT: */ 11697 /* bash behavior on ^C + ignored SIGINT: */
11678 write(STDOUT_FILENO, "\n", 1);
11679 goto again; 11698 goto again;
11680 } 11699 }
11681 if (nr < 0) { 11700 if (nr < 0) {
11682 if (errno == 0) { 11701 if (errno == 0) {
11683 /* Ctrl+D pressed */ 11702 /* ^D pressed */
11684 nr = 0; 11703 nr = 0;
11685 } 11704 }
11705 else if (errno == EINTR) { /* got signal? */
11706 if (bb_got_signal != SIGINT)
11707 write(STDOUT_FILENO, "\n", 1);
11708 goto empty_line_input;
11709 }
11686# if ENABLE_ASH_IDLE_TIMEOUT 11710# if ENABLE_ASH_IDLE_TIMEOUT
11687 else if (errno == EAGAIN && timeout > 0) { 11711 else if (errno == EAGAIN && timeout > 0) {
11688 puts("\007timed out waiting for input: auto-logout"); 11712 puts("\007timed out waiting for input: auto-logout");
diff --git a/shell/hush.c b/shell/hush.c
index 982fc356a..ae81f0da5 100644
--- a/shell/hush.c
+++ b/shell/hush.c
@@ -564,7 +564,7 @@ enum {
564#define NULL_O_STRING { NULL } 564#define NULL_O_STRING { NULL }
565 565
566#ifndef debug_printf_parse 566#ifndef debug_printf_parse
567static const char *const assignment_flag[] = { 567static const char *const assignment_flag[] ALIGN_PTR = {
568 "MAYBE_ASSIGNMENT", 568 "MAYBE_ASSIGNMENT",
569 "DEFINITELY_ASSIGNMENT", 569 "DEFINITELY_ASSIGNMENT",
570 "NOT_ASSIGNMENT", 570 "NOT_ASSIGNMENT",
@@ -918,6 +918,7 @@ struct globals {
918#if ENABLE_HUSH_INTERACTIVE 918#if ENABLE_HUSH_INTERACTIVE
919 smallint promptmode; /* 0: PS1, 1: PS2 */ 919 smallint promptmode; /* 0: PS1, 1: PS2 */
920#endif 920#endif
921 /* set by signal handler if SIGINT is received _and_ its trap is not set */
921 smallint flag_SIGINT; 922 smallint flag_SIGINT;
922#if ENABLE_HUSH_LOOPS 923#if ENABLE_HUSH_LOOPS
923 smallint flag_break_continue; 924 smallint flag_break_continue;
@@ -1944,6 +1945,9 @@ enum {
1944static void record_pending_signo(int sig) 1945static void record_pending_signo(int sig)
1945{ 1946{
1946 sigaddset(&G.pending_set, sig); 1947 sigaddset(&G.pending_set, sig);
1948#if ENABLE_FEATURE_EDITING
1949 bb_got_signal = sig; /* for read_line_input: "we got a signal" */
1950#endif
1947#if ENABLE_HUSH_FAST 1951#if ENABLE_HUSH_FAST
1948 if (sig == SIGCHLD) { 1952 if (sig == SIGCHLD) {
1949 G.count_SIGCHLD++; 1953 G.count_SIGCHLD++;
@@ -2652,30 +2656,53 @@ static int get_user_input(struct in_str *i)
2652 for (;;) { 2656 for (;;) {
2653 reinit_unicode_for_hush(); 2657 reinit_unicode_for_hush();
2654 G.flag_SIGINT = 0; 2658 G.flag_SIGINT = 0;
2655 /* buglet: SIGINT will not make new prompt to appear _at once_, 2659
2656 * only after <Enter>. (^C works immediately) */ 2660 bb_got_signal = 0;
2657 r = read_line_input(G.line_input_state, prompt_str, 2661 if (!sigisemptyset(&G.pending_set)) {
2662 /* Whoops, already got a signal, do not call read_line_input */
2663 bb_got_signal = r = -1;
2664 } else {
2665 /* For shell, LI_INTERRUPTIBLE is set:
2666 * read_line_input will abort on either
2667 * getting EINTR in poll(), or if it sees bb_got_signal != 0
2668 * (IOW: if signal arrives before poll() is reached).
2669 * Interactive testcases:
2670 * (while kill -INT $$; do sleep 1; done) &
2671 * #^^^ prints ^C, prints prompt, repeats
2672 * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) &
2673 * #^^^ prints ^C, prints "I", prints prompt, repeats
2674 * trap 'echo T' term; (while kill $$; do sleep 1; done) &
2675 * #^^^ prints "T", prints prompt, repeats
2676 * #(bash 5.0.17 exits after first "T", looks like a bug)
2677 */
2678 r = read_line_input(G.line_input_state, prompt_str,
2658 G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1 2679 G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1
2659 ); 2680 );
2660 /* read_line_input intercepts ^C, "convert" it to SIGINT */ 2681 /* read_line_input intercepts ^C, "convert" it to SIGINT */
2661 if (r == 0) { 2682 if (r == 0)
2662 raise(SIGINT); 2683 raise(SIGINT);
2684 }
2685 /* bash prints ^C (before running a trap, if any)
2686 * both on keyboard ^C and on real SIGINT (non-kbd generated).
2687 */
2688 if (sigismember(&G.pending_set, SIGINT)) {
2689 write(STDOUT_FILENO, "^C\n", 3);
2690 G.last_exitcode = 128 | SIGINT;
2663 } 2691 }
2664 check_and_run_traps(); 2692 check_and_run_traps();
2665 if (r != 0 && !G.flag_SIGINT) 2693 if (r == 0) /* keyboard ^C? */
2694 continue; /* go back, read another input line */
2695 if (r > 0) /* normal input? (no ^C, no ^D, no signals) */
2666 break; 2696 break;
2667 /* ^C or SIGINT: repeat */ 2697 if (!bb_got_signal) {
2668 /* bash prints ^C even on real SIGINT (non-kbd generated) */ 2698 /* r < 0: ^D/EOF/error detected (but not signal) */
2669 write(STDOUT_FILENO, "^C\n", 3); 2699 /* ^D on interactive input goes to next line before exiting: */
2670 G.last_exitcode = 128 | SIGINT; 2700 write(STDOUT_FILENO, "\n", 1);
2671 } 2701 i->p = NULL;
2672 if (r < 0) { 2702 i->peek_buf[0] = r = EOF;
2673 /* EOF/error detected */ 2703 return r;
2674 /* ^D on interactive input goes to next line before exiting: */ 2704 }
2675 write(STDOUT_FILENO, "\n", 1); 2705 /* it was a signal: go back, read another input line */
2676 i->p = NULL;
2677 i->peek_buf[0] = r = EOF;
2678 return r;
2679 } 2706 }
2680 i->p = G.user_input_buf; 2707 i->p = G.user_input_buf;
2681 return (unsigned char)*i->p++; 2708 return (unsigned char)*i->p++;
@@ -3655,7 +3682,7 @@ static void free_pipe_list(struct pipe *pi)
3655#ifndef debug_print_tree 3682#ifndef debug_print_tree
3656static void debug_print_tree(struct pipe *pi, int lvl) 3683static void debug_print_tree(struct pipe *pi, int lvl)
3657{ 3684{
3658 static const char *const PIPE[] = { 3685 static const char *const PIPE[] ALIGN_PTR = {
3659 [PIPE_SEQ] = "SEQ", 3686 [PIPE_SEQ] = "SEQ",
3660 [PIPE_AND] = "AND", 3687 [PIPE_AND] = "AND",
3661 [PIPE_OR ] = "OR" , 3688 [PIPE_OR ] = "OR" ,
@@ -3690,7 +3717,7 @@ static void debug_print_tree(struct pipe *pi, int lvl)
3690 [RES_XXXX ] = "XXXX" , 3717 [RES_XXXX ] = "XXXX" ,
3691 [RES_SNTX ] = "SNTX" , 3718 [RES_SNTX ] = "SNTX" ,
3692 }; 3719 };
3693 static const char *const CMDTYPE[] = { 3720 static const char *const CMDTYPE[] ALIGN_PTR = {
3694 "{}", 3721 "{}",
3695 "()", 3722 "()",
3696 "[noglob]", 3723 "[noglob]",
@@ -7632,7 +7659,7 @@ static int generate_stream_from_string(const char *s, pid_t *pid_p)
7632 if (is_prefixed_with(s, "trap") 7659 if (is_prefixed_with(s, "trap")
7633 && skip_whitespace(s + 4)[0] == '\0' 7660 && skip_whitespace(s + 4)[0] == '\0'
7634 ) { 7661 ) {
7635 static const char *const argv[] = { NULL, NULL }; 7662 static const char *const argv[] ALIGN_PTR = { NULL, NULL };
7636 builtin_trap((char**)argv); 7663 builtin_trap((char**)argv);
7637 fflush_all(); /* important */ 7664 fflush_all(); /* important */
7638 _exit(0); 7665 _exit(0);
@@ -9799,7 +9826,7 @@ static int run_list(struct pipe *pi)
9799 static const char encoded_dollar_at[] ALIGN1 = { 9826 static const char encoded_dollar_at[] ALIGN1 = {
9800 SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0' 9827 SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0'
9801 }; /* encoded representation of "$@" */ 9828 }; /* encoded representation of "$@" */
9802 static const char *const encoded_dollar_at_argv[] = { 9829 static const char *const encoded_dollar_at_argv[] ALIGN_PTR = {
9803 encoded_dollar_at, NULL 9830 encoded_dollar_at, NULL
9804 }; /* argv list with one element: "$@" */ 9831 }; /* argv list with one element: "$@" */
9805 char **vals; 9832 char **vals;
@@ -10361,7 +10388,7 @@ int hush_main(int argc, char **argv)
10361//it ignores TERM: 10388//it ignores TERM:
10362// bash -i -c 'kill $$; echo ALIVE' 10389// bash -i -c 'kill $$; echo ALIVE'
10363// ALIVE 10390// ALIVE
10364//it resets SIG_INGed HUP to SIG_DFL: 10391//it resets SIG_IGNed HUP to SIG_DFL:
10365// trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE' 10392// trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE'
10366// Hangup [the message is not printed by bash, it's the shell which started it] 10393// Hangup [the message is not printed by bash, it's the shell which started it]
10367//is talkative about jobs and exiting: 10394//is talkative about jobs and exiting:
diff --git a/shell/shell_common.c b/shell/shell_common.c
index fff356c04..399d5e684 100644
--- a/shell/shell_common.c
+++ b/shell/shell_common.c
@@ -218,6 +218,7 @@ shell_builtin_read(struct builtin_read_params *params)
218 */ 218 */
219 errno = 0; 219 errno = 0;
220 pfd[0].events = POLLIN; 220 pfd[0].events = POLLIN;
221//TODO race with a signal arriving just before the poll!
221 if (poll(pfd, 1, timeout) <= 0) { 222 if (poll(pfd, 1, timeout) <= 0) {
222 /* timed out, or EINTR */ 223 /* timed out, or EINTR */
223 err = errno; 224 err = errno;
diff --git a/testsuite/sed.tests b/testsuite/sed.tests
index e62b839f7..626542e33 100755
--- a/testsuite/sed.tests
+++ b/testsuite/sed.tests
@@ -324,6 +324,21 @@ testing "sed zero chars match/replace logic must not falsely trigger here 2" \
324 "sed 's/ *$/_/g'" \ 324 "sed 's/ *$/_/g'" \
325 "qwerty_\n" "" "qwerty\n" 325 "qwerty_\n" "" "qwerty\n"
326 326
327# the pattern here is interpreted as "9+", not as "9\+"
328testing "sed special char as s/// delimiter, in pattern" \
329 "sed 's+9\++X+'" \
330 "X8=17\n" "" "9+8=17\n"
331
332# Matching GNU sed 4.8:
333# in replacement string, "\&" remains "\&", not interpreted as "&"
334testing "sed special char as s/// delimiter, in replacement 1" \
335 "sed 's&9&X\&&'" \
336 "X&+8=17\n" "" "9+8=17\n"
337# in replacement string, "\1" is interpreted as "1"
338testing "sed special char as s/// delimiter, in replacement 2" \
339 "sed 's1\(9\)1X\11'" \
340 "X1+8=17\n" "" "9+8=17\n"
341
327testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \ 342testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \
328 "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \ 343 "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \
329 "\ 344 "\
diff --git a/util-linux/hexdump.c b/util-linux/hexdump.c
index 57e7e8db7..307a84803 100644
--- a/util-linux/hexdump.c
+++ b/util-linux/hexdump.c
@@ -71,7 +71,7 @@ static void bb_dump_addfile(dumper_t *dumper, char *name)
71 fclose(fp); 71 fclose(fp);
72} 72}
73 73
74static const char *const add_strings[] = { 74static const char *const add_strings[] ALIGN_PTR = {
75 "\"%07.7_ax \"16/1 \"%03o \"\"\n\"", /* b */ 75 "\"%07.7_ax \"16/1 \"%03o \"\"\n\"", /* b */
76 "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"", /* c */ 76 "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"", /* c */
77 "\"%07.7_ax \"8/2 \" %05u \"\"\n\"", /* d */ 77 "\"%07.7_ax \"8/2 \" %05u \"\"\n\"", /* d */
diff --git a/util-linux/mkfs_vfat.c b/util-linux/mkfs_vfat.c
index 844d965f8..821371953 100644
--- a/util-linux/mkfs_vfat.c
+++ b/util-linux/mkfs_vfat.c
@@ -218,8 +218,11 @@ static const char boot_code[] ALIGN1 =
218int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; 218int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
219int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) 219int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
220{ 220{
221 static const char NO_NAME_11[] = "NO NAME ";
222
221 struct stat st; 223 struct stat st;
222 const char *volume_label = ""; 224 const char *arg_volume_label = NO_NAME_11; //default
225 char volume_label11[12];
223 char *buf; 226 char *buf;
224 char *device_name; 227 char *device_name;
225 uoff_t volume_size_bytes; 228 uoff_t volume_size_bytes;
@@ -257,14 +260,17 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
257 opts = getopt32(argv, "^" 260 opts = getopt32(argv, "^"
258 "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v" 261 "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v"
259 "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c 262 "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c
260 NULL, NULL, NULL, NULL, NULL, 263 /*b*/NULL, /*f*/NULL, /*F*/NULL, /*h*/NULL, /*i*/NULL,
261 NULL, NULL, &volume_label, NULL, NULL, NULL, NULL); 264 /*l*/NULL, /*m*/NULL, /*n*/&arg_volume_label,
265 /*r*/NULL, /*R*/NULL, /*s*/NULL, /*S*/NULL);
262 argv += optind; 266 argv += optind;
263 267
264 // cache device name 268 // cache device name
265 device_name = argv[0]; 269 device_name = argv[0];
266 // default volume ID = creation time 270 // default volume ID = creation time
267 volume_id = time(NULL); 271 volume_id = time(NULL);
272 // truncate to exactly 11 chars, pad with spaces
273 sprintf(volume_label11, "%-11.11s", arg_volume_label);
268 274
269 dev = xopen(device_name, O_RDWR); 275 dev = xopen(device_name, O_RDWR);
270 xfstat(dev, &st, device_name); 276 xfstat(dev, &st, device_name);
@@ -459,7 +465,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
459 (int)media_byte, 465 (int)media_byte,
460 volume_size_sect, (int)total_clust, (int)sect_per_clust, 466 volume_size_sect, (int)total_clust, (int)sect_per_clust,
461 sect_per_fat, 467 sect_per_fat,
462 (int)volume_id, volume_label 468 (int)volume_id, volume_label11
463 ); 469 );
464 } 470 }
465 471
@@ -508,7 +514,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
508 STORE_LE(boot_blk->vi.ext_boot_sign, 0x29); 514 STORE_LE(boot_blk->vi.ext_boot_sign, 0x29);
509 STORE_LE(boot_blk->vi.volume_id32, volume_id); 515 STORE_LE(boot_blk->vi.volume_id32, volume_id);
510 memcpy(boot_blk->vi.fs_type, "FAT32 ", sizeof(boot_blk->vi.fs_type)); 516 memcpy(boot_blk->vi.fs_type, "FAT32 ", sizeof(boot_blk->vi.fs_type));
511 strncpy(boot_blk->vi.volume_label, volume_label, sizeof(boot_blk->vi.volume_label)); 517 memcpy(boot_blk->vi.volume_label, volume_label11, 11);
512 memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code)); 518 memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code));
513 STORE_LE(boot_blk->boot_sign, BOOT_SIGN); 519 STORE_LE(boot_blk->boot_sign, BOOT_SIGN);
514 520
@@ -545,15 +551,18 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv)
545 // root directory 551 // root directory
546 // empty directory is just a set of zero bytes 552 // empty directory is just a set of zero bytes
547 memset(buf, 0, sect_per_clust * bytes_per_sect); 553 memset(buf, 0, sect_per_clust * bytes_per_sect);
548 if (volume_label[0]) { 554 // not "NO NAME", "NO NAME " etc?
549 // create dir entry for volume_label 555 // (mkfs.fat 4.1 won't create dir entry even with explicit -n 'NO NAME',
556 // but will create one with e.g. -n '', -n ' zZz')
557 if (strcmp(volume_label11, NO_NAME_11) != 0) {
558 // create dir entry for volume label
550 struct msdos_dir_entry *de; 559 struct msdos_dir_entry *de;
551#if 0 560#if 0
552 struct tm tm_time; 561 struct tm tm_time;
553 uint16_t t, d; 562 uint16_t t, d;
554#endif 563#endif
555 de = (void*)buf; 564 de = (void*)buf;
556 strncpy(de->name, volume_label, sizeof(de->name)); 565 memcpy(de->name, volume_label11, 11);
557 STORE_LE(de->attr, ATTR_VOLUME); 566 STORE_LE(de->attr, ATTR_VOLUME);
558#if 0 567#if 0
559 localtime_r(&create_time, &tm_time); 568 localtime_r(&create_time, &tm_time);
diff --git a/util-linux/nsenter.c b/util-linux/nsenter.c
index e6339da2f..1aa045b35 100644
--- a/util-linux/nsenter.c
+++ b/util-linux/nsenter.c
@@ -93,7 +93,7 @@ enum {
93 * The user namespace comes first, so that it is entered first. 93 * The user namespace comes first, so that it is entered first.
94 * This gives an unprivileged user the potential to enter other namespaces. 94 * This gives an unprivileged user the potential to enter other namespaces.
95 */ 95 */
96static const struct namespace_descr ns_list[] = { 96static const struct namespace_descr ns_list[] ALIGN_INT = {
97 { CLONE_NEWUSER, "ns/user", }, 97 { CLONE_NEWUSER, "ns/user", },
98 { CLONE_NEWIPC, "ns/ipc", }, 98 { CLONE_NEWIPC, "ns/ipc", },
99 { CLONE_NEWUTS, "ns/uts", }, 99 { CLONE_NEWUTS, "ns/uts", },
diff --git a/util-linux/unshare.c b/util-linux/unshare.c
index 68ccdd874..06b938074 100644
--- a/util-linux/unshare.c
+++ b/util-linux/unshare.c
@@ -120,7 +120,7 @@ enum {
120 NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */ 120 NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */
121 NS_COUNT, 121 NS_COUNT,
122}; 122};
123static const struct namespace_descr ns_list[] = { 123static const struct namespace_descr ns_list[] ALIGN_INT = {
124 { CLONE_NEWNS, "mnt" }, 124 { CLONE_NEWNS, "mnt" },
125 { CLONE_NEWUTS, "uts" }, 125 { CLONE_NEWUTS, "uts" },
126 { CLONE_NEWIPC, "ipc" }, 126 { CLONE_NEWIPC, "ipc" },