diff options
author | Ron Yorston <rmy@pobox.com> | 2022-02-09 09:03:18 +0000 |
---|---|---|
committer | Ron Yorston <rmy@pobox.com> | 2022-02-09 09:05:39 +0000 |
commit | 492d0a7492a57fe8f02c766e25960b0ce0d88759 (patch) | |
tree | 4f5764a5c2250c031ea05e9aeacbb40d7971f493 | |
parent | 4734416a21312488a5099a297907783bee4ccc22 (diff) | |
parent | caa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 (diff) | |
download | busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.gz busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.bz2 busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.zip |
Merge busybox into merge
Fix conflicts in reset and ash.
Redefine the new safe_read_key() as a reference to read_key().
Disable SHA256_HWACCEL.
63 files changed, 1969 insertions, 836 deletions
diff --git a/archival/libarchive/decompress_bunzip2.c b/archival/libarchive/decompress_bunzip2.c index 42e2b4f88..4a2b668aa 100644 --- a/archival/libarchive/decompress_bunzip2.c +++ b/archival/libarchive/decompress_bunzip2.c | |||
@@ -654,7 +654,7 @@ static int read_bunzip(bunzip_data *bd, char *outbuf, int len) | |||
654 | /* Subtract the 1 copy we'd output anyway to get extras */ | 654 | /* Subtract the 1 copy we'd output anyway to get extras */ |
655 | --bd->writeCopies; | 655 | --bd->writeCopies; |
656 | } | 656 | } |
657 | } /* for(;;) */ | 657 | } /* for (;;) */ |
658 | 658 | ||
659 | /* Decompression of this input block completed successfully */ | 659 | /* Decompression of this input block completed successfully */ |
660 | bd->writeCRC = CRC = ~CRC; | 660 | bd->writeCRC = CRC = ~CRC; |
diff --git a/archival/libarchive/get_header_tar.c b/archival/libarchive/get_header_tar.c index d26868bf8..cc6f3f0ad 100644 --- a/archival/libarchive/get_header_tar.c +++ b/archival/libarchive/get_header_tar.c | |||
@@ -147,11 +147,13 @@ static void process_pax_hdr(archive_handle_t *archive_handle, unsigned sz, int g | |||
147 | #endif | 147 | #endif |
148 | } | 148 | } |
149 | 149 | ||
150 | #if ENABLE_FEATURE_TAR_GNU_EXTENSIONS | ||
150 | static void die_if_bad_fnamesize(off_t sz) | 151 | static void die_if_bad_fnamesize(off_t sz) |
151 | { | 152 | { |
152 | if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */ | 153 | if ((uoff_t)sz > 0xfff) /* more than 4k?! no funny business please */ |
153 | bb_simple_error_msg_and_die("bad archive"); | 154 | bb_simple_error_msg_and_die("bad archive"); |
154 | } | 155 | } |
156 | #endif | ||
155 | 157 | ||
156 | char FAST_FUNC get_header_tar(archive_handle_t *archive_handle) | 158 | char FAST_FUNC get_header_tar(archive_handle_t *archive_handle) |
157 | { | 159 | { |
diff --git a/busybox_ldscript.README.txt b/busybox_ldscript.README.txt new file mode 100644 index 000000000..1625a970a --- /dev/null +++ b/busybox_ldscript.README.txt | |||
@@ -0,0 +1,47 @@ | |||
1 | /* Add SORT_BY_ALIGNMENT to linker script (found in busybox_unstripped.out): | ||
2 | ## .rodata : { *(.rodata SORT_BY_ALIGNMENT(.rodata.*) .gnu.linkonce.r.*) } | ||
3 | ## .data : { *(.data SORT_BY_ALIGNMENT(.data.*) .gnu.linkonce.d.*) } | ||
4 | ## .bss : { *(.bss SORT_BY_ALIGNMENT(.bss.*) .gnu.linkonce.b.*) } | ||
5 | ## This will eliminate most of the padding (~3kb). | ||
6 | ## Hmm, "ld --sort-section alignment" should do it too. | ||
7 | ## | ||
8 | ## There is a ld hack which is meant to decrease disk usage | ||
9 | ## at the cost of more RAM usage (??!!) in standard ld script: | ||
10 | ## . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); | ||
11 | ## Replace it with: | ||
12 | ## . = ALIGN (0x1000); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); | ||
13 | ## to unconditionally align .data to the next page boundary, | ||
14 | ## instead of "next page, plus current offset in this page" | ||
15 | */ | ||
16 | |||
17 | /* To reduce the number of VMAs each bbox process has, | ||
18 | ## move *(.bss SORT_BY_ALIGNMENT(.bss.*) ...) | ||
19 | ## part from .bss : {...} block to .data : { ... } block. | ||
20 | ## (This usually increases .data section by only one page). | ||
21 | ## Result: | ||
22 | ## | ||
23 | ## text data bss dec hex filename | ||
24 | ## 1050792 560 7580 1058932 102874 busybox.bss | ||
25 | ## 1050792 8149 0 1058941 10287d busybox.nobss | ||
26 | ## | ||
27 | ## $ exec busybox.bss pmap $$ | ||
28 | ## 0000000008048000 1028K r-xp /path/to/busybox.bss | ||
29 | ## 0000000008149000 8K rw-p /path/to/busybox.bss | ||
30 | ## 000000000814b000 4K rw-p [ anon ] <---- this VMA is eliminated | ||
31 | ## 00000000085f5000 4K ---p [heap] | ||
32 | ## 00000000085f6000 4K rw-p [heap] | ||
33 | ## 00000000f7778000 8K rw-p [ anon ] | ||
34 | ## 00000000f777a000 12K r--p [vvar] | ||
35 | ## 00000000f777d000 8K r-xp [vdso] | ||
36 | ## 00000000ff7e9000 132K rw-p [stack] | ||
37 | ## | ||
38 | ## $ exec busybox.nobss pmap $$ | ||
39 | ## 0000000008048000 1028K r-xp /path/to/busybox.nobss | ||
40 | ## 0000000008149000 12K rw-p /path/to/busybox.nobss | ||
41 | ## 00000000086f0000 4K ---p [heap] | ||
42 | ## 00000000086f1000 4K rw-p [heap] | ||
43 | ## 00000000f7783000 8K rw-p [ anon ] | ||
44 | ## 00000000f7785000 12K r--p [vvar] | ||
45 | ## 00000000f7788000 8K r-xp [vdso] | ||
46 | ## 00000000ffac0000 132K rw-p [stack] | ||
47 | */ | ||
diff --git a/configs/mingw32_defconfig b/configs/mingw32_defconfig index 408b13eb8..98288bfb2 100644 --- a/configs/mingw32_defconfig +++ b/configs/mingw32_defconfig | |||
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6 | |||
114 | CONFIG_MD5_SMALL=1 | 114 | CONFIG_MD5_SMALL=1 |
115 | CONFIG_SHA1_SMALL=3 | 115 | CONFIG_SHA1_SMALL=3 |
116 | # CONFIG_SHA1_HWACCEL is not set | 116 | # CONFIG_SHA1_HWACCEL is not set |
117 | # CONFIG_SHA256_HWACCEL is not set | ||
117 | CONFIG_SHA3_SMALL=1 | 118 | CONFIG_SHA3_SMALL=1 |
118 | # CONFIG_FEATURE_FAST_TOP is not set | 119 | # CONFIG_FEATURE_FAST_TOP is not set |
119 | # CONFIG_FEATURE_ETC_NETWORKS is not set | 120 | # CONFIG_FEATURE_ETC_NETWORKS is not set |
diff --git a/configs/mingw64_defconfig b/configs/mingw64_defconfig index 05596ab8e..1ce3831a9 100644 --- a/configs/mingw64_defconfig +++ b/configs/mingw64_defconfig | |||
@@ -114,6 +114,7 @@ CONFIG_PASSWORD_MINLEN=6 | |||
114 | CONFIG_MD5_SMALL=1 | 114 | CONFIG_MD5_SMALL=1 |
115 | CONFIG_SHA1_SMALL=3 | 115 | CONFIG_SHA1_SMALL=3 |
116 | # CONFIG_SHA1_HWACCEL is not set | 116 | # CONFIG_SHA1_HWACCEL is not set |
117 | # CONFIG_SHA256_HWACCEL is not set | ||
117 | CONFIG_SHA3_SMALL=1 | 118 | CONFIG_SHA3_SMALL=1 |
118 | # CONFIG_FEATURE_FAST_TOP is not set | 119 | # CONFIG_FEATURE_FAST_TOP is not set |
119 | # CONFIG_FEATURE_ETC_NETWORKS is not set | 120 | # CONFIG_FEATURE_ETC_NETWORKS is not set |
diff --git a/console-tools/reset.c b/console-tools/reset.c index e0d228d50..151bc47d1 100644 --- a/console-tools/reset.c +++ b/console-tools/reset.c | |||
@@ -40,7 +40,7 @@ int reset_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; | |||
40 | int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) | 40 | int reset_main(int argc UNUSED_PARAM, char **argv UNUSED_PARAM) |
41 | { | 41 | { |
42 | #if !ENABLE_PLATFORM_MINGW32 | 42 | #if !ENABLE_PLATFORM_MINGW32 |
43 | static const char *const args[] = { | 43 | static const char *const args[] ALIGN_PTR = { |
44 | "stty", "sane", NULL | 44 | "stty", "sane", NULL |
45 | }; | 45 | }; |
46 | 46 | ||
diff --git a/coreutils/head.c b/coreutils/head.c index 9586f869f..c7537a20e 100644 --- a/coreutils/head.c +++ b/coreutils/head.c | |||
@@ -76,7 +76,7 @@ print_except_N_last_bytes(FILE *fp, unsigned count) | |||
76 | { | 76 | { |
77 | unsigned char *circle = xmalloc(++count); | 77 | unsigned char *circle = xmalloc(++count); |
78 | unsigned head = 0; | 78 | unsigned head = 0; |
79 | for(;;) { | 79 | for (;;) { |
80 | int c; | 80 | int c; |
81 | c = getc(fp); | 81 | c = getc(fp); |
82 | if (c == EOF) | 82 | if (c == EOF) |
@@ -105,7 +105,7 @@ print_except_N_last_lines(FILE *fp, unsigned count) | |||
105 | { | 105 | { |
106 | char **circle = xzalloc((++count) * sizeof(circle[0])); | 106 | char **circle = xzalloc((++count) * sizeof(circle[0])); |
107 | unsigned head = 0; | 107 | unsigned head = 0; |
108 | for(;;) { | 108 | for (;;) { |
109 | char *c; | 109 | char *c; |
110 | c = xmalloc_fgets(fp); | 110 | c = xmalloc_fgets(fp); |
111 | if (!c) | 111 | if (!c) |
@@ -127,7 +127,7 @@ print_except_N_last_lines(FILE *fp, unsigned count) | |||
127 | } | 127 | } |
128 | ret: | 128 | ret: |
129 | head = 0; | 129 | head = 0; |
130 | for(;;) { | 130 | for (;;) { |
131 | free(circle[head++]); | 131 | free(circle[head++]); |
132 | if (head == count) | 132 | if (head == count) |
133 | break; | 133 | break; |
diff --git a/coreutils/od.c b/coreutils/od.c index 9a888dd5f..6f22331e0 100644 --- a/coreutils/od.c +++ b/coreutils/od.c | |||
@@ -144,7 +144,7 @@ odoffset(dumper_t *dumper, int argc, char ***argvp) | |||
144 | } | 144 | } |
145 | } | 145 | } |
146 | 146 | ||
147 | static const char *const add_strings[] = { | 147 | static const char *const add_strings[] ALIGN_PTR = { |
148 | "16/1 \"%3_u \" \"\\n\"", /* a */ | 148 | "16/1 \"%3_u \" \"\\n\"", /* a */ |
149 | "8/2 \" %06o \" \"\\n\"", /* B, o */ | 149 | "8/2 \" %06o \" \"\\n\"", /* B, o */ |
150 | "16/1 \"%03o \" \"\\n\"", /* b */ | 150 | "16/1 \"%03o \" \"\\n\"", /* b */ |
diff --git a/coreutils/test.c b/coreutils/test.c index a914c7490..840a0daaf 100644 --- a/coreutils/test.c +++ b/coreutils/test.c | |||
@@ -242,7 +242,7 @@ int depth; | |||
242 | depth--; \ | 242 | depth--; \ |
243 | return __res; \ | 243 | return __res; \ |
244 | } while (0) | 244 | } while (0) |
245 | static const char *const TOKSTR[] = { | 245 | static const char *const TOKSTR[] ALIGN_PTR = { |
246 | "EOI", | 246 | "EOI", |
247 | "FILRD", | 247 | "FILRD", |
248 | "FILWR", | 248 | "FILWR", |
diff --git a/e2fsprogs/fsck.c b/e2fsprogs/fsck.c index 96c1e51e0..028f8a803 100644 --- a/e2fsprogs/fsck.c +++ b/e2fsprogs/fsck.c | |||
@@ -190,7 +190,7 @@ struct globals { | |||
190 | * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3 | 190 | * Required for the uber-silly devfs /dev/ide/host1/bus2/target3/lun3 |
191 | * pathames. | 191 | * pathames. |
192 | */ | 192 | */ |
193 | static const char *const devfs_hier[] = { | 193 | static const char *const devfs_hier[] ALIGN_PTR = { |
194 | "host", "bus", "target", "lun", NULL | 194 | "host", "bus", "target", "lun", NULL |
195 | }; | 195 | }; |
196 | #endif | 196 | #endif |
diff --git a/editors/cmp.c b/editors/cmp.c index 6d2b0c6c3..b89e519ad 100644 --- a/editors/cmp.c +++ b/editors/cmp.c | |||
@@ -54,6 +54,7 @@ int cmp_main(int argc UNUSED_PARAM, char **argv) | |||
54 | int retval = 0; | 54 | int retval = 0; |
55 | int max_count = -1; | 55 | int max_count = -1; |
56 | 56 | ||
57 | #if !ENABLE_LONG_OPTS | ||
57 | opt = getopt32(argv, "^" | 58 | opt = getopt32(argv, "^" |
58 | OPT_STR | 59 | OPT_STR |
59 | "\0" "-1" | 60 | "\0" "-1" |
@@ -62,6 +63,23 @@ int cmp_main(int argc UNUSED_PARAM, char **argv) | |||
62 | ":l--s:s--l", | 63 | ":l--s:s--l", |
63 | &max_count | 64 | &max_count |
64 | ); | 65 | ); |
66 | #else | ||
67 | static const char cmp_longopts[] ALIGN1 = | ||
68 | "bytes\0" Required_argument "n" | ||
69 | "quiet\0" No_argument "s" | ||
70 | "silent\0" No_argument "s" | ||
71 | "verbose\0" No_argument "l" | ||
72 | ; | ||
73 | opt = getopt32long(argv, "^" | ||
74 | OPT_STR | ||
75 | "\0" "-1" | ||
76 | IF_DESKTOP(":?4") | ||
77 | IF_NOT_DESKTOP(":?2") | ||
78 | ":l--s:s--l", | ||
79 | cmp_longopts, | ||
80 | &max_count | ||
81 | ); | ||
82 | #endif | ||
65 | argv += optind; | 83 | argv += optind; |
66 | 84 | ||
67 | filename1 = *argv; | 85 | filename1 = *argv; |
diff --git a/editors/patch.c b/editors/patch.c index 110176630..aebb5073e 100644 --- a/editors/patch.c +++ b/editors/patch.c | |||
@@ -418,7 +418,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv) | |||
418 | } | 418 | } |
419 | 419 | ||
420 | // Loop through the lines in the patch | 420 | // Loop through the lines in the patch |
421 | for(;;) { | 421 | for (;;) { |
422 | char *patchline; | 422 | char *patchline; |
423 | 423 | ||
424 | patchline = xmalloc_fgetline(stdin); | 424 | patchline = xmalloc_fgetline(stdin); |
diff --git a/editors/patch_toybox.c b/editors/patch_toybox.c index aebab8132..69a508b2e 100644 --- a/editors/patch_toybox.c +++ b/editors/patch_toybox.c | |||
@@ -441,7 +441,7 @@ int patch_main(int argc UNUSED_PARAM, char **argv) | |||
441 | TT.filein = TT.fileout = -1; | 441 | TT.filein = TT.fileout = -1; |
442 | 442 | ||
443 | // Loop through the lines in the patch | 443 | // Loop through the lines in the patch |
444 | for(;;) { | 444 | for (;;) { |
445 | char *patchline; | 445 | char *patchline; |
446 | 446 | ||
447 | patchline = get_line(TT.filepatch); | 447 | patchline = get_line(TT.filepatch); |
diff --git a/editors/sed.c b/editors/sed.c index 374830f3f..f4a5f7b8a 100644 --- a/editors/sed.c +++ b/editors/sed.c | |||
@@ -252,7 +252,6 @@ static void cleanup_outname(void) | |||
252 | } | 252 | } |
253 | 253 | ||
254 | /* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */ | 254 | /* strcpy, replacing "\from" with 'to'. If to is NUL, replacing "\any" with 'any' */ |
255 | |||
256 | static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to) | 255 | static unsigned parse_escapes(char *dest, const char *string, int len, char from, char to) |
257 | { | 256 | { |
258 | char *d = dest; | 257 | char *d = dest; |
@@ -282,7 +281,7 @@ static unsigned parse_escapes(char *dest, const char *string, int len, char from | |||
282 | return d - dest; | 281 | return d - dest; |
283 | } | 282 | } |
284 | 283 | ||
285 | static char *copy_parsing_escapes(const char *string, int len) | 284 | static char *copy_parsing_escapes(const char *string, int len, char delim) |
286 | { | 285 | { |
287 | const char *s; | 286 | const char *s; |
288 | char *dest = xmalloc(len + 1); | 287 | char *dest = xmalloc(len + 1); |
@@ -293,10 +292,15 @@ static char *copy_parsing_escapes(const char *string, int len) | |||
293 | len = parse_escapes(dest, string, len, s[1], s[0]); | 292 | len = parse_escapes(dest, string, len, s[1], s[0]); |
294 | string = dest; | 293 | string = dest; |
295 | } | 294 | } |
295 | if (delim) { | ||
296 | /* we additionally unescape any instances of escaped delimiter. | ||
297 | * For example, in 's+9\++X+' the pattern is "9+", not "9\+". | ||
298 | */ | ||
299 | len = parse_escapes(dest, string, len, delim, delim); | ||
300 | } | ||
296 | return dest; | 301 | return dest; |
297 | } | 302 | } |
298 | 303 | ||
299 | |||
300 | /* | 304 | /* |
301 | * index_of_next_unescaped_regexp_delim - walks left to right through a string | 305 | * index_of_next_unescaped_regexp_delim - walks left to right through a string |
302 | * beginning at a specified index and returns the index of the next regular | 306 | * beginning at a specified index and returns the index of the next regular |
@@ -353,12 +357,14 @@ static int parse_regex_delim(const char *cmdstr, char **match, char **replace) | |||
353 | 357 | ||
354 | /* save the match string */ | 358 | /* save the match string */ |
355 | idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr); | 359 | idx = index_of_next_unescaped_regexp_delim(delimiter, cmdstr_ptr); |
356 | *match = copy_parsing_escapes(cmdstr_ptr, idx); | 360 | *match = copy_parsing_escapes(cmdstr_ptr, idx, delimiter); |
357 | |||
358 | /* save the replacement string */ | 361 | /* save the replacement string */ |
359 | cmdstr_ptr += idx + 1; | 362 | cmdstr_ptr += idx + 1; |
360 | idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); | 363 | idx = index_of_next_unescaped_regexp_delim(- (int)delimiter, cmdstr_ptr); |
361 | *replace = copy_parsing_escapes(cmdstr_ptr, idx); | 364 | //GNU sed 4.8: |
365 | // echo 789 | sed 's&8&\&&' - 7&9 ("\&" remained "\&") | ||
366 | // echo 789 | sed 's1\(8\)1\1\11' - 7119 ("\1\1" become "11") | ||
367 | *replace = copy_parsing_escapes(cmdstr_ptr, idx, delimiter != '&' ? delimiter : 0); | ||
362 | 368 | ||
363 | return ((cmdstr_ptr - cmdstr) + idx); | 369 | return ((cmdstr_ptr - cmdstr) + idx); |
364 | } | 370 | } |
@@ -386,7 +392,7 @@ static int get_address(const char *my_str, int *linenum, regex_t ** regex) | |||
386 | delimiter = *++pos; | 392 | delimiter = *++pos; |
387 | next = index_of_next_unescaped_regexp_delim(delimiter, ++pos); | 393 | next = index_of_next_unescaped_regexp_delim(delimiter, ++pos); |
388 | if (next != 0) { | 394 | if (next != 0) { |
389 | temp = copy_parsing_escapes(pos, next); | 395 | temp = copy_parsing_escapes(pos, next, 0); |
390 | G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t)); | 396 | G.previous_regex_ptr = *regex = xzalloc(sizeof(regex_t)); |
391 | xregcomp(*regex, temp, G.regex_type); | 397 | xregcomp(*regex, temp, G.regex_type); |
392 | free(temp); | 398 | free(temp); |
@@ -581,7 +587,7 @@ static const char *parse_cmd_args(sed_cmd_t *sed_cmd, const char *cmdstr) | |||
581 | cmdstr++; | 587 | cmdstr++; |
582 | } | 588 | } |
583 | len = strlen(cmdstr); | 589 | len = strlen(cmdstr); |
584 | sed_cmd->string = copy_parsing_escapes(cmdstr, len); | 590 | sed_cmd->string = copy_parsing_escapes(cmdstr, len, 0); |
585 | cmdstr += len; | 591 | cmdstr += len; |
586 | /* "\anychar" -> "anychar" */ | 592 | /* "\anychar" -> "anychar" */ |
587 | parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0'); | 593 | parse_escapes(sed_cmd->string, sed_cmd->string, -1, '\0', '\0'); |
diff --git a/editors/vi.c b/editors/vi.c index b973cc056..b30369302 100644 --- a/editors/vi.c +++ b/editors/vi.c | |||
@@ -1182,7 +1182,7 @@ static int readit(void) // read (maybe cursor) key from stdin | |||
1182 | // on nonblocking stdin. | 1182 | // on nonblocking stdin. |
1183 | // Note: read_key sets errno to 0 on success. | 1183 | // Note: read_key sets errno to 0 on success. |
1184 | again: | 1184 | again: |
1185 | c = read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1); | 1185 | c = safe_read_key(STDIN_FILENO, readbuffer, /*timeout:*/ -1); |
1186 | if (c == -1) { // EOF/error | 1186 | if (c == -1) { // EOF/error |
1187 | if (errno == EAGAIN) // paranoia | 1187 | if (errno == EAGAIN) // paranoia |
1188 | goto again; | 1188 | goto again; |
@@ -4930,7 +4930,7 @@ static void edit_file(char *fn) | |||
4930 | uint64_t k; | 4930 | uint64_t k; |
4931 | write1(ESC"[999;999H" ESC"[6n"); | 4931 | write1(ESC"[999;999H" ESC"[6n"); |
4932 | fflush_all(); | 4932 | fflush_all(); |
4933 | k = read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100); | 4933 | k = safe_read_key(STDIN_FILENO, readbuffer, /*timeout_ms:*/ 100); |
4934 | if ((int32_t)k == KEYCODE_CURSOR_POS) { | 4934 | if ((int32_t)k == KEYCODE_CURSOR_POS) { |
4935 | uint32_t rc = (k >> 32); | 4935 | uint32_t rc = (k >> 32); |
4936 | columns = (rc & 0x7fff); | 4936 | columns = (rc & 0x7fff); |
diff --git a/include/libbb.h b/include/libbb.h index e540f2a90..740c25528 100644 --- a/include/libbb.h +++ b/include/libbb.h | |||
@@ -691,6 +691,7 @@ void xsetgid(gid_t gid) FAST_FUNC; | |||
691 | void xsetuid(uid_t uid) FAST_FUNC; | 691 | void xsetuid(uid_t uid) FAST_FUNC; |
692 | void xsetegid(gid_t egid) FAST_FUNC; | 692 | void xsetegid(gid_t egid) FAST_FUNC; |
693 | void xseteuid(uid_t euid) FAST_FUNC; | 693 | void xseteuid(uid_t euid) FAST_FUNC; |
694 | int chdir_or_warn(const char *path) FAST_FUNC; | ||
694 | void xchdir(const char *path) FAST_FUNC; | 695 | void xchdir(const char *path) FAST_FUNC; |
695 | void xfchdir(int fd) FAST_FUNC; | 696 | void xfchdir(int fd) FAST_FUNC; |
696 | void xchroot(const char *path) FAST_FUNC; | 697 | void xchroot(const char *path) FAST_FUNC; |
@@ -1776,7 +1777,7 @@ extern void selinux_or_die(void) FAST_FUNC; | |||
1776 | 1777 | ||
1777 | 1778 | ||
1778 | /* setup_environment: | 1779 | /* setup_environment: |
1779 | * if !SETUP_ENV_NO_CHDIR: | 1780 | * if SETUP_ENV_CHDIR: |
1780 | * if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die | 1781 | * if cd(pw->pw_dir): ok: else if SETUP_ENV_TO_TMP: cd(/tmp) else: cd(/) or die |
1781 | * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set | 1782 | * if SETUP_ENV_CLEARENV: cd(pw->pw_dir), clear environment, then set |
1782 | * TERM=(old value) | 1783 | * TERM=(old value) |
@@ -1784,7 +1785,7 @@ extern void selinux_or_die(void) FAST_FUNC; | |||
1784 | * PATH=bb_default_[root_]path | 1785 | * PATH=bb_default_[root_]path |
1785 | * HOME=pw->pw_dir | 1786 | * HOME=pw->pw_dir |
1786 | * SHELL=shell | 1787 | * SHELL=shell |
1787 | * else if SETUP_ENV_CHANGEENV: | 1788 | * else if SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME: |
1788 | * if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME: | 1789 | * if not root (if pw->pw_uid != 0) or if SETUP_ENV_CHANGEENV_LOGNAME: |
1789 | * USER=pw->pw_name, LOGNAME=pw->pw_name | 1790 | * USER=pw->pw_name, LOGNAME=pw->pw_name |
1790 | * HOME=pw->pw_dir | 1791 | * HOME=pw->pw_dir |
@@ -1798,7 +1799,7 @@ extern void selinux_or_die(void) FAST_FUNC; | |||
1798 | #define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1) | 1799 | #define SETUP_ENV_CHANGEENV_LOGNAME (1 << 1) |
1799 | #define SETUP_ENV_CLEARENV (1 << 2) | 1800 | #define SETUP_ENV_CLEARENV (1 << 2) |
1800 | #define SETUP_ENV_TO_TMP (1 << 3) | 1801 | #define SETUP_ENV_TO_TMP (1 << 3) |
1801 | #define SETUP_ENV_NO_CHDIR (1 << 4) | 1802 | #define SETUP_ENV_CHDIR (1 << 4) |
1802 | void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC; | 1803 | void setup_environment(const char *shell, int flags, const struct passwd *pw) FAST_FUNC; |
1803 | void nuke_str(char *str) FAST_FUNC; | 1804 | void nuke_str(char *str) FAST_FUNC; |
1804 | #if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM | 1805 | #if ENABLE_FEATURE_SECURETTY && !ENABLE_PAM |
@@ -1955,6 +1956,8 @@ enum { | |||
1955 | * (unless fd is in non-blocking mode), | 1956 | * (unless fd is in non-blocking mode), |
1956 | * subsequent reads will time out after a few milliseconds. | 1957 | * subsequent reads will time out after a few milliseconds. |
1957 | * Return of -1 means EOF or error (errno == 0 on EOF). | 1958 | * Return of -1 means EOF or error (errno == 0 on EOF). |
1959 | * Nonzero errno is not preserved across the call: | ||
1960 | * if there was no error, errno will be cleared to 0. | ||
1958 | * buffer[0] is used as a counter of buffered chars and must be 0 | 1961 | * buffer[0] is used as a counter of buffered chars and must be 0 |
1959 | * on first call. | 1962 | * on first call. |
1960 | * timeout: | 1963 | * timeout: |
@@ -1963,6 +1966,12 @@ enum { | |||
1963 | * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout | 1966 | * >=0: poll() for TIMEOUT milliseconds, return -1/EAGAIN on timeout |
1964 | */ | 1967 | */ |
1965 | int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC; | 1968 | int64_t read_key(int fd, char *buffer, int timeout) FAST_FUNC; |
1969 | #if ENABLE_PLATFORM_MINGW32 | ||
1970 | #define safe_read_key(f, b, t) read_key(f, b, t) | ||
1971 | #else | ||
1972 | /* This version loops on EINTR: */ | ||
1973 | int64_t safe_read_key(int fd, char *buffer, int timeout) FAST_FUNC; | ||
1974 | #endif | ||
1966 | void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC; | 1975 | void read_key_ungets(char *buffer, const char *str, unsigned len) FAST_FUNC; |
1967 | 1976 | ||
1968 | 1977 | ||
@@ -2016,7 +2025,8 @@ enum { | |||
2016 | USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION, | 2025 | USERNAME_COMPLETION = 4 * ENABLE_FEATURE_USERNAME_COMPLETION, |
2017 | VI_MODE = 8 * ENABLE_FEATURE_EDITING_VI, | 2026 | VI_MODE = 8 * ENABLE_FEATURE_EDITING_VI, |
2018 | WITH_PATH_LOOKUP = 0x10, | 2027 | WITH_PATH_LOOKUP = 0x10, |
2019 | FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION, | 2028 | LI_INTERRUPTIBLE = 0x20, |
2029 | FOR_SHELL = DO_HISTORY | TAB_COMPLETION | USERNAME_COMPLETION | LI_INTERRUPTIBLE, | ||
2020 | }; | 2030 | }; |
2021 | line_input_t *new_line_input_t(int flags) FAST_FUNC; | 2031 | line_input_t *new_line_input_t(int flags) FAST_FUNC; |
2022 | #if ENABLE_FEATURE_EDITING_SAVEHISTORY | 2032 | #if ENABLE_FEATURE_EDITING_SAVEHISTORY |
@@ -2361,7 +2371,7 @@ struct globals; | |||
2361 | /* '*const' ptr makes gcc optimize code much better. | 2371 | /* '*const' ptr makes gcc optimize code much better. |
2362 | * Magic prevents ptr_to_globals from going into rodata. | 2372 | * Magic prevents ptr_to_globals from going into rodata. |
2363 | * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */ | 2373 | * If you want to assign a value, use SET_PTR_TO_GLOBALS(x) */ |
2364 | extern struct globals *const ptr_to_globals; | 2374 | extern struct globals *BB_GLOBAL_CONST ptr_to_globals; |
2365 | 2375 | ||
2366 | #define barrier() asm volatile ("":::"memory") | 2376 | #define barrier() asm volatile ("":::"memory") |
2367 | 2377 | ||
diff --git a/include/platform.h b/include/platform.h index 3fb1a2dc8..8ae5ed4bc 100644 --- a/include/platform.h +++ b/include/platform.h | |||
@@ -367,6 +367,7 @@ typedef unsigned smalluint; | |||
367 | # define ALIGN4 | 367 | # define ALIGN4 |
368 | #endif | 368 | #endif |
369 | #define ALIGN8 __attribute__((aligned(8))) | 369 | #define ALIGN8 __attribute__((aligned(8))) |
370 | #define ALIGN_INT __attribute__((aligned(sizeof(int)))) | ||
370 | #define ALIGN_PTR __attribute__((aligned(sizeof(void*)))) | 371 | #define ALIGN_PTR __attribute__((aligned(sizeof(void*)))) |
371 | 372 | ||
372 | /* | 373 | /* |
diff --git a/libbb/Config.src b/libbb/Config.src index 708d3b0c8..0ecd5bd46 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -70,6 +70,12 @@ config SHA1_HWACCEL | |||
70 | On x86, this adds ~590 bytes of code. Throughput | 70 | On x86, this adds ~590 bytes of code. Throughput |
71 | is about twice as fast as fully-unrolled generic code. | 71 | is about twice as fast as fully-unrolled generic code. |
72 | 72 | ||
73 | config SHA256_HWACCEL | ||
74 | bool "SHA256: Use hardware accelerated instructions if possible" | ||
75 | default y | ||
76 | help | ||
77 | On x86, this adds ~1k bytes of code. | ||
78 | |||
73 | config SHA3_SMALL | 79 | config SHA3_SMALL |
74 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | 80 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" |
75 | default 1 # all "fast or small" options default to small | 81 | default 1 # all "fast or small" options default to small |
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 67d3c7cf7..191984c9d 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
@@ -48,6 +48,8 @@ lib-y += hash_md5_sha.o | |||
48 | lib-y += hash_md5_sha_x86-64.o | 48 | lib-y += hash_md5_sha_x86-64.o |
49 | lib-y += hash_md5_sha_x86-64_shaNI.o | 49 | lib-y += hash_md5_sha_x86-64_shaNI.o |
50 | lib-y += hash_md5_sha_x86-32_shaNI.o | 50 | lib-y += hash_md5_sha_x86-32_shaNI.o |
51 | lib-y += hash_md5_sha256_x86-64_shaNI.o | ||
52 | lib-y += hash_md5_sha256_x86-32_shaNI.o | ||
51 | # Alternative (disabled) MD5 implementation | 53 | # Alternative (disabled) MD5 implementation |
52 | #lib-y += hash_md5prime.o | 54 | #lib-y += hash_md5prime.o |
53 | lib-y += messages.o | 55 | lib-y += messages.o |
@@ -204,6 +206,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o | |||
204 | lib-$(CONFIG_PKILL) += xregcomp.o | 206 | lib-$(CONFIG_PKILL) += xregcomp.o |
205 | lib-$(CONFIG_DEVFSD) += xregcomp.o | 207 | lib-$(CONFIG_DEVFSD) += xregcomp.o |
206 | lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o | 208 | lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o |
209 | lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o | ||
207 | 210 | ||
208 | # Add the experimental logging functionality, only used by zcip | 211 | # Add the experimental logging functionality, only used by zcip |
209 | lib-$(CONFIG_ZCIP) += logenv.o | 212 | lib-$(CONFIG_ZCIP) += logenv.o |
diff --git a/libbb/appletlib.c b/libbb/appletlib.c index 6c0be4a83..a8b82e729 100644 --- a/libbb/appletlib.c +++ b/libbb/appletlib.c | |||
@@ -671,7 +671,7 @@ static void check_suid(int applet_no) | |||
671 | # if ENABLE_FEATURE_INSTALLER | 671 | # if ENABLE_FEATURE_INSTALLER |
672 | static const char usr_bin [] ALIGN1 = "/usr/bin/"; | 672 | static const char usr_bin [] ALIGN1 = "/usr/bin/"; |
673 | static const char usr_sbin[] ALIGN1 = "/usr/sbin/"; | 673 | static const char usr_sbin[] ALIGN1 = "/usr/sbin/"; |
674 | static const char *const install_dir[] = { | 674 | static const char *const install_dir[] ALIGN_PTR = { |
675 | &usr_bin [8], /* "/" */ | 675 | &usr_bin [8], /* "/" */ |
676 | &usr_bin [4], /* "/bin/" */ | 676 | &usr_bin [4], /* "/bin/" */ |
677 | &usr_sbin[4] /* "/sbin/" */ | 677 | &usr_sbin[4] /* "/sbin/" */ |
diff --git a/libbb/get_console.c b/libbb/get_console.c index 7f2c75332..9044efea1 100644 --- a/libbb/get_console.c +++ b/libbb/get_console.c | |||
@@ -37,7 +37,7 @@ static int open_a_console(const char *fnam) | |||
37 | */ | 37 | */ |
38 | int FAST_FUNC get_console_fd_or_die(void) | 38 | int FAST_FUNC get_console_fd_or_die(void) |
39 | { | 39 | { |
40 | static const char *const console_names[] = { | 40 | static const char *const console_names[] ALIGN_PTR = { |
41 | DEV_CONSOLE, CURRENT_VC, CURRENT_TTY | 41 | DEV_CONSOLE, CURRENT_VC, CURRENT_TTY |
42 | }; | 42 | }; |
43 | 43 | ||
diff --git a/libbb/getopt32.c b/libbb/getopt32.c index 5ab4d66f1..e861d0567 100644 --- a/libbb/getopt32.c +++ b/libbb/getopt32.c | |||
@@ -296,7 +296,7 @@ Special characters: | |||
296 | 296 | ||
297 | /* Code here assumes that 'unsigned' is at least 32 bits wide */ | 297 | /* Code here assumes that 'unsigned' is at least 32 bits wide */ |
298 | 298 | ||
299 | const char *const bb_argv_dash[] = { "-", NULL }; | 299 | const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL }; |
300 | 300 | ||
301 | enum { | 301 | enum { |
302 | PARAM_STRING, | 302 | PARAM_STRING, |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index a23db5152..880ffab01 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -13,6 +13,27 @@ | |||
13 | 13 | ||
14 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) | 14 | #define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) |
15 | 15 | ||
16 | #if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL | ||
17 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
18 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
19 | { | ||
20 | asm ("cpuid" | ||
21 | : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) | ||
22 | : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx) | ||
23 | ); | ||
24 | } | ||
25 | static smallint shaNI; | ||
26 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
27 | void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx); | ||
28 | # if defined(__i386__) | ||
29 | struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; }; | ||
30 | # endif | ||
31 | # if defined(__x86_64__) | ||
32 | struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; }; | ||
33 | # endif | ||
34 | # endif | ||
35 | #endif | ||
36 | |||
16 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro | 37 | /* gcc 4.2.1 optimizes rotr64 better with inline than with macro |
17 | * (for rotX32, there is no difference). Why? My guess is that | 38 | * (for rotX32, there is no difference). Why? My guess is that |
18 | * macro requires clever common subexpression elimination heuristics | 39 | * macro requires clever common subexpression elimination heuristics |
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
1142 | } | 1163 | } |
1143 | #endif /* NEED_SHA512 */ | 1164 | #endif /* NEED_SHA512 */ |
1144 | 1165 | ||
1145 | #if ENABLE_SHA1_HWACCEL | ||
1146 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
1148 | { | ||
1149 | asm ("cpuid" | ||
1150 | : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx) | ||
1151 | : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx) | ||
1152 | ); | ||
1153 | } | ||
1154 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
1155 | # if defined(__i386__) | ||
1156 | struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; | ||
1157 | # endif | ||
1158 | # if defined(__x86_64__) | ||
1159 | struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | ||
1160 | # endif | ||
1161 | # endif | ||
1162 | #endif | ||
1163 | |||
1164 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | 1166 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) |
1165 | { | 1167 | { |
1166 | ctx->hash[0] = 0x67452301; | 1168 | ctx->hash[0] = 0x67452301; |
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
1173 | #if ENABLE_SHA1_HWACCEL | 1175 | #if ENABLE_SHA1_HWACCEL |
1174 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | 1176 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
1175 | { | 1177 | { |
1176 | static smallint shaNI; | ||
1177 | if (!shaNI) { | 1178 | if (!shaNI) { |
1178 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; | 1179 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; |
1179 | cpuid(&eax, &ebx, &ecx, &edx); | 1180 | cpuid(&eax, &ebx, &ecx, &edx); |
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx) | |||
1225 | memcpy(&ctx->total64, init256, sizeof(init256)); | 1226 | memcpy(&ctx->total64, init256, sizeof(init256)); |
1226 | /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ | 1227 | /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ |
1227 | ctx->process_block = sha256_process_block64; | 1228 | ctx->process_block = sha256_process_block64; |
1229 | #if ENABLE_SHA256_HWACCEL | ||
1230 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | ||
1231 | { | ||
1232 | if (!shaNI) { | ||
1233 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; | ||
1234 | cpuid(&eax, &ebx, &ecx, &edx); | ||
1235 | shaNI = ((ebx >> 29) << 1) - 1; | ||
1236 | } | ||
1237 | if (shaNI > 0) | ||
1238 | ctx->process_block = sha256_process_block64_shaNI; | ||
1239 | } | ||
1240 | # endif | ||
1241 | #endif | ||
1228 | } | 1242 | } |
1229 | 1243 | ||
1230 | #if NEED_SHA512 | 1244 | #if NEED_SHA512 |
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S new file mode 100644 index 000000000..aa68193bd --- /dev/null +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S | |||
@@ -0,0 +1,277 @@ | |||
1 | #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define shuf128_32 pshufd | ||
16 | #define shuf128_32 shufps | ||
17 | |||
18 | .section .text.sha256_process_block64_shaNI, "ax", @progbits | ||
19 | .globl sha256_process_block64_shaNI | ||
20 | .hidden sha256_process_block64_shaNI | ||
21 | .type sha256_process_block64_shaNI, @function | ||
22 | |||
23 | #define DATA_PTR %eax | ||
24 | |||
25 | #define SHA256CONSTANTS %ecx | ||
26 | |||
27 | #define MSG %xmm0 | ||
28 | #define STATE0 %xmm1 | ||
29 | #define STATE1 %xmm2 | ||
30 | #define MSGTMP0 %xmm3 | ||
31 | #define MSGTMP1 %xmm4 | ||
32 | #define MSGTMP2 %xmm5 | ||
33 | #define MSGTMP3 %xmm6 | ||
34 | |||
35 | #define XMMTMP %xmm7 | ||
36 | |||
37 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) | ||
38 | |||
39 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
40 | sha256_process_block64_shaNI: | ||
41 | |||
42 | movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ | ||
43 | movu128 76+1*16(%eax), STATE1 /* HGFE */ | ||
44 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | ||
45 | mova128 STATE1, STATE0 | ||
46 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ | ||
47 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ | ||
48 | |||
49 | /* XMMTMP holds flip mask from here... */ | ||
50 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP | ||
51 | movl $K256+8*16, SHA256CONSTANTS | ||
52 | |||
53 | /* Rounds 0-3 */ | ||
54 | movu128 0*16(DATA_PTR), MSG | ||
55 | pshufb XMMTMP, MSG | ||
56 | mova128 MSG, MSGTMP0 | ||
57 | paddd 0*16-8*16(SHA256CONSTANTS), MSG | ||
58 | sha256rnds2 STATE0, STATE1 | ||
59 | shuf128_32 $0x0E, MSG, MSG | ||
60 | sha256rnds2 STATE1, STATE0 | ||
61 | |||
62 | /* Rounds 4-7 */ | ||
63 | movu128 1*16(DATA_PTR), MSG | ||
64 | pshufb XMMTMP, MSG | ||
65 | mova128 MSG, MSGTMP1 | ||
66 | paddd 1*16-8*16(SHA256CONSTANTS), MSG | ||
67 | sha256rnds2 STATE0, STATE1 | ||
68 | shuf128_32 $0x0E, MSG, MSG | ||
69 | sha256rnds2 STATE1, STATE0 | ||
70 | sha256msg1 MSGTMP1, MSGTMP0 | ||
71 | |||
72 | /* Rounds 8-11 */ | ||
73 | movu128 2*16(DATA_PTR), MSG | ||
74 | pshufb XMMTMP, MSG | ||
75 | mova128 MSG, MSGTMP2 | ||
76 | paddd 2*16-8*16(SHA256CONSTANTS), MSG | ||
77 | sha256rnds2 STATE0, STATE1 | ||
78 | shuf128_32 $0x0E, MSG, MSG | ||
79 | sha256rnds2 STATE1, STATE0 | ||
80 | sha256msg1 MSGTMP2, MSGTMP1 | ||
81 | |||
82 | /* Rounds 12-15 */ | ||
83 | movu128 3*16(DATA_PTR), MSG | ||
84 | pshufb XMMTMP, MSG | ||
85 | /* ...to here */ | ||
86 | mova128 MSG, MSGTMP3 | ||
87 | paddd 3*16-8*16(SHA256CONSTANTS), MSG | ||
88 | sha256rnds2 STATE0, STATE1 | ||
89 | mova128 MSGTMP3, XMMTMP | ||
90 | palignr $4, MSGTMP2, XMMTMP | ||
91 | paddd XMMTMP, MSGTMP0 | ||
92 | sha256msg2 MSGTMP3, MSGTMP0 | ||
93 | shuf128_32 $0x0E, MSG, MSG | ||
94 | sha256rnds2 STATE1, STATE0 | ||
95 | sha256msg1 MSGTMP3, MSGTMP2 | ||
96 | |||
97 | /* Rounds 16-19 */ | ||
98 | mova128 MSGTMP0, MSG | ||
99 | paddd 4*16-8*16(SHA256CONSTANTS), MSG | ||
100 | sha256rnds2 STATE0, STATE1 | ||
101 | mova128 MSGTMP0, XMMTMP | ||
102 | palignr $4, MSGTMP3, XMMTMP | ||
103 | paddd XMMTMP, MSGTMP1 | ||
104 | sha256msg2 MSGTMP0, MSGTMP1 | ||
105 | shuf128_32 $0x0E, MSG, MSG | ||
106 | sha256rnds2 STATE1, STATE0 | ||
107 | sha256msg1 MSGTMP0, MSGTMP3 | ||
108 | |||
109 | /* Rounds 20-23 */ | ||
110 | mova128 MSGTMP1, MSG | ||
111 | paddd 5*16-8*16(SHA256CONSTANTS), MSG | ||
112 | sha256rnds2 STATE0, STATE1 | ||
113 | mova128 MSGTMP1, XMMTMP | ||
114 | palignr $4, MSGTMP0, XMMTMP | ||
115 | paddd XMMTMP, MSGTMP2 | ||
116 | sha256msg2 MSGTMP1, MSGTMP2 | ||
117 | shuf128_32 $0x0E, MSG, MSG | ||
118 | sha256rnds2 STATE1, STATE0 | ||
119 | sha256msg1 MSGTMP1, MSGTMP0 | ||
120 | |||
121 | /* Rounds 24-27 */ | ||
122 | mova128 MSGTMP2, MSG | ||
123 | paddd 6*16-8*16(SHA256CONSTANTS), MSG | ||
124 | sha256rnds2 STATE0, STATE1 | ||
125 | mova128 MSGTMP2, XMMTMP | ||
126 | palignr $4, MSGTMP1, XMMTMP | ||
127 | paddd XMMTMP, MSGTMP3 | ||
128 | sha256msg2 MSGTMP2, MSGTMP3 | ||
129 | shuf128_32 $0x0E, MSG, MSG | ||
130 | sha256rnds2 STATE1, STATE0 | ||
131 | sha256msg1 MSGTMP2, MSGTMP1 | ||
132 | |||
133 | /* Rounds 28-31 */ | ||
134 | mova128 MSGTMP3, MSG | ||
135 | paddd 7*16-8*16(SHA256CONSTANTS), MSG | ||
136 | sha256rnds2 STATE0, STATE1 | ||
137 | mova128 MSGTMP3, XMMTMP | ||
138 | palignr $4, MSGTMP2, XMMTMP | ||
139 | paddd XMMTMP, MSGTMP0 | ||
140 | sha256msg2 MSGTMP3, MSGTMP0 | ||
141 | shuf128_32 $0x0E, MSG, MSG | ||
142 | sha256rnds2 STATE1, STATE0 | ||
143 | sha256msg1 MSGTMP3, MSGTMP2 | ||
144 | |||
145 | /* Rounds 32-35 */ | ||
146 | mova128 MSGTMP0, MSG | ||
147 | paddd 8*16-8*16(SHA256CONSTANTS), MSG | ||
148 | sha256rnds2 STATE0, STATE1 | ||
149 | mova128 MSGTMP0, XMMTMP | ||
150 | palignr $4, MSGTMP3, XMMTMP | ||
151 | paddd XMMTMP, MSGTMP1 | ||
152 | sha256msg2 MSGTMP0, MSGTMP1 | ||
153 | shuf128_32 $0x0E, MSG, MSG | ||
154 | sha256rnds2 STATE1, STATE0 | ||
155 | sha256msg1 MSGTMP0, MSGTMP3 | ||
156 | |||
157 | /* Rounds 36-39 */ | ||
158 | mova128 MSGTMP1, MSG | ||
159 | paddd 9*16-8*16(SHA256CONSTANTS), MSG | ||
160 | sha256rnds2 STATE0, STATE1 | ||
161 | mova128 MSGTMP1, XMMTMP | ||
162 | palignr $4, MSGTMP0, XMMTMP | ||
163 | paddd XMMTMP, MSGTMP2 | ||
164 | sha256msg2 MSGTMP1, MSGTMP2 | ||
165 | shuf128_32 $0x0E, MSG, MSG | ||
166 | sha256rnds2 STATE1, STATE0 | ||
167 | sha256msg1 MSGTMP1, MSGTMP0 | ||
168 | |||
169 | /* Rounds 40-43 */ | ||
170 | mova128 MSGTMP2, MSG | ||
171 | paddd 10*16-8*16(SHA256CONSTANTS), MSG | ||
172 | sha256rnds2 STATE0, STATE1 | ||
173 | mova128 MSGTMP2, XMMTMP | ||
174 | palignr $4, MSGTMP1, XMMTMP | ||
175 | paddd XMMTMP, MSGTMP3 | ||
176 | sha256msg2 MSGTMP2, MSGTMP3 | ||
177 | shuf128_32 $0x0E, MSG, MSG | ||
178 | sha256rnds2 STATE1, STATE0 | ||
179 | sha256msg1 MSGTMP2, MSGTMP1 | ||
180 | |||
181 | /* Rounds 44-47 */ | ||
182 | mova128 MSGTMP3, MSG | ||
183 | paddd 11*16-8*16(SHA256CONSTANTS), MSG | ||
184 | sha256rnds2 STATE0, STATE1 | ||
185 | mova128 MSGTMP3, XMMTMP | ||
186 | palignr $4, MSGTMP2, XMMTMP | ||
187 | paddd XMMTMP, MSGTMP0 | ||
188 | sha256msg2 MSGTMP3, MSGTMP0 | ||
189 | shuf128_32 $0x0E, MSG, MSG | ||
190 | sha256rnds2 STATE1, STATE0 | ||
191 | sha256msg1 MSGTMP3, MSGTMP2 | ||
192 | |||
193 | /* Rounds 48-51 */ | ||
194 | mova128 MSGTMP0, MSG | ||
195 | paddd 12*16-8*16(SHA256CONSTANTS), MSG | ||
196 | sha256rnds2 STATE0, STATE1 | ||
197 | mova128 MSGTMP0, XMMTMP | ||
198 | palignr $4, MSGTMP3, XMMTMP | ||
199 | paddd XMMTMP, MSGTMP1 | ||
200 | sha256msg2 MSGTMP0, MSGTMP1 | ||
201 | shuf128_32 $0x0E, MSG, MSG | ||
202 | sha256rnds2 STATE1, STATE0 | ||
203 | sha256msg1 MSGTMP0, MSGTMP3 | ||
204 | |||
205 | /* Rounds 52-55 */ | ||
206 | mova128 MSGTMP1, MSG | ||
207 | paddd 13*16-8*16(SHA256CONSTANTS), MSG | ||
208 | sha256rnds2 STATE0, STATE1 | ||
209 | mova128 MSGTMP1, XMMTMP | ||
210 | palignr $4, MSGTMP0, XMMTMP | ||
211 | paddd XMMTMP, MSGTMP2 | ||
212 | sha256msg2 MSGTMP1, MSGTMP2 | ||
213 | shuf128_32 $0x0E, MSG, MSG | ||
214 | sha256rnds2 STATE1, STATE0 | ||
215 | |||
216 | /* Rounds 56-59 */ | ||
217 | mova128 MSGTMP2, MSG | ||
218 | paddd 14*16-8*16(SHA256CONSTANTS), MSG | ||
219 | sha256rnds2 STATE0, STATE1 | ||
220 | mova128 MSGTMP2, XMMTMP | ||
221 | palignr $4, MSGTMP1, XMMTMP | ||
222 | paddd XMMTMP, MSGTMP3 | ||
223 | sha256msg2 MSGTMP2, MSGTMP3 | ||
224 | shuf128_32 $0x0E, MSG, MSG | ||
225 | sha256rnds2 STATE1, STATE0 | ||
226 | |||
227 | /* Rounds 60-63 */ | ||
228 | mova128 MSGTMP3, MSG | ||
229 | paddd 15*16-8*16(SHA256CONSTANTS), MSG | ||
230 | sha256rnds2 STATE0, STATE1 | ||
231 | shuf128_32 $0x0E, MSG, MSG | ||
232 | sha256rnds2 STATE1, STATE0 | ||
233 | |||
234 | /* Write hash values back in the correct order */ | ||
235 | /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ | ||
236 | /* STATE1: CDGH */ | ||
237 | mova128 STATE0, XMMTMP | ||
238 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | ||
239 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ | ||
240 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ | ||
241 | /* add current hash values to previous ones */ | ||
242 | movu128 76+1*16(%eax), STATE1 | ||
243 | paddd XMMTMP, STATE1 | ||
244 | movu128 STATE1, 76+1*16(%eax) | ||
245 | movu128 76+0*16(%eax), XMMTMP | ||
246 | paddd XMMTMP, STATE0 | ||
247 | movu128 STATE0, 76+0*16(%eax) | ||
248 | |||
249 | ret | ||
250 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | ||
251 | |||
252 | .section .rodata.cst256.K256, "aM", @progbits, 256 | ||
253 | .balign 16 | ||
254 | K256: | ||
255 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
256 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
257 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
258 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
259 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
260 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
261 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
262 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
263 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
264 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
265 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
266 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
267 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
268 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
269 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
270 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
271 | |||
272 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | ||
273 | .balign 16 | ||
274 | PSHUFFLE_BSWAP32_FLIP_MASK: | ||
275 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
276 | |||
277 | #endif | ||
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S new file mode 100644 index 000000000..4663f750a --- /dev/null +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S | |||
@@ -0,0 +1,284 @@ | |||
1 | #if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define shuf128_32 pshufd | ||
16 | #define shuf128_32 shufps | ||
17 | |||
18 | .section .text.sha256_process_block64_shaNI, "ax", @progbits | ||
19 | .globl sha256_process_block64_shaNI | ||
20 | .hidden sha256_process_block64_shaNI | ||
21 | .type sha256_process_block64_shaNI, @function | ||
22 | |||
23 | #define DATA_PTR %rdi | ||
24 | |||
25 | #define SHA256CONSTANTS %rax | ||
26 | |||
27 | #define MSG %xmm0 | ||
28 | #define STATE0 %xmm1 | ||
29 | #define STATE1 %xmm2 | ||
30 | #define MSGTMP0 %xmm3 | ||
31 | #define MSGTMP1 %xmm4 | ||
32 | #define MSGTMP2 %xmm5 | ||
33 | #define MSGTMP3 %xmm6 | ||
34 | |||
35 | #define XMMTMP %xmm7 | ||
36 | |||
37 | #define ABEF_SAVE %xmm9 | ||
38 | #define CDGH_SAVE %xmm10 | ||
39 | |||
40 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) | ||
41 | |||
42 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
43 | sha256_process_block64_shaNI: | ||
44 | |||
45 | movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ | ||
46 | movu128 80+1*16(%rdi), STATE1 /* HGFE */ | ||
47 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | ||
48 | mova128 STATE1, STATE0 | ||
49 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ | ||
50 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ | ||
51 | |||
52 | /* XMMTMP holds flip mask from here... */ | ||
53 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP | ||
54 | leaq K256+8*16(%rip), SHA256CONSTANTS | ||
55 | |||
56 | /* Save hash values for addition after rounds */ | ||
57 | mova128 STATE0, ABEF_SAVE | ||
58 | mova128 STATE1, CDGH_SAVE | ||
59 | |||
60 | /* Rounds 0-3 */ | ||
61 | movu128 0*16(DATA_PTR), MSG | ||
62 | pshufb XMMTMP, MSG | ||
63 | mova128 MSG, MSGTMP0 | ||
64 | paddd 0*16-8*16(SHA256CONSTANTS), MSG | ||
65 | sha256rnds2 STATE0, STATE1 | ||
66 | shuf128_32 $0x0E, MSG, MSG | ||
67 | sha256rnds2 STATE1, STATE0 | ||
68 | |||
69 | /* Rounds 4-7 */ | ||
70 | movu128 1*16(DATA_PTR), MSG | ||
71 | pshufb XMMTMP, MSG | ||
72 | mova128 MSG, MSGTMP1 | ||
73 | paddd 1*16-8*16(SHA256CONSTANTS), MSG | ||
74 | sha256rnds2 STATE0, STATE1 | ||
75 | shuf128_32 $0x0E, MSG, MSG | ||
76 | sha256rnds2 STATE1, STATE0 | ||
77 | sha256msg1 MSGTMP1, MSGTMP0 | ||
78 | |||
79 | /* Rounds 8-11 */ | ||
80 | movu128 2*16(DATA_PTR), MSG | ||
81 | pshufb XMMTMP, MSG | ||
82 | mova128 MSG, MSGTMP2 | ||
83 | paddd 2*16-8*16(SHA256CONSTANTS), MSG | ||
84 | sha256rnds2 STATE0, STATE1 | ||
85 | shuf128_32 $0x0E, MSG, MSG | ||
86 | sha256rnds2 STATE1, STATE0 | ||
87 | sha256msg1 MSGTMP2, MSGTMP1 | ||
88 | |||
89 | /* Rounds 12-15 */ | ||
90 | movu128 3*16(DATA_PTR), MSG | ||
91 | pshufb XMMTMP, MSG | ||
92 | /* ...to here */ | ||
93 | mova128 MSG, MSGTMP3 | ||
94 | paddd 3*16-8*16(SHA256CONSTANTS), MSG | ||
95 | sha256rnds2 STATE0, STATE1 | ||
96 | mova128 MSGTMP3, XMMTMP | ||
97 | palignr $4, MSGTMP2, XMMTMP | ||
98 | paddd XMMTMP, MSGTMP0 | ||
99 | sha256msg2 MSGTMP3, MSGTMP0 | ||
100 | shuf128_32 $0x0E, MSG, MSG | ||
101 | sha256rnds2 STATE1, STATE0 | ||
102 | sha256msg1 MSGTMP3, MSGTMP2 | ||
103 | |||
104 | /* Rounds 16-19 */ | ||
105 | mova128 MSGTMP0, MSG | ||
106 | paddd 4*16-8*16(SHA256CONSTANTS), MSG | ||
107 | sha256rnds2 STATE0, STATE1 | ||
108 | mova128 MSGTMP0, XMMTMP | ||
109 | palignr $4, MSGTMP3, XMMTMP | ||
110 | paddd XMMTMP, MSGTMP1 | ||
111 | sha256msg2 MSGTMP0, MSGTMP1 | ||
112 | shuf128_32 $0x0E, MSG, MSG | ||
113 | sha256rnds2 STATE1, STATE0 | ||
114 | sha256msg1 MSGTMP0, MSGTMP3 | ||
115 | |||
116 | /* Rounds 20-23 */ | ||
117 | mova128 MSGTMP1, MSG | ||
118 | paddd 5*16-8*16(SHA256CONSTANTS), MSG | ||
119 | sha256rnds2 STATE0, STATE1 | ||
120 | mova128 MSGTMP1, XMMTMP | ||
121 | palignr $4, MSGTMP0, XMMTMP | ||
122 | paddd XMMTMP, MSGTMP2 | ||
123 | sha256msg2 MSGTMP1, MSGTMP2 | ||
124 | shuf128_32 $0x0E, MSG, MSG | ||
125 | sha256rnds2 STATE1, STATE0 | ||
126 | sha256msg1 MSGTMP1, MSGTMP0 | ||
127 | |||
128 | /* Rounds 24-27 */ | ||
129 | mova128 MSGTMP2, MSG | ||
130 | paddd 6*16-8*16(SHA256CONSTANTS), MSG | ||
131 | sha256rnds2 STATE0, STATE1 | ||
132 | mova128 MSGTMP2, XMMTMP | ||
133 | palignr $4, MSGTMP1, XMMTMP | ||
134 | paddd XMMTMP, MSGTMP3 | ||
135 | sha256msg2 MSGTMP2, MSGTMP3 | ||
136 | shuf128_32 $0x0E, MSG, MSG | ||
137 | sha256rnds2 STATE1, STATE0 | ||
138 | sha256msg1 MSGTMP2, MSGTMP1 | ||
139 | |||
140 | /* Rounds 28-31 */ | ||
141 | mova128 MSGTMP3, MSG | ||
142 | paddd 7*16-8*16(SHA256CONSTANTS), MSG | ||
143 | sha256rnds2 STATE0, STATE1 | ||
144 | mova128 MSGTMP3, XMMTMP | ||
145 | palignr $4, MSGTMP2, XMMTMP | ||
146 | paddd XMMTMP, MSGTMP0 | ||
147 | sha256msg2 MSGTMP3, MSGTMP0 | ||
148 | shuf128_32 $0x0E, MSG, MSG | ||
149 | sha256rnds2 STATE1, STATE0 | ||
150 | sha256msg1 MSGTMP3, MSGTMP2 | ||
151 | |||
152 | /* Rounds 32-35 */ | ||
153 | mova128 MSGTMP0, MSG | ||
154 | paddd 8*16-8*16(SHA256CONSTANTS), MSG | ||
155 | sha256rnds2 STATE0, STATE1 | ||
156 | mova128 MSGTMP0, XMMTMP | ||
157 | palignr $4, MSGTMP3, XMMTMP | ||
158 | paddd XMMTMP, MSGTMP1 | ||
159 | sha256msg2 MSGTMP0, MSGTMP1 | ||
160 | shuf128_32 $0x0E, MSG, MSG | ||
161 | sha256rnds2 STATE1, STATE0 | ||
162 | sha256msg1 MSGTMP0, MSGTMP3 | ||
163 | |||
164 | /* Rounds 36-39 */ | ||
165 | mova128 MSGTMP1, MSG | ||
166 | paddd 9*16-8*16(SHA256CONSTANTS), MSG | ||
167 | sha256rnds2 STATE0, STATE1 | ||
168 | mova128 MSGTMP1, XMMTMP | ||
169 | palignr $4, MSGTMP0, XMMTMP | ||
170 | paddd XMMTMP, MSGTMP2 | ||
171 | sha256msg2 MSGTMP1, MSGTMP2 | ||
172 | shuf128_32 $0x0E, MSG, MSG | ||
173 | sha256rnds2 STATE1, STATE0 | ||
174 | sha256msg1 MSGTMP1, MSGTMP0 | ||
175 | |||
176 | /* Rounds 40-43 */ | ||
177 | mova128 MSGTMP2, MSG | ||
178 | paddd 10*16-8*16(SHA256CONSTANTS), MSG | ||
179 | sha256rnds2 STATE0, STATE1 | ||
180 | mova128 MSGTMP2, XMMTMP | ||
181 | palignr $4, MSGTMP1, XMMTMP | ||
182 | paddd XMMTMP, MSGTMP3 | ||
183 | sha256msg2 MSGTMP2, MSGTMP3 | ||
184 | shuf128_32 $0x0E, MSG, MSG | ||
185 | sha256rnds2 STATE1, STATE0 | ||
186 | sha256msg1 MSGTMP2, MSGTMP1 | ||
187 | |||
188 | /* Rounds 44-47 */ | ||
189 | mova128 MSGTMP3, MSG | ||
190 | paddd 11*16-8*16(SHA256CONSTANTS), MSG | ||
191 | sha256rnds2 STATE0, STATE1 | ||
192 | mova128 MSGTMP3, XMMTMP | ||
193 | palignr $4, MSGTMP2, XMMTMP | ||
194 | paddd XMMTMP, MSGTMP0 | ||
195 | sha256msg2 MSGTMP3, MSGTMP0 | ||
196 | shuf128_32 $0x0E, MSG, MSG | ||
197 | sha256rnds2 STATE1, STATE0 | ||
198 | sha256msg1 MSGTMP3, MSGTMP2 | ||
199 | |||
200 | /* Rounds 48-51 */ | ||
201 | mova128 MSGTMP0, MSG | ||
202 | paddd 12*16-8*16(SHA256CONSTANTS), MSG | ||
203 | sha256rnds2 STATE0, STATE1 | ||
204 | mova128 MSGTMP0, XMMTMP | ||
205 | palignr $4, MSGTMP3, XMMTMP | ||
206 | paddd XMMTMP, MSGTMP1 | ||
207 | sha256msg2 MSGTMP0, MSGTMP1 | ||
208 | shuf128_32 $0x0E, MSG, MSG | ||
209 | sha256rnds2 STATE1, STATE0 | ||
210 | sha256msg1 MSGTMP0, MSGTMP3 | ||
211 | |||
212 | /* Rounds 52-55 */ | ||
213 | mova128 MSGTMP1, MSG | ||
214 | paddd 13*16-8*16(SHA256CONSTANTS), MSG | ||
215 | sha256rnds2 STATE0, STATE1 | ||
216 | mova128 MSGTMP1, XMMTMP | ||
217 | palignr $4, MSGTMP0, XMMTMP | ||
218 | paddd XMMTMP, MSGTMP2 | ||
219 | sha256msg2 MSGTMP1, MSGTMP2 | ||
220 | shuf128_32 $0x0E, MSG, MSG | ||
221 | sha256rnds2 STATE1, STATE0 | ||
222 | |||
223 | /* Rounds 56-59 */ | ||
224 | mova128 MSGTMP2, MSG | ||
225 | paddd 14*16-8*16(SHA256CONSTANTS), MSG | ||
226 | sha256rnds2 STATE0, STATE1 | ||
227 | mova128 MSGTMP2, XMMTMP | ||
228 | palignr $4, MSGTMP1, XMMTMP | ||
229 | paddd XMMTMP, MSGTMP3 | ||
230 | sha256msg2 MSGTMP2, MSGTMP3 | ||
231 | shuf128_32 $0x0E, MSG, MSG | ||
232 | sha256rnds2 STATE1, STATE0 | ||
233 | |||
234 | /* Rounds 60-63 */ | ||
235 | mova128 MSGTMP3, MSG | ||
236 | paddd 15*16-8*16(SHA256CONSTANTS), MSG | ||
237 | sha256rnds2 STATE0, STATE1 | ||
238 | shuf128_32 $0x0E, MSG, MSG | ||
239 | sha256rnds2 STATE1, STATE0 | ||
240 | |||
241 | /* Add current hash values with previously saved */ | ||
242 | paddd ABEF_SAVE, STATE0 | ||
243 | paddd CDGH_SAVE, STATE1 | ||
244 | |||
245 | /* Write hash values back in the correct order */ | ||
246 | /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ | ||
247 | /* STATE1: CDGH */ | ||
248 | mova128 STATE0, XMMTMP | ||
249 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | ||
250 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ | ||
251 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ | ||
252 | |||
253 | movu128 STATE0, 80+0*16(%rdi) | ||
254 | movu128 XMMTMP, 80+1*16(%rdi) | ||
255 | |||
256 | ret | ||
257 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | ||
258 | |||
259 | .section .rodata.cst256.K256, "aM", @progbits, 256 | ||
260 | .balign 16 | ||
261 | K256: | ||
262 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | ||
263 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | ||
264 | .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 | ||
265 | .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 | ||
266 | .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc | ||
267 | .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da | ||
268 | .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 | ||
269 | .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 | ||
270 | .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 | ||
271 | .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 | ||
272 | .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 | ||
273 | .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 | ||
274 | .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 | ||
275 | .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 | ||
276 | .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 | ||
277 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | ||
278 | |||
279 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | ||
280 | .balign 16 | ||
281 | PSHUFFLE_BSWAP32_FLIP_MASK: | ||
282 | .octa 0x0c0d0e0f08090a0b0405060700010203 | ||
283 | |||
284 | #endif | ||
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index 166cfd38a..a61b3cbed 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
@@ -20,7 +20,7 @@ | |||
20 | #define extr128_32 pextrd | 20 | #define extr128_32 pextrd |
21 | //#define extr128_32 extractps # not shorter | 21 | //#define extr128_32 extractps # not shorter |
22 | 22 | ||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | 23 | .section .text.sha1_process_block64_shaNI, "ax", @progbits |
24 | .globl sha1_process_block64_shaNI | 24 | .globl sha1_process_block64_shaNI |
25 | .hidden sha1_process_block64_shaNI | 25 | .hidden sha1_process_block64_shaNI |
26 | .type sha1_process_block64_shaNI, @function | 26 | .type sha1_process_block64_shaNI, @function |
@@ -32,45 +32,42 @@ | |||
32 | #define MSG1 %xmm4 | 32 | #define MSG1 %xmm4 |
33 | #define MSG2 %xmm5 | 33 | #define MSG2 %xmm5 |
34 | #define MSG3 %xmm6 | 34 | #define MSG3 %xmm6 |
35 | #define SHUF_MASK %xmm7 | ||
36 | 35 | ||
37 | .balign 8 # allow decoders to fetch at least 3 first insns | 36 | .balign 8 # allow decoders to fetch at least 2 first insns |
38 | sha1_process_block64_shaNI: | 37 | sha1_process_block64_shaNI: |
39 | pushl %ebp | ||
40 | movl %esp, %ebp | ||
41 | subl $32, %esp | ||
42 | andl $~0xF, %esp # paddd needs aligned memory operand | ||
43 | |||
44 | /* load initial hash values */ | 38 | /* load initial hash values */ |
45 | xor128 E0, E0 | ||
46 | movu128 76(%eax), ABCD | 39 | movu128 76(%eax), ABCD |
40 | xor128 E0, E0 | ||
47 | pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word | 41 | pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word |
48 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | 42 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD |
49 | 43 | ||
50 | mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK | 44 | mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7 |
45 | |||
46 | movu128 0*16(%eax), MSG0 | ||
47 | pshufb %xmm7, MSG0 | ||
48 | movu128 1*16(%eax), MSG1 | ||
49 | pshufb %xmm7, MSG1 | ||
50 | movu128 2*16(%eax), MSG2 | ||
51 | pshufb %xmm7, MSG2 | ||
52 | movu128 3*16(%eax), MSG3 | ||
53 | pshufb %xmm7, MSG3 | ||
51 | 54 | ||
52 | /* Save hash values for addition after rounds */ | 55 | /* Save hash values for addition after rounds */ |
53 | movu128 E0, 16(%esp) | 56 | mova128 E0, %xmm7 |
54 | movu128 ABCD, (%esp) | 57 | /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */ |
55 | 58 | ||
56 | /* Rounds 0-3 */ | 59 | /* Rounds 0-3 */ |
57 | movu128 0*16(%eax), MSG0 | ||
58 | pshufb SHUF_MASK, MSG0 | ||
59 | paddd MSG0, E0 | 60 | paddd MSG0, E0 |
60 | mova128 ABCD, E1 | 61 | mova128 ABCD, E1 |
61 | sha1rnds4 $0, E0, ABCD | 62 | sha1rnds4 $0, E0, ABCD |
62 | 63 | ||
63 | /* Rounds 4-7 */ | 64 | /* Rounds 4-7 */ |
64 | movu128 1*16(%eax), MSG1 | ||
65 | pshufb SHUF_MASK, MSG1 | ||
66 | sha1nexte MSG1, E1 | 65 | sha1nexte MSG1, E1 |
67 | mova128 ABCD, E0 | 66 | mova128 ABCD, E0 |
68 | sha1rnds4 $0, E1, ABCD | 67 | sha1rnds4 $0, E1, ABCD |
69 | sha1msg1 MSG1, MSG0 | 68 | sha1msg1 MSG1, MSG0 |
70 | 69 | ||
71 | /* Rounds 8-11 */ | 70 | /* Rounds 8-11 */ |
72 | movu128 2*16(%eax), MSG2 | ||
73 | pshufb SHUF_MASK, MSG2 | ||
74 | sha1nexte MSG2, E0 | 71 | sha1nexte MSG2, E0 |
75 | mova128 ABCD, E1 | 72 | mova128 ABCD, E1 |
76 | sha1rnds4 $0, E0, ABCD | 73 | sha1rnds4 $0, E0, ABCD |
@@ -78,8 +75,6 @@ sha1_process_block64_shaNI: | |||
78 | xor128 MSG2, MSG0 | 75 | xor128 MSG2, MSG0 |
79 | 76 | ||
80 | /* Rounds 12-15 */ | 77 | /* Rounds 12-15 */ |
81 | movu128 3*16(%eax), MSG3 | ||
82 | pshufb SHUF_MASK, MSG3 | ||
83 | sha1nexte MSG3, E1 | 78 | sha1nexte MSG3, E1 |
84 | mova128 ABCD, E0 | 79 | mova128 ABCD, E0 |
85 | sha1msg2 MSG3, MSG0 | 80 | sha1msg2 MSG3, MSG0 |
@@ -210,21 +205,21 @@ sha1_process_block64_shaNI: | |||
210 | sha1rnds4 $3, E1, ABCD | 205 | sha1rnds4 $3, E1, ABCD |
211 | 206 | ||
212 | /* Add current hash values with previously saved */ | 207 | /* Add current hash values with previously saved */ |
213 | sha1nexte 16(%esp), E0 | 208 | sha1nexte %xmm7, E0 |
214 | paddd (%esp), ABCD | 209 | /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */ |
210 | movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)... | ||
215 | 211 | ||
216 | /* Write hash values back in the correct order */ | 212 | /* Write hash values back in the correct order */ |
217 | shuf128_32 $0x1B, ABCD, ABCD | 213 | shuf128_32 $0x1B, ABCD, ABCD |
214 | paddd %xmm7, ABCD # ...add it to final ABCD | ||
218 | movu128 ABCD, 76(%eax) | 215 | movu128 ABCD, 76(%eax) |
219 | extr128_32 $3, E0, 76+4*4(%eax) | 216 | extr128_32 $3, E0, 76+4*4(%eax) |
220 | 217 | ||
221 | movl %ebp, %esp | ||
222 | popl %ebp | ||
223 | ret | 218 | ret |
224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 219 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
225 | 220 | ||
226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 221 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
227 | .align 16 | 222 | .balign 16 |
228 | PSHUFFLE_BYTE_FLIP_MASK: | 223 | PSHUFFLE_BYTE_FLIP_MASK: |
229 | .octa 0x000102030405060708090a0b0c0d0e0f | 224 | .octa 0x000102030405060708090a0b0c0d0e0f |
230 | 225 | ||
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 87fb616a1..287cfe547 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -1,7 +1,7 @@ | |||
1 | ### Generated by hash_md5_sha_x86-64.S.sh ### | 1 | ### Generated by hash_md5_sha_x86-64.S.sh ### |
2 | 2 | ||
3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 3 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
4 | .section .text.sha1_process_block64,"ax",@progbits | 4 | .section .text.sha1_process_block64, "ax", @progbits |
5 | .globl sha1_process_block64 | 5 | .globl sha1_process_block64 |
6 | .hidden sha1_process_block64 | 6 | .hidden sha1_process_block64 |
7 | .type sha1_process_block64, @function | 7 | .type sha1_process_block64, @function |
@@ -10,7 +10,7 @@ | |||
10 | sha1_process_block64: | 10 | sha1_process_block64: |
11 | pushq %rbp # 1 byte insn | 11 | pushq %rbp # 1 byte insn |
12 | pushq %rbx # 1 byte insn | 12 | pushq %rbx # 1 byte insn |
13 | pushq %r15 # 2 byte insn | 13 | # pushq %r15 # 2 byte insn |
14 | pushq %r14 # 2 byte insn | 14 | pushq %r14 # 2 byte insn |
15 | pushq %r13 # 2 byte insn | 15 | pushq %r13 # 2 byte insn |
16 | pushq %r12 # 2 byte insn | 16 | pushq %r12 # 2 byte insn |
@@ -19,17 +19,13 @@ sha1_process_block64: | |||
19 | #Register and stack use: | 19 | #Register and stack use: |
20 | # eax..edx: a..d | 20 | # eax..edx: a..d |
21 | # ebp: e | 21 | # ebp: e |
22 | # esi,edi: temps | 22 | # esi,edi,r8..r14: temps |
23 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 23 | # r15: unused |
24 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 24 | # xmm0..xmm3: W[] |
25 | movl $3, %eax | 25 | # xmm4,xmm5: temps |
26 | 1: | 26 | # xmm6: current round constant |
27 | movq (%rdi,%rax,8), %rsi | 27 | # xmm7: all round constants |
28 | bswapq %rsi | 28 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
29 | rolq $32, %rsi | ||
30 | movq %rsi, -32(%rsp,%rax,8) | ||
31 | decl %eax | ||
32 | jns 1b | ||
33 | 29 | ||
34 | movl 80(%rdi), %eax # a = ctx->hash[0] | 30 | movl 80(%rdi), %eax # a = ctx->hash[0] |
35 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 31 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -37,587 +33,760 @@ sha1_process_block64: | |||
37 | movl 92(%rdi), %edx # d = ctx->hash[3] | 33 | movl 92(%rdi), %edx # d = ctx->hash[3] |
38 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 34 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
39 | 35 | ||
40 | movq 4*8(%rdi), %r8 | 36 | movaps sha1const(%rip), %xmm7 |
41 | movq 4*10(%rdi), %r10 | 37 | pshufd $0x00, %xmm7, %xmm6 |
38 | |||
39 | # Load W[] to xmm registers, byteswapping on the fly. | ||
40 | # | ||
41 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | ||
42 | # for use in RD1As instead of spilling them to stack. | ||
43 | # We lose parallelized addition of RCONST, but LEA | ||
44 | # can do two additions at once, so it is probably a wash. | ||
45 | # (We use rsi instead of rN because this makes two | ||
46 | # LEAs in two first RD1As shorter by one byte). | ||
47 | movq 4*0(%rdi), %rsi | ||
48 | movq 4*2(%rdi), %r8 | ||
49 | bswapq %rsi | ||
42 | bswapq %r8 | 50 | bswapq %r8 |
51 | rolq $32, %rsi # rsi = W[1]:W[0] | ||
52 | rolq $32, %r8 # r8 = W[3]:W[2] | ||
53 | movq %rsi, %xmm0 | ||
54 | movq %r8, %xmm4 | ||
55 | punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) | ||
56 | # movaps %xmm0, %xmm4 # add RCONST, spill to stack | ||
57 | # paddd %xmm6, %xmm4 | ||
58 | # movups %xmm4, -64+16*0(%rsp) | ||
59 | |||
60 | movq 4*4(%rdi), %r9 | ||
61 | movq 4*6(%rdi), %r10 | ||
62 | bswapq %r9 | ||
43 | bswapq %r10 | 63 | bswapq %r10 |
44 | movq 4*12(%rdi), %r12 | 64 | rolq $32, %r9 # r9 = W[5]:W[4] |
45 | movq 4*14(%rdi), %r14 | 65 | rolq $32, %r10 # r10 = W[7]:W[6] |
66 | movq %r9, %xmm1 | ||
67 | movq %r10, %xmm4 | ||
68 | punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) | ||
69 | |||
70 | movq 4*8(%rdi), %r11 | ||
71 | movq 4*10(%rdi), %r12 | ||
72 | bswapq %r11 | ||
46 | bswapq %r12 | 73 | bswapq %r12 |
74 | rolq $32, %r11 # r11 = W[9]:W[8] | ||
75 | rolq $32, %r12 # r12 = W[11]:W[10] | ||
76 | movq %r11, %xmm2 | ||
77 | movq %r12, %xmm4 | ||
78 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) | ||
79 | |||
80 | movq 4*12(%rdi), %r13 | ||
81 | movq 4*14(%rdi), %r14 | ||
82 | bswapq %r13 | ||
47 | bswapq %r14 | 83 | bswapq %r14 |
48 | movl %r8d, %r9d | 84 | rolq $32, %r13 # r13 = W[13]:W[12] |
49 | shrq $32, %r8 | 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
50 | movl %r10d, %r11d | 86 | movq %r13, %xmm3 |
51 | shrq $32, %r10 | 87 | movq %r14, %xmm4 |
52 | movl %r12d, %r13d | 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
53 | shrq $32, %r12 | ||
54 | movl %r14d, %r15d | ||
55 | shrq $32, %r14 | ||
56 | 89 | ||
57 | # 0 | 90 | # 0 |
58 | # W[0], already in %esi | 91 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] |
92 | shrq $32, %rsi | ||
59 | movl %ecx, %edi # c | 93 | movl %ecx, %edi # c |
60 | xorl %edx, %edi # ^d | 94 | xorl %edx, %edi # ^d |
61 | andl %ebx, %edi # &b | 95 | andl %ebx, %edi # &b |
62 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 96 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
63 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
64 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 97 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
65 | movl %eax, %esi # | 98 | movl %eax, %edi # |
66 | roll $5, %esi # rotl32(a,5) | 99 | roll $5, %edi # rotl32(a,5) |
67 | addl %esi, %ebp # e += rotl32(a,5) | 100 | addl %edi, %ebp # e += rotl32(a,5) |
68 | rorl $2, %ebx # b = rotl32(b,30) | 101 | rorl $2, %ebx # b = rotl32(b,30) |
69 | # 1 | 102 | # 1 |
70 | movl -32+4*1(%rsp), %esi # W[n] | 103 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] |
71 | movl %ebx, %edi # c | 104 | movl %ebx, %edi # c |
72 | xorl %ecx, %edi # ^d | 105 | xorl %ecx, %edi # ^d |
73 | andl %eax, %edi # &b | 106 | andl %eax, %edi # &b |
74 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 107 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
75 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
76 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 108 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
77 | movl %ebp, %esi # | 109 | movl %ebp, %edi # |
78 | roll $5, %esi # rotl32(a,5) | 110 | roll $5, %edi # rotl32(a,5) |
79 | addl %esi, %edx # e += rotl32(a,5) | 111 | addl %edi, %edx # e += rotl32(a,5) |
80 | rorl $2, %eax # b = rotl32(b,30) | 112 | rorl $2, %eax # b = rotl32(b,30) |
81 | # 2 | 113 | # 2 |
82 | movl -32+4*2(%rsp), %esi # W[n] | 114 | leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n] |
115 | shrq $32, %r8 | ||
83 | movl %eax, %edi # c | 116 | movl %eax, %edi # c |
84 | xorl %ebx, %edi # ^d | 117 | xorl %ebx, %edi # ^d |
85 | andl %ebp, %edi # &b | 118 | andl %ebp, %edi # &b |
86 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 119 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
87 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
88 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 120 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
89 | movl %edx, %esi # | 121 | movl %edx, %edi # |
90 | roll $5, %esi # rotl32(a,5) | 122 | roll $5, %edi # rotl32(a,5) |
91 | addl %esi, %ecx # e += rotl32(a,5) | 123 | addl %edi, %ecx # e += rotl32(a,5) |
92 | rorl $2, %ebp # b = rotl32(b,30) | 124 | rorl $2, %ebp # b = rotl32(b,30) |
93 | # 3 | 125 | # 3 |
94 | movl -32+4*3(%rsp), %esi # W[n] | 126 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] |
95 | movl %ebp, %edi # c | 127 | movl %ebp, %edi # c |
96 | xorl %eax, %edi # ^d | 128 | xorl %eax, %edi # ^d |
97 | andl %edx, %edi # &b | 129 | andl %edx, %edi # &b |
98 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 130 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
99 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n] | ||
100 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 131 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
101 | movl %ecx, %esi # | 132 | movl %ecx, %edi # |
102 | roll $5, %esi # rotl32(a,5) | 133 | roll $5, %edi # rotl32(a,5) |
103 | addl %esi, %ebx # e += rotl32(a,5) | 134 | addl %edi, %ebx # e += rotl32(a,5) |
104 | rorl $2, %edx # b = rotl32(b,30) | 135 | rorl $2, %edx # b = rotl32(b,30) |
105 | # 4 | 136 | # 4 |
106 | movl -32+4*4(%rsp), %esi # W[n] | 137 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] |
138 | shrq $32, %r9 | ||
107 | movl %edx, %edi # c | 139 | movl %edx, %edi # c |
108 | xorl %ebp, %edi # ^d | 140 | xorl %ebp, %edi # ^d |
109 | andl %ecx, %edi # &b | 141 | andl %ecx, %edi # &b |
110 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 142 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
111 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n] | ||
112 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 143 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
113 | movl %ebx, %esi # | 144 | movl %ebx, %edi # |
114 | roll $5, %esi # rotl32(a,5) | 145 | roll $5, %edi # rotl32(a,5) |
115 | addl %esi, %eax # e += rotl32(a,5) | 146 | addl %edi, %eax # e += rotl32(a,5) |
116 | rorl $2, %ecx # b = rotl32(b,30) | 147 | rorl $2, %ecx # b = rotl32(b,30) |
117 | # 5 | 148 | # 5 |
118 | movl -32+4*5(%rsp), %esi # W[n] | 149 | leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n] |
119 | movl %ecx, %edi # c | 150 | movl %ecx, %edi # c |
120 | xorl %edx, %edi # ^d | 151 | xorl %edx, %edi # ^d |
121 | andl %ebx, %edi # &b | 152 | andl %ebx, %edi # &b |
122 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 153 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
123 | leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n] | ||
124 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 154 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
125 | movl %eax, %esi # | 155 | movl %eax, %edi # |
126 | roll $5, %esi # rotl32(a,5) | 156 | roll $5, %edi # rotl32(a,5) |
127 | addl %esi, %ebp # e += rotl32(a,5) | 157 | addl %edi, %ebp # e += rotl32(a,5) |
128 | rorl $2, %ebx # b = rotl32(b,30) | 158 | rorl $2, %ebx # b = rotl32(b,30) |
129 | # 6 | 159 | # 6 |
130 | movl -32+4*6(%rsp), %esi # W[n] | 160 | leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n] |
161 | shrq $32, %r10 | ||
131 | movl %ebx, %edi # c | 162 | movl %ebx, %edi # c |
132 | xorl %ecx, %edi # ^d | 163 | xorl %ecx, %edi # ^d |
133 | andl %eax, %edi # &b | 164 | andl %eax, %edi # &b |
134 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 165 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
135 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n] | ||
136 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 166 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
137 | movl %ebp, %esi # | 167 | movl %ebp, %edi # |
138 | roll $5, %esi # rotl32(a,5) | 168 | roll $5, %edi # rotl32(a,5) |
139 | addl %esi, %edx # e += rotl32(a,5) | 169 | addl %edi, %edx # e += rotl32(a,5) |
140 | rorl $2, %eax # b = rotl32(b,30) | 170 | rorl $2, %eax # b = rotl32(b,30) |
141 | # 7 | 171 | # 7 |
142 | movl -32+4*7(%rsp), %esi # W[n] | 172 | leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n] |
143 | movl %eax, %edi # c | 173 | movl %eax, %edi # c |
144 | xorl %ebx, %edi # ^d | 174 | xorl %ebx, %edi # ^d |
145 | andl %ebp, %edi # &b | 175 | andl %ebp, %edi # &b |
146 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 176 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
147 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n] | ||
148 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 177 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
149 | movl %edx, %esi # | 178 | movl %edx, %edi # |
150 | roll $5, %esi # rotl32(a,5) | 179 | roll $5, %edi # rotl32(a,5) |
151 | addl %esi, %ecx # e += rotl32(a,5) | 180 | addl %edi, %ecx # e += rotl32(a,5) |
152 | rorl $2, %ebp # b = rotl32(b,30) | 181 | rorl $2, %ebp # b = rotl32(b,30) |
182 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
183 | movaps %xmm3, %xmm4 | ||
184 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
185 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
186 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
187 | # same result as above, but shorter and faster: | ||
188 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
189 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
190 | movaps %xmm0, %xmm5 | ||
191 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
192 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
193 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
194 | xorps %xmm5, %xmm0 # ^ | ||
195 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
196 | movaps %xmm0, %xmm5 | ||
197 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
198 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
199 | paddd %xmm0, %xmm0 # shift left by 1 | ||
200 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
201 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
202 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
203 | movaps %xmm5, %xmm4 | ||
204 | pslld $2, %xmm5 | ||
205 | psrld $30, %xmm4 | ||
206 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
207 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
208 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
209 | movaps %xmm0, %xmm5 | ||
210 | paddd %xmm6, %xmm5 | ||
211 | movups %xmm5, -64+16*0(%rsp) | ||
153 | # 8 | 212 | # 8 |
154 | # W[n], in %r8 | 213 | leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n] |
214 | shrq $32, %r11 | ||
155 | movl %ebp, %edi # c | 215 | movl %ebp, %edi # c |
156 | xorl %eax, %edi # ^d | 216 | xorl %eax, %edi # ^d |
157 | andl %edx, %edi # &b | 217 | andl %edx, %edi # &b |
158 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
159 | leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n] | ||
160 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 219 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
161 | movl %ecx, %esi # | 220 | movl %ecx, %edi # |
162 | roll $5, %esi # rotl32(a,5) | 221 | roll $5, %edi # rotl32(a,5) |
163 | addl %esi, %ebx # e += rotl32(a,5) | 222 | addl %edi, %ebx # e += rotl32(a,5) |
164 | rorl $2, %edx # b = rotl32(b,30) | 223 | rorl $2, %edx # b = rotl32(b,30) |
165 | # 9 | 224 | # 9 |
166 | # W[n], in %r9 | 225 | leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n] |
167 | movl %edx, %edi # c | 226 | movl %edx, %edi # c |
168 | xorl %ebp, %edi # ^d | 227 | xorl %ebp, %edi # ^d |
169 | andl %ecx, %edi # &b | 228 | andl %ecx, %edi # &b |
170 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 229 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
171 | leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n] | ||
172 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 230 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
173 | movl %ebx, %esi # | 231 | movl %ebx, %edi # |
174 | roll $5, %esi # rotl32(a,5) | 232 | roll $5, %edi # rotl32(a,5) |
175 | addl %esi, %eax # e += rotl32(a,5) | 233 | addl %edi, %eax # e += rotl32(a,5) |
176 | rorl $2, %ecx # b = rotl32(b,30) | 234 | rorl $2, %ecx # b = rotl32(b,30) |
177 | # 10 | 235 | # 10 |
178 | # W[n], in %r10 | 236 | leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n] |
237 | shrq $32, %r12 | ||
179 | movl %ecx, %edi # c | 238 | movl %ecx, %edi # c |
180 | xorl %edx, %edi # ^d | 239 | xorl %edx, %edi # ^d |
181 | andl %ebx, %edi # &b | 240 | andl %ebx, %edi # &b |
182 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 241 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
183 | leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n] | ||
184 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 242 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
185 | movl %eax, %esi # | 243 | movl %eax, %edi # |
186 | roll $5, %esi # rotl32(a,5) | 244 | roll $5, %edi # rotl32(a,5) |
187 | addl %esi, %ebp # e += rotl32(a,5) | 245 | addl %edi, %ebp # e += rotl32(a,5) |
188 | rorl $2, %ebx # b = rotl32(b,30) | 246 | rorl $2, %ebx # b = rotl32(b,30) |
189 | # 11 | 247 | # 11 |
190 | # W[n], in %r11 | 248 | leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n] |
191 | movl %ebx, %edi # c | 249 | movl %ebx, %edi # c |
192 | xorl %ecx, %edi # ^d | 250 | xorl %ecx, %edi # ^d |
193 | andl %eax, %edi # &b | 251 | andl %eax, %edi # &b |
194 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 252 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
195 | leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n] | ||
196 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 253 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
197 | movl %ebp, %esi # | 254 | movl %ebp, %edi # |
198 | roll $5, %esi # rotl32(a,5) | 255 | roll $5, %edi # rotl32(a,5) |
199 | addl %esi, %edx # e += rotl32(a,5) | 256 | addl %edi, %edx # e += rotl32(a,5) |
200 | rorl $2, %eax # b = rotl32(b,30) | 257 | rorl $2, %eax # b = rotl32(b,30) |
258 | pshufd $0x55, %xmm7, %xmm6 | ||
259 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
260 | movaps %xmm0, %xmm4 | ||
261 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
262 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
263 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
264 | # same result as above, but shorter and faster: | ||
265 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
266 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
267 | movaps %xmm1, %xmm5 | ||
268 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
269 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
270 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
271 | xorps %xmm5, %xmm1 # ^ | ||
272 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
273 | movaps %xmm1, %xmm5 | ||
274 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
275 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
276 | paddd %xmm1, %xmm1 # shift left by 1 | ||
277 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
278 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
279 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
280 | movaps %xmm5, %xmm4 | ||
281 | pslld $2, %xmm5 | ||
282 | psrld $30, %xmm4 | ||
283 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
284 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
285 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
286 | movaps %xmm1, %xmm5 | ||
287 | paddd %xmm6, %xmm5 | ||
288 | movups %xmm5, -64+16*1(%rsp) | ||
201 | # 12 | 289 | # 12 |
202 | # W[n], in %r12 | 290 | leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n] |
291 | shrq $32, %r13 | ||
203 | movl %eax, %edi # c | 292 | movl %eax, %edi # c |
204 | xorl %ebx, %edi # ^d | 293 | xorl %ebx, %edi # ^d |
205 | andl %ebp, %edi # &b | 294 | andl %ebp, %edi # &b |
206 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 295 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
207 | leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n] | ||
208 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 296 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
209 | movl %edx, %esi # | 297 | movl %edx, %edi # |
210 | roll $5, %esi # rotl32(a,5) | 298 | roll $5, %edi # rotl32(a,5) |
211 | addl %esi, %ecx # e += rotl32(a,5) | 299 | addl %edi, %ecx # e += rotl32(a,5) |
212 | rorl $2, %ebp # b = rotl32(b,30) | 300 | rorl $2, %ebp # b = rotl32(b,30) |
213 | # 13 | 301 | # 13 |
214 | # W[n], in %r13 | 302 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] |
215 | movl %ebp, %edi # c | 303 | movl %ebp, %edi # c |
216 | xorl %eax, %edi # ^d | 304 | xorl %eax, %edi # ^d |
217 | andl %edx, %edi # &b | 305 | andl %edx, %edi # &b |
218 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 306 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
219 | leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n] | ||
220 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 307 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
221 | movl %ecx, %esi # | 308 | movl %ecx, %edi # |
222 | roll $5, %esi # rotl32(a,5) | 309 | roll $5, %edi # rotl32(a,5) |
223 | addl %esi, %ebx # e += rotl32(a,5) | 310 | addl %edi, %ebx # e += rotl32(a,5) |
224 | rorl $2, %edx # b = rotl32(b,30) | 311 | rorl $2, %edx # b = rotl32(b,30) |
225 | # 14 | 312 | # 14 |
226 | # W[n], in %r14 | 313 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] |
314 | shrq $32, %r14 | ||
227 | movl %edx, %edi # c | 315 | movl %edx, %edi # c |
228 | xorl %ebp, %edi # ^d | 316 | xorl %ebp, %edi # ^d |
229 | andl %ecx, %edi # &b | 317 | andl %ecx, %edi # &b |
230 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 318 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
231 | leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n] | ||
232 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 319 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
233 | movl %ebx, %esi # | 320 | movl %ebx, %edi # |
234 | roll $5, %esi # rotl32(a,5) | 321 | roll $5, %edi # rotl32(a,5) |
235 | addl %esi, %eax # e += rotl32(a,5) | 322 | addl %edi, %eax # e += rotl32(a,5) |
236 | rorl $2, %ecx # b = rotl32(b,30) | 323 | rorl $2, %ecx # b = rotl32(b,30) |
237 | # 15 | 324 | # 15 |
238 | # W[n], in %r15 | 325 | leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n] |
239 | movl %ecx, %edi # c | 326 | movl %ecx, %edi # c |
240 | xorl %edx, %edi # ^d | 327 | xorl %edx, %edi # ^d |
241 | andl %ebx, %edi # &b | 328 | andl %ebx, %edi # &b |
242 | xorl %edx, %edi # (((c ^ d) & b) ^ d) | 329 | xorl %edx, %edi # (((c ^ d) & b) ^ d) |
243 | leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n] | ||
244 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) | 330 | addl %edi, %ebp # e += (((c ^ d) & b) ^ d) |
245 | movl %eax, %esi # | 331 | movl %eax, %edi # |
246 | roll $5, %esi # rotl32(a,5) | 332 | roll $5, %edi # rotl32(a,5) |
247 | addl %esi, %ebp # e += rotl32(a,5) | 333 | addl %edi, %ebp # e += rotl32(a,5) |
248 | rorl $2, %ebx # b = rotl32(b,30) | 334 | rorl $2, %ebx # b = rotl32(b,30) |
335 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
336 | movaps %xmm1, %xmm4 | ||
337 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
338 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
339 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
340 | # same result as above, but shorter and faster: | ||
341 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
342 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
343 | movaps %xmm2, %xmm5 | ||
344 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
345 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
346 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
347 | xorps %xmm5, %xmm2 # ^ | ||
348 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
349 | movaps %xmm2, %xmm5 | ||
350 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
351 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
352 | paddd %xmm2, %xmm2 # shift left by 1 | ||
353 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
354 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
355 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
356 | movaps %xmm5, %xmm4 | ||
357 | pslld $2, %xmm5 | ||
358 | psrld $30, %xmm4 | ||
359 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
360 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
361 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
362 | movaps %xmm2, %xmm5 | ||
363 | paddd %xmm6, %xmm5 | ||
364 | movups %xmm5, -64+16*2(%rsp) | ||
249 | # 16 | 365 | # 16 |
250 | movl %r13d, %esi # W[(n+13) & 15] | ||
251 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
252 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
253 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
254 | roll %esi # | ||
255 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
256 | movl %ebx, %edi # c | 366 | movl %ebx, %edi # c |
257 | xorl %ecx, %edi # ^d | 367 | xorl %ecx, %edi # ^d |
258 | andl %eax, %edi # &b | 368 | andl %eax, %edi # &b |
259 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) | 369 | xorl %ecx, %edi # (((c ^ d) & b) ^ d) |
260 | leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 370 | addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15] |
261 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) | 371 | addl %edi, %edx # e += (((c ^ d) & b) ^ d) |
262 | movl %ebp, %esi # | 372 | movl %ebp, %esi # |
263 | roll $5, %esi # rotl32(a,5) | 373 | roll $5, %esi # rotl32(a,5) |
264 | addl %esi, %edx # e += rotl32(a,5) | 374 | addl %esi, %edx # e += rotl32(a,5) |
265 | rorl $2, %eax # b = rotl32(b,30) | 375 | rorl $2, %eax # b = rotl32(b,30) |
266 | # 17 | 376 | # 17 |
267 | movl %r14d, %esi # W[(n+13) & 15] | ||
268 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
269 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
270 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
271 | roll %esi # | ||
272 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
273 | movl %eax, %edi # c | 377 | movl %eax, %edi # c |
274 | xorl %ebx, %edi # ^d | 378 | xorl %ebx, %edi # ^d |
275 | andl %ebp, %edi # &b | 379 | andl %ebp, %edi # &b |
276 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) | 380 | xorl %ebx, %edi # (((c ^ d) & b) ^ d) |
277 | leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 381 | addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15] |
278 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) | 382 | addl %edi, %ecx # e += (((c ^ d) & b) ^ d) |
279 | movl %edx, %esi # | 383 | movl %edx, %esi # |
280 | roll $5, %esi # rotl32(a,5) | 384 | roll $5, %esi # rotl32(a,5) |
281 | addl %esi, %ecx # e += rotl32(a,5) | 385 | addl %esi, %ecx # e += rotl32(a,5) |
282 | rorl $2, %ebp # b = rotl32(b,30) | 386 | rorl $2, %ebp # b = rotl32(b,30) |
283 | # 18 | 387 | # 18 |
284 | movl %r15d, %esi # W[(n+13) & 15] | ||
285 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
286 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
287 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
288 | roll %esi # | ||
289 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
290 | movl %ebp, %edi # c | 388 | movl %ebp, %edi # c |
291 | xorl %eax, %edi # ^d | 389 | xorl %eax, %edi # ^d |
292 | andl %edx, %edi # &b | 390 | andl %edx, %edi # &b |
293 | xorl %eax, %edi # (((c ^ d) & b) ^ d) | 391 | xorl %eax, %edi # (((c ^ d) & b) ^ d) |
294 | leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 392 | addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15] |
295 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) | 393 | addl %edi, %ebx # e += (((c ^ d) & b) ^ d) |
296 | movl %ecx, %esi # | 394 | movl %ecx, %esi # |
297 | roll $5, %esi # rotl32(a,5) | 395 | roll $5, %esi # rotl32(a,5) |
298 | addl %esi, %ebx # e += rotl32(a,5) | 396 | addl %esi, %ebx # e += rotl32(a,5) |
299 | rorl $2, %edx # b = rotl32(b,30) | 397 | rorl $2, %edx # b = rotl32(b,30) |
300 | # 19 | 398 | # 19 |
301 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
302 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
303 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
304 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
305 | roll %esi # | ||
306 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
307 | movl %edx, %edi # c | 399 | movl %edx, %edi # c |
308 | xorl %ebp, %edi # ^d | 400 | xorl %ebp, %edi # ^d |
309 | andl %ecx, %edi # &b | 401 | andl %ecx, %edi # &b |
310 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) | 402 | xorl %ebp, %edi # (((c ^ d) & b) ^ d) |
311 | leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 403 | addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15] |
312 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) | 404 | addl %edi, %eax # e += (((c ^ d) & b) ^ d) |
313 | movl %ebx, %esi # | 405 | movl %ebx, %esi # |
314 | roll $5, %esi # rotl32(a,5) | 406 | roll $5, %esi # rotl32(a,5) |
315 | addl %esi, %eax # e += rotl32(a,5) | 407 | addl %esi, %eax # e += rotl32(a,5) |
316 | rorl $2, %ecx # b = rotl32(b,30) | 408 | rorl $2, %ecx # b = rotl32(b,30) |
409 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
410 | movaps %xmm2, %xmm4 | ||
411 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
412 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
413 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
414 | # same result as above, but shorter and faster: | ||
415 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
416 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
417 | movaps %xmm3, %xmm5 | ||
418 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
419 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
420 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
421 | xorps %xmm5, %xmm3 # ^ | ||
422 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
423 | movaps %xmm3, %xmm5 | ||
424 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
425 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
426 | paddd %xmm3, %xmm3 # shift left by 1 | ||
427 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
428 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
429 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
430 | movaps %xmm5, %xmm4 | ||
431 | pslld $2, %xmm5 | ||
432 | psrld $30, %xmm4 | ||
433 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
434 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
435 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
436 | movaps %xmm3, %xmm5 | ||
437 | paddd %xmm6, %xmm5 | ||
438 | movups %xmm5, -64+16*3(%rsp) | ||
317 | # 20 | 439 | # 20 |
318 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
319 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
320 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
321 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
322 | roll %esi # | ||
323 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
324 | movl %ecx, %edi # c | 440 | movl %ecx, %edi # c |
325 | xorl %edx, %edi # ^d | 441 | xorl %edx, %edi # ^d |
326 | xorl %ebx, %edi # ^b | 442 | xorl %ebx, %edi # ^b |
327 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 443 | addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15] |
328 | addl %edi, %ebp # e += (c ^ d ^ b) | 444 | addl %edi, %ebp # e += (c ^ d ^ b) |
329 | movl %eax, %esi # | 445 | movl %eax, %esi # |
330 | roll $5, %esi # rotl32(a,5) | 446 | roll $5, %esi # rotl32(a,5) |
331 | addl %esi, %ebp # e += rotl32(a,5) | 447 | addl %esi, %ebp # e += rotl32(a,5) |
332 | rorl $2, %ebx # b = rotl32(b,30) | 448 | rorl $2, %ebx # b = rotl32(b,30) |
333 | # 21 | 449 | # 21 |
334 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
335 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
336 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
337 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
338 | roll %esi # | ||
339 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
340 | movl %ebx, %edi # c | 450 | movl %ebx, %edi # c |
341 | xorl %ecx, %edi # ^d | 451 | xorl %ecx, %edi # ^d |
342 | xorl %eax, %edi # ^b | 452 | xorl %eax, %edi # ^b |
343 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 453 | addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15] |
344 | addl %edi, %edx # e += (c ^ d ^ b) | 454 | addl %edi, %edx # e += (c ^ d ^ b) |
345 | movl %ebp, %esi # | 455 | movl %ebp, %esi # |
346 | roll $5, %esi # rotl32(a,5) | 456 | roll $5, %esi # rotl32(a,5) |
347 | addl %esi, %edx # e += rotl32(a,5) | 457 | addl %esi, %edx # e += rotl32(a,5) |
348 | rorl $2, %eax # b = rotl32(b,30) | 458 | rorl $2, %eax # b = rotl32(b,30) |
349 | # 22 | 459 | # 22 |
350 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
351 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
352 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
353 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
354 | roll %esi # | ||
355 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
356 | movl %eax, %edi # c | 460 | movl %eax, %edi # c |
357 | xorl %ebx, %edi # ^d | 461 | xorl %ebx, %edi # ^d |
358 | xorl %ebp, %edi # ^b | 462 | xorl %ebp, %edi # ^b |
359 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 463 | addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15] |
360 | addl %edi, %ecx # e += (c ^ d ^ b) | 464 | addl %edi, %ecx # e += (c ^ d ^ b) |
361 | movl %edx, %esi # | 465 | movl %edx, %esi # |
362 | roll $5, %esi # rotl32(a,5) | 466 | roll $5, %esi # rotl32(a,5) |
363 | addl %esi, %ecx # e += rotl32(a,5) | 467 | addl %esi, %ecx # e += rotl32(a,5) |
364 | rorl $2, %ebp # b = rotl32(b,30) | 468 | rorl $2, %ebp # b = rotl32(b,30) |
365 | # 23 | 469 | # 23 |
366 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
367 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
368 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
369 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
370 | roll %esi # | ||
371 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
372 | movl %ebp, %edi # c | 470 | movl %ebp, %edi # c |
373 | xorl %eax, %edi # ^d | 471 | xorl %eax, %edi # ^d |
374 | xorl %edx, %edi # ^b | 472 | xorl %edx, %edi # ^b |
375 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 473 | addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15] |
376 | addl %edi, %ebx # e += (c ^ d ^ b) | 474 | addl %edi, %ebx # e += (c ^ d ^ b) |
377 | movl %ecx, %esi # | 475 | movl %ecx, %esi # |
378 | roll $5, %esi # rotl32(a,5) | 476 | roll $5, %esi # rotl32(a,5) |
379 | addl %esi, %ebx # e += rotl32(a,5) | 477 | addl %esi, %ebx # e += rotl32(a,5) |
380 | rorl $2, %edx # b = rotl32(b,30) | 478 | rorl $2, %edx # b = rotl32(b,30) |
479 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
480 | movaps %xmm3, %xmm4 | ||
481 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
482 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
483 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
484 | # same result as above, but shorter and faster: | ||
485 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
486 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
487 | movaps %xmm0, %xmm5 | ||
488 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
489 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
490 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
491 | xorps %xmm5, %xmm0 # ^ | ||
492 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
493 | movaps %xmm0, %xmm5 | ||
494 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
495 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
496 | paddd %xmm0, %xmm0 # shift left by 1 | ||
497 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
498 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
499 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
500 | movaps %xmm5, %xmm4 | ||
501 | pslld $2, %xmm5 | ||
502 | psrld $30, %xmm4 | ||
503 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
504 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
505 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
506 | movaps %xmm0, %xmm5 | ||
507 | paddd %xmm6, %xmm5 | ||
508 | movups %xmm5, -64+16*0(%rsp) | ||
381 | # 24 | 509 | # 24 |
382 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
383 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
384 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
385 | roll %r8d # | ||
386 | movl %edx, %edi # c | 510 | movl %edx, %edi # c |
387 | xorl %ebp, %edi # ^d | 511 | xorl %ebp, %edi # ^d |
388 | xorl %ecx, %edi # ^b | 512 | xorl %ecx, %edi # ^b |
389 | leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] | 513 | addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15] |
390 | addl %edi, %eax # e += (c ^ d ^ b) | 514 | addl %edi, %eax # e += (c ^ d ^ b) |
391 | movl %ebx, %esi # | 515 | movl %ebx, %esi # |
392 | roll $5, %esi # rotl32(a,5) | 516 | roll $5, %esi # rotl32(a,5) |
393 | addl %esi, %eax # e += rotl32(a,5) | 517 | addl %esi, %eax # e += rotl32(a,5) |
394 | rorl $2, %ecx # b = rotl32(b,30) | 518 | rorl $2, %ecx # b = rotl32(b,30) |
395 | # 25 | 519 | # 25 |
396 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
397 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
398 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
399 | roll %r9d # | ||
400 | movl %ecx, %edi # c | 520 | movl %ecx, %edi # c |
401 | xorl %edx, %edi # ^d | 521 | xorl %edx, %edi # ^d |
402 | xorl %ebx, %edi # ^b | 522 | xorl %ebx, %edi # ^b |
403 | leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] | 523 | addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15] |
404 | addl %edi, %ebp # e += (c ^ d ^ b) | 524 | addl %edi, %ebp # e += (c ^ d ^ b) |
405 | movl %eax, %esi # | 525 | movl %eax, %esi # |
406 | roll $5, %esi # rotl32(a,5) | 526 | roll $5, %esi # rotl32(a,5) |
407 | addl %esi, %ebp # e += rotl32(a,5) | 527 | addl %esi, %ebp # e += rotl32(a,5) |
408 | rorl $2, %ebx # b = rotl32(b,30) | 528 | rorl $2, %ebx # b = rotl32(b,30) |
409 | # 26 | 529 | # 26 |
410 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
411 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
412 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
413 | roll %r10d # | ||
414 | movl %ebx, %edi # c | 530 | movl %ebx, %edi # c |
415 | xorl %ecx, %edi # ^d | 531 | xorl %ecx, %edi # ^d |
416 | xorl %eax, %edi # ^b | 532 | xorl %eax, %edi # ^b |
417 | leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] | 533 | addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15] |
418 | addl %edi, %edx # e += (c ^ d ^ b) | 534 | addl %edi, %edx # e += (c ^ d ^ b) |
419 | movl %ebp, %esi # | 535 | movl %ebp, %esi # |
420 | roll $5, %esi # rotl32(a,5) | 536 | roll $5, %esi # rotl32(a,5) |
421 | addl %esi, %edx # e += rotl32(a,5) | 537 | addl %esi, %edx # e += rotl32(a,5) |
422 | rorl $2, %eax # b = rotl32(b,30) | 538 | rorl $2, %eax # b = rotl32(b,30) |
423 | # 27 | 539 | # 27 |
424 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
425 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
426 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
427 | roll %r11d # | ||
428 | movl %eax, %edi # c | 540 | movl %eax, %edi # c |
429 | xorl %ebx, %edi # ^d | 541 | xorl %ebx, %edi # ^d |
430 | xorl %ebp, %edi # ^b | 542 | xorl %ebp, %edi # ^b |
431 | leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] | 543 | addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15] |
432 | addl %edi, %ecx # e += (c ^ d ^ b) | 544 | addl %edi, %ecx # e += (c ^ d ^ b) |
433 | movl %edx, %esi # | 545 | movl %edx, %esi # |
434 | roll $5, %esi # rotl32(a,5) | 546 | roll $5, %esi # rotl32(a,5) |
435 | addl %esi, %ecx # e += rotl32(a,5) | 547 | addl %esi, %ecx # e += rotl32(a,5) |
436 | rorl $2, %ebp # b = rotl32(b,30) | 548 | rorl $2, %ebp # b = rotl32(b,30) |
549 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
550 | movaps %xmm0, %xmm4 | ||
551 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
552 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
553 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
554 | # same result as above, but shorter and faster: | ||
555 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
556 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
557 | movaps %xmm1, %xmm5 | ||
558 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
559 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
560 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
561 | xorps %xmm5, %xmm1 # ^ | ||
562 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
563 | movaps %xmm1, %xmm5 | ||
564 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
565 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
566 | paddd %xmm1, %xmm1 # shift left by 1 | ||
567 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
568 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
569 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
570 | movaps %xmm5, %xmm4 | ||
571 | pslld $2, %xmm5 | ||
572 | psrld $30, %xmm4 | ||
573 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
574 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
575 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
576 | movaps %xmm1, %xmm5 | ||
577 | paddd %xmm6, %xmm5 | ||
578 | movups %xmm5, -64+16*1(%rsp) | ||
437 | # 28 | 579 | # 28 |
438 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
439 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
440 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
441 | roll %r12d # | ||
442 | movl %ebp, %edi # c | 580 | movl %ebp, %edi # c |
443 | xorl %eax, %edi # ^d | 581 | xorl %eax, %edi # ^d |
444 | xorl %edx, %edi # ^b | 582 | xorl %edx, %edi # ^b |
445 | leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] | 583 | addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15] |
446 | addl %edi, %ebx # e += (c ^ d ^ b) | 584 | addl %edi, %ebx # e += (c ^ d ^ b) |
447 | movl %ecx, %esi # | 585 | movl %ecx, %esi # |
448 | roll $5, %esi # rotl32(a,5) | 586 | roll $5, %esi # rotl32(a,5) |
449 | addl %esi, %ebx # e += rotl32(a,5) | 587 | addl %esi, %ebx # e += rotl32(a,5) |
450 | rorl $2, %edx # b = rotl32(b,30) | 588 | rorl $2, %edx # b = rotl32(b,30) |
451 | # 29 | 589 | # 29 |
452 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
453 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
454 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
455 | roll %r13d # | ||
456 | movl %edx, %edi # c | 590 | movl %edx, %edi # c |
457 | xorl %ebp, %edi # ^d | 591 | xorl %ebp, %edi # ^d |
458 | xorl %ecx, %edi # ^b | 592 | xorl %ecx, %edi # ^b |
459 | leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] | 593 | addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15] |
460 | addl %edi, %eax # e += (c ^ d ^ b) | 594 | addl %edi, %eax # e += (c ^ d ^ b) |
461 | movl %ebx, %esi # | 595 | movl %ebx, %esi # |
462 | roll $5, %esi # rotl32(a,5) | 596 | roll $5, %esi # rotl32(a,5) |
463 | addl %esi, %eax # e += rotl32(a,5) | 597 | addl %esi, %eax # e += rotl32(a,5) |
464 | rorl $2, %ecx # b = rotl32(b,30) | 598 | rorl $2, %ecx # b = rotl32(b,30) |
465 | # 30 | 599 | # 30 |
466 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
467 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
468 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
469 | roll %r14d # | ||
470 | movl %ecx, %edi # c | 600 | movl %ecx, %edi # c |
471 | xorl %edx, %edi # ^d | 601 | xorl %edx, %edi # ^d |
472 | xorl %ebx, %edi # ^b | 602 | xorl %ebx, %edi # ^b |
473 | leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] | 603 | addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15] |
474 | addl %edi, %ebp # e += (c ^ d ^ b) | 604 | addl %edi, %ebp # e += (c ^ d ^ b) |
475 | movl %eax, %esi # | 605 | movl %eax, %esi # |
476 | roll $5, %esi # rotl32(a,5) | 606 | roll $5, %esi # rotl32(a,5) |
477 | addl %esi, %ebp # e += rotl32(a,5) | 607 | addl %esi, %ebp # e += rotl32(a,5) |
478 | rorl $2, %ebx # b = rotl32(b,30) | 608 | rorl $2, %ebx # b = rotl32(b,30) |
479 | # 31 | 609 | # 31 |
480 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
481 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
482 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
483 | roll %r15d # | ||
484 | movl %ebx, %edi # c | 610 | movl %ebx, %edi # c |
485 | xorl %ecx, %edi # ^d | 611 | xorl %ecx, %edi # ^d |
486 | xorl %eax, %edi # ^b | 612 | xorl %eax, %edi # ^b |
487 | leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] | 613 | addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15] |
488 | addl %edi, %edx # e += (c ^ d ^ b) | 614 | addl %edi, %edx # e += (c ^ d ^ b) |
489 | movl %ebp, %esi # | 615 | movl %ebp, %esi # |
490 | roll $5, %esi # rotl32(a,5) | 616 | roll $5, %esi # rotl32(a,5) |
491 | addl %esi, %edx # e += rotl32(a,5) | 617 | addl %esi, %edx # e += rotl32(a,5) |
492 | rorl $2, %eax # b = rotl32(b,30) | 618 | rorl $2, %eax # b = rotl32(b,30) |
619 | pshufd $0xaa, %xmm7, %xmm6 | ||
620 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
621 | movaps %xmm1, %xmm4 | ||
622 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
623 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
624 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
625 | # same result as above, but shorter and faster: | ||
626 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
627 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
628 | movaps %xmm2, %xmm5 | ||
629 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
630 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
631 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
632 | xorps %xmm5, %xmm2 # ^ | ||
633 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
634 | movaps %xmm2, %xmm5 | ||
635 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
636 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
637 | paddd %xmm2, %xmm2 # shift left by 1 | ||
638 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
639 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
640 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
641 | movaps %xmm5, %xmm4 | ||
642 | pslld $2, %xmm5 | ||
643 | psrld $30, %xmm4 | ||
644 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
645 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
646 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
647 | movaps %xmm2, %xmm5 | ||
648 | paddd %xmm6, %xmm5 | ||
649 | movups %xmm5, -64+16*2(%rsp) | ||
493 | # 32 | 650 | # 32 |
494 | movl %r13d, %esi # W[(n+13) & 15] | ||
495 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
496 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
497 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
498 | roll %esi # | ||
499 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
500 | movl %eax, %edi # c | 651 | movl %eax, %edi # c |
501 | xorl %ebx, %edi # ^d | 652 | xorl %ebx, %edi # ^d |
502 | xorl %ebp, %edi # ^b | 653 | xorl %ebp, %edi # ^b |
503 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 654 | addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15] |
504 | addl %edi, %ecx # e += (c ^ d ^ b) | 655 | addl %edi, %ecx # e += (c ^ d ^ b) |
505 | movl %edx, %esi # | 656 | movl %edx, %esi # |
506 | roll $5, %esi # rotl32(a,5) | 657 | roll $5, %esi # rotl32(a,5) |
507 | addl %esi, %ecx # e += rotl32(a,5) | 658 | addl %esi, %ecx # e += rotl32(a,5) |
508 | rorl $2, %ebp # b = rotl32(b,30) | 659 | rorl $2, %ebp # b = rotl32(b,30) |
509 | # 33 | 660 | # 33 |
510 | movl %r14d, %esi # W[(n+13) & 15] | ||
511 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
512 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
513 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
514 | roll %esi # | ||
515 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
516 | movl %ebp, %edi # c | 661 | movl %ebp, %edi # c |
517 | xorl %eax, %edi # ^d | 662 | xorl %eax, %edi # ^d |
518 | xorl %edx, %edi # ^b | 663 | xorl %edx, %edi # ^b |
519 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 664 | addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15] |
520 | addl %edi, %ebx # e += (c ^ d ^ b) | 665 | addl %edi, %ebx # e += (c ^ d ^ b) |
521 | movl %ecx, %esi # | 666 | movl %ecx, %esi # |
522 | roll $5, %esi # rotl32(a,5) | 667 | roll $5, %esi # rotl32(a,5) |
523 | addl %esi, %ebx # e += rotl32(a,5) | 668 | addl %esi, %ebx # e += rotl32(a,5) |
524 | rorl $2, %edx # b = rotl32(b,30) | 669 | rorl $2, %edx # b = rotl32(b,30) |
525 | # 34 | 670 | # 34 |
526 | movl %r15d, %esi # W[(n+13) & 15] | ||
527 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
528 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
529 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
530 | roll %esi # | ||
531 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
532 | movl %edx, %edi # c | 671 | movl %edx, %edi # c |
533 | xorl %ebp, %edi # ^d | 672 | xorl %ebp, %edi # ^d |
534 | xorl %ecx, %edi # ^b | 673 | xorl %ecx, %edi # ^b |
535 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 674 | addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15] |
536 | addl %edi, %eax # e += (c ^ d ^ b) | 675 | addl %edi, %eax # e += (c ^ d ^ b) |
537 | movl %ebx, %esi # | 676 | movl %ebx, %esi # |
538 | roll $5, %esi # rotl32(a,5) | 677 | roll $5, %esi # rotl32(a,5) |
539 | addl %esi, %eax # e += rotl32(a,5) | 678 | addl %esi, %eax # e += rotl32(a,5) |
540 | rorl $2, %ecx # b = rotl32(b,30) | 679 | rorl $2, %ecx # b = rotl32(b,30) |
541 | # 35 | 680 | # 35 |
542 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
543 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
544 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
545 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
546 | roll %esi # | ||
547 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
548 | movl %ecx, %edi # c | 681 | movl %ecx, %edi # c |
549 | xorl %edx, %edi # ^d | 682 | xorl %edx, %edi # ^d |
550 | xorl %ebx, %edi # ^b | 683 | xorl %ebx, %edi # ^b |
551 | leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 684 | addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15] |
552 | addl %edi, %ebp # e += (c ^ d ^ b) | 685 | addl %edi, %ebp # e += (c ^ d ^ b) |
553 | movl %eax, %esi # | 686 | movl %eax, %esi # |
554 | roll $5, %esi # rotl32(a,5) | 687 | roll $5, %esi # rotl32(a,5) |
555 | addl %esi, %ebp # e += rotl32(a,5) | 688 | addl %esi, %ebp # e += rotl32(a,5) |
556 | rorl $2, %ebx # b = rotl32(b,30) | 689 | rorl $2, %ebx # b = rotl32(b,30) |
690 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
691 | movaps %xmm2, %xmm4 | ||
692 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
693 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
694 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
695 | # same result as above, but shorter and faster: | ||
696 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
697 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
698 | movaps %xmm3, %xmm5 | ||
699 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
700 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
701 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
702 | xorps %xmm5, %xmm3 # ^ | ||
703 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
704 | movaps %xmm3, %xmm5 | ||
705 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
706 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
707 | paddd %xmm3, %xmm3 # shift left by 1 | ||
708 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
709 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
710 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
711 | movaps %xmm5, %xmm4 | ||
712 | pslld $2, %xmm5 | ||
713 | psrld $30, %xmm4 | ||
714 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
715 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
716 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
717 | movaps %xmm3, %xmm5 | ||
718 | paddd %xmm6, %xmm5 | ||
719 | movups %xmm5, -64+16*3(%rsp) | ||
557 | # 36 | 720 | # 36 |
558 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
559 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
560 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
561 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
562 | roll %esi # | ||
563 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
564 | movl %ebx, %edi # c | 721 | movl %ebx, %edi # c |
565 | xorl %ecx, %edi # ^d | 722 | xorl %ecx, %edi # ^d |
566 | xorl %eax, %edi # ^b | 723 | xorl %eax, %edi # ^b |
567 | leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 724 | addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15] |
568 | addl %edi, %edx # e += (c ^ d ^ b) | 725 | addl %edi, %edx # e += (c ^ d ^ b) |
569 | movl %ebp, %esi # | 726 | movl %ebp, %esi # |
570 | roll $5, %esi # rotl32(a,5) | 727 | roll $5, %esi # rotl32(a,5) |
571 | addl %esi, %edx # e += rotl32(a,5) | 728 | addl %esi, %edx # e += rotl32(a,5) |
572 | rorl $2, %eax # b = rotl32(b,30) | 729 | rorl $2, %eax # b = rotl32(b,30) |
573 | # 37 | 730 | # 37 |
574 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
575 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
576 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
577 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
578 | roll %esi # | ||
579 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
580 | movl %eax, %edi # c | 731 | movl %eax, %edi # c |
581 | xorl %ebx, %edi # ^d | 732 | xorl %ebx, %edi # ^d |
582 | xorl %ebp, %edi # ^b | 733 | xorl %ebp, %edi # ^b |
583 | leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 734 | addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15] |
584 | addl %edi, %ecx # e += (c ^ d ^ b) | 735 | addl %edi, %ecx # e += (c ^ d ^ b) |
585 | movl %edx, %esi # | 736 | movl %edx, %esi # |
586 | roll $5, %esi # rotl32(a,5) | 737 | roll $5, %esi # rotl32(a,5) |
587 | addl %esi, %ecx # e += rotl32(a,5) | 738 | addl %esi, %ecx # e += rotl32(a,5) |
588 | rorl $2, %ebp # b = rotl32(b,30) | 739 | rorl $2, %ebp # b = rotl32(b,30) |
589 | # 38 | 740 | # 38 |
590 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
591 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
592 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
593 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
594 | roll %esi # | ||
595 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
596 | movl %ebp, %edi # c | 741 | movl %ebp, %edi # c |
597 | xorl %eax, %edi # ^d | 742 | xorl %eax, %edi # ^d |
598 | xorl %edx, %edi # ^b | 743 | xorl %edx, %edi # ^b |
599 | leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 744 | addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15] |
600 | addl %edi, %ebx # e += (c ^ d ^ b) | 745 | addl %edi, %ebx # e += (c ^ d ^ b) |
601 | movl %ecx, %esi # | 746 | movl %ecx, %esi # |
602 | roll $5, %esi # rotl32(a,5) | 747 | roll $5, %esi # rotl32(a,5) |
603 | addl %esi, %ebx # e += rotl32(a,5) | 748 | addl %esi, %ebx # e += rotl32(a,5) |
604 | rorl $2, %edx # b = rotl32(b,30) | 749 | rorl $2, %edx # b = rotl32(b,30) |
605 | # 39 | 750 | # 39 |
606 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
607 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
608 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
609 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
610 | roll %esi # | ||
611 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
612 | movl %edx, %edi # c | 751 | movl %edx, %edi # c |
613 | xorl %ebp, %edi # ^d | 752 | xorl %ebp, %edi # ^d |
614 | xorl %ecx, %edi # ^b | 753 | xorl %ecx, %edi # ^b |
615 | leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 754 | addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15] |
616 | addl %edi, %eax # e += (c ^ d ^ b) | 755 | addl %edi, %eax # e += (c ^ d ^ b) |
617 | movl %ebx, %esi # | 756 | movl %ebx, %esi # |
618 | roll $5, %esi # rotl32(a,5) | 757 | roll $5, %esi # rotl32(a,5) |
619 | addl %esi, %eax # e += rotl32(a,5) | 758 | addl %esi, %eax # e += rotl32(a,5) |
620 | rorl $2, %ecx # b = rotl32(b,30) | 759 | rorl $2, %ecx # b = rotl32(b,30) |
760 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
761 | movaps %xmm3, %xmm4 | ||
762 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
763 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
764 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
765 | # same result as above, but shorter and faster: | ||
766 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
767 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
768 | movaps %xmm0, %xmm5 | ||
769 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
770 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
771 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
772 | xorps %xmm5, %xmm0 # ^ | ||
773 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
774 | movaps %xmm0, %xmm5 | ||
775 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
776 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
777 | paddd %xmm0, %xmm0 # shift left by 1 | ||
778 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
779 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
780 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
781 | movaps %xmm5, %xmm4 | ||
782 | pslld $2, %xmm5 | ||
783 | psrld $30, %xmm4 | ||
784 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
785 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
786 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
787 | movaps %xmm0, %xmm5 | ||
788 | paddd %xmm6, %xmm5 | ||
789 | movups %xmm5, -64+16*0(%rsp) | ||
621 | # 40 | 790 | # 40 |
622 | movl %ebx, %edi # di: b | 791 | movl %ebx, %edi # di: b |
623 | movl %ebx, %esi # si: b | 792 | movl %ebx, %esi # si: b |
@@ -625,12 +794,8 @@ sha1_process_block64: | |||
625 | andl %ecx, %esi # si: b & c | 794 | andl %ecx, %esi # si: b & c |
626 | andl %edx, %edi # di: (b | c) & d | 795 | andl %edx, %edi # di: (b | c) & d |
627 | orl %esi, %edi # ((b | c) & d) | (b & c) | 796 | orl %esi, %edi # ((b | c) & d) | (b & c) |
628 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
629 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
630 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
631 | roll %r8d # | ||
632 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 797 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
633 | leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] | 798 | addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15] |
634 | movl %eax, %esi # | 799 | movl %eax, %esi # |
635 | roll $5, %esi # rotl32(a,5) | 800 | roll $5, %esi # rotl32(a,5) |
636 | addl %esi, %ebp # e += rotl32(a,5) | 801 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -642,12 +807,8 @@ sha1_process_block64: | |||
642 | andl %ebx, %esi # si: b & c | 807 | andl %ebx, %esi # si: b & c |
643 | andl %ecx, %edi # di: (b | c) & d | 808 | andl %ecx, %edi # di: (b | c) & d |
644 | orl %esi, %edi # ((b | c) & d) | (b & c) | 809 | orl %esi, %edi # ((b | c) & d) | (b & c) |
645 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
646 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
647 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
648 | roll %r9d # | ||
649 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 810 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
650 | leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] | 811 | addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15] |
651 | movl %ebp, %esi # | 812 | movl %ebp, %esi # |
652 | roll $5, %esi # rotl32(a,5) | 813 | roll $5, %esi # rotl32(a,5) |
653 | addl %esi, %edx # e += rotl32(a,5) | 814 | addl %esi, %edx # e += rotl32(a,5) |
@@ -659,12 +820,8 @@ sha1_process_block64: | |||
659 | andl %eax, %esi # si: b & c | 820 | andl %eax, %esi # si: b & c |
660 | andl %ebx, %edi # di: (b | c) & d | 821 | andl %ebx, %edi # di: (b | c) & d |
661 | orl %esi, %edi # ((b | c) & d) | (b & c) | 822 | orl %esi, %edi # ((b | c) & d) | (b & c) |
662 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
663 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
664 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
665 | roll %r10d # | ||
666 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 823 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
667 | leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] | 824 | addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15] |
668 | movl %edx, %esi # | 825 | movl %edx, %esi # |
669 | roll $5, %esi # rotl32(a,5) | 826 | roll $5, %esi # rotl32(a,5) |
670 | addl %esi, %ecx # e += rotl32(a,5) | 827 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -676,16 +833,42 @@ sha1_process_block64: | |||
676 | andl %ebp, %esi # si: b & c | 833 | andl %ebp, %esi # si: b & c |
677 | andl %eax, %edi # di: (b | c) & d | 834 | andl %eax, %edi # di: (b | c) & d |
678 | orl %esi, %edi # ((b | c) & d) | (b & c) | 835 | orl %esi, %edi # ((b | c) & d) | (b & c) |
679 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
680 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
681 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
682 | roll %r11d # | ||
683 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 836 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
684 | leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] | 837 | addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15] |
685 | movl %ecx, %esi # | 838 | movl %ecx, %esi # |
686 | roll $5, %esi # rotl32(a,5) | 839 | roll $5, %esi # rotl32(a,5) |
687 | addl %esi, %ebx # e += rotl32(a,5) | 840 | addl %esi, %ebx # e += rotl32(a,5) |
688 | rorl $2, %edx # b = rotl32(b,30) | 841 | rorl $2, %edx # b = rotl32(b,30) |
842 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
843 | movaps %xmm0, %xmm4 | ||
844 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
845 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
846 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
847 | # same result as above, but shorter and faster: | ||
848 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
849 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
850 | movaps %xmm1, %xmm5 | ||
851 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
852 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
853 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
854 | xorps %xmm5, %xmm1 # ^ | ||
855 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
856 | movaps %xmm1, %xmm5 | ||
857 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
858 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
859 | paddd %xmm1, %xmm1 # shift left by 1 | ||
860 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
861 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
862 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
863 | movaps %xmm5, %xmm4 | ||
864 | pslld $2, %xmm5 | ||
865 | psrld $30, %xmm4 | ||
866 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
867 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
868 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
869 | movaps %xmm1, %xmm5 | ||
870 | paddd %xmm6, %xmm5 | ||
871 | movups %xmm5, -64+16*1(%rsp) | ||
689 | # 44 | 872 | # 44 |
690 | movl %ecx, %edi # di: b | 873 | movl %ecx, %edi # di: b |
691 | movl %ecx, %esi # si: b | 874 | movl %ecx, %esi # si: b |
@@ -693,12 +876,8 @@ sha1_process_block64: | |||
693 | andl %edx, %esi # si: b & c | 876 | andl %edx, %esi # si: b & c |
694 | andl %ebp, %edi # di: (b | c) & d | 877 | andl %ebp, %edi # di: (b | c) & d |
695 | orl %esi, %edi # ((b | c) & d) | (b & c) | 878 | orl %esi, %edi # ((b | c) & d) | (b & c) |
696 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
697 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
698 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
699 | roll %r12d # | ||
700 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 879 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
701 | leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] | 880 | addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15] |
702 | movl %ebx, %esi # | 881 | movl %ebx, %esi # |
703 | roll $5, %esi # rotl32(a,5) | 882 | roll $5, %esi # rotl32(a,5) |
704 | addl %esi, %eax # e += rotl32(a,5) | 883 | addl %esi, %eax # e += rotl32(a,5) |
@@ -710,12 +889,8 @@ sha1_process_block64: | |||
710 | andl %ecx, %esi # si: b & c | 889 | andl %ecx, %esi # si: b & c |
711 | andl %edx, %edi # di: (b | c) & d | 890 | andl %edx, %edi # di: (b | c) & d |
712 | orl %esi, %edi # ((b | c) & d) | (b & c) | 891 | orl %esi, %edi # ((b | c) & d) | (b & c) |
713 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
714 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
715 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
716 | roll %r13d # | ||
717 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 892 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
718 | leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] | 893 | addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15] |
719 | movl %eax, %esi # | 894 | movl %eax, %esi # |
720 | roll $5, %esi # rotl32(a,5) | 895 | roll $5, %esi # rotl32(a,5) |
721 | addl %esi, %ebp # e += rotl32(a,5) | 896 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -727,12 +902,8 @@ sha1_process_block64: | |||
727 | andl %ebx, %esi # si: b & c | 902 | andl %ebx, %esi # si: b & c |
728 | andl %ecx, %edi # di: (b | c) & d | 903 | andl %ecx, %edi # di: (b | c) & d |
729 | orl %esi, %edi # ((b | c) & d) | (b & c) | 904 | orl %esi, %edi # ((b | c) & d) | (b & c) |
730 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
731 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
732 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
733 | roll %r14d # | ||
734 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 905 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
735 | leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] | 906 | addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15] |
736 | movl %ebp, %esi # | 907 | movl %ebp, %esi # |
737 | roll $5, %esi # rotl32(a,5) | 908 | roll $5, %esi # rotl32(a,5) |
738 | addl %esi, %edx # e += rotl32(a,5) | 909 | addl %esi, %edx # e += rotl32(a,5) |
@@ -744,16 +915,42 @@ sha1_process_block64: | |||
744 | andl %eax, %esi # si: b & c | 915 | andl %eax, %esi # si: b & c |
745 | andl %ebx, %edi # di: (b | c) & d | 916 | andl %ebx, %edi # di: (b | c) & d |
746 | orl %esi, %edi # ((b | c) & d) | (b & c) | 917 | orl %esi, %edi # ((b | c) & d) | (b & c) |
747 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
748 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
749 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
750 | roll %r15d # | ||
751 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 918 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
752 | leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] | 919 | addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15] |
753 | movl %edx, %esi # | 920 | movl %edx, %esi # |
754 | roll $5, %esi # rotl32(a,5) | 921 | roll $5, %esi # rotl32(a,5) |
755 | addl %esi, %ecx # e += rotl32(a,5) | 922 | addl %esi, %ecx # e += rotl32(a,5) |
756 | rorl $2, %ebp # b = rotl32(b,30) | 923 | rorl $2, %ebp # b = rotl32(b,30) |
924 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
925 | movaps %xmm1, %xmm4 | ||
926 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
927 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
928 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
929 | # same result as above, but shorter and faster: | ||
930 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
931 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
932 | movaps %xmm2, %xmm5 | ||
933 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
934 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
935 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
936 | xorps %xmm5, %xmm2 # ^ | ||
937 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
938 | movaps %xmm2, %xmm5 | ||
939 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
940 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
941 | paddd %xmm2, %xmm2 # shift left by 1 | ||
942 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
943 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
944 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
945 | movaps %xmm5, %xmm4 | ||
946 | pslld $2, %xmm5 | ||
947 | psrld $30, %xmm4 | ||
948 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
949 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
950 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
951 | movaps %xmm2, %xmm5 | ||
952 | paddd %xmm6, %xmm5 | ||
953 | movups %xmm5, -64+16*2(%rsp) | ||
757 | # 48 | 954 | # 48 |
758 | movl %edx, %edi # di: b | 955 | movl %edx, %edi # di: b |
759 | movl %edx, %esi # si: b | 956 | movl %edx, %esi # si: b |
@@ -761,14 +958,8 @@ sha1_process_block64: | |||
761 | andl %ebp, %esi # si: b & c | 958 | andl %ebp, %esi # si: b & c |
762 | andl %eax, %edi # di: (b | c) & d | 959 | andl %eax, %edi # di: (b | c) & d |
763 | orl %esi, %edi # ((b | c) & d) | (b & c) | 960 | orl %esi, %edi # ((b | c) & d) | (b & c) |
764 | movl %r13d, %esi # W[(n+13) & 15] | ||
765 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
766 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
767 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
768 | roll %esi # | ||
769 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
770 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 961 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
771 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 962 | addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15] |
772 | movl %ecx, %esi # | 963 | movl %ecx, %esi # |
773 | roll $5, %esi # rotl32(a,5) | 964 | roll $5, %esi # rotl32(a,5) |
774 | addl %esi, %ebx # e += rotl32(a,5) | 965 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -780,14 +971,8 @@ sha1_process_block64: | |||
780 | andl %edx, %esi # si: b & c | 971 | andl %edx, %esi # si: b & c |
781 | andl %ebp, %edi # di: (b | c) & d | 972 | andl %ebp, %edi # di: (b | c) & d |
782 | orl %esi, %edi # ((b | c) & d) | (b & c) | 973 | orl %esi, %edi # ((b | c) & d) | (b & c) |
783 | movl %r14d, %esi # W[(n+13) & 15] | ||
784 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
785 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
786 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
787 | roll %esi # | ||
788 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
789 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 974 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
790 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 975 | addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15] |
791 | movl %ebx, %esi # | 976 | movl %ebx, %esi # |
792 | roll $5, %esi # rotl32(a,5) | 977 | roll $5, %esi # rotl32(a,5) |
793 | addl %esi, %eax # e += rotl32(a,5) | 978 | addl %esi, %eax # e += rotl32(a,5) |
@@ -799,14 +984,8 @@ sha1_process_block64: | |||
799 | andl %ecx, %esi # si: b & c | 984 | andl %ecx, %esi # si: b & c |
800 | andl %edx, %edi # di: (b | c) & d | 985 | andl %edx, %edi # di: (b | c) & d |
801 | orl %esi, %edi # ((b | c) & d) | (b & c) | 986 | orl %esi, %edi # ((b | c) & d) | (b & c) |
802 | movl %r15d, %esi # W[(n+13) & 15] | ||
803 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
804 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
805 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
806 | roll %esi # | ||
807 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
808 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 987 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
809 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 988 | addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15] |
810 | movl %eax, %esi # | 989 | movl %eax, %esi # |
811 | roll $5, %esi # rotl32(a,5) | 990 | roll $5, %esi # rotl32(a,5) |
812 | addl %esi, %ebp # e += rotl32(a,5) | 991 | addl %esi, %ebp # e += rotl32(a,5) |
@@ -818,18 +997,43 @@ sha1_process_block64: | |||
818 | andl %ebx, %esi # si: b & c | 997 | andl %ebx, %esi # si: b & c |
819 | andl %ecx, %edi # di: (b | c) & d | 998 | andl %ecx, %edi # di: (b | c) & d |
820 | orl %esi, %edi # ((b | c) & d) | (b & c) | 999 | orl %esi, %edi # ((b | c) & d) | (b & c) |
821 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
822 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
823 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
824 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
825 | roll %esi # | ||
826 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
827 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 1000 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
828 | leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1001 | addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15] |
829 | movl %ebp, %esi # | 1002 | movl %ebp, %esi # |
830 | roll $5, %esi # rotl32(a,5) | 1003 | roll $5, %esi # rotl32(a,5) |
831 | addl %esi, %edx # e += rotl32(a,5) | 1004 | addl %esi, %edx # e += rotl32(a,5) |
832 | rorl $2, %eax # b = rotl32(b,30) | 1005 | rorl $2, %eax # b = rotl32(b,30) |
1006 | pshufd $0xff, %xmm7, %xmm6 | ||
1007 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
1008 | movaps %xmm2, %xmm4 | ||
1009 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1010 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1011 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1012 | # same result as above, but shorter and faster: | ||
1013 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1014 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1015 | movaps %xmm3, %xmm5 | ||
1016 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1017 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1018 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1019 | xorps %xmm5, %xmm3 # ^ | ||
1020 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1021 | movaps %xmm3, %xmm5 | ||
1022 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1023 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1024 | paddd %xmm3, %xmm3 # shift left by 1 | ||
1025 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
1026 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1027 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1028 | movaps %xmm5, %xmm4 | ||
1029 | pslld $2, %xmm5 | ||
1030 | psrld $30, %xmm4 | ||
1031 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1032 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
1033 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1034 | movaps %xmm3, %xmm5 | ||
1035 | paddd %xmm6, %xmm5 | ||
1036 | movups %xmm5, -64+16*3(%rsp) | ||
833 | # 52 | 1037 | # 52 |
834 | movl %ebp, %edi # di: b | 1038 | movl %ebp, %edi # di: b |
835 | movl %ebp, %esi # si: b | 1039 | movl %ebp, %esi # si: b |
@@ -837,14 +1041,8 @@ sha1_process_block64: | |||
837 | andl %eax, %esi # si: b & c | 1041 | andl %eax, %esi # si: b & c |
838 | andl %ebx, %edi # di: (b | c) & d | 1042 | andl %ebx, %edi # di: (b | c) & d |
839 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1043 | orl %esi, %edi # ((b | c) & d) | (b & c) |
840 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
841 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
842 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
843 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
844 | roll %esi # | ||
845 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
846 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 1044 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
847 | leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 1045 | addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15] |
848 | movl %edx, %esi # | 1046 | movl %edx, %esi # |
849 | roll $5, %esi # rotl32(a,5) | 1047 | roll $5, %esi # rotl32(a,5) |
850 | addl %esi, %ecx # e += rotl32(a,5) | 1048 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -856,14 +1054,8 @@ sha1_process_block64: | |||
856 | andl %ebp, %esi # si: b & c | 1054 | andl %ebp, %esi # si: b & c |
857 | andl %eax, %edi # di: (b | c) & d | 1055 | andl %eax, %edi # di: (b | c) & d |
858 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1056 | orl %esi, %edi # ((b | c) & d) | (b & c) |
859 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
860 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
861 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
862 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
863 | roll %esi # | ||
864 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
865 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 1057 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
866 | leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 1058 | addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15] |
867 | movl %ecx, %esi # | 1059 | movl %ecx, %esi # |
868 | roll $5, %esi # rotl32(a,5) | 1060 | roll $5, %esi # rotl32(a,5) |
869 | addl %esi, %ebx # e += rotl32(a,5) | 1061 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -875,14 +1067,8 @@ sha1_process_block64: | |||
875 | andl %edx, %esi # si: b & c | 1067 | andl %edx, %esi # si: b & c |
876 | andl %ebp, %edi # di: (b | c) & d | 1068 | andl %ebp, %edi # di: (b | c) & d |
877 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1069 | orl %esi, %edi # ((b | c) & d) | (b & c) |
878 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
879 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
880 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
881 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
882 | roll %esi # | ||
883 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
884 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1070 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
885 | leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1071 | addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15] |
886 | movl %ebx, %esi # | 1072 | movl %ebx, %esi # |
887 | roll $5, %esi # rotl32(a,5) | 1073 | roll $5, %esi # rotl32(a,5) |
888 | addl %esi, %eax # e += rotl32(a,5) | 1074 | addl %esi, %eax # e += rotl32(a,5) |
@@ -894,18 +1080,42 @@ sha1_process_block64: | |||
894 | andl %ecx, %esi # si: b & c | 1080 | andl %ecx, %esi # si: b & c |
895 | andl %edx, %edi # di: (b | c) & d | 1081 | andl %edx, %edi # di: (b | c) & d |
896 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1082 | orl %esi, %edi # ((b | c) & d) | (b & c) |
897 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
898 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
899 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
900 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
901 | roll %esi # | ||
902 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
903 | addl %edi, %ebp # += ((b | c) & d) | (b & c) | 1083 | addl %edi, %ebp # += ((b | c) & d) | (b & c) |
904 | leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1084 | addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15] |
905 | movl %eax, %esi # | 1085 | movl %eax, %esi # |
906 | roll $5, %esi # rotl32(a,5) | 1086 | roll $5, %esi # rotl32(a,5) |
907 | addl %esi, %ebp # e += rotl32(a,5) | 1087 | addl %esi, %ebp # e += rotl32(a,5) |
908 | rorl $2, %ebx # b = rotl32(b,30) | 1088 | rorl $2, %ebx # b = rotl32(b,30) |
1089 | # PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp) | ||
1090 | movaps %xmm3, %xmm4 | ||
1091 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1092 | # pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1093 | # punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1094 | # same result as above, but shorter and faster: | ||
1095 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1096 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1097 | movaps %xmm0, %xmm5 | ||
1098 | shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1099 | xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1100 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1101 | xorps %xmm5, %xmm0 # ^ | ||
1102 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1103 | movaps %xmm0, %xmm5 | ||
1104 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1105 | pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1106 | paddd %xmm0, %xmm0 # shift left by 1 | ||
1107 | psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1 | ||
1108 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1109 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1110 | movaps %xmm5, %xmm4 | ||
1111 | pslld $2, %xmm5 | ||
1112 | psrld $30, %xmm4 | ||
1113 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1114 | xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2 | ||
1115 | xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1116 | movaps %xmm0, %xmm5 | ||
1117 | paddd %xmm6, %xmm5 | ||
1118 | movups %xmm5, -64+16*0(%rsp) | ||
909 | # 56 | 1119 | # 56 |
910 | movl %eax, %edi # di: b | 1120 | movl %eax, %edi # di: b |
911 | movl %eax, %esi # si: b | 1121 | movl %eax, %esi # si: b |
@@ -913,12 +1123,8 @@ sha1_process_block64: | |||
913 | andl %ebx, %esi # si: b & c | 1123 | andl %ebx, %esi # si: b & c |
914 | andl %ecx, %edi # di: (b | c) & d | 1124 | andl %ecx, %edi # di: (b | c) & d |
915 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1125 | orl %esi, %edi # ((b | c) & d) | (b & c) |
916 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
917 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
918 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
919 | roll %r8d # | ||
920 | addl %edi, %edx # += ((b | c) & d) | (b & c) | 1126 | addl %edi, %edx # += ((b | c) & d) | (b & c) |
921 | leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] | 1127 | addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15] |
922 | movl %ebp, %esi # | 1128 | movl %ebp, %esi # |
923 | roll $5, %esi # rotl32(a,5) | 1129 | roll $5, %esi # rotl32(a,5) |
924 | addl %esi, %edx # e += rotl32(a,5) | 1130 | addl %esi, %edx # e += rotl32(a,5) |
@@ -930,12 +1136,8 @@ sha1_process_block64: | |||
930 | andl %eax, %esi # si: b & c | 1136 | andl %eax, %esi # si: b & c |
931 | andl %ebx, %edi # di: (b | c) & d | 1137 | andl %ebx, %edi # di: (b | c) & d |
932 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1138 | orl %esi, %edi # ((b | c) & d) | (b & c) |
933 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
934 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
935 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
936 | roll %r9d # | ||
937 | addl %edi, %ecx # += ((b | c) & d) | (b & c) | 1139 | addl %edi, %ecx # += ((b | c) & d) | (b & c) |
938 | leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] | 1140 | addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15] |
939 | movl %edx, %esi # | 1141 | movl %edx, %esi # |
940 | roll $5, %esi # rotl32(a,5) | 1142 | roll $5, %esi # rotl32(a,5) |
941 | addl %esi, %ecx # e += rotl32(a,5) | 1143 | addl %esi, %ecx # e += rotl32(a,5) |
@@ -947,12 +1149,8 @@ sha1_process_block64: | |||
947 | andl %ebp, %esi # si: b & c | 1149 | andl %ebp, %esi # si: b & c |
948 | andl %eax, %edi # di: (b | c) & d | 1150 | andl %eax, %edi # di: (b | c) & d |
949 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1151 | orl %esi, %edi # ((b | c) & d) | (b & c) |
950 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
951 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
952 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
953 | roll %r10d # | ||
954 | addl %edi, %ebx # += ((b | c) & d) | (b & c) | 1152 | addl %edi, %ebx # += ((b | c) & d) | (b & c) |
955 | leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] | 1153 | addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15] |
956 | movl %ecx, %esi # | 1154 | movl %ecx, %esi # |
957 | roll $5, %esi # rotl32(a,5) | 1155 | roll $5, %esi # rotl32(a,5) |
958 | addl %esi, %ebx # e += rotl32(a,5) | 1156 | addl %esi, %ebx # e += rotl32(a,5) |
@@ -964,307 +1162,297 @@ sha1_process_block64: | |||
964 | andl %edx, %esi # si: b & c | 1162 | andl %edx, %esi # si: b & c |
965 | andl %ebp, %edi # di: (b | c) & d | 1163 | andl %ebp, %edi # di: (b | c) & d |
966 | orl %esi, %edi # ((b | c) & d) | (b & c) | 1164 | orl %esi, %edi # ((b | c) & d) | (b & c) |
967 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
968 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
969 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
970 | roll %r11d # | ||
971 | addl %edi, %eax # += ((b | c) & d) | (b & c) | 1165 | addl %edi, %eax # += ((b | c) & d) | (b & c) |
972 | leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] | 1166 | addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15] |
973 | movl %ebx, %esi # | 1167 | movl %ebx, %esi # |
974 | roll $5, %esi # rotl32(a,5) | 1168 | roll $5, %esi # rotl32(a,5) |
975 | addl %esi, %eax # e += rotl32(a,5) | 1169 | addl %esi, %eax # e += rotl32(a,5) |
976 | rorl $2, %ecx # b = rotl32(b,30) | 1170 | rorl $2, %ecx # b = rotl32(b,30) |
1171 | # PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp) | ||
1172 | movaps %xmm0, %xmm4 | ||
1173 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1174 | # pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1175 | # punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1176 | # same result as above, but shorter and faster: | ||
1177 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1178 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1179 | movaps %xmm1, %xmm5 | ||
1180 | shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1181 | xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1182 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1183 | xorps %xmm5, %xmm1 # ^ | ||
1184 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1185 | movaps %xmm1, %xmm5 | ||
1186 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1187 | pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1188 | paddd %xmm1, %xmm1 # shift left by 1 | ||
1189 | psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1 | ||
1190 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1191 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1192 | movaps %xmm5, %xmm4 | ||
1193 | pslld $2, %xmm5 | ||
1194 | psrld $30, %xmm4 | ||
1195 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1196 | xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2 | ||
1197 | xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1198 | movaps %xmm1, %xmm5 | ||
1199 | paddd %xmm6, %xmm5 | ||
1200 | movups %xmm5, -64+16*1(%rsp) | ||
977 | # 60 | 1201 | # 60 |
978 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
979 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
980 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
981 | roll %r12d # | ||
982 | movl %ecx, %edi # c | 1202 | movl %ecx, %edi # c |
983 | xorl %edx, %edi # ^d | 1203 | xorl %edx, %edi # ^d |
984 | xorl %ebx, %edi # ^b | 1204 | xorl %ebx, %edi # ^b |
985 | leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] | 1205 | addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15] |
986 | addl %edi, %ebp # e += (c ^ d ^ b) | 1206 | addl %edi, %ebp # e += (c ^ d ^ b) |
987 | movl %eax, %esi # | 1207 | movl %eax, %esi # |
988 | roll $5, %esi # rotl32(a,5) | 1208 | roll $5, %esi # rotl32(a,5) |
989 | addl %esi, %ebp # e += rotl32(a,5) | 1209 | addl %esi, %ebp # e += rotl32(a,5) |
990 | rorl $2, %ebx # b = rotl32(b,30) | 1210 | rorl $2, %ebx # b = rotl32(b,30) |
991 | # 61 | 1211 | # 61 |
992 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
993 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
994 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
995 | roll %r13d # | ||
996 | movl %ebx, %edi # c | 1212 | movl %ebx, %edi # c |
997 | xorl %ecx, %edi # ^d | 1213 | xorl %ecx, %edi # ^d |
998 | xorl %eax, %edi # ^b | 1214 | xorl %eax, %edi # ^b |
999 | leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] | 1215 | addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15] |
1000 | addl %edi, %edx # e += (c ^ d ^ b) | 1216 | addl %edi, %edx # e += (c ^ d ^ b) |
1001 | movl %ebp, %esi # | 1217 | movl %ebp, %esi # |
1002 | roll $5, %esi # rotl32(a,5) | 1218 | roll $5, %esi # rotl32(a,5) |
1003 | addl %esi, %edx # e += rotl32(a,5) | 1219 | addl %esi, %edx # e += rotl32(a,5) |
1004 | rorl $2, %eax # b = rotl32(b,30) | 1220 | rorl $2, %eax # b = rotl32(b,30) |
1005 | # 62 | 1221 | # 62 |
1006 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1007 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1008 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1009 | roll %r14d # | ||
1010 | movl %eax, %edi # c | 1222 | movl %eax, %edi # c |
1011 | xorl %ebx, %edi # ^d | 1223 | xorl %ebx, %edi # ^d |
1012 | xorl %ebp, %edi # ^b | 1224 | xorl %ebp, %edi # ^b |
1013 | leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] | 1225 | addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15] |
1014 | addl %edi, %ecx # e += (c ^ d ^ b) | 1226 | addl %edi, %ecx # e += (c ^ d ^ b) |
1015 | movl %edx, %esi # | 1227 | movl %edx, %esi # |
1016 | roll $5, %esi # rotl32(a,5) | 1228 | roll $5, %esi # rotl32(a,5) |
1017 | addl %esi, %ecx # e += rotl32(a,5) | 1229 | addl %esi, %ecx # e += rotl32(a,5) |
1018 | rorl $2, %ebp # b = rotl32(b,30) | 1230 | rorl $2, %ebp # b = rotl32(b,30) |
1019 | # 63 | 1231 | # 63 |
1020 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1021 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1022 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1023 | roll %r15d # | ||
1024 | movl %ebp, %edi # c | 1232 | movl %ebp, %edi # c |
1025 | xorl %eax, %edi # ^d | 1233 | xorl %eax, %edi # ^d |
1026 | xorl %edx, %edi # ^b | 1234 | xorl %edx, %edi # ^b |
1027 | leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] | 1235 | addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15] |
1028 | addl %edi, %ebx # e += (c ^ d ^ b) | 1236 | addl %edi, %ebx # e += (c ^ d ^ b) |
1029 | movl %ecx, %esi # | 1237 | movl %ecx, %esi # |
1030 | roll $5, %esi # rotl32(a,5) | 1238 | roll $5, %esi # rotl32(a,5) |
1031 | addl %esi, %ebx # e += rotl32(a,5) | 1239 | addl %esi, %ebx # e += rotl32(a,5) |
1032 | rorl $2, %edx # b = rotl32(b,30) | 1240 | rorl $2, %edx # b = rotl32(b,30) |
1241 | # PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp) | ||
1242 | movaps %xmm1, %xmm4 | ||
1243 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1244 | # pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1245 | # punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1246 | # same result as above, but shorter and faster: | ||
1247 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1248 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1249 | movaps %xmm2, %xmm5 | ||
1250 | shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1251 | xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1252 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1253 | xorps %xmm5, %xmm2 # ^ | ||
1254 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1255 | movaps %xmm2, %xmm5 | ||
1256 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1257 | pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1258 | paddd %xmm2, %xmm2 # shift left by 1 | ||
1259 | psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1 | ||
1260 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1261 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1262 | movaps %xmm5, %xmm4 | ||
1263 | pslld $2, %xmm5 | ||
1264 | psrld $30, %xmm4 | ||
1265 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1266 | xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2 | ||
1267 | xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1268 | movaps %xmm2, %xmm5 | ||
1269 | paddd %xmm6, %xmm5 | ||
1270 | movups %xmm5, -64+16*2(%rsp) | ||
1033 | # 64 | 1271 | # 64 |
1034 | movl %r13d, %esi # W[(n+13) & 15] | ||
1035 | xorl %r8d, %esi # ^W[(n+8) & 15] | ||
1036 | xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15] | ||
1037 | xorl -32+4*0(%rsp), %esi # ^W[n & 15] | ||
1038 | roll %esi # | ||
1039 | movl %esi, -32+4*0(%rsp) # store to W[n & 15] | ||
1040 | movl %edx, %edi # c | 1272 | movl %edx, %edi # c |
1041 | xorl %ebp, %edi # ^d | 1273 | xorl %ebp, %edi # ^d |
1042 | xorl %ecx, %edi # ^b | 1274 | xorl %ecx, %edi # ^b |
1043 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1275 | addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15] |
1044 | addl %edi, %eax # e += (c ^ d ^ b) | 1276 | addl %edi, %eax # e += (c ^ d ^ b) |
1045 | movl %ebx, %esi # | 1277 | movl %ebx, %esi # |
1046 | roll $5, %esi # rotl32(a,5) | 1278 | roll $5, %esi # rotl32(a,5) |
1047 | addl %esi, %eax # e += rotl32(a,5) | 1279 | addl %esi, %eax # e += rotl32(a,5) |
1048 | rorl $2, %ecx # b = rotl32(b,30) | 1280 | rorl $2, %ecx # b = rotl32(b,30) |
1049 | # 65 | 1281 | # 65 |
1050 | movl %r14d, %esi # W[(n+13) & 15] | ||
1051 | xorl %r9d, %esi # ^W[(n+8) & 15] | ||
1052 | xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15] | ||
1053 | xorl -32+4*1(%rsp), %esi # ^W[n & 15] | ||
1054 | roll %esi # | ||
1055 | movl %esi, -32+4*1(%rsp) # store to W[n & 15] | ||
1056 | movl %ecx, %edi # c | 1282 | movl %ecx, %edi # c |
1057 | xorl %edx, %edi # ^d | 1283 | xorl %edx, %edi # ^d |
1058 | xorl %ebx, %edi # ^b | 1284 | xorl %ebx, %edi # ^b |
1059 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1285 | addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15] |
1060 | addl %edi, %ebp # e += (c ^ d ^ b) | 1286 | addl %edi, %ebp # e += (c ^ d ^ b) |
1061 | movl %eax, %esi # | 1287 | movl %eax, %esi # |
1062 | roll $5, %esi # rotl32(a,5) | 1288 | roll $5, %esi # rotl32(a,5) |
1063 | addl %esi, %ebp # e += rotl32(a,5) | 1289 | addl %esi, %ebp # e += rotl32(a,5) |
1064 | rorl $2, %ebx # b = rotl32(b,30) | 1290 | rorl $2, %ebx # b = rotl32(b,30) |
1065 | # 66 | 1291 | # 66 |
1066 | movl %r15d, %esi # W[(n+13) & 15] | ||
1067 | xorl %r10d, %esi # ^W[(n+8) & 15] | ||
1068 | xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15] | ||
1069 | xorl -32+4*2(%rsp), %esi # ^W[n & 15] | ||
1070 | roll %esi # | ||
1071 | movl %esi, -32+4*2(%rsp) # store to W[n & 15] | ||
1072 | movl %ebx, %edi # c | 1292 | movl %ebx, %edi # c |
1073 | xorl %ecx, %edi # ^d | 1293 | xorl %ecx, %edi # ^d |
1074 | xorl %eax, %edi # ^b | 1294 | xorl %eax, %edi # ^b |
1075 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1295 | addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15] |
1076 | addl %edi, %edx # e += (c ^ d ^ b) | 1296 | addl %edi, %edx # e += (c ^ d ^ b) |
1077 | movl %ebp, %esi # | 1297 | movl %ebp, %esi # |
1078 | roll $5, %esi # rotl32(a,5) | 1298 | roll $5, %esi # rotl32(a,5) |
1079 | addl %esi, %edx # e += rotl32(a,5) | 1299 | addl %esi, %edx # e += rotl32(a,5) |
1080 | rorl $2, %eax # b = rotl32(b,30) | 1300 | rorl $2, %eax # b = rotl32(b,30) |
1081 | # 67 | 1301 | # 67 |
1082 | movl -32+4*0(%rsp), %esi # W[(n+13) & 15] | ||
1083 | xorl %r11d, %esi # ^W[(n+8) & 15] | ||
1084 | xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15] | ||
1085 | xorl -32+4*3(%rsp), %esi # ^W[n & 15] | ||
1086 | roll %esi # | ||
1087 | movl %esi, -32+4*3(%rsp) # store to W[n & 15] | ||
1088 | movl %eax, %edi # c | 1302 | movl %eax, %edi # c |
1089 | xorl %ebx, %edi # ^d | 1303 | xorl %ebx, %edi # ^d |
1090 | xorl %ebp, %edi # ^b | 1304 | xorl %ebp, %edi # ^b |
1091 | leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] | 1305 | addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15] |
1092 | addl %edi, %ecx # e += (c ^ d ^ b) | 1306 | addl %edi, %ecx # e += (c ^ d ^ b) |
1093 | movl %edx, %esi # | 1307 | movl %edx, %esi # |
1094 | roll $5, %esi # rotl32(a,5) | 1308 | roll $5, %esi # rotl32(a,5) |
1095 | addl %esi, %ecx # e += rotl32(a,5) | 1309 | addl %esi, %ecx # e += rotl32(a,5) |
1096 | rorl $2, %ebp # b = rotl32(b,30) | 1310 | rorl $2, %ebp # b = rotl32(b,30) |
1311 | # PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp) | ||
1312 | movaps %xmm2, %xmm4 | ||
1313 | psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
1314 | # pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
1315 | # punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
1316 | # same result as above, but shorter and faster: | ||
1317 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
1318 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
1319 | movaps %xmm3, %xmm5 | ||
1320 | shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
1321 | xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
1322 | xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
1323 | xorps %xmm5, %xmm3 # ^ | ||
1324 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
1325 | movaps %xmm3, %xmm5 | ||
1326 | xorps %xmm4, %xmm4 # rol(W0,1): | ||
1327 | pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1) | ||
1328 | paddd %xmm3, %xmm3 # shift left by 1 | ||
1329 | psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1 | ||
1330 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
1331 | pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
1332 | movaps %xmm5, %xmm4 | ||
1333 | pslld $2, %xmm5 | ||
1334 | psrld $30, %xmm4 | ||
1335 | # xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2) | ||
1336 | xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2 | ||
1337 | xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
1338 | movaps %xmm3, %xmm5 | ||
1339 | paddd %xmm6, %xmm5 | ||
1340 | movups %xmm5, -64+16*3(%rsp) | ||
1097 | # 68 | 1341 | # 68 |
1098 | movl -32+4*1(%rsp), %esi # W[(n+13) & 15] | ||
1099 | xorl %r12d, %esi # ^W[(n+8) & 15] | ||
1100 | xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15] | ||
1101 | xorl -32+4*4(%rsp), %esi # ^W[n & 15] | ||
1102 | roll %esi # | ||
1103 | movl %esi, -32+4*4(%rsp) # store to W[n & 15] | ||
1104 | movl %ebp, %edi # c | 1342 | movl %ebp, %edi # c |
1105 | xorl %eax, %edi # ^d | 1343 | xorl %eax, %edi # ^d |
1106 | xorl %edx, %edi # ^b | 1344 | xorl %edx, %edi # ^b |
1107 | leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] | 1345 | addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15] |
1108 | addl %edi, %ebx # e += (c ^ d ^ b) | 1346 | addl %edi, %ebx # e += (c ^ d ^ b) |
1109 | movl %ecx, %esi # | 1347 | movl %ecx, %esi # |
1110 | roll $5, %esi # rotl32(a,5) | 1348 | roll $5, %esi # rotl32(a,5) |
1111 | addl %esi, %ebx # e += rotl32(a,5) | 1349 | addl %esi, %ebx # e += rotl32(a,5) |
1112 | rorl $2, %edx # b = rotl32(b,30) | 1350 | rorl $2, %edx # b = rotl32(b,30) |
1113 | # 69 | 1351 | # 69 |
1114 | movl -32+4*2(%rsp), %esi # W[(n+13) & 15] | ||
1115 | xorl %r13d, %esi # ^W[(n+8) & 15] | ||
1116 | xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15] | ||
1117 | xorl -32+4*5(%rsp), %esi # ^W[n & 15] | ||
1118 | roll %esi # | ||
1119 | movl %esi, -32+4*5(%rsp) # store to W[n & 15] | ||
1120 | movl %edx, %edi # c | 1352 | movl %edx, %edi # c |
1121 | xorl %ebp, %edi # ^d | 1353 | xorl %ebp, %edi # ^d |
1122 | xorl %ecx, %edi # ^b | 1354 | xorl %ecx, %edi # ^b |
1123 | leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] | 1355 | addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15] |
1124 | addl %edi, %eax # e += (c ^ d ^ b) | 1356 | addl %edi, %eax # e += (c ^ d ^ b) |
1125 | movl %ebx, %esi # | 1357 | movl %ebx, %esi # |
1126 | roll $5, %esi # rotl32(a,5) | 1358 | roll $5, %esi # rotl32(a,5) |
1127 | addl %esi, %eax # e += rotl32(a,5) | 1359 | addl %esi, %eax # e += rotl32(a,5) |
1128 | rorl $2, %ecx # b = rotl32(b,30) | 1360 | rorl $2, %ecx # b = rotl32(b,30) |
1129 | # 70 | 1361 | # 70 |
1130 | movl -32+4*3(%rsp), %esi # W[(n+13) & 15] | ||
1131 | xorl %r14d, %esi # ^W[(n+8) & 15] | ||
1132 | xorl %r8d, %esi # ^W[(n+2) & 15] | ||
1133 | xorl -32+4*6(%rsp), %esi # ^W[n & 15] | ||
1134 | roll %esi # | ||
1135 | movl %esi, -32+4*6(%rsp) # store to W[n & 15] | ||
1136 | movl %ecx, %edi # c | 1362 | movl %ecx, %edi # c |
1137 | xorl %edx, %edi # ^d | 1363 | xorl %edx, %edi # ^d |
1138 | xorl %ebx, %edi # ^b | 1364 | xorl %ebx, %edi # ^b |
1139 | leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] | 1365 | addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15] |
1140 | addl %edi, %ebp # e += (c ^ d ^ b) | 1366 | addl %edi, %ebp # e += (c ^ d ^ b) |
1141 | movl %eax, %esi # | 1367 | movl %eax, %esi # |
1142 | roll $5, %esi # rotl32(a,5) | 1368 | roll $5, %esi # rotl32(a,5) |
1143 | addl %esi, %ebp # e += rotl32(a,5) | 1369 | addl %esi, %ebp # e += rotl32(a,5) |
1144 | rorl $2, %ebx # b = rotl32(b,30) | 1370 | rorl $2, %ebx # b = rotl32(b,30) |
1145 | # 71 | 1371 | # 71 |
1146 | movl -32+4*4(%rsp), %esi # W[(n+13) & 15] | ||
1147 | xorl %r15d, %esi # ^W[(n+8) & 15] | ||
1148 | xorl %r9d, %esi # ^W[(n+2) & 15] | ||
1149 | xorl -32+4*7(%rsp), %esi # ^W[n & 15] | ||
1150 | roll %esi # | ||
1151 | movl %esi, -32+4*7(%rsp) # store to W[n & 15] | ||
1152 | movl %ebx, %edi # c | 1372 | movl %ebx, %edi # c |
1153 | xorl %ecx, %edi # ^d | 1373 | xorl %ecx, %edi # ^d |
1154 | xorl %eax, %edi # ^b | 1374 | xorl %eax, %edi # ^b |
1155 | leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] | 1375 | addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15] |
1156 | addl %edi, %edx # e += (c ^ d ^ b) | 1376 | addl %edi, %edx # e += (c ^ d ^ b) |
1157 | movl %ebp, %esi # | 1377 | movl %ebp, %esi # |
1158 | roll $5, %esi # rotl32(a,5) | 1378 | roll $5, %esi # rotl32(a,5) |
1159 | addl %esi, %edx # e += rotl32(a,5) | 1379 | addl %esi, %edx # e += rotl32(a,5) |
1160 | rorl $2, %eax # b = rotl32(b,30) | 1380 | rorl $2, %eax # b = rotl32(b,30) |
1161 | # 72 | 1381 | # 72 |
1162 | xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15] | ||
1163 | xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15] | ||
1164 | xorl %r10d, %r8d # ^W[(n+2) & 15] | ||
1165 | roll %r8d # | ||
1166 | movl %eax, %edi # c | 1382 | movl %eax, %edi # c |
1167 | xorl %ebx, %edi # ^d | 1383 | xorl %ebx, %edi # ^d |
1168 | xorl %ebp, %edi # ^b | 1384 | xorl %ebp, %edi # ^b |
1169 | leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] | 1385 | addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15] |
1170 | addl %edi, %ecx # e += (c ^ d ^ b) | 1386 | addl %edi, %ecx # e += (c ^ d ^ b) |
1171 | movl %edx, %esi # | 1387 | movl %edx, %esi # |
1172 | roll $5, %esi # rotl32(a,5) | 1388 | roll $5, %esi # rotl32(a,5) |
1173 | addl %esi, %ecx # e += rotl32(a,5) | 1389 | addl %esi, %ecx # e += rotl32(a,5) |
1174 | rorl $2, %ebp # b = rotl32(b,30) | 1390 | rorl $2, %ebp # b = rotl32(b,30) |
1175 | # 73 | 1391 | # 73 |
1176 | xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15] | ||
1177 | xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15] | ||
1178 | xorl %r11d, %r9d # ^W[(n+2) & 15] | ||
1179 | roll %r9d # | ||
1180 | movl %ebp, %edi # c | 1392 | movl %ebp, %edi # c |
1181 | xorl %eax, %edi # ^d | 1393 | xorl %eax, %edi # ^d |
1182 | xorl %edx, %edi # ^b | 1394 | xorl %edx, %edi # ^b |
1183 | leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] | 1395 | addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15] |
1184 | addl %edi, %ebx # e += (c ^ d ^ b) | 1396 | addl %edi, %ebx # e += (c ^ d ^ b) |
1185 | movl %ecx, %esi # | 1397 | movl %ecx, %esi # |
1186 | roll $5, %esi # rotl32(a,5) | 1398 | roll $5, %esi # rotl32(a,5) |
1187 | addl %esi, %ebx # e += rotl32(a,5) | 1399 | addl %esi, %ebx # e += rotl32(a,5) |
1188 | rorl $2, %edx # b = rotl32(b,30) | 1400 | rorl $2, %edx # b = rotl32(b,30) |
1189 | # 74 | 1401 | # 74 |
1190 | xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15] | ||
1191 | xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15] | ||
1192 | xorl %r12d, %r10d # ^W[(n+2) & 15] | ||
1193 | roll %r10d # | ||
1194 | movl %edx, %edi # c | 1402 | movl %edx, %edi # c |
1195 | xorl %ebp, %edi # ^d | 1403 | xorl %ebp, %edi # ^d |
1196 | xorl %ecx, %edi # ^b | 1404 | xorl %ecx, %edi # ^b |
1197 | leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] | 1405 | addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15] |
1198 | addl %edi, %eax # e += (c ^ d ^ b) | 1406 | addl %edi, %eax # e += (c ^ d ^ b) |
1199 | movl %ebx, %esi # | 1407 | movl %ebx, %esi # |
1200 | roll $5, %esi # rotl32(a,5) | 1408 | roll $5, %esi # rotl32(a,5) |
1201 | addl %esi, %eax # e += rotl32(a,5) | 1409 | addl %esi, %eax # e += rotl32(a,5) |
1202 | rorl $2, %ecx # b = rotl32(b,30) | 1410 | rorl $2, %ecx # b = rotl32(b,30) |
1203 | # 75 | 1411 | # 75 |
1204 | xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15] | ||
1205 | xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15] | ||
1206 | xorl %r13d, %r11d # ^W[(n+2) & 15] | ||
1207 | roll %r11d # | ||
1208 | movl %ecx, %edi # c | 1412 | movl %ecx, %edi # c |
1209 | xorl %edx, %edi # ^d | 1413 | xorl %edx, %edi # ^d |
1210 | xorl %ebx, %edi # ^b | 1414 | xorl %ebx, %edi # ^b |
1211 | leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] | 1415 | addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15] |
1212 | addl %edi, %ebp # e += (c ^ d ^ b) | 1416 | addl %edi, %ebp # e += (c ^ d ^ b) |
1213 | movl %eax, %esi # | 1417 | movl %eax, %esi # |
1214 | roll $5, %esi # rotl32(a,5) | 1418 | roll $5, %esi # rotl32(a,5) |
1215 | addl %esi, %ebp # e += rotl32(a,5) | 1419 | addl %esi, %ebp # e += rotl32(a,5) |
1216 | rorl $2, %ebx # b = rotl32(b,30) | 1420 | rorl $2, %ebx # b = rotl32(b,30) |
1217 | # 76 | 1421 | # 76 |
1218 | xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15] | ||
1219 | xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15] | ||
1220 | xorl %r14d, %r12d # ^W[(n+2) & 15] | ||
1221 | roll %r12d # | ||
1222 | movl %ebx, %edi # c | 1422 | movl %ebx, %edi # c |
1223 | xorl %ecx, %edi # ^d | 1423 | xorl %ecx, %edi # ^d |
1224 | xorl %eax, %edi # ^b | 1424 | xorl %eax, %edi # ^b |
1225 | leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] | 1425 | addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15] |
1226 | addl %edi, %edx # e += (c ^ d ^ b) | 1426 | addl %edi, %edx # e += (c ^ d ^ b) |
1227 | movl %ebp, %esi # | 1427 | movl %ebp, %esi # |
1228 | roll $5, %esi # rotl32(a,5) | 1428 | roll $5, %esi # rotl32(a,5) |
1229 | addl %esi, %edx # e += rotl32(a,5) | 1429 | addl %esi, %edx # e += rotl32(a,5) |
1230 | rorl $2, %eax # b = rotl32(b,30) | 1430 | rorl $2, %eax # b = rotl32(b,30) |
1231 | # 77 | 1431 | # 77 |
1232 | xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15] | ||
1233 | xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15] | ||
1234 | xorl %r15d, %r13d # ^W[(n+2) & 15] | ||
1235 | roll %r13d # | ||
1236 | movl %eax, %edi # c | 1432 | movl %eax, %edi # c |
1237 | xorl %ebx, %edi # ^d | 1433 | xorl %ebx, %edi # ^d |
1238 | xorl %ebp, %edi # ^b | 1434 | xorl %ebp, %edi # ^b |
1239 | leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] | 1435 | addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15] |
1240 | addl %edi, %ecx # e += (c ^ d ^ b) | 1436 | addl %edi, %ecx # e += (c ^ d ^ b) |
1241 | movl %edx, %esi # | 1437 | movl %edx, %esi # |
1242 | roll $5, %esi # rotl32(a,5) | 1438 | roll $5, %esi # rotl32(a,5) |
1243 | addl %esi, %ecx # e += rotl32(a,5) | 1439 | addl %esi, %ecx # e += rotl32(a,5) |
1244 | rorl $2, %ebp # b = rotl32(b,30) | 1440 | rorl $2, %ebp # b = rotl32(b,30) |
1245 | # 78 | 1441 | # 78 |
1246 | xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15] | ||
1247 | xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15] | ||
1248 | xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15] | ||
1249 | roll %r14d # | ||
1250 | movl %ebp, %edi # c | 1442 | movl %ebp, %edi # c |
1251 | xorl %eax, %edi # ^d | 1443 | xorl %eax, %edi # ^d |
1252 | xorl %edx, %edi # ^b | 1444 | xorl %edx, %edi # ^b |
1253 | leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] | 1445 | addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15] |
1254 | addl %edi, %ebx # e += (c ^ d ^ b) | 1446 | addl %edi, %ebx # e += (c ^ d ^ b) |
1255 | movl %ecx, %esi # | 1447 | movl %ecx, %esi # |
1256 | roll $5, %esi # rotl32(a,5) | 1448 | roll $5, %esi # rotl32(a,5) |
1257 | addl %esi, %ebx # e += rotl32(a,5) | 1449 | addl %esi, %ebx # e += rotl32(a,5) |
1258 | rorl $2, %edx # b = rotl32(b,30) | 1450 | rorl $2, %edx # b = rotl32(b,30) |
1259 | # 79 | 1451 | # 79 |
1260 | xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15] | ||
1261 | xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15] | ||
1262 | xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15] | ||
1263 | roll %r15d # | ||
1264 | movl %edx, %edi # c | 1452 | movl %edx, %edi # c |
1265 | xorl %ebp, %edi # ^d | 1453 | xorl %ebp, %edi # ^d |
1266 | xorl %ecx, %edi # ^b | 1454 | xorl %ecx, %edi # ^b |
1267 | leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] | 1455 | addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15] |
1268 | addl %edi, %eax # e += (c ^ d ^ b) | 1456 | addl %edi, %eax # e += (c ^ d ^ b) |
1269 | movl %ebx, %esi # | 1457 | movl %ebx, %esi # |
1270 | roll $5, %esi # rotl32(a,5) | 1458 | roll $5, %esi # rotl32(a,5) |
@@ -1278,7 +1466,7 @@ sha1_process_block64: | |||
1278 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 1466 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
1279 | popq %r14 # | 1467 | popq %r14 # |
1280 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 1468 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
1281 | popq %r15 # | 1469 | # popq %r15 # |
1282 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 1470 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
1283 | popq %rbx # | 1471 | popq %rbx # |
1284 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 1472 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
@@ -1286,4 +1474,13 @@ sha1_process_block64: | |||
1286 | 1474 | ||
1287 | ret | 1475 | ret |
1288 | .size sha1_process_block64, .-sha1_process_block64 | 1476 | .size sha1_process_block64, .-sha1_process_block64 |
1477 | |||
1478 | .section .rodata.cst16.sha1const, "aM", @progbits, 16 | ||
1479 | .balign 16 | ||
1480 | sha1const: | ||
1481 | .long 0x5A827999 | ||
1482 | .long 0x6ED9EBA1 | ||
1483 | .long 0x8F1BBCDC | ||
1484 | .long 0xCA62C1D6 | ||
1485 | |||
1289 | #endif | 1486 | #endif |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index 901896e6e..a10ac411d 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -6,33 +6,104 @@ | |||
6 | # also contains the diff of the generated file. | 6 | # also contains the diff of the generated file. |
7 | exec >hash_md5_sha_x86-64.S | 7 | exec >hash_md5_sha_x86-64.S |
8 | 8 | ||
9 | # There is a way to use XMM registers (which always exist for x86-64!) for W[] | 9 | # Based on http://arctic.org/~dean/crypto/sha1.html. |
10 | # For example, if we load W as follows: | 10 | # ("This SHA1 implementation is public domain.") |
11 | # %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] | 11 | # |
12 | # %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] | 12 | # x86-64 has at least SSE2 vector insns always available. |
13 | # %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] | 13 | # We can use them without any CPUID checks (and without a need |
14 | # %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] | 14 | # for a fallback code if needed insns are not available). |
15 | # then the xor'ing operation to generate next W[0..3] is: | 15 | # This code uses them to calculate W[] ahead of time. |
16 | # movaps %xmm0, %xmmT2 | 16 | # |
17 | # palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) | 17 | # Unfortunately, results are passed from vector unit to |
18 | # # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. | 18 | # integer ALUs on the stack. MOVD/Q insns to move them directly |
19 | # movaps %xmm0, %xmmT13 | 19 | # from vector to integer registers are slower than store-to-load |
20 | # palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) | 20 | # forwarding in LSU (on Skylake at least). |
21 | # xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 | 21 | # |
22 | # xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or | 22 | # The win against a purely integer code is small on Skylake, |
23 | # and then results can be extracted for use: | 23 | # only about 7-8%. We offload about 1/3 of our operations to the vector unit. |
24 | # movd %xmm0, %esi # new W[0] | 24 | # It can do 4 ops at once in one 128-bit register, |
25 | # pextrd $1, %xmm0, %esi # new W[1] | 25 | # but we have to use x2 of them because of W[0] complication, |
26 | # # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) | 26 | # SSE2 has no "rotate each word by N bits" insns, |
27 | # pextrd $2, %xmm0, %esi # new W[2] | 27 | # moving data to/from vector unit is clunky, and Skylake |
28 | # pextrd $3, %xmm0, %esi # new W[3] | 28 | # has four integer ALUs unified with three vector ALUs, |
29 | # ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. | 29 | # which makes pure integer code rather fast, and makes |
30 | # vector ops compete with integer ones. | ||
31 | # | ||
32 | # Zen3, with its separate vector ALUs, wins more, about 12%. | ||
33 | |||
34 | xmmT1="%xmm4" | ||
35 | xmmT2="%xmm5" | ||
36 | xmmRCONST="%xmm6" | ||
37 | xmmALLRCONST="%xmm7" | ||
38 | T=`printf '\t'` | ||
39 | |||
40 | # SSE instructions are longer than 4 bytes on average. | ||
41 | # Intel CPUs (up to Tiger Lake at least) can't decode | ||
42 | # more than 16 bytes of code in one cycle. | ||
43 | # By interleaving SSE code and integer code | ||
44 | # we mostly achieve a situation where 16-byte decode fetch window | ||
45 | # contains 4 (or more) insns. | ||
46 | # | ||
47 | # However. On Skylake, there was no observed difference, | ||
48 | # but on Zen3, non-interleaved code is ~3% faster | ||
49 | # (822 Mb/s versus 795 Mb/s hashing speed). | ||
50 | # Off for now: | ||
51 | interleave=false | ||
52 | |||
53 | INTERLEAVE() { | ||
54 | $interleave || \ | ||
55 | { | ||
56 | # Generate non-interleaved code | ||
57 | # (it should work correctly too) | ||
58 | echo "$1" | ||
59 | echo "$2" | ||
60 | return | ||
61 | } | ||
62 | ( | ||
63 | echo "$1" | grep -v '^$' >"$0.temp1" | ||
64 | echo "$2" | grep -v '^$' >"$0.temp2" | ||
65 | exec 3<"$0.temp1" | ||
66 | exec 4<"$0.temp2" | ||
67 | IFS='' | ||
68 | while :; do | ||
69 | line1='' | ||
70 | line2='' | ||
71 | while :; do | ||
72 | read -r line1 <&3 | ||
73 | if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then | ||
74 | break | ||
75 | fi | ||
76 | echo "$line1" | ||
77 | done | ||
78 | while :; do | ||
79 | read -r line2 <&4 | ||
80 | if test "${line2:0:4}" = "${T}lea"; then | ||
81 | # We use 7-8 byte long forms of LEA. | ||
82 | # Do not interleave them with SSE insns | ||
83 | # which are also long. | ||
84 | echo "$line2" | ||
85 | read -r line2 <&4 | ||
86 | echo "$line2" | ||
87 | continue | ||
88 | fi | ||
89 | if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then | ||
90 | break | ||
91 | fi | ||
92 | echo "$line2" | ||
93 | done | ||
94 | test "$line1$line2" || break | ||
95 | echo "$line1" | ||
96 | echo "$line2" | ||
97 | done | ||
98 | rm "$0.temp1" "$0.temp2" | ||
99 | ) | ||
100 | } | ||
30 | 101 | ||
31 | echo \ | 102 | echo \ |
32 | '### Generated by hash_md5_sha_x86-64.S.sh ### | 103 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
33 | 104 | ||
34 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) | 105 | #if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) |
35 | .section .text.sha1_process_block64,"ax",@progbits | 106 | .section .text.sha1_process_block64, \"ax\", @progbits |
36 | .globl sha1_process_block64 | 107 | .globl sha1_process_block64 |
37 | .hidden sha1_process_block64 | 108 | .hidden sha1_process_block64 |
38 | .type sha1_process_block64, @function | 109 | .type sha1_process_block64, @function |
@@ -41,7 +112,7 @@ echo \ | |||
41 | sha1_process_block64: | 112 | sha1_process_block64: |
42 | pushq %rbp # 1 byte insn | 113 | pushq %rbp # 1 byte insn |
43 | pushq %rbx # 1 byte insn | 114 | pushq %rbx # 1 byte insn |
44 | pushq %r15 # 2 byte insn | 115 | # pushq %r15 # 2 byte insn |
45 | pushq %r14 # 2 byte insn | 116 | pushq %r14 # 2 byte insn |
46 | pushq %r13 # 2 byte insn | 117 | pushq %r13 # 2 byte insn |
47 | pushq %r12 # 2 byte insn | 118 | pushq %r12 # 2 byte insn |
@@ -50,17 +121,13 @@ sha1_process_block64: | |||
50 | #Register and stack use: | 121 | #Register and stack use: |
51 | # eax..edx: a..d | 122 | # eax..edx: a..d |
52 | # ebp: e | 123 | # ebp: e |
53 | # esi,edi: temps | 124 | # esi,edi,r8..r14: temps |
54 | # -32+4*n(%rsp),r8...r15: W[0..7,8..15] | 125 | # r15: unused |
55 | # (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) | 126 | # xmm0..xmm3: W[] |
56 | movl $3, %eax | 127 | # xmm4,xmm5: temps |
57 | 1: | 128 | # xmm6: current round constant |
58 | movq (%rdi,%rax,8), %rsi | 129 | # xmm7: all round constants |
59 | bswapq %rsi | 130 | # -64(%rsp): area for passing RCONST + W[] from vector to integer units |
60 | rolq $32, %rsi | ||
61 | movq %rsi, -32(%rsp,%rax,8) | ||
62 | decl %eax | ||
63 | jns 1b | ||
64 | 131 | ||
65 | movl 80(%rdi), %eax # a = ctx->hash[0] | 132 | movl 80(%rdi), %eax # a = ctx->hash[0] |
66 | movl 84(%rdi), %ebx # b = ctx->hash[1] | 133 | movl 84(%rdi), %ebx # b = ctx->hash[1] |
@@ -68,32 +135,123 @@ sha1_process_block64: | |||
68 | movl 92(%rdi), %edx # d = ctx->hash[3] | 135 | movl 92(%rdi), %edx # d = ctx->hash[3] |
69 | movl 96(%rdi), %ebp # e = ctx->hash[4] | 136 | movl 96(%rdi), %ebp # e = ctx->hash[4] |
70 | 137 | ||
71 | movq 4*8(%rdi), %r8 | 138 | movaps sha1const(%rip), $xmmALLRCONST |
72 | movq 4*10(%rdi), %r10 | 139 | pshufd \$0x00, $xmmALLRCONST, $xmmRCONST |
140 | |||
141 | # Load W[] to xmm registers, byteswapping on the fly. | ||
142 | # | ||
143 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | ||
144 | # for use in RD1As instead of spilling them to stack. | ||
145 | # We lose parallelized addition of RCONST, but LEA | ||
146 | # can do two additions at once, so it is probably a wash. | ||
147 | # (We use rsi instead of rN because this makes two | ||
148 | # LEAs in two first RD1As shorter by one byte). | ||
149 | movq 4*0(%rdi), %rsi | ||
150 | movq 4*2(%rdi), %r8 | ||
151 | bswapq %rsi | ||
73 | bswapq %r8 | 152 | bswapq %r8 |
153 | rolq \$32, %rsi # rsi = W[1]:W[0] | ||
154 | rolq \$32, %r8 # r8 = W[3]:W[2] | ||
155 | movq %rsi, %xmm0 | ||
156 | movq %r8, $xmmT1 | ||
157 | punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3]) | ||
158 | # movaps %xmm0, $xmmT1 # add RCONST, spill to stack | ||
159 | # paddd $xmmRCONST, $xmmT1 | ||
160 | # movups $xmmT1, -64+16*0(%rsp) | ||
161 | |||
162 | movq 4*4(%rdi), %r9 | ||
163 | movq 4*6(%rdi), %r10 | ||
164 | bswapq %r9 | ||
74 | bswapq %r10 | 165 | bswapq %r10 |
75 | movq 4*12(%rdi), %r12 | 166 | rolq \$32, %r9 # r9 = W[5]:W[4] |
76 | movq 4*14(%rdi), %r14 | 167 | rolq \$32, %r10 # r10 = W[7]:W[6] |
168 | movq %r9, %xmm1 | ||
169 | movq %r10, $xmmT1 | ||
170 | punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7]) | ||
171 | |||
172 | movq 4*8(%rdi), %r11 | ||
173 | movq 4*10(%rdi), %r12 | ||
174 | bswapq %r11 | ||
77 | bswapq %r12 | 175 | bswapq %r12 |
176 | rolq \$32, %r11 # r11 = W[9]:W[8] | ||
177 | rolq \$32, %r12 # r12 = W[11]:W[10] | ||
178 | movq %r11, %xmm2 | ||
179 | movq %r12, $xmmT1 | ||
180 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) | ||
181 | |||
182 | movq 4*12(%rdi), %r13 | ||
183 | movq 4*14(%rdi), %r14 | ||
184 | bswapq %r13 | ||
78 | bswapq %r14 | 185 | bswapq %r14 |
79 | movl %r8d, %r9d | 186 | rolq \$32, %r13 # r13 = W[13]:W[12] |
80 | shrq $32, %r8 | 187 | rolq \$32, %r14 # r14 = W[15]:W[14] |
81 | movl %r10d, %r11d | 188 | movq %r13, %xmm3 |
82 | shrq $32, %r10 | 189 | movq %r14, $xmmT1 |
83 | movl %r12d, %r13d | 190 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
84 | shrq $32, %r12 | 191 | " |
85 | movl %r14d, %r15d | 192 | |
86 | shrq $32, %r14 | 193 | PREP() { |
87 | ' | 194 | local xmmW0=$1 |
88 | W32() { | 195 | local xmmW4=$2 |
89 | test "$1" || exit 1 | 196 | local xmmW8=$3 |
90 | test "$1" -lt 0 && exit 1 | 197 | local xmmW12=$4 |
91 | test "$1" -gt 15 && exit 1 | 198 | # the above must be %xmm0..3 in some permutation |
92 | test "$1" -lt 8 && echo "-32+4*$1(%rsp)" | 199 | local dstmem=$5 |
93 | test "$1" -ge 8 && echo "%r${1}d" | 200 | #W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1); |
201 | #W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1); | ||
202 | #W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1); | ||
203 | #W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1); | ||
204 | #W[3] ^= rol(W[0], 1); | ||
205 | echo "# PREP $@ | ||
206 | movaps $xmmW12, $xmmT1 | ||
207 | psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0) | ||
208 | |||
209 | # pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x) | ||
210 | # punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5]) | ||
211 | # same result as above, but shorter and faster: | ||
212 | # pshufd/shufps are subtly different: pshufd takes all dwords from source operand, | ||
213 | # shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one! | ||
214 | movaps $xmmW0, $xmmT2 | ||
215 | shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5]) | ||
216 | |||
217 | xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3]) | ||
218 | xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5]) | ||
219 | xorps $xmmT2, $xmmW0 # ^ | ||
220 | # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup | ||
221 | movaps $xmmW0, $xmmT2 | ||
222 | |||
223 | xorps $xmmT1, $xmmT1 # rol(W0,1): | ||
224 | pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1) | ||
225 | paddd $xmmW0, $xmmW0 # shift left by 1 | ||
226 | psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1 | ||
227 | # W0 = rotated (W[0]..W[3]), still needs W[3] fixup | ||
228 | |||
229 | pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0]) | ||
230 | movaps $xmmT2, $xmmT1 | ||
231 | pslld \$2, $xmmT2 | ||
232 | psrld \$30, $xmmT1 | ||
233 | # xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2) | ||
234 | xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2 | ||
235 | |||
236 | xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2)) | ||
237 | " | ||
238 | # movq $xmmW0, %r8 # high latency (~6 cycles) | ||
239 | # movaps $xmmW0, $xmmT1 | ||
240 | # psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower | ||
241 | # movq $xmmT1, %r10 # high latency | ||
242 | # movq %r8, %r9 | ||
243 | # movq %r10, %r11 | ||
244 | # shrq \$32, %r9 | ||
245 | # shrq \$32, %r11 | ||
246 | # ^^^ slower than passing the results on stack (!!!) | ||
247 | echo " | ||
248 | movaps $xmmW0, $xmmT2 | ||
249 | paddd $xmmRCONST, $xmmT2 | ||
250 | movups $xmmT2, $dstmem | ||
251 | " | ||
94 | } | 252 | } |
95 | 253 | ||
96 | # It's possible to interleave insns in rounds to mostly eliminate | 254 | # It's possible to interleave integer insns in rounds to mostly eliminate |
97 | # dependency chains, but this likely to only help old Pentium-based | 255 | # dependency chains, but this likely to only help old Pentium-based |
98 | # CPUs (ones without OOO, which can only simultaneously execute a pair | 256 | # CPUs (ones without OOO, which can only simultaneously execute a pair |
99 | # of _adjacent_ insns). | 257 | # of _adjacent_ insns). |
@@ -104,28 +262,28 @@ RD1A() { | |||
104 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 262 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
105 | local n=$(($6)) | 263 | local n=$(($6)) |
106 | local n0=$(((n+0) & 15)) | 264 | local n0=$(((n+0) & 15)) |
265 | local rN=$((7+n0/2)) | ||
107 | echo " | 266 | echo " |
108 | # $n | 267 | # $n |
109 | ";test $n0 = 0 && echo " | 268 | ";test $n0 = 0 && echo " |
110 | # W[0], already in %esi | 269 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
111 | ";test $n0 != 0 && test $n0 -lt 8 && echo " | 270 | shrq \$32, %rsi |
112 | movl `W32 $n0`, %esi # W[n] | 271 | ";test $n0 = 1 && echo " |
113 | ";test $n0 -ge 8 && echo " | 272 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] |
114 | # W[n], in %r$n0 | 273 | ";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo " |
274 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
275 | shrq \$32, %r$rN | ||
276 | ";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo " | ||
277 | leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n] | ||
115 | ";echo " | 278 | ";echo " |
116 | movl %e$c, %edi # c | 279 | movl %e$c, %edi # c |
117 | xorl %e$d, %edi # ^d | 280 | xorl %e$d, %edi # ^d |
118 | andl %e$b, %edi # &b | 281 | andl %e$b, %edi # &b |
119 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 282 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
120 | ";test $n0 -lt 8 && echo " | ||
121 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | ||
122 | ";test $n0 -ge 8 && echo " | ||
123 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n] | ||
124 | ";echo " | ||
125 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 283 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
126 | movl %e$a, %esi # | 284 | movl %e$a, %edi # |
127 | roll \$5, %esi # rotl32(a,5) | 285 | roll \$5, %edi # rotl32(a,5) |
128 | addl %esi, %e$e # e += rotl32(a,5) | 286 | addl %edi, %e$e # e += rotl32(a,5) |
129 | rorl \$2, %e$b # b = rotl32(b,30) | 287 | rorl \$2, %e$b # b = rotl32(b,30) |
130 | " | 288 | " |
131 | } | 289 | } |
@@ -138,28 +296,11 @@ local n2=$(((n+2) & 15)) | |||
138 | local n0=$(((n+0) & 15)) | 296 | local n0=$(((n+0) & 15)) |
139 | echo " | 297 | echo " |
140 | # $n | 298 | # $n |
141 | ";test $n0 -lt 8 && echo " | ||
142 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
143 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
144 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
145 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
146 | roll %esi # | ||
147 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
148 | ";test $n0 -ge 8 && echo " | ||
149 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
150 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
151 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
152 | roll `W32 $n0` # | ||
153 | ";echo " | ||
154 | movl %e$c, %edi # c | 299 | movl %e$c, %edi # c |
155 | xorl %e$d, %edi # ^d | 300 | xorl %e$d, %edi # ^d |
156 | andl %e$b, %edi # &b | 301 | andl %e$b, %edi # &b |
157 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) | 302 | xorl %e$d, %edi # (((c ^ d) & b) ^ d) |
158 | ";test $n0 -lt 8 && echo " | 303 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
159 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
160 | ";test $n0 -ge 8 && echo " | ||
161 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
162 | ";echo " | ||
163 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) | 304 | addl %edi, %e$e # e += (((c ^ d) & b) ^ d) |
164 | movl %e$a, %esi # | 305 | movl %e$a, %esi # |
165 | roll \$5, %esi # rotl32(a,5) | 306 | roll \$5, %esi # rotl32(a,5) |
@@ -167,13 +308,6 @@ echo " | |||
167 | rorl \$2, %e$b # b = rotl32(b,30) | 308 | rorl \$2, %e$b # b = rotl32(b,30) |
168 | " | 309 | " |
169 | } | 310 | } |
170 | { | ||
171 | RCONST=0x5A827999 | ||
172 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4 | ||
173 | RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9 | ||
174 | RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14 | ||
175 | RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19 | ||
176 | } | grep -v '^$' | ||
177 | 311 | ||
178 | RD2() { | 312 | RD2() { |
179 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 313 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -184,27 +318,10 @@ local n2=$(((n+2) & 15)) | |||
184 | local n0=$(((n+0) & 15)) | 318 | local n0=$(((n+0) & 15)) |
185 | echo " | 319 | echo " |
186 | # $n | 320 | # $n |
187 | ";test $n0 -lt 8 && echo " | ||
188 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
189 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
190 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
191 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
192 | roll %esi # | ||
193 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
194 | ";test $n0 -ge 8 && echo " | ||
195 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
196 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
197 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
198 | roll `W32 $n0` # | ||
199 | ";echo " | ||
200 | movl %e$c, %edi # c | 321 | movl %e$c, %edi # c |
201 | xorl %e$d, %edi # ^d | 322 | xorl %e$d, %edi # ^d |
202 | xorl %e$b, %edi # ^b | 323 | xorl %e$b, %edi # ^b |
203 | ";test $n0 -lt 8 && echo " | 324 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
204 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
205 | ";test $n0 -ge 8 && echo " | ||
206 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
207 | ";echo " | ||
208 | addl %edi, %e$e # e += (c ^ d ^ b) | 325 | addl %edi, %e$e # e += (c ^ d ^ b) |
209 | movl %e$a, %esi # | 326 | movl %e$a, %esi # |
210 | roll \$5, %esi # rotl32(a,5) | 327 | roll \$5, %esi # rotl32(a,5) |
@@ -212,13 +329,6 @@ echo " | |||
212 | rorl \$2, %e$b # b = rotl32(b,30) | 329 | rorl \$2, %e$b # b = rotl32(b,30) |
213 | " | 330 | " |
214 | } | 331 | } |
215 | { | ||
216 | RCONST=0x6ED9EBA1 | ||
217 | RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24 | ||
218 | RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29 | ||
219 | RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34 | ||
220 | RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39 | ||
221 | } | grep -v '^$' | ||
222 | 332 | ||
223 | RD3() { | 333 | RD3() { |
224 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 | 334 | local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 |
@@ -235,53 +345,82 @@ echo " | |||
235 | andl %e$c, %esi # si: b & c | 345 | andl %e$c, %esi # si: b & c |
236 | andl %e$d, %edi # di: (b | c) & d | 346 | andl %e$d, %edi # di: (b | c) & d |
237 | orl %esi, %edi # ((b | c) & d) | (b & c) | 347 | orl %esi, %edi # ((b | c) & d) | (b & c) |
238 | ";test $n0 -lt 8 && echo " | ||
239 | movl `W32 $n13`, %esi # W[(n+13) & 15] | ||
240 | xorl `W32 $n8`, %esi # ^W[(n+8) & 15] | ||
241 | xorl `W32 $n2`, %esi # ^W[(n+2) & 15] | ||
242 | xorl `W32 $n0`, %esi # ^W[n & 15] | ||
243 | roll %esi # | ||
244 | movl %esi, `W32 $n0` # store to W[n & 15] | ||
245 | ";test $n0 -ge 8 && echo " | ||
246 | xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15] | ||
247 | xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15] | ||
248 | xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15] | ||
249 | roll `W32 $n0` # | ||
250 | ";echo " | ||
251 | addl %edi, %e$e # += ((b | c) & d) | (b & c) | 348 | addl %edi, %e$e # += ((b | c) & d) | (b & c) |
252 | ";test $n0 -lt 8 && echo " | 349 | addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15] |
253 | leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15] | ||
254 | ";test $n0 -ge 8 && echo " | ||
255 | leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15] | ||
256 | ";echo " | ||
257 | movl %e$a, %esi # | 350 | movl %e$a, %esi # |
258 | roll \$5, %esi # rotl32(a,5) | 351 | roll \$5, %esi # rotl32(a,5) |
259 | addl %esi, %e$e # e += rotl32(a,5) | 352 | addl %esi, %e$e # e += rotl32(a,5) |
260 | rorl \$2, %e$b # b = rotl32(b,30) | 353 | rorl \$2, %e$b # b = rotl32(b,30) |
261 | " | 354 | " |
262 | } | 355 | } |
356 | |||
263 | { | 357 | { |
264 | #RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" | 358 | # Round 1 |
265 | RCONST=-0x70E44324 | 359 | RCONST=0x5A827999 |
266 | RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 | 360 | RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; |
267 | RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 | 361 | RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; |
268 | RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 | 362 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` |
269 | RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 | 363 | b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;` |
270 | } | grep -v '^$' | 364 | INTERLEAVE "$a" "$b" |
365 | a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST" | ||
366 | PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
367 | b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;` | ||
368 | INTERLEAVE "$a" "$b" | ||
369 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
370 | b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;` | ||
371 | INTERLEAVE "$a" "$b" | ||
372 | |||
373 | # Round 2 | ||
374 | RCONST=0x6ED9EBA1 | ||
375 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
376 | b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;` | ||
377 | INTERLEAVE "$a" "$b" | ||
378 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
379 | b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;` | ||
380 | INTERLEAVE "$a" "$b" | ||
381 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
382 | b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;` | ||
383 | INTERLEAVE "$a" "$b" | ||
384 | a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST" | ||
385 | PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
386 | b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;` | ||
387 | INTERLEAVE "$a" "$b" | ||
388 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
389 | b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;` | ||
390 | INTERLEAVE "$a" "$b" | ||
391 | |||
392 | # Round 3 | ||
393 | RCONST=0x8F1BBCDC | ||
394 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
395 | b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;` | ||
396 | INTERLEAVE "$a" "$b" | ||
397 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` | ||
398 | b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;` | ||
399 | INTERLEAVE "$a" "$b" | ||
400 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` | ||
401 | b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;` | ||
402 | INTERLEAVE "$a" "$b" | ||
403 | a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST" | ||
404 | PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` | ||
405 | b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;` | ||
406 | INTERLEAVE "$a" "$b" | ||
407 | a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"` | ||
408 | b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;` | ||
409 | INTERLEAVE "$a" "$b" | ||
271 | 410 | ||
272 | # Round 4 has the same logic as round 2, only n and RCONST are different | 411 | # Round 4 has the same logic as round 2, only n and RCONST are different |
273 | { | 412 | RCONST=0xCA62C1D6 |
274 | #RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" | 413 | a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"` |
275 | RCONST=-0x359D3E2A | 414 | b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;` |
276 | RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 | 415 | INTERLEAVE "$a" "$b" |
277 | RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 | 416 | a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"` |
278 | RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 | 417 | b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;` |
279 | RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 | 418 | INTERLEAVE "$a" "$b" |
280 | # Note: new W[n&15] values generated in last 3 iterations | 419 | a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"` |
281 | # (W[13,14,15]) are unused after each of these iterations. | 420 | b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;` |
282 | # Since we use r8..r15 for W[8..15], this does not matter. | 421 | INTERLEAVE "$a" "$b" |
283 | # If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] | 422 | RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75; |
284 | # (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. | 423 | RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79; |
285 | } | grep -v '^$' | 424 | } | grep -v '^$' |
286 | 425 | ||
287 | echo " | 426 | echo " |
@@ -292,7 +431,7 @@ echo " | |||
292 | addl %ebx, 84(%rdi) # ctx->hash[1] += b | 431 | addl %ebx, 84(%rdi) # ctx->hash[1] += b |
293 | popq %r14 # | 432 | popq %r14 # |
294 | addl %ecx, 88(%rdi) # ctx->hash[2] += c | 433 | addl %ecx, 88(%rdi) # ctx->hash[2] += c |
295 | popq %r15 # | 434 | # popq %r15 # |
296 | addl %edx, 92(%rdi) # ctx->hash[3] += d | 435 | addl %edx, 92(%rdi) # ctx->hash[3] += d |
297 | popq %rbx # | 436 | popq %rbx # |
298 | addl %ebp, 96(%rdi) # ctx->hash[4] += e | 437 | addl %ebp, 96(%rdi) # ctx->hash[4] += e |
@@ -300,4 +439,13 @@ echo " | |||
300 | 439 | ||
301 | ret | 440 | ret |
302 | .size sha1_process_block64, .-sha1_process_block64 | 441 | .size sha1_process_block64, .-sha1_process_block64 |
442 | |||
443 | .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 | ||
444 | .balign 16 | ||
445 | sha1const: | ||
446 | .long 0x5A827999 | ||
447 | .long 0x6ED9EBA1 | ||
448 | .long 0x8F1BBCDC | ||
449 | .long 0xCA62C1D6 | ||
450 | |||
303 | #endif" | 451 | #endif" |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index 33cc3bf7f..b32029360 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
@@ -20,7 +20,7 @@ | |||
20 | #define extr128_32 pextrd | 20 | #define extr128_32 pextrd |
21 | //#define extr128_32 extractps # not shorter | 21 | //#define extr128_32 extractps # not shorter |
22 | 22 | ||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | 23 | .section .text.sha1_process_block64_shaNI, "ax", @progbits |
24 | .globl sha1_process_block64_shaNI | 24 | .globl sha1_process_block64_shaNI |
25 | .hidden sha1_process_block64_shaNI | 25 | .hidden sha1_process_block64_shaNI |
26 | .type sha1_process_block64_shaNI, @function | 26 | .type sha1_process_block64_shaNI, @function |
@@ -32,41 +32,42 @@ | |||
32 | #define MSG1 %xmm4 | 32 | #define MSG1 %xmm4 |
33 | #define MSG2 %xmm5 | 33 | #define MSG2 %xmm5 |
34 | #define MSG3 %xmm6 | 34 | #define MSG3 %xmm6 |
35 | #define SHUF_MASK %xmm7 | ||
36 | 35 | ||
37 | .balign 8 # allow decoders to fetch at least 2 first insns | 36 | .balign 8 # allow decoders to fetch at least 2 first insns |
38 | sha1_process_block64_shaNI: | 37 | sha1_process_block64_shaNI: |
39 | /* load initial hash values */ | 38 | /* load initial hash values */ |
40 | |||
41 | xor128 E0, E0 | ||
42 | movu128 80(%rdi), ABCD | 39 | movu128 80(%rdi), ABCD |
40 | xor128 E0, E0 | ||
43 | pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word | 41 | pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word |
44 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD | 42 | shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD |
45 | 43 | ||
46 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK | 44 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7 |
45 | |||
46 | movu128 0*16(%rdi), MSG0 | ||
47 | pshufb %xmm7, MSG0 | ||
48 | movu128 1*16(%rdi), MSG1 | ||
49 | pshufb %xmm7, MSG1 | ||
50 | movu128 2*16(%rdi), MSG2 | ||
51 | pshufb %xmm7, MSG2 | ||
52 | movu128 3*16(%rdi), MSG3 | ||
53 | pshufb %xmm7, MSG3 | ||
47 | 54 | ||
48 | /* Save hash values for addition after rounds */ | 55 | /* Save hash values for addition after rounds */ |
49 | mova128 E0, %xmm9 | 56 | mova128 E0, %xmm7 |
50 | mova128 ABCD, %xmm8 | 57 | mova128 ABCD, %xmm8 |
51 | 58 | ||
52 | /* Rounds 0-3 */ | 59 | /* Rounds 0-3 */ |
53 | movu128 0*16(%rdi), MSG0 | ||
54 | pshufb SHUF_MASK, MSG0 | ||
55 | paddd MSG0, E0 | 60 | paddd MSG0, E0 |
56 | mova128 ABCD, E1 | 61 | mova128 ABCD, E1 |
57 | sha1rnds4 $0, E0, ABCD | 62 | sha1rnds4 $0, E0, ABCD |
58 | 63 | ||
59 | /* Rounds 4-7 */ | 64 | /* Rounds 4-7 */ |
60 | movu128 1*16(%rdi), MSG1 | ||
61 | pshufb SHUF_MASK, MSG1 | ||
62 | sha1nexte MSG1, E1 | 65 | sha1nexte MSG1, E1 |
63 | mova128 ABCD, E0 | 66 | mova128 ABCD, E0 |
64 | sha1rnds4 $0, E1, ABCD | 67 | sha1rnds4 $0, E1, ABCD |
65 | sha1msg1 MSG1, MSG0 | 68 | sha1msg1 MSG1, MSG0 |
66 | 69 | ||
67 | /* Rounds 8-11 */ | 70 | /* Rounds 8-11 */ |
68 | movu128 2*16(%rdi), MSG2 | ||
69 | pshufb SHUF_MASK, MSG2 | ||
70 | sha1nexte MSG2, E0 | 71 | sha1nexte MSG2, E0 |
71 | mova128 ABCD, E1 | 72 | mova128 ABCD, E1 |
72 | sha1rnds4 $0, E0, ABCD | 73 | sha1rnds4 $0, E0, ABCD |
@@ -74,8 +75,6 @@ sha1_process_block64_shaNI: | |||
74 | xor128 MSG2, MSG0 | 75 | xor128 MSG2, MSG0 |
75 | 76 | ||
76 | /* Rounds 12-15 */ | 77 | /* Rounds 12-15 */ |
77 | movu128 3*16(%rdi), MSG3 | ||
78 | pshufb SHUF_MASK, MSG3 | ||
79 | sha1nexte MSG3, E1 | 78 | sha1nexte MSG3, E1 |
80 | mova128 ABCD, E0 | 79 | mova128 ABCD, E0 |
81 | sha1msg2 MSG3, MSG0 | 80 | sha1msg2 MSG3, MSG0 |
@@ -206,7 +205,7 @@ sha1_process_block64_shaNI: | |||
206 | sha1rnds4 $3, E1, ABCD | 205 | sha1rnds4 $3, E1, ABCD |
207 | 206 | ||
208 | /* Add current hash values with previously saved */ | 207 | /* Add current hash values with previously saved */ |
209 | sha1nexte %xmm9, E0 | 208 | sha1nexte %xmm7, E0 |
210 | paddd %xmm8, ABCD | 209 | paddd %xmm8, ABCD |
211 | 210 | ||
212 | /* Write hash values back in the correct order */ | 211 | /* Write hash values back in the correct order */ |
@@ -217,8 +216,8 @@ sha1_process_block64_shaNI: | |||
217 | ret | 216 | ret |
218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 217 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
219 | 218 | ||
220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 219 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
221 | .align 16 | 220 | .balign 16 |
222 | PSHUFFLE_BYTE_FLIP_MASK: | 221 | PSHUFFLE_BYTE_FLIP_MASK: |
223 | .octa 0x000102030405060708090a0b0c0d0e0f | 222 | .octa 0x000102030405060708090a0b0c0d0e0f |
224 | 223 | ||
diff --git a/libbb/lineedit.c b/libbb/lineedit.c index 8abc87976..778511d16 100644 --- a/libbb/lineedit.c +++ b/libbb/lineedit.c | |||
@@ -2274,17 +2274,41 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) | |||
2274 | #endif | 2274 | #endif |
2275 | 2275 | ||
2276 | fflush_all(); | 2276 | fflush_all(); |
2277 | while (1) { | 2277 | for (;;) { |
2278 | /* Wait for input. TIMEOUT = -1 makes read_key wait even | 2278 | /* Wait for input. TIMEOUT = -1 makes read_key wait even |
2279 | * on nonblocking stdin, TIMEOUT = 50 makes sure we won't | 2279 | * on nonblocking stdin, TIMEOUT = 50 makes sure we won't |
2280 | * insist on full MB_CUR_MAX buffer to declare input like | 2280 | * insist on full MB_CUR_MAX buffer to declare input like |
2281 | * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls". | 2281 | * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls". |
2282 | * | 2282 | * |
2283 | * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll() | ||
2284 | * inside read_key, or if bb_got_signal != 0 (IOW: if signal | ||
2285 | * arrived before poll() is reached). | ||
2286 | * | ||
2283 | * Note: read_key sets errno to 0 on success. | 2287 | * Note: read_key sets errno to 0 on success. |
2284 | */ | 2288 | */ |
2285 | IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) | 2289 | for (;;) { |
2286 | ic = read_key(STDIN_FILENO, read_key_buffer, timeout); | 2290 | if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) { |
2287 | IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) | 2291 | errno = EINTR; |
2292 | return -1; | ||
2293 | } | ||
2294 | //FIXME: still races here with signals, but small window to poll() inside read_key | ||
2295 | IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) | ||
2296 | /* errno = 0; - read_key does this itself */ | ||
2297 | ic = read_key(STDIN_FILENO, read_key_buffer, timeout); | ||
2298 | IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) | ||
2299 | if (errno != EINTR) | ||
2300 | break; | ||
2301 | if (state->flags & LI_INTERRUPTIBLE) { | ||
2302 | /* LI_INTERRUPTIBLE bails out on EINTR, | ||
2303 | * but nothing really guarantees that bb_got_signal | ||
2304 | * is nonzero. Follow the least surprise principle: | ||
2305 | */ | ||
2306 | if (bb_got_signal == 0) | ||
2307 | bb_got_signal = 255; | ||
2308 | goto ret; | ||
2309 | } | ||
2310 | } | ||
2311 | |||
2288 | if (errno) { | 2312 | if (errno) { |
2289 | #if ENABLE_UNICODE_SUPPORT | 2313 | #if ENABLE_UNICODE_SUPPORT |
2290 | if (errno == EAGAIN && unicode_idx != 0) | 2314 | if (errno == EAGAIN && unicode_idx != 0) |
@@ -2352,7 +2376,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout) | |||
2352 | #endif | 2376 | #endif |
2353 | break; | 2377 | break; |
2354 | } | 2378 | } |
2355 | 2379 | ret: | |
2356 | return ic; | 2380 | return ic; |
2357 | } | 2381 | } |
2358 | 2382 | ||
diff --git a/libbb/read_key.c b/libbb/read_key.c index 03b7da656..cf8ed411e 100644 --- a/libbb/read_key.c +++ b/libbb/read_key.c | |||
@@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) | |||
126 | * if fd can be in non-blocking mode. | 126 | * if fd can be in non-blocking mode. |
127 | */ | 127 | */ |
128 | if (timeout >= -1) { | 128 | if (timeout >= -1) { |
129 | if (safe_poll(&pfd, 1, timeout) == 0) { | 129 | n = poll(&pfd, 1, timeout); |
130 | if (n < 0 && errno == EINTR) | ||
131 | return n; | ||
132 | if (n == 0) { | ||
130 | /* Timed out */ | 133 | /* Timed out */ |
131 | errno = EAGAIN; | 134 | errno = EAGAIN; |
132 | return -1; | 135 | return -1; |
@@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) | |||
138 | * When we were reading 3 bytes here, we were eating | 141 | * When we were reading 3 bytes here, we were eating |
139 | * "li" too, and cat was getting wrong input. | 142 | * "li" too, and cat was getting wrong input. |
140 | */ | 143 | */ |
141 | n = safe_read(fd, buffer, 1); | 144 | n = read(fd, buffer, 1); |
142 | if (n <= 0) | 145 | if (n <= 0) |
143 | return -1; | 146 | return -1; |
144 | } | 147 | } |
@@ -284,6 +287,16 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout) | |||
284 | goto start_over; | 287 | goto start_over; |
285 | } | 288 | } |
286 | 289 | ||
290 | int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout) | ||
291 | { | ||
292 | int64_t r; | ||
293 | do { | ||
294 | /* errno = 0; - read_key does this itself */ | ||
295 | r = read_key(fd, buffer, timeout); | ||
296 | } while (errno == EINTR); | ||
297 | return r; | ||
298 | } | ||
299 | |||
287 | void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len) | 300 | void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len) |
288 | { | 301 | { |
289 | unsigned cur_len = (unsigned char)buffer[0]; | 302 | unsigned cur_len = (unsigned char)buffer[0]; |
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c index df2983958..3549e2099 100644 --- a/libbb/setup_environment.c +++ b/libbb/setup_environment.c | |||
@@ -36,9 +36,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass | |||
36 | 36 | ||
37 | /* Change the current working directory to be the home directory | 37 | /* Change the current working directory to be the home directory |
38 | * of the user */ | 38 | * of the user */ |
39 | if (!(flags & SETUP_ENV_NO_CHDIR)) { | 39 | if (flags & SETUP_ENV_CHDIR) { |
40 | if (chdir(pw->pw_dir) != 0) { | 40 | if (chdir_or_warn(pw->pw_dir) != 0) { |
41 | bb_error_msg("can't change directory to '%s'", pw->pw_dir); | ||
42 | xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); | 41 | xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); |
43 | } | 42 | } |
44 | } | 43 | } |
@@ -59,7 +58,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass | |||
59 | //xsetenv("LOGNAME", pw->pw_name); | 58 | //xsetenv("LOGNAME", pw->pw_name); |
60 | //xsetenv("HOME", pw->pw_dir); | 59 | //xsetenv("HOME", pw->pw_dir); |
61 | //xsetenv("SHELL", shell); | 60 | //xsetenv("SHELL", shell); |
62 | } else if (flags & SETUP_ENV_CHANGEENV) { | 61 | } else |
62 | if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) { | ||
63 | /* Set HOME, SHELL, and if not becoming a super-user | 63 | /* Set HOME, SHELL, and if not becoming a super-user |
64 | * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ | 64 | * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ |
65 | if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { | 65 | if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { |
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c index aae3b092d..a9add8ab2 100644 --- a/libbb/xfuncs_printf.c +++ b/libbb/xfuncs_printf.c | |||
@@ -417,11 +417,18 @@ void FAST_FUNC xseteuid(uid_t euid) | |||
417 | if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid"); | 417 | if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid"); |
418 | } | 418 | } |
419 | 419 | ||
420 | int FAST_FUNC chdir_or_warn(const char *path) | ||
421 | { | ||
422 | int r = chdir(path); | ||
423 | if (r != 0) | ||
424 | bb_perror_msg("can't change directory to '%s'", path); | ||
425 | return r; | ||
426 | } | ||
420 | // Die if we can't chdir to a new path. | 427 | // Die if we can't chdir to a new path. |
421 | void FAST_FUNC xchdir(const char *path) | 428 | void FAST_FUNC xchdir(const char *path) |
422 | { | 429 | { |
423 | if (chdir(path)) | 430 | if (chdir_or_warn(path) != 0) |
424 | bb_perror_msg_and_die("can't change directory to '%s'", path); | 431 | xfunc_die(); |
425 | } | 432 | } |
426 | 433 | ||
427 | void FAST_FUNC xfchdir(int fd) | 434 | void FAST_FUNC xfchdir(int fd) |
diff --git a/loginutils/login.c b/loginutils/login.c index cac4349b2..332238181 100644 --- a/loginutils/login.c +++ b/loginutils/login.c | |||
@@ -564,7 +564,9 @@ int login_main(int argc UNUSED_PARAM, char **argv) | |||
564 | 564 | ||
565 | change_identity(pw); | 565 | change_identity(pw); |
566 | setup_environment(pw->pw_shell, | 566 | setup_environment(pw->pw_shell, |
567 | (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) + SETUP_ENV_CHANGEENV, | 567 | (!(opt & LOGIN_OPT_p) * SETUP_ENV_CLEARENV) |
568 | + SETUP_ENV_CHANGEENV | ||
569 | + SETUP_ENV_CHDIR, | ||
568 | pw); | 570 | pw); |
569 | 571 | ||
570 | #if ENABLE_PAM | 572 | #if ENABLE_PAM |
diff --git a/loginutils/su.c b/loginutils/su.c index 647c97fb1..b61e3753a 100644 --- a/loginutils/su.c +++ b/loginutils/su.c | |||
@@ -177,10 +177,9 @@ int su_main(int argc UNUSED_PARAM, char **argv) | |||
177 | 177 | ||
178 | change_identity(pw); | 178 | change_identity(pw); |
179 | setup_environment(opt_shell, | 179 | setup_environment(opt_shell, |
180 | ((flags & SU_OPT_l) / SU_OPT_l * SETUP_ENV_CLEARENV) | 180 | ((flags & SU_OPT_l) ? (SETUP_ENV_CLEARENV + SETUP_ENV_CHDIR) : 0) |
181 | + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV) | 181 | + (!(flags & SU_OPT_mp) * SETUP_ENV_CHANGEENV), |
182 | + (!(flags & SU_OPT_l) * SETUP_ENV_NO_CHDIR), | 182 | pw); |
183 | pw); | ||
184 | IF_SELINUX(set_current_security_context(NULL);) | 183 | IF_SELINUX(set_current_security_context(NULL);) |
185 | 184 | ||
186 | if (opt_command) { | 185 | if (opt_command) { |
diff --git a/loginutils/sulogin.c b/loginutils/sulogin.c index c9817960c..681022acb 100644 --- a/loginutils/sulogin.c +++ b/loginutils/sulogin.c | |||
@@ -94,10 +94,13 @@ int sulogin_main(int argc UNUSED_PARAM, char **argv) | |||
94 | shell = pwd->pw_shell; | 94 | shell = pwd->pw_shell; |
95 | 95 | ||
96 | /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */ | 96 | /* util-linux 2.36.1 compat: cd to root's HOME, set a few envvars */ |
97 | setup_environment(shell, SETUP_ENV_CHANGEENV | SETUP_ENV_CHANGEENV_LOGNAME, pwd); | 97 | setup_environment(shell, 0 |
98 | + SETUP_ENV_CHANGEENV_LOGNAME | ||
99 | + SETUP_ENV_CHDIR | ||
100 | , pwd); | ||
98 | // no SETUP_ENV_CLEARENV | 101 | // no SETUP_ENV_CLEARENV |
99 | // SETUP_ENV_CHANGEENV[+LOGNAME] - set HOME, SHELL, USER,and LOGNAME | 102 | // SETUP_ENV_CHANGEENV_LOGNAME - set HOME, SHELL, USER,and LOGNAME |
100 | // no SETUP_ENV_NO_CHDIR - IOW: cd to $HOME | 103 | // SETUP_ENV_CHDIR - cd to $HOME |
101 | 104 | ||
102 | /* util-linux 2.36.1 compat: steal ctty if we don't have it yet | 105 | /* util-linux 2.36.1 compat: steal ctty if we don't have it yet |
103 | * (yes, util-linux uses force=1) */ | 106 | * (yes, util-linux uses force=1) */ |
diff --git a/miscutils/bc.c b/miscutils/bc.c index e3f7573c9..fe555d018 100644 --- a/miscutils/bc.c +++ b/miscutils/bc.c | |||
@@ -6011,7 +6011,7 @@ static BC_STATUS zxc_program_assign(char inst) | |||
6011 | #endif | 6011 | #endif |
6012 | 6012 | ||
6013 | if (ib || sc || left->t == XC_RESULT_OBASE) { | 6013 | if (ib || sc || left->t == XC_RESULT_OBASE) { |
6014 | static const char *const msg[] = { | 6014 | static const char *const msg[] ALIGN_PTR = { |
6015 | "bad ibase; must be [2,16]", //XC_RESULT_IBASE | 6015 | "bad ibase; must be [2,16]", //XC_RESULT_IBASE |
6016 | "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE | 6016 | "bad obase; must be [2,"BC_MAX_OBASE_STR"]", //XC_RESULT_OBASE |
6017 | "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE | 6017 | "bad scale; must be [0,"BC_MAX_SCALE_STR"]", //XC_RESULT_SCALE |
diff --git a/miscutils/crond.c b/miscutils/crond.c index b74427351..1965af656 100644 --- a/miscutils/crond.c +++ b/miscutils/crond.c | |||
@@ -675,8 +675,7 @@ static void change_user(struct passwd *pas) | |||
675 | { | 675 | { |
676 | /* careful: we're after vfork! */ | 676 | /* careful: we're after vfork! */ |
677 | change_identity(pas); /* - initgroups, setgid, setuid */ | 677 | change_identity(pas); /* - initgroups, setgid, setuid */ |
678 | if (chdir(pas->pw_dir) < 0) { | 678 | if (chdir_or_warn(pas->pw_dir) != 0) { |
679 | bb_error_msg("can't change directory to '%s'", pas->pw_dir); | ||
680 | xchdir(CRON_DIR); | 679 | xchdir(CRON_DIR); |
681 | } | 680 | } |
682 | } | 681 | } |
diff --git a/miscutils/crontab.c b/miscutils/crontab.c index 411a18a50..1111f4d54 100644 --- a/miscutils/crontab.c +++ b/miscutils/crontab.c | |||
@@ -55,8 +55,8 @@ static void edit_file(const struct passwd *pas, const char *file) | |||
55 | /* initgroups, setgid, setuid */ | 55 | /* initgroups, setgid, setuid */ |
56 | change_identity(pas); | 56 | change_identity(pas); |
57 | setup_environment(pas->pw_shell, | 57 | setup_environment(pas->pw_shell, |
58 | SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP, | 58 | SETUP_ENV_CHANGEENV | SETUP_ENV_TO_TMP | SETUP_ENV_CHDIR, |
59 | pas); | 59 | pas); |
60 | ptr = getenv("VISUAL"); | 60 | ptr = getenv("VISUAL"); |
61 | if (!ptr) { | 61 | if (!ptr) { |
62 | ptr = getenv("EDITOR"); | 62 | ptr = getenv("EDITOR"); |
diff --git a/miscutils/devfsd.c b/miscutils/devfsd.c index 839d00fd0..fb9ebcf60 100644 --- a/miscutils/devfsd.c +++ b/miscutils/devfsd.c | |||
@@ -928,7 +928,7 @@ static void action_compat(const struct devfsd_notify_struct *info, unsigned int | |||
928 | unsigned int i; | 928 | unsigned int i; |
929 | char rewind_; | 929 | char rewind_; |
930 | /* 1 to 5 "scsi/" , 6 to 9 "ide/host" */ | 930 | /* 1 to 5 "scsi/" , 6 to 9 "ide/host" */ |
931 | static const char *const fmt[] = { | 931 | static const char *const fmt[] ALIGN_PTR = { |
932 | NULL , | 932 | NULL , |
933 | "sg/c%db%dt%du%d", /* scsi/generic */ | 933 | "sg/c%db%dt%du%d", /* scsi/generic */ |
934 | "sd/c%db%dt%du%d", /* scsi/disc */ | 934 | "sd/c%db%dt%du%d", /* scsi/disc */ |
@@ -1468,7 +1468,7 @@ const char *get_old_name(const char *devname, unsigned int namelen, | |||
1468 | const char *pty1; | 1468 | const char *pty1; |
1469 | const char *pty2; | 1469 | const char *pty2; |
1470 | /* 1 to 5 "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */ | 1470 | /* 1 to 5 "scsi/" , 6 to 9 "ide/host", 10 sbp/, 11 vcc/, 12 pty/ */ |
1471 | static const char *const fmt[] = { | 1471 | static const char *const fmt[] ALIGN_PTR = { |
1472 | NULL , | 1472 | NULL , |
1473 | "sg%u", /* scsi/generic */ | 1473 | "sg%u", /* scsi/generic */ |
1474 | NULL, /* scsi/disc */ | 1474 | NULL, /* scsi/disc */ |
diff --git a/miscutils/hexedit.c b/miscutils/hexedit.c index f8ff9b62b..15ad78377 100644 --- a/miscutils/hexedit.c +++ b/miscutils/hexedit.c | |||
@@ -292,7 +292,7 @@ int hexedit_main(int argc UNUSED_PARAM, char **argv) | |||
292 | fflush_all(); | 292 | fflush_all(); |
293 | G.in_read_key = 1; | 293 | G.in_read_key = 1; |
294 | if (!bb_got_signal) | 294 | if (!bb_got_signal) |
295 | key = read_key(STDIN_FILENO, G.read_key_buffer, -1); | 295 | key = safe_read_key(STDIN_FILENO, G.read_key_buffer, -1); |
296 | G.in_read_key = 0; | 296 | G.in_read_key = 0; |
297 | if (bb_got_signal) | 297 | if (bb_got_signal) |
298 | key = CTRL('X'); | 298 | key = CTRL('X'); |
diff --git a/miscutils/i2c_tools.c b/miscutils/i2c_tools.c index e3741eeba..da26f5e19 100644 --- a/miscutils/i2c_tools.c +++ b/miscutils/i2c_tools.c | |||
@@ -120,6 +120,7 @@ static int32_t i2c_smbus_access(int fd, char read_write, uint8_t cmd, | |||
120 | return ioctl(fd, I2C_SMBUS, &args); | 120 | return ioctl(fd, I2C_SMBUS, &args); |
121 | } | 121 | } |
122 | 122 | ||
123 | #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP || ENABLE_I2CDETECT | ||
123 | static int32_t i2c_smbus_read_byte(int fd) | 124 | static int32_t i2c_smbus_read_byte(int fd) |
124 | { | 125 | { |
125 | union i2c_smbus_data data; | 126 | union i2c_smbus_data data; |
@@ -131,6 +132,7 @@ static int32_t i2c_smbus_read_byte(int fd) | |||
131 | 132 | ||
132 | return data.byte; | 133 | return data.byte; |
133 | } | 134 | } |
135 | #endif | ||
134 | 136 | ||
135 | #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP | 137 | #if ENABLE_I2CGET || ENABLE_I2CSET || ENABLE_I2CDUMP |
136 | static int32_t i2c_smbus_write_byte(int fd, uint8_t val) | 138 | static int32_t i2c_smbus_write_byte(int fd, uint8_t val) |
diff --git a/miscutils/less.c b/miscutils/less.c index 6da991a0e..842031ca3 100644 --- a/miscutils/less.c +++ b/miscutils/less.c | |||
@@ -1177,9 +1177,9 @@ static int64_t getch_nowait(void) | |||
1177 | #endif | 1177 | #endif |
1178 | } | 1178 | } |
1179 | 1179 | ||
1180 | /* We have kbd_fd in O_NONBLOCK mode, read inside read_key() | 1180 | /* We have kbd_fd in O_NONBLOCK mode, read inside safe_read_key() |
1181 | * would not block even if there is no input available */ | 1181 | * would not block even if there is no input available */ |
1182 | key64 = read_key(kbd_fd, kbd_input, /*timeout off:*/ -2); | 1182 | key64 = safe_read_key(kbd_fd, kbd_input, /*timeout off:*/ -2); |
1183 | if ((int)key64 == -1) { | 1183 | if ((int)key64 == -1) { |
1184 | if (errno == EAGAIN) { | 1184 | if (errno == EAGAIN) { |
1185 | /* No keyboard input available. Since poll() did return, | 1185 | /* No keyboard input available. Since poll() did return, |
diff --git a/miscutils/man.c b/miscutils/man.c index be3b2a000..c3efe4484 100644 --- a/miscutils/man.c +++ b/miscutils/man.c | |||
@@ -328,7 +328,7 @@ int man_main(int argc UNUSED_PARAM, char **argv) | |||
328 | } | 328 | } |
329 | #else | 329 | #else |
330 | if (!man_path_list) { | 330 | if (!man_path_list) { |
331 | static const char *const mpl[] = { "/usr/man", "/usr/share/man", NULL }; | 331 | static const char *const mpl[] ALIGN_PTR = { "/usr/man", "/usr/share/man", NULL }; |
332 | man_path_list = (char**)mpl; | 332 | man_path_list = (char**)mpl; |
333 | /*count_mp = 2; - not used below anyway */ | 333 | /*count_mp = 2; - not used below anyway */ |
334 | } | 334 | } |
diff --git a/modutils/modutils-24.c b/modutils/modutils-24.c index ac8632481..d0bc2a6ef 100644 --- a/modutils/modutils-24.c +++ b/modutils/modutils-24.c | |||
@@ -3458,7 +3458,7 @@ static int obj_load_progbits(char *image, size_t image_size, struct obj_file *f, | |||
3458 | 3458 | ||
3459 | static void hide_special_symbols(struct obj_file *f) | 3459 | static void hide_special_symbols(struct obj_file *f) |
3460 | { | 3460 | { |
3461 | static const char *const specials[] = { | 3461 | static const char *const specials[] ALIGN_PTR = { |
3462 | SPFX "cleanup_module", | 3462 | SPFX "cleanup_module", |
3463 | SPFX "init_module", | 3463 | SPFX "init_module", |
3464 | SPFX "kernel_version", | 3464 | SPFX "kernel_version", |
@@ -3484,7 +3484,7 @@ static int obj_gpl_license(struct obj_file *f, const char **license) | |||
3484 | * linux/include/linux/module.h. Checking for leading "GPL" will not | 3484 | * linux/include/linux/module.h. Checking for leading "GPL" will not |
3485 | * work, somebody will use "GPL sucks, this is proprietary". | 3485 | * work, somebody will use "GPL sucks, this is proprietary". |
3486 | */ | 3486 | */ |
3487 | static const char *const gpl_licenses[] = { | 3487 | static const char *const gpl_licenses[] ALIGN_PTR = { |
3488 | "GPL", | 3488 | "GPL", |
3489 | "GPL v2", | 3489 | "GPL v2", |
3490 | "GPL and additional rights", | 3490 | "GPL and additional rights", |
diff --git a/networking/httpd.c b/networking/httpd.c index 5f7b3a4dd..59b4a769c 100644 --- a/networking/httpd.c +++ b/networking/httpd.c | |||
@@ -1707,8 +1707,7 @@ static void send_cgi_and_exit( | |||
1707 | script = last_slash; | 1707 | script = last_slash; |
1708 | if (script != url) { /* paranoia */ | 1708 | if (script != url) { /* paranoia */ |
1709 | *script = '\0'; | 1709 | *script = '\0'; |
1710 | if (chdir(url + 1) != 0) { | 1710 | if (chdir_or_warn(url + 1) != 0) { |
1711 | bb_perror_msg("can't change directory to '%s'", url + 1); | ||
1712 | goto error_execing_cgi; | 1711 | goto error_execing_cgi; |
1713 | } | 1712 | } |
1714 | // not needed: *script = '/'; | 1713 | // not needed: *script = '/'; |
diff --git a/networking/ifupdown.c b/networking/ifupdown.c index 737113dd4..6c4ae27f2 100644 --- a/networking/ifupdown.c +++ b/networking/ifupdown.c | |||
@@ -532,7 +532,7 @@ static int FAST_FUNC v4tunnel_down(struct interface_defn_t * ifd, execfn * exec) | |||
532 | } | 532 | } |
533 | # endif | 533 | # endif |
534 | 534 | ||
535 | static const struct method_t methods6[] = { | 535 | static const struct method_t methods6[] ALIGN_PTR = { |
536 | # if ENABLE_FEATURE_IFUPDOWN_IP | 536 | # if ENABLE_FEATURE_IFUPDOWN_IP |
537 | { "v4tunnel" , v4tunnel_up , v4tunnel_down , }, | 537 | { "v4tunnel" , v4tunnel_up , v4tunnel_down , }, |
538 | # endif | 538 | # endif |
@@ -627,7 +627,7 @@ struct dhcp_client_t { | |||
627 | const char *stopcmd; | 627 | const char *stopcmd; |
628 | }; | 628 | }; |
629 | 629 | ||
630 | static const struct dhcp_client_t ext_dhcp_clients[] = { | 630 | static const struct dhcp_client_t ext_dhcp_clients[] ALIGN_PTR = { |
631 | { "dhcpcd", | 631 | { "dhcpcd", |
632 | "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%", | 632 | "dhcpcd[[ -h %hostname%]][[ -i %vendor%]][[ -I %client%]][[ -l %leasetime%]] %iface%", |
633 | "dhcpcd -k %iface%", | 633 | "dhcpcd -k %iface%", |
@@ -774,7 +774,7 @@ static int FAST_FUNC wvdial_down(struct interface_defn_t *ifd, execfn *exec) | |||
774 | "-p /var/run/wvdial.%iface% -s 2", ifd, exec); | 774 | "-p /var/run/wvdial.%iface% -s 2", ifd, exec); |
775 | } | 775 | } |
776 | 776 | ||
777 | static const struct method_t methods[] = { | 777 | static const struct method_t methods[] ALIGN_PTR = { |
778 | { "manual" , manual_up_down, manual_up_down, }, | 778 | { "manual" , manual_up_down, manual_up_down, }, |
779 | { "wvdial" , wvdial_up , wvdial_down , }, | 779 | { "wvdial" , wvdial_up , wvdial_down , }, |
780 | { "ppp" , ppp_up , ppp_down , }, | 780 | { "ppp" , ppp_up , ppp_down , }, |
@@ -797,7 +797,7 @@ static int FAST_FUNC link_up_down(struct interface_defn_t *ifd UNUSED_PARAM, exe | |||
797 | return 1; | 797 | return 1; |
798 | } | 798 | } |
799 | 799 | ||
800 | static const struct method_t link_methods[] = { | 800 | static const struct method_t link_methods[] ALIGN_PTR = { |
801 | { "none", link_up_down, link_up_down } | 801 | { "none", link_up_down, link_up_down } |
802 | }; | 802 | }; |
803 | 803 | ||
diff --git a/networking/inetd.c b/networking/inetd.c index e71be51c3..fb2fbe323 100644 --- a/networking/inetd.c +++ b/networking/inetd.c | |||
@@ -1538,7 +1538,7 @@ int inetd_main(int argc UNUSED_PARAM, char **argv) | |||
1538 | #if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \ | 1538 | #if ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_ECHO \ |
1539 | || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD | 1539 | || ENABLE_FEATURE_INETD_SUPPORT_BUILTIN_DISCARD |
1540 | # if !BB_MMU | 1540 | # if !BB_MMU |
1541 | static const char *const cat_args[] = { "cat", NULL }; | 1541 | static const char *const cat_args[] ALIGN_PTR = { "cat", NULL }; |
1542 | # endif | 1542 | # endif |
1543 | #endif | 1543 | #endif |
1544 | 1544 | ||
diff --git a/networking/interface.c b/networking/interface.c index ea6a2c8a8..6b6c0944a 100644 --- a/networking/interface.c +++ b/networking/interface.c | |||
@@ -446,13 +446,13 @@ static char *get_name(char name[IFNAMSIZ], char *p) | |||
446 | * %n specifiers (even the size of integers may not match). | 446 | * %n specifiers (even the size of integers may not match). |
447 | */ | 447 | */ |
448 | #if INT_MAX == LONG_MAX | 448 | #if INT_MAX == LONG_MAX |
449 | static const char *const ss_fmt[] = { | 449 | static const char *const ss_fmt[] ALIGN_PTR = { |
450 | "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u", | 450 | "%n%llu%u%u%u%u%n%n%n%llu%u%u%u%u%u", |
451 | "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u", | 451 | "%llu%llu%u%u%u%u%n%n%llu%llu%u%u%u%u%u", |
452 | "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u" | 452 | "%llu%llu%u%u%u%u%u%u%llu%llu%u%u%u%u%u%u" |
453 | }; | 453 | }; |
454 | #else | 454 | #else |
455 | static const char *const ss_fmt[] = { | 455 | static const char *const ss_fmt[] ALIGN_PTR = { |
456 | "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu", | 456 | "%n%llu%lu%lu%lu%lu%n%n%n%llu%lu%lu%lu%lu%lu", |
457 | "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu", | 457 | "%llu%llu%lu%lu%lu%lu%n%n%llu%llu%lu%lu%lu%lu%lu", |
458 | "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu" | 458 | "%llu%llu%lu%lu%lu%lu%lu%lu%llu%llu%lu%lu%lu%lu%lu%lu" |
@@ -731,7 +731,7 @@ static const struct hwtype ib_hwtype = { | |||
731 | #endif | 731 | #endif |
732 | 732 | ||
733 | 733 | ||
734 | static const struct hwtype *const hwtypes[] = { | 734 | static const struct hwtype *const hwtypes[] ALIGN_PTR = { |
735 | &loop_hwtype, | 735 | &loop_hwtype, |
736 | ðer_hwtype, | 736 | ðer_hwtype, |
737 | &ppp_hwtype, | 737 | &ppp_hwtype, |
diff --git a/networking/libiproute/ipaddress.c b/networking/libiproute/ipaddress.c index 17a838411..ecc3848ff 100644 --- a/networking/libiproute/ipaddress.c +++ b/networking/libiproute/ipaddress.c | |||
@@ -58,7 +58,7 @@ typedef struct filter_t filter_t; | |||
58 | 58 | ||
59 | static void print_link_flags(unsigned flags, unsigned mdown) | 59 | static void print_link_flags(unsigned flags, unsigned mdown) |
60 | { | 60 | { |
61 | static const int flag_masks[] = { | 61 | static const int flag_masks[] ALIGN_INT = { |
62 | IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT, | 62 | IFF_LOOPBACK, IFF_BROADCAST, IFF_POINTOPOINT, |
63 | IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP }; | 63 | IFF_MULTICAST, IFF_NOARP, IFF_UP, IFF_LOWER_UP }; |
64 | static const char flag_labels[] ALIGN1 = | 64 | static const char flag_labels[] ALIGN1 = |
diff --git a/networking/udhcp/common.c b/networking/udhcp/common.c index 8e9b93655..ae818db05 100644 --- a/networking/udhcp/common.c +++ b/networking/udhcp/common.c | |||
@@ -19,7 +19,7 @@ const uint8_t MAC_BCAST_ADDR[6] ALIGN2 = { | |||
19 | * See RFC2132 for more options. | 19 | * See RFC2132 for more options. |
20 | * OPTION_REQ: these options are requested by udhcpc (unless -o). | 20 | * OPTION_REQ: these options are requested by udhcpc (unless -o). |
21 | */ | 21 | */ |
22 | const struct dhcp_optflag dhcp_optflags[] = { | 22 | const struct dhcp_optflag dhcp_optflags[] ALIGN2 = { |
23 | /* flags code */ | 23 | /* flags code */ |
24 | { OPTION_IP | OPTION_REQ, 0x01 }, /* DHCP_SUBNET */ | 24 | { OPTION_IP | OPTION_REQ, 0x01 }, /* DHCP_SUBNET */ |
25 | { OPTION_S32 , 0x02 }, /* DHCP_TIME_OFFSET */ | 25 | { OPTION_S32 , 0x02 }, /* DHCP_TIME_OFFSET */ |
diff --git a/networking/udhcp/d6_dhcpc.c b/networking/udhcp/d6_dhcpc.c index 9d2a8f5d3..9fc690315 100644 --- a/networking/udhcp/d6_dhcpc.c +++ b/networking/udhcp/d6_dhcpc.c | |||
@@ -65,7 +65,7 @@ | |||
65 | 65 | ||
66 | /* "struct client_data_t client_data" is in bb_common_bufsiz1 */ | 66 | /* "struct client_data_t client_data" is in bb_common_bufsiz1 */ |
67 | 67 | ||
68 | static const struct dhcp_optflag d6_optflags[] = { | 68 | static const struct dhcp_optflag d6_optflags[] ALIGN2 = { |
69 | #if ENABLE_FEATURE_UDHCPC6_RFC3646 | 69 | #if ENABLE_FEATURE_UDHCPC6_RFC3646 |
70 | { OPTION_6RD | OPTION_LIST | OPTION_REQ, D6_OPT_DNS_SERVERS }, | 70 | { OPTION_6RD | OPTION_LIST | OPTION_REQ, D6_OPT_DNS_SERVERS }, |
71 | { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST }, | 71 | { OPTION_DNS_STRING | OPTION_LIST | OPTION_REQ, D6_OPT_DOMAIN_LIST }, |
diff --git a/procps/nmeter.c b/procps/nmeter.c index 2310e9844..088d366bf 100644 --- a/procps/nmeter.c +++ b/procps/nmeter.c | |||
@@ -70,7 +70,7 @@ typedef struct proc_file { | |||
70 | smallint last_gen; | 70 | smallint last_gen; |
71 | } proc_file; | 71 | } proc_file; |
72 | 72 | ||
73 | static const char *const proc_name[] = { | 73 | static const char *const proc_name[] ALIGN_PTR = { |
74 | "stat", // Must match the order of proc_file's! | 74 | "stat", // Must match the order of proc_file's! |
75 | "loadavg", | 75 | "loadavg", |
76 | "net/dev", | 76 | "net/dev", |
diff --git a/procps/top.c b/procps/top.c index 4cd545c69..804d6f258 100644 --- a/procps/top.c +++ b/procps/top.c | |||
@@ -913,7 +913,7 @@ static unsigned handle_input(unsigned scan_mask, duration_t interval) | |||
913 | while (1) { | 913 | while (1) { |
914 | int32_t c; | 914 | int32_t c; |
915 | 915 | ||
916 | c = read_key(STDIN_FILENO, G.kbd_input, interval * 1000); | 916 | c = safe_read_key(STDIN_FILENO, G.kbd_input, interval * 1000); |
917 | if (c == -1 && errno != EAGAIN) { | 917 | if (c == -1 && errno != EAGAIN) { |
918 | /* error/EOF */ | 918 | /* error/EOF */ |
919 | option_mask32 |= OPT_EOF; | 919 | option_mask32 |= OPT_EOF; |
diff --git a/selinux/setenforce.c b/selinux/setenforce.c index 996034f8e..2267be451 100644 --- a/selinux/setenforce.c +++ b/selinux/setenforce.c | |||
@@ -26,7 +26,7 @@ | |||
26 | /* These strings are arranged so that odd ones | 26 | /* These strings are arranged so that odd ones |
27 | * result in security_setenforce(1) being done, | 27 | * result in security_setenforce(1) being done, |
28 | * the rest will do security_setenforce(0) */ | 28 | * the rest will do security_setenforce(0) */ |
29 | static const char *const setenforce_cmd[] = { | 29 | static const char *const setenforce_cmd[] ALIGN_PTR = { |
30 | "0", | 30 | "0", |
31 | "1", | 31 | "1", |
32 | "permissive", | 32 | "permissive", |
diff --git a/shell/ash.c b/shell/ash.c index a1d01447a..46c4f1675 100644 --- a/shell/ash.c +++ b/shell/ash.c | |||
@@ -428,7 +428,7 @@ static void forkshell_print(FILE *fp0, struct forkshell *fs, const char **notes) | |||
428 | /* ============ Shell options */ | 428 | /* ============ Shell options */ |
429 | 429 | ||
430 | /* If you add/change options hare, update --help text too */ | 430 | /* If you add/change options hare, update --help text too */ |
431 | static const char *const optletters_optnames[] = { | 431 | static const char *const optletters_optnames[] ALIGN_PTR = { |
432 | "e" "errexit", | 432 | "e" "errexit", |
433 | "f" "noglob", | 433 | "f" "noglob", |
434 | /* bash has '-o ignoreeof', but no short synonym -I for it */ | 434 | /* bash has '-o ignoreeof', but no short synonym -I for it */ |
@@ -845,7 +845,7 @@ raise_exception(int e) | |||
845 | /* | 845 | /* |
846 | * Called when a SIGINT is received. (If the user specifies | 846 | * Called when a SIGINT is received. (If the user specifies |
847 | * that SIGINT is to be trapped or ignored using the trap builtin, then | 847 | * that SIGINT is to be trapped or ignored using the trap builtin, then |
848 | * this routine is not called.) Suppressint is nonzero when interrupts | 848 | * this routine is not called.) suppress_int is nonzero when interrupts |
849 | * are held using the INT_OFF macro. (The test for iflag is just | 849 | * are held using the INT_OFF macro. (The test for iflag is just |
850 | * defensive programming.) | 850 | * defensive programming.) |
851 | */ | 851 | */ |
@@ -882,13 +882,12 @@ raise_interrupt(void) | |||
882 | } while (0) | 882 | } while (0) |
883 | #endif | 883 | #endif |
884 | 884 | ||
885 | static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void | 885 | static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void |
886 | int_on(void) | 886 | int_on(void) |
887 | { | 887 | { |
888 | barrier(); | 888 | barrier(); |
889 | if (--suppress_int == 0 && pending_int) { | 889 | if (--suppress_int == 0 && pending_int) |
890 | raise_interrupt(); | 890 | raise_interrupt(); |
891 | } | ||
892 | } | 891 | } |
893 | #if DEBUG_INTONOFF | 892 | #if DEBUG_INTONOFF |
894 | # define INT_ON do { \ | 893 | # define INT_ON do { \ |
@@ -898,7 +897,7 @@ int_on(void) | |||
898 | #else | 897 | #else |
899 | # define INT_ON int_on() | 898 | # define INT_ON int_on() |
900 | #endif | 899 | #endif |
901 | static IF_ASH_OPTIMIZE_FOR_SIZE(inline) void | 900 | static IF_NOT_ASH_OPTIMIZE_FOR_SIZE(inline) void |
902 | force_int_on(void) | 901 | force_int_on(void) |
903 | { | 902 | { |
904 | barrier(); | 903 | barrier(); |
@@ -4143,7 +4142,9 @@ signal_handler(int signo) | |||
4143 | if (!trap[SIGCHLD]) | 4142 | if (!trap[SIGCHLD]) |
4144 | return; | 4143 | return; |
4145 | } | 4144 | } |
4146 | 4145 | #if ENABLE_FEATURE_EDITING | |
4146 | bb_got_signal = signo; /* for read_line_input: "we got a signal" */ | ||
4147 | #endif | ||
4147 | gotsig[signo - 1] = 1; | 4148 | gotsig[signo - 1] = 1; |
4148 | pending_sig = signo; | 4149 | pending_sig = signo; |
4149 | 4150 | ||
@@ -11656,33 +11657,56 @@ preadfd(void) | |||
11656 | # endif | 11657 | # endif |
11657 | reinit_unicode_for_ash(); | 11658 | reinit_unicode_for_ash(); |
11658 | again: | 11659 | again: |
11660 | /* For shell, LI_INTERRUPTIBLE is set: | ||
11661 | * read_line_input will abort on either | ||
11662 | * getting EINTR in poll(), or if it sees bb_got_signal != 0 | ||
11663 | * (IOW: if signal arrives before poll() is reached). | ||
11664 | * Interactive testcases: | ||
11665 | * (while kill -INT $$; do sleep 1; done) & | ||
11666 | * #^^^ prints ^C, prints prompt, repeats | ||
11667 | * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) & | ||
11668 | * #^^^ prints ^C, prints "I", prints prompt, repeats | ||
11669 | * trap 'echo T' term; (while kill $$; do sleep 1; done) & | ||
11670 | * #^^^ prints "T", prints prompt, repeats | ||
11671 | * #(bash 5.0.17 exits after first "T", looks like a bug) | ||
11672 | */ | ||
11673 | bb_got_signal = 0; | ||
11674 | INT_OFF; /* no longjmp'ing out of read_line_input please */ | ||
11659 | nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ); | 11675 | nr = read_line_input(line_input_state, cmdedit_prompt, buf, IBUFSIZ); |
11676 | if (bb_got_signal == SIGINT) | ||
11677 | write(STDOUT_FILENO, "^C\n", 3); | ||
11678 | INT_ON; /* here non-blocked SIGINT will longjmp */ | ||
11660 | if (nr == 0) { | 11679 | if (nr == 0) { |
11661 | /* ^C pressed, "convert" to SIGINT */ | 11680 | /* ^C pressed, "convert" to SIGINT */ |
11662 | # if !ENABLE_PLATFORM_MINGW32 | 11681 | # if !ENABLE_PLATFORM_MINGW32 |
11663 | write(STDOUT_FILENO, "^C", 2); | 11682 | write(STDOUT_FILENO, "^C\n", 3); |
11664 | raise(SIGINT); | 11683 | raise(SIGINT); /* here non-blocked SIGINT will longjmp */ |
11665 | /* raise(SIGINT) did not work! (e.g. if SIGINT | 11684 | /* raise(SIGINT) did not work! (e.g. if SIGINT |
11666 | * is SIG_INGed on startup, it stays SIG_IGNed) | 11685 | * is SIG_IGNed on startup, it stays SIG_IGNed) |
11667 | */ | 11686 | */ |
11668 | # else | 11687 | # else |
11669 | raise_interrupt(); | 11688 | raise_interrupt(); |
11670 | # endif | 11689 | # endif |
11671 | if (trap[SIGINT]) { | 11690 | if (trap[SIGINT]) { |
11691 | empty_line_input: | ||
11672 | buf[0] = '\n'; | 11692 | buf[0] = '\n'; |
11673 | buf[1] = '\0'; | 11693 | buf[1] = '\0'; |
11674 | return 1; | 11694 | return 1; |
11675 | } | 11695 | } |
11676 | exitstatus = 128 + SIGINT; | 11696 | exitstatus = 128 + SIGINT; |
11677 | /* bash behavior on ^C + ignored SIGINT: */ | 11697 | /* bash behavior on ^C + ignored SIGINT: */ |
11678 | write(STDOUT_FILENO, "\n", 1); | ||
11679 | goto again; | 11698 | goto again; |
11680 | } | 11699 | } |
11681 | if (nr < 0) { | 11700 | if (nr < 0) { |
11682 | if (errno == 0) { | 11701 | if (errno == 0) { |
11683 | /* Ctrl+D pressed */ | 11702 | /* ^D pressed */ |
11684 | nr = 0; | 11703 | nr = 0; |
11685 | } | 11704 | } |
11705 | else if (errno == EINTR) { /* got signal? */ | ||
11706 | if (bb_got_signal != SIGINT) | ||
11707 | write(STDOUT_FILENO, "\n", 1); | ||
11708 | goto empty_line_input; | ||
11709 | } | ||
11686 | # if ENABLE_ASH_IDLE_TIMEOUT | 11710 | # if ENABLE_ASH_IDLE_TIMEOUT |
11687 | else if (errno == EAGAIN && timeout > 0) { | 11711 | else if (errno == EAGAIN && timeout > 0) { |
11688 | puts("\007timed out waiting for input: auto-logout"); | 11712 | puts("\007timed out waiting for input: auto-logout"); |
diff --git a/shell/hush.c b/shell/hush.c index 982fc356a..ae81f0da5 100644 --- a/shell/hush.c +++ b/shell/hush.c | |||
@@ -564,7 +564,7 @@ enum { | |||
564 | #define NULL_O_STRING { NULL } | 564 | #define NULL_O_STRING { NULL } |
565 | 565 | ||
566 | #ifndef debug_printf_parse | 566 | #ifndef debug_printf_parse |
567 | static const char *const assignment_flag[] = { | 567 | static const char *const assignment_flag[] ALIGN_PTR = { |
568 | "MAYBE_ASSIGNMENT", | 568 | "MAYBE_ASSIGNMENT", |
569 | "DEFINITELY_ASSIGNMENT", | 569 | "DEFINITELY_ASSIGNMENT", |
570 | "NOT_ASSIGNMENT", | 570 | "NOT_ASSIGNMENT", |
@@ -918,6 +918,7 @@ struct globals { | |||
918 | #if ENABLE_HUSH_INTERACTIVE | 918 | #if ENABLE_HUSH_INTERACTIVE |
919 | smallint promptmode; /* 0: PS1, 1: PS2 */ | 919 | smallint promptmode; /* 0: PS1, 1: PS2 */ |
920 | #endif | 920 | #endif |
921 | /* set by signal handler if SIGINT is received _and_ its trap is not set */ | ||
921 | smallint flag_SIGINT; | 922 | smallint flag_SIGINT; |
922 | #if ENABLE_HUSH_LOOPS | 923 | #if ENABLE_HUSH_LOOPS |
923 | smallint flag_break_continue; | 924 | smallint flag_break_continue; |
@@ -1944,6 +1945,9 @@ enum { | |||
1944 | static void record_pending_signo(int sig) | 1945 | static void record_pending_signo(int sig) |
1945 | { | 1946 | { |
1946 | sigaddset(&G.pending_set, sig); | 1947 | sigaddset(&G.pending_set, sig); |
1948 | #if ENABLE_FEATURE_EDITING | ||
1949 | bb_got_signal = sig; /* for read_line_input: "we got a signal" */ | ||
1950 | #endif | ||
1947 | #if ENABLE_HUSH_FAST | 1951 | #if ENABLE_HUSH_FAST |
1948 | if (sig == SIGCHLD) { | 1952 | if (sig == SIGCHLD) { |
1949 | G.count_SIGCHLD++; | 1953 | G.count_SIGCHLD++; |
@@ -2652,30 +2656,53 @@ static int get_user_input(struct in_str *i) | |||
2652 | for (;;) { | 2656 | for (;;) { |
2653 | reinit_unicode_for_hush(); | 2657 | reinit_unicode_for_hush(); |
2654 | G.flag_SIGINT = 0; | 2658 | G.flag_SIGINT = 0; |
2655 | /* buglet: SIGINT will not make new prompt to appear _at once_, | 2659 | |
2656 | * only after <Enter>. (^C works immediately) */ | 2660 | bb_got_signal = 0; |
2657 | r = read_line_input(G.line_input_state, prompt_str, | 2661 | if (!sigisemptyset(&G.pending_set)) { |
2662 | /* Whoops, already got a signal, do not call read_line_input */ | ||
2663 | bb_got_signal = r = -1; | ||
2664 | } else { | ||
2665 | /* For shell, LI_INTERRUPTIBLE is set: | ||
2666 | * read_line_input will abort on either | ||
2667 | * getting EINTR in poll(), or if it sees bb_got_signal != 0 | ||
2668 | * (IOW: if signal arrives before poll() is reached). | ||
2669 | * Interactive testcases: | ||
2670 | * (while kill -INT $$; do sleep 1; done) & | ||
2671 | * #^^^ prints ^C, prints prompt, repeats | ||
2672 | * trap 'echo I' int; (while kill -INT $$; do sleep 1; done) & | ||
2673 | * #^^^ prints ^C, prints "I", prints prompt, repeats | ||
2674 | * trap 'echo T' term; (while kill $$; do sleep 1; done) & | ||
2675 | * #^^^ prints "T", prints prompt, repeats | ||
2676 | * #(bash 5.0.17 exits after first "T", looks like a bug) | ||
2677 | */ | ||
2678 | r = read_line_input(G.line_input_state, prompt_str, | ||
2658 | G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1 | 2679 | G.user_input_buf, CONFIG_FEATURE_EDITING_MAX_LEN-1 |
2659 | ); | 2680 | ); |
2660 | /* read_line_input intercepts ^C, "convert" it to SIGINT */ | 2681 | /* read_line_input intercepts ^C, "convert" it to SIGINT */ |
2661 | if (r == 0) { | 2682 | if (r == 0) |
2662 | raise(SIGINT); | 2683 | raise(SIGINT); |
2684 | } | ||
2685 | /* bash prints ^C (before running a trap, if any) | ||
2686 | * both on keyboard ^C and on real SIGINT (non-kbd generated). | ||
2687 | */ | ||
2688 | if (sigismember(&G.pending_set, SIGINT)) { | ||
2689 | write(STDOUT_FILENO, "^C\n", 3); | ||
2690 | G.last_exitcode = 128 | SIGINT; | ||
2663 | } | 2691 | } |
2664 | check_and_run_traps(); | 2692 | check_and_run_traps(); |
2665 | if (r != 0 && !G.flag_SIGINT) | 2693 | if (r == 0) /* keyboard ^C? */ |
2694 | continue; /* go back, read another input line */ | ||
2695 | if (r > 0) /* normal input? (no ^C, no ^D, no signals) */ | ||
2666 | break; | 2696 | break; |
2667 | /* ^C or SIGINT: repeat */ | 2697 | if (!bb_got_signal) { |
2668 | /* bash prints ^C even on real SIGINT (non-kbd generated) */ | 2698 | /* r < 0: ^D/EOF/error detected (but not signal) */ |
2669 | write(STDOUT_FILENO, "^C\n", 3); | 2699 | /* ^D on interactive input goes to next line before exiting: */ |
2670 | G.last_exitcode = 128 | SIGINT; | 2700 | write(STDOUT_FILENO, "\n", 1); |
2671 | } | 2701 | i->p = NULL; |
2672 | if (r < 0) { | 2702 | i->peek_buf[0] = r = EOF; |
2673 | /* EOF/error detected */ | 2703 | return r; |
2674 | /* ^D on interactive input goes to next line before exiting: */ | 2704 | } |
2675 | write(STDOUT_FILENO, "\n", 1); | 2705 | /* it was a signal: go back, read another input line */ |
2676 | i->p = NULL; | ||
2677 | i->peek_buf[0] = r = EOF; | ||
2678 | return r; | ||
2679 | } | 2706 | } |
2680 | i->p = G.user_input_buf; | 2707 | i->p = G.user_input_buf; |
2681 | return (unsigned char)*i->p++; | 2708 | return (unsigned char)*i->p++; |
@@ -3655,7 +3682,7 @@ static void free_pipe_list(struct pipe *pi) | |||
3655 | #ifndef debug_print_tree | 3682 | #ifndef debug_print_tree |
3656 | static void debug_print_tree(struct pipe *pi, int lvl) | 3683 | static void debug_print_tree(struct pipe *pi, int lvl) |
3657 | { | 3684 | { |
3658 | static const char *const PIPE[] = { | 3685 | static const char *const PIPE[] ALIGN_PTR = { |
3659 | [PIPE_SEQ] = "SEQ", | 3686 | [PIPE_SEQ] = "SEQ", |
3660 | [PIPE_AND] = "AND", | 3687 | [PIPE_AND] = "AND", |
3661 | [PIPE_OR ] = "OR" , | 3688 | [PIPE_OR ] = "OR" , |
@@ -3690,7 +3717,7 @@ static void debug_print_tree(struct pipe *pi, int lvl) | |||
3690 | [RES_XXXX ] = "XXXX" , | 3717 | [RES_XXXX ] = "XXXX" , |
3691 | [RES_SNTX ] = "SNTX" , | 3718 | [RES_SNTX ] = "SNTX" , |
3692 | }; | 3719 | }; |
3693 | static const char *const CMDTYPE[] = { | 3720 | static const char *const CMDTYPE[] ALIGN_PTR = { |
3694 | "{}", | 3721 | "{}", |
3695 | "()", | 3722 | "()", |
3696 | "[noglob]", | 3723 | "[noglob]", |
@@ -7632,7 +7659,7 @@ static int generate_stream_from_string(const char *s, pid_t *pid_p) | |||
7632 | if (is_prefixed_with(s, "trap") | 7659 | if (is_prefixed_with(s, "trap") |
7633 | && skip_whitespace(s + 4)[0] == '\0' | 7660 | && skip_whitespace(s + 4)[0] == '\0' |
7634 | ) { | 7661 | ) { |
7635 | static const char *const argv[] = { NULL, NULL }; | 7662 | static const char *const argv[] ALIGN_PTR = { NULL, NULL }; |
7636 | builtin_trap((char**)argv); | 7663 | builtin_trap((char**)argv); |
7637 | fflush_all(); /* important */ | 7664 | fflush_all(); /* important */ |
7638 | _exit(0); | 7665 | _exit(0); |
@@ -9799,7 +9826,7 @@ static int run_list(struct pipe *pi) | |||
9799 | static const char encoded_dollar_at[] ALIGN1 = { | 9826 | static const char encoded_dollar_at[] ALIGN1 = { |
9800 | SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0' | 9827 | SPECIAL_VAR_SYMBOL, '@' | 0x80, SPECIAL_VAR_SYMBOL, '\0' |
9801 | }; /* encoded representation of "$@" */ | 9828 | }; /* encoded representation of "$@" */ |
9802 | static const char *const encoded_dollar_at_argv[] = { | 9829 | static const char *const encoded_dollar_at_argv[] ALIGN_PTR = { |
9803 | encoded_dollar_at, NULL | 9830 | encoded_dollar_at, NULL |
9804 | }; /* argv list with one element: "$@" */ | 9831 | }; /* argv list with one element: "$@" */ |
9805 | char **vals; | 9832 | char **vals; |
@@ -10361,7 +10388,7 @@ int hush_main(int argc, char **argv) | |||
10361 | //it ignores TERM: | 10388 | //it ignores TERM: |
10362 | // bash -i -c 'kill $$; echo ALIVE' | 10389 | // bash -i -c 'kill $$; echo ALIVE' |
10363 | // ALIVE | 10390 | // ALIVE |
10364 | //it resets SIG_INGed HUP to SIG_DFL: | 10391 | //it resets SIG_IGNed HUP to SIG_DFL: |
10365 | // trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE' | 10392 | // trap '' hup; bash -i -c 'kill -hup $$; echo ALIVE' |
10366 | // Hangup [the message is not printed by bash, it's the shell which started it] | 10393 | // Hangup [the message is not printed by bash, it's the shell which started it] |
10367 | //is talkative about jobs and exiting: | 10394 | //is talkative about jobs and exiting: |
diff --git a/shell/shell_common.c b/shell/shell_common.c index fff356c04..399d5e684 100644 --- a/shell/shell_common.c +++ b/shell/shell_common.c | |||
@@ -218,6 +218,7 @@ shell_builtin_read(struct builtin_read_params *params) | |||
218 | */ | 218 | */ |
219 | errno = 0; | 219 | errno = 0; |
220 | pfd[0].events = POLLIN; | 220 | pfd[0].events = POLLIN; |
221 | //TODO race with a signal arriving just before the poll! | ||
221 | if (poll(pfd, 1, timeout) <= 0) { | 222 | if (poll(pfd, 1, timeout) <= 0) { |
222 | /* timed out, or EINTR */ | 223 | /* timed out, or EINTR */ |
223 | err = errno; | 224 | err = errno; |
diff --git a/testsuite/sed.tests b/testsuite/sed.tests index e62b839f7..626542e33 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests | |||
@@ -324,6 +324,21 @@ testing "sed zero chars match/replace logic must not falsely trigger here 2" \ | |||
324 | "sed 's/ *$/_/g'" \ | 324 | "sed 's/ *$/_/g'" \ |
325 | "qwerty_\n" "" "qwerty\n" | 325 | "qwerty_\n" "" "qwerty\n" |
326 | 326 | ||
327 | # the pattern here is interpreted as "9+", not as "9\+" | ||
328 | testing "sed special char as s/// delimiter, in pattern" \ | ||
329 | "sed 's+9\++X+'" \ | ||
330 | "X8=17\n" "" "9+8=17\n" | ||
331 | |||
332 | # Matching GNU sed 4.8: | ||
333 | # in replacement string, "\&" remains "\&", not interpreted as "&" | ||
334 | testing "sed special char as s/// delimiter, in replacement 1" \ | ||
335 | "sed 's&9&X\&&'" \ | ||
336 | "X&+8=17\n" "" "9+8=17\n" | ||
337 | # in replacement string, "\1" is interpreted as "1" | ||
338 | testing "sed special char as s/// delimiter, in replacement 2" \ | ||
339 | "sed 's1\(9\)1X\11'" \ | ||
340 | "X1+8=17\n" "" "9+8=17\n" | ||
341 | |||
327 | testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \ | 342 | testing "sed /\$_in_regex/ should not match newlines, only end-of-line" \ |
328 | "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \ | 343 | "sed ': testcont; /\\\\$/{ =; N; b testcont }'" \ |
329 | "\ | 344 | "\ |
diff --git a/util-linux/hexdump.c b/util-linux/hexdump.c index 57e7e8db7..307a84803 100644 --- a/util-linux/hexdump.c +++ b/util-linux/hexdump.c | |||
@@ -71,7 +71,7 @@ static void bb_dump_addfile(dumper_t *dumper, char *name) | |||
71 | fclose(fp); | 71 | fclose(fp); |
72 | } | 72 | } |
73 | 73 | ||
74 | static const char *const add_strings[] = { | 74 | static const char *const add_strings[] ALIGN_PTR = { |
75 | "\"%07.7_ax \"16/1 \"%03o \"\"\n\"", /* b */ | 75 | "\"%07.7_ax \"16/1 \"%03o \"\"\n\"", /* b */ |
76 | "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"", /* c */ | 76 | "\"%07.7_ax \"16/1 \"%3_c \"\"\n\"", /* c */ |
77 | "\"%07.7_ax \"8/2 \" %05u \"\"\n\"", /* d */ | 77 | "\"%07.7_ax \"8/2 \" %05u \"\"\n\"", /* d */ |
diff --git a/util-linux/mkfs_vfat.c b/util-linux/mkfs_vfat.c index 844d965f8..821371953 100644 --- a/util-linux/mkfs_vfat.c +++ b/util-linux/mkfs_vfat.c | |||
@@ -218,8 +218,11 @@ static const char boot_code[] ALIGN1 = | |||
218 | int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; | 218 | int mkfs_vfat_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; |
219 | int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) | 219 | int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) |
220 | { | 220 | { |
221 | static const char NO_NAME_11[] = "NO NAME "; | ||
222 | |||
221 | struct stat st; | 223 | struct stat st; |
222 | const char *volume_label = ""; | 224 | const char *arg_volume_label = NO_NAME_11; //default |
225 | char volume_label11[12]; | ||
223 | char *buf; | 226 | char *buf; |
224 | char *device_name; | 227 | char *device_name; |
225 | uoff_t volume_size_bytes; | 228 | uoff_t volume_size_bytes; |
@@ -257,14 +260,17 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) | |||
257 | opts = getopt32(argv, "^" | 260 | opts = getopt32(argv, "^" |
258 | "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v" | 261 | "Ab:cCf:F:h:Ii:l:m:n:r:R:s:S:v" |
259 | "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c | 262 | "\0" "-1", //:b+:f+:F+:h+:r+:R+:s+:S+:vv:c--l:l--c |
260 | NULL, NULL, NULL, NULL, NULL, | 263 | /*b*/NULL, /*f*/NULL, /*F*/NULL, /*h*/NULL, /*i*/NULL, |
261 | NULL, NULL, &volume_label, NULL, NULL, NULL, NULL); | 264 | /*l*/NULL, /*m*/NULL, /*n*/&arg_volume_label, |
265 | /*r*/NULL, /*R*/NULL, /*s*/NULL, /*S*/NULL); | ||
262 | argv += optind; | 266 | argv += optind; |
263 | 267 | ||
264 | // cache device name | 268 | // cache device name |
265 | device_name = argv[0]; | 269 | device_name = argv[0]; |
266 | // default volume ID = creation time | 270 | // default volume ID = creation time |
267 | volume_id = time(NULL); | 271 | volume_id = time(NULL); |
272 | // truncate to exactly 11 chars, pad with spaces | ||
273 | sprintf(volume_label11, "%-11.11s", arg_volume_label); | ||
268 | 274 | ||
269 | dev = xopen(device_name, O_RDWR); | 275 | dev = xopen(device_name, O_RDWR); |
270 | xfstat(dev, &st, device_name); | 276 | xfstat(dev, &st, device_name); |
@@ -459,7 +465,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) | |||
459 | (int)media_byte, | 465 | (int)media_byte, |
460 | volume_size_sect, (int)total_clust, (int)sect_per_clust, | 466 | volume_size_sect, (int)total_clust, (int)sect_per_clust, |
461 | sect_per_fat, | 467 | sect_per_fat, |
462 | (int)volume_id, volume_label | 468 | (int)volume_id, volume_label11 |
463 | ); | 469 | ); |
464 | } | 470 | } |
465 | 471 | ||
@@ -508,7 +514,7 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) | |||
508 | STORE_LE(boot_blk->vi.ext_boot_sign, 0x29); | 514 | STORE_LE(boot_blk->vi.ext_boot_sign, 0x29); |
509 | STORE_LE(boot_blk->vi.volume_id32, volume_id); | 515 | STORE_LE(boot_blk->vi.volume_id32, volume_id); |
510 | memcpy(boot_blk->vi.fs_type, "FAT32 ", sizeof(boot_blk->vi.fs_type)); | 516 | memcpy(boot_blk->vi.fs_type, "FAT32 ", sizeof(boot_blk->vi.fs_type)); |
511 | strncpy(boot_blk->vi.volume_label, volume_label, sizeof(boot_blk->vi.volume_label)); | 517 | memcpy(boot_blk->vi.volume_label, volume_label11, 11); |
512 | memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code)); | 518 | memcpy(boot_blk->boot_code, boot_code, sizeof(boot_code)); |
513 | STORE_LE(boot_blk->boot_sign, BOOT_SIGN); | 519 | STORE_LE(boot_blk->boot_sign, BOOT_SIGN); |
514 | 520 | ||
@@ -545,15 +551,18 @@ int mkfs_vfat_main(int argc UNUSED_PARAM, char **argv) | |||
545 | // root directory | 551 | // root directory |
546 | // empty directory is just a set of zero bytes | 552 | // empty directory is just a set of zero bytes |
547 | memset(buf, 0, sect_per_clust * bytes_per_sect); | 553 | memset(buf, 0, sect_per_clust * bytes_per_sect); |
548 | if (volume_label[0]) { | 554 | // not "NO NAME", "NO NAME " etc? |
549 | // create dir entry for volume_label | 555 | // (mkfs.fat 4.1 won't create dir entry even with explicit -n 'NO NAME', |
556 | // but will create one with e.g. -n '', -n ' zZz') | ||
557 | if (strcmp(volume_label11, NO_NAME_11) != 0) { | ||
558 | // create dir entry for volume label | ||
550 | struct msdos_dir_entry *de; | 559 | struct msdos_dir_entry *de; |
551 | #if 0 | 560 | #if 0 |
552 | struct tm tm_time; | 561 | struct tm tm_time; |
553 | uint16_t t, d; | 562 | uint16_t t, d; |
554 | #endif | 563 | #endif |
555 | de = (void*)buf; | 564 | de = (void*)buf; |
556 | strncpy(de->name, volume_label, sizeof(de->name)); | 565 | memcpy(de->name, volume_label11, 11); |
557 | STORE_LE(de->attr, ATTR_VOLUME); | 566 | STORE_LE(de->attr, ATTR_VOLUME); |
558 | #if 0 | 567 | #if 0 |
559 | localtime_r(&create_time, &tm_time); | 568 | localtime_r(&create_time, &tm_time); |
diff --git a/util-linux/nsenter.c b/util-linux/nsenter.c index e6339da2f..1aa045b35 100644 --- a/util-linux/nsenter.c +++ b/util-linux/nsenter.c | |||
@@ -93,7 +93,7 @@ enum { | |||
93 | * The user namespace comes first, so that it is entered first. | 93 | * The user namespace comes first, so that it is entered first. |
94 | * This gives an unprivileged user the potential to enter other namespaces. | 94 | * This gives an unprivileged user the potential to enter other namespaces. |
95 | */ | 95 | */ |
96 | static const struct namespace_descr ns_list[] = { | 96 | static const struct namespace_descr ns_list[] ALIGN_INT = { |
97 | { CLONE_NEWUSER, "ns/user", }, | 97 | { CLONE_NEWUSER, "ns/user", }, |
98 | { CLONE_NEWIPC, "ns/ipc", }, | 98 | { CLONE_NEWIPC, "ns/ipc", }, |
99 | { CLONE_NEWUTS, "ns/uts", }, | 99 | { CLONE_NEWUTS, "ns/uts", }, |
diff --git a/util-linux/unshare.c b/util-linux/unshare.c index 68ccdd874..06b938074 100644 --- a/util-linux/unshare.c +++ b/util-linux/unshare.c | |||
@@ -120,7 +120,7 @@ enum { | |||
120 | NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */ | 120 | NS_USR_POS, /* OPT_user, NS_USR_POS, and ns_list[] index must match! */ |
121 | NS_COUNT, | 121 | NS_COUNT, |
122 | }; | 122 | }; |
123 | static const struct namespace_descr ns_list[] = { | 123 | static const struct namespace_descr ns_list[] ALIGN_INT = { |
124 | { CLONE_NEWNS, "mnt" }, | 124 | { CLONE_NEWNS, "mnt" }, |
125 | { CLONE_NEWUTS, "uts" }, | 125 | { CLONE_NEWUTS, "uts" }, |
126 | { CLONE_NEWIPC, "ipc" }, | 126 | { CLONE_NEWIPC, "ipc" }, |