diff options
author | Ron Yorston <rmy@pobox.com> | 2022-04-07 11:41:52 +0100 |
---|---|---|
committer | Ron Yorston <rmy@pobox.com> | 2022-04-07 11:41:52 +0100 |
commit | b34845ed2e1583bec6581b6881cc7d6c84454276 (patch) | |
tree | f1ecff71fa8f84e44f0b8794f0d2d33031f69dc8 | |
parent | af41de68901d48753eb73491d54931a99d1a13b5 (diff) | |
parent | fc7868602ecf0d761a9a877141add4a9b6918d02 (diff) | |
download | busybox-w32-b34845ed2e1583bec6581b6881cc7d6c84454276.tar.gz busybox-w32-b34845ed2e1583bec6581b6881cc7d6c84454276.tar.bz2 busybox-w32-b34845ed2e1583bec6581b6881cc7d6c84454276.zip |
Merge branch 'busybox'
-rw-r--r-- | editors/vi.c | 34 | ||||
-rw-r--r-- | libbb/Config.src | 7 | ||||
-rw-r--r-- | libbb/hash_md5_sha256_x86-32_shaNI.S | 92 | ||||
-rw-r--r-- | libbb/hash_md5_sha256_x86-64_shaNI.S | 105 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 11 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64.S | 10 | ||||
-rwxr-xr-x | libbb/hash_md5_sha_x86-64.S.sh | 34 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 11 | ||||
-rw-r--r-- | shell/ash.c | 16 | ||||
-rw-r--r-- | shell/ash_test/ash-vars/var_bash_repl_unterminated.right | 1 | ||||
-rwxr-xr-x | shell/ash_test/ash-vars/var_bash_repl_unterminated.tests | 2 | ||||
-rw-r--r-- | shell/hush_test/hush-vars/var_bash_repl_unterminated.right | 1 | ||||
-rwxr-xr-x | shell/hush_test/hush-vars/var_bash_repl_unterminated.tests | 2 | ||||
-rw-r--r-- | util-linux/taskset.c | 3 |
14 files changed, 196 insertions, 133 deletions
diff --git a/editors/vi.c b/editors/vi.c index b30369302..dd8dd488a 100644 --- a/editors/vi.c +++ b/editors/vi.c | |||
@@ -233,6 +233,11 @@ | |||
233 | 233 | ||
234 | #endif | 234 | #endif |
235 | 235 | ||
236 | #if !ENABLE_PLATFORM_MINGW32 | ||
237 | #define isbackspace(c) ((c) == term_orig.c_cc[VERASE] || (c) == 8 || (c) == 127) | ||
238 | #else | ||
239 | #define isbackspace(c) ((c) == 8 || (c) == 127) | ||
240 | #endif | ||
236 | 241 | ||
237 | enum { | 242 | enum { |
238 | MAX_TABSTOP = 32, // sanity limit | 243 | MAX_TABSTOP = 32, // sanity limit |
@@ -371,6 +376,7 @@ struct globals { | |||
371 | int last_modified_count; // = -1; | 376 | int last_modified_count; // = -1; |
372 | int cmdline_filecnt; // how many file names on cmd line | 377 | int cmdline_filecnt; // how many file names on cmd line |
373 | int cmdcnt; // repetition count | 378 | int cmdcnt; // repetition count |
379 | char *rstart; // start of text in Replace mode | ||
374 | unsigned rows, columns; // the terminal screen is this size | 380 | unsigned rows, columns; // the terminal screen is this size |
375 | #if ENABLE_FEATURE_VI_ASK_TERMINAL | 381 | #if ENABLE_FEATURE_VI_ASK_TERMINAL |
376 | int get_rowcol_error; | 382 | int get_rowcol_error; |
@@ -509,6 +515,7 @@ struct globals { | |||
509 | #define last_modified_count (G.last_modified_count) | 515 | #define last_modified_count (G.last_modified_count) |
510 | #define cmdline_filecnt (G.cmdline_filecnt ) | 516 | #define cmdline_filecnt (G.cmdline_filecnt ) |
511 | #define cmdcnt (G.cmdcnt ) | 517 | #define cmdcnt (G.cmdcnt ) |
518 | #define rstart (G.rstart ) | ||
512 | #define rows (G.rows ) | 519 | #define rows (G.rows ) |
513 | #define columns (G.columns ) | 520 | #define columns (G.columns ) |
514 | #define crow (G.crow ) | 521 | #define crow (G.crow ) |
@@ -1272,11 +1279,7 @@ static char *get_input_line(const char *prompt) | |||
1272 | c = get_one_char(); | 1279 | c = get_one_char(); |
1273 | if (c == '\n' || c == '\r' || c == 27) | 1280 | if (c == '\n' || c == '\r' || c == 27) |
1274 | break; // this is end of input | 1281 | break; // this is end of input |
1275 | #if !ENABLE_PLATFORM_MINGW32 | 1282 | if (isbackspace(c)) { |
1276 | if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) { | ||
1277 | #else | ||
1278 | if (c == 8 || c == 127) { | ||
1279 | #endif | ||
1280 | // user wants to erase prev char | 1283 | // user wants to erase prev char |
1281 | write1("\b \b"); // erase char on screen | 1284 | write1("\b \b"); // erase char on screen |
1282 | buf[--i] = '\0'; | 1285 | buf[--i] = '\0'; |
@@ -2265,12 +2268,16 @@ static char *char_insert(char *p, char c, int undo) // insert the char c at 'p' | |||
2265 | p += 1 + stupid_insert(p, ' '); | 2268 | p += 1 + stupid_insert(p, ' '); |
2266 | } | 2269 | } |
2267 | #endif | 2270 | #endif |
2268 | #if !ENABLE_PLATFORM_MINGW32 | 2271 | } else if (isbackspace(c)) { |
2269 | } else if (c == term_orig.c_cc[VERASE] || c == 8 || c == 127) { // Is this a BS | 2272 | if (cmd_mode == 2) { |
2270 | #else | 2273 | // special treatment for backspace in Replace mode |
2271 | } else if (c == 8 || c == 127) { // Is this a BS | 2274 | if (p > rstart) { |
2275 | p--; | ||
2276 | #if ENABLE_FEATURE_VI_UNDO | ||
2277 | undo_pop(); | ||
2272 | #endif | 2278 | #endif |
2273 | if (p > text) { | 2279 | } |
2280 | } else if (p > text) { | ||
2274 | p--; | 2281 | p--; |
2275 | p = text_hole_delete(p, p, ALLOW_UNDO_QUEUED); // shrink buffer 1 char | 2282 | p = text_hole_delete(p, p, ALLOW_UNDO_QUEUED); // shrink buffer 1 char |
2276 | } | 2283 | } |
@@ -3863,9 +3870,9 @@ static void do_cmd(int c) | |||
3863 | undo_queue_commit(); | 3870 | undo_queue_commit(); |
3864 | } else { | 3871 | } else { |
3865 | if (1 <= c || Isprint(c)) { | 3872 | if (1 <= c || Isprint(c)) { |
3866 | if (c != 27) | 3873 | if (c != 27 && !isbackspace(c)) |
3867 | dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO); // delete char | 3874 | dot = yank_delete(dot, dot, PARTIAL, YANKDEL, ALLOW_UNDO); |
3868 | dot = char_insert(dot, c, ALLOW_UNDO_CHAIN); // insert new char | 3875 | dot = char_insert(dot, c, ALLOW_UNDO_CHAIN); |
3869 | } | 3876 | } |
3870 | goto dc1; | 3877 | goto dc1; |
3871 | } | 3878 | } |
@@ -4424,6 +4431,7 @@ static void do_cmd(int c) | |||
4424 | dc5: | 4431 | dc5: |
4425 | cmd_mode = 2; | 4432 | cmd_mode = 2; |
4426 | undo_queue_commit(); | 4433 | undo_queue_commit(); |
4434 | rstart = dot; | ||
4427 | break; | 4435 | break; |
4428 | case KEYCODE_DELETE: | 4436 | case KEYCODE_DELETE: |
4429 | if (dot < end - 1) | 4437 | if (dot < end - 1) |
diff --git a/libbb/Config.src b/libbb/Config.src index 0ecd5bd46..66a3ffa23 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -57,11 +57,12 @@ config SHA1_SMALL | |||
57 | range 0 3 | 57 | range 0 3 |
58 | help | 58 | help |
59 | Trade binary size versus speed for the sha1 algorithm. | 59 | Trade binary size versus speed for the sha1 algorithm. |
60 | With FEATURE_COPYBUF_KB=64: | ||
60 | throughput MB/s size of sha1_process_block64 | 61 | throughput MB/s size of sha1_process_block64 |
61 | value 486 x86-64 486 x86-64 | 62 | value 486 x86-64 486 x86-64 |
62 | 0 367 375 3657 3502 | 63 | 0 440 485 3481 3502 |
63 | 1 224 229 654 732 | 64 | 1 265 265 641 696 |
64 | 2,3 200 195 358 380 | 65 | 2,3 220 210 342 364 |
65 | 66 | ||
66 | config SHA1_HWACCEL | 67 | config SHA1_HWACCEL |
67 | bool "SHA1: Use hardware accelerated instructions if possible" | 68 | bool "SHA1: Use hardware accelerated instructions if possible" |
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S index aa68193bd..3905bad9a 100644 --- a/libbb/hash_md5_sha256_x86-32_shaNI.S +++ b/libbb/hash_md5_sha256_x86-32_shaNI.S | |||
@@ -4,7 +4,7 @@ | |||
4 | // We use shorter insns, even though they are for "wrong" | 4 | // We use shorter insns, even though they are for "wrong" |
5 | // data type (fp, not int). | 5 | // data type (fp, not int). |
6 | // For Intel, there is no penalty for doing it at all | 6 | // For Intel, there is no penalty for doing it at all |
7 | // (CPUs which do have such penalty do not support SHA1 insns). | 7 | // (CPUs which do have such penalty do not support SHA insns). |
8 | // For AMD, the penalty is one extra cycle | 8 | // For AMD, the penalty is one extra cycle |
9 | // (allegedly: I failed to find measurable difference). | 9 | // (allegedly: I failed to find measurable difference). |
10 | 10 | ||
@@ -15,6 +15,10 @@ | |||
15 | //#define shuf128_32 pshufd | 15 | //#define shuf128_32 pshufd |
16 | #define shuf128_32 shufps | 16 | #define shuf128_32 shufps |
17 | 17 | ||
18 | // pshufb and palignr are SSSE3 insns. | ||
19 | // We do not check SSSE3 in cpuid, | ||
20 | // all SHA-capable CPUs support it as well. | ||
21 | |||
18 | .section .text.sha256_process_block64_shaNI, "ax", @progbits | 22 | .section .text.sha256_process_block64_shaNI, "ax", @progbits |
19 | .globl sha256_process_block64_shaNI | 23 | .globl sha256_process_block64_shaNI |
20 | .hidden sha256_process_block64_shaNI | 24 | .hidden sha256_process_block64_shaNI |
@@ -39,12 +43,13 @@ | |||
39 | .balign 8 # allow decoders to fetch at least 2 first insns | 43 | .balign 8 # allow decoders to fetch at least 2 first insns |
40 | sha256_process_block64_shaNI: | 44 | sha256_process_block64_shaNI: |
41 | 45 | ||
42 | movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ | 46 | movu128 76+0*16(%eax), XMMTMP /* ABCD (little-endian dword order) */ |
43 | movu128 76+1*16(%eax), STATE1 /* HGFE */ | 47 | movu128 76+1*16(%eax), STATE1 /* EFGH */ |
44 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 48 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
45 | mova128 STATE1, STATE0 | 49 | mova128 STATE1, STATE0 |
46 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ | 50 | /* --- -------------- ABCD -- EFGH */ |
47 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ | 51 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ |
52 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ | ||
48 | 53 | ||
49 | /* XMMTMP holds flip mask from here... */ | 54 | /* XMMTMP holds flip mask from here... */ |
50 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP | 55 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP |
@@ -55,18 +60,18 @@ sha256_process_block64_shaNI: | |||
55 | pshufb XMMTMP, MSG | 60 | pshufb XMMTMP, MSG |
56 | mova128 MSG, MSGTMP0 | 61 | mova128 MSG, MSGTMP0 |
57 | paddd 0*16-8*16(SHA256CONSTANTS), MSG | 62 | paddd 0*16-8*16(SHA256CONSTANTS), MSG |
58 | sha256rnds2 STATE0, STATE1 | 63 | sha256rnds2 MSG, STATE0, STATE1 |
59 | shuf128_32 $0x0E, MSG, MSG | 64 | shuf128_32 $0x0E, MSG, MSG |
60 | sha256rnds2 STATE1, STATE0 | 65 | sha256rnds2 MSG, STATE1, STATE0 |
61 | 66 | ||
62 | /* Rounds 4-7 */ | 67 | /* Rounds 4-7 */ |
63 | movu128 1*16(DATA_PTR), MSG | 68 | movu128 1*16(DATA_PTR), MSG |
64 | pshufb XMMTMP, MSG | 69 | pshufb XMMTMP, MSG |
65 | mova128 MSG, MSGTMP1 | 70 | mova128 MSG, MSGTMP1 |
66 | paddd 1*16-8*16(SHA256CONSTANTS), MSG | 71 | paddd 1*16-8*16(SHA256CONSTANTS), MSG |
67 | sha256rnds2 STATE0, STATE1 | 72 | sha256rnds2 MSG, STATE0, STATE1 |
68 | shuf128_32 $0x0E, MSG, MSG | 73 | shuf128_32 $0x0E, MSG, MSG |
69 | sha256rnds2 STATE1, STATE0 | 74 | sha256rnds2 MSG, STATE1, STATE0 |
70 | sha256msg1 MSGTMP1, MSGTMP0 | 75 | sha256msg1 MSGTMP1, MSGTMP0 |
71 | 76 | ||
72 | /* Rounds 8-11 */ | 77 | /* Rounds 8-11 */ |
@@ -74,9 +79,9 @@ sha256_process_block64_shaNI: | |||
74 | pshufb XMMTMP, MSG | 79 | pshufb XMMTMP, MSG |
75 | mova128 MSG, MSGTMP2 | 80 | mova128 MSG, MSGTMP2 |
76 | paddd 2*16-8*16(SHA256CONSTANTS), MSG | 81 | paddd 2*16-8*16(SHA256CONSTANTS), MSG |
77 | sha256rnds2 STATE0, STATE1 | 82 | sha256rnds2 MSG, STATE0, STATE1 |
78 | shuf128_32 $0x0E, MSG, MSG | 83 | shuf128_32 $0x0E, MSG, MSG |
79 | sha256rnds2 STATE1, STATE0 | 84 | sha256rnds2 MSG, STATE1, STATE0 |
80 | sha256msg1 MSGTMP2, MSGTMP1 | 85 | sha256msg1 MSGTMP2, MSGTMP1 |
81 | 86 | ||
82 | /* Rounds 12-15 */ | 87 | /* Rounds 12-15 */ |
@@ -85,159 +90,158 @@ sha256_process_block64_shaNI: | |||
85 | /* ...to here */ | 90 | /* ...to here */ |
86 | mova128 MSG, MSGTMP3 | 91 | mova128 MSG, MSGTMP3 |
87 | paddd 3*16-8*16(SHA256CONSTANTS), MSG | 92 | paddd 3*16-8*16(SHA256CONSTANTS), MSG |
88 | sha256rnds2 STATE0, STATE1 | 93 | sha256rnds2 MSG, STATE0, STATE1 |
89 | mova128 MSGTMP3, XMMTMP | 94 | mova128 MSGTMP3, XMMTMP |
90 | palignr $4, MSGTMP2, XMMTMP | 95 | palignr $4, MSGTMP2, XMMTMP |
91 | paddd XMMTMP, MSGTMP0 | 96 | paddd XMMTMP, MSGTMP0 |
92 | sha256msg2 MSGTMP3, MSGTMP0 | 97 | sha256msg2 MSGTMP3, MSGTMP0 |
93 | shuf128_32 $0x0E, MSG, MSG | 98 | shuf128_32 $0x0E, MSG, MSG |
94 | sha256rnds2 STATE1, STATE0 | 99 | sha256rnds2 MSG, STATE1, STATE0 |
95 | sha256msg1 MSGTMP3, MSGTMP2 | 100 | sha256msg1 MSGTMP3, MSGTMP2 |
96 | 101 | ||
97 | /* Rounds 16-19 */ | 102 | /* Rounds 16-19 */ |
98 | mova128 MSGTMP0, MSG | 103 | mova128 MSGTMP0, MSG |
99 | paddd 4*16-8*16(SHA256CONSTANTS), MSG | 104 | paddd 4*16-8*16(SHA256CONSTANTS), MSG |
100 | sha256rnds2 STATE0, STATE1 | 105 | sha256rnds2 MSG, STATE0, STATE1 |
101 | mova128 MSGTMP0, XMMTMP | 106 | mova128 MSGTMP0, XMMTMP |
102 | palignr $4, MSGTMP3, XMMTMP | 107 | palignr $4, MSGTMP3, XMMTMP |
103 | paddd XMMTMP, MSGTMP1 | 108 | paddd XMMTMP, MSGTMP1 |
104 | sha256msg2 MSGTMP0, MSGTMP1 | 109 | sha256msg2 MSGTMP0, MSGTMP1 |
105 | shuf128_32 $0x0E, MSG, MSG | 110 | shuf128_32 $0x0E, MSG, MSG |
106 | sha256rnds2 STATE1, STATE0 | 111 | sha256rnds2 MSG, STATE1, STATE0 |
107 | sha256msg1 MSGTMP0, MSGTMP3 | 112 | sha256msg1 MSGTMP0, MSGTMP3 |
108 | 113 | ||
109 | /* Rounds 20-23 */ | 114 | /* Rounds 20-23 */ |
110 | mova128 MSGTMP1, MSG | 115 | mova128 MSGTMP1, MSG |
111 | paddd 5*16-8*16(SHA256CONSTANTS), MSG | 116 | paddd 5*16-8*16(SHA256CONSTANTS), MSG |
112 | sha256rnds2 STATE0, STATE1 | 117 | sha256rnds2 MSG, STATE0, STATE1 |
113 | mova128 MSGTMP1, XMMTMP | 118 | mova128 MSGTMP1, XMMTMP |
114 | palignr $4, MSGTMP0, XMMTMP | 119 | palignr $4, MSGTMP0, XMMTMP |
115 | paddd XMMTMP, MSGTMP2 | 120 | paddd XMMTMP, MSGTMP2 |
116 | sha256msg2 MSGTMP1, MSGTMP2 | 121 | sha256msg2 MSGTMP1, MSGTMP2 |
117 | shuf128_32 $0x0E, MSG, MSG | 122 | shuf128_32 $0x0E, MSG, MSG |
118 | sha256rnds2 STATE1, STATE0 | 123 | sha256rnds2 MSG, STATE1, STATE0 |
119 | sha256msg1 MSGTMP1, MSGTMP0 | 124 | sha256msg1 MSGTMP1, MSGTMP0 |
120 | 125 | ||
121 | /* Rounds 24-27 */ | 126 | /* Rounds 24-27 */ |
122 | mova128 MSGTMP2, MSG | 127 | mova128 MSGTMP2, MSG |
123 | paddd 6*16-8*16(SHA256CONSTANTS), MSG | 128 | paddd 6*16-8*16(SHA256CONSTANTS), MSG |
124 | sha256rnds2 STATE0, STATE1 | 129 | sha256rnds2 MSG, STATE0, STATE1 |
125 | mova128 MSGTMP2, XMMTMP | 130 | mova128 MSGTMP2, XMMTMP |
126 | palignr $4, MSGTMP1, XMMTMP | 131 | palignr $4, MSGTMP1, XMMTMP |
127 | paddd XMMTMP, MSGTMP3 | 132 | paddd XMMTMP, MSGTMP3 |
128 | sha256msg2 MSGTMP2, MSGTMP3 | 133 | sha256msg2 MSGTMP2, MSGTMP3 |
129 | shuf128_32 $0x0E, MSG, MSG | 134 | shuf128_32 $0x0E, MSG, MSG |
130 | sha256rnds2 STATE1, STATE0 | 135 | sha256rnds2 MSG, STATE1, STATE0 |
131 | sha256msg1 MSGTMP2, MSGTMP1 | 136 | sha256msg1 MSGTMP2, MSGTMP1 |
132 | 137 | ||
133 | /* Rounds 28-31 */ | 138 | /* Rounds 28-31 */ |
134 | mova128 MSGTMP3, MSG | 139 | mova128 MSGTMP3, MSG |
135 | paddd 7*16-8*16(SHA256CONSTANTS), MSG | 140 | paddd 7*16-8*16(SHA256CONSTANTS), MSG |
136 | sha256rnds2 STATE0, STATE1 | 141 | sha256rnds2 MSG, STATE0, STATE1 |
137 | mova128 MSGTMP3, XMMTMP | 142 | mova128 MSGTMP3, XMMTMP |
138 | palignr $4, MSGTMP2, XMMTMP | 143 | palignr $4, MSGTMP2, XMMTMP |
139 | paddd XMMTMP, MSGTMP0 | 144 | paddd XMMTMP, MSGTMP0 |
140 | sha256msg2 MSGTMP3, MSGTMP0 | 145 | sha256msg2 MSGTMP3, MSGTMP0 |
141 | shuf128_32 $0x0E, MSG, MSG | 146 | shuf128_32 $0x0E, MSG, MSG |
142 | sha256rnds2 STATE1, STATE0 | 147 | sha256rnds2 MSG, STATE1, STATE0 |
143 | sha256msg1 MSGTMP3, MSGTMP2 | 148 | sha256msg1 MSGTMP3, MSGTMP2 |
144 | 149 | ||
145 | /* Rounds 32-35 */ | 150 | /* Rounds 32-35 */ |
146 | mova128 MSGTMP0, MSG | 151 | mova128 MSGTMP0, MSG |
147 | paddd 8*16-8*16(SHA256CONSTANTS), MSG | 152 | paddd 8*16-8*16(SHA256CONSTANTS), MSG |
148 | sha256rnds2 STATE0, STATE1 | 153 | sha256rnds2 MSG, STATE0, STATE1 |
149 | mova128 MSGTMP0, XMMTMP | 154 | mova128 MSGTMP0, XMMTMP |
150 | palignr $4, MSGTMP3, XMMTMP | 155 | palignr $4, MSGTMP3, XMMTMP |
151 | paddd XMMTMP, MSGTMP1 | 156 | paddd XMMTMP, MSGTMP1 |
152 | sha256msg2 MSGTMP0, MSGTMP1 | 157 | sha256msg2 MSGTMP0, MSGTMP1 |
153 | shuf128_32 $0x0E, MSG, MSG | 158 | shuf128_32 $0x0E, MSG, MSG |
154 | sha256rnds2 STATE1, STATE0 | 159 | sha256rnds2 MSG, STATE1, STATE0 |
155 | sha256msg1 MSGTMP0, MSGTMP3 | 160 | sha256msg1 MSGTMP0, MSGTMP3 |
156 | 161 | ||
157 | /* Rounds 36-39 */ | 162 | /* Rounds 36-39 */ |
158 | mova128 MSGTMP1, MSG | 163 | mova128 MSGTMP1, MSG |
159 | paddd 9*16-8*16(SHA256CONSTANTS), MSG | 164 | paddd 9*16-8*16(SHA256CONSTANTS), MSG |
160 | sha256rnds2 STATE0, STATE1 | 165 | sha256rnds2 MSG, STATE0, STATE1 |
161 | mova128 MSGTMP1, XMMTMP | 166 | mova128 MSGTMP1, XMMTMP |
162 | palignr $4, MSGTMP0, XMMTMP | 167 | palignr $4, MSGTMP0, XMMTMP |
163 | paddd XMMTMP, MSGTMP2 | 168 | paddd XMMTMP, MSGTMP2 |
164 | sha256msg2 MSGTMP1, MSGTMP2 | 169 | sha256msg2 MSGTMP1, MSGTMP2 |
165 | shuf128_32 $0x0E, MSG, MSG | 170 | shuf128_32 $0x0E, MSG, MSG |
166 | sha256rnds2 STATE1, STATE0 | 171 | sha256rnds2 MSG, STATE1, STATE0 |
167 | sha256msg1 MSGTMP1, MSGTMP0 | 172 | sha256msg1 MSGTMP1, MSGTMP0 |
168 | 173 | ||
169 | /* Rounds 40-43 */ | 174 | /* Rounds 40-43 */ |
170 | mova128 MSGTMP2, MSG | 175 | mova128 MSGTMP2, MSG |
171 | paddd 10*16-8*16(SHA256CONSTANTS), MSG | 176 | paddd 10*16-8*16(SHA256CONSTANTS), MSG |
172 | sha256rnds2 STATE0, STATE1 | 177 | sha256rnds2 MSG, STATE0, STATE1 |
173 | mova128 MSGTMP2, XMMTMP | 178 | mova128 MSGTMP2, XMMTMP |
174 | palignr $4, MSGTMP1, XMMTMP | 179 | palignr $4, MSGTMP1, XMMTMP |
175 | paddd XMMTMP, MSGTMP3 | 180 | paddd XMMTMP, MSGTMP3 |
176 | sha256msg2 MSGTMP2, MSGTMP3 | 181 | sha256msg2 MSGTMP2, MSGTMP3 |
177 | shuf128_32 $0x0E, MSG, MSG | 182 | shuf128_32 $0x0E, MSG, MSG |
178 | sha256rnds2 STATE1, STATE0 | 183 | sha256rnds2 MSG, STATE1, STATE0 |
179 | sha256msg1 MSGTMP2, MSGTMP1 | 184 | sha256msg1 MSGTMP2, MSGTMP1 |
180 | 185 | ||
181 | /* Rounds 44-47 */ | 186 | /* Rounds 44-47 */ |
182 | mova128 MSGTMP3, MSG | 187 | mova128 MSGTMP3, MSG |
183 | paddd 11*16-8*16(SHA256CONSTANTS), MSG | 188 | paddd 11*16-8*16(SHA256CONSTANTS), MSG |
184 | sha256rnds2 STATE0, STATE1 | 189 | sha256rnds2 MSG, STATE0, STATE1 |
185 | mova128 MSGTMP3, XMMTMP | 190 | mova128 MSGTMP3, XMMTMP |
186 | palignr $4, MSGTMP2, XMMTMP | 191 | palignr $4, MSGTMP2, XMMTMP |
187 | paddd XMMTMP, MSGTMP0 | 192 | paddd XMMTMP, MSGTMP0 |
188 | sha256msg2 MSGTMP3, MSGTMP0 | 193 | sha256msg2 MSGTMP3, MSGTMP0 |
189 | shuf128_32 $0x0E, MSG, MSG | 194 | shuf128_32 $0x0E, MSG, MSG |
190 | sha256rnds2 STATE1, STATE0 | 195 | sha256rnds2 MSG, STATE1, STATE0 |
191 | sha256msg1 MSGTMP3, MSGTMP2 | 196 | sha256msg1 MSGTMP3, MSGTMP2 |
192 | 197 | ||
193 | /* Rounds 48-51 */ | 198 | /* Rounds 48-51 */ |
194 | mova128 MSGTMP0, MSG | 199 | mova128 MSGTMP0, MSG |
195 | paddd 12*16-8*16(SHA256CONSTANTS), MSG | 200 | paddd 12*16-8*16(SHA256CONSTANTS), MSG |
196 | sha256rnds2 STATE0, STATE1 | 201 | sha256rnds2 MSG, STATE0, STATE1 |
197 | mova128 MSGTMP0, XMMTMP | 202 | mova128 MSGTMP0, XMMTMP |
198 | palignr $4, MSGTMP3, XMMTMP | 203 | palignr $4, MSGTMP3, XMMTMP |
199 | paddd XMMTMP, MSGTMP1 | 204 | paddd XMMTMP, MSGTMP1 |
200 | sha256msg2 MSGTMP0, MSGTMP1 | 205 | sha256msg2 MSGTMP0, MSGTMP1 |
201 | shuf128_32 $0x0E, MSG, MSG | 206 | shuf128_32 $0x0E, MSG, MSG |
202 | sha256rnds2 STATE1, STATE0 | 207 | sha256rnds2 MSG, STATE1, STATE0 |
203 | sha256msg1 MSGTMP0, MSGTMP3 | 208 | sha256msg1 MSGTMP0, MSGTMP3 |
204 | 209 | ||
205 | /* Rounds 52-55 */ | 210 | /* Rounds 52-55 */ |
206 | mova128 MSGTMP1, MSG | 211 | mova128 MSGTMP1, MSG |
207 | paddd 13*16-8*16(SHA256CONSTANTS), MSG | 212 | paddd 13*16-8*16(SHA256CONSTANTS), MSG |
208 | sha256rnds2 STATE0, STATE1 | 213 | sha256rnds2 MSG, STATE0, STATE1 |
209 | mova128 MSGTMP1, XMMTMP | 214 | mova128 MSGTMP1, XMMTMP |
210 | palignr $4, MSGTMP0, XMMTMP | 215 | palignr $4, MSGTMP0, XMMTMP |
211 | paddd XMMTMP, MSGTMP2 | 216 | paddd XMMTMP, MSGTMP2 |
212 | sha256msg2 MSGTMP1, MSGTMP2 | 217 | sha256msg2 MSGTMP1, MSGTMP2 |
213 | shuf128_32 $0x0E, MSG, MSG | 218 | shuf128_32 $0x0E, MSG, MSG |
214 | sha256rnds2 STATE1, STATE0 | 219 | sha256rnds2 MSG, STATE1, STATE0 |
215 | 220 | ||
216 | /* Rounds 56-59 */ | 221 | /* Rounds 56-59 */ |
217 | mova128 MSGTMP2, MSG | 222 | mova128 MSGTMP2, MSG |
218 | paddd 14*16-8*16(SHA256CONSTANTS), MSG | 223 | paddd 14*16-8*16(SHA256CONSTANTS), MSG |
219 | sha256rnds2 STATE0, STATE1 | 224 | sha256rnds2 MSG, STATE0, STATE1 |
220 | mova128 MSGTMP2, XMMTMP | 225 | mova128 MSGTMP2, XMMTMP |
221 | palignr $4, MSGTMP1, XMMTMP | 226 | palignr $4, MSGTMP1, XMMTMP |
222 | paddd XMMTMP, MSGTMP3 | 227 | paddd XMMTMP, MSGTMP3 |
223 | sha256msg2 MSGTMP2, MSGTMP3 | 228 | sha256msg2 MSGTMP2, MSGTMP3 |
224 | shuf128_32 $0x0E, MSG, MSG | 229 | shuf128_32 $0x0E, MSG, MSG |
225 | sha256rnds2 STATE1, STATE0 | 230 | sha256rnds2 MSG, STATE1, STATE0 |
226 | 231 | ||
227 | /* Rounds 60-63 */ | 232 | /* Rounds 60-63 */ |
228 | mova128 MSGTMP3, MSG | 233 | mova128 MSGTMP3, MSG |
229 | paddd 15*16-8*16(SHA256CONSTANTS), MSG | 234 | paddd 15*16-8*16(SHA256CONSTANTS), MSG |
230 | sha256rnds2 STATE0, STATE1 | 235 | sha256rnds2 MSG, STATE0, STATE1 |
231 | shuf128_32 $0x0E, MSG, MSG | 236 | shuf128_32 $0x0E, MSG, MSG |
232 | sha256rnds2 STATE1, STATE0 | 237 | sha256rnds2 MSG, STATE1, STATE0 |
233 | 238 | ||
234 | /* Write hash values back in the correct order */ | 239 | /* Write hash values back in the correct order */ |
235 | /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ | ||
236 | /* STATE1: CDGH */ | ||
237 | mova128 STATE0, XMMTMP | 240 | mova128 STATE0, XMMTMP |
238 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 241 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
239 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ | 242 | /* --- -------------- HGDC -- FEBA */ |
240 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ | 243 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ |
244 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ | ||
241 | /* add current hash values to previous ones */ | 245 | /* add current hash values to previous ones */ |
242 | movu128 76+1*16(%eax), STATE1 | 246 | movu128 76+1*16(%eax), STATE1 |
243 | paddd XMMTMP, STATE1 | 247 | paddd XMMTMP, STATE1 |
@@ -250,7 +254,7 @@ sha256_process_block64_shaNI: | |||
250 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 254 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
251 | 255 | ||
252 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 256 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
253 | .balign 16 | 257 | .balign 16 |
254 | K256: | 258 | K256: |
255 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 259 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
256 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 260 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
@@ -270,8 +274,8 @@ K256: | |||
270 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 274 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
271 | 275 | ||
272 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 276 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
273 | .balign 16 | 277 | .balign 16 |
274 | PSHUFFLE_BSWAP32_FLIP_MASK: | 278 | PSHUFFLE_BSWAP32_FLIP_MASK: |
275 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 279 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
276 | 280 | ||
277 | #endif | 281 | #endif |
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S index 4663f750a..082ceafe4 100644 --- a/libbb/hash_md5_sha256_x86-64_shaNI.S +++ b/libbb/hash_md5_sha256_x86-64_shaNI.S | |||
@@ -4,7 +4,7 @@ | |||
4 | // We use shorter insns, even though they are for "wrong" | 4 | // We use shorter insns, even though they are for "wrong" |
5 | // data type (fp, not int). | 5 | // data type (fp, not int). |
6 | // For Intel, there is no penalty for doing it at all | 6 | // For Intel, there is no penalty for doing it at all |
7 | // (CPUs which do have such penalty do not support SHA1 insns). | 7 | // (CPUs which do have such penalty do not support SHA insns). |
8 | // For AMD, the penalty is one extra cycle | 8 | // For AMD, the penalty is one extra cycle |
9 | // (allegedly: I failed to find measurable difference). | 9 | // (allegedly: I failed to find measurable difference). |
10 | 10 | ||
@@ -15,6 +15,10 @@ | |||
15 | //#define shuf128_32 pshufd | 15 | //#define shuf128_32 pshufd |
16 | #define shuf128_32 shufps | 16 | #define shuf128_32 shufps |
17 | 17 | ||
18 | // pshufb and palignr are SSSE3 insns. | ||
19 | // We do not check SSSE3 in cpuid, | ||
20 | // all SHA-capable CPUs support it as well. | ||
21 | |||
18 | .section .text.sha256_process_block64_shaNI, "ax", @progbits | 22 | .section .text.sha256_process_block64_shaNI, "ax", @progbits |
19 | .globl sha256_process_block64_shaNI | 23 | .globl sha256_process_block64_shaNI |
20 | .hidden sha256_process_block64_shaNI | 24 | .hidden sha256_process_block64_shaNI |
@@ -34,46 +38,47 @@ | |||
34 | 38 | ||
35 | #define XMMTMP %xmm7 | 39 | #define XMMTMP %xmm7 |
36 | 40 | ||
37 | #define ABEF_SAVE %xmm9 | 41 | #define SAVE0 %xmm8 |
38 | #define CDGH_SAVE %xmm10 | 42 | #define SAVE1 %xmm9 |
39 | 43 | ||
40 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) | 44 | #define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6)) |
41 | 45 | ||
42 | .balign 8 # allow decoders to fetch at least 2 first insns | 46 | .balign 8 # allow decoders to fetch at least 2 first insns |
43 | sha256_process_block64_shaNI: | 47 | sha256_process_block64_shaNI: |
44 | 48 | ||
45 | movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */ | 49 | movu128 80+0*16(%rdi), XMMTMP /* ABCD (little-endian dword order) */ |
46 | movu128 80+1*16(%rdi), STATE1 /* HGFE */ | 50 | movu128 80+1*16(%rdi), STATE1 /* EFGH */ |
47 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 51 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
48 | mova128 STATE1, STATE0 | 52 | mova128 STATE1, STATE0 |
49 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */ | 53 | /* --- -------------- ABCD -- EFGH */ |
50 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */ | 54 | shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* FEBA */ |
55 | shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* HGDC */ | ||
51 | 56 | ||
52 | /* XMMTMP holds flip mask from here... */ | 57 | /* XMMTMP holds flip mask from here... */ |
53 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP | 58 | mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP |
54 | leaq K256+8*16(%rip), SHA256CONSTANTS | 59 | leaq K256+8*16(%rip), SHA256CONSTANTS |
55 | 60 | ||
56 | /* Save hash values for addition after rounds */ | 61 | /* Save hash values for addition after rounds */ |
57 | mova128 STATE0, ABEF_SAVE | 62 | mova128 STATE0, SAVE0 |
58 | mova128 STATE1, CDGH_SAVE | 63 | mova128 STATE1, SAVE1 |
59 | 64 | ||
60 | /* Rounds 0-3 */ | 65 | /* Rounds 0-3 */ |
61 | movu128 0*16(DATA_PTR), MSG | 66 | movu128 0*16(DATA_PTR), MSG |
62 | pshufb XMMTMP, MSG | 67 | pshufb XMMTMP, MSG |
63 | mova128 MSG, MSGTMP0 | 68 | mova128 MSG, MSGTMP0 |
64 | paddd 0*16-8*16(SHA256CONSTANTS), MSG | 69 | paddd 0*16-8*16(SHA256CONSTANTS), MSG |
65 | sha256rnds2 STATE0, STATE1 | 70 | sha256rnds2 MSG, STATE0, STATE1 |
66 | shuf128_32 $0x0E, MSG, MSG | 71 | shuf128_32 $0x0E, MSG, MSG |
67 | sha256rnds2 STATE1, STATE0 | 72 | sha256rnds2 MSG, STATE1, STATE0 |
68 | 73 | ||
69 | /* Rounds 4-7 */ | 74 | /* Rounds 4-7 */ |
70 | movu128 1*16(DATA_PTR), MSG | 75 | movu128 1*16(DATA_PTR), MSG |
71 | pshufb XMMTMP, MSG | 76 | pshufb XMMTMP, MSG |
72 | mova128 MSG, MSGTMP1 | 77 | mova128 MSG, MSGTMP1 |
73 | paddd 1*16-8*16(SHA256CONSTANTS), MSG | 78 | paddd 1*16-8*16(SHA256CONSTANTS), MSG |
74 | sha256rnds2 STATE0, STATE1 | 79 | sha256rnds2 MSG, STATE0, STATE1 |
75 | shuf128_32 $0x0E, MSG, MSG | 80 | shuf128_32 $0x0E, MSG, MSG |
76 | sha256rnds2 STATE1, STATE0 | 81 | sha256rnds2 MSG, STATE1, STATE0 |
77 | sha256msg1 MSGTMP1, MSGTMP0 | 82 | sha256msg1 MSGTMP1, MSGTMP0 |
78 | 83 | ||
79 | /* Rounds 8-11 */ | 84 | /* Rounds 8-11 */ |
@@ -81,9 +86,9 @@ sha256_process_block64_shaNI: | |||
81 | pshufb XMMTMP, MSG | 86 | pshufb XMMTMP, MSG |
82 | mova128 MSG, MSGTMP2 | 87 | mova128 MSG, MSGTMP2 |
83 | paddd 2*16-8*16(SHA256CONSTANTS), MSG | 88 | paddd 2*16-8*16(SHA256CONSTANTS), MSG |
84 | sha256rnds2 STATE0, STATE1 | 89 | sha256rnds2 MSG, STATE0, STATE1 |
85 | shuf128_32 $0x0E, MSG, MSG | 90 | shuf128_32 $0x0E, MSG, MSG |
86 | sha256rnds2 STATE1, STATE0 | 91 | sha256rnds2 MSG, STATE1, STATE0 |
87 | sha256msg1 MSGTMP2, MSGTMP1 | 92 | sha256msg1 MSGTMP2, MSGTMP1 |
88 | 93 | ||
89 | /* Rounds 12-15 */ | 94 | /* Rounds 12-15 */ |
@@ -92,164 +97,162 @@ sha256_process_block64_shaNI: | |||
92 | /* ...to here */ | 97 | /* ...to here */ |
93 | mova128 MSG, MSGTMP3 | 98 | mova128 MSG, MSGTMP3 |
94 | paddd 3*16-8*16(SHA256CONSTANTS), MSG | 99 | paddd 3*16-8*16(SHA256CONSTANTS), MSG |
95 | sha256rnds2 STATE0, STATE1 | 100 | sha256rnds2 MSG, STATE0, STATE1 |
96 | mova128 MSGTMP3, XMMTMP | 101 | mova128 MSGTMP3, XMMTMP |
97 | palignr $4, MSGTMP2, XMMTMP | 102 | palignr $4, MSGTMP2, XMMTMP |
98 | paddd XMMTMP, MSGTMP0 | 103 | paddd XMMTMP, MSGTMP0 |
99 | sha256msg2 MSGTMP3, MSGTMP0 | 104 | sha256msg2 MSGTMP3, MSGTMP0 |
100 | shuf128_32 $0x0E, MSG, MSG | 105 | shuf128_32 $0x0E, MSG, MSG |
101 | sha256rnds2 STATE1, STATE0 | 106 | sha256rnds2 MSG, STATE1, STATE0 |
102 | sha256msg1 MSGTMP3, MSGTMP2 | 107 | sha256msg1 MSGTMP3, MSGTMP2 |
103 | 108 | ||
104 | /* Rounds 16-19 */ | 109 | /* Rounds 16-19 */ |
105 | mova128 MSGTMP0, MSG | 110 | mova128 MSGTMP0, MSG |
106 | paddd 4*16-8*16(SHA256CONSTANTS), MSG | 111 | paddd 4*16-8*16(SHA256CONSTANTS), MSG |
107 | sha256rnds2 STATE0, STATE1 | 112 | sha256rnds2 MSG, STATE0, STATE1 |
108 | mova128 MSGTMP0, XMMTMP | 113 | mova128 MSGTMP0, XMMTMP |
109 | palignr $4, MSGTMP3, XMMTMP | 114 | palignr $4, MSGTMP3, XMMTMP |
110 | paddd XMMTMP, MSGTMP1 | 115 | paddd XMMTMP, MSGTMP1 |
111 | sha256msg2 MSGTMP0, MSGTMP1 | 116 | sha256msg2 MSGTMP0, MSGTMP1 |
112 | shuf128_32 $0x0E, MSG, MSG | 117 | shuf128_32 $0x0E, MSG, MSG |
113 | sha256rnds2 STATE1, STATE0 | 118 | sha256rnds2 MSG, STATE1, STATE0 |
114 | sha256msg1 MSGTMP0, MSGTMP3 | 119 | sha256msg1 MSGTMP0, MSGTMP3 |
115 | 120 | ||
116 | /* Rounds 20-23 */ | 121 | /* Rounds 20-23 */ |
117 | mova128 MSGTMP1, MSG | 122 | mova128 MSGTMP1, MSG |
118 | paddd 5*16-8*16(SHA256CONSTANTS), MSG | 123 | paddd 5*16-8*16(SHA256CONSTANTS), MSG |
119 | sha256rnds2 STATE0, STATE1 | 124 | sha256rnds2 MSG, STATE0, STATE1 |
120 | mova128 MSGTMP1, XMMTMP | 125 | mova128 MSGTMP1, XMMTMP |
121 | palignr $4, MSGTMP0, XMMTMP | 126 | palignr $4, MSGTMP0, XMMTMP |
122 | paddd XMMTMP, MSGTMP2 | 127 | paddd XMMTMP, MSGTMP2 |
123 | sha256msg2 MSGTMP1, MSGTMP2 | 128 | sha256msg2 MSGTMP1, MSGTMP2 |
124 | shuf128_32 $0x0E, MSG, MSG | 129 | shuf128_32 $0x0E, MSG, MSG |
125 | sha256rnds2 STATE1, STATE0 | 130 | sha256rnds2 MSG, STATE1, STATE0 |
126 | sha256msg1 MSGTMP1, MSGTMP0 | 131 | sha256msg1 MSGTMP1, MSGTMP0 |
127 | 132 | ||
128 | /* Rounds 24-27 */ | 133 | /* Rounds 24-27 */ |
129 | mova128 MSGTMP2, MSG | 134 | mova128 MSGTMP2, MSG |
130 | paddd 6*16-8*16(SHA256CONSTANTS), MSG | 135 | paddd 6*16-8*16(SHA256CONSTANTS), MSG |
131 | sha256rnds2 STATE0, STATE1 | 136 | sha256rnds2 MSG, STATE0, STATE1 |
132 | mova128 MSGTMP2, XMMTMP | 137 | mova128 MSGTMP2, XMMTMP |
133 | palignr $4, MSGTMP1, XMMTMP | 138 | palignr $4, MSGTMP1, XMMTMP |
134 | paddd XMMTMP, MSGTMP3 | 139 | paddd XMMTMP, MSGTMP3 |
135 | sha256msg2 MSGTMP2, MSGTMP3 | 140 | sha256msg2 MSGTMP2, MSGTMP3 |
136 | shuf128_32 $0x0E, MSG, MSG | 141 | shuf128_32 $0x0E, MSG, MSG |
137 | sha256rnds2 STATE1, STATE0 | 142 | sha256rnds2 MSG, STATE1, STATE0 |
138 | sha256msg1 MSGTMP2, MSGTMP1 | 143 | sha256msg1 MSGTMP2, MSGTMP1 |
139 | 144 | ||
140 | /* Rounds 28-31 */ | 145 | /* Rounds 28-31 */ |
141 | mova128 MSGTMP3, MSG | 146 | mova128 MSGTMP3, MSG |
142 | paddd 7*16-8*16(SHA256CONSTANTS), MSG | 147 | paddd 7*16-8*16(SHA256CONSTANTS), MSG |
143 | sha256rnds2 STATE0, STATE1 | 148 | sha256rnds2 MSG, STATE0, STATE1 |
144 | mova128 MSGTMP3, XMMTMP | 149 | mova128 MSGTMP3, XMMTMP |
145 | palignr $4, MSGTMP2, XMMTMP | 150 | palignr $4, MSGTMP2, XMMTMP |
146 | paddd XMMTMP, MSGTMP0 | 151 | paddd XMMTMP, MSGTMP0 |
147 | sha256msg2 MSGTMP3, MSGTMP0 | 152 | sha256msg2 MSGTMP3, MSGTMP0 |
148 | shuf128_32 $0x0E, MSG, MSG | 153 | shuf128_32 $0x0E, MSG, MSG |
149 | sha256rnds2 STATE1, STATE0 | 154 | sha256rnds2 MSG, STATE1, STATE0 |
150 | sha256msg1 MSGTMP3, MSGTMP2 | 155 | sha256msg1 MSGTMP3, MSGTMP2 |
151 | 156 | ||
152 | /* Rounds 32-35 */ | 157 | /* Rounds 32-35 */ |
153 | mova128 MSGTMP0, MSG | 158 | mova128 MSGTMP0, MSG |
154 | paddd 8*16-8*16(SHA256CONSTANTS), MSG | 159 | paddd 8*16-8*16(SHA256CONSTANTS), MSG |
155 | sha256rnds2 STATE0, STATE1 | 160 | sha256rnds2 MSG, STATE0, STATE1 |
156 | mova128 MSGTMP0, XMMTMP | 161 | mova128 MSGTMP0, XMMTMP |
157 | palignr $4, MSGTMP3, XMMTMP | 162 | palignr $4, MSGTMP3, XMMTMP |
158 | paddd XMMTMP, MSGTMP1 | 163 | paddd XMMTMP, MSGTMP1 |
159 | sha256msg2 MSGTMP0, MSGTMP1 | 164 | sha256msg2 MSGTMP0, MSGTMP1 |
160 | shuf128_32 $0x0E, MSG, MSG | 165 | shuf128_32 $0x0E, MSG, MSG |
161 | sha256rnds2 STATE1, STATE0 | 166 | sha256rnds2 MSG, STATE1, STATE0 |
162 | sha256msg1 MSGTMP0, MSGTMP3 | 167 | sha256msg1 MSGTMP0, MSGTMP3 |
163 | 168 | ||
164 | /* Rounds 36-39 */ | 169 | /* Rounds 36-39 */ |
165 | mova128 MSGTMP1, MSG | 170 | mova128 MSGTMP1, MSG |
166 | paddd 9*16-8*16(SHA256CONSTANTS), MSG | 171 | paddd 9*16-8*16(SHA256CONSTANTS), MSG |
167 | sha256rnds2 STATE0, STATE1 | 172 | sha256rnds2 MSG, STATE0, STATE1 |
168 | mova128 MSGTMP1, XMMTMP | 173 | mova128 MSGTMP1, XMMTMP |
169 | palignr $4, MSGTMP0, XMMTMP | 174 | palignr $4, MSGTMP0, XMMTMP |
170 | paddd XMMTMP, MSGTMP2 | 175 | paddd XMMTMP, MSGTMP2 |
171 | sha256msg2 MSGTMP1, MSGTMP2 | 176 | sha256msg2 MSGTMP1, MSGTMP2 |
172 | shuf128_32 $0x0E, MSG, MSG | 177 | shuf128_32 $0x0E, MSG, MSG |
173 | sha256rnds2 STATE1, STATE0 | 178 | sha256rnds2 MSG, STATE1, STATE0 |
174 | sha256msg1 MSGTMP1, MSGTMP0 | 179 | sha256msg1 MSGTMP1, MSGTMP0 |
175 | 180 | ||
176 | /* Rounds 40-43 */ | 181 | /* Rounds 40-43 */ |
177 | mova128 MSGTMP2, MSG | 182 | mova128 MSGTMP2, MSG |
178 | paddd 10*16-8*16(SHA256CONSTANTS), MSG | 183 | paddd 10*16-8*16(SHA256CONSTANTS), MSG |
179 | sha256rnds2 STATE0, STATE1 | 184 | sha256rnds2 MSG, STATE0, STATE1 |
180 | mova128 MSGTMP2, XMMTMP | 185 | mova128 MSGTMP2, XMMTMP |
181 | palignr $4, MSGTMP1, XMMTMP | 186 | palignr $4, MSGTMP1, XMMTMP |
182 | paddd XMMTMP, MSGTMP3 | 187 | paddd XMMTMP, MSGTMP3 |
183 | sha256msg2 MSGTMP2, MSGTMP3 | 188 | sha256msg2 MSGTMP2, MSGTMP3 |
184 | shuf128_32 $0x0E, MSG, MSG | 189 | shuf128_32 $0x0E, MSG, MSG |
185 | sha256rnds2 STATE1, STATE0 | 190 | sha256rnds2 MSG, STATE1, STATE0 |
186 | sha256msg1 MSGTMP2, MSGTMP1 | 191 | sha256msg1 MSGTMP2, MSGTMP1 |
187 | 192 | ||
188 | /* Rounds 44-47 */ | 193 | /* Rounds 44-47 */ |
189 | mova128 MSGTMP3, MSG | 194 | mova128 MSGTMP3, MSG |
190 | paddd 11*16-8*16(SHA256CONSTANTS), MSG | 195 | paddd 11*16-8*16(SHA256CONSTANTS), MSG |
191 | sha256rnds2 STATE0, STATE1 | 196 | sha256rnds2 MSG, STATE0, STATE1 |
192 | mova128 MSGTMP3, XMMTMP | 197 | mova128 MSGTMP3, XMMTMP |
193 | palignr $4, MSGTMP2, XMMTMP | 198 | palignr $4, MSGTMP2, XMMTMP |
194 | paddd XMMTMP, MSGTMP0 | 199 | paddd XMMTMP, MSGTMP0 |
195 | sha256msg2 MSGTMP3, MSGTMP0 | 200 | sha256msg2 MSGTMP3, MSGTMP0 |
196 | shuf128_32 $0x0E, MSG, MSG | 201 | shuf128_32 $0x0E, MSG, MSG |
197 | sha256rnds2 STATE1, STATE0 | 202 | sha256rnds2 MSG, STATE1, STATE0 |
198 | sha256msg1 MSGTMP3, MSGTMP2 | 203 | sha256msg1 MSGTMP3, MSGTMP2 |
199 | 204 | ||
200 | /* Rounds 48-51 */ | 205 | /* Rounds 48-51 */ |
201 | mova128 MSGTMP0, MSG | 206 | mova128 MSGTMP0, MSG |
202 | paddd 12*16-8*16(SHA256CONSTANTS), MSG | 207 | paddd 12*16-8*16(SHA256CONSTANTS), MSG |
203 | sha256rnds2 STATE0, STATE1 | 208 | sha256rnds2 MSG, STATE0, STATE1 |
204 | mova128 MSGTMP0, XMMTMP | 209 | mova128 MSGTMP0, XMMTMP |
205 | palignr $4, MSGTMP3, XMMTMP | 210 | palignr $4, MSGTMP3, XMMTMP |
206 | paddd XMMTMP, MSGTMP1 | 211 | paddd XMMTMP, MSGTMP1 |
207 | sha256msg2 MSGTMP0, MSGTMP1 | 212 | sha256msg2 MSGTMP0, MSGTMP1 |
208 | shuf128_32 $0x0E, MSG, MSG | 213 | shuf128_32 $0x0E, MSG, MSG |
209 | sha256rnds2 STATE1, STATE0 | 214 | sha256rnds2 MSG, STATE1, STATE0 |
210 | sha256msg1 MSGTMP0, MSGTMP3 | 215 | sha256msg1 MSGTMP0, MSGTMP3 |
211 | 216 | ||
212 | /* Rounds 52-55 */ | 217 | /* Rounds 52-55 */ |
213 | mova128 MSGTMP1, MSG | 218 | mova128 MSGTMP1, MSG |
214 | paddd 13*16-8*16(SHA256CONSTANTS), MSG | 219 | paddd 13*16-8*16(SHA256CONSTANTS), MSG |
215 | sha256rnds2 STATE0, STATE1 | 220 | sha256rnds2 MSG, STATE0, STATE1 |
216 | mova128 MSGTMP1, XMMTMP | 221 | mova128 MSGTMP1, XMMTMP |
217 | palignr $4, MSGTMP0, XMMTMP | 222 | palignr $4, MSGTMP0, XMMTMP |
218 | paddd XMMTMP, MSGTMP2 | 223 | paddd XMMTMP, MSGTMP2 |
219 | sha256msg2 MSGTMP1, MSGTMP2 | 224 | sha256msg2 MSGTMP1, MSGTMP2 |
220 | shuf128_32 $0x0E, MSG, MSG | 225 | shuf128_32 $0x0E, MSG, MSG |
221 | sha256rnds2 STATE1, STATE0 | 226 | sha256rnds2 MSG, STATE1, STATE0 |
222 | 227 | ||
223 | /* Rounds 56-59 */ | 228 | /* Rounds 56-59 */ |
224 | mova128 MSGTMP2, MSG | 229 | mova128 MSGTMP2, MSG |
225 | paddd 14*16-8*16(SHA256CONSTANTS), MSG | 230 | paddd 14*16-8*16(SHA256CONSTANTS), MSG |
226 | sha256rnds2 STATE0, STATE1 | 231 | sha256rnds2 MSG, STATE0, STATE1 |
227 | mova128 MSGTMP2, XMMTMP | 232 | mova128 MSGTMP2, XMMTMP |
228 | palignr $4, MSGTMP1, XMMTMP | 233 | palignr $4, MSGTMP1, XMMTMP |
229 | paddd XMMTMP, MSGTMP3 | 234 | paddd XMMTMP, MSGTMP3 |
230 | sha256msg2 MSGTMP2, MSGTMP3 | 235 | sha256msg2 MSGTMP2, MSGTMP3 |
231 | shuf128_32 $0x0E, MSG, MSG | 236 | shuf128_32 $0x0E, MSG, MSG |
232 | sha256rnds2 STATE1, STATE0 | 237 | sha256rnds2 MSG, STATE1, STATE0 |
233 | 238 | ||
234 | /* Rounds 60-63 */ | 239 | /* Rounds 60-63 */ |
235 | mova128 MSGTMP3, MSG | 240 | mova128 MSGTMP3, MSG |
236 | paddd 15*16-8*16(SHA256CONSTANTS), MSG | 241 | paddd 15*16-8*16(SHA256CONSTANTS), MSG |
237 | sha256rnds2 STATE0, STATE1 | 242 | sha256rnds2 MSG, STATE0, STATE1 |
238 | shuf128_32 $0x0E, MSG, MSG | 243 | shuf128_32 $0x0E, MSG, MSG |
239 | sha256rnds2 STATE1, STATE0 | 244 | sha256rnds2 MSG, STATE1, STATE0 |
240 | 245 | ||
241 | /* Add current hash values with previously saved */ | 246 | /* Add current hash values with previously saved */ |
242 | paddd ABEF_SAVE, STATE0 | 247 | paddd SAVE0, STATE0 |
243 | paddd CDGH_SAVE, STATE1 | 248 | paddd SAVE1, STATE1 |
244 | 249 | ||
245 | /* Write hash values back in the correct order */ | 250 | /* Write hash values back in the correct order */ |
246 | /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */ | ||
247 | /* STATE1: CDGH */ | ||
248 | mova128 STATE0, XMMTMP | 251 | mova128 STATE0, XMMTMP |
249 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ | 252 | /* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */ |
250 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */ | 253 | /* --- -------------- HGDC -- FEBA */ |
251 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */ | 254 | shufps SHUF(3,2,3,2), STATE1, STATE0 /* ABCD */ |
252 | 255 | shufps SHUF(1,0,1,0), STATE1, XMMTMP /* EFGH */ | |
253 | movu128 STATE0, 80+0*16(%rdi) | 256 | movu128 STATE0, 80+0*16(%rdi) |
254 | movu128 XMMTMP, 80+1*16(%rdi) | 257 | movu128 XMMTMP, 80+1*16(%rdi) |
255 | 258 | ||
@@ -257,7 +260,7 @@ sha256_process_block64_shaNI: | |||
257 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI | 260 | .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI |
258 | 261 | ||
259 | .section .rodata.cst256.K256, "aM", @progbits, 256 | 262 | .section .rodata.cst256.K256, "aM", @progbits, 256 |
260 | .balign 16 | 263 | .balign 16 |
261 | K256: | 264 | K256: |
262 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 | 265 | .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 |
263 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 | 266 | .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 |
@@ -277,8 +280,8 @@ K256: | |||
277 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 | 280 | .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 |
278 | 281 | ||
279 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 | 282 | .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16 |
280 | .balign 16 | 283 | .balign 16 |
281 | PSHUFFLE_BSWAP32_FLIP_MASK: | 284 | PSHUFFLE_BSWAP32_FLIP_MASK: |
282 | .octa 0x0c0d0e0f08090a0b0405060700010203 | 285 | .octa 0x0c0d0e0f08090a0b0405060700010203 |
283 | 286 | ||
284 | #endif | 287 | #endif |
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S index a61b3cbed..2366b046a 100644 --- a/libbb/hash_md5_sha_x86-32_shaNI.S +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
@@ -4,7 +4,7 @@ | |||
4 | // We use shorter insns, even though they are for "wrong" | 4 | // We use shorter insns, even though they are for "wrong" |
5 | // data type (fp, not int). | 5 | // data type (fp, not int). |
6 | // For Intel, there is no penalty for doing it at all | 6 | // For Intel, there is no penalty for doing it at all |
7 | // (CPUs which do have such penalty do not support SHA1 insns). | 7 | // (CPUs which do have such penalty do not support SHA insns). |
8 | // For AMD, the penalty is one extra cycle | 8 | // For AMD, the penalty is one extra cycle |
9 | // (allegedly: I failed to find measurable difference). | 9 | // (allegedly: I failed to find measurable difference). |
10 | 10 | ||
@@ -20,6 +20,11 @@ | |||
20 | #define extr128_32 pextrd | 20 | #define extr128_32 pextrd |
21 | //#define extr128_32 extractps # not shorter | 21 | //#define extr128_32 extractps # not shorter |
22 | 22 | ||
23 | // pshufb is a SSSE3 insn. | ||
24 | // pinsrd, pextrd, extractps are SSE4.1 insns. | ||
25 | // We do not check SSSE3/SSE4.1 in cpuid, | ||
26 | // all SHA-capable CPUs support them as well. | ||
27 | |||
23 | .section .text.sha1_process_block64_shaNI, "ax", @progbits | 28 | .section .text.sha1_process_block64_shaNI, "ax", @progbits |
24 | .globl sha1_process_block64_shaNI | 29 | .globl sha1_process_block64_shaNI |
25 | .hidden sha1_process_block64_shaNI | 30 | .hidden sha1_process_block64_shaNI |
@@ -219,8 +224,8 @@ sha1_process_block64_shaNI: | |||
219 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
220 | 225 | ||
221 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
222 | .balign 16 | 227 | .balign 16 |
223 | PSHUFFLE_BYTE_FLIP_MASK: | 228 | PSHUFFLE_BYTE_FLIP_MASK: |
224 | .octa 0x000102030405060708090a0b0c0d0e0f | 229 | .octa 0x000102030405060708090a0b0c0d0e0f |
225 | 230 | ||
226 | #endif | 231 | #endif |
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S index 287cfe547..1d55b91f8 100644 --- a/libbb/hash_md5_sha_x86-64.S +++ b/libbb/hash_md5_sha_x86-64.S | |||
@@ -36,7 +36,7 @@ sha1_process_block64: | |||
36 | movaps sha1const(%rip), %xmm7 | 36 | movaps sha1const(%rip), %xmm7 |
37 | pshufd $0x00, %xmm7, %xmm6 | 37 | pshufd $0x00, %xmm7, %xmm6 |
38 | 38 | ||
39 | # Load W[] to xmm registers, byteswapping on the fly. | 39 | # Load W[] to xmm0..3, byteswapping on the fly. |
40 | # | 40 | # |
41 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | 41 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
42 | # for use in RD1As instead of spilling them to stack. | 42 | # for use in RD1As instead of spilling them to stack. |
@@ -71,8 +71,8 @@ sha1_process_block64: | |||
71 | movq 4*10(%rdi), %r12 | 71 | movq 4*10(%rdi), %r12 |
72 | bswapq %r11 | 72 | bswapq %r11 |
73 | bswapq %r12 | 73 | bswapq %r12 |
74 | rolq $32, %r11 # r11 = W[9]:W[8] | 74 | rolq $32, %r11 # r11 = W[9]:W[8] |
75 | rolq $32, %r12 # r12 = W[11]:W[10] | 75 | rolq $32, %r12 # r12 = W[11]:W[10] |
76 | movq %r11, %xmm2 | 76 | movq %r11, %xmm2 |
77 | movq %r12, %xmm4 | 77 | movq %r12, %xmm4 |
78 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) | 78 | punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
@@ -81,8 +81,8 @@ sha1_process_block64: | |||
81 | movq 4*14(%rdi), %r14 | 81 | movq 4*14(%rdi), %r14 |
82 | bswapq %r13 | 82 | bswapq %r13 |
83 | bswapq %r14 | 83 | bswapq %r14 |
84 | rolq $32, %r13 # r13 = W[13]:W[12] | 84 | rolq $32, %r13 # r13 = W[13]:W[12] |
85 | rolq $32, %r14 # r14 = W[15]:W[14] | 85 | rolq $32, %r14 # r14 = W[15]:W[14] |
86 | movq %r13, %xmm3 | 86 | movq %r13, %xmm3 |
87 | movq %r14, %xmm4 | 87 | movq %r14, %xmm4 |
88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) | 88 | punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh index a10ac411d..40c979d35 100755 --- a/libbb/hash_md5_sha_x86-64.S.sh +++ b/libbb/hash_md5_sha_x86-64.S.sh | |||
@@ -99,6 +99,30 @@ INTERLEAVE() { | |||
99 | ) | 99 | ) |
100 | } | 100 | } |
101 | 101 | ||
102 | # movaps bswap32_mask(%rip), $xmmT1 | ||
103 | # Load W[] to xmm0..3, byteswapping on the fly. | ||
104 | # For iterations 0..15, we pass RCONST+W[] in rsi,r8..r14 | ||
105 | # for use in RD1As instead of spilling them to stack. | ||
106 | # (We use rsi instead of rN because this makes two | ||
107 | # ADDs in two first RD1As shorter by one byte). | ||
108 | # movups 16*0(%rdi), %xmm0 | ||
109 | # pshufb $xmmT1, %xmm0 #SSSE3 insn | ||
110 | # movaps %xmm0, $xmmT2 | ||
111 | # paddd $xmmRCONST, $xmmT2 | ||
112 | # movq $xmmT2, %rsi | ||
113 | # #pextrq \$1, $xmmT2, %r8 #SSE4.1 insn | ||
114 | # #movhpd $xmmT2, %r8 #can only move to mem, not to reg | ||
115 | # shufps \$0x0e, $xmmT2, $xmmT2 # have to use two-insn sequence | ||
116 | # movq $xmmT2, %r8 # instead | ||
117 | # ... | ||
118 | # <repeat for xmm1,2,3> | ||
119 | # ... | ||
120 | #- leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n] | ||
121 | #+ addl %esi, %e$e # e += RCONST + W[n] | ||
122 | # ^^^^^^^^^^^^^^^^^^^^^^^^ | ||
123 | # The above is -97 bytes of code... | ||
124 | # ...but pshufb is a SSSE3 insn. Can't use it. | ||
125 | |||
102 | echo \ | 126 | echo \ |
103 | "### Generated by hash_md5_sha_x86-64.S.sh ### | 127 | "### Generated by hash_md5_sha_x86-64.S.sh ### |
104 | 128 | ||
@@ -138,7 +162,7 @@ sha1_process_block64: | |||
138 | movaps sha1const(%rip), $xmmALLRCONST | 162 | movaps sha1const(%rip), $xmmALLRCONST |
139 | pshufd \$0x00, $xmmALLRCONST, $xmmRCONST | 163 | pshufd \$0x00, $xmmALLRCONST, $xmmRCONST |
140 | 164 | ||
141 | # Load W[] to xmm registers, byteswapping on the fly. | 165 | # Load W[] to xmm0..3, byteswapping on the fly. |
142 | # | 166 | # |
143 | # For iterations 0..15, we pass W[] in rsi,r8..r14 | 167 | # For iterations 0..15, we pass W[] in rsi,r8..r14 |
144 | # for use in RD1As instead of spilling them to stack. | 168 | # for use in RD1As instead of spilling them to stack. |
@@ -173,8 +197,8 @@ sha1_process_block64: | |||
173 | movq 4*10(%rdi), %r12 | 197 | movq 4*10(%rdi), %r12 |
174 | bswapq %r11 | 198 | bswapq %r11 |
175 | bswapq %r12 | 199 | bswapq %r12 |
176 | rolq \$32, %r11 # r11 = W[9]:W[8] | 200 | rolq \$32, %r11 # r11 = W[9]:W[8] |
177 | rolq \$32, %r12 # r12 = W[11]:W[10] | 201 | rolq \$32, %r12 # r12 = W[11]:W[10] |
178 | movq %r11, %xmm2 | 202 | movq %r11, %xmm2 |
179 | movq %r12, $xmmT1 | 203 | movq %r12, $xmmT1 |
180 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) | 204 | punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11]) |
@@ -183,8 +207,8 @@ sha1_process_block64: | |||
183 | movq 4*14(%rdi), %r14 | 207 | movq 4*14(%rdi), %r14 |
184 | bswapq %r13 | 208 | bswapq %r13 |
185 | bswapq %r14 | 209 | bswapq %r14 |
186 | rolq \$32, %r13 # r13 = W[13]:W[12] | 210 | rolq \$32, %r13 # r13 = W[13]:W[12] |
187 | rolq \$32, %r14 # r14 = W[15]:W[14] | 211 | rolq \$32, %r14 # r14 = W[15]:W[14] |
188 | movq %r13, %xmm3 | 212 | movq %r13, %xmm3 |
189 | movq %r14, $xmmT1 | 213 | movq %r14, $xmmT1 |
190 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) | 214 | punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15]) |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S index b32029360..794e97040 100644 --- a/libbb/hash_md5_sha_x86-64_shaNI.S +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
@@ -4,7 +4,7 @@ | |||
4 | // We use shorter insns, even though they are for "wrong" | 4 | // We use shorter insns, even though they are for "wrong" |
5 | // data type (fp, not int). | 5 | // data type (fp, not int). |
6 | // For Intel, there is no penalty for doing it at all | 6 | // For Intel, there is no penalty for doing it at all |
7 | // (CPUs which do have such penalty do not support SHA1 insns). | 7 | // (CPUs which do have such penalty do not support SHA insns). |
8 | // For AMD, the penalty is one extra cycle | 8 | // For AMD, the penalty is one extra cycle |
9 | // (allegedly: I failed to find measurable difference). | 9 | // (allegedly: I failed to find measurable difference). |
10 | 10 | ||
@@ -20,6 +20,11 @@ | |||
20 | #define extr128_32 pextrd | 20 | #define extr128_32 pextrd |
21 | //#define extr128_32 extractps # not shorter | 21 | //#define extr128_32 extractps # not shorter |
22 | 22 | ||
23 | // pshufb is a SSSE3 insn. | ||
24 | // pinsrd, pextrd, extractps are SSE4.1 insns. | ||
25 | // We do not check SSSE3/SSE4.1 in cpuid, | ||
26 | // all SHA-capable CPUs support them as well. | ||
27 | |||
23 | .section .text.sha1_process_block64_shaNI, "ax", @progbits | 28 | .section .text.sha1_process_block64_shaNI, "ax", @progbits |
24 | .globl sha1_process_block64_shaNI | 29 | .globl sha1_process_block64_shaNI |
25 | .hidden sha1_process_block64_shaNI | 30 | .hidden sha1_process_block64_shaNI |
@@ -217,8 +222,8 @@ sha1_process_block64_shaNI: | |||
217 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | 222 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI |
218 | 223 | ||
219 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | 224 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 |
220 | .balign 16 | 225 | .balign 16 |
221 | PSHUFFLE_BYTE_FLIP_MASK: | 226 | PSHUFFLE_BYTE_FLIP_MASK: |
222 | .octa 0x000102030405060708090a0b0c0d0e0f | 227 | .octa 0x000102030405060708090a0b0c0d0e0f |
223 | 228 | ||
224 | #endif | 229 | #endif |
diff --git a/shell/ash.c b/shell/ash.c index e8a1e853c..35aa3f6e6 100644 --- a/shell/ash.c +++ b/shell/ash.c | |||
@@ -7163,9 +7163,7 @@ exptilde(char *startp, int flag) | |||
7163 | home = lookupvar("HOME"); | 7163 | home = lookupvar("HOME"); |
7164 | } else { | 7164 | } else { |
7165 | pw = getpwnam(name); | 7165 | pw = getpwnam(name); |
7166 | if (pw == NULL) | 7166 | home = pw ? pw->pw_dir : NULL; |
7167 | goto lose; | ||
7168 | home = pw->pw_dir; | ||
7169 | } | 7167 | } |
7170 | *p = c; | 7168 | *p = c; |
7171 | if (!home) | 7169 | if (!home) |
@@ -7724,6 +7722,10 @@ subevalvar(char *start, char *str, int strloc, | |||
7724 | *repl = '\0'; | 7722 | *repl = '\0'; |
7725 | break; | 7723 | break; |
7726 | } | 7724 | } |
7725 | if ((unsigned char)*repl == CTLENDVAR) { /* ${v/pattern} (no trailing /, no repl) */ | ||
7726 | repl = NULL; | ||
7727 | break; | ||
7728 | } | ||
7727 | /* Handle escaped slashes, e.g. "${v/\//_}" (they are CTLESC'ed by this point) */ | 7729 | /* Handle escaped slashes, e.g. "${v/\//_}" (they are CTLESC'ed by this point) */ |
7728 | if ((unsigned char)*repl == CTLESC && repl[1]) | 7730 | if ((unsigned char)*repl == CTLESC && repl[1]) |
7729 | repl++; | 7731 | repl++; |
@@ -7830,7 +7832,13 @@ subevalvar(char *start, char *str, int strloc, | |||
7830 | len = orig_len - pos; | 7832 | len = orig_len - pos; |
7831 | 7833 | ||
7832 | if (!quotes) { | 7834 | if (!quotes) { |
7833 | loc = mempcpy(startp, startp + pos, len); | 7835 | /* want: loc = mempcpy(startp, startp + pos, len) |
7836 | * but it does not allow overlapping arguments */ | ||
7837 | loc = startp; | ||
7838 | while (--len >= 0) { | ||
7839 | *loc = loc[pos]; | ||
7840 | loc++; | ||
7841 | } | ||
7834 | } else { | 7842 | } else { |
7835 | for (vstr = startp; pos != 0; pos--) { | 7843 | for (vstr = startp; pos != 0; pos--) { |
7836 | if ((unsigned char)*vstr == CTLESC) | 7844 | if ((unsigned char)*vstr == CTLESC) |
diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.right b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right new file mode 100644 index 000000000..5bff3a6fa --- /dev/null +++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.right | |||
@@ -0,0 +1 @@ | |||
b/d | |||
diff --git a/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests new file mode 100755 index 000000000..c9513343d --- /dev/null +++ b/shell/ash_test/ash-vars/var_bash_repl_unterminated.tests | |||
@@ -0,0 +1,2 @@ | |||
1 | a=b-c | ||
2 | echo ${a/-*}/d | ||
diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.right b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right new file mode 100644 index 000000000..5bff3a6fa --- /dev/null +++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.right | |||
@@ -0,0 +1 @@ | |||
b/d | |||
diff --git a/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests new file mode 100755 index 000000000..c9513343d --- /dev/null +++ b/shell/hush_test/hush-vars/var_bash_repl_unterminated.tests | |||
@@ -0,0 +1,2 @@ | |||
1 | a=b-c | ||
2 | echo ${a/-*}/d | ||
diff --git a/util-linux/taskset.c b/util-linux/taskset.c index d2ef9b98f..8b410f369 100644 --- a/util-linux/taskset.c +++ b/util-linux/taskset.c | |||
@@ -55,7 +55,6 @@ | |||
55 | * Not yet implemented: | 55 | * Not yet implemented: |
56 | * -a/--all-tasks (affect all threads) | 56 | * -a/--all-tasks (affect all threads) |
57 | * needs to get TIDs from /proc/PID/task/ and use _them_ as "pid" in sched_setaffinity(pid) | 57 | * needs to get TIDs from /proc/PID/task/ and use _them_ as "pid" in sched_setaffinity(pid) |
58 | * -c/--cpu-list (specify CPUs via "1,3,5-7") | ||
59 | */ | 58 | */ |
60 | 59 | ||
61 | #include <sched.h> | 60 | #include <sched.h> |
@@ -91,7 +90,7 @@ static char *from_mask(const ul *mask, unsigned sz_in_bytes) | |||
91 | } | 90 | } |
92 | #else | 91 | #else |
93 | #define TASKSET_PRINTF_MASK "%lx" | 92 | #define TASKSET_PRINTF_MASK "%lx" |
94 | static unsigned long long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM) | 93 | static unsigned long from_mask(ul *mask, unsigned sz_in_bytes UNUSED_PARAM) |
95 | { | 94 | { |
96 | return *mask; | 95 | return *mask; |
97 | } | 96 | } |