aboutsummaryrefslogtreecommitdiff
path: root/libbb
diff options
context:
space:
mode:
authorRon Yorston <rmy@pobox.com>2022-02-09 09:03:18 +0000
committerRon Yorston <rmy@pobox.com>2022-02-09 09:05:39 +0000
commit492d0a7492a57fe8f02c766e25960b0ce0d88759 (patch)
tree4f5764a5c2250c031ea05e9aeacbb40d7971f493 /libbb
parent4734416a21312488a5099a297907783bee4ccc22 (diff)
parentcaa9c4f707b661cf398f2c2d66f54f5b0d8adfe2 (diff)
downloadbusybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.gz
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.tar.bz2
busybox-w32-492d0a7492a57fe8f02c766e25960b0ce0d88759.zip
Merge busybox into merge
Fix conflicts in reset and ash. Redefine the new safe_read_key() as a reference to read_key(). Disable SHA256_HWACCEL.
Diffstat (limited to 'libbb')
-rw-r--r--libbb/Config.src6
-rw-r--r--libbb/Kbuild.src3
-rw-r--r--libbb/appletlib.c2
-rw-r--r--libbb/get_console.c2
-rw-r--r--libbb/getopt32.c2
-rw-r--r--libbb/hash_md5_sha.c54
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S277
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S284
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S47
-rw-r--r--libbb/hash_md5_sha_x86-64.S1177
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh460
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S33
-rw-r--r--libbb/lineedit.c34
-rw-r--r--libbb/read_key.c17
-rw-r--r--libbb/setup_environment.c8
-rw-r--r--libbb/xfuncs_printf.c11
16 files changed, 1692 insertions, 725 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index 708d3b0c8..0ecd5bd46 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -70,6 +70,12 @@ config SHA1_HWACCEL
70 On x86, this adds ~590 bytes of code. Throughput 70 On x86, this adds ~590 bytes of code. Throughput
71 is about twice as fast as fully-unrolled generic code. 71 is about twice as fast as fully-unrolled generic code.
72 72
73config SHA256_HWACCEL
74 bool "SHA256: Use hardware accelerated instructions if possible"
75 default y
76 help
77 On x86, this adds ~1k bytes of code.
78
73config SHA3_SMALL 79config SHA3_SMALL
74 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 80 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
75 default 1 # all "fast or small" options default to small 81 default 1 # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 67d3c7cf7..191984c9d 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -48,6 +48,8 @@ lib-y += hash_md5_sha.o
48lib-y += hash_md5_sha_x86-64.o 48lib-y += hash_md5_sha_x86-64.o
49lib-y += hash_md5_sha_x86-64_shaNI.o 49lib-y += hash_md5_sha_x86-64_shaNI.o
50lib-y += hash_md5_sha_x86-32_shaNI.o 50lib-y += hash_md5_sha_x86-32_shaNI.o
51lib-y += hash_md5_sha256_x86-64_shaNI.o
52lib-y += hash_md5_sha256_x86-32_shaNI.o
51# Alternative (disabled) MD5 implementation 53# Alternative (disabled) MD5 implementation
52#lib-y += hash_md5prime.o 54#lib-y += hash_md5prime.o
53lib-y += messages.o 55lib-y += messages.o
@@ -204,6 +206,7 @@ lib-$(CONFIG_PGREP) += xregcomp.o
204lib-$(CONFIG_PKILL) += xregcomp.o 206lib-$(CONFIG_PKILL) += xregcomp.o
205lib-$(CONFIG_DEVFSD) += xregcomp.o 207lib-$(CONFIG_DEVFSD) += xregcomp.o
206lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o 208lib-$(CONFIG_FEATURE_FIND_REGEX) += xregcomp.o
209lib-$(CONFIG_FEATURE_CUT_REGEX) += xregcomp.o
207 210
208# Add the experimental logging functionality, only used by zcip 211# Add the experimental logging functionality, only used by zcip
209lib-$(CONFIG_ZCIP) += logenv.o 212lib-$(CONFIG_ZCIP) += logenv.o
diff --git a/libbb/appletlib.c b/libbb/appletlib.c
index 6c0be4a83..a8b82e729 100644
--- a/libbb/appletlib.c
+++ b/libbb/appletlib.c
@@ -671,7 +671,7 @@ static void check_suid(int applet_no)
671# if ENABLE_FEATURE_INSTALLER 671# if ENABLE_FEATURE_INSTALLER
672static const char usr_bin [] ALIGN1 = "/usr/bin/"; 672static const char usr_bin [] ALIGN1 = "/usr/bin/";
673static const char usr_sbin[] ALIGN1 = "/usr/sbin/"; 673static const char usr_sbin[] ALIGN1 = "/usr/sbin/";
674static const char *const install_dir[] = { 674static const char *const install_dir[] ALIGN_PTR = {
675 &usr_bin [8], /* "/" */ 675 &usr_bin [8], /* "/" */
676 &usr_bin [4], /* "/bin/" */ 676 &usr_bin [4], /* "/bin/" */
677 &usr_sbin[4] /* "/sbin/" */ 677 &usr_sbin[4] /* "/sbin/" */
diff --git a/libbb/get_console.c b/libbb/get_console.c
index 7f2c75332..9044efea1 100644
--- a/libbb/get_console.c
+++ b/libbb/get_console.c
@@ -37,7 +37,7 @@ static int open_a_console(const char *fnam)
37 */ 37 */
38int FAST_FUNC get_console_fd_or_die(void) 38int FAST_FUNC get_console_fd_or_die(void)
39{ 39{
40 static const char *const console_names[] = { 40 static const char *const console_names[] ALIGN_PTR = {
41 DEV_CONSOLE, CURRENT_VC, CURRENT_TTY 41 DEV_CONSOLE, CURRENT_VC, CURRENT_TTY
42 }; 42 };
43 43
diff --git a/libbb/getopt32.c b/libbb/getopt32.c
index 5ab4d66f1..e861d0567 100644
--- a/libbb/getopt32.c
+++ b/libbb/getopt32.c
@@ -296,7 +296,7 @@ Special characters:
296 296
297/* Code here assumes that 'unsigned' is at least 32 bits wide */ 297/* Code here assumes that 'unsigned' is at least 32 bits wide */
298 298
299const char *const bb_argv_dash[] = { "-", NULL }; 299const char *const bb_argv_dash[] ALIGN_PTR = { "-", NULL };
300 300
301enum { 301enum {
302 PARAM_STRING, 302 PARAM_STRING,
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a23db5152..880ffab01 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -13,6 +13,27 @@
13 13
14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) 14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
15 15
16#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
17# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
18static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
19{
20 asm ("cpuid"
21 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
22 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
23 );
24}
25static smallint shaNI;
26void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
27void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
28# if defined(__i386__)
29struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
30# endif
31# if defined(__x86_64__)
32struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
33# endif
34# endif
35#endif
36
16/* gcc 4.2.1 optimizes rotr64 better with inline than with macro 37/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
17 * (for rotX32, there is no difference). Why? My guess is that 38 * (for rotX32, there is no difference). Why? My guess is that
18 * macro requires clever common subexpression elimination heuristics 39 * macro requires clever common subexpression elimination heuristics
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142} 1163}
1143#endif /* NEED_SHA512 */ 1164#endif /* NEED_SHA512 */
1144 1165
1145#if ENABLE_SHA1_HWACCEL
1146# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1147static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
1148{
1149 asm ("cpuid"
1150 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
1151 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
1152 );
1153}
1154void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
1155# if defined(__i386__)
1156struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
1157# endif
1158# if defined(__x86_64__)
1159struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
1160# endif
1161# endif
1162#endif
1163
1164void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) 1166void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1165{ 1167{
1166 ctx->hash[0] = 0x67452301; 1168 ctx->hash[0] = 0x67452301;
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1173#if ENABLE_SHA1_HWACCEL 1175#if ENABLE_SHA1_HWACCEL
1174# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 1176# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1175 { 1177 {
1176 static smallint shaNI;
1177 if (!shaNI) { 1178 if (!shaNI) {
1178 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; 1179 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1179 cpuid(&eax, &ebx, &ecx, &edx); 1180 cpuid(&eax, &ebx, &ecx, &edx);
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
1225 memcpy(&ctx->total64, init256, sizeof(init256)); 1226 memcpy(&ctx->total64, init256, sizeof(init256));
1226 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ 1227 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
1227 ctx->process_block = sha256_process_block64; 1228 ctx->process_block = sha256_process_block64;
1229#if ENABLE_SHA256_HWACCEL
1230# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1231 {
1232 if (!shaNI) {
1233 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1234 cpuid(&eax, &ebx, &ecx, &edx);
1235 shaNI = ((ebx >> 29) << 1) - 1;
1236 }
1237 if (shaNI > 0)
1238 ctx->process_block = sha256_process_block64_shaNI;
1239 }
1240# endif
1241#endif
1228} 1242}
1229 1243
1230#if NEED_SHA512 1244#if NEED_SHA512
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..aa68193bd
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,277 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
38
39 .balign 8 # allow decoders to fetch at least 2 first insns
40sha256_process_block64_shaNI:
41
42 movu128 76+0*16(%eax), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
43 movu128 76+1*16(%eax), STATE1 /* HGFE */
44/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
45 mova128 STATE1, STATE0
46 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
47 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
48
49/* XMMTMP holds flip mask from here... */
50 mova128 PSHUFFLE_BSWAP32_FLIP_MASK, XMMTMP
51 movl $K256+8*16, SHA256CONSTANTS
52
53 /* Rounds 0-3 */
54 movu128 0*16(DATA_PTR), MSG
55 pshufb XMMTMP, MSG
56 mova128 MSG, MSGTMP0
57 paddd 0*16-8*16(SHA256CONSTANTS), MSG
58 sha256rnds2 STATE0, STATE1
59 shuf128_32 $0x0E, MSG, MSG
60 sha256rnds2 STATE1, STATE0
61
62 /* Rounds 4-7 */
63 movu128 1*16(DATA_PTR), MSG
64 pshufb XMMTMP, MSG
65 mova128 MSG, MSGTMP1
66 paddd 1*16-8*16(SHA256CONSTANTS), MSG
67 sha256rnds2 STATE0, STATE1
68 shuf128_32 $0x0E, MSG, MSG
69 sha256rnds2 STATE1, STATE0
70 sha256msg1 MSGTMP1, MSGTMP0
71
72 /* Rounds 8-11 */
73 movu128 2*16(DATA_PTR), MSG
74 pshufb XMMTMP, MSG
75 mova128 MSG, MSGTMP2
76 paddd 2*16-8*16(SHA256CONSTANTS), MSG
77 sha256rnds2 STATE0, STATE1
78 shuf128_32 $0x0E, MSG, MSG
79 sha256rnds2 STATE1, STATE0
80 sha256msg1 MSGTMP2, MSGTMP1
81
82 /* Rounds 12-15 */
83 movu128 3*16(DATA_PTR), MSG
84 pshufb XMMTMP, MSG
85/* ...to here */
86 mova128 MSG, MSGTMP3
87 paddd 3*16-8*16(SHA256CONSTANTS), MSG
88 sha256rnds2 STATE0, STATE1
89 mova128 MSGTMP3, XMMTMP
90 palignr $4, MSGTMP2, XMMTMP
91 paddd XMMTMP, MSGTMP0
92 sha256msg2 MSGTMP3, MSGTMP0
93 shuf128_32 $0x0E, MSG, MSG
94 sha256rnds2 STATE1, STATE0
95 sha256msg1 MSGTMP3, MSGTMP2
96
97 /* Rounds 16-19 */
98 mova128 MSGTMP0, MSG
99 paddd 4*16-8*16(SHA256CONSTANTS), MSG
100 sha256rnds2 STATE0, STATE1
101 mova128 MSGTMP0, XMMTMP
102 palignr $4, MSGTMP3, XMMTMP
103 paddd XMMTMP, MSGTMP1
104 sha256msg2 MSGTMP0, MSGTMP1
105 shuf128_32 $0x0E, MSG, MSG
106 sha256rnds2 STATE1, STATE0
107 sha256msg1 MSGTMP0, MSGTMP3
108
109 /* Rounds 20-23 */
110 mova128 MSGTMP1, MSG
111 paddd 5*16-8*16(SHA256CONSTANTS), MSG
112 sha256rnds2 STATE0, STATE1
113 mova128 MSGTMP1, XMMTMP
114 palignr $4, MSGTMP0, XMMTMP
115 paddd XMMTMP, MSGTMP2
116 sha256msg2 MSGTMP1, MSGTMP2
117 shuf128_32 $0x0E, MSG, MSG
118 sha256rnds2 STATE1, STATE0
119 sha256msg1 MSGTMP1, MSGTMP0
120
121 /* Rounds 24-27 */
122 mova128 MSGTMP2, MSG
123 paddd 6*16-8*16(SHA256CONSTANTS), MSG
124 sha256rnds2 STATE0, STATE1
125 mova128 MSGTMP2, XMMTMP
126 palignr $4, MSGTMP1, XMMTMP
127 paddd XMMTMP, MSGTMP3
128 sha256msg2 MSGTMP2, MSGTMP3
129 shuf128_32 $0x0E, MSG, MSG
130 sha256rnds2 STATE1, STATE0
131 sha256msg1 MSGTMP2, MSGTMP1
132
133 /* Rounds 28-31 */
134 mova128 MSGTMP3, MSG
135 paddd 7*16-8*16(SHA256CONSTANTS), MSG
136 sha256rnds2 STATE0, STATE1
137 mova128 MSGTMP3, XMMTMP
138 palignr $4, MSGTMP2, XMMTMP
139 paddd XMMTMP, MSGTMP0
140 sha256msg2 MSGTMP3, MSGTMP0
141 shuf128_32 $0x0E, MSG, MSG
142 sha256rnds2 STATE1, STATE0
143 sha256msg1 MSGTMP3, MSGTMP2
144
145 /* Rounds 32-35 */
146 mova128 MSGTMP0, MSG
147 paddd 8*16-8*16(SHA256CONSTANTS), MSG
148 sha256rnds2 STATE0, STATE1
149 mova128 MSGTMP0, XMMTMP
150 palignr $4, MSGTMP3, XMMTMP
151 paddd XMMTMP, MSGTMP1
152 sha256msg2 MSGTMP0, MSGTMP1
153 shuf128_32 $0x0E, MSG, MSG
154 sha256rnds2 STATE1, STATE0
155 sha256msg1 MSGTMP0, MSGTMP3
156
157 /* Rounds 36-39 */
158 mova128 MSGTMP1, MSG
159 paddd 9*16-8*16(SHA256CONSTANTS), MSG
160 sha256rnds2 STATE0, STATE1
161 mova128 MSGTMP1, XMMTMP
162 palignr $4, MSGTMP0, XMMTMP
163 paddd XMMTMP, MSGTMP2
164 sha256msg2 MSGTMP1, MSGTMP2
165 shuf128_32 $0x0E, MSG, MSG
166 sha256rnds2 STATE1, STATE0
167 sha256msg1 MSGTMP1, MSGTMP0
168
169 /* Rounds 40-43 */
170 mova128 MSGTMP2, MSG
171 paddd 10*16-8*16(SHA256CONSTANTS), MSG
172 sha256rnds2 STATE0, STATE1
173 mova128 MSGTMP2, XMMTMP
174 palignr $4, MSGTMP1, XMMTMP
175 paddd XMMTMP, MSGTMP3
176 sha256msg2 MSGTMP2, MSGTMP3
177 shuf128_32 $0x0E, MSG, MSG
178 sha256rnds2 STATE1, STATE0
179 sha256msg1 MSGTMP2, MSGTMP1
180
181 /* Rounds 44-47 */
182 mova128 MSGTMP3, MSG
183 paddd 11*16-8*16(SHA256CONSTANTS), MSG
184 sha256rnds2 STATE0, STATE1
185 mova128 MSGTMP3, XMMTMP
186 palignr $4, MSGTMP2, XMMTMP
187 paddd XMMTMP, MSGTMP0
188 sha256msg2 MSGTMP3, MSGTMP0
189 shuf128_32 $0x0E, MSG, MSG
190 sha256rnds2 STATE1, STATE0
191 sha256msg1 MSGTMP3, MSGTMP2
192
193 /* Rounds 48-51 */
194 mova128 MSGTMP0, MSG
195 paddd 12*16-8*16(SHA256CONSTANTS), MSG
196 sha256rnds2 STATE0, STATE1
197 mova128 MSGTMP0, XMMTMP
198 palignr $4, MSGTMP3, XMMTMP
199 paddd XMMTMP, MSGTMP1
200 sha256msg2 MSGTMP0, MSGTMP1
201 shuf128_32 $0x0E, MSG, MSG
202 sha256rnds2 STATE1, STATE0
203 sha256msg1 MSGTMP0, MSGTMP3
204
205 /* Rounds 52-55 */
206 mova128 MSGTMP1, MSG
207 paddd 13*16-8*16(SHA256CONSTANTS), MSG
208 sha256rnds2 STATE0, STATE1
209 mova128 MSGTMP1, XMMTMP
210 palignr $4, MSGTMP0, XMMTMP
211 paddd XMMTMP, MSGTMP2
212 sha256msg2 MSGTMP1, MSGTMP2
213 shuf128_32 $0x0E, MSG, MSG
214 sha256rnds2 STATE1, STATE0
215
216 /* Rounds 56-59 */
217 mova128 MSGTMP2, MSG
218 paddd 14*16-8*16(SHA256CONSTANTS), MSG
219 sha256rnds2 STATE0, STATE1
220 mova128 MSGTMP2, XMMTMP
221 palignr $4, MSGTMP1, XMMTMP
222 paddd XMMTMP, MSGTMP3
223 sha256msg2 MSGTMP2, MSGTMP3
224 shuf128_32 $0x0E, MSG, MSG
225 sha256rnds2 STATE1, STATE0
226
227 /* Rounds 60-63 */
228 mova128 MSGTMP3, MSG
229 paddd 15*16-8*16(SHA256CONSTANTS), MSG
230 sha256rnds2 STATE0, STATE1
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Write hash values back in the correct order */
235 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
236 /* STATE1: CDGH */
237 mova128 STATE0, XMMTMP
238/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
239 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
240 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
241 /* add current hash values to previous ones */
242 movu128 76+1*16(%eax), STATE1
243 paddd XMMTMP, STATE1
244 movu128 STATE1, 76+1*16(%eax)
245 movu128 76+0*16(%eax), XMMTMP
246 paddd XMMTMP, STATE0
247 movu128 STATE0, 76+0*16(%eax)
248
249 ret
250 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
251
252 .section .rodata.cst256.K256, "aM", @progbits, 256
253 .balign 16
254K256:
255 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
256 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
257 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
258 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
259 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
260 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
261 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
262 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
263 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
264 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
265 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
266 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
267 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
268 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
269 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
270 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
271
272 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
273 .balign 16
274PSHUFFLE_BSWAP32_FLIP_MASK:
275 .octa 0x0c0d0e0f08090a0b0405060700010203
276
277#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..4663f750a
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,284 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34
35#define XMMTMP %xmm7
36
37#define ABEF_SAVE %xmm9
38#define CDGH_SAVE %xmm10
39
40#define SHUF(a,b,c,d) $(a+(b<<2)+(c<<4)+(d<<6))
41
42 .balign 8 # allow decoders to fetch at least 2 first insns
43sha256_process_block64_shaNI:
44
45 movu128 80+0*16(%rdi), XMMTMP /* DCBA (msb-to-lsb: 3,2,1,0) */
46 movu128 80+1*16(%rdi), STATE1 /* HGFE */
47/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
48 mova128 STATE1, STATE0
49 shufps SHUF(1,0,1,0), XMMTMP, STATE0 /* ABEF */
50 shufps SHUF(3,2,3,2), XMMTMP, STATE1 /* CDGH */
51
52/* XMMTMP holds flip mask from here... */
53 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), XMMTMP
54 leaq K256+8*16(%rip), SHA256CONSTANTS
55
56 /* Save hash values for addition after rounds */
57 mova128 STATE0, ABEF_SAVE
58 mova128 STATE1, CDGH_SAVE
59
60 /* Rounds 0-3 */
61 movu128 0*16(DATA_PTR), MSG
62 pshufb XMMTMP, MSG
63 mova128 MSG, MSGTMP0
64 paddd 0*16-8*16(SHA256CONSTANTS), MSG
65 sha256rnds2 STATE0, STATE1
66 shuf128_32 $0x0E, MSG, MSG
67 sha256rnds2 STATE1, STATE0
68
69 /* Rounds 4-7 */
70 movu128 1*16(DATA_PTR), MSG
71 pshufb XMMTMP, MSG
72 mova128 MSG, MSGTMP1
73 paddd 1*16-8*16(SHA256CONSTANTS), MSG
74 sha256rnds2 STATE0, STATE1
75 shuf128_32 $0x0E, MSG, MSG
76 sha256rnds2 STATE1, STATE0
77 sha256msg1 MSGTMP1, MSGTMP0
78
79 /* Rounds 8-11 */
80 movu128 2*16(DATA_PTR), MSG
81 pshufb XMMTMP, MSG
82 mova128 MSG, MSGTMP2
83 paddd 2*16-8*16(SHA256CONSTANTS), MSG
84 sha256rnds2 STATE0, STATE1
85 shuf128_32 $0x0E, MSG, MSG
86 sha256rnds2 STATE1, STATE0
87 sha256msg1 MSGTMP2, MSGTMP1
88
89 /* Rounds 12-15 */
90 movu128 3*16(DATA_PTR), MSG
91 pshufb XMMTMP, MSG
92/* ...to here */
93 mova128 MSG, MSGTMP3
94 paddd 3*16-8*16(SHA256CONSTANTS), MSG
95 sha256rnds2 STATE0, STATE1
96 mova128 MSGTMP3, XMMTMP
97 palignr $4, MSGTMP2, XMMTMP
98 paddd XMMTMP, MSGTMP0
99 sha256msg2 MSGTMP3, MSGTMP0
100 shuf128_32 $0x0E, MSG, MSG
101 sha256rnds2 STATE1, STATE0
102 sha256msg1 MSGTMP3, MSGTMP2
103
104 /* Rounds 16-19 */
105 mova128 MSGTMP0, MSG
106 paddd 4*16-8*16(SHA256CONSTANTS), MSG
107 sha256rnds2 STATE0, STATE1
108 mova128 MSGTMP0, XMMTMP
109 palignr $4, MSGTMP3, XMMTMP
110 paddd XMMTMP, MSGTMP1
111 sha256msg2 MSGTMP0, MSGTMP1
112 shuf128_32 $0x0E, MSG, MSG
113 sha256rnds2 STATE1, STATE0
114 sha256msg1 MSGTMP0, MSGTMP3
115
116 /* Rounds 20-23 */
117 mova128 MSGTMP1, MSG
118 paddd 5*16-8*16(SHA256CONSTANTS), MSG
119 sha256rnds2 STATE0, STATE1
120 mova128 MSGTMP1, XMMTMP
121 palignr $4, MSGTMP0, XMMTMP
122 paddd XMMTMP, MSGTMP2
123 sha256msg2 MSGTMP1, MSGTMP2
124 shuf128_32 $0x0E, MSG, MSG
125 sha256rnds2 STATE1, STATE0
126 sha256msg1 MSGTMP1, MSGTMP0
127
128 /* Rounds 24-27 */
129 mova128 MSGTMP2, MSG
130 paddd 6*16-8*16(SHA256CONSTANTS), MSG
131 sha256rnds2 STATE0, STATE1
132 mova128 MSGTMP2, XMMTMP
133 palignr $4, MSGTMP1, XMMTMP
134 paddd XMMTMP, MSGTMP3
135 sha256msg2 MSGTMP2, MSGTMP3
136 shuf128_32 $0x0E, MSG, MSG
137 sha256rnds2 STATE1, STATE0
138 sha256msg1 MSGTMP2, MSGTMP1
139
140 /* Rounds 28-31 */
141 mova128 MSGTMP3, MSG
142 paddd 7*16-8*16(SHA256CONSTANTS), MSG
143 sha256rnds2 STATE0, STATE1
144 mova128 MSGTMP3, XMMTMP
145 palignr $4, MSGTMP2, XMMTMP
146 paddd XMMTMP, MSGTMP0
147 sha256msg2 MSGTMP3, MSGTMP0
148 shuf128_32 $0x0E, MSG, MSG
149 sha256rnds2 STATE1, STATE0
150 sha256msg1 MSGTMP3, MSGTMP2
151
152 /* Rounds 32-35 */
153 mova128 MSGTMP0, MSG
154 paddd 8*16-8*16(SHA256CONSTANTS), MSG
155 sha256rnds2 STATE0, STATE1
156 mova128 MSGTMP0, XMMTMP
157 palignr $4, MSGTMP3, XMMTMP
158 paddd XMMTMP, MSGTMP1
159 sha256msg2 MSGTMP0, MSGTMP1
160 shuf128_32 $0x0E, MSG, MSG
161 sha256rnds2 STATE1, STATE0
162 sha256msg1 MSGTMP0, MSGTMP3
163
164 /* Rounds 36-39 */
165 mova128 MSGTMP1, MSG
166 paddd 9*16-8*16(SHA256CONSTANTS), MSG
167 sha256rnds2 STATE0, STATE1
168 mova128 MSGTMP1, XMMTMP
169 palignr $4, MSGTMP0, XMMTMP
170 paddd XMMTMP, MSGTMP2
171 sha256msg2 MSGTMP1, MSGTMP2
172 shuf128_32 $0x0E, MSG, MSG
173 sha256rnds2 STATE1, STATE0
174 sha256msg1 MSGTMP1, MSGTMP0
175
176 /* Rounds 40-43 */
177 mova128 MSGTMP2, MSG
178 paddd 10*16-8*16(SHA256CONSTANTS), MSG
179 sha256rnds2 STATE0, STATE1
180 mova128 MSGTMP2, XMMTMP
181 palignr $4, MSGTMP1, XMMTMP
182 paddd XMMTMP, MSGTMP3
183 sha256msg2 MSGTMP2, MSGTMP3
184 shuf128_32 $0x0E, MSG, MSG
185 sha256rnds2 STATE1, STATE0
186 sha256msg1 MSGTMP2, MSGTMP1
187
188 /* Rounds 44-47 */
189 mova128 MSGTMP3, MSG
190 paddd 11*16-8*16(SHA256CONSTANTS), MSG
191 sha256rnds2 STATE0, STATE1
192 mova128 MSGTMP3, XMMTMP
193 palignr $4, MSGTMP2, XMMTMP
194 paddd XMMTMP, MSGTMP0
195 sha256msg2 MSGTMP3, MSGTMP0
196 shuf128_32 $0x0E, MSG, MSG
197 sha256rnds2 STATE1, STATE0
198 sha256msg1 MSGTMP3, MSGTMP2
199
200 /* Rounds 48-51 */
201 mova128 MSGTMP0, MSG
202 paddd 12*16-8*16(SHA256CONSTANTS), MSG
203 sha256rnds2 STATE0, STATE1
204 mova128 MSGTMP0, XMMTMP
205 palignr $4, MSGTMP3, XMMTMP
206 paddd XMMTMP, MSGTMP1
207 sha256msg2 MSGTMP0, MSGTMP1
208 shuf128_32 $0x0E, MSG, MSG
209 sha256rnds2 STATE1, STATE0
210 sha256msg1 MSGTMP0, MSGTMP3
211
212 /* Rounds 52-55 */
213 mova128 MSGTMP1, MSG
214 paddd 13*16-8*16(SHA256CONSTANTS), MSG
215 sha256rnds2 STATE0, STATE1
216 mova128 MSGTMP1, XMMTMP
217 palignr $4, MSGTMP0, XMMTMP
218 paddd XMMTMP, MSGTMP2
219 sha256msg2 MSGTMP1, MSGTMP2
220 shuf128_32 $0x0E, MSG, MSG
221 sha256rnds2 STATE1, STATE0
222
223 /* Rounds 56-59 */
224 mova128 MSGTMP2, MSG
225 paddd 14*16-8*16(SHA256CONSTANTS), MSG
226 sha256rnds2 STATE0, STATE1
227 mova128 MSGTMP2, XMMTMP
228 palignr $4, MSGTMP1, XMMTMP
229 paddd XMMTMP, MSGTMP3
230 sha256msg2 MSGTMP2, MSGTMP3
231 shuf128_32 $0x0E, MSG, MSG
232 sha256rnds2 STATE1, STATE0
233
234 /* Rounds 60-63 */
235 mova128 MSGTMP3, MSG
236 paddd 15*16-8*16(SHA256CONSTANTS), MSG
237 sha256rnds2 STATE0, STATE1
238 shuf128_32 $0x0E, MSG, MSG
239 sha256rnds2 STATE1, STATE0
240
241 /* Add current hash values with previously saved */
242 paddd ABEF_SAVE, STATE0
243 paddd CDGH_SAVE, STATE1
244
245 /* Write hash values back in the correct order */
246 /* STATE0: ABEF (msb-to-lsb: 3,2,1,0) */
247 /* STATE1: CDGH */
248 mova128 STATE0, XMMTMP
249/* shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one */
250 shufps SHUF(3,2,3,2), STATE1, STATE0 /* DCBA */
251 shufps SHUF(1,0,1,0), STATE1, XMMTMP /* HGFE */
252
253 movu128 STATE0, 80+0*16(%rdi)
254 movu128 XMMTMP, 80+1*16(%rdi)
255
256 ret
257 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
258
259 .section .rodata.cst256.K256, "aM", @progbits, 256
260 .balign 16
261K256:
262 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
263 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
264 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
265 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
266 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
267 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
268 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
269 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
270 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
271 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
272 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
273 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
274 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
275 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
276 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
277 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
278
279 .section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
280 .balign 16
281PSHUFFLE_BSWAP32_FLIP_MASK:
282 .octa 0x0c0d0e0f08090a0b0405060700010203
283
284#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 166cfd38a..a61b3cbed 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -32,45 +32,42 @@
32#define MSG1 %xmm4 32#define MSG1 %xmm4
33#define MSG2 %xmm5 33#define MSG2 %xmm5
34#define MSG3 %xmm6 34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36 35
37 .balign 8 # allow decoders to fetch at least 3 first insns 36 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI: 37sha1_process_block64_shaNI:
39 pushl %ebp
40 movl %esp, %ebp
41 subl $32, %esp
42 andl $~0xF, %esp # paddd needs aligned memory operand
43
44 /* load initial hash values */ 38 /* load initial hash values */
45 xor128 E0, E0
46 movu128 76(%eax), ABCD 39 movu128 76(%eax), ABCD
40 xor128 E0, E0
47 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word 41 pinsrd $3, 76+4*4(%eax), E0 # load to uppermost 32-bit word
48 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD 42 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
49 43
50 mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK 44 mova128 PSHUFFLE_BYTE_FLIP_MASK, %xmm7
45
46 movu128 0*16(%eax), MSG0
47 pshufb %xmm7, MSG0
48 movu128 1*16(%eax), MSG1
49 pshufb %xmm7, MSG1
50 movu128 2*16(%eax), MSG2
51 pshufb %xmm7, MSG2
52 movu128 3*16(%eax), MSG3
53 pshufb %xmm7, MSG3
51 54
52 /* Save hash values for addition after rounds */ 55 /* Save hash values for addition after rounds */
53 movu128 E0, 16(%esp) 56 mova128 E0, %xmm7
54 movu128 ABCD, (%esp) 57 /*mova128 ABCD, %xmm8 - NOPE, 32bit has no xmm8 */
55 58
56 /* Rounds 0-3 */ 59 /* Rounds 0-3 */
57 movu128 0*16(%eax), MSG0
58 pshufb SHUF_MASK, MSG0
59 paddd MSG0, E0 60 paddd MSG0, E0
60 mova128 ABCD, E1 61 mova128 ABCD, E1
61 sha1rnds4 $0, E0, ABCD 62 sha1rnds4 $0, E0, ABCD
62 63
63 /* Rounds 4-7 */ 64 /* Rounds 4-7 */
64 movu128 1*16(%eax), MSG1
65 pshufb SHUF_MASK, MSG1
66 sha1nexte MSG1, E1 65 sha1nexte MSG1, E1
67 mova128 ABCD, E0 66 mova128 ABCD, E0
68 sha1rnds4 $0, E1, ABCD 67 sha1rnds4 $0, E1, ABCD
69 sha1msg1 MSG1, MSG0 68 sha1msg1 MSG1, MSG0
70 69
71 /* Rounds 8-11 */ 70 /* Rounds 8-11 */
72 movu128 2*16(%eax), MSG2
73 pshufb SHUF_MASK, MSG2
74 sha1nexte MSG2, E0 71 sha1nexte MSG2, E0
75 mova128 ABCD, E1 72 mova128 ABCD, E1
76 sha1rnds4 $0, E0, ABCD 73 sha1rnds4 $0, E0, ABCD
@@ -78,8 +75,6 @@ sha1_process_block64_shaNI:
78 xor128 MSG2, MSG0 75 xor128 MSG2, MSG0
79 76
80 /* Rounds 12-15 */ 77 /* Rounds 12-15 */
81 movu128 3*16(%eax), MSG3
82 pshufb SHUF_MASK, MSG3
83 sha1nexte MSG3, E1 78 sha1nexte MSG3, E1
84 mova128 ABCD, E0 79 mova128 ABCD, E0
85 sha1msg2 MSG3, MSG0 80 sha1msg2 MSG3, MSG0
@@ -210,21 +205,21 @@ sha1_process_block64_shaNI:
210 sha1rnds4 $3, E1, ABCD 205 sha1rnds4 $3, E1, ABCD
211 206
212 /* Add current hash values with previously saved */ 207 /* Add current hash values with previously saved */
213 sha1nexte 16(%esp), E0 208 sha1nexte %xmm7, E0
214 paddd (%esp), ABCD 209 /*paddd %xmm8, ABCD - 32-bit mode has no xmm8 */
210 movu128 76(%eax), %xmm7 # get original ABCD (not shuffled)...
215 211
216 /* Write hash values back in the correct order */ 212 /* Write hash values back in the correct order */
217 shuf128_32 $0x1B, ABCD, ABCD 213 shuf128_32 $0x1B, ABCD, ABCD
214 paddd %xmm7, ABCD # ...add it to final ABCD
218 movu128 ABCD, 76(%eax) 215 movu128 ABCD, 76(%eax)
219 extr128_32 $3, E0, 76+4*4(%eax) 216 extr128_32 $3, E0, 76+4*4(%eax)
220 217
221 movl %ebp, %esp
222 popl %ebp
223 ret 218 ret
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 219 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225 220
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 221 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.align 16 222 .balign 16
228PSHUFFLE_BYTE_FLIP_MASK: 223PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f 224 .octa 0x000102030405060708090a0b0c0d0e0f
230 225
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 87fb616a1..287cfe547 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1,7 +1,7 @@
1### Generated by hash_md5_sha_x86-64.S.sh ### 1### Generated by hash_md5_sha_x86-64.S.sh ###
2 2
3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 3#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
4 .section .text.sha1_process_block64,"ax",@progbits 4 .section .text.sha1_process_block64, "ax", @progbits
5 .globl sha1_process_block64 5 .globl sha1_process_block64
6 .hidden sha1_process_block64 6 .hidden sha1_process_block64
7 .type sha1_process_block64, @function 7 .type sha1_process_block64, @function
@@ -10,7 +10,7 @@
10sha1_process_block64: 10sha1_process_block64:
11 pushq %rbp # 1 byte insn 11 pushq %rbp # 1 byte insn
12 pushq %rbx # 1 byte insn 12 pushq %rbx # 1 byte insn
13 pushq %r15 # 2 byte insn 13# pushq %r15 # 2 byte insn
14 pushq %r14 # 2 byte insn 14 pushq %r14 # 2 byte insn
15 pushq %r13 # 2 byte insn 15 pushq %r13 # 2 byte insn
16 pushq %r12 # 2 byte insn 16 pushq %r12 # 2 byte insn
@@ -19,17 +19,13 @@ sha1_process_block64:
19#Register and stack use: 19#Register and stack use:
20# eax..edx: a..d 20# eax..edx: a..d
21# ebp: e 21# ebp: e
22# esi,edi: temps 22# esi,edi,r8..r14: temps
23# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 23# r15: unused
24# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 24# xmm0..xmm3: W[]
25 movl $3, %eax 25# xmm4,xmm5: temps
261: 26# xmm6: current round constant
27 movq (%rdi,%rax,8), %rsi 27# xmm7: all round constants
28 bswapq %rsi 28# -64(%rsp): area for passing RCONST + W[] from vector to integer units
29 rolq $32, %rsi
30 movq %rsi, -32(%rsp,%rax,8)
31 decl %eax
32 jns 1b
33 29
34 movl 80(%rdi), %eax # a = ctx->hash[0] 30 movl 80(%rdi), %eax # a = ctx->hash[0]
35 movl 84(%rdi), %ebx # b = ctx->hash[1] 31 movl 84(%rdi), %ebx # b = ctx->hash[1]
@@ -37,587 +33,760 @@ sha1_process_block64:
37 movl 92(%rdi), %edx # d = ctx->hash[3] 33 movl 92(%rdi), %edx # d = ctx->hash[3]
38 movl 96(%rdi), %ebp # e = ctx->hash[4] 34 movl 96(%rdi), %ebp # e = ctx->hash[4]
39 35
40 movq 4*8(%rdi), %r8 36 movaps sha1const(%rip), %xmm7
41 movq 4*10(%rdi), %r10 37 pshufd $0x00, %xmm7, %xmm6
38
39 # Load W[] to xmm registers, byteswapping on the fly.
40 #
41 # For iterations 0..15, we pass W[] in rsi,r8..r14
42 # for use in RD1As instead of spilling them to stack.
43 # We lose parallelized addition of RCONST, but LEA
44 # can do two additions at once, so it is probably a wash.
45 # (We use rsi instead of rN because this makes two
46 # LEAs in two first RD1As shorter by one byte).
47 movq 4*0(%rdi), %rsi
48 movq 4*2(%rdi), %r8
49 bswapq %rsi
42 bswapq %r8 50 bswapq %r8
51 rolq $32, %rsi # rsi = W[1]:W[0]
52 rolq $32, %r8 # r8 = W[3]:W[2]
53 movq %rsi, %xmm0
54 movq %r8, %xmm4
55 punpcklqdq %xmm4, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
56# movaps %xmm0, %xmm4 # add RCONST, spill to stack
57# paddd %xmm6, %xmm4
58# movups %xmm4, -64+16*0(%rsp)
59
60 movq 4*4(%rdi), %r9
61 movq 4*6(%rdi), %r10
62 bswapq %r9
43 bswapq %r10 63 bswapq %r10
44 movq 4*12(%rdi), %r12 64 rolq $32, %r9 # r9 = W[5]:W[4]
45 movq 4*14(%rdi), %r14 65 rolq $32, %r10 # r10 = W[7]:W[6]
66 movq %r9, %xmm1
67 movq %r10, %xmm4
68 punpcklqdq %xmm4, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
69
70 movq 4*8(%rdi), %r11
71 movq 4*10(%rdi), %r12
72 bswapq %r11
46 bswapq %r12 73 bswapq %r12
74 rolq $32, %r11 # r11 = W[9]:W[8]
75 rolq $32, %r12 # r12 = W[11]:W[10]
76 movq %r11, %xmm2
77 movq %r12, %xmm4
78 punpcklqdq %xmm4, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
79
80 movq 4*12(%rdi), %r13
81 movq 4*14(%rdi), %r14
82 bswapq %r13
47 bswapq %r14 83 bswapq %r14
48 movl %r8d, %r9d 84 rolq $32, %r13 # r13 = W[13]:W[12]
49 shrq $32, %r8 85 rolq $32, %r14 # r14 = W[15]:W[14]
50 movl %r10d, %r11d 86 movq %r13, %xmm3
51 shrq $32, %r10 87 movq %r14, %xmm4
52 movl %r12d, %r13d 88 punpcklqdq %xmm4, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
53 shrq $32, %r12
54 movl %r14d, %r15d
55 shrq $32, %r14
56 89
57# 0 90# 0
58 # W[0], already in %esi 91 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
92 shrq $32, %rsi
59 movl %ecx, %edi # c 93 movl %ecx, %edi # c
60 xorl %edx, %edi # ^d 94 xorl %edx, %edi # ^d
61 andl %ebx, %edi # &b 95 andl %ebx, %edi # &b
62 xorl %edx, %edi # (((c ^ d) & b) ^ d) 96 xorl %edx, %edi # (((c ^ d) & b) ^ d)
63 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
64 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 97 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
65 movl %eax, %esi # 98 movl %eax, %edi #
66 roll $5, %esi # rotl32(a,5) 99 roll $5, %edi # rotl32(a,5)
67 addl %esi, %ebp # e += rotl32(a,5) 100 addl %edi, %ebp # e += rotl32(a,5)
68 rorl $2, %ebx # b = rotl32(b,30) 101 rorl $2, %ebx # b = rotl32(b,30)
69# 1 102# 1
70 movl -32+4*1(%rsp), %esi # W[n] 103 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
71 movl %ebx, %edi # c 104 movl %ebx, %edi # c
72 xorl %ecx, %edi # ^d 105 xorl %ecx, %edi # ^d
73 andl %eax, %edi # &b 106 andl %eax, %edi # &b
74 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 107 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
75 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
76 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 108 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
77 movl %ebp, %esi # 109 movl %ebp, %edi #
78 roll $5, %esi # rotl32(a,5) 110 roll $5, %edi # rotl32(a,5)
79 addl %esi, %edx # e += rotl32(a,5) 111 addl %edi, %edx # e += rotl32(a,5)
80 rorl $2, %eax # b = rotl32(b,30) 112 rorl $2, %eax # b = rotl32(b,30)
81# 2 113# 2
82 movl -32+4*2(%rsp), %esi # W[n] 114 leal 0x5A827999(%rcx,%r8), %ecx # e += RCONST + W[n]
115 shrq $32, %r8
83 movl %eax, %edi # c 116 movl %eax, %edi # c
84 xorl %ebx, %edi # ^d 117 xorl %ebx, %edi # ^d
85 andl %ebp, %edi # &b 118 andl %ebp, %edi # &b
86 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 119 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
87 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
88 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 120 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
89 movl %edx, %esi # 121 movl %edx, %edi #
90 roll $5, %esi # rotl32(a,5) 122 roll $5, %edi # rotl32(a,5)
91 addl %esi, %ecx # e += rotl32(a,5) 123 addl %edi, %ecx # e += rotl32(a,5)
92 rorl $2, %ebp # b = rotl32(b,30) 124 rorl $2, %ebp # b = rotl32(b,30)
93# 3 125# 3
94 movl -32+4*3(%rsp), %esi # W[n] 126 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
95 movl %ebp, %edi # c 127 movl %ebp, %edi # c
96 xorl %eax, %edi # ^d 128 xorl %eax, %edi # ^d
97 andl %edx, %edi # &b 129 andl %edx, %edi # &b
98 xorl %eax, %edi # (((c ^ d) & b) ^ d) 130 xorl %eax, %edi # (((c ^ d) & b) ^ d)
99 leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n]
100 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 131 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
101 movl %ecx, %esi # 132 movl %ecx, %edi #
102 roll $5, %esi # rotl32(a,5) 133 roll $5, %edi # rotl32(a,5)
103 addl %esi, %ebx # e += rotl32(a,5) 134 addl %edi, %ebx # e += rotl32(a,5)
104 rorl $2, %edx # b = rotl32(b,30) 135 rorl $2, %edx # b = rotl32(b,30)
105# 4 136# 4
106 movl -32+4*4(%rsp), %esi # W[n] 137 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
138 shrq $32, %r9
107 movl %edx, %edi # c 139 movl %edx, %edi # c
108 xorl %ebp, %edi # ^d 140 xorl %ebp, %edi # ^d
109 andl %ecx, %edi # &b 141 andl %ecx, %edi # &b
110 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 142 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
111 leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n]
112 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 143 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
113 movl %ebx, %esi # 144 movl %ebx, %edi #
114 roll $5, %esi # rotl32(a,5) 145 roll $5, %edi # rotl32(a,5)
115 addl %esi, %eax # e += rotl32(a,5) 146 addl %edi, %eax # e += rotl32(a,5)
116 rorl $2, %ecx # b = rotl32(b,30) 147 rorl $2, %ecx # b = rotl32(b,30)
117# 5 148# 5
118 movl -32+4*5(%rsp), %esi # W[n] 149 leal 0x5A827999(%rbp,%r9), %ebp # e += RCONST + W[n]
119 movl %ecx, %edi # c 150 movl %ecx, %edi # c
120 xorl %edx, %edi # ^d 151 xorl %edx, %edi # ^d
121 andl %ebx, %edi # &b 152 andl %ebx, %edi # &b
122 xorl %edx, %edi # (((c ^ d) & b) ^ d) 153 xorl %edx, %edi # (((c ^ d) & b) ^ d)
123 leal 0x5A827999(%rbp,%rsi), %ebp # e += RCONST + W[n]
124 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 154 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
125 movl %eax, %esi # 155 movl %eax, %edi #
126 roll $5, %esi # rotl32(a,5) 156 roll $5, %edi # rotl32(a,5)
127 addl %esi, %ebp # e += rotl32(a,5) 157 addl %edi, %ebp # e += rotl32(a,5)
128 rorl $2, %ebx # b = rotl32(b,30) 158 rorl $2, %ebx # b = rotl32(b,30)
129# 6 159# 6
130 movl -32+4*6(%rsp), %esi # W[n] 160 leal 0x5A827999(%rdx,%r10), %edx # e += RCONST + W[n]
161 shrq $32, %r10
131 movl %ebx, %edi # c 162 movl %ebx, %edi # c
132 xorl %ecx, %edi # ^d 163 xorl %ecx, %edi # ^d
133 andl %eax, %edi # &b 164 andl %eax, %edi # &b
134 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 165 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
135 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n]
136 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 166 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
137 movl %ebp, %esi # 167 movl %ebp, %edi #
138 roll $5, %esi # rotl32(a,5) 168 roll $5, %edi # rotl32(a,5)
139 addl %esi, %edx # e += rotl32(a,5) 169 addl %edi, %edx # e += rotl32(a,5)
140 rorl $2, %eax # b = rotl32(b,30) 170 rorl $2, %eax # b = rotl32(b,30)
141# 7 171# 7
142 movl -32+4*7(%rsp), %esi # W[n] 172 leal 0x5A827999(%rcx,%r10), %ecx # e += RCONST + W[n]
143 movl %eax, %edi # c 173 movl %eax, %edi # c
144 xorl %ebx, %edi # ^d 174 xorl %ebx, %edi # ^d
145 andl %ebp, %edi # &b 175 andl %ebp, %edi # &b
146 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 176 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
147 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n]
148 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 177 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
149 movl %edx, %esi # 178 movl %edx, %edi #
150 roll $5, %esi # rotl32(a,5) 179 roll $5, %edi # rotl32(a,5)
151 addl %esi, %ecx # e += rotl32(a,5) 180 addl %edi, %ecx # e += rotl32(a,5)
152 rorl $2, %ebp # b = rotl32(b,30) 181 rorl $2, %ebp # b = rotl32(b,30)
182# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
183 movaps %xmm3, %xmm4
184 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
185# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
186# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
187# same result as above, but shorter and faster:
188# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
189# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
190 movaps %xmm0, %xmm5
191 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
192 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
193 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
194 xorps %xmm5, %xmm0 # ^
195 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
196 movaps %xmm0, %xmm5
197 xorps %xmm4, %xmm4 # rol(W0,1):
198 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
199 paddd %xmm0, %xmm0 # shift left by 1
200 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
201 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
202 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
203 movaps %xmm5, %xmm4
204 pslld $2, %xmm5
205 psrld $30, %xmm4
206# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
207 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
208 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
209 movaps %xmm0, %xmm5
210 paddd %xmm6, %xmm5
211 movups %xmm5, -64+16*0(%rsp)
153# 8 212# 8
154 # W[n], in %r8 213 leal 0x5A827999(%rbx,%r11), %ebx # e += RCONST + W[n]
214 shrq $32, %r11
155 movl %ebp, %edi # c 215 movl %ebp, %edi # c
156 xorl %eax, %edi # ^d 216 xorl %eax, %edi # ^d
157 andl %edx, %edi # &b 217 andl %edx, %edi # &b
158 xorl %eax, %edi # (((c ^ d) & b) ^ d) 218 xorl %eax, %edi # (((c ^ d) & b) ^ d)
159 leal 0x5A827999(%rbx,%r8), %ebx # e += RCONST + W[n]
160 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 219 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
161 movl %ecx, %esi # 220 movl %ecx, %edi #
162 roll $5, %esi # rotl32(a,5) 221 roll $5, %edi # rotl32(a,5)
163 addl %esi, %ebx # e += rotl32(a,5) 222 addl %edi, %ebx # e += rotl32(a,5)
164 rorl $2, %edx # b = rotl32(b,30) 223 rorl $2, %edx # b = rotl32(b,30)
165# 9 224# 9
166 # W[n], in %r9 225 leal 0x5A827999(%rax,%r11), %eax # e += RCONST + W[n]
167 movl %edx, %edi # c 226 movl %edx, %edi # c
168 xorl %ebp, %edi # ^d 227 xorl %ebp, %edi # ^d
169 andl %ecx, %edi # &b 228 andl %ecx, %edi # &b
170 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 229 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
171 leal 0x5A827999(%rax,%r9), %eax # e += RCONST + W[n]
172 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 230 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
173 movl %ebx, %esi # 231 movl %ebx, %edi #
174 roll $5, %esi # rotl32(a,5) 232 roll $5, %edi # rotl32(a,5)
175 addl %esi, %eax # e += rotl32(a,5) 233 addl %edi, %eax # e += rotl32(a,5)
176 rorl $2, %ecx # b = rotl32(b,30) 234 rorl $2, %ecx # b = rotl32(b,30)
177# 10 235# 10
178 # W[n], in %r10 236 leal 0x5A827999(%rbp,%r12), %ebp # e += RCONST + W[n]
237 shrq $32, %r12
179 movl %ecx, %edi # c 238 movl %ecx, %edi # c
180 xorl %edx, %edi # ^d 239 xorl %edx, %edi # ^d
181 andl %ebx, %edi # &b 240 andl %ebx, %edi # &b
182 xorl %edx, %edi # (((c ^ d) & b) ^ d) 241 xorl %edx, %edi # (((c ^ d) & b) ^ d)
183 leal 0x5A827999(%rbp,%r10), %ebp # e += RCONST + W[n]
184 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 242 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
185 movl %eax, %esi # 243 movl %eax, %edi #
186 roll $5, %esi # rotl32(a,5) 244 roll $5, %edi # rotl32(a,5)
187 addl %esi, %ebp # e += rotl32(a,5) 245 addl %edi, %ebp # e += rotl32(a,5)
188 rorl $2, %ebx # b = rotl32(b,30) 246 rorl $2, %ebx # b = rotl32(b,30)
189# 11 247# 11
190 # W[n], in %r11 248 leal 0x5A827999(%rdx,%r12), %edx # e += RCONST + W[n]
191 movl %ebx, %edi # c 249 movl %ebx, %edi # c
192 xorl %ecx, %edi # ^d 250 xorl %ecx, %edi # ^d
193 andl %eax, %edi # &b 251 andl %eax, %edi # &b
194 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 252 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
195 leal 0x5A827999(%rdx,%r11), %edx # e += RCONST + W[n]
196 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 253 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
197 movl %ebp, %esi # 254 movl %ebp, %edi #
198 roll $5, %esi # rotl32(a,5) 255 roll $5, %edi # rotl32(a,5)
199 addl %esi, %edx # e += rotl32(a,5) 256 addl %edi, %edx # e += rotl32(a,5)
200 rorl $2, %eax # b = rotl32(b,30) 257 rorl $2, %eax # b = rotl32(b,30)
258 pshufd $0x55, %xmm7, %xmm6
259# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
260 movaps %xmm0, %xmm4
261 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
262# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
263# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
264# same result as above, but shorter and faster:
265# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
266# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
267 movaps %xmm1, %xmm5
268 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
269 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
270 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
271 xorps %xmm5, %xmm1 # ^
272 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
273 movaps %xmm1, %xmm5
274 xorps %xmm4, %xmm4 # rol(W0,1):
275 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
276 paddd %xmm1, %xmm1 # shift left by 1
277 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
278 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
279 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
280 movaps %xmm5, %xmm4
281 pslld $2, %xmm5
282 psrld $30, %xmm4
283# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
284 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
285 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
286 movaps %xmm1, %xmm5
287 paddd %xmm6, %xmm5
288 movups %xmm5, -64+16*1(%rsp)
201# 12 289# 12
202 # W[n], in %r12 290 leal 0x5A827999(%rcx,%r13), %ecx # e += RCONST + W[n]
291 shrq $32, %r13
203 movl %eax, %edi # c 292 movl %eax, %edi # c
204 xorl %ebx, %edi # ^d 293 xorl %ebx, %edi # ^d
205 andl %ebp, %edi # &b 294 andl %ebp, %edi # &b
206 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 295 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
207 leal 0x5A827999(%rcx,%r12), %ecx # e += RCONST + W[n]
208 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 296 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
209 movl %edx, %esi # 297 movl %edx, %edi #
210 roll $5, %esi # rotl32(a,5) 298 roll $5, %edi # rotl32(a,5)
211 addl %esi, %ecx # e += rotl32(a,5) 299 addl %edi, %ecx # e += rotl32(a,5)
212 rorl $2, %ebp # b = rotl32(b,30) 300 rorl $2, %ebp # b = rotl32(b,30)
213# 13 301# 13
214 # W[n], in %r13 302 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
215 movl %ebp, %edi # c 303 movl %ebp, %edi # c
216 xorl %eax, %edi # ^d 304 xorl %eax, %edi # ^d
217 andl %edx, %edi # &b 305 andl %edx, %edi # &b
218 xorl %eax, %edi # (((c ^ d) & b) ^ d) 306 xorl %eax, %edi # (((c ^ d) & b) ^ d)
219 leal 0x5A827999(%rbx,%r13), %ebx # e += RCONST + W[n]
220 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 307 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
221 movl %ecx, %esi # 308 movl %ecx, %edi #
222 roll $5, %esi # rotl32(a,5) 309 roll $5, %edi # rotl32(a,5)
223 addl %esi, %ebx # e += rotl32(a,5) 310 addl %edi, %ebx # e += rotl32(a,5)
224 rorl $2, %edx # b = rotl32(b,30) 311 rorl $2, %edx # b = rotl32(b,30)
225# 14 312# 14
226 # W[n], in %r14 313 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
314 shrq $32, %r14
227 movl %edx, %edi # c 315 movl %edx, %edi # c
228 xorl %ebp, %edi # ^d 316 xorl %ebp, %edi # ^d
229 andl %ecx, %edi # &b 317 andl %ecx, %edi # &b
230 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 318 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
231 leal 0x5A827999(%rax,%r14), %eax # e += RCONST + W[n]
232 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 319 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
233 movl %ebx, %esi # 320 movl %ebx, %edi #
234 roll $5, %esi # rotl32(a,5) 321 roll $5, %edi # rotl32(a,5)
235 addl %esi, %eax # e += rotl32(a,5) 322 addl %edi, %eax # e += rotl32(a,5)
236 rorl $2, %ecx # b = rotl32(b,30) 323 rorl $2, %ecx # b = rotl32(b,30)
237# 15 324# 15
238 # W[n], in %r15 325 leal 0x5A827999(%rbp,%r14), %ebp # e += RCONST + W[n]
239 movl %ecx, %edi # c 326 movl %ecx, %edi # c
240 xorl %edx, %edi # ^d 327 xorl %edx, %edi # ^d
241 andl %ebx, %edi # &b 328 andl %ebx, %edi # &b
242 xorl %edx, %edi # (((c ^ d) & b) ^ d) 329 xorl %edx, %edi # (((c ^ d) & b) ^ d)
243 leal 0x5A827999(%rbp,%r15), %ebp # e += RCONST + W[n]
244 addl %edi, %ebp # e += (((c ^ d) & b) ^ d) 330 addl %edi, %ebp # e += (((c ^ d) & b) ^ d)
245 movl %eax, %esi # 331 movl %eax, %edi #
246 roll $5, %esi # rotl32(a,5) 332 roll $5, %edi # rotl32(a,5)
247 addl %esi, %ebp # e += rotl32(a,5) 333 addl %edi, %ebp # e += rotl32(a,5)
248 rorl $2, %ebx # b = rotl32(b,30) 334 rorl $2, %ebx # b = rotl32(b,30)
335# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
336 movaps %xmm1, %xmm4
337 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
338# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
339# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
340# same result as above, but shorter and faster:
341# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
342# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
343 movaps %xmm2, %xmm5
344 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
345 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
346 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
347 xorps %xmm5, %xmm2 # ^
348 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
349 movaps %xmm2, %xmm5
350 xorps %xmm4, %xmm4 # rol(W0,1):
351 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
352 paddd %xmm2, %xmm2 # shift left by 1
353 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
354 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
355 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
356 movaps %xmm5, %xmm4
357 pslld $2, %xmm5
358 psrld $30, %xmm4
359# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
360 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
361 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
362 movaps %xmm2, %xmm5
363 paddd %xmm6, %xmm5
364 movups %xmm5, -64+16*2(%rsp)
249# 16 365# 16
250 movl %r13d, %esi # W[(n+13) & 15]
251 xorl %r8d, %esi # ^W[(n+8) & 15]
252 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
253 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
254 roll %esi #
255 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
256 movl %ebx, %edi # c 366 movl %ebx, %edi # c
257 xorl %ecx, %edi # ^d 367 xorl %ecx, %edi # ^d
258 andl %eax, %edi # &b 368 andl %eax, %edi # &b
259 xorl %ecx, %edi # (((c ^ d) & b) ^ d) 369 xorl %ecx, %edi # (((c ^ d) & b) ^ d)
260 leal 0x5A827999(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 370 addl -64+4*0(%rsp), %edx # e += RCONST + W[n & 15]
261 addl %edi, %edx # e += (((c ^ d) & b) ^ d) 371 addl %edi, %edx # e += (((c ^ d) & b) ^ d)
262 movl %ebp, %esi # 372 movl %ebp, %esi #
263 roll $5, %esi # rotl32(a,5) 373 roll $5, %esi # rotl32(a,5)
264 addl %esi, %edx # e += rotl32(a,5) 374 addl %esi, %edx # e += rotl32(a,5)
265 rorl $2, %eax # b = rotl32(b,30) 375 rorl $2, %eax # b = rotl32(b,30)
266# 17 376# 17
267 movl %r14d, %esi # W[(n+13) & 15]
268 xorl %r9d, %esi # ^W[(n+8) & 15]
269 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
270 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
271 roll %esi #
272 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
273 movl %eax, %edi # c 377 movl %eax, %edi # c
274 xorl %ebx, %edi # ^d 378 xorl %ebx, %edi # ^d
275 andl %ebp, %edi # &b 379 andl %ebp, %edi # &b
276 xorl %ebx, %edi # (((c ^ d) & b) ^ d) 380 xorl %ebx, %edi # (((c ^ d) & b) ^ d)
277 leal 0x5A827999(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 381 addl -64+4*1(%rsp), %ecx # e += RCONST + W[n & 15]
278 addl %edi, %ecx # e += (((c ^ d) & b) ^ d) 382 addl %edi, %ecx # e += (((c ^ d) & b) ^ d)
279 movl %edx, %esi # 383 movl %edx, %esi #
280 roll $5, %esi # rotl32(a,5) 384 roll $5, %esi # rotl32(a,5)
281 addl %esi, %ecx # e += rotl32(a,5) 385 addl %esi, %ecx # e += rotl32(a,5)
282 rorl $2, %ebp # b = rotl32(b,30) 386 rorl $2, %ebp # b = rotl32(b,30)
283# 18 387# 18
284 movl %r15d, %esi # W[(n+13) & 15]
285 xorl %r10d, %esi # ^W[(n+8) & 15]
286 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
287 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
288 roll %esi #
289 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
290 movl %ebp, %edi # c 388 movl %ebp, %edi # c
291 xorl %eax, %edi # ^d 389 xorl %eax, %edi # ^d
292 andl %edx, %edi # &b 390 andl %edx, %edi # &b
293 xorl %eax, %edi # (((c ^ d) & b) ^ d) 391 xorl %eax, %edi # (((c ^ d) & b) ^ d)
294 leal 0x5A827999(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 392 addl -64+4*2(%rsp), %ebx # e += RCONST + W[n & 15]
295 addl %edi, %ebx # e += (((c ^ d) & b) ^ d) 393 addl %edi, %ebx # e += (((c ^ d) & b) ^ d)
296 movl %ecx, %esi # 394 movl %ecx, %esi #
297 roll $5, %esi # rotl32(a,5) 395 roll $5, %esi # rotl32(a,5)
298 addl %esi, %ebx # e += rotl32(a,5) 396 addl %esi, %ebx # e += rotl32(a,5)
299 rorl $2, %edx # b = rotl32(b,30) 397 rorl $2, %edx # b = rotl32(b,30)
300# 19 398# 19
301 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
302 xorl %r11d, %esi # ^W[(n+8) & 15]
303 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
304 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
305 roll %esi #
306 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
307 movl %edx, %edi # c 399 movl %edx, %edi # c
308 xorl %ebp, %edi # ^d 400 xorl %ebp, %edi # ^d
309 andl %ecx, %edi # &b 401 andl %ecx, %edi # &b
310 xorl %ebp, %edi # (((c ^ d) & b) ^ d) 402 xorl %ebp, %edi # (((c ^ d) & b) ^ d)
311 leal 0x5A827999(%rax,%rsi), %eax # e += RCONST + W[n & 15] 403 addl -64+4*3(%rsp), %eax # e += RCONST + W[n & 15]
312 addl %edi, %eax # e += (((c ^ d) & b) ^ d) 404 addl %edi, %eax # e += (((c ^ d) & b) ^ d)
313 movl %ebx, %esi # 405 movl %ebx, %esi #
314 roll $5, %esi # rotl32(a,5) 406 roll $5, %esi # rotl32(a,5)
315 addl %esi, %eax # e += rotl32(a,5) 407 addl %esi, %eax # e += rotl32(a,5)
316 rorl $2, %ecx # b = rotl32(b,30) 408 rorl $2, %ecx # b = rotl32(b,30)
409# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
410 movaps %xmm2, %xmm4
411 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
412# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
413# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
414# same result as above, but shorter and faster:
415# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
416# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
417 movaps %xmm3, %xmm5
418 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
419 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
420 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
421 xorps %xmm5, %xmm3 # ^
422 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
423 movaps %xmm3, %xmm5
424 xorps %xmm4, %xmm4 # rol(W0,1):
425 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
426 paddd %xmm3, %xmm3 # shift left by 1
427 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
428 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
429 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
430 movaps %xmm5, %xmm4
431 pslld $2, %xmm5
432 psrld $30, %xmm4
433# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
434 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
435 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
436 movaps %xmm3, %xmm5
437 paddd %xmm6, %xmm5
438 movups %xmm5, -64+16*3(%rsp)
317# 20 439# 20
318 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
319 xorl %r12d, %esi # ^W[(n+8) & 15]
320 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
321 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
322 roll %esi #
323 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
324 movl %ecx, %edi # c 440 movl %ecx, %edi # c
325 xorl %edx, %edi # ^d 441 xorl %edx, %edi # ^d
326 xorl %ebx, %edi # ^b 442 xorl %ebx, %edi # ^b
327 leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 443 addl -64+4*4(%rsp), %ebp # e += RCONST + W[n & 15]
328 addl %edi, %ebp # e += (c ^ d ^ b) 444 addl %edi, %ebp # e += (c ^ d ^ b)
329 movl %eax, %esi # 445 movl %eax, %esi #
330 roll $5, %esi # rotl32(a,5) 446 roll $5, %esi # rotl32(a,5)
331 addl %esi, %ebp # e += rotl32(a,5) 447 addl %esi, %ebp # e += rotl32(a,5)
332 rorl $2, %ebx # b = rotl32(b,30) 448 rorl $2, %ebx # b = rotl32(b,30)
333# 21 449# 21
334 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
335 xorl %r13d, %esi # ^W[(n+8) & 15]
336 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
337 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
338 roll %esi #
339 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
340 movl %ebx, %edi # c 450 movl %ebx, %edi # c
341 xorl %ecx, %edi # ^d 451 xorl %ecx, %edi # ^d
342 xorl %eax, %edi # ^b 452 xorl %eax, %edi # ^b
343 leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 453 addl -64+4*5(%rsp), %edx # e += RCONST + W[n & 15]
344 addl %edi, %edx # e += (c ^ d ^ b) 454 addl %edi, %edx # e += (c ^ d ^ b)
345 movl %ebp, %esi # 455 movl %ebp, %esi #
346 roll $5, %esi # rotl32(a,5) 456 roll $5, %esi # rotl32(a,5)
347 addl %esi, %edx # e += rotl32(a,5) 457 addl %esi, %edx # e += rotl32(a,5)
348 rorl $2, %eax # b = rotl32(b,30) 458 rorl $2, %eax # b = rotl32(b,30)
349# 22 459# 22
350 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
351 xorl %r14d, %esi # ^W[(n+8) & 15]
352 xorl %r8d, %esi # ^W[(n+2) & 15]
353 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
354 roll %esi #
355 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
356 movl %eax, %edi # c 460 movl %eax, %edi # c
357 xorl %ebx, %edi # ^d 461 xorl %ebx, %edi # ^d
358 xorl %ebp, %edi # ^b 462 xorl %ebp, %edi # ^b
359 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 463 addl -64+4*6(%rsp), %ecx # e += RCONST + W[n & 15]
360 addl %edi, %ecx # e += (c ^ d ^ b) 464 addl %edi, %ecx # e += (c ^ d ^ b)
361 movl %edx, %esi # 465 movl %edx, %esi #
362 roll $5, %esi # rotl32(a,5) 466 roll $5, %esi # rotl32(a,5)
363 addl %esi, %ecx # e += rotl32(a,5) 467 addl %esi, %ecx # e += rotl32(a,5)
364 rorl $2, %ebp # b = rotl32(b,30) 468 rorl $2, %ebp # b = rotl32(b,30)
365# 23 469# 23
366 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
367 xorl %r15d, %esi # ^W[(n+8) & 15]
368 xorl %r9d, %esi # ^W[(n+2) & 15]
369 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
370 roll %esi #
371 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
372 movl %ebp, %edi # c 470 movl %ebp, %edi # c
373 xorl %eax, %edi # ^d 471 xorl %eax, %edi # ^d
374 xorl %edx, %edi # ^b 472 xorl %edx, %edi # ^b
375 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 473 addl -64+4*7(%rsp), %ebx # e += RCONST + W[n & 15]
376 addl %edi, %ebx # e += (c ^ d ^ b) 474 addl %edi, %ebx # e += (c ^ d ^ b)
377 movl %ecx, %esi # 475 movl %ecx, %esi #
378 roll $5, %esi # rotl32(a,5) 476 roll $5, %esi # rotl32(a,5)
379 addl %esi, %ebx # e += rotl32(a,5) 477 addl %esi, %ebx # e += rotl32(a,5)
380 rorl $2, %edx # b = rotl32(b,30) 478 rorl $2, %edx # b = rotl32(b,30)
479# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
480 movaps %xmm3, %xmm4
481 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
482# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
483# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
484# same result as above, but shorter and faster:
485# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
486# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
487 movaps %xmm0, %xmm5
488 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
489 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
490 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
491 xorps %xmm5, %xmm0 # ^
492 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
493 movaps %xmm0, %xmm5
494 xorps %xmm4, %xmm4 # rol(W0,1):
495 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
496 paddd %xmm0, %xmm0 # shift left by 1
497 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
498 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
499 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
500 movaps %xmm5, %xmm4
501 pslld $2, %xmm5
502 psrld $30, %xmm4
503# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
504 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
505 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
506 movaps %xmm0, %xmm5
507 paddd %xmm6, %xmm5
508 movups %xmm5, -64+16*0(%rsp)
381# 24 509# 24
382 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
383 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
384 xorl %r10d, %r8d # ^W[(n+2) & 15]
385 roll %r8d #
386 movl %edx, %edi # c 510 movl %edx, %edi # c
387 xorl %ebp, %edi # ^d 511 xorl %ebp, %edi # ^d
388 xorl %ecx, %edi # ^b 512 xorl %ecx, %edi # ^b
389 leal 0x6ED9EBA1(%rax,%r8), %eax # e += RCONST + W[n & 15] 513 addl -64+4*8(%rsp), %eax # e += RCONST + W[n & 15]
390 addl %edi, %eax # e += (c ^ d ^ b) 514 addl %edi, %eax # e += (c ^ d ^ b)
391 movl %ebx, %esi # 515 movl %ebx, %esi #
392 roll $5, %esi # rotl32(a,5) 516 roll $5, %esi # rotl32(a,5)
393 addl %esi, %eax # e += rotl32(a,5) 517 addl %esi, %eax # e += rotl32(a,5)
394 rorl $2, %ecx # b = rotl32(b,30) 518 rorl $2, %ecx # b = rotl32(b,30)
395# 25 519# 25
396 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
397 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
398 xorl %r11d, %r9d # ^W[(n+2) & 15]
399 roll %r9d #
400 movl %ecx, %edi # c 520 movl %ecx, %edi # c
401 xorl %edx, %edi # ^d 521 xorl %edx, %edi # ^d
402 xorl %ebx, %edi # ^b 522 xorl %ebx, %edi # ^b
403 leal 0x6ED9EBA1(%rbp,%r9), %ebp # e += RCONST + W[n & 15] 523 addl -64+4*9(%rsp), %ebp # e += RCONST + W[n & 15]
404 addl %edi, %ebp # e += (c ^ d ^ b) 524 addl %edi, %ebp # e += (c ^ d ^ b)
405 movl %eax, %esi # 525 movl %eax, %esi #
406 roll $5, %esi # rotl32(a,5) 526 roll $5, %esi # rotl32(a,5)
407 addl %esi, %ebp # e += rotl32(a,5) 527 addl %esi, %ebp # e += rotl32(a,5)
408 rorl $2, %ebx # b = rotl32(b,30) 528 rorl $2, %ebx # b = rotl32(b,30)
409# 26 529# 26
410 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
411 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
412 xorl %r12d, %r10d # ^W[(n+2) & 15]
413 roll %r10d #
414 movl %ebx, %edi # c 530 movl %ebx, %edi # c
415 xorl %ecx, %edi # ^d 531 xorl %ecx, %edi # ^d
416 xorl %eax, %edi # ^b 532 xorl %eax, %edi # ^b
417 leal 0x6ED9EBA1(%rdx,%r10), %edx # e += RCONST + W[n & 15] 533 addl -64+4*10(%rsp), %edx # e += RCONST + W[n & 15]
418 addl %edi, %edx # e += (c ^ d ^ b) 534 addl %edi, %edx # e += (c ^ d ^ b)
419 movl %ebp, %esi # 535 movl %ebp, %esi #
420 roll $5, %esi # rotl32(a,5) 536 roll $5, %esi # rotl32(a,5)
421 addl %esi, %edx # e += rotl32(a,5) 537 addl %esi, %edx # e += rotl32(a,5)
422 rorl $2, %eax # b = rotl32(b,30) 538 rorl $2, %eax # b = rotl32(b,30)
423# 27 539# 27
424 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
425 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
426 xorl %r13d, %r11d # ^W[(n+2) & 15]
427 roll %r11d #
428 movl %eax, %edi # c 540 movl %eax, %edi # c
429 xorl %ebx, %edi # ^d 541 xorl %ebx, %edi # ^d
430 xorl %ebp, %edi # ^b 542 xorl %ebp, %edi # ^b
431 leal 0x6ED9EBA1(%rcx,%r11), %ecx # e += RCONST + W[n & 15] 543 addl -64+4*11(%rsp), %ecx # e += RCONST + W[n & 15]
432 addl %edi, %ecx # e += (c ^ d ^ b) 544 addl %edi, %ecx # e += (c ^ d ^ b)
433 movl %edx, %esi # 545 movl %edx, %esi #
434 roll $5, %esi # rotl32(a,5) 546 roll $5, %esi # rotl32(a,5)
435 addl %esi, %ecx # e += rotl32(a,5) 547 addl %esi, %ecx # e += rotl32(a,5)
436 rorl $2, %ebp # b = rotl32(b,30) 548 rorl $2, %ebp # b = rotl32(b,30)
549# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
550 movaps %xmm0, %xmm4
551 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
552# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
553# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
554# same result as above, but shorter and faster:
555# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
556# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
557 movaps %xmm1, %xmm5
558 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
559 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
560 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
561 xorps %xmm5, %xmm1 # ^
562 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
563 movaps %xmm1, %xmm5
564 xorps %xmm4, %xmm4 # rol(W0,1):
565 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
566 paddd %xmm1, %xmm1 # shift left by 1
567 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
568 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
569 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
570 movaps %xmm5, %xmm4
571 pslld $2, %xmm5
572 psrld $30, %xmm4
573# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
574 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
575 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
576 movaps %xmm1, %xmm5
577 paddd %xmm6, %xmm5
578 movups %xmm5, -64+16*1(%rsp)
437# 28 579# 28
438 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
439 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
440 xorl %r14d, %r12d # ^W[(n+2) & 15]
441 roll %r12d #
442 movl %ebp, %edi # c 580 movl %ebp, %edi # c
443 xorl %eax, %edi # ^d 581 xorl %eax, %edi # ^d
444 xorl %edx, %edi # ^b 582 xorl %edx, %edi # ^b
445 leal 0x6ED9EBA1(%rbx,%r12), %ebx # e += RCONST + W[n & 15] 583 addl -64+4*12(%rsp), %ebx # e += RCONST + W[n & 15]
446 addl %edi, %ebx # e += (c ^ d ^ b) 584 addl %edi, %ebx # e += (c ^ d ^ b)
447 movl %ecx, %esi # 585 movl %ecx, %esi #
448 roll $5, %esi # rotl32(a,5) 586 roll $5, %esi # rotl32(a,5)
449 addl %esi, %ebx # e += rotl32(a,5) 587 addl %esi, %ebx # e += rotl32(a,5)
450 rorl $2, %edx # b = rotl32(b,30) 588 rorl $2, %edx # b = rotl32(b,30)
451# 29 589# 29
452 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
453 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
454 xorl %r15d, %r13d # ^W[(n+2) & 15]
455 roll %r13d #
456 movl %edx, %edi # c 590 movl %edx, %edi # c
457 xorl %ebp, %edi # ^d 591 xorl %ebp, %edi # ^d
458 xorl %ecx, %edi # ^b 592 xorl %ecx, %edi # ^b
459 leal 0x6ED9EBA1(%rax,%r13), %eax # e += RCONST + W[n & 15] 593 addl -64+4*13(%rsp), %eax # e += RCONST + W[n & 15]
460 addl %edi, %eax # e += (c ^ d ^ b) 594 addl %edi, %eax # e += (c ^ d ^ b)
461 movl %ebx, %esi # 595 movl %ebx, %esi #
462 roll $5, %esi # rotl32(a,5) 596 roll $5, %esi # rotl32(a,5)
463 addl %esi, %eax # e += rotl32(a,5) 597 addl %esi, %eax # e += rotl32(a,5)
464 rorl $2, %ecx # b = rotl32(b,30) 598 rorl $2, %ecx # b = rotl32(b,30)
465# 30 599# 30
466 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
467 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
468 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
469 roll %r14d #
470 movl %ecx, %edi # c 600 movl %ecx, %edi # c
471 xorl %edx, %edi # ^d 601 xorl %edx, %edi # ^d
472 xorl %ebx, %edi # ^b 602 xorl %ebx, %edi # ^b
473 leal 0x6ED9EBA1(%rbp,%r14), %ebp # e += RCONST + W[n & 15] 603 addl -64+4*14(%rsp), %ebp # e += RCONST + W[n & 15]
474 addl %edi, %ebp # e += (c ^ d ^ b) 604 addl %edi, %ebp # e += (c ^ d ^ b)
475 movl %eax, %esi # 605 movl %eax, %esi #
476 roll $5, %esi # rotl32(a,5) 606 roll $5, %esi # rotl32(a,5)
477 addl %esi, %ebp # e += rotl32(a,5) 607 addl %esi, %ebp # e += rotl32(a,5)
478 rorl $2, %ebx # b = rotl32(b,30) 608 rorl $2, %ebx # b = rotl32(b,30)
479# 31 609# 31
480 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
481 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
482 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
483 roll %r15d #
484 movl %ebx, %edi # c 610 movl %ebx, %edi # c
485 xorl %ecx, %edi # ^d 611 xorl %ecx, %edi # ^d
486 xorl %eax, %edi # ^b 612 xorl %eax, %edi # ^b
487 leal 0x6ED9EBA1(%rdx,%r15), %edx # e += RCONST + W[n & 15] 613 addl -64+4*15(%rsp), %edx # e += RCONST + W[n & 15]
488 addl %edi, %edx # e += (c ^ d ^ b) 614 addl %edi, %edx # e += (c ^ d ^ b)
489 movl %ebp, %esi # 615 movl %ebp, %esi #
490 roll $5, %esi # rotl32(a,5) 616 roll $5, %esi # rotl32(a,5)
491 addl %esi, %edx # e += rotl32(a,5) 617 addl %esi, %edx # e += rotl32(a,5)
492 rorl $2, %eax # b = rotl32(b,30) 618 rorl $2, %eax # b = rotl32(b,30)
619 pshufd $0xaa, %xmm7, %xmm6
620# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
621 movaps %xmm1, %xmm4
622 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
623# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
624# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
625# same result as above, but shorter and faster:
626# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
627# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
628 movaps %xmm2, %xmm5
629 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
630 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
631 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
632 xorps %xmm5, %xmm2 # ^
633 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
634 movaps %xmm2, %xmm5
635 xorps %xmm4, %xmm4 # rol(W0,1):
636 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
637 paddd %xmm2, %xmm2 # shift left by 1
638 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
639 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
640 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
641 movaps %xmm5, %xmm4
642 pslld $2, %xmm5
643 psrld $30, %xmm4
644# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
645 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
646 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
647 movaps %xmm2, %xmm5
648 paddd %xmm6, %xmm5
649 movups %xmm5, -64+16*2(%rsp)
493# 32 650# 32
494 movl %r13d, %esi # W[(n+13) & 15]
495 xorl %r8d, %esi # ^W[(n+8) & 15]
496 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
497 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
498 roll %esi #
499 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
500 movl %eax, %edi # c 651 movl %eax, %edi # c
501 xorl %ebx, %edi # ^d 652 xorl %ebx, %edi # ^d
502 xorl %ebp, %edi # ^b 653 xorl %ebp, %edi # ^b
503 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 654 addl -64+4*0(%rsp), %ecx # e += RCONST + W[n & 15]
504 addl %edi, %ecx # e += (c ^ d ^ b) 655 addl %edi, %ecx # e += (c ^ d ^ b)
505 movl %edx, %esi # 656 movl %edx, %esi #
506 roll $5, %esi # rotl32(a,5) 657 roll $5, %esi # rotl32(a,5)
507 addl %esi, %ecx # e += rotl32(a,5) 658 addl %esi, %ecx # e += rotl32(a,5)
508 rorl $2, %ebp # b = rotl32(b,30) 659 rorl $2, %ebp # b = rotl32(b,30)
509# 33 660# 33
510 movl %r14d, %esi # W[(n+13) & 15]
511 xorl %r9d, %esi # ^W[(n+8) & 15]
512 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
513 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
514 roll %esi #
515 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
516 movl %ebp, %edi # c 661 movl %ebp, %edi # c
517 xorl %eax, %edi # ^d 662 xorl %eax, %edi # ^d
518 xorl %edx, %edi # ^b 663 xorl %edx, %edi # ^b
519 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 664 addl -64+4*1(%rsp), %ebx # e += RCONST + W[n & 15]
520 addl %edi, %ebx # e += (c ^ d ^ b) 665 addl %edi, %ebx # e += (c ^ d ^ b)
521 movl %ecx, %esi # 666 movl %ecx, %esi #
522 roll $5, %esi # rotl32(a,5) 667 roll $5, %esi # rotl32(a,5)
523 addl %esi, %ebx # e += rotl32(a,5) 668 addl %esi, %ebx # e += rotl32(a,5)
524 rorl $2, %edx # b = rotl32(b,30) 669 rorl $2, %edx # b = rotl32(b,30)
525# 34 670# 34
526 movl %r15d, %esi # W[(n+13) & 15]
527 xorl %r10d, %esi # ^W[(n+8) & 15]
528 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
529 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
530 roll %esi #
531 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
532 movl %edx, %edi # c 671 movl %edx, %edi # c
533 xorl %ebp, %edi # ^d 672 xorl %ebp, %edi # ^d
534 xorl %ecx, %edi # ^b 673 xorl %ecx, %edi # ^b
535 leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] 674 addl -64+4*2(%rsp), %eax # e += RCONST + W[n & 15]
536 addl %edi, %eax # e += (c ^ d ^ b) 675 addl %edi, %eax # e += (c ^ d ^ b)
537 movl %ebx, %esi # 676 movl %ebx, %esi #
538 roll $5, %esi # rotl32(a,5) 677 roll $5, %esi # rotl32(a,5)
539 addl %esi, %eax # e += rotl32(a,5) 678 addl %esi, %eax # e += rotl32(a,5)
540 rorl $2, %ecx # b = rotl32(b,30) 679 rorl $2, %ecx # b = rotl32(b,30)
541# 35 680# 35
542 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
543 xorl %r11d, %esi # ^W[(n+8) & 15]
544 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
545 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
546 roll %esi #
547 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
548 movl %ecx, %edi # c 681 movl %ecx, %edi # c
549 xorl %edx, %edi # ^d 682 xorl %edx, %edi # ^d
550 xorl %ebx, %edi # ^b 683 xorl %ebx, %edi # ^b
551 leal 0x6ED9EBA1(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 684 addl -64+4*3(%rsp), %ebp # e += RCONST + W[n & 15]
552 addl %edi, %ebp # e += (c ^ d ^ b) 685 addl %edi, %ebp # e += (c ^ d ^ b)
553 movl %eax, %esi # 686 movl %eax, %esi #
554 roll $5, %esi # rotl32(a,5) 687 roll $5, %esi # rotl32(a,5)
555 addl %esi, %ebp # e += rotl32(a,5) 688 addl %esi, %ebp # e += rotl32(a,5)
556 rorl $2, %ebx # b = rotl32(b,30) 689 rorl $2, %ebx # b = rotl32(b,30)
690# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
691 movaps %xmm2, %xmm4
692 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
693# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
694# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
695# same result as above, but shorter and faster:
696# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
697# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
698 movaps %xmm3, %xmm5
699 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
700 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
701 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
702 xorps %xmm5, %xmm3 # ^
703 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
704 movaps %xmm3, %xmm5
705 xorps %xmm4, %xmm4 # rol(W0,1):
706 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
707 paddd %xmm3, %xmm3 # shift left by 1
708 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
709 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
710 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
711 movaps %xmm5, %xmm4
712 pslld $2, %xmm5
713 psrld $30, %xmm4
714# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
715 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
716 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
717 movaps %xmm3, %xmm5
718 paddd %xmm6, %xmm5
719 movups %xmm5, -64+16*3(%rsp)
557# 36 720# 36
558 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
559 xorl %r12d, %esi # ^W[(n+8) & 15]
560 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
561 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
562 roll %esi #
563 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
564 movl %ebx, %edi # c 721 movl %ebx, %edi # c
565 xorl %ecx, %edi # ^d 722 xorl %ecx, %edi # ^d
566 xorl %eax, %edi # ^b 723 xorl %eax, %edi # ^b
567 leal 0x6ED9EBA1(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 724 addl -64+4*4(%rsp), %edx # e += RCONST + W[n & 15]
568 addl %edi, %edx # e += (c ^ d ^ b) 725 addl %edi, %edx # e += (c ^ d ^ b)
569 movl %ebp, %esi # 726 movl %ebp, %esi #
570 roll $5, %esi # rotl32(a,5) 727 roll $5, %esi # rotl32(a,5)
571 addl %esi, %edx # e += rotl32(a,5) 728 addl %esi, %edx # e += rotl32(a,5)
572 rorl $2, %eax # b = rotl32(b,30) 729 rorl $2, %eax # b = rotl32(b,30)
573# 37 730# 37
574 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
575 xorl %r13d, %esi # ^W[(n+8) & 15]
576 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
577 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
578 roll %esi #
579 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
580 movl %eax, %edi # c 731 movl %eax, %edi # c
581 xorl %ebx, %edi # ^d 732 xorl %ebx, %edi # ^d
582 xorl %ebp, %edi # ^b 733 xorl %ebp, %edi # ^b
583 leal 0x6ED9EBA1(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 734 addl -64+4*5(%rsp), %ecx # e += RCONST + W[n & 15]
584 addl %edi, %ecx # e += (c ^ d ^ b) 735 addl %edi, %ecx # e += (c ^ d ^ b)
585 movl %edx, %esi # 736 movl %edx, %esi #
586 roll $5, %esi # rotl32(a,5) 737 roll $5, %esi # rotl32(a,5)
587 addl %esi, %ecx # e += rotl32(a,5) 738 addl %esi, %ecx # e += rotl32(a,5)
588 rorl $2, %ebp # b = rotl32(b,30) 739 rorl $2, %ebp # b = rotl32(b,30)
589# 38 740# 38
590 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
591 xorl %r14d, %esi # ^W[(n+8) & 15]
592 xorl %r8d, %esi # ^W[(n+2) & 15]
593 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
594 roll %esi #
595 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
596 movl %ebp, %edi # c 741 movl %ebp, %edi # c
597 xorl %eax, %edi # ^d 742 xorl %eax, %edi # ^d
598 xorl %edx, %edi # ^b 743 xorl %edx, %edi # ^b
599 leal 0x6ED9EBA1(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 744 addl -64+4*6(%rsp), %ebx # e += RCONST + W[n & 15]
600 addl %edi, %ebx # e += (c ^ d ^ b) 745 addl %edi, %ebx # e += (c ^ d ^ b)
601 movl %ecx, %esi # 746 movl %ecx, %esi #
602 roll $5, %esi # rotl32(a,5) 747 roll $5, %esi # rotl32(a,5)
603 addl %esi, %ebx # e += rotl32(a,5) 748 addl %esi, %ebx # e += rotl32(a,5)
604 rorl $2, %edx # b = rotl32(b,30) 749 rorl $2, %edx # b = rotl32(b,30)
605# 39 750# 39
606 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
607 xorl %r15d, %esi # ^W[(n+8) & 15]
608 xorl %r9d, %esi # ^W[(n+2) & 15]
609 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
610 roll %esi #
611 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
612 movl %edx, %edi # c 751 movl %edx, %edi # c
613 xorl %ebp, %edi # ^d 752 xorl %ebp, %edi # ^d
614 xorl %ecx, %edi # ^b 753 xorl %ecx, %edi # ^b
615 leal 0x6ED9EBA1(%rax,%rsi), %eax # e += RCONST + W[n & 15] 754 addl -64+4*7(%rsp), %eax # e += RCONST + W[n & 15]
616 addl %edi, %eax # e += (c ^ d ^ b) 755 addl %edi, %eax # e += (c ^ d ^ b)
617 movl %ebx, %esi # 756 movl %ebx, %esi #
618 roll $5, %esi # rotl32(a,5) 757 roll $5, %esi # rotl32(a,5)
619 addl %esi, %eax # e += rotl32(a,5) 758 addl %esi, %eax # e += rotl32(a,5)
620 rorl $2, %ecx # b = rotl32(b,30) 759 rorl $2, %ecx # b = rotl32(b,30)
760# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
761 movaps %xmm3, %xmm4
762 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
763# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
764# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
765# same result as above, but shorter and faster:
766# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
767# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
768 movaps %xmm0, %xmm5
769 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
770 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
771 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
772 xorps %xmm5, %xmm0 # ^
773 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
774 movaps %xmm0, %xmm5
775 xorps %xmm4, %xmm4 # rol(W0,1):
776 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
777 paddd %xmm0, %xmm0 # shift left by 1
778 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
779 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
780 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
781 movaps %xmm5, %xmm4
782 pslld $2, %xmm5
783 psrld $30, %xmm4
784# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
785 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
786 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
787 movaps %xmm0, %xmm5
788 paddd %xmm6, %xmm5
789 movups %xmm5, -64+16*0(%rsp)
621# 40 790# 40
622 movl %ebx, %edi # di: b 791 movl %ebx, %edi # di: b
623 movl %ebx, %esi # si: b 792 movl %ebx, %esi # si: b
@@ -625,12 +794,8 @@ sha1_process_block64:
625 andl %ecx, %esi # si: b & c 794 andl %ecx, %esi # si: b & c
626 andl %edx, %edi # di: (b | c) & d 795 andl %edx, %edi # di: (b | c) & d
627 orl %esi, %edi # ((b | c) & d) | (b & c) 796 orl %esi, %edi # ((b | c) & d) | (b & c)
628 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
629 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
630 xorl %r10d, %r8d # ^W[(n+2) & 15]
631 roll %r8d #
632 addl %edi, %ebp # += ((b | c) & d) | (b & c) 797 addl %edi, %ebp # += ((b | c) & d) | (b & c)
633 leal -0x70E44324(%rbp,%r8), %ebp # e += RCONST + W[n & 15] 798 addl -64+4*8(%rsp), %ebp # e += RCONST + W[n & 15]
634 movl %eax, %esi # 799 movl %eax, %esi #
635 roll $5, %esi # rotl32(a,5) 800 roll $5, %esi # rotl32(a,5)
636 addl %esi, %ebp # e += rotl32(a,5) 801 addl %esi, %ebp # e += rotl32(a,5)
@@ -642,12 +807,8 @@ sha1_process_block64:
642 andl %ebx, %esi # si: b & c 807 andl %ebx, %esi # si: b & c
643 andl %ecx, %edi # di: (b | c) & d 808 andl %ecx, %edi # di: (b | c) & d
644 orl %esi, %edi # ((b | c) & d) | (b & c) 809 orl %esi, %edi # ((b | c) & d) | (b & c)
645 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
646 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
647 xorl %r11d, %r9d # ^W[(n+2) & 15]
648 roll %r9d #
649 addl %edi, %edx # += ((b | c) & d) | (b & c) 810 addl %edi, %edx # += ((b | c) & d) | (b & c)
650 leal -0x70E44324(%rdx,%r9), %edx # e += RCONST + W[n & 15] 811 addl -64+4*9(%rsp), %edx # e += RCONST + W[n & 15]
651 movl %ebp, %esi # 812 movl %ebp, %esi #
652 roll $5, %esi # rotl32(a,5) 813 roll $5, %esi # rotl32(a,5)
653 addl %esi, %edx # e += rotl32(a,5) 814 addl %esi, %edx # e += rotl32(a,5)
@@ -659,12 +820,8 @@ sha1_process_block64:
659 andl %eax, %esi # si: b & c 820 andl %eax, %esi # si: b & c
660 andl %ebx, %edi # di: (b | c) & d 821 andl %ebx, %edi # di: (b | c) & d
661 orl %esi, %edi # ((b | c) & d) | (b & c) 822 orl %esi, %edi # ((b | c) & d) | (b & c)
662 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
663 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
664 xorl %r12d, %r10d # ^W[(n+2) & 15]
665 roll %r10d #
666 addl %edi, %ecx # += ((b | c) & d) | (b & c) 823 addl %edi, %ecx # += ((b | c) & d) | (b & c)
667 leal -0x70E44324(%rcx,%r10), %ecx # e += RCONST + W[n & 15] 824 addl -64+4*10(%rsp), %ecx # e += RCONST + W[n & 15]
668 movl %edx, %esi # 825 movl %edx, %esi #
669 roll $5, %esi # rotl32(a,5) 826 roll $5, %esi # rotl32(a,5)
670 addl %esi, %ecx # e += rotl32(a,5) 827 addl %esi, %ecx # e += rotl32(a,5)
@@ -676,16 +833,42 @@ sha1_process_block64:
676 andl %ebp, %esi # si: b & c 833 andl %ebp, %esi # si: b & c
677 andl %eax, %edi # di: (b | c) & d 834 andl %eax, %edi # di: (b | c) & d
678 orl %esi, %edi # ((b | c) & d) | (b & c) 835 orl %esi, %edi # ((b | c) & d) | (b & c)
679 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
680 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
681 xorl %r13d, %r11d # ^W[(n+2) & 15]
682 roll %r11d #
683 addl %edi, %ebx # += ((b | c) & d) | (b & c) 836 addl %edi, %ebx # += ((b | c) & d) | (b & c)
684 leal -0x70E44324(%rbx,%r11), %ebx # e += RCONST + W[n & 15] 837 addl -64+4*11(%rsp), %ebx # e += RCONST + W[n & 15]
685 movl %ecx, %esi # 838 movl %ecx, %esi #
686 roll $5, %esi # rotl32(a,5) 839 roll $5, %esi # rotl32(a,5)
687 addl %esi, %ebx # e += rotl32(a,5) 840 addl %esi, %ebx # e += rotl32(a,5)
688 rorl $2, %edx # b = rotl32(b,30) 841 rorl $2, %edx # b = rotl32(b,30)
842# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
843 movaps %xmm0, %xmm4
844 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
845# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
846# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
847# same result as above, but shorter and faster:
848# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
849# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
850 movaps %xmm1, %xmm5
851 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
852 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
853 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
854 xorps %xmm5, %xmm1 # ^
855 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
856 movaps %xmm1, %xmm5
857 xorps %xmm4, %xmm4 # rol(W0,1):
858 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
859 paddd %xmm1, %xmm1 # shift left by 1
860 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
861 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
862 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
863 movaps %xmm5, %xmm4
864 pslld $2, %xmm5
865 psrld $30, %xmm4
866# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
867 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
868 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
869 movaps %xmm1, %xmm5
870 paddd %xmm6, %xmm5
871 movups %xmm5, -64+16*1(%rsp)
689# 44 872# 44
690 movl %ecx, %edi # di: b 873 movl %ecx, %edi # di: b
691 movl %ecx, %esi # si: b 874 movl %ecx, %esi # si: b
@@ -693,12 +876,8 @@ sha1_process_block64:
693 andl %edx, %esi # si: b & c 876 andl %edx, %esi # si: b & c
694 andl %ebp, %edi # di: (b | c) & d 877 andl %ebp, %edi # di: (b | c) & d
695 orl %esi, %edi # ((b | c) & d) | (b & c) 878 orl %esi, %edi # ((b | c) & d) | (b & c)
696 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
697 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
698 xorl %r14d, %r12d # ^W[(n+2) & 15]
699 roll %r12d #
700 addl %edi, %eax # += ((b | c) & d) | (b & c) 879 addl %edi, %eax # += ((b | c) & d) | (b & c)
701 leal -0x70E44324(%rax,%r12), %eax # e += RCONST + W[n & 15] 880 addl -64+4*12(%rsp), %eax # e += RCONST + W[n & 15]
702 movl %ebx, %esi # 881 movl %ebx, %esi #
703 roll $5, %esi # rotl32(a,5) 882 roll $5, %esi # rotl32(a,5)
704 addl %esi, %eax # e += rotl32(a,5) 883 addl %esi, %eax # e += rotl32(a,5)
@@ -710,12 +889,8 @@ sha1_process_block64:
710 andl %ecx, %esi # si: b & c 889 andl %ecx, %esi # si: b & c
711 andl %edx, %edi # di: (b | c) & d 890 andl %edx, %edi # di: (b | c) & d
712 orl %esi, %edi # ((b | c) & d) | (b & c) 891 orl %esi, %edi # ((b | c) & d) | (b & c)
713 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
714 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
715 xorl %r15d, %r13d # ^W[(n+2) & 15]
716 roll %r13d #
717 addl %edi, %ebp # += ((b | c) & d) | (b & c) 892 addl %edi, %ebp # += ((b | c) & d) | (b & c)
718 leal -0x70E44324(%rbp,%r13), %ebp # e += RCONST + W[n & 15] 893 addl -64+4*13(%rsp), %ebp # e += RCONST + W[n & 15]
719 movl %eax, %esi # 894 movl %eax, %esi #
720 roll $5, %esi # rotl32(a,5) 895 roll $5, %esi # rotl32(a,5)
721 addl %esi, %ebp # e += rotl32(a,5) 896 addl %esi, %ebp # e += rotl32(a,5)
@@ -727,12 +902,8 @@ sha1_process_block64:
727 andl %ebx, %esi # si: b & c 902 andl %ebx, %esi # si: b & c
728 andl %ecx, %edi # di: (b | c) & d 903 andl %ecx, %edi # di: (b | c) & d
729 orl %esi, %edi # ((b | c) & d) | (b & c) 904 orl %esi, %edi # ((b | c) & d) | (b & c)
730 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
731 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
732 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
733 roll %r14d #
734 addl %edi, %edx # += ((b | c) & d) | (b & c) 905 addl %edi, %edx # += ((b | c) & d) | (b & c)
735 leal -0x70E44324(%rdx,%r14), %edx # e += RCONST + W[n & 15] 906 addl -64+4*14(%rsp), %edx # e += RCONST + W[n & 15]
736 movl %ebp, %esi # 907 movl %ebp, %esi #
737 roll $5, %esi # rotl32(a,5) 908 roll $5, %esi # rotl32(a,5)
738 addl %esi, %edx # e += rotl32(a,5) 909 addl %esi, %edx # e += rotl32(a,5)
@@ -744,16 +915,42 @@ sha1_process_block64:
744 andl %eax, %esi # si: b & c 915 andl %eax, %esi # si: b & c
745 andl %ebx, %edi # di: (b | c) & d 916 andl %ebx, %edi # di: (b | c) & d
746 orl %esi, %edi # ((b | c) & d) | (b & c) 917 orl %esi, %edi # ((b | c) & d) | (b & c)
747 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
748 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
749 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
750 roll %r15d #
751 addl %edi, %ecx # += ((b | c) & d) | (b & c) 918 addl %edi, %ecx # += ((b | c) & d) | (b & c)
752 leal -0x70E44324(%rcx,%r15), %ecx # e += RCONST + W[n & 15] 919 addl -64+4*15(%rsp), %ecx # e += RCONST + W[n & 15]
753 movl %edx, %esi # 920 movl %edx, %esi #
754 roll $5, %esi # rotl32(a,5) 921 roll $5, %esi # rotl32(a,5)
755 addl %esi, %ecx # e += rotl32(a,5) 922 addl %esi, %ecx # e += rotl32(a,5)
756 rorl $2, %ebp # b = rotl32(b,30) 923 rorl $2, %ebp # b = rotl32(b,30)
924# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
925 movaps %xmm1, %xmm4
926 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
927# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
928# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
929# same result as above, but shorter and faster:
930# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
931# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
932 movaps %xmm2, %xmm5
933 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
934 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
935 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
936 xorps %xmm5, %xmm2 # ^
937 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
938 movaps %xmm2, %xmm5
939 xorps %xmm4, %xmm4 # rol(W0,1):
940 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
941 paddd %xmm2, %xmm2 # shift left by 1
942 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
943 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
944 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
945 movaps %xmm5, %xmm4
946 pslld $2, %xmm5
947 psrld $30, %xmm4
948# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
949 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
950 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
951 movaps %xmm2, %xmm5
952 paddd %xmm6, %xmm5
953 movups %xmm5, -64+16*2(%rsp)
757# 48 954# 48
758 movl %edx, %edi # di: b 955 movl %edx, %edi # di: b
759 movl %edx, %esi # si: b 956 movl %edx, %esi # si: b
@@ -761,14 +958,8 @@ sha1_process_block64:
761 andl %ebp, %esi # si: b & c 958 andl %ebp, %esi # si: b & c
762 andl %eax, %edi # di: (b | c) & d 959 andl %eax, %edi # di: (b | c) & d
763 orl %esi, %edi # ((b | c) & d) | (b & c) 960 orl %esi, %edi # ((b | c) & d) | (b & c)
764 movl %r13d, %esi # W[(n+13) & 15]
765 xorl %r8d, %esi # ^W[(n+8) & 15]
766 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
767 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
768 roll %esi #
769 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
770 addl %edi, %ebx # += ((b | c) & d) | (b & c) 961 addl %edi, %ebx # += ((b | c) & d) | (b & c)
771 leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 962 addl -64+4*0(%rsp), %ebx # e += RCONST + W[n & 15]
772 movl %ecx, %esi # 963 movl %ecx, %esi #
773 roll $5, %esi # rotl32(a,5) 964 roll $5, %esi # rotl32(a,5)
774 addl %esi, %ebx # e += rotl32(a,5) 965 addl %esi, %ebx # e += rotl32(a,5)
@@ -780,14 +971,8 @@ sha1_process_block64:
780 andl %edx, %esi # si: b & c 971 andl %edx, %esi # si: b & c
781 andl %ebp, %edi # di: (b | c) & d 972 andl %ebp, %edi # di: (b | c) & d
782 orl %esi, %edi # ((b | c) & d) | (b & c) 973 orl %esi, %edi # ((b | c) & d) | (b & c)
783 movl %r14d, %esi # W[(n+13) & 15]
784 xorl %r9d, %esi # ^W[(n+8) & 15]
785 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
786 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
787 roll %esi #
788 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
789 addl %edi, %eax # += ((b | c) & d) | (b & c) 974 addl %edi, %eax # += ((b | c) & d) | (b & c)
790 leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] 975 addl -64+4*1(%rsp), %eax # e += RCONST + W[n & 15]
791 movl %ebx, %esi # 976 movl %ebx, %esi #
792 roll $5, %esi # rotl32(a,5) 977 roll $5, %esi # rotl32(a,5)
793 addl %esi, %eax # e += rotl32(a,5) 978 addl %esi, %eax # e += rotl32(a,5)
@@ -799,14 +984,8 @@ sha1_process_block64:
799 andl %ecx, %esi # si: b & c 984 andl %ecx, %esi # si: b & c
800 andl %edx, %edi # di: (b | c) & d 985 andl %edx, %edi # di: (b | c) & d
801 orl %esi, %edi # ((b | c) & d) | (b & c) 986 orl %esi, %edi # ((b | c) & d) | (b & c)
802 movl %r15d, %esi # W[(n+13) & 15]
803 xorl %r10d, %esi # ^W[(n+8) & 15]
804 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
805 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
806 roll %esi #
807 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
808 addl %edi, %ebp # += ((b | c) & d) | (b & c) 987 addl %edi, %ebp # += ((b | c) & d) | (b & c)
809 leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 988 addl -64+4*2(%rsp), %ebp # e += RCONST + W[n & 15]
810 movl %eax, %esi # 989 movl %eax, %esi #
811 roll $5, %esi # rotl32(a,5) 990 roll $5, %esi # rotl32(a,5)
812 addl %esi, %ebp # e += rotl32(a,5) 991 addl %esi, %ebp # e += rotl32(a,5)
@@ -818,18 +997,43 @@ sha1_process_block64:
818 andl %ebx, %esi # si: b & c 997 andl %ebx, %esi # si: b & c
819 andl %ecx, %edi # di: (b | c) & d 998 andl %ecx, %edi # di: (b | c) & d
820 orl %esi, %edi # ((b | c) & d) | (b & c) 999 orl %esi, %edi # ((b | c) & d) | (b & c)
821 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
822 xorl %r11d, %esi # ^W[(n+8) & 15]
823 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
824 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
825 roll %esi #
826 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
827 addl %edi, %edx # += ((b | c) & d) | (b & c) 1000 addl %edi, %edx # += ((b | c) & d) | (b & c)
828 leal -0x70E44324(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1001 addl -64+4*3(%rsp), %edx # e += RCONST + W[n & 15]
829 movl %ebp, %esi # 1002 movl %ebp, %esi #
830 roll $5, %esi # rotl32(a,5) 1003 roll $5, %esi # rotl32(a,5)
831 addl %esi, %edx # e += rotl32(a,5) 1004 addl %esi, %edx # e += rotl32(a,5)
832 rorl $2, %eax # b = rotl32(b,30) 1005 rorl $2, %eax # b = rotl32(b,30)
1006 pshufd $0xff, %xmm7, %xmm6
1007# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1008 movaps %xmm2, %xmm4
1009 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1010# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1011# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1012# same result as above, but shorter and faster:
1013# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1014# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1015 movaps %xmm3, %xmm5
1016 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1017 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1018 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1019 xorps %xmm5, %xmm3 # ^
1020 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1021 movaps %xmm3, %xmm5
1022 xorps %xmm4, %xmm4 # rol(W0,1):
1023 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1024 paddd %xmm3, %xmm3 # shift left by 1
1025 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1026 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1027 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1028 movaps %xmm5, %xmm4
1029 pslld $2, %xmm5
1030 psrld $30, %xmm4
1031# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1032 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1033 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1034 movaps %xmm3, %xmm5
1035 paddd %xmm6, %xmm5
1036 movups %xmm5, -64+16*3(%rsp)
833# 52 1037# 52
834 movl %ebp, %edi # di: b 1038 movl %ebp, %edi # di: b
835 movl %ebp, %esi # si: b 1039 movl %ebp, %esi # si: b
@@ -837,14 +1041,8 @@ sha1_process_block64:
837 andl %eax, %esi # si: b & c 1041 andl %eax, %esi # si: b & c
838 andl %ebx, %edi # di: (b | c) & d 1042 andl %ebx, %edi # di: (b | c) & d
839 orl %esi, %edi # ((b | c) & d) | (b & c) 1043 orl %esi, %edi # ((b | c) & d) | (b & c)
840 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
841 xorl %r12d, %esi # ^W[(n+8) & 15]
842 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
843 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
844 roll %esi #
845 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
846 addl %edi, %ecx # += ((b | c) & d) | (b & c) 1044 addl %edi, %ecx # += ((b | c) & d) | (b & c)
847 leal -0x70E44324(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 1045 addl -64+4*4(%rsp), %ecx # e += RCONST + W[n & 15]
848 movl %edx, %esi # 1046 movl %edx, %esi #
849 roll $5, %esi # rotl32(a,5) 1047 roll $5, %esi # rotl32(a,5)
850 addl %esi, %ecx # e += rotl32(a,5) 1048 addl %esi, %ecx # e += rotl32(a,5)
@@ -856,14 +1054,8 @@ sha1_process_block64:
856 andl %ebp, %esi # si: b & c 1054 andl %ebp, %esi # si: b & c
857 andl %eax, %edi # di: (b | c) & d 1055 andl %eax, %edi # di: (b | c) & d
858 orl %esi, %edi # ((b | c) & d) | (b & c) 1056 orl %esi, %edi # ((b | c) & d) | (b & c)
859 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
860 xorl %r13d, %esi # ^W[(n+8) & 15]
861 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
862 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
863 roll %esi #
864 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
865 addl %edi, %ebx # += ((b | c) & d) | (b & c) 1057 addl %edi, %ebx # += ((b | c) & d) | (b & c)
866 leal -0x70E44324(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 1058 addl -64+4*5(%rsp), %ebx # e += RCONST + W[n & 15]
867 movl %ecx, %esi # 1059 movl %ecx, %esi #
868 roll $5, %esi # rotl32(a,5) 1060 roll $5, %esi # rotl32(a,5)
869 addl %esi, %ebx # e += rotl32(a,5) 1061 addl %esi, %ebx # e += rotl32(a,5)
@@ -875,14 +1067,8 @@ sha1_process_block64:
875 andl %edx, %esi # si: b & c 1067 andl %edx, %esi # si: b & c
876 andl %ebp, %edi # di: (b | c) & d 1068 andl %ebp, %edi # di: (b | c) & d
877 orl %esi, %edi # ((b | c) & d) | (b & c) 1069 orl %esi, %edi # ((b | c) & d) | (b & c)
878 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
879 xorl %r14d, %esi # ^W[(n+8) & 15]
880 xorl %r8d, %esi # ^W[(n+2) & 15]
881 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
882 roll %esi #
883 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
884 addl %edi, %eax # += ((b | c) & d) | (b & c) 1070 addl %edi, %eax # += ((b | c) & d) | (b & c)
885 leal -0x70E44324(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1071 addl -64+4*6(%rsp), %eax # e += RCONST + W[n & 15]
886 movl %ebx, %esi # 1072 movl %ebx, %esi #
887 roll $5, %esi # rotl32(a,5) 1073 roll $5, %esi # rotl32(a,5)
888 addl %esi, %eax # e += rotl32(a,5) 1074 addl %esi, %eax # e += rotl32(a,5)
@@ -894,18 +1080,42 @@ sha1_process_block64:
894 andl %ecx, %esi # si: b & c 1080 andl %ecx, %esi # si: b & c
895 andl %edx, %edi # di: (b | c) & d 1081 andl %edx, %edi # di: (b | c) & d
896 orl %esi, %edi # ((b | c) & d) | (b & c) 1082 orl %esi, %edi # ((b | c) & d) | (b & c)
897 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
898 xorl %r15d, %esi # ^W[(n+8) & 15]
899 xorl %r9d, %esi # ^W[(n+2) & 15]
900 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
901 roll %esi #
902 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
903 addl %edi, %ebp # += ((b | c) & d) | (b & c) 1083 addl %edi, %ebp # += ((b | c) & d) | (b & c)
904 leal -0x70E44324(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1084 addl -64+4*7(%rsp), %ebp # e += RCONST + W[n & 15]
905 movl %eax, %esi # 1085 movl %eax, %esi #
906 roll $5, %esi # rotl32(a,5) 1086 roll $5, %esi # rotl32(a,5)
907 addl %esi, %ebp # e += rotl32(a,5) 1087 addl %esi, %ebp # e += rotl32(a,5)
908 rorl $2, %ebx # b = rotl32(b,30) 1088 rorl $2, %ebx # b = rotl32(b,30)
1089# PREP %xmm0 %xmm1 %xmm2 %xmm3 -64+16*0(%rsp)
1090 movaps %xmm3, %xmm4
1091 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1092# pshufd $0x4e, %xmm0, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1093# punpcklqdq %xmm1, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1094# same result as above, but shorter and faster:
1095# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1096# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1097 movaps %xmm0, %xmm5
1098 shufps $0x4e, %xmm1, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1099 xorps %xmm2, %xmm0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1100 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1101 xorps %xmm5, %xmm0 # ^
1102 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1103 movaps %xmm0, %xmm5
1104 xorps %xmm4, %xmm4 # rol(W0,1):
1105 pcmpgtd %xmm0, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1106 paddd %xmm0, %xmm0 # shift left by 1
1107 psubd %xmm4, %xmm0 # add 1 to those who had msb bit 1
1108 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1109 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1110 movaps %xmm5, %xmm4
1111 pslld $2, %xmm5
1112 psrld $30, %xmm4
1113# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1114 xorps %xmm4, %xmm0 # same result, but does not depend on/does not modify T2
1115 xorps %xmm5, %xmm0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1116 movaps %xmm0, %xmm5
1117 paddd %xmm6, %xmm5
1118 movups %xmm5, -64+16*0(%rsp)
909# 56 1119# 56
910 movl %eax, %edi # di: b 1120 movl %eax, %edi # di: b
911 movl %eax, %esi # si: b 1121 movl %eax, %esi # si: b
@@ -913,12 +1123,8 @@ sha1_process_block64:
913 andl %ebx, %esi # si: b & c 1123 andl %ebx, %esi # si: b & c
914 andl %ecx, %edi # di: (b | c) & d 1124 andl %ecx, %edi # di: (b | c) & d
915 orl %esi, %edi # ((b | c) & d) | (b & c) 1125 orl %esi, %edi # ((b | c) & d) | (b & c)
916 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
917 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
918 xorl %r10d, %r8d # ^W[(n+2) & 15]
919 roll %r8d #
920 addl %edi, %edx # += ((b | c) & d) | (b & c) 1126 addl %edi, %edx # += ((b | c) & d) | (b & c)
921 leal -0x70E44324(%rdx,%r8), %edx # e += RCONST + W[n & 15] 1127 addl -64+4*8(%rsp), %edx # e += RCONST + W[n & 15]
922 movl %ebp, %esi # 1128 movl %ebp, %esi #
923 roll $5, %esi # rotl32(a,5) 1129 roll $5, %esi # rotl32(a,5)
924 addl %esi, %edx # e += rotl32(a,5) 1130 addl %esi, %edx # e += rotl32(a,5)
@@ -930,12 +1136,8 @@ sha1_process_block64:
930 andl %eax, %esi # si: b & c 1136 andl %eax, %esi # si: b & c
931 andl %ebx, %edi # di: (b | c) & d 1137 andl %ebx, %edi # di: (b | c) & d
932 orl %esi, %edi # ((b | c) & d) | (b & c) 1138 orl %esi, %edi # ((b | c) & d) | (b & c)
933 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
934 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
935 xorl %r11d, %r9d # ^W[(n+2) & 15]
936 roll %r9d #
937 addl %edi, %ecx # += ((b | c) & d) | (b & c) 1139 addl %edi, %ecx # += ((b | c) & d) | (b & c)
938 leal -0x70E44324(%rcx,%r9), %ecx # e += RCONST + W[n & 15] 1140 addl -64+4*9(%rsp), %ecx # e += RCONST + W[n & 15]
939 movl %edx, %esi # 1141 movl %edx, %esi #
940 roll $5, %esi # rotl32(a,5) 1142 roll $5, %esi # rotl32(a,5)
941 addl %esi, %ecx # e += rotl32(a,5) 1143 addl %esi, %ecx # e += rotl32(a,5)
@@ -947,12 +1149,8 @@ sha1_process_block64:
947 andl %ebp, %esi # si: b & c 1149 andl %ebp, %esi # si: b & c
948 andl %eax, %edi # di: (b | c) & d 1150 andl %eax, %edi # di: (b | c) & d
949 orl %esi, %edi # ((b | c) & d) | (b & c) 1151 orl %esi, %edi # ((b | c) & d) | (b & c)
950 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
951 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
952 xorl %r12d, %r10d # ^W[(n+2) & 15]
953 roll %r10d #
954 addl %edi, %ebx # += ((b | c) & d) | (b & c) 1152 addl %edi, %ebx # += ((b | c) & d) | (b & c)
955 leal -0x70E44324(%rbx,%r10), %ebx # e += RCONST + W[n & 15] 1153 addl -64+4*10(%rsp), %ebx # e += RCONST + W[n & 15]
956 movl %ecx, %esi # 1154 movl %ecx, %esi #
957 roll $5, %esi # rotl32(a,5) 1155 roll $5, %esi # rotl32(a,5)
958 addl %esi, %ebx # e += rotl32(a,5) 1156 addl %esi, %ebx # e += rotl32(a,5)
@@ -964,307 +1162,297 @@ sha1_process_block64:
964 andl %edx, %esi # si: b & c 1162 andl %edx, %esi # si: b & c
965 andl %ebp, %edi # di: (b | c) & d 1163 andl %ebp, %edi # di: (b | c) & d
966 orl %esi, %edi # ((b | c) & d) | (b & c) 1164 orl %esi, %edi # ((b | c) & d) | (b & c)
967 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
968 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
969 xorl %r13d, %r11d # ^W[(n+2) & 15]
970 roll %r11d #
971 addl %edi, %eax # += ((b | c) & d) | (b & c) 1165 addl %edi, %eax # += ((b | c) & d) | (b & c)
972 leal -0x70E44324(%rax,%r11), %eax # e += RCONST + W[n & 15] 1166 addl -64+4*11(%rsp), %eax # e += RCONST + W[n & 15]
973 movl %ebx, %esi # 1167 movl %ebx, %esi #
974 roll $5, %esi # rotl32(a,5) 1168 roll $5, %esi # rotl32(a,5)
975 addl %esi, %eax # e += rotl32(a,5) 1169 addl %esi, %eax # e += rotl32(a,5)
976 rorl $2, %ecx # b = rotl32(b,30) 1170 rorl $2, %ecx # b = rotl32(b,30)
1171# PREP %xmm1 %xmm2 %xmm3 %xmm0 -64+16*1(%rsp)
1172 movaps %xmm0, %xmm4
1173 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1174# pshufd $0x4e, %xmm1, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1175# punpcklqdq %xmm2, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1176# same result as above, but shorter and faster:
1177# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1178# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1179 movaps %xmm1, %xmm5
1180 shufps $0x4e, %xmm2, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1181 xorps %xmm3, %xmm1 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1182 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1183 xorps %xmm5, %xmm1 # ^
1184 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1185 movaps %xmm1, %xmm5
1186 xorps %xmm4, %xmm4 # rol(W0,1):
1187 pcmpgtd %xmm1, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1188 paddd %xmm1, %xmm1 # shift left by 1
1189 psubd %xmm4, %xmm1 # add 1 to those who had msb bit 1
1190 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1191 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1192 movaps %xmm5, %xmm4
1193 pslld $2, %xmm5
1194 psrld $30, %xmm4
1195# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1196 xorps %xmm4, %xmm1 # same result, but does not depend on/does not modify T2
1197 xorps %xmm5, %xmm1 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1198 movaps %xmm1, %xmm5
1199 paddd %xmm6, %xmm5
1200 movups %xmm5, -64+16*1(%rsp)
977# 60 1201# 60
978 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
979 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
980 xorl %r14d, %r12d # ^W[(n+2) & 15]
981 roll %r12d #
982 movl %ecx, %edi # c 1202 movl %ecx, %edi # c
983 xorl %edx, %edi # ^d 1203 xorl %edx, %edi # ^d
984 xorl %ebx, %edi # ^b 1204 xorl %ebx, %edi # ^b
985 leal -0x359D3E2A(%rbp,%r12), %ebp # e += RCONST + W[n & 15] 1205 addl -64+4*12(%rsp), %ebp # e += RCONST + W[n & 15]
986 addl %edi, %ebp # e += (c ^ d ^ b) 1206 addl %edi, %ebp # e += (c ^ d ^ b)
987 movl %eax, %esi # 1207 movl %eax, %esi #
988 roll $5, %esi # rotl32(a,5) 1208 roll $5, %esi # rotl32(a,5)
989 addl %esi, %ebp # e += rotl32(a,5) 1209 addl %esi, %ebp # e += rotl32(a,5)
990 rorl $2, %ebx # b = rotl32(b,30) 1210 rorl $2, %ebx # b = rotl32(b,30)
991# 61 1211# 61
992 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
993 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
994 xorl %r15d, %r13d # ^W[(n+2) & 15]
995 roll %r13d #
996 movl %ebx, %edi # c 1212 movl %ebx, %edi # c
997 xorl %ecx, %edi # ^d 1213 xorl %ecx, %edi # ^d
998 xorl %eax, %edi # ^b 1214 xorl %eax, %edi # ^b
999 leal -0x359D3E2A(%rdx,%r13), %edx # e += RCONST + W[n & 15] 1215 addl -64+4*13(%rsp), %edx # e += RCONST + W[n & 15]
1000 addl %edi, %edx # e += (c ^ d ^ b) 1216 addl %edi, %edx # e += (c ^ d ^ b)
1001 movl %ebp, %esi # 1217 movl %ebp, %esi #
1002 roll $5, %esi # rotl32(a,5) 1218 roll $5, %esi # rotl32(a,5)
1003 addl %esi, %edx # e += rotl32(a,5) 1219 addl %esi, %edx # e += rotl32(a,5)
1004 rorl $2, %eax # b = rotl32(b,30) 1220 rorl $2, %eax # b = rotl32(b,30)
1005# 62 1221# 62
1006 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
1007 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
1008 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
1009 roll %r14d #
1010 movl %eax, %edi # c 1222 movl %eax, %edi # c
1011 xorl %ebx, %edi # ^d 1223 xorl %ebx, %edi # ^d
1012 xorl %ebp, %edi # ^b 1224 xorl %ebp, %edi # ^b
1013 leal -0x359D3E2A(%rcx,%r14), %ecx # e += RCONST + W[n & 15] 1225 addl -64+4*14(%rsp), %ecx # e += RCONST + W[n & 15]
1014 addl %edi, %ecx # e += (c ^ d ^ b) 1226 addl %edi, %ecx # e += (c ^ d ^ b)
1015 movl %edx, %esi # 1227 movl %edx, %esi #
1016 roll $5, %esi # rotl32(a,5) 1228 roll $5, %esi # rotl32(a,5)
1017 addl %esi, %ecx # e += rotl32(a,5) 1229 addl %esi, %ecx # e += rotl32(a,5)
1018 rorl $2, %ebp # b = rotl32(b,30) 1230 rorl $2, %ebp # b = rotl32(b,30)
1019# 63 1231# 63
1020 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
1021 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
1022 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
1023 roll %r15d #
1024 movl %ebp, %edi # c 1232 movl %ebp, %edi # c
1025 xorl %eax, %edi # ^d 1233 xorl %eax, %edi # ^d
1026 xorl %edx, %edi # ^b 1234 xorl %edx, %edi # ^b
1027 leal -0x359D3E2A(%rbx,%r15), %ebx # e += RCONST + W[n & 15] 1235 addl -64+4*15(%rsp), %ebx # e += RCONST + W[n & 15]
1028 addl %edi, %ebx # e += (c ^ d ^ b) 1236 addl %edi, %ebx # e += (c ^ d ^ b)
1029 movl %ecx, %esi # 1237 movl %ecx, %esi #
1030 roll $5, %esi # rotl32(a,5) 1238 roll $5, %esi # rotl32(a,5)
1031 addl %esi, %ebx # e += rotl32(a,5) 1239 addl %esi, %ebx # e += rotl32(a,5)
1032 rorl $2, %edx # b = rotl32(b,30) 1240 rorl $2, %edx # b = rotl32(b,30)
1241# PREP %xmm2 %xmm3 %xmm0 %xmm1 -64+16*2(%rsp)
1242 movaps %xmm1, %xmm4
1243 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1244# pshufd $0x4e, %xmm2, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1245# punpcklqdq %xmm3, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1246# same result as above, but shorter and faster:
1247# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1248# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1249 movaps %xmm2, %xmm5
1250 shufps $0x4e, %xmm3, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1251 xorps %xmm0, %xmm2 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1252 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1253 xorps %xmm5, %xmm2 # ^
1254 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1255 movaps %xmm2, %xmm5
1256 xorps %xmm4, %xmm4 # rol(W0,1):
1257 pcmpgtd %xmm2, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1258 paddd %xmm2, %xmm2 # shift left by 1
1259 psubd %xmm4, %xmm2 # add 1 to those who had msb bit 1
1260 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1261 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1262 movaps %xmm5, %xmm4
1263 pslld $2, %xmm5
1264 psrld $30, %xmm4
1265# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1266 xorps %xmm4, %xmm2 # same result, but does not depend on/does not modify T2
1267 xorps %xmm5, %xmm2 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1268 movaps %xmm2, %xmm5
1269 paddd %xmm6, %xmm5
1270 movups %xmm5, -64+16*2(%rsp)
1033# 64 1271# 64
1034 movl %r13d, %esi # W[(n+13) & 15]
1035 xorl %r8d, %esi # ^W[(n+8) & 15]
1036 xorl -32+4*2(%rsp), %esi # ^W[(n+2) & 15]
1037 xorl -32+4*0(%rsp), %esi # ^W[n & 15]
1038 roll %esi #
1039 movl %esi, -32+4*0(%rsp) # store to W[n & 15]
1040 movl %edx, %edi # c 1272 movl %edx, %edi # c
1041 xorl %ebp, %edi # ^d 1273 xorl %ebp, %edi # ^d
1042 xorl %ecx, %edi # ^b 1274 xorl %ecx, %edi # ^b
1043 leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1275 addl -64+4*0(%rsp), %eax # e += RCONST + W[n & 15]
1044 addl %edi, %eax # e += (c ^ d ^ b) 1276 addl %edi, %eax # e += (c ^ d ^ b)
1045 movl %ebx, %esi # 1277 movl %ebx, %esi #
1046 roll $5, %esi # rotl32(a,5) 1278 roll $5, %esi # rotl32(a,5)
1047 addl %esi, %eax # e += rotl32(a,5) 1279 addl %esi, %eax # e += rotl32(a,5)
1048 rorl $2, %ecx # b = rotl32(b,30) 1280 rorl $2, %ecx # b = rotl32(b,30)
1049# 65 1281# 65
1050 movl %r14d, %esi # W[(n+13) & 15]
1051 xorl %r9d, %esi # ^W[(n+8) & 15]
1052 xorl -32+4*3(%rsp), %esi # ^W[(n+2) & 15]
1053 xorl -32+4*1(%rsp), %esi # ^W[n & 15]
1054 roll %esi #
1055 movl %esi, -32+4*1(%rsp) # store to W[n & 15]
1056 movl %ecx, %edi # c 1282 movl %ecx, %edi # c
1057 xorl %edx, %edi # ^d 1283 xorl %edx, %edi # ^d
1058 xorl %ebx, %edi # ^b 1284 xorl %ebx, %edi # ^b
1059 leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1285 addl -64+4*1(%rsp), %ebp # e += RCONST + W[n & 15]
1060 addl %edi, %ebp # e += (c ^ d ^ b) 1286 addl %edi, %ebp # e += (c ^ d ^ b)
1061 movl %eax, %esi # 1287 movl %eax, %esi #
1062 roll $5, %esi # rotl32(a,5) 1288 roll $5, %esi # rotl32(a,5)
1063 addl %esi, %ebp # e += rotl32(a,5) 1289 addl %esi, %ebp # e += rotl32(a,5)
1064 rorl $2, %ebx # b = rotl32(b,30) 1290 rorl $2, %ebx # b = rotl32(b,30)
1065# 66 1291# 66
1066 movl %r15d, %esi # W[(n+13) & 15]
1067 xorl %r10d, %esi # ^W[(n+8) & 15]
1068 xorl -32+4*4(%rsp), %esi # ^W[(n+2) & 15]
1069 xorl -32+4*2(%rsp), %esi # ^W[n & 15]
1070 roll %esi #
1071 movl %esi, -32+4*2(%rsp) # store to W[n & 15]
1072 movl %ebx, %edi # c 1292 movl %ebx, %edi # c
1073 xorl %ecx, %edi # ^d 1293 xorl %ecx, %edi # ^d
1074 xorl %eax, %edi # ^b 1294 xorl %eax, %edi # ^b
1075 leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1295 addl -64+4*2(%rsp), %edx # e += RCONST + W[n & 15]
1076 addl %edi, %edx # e += (c ^ d ^ b) 1296 addl %edi, %edx # e += (c ^ d ^ b)
1077 movl %ebp, %esi # 1297 movl %ebp, %esi #
1078 roll $5, %esi # rotl32(a,5) 1298 roll $5, %esi # rotl32(a,5)
1079 addl %esi, %edx # e += rotl32(a,5) 1299 addl %esi, %edx # e += rotl32(a,5)
1080 rorl $2, %eax # b = rotl32(b,30) 1300 rorl $2, %eax # b = rotl32(b,30)
1081# 67 1301# 67
1082 movl -32+4*0(%rsp), %esi # W[(n+13) & 15]
1083 xorl %r11d, %esi # ^W[(n+8) & 15]
1084 xorl -32+4*5(%rsp), %esi # ^W[(n+2) & 15]
1085 xorl -32+4*3(%rsp), %esi # ^W[n & 15]
1086 roll %esi #
1087 movl %esi, -32+4*3(%rsp) # store to W[n & 15]
1088 movl %eax, %edi # c 1302 movl %eax, %edi # c
1089 xorl %ebx, %edi # ^d 1303 xorl %ebx, %edi # ^d
1090 xorl %ebp, %edi # ^b 1304 xorl %ebp, %edi # ^b
1091 leal -0x359D3E2A(%rcx,%rsi), %ecx # e += RCONST + W[n & 15] 1305 addl -64+4*3(%rsp), %ecx # e += RCONST + W[n & 15]
1092 addl %edi, %ecx # e += (c ^ d ^ b) 1306 addl %edi, %ecx # e += (c ^ d ^ b)
1093 movl %edx, %esi # 1307 movl %edx, %esi #
1094 roll $5, %esi # rotl32(a,5) 1308 roll $5, %esi # rotl32(a,5)
1095 addl %esi, %ecx # e += rotl32(a,5) 1309 addl %esi, %ecx # e += rotl32(a,5)
1096 rorl $2, %ebp # b = rotl32(b,30) 1310 rorl $2, %ebp # b = rotl32(b,30)
1311# PREP %xmm3 %xmm0 %xmm1 %xmm2 -64+16*3(%rsp)
1312 movaps %xmm2, %xmm4
1313 psrldq $4, %xmm4 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
1314# pshufd $0x4e, %xmm3, %xmm5 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
1315# punpcklqdq %xmm0, %xmm5 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
1316# same result as above, but shorter and faster:
1317# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
1318# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
1319 movaps %xmm3, %xmm5
1320 shufps $0x4e, %xmm0, %xmm5 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
1321 xorps %xmm1, %xmm3 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
1322 xorps %xmm4, %xmm5 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
1323 xorps %xmm5, %xmm3 # ^
1324 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
1325 movaps %xmm3, %xmm5
1326 xorps %xmm4, %xmm4 # rol(W0,1):
1327 pcmpgtd %xmm3, %xmm4 # ffffffff for elements <0 (ones with msb bit 1)
1328 paddd %xmm3, %xmm3 # shift left by 1
1329 psubd %xmm4, %xmm3 # add 1 to those who had msb bit 1
1330 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
1331 pslldq $12, %xmm5 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
1332 movaps %xmm5, %xmm4
1333 pslld $2, %xmm5
1334 psrld $30, %xmm4
1335# xorps %xmm4, %xmm5 # rol((0,0,0,unrotW[0]),2)
1336 xorps %xmm4, %xmm3 # same result, but does not depend on/does not modify T2
1337 xorps %xmm5, %xmm3 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
1338 movaps %xmm3, %xmm5
1339 paddd %xmm6, %xmm5
1340 movups %xmm5, -64+16*3(%rsp)
1097# 68 1341# 68
1098 movl -32+4*1(%rsp), %esi # W[(n+13) & 15]
1099 xorl %r12d, %esi # ^W[(n+8) & 15]
1100 xorl -32+4*6(%rsp), %esi # ^W[(n+2) & 15]
1101 xorl -32+4*4(%rsp), %esi # ^W[n & 15]
1102 roll %esi #
1103 movl %esi, -32+4*4(%rsp) # store to W[n & 15]
1104 movl %ebp, %edi # c 1342 movl %ebp, %edi # c
1105 xorl %eax, %edi # ^d 1343 xorl %eax, %edi # ^d
1106 xorl %edx, %edi # ^b 1344 xorl %edx, %edi # ^b
1107 leal -0x359D3E2A(%rbx,%rsi), %ebx # e += RCONST + W[n & 15] 1345 addl -64+4*4(%rsp), %ebx # e += RCONST + W[n & 15]
1108 addl %edi, %ebx # e += (c ^ d ^ b) 1346 addl %edi, %ebx # e += (c ^ d ^ b)
1109 movl %ecx, %esi # 1347 movl %ecx, %esi #
1110 roll $5, %esi # rotl32(a,5) 1348 roll $5, %esi # rotl32(a,5)
1111 addl %esi, %ebx # e += rotl32(a,5) 1349 addl %esi, %ebx # e += rotl32(a,5)
1112 rorl $2, %edx # b = rotl32(b,30) 1350 rorl $2, %edx # b = rotl32(b,30)
1113# 69 1351# 69
1114 movl -32+4*2(%rsp), %esi # W[(n+13) & 15]
1115 xorl %r13d, %esi # ^W[(n+8) & 15]
1116 xorl -32+4*7(%rsp), %esi # ^W[(n+2) & 15]
1117 xorl -32+4*5(%rsp), %esi # ^W[n & 15]
1118 roll %esi #
1119 movl %esi, -32+4*5(%rsp) # store to W[n & 15]
1120 movl %edx, %edi # c 1352 movl %edx, %edi # c
1121 xorl %ebp, %edi # ^d 1353 xorl %ebp, %edi # ^d
1122 xorl %ecx, %edi # ^b 1354 xorl %ecx, %edi # ^b
1123 leal -0x359D3E2A(%rax,%rsi), %eax # e += RCONST + W[n & 15] 1355 addl -64+4*5(%rsp), %eax # e += RCONST + W[n & 15]
1124 addl %edi, %eax # e += (c ^ d ^ b) 1356 addl %edi, %eax # e += (c ^ d ^ b)
1125 movl %ebx, %esi # 1357 movl %ebx, %esi #
1126 roll $5, %esi # rotl32(a,5) 1358 roll $5, %esi # rotl32(a,5)
1127 addl %esi, %eax # e += rotl32(a,5) 1359 addl %esi, %eax # e += rotl32(a,5)
1128 rorl $2, %ecx # b = rotl32(b,30) 1360 rorl $2, %ecx # b = rotl32(b,30)
1129# 70 1361# 70
1130 movl -32+4*3(%rsp), %esi # W[(n+13) & 15]
1131 xorl %r14d, %esi # ^W[(n+8) & 15]
1132 xorl %r8d, %esi # ^W[(n+2) & 15]
1133 xorl -32+4*6(%rsp), %esi # ^W[n & 15]
1134 roll %esi #
1135 movl %esi, -32+4*6(%rsp) # store to W[n & 15]
1136 movl %ecx, %edi # c 1362 movl %ecx, %edi # c
1137 xorl %edx, %edi # ^d 1363 xorl %edx, %edi # ^d
1138 xorl %ebx, %edi # ^b 1364 xorl %ebx, %edi # ^b
1139 leal -0x359D3E2A(%rbp,%rsi), %ebp # e += RCONST + W[n & 15] 1365 addl -64+4*6(%rsp), %ebp # e += RCONST + W[n & 15]
1140 addl %edi, %ebp # e += (c ^ d ^ b) 1366 addl %edi, %ebp # e += (c ^ d ^ b)
1141 movl %eax, %esi # 1367 movl %eax, %esi #
1142 roll $5, %esi # rotl32(a,5) 1368 roll $5, %esi # rotl32(a,5)
1143 addl %esi, %ebp # e += rotl32(a,5) 1369 addl %esi, %ebp # e += rotl32(a,5)
1144 rorl $2, %ebx # b = rotl32(b,30) 1370 rorl $2, %ebx # b = rotl32(b,30)
1145# 71 1371# 71
1146 movl -32+4*4(%rsp), %esi # W[(n+13) & 15]
1147 xorl %r15d, %esi # ^W[(n+8) & 15]
1148 xorl %r9d, %esi # ^W[(n+2) & 15]
1149 xorl -32+4*7(%rsp), %esi # ^W[n & 15]
1150 roll %esi #
1151 movl %esi, -32+4*7(%rsp) # store to W[n & 15]
1152 movl %ebx, %edi # c 1372 movl %ebx, %edi # c
1153 xorl %ecx, %edi # ^d 1373 xorl %ecx, %edi # ^d
1154 xorl %eax, %edi # ^b 1374 xorl %eax, %edi # ^b
1155 leal -0x359D3E2A(%rdx,%rsi), %edx # e += RCONST + W[n & 15] 1375 addl -64+4*7(%rsp), %edx # e += RCONST + W[n & 15]
1156 addl %edi, %edx # e += (c ^ d ^ b) 1376 addl %edi, %edx # e += (c ^ d ^ b)
1157 movl %ebp, %esi # 1377 movl %ebp, %esi #
1158 roll $5, %esi # rotl32(a,5) 1378 roll $5, %esi # rotl32(a,5)
1159 addl %esi, %edx # e += rotl32(a,5) 1379 addl %esi, %edx # e += rotl32(a,5)
1160 rorl $2, %eax # b = rotl32(b,30) 1380 rorl $2, %eax # b = rotl32(b,30)
1161# 72 1381# 72
1162 xorl -32+4*5(%rsp), %r8d # W[n & 15] ^= W[(n+13) & 15]
1163 xorl -32+4*0(%rsp), %r8d # ^W[(n+8) & 15]
1164 xorl %r10d, %r8d # ^W[(n+2) & 15]
1165 roll %r8d #
1166 movl %eax, %edi # c 1382 movl %eax, %edi # c
1167 xorl %ebx, %edi # ^d 1383 xorl %ebx, %edi # ^d
1168 xorl %ebp, %edi # ^b 1384 xorl %ebp, %edi # ^b
1169 leal -0x359D3E2A(%rcx,%r8), %ecx # e += RCONST + W[n & 15] 1385 addl -64+4*8(%rsp), %ecx # e += RCONST + W[n & 15]
1170 addl %edi, %ecx # e += (c ^ d ^ b) 1386 addl %edi, %ecx # e += (c ^ d ^ b)
1171 movl %edx, %esi # 1387 movl %edx, %esi #
1172 roll $5, %esi # rotl32(a,5) 1388 roll $5, %esi # rotl32(a,5)
1173 addl %esi, %ecx # e += rotl32(a,5) 1389 addl %esi, %ecx # e += rotl32(a,5)
1174 rorl $2, %ebp # b = rotl32(b,30) 1390 rorl $2, %ebp # b = rotl32(b,30)
1175# 73 1391# 73
1176 xorl -32+4*6(%rsp), %r9d # W[n & 15] ^= W[(n+13) & 15]
1177 xorl -32+4*1(%rsp), %r9d # ^W[(n+8) & 15]
1178 xorl %r11d, %r9d # ^W[(n+2) & 15]
1179 roll %r9d #
1180 movl %ebp, %edi # c 1392 movl %ebp, %edi # c
1181 xorl %eax, %edi # ^d 1393 xorl %eax, %edi # ^d
1182 xorl %edx, %edi # ^b 1394 xorl %edx, %edi # ^b
1183 leal -0x359D3E2A(%rbx,%r9), %ebx # e += RCONST + W[n & 15] 1395 addl -64+4*9(%rsp), %ebx # e += RCONST + W[n & 15]
1184 addl %edi, %ebx # e += (c ^ d ^ b) 1396 addl %edi, %ebx # e += (c ^ d ^ b)
1185 movl %ecx, %esi # 1397 movl %ecx, %esi #
1186 roll $5, %esi # rotl32(a,5) 1398 roll $5, %esi # rotl32(a,5)
1187 addl %esi, %ebx # e += rotl32(a,5) 1399 addl %esi, %ebx # e += rotl32(a,5)
1188 rorl $2, %edx # b = rotl32(b,30) 1400 rorl $2, %edx # b = rotl32(b,30)
1189# 74 1401# 74
1190 xorl -32+4*7(%rsp), %r10d # W[n & 15] ^= W[(n+13) & 15]
1191 xorl -32+4*2(%rsp), %r10d # ^W[(n+8) & 15]
1192 xorl %r12d, %r10d # ^W[(n+2) & 15]
1193 roll %r10d #
1194 movl %edx, %edi # c 1402 movl %edx, %edi # c
1195 xorl %ebp, %edi # ^d 1403 xorl %ebp, %edi # ^d
1196 xorl %ecx, %edi # ^b 1404 xorl %ecx, %edi # ^b
1197 leal -0x359D3E2A(%rax,%r10), %eax # e += RCONST + W[n & 15] 1405 addl -64+4*10(%rsp), %eax # e += RCONST + W[n & 15]
1198 addl %edi, %eax # e += (c ^ d ^ b) 1406 addl %edi, %eax # e += (c ^ d ^ b)
1199 movl %ebx, %esi # 1407 movl %ebx, %esi #
1200 roll $5, %esi # rotl32(a,5) 1408 roll $5, %esi # rotl32(a,5)
1201 addl %esi, %eax # e += rotl32(a,5) 1409 addl %esi, %eax # e += rotl32(a,5)
1202 rorl $2, %ecx # b = rotl32(b,30) 1410 rorl $2, %ecx # b = rotl32(b,30)
1203# 75 1411# 75
1204 xorl %r8d, %r11d # W[n & 15] ^= W[(n+13) & 15]
1205 xorl -32+4*3(%rsp), %r11d # ^W[(n+8) & 15]
1206 xorl %r13d, %r11d # ^W[(n+2) & 15]
1207 roll %r11d #
1208 movl %ecx, %edi # c 1412 movl %ecx, %edi # c
1209 xorl %edx, %edi # ^d 1413 xorl %edx, %edi # ^d
1210 xorl %ebx, %edi # ^b 1414 xorl %ebx, %edi # ^b
1211 leal -0x359D3E2A(%rbp,%r11), %ebp # e += RCONST + W[n & 15] 1415 addl -64+4*11(%rsp), %ebp # e += RCONST + W[n & 15]
1212 addl %edi, %ebp # e += (c ^ d ^ b) 1416 addl %edi, %ebp # e += (c ^ d ^ b)
1213 movl %eax, %esi # 1417 movl %eax, %esi #
1214 roll $5, %esi # rotl32(a,5) 1418 roll $5, %esi # rotl32(a,5)
1215 addl %esi, %ebp # e += rotl32(a,5) 1419 addl %esi, %ebp # e += rotl32(a,5)
1216 rorl $2, %ebx # b = rotl32(b,30) 1420 rorl $2, %ebx # b = rotl32(b,30)
1217# 76 1421# 76
1218 xorl %r9d, %r12d # W[n & 15] ^= W[(n+13) & 15]
1219 xorl -32+4*4(%rsp), %r12d # ^W[(n+8) & 15]
1220 xorl %r14d, %r12d # ^W[(n+2) & 15]
1221 roll %r12d #
1222 movl %ebx, %edi # c 1422 movl %ebx, %edi # c
1223 xorl %ecx, %edi # ^d 1423 xorl %ecx, %edi # ^d
1224 xorl %eax, %edi # ^b 1424 xorl %eax, %edi # ^b
1225 leal -0x359D3E2A(%rdx,%r12), %edx # e += RCONST + W[n & 15] 1425 addl -64+4*12(%rsp), %edx # e += RCONST + W[n & 15]
1226 addl %edi, %edx # e += (c ^ d ^ b) 1426 addl %edi, %edx # e += (c ^ d ^ b)
1227 movl %ebp, %esi # 1427 movl %ebp, %esi #
1228 roll $5, %esi # rotl32(a,5) 1428 roll $5, %esi # rotl32(a,5)
1229 addl %esi, %edx # e += rotl32(a,5) 1429 addl %esi, %edx # e += rotl32(a,5)
1230 rorl $2, %eax # b = rotl32(b,30) 1430 rorl $2, %eax # b = rotl32(b,30)
1231# 77 1431# 77
1232 xorl %r10d, %r13d # W[n & 15] ^= W[(n+13) & 15]
1233 xorl -32+4*5(%rsp), %r13d # ^W[(n+8) & 15]
1234 xorl %r15d, %r13d # ^W[(n+2) & 15]
1235 roll %r13d #
1236 movl %eax, %edi # c 1432 movl %eax, %edi # c
1237 xorl %ebx, %edi # ^d 1433 xorl %ebx, %edi # ^d
1238 xorl %ebp, %edi # ^b 1434 xorl %ebp, %edi # ^b
1239 leal -0x359D3E2A(%rcx,%r13), %ecx # e += RCONST + W[n & 15] 1435 addl -64+4*13(%rsp), %ecx # e += RCONST + W[n & 15]
1240 addl %edi, %ecx # e += (c ^ d ^ b) 1436 addl %edi, %ecx # e += (c ^ d ^ b)
1241 movl %edx, %esi # 1437 movl %edx, %esi #
1242 roll $5, %esi # rotl32(a,5) 1438 roll $5, %esi # rotl32(a,5)
1243 addl %esi, %ecx # e += rotl32(a,5) 1439 addl %esi, %ecx # e += rotl32(a,5)
1244 rorl $2, %ebp # b = rotl32(b,30) 1440 rorl $2, %ebp # b = rotl32(b,30)
1245# 78 1441# 78
1246 xorl %r11d, %r14d # W[n & 15] ^= W[(n+13) & 15]
1247 xorl -32+4*6(%rsp), %r14d # ^W[(n+8) & 15]
1248 xorl -32+4*0(%rsp), %r14d # ^W[(n+2) & 15]
1249 roll %r14d #
1250 movl %ebp, %edi # c 1442 movl %ebp, %edi # c
1251 xorl %eax, %edi # ^d 1443 xorl %eax, %edi # ^d
1252 xorl %edx, %edi # ^b 1444 xorl %edx, %edi # ^b
1253 leal -0x359D3E2A(%rbx,%r14), %ebx # e += RCONST + W[n & 15] 1445 addl -64+4*14(%rsp), %ebx # e += RCONST + W[n & 15]
1254 addl %edi, %ebx # e += (c ^ d ^ b) 1446 addl %edi, %ebx # e += (c ^ d ^ b)
1255 movl %ecx, %esi # 1447 movl %ecx, %esi #
1256 roll $5, %esi # rotl32(a,5) 1448 roll $5, %esi # rotl32(a,5)
1257 addl %esi, %ebx # e += rotl32(a,5) 1449 addl %esi, %ebx # e += rotl32(a,5)
1258 rorl $2, %edx # b = rotl32(b,30) 1450 rorl $2, %edx # b = rotl32(b,30)
1259# 79 1451# 79
1260 xorl %r12d, %r15d # W[n & 15] ^= W[(n+13) & 15]
1261 xorl -32+4*7(%rsp), %r15d # ^W[(n+8) & 15]
1262 xorl -32+4*1(%rsp), %r15d # ^W[(n+2) & 15]
1263 roll %r15d #
1264 movl %edx, %edi # c 1452 movl %edx, %edi # c
1265 xorl %ebp, %edi # ^d 1453 xorl %ebp, %edi # ^d
1266 xorl %ecx, %edi # ^b 1454 xorl %ecx, %edi # ^b
1267 leal -0x359D3E2A(%rax,%r15), %eax # e += RCONST + W[n & 15] 1455 addl -64+4*15(%rsp), %eax # e += RCONST + W[n & 15]
1268 addl %edi, %eax # e += (c ^ d ^ b) 1456 addl %edi, %eax # e += (c ^ d ^ b)
1269 movl %ebx, %esi # 1457 movl %ebx, %esi #
1270 roll $5, %esi # rotl32(a,5) 1458 roll $5, %esi # rotl32(a,5)
@@ -1278,7 +1466,7 @@ sha1_process_block64:
1278 addl %ebx, 84(%rdi) # ctx->hash[1] += b 1466 addl %ebx, 84(%rdi) # ctx->hash[1] += b
1279 popq %r14 # 1467 popq %r14 #
1280 addl %ecx, 88(%rdi) # ctx->hash[2] += c 1468 addl %ecx, 88(%rdi) # ctx->hash[2] += c
1281 popq %r15 # 1469# popq %r15 #
1282 addl %edx, 92(%rdi) # ctx->hash[3] += d 1470 addl %edx, 92(%rdi) # ctx->hash[3] += d
1283 popq %rbx # 1471 popq %rbx #
1284 addl %ebp, 96(%rdi) # ctx->hash[4] += e 1472 addl %ebp, 96(%rdi) # ctx->hash[4] += e
@@ -1286,4 +1474,13 @@ sha1_process_block64:
1286 1474
1287 ret 1475 ret
1288 .size sha1_process_block64, .-sha1_process_block64 1476 .size sha1_process_block64, .-sha1_process_block64
1477
1478 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1479 .balign 16
1480sha1const:
1481 .long 0x5A827999
1482 .long 0x6ED9EBA1
1483 .long 0x8F1BBCDC
1484 .long 0xCA62C1D6
1485
1289#endif 1486#endif
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 901896e6e..a10ac411d 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -6,33 +6,104 @@
6# also contains the diff of the generated file. 6# also contains the diff of the generated file.
7exec >hash_md5_sha_x86-64.S 7exec >hash_md5_sha_x86-64.S
8 8
9# There is a way to use XMM registers (which always exist for x86-64!) for W[] 9# Based on http://arctic.org/~dean/crypto/sha1.html.
10# For example, if we load W as follows: 10# ("This SHA1 implementation is public domain.")
11# %xmm0: w[0x0] w[0x1] w[0x2] w[0x3] 11#
12# %xmm4: w[0x4] w[0x5] w[0x6] w[0x7] 12# x86-64 has at least SSE2 vector insns always available.
13# %xmm8: w[0x8] w[0x9] w[0xa] w[0xb] 13# We can use them without any CPUID checks (and without a need
14# %xmm12: w[0xc] w[0xd] w[0xe] w[0xf] 14# for a fallback code if needed insns are not available).
15# then the xor'ing operation to generate next W[0..3] is: 15# This code uses them to calculate W[] ahead of time.
16# movaps %xmm0, %xmmT2 16#
17# palignr $0x8, %xmm4, %xmmT2 # form (w[0x2],w[0x3],w[0x4],w[0x5]) 17# Unfortunately, results are passed from vector unit to
18# # Right-shifts xmm4:xmmT2 by 8 bytes. Writes shifted result to xmmT2. SSSE3 insn. 18# integer ALUs on the stack. MOVD/Q insns to move them directly
19# movaps %xmm0, %xmmT13 19# from vector to integer registers are slower than store-to-load
20# palignr $0x4,%xmm0,%xmmT13 # form (w[0xd],w[0xe],w[0xf],w[0x0]) 20# forwarding in LSU (on Skylake at least).
21# xmm0 = xmm0 ^ t2 ^ xmm8 ^ t13 21#
22# xmm0 = rol32(xmm0,1) # no such insn, have to use pslld+psrld+or 22# The win against a purely integer code is small on Skylake,
23# and then results can be extracted for use: 23# only about 7-8%. We offload about 1/3 of our operations to the vector unit.
24# movd %xmm0, %esi # new W[0] 24# It can do 4 ops at once in one 128-bit register,
25# pextrd $1, %xmm0, %esi # new W[1] 25# but we have to use x2 of them because of W[0] complication,
26# # SSE4.1 insn. Can use EXTRACTPS (also SSE4.1) 26# SSE2 has no "rotate each word by N bits" insns,
27# pextrd $2, %xmm0, %esi # new W[2] 27# moving data to/from vector unit is clunky, and Skylake
28# pextrd $3, %xmm0, %esi # new W[3] 28# has four integer ALUs unified with three vector ALUs,
29# ... but this requires SSE4.1 and SSSE3, which are not universally available on x86-64. 29# which makes pure integer code rather fast, and makes
30# vector ops compete with integer ones.
31#
32# Zen3, with its separate vector ALUs, wins more, about 12%.
33
34xmmT1="%xmm4"
35xmmT2="%xmm5"
36xmmRCONST="%xmm6"
37xmmALLRCONST="%xmm7"
38T=`printf '\t'`
39
40# SSE instructions are longer than 4 bytes on average.
41# Intel CPUs (up to Tiger Lake at least) can't decode
42# more than 16 bytes of code in one cycle.
43# By interleaving SSE code and integer code
44# we mostly achieve a situation where 16-byte decode fetch window
45# contains 4 (or more) insns.
46#
47# However. On Skylake, there was no observed difference,
48# but on Zen3, non-interleaved code is ~3% faster
49# (822 Mb/s versus 795 Mb/s hashing speed).
50# Off for now:
51interleave=false
52
53INTERLEAVE() {
54 $interleave || \
55 {
56 # Generate non-interleaved code
57 # (it should work correctly too)
58 echo "$1"
59 echo "$2"
60 return
61 }
62 (
63 echo "$1" | grep -v '^$' >"$0.temp1"
64 echo "$2" | grep -v '^$' >"$0.temp2"
65 exec 3<"$0.temp1"
66 exec 4<"$0.temp2"
67 IFS=''
68 while :; do
69 line1=''
70 line2=''
71 while :; do
72 read -r line1 <&3
73 if test "${line1:0:1}" != "#" && test "${line1:0:2}" != "$T#"; then
74 break
75 fi
76 echo "$line1"
77 done
78 while :; do
79 read -r line2 <&4
80 if test "${line2:0:4}" = "${T}lea"; then
81 # We use 7-8 byte long forms of LEA.
82 # Do not interleave them with SSE insns
83 # which are also long.
84 echo "$line2"
85 read -r line2 <&4
86 echo "$line2"
87 continue
88 fi
89 if test "${line2:0:1}" != "#" && test "${line2:0:2}" != "$T#"; then
90 break
91 fi
92 echo "$line2"
93 done
94 test "$line1$line2" || break
95 echo "$line1"
96 echo "$line2"
97 done
98 rm "$0.temp1" "$0.temp2"
99 )
100}
30 101
31echo \ 102echo \
32'### Generated by hash_md5_sha_x86-64.S.sh ### 103"### Generated by hash_md5_sha_x86-64.S.sh ###
33 104
34#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__) 105#if CONFIG_SHA1_SMALL == 0 && defined(__GNUC__) && defined(__x86_64__)
35 .section .text.sha1_process_block64,"ax",@progbits 106 .section .text.sha1_process_block64, \"ax\", @progbits
36 .globl sha1_process_block64 107 .globl sha1_process_block64
37 .hidden sha1_process_block64 108 .hidden sha1_process_block64
38 .type sha1_process_block64, @function 109 .type sha1_process_block64, @function
@@ -41,7 +112,7 @@ echo \
41sha1_process_block64: 112sha1_process_block64:
42 pushq %rbp # 1 byte insn 113 pushq %rbp # 1 byte insn
43 pushq %rbx # 1 byte insn 114 pushq %rbx # 1 byte insn
44 pushq %r15 # 2 byte insn 115# pushq %r15 # 2 byte insn
45 pushq %r14 # 2 byte insn 116 pushq %r14 # 2 byte insn
46 pushq %r13 # 2 byte insn 117 pushq %r13 # 2 byte insn
47 pushq %r12 # 2 byte insn 118 pushq %r12 # 2 byte insn
@@ -50,17 +121,13 @@ sha1_process_block64:
50#Register and stack use: 121#Register and stack use:
51# eax..edx: a..d 122# eax..edx: a..d
52# ebp: e 123# ebp: e
53# esi,edi: temps 124# esi,edi,r8..r14: temps
54# -32+4*n(%rsp),r8...r15: W[0..7,8..15] 125# r15: unused
55# (TODO: actually W[0..7] are used a bit more often, put _them_ into r8..r15?) 126# xmm0..xmm3: W[]
56 movl $3, %eax 127# xmm4,xmm5: temps
571: 128# xmm6: current round constant
58 movq (%rdi,%rax,8), %rsi 129# xmm7: all round constants
59 bswapq %rsi 130# -64(%rsp): area for passing RCONST + W[] from vector to integer units
60 rolq $32, %rsi
61 movq %rsi, -32(%rsp,%rax,8)
62 decl %eax
63 jns 1b
64 131
65 movl 80(%rdi), %eax # a = ctx->hash[0] 132 movl 80(%rdi), %eax # a = ctx->hash[0]
66 movl 84(%rdi), %ebx # b = ctx->hash[1] 133 movl 84(%rdi), %ebx # b = ctx->hash[1]
@@ -68,32 +135,123 @@ sha1_process_block64:
68 movl 92(%rdi), %edx # d = ctx->hash[3] 135 movl 92(%rdi), %edx # d = ctx->hash[3]
69 movl 96(%rdi), %ebp # e = ctx->hash[4] 136 movl 96(%rdi), %ebp # e = ctx->hash[4]
70 137
71 movq 4*8(%rdi), %r8 138 movaps sha1const(%rip), $xmmALLRCONST
72 movq 4*10(%rdi), %r10 139 pshufd \$0x00, $xmmALLRCONST, $xmmRCONST
140
141 # Load W[] to xmm registers, byteswapping on the fly.
142 #
143 # For iterations 0..15, we pass W[] in rsi,r8..r14
144 # for use in RD1As instead of spilling them to stack.
145 # We lose parallelized addition of RCONST, but LEA
146 # can do two additions at once, so it is probably a wash.
147 # (We use rsi instead of rN because this makes two
148 # LEAs in two first RD1As shorter by one byte).
149 movq 4*0(%rdi), %rsi
150 movq 4*2(%rdi), %r8
151 bswapq %rsi
73 bswapq %r8 152 bswapq %r8
153 rolq \$32, %rsi # rsi = W[1]:W[0]
154 rolq \$32, %r8 # r8 = W[3]:W[2]
155 movq %rsi, %xmm0
156 movq %r8, $xmmT1
157 punpcklqdq $xmmT1, %xmm0 # xmm0 = r8:rsi = (W[0],W[1],W[2],W[3])
158# movaps %xmm0, $xmmT1 # add RCONST, spill to stack
159# paddd $xmmRCONST, $xmmT1
160# movups $xmmT1, -64+16*0(%rsp)
161
162 movq 4*4(%rdi), %r9
163 movq 4*6(%rdi), %r10
164 bswapq %r9
74 bswapq %r10 165 bswapq %r10
75 movq 4*12(%rdi), %r12 166 rolq \$32, %r9 # r9 = W[5]:W[4]
76 movq 4*14(%rdi), %r14 167 rolq \$32, %r10 # r10 = W[7]:W[6]
168 movq %r9, %xmm1
169 movq %r10, $xmmT1
170 punpcklqdq $xmmT1, %xmm1 # xmm1 = r10:r9 = (W[4],W[5],W[6],W[7])
171
172 movq 4*8(%rdi), %r11
173 movq 4*10(%rdi), %r12
174 bswapq %r11
77 bswapq %r12 175 bswapq %r12
176 rolq \$32, %r11 # r11 = W[9]:W[8]
177 rolq \$32, %r12 # r12 = W[11]:W[10]
178 movq %r11, %xmm2
179 movq %r12, $xmmT1
180 punpcklqdq $xmmT1, %xmm2 # xmm2 = r12:r11 = (W[8],W[9],W[10],W[11])
181
182 movq 4*12(%rdi), %r13
183 movq 4*14(%rdi), %r14
184 bswapq %r13
78 bswapq %r14 185 bswapq %r14
79 movl %r8d, %r9d 186 rolq \$32, %r13 # r13 = W[13]:W[12]
80 shrq $32, %r8 187 rolq \$32, %r14 # r14 = W[15]:W[14]
81 movl %r10d, %r11d 188 movq %r13, %xmm3
82 shrq $32, %r10 189 movq %r14, $xmmT1
83 movl %r12d, %r13d 190 punpcklqdq $xmmT1, %xmm3 # xmm3 = r14:r13 = (W[12],W[13],W[14],W[15])
84 shrq $32, %r12 191"
85 movl %r14d, %r15d 192
86 shrq $32, %r14 193PREP() {
87' 194local xmmW0=$1
88W32() { 195local xmmW4=$2
89test "$1" || exit 1 196local xmmW8=$3
90test "$1" -lt 0 && exit 1 197local xmmW12=$4
91test "$1" -gt 15 && exit 1 198# the above must be %xmm0..3 in some permutation
92test "$1" -lt 8 && echo "-32+4*$1(%rsp)" 199local dstmem=$5
93test "$1" -ge 8 && echo "%r${1}d" 200#W[0] = rol(W[13] ^ W[8] ^ W[2] ^ W[0], 1);
201#W[1] = rol(W[14] ^ W[9] ^ W[3] ^ W[1], 1);
202#W[2] = rol(W[15] ^ W[10] ^ W[4] ^ W[2], 1);
203#W[3] = rol( 0 ^ W[11] ^ W[5] ^ W[3], 1);
204#W[3] ^= rol(W[0], 1);
205echo "# PREP $@
206 movaps $xmmW12, $xmmT1
207 psrldq \$4, $xmmT1 # rshift by 4 bytes: T1 = ([13],[14],[15],0)
208
209# pshufd \$0x4e, $xmmW0, $xmmT2 # 01001110=2,3,0,1 shuffle, ([2],[3],x,x)
210# punpcklqdq $xmmW4, $xmmT2 # T2 = W4[0..63]:T2[0..63] = ([2],[3],[4],[5])
211# same result as above, but shorter and faster:
212# pshufd/shufps are subtly different: pshufd takes all dwords from source operand,
213# shufps takes dwords 0,1 from *2nd* operand, and dwords 2,3 from 1st one!
214 movaps $xmmW0, $xmmT2
215 shufps \$0x4e, $xmmW4, $xmmT2 # 01001110=(T2.dw[2], T2.dw[3], W4.dw[0], W4.dw[1]) = ([2],[3],[4],[5])
216
217 xorps $xmmW8, $xmmW0 # ([8],[9],[10],[11]) ^ ([0],[1],[2],[3])
218 xorps $xmmT1, $xmmT2 # ([13],[14],[15],0) ^ ([2],[3],[4],[5])
219 xorps $xmmT2, $xmmW0 # ^
220 # W0 = unrotated (W[0]..W[3]), still needs W[3] fixup
221 movaps $xmmW0, $xmmT2
222
223 xorps $xmmT1, $xmmT1 # rol(W0,1):
224 pcmpgtd $xmmW0, $xmmT1 # ffffffff for elements <0 (ones with msb bit 1)
225 paddd $xmmW0, $xmmW0 # shift left by 1
226 psubd $xmmT1, $xmmW0 # add 1 to those who had msb bit 1
227 # W0 = rotated (W[0]..W[3]), still needs W[3] fixup
228
229 pslldq \$12, $xmmT2 # lshift by 12 bytes: T2 = (0,0,0,unrotW[0])
230 movaps $xmmT2, $xmmT1
231 pslld \$2, $xmmT2
232 psrld \$30, $xmmT1
233# xorps $xmmT1, $xmmT2 # rol((0,0,0,unrotW[0]),2)
234 xorps $xmmT1, $xmmW0 # same result, but does not depend on/does not modify T2
235
236 xorps $xmmT2, $xmmW0 # W0 = rol(W[0]..W[3],1) ^ (0,0,0,rol(unrotW[0],2))
237"
238# movq $xmmW0, %r8 # high latency (~6 cycles)
239# movaps $xmmW0, $xmmT1
240# psrldq \$8, $xmmT1 # rshift by 8 bytes: move upper 64 bits to lower
241# movq $xmmT1, %r10 # high latency
242# movq %r8, %r9
243# movq %r10, %r11
244# shrq \$32, %r9
245# shrq \$32, %r11
246# ^^^ slower than passing the results on stack (!!!)
247echo "
248 movaps $xmmW0, $xmmT2
249 paddd $xmmRCONST, $xmmT2
250 movups $xmmT2, $dstmem
251"
94} 252}
95 253
96# It's possible to interleave insns in rounds to mostly eliminate 254# It's possible to interleave integer insns in rounds to mostly eliminate
97# dependency chains, but this likely to only help old Pentium-based 255# dependency chains, but this likely to only help old Pentium-based
98# CPUs (ones without OOO, which can only simultaneously execute a pair 256# CPUs (ones without OOO, which can only simultaneously execute a pair
99# of _adjacent_ insns). 257# of _adjacent_ insns).
@@ -104,28 +262,28 @@ RD1A() {
104local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 262local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
105local n=$(($6)) 263local n=$(($6))
106local n0=$(((n+0) & 15)) 264local n0=$(((n+0) & 15))
265local rN=$((7+n0/2))
107echo " 266echo "
108# $n 267# $n
109";test $n0 = 0 && echo " 268";test $n0 = 0 && echo "
110 # W[0], already in %esi 269 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
111";test $n0 != 0 && test $n0 -lt 8 && echo " 270 shrq \$32, %rsi
112 movl `W32 $n0`, %esi # W[n] 271";test $n0 = 1 && echo "
113";test $n0 -ge 8 && echo " 272 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
114 # W[n], in %r$n0 273";test $n0 -ge 2 && test $((n0 & 1)) = 0 && echo "
274 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
275 shrq \$32, %r$rN
276";test $n0 -ge 2 && test $((n0 & 1)) = 1 && echo "
277 leal $RCONST(%r$e,%r$rN), %e$e # e += RCONST + W[n]
115";echo " 278";echo "
116 movl %e$c, %edi # c 279 movl %e$c, %edi # c
117 xorl %e$d, %edi # ^d 280 xorl %e$d, %edi # ^d
118 andl %e$b, %edi # &b 281 andl %e$b, %edi # &b
119 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 282 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
120";test $n0 -lt 8 && echo "
121 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n]
122";test $n0 -ge 8 && echo "
123 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n]
124";echo "
125 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 283 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
126 movl %e$a, %esi # 284 movl %e$a, %edi #
127 roll \$5, %esi # rotl32(a,5) 285 roll \$5, %edi # rotl32(a,5)
128 addl %esi, %e$e # e += rotl32(a,5) 286 addl %edi, %e$e # e += rotl32(a,5)
129 rorl \$2, %e$b # b = rotl32(b,30) 287 rorl \$2, %e$b # b = rotl32(b,30)
130" 288"
131} 289}
@@ -138,28 +296,11 @@ local n2=$(((n+2) & 15))
138local n0=$(((n+0) & 15)) 296local n0=$(((n+0) & 15))
139echo " 297echo "
140# $n 298# $n
141";test $n0 -lt 8 && echo "
142 movl `W32 $n13`, %esi # W[(n+13) & 15]
143 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
144 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
145 xorl `W32 $n0`, %esi # ^W[n & 15]
146 roll %esi #
147 movl %esi, `W32 $n0` # store to W[n & 15]
148";test $n0 -ge 8 && echo "
149 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
150 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
151 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
152 roll `W32 $n0` #
153";echo "
154 movl %e$c, %edi # c 299 movl %e$c, %edi # c
155 xorl %e$d, %edi # ^d 300 xorl %e$d, %edi # ^d
156 andl %e$b, %edi # &b 301 andl %e$b, %edi # &b
157 xorl %e$d, %edi # (((c ^ d) & b) ^ d) 302 xorl %e$d, %edi # (((c ^ d) & b) ^ d)
158";test $n0 -lt 8 && echo " 303 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
159 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
160";test $n0 -ge 8 && echo "
161 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
162";echo "
163 addl %edi, %e$e # e += (((c ^ d) & b) ^ d) 304 addl %edi, %e$e # e += (((c ^ d) & b) ^ d)
164 movl %e$a, %esi # 305 movl %e$a, %esi #
165 roll \$5, %esi # rotl32(a,5) 306 roll \$5, %esi # rotl32(a,5)
@@ -167,13 +308,6 @@ echo "
167 rorl \$2, %e$b # b = rotl32(b,30) 308 rorl \$2, %e$b # b = rotl32(b,30)
168" 309"
169} 310}
170{
171RCONST=0x5A827999
172RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3; RD1A bx cx dx bp ax 4
173RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7; RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9
174RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11; RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14
175RD1A ax bx cx dx bp 15; RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19
176} | grep -v '^$'
177 311
178RD2() { 312RD2() {
179local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 313local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -184,27 +318,10 @@ local n2=$(((n+2) & 15))
184local n0=$(((n+0) & 15)) 318local n0=$(((n+0) & 15))
185echo " 319echo "
186# $n 320# $n
187";test $n0 -lt 8 && echo "
188 movl `W32 $n13`, %esi # W[(n+13) & 15]
189 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
190 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
191 xorl `W32 $n0`, %esi # ^W[n & 15]
192 roll %esi #
193 movl %esi, `W32 $n0` # store to W[n & 15]
194";test $n0 -ge 8 && echo "
195 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
196 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
197 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
198 roll `W32 $n0` #
199";echo "
200 movl %e$c, %edi # c 321 movl %e$c, %edi # c
201 xorl %e$d, %edi # ^d 322 xorl %e$d, %edi # ^d
202 xorl %e$b, %edi # ^b 323 xorl %e$b, %edi # ^b
203";test $n0 -lt 8 && echo " 324 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
204 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
205";test $n0 -ge 8 && echo "
206 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
207";echo "
208 addl %edi, %e$e # e += (c ^ d ^ b) 325 addl %edi, %e$e # e += (c ^ d ^ b)
209 movl %e$a, %esi # 326 movl %e$a, %esi #
210 roll \$5, %esi # rotl32(a,5) 327 roll \$5, %esi # rotl32(a,5)
@@ -212,13 +329,6 @@ echo "
212 rorl \$2, %e$b # b = rotl32(b,30) 329 rorl \$2, %e$b # b = rotl32(b,30)
213" 330"
214} 331}
215{
216RCONST=0x6ED9EBA1
217RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23; RD2 bx cx dx bp ax 24
218RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27; RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29
219RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31; RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34
220RD2 ax bx cx dx bp 35; RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39
221} | grep -v '^$'
222 332
223RD3() { 333RD3() {
224local a=$1;local b=$2;local c=$3;local d=$4;local e=$5 334local a=$1;local b=$2;local c=$3;local d=$4;local e=$5
@@ -235,53 +345,82 @@ echo "
235 andl %e$c, %esi # si: b & c 345 andl %e$c, %esi # si: b & c
236 andl %e$d, %edi # di: (b | c) & d 346 andl %e$d, %edi # di: (b | c) & d
237 orl %esi, %edi # ((b | c) & d) | (b & c) 347 orl %esi, %edi # ((b | c) & d) | (b & c)
238";test $n0 -lt 8 && echo "
239 movl `W32 $n13`, %esi # W[(n+13) & 15]
240 xorl `W32 $n8`, %esi # ^W[(n+8) & 15]
241 xorl `W32 $n2`, %esi # ^W[(n+2) & 15]
242 xorl `W32 $n0`, %esi # ^W[n & 15]
243 roll %esi #
244 movl %esi, `W32 $n0` # store to W[n & 15]
245";test $n0 -ge 8 && echo "
246 xorl `W32 $n13`, `W32 $n0` # W[n & 15] ^= W[(n+13) & 15]
247 xorl `W32 $n8`, `W32 $n0` # ^W[(n+8) & 15]
248 xorl `W32 $n2`, `W32 $n0` # ^W[(n+2) & 15]
249 roll `W32 $n0` #
250";echo "
251 addl %edi, %e$e # += ((b | c) & d) | (b & c) 348 addl %edi, %e$e # += ((b | c) & d) | (b & c)
252";test $n0 -lt 8 && echo " 349 addl -64+4*$n0(%rsp), %e$e # e += RCONST + W[n & 15]
253 leal $RCONST(%r$e,%rsi), %e$e # e += RCONST + W[n & 15]
254";test $n0 -ge 8 && echo "
255 leal $RCONST(%r$e,%r$n0), %e$e # e += RCONST + W[n & 15]
256";echo "
257 movl %e$a, %esi # 350 movl %e$a, %esi #
258 roll \$5, %esi # rotl32(a,5) 351 roll \$5, %esi # rotl32(a,5)
259 addl %esi, %e$e # e += rotl32(a,5) 352 addl %esi, %e$e # e += rotl32(a,5)
260 rorl \$2, %e$b # b = rotl32(b,30) 353 rorl \$2, %e$b # b = rotl32(b,30)
261" 354"
262} 355}
356
263{ 357{
264#RCONST=0x8F1BBCDC "out of range for signed 32bit displacement" 358# Round 1
265RCONST=-0x70E44324 359RCONST=0x5A827999
266RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43; RD3 bx cx dx bp ax 44 360RD1A ax bx cx dx bp 0; RD1A bp ax bx cx dx 1; RD1A dx bp ax bx cx 2; RD1A cx dx bp ax bx 3;
267RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47; RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49 361RD1A bx cx dx bp ax 4; RD1A ax bx cx dx bp 5; RD1A bp ax bx cx dx 6; RD1A dx bp ax bx cx 7;
268RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51; RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54 362a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
269RD3 ax bx cx dx bp 55; RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59 363b=`RD1A cx dx bp ax bx 8; RD1A bx cx dx bp ax 9; RD1A ax bx cx dx bp 10; RD1A bp ax bx cx dx 11;`
270} | grep -v '^$' 364INTERLEAVE "$a" "$b"
365a=`echo " pshufd \\$0x55, $xmmALLRCONST, $xmmRCONST"
366 PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
367b=`RD1A dx bp ax bx cx 12; RD1A cx dx bp ax bx 13; RD1A bx cx dx bp ax 14; RD1A ax bx cx dx bp 15;`
368INTERLEAVE "$a" "$b"
369a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
370b=`RD1B bp ax bx cx dx 16; RD1B dx bp ax bx cx 17; RD1B cx dx bp ax bx 18; RD1B bx cx dx bp ax 19;`
371INTERLEAVE "$a" "$b"
372
373# Round 2
374RCONST=0x6ED9EBA1
375a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
376b=`RD2 ax bx cx dx bp 20; RD2 bp ax bx cx dx 21; RD2 dx bp ax bx cx 22; RD2 cx dx bp ax bx 23;`
377INTERLEAVE "$a" "$b"
378a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
379b=`RD2 bx cx dx bp ax 24; RD2 ax bx cx dx bp 25; RD2 bp ax bx cx dx 26; RD2 dx bp ax bx cx 27;`
380INTERLEAVE "$a" "$b"
381a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
382b=`RD2 cx dx bp ax bx 28; RD2 bx cx dx bp ax 29; RD2 ax bx cx dx bp 30; RD2 bp ax bx cx dx 31;`
383INTERLEAVE "$a" "$b"
384a=`echo " pshufd \\$0xaa, $xmmALLRCONST, $xmmRCONST"
385 PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
386b=`RD2 dx bp ax bx cx 32; RD2 cx dx bp ax bx 33; RD2 bx cx dx bp ax 34; RD2 ax bx cx dx bp 35;`
387INTERLEAVE "$a" "$b"
388a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
389b=`RD2 bp ax bx cx dx 36; RD2 dx bp ax bx cx 37; RD2 cx dx bp ax bx 38; RD2 bx cx dx bp ax 39;`
390INTERLEAVE "$a" "$b"
391
392# Round 3
393RCONST=0x8F1BBCDC
394a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
395b=`RD3 ax bx cx dx bp 40; RD3 bp ax bx cx dx 41; RD3 dx bp ax bx cx 42; RD3 cx dx bp ax bx 43;`
396INTERLEAVE "$a" "$b"
397a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
398b=`RD3 bx cx dx bp ax 44; RD3 ax bx cx dx bp 45; RD3 bp ax bx cx dx 46; RD3 dx bp ax bx cx 47;`
399INTERLEAVE "$a" "$b"
400a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
401b=`RD3 cx dx bp ax bx 48; RD3 bx cx dx bp ax 49; RD3 ax bx cx dx bp 50; RD3 bp ax bx cx dx 51;`
402INTERLEAVE "$a" "$b"
403a=`echo " pshufd \\$0xff, $xmmALLRCONST, $xmmRCONST"
404 PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
405b=`RD3 dx bp ax bx cx 52; RD3 cx dx bp ax bx 53; RD3 bx cx dx bp ax 54; RD3 ax bx cx dx bp 55;`
406INTERLEAVE "$a" "$b"
407a=`PREP %xmm0 %xmm1 %xmm2 %xmm3 "-64+16*0(%rsp)"`
408b=`RD3 bp ax bx cx dx 56; RD3 dx bp ax bx cx 57; RD3 cx dx bp ax bx 58; RD3 bx cx dx bp ax 59;`
409INTERLEAVE "$a" "$b"
271 410
272# Round 4 has the same logic as round 2, only n and RCONST are different 411# Round 4 has the same logic as round 2, only n and RCONST are different
273{ 412RCONST=0xCA62C1D6
274#RCONST=0xCA62C1D6 "out of range for signed 32bit displacement" 413a=`PREP %xmm1 %xmm2 %xmm3 %xmm0 "-64+16*1(%rsp)"`
275RCONST=-0x359D3E2A 414b=`RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63;`
276RD2 ax bx cx dx bp 60; RD2 bp ax bx cx dx 61; RD2 dx bp ax bx cx 62; RD2 cx dx bp ax bx 63; RD2 bx cx dx bp ax 64 415INTERLEAVE "$a" "$b"
277RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67; RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69 416a=`PREP %xmm2 %xmm3 %xmm0 %xmm1 "-64+16*2(%rsp)"`
278RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71; RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74 417b=`RD2 bx cx dx bp ax 64; RD2 ax bx cx dx bp 65; RD2 bp ax bx cx dx 66; RD2 dx bp ax bx cx 67;`
279RD2 ax bx cx dx bp 75; RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79 418INTERLEAVE "$a" "$b"
280# Note: new W[n&15] values generated in last 3 iterations 419a=`PREP %xmm3 %xmm0 %xmm1 %xmm2 "-64+16*3(%rsp)"`
281# (W[13,14,15]) are unused after each of these iterations. 420b=`RD2 cx dx bp ax bx 68; RD2 bx cx dx bp ax 69; RD2 ax bx cx dx bp 70; RD2 bp ax bx cx dx 71;`
282# Since we use r8..r15 for W[8..15], this does not matter. 421INTERLEAVE "$a" "$b"
283# If we switch to e.g. using r8..r15 for W[0..7], then saving of W[13,14,15] 422RD2 dx bp ax bx cx 72; RD2 cx dx bp ax bx 73; RD2 bx cx dx bp ax 74; RD2 ax bx cx dx bp 75;
284# (the "movl %esi, `W32 $n0`" insn) is a dead store and can be removed. 423RD2 bp ax bx cx dx 76; RD2 dx bp ax bx cx 77; RD2 cx dx bp ax bx 78; RD2 bx cx dx bp ax 79;
285} | grep -v '^$' 424} | grep -v '^$'
286 425
287echo " 426echo "
@@ -292,7 +431,7 @@ echo "
292 addl %ebx, 84(%rdi) # ctx->hash[1] += b 431 addl %ebx, 84(%rdi) # ctx->hash[1] += b
293 popq %r14 # 432 popq %r14 #
294 addl %ecx, 88(%rdi) # ctx->hash[2] += c 433 addl %ecx, 88(%rdi) # ctx->hash[2] += c
295 popq %r15 # 434# popq %r15 #
296 addl %edx, 92(%rdi) # ctx->hash[3] += d 435 addl %edx, 92(%rdi) # ctx->hash[3] += d
297 popq %rbx # 436 popq %rbx #
298 addl %ebp, 96(%rdi) # ctx->hash[4] += e 437 addl %ebp, 96(%rdi) # ctx->hash[4] += e
@@ -300,4 +439,13 @@ echo "
300 439
301 ret 440 ret
302 .size sha1_process_block64, .-sha1_process_block64 441 .size sha1_process_block64, .-sha1_process_block64
442
443 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
444 .balign 16
445sha1const:
446 .long 0x5A827999
447 .long 0x6ED9EBA1
448 .long 0x8F1BBCDC
449 .long 0xCA62C1D6
450
303#endif" 451#endif"
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 33cc3bf7f..b32029360 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -32,41 +32,42 @@
32#define MSG1 %xmm4 32#define MSG1 %xmm4
33#define MSG2 %xmm5 33#define MSG2 %xmm5
34#define MSG3 %xmm6 34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36 35
37 .balign 8 # allow decoders to fetch at least 2 first insns 36 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI: 37sha1_process_block64_shaNI:
39 /* load initial hash values */ 38 /* load initial hash values */
40
41 xor128 E0, E0
42 movu128 80(%rdi), ABCD 39 movu128 80(%rdi), ABCD
40 xor128 E0, E0
43 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word 41 pinsrd $3, 80+4*4(%rdi), E0 # load to uppermost 32-bit word
44 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD 42 shuf128_32 $0x1B, ABCD, ABCD # DCBA -> ABCD
45 43
46 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK 44 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), %xmm7
45
46 movu128 0*16(%rdi), MSG0
47 pshufb %xmm7, MSG0
48 movu128 1*16(%rdi), MSG1
49 pshufb %xmm7, MSG1
50 movu128 2*16(%rdi), MSG2
51 pshufb %xmm7, MSG2
52 movu128 3*16(%rdi), MSG3
53 pshufb %xmm7, MSG3
47 54
48 /* Save hash values for addition after rounds */ 55 /* Save hash values for addition after rounds */
49 mova128 E0, %xmm9 56 mova128 E0, %xmm7
50 mova128 ABCD, %xmm8 57 mova128 ABCD, %xmm8
51 58
52 /* Rounds 0-3 */ 59 /* Rounds 0-3 */
53 movu128 0*16(%rdi), MSG0
54 pshufb SHUF_MASK, MSG0
55 paddd MSG0, E0 60 paddd MSG0, E0
56 mova128 ABCD, E1 61 mova128 ABCD, E1
57 sha1rnds4 $0, E0, ABCD 62 sha1rnds4 $0, E0, ABCD
58 63
59 /* Rounds 4-7 */ 64 /* Rounds 4-7 */
60 movu128 1*16(%rdi), MSG1
61 pshufb SHUF_MASK, MSG1
62 sha1nexte MSG1, E1 65 sha1nexte MSG1, E1
63 mova128 ABCD, E0 66 mova128 ABCD, E0
64 sha1rnds4 $0, E1, ABCD 67 sha1rnds4 $0, E1, ABCD
65 sha1msg1 MSG1, MSG0 68 sha1msg1 MSG1, MSG0
66 69
67 /* Rounds 8-11 */ 70 /* Rounds 8-11 */
68 movu128 2*16(%rdi), MSG2
69 pshufb SHUF_MASK, MSG2
70 sha1nexte MSG2, E0 71 sha1nexte MSG2, E0
71 mova128 ABCD, E1 72 mova128 ABCD, E1
72 sha1rnds4 $0, E0, ABCD 73 sha1rnds4 $0, E0, ABCD
@@ -74,8 +75,6 @@ sha1_process_block64_shaNI:
74 xor128 MSG2, MSG0 75 xor128 MSG2, MSG0
75 76
76 /* Rounds 12-15 */ 77 /* Rounds 12-15 */
77 movu128 3*16(%rdi), MSG3
78 pshufb SHUF_MASK, MSG3
79 sha1nexte MSG3, E1 78 sha1nexte MSG3, E1
80 mova128 ABCD, E0 79 mova128 ABCD, E0
81 sha1msg2 MSG3, MSG0 80 sha1msg2 MSG3, MSG0
@@ -206,7 +205,7 @@ sha1_process_block64_shaNI:
206 sha1rnds4 $3, E1, ABCD 205 sha1rnds4 $3, E1, ABCD
207 206
208 /* Add current hash values with previously saved */ 207 /* Add current hash values with previously saved */
209 sha1nexte %xmm9, E0 208 sha1nexte %xmm7, E0
210 paddd %xmm8, ABCD 209 paddd %xmm8, ABCD
211 210
212 /* Write hash values back in the correct order */ 211 /* Write hash values back in the correct order */
@@ -217,8 +216,8 @@ sha1_process_block64_shaNI:
217 ret 216 ret
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 217 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219 218
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 219 .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.align 16 220 .balign 16
222PSHUFFLE_BYTE_FLIP_MASK: 221PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f 222 .octa 0x000102030405060708090a0b0c0d0e0f
224 223
diff --git a/libbb/lineedit.c b/libbb/lineedit.c
index 8abc87976..778511d16 100644
--- a/libbb/lineedit.c
+++ b/libbb/lineedit.c
@@ -2274,17 +2274,41 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
2274#endif 2274#endif
2275 2275
2276 fflush_all(); 2276 fflush_all();
2277 while (1) { 2277 for (;;) {
2278 /* Wait for input. TIMEOUT = -1 makes read_key wait even 2278 /* Wait for input. TIMEOUT = -1 makes read_key wait even
2279 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't 2279 * on nonblocking stdin, TIMEOUT = 50 makes sure we won't
2280 * insist on full MB_CUR_MAX buffer to declare input like 2280 * insist on full MB_CUR_MAX buffer to declare input like
2281 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls". 2281 * "\xff\n",pause,"ls\n" invalid and thus won't lose "ls".
2282 * 2282 *
2283 * If LI_INTERRUPTIBLE, return -1 if got EINTR in poll()
2284 * inside read_key, or if bb_got_signal != 0 (IOW: if signal
2285 * arrived before poll() is reached).
2286 *
2283 * Note: read_key sets errno to 0 on success. 2287 * Note: read_key sets errno to 0 on success.
2284 */ 2288 */
2285 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;) 2289 for (;;) {
2286 ic = read_key(STDIN_FILENO, read_key_buffer, timeout); 2290 if ((state->flags & LI_INTERRUPTIBLE) && bb_got_signal) {
2287 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;) 2291 errno = EINTR;
2292 return -1;
2293 }
2294//FIXME: still races here with signals, but small window to poll() inside read_key
2295 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 1;)
2296 /* errno = 0; - read_key does this itself */
2297 ic = read_key(STDIN_FILENO, read_key_buffer, timeout);
2298 IF_FEATURE_EDITING_WINCH(S.ok_to_redraw = 0;)
2299 if (errno != EINTR)
2300 break;
2301 if (state->flags & LI_INTERRUPTIBLE) {
2302 /* LI_INTERRUPTIBLE bails out on EINTR,
2303 * but nothing really guarantees that bb_got_signal
2304 * is nonzero. Follow the least surprise principle:
2305 */
2306 if (bb_got_signal == 0)
2307 bb_got_signal = 255;
2308 goto ret;
2309 }
2310 }
2311
2288 if (errno) { 2312 if (errno) {
2289#if ENABLE_UNICODE_SUPPORT 2313#if ENABLE_UNICODE_SUPPORT
2290 if (errno == EAGAIN && unicode_idx != 0) 2314 if (errno == EAGAIN && unicode_idx != 0)
@@ -2352,7 +2376,7 @@ static int lineedit_read_key(char *read_key_buffer, int timeout)
2352#endif 2376#endif
2353 break; 2377 break;
2354 } 2378 }
2355 2379 ret:
2356 return ic; 2380 return ic;
2357} 2381}
2358 2382
diff --git a/libbb/read_key.c b/libbb/read_key.c
index 03b7da656..cf8ed411e 100644
--- a/libbb/read_key.c
+++ b/libbb/read_key.c
@@ -126,7 +126,10 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
126 * if fd can be in non-blocking mode. 126 * if fd can be in non-blocking mode.
127 */ 127 */
128 if (timeout >= -1) { 128 if (timeout >= -1) {
129 if (safe_poll(&pfd, 1, timeout) == 0) { 129 n = poll(&pfd, 1, timeout);
130 if (n < 0 && errno == EINTR)
131 return n;
132 if (n == 0) {
130 /* Timed out */ 133 /* Timed out */
131 errno = EAGAIN; 134 errno = EAGAIN;
132 return -1; 135 return -1;
@@ -138,7 +141,7 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
138 * When we were reading 3 bytes here, we were eating 141 * When we were reading 3 bytes here, we were eating
139 * "li" too, and cat was getting wrong input. 142 * "li" too, and cat was getting wrong input.
140 */ 143 */
141 n = safe_read(fd, buffer, 1); 144 n = read(fd, buffer, 1);
142 if (n <= 0) 145 if (n <= 0)
143 return -1; 146 return -1;
144 } 147 }
@@ -284,6 +287,16 @@ int64_t FAST_FUNC read_key(int fd, char *buffer, int timeout)
284 goto start_over; 287 goto start_over;
285} 288}
286 289
290int64_t FAST_FUNC safe_read_key(int fd, char *buffer, int timeout)
291{
292 int64_t r;
293 do {
294 /* errno = 0; - read_key does this itself */
295 r = read_key(fd, buffer, timeout);
296 } while (errno == EINTR);
297 return r;
298}
299
287void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len) 300void FAST_FUNC read_key_ungets(char *buffer, const char *str, unsigned len)
288{ 301{
289 unsigned cur_len = (unsigned char)buffer[0]; 302 unsigned cur_len = (unsigned char)buffer[0];
diff --git a/libbb/setup_environment.c b/libbb/setup_environment.c
index df2983958..3549e2099 100644
--- a/libbb/setup_environment.c
+++ b/libbb/setup_environment.c
@@ -36,9 +36,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
36 36
37 /* Change the current working directory to be the home directory 37 /* Change the current working directory to be the home directory
38 * of the user */ 38 * of the user */
39 if (!(flags & SETUP_ENV_NO_CHDIR)) { 39 if (flags & SETUP_ENV_CHDIR) {
40 if (chdir(pw->pw_dir) != 0) { 40 if (chdir_or_warn(pw->pw_dir) != 0) {
41 bb_error_msg("can't change directory to '%s'", pw->pw_dir);
42 xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/"); 41 xchdir((flags & SETUP_ENV_TO_TMP) ? "/tmp" : "/");
43 } 42 }
44 } 43 }
@@ -59,7 +58,8 @@ void FAST_FUNC setup_environment(const char *shell, int flags, const struct pass
59 //xsetenv("LOGNAME", pw->pw_name); 58 //xsetenv("LOGNAME", pw->pw_name);
60 //xsetenv("HOME", pw->pw_dir); 59 //xsetenv("HOME", pw->pw_dir);
61 //xsetenv("SHELL", shell); 60 //xsetenv("SHELL", shell);
62 } else if (flags & SETUP_ENV_CHANGEENV) { 61 } else
62 if (flags & (SETUP_ENV_CHANGEENV|SETUP_ENV_CHANGEENV_LOGNAME)) {
63 /* Set HOME, SHELL, and if not becoming a super-user 63 /* Set HOME, SHELL, and if not becoming a super-user
64 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */ 64 * or if SETUP_ENV_CHANGEENV_LOGNAME, USER and LOGNAME. */
65 if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) { 65 if ((flags & SETUP_ENV_CHANGEENV_LOGNAME) || pw->pw_uid != 0) {
diff --git a/libbb/xfuncs_printf.c b/libbb/xfuncs_printf.c
index aae3b092d..a9add8ab2 100644
--- a/libbb/xfuncs_printf.c
+++ b/libbb/xfuncs_printf.c
@@ -417,11 +417,18 @@ void FAST_FUNC xseteuid(uid_t euid)
417 if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid"); 417 if (seteuid(euid)) bb_simple_perror_msg_and_die("seteuid");
418} 418}
419 419
420int FAST_FUNC chdir_or_warn(const char *path)
421{
422 int r = chdir(path);
423 if (r != 0)
424 bb_perror_msg("can't change directory to '%s'", path);
425 return r;
426}
420// Die if we can't chdir to a new path. 427// Die if we can't chdir to a new path.
421void FAST_FUNC xchdir(const char *path) 428void FAST_FUNC xchdir(const char *path)
422{ 429{
423 if (chdir(path)) 430 if (chdir_or_warn(path) != 0)
424 bb_perror_msg_and_die("can't change directory to '%s'", path); 431 xfunc_die();
425} 432}
426 433
427void FAST_FUNC xfchdir(int fd) 434void FAST_FUNC xfchdir(int fd)