aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-02-03 14:15:20 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-02-03 14:58:02 +0100
commit6472ac942898437e040171cec991de1c0b962f72 (patch)
treedb3bad21943c0f946f3485d71ac336c57a7ea846
parent205042c07a3bf6c8e685c434713f2a9e46630cd0 (diff)
downloadbusybox-w32-6472ac942898437e040171cec991de1c0b962f72.tar.gz
busybox-w32-6472ac942898437e040171cec991de1c0b962f72.tar.bz2
busybox-w32-6472ac942898437e040171cec991de1c0b962f72.zip
libbb/sha256: optional x86 hardware accelerated hashing
64 bit: function old new delta sha256_process_block64_shaNI - 730 +730 .rodata 108314 108586 +272 sha256_begin 31 83 +52 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1055/-1) Total: 1054 bytes 32 bit: function old new delta sha256_process_block64_shaNI - 747 +747 .rodata 104318 104590 +272 sha256_begin 29 84 +55 ------------------------------------------------------------------------------ (add/remove: 5/1 grow/shrink: 2/0 up/down: 1075/-1) Total: 1074 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/Config.src6
-rw-r--r--libbb/Kbuild.src2
-rw-r--r--libbb/hash_md5_sha.c54
-rw-r--r--libbb/hash_md5_sha256_x86-32_shaNI.S283
-rw-r--r--libbb/hash_md5_sha256_x86-64_shaNI.S281
-rw-r--r--libbb/hash_md5_sha_x86-32_shaNI.S4
-rw-r--r--libbb/hash_md5_sha_x86-64.S2
-rwxr-xr-xlibbb/hash_md5_sha_x86-64.S.sh2
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S4
9 files changed, 612 insertions, 26 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index 708d3b0c8..0ecd5bd46 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -70,6 +70,12 @@ config SHA1_HWACCEL
70 On x86, this adds ~590 bytes of code. Throughput 70 On x86, this adds ~590 bytes of code. Throughput
71 is about twice as fast as fully-unrolled generic code. 71 is about twice as fast as fully-unrolled generic code.
72 72
73config SHA256_HWACCEL
74 bool "SHA256: Use hardware accelerated instructions if possible"
75 default y
76 help
77 On x86, this adds ~1k bytes of code.
78
73config SHA3_SMALL 79config SHA3_SMALL
74 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 80 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
75 default 1 # all "fast or small" options default to small 81 default 1 # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index b9d34de8e..653025e56 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -59,6 +59,8 @@ lib-y += hash_md5_sha.o
59lib-y += hash_md5_sha_x86-64.o 59lib-y += hash_md5_sha_x86-64.o
60lib-y += hash_md5_sha_x86-64_shaNI.o 60lib-y += hash_md5_sha_x86-64_shaNI.o
61lib-y += hash_md5_sha_x86-32_shaNI.o 61lib-y += hash_md5_sha_x86-32_shaNI.o
62lib-y += hash_md5_sha256_x86-64_shaNI.o
63lib-y += hash_md5_sha256_x86-32_shaNI.o
62# Alternative (disabled) MD5 implementation 64# Alternative (disabled) MD5 implementation
63#lib-y += hash_md5prime.o 65#lib-y += hash_md5prime.o
64lib-y += messages.o 66lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index a23db5152..880ffab01 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -13,6 +13,27 @@
13 13
14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA) 14#define NEED_SHA512 (ENABLE_SHA512SUM || ENABLE_USE_BB_CRYPT_SHA)
15 15
16#if ENABLE_SHA1_HWACCEL || ENABLE_SHA256_HWACCEL
17# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
18static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
19{
20 asm ("cpuid"
21 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
22 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
23 );
24}
25static smallint shaNI;
26void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
27void FAST_FUNC sha256_process_block64_shaNI(sha256_ctx_t *ctx);
28# if defined(__i386__)
29struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 76)]; };
30# endif
31# if defined(__x86_64__)
32struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha256_ctx_t, hash) != 80)]; };
33# endif
34# endif
35#endif
36
16/* gcc 4.2.1 optimizes rotr64 better with inline than with macro 37/* gcc 4.2.1 optimizes rotr64 better with inline than with macro
17 * (for rotX32, there is no difference). Why? My guess is that 38 * (for rotX32, there is no difference). Why? My guess is that
18 * macro requires clever common subexpression elimination heuristics 39 * macro requires clever common subexpression elimination heuristics
@@ -1142,25 +1163,6 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142} 1163}
1143#endif /* NEED_SHA512 */ 1164#endif /* NEED_SHA512 */
1144 1165
1145#if ENABLE_SHA1_HWACCEL
1146# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1147static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
1148{
1149 asm ("cpuid"
1150 : "=a"(*eax), "=b"(*ebx), "=c"(*ecx), "=d"(*edx)
1151 : "0"(*eax), "1"(*ebx), "2"(*ecx), "3"(*edx)
1152 );
1153}
1154void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
1155# if defined(__i386__)
1156struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; };
1157# endif
1158# if defined(__x86_64__)
1159struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
1160# endif
1161# endif
1162#endif
1163
1164void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) 1166void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1165{ 1167{
1166 ctx->hash[0] = 0x67452301; 1168 ctx->hash[0] = 0x67452301;
@@ -1173,7 +1175,6 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1173#if ENABLE_SHA1_HWACCEL 1175#if ENABLE_SHA1_HWACCEL
1174# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 1176# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1175 { 1177 {
1176 static smallint shaNI;
1177 if (!shaNI) { 1178 if (!shaNI) {
1178 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; 1179 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1179 cpuid(&eax, &ebx, &ecx, &edx); 1180 cpuid(&eax, &ebx, &ecx, &edx);
@@ -1225,6 +1226,19 @@ void FAST_FUNC sha256_begin(sha256_ctx_t *ctx)
1225 memcpy(&ctx->total64, init256, sizeof(init256)); 1226 memcpy(&ctx->total64, init256, sizeof(init256));
1226 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */ 1227 /*ctx->total64 = 0; - done by prepending two 32-bit zeros to init256 */
1227 ctx->process_block = sha256_process_block64; 1228 ctx->process_block = sha256_process_block64;
1229#if ENABLE_SHA256_HWACCEL
1230# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
1231 {
1232 if (!shaNI) {
1233 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1234 cpuid(&eax, &ebx, &ecx, &edx);
1235 shaNI = ((ebx >> 29) << 1) - 1;
1236 }
1237 if (shaNI > 0)
1238 ctx->process_block = sha256_process_block64_shaNI;
1239 }
1240# endif
1241#endif
1228} 1242}
1229 1243
1230#if NEED_SHA512 1244#if NEED_SHA512
diff --git a/libbb/hash_md5_sha256_x86-32_shaNI.S b/libbb/hash_md5_sha256_x86-32_shaNI.S
new file mode 100644
index 000000000..56e37fa38
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-32_shaNI.S
@@ -0,0 +1,283 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__i386__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %eax
24
25#define SHA256CONSTANTS %ecx
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34#define MSGTMP4 %xmm7
35
36 .balign 8 # allow decoders to fetch at least 3 first insns
37sha256_process_block64_shaNI:
38 pushl %ebp
39 movl %esp, %ebp
40 subl $32, %esp
41 andl $~0xF, %esp # paddd needs aligned memory operand
42
43 movu128 76+0*16(%eax), STATE0
44 movu128 76+1*16(%eax), STATE1
45
46 shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
47 shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
48 mova128 STATE0, MSGTMP4
49 palignr $8, STATE1, STATE0 /* ABEF */
50 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
51
52# mova128 PSHUFFLE_BSWAP32_FLIP_MASK, SHUF_MASK
53 lea K256, SHA256CONSTANTS
54
55 /* Save hash values for addition after rounds */
56 mova128 STATE0, 0*16(%esp)
57 mova128 STATE1, 1*16(%esp)
58
59 /* Rounds 0-3 */
60 movu128 0*16(DATA_PTR), MSG
61 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
62 mova128 MSG, MSGTMP0
63 paddd 0*16(SHA256CONSTANTS), MSG
64 sha256rnds2 STATE0, STATE1
65 shuf128_32 $0x0E, MSG, MSG
66 sha256rnds2 STATE1, STATE0
67
68 /* Rounds 4-7 */
69 movu128 1*16(DATA_PTR), MSG
70 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
71 mova128 MSG, MSGTMP1
72 paddd 1*16(SHA256CONSTANTS), MSG
73 sha256rnds2 STATE0, STATE1
74 shuf128_32 $0x0E, MSG, MSG
75 sha256rnds2 STATE1, STATE0
76 sha256msg1 MSGTMP1, MSGTMP0
77
78 /* Rounds 8-11 */
79 movu128 2*16(DATA_PTR), MSG
80 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
81 mova128 MSG, MSGTMP2
82 paddd 2*16(SHA256CONSTANTS), MSG
83 sha256rnds2 STATE0, STATE1
84 shuf128_32 $0x0E, MSG, MSG
85 sha256rnds2 STATE1, STATE0
86 sha256msg1 MSGTMP2, MSGTMP1
87
88 /* Rounds 12-15 */
89 movu128 3*16(DATA_PTR), MSG
90 pshufb PSHUFFLE_BSWAP32_FLIP_MASK, MSG
91 mova128 MSG, MSGTMP3
92 paddd 3*16(SHA256CONSTANTS), MSG
93 sha256rnds2 STATE0, STATE1
94 mova128 MSGTMP3, MSGTMP4
95 palignr $4, MSGTMP2, MSGTMP4
96 paddd MSGTMP4, MSGTMP0
97 sha256msg2 MSGTMP3, MSGTMP0
98 shuf128_32 $0x0E, MSG, MSG
99 sha256rnds2 STATE1, STATE0
100 sha256msg1 MSGTMP3, MSGTMP2
101
102 /* Rounds 16-19 */
103 mova128 MSGTMP0, MSG
104 paddd 4*16(SHA256CONSTANTS), MSG
105 sha256rnds2 STATE0, STATE1
106 mova128 MSGTMP0, MSGTMP4
107 palignr $4, MSGTMP3, MSGTMP4
108 paddd MSGTMP4, MSGTMP1
109 sha256msg2 MSGTMP0, MSGTMP1
110 shuf128_32 $0x0E, MSG, MSG
111 sha256rnds2 STATE1, STATE0
112 sha256msg1 MSGTMP0, MSGTMP3
113
114 /* Rounds 20-23 */
115 mova128 MSGTMP1, MSG
116 paddd 5*16(SHA256CONSTANTS), MSG
117 sha256rnds2 STATE0, STATE1
118 mova128 MSGTMP1, MSGTMP4
119 palignr $4, MSGTMP0, MSGTMP4
120 paddd MSGTMP4, MSGTMP2
121 sha256msg2 MSGTMP1, MSGTMP2
122 shuf128_32 $0x0E, MSG, MSG
123 sha256rnds2 STATE1, STATE0
124 sha256msg1 MSGTMP1, MSGTMP0
125
126 /* Rounds 24-27 */
127 mova128 MSGTMP2, MSG
128 paddd 6*16(SHA256CONSTANTS), MSG
129 sha256rnds2 STATE0, STATE1
130 mova128 MSGTMP2, MSGTMP4
131 palignr $4, MSGTMP1, MSGTMP4
132 paddd MSGTMP4, MSGTMP3
133 sha256msg2 MSGTMP2, MSGTMP3
134 shuf128_32 $0x0E, MSG, MSG
135 sha256rnds2 STATE1, STATE0
136 sha256msg1 MSGTMP2, MSGTMP1
137
138 /* Rounds 28-31 */
139 mova128 MSGTMP3, MSG
140 paddd 7*16(SHA256CONSTANTS), MSG
141 sha256rnds2 STATE0, STATE1
142 mova128 MSGTMP3, MSGTMP4
143 palignr $4, MSGTMP2, MSGTMP4
144 paddd MSGTMP4, MSGTMP0
145 sha256msg2 MSGTMP3, MSGTMP0
146 shuf128_32 $0x0E, MSG, MSG
147 sha256rnds2 STATE1, STATE0
148 sha256msg1 MSGTMP3, MSGTMP2
149
150 /* Rounds 32-35 */
151 mova128 MSGTMP0, MSG
152 paddd 8*16(SHA256CONSTANTS), MSG
153 sha256rnds2 STATE0, STATE1
154 mova128 MSGTMP0, MSGTMP4
155 palignr $4, MSGTMP3, MSGTMP4
156 paddd MSGTMP4, MSGTMP1
157 sha256msg2 MSGTMP0, MSGTMP1
158 shuf128_32 $0x0E, MSG, MSG
159 sha256rnds2 STATE1, STATE0
160 sha256msg1 MSGTMP0, MSGTMP3
161
162 /* Rounds 36-39 */
163 mova128 MSGTMP1, MSG
164 paddd 9*16(SHA256CONSTANTS), MSG
165 sha256rnds2 STATE0, STATE1
166 mova128 MSGTMP1, MSGTMP4
167 palignr $4, MSGTMP0, MSGTMP4
168 paddd MSGTMP4, MSGTMP2
169 sha256msg2 MSGTMP1, MSGTMP2
170 shuf128_32 $0x0E, MSG, MSG
171 sha256rnds2 STATE1, STATE0
172 sha256msg1 MSGTMP1, MSGTMP0
173
174 /* Rounds 40-43 */
175 mova128 MSGTMP2, MSG
176 paddd 10*16(SHA256CONSTANTS), MSG
177 sha256rnds2 STATE0, STATE1
178 mova128 MSGTMP2, MSGTMP4
179 palignr $4, MSGTMP1, MSGTMP4
180 paddd MSGTMP4, MSGTMP3
181 sha256msg2 MSGTMP2, MSGTMP3
182 shuf128_32 $0x0E, MSG, MSG
183 sha256rnds2 STATE1, STATE0
184 sha256msg1 MSGTMP2, MSGTMP1
185
186 /* Rounds 44-47 */
187 mova128 MSGTMP3, MSG
188 paddd 11*16(SHA256CONSTANTS), MSG
189 sha256rnds2 STATE0, STATE1
190 mova128 MSGTMP3, MSGTMP4
191 palignr $4, MSGTMP2, MSGTMP4
192 paddd MSGTMP4, MSGTMP0
193 sha256msg2 MSGTMP3, MSGTMP0
194 shuf128_32 $0x0E, MSG, MSG
195 sha256rnds2 STATE1, STATE0
196 sha256msg1 MSGTMP3, MSGTMP2
197
198 /* Rounds 48-51 */
199 mova128 MSGTMP0, MSG
200 paddd 12*16(SHA256CONSTANTS), MSG
201 sha256rnds2 STATE0, STATE1
202 mova128 MSGTMP0, MSGTMP4
203 palignr $4, MSGTMP3, MSGTMP4
204 paddd MSGTMP4, MSGTMP1
205 sha256msg2 MSGTMP0, MSGTMP1
206 shuf128_32 $0x0E, MSG, MSG
207 sha256rnds2 STATE1, STATE0
208 sha256msg1 MSGTMP0, MSGTMP3
209
210 /* Rounds 52-55 */
211 mova128 MSGTMP1, MSG
212 paddd 13*16(SHA256CONSTANTS), MSG
213 sha256rnds2 STATE0, STATE1
214 mova128 MSGTMP1, MSGTMP4
215 palignr $4, MSGTMP0, MSGTMP4
216 paddd MSGTMP4, MSGTMP2
217 sha256msg2 MSGTMP1, MSGTMP2
218 shuf128_32 $0x0E, MSG, MSG
219 sha256rnds2 STATE1, STATE0
220
221 /* Rounds 56-59 */
222 mova128 MSGTMP2, MSG
223 paddd 14*16(SHA256CONSTANTS), MSG
224 sha256rnds2 STATE0, STATE1
225 mova128 MSGTMP2, MSGTMP4
226 palignr $4, MSGTMP1, MSGTMP4
227 paddd MSGTMP4, MSGTMP3
228 sha256msg2 MSGTMP2, MSGTMP3
229 shuf128_32 $0x0E, MSG, MSG
230 sha256rnds2 STATE1, STATE0
231
232 /* Rounds 60-63 */
233 mova128 MSGTMP3, MSG
234 paddd 15*16(SHA256CONSTANTS), MSG
235 sha256rnds2 STATE0, STATE1
236 shuf128_32 $0x0E, MSG, MSG
237 sha256rnds2 STATE1, STATE0
238
239 /* Add current hash values with previously saved */
240 paddd 0*16(%esp), STATE0
241 paddd 1*16(%esp), STATE1
242
243 /* Write hash values back in the correct order */
244 shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
245 shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
246 mova128 STATE0, MSGTMP4
247 pblendw $0xF0, STATE1, STATE0 /* DCBA */
248 palignr $8, MSGTMP4, STATE1 /* HGFE */
249
250 movu128 STATE0, 76+0*16(%eax)
251 movu128 STATE1, 76+1*16(%eax)
252
253 movl %ebp, %esp
254 popl %ebp
255 ret
256 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
257
258.section .rodata.cst256.K256, "aM", @progbits, 256
259.balign 16
260K256:
261 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
262 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
263 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
264 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
265 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
266 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
267 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
268 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
269 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
270 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
271 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
272 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
273 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
274 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
275 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
276 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
277
278.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
279.balign 16
280PSHUFFLE_BSWAP32_FLIP_MASK:
281 .octa 0x0c0d0e0f08090a0b0405060700010203
282
283#endif
diff --git a/libbb/hash_md5_sha256_x86-64_shaNI.S b/libbb/hash_md5_sha256_x86-64_shaNI.S
new file mode 100644
index 000000000..1c2b75af3
--- /dev/null
+++ b/libbb/hash_md5_sha256_x86-64_shaNI.S
@@ -0,0 +1,281 @@
1#if ENABLE_SHA256_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define shuf128_32 pshufd
16#define shuf128_32 shufps
17
18 .section .text.sha256_process_block64_shaNI, "ax", @progbits
19 .globl sha256_process_block64_shaNI
20 .hidden sha256_process_block64_shaNI
21 .type sha256_process_block64_shaNI, @function
22
23#define DATA_PTR %rdi
24
25#define SHA256CONSTANTS %rax
26
27#define MSG %xmm0
28#define STATE0 %xmm1
29#define STATE1 %xmm2
30#define MSGTMP0 %xmm3
31#define MSGTMP1 %xmm4
32#define MSGTMP2 %xmm5
33#define MSGTMP3 %xmm6
34#define MSGTMP4 %xmm7
35
36#define SHUF_MASK %xmm8
37
38#define ABEF_SAVE %xmm9
39#define CDGH_SAVE %xmm10
40
41 .balign 8 # allow decoders to fetch at least 2 first insns
42sha256_process_block64_shaNI:
43 movu128 80+0*16(%rdi), STATE0
44 movu128 80+1*16(%rdi), STATE1
45
46 shuf128_32 $0xB1, STATE0, STATE0 /* CDAB */
47 shuf128_32 $0x1B, STATE1, STATE1 /* EFGH */
48 mova128 STATE0, MSGTMP4
49 palignr $8, STATE1, STATE0 /* ABEF */
50 pblendw $0xF0, MSGTMP4, STATE1 /* CDGH */
51
52 mova128 PSHUFFLE_BSWAP32_FLIP_MASK(%rip), SHUF_MASK
53 lea K256(%rip), SHA256CONSTANTS
54
55 /* Save hash values for addition after rounds */
56 mova128 STATE0, ABEF_SAVE
57 mova128 STATE1, CDGH_SAVE
58
59 /* Rounds 0-3 */
60 movu128 0*16(DATA_PTR), MSG
61 pshufb SHUF_MASK, MSG
62 mova128 MSG, MSGTMP0
63 paddd 0*16(SHA256CONSTANTS), MSG
64 sha256rnds2 STATE0, STATE1
65 shuf128_32 $0x0E, MSG, MSG
66 sha256rnds2 STATE1, STATE0
67
68 /* Rounds 4-7 */
69 movu128 1*16(DATA_PTR), MSG
70 pshufb SHUF_MASK, MSG
71 mova128 MSG, MSGTMP1
72 paddd 1*16(SHA256CONSTANTS), MSG
73 sha256rnds2 STATE0, STATE1
74 shuf128_32 $0x0E, MSG, MSG
75 sha256rnds2 STATE1, STATE0
76 sha256msg1 MSGTMP1, MSGTMP0
77
78 /* Rounds 8-11 */
79 movu128 2*16(DATA_PTR), MSG
80 pshufb SHUF_MASK, MSG
81 mova128 MSG, MSGTMP2
82 paddd 2*16(SHA256CONSTANTS), MSG
83 sha256rnds2 STATE0, STATE1
84 shuf128_32 $0x0E, MSG, MSG
85 sha256rnds2 STATE1, STATE0
86 sha256msg1 MSGTMP2, MSGTMP1
87
88 /* Rounds 12-15 */
89 movu128 3*16(DATA_PTR), MSG
90 pshufb SHUF_MASK, MSG
91 mova128 MSG, MSGTMP3
92 paddd 3*16(SHA256CONSTANTS), MSG
93 sha256rnds2 STATE0, STATE1
94 mova128 MSGTMP3, MSGTMP4
95 palignr $4, MSGTMP2, MSGTMP4
96 paddd MSGTMP4, MSGTMP0
97 sha256msg2 MSGTMP3, MSGTMP0
98 shuf128_32 $0x0E, MSG, MSG
99 sha256rnds2 STATE1, STATE0
100 sha256msg1 MSGTMP3, MSGTMP2
101
102 /* Rounds 16-19 */
103 mova128 MSGTMP0, MSG
104 paddd 4*16(SHA256CONSTANTS), MSG
105 sha256rnds2 STATE0, STATE1
106 mova128 MSGTMP0, MSGTMP4
107 palignr $4, MSGTMP3, MSGTMP4
108 paddd MSGTMP4, MSGTMP1
109 sha256msg2 MSGTMP0, MSGTMP1
110 shuf128_32 $0x0E, MSG, MSG
111 sha256rnds2 STATE1, STATE0
112 sha256msg1 MSGTMP0, MSGTMP3
113
114 /* Rounds 20-23 */
115 mova128 MSGTMP1, MSG
116 paddd 5*16(SHA256CONSTANTS), MSG
117 sha256rnds2 STATE0, STATE1
118 mova128 MSGTMP1, MSGTMP4
119 palignr $4, MSGTMP0, MSGTMP4
120 paddd MSGTMP4, MSGTMP2
121 sha256msg2 MSGTMP1, MSGTMP2
122 shuf128_32 $0x0E, MSG, MSG
123 sha256rnds2 STATE1, STATE0
124 sha256msg1 MSGTMP1, MSGTMP0
125
126 /* Rounds 24-27 */
127 mova128 MSGTMP2, MSG
128 paddd 6*16(SHA256CONSTANTS), MSG
129 sha256rnds2 STATE0, STATE1
130 mova128 MSGTMP2, MSGTMP4
131 palignr $4, MSGTMP1, MSGTMP4
132 paddd MSGTMP4, MSGTMP3
133 sha256msg2 MSGTMP2, MSGTMP3
134 shuf128_32 $0x0E, MSG, MSG
135 sha256rnds2 STATE1, STATE0
136 sha256msg1 MSGTMP2, MSGTMP1
137
138 /* Rounds 28-31 */
139 mova128 MSGTMP3, MSG
140 paddd 7*16(SHA256CONSTANTS), MSG
141 sha256rnds2 STATE0, STATE1
142 mova128 MSGTMP3, MSGTMP4
143 palignr $4, MSGTMP2, MSGTMP4
144 paddd MSGTMP4, MSGTMP0
145 sha256msg2 MSGTMP3, MSGTMP0
146 shuf128_32 $0x0E, MSG, MSG
147 sha256rnds2 STATE1, STATE0
148 sha256msg1 MSGTMP3, MSGTMP2
149
150 /* Rounds 32-35 */
151 mova128 MSGTMP0, MSG
152 paddd 8*16(SHA256CONSTANTS), MSG
153 sha256rnds2 STATE0, STATE1
154 mova128 MSGTMP0, MSGTMP4
155 palignr $4, MSGTMP3, MSGTMP4
156 paddd MSGTMP4, MSGTMP1
157 sha256msg2 MSGTMP0, MSGTMP1
158 shuf128_32 $0x0E, MSG, MSG
159 sha256rnds2 STATE1, STATE0
160 sha256msg1 MSGTMP0, MSGTMP3
161
162 /* Rounds 36-39 */
163 mova128 MSGTMP1, MSG
164 paddd 9*16(SHA256CONSTANTS), MSG
165 sha256rnds2 STATE0, STATE1
166 mova128 MSGTMP1, MSGTMP4
167 palignr $4, MSGTMP0, MSGTMP4
168 paddd MSGTMP4, MSGTMP2
169 sha256msg2 MSGTMP1, MSGTMP2
170 shuf128_32 $0x0E, MSG, MSG
171 sha256rnds2 STATE1, STATE0
172 sha256msg1 MSGTMP1, MSGTMP0
173
174 /* Rounds 40-43 */
175 mova128 MSGTMP2, MSG
176 paddd 10*16(SHA256CONSTANTS), MSG
177 sha256rnds2 STATE0, STATE1
178 mova128 MSGTMP2, MSGTMP4
179 palignr $4, MSGTMP1, MSGTMP4
180 paddd MSGTMP4, MSGTMP3
181 sha256msg2 MSGTMP2, MSGTMP3
182 shuf128_32 $0x0E, MSG, MSG
183 sha256rnds2 STATE1, STATE0
184 sha256msg1 MSGTMP2, MSGTMP1
185
186 /* Rounds 44-47 */
187 mova128 MSGTMP3, MSG
188 paddd 11*16(SHA256CONSTANTS), MSG
189 sha256rnds2 STATE0, STATE1
190 mova128 MSGTMP3, MSGTMP4
191 palignr $4, MSGTMP2, MSGTMP4
192 paddd MSGTMP4, MSGTMP0
193 sha256msg2 MSGTMP3, MSGTMP0
194 shuf128_32 $0x0E, MSG, MSG
195 sha256rnds2 STATE1, STATE0
196 sha256msg1 MSGTMP3, MSGTMP2
197
198 /* Rounds 48-51 */
199 mova128 MSGTMP0, MSG
200 paddd 12*16(SHA256CONSTANTS), MSG
201 sha256rnds2 STATE0, STATE1
202 mova128 MSGTMP0, MSGTMP4
203 palignr $4, MSGTMP3, MSGTMP4
204 paddd MSGTMP4, MSGTMP1
205 sha256msg2 MSGTMP0, MSGTMP1
206 shuf128_32 $0x0E, MSG, MSG
207 sha256rnds2 STATE1, STATE0
208 sha256msg1 MSGTMP0, MSGTMP3
209
210 /* Rounds 52-55 */
211 mova128 MSGTMP1, MSG
212 paddd 13*16(SHA256CONSTANTS), MSG
213 sha256rnds2 STATE0, STATE1
214 mova128 MSGTMP1, MSGTMP4
215 palignr $4, MSGTMP0, MSGTMP4
216 paddd MSGTMP4, MSGTMP2
217 sha256msg2 MSGTMP1, MSGTMP2
218 shuf128_32 $0x0E, MSG, MSG
219 sha256rnds2 STATE1, STATE0
220
221 /* Rounds 56-59 */
222 mova128 MSGTMP2, MSG
223 paddd 14*16(SHA256CONSTANTS), MSG
224 sha256rnds2 STATE0, STATE1
225 mova128 MSGTMP2, MSGTMP4
226 palignr $4, MSGTMP1, MSGTMP4
227 paddd MSGTMP4, MSGTMP3
228 sha256msg2 MSGTMP2, MSGTMP3
229 shuf128_32 $0x0E, MSG, MSG
230 sha256rnds2 STATE1, STATE0
231
232 /* Rounds 60-63 */
233 mova128 MSGTMP3, MSG
234 paddd 15*16(SHA256CONSTANTS), MSG
235 sha256rnds2 STATE0, STATE1
236 shuf128_32 $0x0E, MSG, MSG
237 sha256rnds2 STATE1, STATE0
238
239 /* Add current hash values with previously saved */
240 paddd ABEF_SAVE, STATE0
241 paddd CDGH_SAVE, STATE1
242
243 /* Write hash values back in the correct order */
244 shuf128_32 $0x1B, STATE0, STATE0 /* FEBA */
245 shuf128_32 $0xB1, STATE1, STATE1 /* DCHG */
246 mova128 STATE0, MSGTMP4
247 pblendw $0xF0, STATE1, STATE0 /* DCBA */
248 palignr $8, MSGTMP4, STATE1 /* HGFE */
249
250 movu128 STATE0, 80+0*16(%rdi)
251 movu128 STATE1, 80+1*16(%rdi)
252
253 ret
254 .size sha256_process_block64_shaNI, .-sha256_process_block64_shaNI
255
256.section .rodata.cst256.K256, "aM", @progbits, 256
257.balign 16
258K256:
259 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
260 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
261 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
262 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
263 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
264 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
265 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
266 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
267 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
268 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
269 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
270 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
271 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
272 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
273 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
274 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
275
276.section .rodata.cst16.PSHUFFLE_BSWAP32_FLIP_MASK, "aM", @progbits, 16
277.balign 16
278PSHUFFLE_BSWAP32_FLIP_MASK:
279 .octa 0x0c0d0e0f08090a0b0405060700010203
280
281#endif
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S
index 166cfd38a..11b855e26 100644
--- a/libbb/hash_md5_sha_x86-32_shaNI.S
+++ b/libbb/hash_md5_sha_x86-32_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -224,7 +224,7 @@ sha1_process_block64_shaNI:
224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 224 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
225 225
226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 226.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
227.align 16 227.balign 16
228PSHUFFLE_BYTE_FLIP_MASK: 228PSHUFFLE_BYTE_FLIP_MASK:
229 .octa 0x000102030405060708090a0b0c0d0e0f 229 .octa 0x000102030405060708090a0b0c0d0e0f
230 230
diff --git a/libbb/hash_md5_sha_x86-64.S b/libbb/hash_md5_sha_x86-64.S
index 743269d98..47ace60de 100644
--- a/libbb/hash_md5_sha_x86-64.S
+++ b/libbb/hash_md5_sha_x86-64.S
@@ -1394,7 +1394,7 @@ sha1_process_block64:
1394 .size sha1_process_block64, .-sha1_process_block64 1394 .size sha1_process_block64, .-sha1_process_block64
1395 1395
1396 .section .rodata.cst16.sha1const, "aM", @progbits, 16 1396 .section .rodata.cst16.sha1const, "aM", @progbits, 16
1397 .align 16 1397 .balign 16
1398rconst0x5A827999: 1398rconst0x5A827999:
1399 .long 0x5A827999 1399 .long 0x5A827999
1400 .long 0x5A827999 1400 .long 0x5A827999
diff --git a/libbb/hash_md5_sha_x86-64.S.sh b/libbb/hash_md5_sha_x86-64.S.sh
index 47c40af0d..656fb5414 100755
--- a/libbb/hash_md5_sha_x86-64.S.sh
+++ b/libbb/hash_md5_sha_x86-64.S.sh
@@ -433,7 +433,7 @@ echo "
433 .size sha1_process_block64, .-sha1_process_block64 433 .size sha1_process_block64, .-sha1_process_block64
434 434
435 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16 435 .section .rodata.cst16.sha1const, \"aM\", @progbits, 16
436 .align 16 436 .balign 16
437rconst0x5A827999: 437rconst0x5A827999:
438 .long 0x5A827999 438 .long 0x5A827999
439 .long 0x5A827999 439 .long 0x5A827999
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
index 33cc3bf7f..ba92f09df 100644
--- a/libbb/hash_md5_sha_x86-64_shaNI.S
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -20,7 +20,7 @@
20#define extr128_32 pextrd 20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter 21//#define extr128_32 extractps # not shorter
22 22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits 23 .section .text.sha1_process_block64_shaNI, "ax", @progbits
24 .globl sha1_process_block64_shaNI 24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI 25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function 26 .type sha1_process_block64_shaNI, @function
@@ -218,7 +218,7 @@ sha1_process_block64_shaNI:
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI 218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219 219
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.align 16 221.balign 16
222PSHUFFLE_BYTE_FLIP_MASK: 222PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f 223 .octa 0x000102030405060708090a0b0c0d0e0f
224 224