diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 01:32:13 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 01:32:13 +0100 |
commit | a96ccbefe417aaac6a2ce59c788e01fc0f83902f (patch) | |
tree | 4151347572dcbce6777ba66c40cf0bc0135ee491 | |
parent | 711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (diff) | |
download | busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.tar.gz busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.tar.bz2 busybox-w32-a96ccbefe417aaac6a2ce59c788e01fc0f83902f.zip |
libbb/sha1: optional i686 hardware accelerates hashing
function old new delta
sha1_process_block64_shaNI - 524 +524
sha1_begin 57 114 +57
.rodata 104353 104369 +16
static.shaNI - 1 +1
------------------------------------------------------------------------------
(add/remove: 4/0 grow/shrink: 2/0 up/down: 598/0) Total: 598 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/Kbuild.src | 1 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 21 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-32_shaNI.S | 231 |
3 files changed, 252 insertions, 1 deletions
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index a3db02b6f..e8bb24f6d 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
@@ -58,6 +58,7 @@ lib-y += makedev.o | |||
58 | lib-y += hash_md5_sha.o | 58 | lib-y += hash_md5_sha.o |
59 | lib-y += hash_md5_sha_x86-64.o | 59 | lib-y += hash_md5_sha_x86-64.o |
60 | lib-y += hash_md5_sha_x86-64_shaNI.o | 60 | lib-y += hash_md5_sha_x86-64_shaNI.o |
61 | lib-y += hash_md5_sha_x86-32_shaNI.o | ||
61 | # Alternative (disabled) MD5 implementation | 62 | # Alternative (disabled) MD5 implementation |
62 | #lib-y += hash_md5prime.o | 63 | #lib-y += hash_md5prime.o |
63 | lib-y += messages.o | 64 | lib-y += messages.o |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index 4c6904b48..0b3af723a 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -1143,6 +1143,25 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
1143 | #endif /* NEED_SHA512 */ | 1143 | #endif /* NEED_SHA512 */ |
1144 | 1144 | ||
1145 | #if ENABLE_SHA1_HWACCEL | 1145 | #if ENABLE_SHA1_HWACCEL |
1146 | # if defined(__GNUC__) && defined(__i386__) | ||
1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
1148 | { | ||
1149 | asm ( | ||
1150 | " cpuid\n" | ||
1151 | : "=a"(*eax), /* Output */ | ||
1152 | "=b"(*ebx), | ||
1153 | "=c"(*ecx), | ||
1154 | "=d"(*edx) | ||
1155 | : "0"(*eax), /* Input */ | ||
1156 | "1"(*ebx), | ||
1157 | "2"(*ecx), | ||
1158 | "3"(*edx) | ||
1159 | /* No clobbered registers */ | ||
1160 | ); | ||
1161 | } | ||
1162 | struct ASM_expects_76_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 76)]; }; | ||
1163 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
1164 | # endif | ||
1146 | # if defined(__GNUC__) && defined(__x86_64__) | 1165 | # if defined(__GNUC__) && defined(__x86_64__) |
1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | 1166 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) |
1148 | { | 1167 | { |
@@ -1174,7 +1193,7 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
1174 | ctx->total64 = 0; | 1193 | ctx->total64 = 0; |
1175 | ctx->process_block = sha1_process_block64; | 1194 | ctx->process_block = sha1_process_block64; |
1176 | #if ENABLE_SHA1_HWACCEL | 1195 | #if ENABLE_SHA1_HWACCEL |
1177 | # if defined(__GNUC__) && defined(__x86_64__) | 1196 | # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
1178 | { | 1197 | { |
1179 | static smallint shaNI; | 1198 | static smallint shaNI; |
1180 | if (!shaNI) { | 1199 | if (!shaNI) { |
diff --git a/libbb/hash_md5_sha_x86-32_shaNI.S b/libbb/hash_md5_sha_x86-32_shaNI.S new file mode 100644 index 000000000..7202c7673 --- /dev/null +++ b/libbb/hash_md5_sha_x86-32_shaNI.S | |||
@@ -0,0 +1,231 @@ | |||
1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__i386__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define xor128 pxor | ||
16 | #define xor128 xorps | ||
17 | //#define shuf128_32 pshufd | ||
18 | #define shuf128_32 shufps | ||
19 | |||
20 | #define extr128_32 pextrd | ||
21 | //#define extr128_32 extractps # not shorter | ||
22 | |||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
24 | .globl sha1_process_block64_shaNI | ||
25 | .hidden sha1_process_block64_shaNI | ||
26 | .type sha1_process_block64_shaNI, @function | ||
27 | |||
28 | #define ABCD %xmm0 | ||
29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
30 | #define E1 %xmm2 | ||
31 | #define MSG0 %xmm3 | ||
32 | #define MSG1 %xmm4 | ||
33 | #define MSG2 %xmm5 | ||
34 | #define MSG3 %xmm6 | ||
35 | #define SHUF_MASK %xmm7 | ||
36 | |||
37 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
38 | sha1_process_block64_shaNI: | ||
39 | pushl %ebp | ||
40 | movl %esp, %ebp | ||
41 | subl $32, %esp | ||
42 | andl $~0xF, %esp # paddd needs aligned memory operand | ||
43 | |||
44 | /* load initial hash values */ | ||
45 | xor128 E0, E0 | ||
46 | movu128 76(%eax), ABCD | ||
47 | pinsrd $3, 76+4*4(%eax), E0 # load to upper 32-bit word | ||
48 | shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap | ||
49 | |||
50 | mova128 PSHUFFLE_BYTE_FLIP_MASK, SHUF_MASK | ||
51 | |||
52 | /* Save hash values for addition after rounds */ | ||
53 | movu128 E0, 16(%esp) | ||
54 | movu128 ABCD, (%esp) | ||
55 | |||
56 | /* Rounds 0-3 */ | ||
57 | movu128 0*16(%eax), MSG0 | ||
58 | pshufb SHUF_MASK, MSG0 | ||
59 | paddd MSG0, E0 | ||
60 | mova128 ABCD, E1 | ||
61 | sha1rnds4 $0, E0, ABCD | ||
62 | |||
63 | /* Rounds 4-7 */ | ||
64 | movu128 1*16(%eax), MSG1 | ||
65 | pshufb SHUF_MASK, MSG1 | ||
66 | sha1nexte MSG1, E1 | ||
67 | mova128 ABCD, E0 | ||
68 | sha1rnds4 $0, E1, ABCD | ||
69 | sha1msg1 MSG1, MSG0 | ||
70 | |||
71 | /* Rounds 8-11 */ | ||
72 | movu128 2*16(%eax), MSG2 | ||
73 | pshufb SHUF_MASK, MSG2 | ||
74 | sha1nexte MSG2, E0 | ||
75 | mova128 ABCD, E1 | ||
76 | sha1rnds4 $0, E0, ABCD | ||
77 | sha1msg1 MSG2, MSG1 | ||
78 | xor128 MSG2, MSG0 | ||
79 | |||
80 | /* Rounds 12-15 */ | ||
81 | movu128 3*16(%eax), MSG3 | ||
82 | pshufb SHUF_MASK, MSG3 | ||
83 | sha1nexte MSG3, E1 | ||
84 | mova128 ABCD, E0 | ||
85 | sha1msg2 MSG3, MSG0 | ||
86 | sha1rnds4 $0, E1, ABCD | ||
87 | sha1msg1 MSG3, MSG2 | ||
88 | xor128 MSG3, MSG1 | ||
89 | |||
90 | /* Rounds 16-19 */ | ||
91 | sha1nexte MSG0, E0 | ||
92 | mova128 ABCD, E1 | ||
93 | sha1msg2 MSG0, MSG1 | ||
94 | sha1rnds4 $0, E0, ABCD | ||
95 | sha1msg1 MSG0, MSG3 | ||
96 | xor128 MSG0, MSG2 | ||
97 | |||
98 | /* Rounds 20-23 */ | ||
99 | sha1nexte MSG1, E1 | ||
100 | mova128 ABCD, E0 | ||
101 | sha1msg2 MSG1, MSG2 | ||
102 | sha1rnds4 $1, E1, ABCD | ||
103 | sha1msg1 MSG1, MSG0 | ||
104 | xor128 MSG1, MSG3 | ||
105 | |||
106 | /* Rounds 24-27 */ | ||
107 | sha1nexte MSG2, E0 | ||
108 | mova128 ABCD, E1 | ||
109 | sha1msg2 MSG2, MSG3 | ||
110 | sha1rnds4 $1, E0, ABCD | ||
111 | sha1msg1 MSG2, MSG1 | ||
112 | xor128 MSG2, MSG0 | ||
113 | |||
114 | /* Rounds 28-31 */ | ||
115 | sha1nexte MSG3, E1 | ||
116 | mova128 ABCD, E0 | ||
117 | sha1msg2 MSG3, MSG0 | ||
118 | sha1rnds4 $1, E1, ABCD | ||
119 | sha1msg1 MSG3, MSG2 | ||
120 | xor128 MSG3, MSG1 | ||
121 | |||
122 | /* Rounds 32-35 */ | ||
123 | sha1nexte MSG0, E0 | ||
124 | mova128 ABCD, E1 | ||
125 | sha1msg2 MSG0, MSG1 | ||
126 | sha1rnds4 $1, E0, ABCD | ||
127 | sha1msg1 MSG0, MSG3 | ||
128 | xor128 MSG0, MSG2 | ||
129 | |||
130 | /* Rounds 36-39 */ | ||
131 | sha1nexte MSG1, E1 | ||
132 | mova128 ABCD, E0 | ||
133 | sha1msg2 MSG1, MSG2 | ||
134 | sha1rnds4 $1, E1, ABCD | ||
135 | sha1msg1 MSG1, MSG0 | ||
136 | xor128 MSG1, MSG3 | ||
137 | |||
138 | /* Rounds 40-43 */ | ||
139 | sha1nexte MSG2, E0 | ||
140 | mova128 ABCD, E1 | ||
141 | sha1msg2 MSG2, MSG3 | ||
142 | sha1rnds4 $2, E0, ABCD | ||
143 | sha1msg1 MSG2, MSG1 | ||
144 | xor128 MSG2, MSG0 | ||
145 | |||
146 | /* Rounds 44-47 */ | ||
147 | sha1nexte MSG3, E1 | ||
148 | mova128 ABCD, E0 | ||
149 | sha1msg2 MSG3, MSG0 | ||
150 | sha1rnds4 $2, E1, ABCD | ||
151 | sha1msg1 MSG3, MSG2 | ||
152 | xor128 MSG3, MSG1 | ||
153 | |||
154 | /* Rounds 48-51 */ | ||
155 | sha1nexte MSG0, E0 | ||
156 | mova128 ABCD, E1 | ||
157 | sha1msg2 MSG0, MSG1 | ||
158 | sha1rnds4 $2, E0, ABCD | ||
159 | sha1msg1 MSG0, MSG3 | ||
160 | xor128 MSG0, MSG2 | ||
161 | |||
162 | /* Rounds 52-55 */ | ||
163 | sha1nexte MSG1, E1 | ||
164 | mova128 ABCD, E0 | ||
165 | sha1msg2 MSG1, MSG2 | ||
166 | sha1rnds4 $2, E1, ABCD | ||
167 | sha1msg1 MSG1, MSG0 | ||
168 | xor128 MSG1, MSG3 | ||
169 | |||
170 | /* Rounds 56-59 */ | ||
171 | sha1nexte MSG2, E0 | ||
172 | mova128 ABCD, E1 | ||
173 | sha1msg2 MSG2, MSG3 | ||
174 | sha1rnds4 $2, E0, ABCD | ||
175 | sha1msg1 MSG2, MSG1 | ||
176 | xor128 MSG2, MSG0 | ||
177 | |||
178 | /* Rounds 60-63 */ | ||
179 | sha1nexte MSG3, E1 | ||
180 | mova128 ABCD, E0 | ||
181 | sha1msg2 MSG3, MSG0 | ||
182 | sha1rnds4 $3, E1, ABCD | ||
183 | sha1msg1 MSG3, MSG2 | ||
184 | xor128 MSG3, MSG1 | ||
185 | |||
186 | /* Rounds 64-67 */ | ||
187 | sha1nexte MSG0, E0 | ||
188 | mova128 ABCD, E1 | ||
189 | sha1msg2 MSG0, MSG1 | ||
190 | sha1rnds4 $3, E0, ABCD | ||
191 | sha1msg1 MSG0, MSG3 | ||
192 | xor128 MSG0, MSG2 | ||
193 | |||
194 | /* Rounds 68-71 */ | ||
195 | sha1nexte MSG1, E1 | ||
196 | mova128 ABCD, E0 | ||
197 | sha1msg2 MSG1, MSG2 | ||
198 | sha1rnds4 $3, E1, ABCD | ||
199 | xor128 MSG1, MSG3 | ||
200 | |||
201 | /* Rounds 72-75 */ | ||
202 | sha1nexte MSG2, E0 | ||
203 | mova128 ABCD, E1 | ||
204 | sha1msg2 MSG2, MSG3 | ||
205 | sha1rnds4 $3, E0, ABCD | ||
206 | |||
207 | /* Rounds 76-79 */ | ||
208 | sha1nexte MSG3, E1 | ||
209 | mova128 ABCD, E0 | ||
210 | sha1rnds4 $3, E1, ABCD | ||
211 | |||
212 | /* Add current hash values with previously saved */ | ||
213 | sha1nexte 16(%esp), E0 | ||
214 | paddd (%esp), ABCD | ||
215 | |||
216 | /* Write hash values back in the correct order */ | ||
217 | shuf128_32 $0x1B, ABCD, ABCD | ||
218 | movu128 ABCD, 76(%eax) | ||
219 | extr128_32 $3, E0, 76+4*4(%eax) | ||
220 | |||
221 | movl %ebp, %esp | ||
222 | popl %ebp | ||
223 | ret | ||
224 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
225 | |||
226 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
227 | .align 16 | ||
228 | PSHUFFLE_BYTE_FLIP_MASK: | ||
229 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
230 | |||
231 | #endif | ||