diff options
author | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 00:43:59 +0100 |
---|---|---|
committer | Denys Vlasenko <vda.linux@googlemail.com> | 2022-01-07 00:43:59 +0100 |
commit | 711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (patch) | |
tree | 170fa55e39133e3ba7182fa56d1643e25b55010a | |
parent | a93668cc4277b14eaff07fcfdef9693c990ec824 (diff) | |
download | busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.gz busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.bz2 busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.zip |
libbb/sha1: optional x86-64 hardware accelerates hashing
function old new delta
sha1_process_block64_shaNI - 510 +510
sha1_begin 52 107 +55
.rodata 108285 108301 +16
static.shaNI - 1 +1
------------------------------------------------------------------------------
(add/remove: 4/0 grow/shrink: 2/0 up/down: 582/0) Total: 582 bytes
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | libbb/Config.src | 7 | ||||
-rw-r--r-- | libbb/Kbuild.src | 1 | ||||
-rw-r--r-- | libbb/hash_md5_sha.c | 38 | ||||
-rw-r--r-- | libbb/hash_md5_sha_x86-64_shaNI.S | 225 |
4 files changed, 270 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src index c80bee286..708d3b0c8 100644 --- a/libbb/Config.src +++ b/libbb/Config.src | |||
@@ -63,6 +63,13 @@ config SHA1_SMALL | |||
63 | 1 224 229 654 732 | 63 | 1 224 229 654 732 |
64 | 2,3 200 195 358 380 | 64 | 2,3 200 195 358 380 |
65 | 65 | ||
66 | config SHA1_HWACCEL | ||
67 | bool "SHA1: Use hardware accelerated instructions if possible" | ||
68 | default y | ||
69 | help | ||
70 | On x86, this adds ~590 bytes of code. Throughput | ||
71 | is about twice as fast as fully-unrolled generic code. | ||
72 | |||
66 | config SHA3_SMALL | 73 | config SHA3_SMALL |
67 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" | 74 | int "SHA3: Trade bytes for speed (0:fast, 1:slow)" |
68 | default 1 # all "fast or small" options default to small | 75 | default 1 # all "fast or small" options default to small |
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src index 19b8aad60..a3db02b6f 100644 --- a/libbb/Kbuild.src +++ b/libbb/Kbuild.src | |||
@@ -57,6 +57,7 @@ lib-y += make_directory.o | |||
57 | lib-y += makedev.o | 57 | lib-y += makedev.o |
58 | lib-y += hash_md5_sha.o | 58 | lib-y += hash_md5_sha.o |
59 | lib-y += hash_md5_sha_x86-64.o | 59 | lib-y += hash_md5_sha_x86-64.o |
60 | lib-y += hash_md5_sha_x86-64_shaNI.o | ||
60 | # Alternative (disabled) MD5 implementation | 61 | # Alternative (disabled) MD5 implementation |
61 | #lib-y += hash_md5prime.o | 62 | #lib-y += hash_md5prime.o |
62 | lib-y += messages.o | 63 | lib-y += messages.o |
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c index ee19c1cb7..4c6904b48 100644 --- a/libbb/hash_md5_sha.c +++ b/libbb/hash_md5_sha.c | |||
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM) | |||
699 | 699 | ||
700 | /* in hash_md5_sha_x86-64.S */ | 700 | /* in hash_md5_sha_x86-64.S */ |
701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | 701 | struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; |
702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); | 702 | void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx); |
703 | 703 | ||
704 | # else | 704 | # else |
705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. | 705 | /* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. |
@@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx) | |||
1142 | } | 1142 | } |
1143 | #endif /* NEED_SHA512 */ | 1143 | #endif /* NEED_SHA512 */ |
1144 | 1144 | ||
1145 | #if ENABLE_SHA1_HWACCEL | ||
1146 | # if defined(__GNUC__) && defined(__x86_64__) | ||
1147 | static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) | ||
1148 | { | ||
1149 | asm ( | ||
1150 | "cpuid\n" | ||
1151 | : "=a"(*eax), /* Output */ | ||
1152 | "=b"(*ebx), | ||
1153 | "=c"(*ecx), | ||
1154 | "=d"(*edx) | ||
1155 | : "0"(*eax), /* Input */ | ||
1156 | "1"(*ebx), | ||
1157 | "2"(*ecx), | ||
1158 | "3"(*edx) | ||
1159 | /* No clobbered registers */ | ||
1160 | ); | ||
1161 | } | ||
1162 | struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; | ||
1163 | void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx); | ||
1164 | # endif | ||
1165 | #endif | ||
1166 | |||
1145 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | 1167 | void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) |
1146 | { | 1168 | { |
1147 | ctx->hash[0] = 0x67452301; | 1169 | ctx->hash[0] = 0x67452301; |
@@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) | |||
1151 | ctx->hash[4] = 0xc3d2e1f0; | 1173 | ctx->hash[4] = 0xc3d2e1f0; |
1152 | ctx->total64 = 0; | 1174 | ctx->total64 = 0; |
1153 | ctx->process_block = sha1_process_block64; | 1175 | ctx->process_block = sha1_process_block64; |
1176 | #if ENABLE_SHA1_HWACCEL | ||
1177 | # if defined(__GNUC__) && defined(__x86_64__) | ||
1178 | { | ||
1179 | static smallint shaNI; | ||
1180 | if (!shaNI) { | ||
1181 | unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx; | ||
1182 | cpuid(&eax, &ebx, &ecx, &edx); | ||
1183 | shaNI = ((ebx >> 28) & 2) - 1; | ||
1184 | } | ||
1185 | if (shaNI > 0) | ||
1186 | ctx->process_block = sha1_process_block64_shaNI; | ||
1187 | } | ||
1188 | # endif | ||
1189 | #endif | ||
1154 | } | 1190 | } |
1155 | 1191 | ||
1156 | static const uint32_t init256[] ALIGN4 = { | 1192 | static const uint32_t init256[] ALIGN4 = { |
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S new file mode 100644 index 000000000..473b472f1 --- /dev/null +++ b/libbb/hash_md5_sha_x86-64_shaNI.S | |||
@@ -0,0 +1,225 @@ | |||
1 | #if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__) | ||
2 | /* The code is adapted from Linux kernel's source */ | ||
3 | |||
4 | // We use shorter insns, even though they are for "wrong" | ||
5 | // data type (fp, not int). | ||
6 | // For Intel, there is no penalty for doing it at all | ||
7 | // (CPUs which do have such penalty do not support SHA1 insns). | ||
8 | // For AMD, the penalty is one extra cycle | ||
9 | // (allegedly: I failed to find measurable difference). | ||
10 | |||
11 | //#define mova128 movdqa | ||
12 | #define mova128 movaps | ||
13 | //#define movu128 movdqu | ||
14 | #define movu128 movups | ||
15 | //#define xor128 pxor | ||
16 | #define xor128 xorps | ||
17 | //#define shuf128_32 pshufd | ||
18 | #define shuf128_32 shufps | ||
19 | |||
20 | #define extr128_32 pextrd | ||
21 | //#define extr128_32 extractps # not shorter | ||
22 | |||
23 | .section .text.sha1_process_block64_shaNI,"ax",@progbits | ||
24 | .globl sha1_process_block64_shaNI | ||
25 | .hidden sha1_process_block64_shaNI | ||
26 | .type sha1_process_block64_shaNI, @function | ||
27 | |||
28 | #define ABCD %xmm0 | ||
29 | #define E0 %xmm1 /* Need two E's b/c they ping pong */ | ||
30 | #define E1 %xmm2 | ||
31 | #define MSG0 %xmm3 | ||
32 | #define MSG1 %xmm4 | ||
33 | #define MSG2 %xmm5 | ||
34 | #define MSG3 %xmm6 | ||
35 | #define SHUF_MASK %xmm7 | ||
36 | |||
37 | .balign 8 # allow decoders to fetch at least 2 first insns | ||
38 | sha1_process_block64_shaNI: | ||
39 | /* load initial hash values */ | ||
40 | |||
41 | xor128 E0, E0 | ||
42 | movu128 80(%rdi), ABCD | ||
43 | pinsrd $3, 80+4*4(%rdi), E0 # load to upper 32-bit word | ||
44 | shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap | ||
45 | |||
46 | mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK | ||
47 | |||
48 | /* Save hash values for addition after rounds */ | ||
49 | mova128 E0, %xmm9 | ||
50 | mova128 ABCD, %xmm8 | ||
51 | |||
52 | /* Rounds 0-3 */ | ||
53 | movu128 0*16(%rdi), MSG0 | ||
54 | pshufb SHUF_MASK, MSG0 | ||
55 | paddd MSG0, E0 | ||
56 | mova128 ABCD, E1 | ||
57 | sha1rnds4 $0, E0, ABCD | ||
58 | |||
59 | /* Rounds 4-7 */ | ||
60 | movu128 1*16(%rdi), MSG1 | ||
61 | pshufb SHUF_MASK, MSG1 | ||
62 | sha1nexte MSG1, E1 | ||
63 | mova128 ABCD, E0 | ||
64 | sha1rnds4 $0, E1, ABCD | ||
65 | sha1msg1 MSG1, MSG0 | ||
66 | |||
67 | /* Rounds 8-11 */ | ||
68 | movu128 2*16(%rdi), MSG2 | ||
69 | pshufb SHUF_MASK, MSG2 | ||
70 | sha1nexte MSG2, E0 | ||
71 | mova128 ABCD, E1 | ||
72 | sha1rnds4 $0, E0, ABCD | ||
73 | sha1msg1 MSG2, MSG1 | ||
74 | xor128 MSG2, MSG0 | ||
75 | |||
76 | /* Rounds 12-15 */ | ||
77 | movu128 3*16(%rdi), MSG3 | ||
78 | pshufb SHUF_MASK, MSG3 | ||
79 | sha1nexte MSG3, E1 | ||
80 | mova128 ABCD, E0 | ||
81 | sha1msg2 MSG3, MSG0 | ||
82 | sha1rnds4 $0, E1, ABCD | ||
83 | sha1msg1 MSG3, MSG2 | ||
84 | xor128 MSG3, MSG1 | ||
85 | |||
86 | /* Rounds 16-19 */ | ||
87 | sha1nexte MSG0, E0 | ||
88 | mova128 ABCD, E1 | ||
89 | sha1msg2 MSG0, MSG1 | ||
90 | sha1rnds4 $0, E0, ABCD | ||
91 | sha1msg1 MSG0, MSG3 | ||
92 | xor128 MSG0, MSG2 | ||
93 | |||
94 | /* Rounds 20-23 */ | ||
95 | sha1nexte MSG1, E1 | ||
96 | mova128 ABCD, E0 | ||
97 | sha1msg2 MSG1, MSG2 | ||
98 | sha1rnds4 $1, E1, ABCD | ||
99 | sha1msg1 MSG1, MSG0 | ||
100 | xor128 MSG1, MSG3 | ||
101 | |||
102 | /* Rounds 24-27 */ | ||
103 | sha1nexte MSG2, E0 | ||
104 | mova128 ABCD, E1 | ||
105 | sha1msg2 MSG2, MSG3 | ||
106 | sha1rnds4 $1, E0, ABCD | ||
107 | sha1msg1 MSG2, MSG1 | ||
108 | xor128 MSG2, MSG0 | ||
109 | |||
110 | /* Rounds 28-31 */ | ||
111 | sha1nexte MSG3, E1 | ||
112 | mova128 ABCD, E0 | ||
113 | sha1msg2 MSG3, MSG0 | ||
114 | sha1rnds4 $1, E1, ABCD | ||
115 | sha1msg1 MSG3, MSG2 | ||
116 | xor128 MSG3, MSG1 | ||
117 | |||
118 | /* Rounds 32-35 */ | ||
119 | sha1nexte MSG0, E0 | ||
120 | mova128 ABCD, E1 | ||
121 | sha1msg2 MSG0, MSG1 | ||
122 | sha1rnds4 $1, E0, ABCD | ||
123 | sha1msg1 MSG0, MSG3 | ||
124 | xor128 MSG0, MSG2 | ||
125 | |||
126 | /* Rounds 36-39 */ | ||
127 | sha1nexte MSG1, E1 | ||
128 | mova128 ABCD, E0 | ||
129 | sha1msg2 MSG1, MSG2 | ||
130 | sha1rnds4 $1, E1, ABCD | ||
131 | sha1msg1 MSG1, MSG0 | ||
132 | xor128 MSG1, MSG3 | ||
133 | |||
134 | /* Rounds 40-43 */ | ||
135 | sha1nexte MSG2, E0 | ||
136 | mova128 ABCD, E1 | ||
137 | sha1msg2 MSG2, MSG3 | ||
138 | sha1rnds4 $2, E0, ABCD | ||
139 | sha1msg1 MSG2, MSG1 | ||
140 | xor128 MSG2, MSG0 | ||
141 | |||
142 | /* Rounds 44-47 */ | ||
143 | sha1nexte MSG3, E1 | ||
144 | mova128 ABCD, E0 | ||
145 | sha1msg2 MSG3, MSG0 | ||
146 | sha1rnds4 $2, E1, ABCD | ||
147 | sha1msg1 MSG3, MSG2 | ||
148 | xor128 MSG3, MSG1 | ||
149 | |||
150 | /* Rounds 48-51 */ | ||
151 | sha1nexte MSG0, E0 | ||
152 | mova128 ABCD, E1 | ||
153 | sha1msg2 MSG0, MSG1 | ||
154 | sha1rnds4 $2, E0, ABCD | ||
155 | sha1msg1 MSG0, MSG3 | ||
156 | xor128 MSG0, MSG2 | ||
157 | |||
158 | /* Rounds 52-55 */ | ||
159 | sha1nexte MSG1, E1 | ||
160 | mova128 ABCD, E0 | ||
161 | sha1msg2 MSG1, MSG2 | ||
162 | sha1rnds4 $2, E1, ABCD | ||
163 | sha1msg1 MSG1, MSG0 | ||
164 | xor128 MSG1, MSG3 | ||
165 | |||
166 | /* Rounds 56-59 */ | ||
167 | sha1nexte MSG2, E0 | ||
168 | mova128 ABCD, E1 | ||
169 | sha1msg2 MSG2, MSG3 | ||
170 | sha1rnds4 $2, E0, ABCD | ||
171 | sha1msg1 MSG2, MSG1 | ||
172 | xor128 MSG2, MSG0 | ||
173 | |||
174 | /* Rounds 60-63 */ | ||
175 | sha1nexte MSG3, E1 | ||
176 | mova128 ABCD, E0 | ||
177 | sha1msg2 MSG3, MSG0 | ||
178 | sha1rnds4 $3, E1, ABCD | ||
179 | sha1msg1 MSG3, MSG2 | ||
180 | xor128 MSG3, MSG1 | ||
181 | |||
182 | /* Rounds 64-67 */ | ||
183 | sha1nexte MSG0, E0 | ||
184 | mova128 ABCD, E1 | ||
185 | sha1msg2 MSG0, MSG1 | ||
186 | sha1rnds4 $3, E0, ABCD | ||
187 | sha1msg1 MSG0, MSG3 | ||
188 | xor128 MSG0, MSG2 | ||
189 | |||
190 | /* Rounds 68-71 */ | ||
191 | sha1nexte MSG1, E1 | ||
192 | mova128 ABCD, E0 | ||
193 | sha1msg2 MSG1, MSG2 | ||
194 | sha1rnds4 $3, E1, ABCD | ||
195 | xor128 MSG1, MSG3 | ||
196 | |||
197 | /* Rounds 72-75 */ | ||
198 | sha1nexte MSG2, E0 | ||
199 | mova128 ABCD, E1 | ||
200 | sha1msg2 MSG2, MSG3 | ||
201 | sha1rnds4 $3, E0, ABCD | ||
202 | |||
203 | /* Rounds 76-79 */ | ||
204 | sha1nexte MSG3, E1 | ||
205 | mova128 ABCD, E0 | ||
206 | sha1rnds4 $3, E1, ABCD | ||
207 | |||
208 | /* Add current hash values with previously saved */ | ||
209 | sha1nexte %xmm9, E0 | ||
210 | paddd %xmm8, ABCD | ||
211 | |||
212 | /* Write hash values back in the correct order */ | ||
213 | shuf128_32 $0x1B, ABCD, ABCD | ||
214 | movu128 ABCD, 80(%rdi) | ||
215 | extr128_32 $3, E0, 80+4*4(%rdi) | ||
216 | |||
217 | ret | ||
218 | .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI | ||
219 | |||
220 | .section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16 | ||
221 | .align 16 | ||
222 | PSHUFFLE_BYTE_FLIP_MASK: | ||
223 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
224 | |||
225 | #endif | ||