aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenys Vlasenko <vda.linux@googlemail.com>2022-01-07 00:43:59 +0100
committerDenys Vlasenko <vda.linux@googlemail.com>2022-01-07 00:43:59 +0100
commit711e20ecb85d13f98ba3e2bdcb344ee7534829c4 (patch)
tree170fa55e39133e3ba7182fa56d1643e25b55010a
parenta93668cc4277b14eaff07fcfdef9693c990ec824 (diff)
downloadbusybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.gz
busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.tar.bz2
busybox-w32-711e20ecb85d13f98ba3e2bdcb344ee7534829c4.zip
libbb/sha1: optional x86-64 hardware accelerates hashing
function old new delta sha1_process_block64_shaNI - 510 +510 sha1_begin 52 107 +55 .rodata 108285 108301 +16 static.shaNI - 1 +1 ------------------------------------------------------------------------------ (add/remove: 4/0 grow/shrink: 2/0 up/down: 582/0) Total: 582 bytes Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r--libbb/Config.src7
-rw-r--r--libbb/Kbuild.src1
-rw-r--r--libbb/hash_md5_sha.c38
-rw-r--r--libbb/hash_md5_sha_x86-64_shaNI.S225
4 files changed, 270 insertions, 1 deletions
diff --git a/libbb/Config.src b/libbb/Config.src
index c80bee286..708d3b0c8 100644
--- a/libbb/Config.src
+++ b/libbb/Config.src
@@ -63,6 +63,13 @@ config SHA1_SMALL
63 1 224 229 654 732 63 1 224 229 654 732
64 2,3 200 195 358 380 64 2,3 200 195 358 380
65 65
66config SHA1_HWACCEL
67 bool "SHA1: Use hardware accelerated instructions if possible"
68 default y
69 help
70 On x86, this adds ~590 bytes of code. Throughput
71 is about twice as fast as fully-unrolled generic code.
72
66config SHA3_SMALL 73config SHA3_SMALL
67 int "SHA3: Trade bytes for speed (0:fast, 1:slow)" 74 int "SHA3: Trade bytes for speed (0:fast, 1:slow)"
68 default 1 # all "fast or small" options default to small 75 default 1 # all "fast or small" options default to small
diff --git a/libbb/Kbuild.src b/libbb/Kbuild.src
index 19b8aad60..a3db02b6f 100644
--- a/libbb/Kbuild.src
+++ b/libbb/Kbuild.src
@@ -57,6 +57,7 @@ lib-y += make_directory.o
57lib-y += makedev.o 57lib-y += makedev.o
58lib-y += hash_md5_sha.o 58lib-y += hash_md5_sha.o
59lib-y += hash_md5_sha_x86-64.o 59lib-y += hash_md5_sha_x86-64.o
60lib-y += hash_md5_sha_x86-64_shaNI.o
60# Alternative (disabled) MD5 implementation 61# Alternative (disabled) MD5 implementation
61#lib-y += hash_md5prime.o 62#lib-y += hash_md5prime.o
62lib-y += messages.o 63lib-y += messages.o
diff --git a/libbb/hash_md5_sha.c b/libbb/hash_md5_sha.c
index ee19c1cb7..4c6904b48 100644
--- a/libbb/hash_md5_sha.c
+++ b/libbb/hash_md5_sha.c
@@ -699,7 +699,7 @@ static void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM)
699 699
700/* in hash_md5_sha_x86-64.S */ 700/* in hash_md5_sha_x86-64.S */
701struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; }; 701struct ASM_expects_80 { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
702void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx UNUSED_PARAM); 702void FAST_FUNC sha1_process_block64(sha1_ctx_t *ctx);
703 703
704# else 704# else
705/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86. 705/* Fast, fully-unrolled SHA1. +3800 bytes of code on x86.
@@ -1142,6 +1142,28 @@ static void FAST_FUNC sha512_process_block128(sha512_ctx_t *ctx)
1142} 1142}
1143#endif /* NEED_SHA512 */ 1143#endif /* NEED_SHA512 */
1144 1144
1145#if ENABLE_SHA1_HWACCEL
1146# if defined(__GNUC__) && defined(__x86_64__)
1147static void cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
1148{
1149 asm (
1150 "cpuid\n"
1151 : "=a"(*eax), /* Output */
1152 "=b"(*ebx),
1153 "=c"(*ecx),
1154 "=d"(*edx)
1155 : "0"(*eax), /* Input */
1156 "1"(*ebx),
1157 "2"(*ecx),
1158 "3"(*edx)
1159 /* No clobbered registers */
1160 );
1161}
1162struct ASM_expects_80_shaNI { char t[1 - 2*(offsetof(sha1_ctx_t, hash) != 80)]; };
1163void FAST_FUNC sha1_process_block64_shaNI(sha1_ctx_t *ctx);
1164# endif
1165#endif
1166
1145void FAST_FUNC sha1_begin(sha1_ctx_t *ctx) 1167void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1146{ 1168{
1147 ctx->hash[0] = 0x67452301; 1169 ctx->hash[0] = 0x67452301;
@@ -1151,6 +1173,20 @@ void FAST_FUNC sha1_begin(sha1_ctx_t *ctx)
1151 ctx->hash[4] = 0xc3d2e1f0; 1173 ctx->hash[4] = 0xc3d2e1f0;
1152 ctx->total64 = 0; 1174 ctx->total64 = 0;
1153 ctx->process_block = sha1_process_block64; 1175 ctx->process_block = sha1_process_block64;
1176#if ENABLE_SHA1_HWACCEL
1177# if defined(__GNUC__) && defined(__x86_64__)
1178 {
1179 static smallint shaNI;
1180 if (!shaNI) {
1181 unsigned eax = 7, ebx = ebx, ecx = 0, edx = edx;
1182 cpuid(&eax, &ebx, &ecx, &edx);
1183 shaNI = ((ebx >> 28) & 2) - 1;
1184 }
1185 if (shaNI > 0)
1186 ctx->process_block = sha1_process_block64_shaNI;
1187 }
1188# endif
1189#endif
1154} 1190}
1155 1191
1156static const uint32_t init256[] ALIGN4 = { 1192static const uint32_t init256[] ALIGN4 = {
diff --git a/libbb/hash_md5_sha_x86-64_shaNI.S b/libbb/hash_md5_sha_x86-64_shaNI.S
new file mode 100644
index 000000000..473b472f1
--- /dev/null
+++ b/libbb/hash_md5_sha_x86-64_shaNI.S
@@ -0,0 +1,225 @@
1#if ENABLE_SHA1_HWACCEL && defined(__GNUC__) && defined(__x86_64__)
2/* The code is adapted from Linux kernel's source */
3
4// We use shorter insns, even though they are for "wrong"
5// data type (fp, not int).
6// For Intel, there is no penalty for doing it at all
7// (CPUs which do have such penalty do not support SHA1 insns).
8// For AMD, the penalty is one extra cycle
9// (allegedly: I failed to find measurable difference).
10
11//#define mova128 movdqa
12#define mova128 movaps
13//#define movu128 movdqu
14#define movu128 movups
15//#define xor128 pxor
16#define xor128 xorps
17//#define shuf128_32 pshufd
18#define shuf128_32 shufps
19
20#define extr128_32 pextrd
21//#define extr128_32 extractps # not shorter
22
23 .section .text.sha1_process_block64_shaNI,"ax",@progbits
24 .globl sha1_process_block64_shaNI
25 .hidden sha1_process_block64_shaNI
26 .type sha1_process_block64_shaNI, @function
27
28#define ABCD %xmm0
29#define E0 %xmm1 /* Need two E's b/c they ping pong */
30#define E1 %xmm2
31#define MSG0 %xmm3
32#define MSG1 %xmm4
33#define MSG2 %xmm5
34#define MSG3 %xmm6
35#define SHUF_MASK %xmm7
36
37 .balign 8 # allow decoders to fetch at least 2 first insns
38sha1_process_block64_shaNI:
39 /* load initial hash values */
40
41 xor128 E0, E0
42 movu128 80(%rdi), ABCD
43 pinsrd $3, 80+4*4(%rdi), E0 # load to upper 32-bit word
44 shuf128_32 $0x1B, ABCD, ABCD # 00011011: bswap
45
46 mova128 PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
47
48 /* Save hash values for addition after rounds */
49 mova128 E0, %xmm9
50 mova128 ABCD, %xmm8
51
52 /* Rounds 0-3 */
53 movu128 0*16(%rdi), MSG0
54 pshufb SHUF_MASK, MSG0
55 paddd MSG0, E0
56 mova128 ABCD, E1
57 sha1rnds4 $0, E0, ABCD
58
59 /* Rounds 4-7 */
60 movu128 1*16(%rdi), MSG1
61 pshufb SHUF_MASK, MSG1
62 sha1nexte MSG1, E1
63 mova128 ABCD, E0
64 sha1rnds4 $0, E1, ABCD
65 sha1msg1 MSG1, MSG0
66
67 /* Rounds 8-11 */
68 movu128 2*16(%rdi), MSG2
69 pshufb SHUF_MASK, MSG2
70 sha1nexte MSG2, E0
71 mova128 ABCD, E1
72 sha1rnds4 $0, E0, ABCD
73 sha1msg1 MSG2, MSG1
74 xor128 MSG2, MSG0
75
76 /* Rounds 12-15 */
77 movu128 3*16(%rdi), MSG3
78 pshufb SHUF_MASK, MSG3
79 sha1nexte MSG3, E1
80 mova128 ABCD, E0
81 sha1msg2 MSG3, MSG0
82 sha1rnds4 $0, E1, ABCD
83 sha1msg1 MSG3, MSG2
84 xor128 MSG3, MSG1
85
86 /* Rounds 16-19 */
87 sha1nexte MSG0, E0
88 mova128 ABCD, E1
89 sha1msg2 MSG0, MSG1
90 sha1rnds4 $0, E0, ABCD
91 sha1msg1 MSG0, MSG3
92 xor128 MSG0, MSG2
93
94 /* Rounds 20-23 */
95 sha1nexte MSG1, E1
96 mova128 ABCD, E0
97 sha1msg2 MSG1, MSG2
98 sha1rnds4 $1, E1, ABCD
99 sha1msg1 MSG1, MSG0
100 xor128 MSG1, MSG3
101
102 /* Rounds 24-27 */
103 sha1nexte MSG2, E0
104 mova128 ABCD, E1
105 sha1msg2 MSG2, MSG3
106 sha1rnds4 $1, E0, ABCD
107 sha1msg1 MSG2, MSG1
108 xor128 MSG2, MSG0
109
110 /* Rounds 28-31 */
111 sha1nexte MSG3, E1
112 mova128 ABCD, E0
113 sha1msg2 MSG3, MSG0
114 sha1rnds4 $1, E1, ABCD
115 sha1msg1 MSG3, MSG2
116 xor128 MSG3, MSG1
117
118 /* Rounds 32-35 */
119 sha1nexte MSG0, E0
120 mova128 ABCD, E1
121 sha1msg2 MSG0, MSG1
122 sha1rnds4 $1, E0, ABCD
123 sha1msg1 MSG0, MSG3
124 xor128 MSG0, MSG2
125
126 /* Rounds 36-39 */
127 sha1nexte MSG1, E1
128 mova128 ABCD, E0
129 sha1msg2 MSG1, MSG2
130 sha1rnds4 $1, E1, ABCD
131 sha1msg1 MSG1, MSG0
132 xor128 MSG1, MSG3
133
134 /* Rounds 40-43 */
135 sha1nexte MSG2, E0
136 mova128 ABCD, E1
137 sha1msg2 MSG2, MSG3
138 sha1rnds4 $2, E0, ABCD
139 sha1msg1 MSG2, MSG1
140 xor128 MSG2, MSG0
141
142 /* Rounds 44-47 */
143 sha1nexte MSG3, E1
144 mova128 ABCD, E0
145 sha1msg2 MSG3, MSG0
146 sha1rnds4 $2, E1, ABCD
147 sha1msg1 MSG3, MSG2
148 xor128 MSG3, MSG1
149
150 /* Rounds 48-51 */
151 sha1nexte MSG0, E0
152 mova128 ABCD, E1
153 sha1msg2 MSG0, MSG1
154 sha1rnds4 $2, E0, ABCD
155 sha1msg1 MSG0, MSG3
156 xor128 MSG0, MSG2
157
158 /* Rounds 52-55 */
159 sha1nexte MSG1, E1
160 mova128 ABCD, E0
161 sha1msg2 MSG1, MSG2
162 sha1rnds4 $2, E1, ABCD
163 sha1msg1 MSG1, MSG0
164 xor128 MSG1, MSG3
165
166 /* Rounds 56-59 */
167 sha1nexte MSG2, E0
168 mova128 ABCD, E1
169 sha1msg2 MSG2, MSG3
170 sha1rnds4 $2, E0, ABCD
171 sha1msg1 MSG2, MSG1
172 xor128 MSG2, MSG0
173
174 /* Rounds 60-63 */
175 sha1nexte MSG3, E1
176 mova128 ABCD, E0
177 sha1msg2 MSG3, MSG0
178 sha1rnds4 $3, E1, ABCD
179 sha1msg1 MSG3, MSG2
180 xor128 MSG3, MSG1
181
182 /* Rounds 64-67 */
183 sha1nexte MSG0, E0
184 mova128 ABCD, E1
185 sha1msg2 MSG0, MSG1
186 sha1rnds4 $3, E0, ABCD
187 sha1msg1 MSG0, MSG3
188 xor128 MSG0, MSG2
189
190 /* Rounds 68-71 */
191 sha1nexte MSG1, E1
192 mova128 ABCD, E0
193 sha1msg2 MSG1, MSG2
194 sha1rnds4 $3, E1, ABCD
195 xor128 MSG1, MSG3
196
197 /* Rounds 72-75 */
198 sha1nexte MSG2, E0
199 mova128 ABCD, E1
200 sha1msg2 MSG2, MSG3
201 sha1rnds4 $3, E0, ABCD
202
203 /* Rounds 76-79 */
204 sha1nexte MSG3, E1
205 mova128 ABCD, E0
206 sha1rnds4 $3, E1, ABCD
207
208 /* Add current hash values with previously saved */
209 sha1nexte %xmm9, E0
210 paddd %xmm8, ABCD
211
212 /* Write hash values back in the correct order */
213 shuf128_32 $0x1B, ABCD, ABCD
214 movu128 ABCD, 80(%rdi)
215 extr128_32 $3, E0, 80+4*4(%rdi)
216
217 ret
218 .size sha1_process_block64_shaNI, .-sha1_process_block64_shaNI
219
220.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
221.align 16
222PSHUFFLE_BYTE_FLIP_MASK:
223 .octa 0x000102030405060708090a0b0c0d0e0f
224
225#endif