diff options
author | jsing <> | 2024-12-04 13:13:33 +0000 |
---|---|---|
committer | jsing <> | 2024-12-04 13:13:33 +0000 |
commit | 1c3ce6cc8e538cecc33ed58f89d969af28952dea (patch) | |
tree | 6adf1634c082704fca00fea488f843d1345662b2 /src | |
parent | 54b7e03a99e6dbd79315380653c1bf578c8444b0 (diff) | |
download | openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.gz openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.bz2 openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.zip |
Provide a replacement assembly implementation for SHA-1 on amd64.
As already done for SHA-256 and SHA-512, replace the perlasm generated
SHA-1 assembly implementation with one that is actually readable. Call the
assembly implementation from a C wrapper that can, in the future, dispatch
to alternate implementations. On a modern CPU the performance is around
5% faster than the base implementation generated by sha1-x86_64.pl, however
it is around 15% slower than the excessively complex SSSE2/AVX version that
is also generated by the same script (a SHA-NI version will greatly
outperform this and is much cleaner/simpler).
ok tb@
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/arch/amd64/Makefile.inc | 5 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha1_amd64.c | 28 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha1_amd64_generic.S | 314 |
3 files changed, 345 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc index fe22385633..33c7dbba26 100644 --- a/src/lib/libcrypto/arch/amd64/Makefile.inc +++ b/src/lib/libcrypto/arch/amd64/Makefile.inc | |||
@@ -1,4 +1,4 @@ | |||
1 | # $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ | 1 | # $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $ |
2 | 2 | ||
3 | # amd64-specific libcrypto build rules | 3 | # amd64-specific libcrypto build rules |
4 | 4 | ||
@@ -49,7 +49,8 @@ SSLASM+= rc4 rc4-x86_64 | |||
49 | # ripemd | 49 | # ripemd |
50 | # sha | 50 | # sha |
51 | CFLAGS+= -DSHA1_ASM | 51 | CFLAGS+= -DSHA1_ASM |
52 | SSLASM+= sha sha1-x86_64 | 52 | SRCS+= sha1_amd64.c |
53 | SRCS+= sha1_amd64_generic.S | ||
53 | CFLAGS+= -DSHA256_ASM | 54 | CFLAGS+= -DSHA256_ASM |
54 | SRCS+= sha256_amd64.c | 55 | SRCS+= sha256_amd64.c |
55 | SRCS+= sha256_amd64_generic.S | 56 | SRCS+= sha256_amd64_generic.S |
diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c new file mode 100644 index 0000000000..b3d4ab1263 --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64.c | |||
@@ -0,0 +1,28 @@ | |||
1 | /* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | #include <openssl/sha.h> | ||
19 | |||
20 | #include "crypto_arch.h" | ||
21 | |||
22 | void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num); | ||
23 | |||
24 | void | ||
25 | sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num) | ||
26 | { | ||
27 | sha1_block_generic(ctx, in, num); | ||
28 | } | ||
diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S new file mode 100644 index 0000000000..d3e184dbca --- /dev/null +++ b/src/lib/libcrypto/sha/sha1_amd64_generic.S | |||
@@ -0,0 +1,314 @@ | |||
1 | /* $OpenBSD: sha1_amd64_generic.S,v 1.1 2024/12/04 13:13:33 jsing Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | #ifdef __CET__ | ||
19 | #include <cet.h> | ||
20 | #else | ||
21 | #define _CET_ENDBR | ||
22 | #endif | ||
23 | |||
24 | #define ctx %rdi | ||
25 | #define in %rsi | ||
26 | #define num %rdx | ||
27 | |||
28 | #define end %rbp | ||
29 | |||
30 | #define hs0 %r8d | ||
31 | #define hs1 %r9d | ||
32 | #define hs2 %r10d | ||
33 | #define hs3 %r11d | ||
34 | #define hs4 %r12d | ||
35 | |||
36 | #define tmp0 %eax | ||
37 | #define tmp1 %ebx | ||
38 | #define tmp2 %ecx | ||
39 | #define tmp3 %edx | ||
40 | |||
41 | /* | ||
42 | * Load message into wt, storing a copy in the message schedule: | ||
43 | * | ||
44 | * Wt = Mt | ||
45 | */ | ||
46 | #define sha1_message_schedule_load(idx, m, w, wt) \ | ||
47 | movl ((idx&0xf)*4)(m), wt; \ | ||
48 | bswapl wt; \ | ||
49 | movl wt, ((idx&0xf)*4)(w); | ||
50 | |||
51 | /* | ||
52 | * Update message schedule and return current value in wt: | ||
53 | * | ||
54 | * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1) | ||
55 | */ | ||
56 | #define sha1_message_schedule_update(idx, w, wt) \ | ||
57 | movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \ | ||
58 | xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \ | ||
59 | xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \ | ||
60 | xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \ | ||
61 | roll $1, wt; \ | ||
62 | \ | ||
63 | movl wt, ((idx&0xf)*4)(w); | ||
64 | |||
65 | /* | ||
66 | * Compute a SHA-1 round without logic function: | ||
67 | * | ||
68 | * T = rol(a, 5) + e + Kt + Wt | ||
69 | * | ||
70 | * The caller is required to compute the appropriate logic function | ||
71 | * (Ch, Maj, Parity) and add it to e. | ||
72 | * | ||
73 | * Upon completion b = rol(b, 30), e = T, pending rotation. | ||
74 | */ | ||
75 | #define sha1_round(a, b, c, d, e, kt, wt) \ | ||
76 | leal kt(wt, e, 1), e; /* Kt + Wt */ \ | ||
77 | \ | ||
78 | movl a, tmp1; /* rol(a, 5) */ \ | ||
79 | roll $5, tmp1; \ | ||
80 | addl tmp1, e; \ | ||
81 | \ | ||
82 | roll $30, b; /* rol(b, 30) */ | ||
83 | |||
84 | /* | ||
85 | * Compute a SHA-1 round with Ch: | ||
86 | * | ||
87 | * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt | ||
88 | * | ||
89 | * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z | ||
90 | * | ||
91 | * Upon completion b = rol(b, 30), e = T, pending rotation. | ||
92 | */ | ||
93 | #define sha1_round_ch(a, b, c, d, e, kt, wt) \ | ||
94 | movl c, tmp2; /* Ch */ \ | ||
95 | xorl d, tmp2; /* Ch */ \ | ||
96 | andl b, tmp2; /* Ch */ \ | ||
97 | xorl d, tmp2; /* Ch */ \ | ||
98 | addl tmp2, e; /* Ch */ \ | ||
99 | \ | ||
100 | sha1_round(a, b, c, d, e, kt, wt); | ||
101 | |||
102 | /* | ||
103 | * Compute a SHA-1 round with Parity: | ||
104 | * | ||
105 | * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt | ||
106 | * | ||
107 | * Parity(x, y, z) = x ^ y ^ z | ||
108 | * | ||
109 | * Upon completion b = rol(b, 30), e = T, pending rotation. | ||
110 | */ | ||
111 | #define sha1_round_parity(a, b, c, d, e, kt, wt) \ | ||
112 | movl b, tmp2; /* Parity */ \ | ||
113 | xorl c, tmp2; /* Parity */ \ | ||
114 | xorl d, tmp2; /* Parity */ \ | ||
115 | addl tmp2, e; /* Parity */ \ | ||
116 | \ | ||
117 | sha1_round(a, b, c, d, e, kt, wt); | ||
118 | |||
119 | /* | ||
120 | * Compute a SHA-1 round with Maj: | ||
121 | * | ||
122 | * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt | ||
123 | * | ||
124 | * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z) | ||
125 | * | ||
126 | * Upon completion b = rol(b, 30), e = T, pending rotation. | ||
127 | */ | ||
128 | #define sha1_round_maj(a, b, c, d, e, kt, wt) \ | ||
129 | movl c, tmp2; /* Maj */ \ | ||
130 | xorl d, tmp2; /* Maj */ \ | ||
131 | andl b, tmp2; /* Maj */ \ | ||
132 | movl c, tmp3; /* Maj */ \ | ||
133 | andl d, tmp3; /* Maj */ \ | ||
134 | xorl tmp2, tmp3; /* Maj */ \ | ||
135 | addl tmp3, e; /* Maj */ \ | ||
136 | \ | ||
137 | sha1_round(a, b, c, d, e, kt, wt); | ||
138 | |||
139 | #define sha1_round1_load(idx, a, b, c, d, e) \ | ||
140 | sha1_message_schedule_load(idx, in, %rsp, tmp0) \ | ||
141 | sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) | ||
142 | |||
143 | #define sha1_round1_update(idx, a, b, c, d, e) \ | ||
144 | sha1_message_schedule_update(idx, %rsp, tmp0) \ | ||
145 | sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0) | ||
146 | |||
147 | #define sha1_round2_update(idx, a, b, c, d, e) \ | ||
148 | sha1_message_schedule_update(idx, %rsp, tmp0) \ | ||
149 | sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0) | ||
150 | |||
151 | #define sha1_round3_update(idx, a, b, c, d, e) \ | ||
152 | sha1_message_schedule_update(idx, %rsp, tmp0) \ | ||
153 | sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0) | ||
154 | |||
155 | #define sha1_round4_update(idx, a, b, c, d, e) \ | ||
156 | sha1_message_schedule_update(idx, %rsp, tmp0) \ | ||
157 | sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0) | ||
158 | |||
159 | .text | ||
160 | |||
161 | /* | ||
162 | * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num); | ||
163 | * | ||
164 | * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num | ||
165 | */ | ||
166 | .align 16 | ||
167 | .globl sha1_block_generic | ||
168 | .type sha1_block_generic,@function | ||
169 | sha1_block_generic: | ||
170 | _CET_ENDBR | ||
171 | |||
172 | /* Save callee save registers. */ | ||
173 | pushq %rbx | ||
174 | pushq %rbp | ||
175 | pushq %r12 | ||
176 | |||
177 | /* Allocate space for message schedule. */ | ||
178 | movq %rsp, %rax | ||
179 | subq $(64+1*8), %rsp | ||
180 | andq $~63, %rsp | ||
181 | movq %rax, (64+0*8)(%rsp) | ||
182 | |||
183 | /* Compute and store end of message. */ | ||
184 | shlq $6, num | ||
185 | leaq (in, num, 1), %rbp | ||
186 | |||
187 | /* Load current hash state from context. */ | ||
188 | movl (0*4)(ctx), hs0 | ||
189 | movl (1*4)(ctx), hs1 | ||
190 | movl (2*4)(ctx), hs2 | ||
191 | movl (3*4)(ctx), hs3 | ||
192 | movl (4*4)(ctx), hs4 | ||
193 | |||
194 | jmp .Lblock_loop | ||
195 | |||
196 | .align 16 | ||
197 | .Lblock_loop: | ||
198 | |||
199 | /* Round 0 through 15. */ | ||
200 | sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4) | ||
201 | sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3) | ||
202 | sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2) | ||
203 | sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1) | ||
204 | sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0) | ||
205 | sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4) | ||
206 | sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3) | ||
207 | sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2) | ||
208 | sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1) | ||
209 | sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0) | ||
210 | sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4) | ||
211 | sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3) | ||
212 | sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2) | ||
213 | sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1) | ||
214 | sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0) | ||
215 | sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4) | ||
216 | |||
217 | /* Round 16 through 31. */ | ||
218 | sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3); | ||
219 | sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2); | ||
220 | sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1); | ||
221 | sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0); | ||
222 | sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4); | ||
223 | sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3); | ||
224 | sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2); | ||
225 | sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1); | ||
226 | sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0); | ||
227 | sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4); | ||
228 | sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3); | ||
229 | sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2); | ||
230 | sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1); | ||
231 | sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0); | ||
232 | sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4); | ||
233 | sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3); | ||
234 | |||
235 | /* Round 32 through 47. */ | ||
236 | sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2); | ||
237 | sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1); | ||
238 | sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0); | ||
239 | sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4); | ||
240 | sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3); | ||
241 | sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2); | ||
242 | sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1); | ||
243 | sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0); | ||
244 | sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4); | ||
245 | sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3); | ||
246 | sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2); | ||
247 | sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1); | ||
248 | sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0); | ||
249 | sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4); | ||
250 | sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3); | ||
251 | sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2); | ||
252 | |||
253 | /* Round 48 through 63. */ | ||
254 | sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1); | ||
255 | sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0); | ||
256 | sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4); | ||
257 | sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3); | ||
258 | sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2); | ||
259 | sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1); | ||
260 | sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0); | ||
261 | sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4); | ||
262 | sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3); | ||
263 | sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2); | ||
264 | sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1); | ||
265 | sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0); | ||
266 | sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4); | ||
267 | sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3); | ||
268 | sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2); | ||
269 | sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1); | ||
270 | |||
271 | /* Round 64 through 79. */ | ||
272 | sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0); | ||
273 | sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4); | ||
274 | sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3); | ||
275 | sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2); | ||
276 | sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1); | ||
277 | sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0); | ||
278 | sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4); | ||
279 | sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3); | ||
280 | sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2); | ||
281 | sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1); | ||
282 | sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0); | ||
283 | sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4); | ||
284 | sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3); | ||
285 | sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2); | ||
286 | sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1); | ||
287 | sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0); | ||
288 | |||
289 | /* Add intermediate state to hash state. */ | ||
290 | addl (0*4)(ctx), hs0 | ||
291 | addl (1*4)(ctx), hs1 | ||
292 | addl (2*4)(ctx), hs2 | ||
293 | addl (3*4)(ctx), hs3 | ||
294 | addl (4*4)(ctx), hs4 | ||
295 | |||
296 | /* Store new hash state to context. */ | ||
297 | movl hs0, (0*4)(ctx) | ||
298 | movl hs1, (1*4)(ctx) | ||
299 | movl hs2, (2*4)(ctx) | ||
300 | movl hs3, (3*4)(ctx) | ||
301 | movl hs4, (4*4)(ctx) | ||
302 | |||
303 | addq $64, in | ||
304 | cmpq end, in | ||
305 | jb .Lblock_loop | ||
306 | |||
307 | movq (64+0*8)(%rsp), %rsp | ||
308 | |||
309 | /* Restore callee save registers. */ | ||
310 | popq %r12 | ||
311 | popq %rbp | ||
312 | popq %rbx | ||
313 | |||
314 | ret | ||