summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2024-12-04 13:13:33 +0000
committerjsing <>2024-12-04 13:13:33 +0000
commit1c3ce6cc8e538cecc33ed58f89d969af28952dea (patch)
tree6adf1634c082704fca00fea488f843d1345662b2 /src
parent54b7e03a99e6dbd79315380653c1bf578c8444b0 (diff)
downloadopenbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.gz
openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.tar.bz2
openbsd-1c3ce6cc8e538cecc33ed58f89d969af28952dea.zip
Provide a replacement assembly implementation for SHA-1 on amd64.
As already done for SHA-256 and SHA-512, replace the perlasm generated SHA-1 assembly implementation with one that is actually readable. Call the assembly implementation from a C wrapper that can, in the future, dispatch to alternate implementations. On a modern CPU the performance is around 5% faster than the base implementation generated by sha1-x86_64.pl, however it is around 15% slower than the excessively complex SSSE2/AVX version that is also generated by the same script (a SHA-NI version will greatly outperform this and is much cleaner/simpler). ok tb@
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/arch/amd64/Makefile.inc5
-rw-r--r--src/lib/libcrypto/sha/sha1_amd64.c28
-rw-r--r--src/lib/libcrypto/sha/sha1_amd64_generic.S314
3 files changed, 345 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index fe22385633..33c7dbba26 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
1# $OpenBSD: Makefile.inc,v 1.33 2024/11/16 15:31:36 jsing Exp $ 1# $OpenBSD: Makefile.inc,v 1.34 2024/12/04 13:13:33 jsing Exp $
2 2
3# amd64-specific libcrypto build rules 3# amd64-specific libcrypto build rules
4 4
@@ -49,7 +49,8 @@ SSLASM+= rc4 rc4-x86_64
49# ripemd 49# ripemd
50# sha 50# sha
51CFLAGS+= -DSHA1_ASM 51CFLAGS+= -DSHA1_ASM
52SSLASM+= sha sha1-x86_64 52SRCS+= sha1_amd64.c
53SRCS+= sha1_amd64_generic.S
53CFLAGS+= -DSHA256_ASM 54CFLAGS+= -DSHA256_ASM
54SRCS+= sha256_amd64.c 55SRCS+= sha256_amd64.c
55SRCS+= sha256_amd64_generic.S 56SRCS+= sha256_amd64_generic.S
diff --git a/src/lib/libcrypto/sha/sha1_amd64.c b/src/lib/libcrypto/sha/sha1_amd64.c
new file mode 100644
index 0000000000..b3d4ab1263
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_amd64.c
@@ -0,0 +1,28 @@
1/* $OpenBSD: sha1_amd64.c,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha1_block_generic(SHA_CTX *ctx, const void *in, size_t num);
23
24void
25sha1_block_data_order(SHA_CTX *ctx, const void *in, size_t num)
26{
27 sha1_block_generic(ctx, in, num);
28}
diff --git a/src/lib/libcrypto/sha/sha1_amd64_generic.S b/src/lib/libcrypto/sha/sha1_amd64_generic.S
new file mode 100644
index 0000000000..d3e184dbca
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha1_amd64_generic.S
@@ -0,0 +1,314 @@
1/* $OpenBSD: sha1_amd64_generic.S,v 1.1 2024/12/04 13:13:33 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define end %rbp
29
30#define hs0 %r8d
31#define hs1 %r9d
32#define hs2 %r10d
33#define hs3 %r11d
34#define hs4 %r12d
35
36#define tmp0 %eax
37#define tmp1 %ebx
38#define tmp2 %ecx
39#define tmp3 %edx
40
41/*
42 * Load message into wt, storing a copy in the message schedule:
43 *
44 * Wt = Mt
45 */
46#define sha1_message_schedule_load(idx, m, w, wt) \
47 movl ((idx&0xf)*4)(m), wt; \
48 bswapl wt; \
49 movl wt, ((idx&0xf)*4)(w);
50
51/*
52 * Update message schedule and return current value in wt:
53 *
54 * W0 = rol(W13 ^ W8 ^ W2 ^ W0, 1)
55 */
56#define sha1_message_schedule_update(idx, w, wt) \
57 movl (((idx-3)&0xf)*4)(w), wt; /* W13 */ \
58 xorl (((idx-8)&0xf)*4)(w), wt; /* W8 */ \
59 xorl (((idx-14)&0xf)*4)(w), wt; /* W2 */ \
60 xorl (((idx)&0xf)*4)(w), wt; /* W0 */ \
61 roll $1, wt; \
62 \
63 movl wt, ((idx&0xf)*4)(w);
64
65/*
66 * Compute a SHA-1 round without logic function:
67 *
68 * T = rol(a, 5) + e + Kt + Wt
69 *
70 * The caller is required to compute the appropriate logic function
71 * (Ch, Maj, Parity) and add it to e.
72 *
73 * Upon completion b = rol(b, 30), e = T, pending rotation.
74 */
75#define sha1_round(a, b, c, d, e, kt, wt) \
76 leal kt(wt, e, 1), e; /* Kt + Wt */ \
77 \
78 movl a, tmp1; /* rol(a, 5) */ \
79 roll $5, tmp1; \
80 addl tmp1, e; \
81 \
82 roll $30, b; /* rol(b, 30) */
83
84/*
85 * Compute a SHA-1 round with Ch:
86 *
87 * T = rol(a, 5) + Ch(b, c, d) + e + Kt + Wt
88 *
89 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
90 *
91 * Upon completion b = rol(b, 30), e = T, pending rotation.
92 */
93#define sha1_round_ch(a, b, c, d, e, kt, wt) \
94 movl c, tmp2; /* Ch */ \
95 xorl d, tmp2; /* Ch */ \
96 andl b, tmp2; /* Ch */ \
97 xorl d, tmp2; /* Ch */ \
98 addl tmp2, e; /* Ch */ \
99 \
100 sha1_round(a, b, c, d, e, kt, wt);
101
102/*
103 * Compute a SHA-1 round with Parity:
104 *
105 * T = rol(a, 5) + Parity(b, c, d) + e + Kt + Wt
106 *
107 * Parity(x, y, z) = x ^ y ^ z
108 *
109 * Upon completion b = rol(b, 30), e = T, pending rotation.
110 */
111#define sha1_round_parity(a, b, c, d, e, kt, wt) \
112 movl b, tmp2; /* Parity */ \
113 xorl c, tmp2; /* Parity */ \
114 xorl d, tmp2; /* Parity */ \
115 addl tmp2, e; /* Parity */ \
116 \
117 sha1_round(a, b, c, d, e, kt, wt);
118
119/*
120 * Compute a SHA-1 round with Maj:
121 *
122 * T = rol(a, 5) + Maj(b, c, d) + e + Kt + Wt
123 *
124 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
125 *
126 * Upon completion b = rol(b, 30), e = T, pending rotation.
127 */
128#define sha1_round_maj(a, b, c, d, e, kt, wt) \
129 movl c, tmp2; /* Maj */ \
130 xorl d, tmp2; /* Maj */ \
131 andl b, tmp2; /* Maj */ \
132 movl c, tmp3; /* Maj */ \
133 andl d, tmp3; /* Maj */ \
134 xorl tmp2, tmp3; /* Maj */ \
135 addl tmp3, e; /* Maj */ \
136 \
137 sha1_round(a, b, c, d, e, kt, wt);
138
139#define sha1_round1_load(idx, a, b, c, d, e) \
140 sha1_message_schedule_load(idx, in, %rsp, tmp0) \
141 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
142
143#define sha1_round1_update(idx, a, b, c, d, e) \
144 sha1_message_schedule_update(idx, %rsp, tmp0) \
145 sha1_round_ch(a, b, c, d, e, 0x5a827999, tmp0)
146
147#define sha1_round2_update(idx, a, b, c, d, e) \
148 sha1_message_schedule_update(idx, %rsp, tmp0) \
149 sha1_round_parity(a, b, c, d, e, 0x6ed9eba1, tmp0)
150
151#define sha1_round3_update(idx, a, b, c, d, e) \
152 sha1_message_schedule_update(idx, %rsp, tmp0) \
153 sha1_round_maj(a, b, c, d, e, 0x8f1bbcdc, tmp0)
154
155#define sha1_round4_update(idx, a, b, c, d, e) \
156 sha1_message_schedule_update(idx, %rsp, tmp0) \
157 sha1_round_parity(a, b, c, d, e, 0xca62c1d6, tmp0)
158
159.text
160
161/*
162 * void sha1_block_generic(SHA1_CTX *ctx, const void *in, size_t num);
163 *
164 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
165 */
166.align 16
167.globl sha1_block_generic
168.type sha1_block_generic,@function
169sha1_block_generic:
170 _CET_ENDBR
171
172 /* Save callee save registers. */
173 pushq %rbx
174 pushq %rbp
175 pushq %r12
176
177 /* Allocate space for message schedule. */
178 movq %rsp, %rax
179 subq $(64+1*8), %rsp
180 andq $~63, %rsp
181 movq %rax, (64+0*8)(%rsp)
182
183 /* Compute and store end of message. */
184 shlq $6, num
185 leaq (in, num, 1), %rbp
186
187 /* Load current hash state from context. */
188 movl (0*4)(ctx), hs0
189 movl (1*4)(ctx), hs1
190 movl (2*4)(ctx), hs2
191 movl (3*4)(ctx), hs3
192 movl (4*4)(ctx), hs4
193
194 jmp .Lblock_loop
195
196.align 16
197.Lblock_loop:
198
199 /* Round 0 through 15. */
200 sha1_round1_load(0, hs0, hs1, hs2, hs3, hs4)
201 sha1_round1_load(1, hs4, hs0, hs1, hs2, hs3)
202 sha1_round1_load(2, hs3, hs4, hs0, hs1, hs2)
203 sha1_round1_load(3, hs2, hs3, hs4, hs0, hs1)
204 sha1_round1_load(4, hs1, hs2, hs3, hs4, hs0)
205 sha1_round1_load(5, hs0, hs1, hs2, hs3, hs4)
206 sha1_round1_load(6, hs4, hs0, hs1, hs2, hs3)
207 sha1_round1_load(7, hs3, hs4, hs0, hs1, hs2)
208 sha1_round1_load(8, hs2, hs3, hs4, hs0, hs1)
209 sha1_round1_load(9, hs1, hs2, hs3, hs4, hs0)
210 sha1_round1_load(10, hs0, hs1, hs2, hs3, hs4)
211 sha1_round1_load(11, hs4, hs0, hs1, hs2, hs3)
212 sha1_round1_load(12, hs3, hs4, hs0, hs1, hs2)
213 sha1_round1_load(13, hs2, hs3, hs4, hs0, hs1)
214 sha1_round1_load(14, hs1, hs2, hs3, hs4, hs0)
215 sha1_round1_load(15, hs0, hs1, hs2, hs3, hs4)
216
217 /* Round 16 through 31. */
218 sha1_round1_update(16, hs4, hs0, hs1, hs2, hs3);
219 sha1_round1_update(17, hs3, hs4, hs0, hs1, hs2);
220 sha1_round1_update(18, hs2, hs3, hs4, hs0, hs1);
221 sha1_round1_update(19, hs1, hs2, hs3, hs4, hs0);
222 sha1_round2_update(20, hs0, hs1, hs2, hs3, hs4);
223 sha1_round2_update(21, hs4, hs0, hs1, hs2, hs3);
224 sha1_round2_update(22, hs3, hs4, hs0, hs1, hs2);
225 sha1_round2_update(23, hs2, hs3, hs4, hs0, hs1);
226 sha1_round2_update(24, hs1, hs2, hs3, hs4, hs0);
227 sha1_round2_update(25, hs0, hs1, hs2, hs3, hs4);
228 sha1_round2_update(26, hs4, hs0, hs1, hs2, hs3);
229 sha1_round2_update(27, hs3, hs4, hs0, hs1, hs2);
230 sha1_round2_update(28, hs2, hs3, hs4, hs0, hs1);
231 sha1_round2_update(29, hs1, hs2, hs3, hs4, hs0);
232 sha1_round2_update(30, hs0, hs1, hs2, hs3, hs4);
233 sha1_round2_update(31, hs4, hs0, hs1, hs2, hs3);
234
235 /* Round 32 through 47. */
236 sha1_round2_update(32, hs3, hs4, hs0, hs1, hs2);
237 sha1_round2_update(33, hs2, hs3, hs4, hs0, hs1);
238 sha1_round2_update(34, hs1, hs2, hs3, hs4, hs0);
239 sha1_round2_update(35, hs0, hs1, hs2, hs3, hs4);
240 sha1_round2_update(36, hs4, hs0, hs1, hs2, hs3);
241 sha1_round2_update(37, hs3, hs4, hs0, hs1, hs2);
242 sha1_round2_update(38, hs2, hs3, hs4, hs0, hs1);
243 sha1_round2_update(39, hs1, hs2, hs3, hs4, hs0);
244 sha1_round3_update(40, hs0, hs1, hs2, hs3, hs4);
245 sha1_round3_update(41, hs4, hs0, hs1, hs2, hs3);
246 sha1_round3_update(42, hs3, hs4, hs0, hs1, hs2);
247 sha1_round3_update(43, hs2, hs3, hs4, hs0, hs1);
248 sha1_round3_update(44, hs1, hs2, hs3, hs4, hs0);
249 sha1_round3_update(45, hs0, hs1, hs2, hs3, hs4);
250 sha1_round3_update(46, hs4, hs0, hs1, hs2, hs3);
251 sha1_round3_update(47, hs3, hs4, hs0, hs1, hs2);
252
253 /* Round 48 through 63. */
254 sha1_round3_update(48, hs2, hs3, hs4, hs0, hs1);
255 sha1_round3_update(49, hs1, hs2, hs3, hs4, hs0);
256 sha1_round3_update(50, hs0, hs1, hs2, hs3, hs4);
257 sha1_round3_update(51, hs4, hs0, hs1, hs2, hs3);
258 sha1_round3_update(52, hs3, hs4, hs0, hs1, hs2);
259 sha1_round3_update(53, hs2, hs3, hs4, hs0, hs1);
260 sha1_round3_update(54, hs1, hs2, hs3, hs4, hs0);
261 sha1_round3_update(55, hs0, hs1, hs2, hs3, hs4);
262 sha1_round3_update(56, hs4, hs0, hs1, hs2, hs3);
263 sha1_round3_update(57, hs3, hs4, hs0, hs1, hs2);
264 sha1_round3_update(58, hs2, hs3, hs4, hs0, hs1);
265 sha1_round3_update(59, hs1, hs2, hs3, hs4, hs0);
266 sha1_round4_update(60, hs0, hs1, hs2, hs3, hs4);
267 sha1_round4_update(61, hs4, hs0, hs1, hs2, hs3);
268 sha1_round4_update(62, hs3, hs4, hs0, hs1, hs2);
269 sha1_round4_update(63, hs2, hs3, hs4, hs0, hs1);
270
271 /* Round 64 through 79. */
272 sha1_round4_update(64, hs1, hs2, hs3, hs4, hs0);
273 sha1_round4_update(65, hs0, hs1, hs2, hs3, hs4);
274 sha1_round4_update(66, hs4, hs0, hs1, hs2, hs3);
275 sha1_round4_update(67, hs3, hs4, hs0, hs1, hs2);
276 sha1_round4_update(68, hs2, hs3, hs4, hs0, hs1);
277 sha1_round4_update(69, hs1, hs2, hs3, hs4, hs0);
278 sha1_round4_update(70, hs0, hs1, hs2, hs3, hs4);
279 sha1_round4_update(71, hs4, hs0, hs1, hs2, hs3);
280 sha1_round4_update(72, hs3, hs4, hs0, hs1, hs2);
281 sha1_round4_update(73, hs2, hs3, hs4, hs0, hs1);
282 sha1_round4_update(74, hs1, hs2, hs3, hs4, hs0);
283 sha1_round4_update(75, hs0, hs1, hs2, hs3, hs4);
284 sha1_round4_update(76, hs4, hs0, hs1, hs2, hs3);
285 sha1_round4_update(77, hs3, hs4, hs0, hs1, hs2);
286 sha1_round4_update(78, hs2, hs3, hs4, hs0, hs1);
287 sha1_round4_update(79, hs1, hs2, hs3, hs4, hs0);
288
289 /* Add intermediate state to hash state. */
290 addl (0*4)(ctx), hs0
291 addl (1*4)(ctx), hs1
292 addl (2*4)(ctx), hs2
293 addl (3*4)(ctx), hs3
294 addl (4*4)(ctx), hs4
295
296 /* Store new hash state to context. */
297 movl hs0, (0*4)(ctx)
298 movl hs1, (1*4)(ctx)
299 movl hs2, (2*4)(ctx)
300 movl hs3, (3*4)(ctx)
301 movl hs4, (4*4)(ctx)
302
303 addq $64, in
304 cmpq end, in
305 jb .Lblock_loop
306
307 movq (64+0*8)(%rsp), %rsp
308
309 /* Restore callee save registers. */
310 popq %r12
311 popq %rbp
312 popq %rbx
313
314 ret