summaryrefslogtreecommitdiff
path: root/src/lib
diff options
context:
space:
mode:
authorjsing <>2024-11-08 15:09:48 +0000
committerjsing <>2024-11-08 15:09:48 +0000
commit04e057f72ee941e676968c3b04b6dd1f0f220d9a (patch)
tree84d6d8d2658114716d976529462bcebc29b879ca /src/lib
parent7bfb9335fb0ae857523adc3e3686eeb719aafd2e (diff)
downloadopenbsd-04e057f72ee941e676968c3b04b6dd1f0f220d9a.tar.gz
openbsd-04e057f72ee941e676968c3b04b6dd1f0f220d9a.tar.bz2
openbsd-04e057f72ee941e676968c3b04b6dd1f0f220d9a.zip
Provide a replacement assembly implementation for SHA-256 on amd64.
Replace the perlasm generated SHA-256 assembly implementation with one that is actually readable. Call the assembly implementation from a C wrapper that can, in the future, dispatch to alternate implementations. Performance is similar (or even better) on modern CPUs, while somewhat slower on older CPUs (this is in part due to the wrapper, the impact of which is more noticable with small block sizes). Thanks to gkoehler@ and tb@ for testing. ok tb@
Diffstat (limited to 'src/lib')
-rw-r--r--src/lib/libcrypto/arch/amd64/Makefile.inc9
-rw-r--r--src/lib/libcrypto/sha/sha256_amd64.c26
-rw-r--r--src/lib/libcrypto/sha/sha256_amd64_generic.S301
3 files changed, 330 insertions, 6 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index 2f41f44381..07fcf46ed5 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
1# $OpenBSD: Makefile.inc,v 1.30 2024/10/18 13:36:24 jsing Exp $ 1# $OpenBSD: Makefile.inc,v 1.31 2024/11/08 15:09:48 jsing Exp $
2 2
3# amd64-specific libcrypto build rules 3# amd64-specific libcrypto build rules
4 4
@@ -51,11 +51,8 @@ SSLASM+= rc4 rc4-x86_64
51CFLAGS+= -DSHA1_ASM 51CFLAGS+= -DSHA1_ASM
52SSLASM+= sha sha1-x86_64 52SSLASM+= sha sha1-x86_64
53CFLAGS+= -DSHA256_ASM 53CFLAGS+= -DSHA256_ASM
54SRCS+= sha256-x86_64.S 54SRCS+= sha256_amd64.c
55GENERATED+= sha256-x86_64.S 55SRCS+= sha256_amd64_generic.S
56sha256-x86_64.S: ${LCRYPTO_SRC}/sha/asm/sha512-x86_64.pl ${EXTRA_PL}
57 cd ${LCRYPTO_SRC}/sha/asm ; \
58 /usr/bin/perl ./sha512-x86_64.pl ${.OBJDIR}/${.TARGET}
59CFLAGS+= -DSHA512_ASM 56CFLAGS+= -DSHA512_ASM
60SRCS+= sha512-x86_64.S 57SRCS+= sha512-x86_64.S
61GENERATED+= sha512-x86_64.S 58GENERATED+= sha512-x86_64.S
diff --git a/src/lib/libcrypto/sha/sha256_amd64.c b/src/lib/libcrypto/sha/sha256_amd64.c
new file mode 100644
index 0000000000..f7531b340f
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha256_amd64.c
@@ -0,0 +1,26 @@
1/* $OpenBSD: sha256_amd64.c,v 1.1 2024/11/08 15:09:48 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
21
22void
23sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
24{
25 sha256_block_generic(ctx, in, num);
26}
diff --git a/src/lib/libcrypto/sha/sha256_amd64_generic.S b/src/lib/libcrypto/sha/sha256_amd64_generic.S
new file mode 100644
index 0000000000..07078fb0d5
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha256_amd64_generic.S
@@ -0,0 +1,301 @@
1/* $OpenBSD: sha256_amd64_generic.S,v 1.1 2024/11/08 15:09:48 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define round %rdi
29
30#define hs0 %r8d
31#define hs1 %r9d
32#define hs2 %r10d
33#define hs3 %r11d
34#define hs4 %r12d
35#define hs5 %r13d
36#define hs6 %r14d
37#define hs7 %r15d
38
39#define k256 %rbp
40
41#define tmp0 %eax
42#define tmp1 %ebx
43#define tmp2 %ecx
44#define tmp3 %edx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 * Wt = Mt
50 */
51#define sha256_message_schedule_load(idx, m, w, wt) \
52 movl (m, round, 4), wt; \
53 bswapl wt; \
54 movl wt, ((idx&0xf)*4)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 * sigma0(x) = ror(x, 7) ^ ror(x, 18) ^ (x >> 3)
62 * sigma1(x) = ror(x, 17) ^ ror(x, 19) ^ (x >> 10)
63 */
64#define sha256_message_schedule_update(idx, w, wt) \
65 movl (((idx-2)&0xf)*4)(w), wt; /* sigma1 */ \
66 movl wt, tmp1; /* sigma1 */ \
67 rorl $(19-17), tmp1; /* sigma1 */ \
68 xorl wt, tmp1; /* sigma1 */ \
69 rorl $17, tmp1; /* sigma1 */ \
70 shrl $10, wt; /* sigma1 */ \
71 xorl tmp1, wt; /* sigma1 */ \
72 \
73 addl (((idx-7)&0xf)*4)(w), wt; /* Wt-7 */ \
74 addl (((idx-16)&0xf)*4)(w), wt; /* Wt-16 */ \
75 \
76 movl (((idx-15)&0xf)*4)(w), tmp2; /* sigma0 */ \
77 movl tmp2, tmp3; /* sigma0 */ \
78 rorl $(18-7), tmp2; /* sigma0 */ \
79 xorl tmp3, tmp2; /* sigma0 */ \
80 rorl $7, tmp2; /* sigma0 */ \
81 shrl $3, tmp3; /* sigma0 */ \
82 xorl tmp3, tmp2; /* sigma0 */ \
83 addl tmp2, wt; /* sigma0 */ \
84 \
85 movl wt, ((idx&0xf)*4)(w);
86
87/*
88 * Compute a SHA-256 round:
89 *
90 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
91 * T2 = Sigma0(a) + Maj(a, b, c)
92 *
93 * Sigma0(x) = ror(x, 2) ^ ror(x, 13) ^ ror(x, 22)
94 * Sigma1(x) = ror(x, 6) ^ ror(x, 11) ^ ror(x, 25)
95 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
96 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
97 *
98 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
99 */
100#define sha256_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
101 addl wt, h; /* T1 Wt */ \
102 addl (k256, round, 4), h; /* T1 Kt */ \
103 \
104 movl e, tmp1; /* T1 Sigma1 */ \
105 rorl $(25-11), tmp1; /* T1 Sigma1 */ \
106 xorl e, tmp1; /* T1 Sigma1 */ \
107 rorl $(11-6), tmp1; /* T1 Sigma1 */ \
108 xorl e, tmp1; /* T1 Sigma1 */ \
109 rorl $6, tmp1; /* T1 Sigma1 */ \
110 addl tmp1, h; /* T1 Sigma1 */ \
111 \
112 movl f, tmp2; /* T1 Ch */ \
113 xorl g, tmp2; /* T1 Ch */ \
114 andl e, tmp2; /* T1 Ch */ \
115 xorl g, tmp2; /* T1 Ch */ \
116 addl tmp2, h; /* T1 Ch */ \
117 \
118 addl h, d; /* d += T1 */ \
119 \
120 movl a, tmp1; /* T2 Sigma0 */ \
121 rorl $(22-13), tmp1; /* T2 Sigma0 */ \
122 xorl a, tmp1; /* T2 Sigma0 */ \
123 rorl $(13-2), tmp1; /* T2 Sigma0 */ \
124 xorl a, tmp1; /* T2 Sigma0 */ \
125 rorl $2, tmp1; /* T2 Sigma0 */ \
126 addl tmp1, h; /* T2 Sigma0 */ \
127 \
128 movl b, tmp2; /* T2 Maj */ \
129 xorl c, tmp2; /* T2 Maj */ \
130 andl a, tmp2; /* T2 Maj */ \
131 movl b, tmp3; /* T2 Maj */ \
132 andl c, tmp3; /* T2 Maj */ \
133 xorl tmp2, tmp3; /* T2 Maj */ \
134 addl tmp3, h; /* T2 Maj */ \
135 \
136 addq $1, round;
137
138#define sha256_round_load(idx, a, b, c, d, e, f, g, h) \
139 sha256_message_schedule_load(idx, in, %rsp, tmp0) \
140 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
141
142#define sha256_round_update(idx, a, b, c, d, e, f, g, h) \
143 sha256_message_schedule_update(idx, %rsp, tmp0) \
144 sha256_round(idx, a, b, c, d, e, f, g, h, k256, %rsp, tmp0)
145
146.text
147
148/*
149 * void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
150 *
151 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
152 */
153.align 16
154.globl sha256_block_generic
155.type sha256_block_generic,@function
156sha256_block_generic:
157 _CET_ENDBR
158
159 /* Save callee save registers. */
160 pushq %rbx
161 pushq %rbp
162 pushq %r12
163 pushq %r13
164 pushq %r14
165 pushq %r15
166
167 /* Allocate space for message schedule and context pointer. */
168 movq %rsp, %rax
169 subq $(64+32), %rsp
170 andq $~63, %rsp
171 movq %rax, (64+16)(%rsp)
172 movq ctx, (64+8)(%rsp)
173
174 /* Compute and store end of message. */
175 shlq $6, num
176 leaq (in, num, 1), %rbx
177 movq %rbx, (64+0)(%rsp)
178
179 /* Address of SHA-256 constants. */
180 leaq K256(%rip), k256
181
182 /* Load current hash state from context. */
183 movl (0*4)(ctx), hs0
184 movl (1*4)(ctx), hs1
185 movl (2*4)(ctx), hs2
186 movl (3*4)(ctx), hs3
187 movl (4*4)(ctx), hs4
188 movl (5*4)(ctx), hs5
189 movl (6*4)(ctx), hs6
190 movl (7*4)(ctx), hs7
191
192 jmp .Lblock_loop0
193
194.align 16
195.Lblock_loop0:
196 mov $0, round
197
198 /* Round 0 through 15. */
199 sha256_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
200 sha256_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
201 sha256_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
202 sha256_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
203 sha256_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
204 sha256_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
205 sha256_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
206 sha256_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
207 sha256_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
208 sha256_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
209 sha256_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
210 sha256_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
211 sha256_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
212 sha256_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
213 sha256_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
214 sha256_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
215
216 jmp .Lblock_loop16
217
218.align 16
219.Lblock_loop16:
220 /* Round 16 through 63. */
221 sha256_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
222 sha256_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
223 sha256_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
224 sha256_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
225 sha256_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
226 sha256_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
227 sha256_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
228 sha256_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
229 sha256_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
230 sha256_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
231 sha256_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
232 sha256_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
233 sha256_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
234 sha256_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
235 sha256_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
236 sha256_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
237
238 cmp $64, round
239 jb .Lblock_loop16
240
241 movq (64+8)(%rsp), ctx
242
243 /* Add intermediate state to hash state. */
244 addl (0*4)(ctx), hs0
245 addl (1*4)(ctx), hs1
246 addl (2*4)(ctx), hs2
247 addl (3*4)(ctx), hs3
248 addl (4*4)(ctx), hs4
249 addl (5*4)(ctx), hs5
250 addl (6*4)(ctx), hs6
251 addl (7*4)(ctx), hs7
252
253 /* Store new hash state to context. */
254 movl hs0, (0*4)(ctx)
255 movl hs1, (1*4)(ctx)
256 movl hs2, (2*4)(ctx)
257 movl hs3, (3*4)(ctx)
258 movl hs4, (4*4)(ctx)
259 movl hs5, (5*4)(ctx)
260 movl hs6, (6*4)(ctx)
261 movl hs7, (7*4)(ctx)
262
263 addq $64, in
264 cmpq (64+0)(%rsp), in
265 jb .Lblock_loop0
266
267 movq (64+16)(%rsp), %rsp
268
269 /* Restore callee save registers. */
270 popq %r15
271 popq %r14
272 popq %r13
273 popq %r12
274 popq %rbp
275 popq %rbx
276
277 ret
278
279/*
280 * SHA-256 constants - see FIPS 180-4 section 4.2.2.
281 */
282.rodata
283.align 64
284.type K256,@object
285K256:
286.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
287.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
288.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
289.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
290.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
291.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
292.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
293.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
294.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
295.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
296.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
297.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
298.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
299.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
300.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
301.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2