summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjsing <>2024-11-16 14:56:39 +0000
committerjsing <>2024-11-16 14:56:39 +0000
commit8e55140e006194dd39836fcc054babaf7f7e6c5d (patch)
tree440bdcfa47dd4c46b50b948e6099723d92a1e59b
parentfe7294a5c93bf90f080d28c3a7684b6e91757a35 (diff)
downloadopenbsd-8e55140e006194dd39836fcc054babaf7f7e6c5d.tar.gz
openbsd-8e55140e006194dd39836fcc054babaf7f7e6c5d.tar.bz2
openbsd-8e55140e006194dd39836fcc054babaf7f7e6c5d.zip
Provide a replacement assembly implementation for SHA-512 on amd64.
Replace the perlasm generated SHA-512 assembly with a more readable version and the same C wrapper introduced for SHA-256. As for SHA-256, on a modern CPU the performance is largely the same. ok tb@
-rw-r--r--src/lib/libcrypto/arch/amd64/Makefile.inc9
-rw-r--r--src/lib/libcrypto/sha/sha512_amd64.c26
-rw-r--r--src/lib/libcrypto/sha/sha512_amd64_generic.S307
3 files changed, 336 insertions, 6 deletions
diff --git a/src/lib/libcrypto/arch/amd64/Makefile.inc b/src/lib/libcrypto/arch/amd64/Makefile.inc
index 07fcf46ed5..9ba5634f87 100644
--- a/src/lib/libcrypto/arch/amd64/Makefile.inc
+++ b/src/lib/libcrypto/arch/amd64/Makefile.inc
@@ -1,4 +1,4 @@
1# $OpenBSD: Makefile.inc,v 1.31 2024/11/08 15:09:48 jsing Exp $ 1# $OpenBSD: Makefile.inc,v 1.32 2024/11/16 14:56:39 jsing Exp $
2 2
3# amd64-specific libcrypto build rules 3# amd64-specific libcrypto build rules
4 4
@@ -54,11 +54,8 @@ CFLAGS+= -DSHA256_ASM
54SRCS+= sha256_amd64.c 54SRCS+= sha256_amd64.c
55SRCS+= sha256_amd64_generic.S 55SRCS+= sha256_amd64_generic.S
56CFLAGS+= -DSHA512_ASM 56CFLAGS+= -DSHA512_ASM
57SRCS+= sha512-x86_64.S 57SRCS+= sha512_amd64.c
58GENERATED+= sha512-x86_64.S 58SRCS+= sha512_amd64_generic.S
59sha512-x86_64.S: ${LCRYPTO_SRC}/sha/asm/sha512-x86_64.pl ${EXTRA_PL}
60 cd ${LCRYPTO_SRC}/sha/asm ; \
61 /usr/bin/perl ./sha512-x86_64.pl ${.OBJDIR}/${.TARGET}
62 59
63.for dir f in ${SSLASM} 60.for dir f in ${SSLASM}
64SRCS+= ${f}.S 61SRCS+= ${f}.S
diff --git a/src/lib/libcrypto/sha/sha512_amd64.c b/src/lib/libcrypto/sha/sha512_amd64.c
new file mode 100644
index 0000000000..0b54243020
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha512_amd64.c
@@ -0,0 +1,26 @@
1/* $OpenBSD: sha512_amd64.c,v 1.1 2024/11/16 14:56:39 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
21
22void
23sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num)
24{
25 sha512_block_generic(ctx, in, num);
26}
diff --git a/src/lib/libcrypto/sha/sha512_amd64_generic.S b/src/lib/libcrypto/sha/sha512_amd64_generic.S
new file mode 100644
index 0000000000..8419d60b8e
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha512_amd64_generic.S
@@ -0,0 +1,307 @@
1/* $OpenBSD: sha512_amd64_generic.S,v 1.1 2024/11/16 14:56:39 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define ctx %rdi
25#define in %rsi
26#define num %rdx
27
28#define round %rdi
29
30#define hs0 %r8
31#define hs1 %r9
32#define hs2 %r10
33#define hs3 %r11
34#define hs4 %r12
35#define hs5 %r13
36#define hs6 %r14
37#define hs7 %r15
38
39#define k512 %rbp
40
41#define tmp0 %rax
42#define tmp1 %rbx
43#define tmp2 %rcx
44#define tmp3 %rdx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 * Wt = Mt
50 */
51#define sha512_message_schedule_load(idx, m, w, wt) \
52 movq (m, round, 8), wt; \
53 bswapq wt; \
54 movq wt, ((idx&0xf)*8)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 * Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 * sigma0(x) = ror(x, 1) ^ ror(x, 8) ^ (x >> 7)
62 * sigma1(x) = ror(x, 19) ^ ror(x, 61) ^ (x >> 6)
63 *
64 */
65#define sha512_message_schedule_update(idx, w, wt) \
66 movq (((idx-2)&0xf)*8)(w), wt; /* sigma1 */ \
67 movq wt, tmp1; /* sigma1 */ \
68 rorq $(61-19), tmp1; /* sigma1 */ \
69 xorq wt, tmp1; /* sigma1 */ \
70 rorq $19, tmp1; /* sigma1 */ \
71 shrq $6, wt; /* sigma1 */ \
72 xorq tmp1, wt; /* sigma1 */ \
73 \
74 addq (((idx-7)&0xf)*8)(w), wt; /* Wt-7 */ \
75 addq (((idx-16)&0xf)*8)(w), wt; /* Wt-16 */ \
76 \
77 movq (((idx-15)&0xf)*8)(w), tmp2; /* sigma0 */ \
78 movq tmp2, tmp3; /* sigma0 */ \
79 rorq $(8-1), tmp2; /* sigma0 */ \
80 xorq tmp3, tmp2; /* sigma0 */ \
81 rorq $1, tmp2; /* sigma0 */ \
82 shrq $7, tmp3; /* sigma0 */ \
83 xorq tmp3, tmp2; /* sigma0 */ \
84 addq tmp2, wt; /* sigma0 */ \
85 \
86 movq wt, ((idx&0xf)*8)(w);
87
88/*
89 * Compute a SHA-512 round:
90 *
91 * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
92 * T2 = Sigma0(a) + Maj(a, b, c)
93 *
94 * Sigma0(x) = ror(x, 28) ^ ror(x, 34) ^ ror(x, 39)
95 * Sigma1(x) = ror(x, 14) ^ ror(x, 18) ^ ror(x, 41)
96 * Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
97 * Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
98 *
99 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
100 */
101#define sha512_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
102 addq wt, h; /* T1 Wt */ \
103 addq (k512, round, 8), h; /* T1 Kt */ \
104 \
105 movq e, tmp1; /* T1 Sigma1 */ \
106 rorq $(41-18), tmp1; /* T1 Sigma1 */ \
107 xorq e, tmp1; /* T1 Sigma1 */ \
108 rorq $(18-14), tmp1; /* T1 Sigma1 */ \
109 xorq e, tmp1; /* T1 Sigma1 */ \
110 rorq $14, tmp1; /* T1 Sigma1 */ \
111 addq tmp1, h; /* T1 Sigma1 */ \
112 \
113 movq f, tmp2; /* T1 Ch */ \
114 xorq g, tmp2; /* T1 Ch */ \
115 andq e, tmp2; /* T1 Ch */ \
116 xorq g, tmp2; /* T1 Ch */ \
117 addq tmp2, h; /* T1 Ch */ \
118 \
119 addq h, d; /* d += T1 */ \
120 \
121 movq a, tmp1; /* T2 Sigma0 */ \
122 rorq $(39-34), tmp1; /* T2 Sigma0 */ \
123 xorq a, tmp1; /* T2 Sigma0 */ \
124 rorq $(34-28), tmp1; /* T2 Sigma0 */ \
125 xorq a, tmp1; /* T2 Sigma0 */ \
126 rorq $28, tmp1; /* T2 Sigma0 */ \
127 addq tmp1, h; /* T2 Sigma0 */ \
128 \
129 movq b, tmp2; /* T2 Maj */ \
130 xorq c, tmp2; /* T2 Maj */ \
131 andq a, tmp2; /* T2 Maj */ \
132 movq b, tmp3; /* T2 Maj */ \
133 andq c, tmp3; /* T2 Maj */ \
134 xorq tmp2, tmp3; /* T2 Maj */ \
135 addq tmp3, h; /* T2 Maj */ \
136 \
137 addq $1, round;
138
139#define sha512_round_load(idx, a, b, c, d, e, f, g, h) \
140 sha512_message_schedule_load(idx, in, %rsp, tmp0) \
141 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
142
143#define sha512_round_update(idx, a, b, c, d, e, f, g, h) \
144 sha512_message_schedule_update(idx, %rsp, tmp0) \
145 sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
146
147.text
148
149/*
150 * void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
151 *
152 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
153 */
154.align 16
155.globl sha512_block_generic
156.type sha512_block_generic,@function
157sha512_block_generic:
158 _CET_ENDBR
159
160 /* Save callee save registers. */
161 pushq %rbx
162 pushq %rbp
163 pushq %r12
164 pushq %r13
165 pushq %r14
166 pushq %r15
167
168 /* Allocate space for message schedule and context pointer. */
169 movq %rsp, %rax
170 subq $(128+3*8), %rsp
171 andq $~63, %rsp
172 movq %rax, (128+2*8)(%rsp)
173 movq ctx, (128+1*8)(%rsp)
174
175 /* Compute and store end of message. */
176 shlq $7, num
177 leaq (in, num, 1), %rbx
178 movq %rbx, (128+0*8)(%rsp)
179
180 /* Address of SHA-512 constants. */
181 leaq K512(%rip), k512
182
183 /* Load current hash state from context. */
184 movq (0*8)(ctx), hs0
185 movq (1*8)(ctx), hs1
186 movq (2*8)(ctx), hs2
187 movq (3*8)(ctx), hs3
188 movq (4*8)(ctx), hs4
189 movq (5*8)(ctx), hs5
190 movq (6*8)(ctx), hs6
191 movq (7*8)(ctx), hs7
192
193 jmp .Lblock_loop0
194
195.align 16
196.Lblock_loop0:
197 mov $0, round
198
199 /* Round 0 through 15. */
200 sha512_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
201 sha512_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
202 sha512_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
203 sha512_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
204 sha512_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
205 sha512_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
206 sha512_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
207 sha512_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
208 sha512_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
209 sha512_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
210 sha512_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
211 sha512_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
212 sha512_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
213 sha512_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
214 sha512_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
215 sha512_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
216
217 jmp .Lblock_loop16
218
219.align 16
220.Lblock_loop16:
221 /* Round 16 through 79. */
222 sha512_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
223 sha512_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
224 sha512_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
225 sha512_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
226 sha512_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
227 sha512_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
228 sha512_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
229 sha512_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
230 sha512_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
231 sha512_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
232 sha512_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
233 sha512_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
234 sha512_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
235 sha512_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
236 sha512_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
237 sha512_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
238
239 cmp $80, round
240 jb .Lblock_loop16
241
242 movq (128+1*8)(%rsp), ctx
243
244 /* Add intermediate state to hash state. */
245 addq (0*8)(ctx), hs0
246 addq (1*8)(ctx), hs1
247 addq (2*8)(ctx), hs2
248 addq (3*8)(ctx), hs3
249 addq (4*8)(ctx), hs4
250 addq (5*8)(ctx), hs5
251 addq (6*8)(ctx), hs6
252 addq (7*8)(ctx), hs7
253
254 /* Store new hash state to context. */
255 movq hs0, (0*8)(ctx)
256 movq hs1, (1*8)(ctx)
257 movq hs2, (2*8)(ctx)
258 movq hs3, (3*8)(ctx)
259 movq hs4, (4*8)(ctx)
260 movq hs5, (5*8)(ctx)
261 movq hs6, (6*8)(ctx)
262 movq hs7, (7*8)(ctx)
263
264 addq $128, in
265 cmpq (128+0*8)(%rsp), in
266 jb .Lblock_loop0
267
268 movq (128+2*8)(%rsp), %rsp
269
270 /* Restore callee save registers. */
271 popq %r15
272 popq %r14
273 popq %r13
274 popq %r12
275 popq %rbp
276 popq %rbx
277
278 ret
279
280/*
281 * SHA-512 constants - see FIPS 180-4 section 4.2.3.
282 */
283.rodata
284.align 64
285.type K512,@object
286K512:
287.quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
288.quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
289.quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
290.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694
291.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
292.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
293.quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4
294.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70
295.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
296.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b
297.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30
298.quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8
299.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
300.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
301.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec
302.quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b
303.quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
304.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b
305.quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
306.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
307.size K512,.-K512