summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2025-03-07 14:21:22 +0000
committerjsing <>2025-03-07 14:21:22 +0000
commit98a22bb72e4a20765f43cd5778f45ccd8072fa26 (patch)
treecdaa4718601092c89a7ce80734ed3833143670e9 /src
parent44ed550ca5e0b6e9358fa1dc3fe19cf4014eb338 (diff)
downloadopenbsd-98a22bb72e4a20765f43cd5778f45ccd8072fa26.tar.gz
openbsd-98a22bb72e4a20765f43cd5778f45ccd8072fa26.tar.bz2
openbsd-98a22bb72e4a20765f43cd5778f45ccd8072fa26.zip
Provide an accelerated SHA-256 assembly implementation for aarch64.
This provides a SHA-256 assembly implementation that makes use of the ARM Cryptographic Extension (CE), which is found on many arm64 CPUs. This gives a performance gain of up to 7.5x on an Apple M2 (dependent on block size). If an aarch64 machine does not have SHA2 support, then we'll fall back to using the existing C implementation. ok kettenis@ tb@
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/arch/aarch64/Makefile.inc5
-rw-r--r--src/lib/libcrypto/arch/aarch64/crypto_arch.h6
-rw-r--r--src/lib/libcrypto/sha/sha256_aarch64.c34
-rw-r--r--src/lib/libcrypto/sha/sha256_aarch64_ce.S189
4 files changed, 232 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/aarch64/Makefile.inc b/src/lib/libcrypto/arch/aarch64/Makefile.inc
index 20a634dc99..41198b069f 100644
--- a/src/lib/libcrypto/arch/aarch64/Makefile.inc
+++ b/src/lib/libcrypto/arch/aarch64/Makefile.inc
@@ -1,7 +1,10 @@
1# $OpenBSD: Makefile.inc,v 1.14 2024/11/08 13:34:24 jsing Exp $ 1# $OpenBSD: Makefile.inc,v 1.15 2025/03/07 14:21:22 jsing Exp $
2 2
3# aarch64-specific libcrypto build rules 3# aarch64-specific libcrypto build rules
4 4
5SRCS += crypto_cpu_caps.c 5SRCS += crypto_cpu_caps.c
6 6
7SRCS += sha256_aarch64.c
8SRCS += sha256_aarch64_ce.S
9
7AFLAGS+= -mmark-bti-property 10AFLAGS+= -mmark-bti-property
diff --git a/src/lib/libcrypto/arch/aarch64/crypto_arch.h b/src/lib/libcrypto/arch/aarch64/crypto_arch.h
index b0188c498a..adc91cd19f 100644
--- a/src/lib/libcrypto/arch/aarch64/crypto_arch.h
+++ b/src/lib/libcrypto/arch/aarch64/crypto_arch.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: crypto_arch.h,v 1.2 2024/11/08 13:34:24 jsing Exp $ */ 1/* $OpenBSD: crypto_arch.h,v 1.3 2025/03/07 14:21:22 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -33,4 +33,8 @@ extern uint64_t crypto_cpu_caps_aarch64;
33#define CRYPTO_CPU_CAPS_AARCH64_SHA512 (1ULL << 4) 33#define CRYPTO_CPU_CAPS_AARCH64_SHA512 (1ULL << 4)
34#define CRYPTO_CPU_CAPS_AARCH64_SHA3 (1ULL << 5) 34#define CRYPTO_CPU_CAPS_AARCH64_SHA3 (1ULL << 5)
35 35
36#ifndef OPENSSL_NO_ASM
37#define HAVE_SHA256_BLOCK_DATA_ORDER
38#endif
39
36#endif 40#endif
diff --git a/src/lib/libcrypto/sha/sha256_aarch64.c b/src/lib/libcrypto/sha/sha256_aarch64.c
new file mode 100644
index 0000000000..ecac64390d
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha256_aarch64.c
@@ -0,0 +1,34 @@
1/* $OpenBSD: sha256_aarch64.c,v 1.1 2025/03/07 14:21:22 jsing Exp $ */
2/*
3 * Copyright (c) 2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#include <openssl/sha.h>
19
20#include "crypto_arch.h"
21
22void sha256_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
23void sha256_block_generic(SHA256_CTX *ctx, const void *in, size_t num);
24
25void
26sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num)
27{
28 if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA2) != 0) {
29 sha256_block_ce(ctx, in, num);
30 return;
31 }
32
33 sha256_block_generic(ctx, in, num);
34}
diff --git a/src/lib/libcrypto/sha/sha256_aarch64_ce.S b/src/lib/libcrypto/sha/sha256_aarch64_ce.S
new file mode 100644
index 0000000000..a1644bb7a7
--- /dev/null
+++ b/src/lib/libcrypto/sha/sha256_aarch64_ce.S
@@ -0,0 +1,189 @@
1/* $OpenBSD: sha256_aarch64_ce.S,v 1.1 2025/03/07 14:21:22 jsing Exp $ */
2/*
3 * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18/*
19 * SHA-256 implementation using the ARM Cryptographic Extension (CE).
20 *
21 * There are four instructions that enable hardware acceleration of SHA-256,
22 * however the documentation for these is woefully inadequate:
23 *
24 * sha256h: hash update - part 1 (without a number to be inconsistent)
25 * sha256h2: hash update - part 2
26 * sha256su0: message schedule update with sigma0 for four rounds
27 * sha256su1: message schedule update with sigma1 for four rounds
28 */
29
30#define ctx x0
31#define in x1
32#define num x2
33
34#define k256_base x9
35#define k256 x10
36
37/* Note: the lower 64 bits of v8 through v15 are callee save. */
38
39#define hc0 v16
40#define hc1 v17
41
42#define hs0 v18
43#define hs1 v19
44
45#define w0 v20
46#define w1 v21
47#define w2 v22
48#define w3 v23
49
50#define k0 v24
51#define k1 v25
52#define k2 v26
53#define k3 v27
54
55#define tmp0 v28
56#define tmp1 v29
57
58/*
59 * Update message schedule for m0 (W0:W1:W2:W3), using m1 (W4:W5:W6:W7),
60 * m2 (W8:W9:W10:11) and m3 (W12:W13:W14:W15). The sha256su0 instruction
61 * computes the sigma0 component of the message schedule update as:
62 * W0:W1:W2:W3 = sigma0(W1:W2:W3:W4) + W0:W1:W2:W3
63 * while sha256su1 computes the sigma1 component and adds in W9 as:
64 * W0:W1:W2:W3 = sigma1(W14:W15:W0:W1) + W9:W10:W12:W13 + W0:W1:W2:W3
65 */
66#define sha256_message_schedule_update(m0, m1, m2, m3) \
67 sha256su0 m0.4s, m1.4s; \
68 sha256su1 m0.4s, m2.4s, m3.4s;
69
70/*
71 * Compute four SHA-256 rounds by adding W0:W1:W2:W3 + K0:K1:K2:K3, then
72 * computing the remainder of each round (including the shuffle) via
73 * sha256h/sha256h2.
74 */
75#define sha256_round(h0, h1, w, k) \
76 add tmp0.4s, w.4s, k.4s; /* Tt = Wt + Kt */ \
77 mov tmp1.4s, h0.4s; \
78 sha256h h0, h1, tmp0.4s; \
79 sha256h2 h1, tmp1, tmp0.4s;
80
81#define sha256_round_update(h0, h1, m0, m1, m2, m3, k) \
82 sha256_message_schedule_update(m0, m1, m2, m3) \
83 sha256_round(h0, h1, m0, k)
84
85.cpu generic+sha2
86
87.text
88
89/*
90 * void sha256_block_ce(SHA256_CTX *ctx, const void *in, size_t num);
91 *
92 * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num
93 */
94.globl sha256_block_ce
95.type sha256_block_ce,@function
96sha256_block_ce:
97
98 /* Address of SHA-256 constants. */
99 adrp k256_base, K256
100 add k256_base, k256_base, :lo12:K256
101
102 /*
103 * Load current hash state from context.
104 * hc0 = a:b:c:d, hc1 = e:f:g:h
105 */
106 ld1 {hc0.4s, hc1.4s}, [ctx]
107
108block_loop:
109 mov k256, k256_base
110
111 /* Copy current hash state. */
112 mov hs0.4s, hc0.4s
113 mov hs1.4s, hc1.4s
114
115 /* Load and byte swap message schedule. */
116 ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64
117 rev32 w0.16b, w0.16b
118 rev32 w1.16b, w1.16b
119 rev32 w2.16b, w2.16b
120 rev32 w3.16b, w3.16b
121
122 /* Rounds 0 through 15 (four rounds at a time). */
123 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
124
125 sha256_round(hs0, hs1, w0, k0)
126 sha256_round(hs0, hs1, w1, k1)
127 sha256_round(hs0, hs1, w2, k2)
128 sha256_round(hs0, hs1, w3, k3)
129
130 /* Rounds 16 through 31 (four rounds at a time). */
131 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
132
133 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
134 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
135 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
136 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
137
138 /* Rounds 32 through 47 (four rounds at a time). */
139 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
140
141 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
142 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
143 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
144 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
145
146 /* Rounds 48 through 63 (four rounds at a time). */
147 ld1 {k0.4s, k1.4s, k2.4s, k3.4s}, [k256], #64
148
149 sha256_round_update(hs0, hs1, w0, w1, w2, w3, k0)
150 sha256_round_update(hs0, hs1, w1, w2, w3, w0, k1)
151 sha256_round_update(hs0, hs1, w2, w3, w0, w1, k2)
152 sha256_round_update(hs0, hs1, w3, w0, w1, w2, k3)
153
154 /* Add intermediate state to hash state. */
155 add hc0.4s, hc0.4s, hs0.4s
156 add hc1.4s, hc1.4s, hs1.4s
157
158 sub num, num, #1
159 cbnz num, block_loop
160
161 /* Store hash state to context. */
162 st1 {hc0.4s, hc1.4s}, [ctx]
163
164 ret
165
166/*
167 * SHA-256 constants - see FIPS 180-4 section 4.2.3.
168 */
169.rodata
170.align 4
171.type K256,@object
172K256:
173.long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
174.long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
175.long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
176.long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
177.long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
178.long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
179.long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
180.long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
181.long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
182.long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
183.long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
184.long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
185.long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
186.long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
187.long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
188.long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
189.size K256,.-K256