diff options
author | jsing <> | 2025-03-12 14:13:41 +0000 |
---|---|---|
committer | jsing <> | 2025-03-12 14:13:41 +0000 |
commit | e7d0828785b67e44a3306faac670eee789fc7b10 (patch) | |
tree | b0d2ca8b4fd6a1b59a0984597c85260e091e41e2 /src/lib | |
parent | 8627628c24553fa0821bff8f761d620577f97c3b (diff) | |
download | openbsd-e7d0828785b67e44a3306faac670eee789fc7b10.tar.gz openbsd-e7d0828785b67e44a3306faac670eee789fc7b10.tar.bz2 openbsd-e7d0828785b67e44a3306faac670eee789fc7b10.zip |
Provide an accelerated SHA-512 assembly implementation for aarch64.
This provides a SHA-512 assembly implementation that makes use of the ARM
Cryptographic Extension (CE), which is found on many arm64 CPUs. This gives
a performance gain of up to 2.5x on an Apple M2 (dependent on block size).
If an aarch64 machine does not have SHA512 support, then we'll fall back to
using the existing C implementation.
ok kettenis@ tb@
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/libcrypto/arch/aarch64/Makefile.inc | 4 | ||||
-rw-r--r-- | src/lib/libcrypto/arch/aarch64/crypto_arch.h | 5 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha512_aarch64.c | 34 | ||||
-rw-r--r-- | src/lib/libcrypto/sha/sha512_aarch64_ce.S | 312 |
4 files changed, 353 insertions, 2 deletions
diff --git a/src/lib/libcrypto/arch/aarch64/Makefile.inc b/src/lib/libcrypto/arch/aarch64/Makefile.inc index 41198b069f..d93cb815ef 100644 --- a/src/lib/libcrypto/arch/aarch64/Makefile.inc +++ b/src/lib/libcrypto/arch/aarch64/Makefile.inc | |||
@@ -1,4 +1,4 @@ | |||
1 | # $OpenBSD: Makefile.inc,v 1.15 2025/03/07 14:21:22 jsing Exp $ | 1 | # $OpenBSD: Makefile.inc,v 1.16 2025/03/12 14:13:41 jsing Exp $ |
2 | 2 | ||
3 | # aarch64-specific libcrypto build rules | 3 | # aarch64-specific libcrypto build rules |
4 | 4 | ||
@@ -6,5 +6,7 @@ SRCS += crypto_cpu_caps.c | |||
6 | 6 | ||
7 | SRCS += sha256_aarch64.c | 7 | SRCS += sha256_aarch64.c |
8 | SRCS += sha256_aarch64_ce.S | 8 | SRCS += sha256_aarch64_ce.S |
9 | SRCS += sha512_aarch64.c | ||
10 | SRCS += sha512_aarch64_ce.S | ||
9 | 11 | ||
10 | AFLAGS+= -mmark-bti-property | 12 | AFLAGS+= -mmark-bti-property |
diff --git a/src/lib/libcrypto/arch/aarch64/crypto_arch.h b/src/lib/libcrypto/arch/aarch64/crypto_arch.h index adc91cd19f..35ecba9394 100644 --- a/src/lib/libcrypto/arch/aarch64/crypto_arch.h +++ b/src/lib/libcrypto/arch/aarch64/crypto_arch.h | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: crypto_arch.h,v 1.3 2025/03/07 14:21:22 jsing Exp $ */ | 1 | /* $OpenBSD: crypto_arch.h,v 1.4 2025/03/12 14:13:41 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -34,7 +34,10 @@ extern uint64_t crypto_cpu_caps_aarch64; | |||
34 | #define CRYPTO_CPU_CAPS_AARCH64_SHA3 (1ULL << 5) | 34 | #define CRYPTO_CPU_CAPS_AARCH64_SHA3 (1ULL << 5) |
35 | 35 | ||
36 | #ifndef OPENSSL_NO_ASM | 36 | #ifndef OPENSSL_NO_ASM |
37 | |||
37 | #define HAVE_SHA256_BLOCK_DATA_ORDER | 38 | #define HAVE_SHA256_BLOCK_DATA_ORDER |
39 | #define HAVE_SHA512_BLOCK_DATA_ORDER | ||
40 | |||
38 | #endif | 41 | #endif |
39 | 42 | ||
40 | #endif | 43 | #endif |
diff --git a/src/lib/libcrypto/sha/sha512_aarch64.c b/src/lib/libcrypto/sha/sha512_aarch64.c new file mode 100644 index 0000000000..3c997e3e89 --- /dev/null +++ b/src/lib/libcrypto/sha/sha512_aarch64.c | |||
@@ -0,0 +1,34 @@ | |||
1 | /* $OpenBSD: sha512_aarch64.c,v 1.1 2025/03/12 14:13:41 jsing Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2024 Joel Sing <jsing@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | #include <openssl/sha.h> | ||
19 | |||
20 | #include "crypto_arch.h" | ||
21 | |||
22 | void sha512_block_ce(SHA512_CTX *ctx, const void *in, size_t num); | ||
23 | void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num); | ||
24 | |||
25 | void | ||
26 | sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num) | ||
27 | { | ||
28 | if ((crypto_cpu_caps_aarch64 & CRYPTO_CPU_CAPS_AARCH64_SHA512) != 0) { | ||
29 | sha512_block_ce(ctx, in, num); | ||
30 | return; | ||
31 | } | ||
32 | |||
33 | sha512_block_generic(ctx, in, num); | ||
34 | } | ||
diff --git a/src/lib/libcrypto/sha/sha512_aarch64_ce.S b/src/lib/libcrypto/sha/sha512_aarch64_ce.S new file mode 100644 index 0000000000..89109a78ba --- /dev/null +++ b/src/lib/libcrypto/sha/sha512_aarch64_ce.S | |||
@@ -0,0 +1,312 @@ | |||
1 | /* $OpenBSD: sha512_aarch64_ce.S,v 1.1 2025/03/12 14:13:41 jsing Exp $ */ | ||
2 | /* | ||
3 | * Copyright (c) 2023,2025 Joel Sing <jsing@openbsd.org> | ||
4 | * | ||
5 | * Permission to use, copy, modify, and distribute this software for any | ||
6 | * purpose with or without fee is hereby granted, provided that the above | ||
7 | * copyright notice and this permission notice appear in all copies. | ||
8 | * | ||
9 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
10 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
11 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
12 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
13 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
14 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
15 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
16 | */ | ||
17 | |||
18 | /* | ||
19 | * SHA-512 implementation using the ARM Cryptographic Extension (CE). | ||
20 | * | ||
21 | * The documentation for these is rather inadequate - each instruction is | ||
22 | * described in a mechanical sense, however their combined usage does not | ||
23 | * seem to be detailed anywhere. | ||
24 | * | ||
25 | * There are four instructions that enable hardware acceleration of SHA-512: | ||
26 | * | ||
27 | * sha512h - hash update, part 1 (without a number to be inconsistent): | ||
28 | * inputs <W1:W0 + K1:K0 + g:h>, <f:g>, <d:e> | ||
29 | * output T1 for W0, T1 for W1 | ||
30 | * | ||
31 | * sha512h2 - hash update, part 2: | ||
32 | * inputs <T1 for W0, T1 for W1>, <c:d>, <a:b> | ||
33 | * output <T1 + T2 for W0, T1 + T2 for W1> | ||
34 | * | ||
35 | * sha512su0 - message schedule update with sigma0 for two rounds: | ||
36 | * inputs <W0:W1>, <W2:W3> | ||
37 | * output W0 += sigma0(W1), W1 += sigma0(W2) | ||
38 | * | ||
39 | * sha512su1 - message schedule update with sigma1 for two rounds: | ||
40 | * inputs <W0:W1>, <W14:W15>, <W9:W10> | ||
41 | * output W0 += sigma1(W14) + W9, W1 += sigma1(W15) + W10 | ||
42 | */ | ||
43 | |||
44 | #define ctx x0 | ||
45 | #define in x1 | ||
46 | #define num x2 | ||
47 | |||
48 | #define k512_base x3 | ||
49 | #define k512 x4 | ||
50 | |||
51 | /* Note: the lower 64 bits of v8 through v15 are callee save. */ | ||
52 | |||
53 | #define hc0 v28 | ||
54 | #define hc1 v29 | ||
55 | #define hc2 v30 | ||
56 | #define hc3 v31 | ||
57 | |||
58 | #define hs0 v0 | ||
59 | #define hs1 v1 | ||
60 | #define hs2 v2 | ||
61 | #define hs3 v3 | ||
62 | #define hs4 v4 | ||
63 | #define hs5 v5 | ||
64 | #define hs6 v6 | ||
65 | #define hs7 v7 | ||
66 | |||
67 | #define w0 v10 | ||
68 | #define w1 v11 | ||
69 | #define w2 v12 | ||
70 | #define w3 v13 | ||
71 | #define w4 v14 | ||
72 | #define w5 v15 | ||
73 | #define w6 v16 | ||
74 | #define w7 v17 | ||
75 | |||
76 | #define k0 v20 | ||
77 | #define k1 v21 | ||
78 | #define k2 v22 | ||
79 | #define k3 v23 | ||
80 | #define k4 v24 | ||
81 | #define k5 v25 | ||
82 | #define k6 v26 | ||
83 | #define k7 v27 | ||
84 | |||
85 | #define tmp0 v8 | ||
86 | #define tmp1 v9 | ||
87 | #define tmp2 v18 | ||
88 | |||
89 | /* | ||
90 | * Update message schedule for m0 (W0:W1), using m1 (W2:W3), m4 (W8:W9), | ||
91 | * m5 (W10:W11) and m7 (W14:W15). The sha512su0 instruction computes the sigma0 | ||
92 | * component of the message schedule update as m0 = sigma0(m1) + m0, while | ||
93 | * sha512su1 computes the sigma1 component as m0 = sigma1(m7) + W9:W10 + m0. | ||
94 | * Note that W9:W10 is split across two registers, hence this needs to be | ||
95 | * constructed before it is passed to sha512su1: | ||
96 | * | ||
97 | * W0 = sigma1(W14) + W9 + sigma0(W1) + W0 | ||
98 | */ | ||
99 | #define sha512_message_schedule_update(m0, m1, m4, m5, m7) \ | ||
100 | sha512su0 m0.2d, m1.2d; /* W0 += sigma0(W1) */ \ | ||
101 | ext tmp2.16b, m4.16b, m5.16b, #8; /* W9:W10 */ \ | ||
102 | sha512su1 m0.2d, m7.2d, tmp2.2d; /* W0 += sigma1(W14) + W9 */ | ||
103 | |||
104 | /* | ||
105 | * Compute two SHA-512 rounds by adding W0:W1 + K0:K1, then computing T1 for two | ||
106 | * rounds by swapping the double words, adding g:h and calling sha512h with this | ||
107 | * value (W1:W0 = W1:W0 + K1:K0 + g:h), f:g and d:e. The new e:f value is then | ||
108 | * computed by adding T1 + c:d (producing the next e:f values), before calling | ||
109 | * sha512h2 with T1, c:d and a:b, computing T1 + T2 for two rounds (producing | ||
110 | * the next a:b values): | ||
111 | * | ||
112 | * T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt | ||
113 | * T2 = Sigma0(a) + Maj(a, b, c) | ||
114 | * | ||
115 | * h = g | ||
116 | * g = f | ||
117 | * f = e | ||
118 | * e = d + T1 | ||
119 | * d = c | ||
120 | * c = b | ||
121 | * b = a | ||
122 | * a = T1 + T2 | ||
123 | * | ||
124 | * The inputs are: | ||
125 | * | ||
126 | * h0 = a:b | ||
127 | * h1 = c:d | ||
128 | * h2 = e:f | ||
129 | * h3 = g:h | ||
130 | * | ||
131 | * Producing the following outputs: | ||
132 | * | ||
133 | * h4 = next a:b | ||
134 | * h5 = next e:f | ||
135 | * | ||
136 | * These values are then rotated by the caller to perform the next two rounds. | ||
137 | */ | ||
138 | #define sha512_round(h0, h1, h2, h3, h4, h5, w, k) \ | ||
139 | add h4.2d, w.2d, k.2d; /* W0:W1 += K0:K1 */ \ | ||
140 | ext h4.16b, h4.16b, h4.16b, #8; /* W1:W0 (swap) */ \ | ||
141 | add h4.2d, h4.2d, h3.2d; /* W1:W0 += g:h */ \ | ||
142 | ext tmp0.16b, h2.16b, h3.16b, #8; /* f:g */ \ | ||
143 | ext tmp1.16b, h1.16b, h2.16b, #8; /* d:e */ \ | ||
144 | sha512h h4, tmp0, tmp1.2d; /* T1 */ \ | ||
145 | add h5.2d, h1.2d, h4.2d; /* c:d + T1 */ \ | ||
146 | sha512h2 h4, h1, h0.2d; /* T1 + T2 */ | ||
147 | |||
148 | #define sha512_round_update(h0, h1, h2, h3, h4, h5, m0, m1, m2, m3, m4, k) \ | ||
149 | sha512_message_schedule_update(m0, m1, m2, m3, m4) \ | ||
150 | sha512_round(h0, h1, h2, h3, h4, h5, m0, k) | ||
151 | |||
152 | .arch armv8-a+sha3 | ||
153 | |||
154 | .text | ||
155 | |||
156 | /* | ||
157 | * void sha512_block_ce(SHA512_CTX *ctx, const void *in, size_t num); | ||
158 | * | ||
159 | * Standard ARM ABI: x0 = ctx, x1 = in, x2 = num | ||
160 | */ | ||
161 | .globl sha512_block_ce | ||
162 | sha512_block_ce: | ||
163 | |||
164 | /* Save low 64 bits of v8 through v15 to the stack. */ | ||
165 | sub sp, sp, #32 | ||
166 | st4 {v8.d, v9.d, v10.d, v11.d}[0], [sp] | ||
167 | sub sp, sp, #32 | ||
168 | st4 {v12.d, v13.d, v14.d, v15.d}[0], [sp] | ||
169 | |||
170 | /* Address of SHA-512 constants. */ | ||
171 | adrp k512_base, K512 | ||
172 | add k512_base, k512_base, :lo12:K512 | ||
173 | |||
174 | /* | ||
175 | * Load current hash state from context. | ||
176 | * hc0 = a:b, hc1 = c:d, hc2 = e:f, hc3 = g:h | ||
177 | */ | ||
178 | ld1 {hc0.2d, hc1.2d, hc2.2d, hc3.2d}, [ctx] | ||
179 | |||
180 | block_loop: | ||
181 | mov k512, k512_base | ||
182 | |||
183 | /* Copy current hash state. */ | ||
184 | mov hs0.2d, hc0.2d | ||
185 | mov hs1.2d, hc1.2d | ||
186 | mov hs2.2d, hc2.2d | ||
187 | mov hs3.2d, hc3.2d | ||
188 | |||
189 | /* Load and byte swap message schedule. */ | ||
190 | ld1 {w0.16b, w1.16b, w2.16b, w3.16b}, [in], #64 | ||
191 | rev64 w0.16b, w0.16b | ||
192 | rev64 w1.16b, w1.16b | ||
193 | rev64 w2.16b, w2.16b | ||
194 | rev64 w3.16b, w3.16b | ||
195 | |||
196 | ld1 {w4.2d, w5.2d, w6.2d, w7.2d}, [in], #64 | ||
197 | rev64 w4.16b, w4.16b | ||
198 | rev64 w5.16b, w5.16b | ||
199 | rev64 w6.16b, w6.16b | ||
200 | rev64 w7.16b, w7.16b | ||
201 | |||
202 | /* Rounds 0 through 15 (two rounds at a time). */ | ||
203 | ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 | ||
204 | ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 | ||
205 | |||
206 | sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w0, k0) | ||
207 | sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w1, k1) | ||
208 | sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w2, k2) | ||
209 | sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w3, k3) | ||
210 | sha512_round(hs0, hs1, hs2, hs3, hs4, hs5, w4, k4) | ||
211 | sha512_round(hs4, hs0, hs5, hs2, hs6, hs7, w5, k5) | ||
212 | sha512_round(hs6, hs4, hs7, hs5, hs1, hs3, w6, k6) | ||
213 | sha512_round(hs1, hs6, hs3, hs7, hs0, hs2, w7, k7) | ||
214 | |||
215 | /* Rounds 16 through 31 (two rounds at a time). */ | ||
216 | ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 | ||
217 | ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 | ||
218 | |||
219 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0) | ||
220 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1) | ||
221 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2) | ||
222 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3) | ||
223 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4) | ||
224 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5) | ||
225 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6) | ||
226 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7) | ||
227 | |||
228 | /* Rounds 32 through 47 (two rounds at a time). */ | ||
229 | ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 | ||
230 | ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 | ||
231 | |||
232 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0) | ||
233 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1) | ||
234 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2) | ||
235 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3) | ||
236 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4) | ||
237 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5) | ||
238 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6) | ||
239 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7) | ||
240 | |||
241 | /* Rounds 48 through 63 (two rounds at a time). */ | ||
242 | ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 | ||
243 | ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 | ||
244 | |||
245 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0) | ||
246 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1) | ||
247 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2) | ||
248 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3) | ||
249 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4) | ||
250 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5) | ||
251 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6) | ||
252 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7) | ||
253 | |||
254 | /* Rounds 64 through 79 (two rounds at a time). */ | ||
255 | ld1 {k0.2d, k1.2d, k2.2d, k3.2d}, [k512], #64 | ||
256 | ld1 {k4.2d, k5.2d, k6.2d, k7.2d}, [k512], #64 | ||
257 | |||
258 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w0, w1, w4, w5, w7, k0) | ||
259 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w1, w2, w5, w6, w0, k1) | ||
260 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w2, w3, w6, w7, w1, k2) | ||
261 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w3, w4, w7, w0, w2, k3) | ||
262 | sha512_round_update(hs0, hs1, hs2, hs3, hs4, hs5, w4, w5, w0, w1, w3, k4) | ||
263 | sha512_round_update(hs4, hs0, hs5, hs2, hs6, hs7, w5, w6, w1, w2, w4, k5) | ||
264 | sha512_round_update(hs6, hs4, hs7, hs5, hs1, hs3, w6, w7, w2, w3, w5, k6) | ||
265 | sha512_round_update(hs1, hs6, hs3, hs7, hs0, hs2, w7, w0, w3, w4, w6, k7) | ||
266 | |||
267 | /* Add intermediate state to hash state. */ | ||
268 | add hc0.2d, hc0.2d, hs0.2d | ||
269 | add hc1.2d, hc1.2d, hs1.2d | ||
270 | add hc2.2d, hc2.2d, hs2.2d | ||
271 | add hc3.2d, hc3.2d, hs3.2d | ||
272 | |||
273 | sub num, num, #1 | ||
274 | cbnz num, block_loop | ||
275 | |||
276 | /* Store hash state to context. */ | ||
277 | st1 {hc0.2d, hc1.2d, hc2.2d, hc3.2d}, [ctx] | ||
278 | |||
279 | /* Restore low 64 bits of v8 through v15 from the stack. */ | ||
280 | ld4 {v12.d, v13.d, v14.d, v15.d}[0], [sp], #32 | ||
281 | ld4 {v8.d, v9.d, v10.d, v11.d}[0], [sp], #32 | ||
282 | |||
283 | ret | ||
284 | |||
285 | /* | ||
286 | * SHA-512 constants - see FIPS 180-4 section 4.2.3. | ||
287 | */ | ||
288 | .rodata | ||
289 | .align 4 | ||
290 | .type K512,@object | ||
291 | K512: | ||
292 | .quad 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc | ||
293 | .quad 0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 | ||
294 | .quad 0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 | ||
295 | .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694 | ||
296 | .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 | ||
297 | .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 | ||
298 | .quad 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4 | ||
299 | .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70 | ||
300 | .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df | ||
301 | .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b | ||
302 | .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30 | ||
303 | .quad 0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8 | ||
304 | .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 | ||
305 | .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 | ||
306 | .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec | ||
307 | .quad 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b | ||
308 | .quad 0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 | ||
309 | .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b | ||
310 | .quad 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c | ||
311 | .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 | ||
312 | .size K512,.-K512 | ||