diff options
author | jsing <> | 2023-06-12 16:42:11 +0000 |
---|---|---|
committer | jsing <> | 2023-06-12 16:42:11 +0000 |
commit | 38c39b6373ebdb2cab5dc7287cf23a31076e26bf (patch) | |
tree | 8beed90a7ab14d84c174b3a562626473cdcf930d | |
parent | 9555359b43b00ed20a16eba7b602909bc52f32b0 (diff) | |
download | openbsd-38c39b6373ebdb2cab5dc7287cf23a31076e26bf.tar.gz openbsd-38c39b6373ebdb2cab5dc7287cf23a31076e26bf.tar.bz2 openbsd-38c39b6373ebdb2cab5dc7287cf23a31076e26bf.zip |
Optimise quad word primitives on aarch64.
This provides a performance gain across most BN operations.
-rw-r--r-- | src/lib/libcrypto/bn/arch/aarch64/bn_arch.h | 137 |
1 files changed, 136 insertions, 1 deletions
diff --git a/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h b/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h index 708083aaf2..f658510c73 100644 --- a/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h +++ b/src/lib/libcrypto/bn/arch/aarch64/bn_arch.h | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: bn_arch.h,v 1.9 2023/05/28 17:42:30 jsing Exp $ */ | 1 | /* $OpenBSD: bn_arch.h,v 1.10 2023/06/12 16:42:11 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -63,6 +63,35 @@ bn_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1, | |||
63 | *out_r0 = r0; | 63 | *out_r0 = r0; |
64 | } | 64 | } |
65 | 65 | ||
66 | #define HAVE_BN_QWADDQW | ||
67 | |||
68 | static inline void | ||
69 | bn_qwaddqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3, | ||
70 | BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG carry, BN_ULONG *out_carry, | ||
71 | BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) | ||
72 | { | ||
73 | BN_ULONG r3, r2, r1, r0; | ||
74 | |||
75 | __asm__ ( | ||
76 | "adds xzr, %[carry], #-1 \n" | ||
77 | "adcs %[r0], %[a0], %[b0] \n" | ||
78 | "adcs %[r1], %[a1], %[b1] \n" | ||
79 | "adcs %[r2], %[a2], %[b2] \n" | ||
80 | "adcs %[r3], %[a3], %[b3] \n" | ||
81 | "cset %[carry], cs \n" | ||
82 | : [carry]"+r"(carry), [r3]"=&r"(r3), [r2]"=&r"(r2), | ||
83 | [r1]"=&r"(r1), [r0]"=&r"(r0) | ||
84 | : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), | ||
85 | [b3]"r"(b3), [b2]"r"(b2), [b1]"r"(b1), [b0]"r"(b0) | ||
86 | : "cc"); | ||
87 | |||
88 | *out_carry = carry; | ||
89 | *out_r3 = r3; | ||
90 | *out_r2 = r2; | ||
91 | *out_r1 = r1; | ||
92 | *out_r0 = r0; | ||
93 | } | ||
94 | |||
66 | #define HAVE_BN_MULW | 95 | #define HAVE_BN_MULW |
67 | 96 | ||
68 | static inline void | 97 | static inline void |
@@ -148,6 +177,83 @@ bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, | |||
148 | *out_r0 = r0; | 177 | *out_r0 = r0; |
149 | } | 178 | } |
150 | 179 | ||
180 | #define HAVE_BN_QWMULW_ADDW | ||
181 | |||
182 | static inline void | ||
183 | bn_qwmulw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b, | ||
184 | BN_ULONG c, BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2, | ||
185 | BN_ULONG *out_r1, BN_ULONG *out_r0) | ||
186 | { | ||
187 | BN_ULONG r4, r3, r2, r1, r0; | ||
188 | |||
189 | __asm__ ( | ||
190 | "umulh %[r1], %[a0], %[b] \n" | ||
191 | "mul %[r0], %[a0], %[b] \n" | ||
192 | "adds %[r0], %[r0], %[c] \n" | ||
193 | "umulh %[r2], %[a1], %[b] \n" | ||
194 | "mul %[c], %[a1], %[b] \n" | ||
195 | "adcs %[r1], %[r1], %[c] \n" | ||
196 | "umulh %[r3], %[a2], %[b] \n" | ||
197 | "mul %[c], %[a2], %[b] \n" | ||
198 | "adcs %[r2], %[r2], %[c] \n" | ||
199 | "umulh %[r4], %[a3], %[b] \n" | ||
200 | "mul %[c], %[a3], %[b] \n" | ||
201 | "adcs %[r3], %[r3], %[c] \n" | ||
202 | "adc %[r4], %[r4], xzr \n" | ||
203 | : [c]"+r"(c), [r4]"=&r"(r4), [r3]"=&r"(r3), [r2]"=&r"(r2), | ||
204 | [r1]"=&r"(r1), [r0]"=&r"(r0) | ||
205 | : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), [b]"r"(b) | ||
206 | : "cc"); | ||
207 | |||
208 | *out_r4 = r4; | ||
209 | *out_r3 = r3; | ||
210 | *out_r2 = r2; | ||
211 | *out_r1 = r1; | ||
212 | *out_r0 = r0; | ||
213 | } | ||
214 | |||
215 | #define HAVE_BN_QWMULW_ADDQW_ADDW | ||
216 | |||
217 | static inline void | ||
218 | bn_qwmulw_addqw_addw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, | ||
219 | BN_ULONG b, BN_ULONG c3, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, BN_ULONG d, | ||
220 | BN_ULONG *out_r4, BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, | ||
221 | BN_ULONG *out_r0) | ||
222 | { | ||
223 | BN_ULONG r4, r3, r2, r1, r0; | ||
224 | |||
225 | __asm__ ( | ||
226 | "umulh %[r1], %[a0], %[b] \n" | ||
227 | "mul %[r0], %[a0], %[b] \n" | ||
228 | "adds %[r0], %[r0], %[d] \n" | ||
229 | "umulh %[r2], %[a1], %[b] \n" | ||
230 | "mul %[d], %[a1], %[b] \n" | ||
231 | "adcs %[r1], %[r1], %[d] \n" | ||
232 | "umulh %[r3], %[a2], %[b] \n" | ||
233 | "mul %[d], %[a2], %[b] \n" | ||
234 | "adcs %[r2], %[r2], %[d] \n" | ||
235 | "umulh %[r4], %[a3], %[b] \n" | ||
236 | "mul %[d], %[a3], %[b] \n" | ||
237 | "adcs %[r3], %[r3], %[d] \n" | ||
238 | "adc %[r4], %[r4], xzr \n" | ||
239 | "adds %[r0], %[r0], %[c0] \n" | ||
240 | "adcs %[r1], %[r1], %[c1] \n" | ||
241 | "adcs %[r2], %[r2], %[c2] \n" | ||
242 | "adcs %[r3], %[r3], %[c3] \n" | ||
243 | "adc %[r4], %[r4], xzr \n" | ||
244 | : [d]"+r"(d), [r4]"=&r"(r4), [r3]"=&r"(r3), [r2]"=&r"(r2), | ||
245 | [r1]"=&r"(r1), [r0]"=&r"(r0) | ||
246 | : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), [b]"r"(b), | ||
247 | [c3]"r"(c3), [c2]"r"(c2), [c1]"r"(c1), [c0]"r"(c0) | ||
248 | : "cc"); | ||
249 | |||
250 | *out_r4 = r4; | ||
251 | *out_r3 = r3; | ||
252 | *out_r2 = r2; | ||
253 | *out_r1 = r1; | ||
254 | *out_r0 = r0; | ||
255 | } | ||
256 | |||
151 | #define HAVE_BN_SUBW | 257 | #define HAVE_BN_SUBW |
152 | 258 | ||
153 | static inline void | 259 | static inline void |
@@ -187,6 +293,35 @@ bn_subw_subw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_borrow, | |||
187 | *out_r0 = r0; | 293 | *out_r0 = r0; |
188 | } | 294 | } |
189 | 295 | ||
296 | #define HAVE_BN_QWSUBQW | ||
297 | |||
298 | static inline void | ||
299 | bn_qwsubqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3, | ||
300 | BN_ULONG b2, BN_ULONG b1, BN_ULONG b0, BN_ULONG borrow, BN_ULONG *out_borrow, | ||
301 | BN_ULONG *out_r3, BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) | ||
302 | { | ||
303 | BN_ULONG r3, r2, r1, r0; | ||
304 | |||
305 | __asm__ ( | ||
306 | "subs xzr, xzr, %[borrow] \n" | ||
307 | "sbcs %[r0], %[a0], %[b0] \n" | ||
308 | "sbcs %[r1], %[a1], %[b1] \n" | ||
309 | "sbcs %[r2], %[a2], %[b2] \n" | ||
310 | "sbcs %[r3], %[a3], %[b3] \n" | ||
311 | "cset %[borrow], cc \n" | ||
312 | : [borrow]"+r"(borrow), [r3]"=&r"(r3), [r2]"=&r"(r2), | ||
313 | [r1]"=&r"(r1), [r0]"=&r"(r0) | ||
314 | : [a3]"r"(a3), [a2]"r"(a2), [a1]"r"(a1), [a0]"r"(a0), | ||
315 | [b3]"r"(b3), [b2]"r"(b2), [b1]"r"(b1), [b0]"r"(b0) | ||
316 | : "cc"); | ||
317 | |||
318 | *out_borrow = borrow; | ||
319 | *out_r3 = r3; | ||
320 | *out_r2 = r2; | ||
321 | *out_r1 = r1; | ||
322 | *out_r0 = r0; | ||
323 | } | ||
324 | |||
190 | #endif /* __GNUC__ */ | 325 | #endif /* __GNUC__ */ |
191 | 326 | ||
192 | #endif | 327 | #endif |