summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2023-06-25 11:42:26 +0000
committerjsing <>2023-06-25 11:42:26 +0000
commitee2a1487217437d0cbc8d2cba036b6b755509997 (patch)
tree2db3519d7bc9b4956b88feeca83e317d2f563b07 /src
parenta2fd97ea81459f82bf6ddf4a963c48c495818f5d (diff)
downloadopenbsd-ee2a1487217437d0cbc8d2cba036b6b755509997.tar.gz
openbsd-ee2a1487217437d0cbc8d2cba036b6b755509997.tar.bz2
openbsd-ee2a1487217437d0cbc8d2cba036b6b755509997.zip
Provide additional BN primitives for BN_ULLONG architectures.
On BN_ULLONG architectures, the C compiler can usually do a decent job of optimising primitives, however it struggles to see through primitive calls due to type narrowing. As such, providing explicit versions of compound primitives can result in the production of more optimal code. For example, on arm the bn_mulw_addw_addw() primitive can be replaced with a single umaal instruction, which provides significant performance gains. Rather than intermingling #ifdef/#else throughout the header, the BN_ULLONG defines are pulled up above the normal functions. This also allows complex compound primitives to be reused. The conditionals have also been changed from BN_LLONG to BN_ULLONG, since that is what really matters. ok tb@
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/bn/bn_internal.h100
1 files changed, 79 insertions, 21 deletions
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h
index b712b736f6..fd04bc9f8a 100644
--- a/src/lib/libcrypto/bn/bn_internal.h
+++ b/src/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_internal.h,v 1.14 2023/06/21 07:48:41 jsing Exp $ */ 1/* $OpenBSD: bn_internal.h,v 1.15 2023/06/25 11:42:26 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -80,12 +80,18 @@ bn_clzw(BN_ULONG w)
80 */ 80 */
81 81
82/* 82/*
83 * bn_addw() computes (r1:r0) = a + b, where both inputs are single words, 83 * Default implementations for BN_ULLONG architectures.
84 * producing a double word result. The value of r1 is the carry from the 84 *
85 * addition. 85 * On these platforms the C compiler is generally better at optimising without
86 * the use of inline assembly primitives. However, it can be difficult for the
87 * compiler to see through primitives in order to combine operations, due to
88 * type changes/narrowing. For this reason compound primitives are usually
89 * explicitly provided.
86 */ 90 */
91#ifdef BN_ULLONG
92
87#ifndef HAVE_BN_ADDW 93#ifndef HAVE_BN_ADDW
88#ifdef BN_LLONG 94#define HAVE_BN_ADDW
89static inline void 95static inline void
90bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0) 96bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
91{ 97{
@@ -96,8 +102,75 @@ bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
96 *out_r1 = r >> BN_BITS2; 102 *out_r1 = r >> BN_BITS2;
97 *out_r0 = r & BN_MASK2; 103 *out_r0 = r & BN_MASK2;
98} 104}
99#else 105#endif
106
107#ifndef HAVE_BN_ADDW_ADDW
108#define HAVE_BN_ADDW_ADDW
109static inline void
110bn_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
111 BN_ULONG *out_r0)
112{
113 BN_ULLONG r;
114
115 r = (BN_ULLONG)a + (BN_ULLONG)b + (BN_ULLONG)c;
116
117 *out_r1 = r >> BN_BITS2;
118 *out_r0 = r & BN_MASK2;
119}
120#endif
121
122#ifndef HAVE_BN_MULW
123#define HAVE_BN_MULW
124static inline void
125bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
126{
127 BN_ULLONG r;
128
129 r = (BN_ULLONG)a * (BN_ULLONG)b;
130
131 *out_r1 = r >> BN_BITS2;
132 *out_r0 = r & BN_MASK2;
133}
134#endif
135
136#ifndef HAVE_BN_MULW_ADDW
137#define HAVE_BN_MULW_ADDW
138static inline void
139bn_mulw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG *out_r1,
140 BN_ULONG *out_r0)
141{
142 BN_ULLONG r;
143
144 r = (BN_ULLONG)a * (BN_ULLONG)b + (BN_ULLONG)c;
145
146 *out_r1 = r >> BN_BITS2;
147 *out_r0 = r & BN_MASK2;
148}
149#endif
100 150
151#ifndef HAVE_BN_MULW_ADDW_ADDW
152#define HAVE_BN_MULW_ADDW_ADDW
153static inline void
154bn_mulw_addw_addw(BN_ULONG a, BN_ULONG b, BN_ULONG c, BN_ULONG d,
155 BN_ULONG *out_r1, BN_ULONG *out_r0)
156{
157 BN_ULLONG r;
158
159 r = (BN_ULLONG)a * (BN_ULLONG)b + (BN_ULLONG)c + (BN_ULLONG)d;
160
161 *out_r1 = r >> BN_BITS2;
162 *out_r0 = r & BN_MASK2;
163}
164#endif
165
166#endif /* !BN_ULLONG */
167
168/*
169 * bn_addw() computes (r1:r0) = a + b, where both inputs are single words,
170 * producing a double word result. The value of r1 is the carry from the
171 * addition.
172 */
173#ifndef HAVE_BN_ADDW
101static inline void 174static inline void
102bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0) 175bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
103{ 176{
@@ -112,7 +185,6 @@ bn_addw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
112 *out_r0 = r0; 185 *out_r0 = r0;
113} 186}
114#endif 187#endif
115#endif
116 188
117/* 189/*
118 * bn_addw_addw() computes (r1:r0) = a + b + c, where all inputs are single 190 * bn_addw_addw() computes (r1:r0) = a + b + c, where all inputs are single
@@ -230,19 +302,6 @@ bn_qwsubqw(BN_ULONG a3, BN_ULONG a2, BN_ULONG a1, BN_ULONG a0, BN_ULONG b3,
230 * producing a double word result. 302 * producing a double word result.
231 */ 303 */
232#ifndef HAVE_BN_MULW 304#ifndef HAVE_BN_MULW
233#ifdef BN_LLONG
234static inline void
235bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
236{
237 BN_ULLONG r;
238
239 r = (BN_ULLONG)a * (BN_ULLONG)b;
240
241 *out_r1 = r >> BN_BITS2;
242 *out_r0 = r & BN_MASK2;
243}
244
245#else /* !BN_LLONG */
246/* 305/*
247 * Multiply two words (a * b) producing a double word result (h:l). 306 * Multiply two words (a * b) producing a double word result (h:l).
248 * 307 *
@@ -339,7 +398,6 @@ bn_mulw(BN_ULONG a, BN_ULONG b, BN_ULONG *out_r1, BN_ULONG *out_r0)
339 *out_r0 = (acc1 << BN_BITS4) | acc0; 398 *out_r0 = (acc1 << BN_BITS4) | acc0;
340} 399}
341#endif 400#endif
342#endif /* !BN_LLONG */
343#endif 401#endif
344 402
345#ifndef HAVE_BN_MULW_LO 403#ifndef HAVE_BN_MULW_LO