diff options
author | jsing <> | 2023-02-17 05:13:34 +0000 |
---|---|---|
committer | jsing <> | 2023-02-17 05:13:34 +0000 |
commit | b7f1c098b1a50519f08f8112820c6dcf50a9f2f0 (patch) | |
tree | b83e94b45b8911aac35bd66431cc5c5c9cd89015 | |
parent | 78b6c67232479df5c8a2d1b39ea628f0341a4202 (diff) | |
download | openbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.tar.gz openbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.tar.bz2 openbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.zip |
Reimplement bn_sqr_comba{4,8}().
Use bignum primitives rather than the current mess of macros.The sqr_add_c
macro gets replaced with bn_mulw_addtw(), while the sqr_add_c2 macro gets
replaced with bn_mul2_mulw_addtw().
The variables in the comba functions have also been reordered, so that the
patterns are easier to understand - the compiler can take care of
optimising the inputs and outputs to avoid register moves.
ok tb@
-rw-r--r-- | src/lib/libcrypto/bn/bn_internal.h | 30 | ||||
-rw-r--r-- | src/lib/libcrypto/bn/bn_sqr.c | 182 |
2 files changed, 110 insertions, 102 deletions
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h index acee2b4020..859f00d05d 100644 --- a/src/lib/libcrypto/bn/bn_internal.h +++ b/src/lib/libcrypto/bn/bn_internal.h | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: bn_internal.h,v 1.8 2023/02/16 10:58:06 jsing Exp $ */ | 1 | /* $OpenBSD: bn_internal.h,v 1.9 2023/02/17 05:13:34 jsing Exp $ */ |
2 | /* | 2 | /* |
3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> | 3 | * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> |
4 | * | 4 | * |
@@ -361,4 +361,32 @@ bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, | |||
361 | } | 361 | } |
362 | #endif | 362 | #endif |
363 | 363 | ||
364 | /* | ||
365 | * bn_mulw_addtw() computes (r2:r1:r0) = 2 * a * b + (c2:c1:c0), where a and b | ||
366 | * are single words and (c2:c1:c0) is a triple word, producing a triple word | ||
367 | * result. The caller must ensure that the inputs provided do not result in c2 | ||
368 | * overflowing. | ||
369 | */ | ||
370 | #ifndef HAVE_BN_MUL2_MULW_ADDTW | ||
371 | static inline void | ||
372 | bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0, | ||
373 | BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0) | ||
374 | { | ||
375 | BN_ULONG r2, r1, r0, x1, x0; | ||
376 | BN_ULONG carry; | ||
377 | |||
378 | bn_mulw(a, b, &x1, &x0); | ||
379 | bn_addw(c0, x0, &carry, &r0); | ||
380 | bn_addw(c1, x1 + carry, &r2, &r1); | ||
381 | bn_addw(c2, r2, &carry, &r2); | ||
382 | bn_addw(r0, x0, &carry, &r0); | ||
383 | bn_addw(r1, x1 + carry, &carry, &r1); | ||
384 | r2 += carry; | ||
385 | |||
386 | *out_r2 = r2; | ||
387 | *out_r1 = r1; | ||
388 | *out_r0 = r0; | ||
389 | } | ||
390 | #endif | ||
391 | |||
364 | #endif | 392 | #endif |
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c index f649b9bce8..6e784541bd 100644 --- a/src/lib/libcrypto/bn/bn_sqr.c +++ b/src/lib/libcrypto/bn/bn_sqr.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* $OpenBSD: bn_sqr.c,v 1.26 2023/02/16 10:41:03 jsing Exp $ */ | 1 | /* $OpenBSD: bn_sqr.c,v 1.27 2023/02/17 05:13:34 jsing Exp $ */ |
2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) | 2 | /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) |
3 | * All rights reserved. | 3 | * All rights reserved. |
4 | * | 4 | * |
@@ -66,117 +66,97 @@ | |||
66 | 66 | ||
67 | int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx); | 67 | int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx); |
68 | 68 | ||
69 | /* | ||
70 | * bn_sqr_comba4() computes r[] = a[] * a[] using Comba multiplication | ||
71 | * (https://everything2.com/title/Comba+multiplication), where a is a | ||
72 | * four word array, producing an eight word array result. | ||
73 | */ | ||
69 | #ifndef HAVE_BN_SQR_COMBA4 | 74 | #ifndef HAVE_BN_SQR_COMBA4 |
70 | void | 75 | void |
71 | bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) | 76 | bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) |
72 | { | 77 | { |
73 | BN_ULONG c1, c2, c3; | 78 | BN_ULONG c2, c1, c0; |
74 | 79 | ||
75 | c1 = 0; | 80 | bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]); |
76 | c2 = 0; | 81 | |
77 | c3 = 0; | 82 | bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]); |
78 | sqr_add_c(a, 0, c1, c2, c3); | 83 | |
79 | r[0] = c1; | 84 | bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0); |
80 | c1 = 0; | 85 | bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]); |
81 | sqr_add_c2(a, 1, 0, c2, c3, c1); | 86 | |
82 | r[1] = c2; | 87 | bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0); |
83 | c2 = 0; | 88 | bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]); |
84 | sqr_add_c(a, 1, c3, c1, c2); | 89 | |
85 | sqr_add_c2(a, 2, 0, c3, c1, c2); | 90 | bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0); |
86 | r[2] = c3; | 91 | bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &r[4]); |
87 | c3 = 0; | 92 | |
88 | sqr_add_c2(a, 3, 0, c1, c2, c3); | 93 | bn_mul2_mulw_addtw(a[3], a[2], 0, c2, c1, &c2, &c1, &r[5]); |
89 | sqr_add_c2(a, 2, 1, c1, c2, c3); | 94 | |
90 | r[3] = c1; | 95 | bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &r[7], &r[6]); |
91 | c1 = 0; | ||
92 | sqr_add_c(a, 2, c2, c3, c1); | ||
93 | sqr_add_c2(a, 3, 1, c2, c3, c1); | ||
94 | r[4] = c2; | ||
95 | c2 = 0; | ||
96 | sqr_add_c2(a, 3, 2, c3, c1, c2); | ||
97 | r[5] = c3; | ||
98 | c3 = 0; | ||
99 | sqr_add_c(a, 3, c1, c2, c3); | ||
100 | r[6] = c1; | ||
101 | r[7] = c2; | ||
102 | } | 96 | } |
103 | #endif | 97 | #endif |
104 | 98 | ||
99 | /* | ||
100 | * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication | ||
101 | * (https://everything2.com/title/Comba+multiplication), where a is an | ||
102 | * eight word array, producing an 16 word array result. | ||
103 | */ | ||
105 | #ifndef HAVE_BN_SQR_COMBA8 | 104 | #ifndef HAVE_BN_SQR_COMBA8 |
106 | void | 105 | void |
107 | bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) | 106 | bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) |
108 | { | 107 | { |
109 | BN_ULONG c1, c2, c3; | 108 | BN_ULONG c2, c1, c0; |
110 | 109 | ||
111 | c1 = 0; | 110 | bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]); |
112 | c2 = 0; | 111 | |
113 | c3 = 0; | 112 | bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]); |
114 | sqr_add_c(a, 0, c1, c2, c3); | 113 | |
115 | r[0] = c1; | 114 | bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0); |
116 | c1 = 0; | 115 | bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]); |
117 | sqr_add_c2(a, 1, 0, c2, c3, c1); | 116 | |
118 | r[1] = c2; | 117 | bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0); |
119 | c2 = 0; | 118 | bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]); |
120 | sqr_add_c(a, 1, c3, c1, c2); | 119 | |
121 | sqr_add_c2(a, 2, 0, c3, c1, c2); | 120 | bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0); |
122 | r[2] = c3; | 121 | bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0); |
123 | c3 = 0; | 122 | bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]); |
124 | sqr_add_c2(a, 3, 0, c1, c2, c3); | 123 | |
125 | sqr_add_c2(a, 2, 1, c1, c2, c3); | 124 | bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0); |
126 | r[3] = c1; | 125 | bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0); |
127 | c1 = 0; | 126 | bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]); |
128 | sqr_add_c(a, 2, c2, c3, c1); | 127 | |
129 | sqr_add_c2(a, 3, 1, c2, c3, c1); | 128 | bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0); |
130 | sqr_add_c2(a, 4, 0, c2, c3, c1); | 129 | bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0); |
131 | r[4] = c2; | 130 | bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &c0); |
132 | c2 = 0; | 131 | bn_mul2_mulw_addtw(a[6], a[0], c2, c1, c0, &c2, &c1, &r[6]); |
133 | sqr_add_c2(a, 5, 0, c3, c1, c2); | 132 | |
134 | sqr_add_c2(a, 4, 1, c3, c1, c2); | 133 | bn_mul2_mulw_addtw(a[7], a[0], 0, c2, c1, &c2, &c1, &c0); |
135 | sqr_add_c2(a, 3, 2, c3, c1, c2); | 134 | bn_mul2_mulw_addtw(a[6], a[1], c2, c1, c0, &c2, &c1, &c0); |
136 | r[5] = c3; | 135 | bn_mul2_mulw_addtw(a[5], a[2], c2, c1, c0, &c2, &c1, &c0); |
137 | c3 = 0; | 136 | bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]); |
138 | sqr_add_c(a, 3, c1, c2, c3); | 137 | |
139 | sqr_add_c2(a, 4, 2, c1, c2, c3); | 138 | bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0); |
140 | sqr_add_c2(a, 5, 1, c1, c2, c3); | 139 | bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &c0); |
141 | sqr_add_c2(a, 6, 0, c1, c2, c3); | 140 | bn_mul2_mulw_addtw(a[6], a[2], c2, c1, c0, &c2, &c1, &c0); |
142 | r[6] = c1; | 141 | bn_mul2_mulw_addtw(a[7], a[1], c2, c1, c0, &c2, &c1, &r[8]); |
143 | c1 = 0; | 142 | |
144 | sqr_add_c2(a, 7, 0, c2, c3, c1); | 143 | bn_mul2_mulw_addtw(a[7], a[2], 0, c2, c1, &c2, &c1, &c0); |
145 | sqr_add_c2(a, 6, 1, c2, c3, c1); | 144 | bn_mul2_mulw_addtw(a[6], a[3], c2, c1, c0, &c2, &c1, &c0); |
146 | sqr_add_c2(a, 5, 2, c2, c3, c1); | 145 | bn_mul2_mulw_addtw(a[5], a[4], c2, c1, c0, &c2, &c1, &r[9]); |
147 | sqr_add_c2(a, 4, 3, c2, c3, c1); | 146 | |
148 | r[7] = c2; | 147 | bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &c1, &c0); |
149 | c2 = 0; | 148 | bn_mul2_mulw_addtw(a[6], a[4], c2, c1, c0, &c2, &c1, &c0); |
150 | sqr_add_c(a, 4, c3, c1, c2); | 149 | bn_mul2_mulw_addtw(a[7], a[3], c2, c1, c0, &c2, &c1, &r[10]); |
151 | sqr_add_c2(a, 5, 3, c3, c1, c2); | 150 | |
152 | sqr_add_c2(a, 6, 2, c3, c1, c2); | 151 | bn_mul2_mulw_addtw(a[7], a[4], 0, c2, c1, &c2, &c1, &c0); |
153 | sqr_add_c2(a, 7, 1, c3, c1, c2); | 152 | bn_mul2_mulw_addtw(a[6], a[5], c2, c1, c0, &c2, &c1, &r[11]); |
154 | r[8] = c3; | 153 | |
155 | c3 = 0; | 154 | bn_mulw_addtw(a[6], a[6], 0, c2, c1, &c2, &c1, &c0); |
156 | sqr_add_c2(a, 7, 2, c1, c2, c3); | 155 | bn_mul2_mulw_addtw(a[7], a[5], c2, c1, c0, &c2, &c1, &r[12]); |
157 | sqr_add_c2(a, 6, 3, c1, c2, c3); | 156 | |
158 | sqr_add_c2(a, 5, 4, c1, c2, c3); | 157 | bn_mul2_mulw_addtw(a[7], a[6], 0, c2, c1, &c2, &c1, &r[13]); |
159 | r[9] = c1; | 158 | |
160 | c1 = 0; | 159 | bn_mulw_addtw(a[7], a[7], 0, c2, c1, &c2, &r[15], &r[14]); |
161 | sqr_add_c(a, 5, c2, c3, c1); | ||
162 | sqr_add_c2(a, 6, 4, c2, c3, c1); | ||
163 | sqr_add_c2(a, 7, 3, c2, c3, c1); | ||
164 | r[10] = c2; | ||
165 | c2 = 0; | ||
166 | sqr_add_c2(a, 7, 4, c3, c1, c2); | ||
167 | sqr_add_c2(a, 6, 5, c3, c1, c2); | ||
168 | r[11] = c3; | ||
169 | c3 = 0; | ||
170 | sqr_add_c(a, 6, c1, c2, c3); | ||
171 | sqr_add_c2(a, 7, 5, c1, c2, c3); | ||
172 | r[12] = c1; | ||
173 | c1 = 0; | ||
174 | sqr_add_c2(a, 7, 6, c2, c3, c1); | ||
175 | r[13] = c2; | ||
176 | c2 = 0; | ||
177 | sqr_add_c(a, 7, c3, c1, c2); | ||
178 | r[14] = c3; | ||
179 | r[15] = c1; | ||
180 | } | 160 | } |
181 | #endif | 161 | #endif |
182 | 162 | ||