summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjsing <>2023-02-17 05:13:34 +0000
committerjsing <>2023-02-17 05:13:34 +0000
commitb7f1c098b1a50519f08f8112820c6dcf50a9f2f0 (patch)
treeb83e94b45b8911aac35bd66431cc5c5c9cd89015
parent78b6c67232479df5c8a2d1b39ea628f0341a4202 (diff)
downloadopenbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.tar.gz
openbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.tar.bz2
openbsd-b7f1c098b1a50519f08f8112820c6dcf50a9f2f0.zip
Reimplement bn_sqr_comba{4,8}().
Use bignum primitives rather than the current mess of macros.The sqr_add_c macro gets replaced with bn_mulw_addtw(), while the sqr_add_c2 macro gets replaced with bn_mul2_mulw_addtw(). The variables in the comba functions have also been reordered, so that the patterns are easier to understand - the compiler can take care of optimising the inputs and outputs to avoid register moves. ok tb@
-rw-r--r--src/lib/libcrypto/bn/bn_internal.h30
-rw-r--r--src/lib/libcrypto/bn/bn_sqr.c182
2 files changed, 110 insertions, 102 deletions
diff --git a/src/lib/libcrypto/bn/bn_internal.h b/src/lib/libcrypto/bn/bn_internal.h
index acee2b4020..859f00d05d 100644
--- a/src/lib/libcrypto/bn/bn_internal.h
+++ b/src/lib/libcrypto/bn/bn_internal.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_internal.h,v 1.8 2023/02/16 10:58:06 jsing Exp $ */ 1/* $OpenBSD: bn_internal.h,v 1.9 2023/02/17 05:13:34 jsing Exp $ */
2/* 2/*
3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org> 3 * Copyright (c) 2023 Joel Sing <jsing@openbsd.org>
4 * 4 *
@@ -361,4 +361,32 @@ bn_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
361} 361}
362#endif 362#endif
363 363
364/*
365 * bn_mulw_addtw() computes (r2:r1:r0) = 2 * a * b + (c2:c1:c0), where a and b
366 * are single words and (c2:c1:c0) is a triple word, producing a triple word
367 * result. The caller must ensure that the inputs provided do not result in c2
368 * overflowing.
369 */
370#ifndef HAVE_BN_MUL2_MULW_ADDTW
371static inline void
372bn_mul2_mulw_addtw(BN_ULONG a, BN_ULONG b, BN_ULONG c2, BN_ULONG c1, BN_ULONG c0,
373 BN_ULONG *out_r2, BN_ULONG *out_r1, BN_ULONG *out_r0)
374{
375 BN_ULONG r2, r1, r0, x1, x0;
376 BN_ULONG carry;
377
378 bn_mulw(a, b, &x1, &x0);
379 bn_addw(c0, x0, &carry, &r0);
380 bn_addw(c1, x1 + carry, &r2, &r1);
381 bn_addw(c2, r2, &carry, &r2);
382 bn_addw(r0, x0, &carry, &r0);
383 bn_addw(r1, x1 + carry, &carry, &r1);
384 r2 += carry;
385
386 *out_r2 = r2;
387 *out_r1 = r1;
388 *out_r0 = r0;
389}
390#endif
391
364#endif 392#endif
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index f649b9bce8..6e784541bd 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: bn_sqr.c,v 1.26 2023/02/16 10:41:03 jsing Exp $ */ 1/* $OpenBSD: bn_sqr.c,v 1.27 2023/02/17 05:13:34 jsing Exp $ */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) 2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved. 3 * All rights reserved.
4 * 4 *
@@ -66,117 +66,97 @@
66 66
67int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx); 67int bn_sqr(BIGNUM *r, const BIGNUM *a, int max, BN_CTX *ctx);
68 68
69/*
70 * bn_sqr_comba4() computes r[] = a[] * a[] using Comba multiplication
71 * (https://everything2.com/title/Comba+multiplication), where a is a
72 * four word array, producing an eight word array result.
73 */
69#ifndef HAVE_BN_SQR_COMBA4 74#ifndef HAVE_BN_SQR_COMBA4
70void 75void
71bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) 76bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
72{ 77{
73 BN_ULONG c1, c2, c3; 78 BN_ULONG c2, c1, c0;
74 79
75 c1 = 0; 80 bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
76 c2 = 0; 81
77 c3 = 0; 82 bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
78 sqr_add_c(a, 0, c1, c2, c3); 83
79 r[0] = c1; 84 bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
80 c1 = 0; 85 bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
81 sqr_add_c2(a, 1, 0, c2, c3, c1); 86
82 r[1] = c2; 87 bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
83 c2 = 0; 88 bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
84 sqr_add_c(a, 1, c3, c1, c2); 89
85 sqr_add_c2(a, 2, 0, c3, c1, c2); 90 bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
86 r[2] = c3; 91 bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &r[4]);
87 c3 = 0; 92
88 sqr_add_c2(a, 3, 0, c1, c2, c3); 93 bn_mul2_mulw_addtw(a[3], a[2], 0, c2, c1, &c2, &c1, &r[5]);
89 sqr_add_c2(a, 2, 1, c1, c2, c3); 94
90 r[3] = c1; 95 bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &r[7], &r[6]);
91 c1 = 0;
92 sqr_add_c(a, 2, c2, c3, c1);
93 sqr_add_c2(a, 3, 1, c2, c3, c1);
94 r[4] = c2;
95 c2 = 0;
96 sqr_add_c2(a, 3, 2, c3, c1, c2);
97 r[5] = c3;
98 c3 = 0;
99 sqr_add_c(a, 3, c1, c2, c3);
100 r[6] = c1;
101 r[7] = c2;
102} 96}
103#endif 97#endif
104 98
99/*
100 * bn_sqr_comba8() computes r[] = a[] * a[] using Comba multiplication
101 * (https://everything2.com/title/Comba+multiplication), where a is an
102 * eight word array, producing an 16 word array result.
103 */
105#ifndef HAVE_BN_SQR_COMBA8 104#ifndef HAVE_BN_SQR_COMBA8
106void 105void
107bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) 106bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
108{ 107{
109 BN_ULONG c1, c2, c3; 108 BN_ULONG c2, c1, c0;
110 109
111 c1 = 0; 110 bn_mulw_addtw(a[0], a[0], 0, 0, 0, &c2, &c1, &r[0]);
112 c2 = 0; 111
113 c3 = 0; 112 bn_mul2_mulw_addtw(a[1], a[0], 0, c2, c1, &c2, &c1, &r[1]);
114 sqr_add_c(a, 0, c1, c2, c3); 113
115 r[0] = c1; 114 bn_mulw_addtw(a[1], a[1], 0, c2, c1, &c2, &c1, &c0);
116 c1 = 0; 115 bn_mul2_mulw_addtw(a[2], a[0], c2, c1, c0, &c2, &c1, &r[2]);
117 sqr_add_c2(a, 1, 0, c2, c3, c1); 116
118 r[1] = c2; 117 bn_mul2_mulw_addtw(a[3], a[0], 0, c2, c1, &c2, &c1, &c0);
119 c2 = 0; 118 bn_mul2_mulw_addtw(a[2], a[1], c2, c1, c0, &c2, &c1, &r[3]);
120 sqr_add_c(a, 1, c3, c1, c2); 119
121 sqr_add_c2(a, 2, 0, c3, c1, c2); 120 bn_mulw_addtw(a[2], a[2], 0, c2, c1, &c2, &c1, &c0);
122 r[2] = c3; 121 bn_mul2_mulw_addtw(a[3], a[1], c2, c1, c0, &c2, &c1, &c0);
123 c3 = 0; 122 bn_mul2_mulw_addtw(a[4], a[0], c2, c1, c0, &c2, &c1, &r[4]);
124 sqr_add_c2(a, 3, 0, c1, c2, c3); 123
125 sqr_add_c2(a, 2, 1, c1, c2, c3); 124 bn_mul2_mulw_addtw(a[5], a[0], 0, c2, c1, &c2, &c1, &c0);
126 r[3] = c1; 125 bn_mul2_mulw_addtw(a[4], a[1], c2, c1, c0, &c2, &c1, &c0);
127 c1 = 0; 126 bn_mul2_mulw_addtw(a[3], a[2], c2, c1, c0, &c2, &c1, &r[5]);
128 sqr_add_c(a, 2, c2, c3, c1); 127
129 sqr_add_c2(a, 3, 1, c2, c3, c1); 128 bn_mulw_addtw(a[3], a[3], 0, c2, c1, &c2, &c1, &c0);
130 sqr_add_c2(a, 4, 0, c2, c3, c1); 129 bn_mul2_mulw_addtw(a[4], a[2], c2, c1, c0, &c2, &c1, &c0);
131 r[4] = c2; 130 bn_mul2_mulw_addtw(a[5], a[1], c2, c1, c0, &c2, &c1, &c0);
132 c2 = 0; 131 bn_mul2_mulw_addtw(a[6], a[0], c2, c1, c0, &c2, &c1, &r[6]);
133 sqr_add_c2(a, 5, 0, c3, c1, c2); 132
134 sqr_add_c2(a, 4, 1, c3, c1, c2); 133 bn_mul2_mulw_addtw(a[7], a[0], 0, c2, c1, &c2, &c1, &c0);
135 sqr_add_c2(a, 3, 2, c3, c1, c2); 134 bn_mul2_mulw_addtw(a[6], a[1], c2, c1, c0, &c2, &c1, &c0);
136 r[5] = c3; 135 bn_mul2_mulw_addtw(a[5], a[2], c2, c1, c0, &c2, &c1, &c0);
137 c3 = 0; 136 bn_mul2_mulw_addtw(a[4], a[3], c2, c1, c0, &c2, &c1, &r[7]);
138 sqr_add_c(a, 3, c1, c2, c3); 137
139 sqr_add_c2(a, 4, 2, c1, c2, c3); 138 bn_mulw_addtw(a[4], a[4], 0, c2, c1, &c2, &c1, &c0);
140 sqr_add_c2(a, 5, 1, c1, c2, c3); 139 bn_mul2_mulw_addtw(a[5], a[3], c2, c1, c0, &c2, &c1, &c0);
141 sqr_add_c2(a, 6, 0, c1, c2, c3); 140 bn_mul2_mulw_addtw(a[6], a[2], c2, c1, c0, &c2, &c1, &c0);
142 r[6] = c1; 141 bn_mul2_mulw_addtw(a[7], a[1], c2, c1, c0, &c2, &c1, &r[8]);
143 c1 = 0; 142
144 sqr_add_c2(a, 7, 0, c2, c3, c1); 143 bn_mul2_mulw_addtw(a[7], a[2], 0, c2, c1, &c2, &c1, &c0);
145 sqr_add_c2(a, 6, 1, c2, c3, c1); 144 bn_mul2_mulw_addtw(a[6], a[3], c2, c1, c0, &c2, &c1, &c0);
146 sqr_add_c2(a, 5, 2, c2, c3, c1); 145 bn_mul2_mulw_addtw(a[5], a[4], c2, c1, c0, &c2, &c1, &r[9]);
147 sqr_add_c2(a, 4, 3, c2, c3, c1); 146
148 r[7] = c2; 147 bn_mulw_addtw(a[5], a[5], 0, c2, c1, &c2, &c1, &c0);
149 c2 = 0; 148 bn_mul2_mulw_addtw(a[6], a[4], c2, c1, c0, &c2, &c1, &c0);
150 sqr_add_c(a, 4, c3, c1, c2); 149 bn_mul2_mulw_addtw(a[7], a[3], c2, c1, c0, &c2, &c1, &r[10]);
151 sqr_add_c2(a, 5, 3, c3, c1, c2); 150
152 sqr_add_c2(a, 6, 2, c3, c1, c2); 151 bn_mul2_mulw_addtw(a[7], a[4], 0, c2, c1, &c2, &c1, &c0);
153 sqr_add_c2(a, 7, 1, c3, c1, c2); 152 bn_mul2_mulw_addtw(a[6], a[5], c2, c1, c0, &c2, &c1, &r[11]);
154 r[8] = c3; 153
155 c3 = 0; 154 bn_mulw_addtw(a[6], a[6], 0, c2, c1, &c2, &c1, &c0);
156 sqr_add_c2(a, 7, 2, c1, c2, c3); 155 bn_mul2_mulw_addtw(a[7], a[5], c2, c1, c0, &c2, &c1, &r[12]);
157 sqr_add_c2(a, 6, 3, c1, c2, c3); 156
158 sqr_add_c2(a, 5, 4, c1, c2, c3); 157 bn_mul2_mulw_addtw(a[7], a[6], 0, c2, c1, &c2, &c1, &r[13]);
159 r[9] = c1; 158
160 c1 = 0; 159 bn_mulw_addtw(a[7], a[7], 0, c2, c1, &c2, &r[15], &r[14]);
161 sqr_add_c(a, 5, c2, c3, c1);
162 sqr_add_c2(a, 6, 4, c2, c3, c1);
163 sqr_add_c2(a, 7, 3, c2, c3, c1);
164 r[10] = c2;
165 c2 = 0;
166 sqr_add_c2(a, 7, 4, c3, c1, c2);
167 sqr_add_c2(a, 6, 5, c3, c1, c2);
168 r[11] = c3;
169 c3 = 0;
170 sqr_add_c(a, 6, c1, c2, c3);
171 sqr_add_c2(a, 7, 5, c1, c2, c3);
172 r[12] = c1;
173 c1 = 0;
174 sqr_add_c2(a, 7, 6, c2, c3, c1);
175 r[13] = c2;
176 c2 = 0;
177 sqr_add_c(a, 7, c3, c1, c2);
178 r[14] = c3;
179 r[15] = c1;
180} 160}
181#endif 161#endif
182 162