diff options
author | jsing <> | 2023-02-11 12:15:02 +0000 |
---|---|---|
committer | jsing <> | 2023-02-11 12:15:02 +0000 |
commit | f3bce1e189bb4d596a7d1feae8b66b6c11c62761 (patch) | |
tree | 9c771840455a831a47e2a3c9c68970885a9a45c3 /src | |
parent | 67afc07de0ed3a0ccc272df42853ba565a8277c6 (diff) | |
download | openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.gz openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.bz2 openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.zip |
Bye bye x86_64-gcc.c.
This is no longer used, since we're now using s2n-bignum functions instead.
Diffstat (limited to 'src')
-rw-r--r-- | src/lib/libcrypto/bn/asm/x86_64-gcc.c | 559 |
1 files changed, 0 insertions, 559 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c deleted file mode 100644 index c6d6239bc2..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ /dev/null | |||
@@ -1,559 +0,0 @@ | |||
1 | /* $OpenBSD: x86_64-gcc.c,v 1.8 2023/01/20 17:26:03 jsing Exp $ */ | ||
2 | #include "../bn_local.h" | ||
3 | /* | ||
4 | * x86_64 BIGNUM accelerator version 0.1, December 2002. | ||
5 | * | ||
6 | * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
7 | * project. | ||
8 | * | ||
9 | * Rights for redistribution and usage in source and binary forms are | ||
10 | * granted according to the OpenSSL license. Warranty of any kind is | ||
11 | * disclaimed. | ||
12 | * | ||
13 | * Q. Version 0.1? It doesn't sound like Andy, he used to assign real | ||
14 | * versions, like 1.0... | ||
15 | * A. Well, that's because this code is basically a quick-n-dirty | ||
16 | * proof-of-concept hack. As you can see it's implemented with | ||
17 | * inline assembler, which means that you're bound to GCC and that | ||
18 | * there might be enough room for further improvement. | ||
19 | * | ||
20 | * Q. Why inline assembler? | ||
21 | * A. x86_64 features own ABI which I'm not familiar with. This is | ||
22 | * why I decided to let the compiler take care of subroutine | ||
23 | * prologue/epilogue as well as register allocation. For reference. | ||
24 | * Win64 implements different ABI for AMD64, different from Linux. | ||
25 | * | ||
26 | * Q. How much faster does it get? | ||
27 | * A. 'apps/openssl speed rsa dsa' output with no-asm: | ||
28 | * | ||
29 | * sign verify sign/s verify/s | ||
30 | * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 | ||
31 | * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 | ||
32 | * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 | ||
33 | * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 | ||
34 | * sign verify sign/s verify/s | ||
35 | * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 | ||
36 | * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 | ||
37 | * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 | ||
38 | * | ||
39 | * 'apps/openssl speed rsa dsa' output with this module: | ||
40 | * | ||
41 | * sign verify sign/s verify/s | ||
42 | * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 | ||
43 | * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 | ||
44 | * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 | ||
45 | * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 | ||
46 | * sign verify sign/s verify/s | ||
47 | * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 | ||
48 | * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 | ||
49 | * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 | ||
50 | * | ||
51 | * For the reference. IA-32 assembler implementation performs | ||
52 | * very much like 64-bit code compiled with no-asm on the same | ||
53 | * machine. | ||
54 | */ | ||
55 | |||
56 | #define BN_ULONG unsigned long | ||
57 | |||
58 | #undef mul | ||
59 | #undef mul_add | ||
60 | #undef sqr | ||
61 | |||
62 | /* | ||
63 | * "m"(a), "+m"(r) is the way to favor DirectPath µ-code; | ||
64 | * "g"(0) let the compiler to decide where does it | ||
65 | * want to keep the value of zero; | ||
66 | */ | ||
67 | #define mul_add(r,a,word,carry) do { \ | ||
68 | BN_ULONG high,low; \ | ||
69 | asm ("mulq %3" \ | ||
70 | : "=a"(low),"=d"(high) \ | ||
71 | : "a"(word),"m"(a) \ | ||
72 | : "cc"); \ | ||
73 | asm ("addq %2,%0; adcq %3,%1" \ | ||
74 | : "+r"(carry),"+d"(high)\ | ||
75 | : "a"(low),"g"(0) \ | ||
76 | : "cc"); \ | ||
77 | asm ("addq %2,%0; adcq %3,%1" \ | ||
78 | : "+m"(r),"+d"(high) \ | ||
79 | : "r"(carry),"g"(0) \ | ||
80 | : "cc"); \ | ||
81 | carry=high; \ | ||
82 | } while (0) | ||
83 | |||
84 | #define mul(r,a,word,carry) do { \ | ||
85 | BN_ULONG high,low; \ | ||
86 | asm ("mulq %3" \ | ||
87 | : "=a"(low),"=d"(high) \ | ||
88 | : "a"(word),"g"(a) \ | ||
89 | : "cc"); \ | ||
90 | asm ("addq %2,%0; adcq %3,%1" \ | ||
91 | : "+r"(carry),"+d"(high)\ | ||
92 | : "a"(low),"g"(0) \ | ||
93 | : "cc"); \ | ||
94 | (r)=carry, carry=high; \ | ||
95 | } while (0) | ||
96 | |||
97 | #define sqr(r0,r1,a) \ | ||
98 | asm ("mulq %2" \ | ||
99 | : "=a"(r0),"=d"(r1) \ | ||
100 | : "a"(a) \ | ||
101 | : "cc"); | ||
102 | |||
103 | BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | ||
104 | { | ||
105 | BN_ULONG c1=0; | ||
106 | |||
107 | if (num <= 0) return(c1); | ||
108 | |||
109 | while (num&~3) | ||
110 | { | ||
111 | mul_add(rp[0],ap[0],w,c1); | ||
112 | mul_add(rp[1],ap[1],w,c1); | ||
113 | mul_add(rp[2],ap[2],w,c1); | ||
114 | mul_add(rp[3],ap[3],w,c1); | ||
115 | ap+=4; rp+=4; num-=4; | ||
116 | } | ||
117 | if (num) | ||
118 | { | ||
119 | mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; | ||
120 | mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; | ||
121 | mul_add(rp[2],ap[2],w,c1); return c1; | ||
122 | } | ||
123 | |||
124 | return(c1); | ||
125 | } | ||
126 | |||
127 | BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) | ||
128 | { | ||
129 | BN_ULONG c1=0; | ||
130 | |||
131 | if (num <= 0) return(c1); | ||
132 | |||
133 | while (num&~3) | ||
134 | { | ||
135 | mul(rp[0],ap[0],w,c1); | ||
136 | mul(rp[1],ap[1],w,c1); | ||
137 | mul(rp[2],ap[2],w,c1); | ||
138 | mul(rp[3],ap[3],w,c1); | ||
139 | ap+=4; rp+=4; num-=4; | ||
140 | } | ||
141 | if (num) | ||
142 | { | ||
143 | mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; | ||
144 | mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; | ||
145 | mul(rp[2],ap[2],w,c1); | ||
146 | } | ||
147 | return(c1); | ||
148 | } | ||
149 | |||
150 | void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) | ||
151 | { | ||
152 | if (n <= 0) return; | ||
153 | |||
154 | while (n&~3) | ||
155 | { | ||
156 | sqr(r[0],r[1],a[0]); | ||
157 | sqr(r[2],r[3],a[1]); | ||
158 | sqr(r[4],r[5],a[2]); | ||
159 | sqr(r[6],r[7],a[3]); | ||
160 | a+=4; r+=8; n-=4; | ||
161 | } | ||
162 | if (n) | ||
163 | { | ||
164 | sqr(r[0],r[1],a[0]); if (--n == 0) return; | ||
165 | sqr(r[2],r[3],a[1]); if (--n == 0) return; | ||
166 | sqr(r[4],r[5],a[2]); | ||
167 | } | ||
168 | } | ||
169 | |||
170 | BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) | ||
171 | { BN_ULONG ret,waste; | ||
172 | |||
173 | asm ("divq %4" | ||
174 | : "=a"(ret),"=d"(waste) | ||
175 | : "a"(l),"d"(h),"g"(d) | ||
176 | : "cc"); | ||
177 | |||
178 | return ret; | ||
179 | } | ||
180 | |||
181 | BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) | ||
182 | { BN_ULONG ret=0,i=0; | ||
183 | |||
184 | if (n <= 0) return 0; | ||
185 | |||
186 | asm ( | ||
187 | " subq %2,%2 \n" | ||
188 | ".p2align 4 \n" | ||
189 | "1: movq (%4,%2,8),%0 \n" | ||
190 | " adcq (%5,%2,8),%0 \n" | ||
191 | " movq %0,(%3,%2,8) \n" | ||
192 | " leaq 1(%2),%2 \n" | ||
193 | " loop 1b \n" | ||
194 | " sbbq %0,%0 \n" | ||
195 | : "=&a"(ret),"+c"(n),"=&r"(i) | ||
196 | : "r"(rp),"r"(ap),"r"(bp) | ||
197 | : "cc" | ||
198 | ); | ||
199 | |||
200 | return ret&1; | ||
201 | } | ||
202 | |||
203 | BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n) | ||
204 | { BN_ULONG ret=0,i=0; | ||
205 | |||
206 | if (n <= 0) return 0; | ||
207 | |||
208 | asm ( | ||
209 | " subq %2,%2 \n" | ||
210 | ".p2align 4 \n" | ||
211 | "1: movq (%4,%2,8),%0 \n" | ||
212 | " sbbq (%5,%2,8),%0 \n" | ||
213 | " movq %0,(%3,%2,8) \n" | ||
214 | " leaq 1(%2),%2 \n" | ||
215 | " loop 1b \n" | ||
216 | " sbbq %0,%0 \n" | ||
217 | : "=&a"(ret),"+c"(n),"=&r"(i) | ||
218 | : "r"(rp),"r"(ap),"r"(bp) | ||
219 | : "cc" | ||
220 | ); | ||
221 | |||
222 | return ret&1; | ||
223 | } | ||
224 | |||
225 | /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */ | ||
226 | /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */ | ||
227 | /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */ | ||
228 | /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */ | ||
229 | |||
230 | #undef mul_add_c | ||
231 | #undef mul_add_c2 | ||
232 | #undef sqr_add_c | ||
233 | #undef sqr_add_c2 | ||
234 | |||
235 | /* | ||
236 | * Keep in mind that carrying into high part of multiplication result | ||
237 | * can not overflow, because it cannot be all-ones. | ||
238 | */ | ||
239 | #if 0 | ||
240 | /* original macros are kept for reference purposes */ | ||
241 | #define mul_add_c(a,b,c0,c1,c2) do { \ | ||
242 | BN_ULONG ta = (a), tb = (b); \ | ||
243 | BN_ULONG lo, hi; \ | ||
244 | BN_UMULT_LOHI(lo,hi,ta,tb); \ | ||
245 | c0 += lo; hi += (c0<lo)?1:0; \ | ||
246 | c1 += hi; c2 += (c1<hi)?1:0; \ | ||
247 | } while(0) | ||
248 | |||
249 | #define mul_add_c2(a,b,c0,c1,c2) do { \ | ||
250 | BN_ULONG ta = (a), tb = (b); \ | ||
251 | BN_ULONG lo, hi, tt; \ | ||
252 | BN_UMULT_LOHI(lo,hi,ta,tb); \ | ||
253 | c0 += lo; tt = hi+((c0<lo)?1:0); \ | ||
254 | c1 += tt; c2 += (c1<tt)?1:0; \ | ||
255 | c0 += lo; hi += (c0<lo)?1:0; \ | ||
256 | c1 += hi; c2 += (c1<hi)?1:0; \ | ||
257 | } while(0) | ||
258 | |||
259 | #define sqr_add_c(a,i,c0,c1,c2) do { \ | ||
260 | BN_ULONG ta = (a)[i]; \ | ||
261 | BN_ULONG lo, hi; \ | ||
262 | BN_UMULT_LOHI(lo,hi,ta,ta); \ | ||
263 | c0 += lo; hi += (c0<lo)?1:0; \ | ||
264 | c1 += hi; c2 += (c1<hi)?1:0; \ | ||
265 | } while(0) | ||
266 | #else | ||
267 | #define mul_add_c(a,b,c0,c1,c2) do { \ | ||
268 | BN_ULONG t1,t2; \ | ||
269 | asm ("mulq %3" \ | ||
270 | : "=a"(t1),"=d"(t2) \ | ||
271 | : "a"(a),"m"(b) \ | ||
272 | : "cc"); \ | ||
273 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ | ||
274 | : "+r"(c0),"+r"(c1),"+r"(c2) \ | ||
275 | : "r"(t1),"r"(t2),"g"(0) \ | ||
276 | : "cc"); \ | ||
277 | } while (0) | ||
278 | |||
279 | #define sqr_add_c(a,i,c0,c1,c2) do { \ | ||
280 | BN_ULONG t1,t2; \ | ||
281 | asm ("mulq %2" \ | ||
282 | : "=a"(t1),"=d"(t2) \ | ||
283 | : "a"(a[i]) \ | ||
284 | : "cc"); \ | ||
285 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ | ||
286 | : "+r"(c0),"+r"(c1),"+r"(c2) \ | ||
287 | : "r"(t1),"r"(t2),"g"(0) \ | ||
288 | : "cc"); \ | ||
289 | } while (0) | ||
290 | |||
291 | #define mul_add_c2(a,b,c0,c1,c2) do { \ | ||
292 | BN_ULONG t1,t2; \ | ||
293 | asm ("mulq %3" \ | ||
294 | : "=a"(t1),"=d"(t2) \ | ||
295 | : "a"(a),"m"(b) \ | ||
296 | : "cc"); \ | ||
297 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ | ||
298 | : "+r"(c0),"+r"(c1),"+r"(c2) \ | ||
299 | : "r"(t1),"r"(t2),"g"(0) \ | ||
300 | : "cc"); \ | ||
301 | asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \ | ||
302 | : "+r"(c0),"+r"(c1),"+r"(c2) \ | ||
303 | : "r"(t1),"r"(t2),"g"(0) \ | ||
304 | : "cc"); \ | ||
305 | } while (0) | ||
306 | #endif | ||
307 | |||
308 | #define sqr_add_c2(a,i,j,c0,c1,c2) \ | ||
309 | mul_add_c2((a)[i],(a)[j],c0,c1,c2) | ||
310 | |||
311 | void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
312 | { | ||
313 | BN_ULONG c1,c2,c3; | ||
314 | |||
315 | c1=0; | ||
316 | c2=0; | ||
317 | c3=0; | ||
318 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
319 | r[0]=c1; | ||
320 | c1=0; | ||
321 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
322 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
323 | r[1]=c2; | ||
324 | c2=0; | ||
325 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
326 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
327 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
328 | r[2]=c3; | ||
329 | c3=0; | ||
330 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
331 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
332 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
333 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
334 | r[3]=c1; | ||
335 | c1=0; | ||
336 | mul_add_c(a[4],b[0],c2,c3,c1); | ||
337 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
338 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
339 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
340 | mul_add_c(a[0],b[4],c2,c3,c1); | ||
341 | r[4]=c2; | ||
342 | c2=0; | ||
343 | mul_add_c(a[0],b[5],c3,c1,c2); | ||
344 | mul_add_c(a[1],b[4],c3,c1,c2); | ||
345 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
346 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
347 | mul_add_c(a[4],b[1],c3,c1,c2); | ||
348 | mul_add_c(a[5],b[0],c3,c1,c2); | ||
349 | r[5]=c3; | ||
350 | c3=0; | ||
351 | mul_add_c(a[6],b[0],c1,c2,c3); | ||
352 | mul_add_c(a[5],b[1],c1,c2,c3); | ||
353 | mul_add_c(a[4],b[2],c1,c2,c3); | ||
354 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
355 | mul_add_c(a[2],b[4],c1,c2,c3); | ||
356 | mul_add_c(a[1],b[5],c1,c2,c3); | ||
357 | mul_add_c(a[0],b[6],c1,c2,c3); | ||
358 | r[6]=c1; | ||
359 | c1=0; | ||
360 | mul_add_c(a[0],b[7],c2,c3,c1); | ||
361 | mul_add_c(a[1],b[6],c2,c3,c1); | ||
362 | mul_add_c(a[2],b[5],c2,c3,c1); | ||
363 | mul_add_c(a[3],b[4],c2,c3,c1); | ||
364 | mul_add_c(a[4],b[3],c2,c3,c1); | ||
365 | mul_add_c(a[5],b[2],c2,c3,c1); | ||
366 | mul_add_c(a[6],b[1],c2,c3,c1); | ||
367 | mul_add_c(a[7],b[0],c2,c3,c1); | ||
368 | r[7]=c2; | ||
369 | c2=0; | ||
370 | mul_add_c(a[7],b[1],c3,c1,c2); | ||
371 | mul_add_c(a[6],b[2],c3,c1,c2); | ||
372 | mul_add_c(a[5],b[3],c3,c1,c2); | ||
373 | mul_add_c(a[4],b[4],c3,c1,c2); | ||
374 | mul_add_c(a[3],b[5],c3,c1,c2); | ||
375 | mul_add_c(a[2],b[6],c3,c1,c2); | ||
376 | mul_add_c(a[1],b[7],c3,c1,c2); | ||
377 | r[8]=c3; | ||
378 | c3=0; | ||
379 | mul_add_c(a[2],b[7],c1,c2,c3); | ||
380 | mul_add_c(a[3],b[6],c1,c2,c3); | ||
381 | mul_add_c(a[4],b[5],c1,c2,c3); | ||
382 | mul_add_c(a[5],b[4],c1,c2,c3); | ||
383 | mul_add_c(a[6],b[3],c1,c2,c3); | ||
384 | mul_add_c(a[7],b[2],c1,c2,c3); | ||
385 | r[9]=c1; | ||
386 | c1=0; | ||
387 | mul_add_c(a[7],b[3],c2,c3,c1); | ||
388 | mul_add_c(a[6],b[4],c2,c3,c1); | ||
389 | mul_add_c(a[5],b[5],c2,c3,c1); | ||
390 | mul_add_c(a[4],b[6],c2,c3,c1); | ||
391 | mul_add_c(a[3],b[7],c2,c3,c1); | ||
392 | r[10]=c2; | ||
393 | c2=0; | ||
394 | mul_add_c(a[4],b[7],c3,c1,c2); | ||
395 | mul_add_c(a[5],b[6],c3,c1,c2); | ||
396 | mul_add_c(a[6],b[5],c3,c1,c2); | ||
397 | mul_add_c(a[7],b[4],c3,c1,c2); | ||
398 | r[11]=c3; | ||
399 | c3=0; | ||
400 | mul_add_c(a[7],b[5],c1,c2,c3); | ||
401 | mul_add_c(a[6],b[6],c1,c2,c3); | ||
402 | mul_add_c(a[5],b[7],c1,c2,c3); | ||
403 | r[12]=c1; | ||
404 | c1=0; | ||
405 | mul_add_c(a[6],b[7],c2,c3,c1); | ||
406 | mul_add_c(a[7],b[6],c2,c3,c1); | ||
407 | r[13]=c2; | ||
408 | c2=0; | ||
409 | mul_add_c(a[7],b[7],c3,c1,c2); | ||
410 | r[14]=c3; | ||
411 | r[15]=c1; | ||
412 | } | ||
413 | |||
414 | void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) | ||
415 | { | ||
416 | BN_ULONG c1,c2,c3; | ||
417 | |||
418 | c1=0; | ||
419 | c2=0; | ||
420 | c3=0; | ||
421 | mul_add_c(a[0],b[0],c1,c2,c3); | ||
422 | r[0]=c1; | ||
423 | c1=0; | ||
424 | mul_add_c(a[0],b[1],c2,c3,c1); | ||
425 | mul_add_c(a[1],b[0],c2,c3,c1); | ||
426 | r[1]=c2; | ||
427 | c2=0; | ||
428 | mul_add_c(a[2],b[0],c3,c1,c2); | ||
429 | mul_add_c(a[1],b[1],c3,c1,c2); | ||
430 | mul_add_c(a[0],b[2],c3,c1,c2); | ||
431 | r[2]=c3; | ||
432 | c3=0; | ||
433 | mul_add_c(a[0],b[3],c1,c2,c3); | ||
434 | mul_add_c(a[1],b[2],c1,c2,c3); | ||
435 | mul_add_c(a[2],b[1],c1,c2,c3); | ||
436 | mul_add_c(a[3],b[0],c1,c2,c3); | ||
437 | r[3]=c1; | ||
438 | c1=0; | ||
439 | mul_add_c(a[3],b[1],c2,c3,c1); | ||
440 | mul_add_c(a[2],b[2],c2,c3,c1); | ||
441 | mul_add_c(a[1],b[3],c2,c3,c1); | ||
442 | r[4]=c2; | ||
443 | c2=0; | ||
444 | mul_add_c(a[2],b[3],c3,c1,c2); | ||
445 | mul_add_c(a[3],b[2],c3,c1,c2); | ||
446 | r[5]=c3; | ||
447 | c3=0; | ||
448 | mul_add_c(a[3],b[3],c1,c2,c3); | ||
449 | r[6]=c1; | ||
450 | r[7]=c2; | ||
451 | } | ||
452 | |||
453 | void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) | ||
454 | { | ||
455 | BN_ULONG c1,c2,c3; | ||
456 | |||
457 | c1=0; | ||
458 | c2=0; | ||
459 | c3=0; | ||
460 | sqr_add_c(a,0,c1,c2,c3); | ||
461 | r[0]=c1; | ||
462 | c1=0; | ||
463 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
464 | r[1]=c2; | ||
465 | c2=0; | ||
466 | sqr_add_c(a,1,c3,c1,c2); | ||
467 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
468 | r[2]=c3; | ||
469 | c3=0; | ||
470 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
471 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
472 | r[3]=c1; | ||
473 | c1=0; | ||
474 | sqr_add_c(a,2,c2,c3,c1); | ||
475 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
476 | sqr_add_c2(a,4,0,c2,c3,c1); | ||
477 | r[4]=c2; | ||
478 | c2=0; | ||
479 | sqr_add_c2(a,5,0,c3,c1,c2); | ||
480 | sqr_add_c2(a,4,1,c3,c1,c2); | ||
481 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
482 | r[5]=c3; | ||
483 | c3=0; | ||
484 | sqr_add_c(a,3,c1,c2,c3); | ||
485 | sqr_add_c2(a,4,2,c1,c2,c3); | ||
486 | sqr_add_c2(a,5,1,c1,c2,c3); | ||
487 | sqr_add_c2(a,6,0,c1,c2,c3); | ||
488 | r[6]=c1; | ||
489 | c1=0; | ||
490 | sqr_add_c2(a,7,0,c2,c3,c1); | ||
491 | sqr_add_c2(a,6,1,c2,c3,c1); | ||
492 | sqr_add_c2(a,5,2,c2,c3,c1); | ||
493 | sqr_add_c2(a,4,3,c2,c3,c1); | ||
494 | r[7]=c2; | ||
495 | c2=0; | ||
496 | sqr_add_c(a,4,c3,c1,c2); | ||
497 | sqr_add_c2(a,5,3,c3,c1,c2); | ||
498 | sqr_add_c2(a,6,2,c3,c1,c2); | ||
499 | sqr_add_c2(a,7,1,c3,c1,c2); | ||
500 | r[8]=c3; | ||
501 | c3=0; | ||
502 | sqr_add_c2(a,7,2,c1,c2,c3); | ||
503 | sqr_add_c2(a,6,3,c1,c2,c3); | ||
504 | sqr_add_c2(a,5,4,c1,c2,c3); | ||
505 | r[9]=c1; | ||
506 | c1=0; | ||
507 | sqr_add_c(a,5,c2,c3,c1); | ||
508 | sqr_add_c2(a,6,4,c2,c3,c1); | ||
509 | sqr_add_c2(a,7,3,c2,c3,c1); | ||
510 | r[10]=c2; | ||
511 | c2=0; | ||
512 | sqr_add_c2(a,7,4,c3,c1,c2); | ||
513 | sqr_add_c2(a,6,5,c3,c1,c2); | ||
514 | r[11]=c3; | ||
515 | c3=0; | ||
516 | sqr_add_c(a,6,c1,c2,c3); | ||
517 | sqr_add_c2(a,7,5,c1,c2,c3); | ||
518 | r[12]=c1; | ||
519 | c1=0; | ||
520 | sqr_add_c2(a,7,6,c2,c3,c1); | ||
521 | r[13]=c2; | ||
522 | c2=0; | ||
523 | sqr_add_c(a,7,c3,c1,c2); | ||
524 | r[14]=c3; | ||
525 | r[15]=c1; | ||
526 | } | ||
527 | |||
528 | void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) | ||
529 | { | ||
530 | BN_ULONG c1,c2,c3; | ||
531 | |||
532 | c1=0; | ||
533 | c2=0; | ||
534 | c3=0; | ||
535 | sqr_add_c(a,0,c1,c2,c3); | ||
536 | r[0]=c1; | ||
537 | c1=0; | ||
538 | sqr_add_c2(a,1,0,c2,c3,c1); | ||
539 | r[1]=c2; | ||
540 | c2=0; | ||
541 | sqr_add_c(a,1,c3,c1,c2); | ||
542 | sqr_add_c2(a,2,0,c3,c1,c2); | ||
543 | r[2]=c3; | ||
544 | c3=0; | ||
545 | sqr_add_c2(a,3,0,c1,c2,c3); | ||
546 | sqr_add_c2(a,2,1,c1,c2,c3); | ||
547 | r[3]=c1; | ||
548 | c1=0; | ||
549 | sqr_add_c(a,2,c2,c3,c1); | ||
550 | sqr_add_c2(a,3,1,c2,c3,c1); | ||
551 | r[4]=c2; | ||
552 | c2=0; | ||
553 | sqr_add_c2(a,3,2,c3,c1,c2); | ||
554 | r[5]=c3; | ||
555 | c3=0; | ||
556 | sqr_add_c(a,3,c1,c2,c3); | ||
557 | r[6]=c1; | ||
558 | r[7]=c2; | ||
559 | } | ||