summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86_64-gcc.c
diff options
context:
space:
mode:
authorcvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
committercvs2svn <admin@example.com>2015-08-02 21:54:22 +0000
commited3760bf4be4a96a89233fb8f8b84a0d44725862 (patch)
tree5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/bn/asm/x86_64-gcc.c
parentf8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff)
downloadopenbsd-OPENBSD_5_8_BASE.tar.gz
openbsd-OPENBSD_5_8_BASE.tar.bz2
openbsd-OPENBSD_5_8_BASE.zip
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_8_BASE'.OPENBSD_5_8_BASE
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86_64-gcc.c')
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c598
1 files changed, 0 insertions, 598 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index 9deffa71f1..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,598 +0,0 @@
1/* $OpenBSD: x86_64-gcc.c,v 1.5 2015/02/25 15:39:49 bcook Exp $ */
2#include "../bn_lcl.h"
3#if !(defined(__GNUC__) && __GNUC__>=2)
4# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
5#else
6/*
7 * x86_64 BIGNUM accelerator version 0.1, December 2002.
8 *
9 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10 * project.
11 *
12 * Rights for redistribution and usage in source and binary forms are
13 * granted according to the OpenSSL license. Warranty of any kind is
14 * disclaimed.
15 *
16 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
17 * versions, like 1.0...
18 * A. Well, that's because this code is basically a quick-n-dirty
19 * proof-of-concept hack. As you can see it's implemented with
20 * inline assembler, which means that you're bound to GCC and that
21 * there might be enough room for further improvement.
22 *
23 * Q. Why inline assembler?
24 * A. x86_64 features own ABI which I'm not familiar with. This is
25 * why I decided to let the compiler take care of subroutine
26 * prologue/epilogue as well as register allocation. For reference.
27 * Win64 implements different ABI for AMD64, different from Linux.
28 *
29 * Q. How much faster does it get?
30 * A. 'apps/openssl speed rsa dsa' output with no-asm:
31 *
32 * sign verify sign/s verify/s
33 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
34 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
35 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
36 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
37 * sign verify sign/s verify/s
38 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
39 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
40 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
41 *
42 * 'apps/openssl speed rsa dsa' output with this module:
43 *
44 * sign verify sign/s verify/s
45 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
46 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
47 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
48 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
49 * sign verify sign/s verify/s
50 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
51 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
52 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
53 *
54 * For the reference. IA-32 assembler implementation performs
55 * very much like 64-bit code compiled with no-asm on the same
56 * machine.
57 */
58
59#define BN_ULONG unsigned long
60
61#undef mul
62#undef mul_add
63#undef sqr
64
65/*
66 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
67 * "g"(0) let the compiler to decide where does it
68 * want to keep the value of zero;
69 */
70#define mul_add(r,a,word,carry) do { \
71 BN_ULONG high,low; \
72 asm ("mulq %3" \
73 : "=a"(low),"=d"(high) \
74 : "a"(word),"m"(a) \
75 : "cc"); \
76 asm ("addq %2,%0; adcq %3,%1" \
77 : "+r"(carry),"+d"(high)\
78 : "a"(low),"g"(0) \
79 : "cc"); \
80 asm ("addq %2,%0; adcq %3,%1" \
81 : "+m"(r),"+d"(high) \
82 : "r"(carry),"g"(0) \
83 : "cc"); \
84 carry=high; \
85 } while (0)
86
87#define mul(r,a,word,carry) do { \
88 BN_ULONG high,low; \
89 asm ("mulq %3" \
90 : "=a"(low),"=d"(high) \
91 : "a"(word),"g"(a) \
92 : "cc"); \
93 asm ("addq %2,%0; adcq %3,%1" \
94 : "+r"(carry),"+d"(high)\
95 : "a"(low),"g"(0) \
96 : "cc"); \
97 (r)=carry, carry=high; \
98 } while (0)
99
100#define sqr(r0,r1,a) \
101 asm ("mulq %2" \
102 : "=a"(r0),"=d"(r1) \
103 : "a"(a) \
104 : "cc");
105
106BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
107 {
108 BN_ULONG c1=0;
109
110 if (num <= 0) return(c1);
111
112 while (num&~3)
113 {
114 mul_add(rp[0],ap[0],w,c1);
115 mul_add(rp[1],ap[1],w,c1);
116 mul_add(rp[2],ap[2],w,c1);
117 mul_add(rp[3],ap[3],w,c1);
118 ap+=4; rp+=4; num-=4;
119 }
120 if (num)
121 {
122 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
123 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
124 mul_add(rp[2],ap[2],w,c1); return c1;
125 }
126
127 return(c1);
128 }
129
130BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
131 {
132 BN_ULONG c1=0;
133
134 if (num <= 0) return(c1);
135
136 while (num&~3)
137 {
138 mul(rp[0],ap[0],w,c1);
139 mul(rp[1],ap[1],w,c1);
140 mul(rp[2],ap[2],w,c1);
141 mul(rp[3],ap[3],w,c1);
142 ap+=4; rp+=4; num-=4;
143 }
144 if (num)
145 {
146 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
147 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
148 mul(rp[2],ap[2],w,c1);
149 }
150 return(c1);
151 }
152
153void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
154 {
155 if (n <= 0) return;
156
157 while (n&~3)
158 {
159 sqr(r[0],r[1],a[0]);
160 sqr(r[2],r[3],a[1]);
161 sqr(r[4],r[5],a[2]);
162 sqr(r[6],r[7],a[3]);
163 a+=4; r+=8; n-=4;
164 }
165 if (n)
166 {
167 sqr(r[0],r[1],a[0]); if (--n == 0) return;
168 sqr(r[2],r[3],a[1]); if (--n == 0) return;
169 sqr(r[4],r[5],a[2]);
170 }
171 }
172
173BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
174{ BN_ULONG ret,waste;
175
176 asm ("divq %4"
177 : "=a"(ret),"=d"(waste)
178 : "a"(l),"d"(h),"g"(d)
179 : "cc");
180
181 return ret;
182}
183
184BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
185{ BN_ULONG ret=0,i=0;
186
187 if (n <= 0) return 0;
188
189 asm (
190 " subq %2,%2 \n"
191 ".p2align 4 \n"
192 "1: movq (%4,%2,8),%0 \n"
193 " adcq (%5,%2,8),%0 \n"
194 " movq %0,(%3,%2,8) \n"
195 " leaq 1(%2),%2 \n"
196 " loop 1b \n"
197 " sbbq %0,%0 \n"
198 : "=&a"(ret),"+c"(n),"=&r"(i)
199 : "r"(rp),"r"(ap),"r"(bp)
200 : "cc"
201 );
202
203 return ret&1;
204}
205
206#ifndef SIMICS
207BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
208{ BN_ULONG ret=0,i=0;
209
210 if (n <= 0) return 0;
211
212 asm (
213 " subq %2,%2 \n"
214 ".p2align 4 \n"
215 "1: movq (%4,%2,8),%0 \n"
216 " sbbq (%5,%2,8),%0 \n"
217 " movq %0,(%3,%2,8) \n"
218 " leaq 1(%2),%2 \n"
219 " loop 1b \n"
220 " sbbq %0,%0 \n"
221 : "=&a"(ret),"+c"(n),"=&r"(i)
222 : "r"(rp),"r"(ap),"r"(bp)
223 : "cc"
224 );
225
226 return ret&1;
227}
228#else
229/* Simics 1.4<7 has buggy sbbq:-( */
230#define BN_MASK2 0xffffffffffffffffL
231BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
232 {
233 BN_ULONG t1,t2;
234 int c=0;
235
236 if (n <= 0) return((BN_ULONG)0);
237
238 for (;;)
239 {
240 t1=a[0]; t2=b[0];
241 r[0]=(t1-t2-c)&BN_MASK2;
242 if (t1 != t2) c=(t1 < t2);
243 if (--n <= 0) break;
244
245 t1=a[1]; t2=b[1];
246 r[1]=(t1-t2-c)&BN_MASK2;
247 if (t1 != t2) c=(t1 < t2);
248 if (--n <= 0) break;
249
250 t1=a[2]; t2=b[2];
251 r[2]=(t1-t2-c)&BN_MASK2;
252 if (t1 != t2) c=(t1 < t2);
253 if (--n <= 0) break;
254
255 t1=a[3]; t2=b[3];
256 r[3]=(t1-t2-c)&BN_MASK2;
257 if (t1 != t2) c=(t1 < t2);
258 if (--n <= 0) break;
259
260 a+=4;
261 b+=4;
262 r+=4;
263 }
264 return(c);
265 }
266#endif
267
268/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
269/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
270/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
271/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
272
273/*
274 * Keep in mind that carrying into high part of multiplication result
275 * can not overflow, because it cannot be all-ones.
276 */
277#if 0
278/* original macros are kept for reference purposes */
279#define mul_add_c(a,b,c0,c1,c2) do { \
280 BN_ULONG ta = (a), tb = (b); \
281 BN_ULONG lo, hi; \
282 BN_UMULT_LOHI(lo,hi,ta,tb); \
283 c0 += lo; hi += (c0<lo)?1:0; \
284 c1 += hi; c2 += (c1<hi)?1:0; \
285 } while(0)
286
287#define mul_add_c2(a,b,c0,c1,c2) do { \
288 BN_ULONG ta = (a), tb = (b); \
289 BN_ULONG lo, hi, tt; \
290 BN_UMULT_LOHI(lo,hi,ta,tb); \
291 c0 += lo; tt = hi+((c0<lo)?1:0); \
292 c1 += tt; c2 += (c1<tt)?1:0; \
293 c0 += lo; hi += (c0<lo)?1:0; \
294 c1 += hi; c2 += (c1<hi)?1:0; \
295 } while(0)
296
297#define sqr_add_c(a,i,c0,c1,c2) do { \
298 BN_ULONG ta = (a)[i]; \
299 BN_ULONG lo, hi; \
300 BN_UMULT_LOHI(lo,hi,ta,ta); \
301 c0 += lo; hi += (c0<lo)?1:0; \
302 c1 += hi; c2 += (c1<hi)?1:0; \
303 } while(0)
304#else
305#define mul_add_c(a,b,c0,c1,c2) do { \
306 BN_ULONG t1,t2; \
307 asm ("mulq %3" \
308 : "=a"(t1),"=d"(t2) \
309 : "a"(a),"m"(b) \
310 : "cc"); \
311 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
312 : "+r"(c0),"+r"(c1),"+r"(c2) \
313 : "r"(t1),"r"(t2),"g"(0) \
314 : "cc"); \
315 } while (0)
316
317#define sqr_add_c(a,i,c0,c1,c2) do { \
318 BN_ULONG t1,t2; \
319 asm ("mulq %2" \
320 : "=a"(t1),"=d"(t2) \
321 : "a"(a[i]) \
322 : "cc"); \
323 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
324 : "+r"(c0),"+r"(c1),"+r"(c2) \
325 : "r"(t1),"r"(t2),"g"(0) \
326 : "cc"); \
327 } while (0)
328
329#define mul_add_c2(a,b,c0,c1,c2) do { \
330 BN_ULONG t1,t2; \
331 asm ("mulq %3" \
332 : "=a"(t1),"=d"(t2) \
333 : "a"(a),"m"(b) \
334 : "cc"); \
335 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
336 : "+r"(c0),"+r"(c1),"+r"(c2) \
337 : "r"(t1),"r"(t2),"g"(0) \
338 : "cc"); \
339 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
340 : "+r"(c0),"+r"(c1),"+r"(c2) \
341 : "r"(t1),"r"(t2),"g"(0) \
342 : "cc"); \
343 } while (0)
344#endif
345
346#define sqr_add_c2(a,i,j,c0,c1,c2) \
347 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
348
349void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
350 {
351 BN_ULONG c1,c2,c3;
352
353 c1=0;
354 c2=0;
355 c3=0;
356 mul_add_c(a[0],b[0],c1,c2,c3);
357 r[0]=c1;
358 c1=0;
359 mul_add_c(a[0],b[1],c2,c3,c1);
360 mul_add_c(a[1],b[0],c2,c3,c1);
361 r[1]=c2;
362 c2=0;
363 mul_add_c(a[2],b[0],c3,c1,c2);
364 mul_add_c(a[1],b[1],c3,c1,c2);
365 mul_add_c(a[0],b[2],c3,c1,c2);
366 r[2]=c3;
367 c3=0;
368 mul_add_c(a[0],b[3],c1,c2,c3);
369 mul_add_c(a[1],b[2],c1,c2,c3);
370 mul_add_c(a[2],b[1],c1,c2,c3);
371 mul_add_c(a[3],b[0],c1,c2,c3);
372 r[3]=c1;
373 c1=0;
374 mul_add_c(a[4],b[0],c2,c3,c1);
375 mul_add_c(a[3],b[1],c2,c3,c1);
376 mul_add_c(a[2],b[2],c2,c3,c1);
377 mul_add_c(a[1],b[3],c2,c3,c1);
378 mul_add_c(a[0],b[4],c2,c3,c1);
379 r[4]=c2;
380 c2=0;
381 mul_add_c(a[0],b[5],c3,c1,c2);
382 mul_add_c(a[1],b[4],c3,c1,c2);
383 mul_add_c(a[2],b[3],c3,c1,c2);
384 mul_add_c(a[3],b[2],c3,c1,c2);
385 mul_add_c(a[4],b[1],c3,c1,c2);
386 mul_add_c(a[5],b[0],c3,c1,c2);
387 r[5]=c3;
388 c3=0;
389 mul_add_c(a[6],b[0],c1,c2,c3);
390 mul_add_c(a[5],b[1],c1,c2,c3);
391 mul_add_c(a[4],b[2],c1,c2,c3);
392 mul_add_c(a[3],b[3],c1,c2,c3);
393 mul_add_c(a[2],b[4],c1,c2,c3);
394 mul_add_c(a[1],b[5],c1,c2,c3);
395 mul_add_c(a[0],b[6],c1,c2,c3);
396 r[6]=c1;
397 c1=0;
398 mul_add_c(a[0],b[7],c2,c3,c1);
399 mul_add_c(a[1],b[6],c2,c3,c1);
400 mul_add_c(a[2],b[5],c2,c3,c1);
401 mul_add_c(a[3],b[4],c2,c3,c1);
402 mul_add_c(a[4],b[3],c2,c3,c1);
403 mul_add_c(a[5],b[2],c2,c3,c1);
404 mul_add_c(a[6],b[1],c2,c3,c1);
405 mul_add_c(a[7],b[0],c2,c3,c1);
406 r[7]=c2;
407 c2=0;
408 mul_add_c(a[7],b[1],c3,c1,c2);
409 mul_add_c(a[6],b[2],c3,c1,c2);
410 mul_add_c(a[5],b[3],c3,c1,c2);
411 mul_add_c(a[4],b[4],c3,c1,c2);
412 mul_add_c(a[3],b[5],c3,c1,c2);
413 mul_add_c(a[2],b[6],c3,c1,c2);
414 mul_add_c(a[1],b[7],c3,c1,c2);
415 r[8]=c3;
416 c3=0;
417 mul_add_c(a[2],b[7],c1,c2,c3);
418 mul_add_c(a[3],b[6],c1,c2,c3);
419 mul_add_c(a[4],b[5],c1,c2,c3);
420 mul_add_c(a[5],b[4],c1,c2,c3);
421 mul_add_c(a[6],b[3],c1,c2,c3);
422 mul_add_c(a[7],b[2],c1,c2,c3);
423 r[9]=c1;
424 c1=0;
425 mul_add_c(a[7],b[3],c2,c3,c1);
426 mul_add_c(a[6],b[4],c2,c3,c1);
427 mul_add_c(a[5],b[5],c2,c3,c1);
428 mul_add_c(a[4],b[6],c2,c3,c1);
429 mul_add_c(a[3],b[7],c2,c3,c1);
430 r[10]=c2;
431 c2=0;
432 mul_add_c(a[4],b[7],c3,c1,c2);
433 mul_add_c(a[5],b[6],c3,c1,c2);
434 mul_add_c(a[6],b[5],c3,c1,c2);
435 mul_add_c(a[7],b[4],c3,c1,c2);
436 r[11]=c3;
437 c3=0;
438 mul_add_c(a[7],b[5],c1,c2,c3);
439 mul_add_c(a[6],b[6],c1,c2,c3);
440 mul_add_c(a[5],b[7],c1,c2,c3);
441 r[12]=c1;
442 c1=0;
443 mul_add_c(a[6],b[7],c2,c3,c1);
444 mul_add_c(a[7],b[6],c2,c3,c1);
445 r[13]=c2;
446 c2=0;
447 mul_add_c(a[7],b[7],c3,c1,c2);
448 r[14]=c3;
449 r[15]=c1;
450 }
451
452void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
453 {
454 BN_ULONG c1,c2,c3;
455
456 c1=0;
457 c2=0;
458 c3=0;
459 mul_add_c(a[0],b[0],c1,c2,c3);
460 r[0]=c1;
461 c1=0;
462 mul_add_c(a[0],b[1],c2,c3,c1);
463 mul_add_c(a[1],b[0],c2,c3,c1);
464 r[1]=c2;
465 c2=0;
466 mul_add_c(a[2],b[0],c3,c1,c2);
467 mul_add_c(a[1],b[1],c3,c1,c2);
468 mul_add_c(a[0],b[2],c3,c1,c2);
469 r[2]=c3;
470 c3=0;
471 mul_add_c(a[0],b[3],c1,c2,c3);
472 mul_add_c(a[1],b[2],c1,c2,c3);
473 mul_add_c(a[2],b[1],c1,c2,c3);
474 mul_add_c(a[3],b[0],c1,c2,c3);
475 r[3]=c1;
476 c1=0;
477 mul_add_c(a[3],b[1],c2,c3,c1);
478 mul_add_c(a[2],b[2],c2,c3,c1);
479 mul_add_c(a[1],b[3],c2,c3,c1);
480 r[4]=c2;
481 c2=0;
482 mul_add_c(a[2],b[3],c3,c1,c2);
483 mul_add_c(a[3],b[2],c3,c1,c2);
484 r[5]=c3;
485 c3=0;
486 mul_add_c(a[3],b[3],c1,c2,c3);
487 r[6]=c1;
488 r[7]=c2;
489 }
490
491void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
492 {
493 BN_ULONG c1,c2,c3;
494
495 c1=0;
496 c2=0;
497 c3=0;
498 sqr_add_c(a,0,c1,c2,c3);
499 r[0]=c1;
500 c1=0;
501 sqr_add_c2(a,1,0,c2,c3,c1);
502 r[1]=c2;
503 c2=0;
504 sqr_add_c(a,1,c3,c1,c2);
505 sqr_add_c2(a,2,0,c3,c1,c2);
506 r[2]=c3;
507 c3=0;
508 sqr_add_c2(a,3,0,c1,c2,c3);
509 sqr_add_c2(a,2,1,c1,c2,c3);
510 r[3]=c1;
511 c1=0;
512 sqr_add_c(a,2,c2,c3,c1);
513 sqr_add_c2(a,3,1,c2,c3,c1);
514 sqr_add_c2(a,4,0,c2,c3,c1);
515 r[4]=c2;
516 c2=0;
517 sqr_add_c2(a,5,0,c3,c1,c2);
518 sqr_add_c2(a,4,1,c3,c1,c2);
519 sqr_add_c2(a,3,2,c3,c1,c2);
520 r[5]=c3;
521 c3=0;
522 sqr_add_c(a,3,c1,c2,c3);
523 sqr_add_c2(a,4,2,c1,c2,c3);
524 sqr_add_c2(a,5,1,c1,c2,c3);
525 sqr_add_c2(a,6,0,c1,c2,c3);
526 r[6]=c1;
527 c1=0;
528 sqr_add_c2(a,7,0,c2,c3,c1);
529 sqr_add_c2(a,6,1,c2,c3,c1);
530 sqr_add_c2(a,5,2,c2,c3,c1);
531 sqr_add_c2(a,4,3,c2,c3,c1);
532 r[7]=c2;
533 c2=0;
534 sqr_add_c(a,4,c3,c1,c2);
535 sqr_add_c2(a,5,3,c3,c1,c2);
536 sqr_add_c2(a,6,2,c3,c1,c2);
537 sqr_add_c2(a,7,1,c3,c1,c2);
538 r[8]=c3;
539 c3=0;
540 sqr_add_c2(a,7,2,c1,c2,c3);
541 sqr_add_c2(a,6,3,c1,c2,c3);
542 sqr_add_c2(a,5,4,c1,c2,c3);
543 r[9]=c1;
544 c1=0;
545 sqr_add_c(a,5,c2,c3,c1);
546 sqr_add_c2(a,6,4,c2,c3,c1);
547 sqr_add_c2(a,7,3,c2,c3,c1);
548 r[10]=c2;
549 c2=0;
550 sqr_add_c2(a,7,4,c3,c1,c2);
551 sqr_add_c2(a,6,5,c3,c1,c2);
552 r[11]=c3;
553 c3=0;
554 sqr_add_c(a,6,c1,c2,c3);
555 sqr_add_c2(a,7,5,c1,c2,c3);
556 r[12]=c1;
557 c1=0;
558 sqr_add_c2(a,7,6,c2,c3,c1);
559 r[13]=c2;
560 c2=0;
561 sqr_add_c(a,7,c3,c1,c2);
562 r[14]=c3;
563 r[15]=c1;
564 }
565
566void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
567 {
568 BN_ULONG c1,c2,c3;
569
570 c1=0;
571 c2=0;
572 c3=0;
573 sqr_add_c(a,0,c1,c2,c3);
574 r[0]=c1;
575 c1=0;
576 sqr_add_c2(a,1,0,c2,c3,c1);
577 r[1]=c2;
578 c2=0;
579 sqr_add_c(a,1,c3,c1,c2);
580 sqr_add_c2(a,2,0,c3,c1,c2);
581 r[2]=c3;
582 c3=0;
583 sqr_add_c2(a,3,0,c1,c2,c3);
584 sqr_add_c2(a,2,1,c1,c2,c3);
585 r[3]=c1;
586 c1=0;
587 sqr_add_c(a,2,c2,c3,c1);
588 sqr_add_c2(a,3,1,c2,c3,c1);
589 r[4]=c2;
590 c2=0;
591 sqr_add_c2(a,3,2,c3,c1,c2);
592 r[5]=c3;
593 c3=0;
594 sqr_add_c(a,3,c1,c2,c3);
595 r[6]=c1;
596 r[7]=c2;
597 }
598#endif