summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/bn/asm/x86_64-gcc.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/bn/asm/x86_64-gcc.c')
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c593
1 files changed, 0 insertions, 593 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index 7378344251..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,593 +0,0 @@
1/*
2 * x86_64 BIGNUM accelerator version 0.1, December 2002.
3 *
4 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 * project.
6 *
7 * Rights for redistribution and usage in source and binary forms are
8 * granted according to the OpenSSL license. Warranty of any kind is
9 * disclaimed.
10 *
11 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
12 * versions, like 1.0...
13 * A. Well, that's because this code is basically a quick-n-dirty
14 * proof-of-concept hack. As you can see it's implemented with
15 * inline assembler, which means that you're bound to GCC and that
16 * there might be enough room for further improvement.
17 *
18 * Q. Why inline assembler?
19 * A. x86_64 features own ABI which I'm not familiar with. This is
20 * why I decided to let the compiler take care of subroutine
21 * prologue/epilogue as well as register allocation. For reference.
22 * Win64 implements different ABI for AMD64, different from Linux.
23 *
24 * Q. How much faster does it get?
25 * A. 'apps/openssl speed rsa dsa' output with no-asm:
26 *
27 * sign verify sign/s verify/s
28 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
29 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
30 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
31 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
32 * sign verify sign/s verify/s
33 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
34 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
35 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
36 *
37 * 'apps/openssl speed rsa dsa' output with this module:
38 *
39 * sign verify sign/s verify/s
40 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
41 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
42 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
43 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
44 * sign verify sign/s verify/s
45 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
46 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
47 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
48 *
49 * For the reference. IA-32 assembler implementation performs
50 * very much like 64-bit code compiled with no-asm on the same
51 * machine.
52 */
53
54#define BN_ULONG unsigned long
55
56/*
57 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
58 * "g"(0) let the compiler to decide where does it
59 * want to keep the value of zero;
60 */
61#define mul_add(r,a,word,carry) do { \
62 register BN_ULONG high,low; \
63 asm ("mulq %3" \
64 : "=a"(low),"=d"(high) \
65 : "a"(word),"m"(a) \
66 : "cc"); \
67 asm ("addq %2,%0; adcq %3,%1" \
68 : "+r"(carry),"+d"(high)\
69 : "a"(low),"g"(0) \
70 : "cc"); \
71 asm ("addq %2,%0; adcq %3,%1" \
72 : "+m"(r),"+d"(high) \
73 : "r"(carry),"g"(0) \
74 : "cc"); \
75 carry=high; \
76 } while (0)
77
78#define mul(r,a,word,carry) do { \
79 register BN_ULONG high,low; \
80 asm ("mulq %3" \
81 : "=a"(low),"=d"(high) \
82 : "a"(word),"g"(a) \
83 : "cc"); \
84 asm ("addq %2,%0; adcq %3,%1" \
85 : "+r"(carry),"+d"(high)\
86 : "a"(low),"g"(0) \
87 : "cc"); \
88 (r)=carry, carry=high; \
89 } while (0)
90
91#define sqr(r0,r1,a) \
92 asm ("mulq %2" \
93 : "=a"(r0),"=d"(r1) \
94 : "a"(a) \
95 : "cc");
96
97BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
98 {
99 BN_ULONG c1=0;
100
101 if (num <= 0) return(c1);
102
103 while (num&~3)
104 {
105 mul_add(rp[0],ap[0],w,c1);
106 mul_add(rp[1],ap[1],w,c1);
107 mul_add(rp[2],ap[2],w,c1);
108 mul_add(rp[3],ap[3],w,c1);
109 ap+=4; rp+=4; num-=4;
110 }
111 if (num)
112 {
113 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
114 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
115 mul_add(rp[2],ap[2],w,c1); return c1;
116 }
117
118 return(c1);
119 }
120
121BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
122 {
123 BN_ULONG c1=0;
124
125 if (num <= 0) return(c1);
126
127 while (num&~3)
128 {
129 mul(rp[0],ap[0],w,c1);
130 mul(rp[1],ap[1],w,c1);
131 mul(rp[2],ap[2],w,c1);
132 mul(rp[3],ap[3],w,c1);
133 ap+=4; rp+=4; num-=4;
134 }
135 if (num)
136 {
137 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
138 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
139 mul(rp[2],ap[2],w,c1);
140 }
141 return(c1);
142 }
143
144void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
145 {
146 if (n <= 0) return;
147
148 while (n&~3)
149 {
150 sqr(r[0],r[1],a[0]);
151 sqr(r[2],r[3],a[1]);
152 sqr(r[4],r[5],a[2]);
153 sqr(r[6],r[7],a[3]);
154 a+=4; r+=8; n-=4;
155 }
156 if (n)
157 {
158 sqr(r[0],r[1],a[0]); if (--n == 0) return;
159 sqr(r[2],r[3],a[1]); if (--n == 0) return;
160 sqr(r[4],r[5],a[2]);
161 }
162 }
163
164BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
165{ BN_ULONG ret,waste;
166
167 asm ("divq %4"
168 : "=a"(ret),"=d"(waste)
169 : "a"(l),"d"(h),"g"(d)
170 : "cc");
171
172 return ret;
173}
174
175BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
176{ BN_ULONG ret=0,i=0;
177
178 if (n <= 0) return 0;
179
180 asm (
181 " subq %2,%2 \n"
182 ".align 16 \n"
183 "1: movq (%4,%2,8),%0 \n"
184 " adcq (%5,%2,8),%0 \n"
185 " movq %0,(%3,%2,8) \n"
186 " leaq 1(%2),%2 \n"
187 " loop 1b \n"
188 " sbbq %0,%0 \n"
189 : "=&a"(ret),"+c"(n),"=&r"(i)
190 : "r"(rp),"r"(ap),"r"(bp)
191 : "cc"
192 );
193
194 return ret&1;
195}
196
197#ifndef SIMICS
198BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
199{ BN_ULONG ret=0,i=0;
200
201 if (n <= 0) return 0;
202
203 asm (
204 " subq %2,%2 \n"
205 ".align 16 \n"
206 "1: movq (%4,%2,8),%0 \n"
207 " sbbq (%5,%2,8),%0 \n"
208 " movq %0,(%3,%2,8) \n"
209 " leaq 1(%2),%2 \n"
210 " loop 1b \n"
211 " sbbq %0,%0 \n"
212 : "=&a"(ret),"+c"(n),"=&r"(i)
213 : "r"(rp),"r"(ap),"r"(bp)
214 : "cc"
215 );
216
217 return ret&1;
218}
219#else
220/* Simics 1.4<7 has buggy sbbq:-( */
221#define BN_MASK2 0xffffffffffffffffL
222BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
223 {
224 BN_ULONG t1,t2;
225 int c=0;
226
227 if (n <= 0) return((BN_ULONG)0);
228
229 for (;;)
230 {
231 t1=a[0]; t2=b[0];
232 r[0]=(t1-t2-c)&BN_MASK2;
233 if (t1 != t2) c=(t1 < t2);
234 if (--n <= 0) break;
235
236 t1=a[1]; t2=b[1];
237 r[1]=(t1-t2-c)&BN_MASK2;
238 if (t1 != t2) c=(t1 < t2);
239 if (--n <= 0) break;
240
241 t1=a[2]; t2=b[2];
242 r[2]=(t1-t2-c)&BN_MASK2;
243 if (t1 != t2) c=(t1 < t2);
244 if (--n <= 0) break;
245
246 t1=a[3]; t2=b[3];
247 r[3]=(t1-t2-c)&BN_MASK2;
248 if (t1 != t2) c=(t1 < t2);
249 if (--n <= 0) break;
250
251 a+=4;
252 b+=4;
253 r+=4;
254 }
255 return(c);
256 }
257#endif
258
259/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
260/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
261/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
262/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
263
264#if 0
265/* original macros are kept for reference purposes */
266#define mul_add_c(a,b,c0,c1,c2) { \
267 BN_ULONG ta=(a),tb=(b); \
268 t1 = ta * tb; \
269 t2 = BN_UMULT_HIGH(ta,tb); \
270 c0 += t1; t2 += (c0<t1)?1:0; \
271 c1 += t2; c2 += (c1<t2)?1:0; \
272 }
273
274#define mul_add_c2(a,b,c0,c1,c2) { \
275 BN_ULONG ta=(a),tb=(b),t0; \
276 t1 = BN_UMULT_HIGH(ta,tb); \
277 t0 = ta * tb; \
278 t2 = t1+t1; c2 += (t2<t1)?1:0; \
279 t1 = t0+t0; t2 += (t1<t0)?1:0; \
280 c0 += t1; t2 += (c0<t1)?1:0; \
281 c1 += t2; c2 += (c1<t2)?1:0; \
282 }
283#else
284#define mul_add_c(a,b,c0,c1,c2) do { \
285 asm ("mulq %3" \
286 : "=a"(t1),"=d"(t2) \
287 : "a"(a),"m"(b) \
288 : "cc"); \
289 asm ("addq %2,%0; adcq %3,%1" \
290 : "+r"(c0),"+d"(t2) \
291 : "a"(t1),"g"(0) \
292 : "cc"); \
293 asm ("addq %2,%0; adcq %3,%1" \
294 : "+r"(c1),"+r"(c2) \
295 : "d"(t2),"g"(0) \
296 : "cc"); \
297 } while (0)
298
299#define sqr_add_c(a,i,c0,c1,c2) do { \
300 asm ("mulq %2" \
301 : "=a"(t1),"=d"(t2) \
302 : "a"(a[i]) \
303 : "cc"); \
304 asm ("addq %2,%0; adcq %3,%1" \
305 : "+r"(c0),"+d"(t2) \
306 : "a"(t1),"g"(0) \
307 : "cc"); \
308 asm ("addq %2,%0; adcq %3,%1" \
309 : "+r"(c1),"+r"(c2) \
310 : "d"(t2),"g"(0) \
311 : "cc"); \
312 } while (0)
313
314#define mul_add_c2(a,b,c0,c1,c2) do { \
315 asm ("mulq %3" \
316 : "=a"(t1),"=d"(t2) \
317 : "a"(a),"m"(b) \
318 : "cc"); \
319 asm ("addq %0,%0; adcq %2,%1" \
320 : "+d"(t2),"+r"(c2) \
321 : "g"(0) \
322 : "cc"); \
323 asm ("addq %0,%0; adcq %2,%1" \
324 : "+a"(t1),"+d"(t2) \
325 : "g"(0) \
326 : "cc"); \
327 asm ("addq %2,%0; adcq %3,%1" \
328 : "+r"(c0),"+d"(t2) \
329 : "a"(t1),"g"(0) \
330 : "cc"); \
331 asm ("addq %2,%0; adcq %3,%1" \
332 : "+r"(c1),"+r"(c2) \
333 : "d"(t2),"g"(0) \
334 : "cc"); \
335 } while (0)
336#endif
337
338#define sqr_add_c2(a,i,j,c0,c1,c2) \
339 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
340
341void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
342 {
343 BN_ULONG t1,t2;
344 BN_ULONG c1,c2,c3;
345
346 c1=0;
347 c2=0;
348 c3=0;
349 mul_add_c(a[0],b[0],c1,c2,c3);
350 r[0]=c1;
351 c1=0;
352 mul_add_c(a[0],b[1],c2,c3,c1);
353 mul_add_c(a[1],b[0],c2,c3,c1);
354 r[1]=c2;
355 c2=0;
356 mul_add_c(a[2],b[0],c3,c1,c2);
357 mul_add_c(a[1],b[1],c3,c1,c2);
358 mul_add_c(a[0],b[2],c3,c1,c2);
359 r[2]=c3;
360 c3=0;
361 mul_add_c(a[0],b[3],c1,c2,c3);
362 mul_add_c(a[1],b[2],c1,c2,c3);
363 mul_add_c(a[2],b[1],c1,c2,c3);
364 mul_add_c(a[3],b[0],c1,c2,c3);
365 r[3]=c1;
366 c1=0;
367 mul_add_c(a[4],b[0],c2,c3,c1);
368 mul_add_c(a[3],b[1],c2,c3,c1);
369 mul_add_c(a[2],b[2],c2,c3,c1);
370 mul_add_c(a[1],b[3],c2,c3,c1);
371 mul_add_c(a[0],b[4],c2,c3,c1);
372 r[4]=c2;
373 c2=0;
374 mul_add_c(a[0],b[5],c3,c1,c2);
375 mul_add_c(a[1],b[4],c3,c1,c2);
376 mul_add_c(a[2],b[3],c3,c1,c2);
377 mul_add_c(a[3],b[2],c3,c1,c2);
378 mul_add_c(a[4],b[1],c3,c1,c2);
379 mul_add_c(a[5],b[0],c3,c1,c2);
380 r[5]=c3;
381 c3=0;
382 mul_add_c(a[6],b[0],c1,c2,c3);
383 mul_add_c(a[5],b[1],c1,c2,c3);
384 mul_add_c(a[4],b[2],c1,c2,c3);
385 mul_add_c(a[3],b[3],c1,c2,c3);
386 mul_add_c(a[2],b[4],c1,c2,c3);
387 mul_add_c(a[1],b[5],c1,c2,c3);
388 mul_add_c(a[0],b[6],c1,c2,c3);
389 r[6]=c1;
390 c1=0;
391 mul_add_c(a[0],b[7],c2,c3,c1);
392 mul_add_c(a[1],b[6],c2,c3,c1);
393 mul_add_c(a[2],b[5],c2,c3,c1);
394 mul_add_c(a[3],b[4],c2,c3,c1);
395 mul_add_c(a[4],b[3],c2,c3,c1);
396 mul_add_c(a[5],b[2],c2,c3,c1);
397 mul_add_c(a[6],b[1],c2,c3,c1);
398 mul_add_c(a[7],b[0],c2,c3,c1);
399 r[7]=c2;
400 c2=0;
401 mul_add_c(a[7],b[1],c3,c1,c2);
402 mul_add_c(a[6],b[2],c3,c1,c2);
403 mul_add_c(a[5],b[3],c3,c1,c2);
404 mul_add_c(a[4],b[4],c3,c1,c2);
405 mul_add_c(a[3],b[5],c3,c1,c2);
406 mul_add_c(a[2],b[6],c3,c1,c2);
407 mul_add_c(a[1],b[7],c3,c1,c2);
408 r[8]=c3;
409 c3=0;
410 mul_add_c(a[2],b[7],c1,c2,c3);
411 mul_add_c(a[3],b[6],c1,c2,c3);
412 mul_add_c(a[4],b[5],c1,c2,c3);
413 mul_add_c(a[5],b[4],c1,c2,c3);
414 mul_add_c(a[6],b[3],c1,c2,c3);
415 mul_add_c(a[7],b[2],c1,c2,c3);
416 r[9]=c1;
417 c1=0;
418 mul_add_c(a[7],b[3],c2,c3,c1);
419 mul_add_c(a[6],b[4],c2,c3,c1);
420 mul_add_c(a[5],b[5],c2,c3,c1);
421 mul_add_c(a[4],b[6],c2,c3,c1);
422 mul_add_c(a[3],b[7],c2,c3,c1);
423 r[10]=c2;
424 c2=0;
425 mul_add_c(a[4],b[7],c3,c1,c2);
426 mul_add_c(a[5],b[6],c3,c1,c2);
427 mul_add_c(a[6],b[5],c3,c1,c2);
428 mul_add_c(a[7],b[4],c3,c1,c2);
429 r[11]=c3;
430 c3=0;
431 mul_add_c(a[7],b[5],c1,c2,c3);
432 mul_add_c(a[6],b[6],c1,c2,c3);
433 mul_add_c(a[5],b[7],c1,c2,c3);
434 r[12]=c1;
435 c1=0;
436 mul_add_c(a[6],b[7],c2,c3,c1);
437 mul_add_c(a[7],b[6],c2,c3,c1);
438 r[13]=c2;
439 c2=0;
440 mul_add_c(a[7],b[7],c3,c1,c2);
441 r[14]=c3;
442 r[15]=c1;
443 }
444
445void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
446 {
447 BN_ULONG t1,t2;
448 BN_ULONG c1,c2,c3;
449
450 c1=0;
451 c2=0;
452 c3=0;
453 mul_add_c(a[0],b[0],c1,c2,c3);
454 r[0]=c1;
455 c1=0;
456 mul_add_c(a[0],b[1],c2,c3,c1);
457 mul_add_c(a[1],b[0],c2,c3,c1);
458 r[1]=c2;
459 c2=0;
460 mul_add_c(a[2],b[0],c3,c1,c2);
461 mul_add_c(a[1],b[1],c3,c1,c2);
462 mul_add_c(a[0],b[2],c3,c1,c2);
463 r[2]=c3;
464 c3=0;
465 mul_add_c(a[0],b[3],c1,c2,c3);
466 mul_add_c(a[1],b[2],c1,c2,c3);
467 mul_add_c(a[2],b[1],c1,c2,c3);
468 mul_add_c(a[3],b[0],c1,c2,c3);
469 r[3]=c1;
470 c1=0;
471 mul_add_c(a[3],b[1],c2,c3,c1);
472 mul_add_c(a[2],b[2],c2,c3,c1);
473 mul_add_c(a[1],b[3],c2,c3,c1);
474 r[4]=c2;
475 c2=0;
476 mul_add_c(a[2],b[3],c3,c1,c2);
477 mul_add_c(a[3],b[2],c3,c1,c2);
478 r[5]=c3;
479 c3=0;
480 mul_add_c(a[3],b[3],c1,c2,c3);
481 r[6]=c1;
482 r[7]=c2;
483 }
484
485void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
486 {
487 BN_ULONG t1,t2;
488 BN_ULONG c1,c2,c3;
489
490 c1=0;
491 c2=0;
492 c3=0;
493 sqr_add_c(a,0,c1,c2,c3);
494 r[0]=c1;
495 c1=0;
496 sqr_add_c2(a,1,0,c2,c3,c1);
497 r[1]=c2;
498 c2=0;
499 sqr_add_c(a,1,c3,c1,c2);
500 sqr_add_c2(a,2,0,c3,c1,c2);
501 r[2]=c3;
502 c3=0;
503 sqr_add_c2(a,3,0,c1,c2,c3);
504 sqr_add_c2(a,2,1,c1,c2,c3);
505 r[3]=c1;
506 c1=0;
507 sqr_add_c(a,2,c2,c3,c1);
508 sqr_add_c2(a,3,1,c2,c3,c1);
509 sqr_add_c2(a,4,0,c2,c3,c1);
510 r[4]=c2;
511 c2=0;
512 sqr_add_c2(a,5,0,c3,c1,c2);
513 sqr_add_c2(a,4,1,c3,c1,c2);
514 sqr_add_c2(a,3,2,c3,c1,c2);
515 r[5]=c3;
516 c3=0;
517 sqr_add_c(a,3,c1,c2,c3);
518 sqr_add_c2(a,4,2,c1,c2,c3);
519 sqr_add_c2(a,5,1,c1,c2,c3);
520 sqr_add_c2(a,6,0,c1,c2,c3);
521 r[6]=c1;
522 c1=0;
523 sqr_add_c2(a,7,0,c2,c3,c1);
524 sqr_add_c2(a,6,1,c2,c3,c1);
525 sqr_add_c2(a,5,2,c2,c3,c1);
526 sqr_add_c2(a,4,3,c2,c3,c1);
527 r[7]=c2;
528 c2=0;
529 sqr_add_c(a,4,c3,c1,c2);
530 sqr_add_c2(a,5,3,c3,c1,c2);
531 sqr_add_c2(a,6,2,c3,c1,c2);
532 sqr_add_c2(a,7,1,c3,c1,c2);
533 r[8]=c3;
534 c3=0;
535 sqr_add_c2(a,7,2,c1,c2,c3);
536 sqr_add_c2(a,6,3,c1,c2,c3);
537 sqr_add_c2(a,5,4,c1,c2,c3);
538 r[9]=c1;
539 c1=0;
540 sqr_add_c(a,5,c2,c3,c1);
541 sqr_add_c2(a,6,4,c2,c3,c1);
542 sqr_add_c2(a,7,3,c2,c3,c1);
543 r[10]=c2;
544 c2=0;
545 sqr_add_c2(a,7,4,c3,c1,c2);
546 sqr_add_c2(a,6,5,c3,c1,c2);
547 r[11]=c3;
548 c3=0;
549 sqr_add_c(a,6,c1,c2,c3);
550 sqr_add_c2(a,7,5,c1,c2,c3);
551 r[12]=c1;
552 c1=0;
553 sqr_add_c2(a,7,6,c2,c3,c1);
554 r[13]=c2;
555 c2=0;
556 sqr_add_c(a,7,c3,c1,c2);
557 r[14]=c3;
558 r[15]=c1;
559 }
560
561void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
562 {
563 BN_ULONG t1,t2;
564 BN_ULONG c1,c2,c3;
565
566 c1=0;
567 c2=0;
568 c3=0;
569 sqr_add_c(a,0,c1,c2,c3);
570 r[0]=c1;
571 c1=0;
572 sqr_add_c2(a,1,0,c2,c3,c1);
573 r[1]=c2;
574 c2=0;
575 sqr_add_c(a,1,c3,c1,c2);
576 sqr_add_c2(a,2,0,c3,c1,c2);
577 r[2]=c3;
578 c3=0;
579 sqr_add_c2(a,3,0,c1,c2,c3);
580 sqr_add_c2(a,2,1,c1,c2,c3);
581 r[3]=c1;
582 c1=0;
583 sqr_add_c(a,2,c2,c3,c1);
584 sqr_add_c2(a,3,1,c2,c3,c1);
585 r[4]=c2;
586 c2=0;
587 sqr_add_c2(a,3,2,c3,c1,c2);
588 r[5]=c3;
589 c3=0;
590 sqr_add_c(a,3,c1,c2,c3);
591 r[6]=c1;
592 r[7]=c2;
593 }