summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorjsing <>2023-02-11 12:15:02 +0000
committerjsing <>2023-02-11 12:15:02 +0000
commitf3bce1e189bb4d596a7d1feae8b66b6c11c62761 (patch)
tree9c771840455a831a47e2a3c9c68970885a9a45c3 /src
parent67afc07de0ed3a0ccc272df42853ba565a8277c6 (diff)
downloadopenbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.gz
openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.bz2
openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.zip
Bye bye x86_64-gcc.c.
This is no longer used, since we're now using s2n-bignum functions instead.
Diffstat (limited to 'src')
-rw-r--r--src/lib/libcrypto/bn/asm/x86_64-gcc.c559
1 files changed, 0 insertions, 559 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index c6d6239bc2..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,559 +0,0 @@
1/* $OpenBSD: x86_64-gcc.c,v 1.8 2023/01/20 17:26:03 jsing Exp $ */
2#include "../bn_local.h"
3/*
4 * x86_64 BIGNUM accelerator version 0.1, December 2002.
5 *
6 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7 * project.
8 *
9 * Rights for redistribution and usage in source and binary forms are
10 * granted according to the OpenSSL license. Warranty of any kind is
11 * disclaimed.
12 *
13 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
14 * versions, like 1.0...
15 * A. Well, that's because this code is basically a quick-n-dirty
16 * proof-of-concept hack. As you can see it's implemented with
17 * inline assembler, which means that you're bound to GCC and that
18 * there might be enough room for further improvement.
19 *
20 * Q. Why inline assembler?
21 * A. x86_64 features own ABI which I'm not familiar with. This is
22 * why I decided to let the compiler take care of subroutine
23 * prologue/epilogue as well as register allocation. For reference.
24 * Win64 implements different ABI for AMD64, different from Linux.
25 *
26 * Q. How much faster does it get?
27 * A. 'apps/openssl speed rsa dsa' output with no-asm:
28 *
29 * sign verify sign/s verify/s
30 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
31 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
32 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
33 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
34 * sign verify sign/s verify/s
35 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
36 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
37 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
38 *
39 * 'apps/openssl speed rsa dsa' output with this module:
40 *
41 * sign verify sign/s verify/s
42 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
43 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
44 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
45 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
46 * sign verify sign/s verify/s
47 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
48 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
49 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
50 *
51 * For the reference. IA-32 assembler implementation performs
52 * very much like 64-bit code compiled with no-asm on the same
53 * machine.
54 */
55
56#define BN_ULONG unsigned long
57
58#undef mul
59#undef mul_add
60#undef sqr
61
62/*
63 * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
64 * "g"(0) let the compiler to decide where does it
65 * want to keep the value of zero;
66 */
67#define mul_add(r,a,word,carry) do { \
68 BN_ULONG high,low; \
69 asm ("mulq %3" \
70 : "=a"(low),"=d"(high) \
71 : "a"(word),"m"(a) \
72 : "cc"); \
73 asm ("addq %2,%0; adcq %3,%1" \
74 : "+r"(carry),"+d"(high)\
75 : "a"(low),"g"(0) \
76 : "cc"); \
77 asm ("addq %2,%0; adcq %3,%1" \
78 : "+m"(r),"+d"(high) \
79 : "r"(carry),"g"(0) \
80 : "cc"); \
81 carry=high; \
82 } while (0)
83
84#define mul(r,a,word,carry) do { \
85 BN_ULONG high,low; \
86 asm ("mulq %3" \
87 : "=a"(low),"=d"(high) \
88 : "a"(word),"g"(a) \
89 : "cc"); \
90 asm ("addq %2,%0; adcq %3,%1" \
91 : "+r"(carry),"+d"(high)\
92 : "a"(low),"g"(0) \
93 : "cc"); \
94 (r)=carry, carry=high; \
95 } while (0)
96
97#define sqr(r0,r1,a) \
98 asm ("mulq %2" \
99 : "=a"(r0),"=d"(r1) \
100 : "a"(a) \
101 : "cc");
102
103BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
104 {
105 BN_ULONG c1=0;
106
107 if (num <= 0) return(c1);
108
109 while (num&~3)
110 {
111 mul_add(rp[0],ap[0],w,c1);
112 mul_add(rp[1],ap[1],w,c1);
113 mul_add(rp[2],ap[2],w,c1);
114 mul_add(rp[3],ap[3],w,c1);
115 ap+=4; rp+=4; num-=4;
116 }
117 if (num)
118 {
119 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
120 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
121 mul_add(rp[2],ap[2],w,c1); return c1;
122 }
123
124 return(c1);
125 }
126
127BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
128 {
129 BN_ULONG c1=0;
130
131 if (num <= 0) return(c1);
132
133 while (num&~3)
134 {
135 mul(rp[0],ap[0],w,c1);
136 mul(rp[1],ap[1],w,c1);
137 mul(rp[2],ap[2],w,c1);
138 mul(rp[3],ap[3],w,c1);
139 ap+=4; rp+=4; num-=4;
140 }
141 if (num)
142 {
143 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
144 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
145 mul(rp[2],ap[2],w,c1);
146 }
147 return(c1);
148 }
149
150void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
151 {
152 if (n <= 0) return;
153
154 while (n&~3)
155 {
156 sqr(r[0],r[1],a[0]);
157 sqr(r[2],r[3],a[1]);
158 sqr(r[4],r[5],a[2]);
159 sqr(r[6],r[7],a[3]);
160 a+=4; r+=8; n-=4;
161 }
162 if (n)
163 {
164 sqr(r[0],r[1],a[0]); if (--n == 0) return;
165 sqr(r[2],r[3],a[1]); if (--n == 0) return;
166 sqr(r[4],r[5],a[2]);
167 }
168 }
169
170BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
171{ BN_ULONG ret,waste;
172
173 asm ("divq %4"
174 : "=a"(ret),"=d"(waste)
175 : "a"(l),"d"(h),"g"(d)
176 : "cc");
177
178 return ret;
179}
180
181BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
182{ BN_ULONG ret=0,i=0;
183
184 if (n <= 0) return 0;
185
186 asm (
187 " subq %2,%2 \n"
188 ".p2align 4 \n"
189 "1: movq (%4,%2,8),%0 \n"
190 " adcq (%5,%2,8),%0 \n"
191 " movq %0,(%3,%2,8) \n"
192 " leaq 1(%2),%2 \n"
193 " loop 1b \n"
194 " sbbq %0,%0 \n"
195 : "=&a"(ret),"+c"(n),"=&r"(i)
196 : "r"(rp),"r"(ap),"r"(bp)
197 : "cc"
198 );
199
200 return ret&1;
201}
202
203BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
204{ BN_ULONG ret=0,i=0;
205
206 if (n <= 0) return 0;
207
208 asm (
209 " subq %2,%2 \n"
210 ".p2align 4 \n"
211 "1: movq (%4,%2,8),%0 \n"
212 " sbbq (%5,%2,8),%0 \n"
213 " movq %0,(%3,%2,8) \n"
214 " leaq 1(%2),%2 \n"
215 " loop 1b \n"
216 " sbbq %0,%0 \n"
217 : "=&a"(ret),"+c"(n),"=&r"(i)
218 : "r"(rp),"r"(ap),"r"(bp)
219 : "cc"
220 );
221
222 return ret&1;
223}
224
225/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
226/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
227/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
228/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
229
230#undef mul_add_c
231#undef mul_add_c2
232#undef sqr_add_c
233#undef sqr_add_c2
234
235/*
236 * Keep in mind that carrying into high part of multiplication result
237 * can not overflow, because it cannot be all-ones.
238 */
239#if 0
240/* original macros are kept for reference purposes */
241#define mul_add_c(a,b,c0,c1,c2) do { \
242 BN_ULONG ta = (a), tb = (b); \
243 BN_ULONG lo, hi; \
244 BN_UMULT_LOHI(lo,hi,ta,tb); \
245 c0 += lo; hi += (c0<lo)?1:0; \
246 c1 += hi; c2 += (c1<hi)?1:0; \
247 } while(0)
248
249#define mul_add_c2(a,b,c0,c1,c2) do { \
250 BN_ULONG ta = (a), tb = (b); \
251 BN_ULONG lo, hi, tt; \
252 BN_UMULT_LOHI(lo,hi,ta,tb); \
253 c0 += lo; tt = hi+((c0<lo)?1:0); \
254 c1 += tt; c2 += (c1<tt)?1:0; \
255 c0 += lo; hi += (c0<lo)?1:0; \
256 c1 += hi; c2 += (c1<hi)?1:0; \
257 } while(0)
258
259#define sqr_add_c(a,i,c0,c1,c2) do { \
260 BN_ULONG ta = (a)[i]; \
261 BN_ULONG lo, hi; \
262 BN_UMULT_LOHI(lo,hi,ta,ta); \
263 c0 += lo; hi += (c0<lo)?1:0; \
264 c1 += hi; c2 += (c1<hi)?1:0; \
265 } while(0)
266#else
267#define mul_add_c(a,b,c0,c1,c2) do { \
268 BN_ULONG t1,t2; \
269 asm ("mulq %3" \
270 : "=a"(t1),"=d"(t2) \
271 : "a"(a),"m"(b) \
272 : "cc"); \
273 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
274 : "+r"(c0),"+r"(c1),"+r"(c2) \
275 : "r"(t1),"r"(t2),"g"(0) \
276 : "cc"); \
277 } while (0)
278
279#define sqr_add_c(a,i,c0,c1,c2) do { \
280 BN_ULONG t1,t2; \
281 asm ("mulq %2" \
282 : "=a"(t1),"=d"(t2) \
283 : "a"(a[i]) \
284 : "cc"); \
285 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
286 : "+r"(c0),"+r"(c1),"+r"(c2) \
287 : "r"(t1),"r"(t2),"g"(0) \
288 : "cc"); \
289 } while (0)
290
291#define mul_add_c2(a,b,c0,c1,c2) do { \
292 BN_ULONG t1,t2; \
293 asm ("mulq %3" \
294 : "=a"(t1),"=d"(t2) \
295 : "a"(a),"m"(b) \
296 : "cc"); \
297 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
298 : "+r"(c0),"+r"(c1),"+r"(c2) \
299 : "r"(t1),"r"(t2),"g"(0) \
300 : "cc"); \
301 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
302 : "+r"(c0),"+r"(c1),"+r"(c2) \
303 : "r"(t1),"r"(t2),"g"(0) \
304 : "cc"); \
305 } while (0)
306#endif
307
308#define sqr_add_c2(a,i,j,c0,c1,c2) \
309 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
310
311void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
312 {
313 BN_ULONG c1,c2,c3;
314
315 c1=0;
316 c2=0;
317 c3=0;
318 mul_add_c(a[0],b[0],c1,c2,c3);
319 r[0]=c1;
320 c1=0;
321 mul_add_c(a[0],b[1],c2,c3,c1);
322 mul_add_c(a[1],b[0],c2,c3,c1);
323 r[1]=c2;
324 c2=0;
325 mul_add_c(a[2],b[0],c3,c1,c2);
326 mul_add_c(a[1],b[1],c3,c1,c2);
327 mul_add_c(a[0],b[2],c3,c1,c2);
328 r[2]=c3;
329 c3=0;
330 mul_add_c(a[0],b[3],c1,c2,c3);
331 mul_add_c(a[1],b[2],c1,c2,c3);
332 mul_add_c(a[2],b[1],c1,c2,c3);
333 mul_add_c(a[3],b[0],c1,c2,c3);
334 r[3]=c1;
335 c1=0;
336 mul_add_c(a[4],b[0],c2,c3,c1);
337 mul_add_c(a[3],b[1],c2,c3,c1);
338 mul_add_c(a[2],b[2],c2,c3,c1);
339 mul_add_c(a[1],b[3],c2,c3,c1);
340 mul_add_c(a[0],b[4],c2,c3,c1);
341 r[4]=c2;
342 c2=0;
343 mul_add_c(a[0],b[5],c3,c1,c2);
344 mul_add_c(a[1],b[4],c3,c1,c2);
345 mul_add_c(a[2],b[3],c3,c1,c2);
346 mul_add_c(a[3],b[2],c3,c1,c2);
347 mul_add_c(a[4],b[1],c3,c1,c2);
348 mul_add_c(a[5],b[0],c3,c1,c2);
349 r[5]=c3;
350 c3=0;
351 mul_add_c(a[6],b[0],c1,c2,c3);
352 mul_add_c(a[5],b[1],c1,c2,c3);
353 mul_add_c(a[4],b[2],c1,c2,c3);
354 mul_add_c(a[3],b[3],c1,c2,c3);
355 mul_add_c(a[2],b[4],c1,c2,c3);
356 mul_add_c(a[1],b[5],c1,c2,c3);
357 mul_add_c(a[0],b[6],c1,c2,c3);
358 r[6]=c1;
359 c1=0;
360 mul_add_c(a[0],b[7],c2,c3,c1);
361 mul_add_c(a[1],b[6],c2,c3,c1);
362 mul_add_c(a[2],b[5],c2,c3,c1);
363 mul_add_c(a[3],b[4],c2,c3,c1);
364 mul_add_c(a[4],b[3],c2,c3,c1);
365 mul_add_c(a[5],b[2],c2,c3,c1);
366 mul_add_c(a[6],b[1],c2,c3,c1);
367 mul_add_c(a[7],b[0],c2,c3,c1);
368 r[7]=c2;
369 c2=0;
370 mul_add_c(a[7],b[1],c3,c1,c2);
371 mul_add_c(a[6],b[2],c3,c1,c2);
372 mul_add_c(a[5],b[3],c3,c1,c2);
373 mul_add_c(a[4],b[4],c3,c1,c2);
374 mul_add_c(a[3],b[5],c3,c1,c2);
375 mul_add_c(a[2],b[6],c3,c1,c2);
376 mul_add_c(a[1],b[7],c3,c1,c2);
377 r[8]=c3;
378 c3=0;
379 mul_add_c(a[2],b[7],c1,c2,c3);
380 mul_add_c(a[3],b[6],c1,c2,c3);
381 mul_add_c(a[4],b[5],c1,c2,c3);
382 mul_add_c(a[5],b[4],c1,c2,c3);
383 mul_add_c(a[6],b[3],c1,c2,c3);
384 mul_add_c(a[7],b[2],c1,c2,c3);
385 r[9]=c1;
386 c1=0;
387 mul_add_c(a[7],b[3],c2,c3,c1);
388 mul_add_c(a[6],b[4],c2,c3,c1);
389 mul_add_c(a[5],b[5],c2,c3,c1);
390 mul_add_c(a[4],b[6],c2,c3,c1);
391 mul_add_c(a[3],b[7],c2,c3,c1);
392 r[10]=c2;
393 c2=0;
394 mul_add_c(a[4],b[7],c3,c1,c2);
395 mul_add_c(a[5],b[6],c3,c1,c2);
396 mul_add_c(a[6],b[5],c3,c1,c2);
397 mul_add_c(a[7],b[4],c3,c1,c2);
398 r[11]=c3;
399 c3=0;
400 mul_add_c(a[7],b[5],c1,c2,c3);
401 mul_add_c(a[6],b[6],c1,c2,c3);
402 mul_add_c(a[5],b[7],c1,c2,c3);
403 r[12]=c1;
404 c1=0;
405 mul_add_c(a[6],b[7],c2,c3,c1);
406 mul_add_c(a[7],b[6],c2,c3,c1);
407 r[13]=c2;
408 c2=0;
409 mul_add_c(a[7],b[7],c3,c1,c2);
410 r[14]=c3;
411 r[15]=c1;
412 }
413
414void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
415 {
416 BN_ULONG c1,c2,c3;
417
418 c1=0;
419 c2=0;
420 c3=0;
421 mul_add_c(a[0],b[0],c1,c2,c3);
422 r[0]=c1;
423 c1=0;
424 mul_add_c(a[0],b[1],c2,c3,c1);
425 mul_add_c(a[1],b[0],c2,c3,c1);
426 r[1]=c2;
427 c2=0;
428 mul_add_c(a[2],b[0],c3,c1,c2);
429 mul_add_c(a[1],b[1],c3,c1,c2);
430 mul_add_c(a[0],b[2],c3,c1,c2);
431 r[2]=c3;
432 c3=0;
433 mul_add_c(a[0],b[3],c1,c2,c3);
434 mul_add_c(a[1],b[2],c1,c2,c3);
435 mul_add_c(a[2],b[1],c1,c2,c3);
436 mul_add_c(a[3],b[0],c1,c2,c3);
437 r[3]=c1;
438 c1=0;
439 mul_add_c(a[3],b[1],c2,c3,c1);
440 mul_add_c(a[2],b[2],c2,c3,c1);
441 mul_add_c(a[1],b[3],c2,c3,c1);
442 r[4]=c2;
443 c2=0;
444 mul_add_c(a[2],b[3],c3,c1,c2);
445 mul_add_c(a[3],b[2],c3,c1,c2);
446 r[5]=c3;
447 c3=0;
448 mul_add_c(a[3],b[3],c1,c2,c3);
449 r[6]=c1;
450 r[7]=c2;
451 }
452
453void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
454 {
455 BN_ULONG c1,c2,c3;
456
457 c1=0;
458 c2=0;
459 c3=0;
460 sqr_add_c(a,0,c1,c2,c3);
461 r[0]=c1;
462 c1=0;
463 sqr_add_c2(a,1,0,c2,c3,c1);
464 r[1]=c2;
465 c2=0;
466 sqr_add_c(a,1,c3,c1,c2);
467 sqr_add_c2(a,2,0,c3,c1,c2);
468 r[2]=c3;
469 c3=0;
470 sqr_add_c2(a,3,0,c1,c2,c3);
471 sqr_add_c2(a,2,1,c1,c2,c3);
472 r[3]=c1;
473 c1=0;
474 sqr_add_c(a,2,c2,c3,c1);
475 sqr_add_c2(a,3,1,c2,c3,c1);
476 sqr_add_c2(a,4,0,c2,c3,c1);
477 r[4]=c2;
478 c2=0;
479 sqr_add_c2(a,5,0,c3,c1,c2);
480 sqr_add_c2(a,4,1,c3,c1,c2);
481 sqr_add_c2(a,3,2,c3,c1,c2);
482 r[5]=c3;
483 c3=0;
484 sqr_add_c(a,3,c1,c2,c3);
485 sqr_add_c2(a,4,2,c1,c2,c3);
486 sqr_add_c2(a,5,1,c1,c2,c3);
487 sqr_add_c2(a,6,0,c1,c2,c3);
488 r[6]=c1;
489 c1=0;
490 sqr_add_c2(a,7,0,c2,c3,c1);
491 sqr_add_c2(a,6,1,c2,c3,c1);
492 sqr_add_c2(a,5,2,c2,c3,c1);
493 sqr_add_c2(a,4,3,c2,c3,c1);
494 r[7]=c2;
495 c2=0;
496 sqr_add_c(a,4,c3,c1,c2);
497 sqr_add_c2(a,5,3,c3,c1,c2);
498 sqr_add_c2(a,6,2,c3,c1,c2);
499 sqr_add_c2(a,7,1,c3,c1,c2);
500 r[8]=c3;
501 c3=0;
502 sqr_add_c2(a,7,2,c1,c2,c3);
503 sqr_add_c2(a,6,3,c1,c2,c3);
504 sqr_add_c2(a,5,4,c1,c2,c3);
505 r[9]=c1;
506 c1=0;
507 sqr_add_c(a,5,c2,c3,c1);
508 sqr_add_c2(a,6,4,c2,c3,c1);
509 sqr_add_c2(a,7,3,c2,c3,c1);
510 r[10]=c2;
511 c2=0;
512 sqr_add_c2(a,7,4,c3,c1,c2);
513 sqr_add_c2(a,6,5,c3,c1,c2);
514 r[11]=c3;
515 c3=0;
516 sqr_add_c(a,6,c1,c2,c3);
517 sqr_add_c2(a,7,5,c1,c2,c3);
518 r[12]=c1;
519 c1=0;
520 sqr_add_c2(a,7,6,c2,c3,c1);
521 r[13]=c2;
522 c2=0;
523 sqr_add_c(a,7,c3,c1,c2);
524 r[14]=c3;
525 r[15]=c1;
526 }
527
528void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
529 {
530 BN_ULONG c1,c2,c3;
531
532 c1=0;
533 c2=0;
534 c3=0;
535 sqr_add_c(a,0,c1,c2,c3);
536 r[0]=c1;
537 c1=0;
538 sqr_add_c2(a,1,0,c2,c3,c1);
539 r[1]=c2;
540 c2=0;
541 sqr_add_c(a,1,c3,c1,c2);
542 sqr_add_c2(a,2,0,c3,c1,c2);
543 r[2]=c3;
544 c3=0;
545 sqr_add_c2(a,3,0,c1,c2,c3);
546 sqr_add_c2(a,2,1,c1,c2,c3);
547 r[3]=c1;
548 c1=0;
549 sqr_add_c(a,2,c2,c3,c1);
550 sqr_add_c2(a,3,1,c2,c3,c1);
551 r[4]=c2;
552 c2=0;
553 sqr_add_c2(a,3,2,c3,c1,c2);
554 r[5]=c3;
555 c3=0;
556 sqr_add_c(a,3,c1,c2,c3);
557 r[6]=c1;
558 r[7]=c2;
559 }