Bye bye x86_64-gcc.c.

This is no longer used, since we're now using s2n-bignum functions instead.
author: jsing <> 2023-02-11 12:15:02 +0000
committer: jsing <> 2023-02-11 12:15:02 +0000
commit: f3bce1e189bb4d596a7d1feae8b66b6c11c62761 (patch)
tree: 9c771840455a831a47e2a3c9c68970885a9a45c3 /src
parent: 67afc07de0ed3a0ccc272df42853ba565a8277c6 (diff)
download: openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.gz
openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.bz2
openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.zip
1 files changed, 0 insertions, 559 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index c6d6239bc2..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,559 +0,0 @@
-/* $OpenBSD: x86_64-gcc.c,v 1.8 2023/01/20 17:26:03 jsing Exp $ */
-#include "../bn_local.h"
-/*
- * x86_64 BIGNUM accelerator version 0.1, December 2002.
- *
- * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- *
- * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
- *    versions, like 1.0...
- * A. Well, that's because this code is basically a quick-n-dirty
- *    proof-of-concept hack. As you can see it's implemented with
- *    inline assembler, which means that you're bound to GCC and that
- *    there might be enough room for further improvement.
- *
- * Q. Why inline assembler?
- * A. x86_64 features own ABI which I'm not familiar with. This is
- *    why I decided to let the compiler take care of subroutine
- *    prologue/epilogue as well as register allocation. For reference.
- *    Win64 implements different ABI for AMD64, different from Linux.
- *
- * Q. How much faster does it get?
- * A. 'apps/openssl speed rsa dsa' output with no-asm:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
- *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
- *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
- *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
- *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
- *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
- *
- *    'apps/openssl speed rsa dsa' output with this module:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
- *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
- *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
- *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
- *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
- *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
- *
- *    For the reference. IA-32 assembler implementation performs
- *    very much like 64-bit code compiled with no-asm on the same
- *    machine.
- */
-#define BN_ULONG unsigned long
-#undef mul
-#undef mul_add
-#undef sqr
-/*
- * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
- * "g"(0)               let the compiler to decide where does it
- *                      want to keep the value of zero;
- */
-#define mul_add(r,a,word,carry) do {    \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"m"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+m"(r),"+d"(high)    \
-                : "r"(carry),"g"(0)     \
-                : "cc");                \
-        carry=high;                     \
-        } while (0)
-#define mul(r,a,word,carry) do {        \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"g"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        (r)=carry, carry=high;          \
-        } while (0)
-#define sqr(r0,r1,a)                    \
-        asm ("mulq %2"                  \
-                : "=a"(r0),"=d"(r1)     \
-                : "a"(a)                \
-                : "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul_add(rp[0],ap[0],w,c1);
-                mul_add(rp[1],ap[1],w,c1);
-                mul_add(rp[2],ap[2],w,c1);
-                mul_add(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
-                mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
-                mul_add(rp[2],ap[2],w,c1); return c1;
-                }
-        
-        return(c1);
-        } 
-BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul(rp[0],ap[0],w,c1);
-                mul(rp[1],ap[1],w,c1);
-                mul(rp[2],ap[2],w,c1);
-                mul(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
-                mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
-                mul(rp[2],ap[2],w,c1);
-                }
-        return(c1);
-        } 
-void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
-        {
-        if (n <= 0) return;
-        while (n&~3)
-                {
-                sqr(r[0],r[1],a[0]);
-                sqr(r[2],r[3],a[1]);
-                sqr(r[4],r[5],a[2]);
-                sqr(r[6],r[7],a[3]);
-                a+=4; r+=8; n-=4;
-                }
-        if (n)
-                {
-                sqr(r[0],r[1],a[0]); if (--n == 0) return;
-                sqr(r[2],r[3],a[1]); if (--n == 0) return;
-                sqr(r[4],r[5],a[2]);
-                }
-        }
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
-{       BN_ULONG ret,waste;
-        asm ("divq      %4"
-                : "=a"(ret),"=d"(waste)
-                : "a"(l),"d"(h),"g"(d)
-                : "cc");
-        return ret;
-}
-BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       adcq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       sbbq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
-/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
-/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
-/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
-#undef mul_add_c
-#undef mul_add_c2
-#undef sqr_add_c
-#undef sqr_add_c2
-/*
- * Keep in mind that carrying into high part of multiplication result
- * can not overflow, because it cannot be all-ones.
- */
-#if 0
-/* original macros are kept for reference purposes */
-#define mul_add_c(a,b,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define mul_add_c2(a,b,c0,c1,c2)        do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi, tt;                    \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; tt = hi+((c0<lo)?1:0);        \
-        c1 += tt; c2 += (c1<tt)?1:0;            \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define sqr_add_c(a,i,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a)[i];                   \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,ta);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#else
-#define mul_add_c(a,b,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define sqr_add_c(a,i,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %2"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a[i])             \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define mul_add_c2(a,b,c0,c1,c2) do {   \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#endif
-#define sqr_add_c2(a,i,j,c0,c1,c2)      \
-        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[4],b[0],c2,c3,c1);
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        mul_add_c(a[0],b[4],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[0],b[5],c3,c1,c2);
-        mul_add_c(a[1],b[4],c3,c1,c2);
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        mul_add_c(a[4],b[1],c3,c1,c2);
-        mul_add_c(a[5],b[0],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[6],b[0],c1,c2,c3);
-        mul_add_c(a[5],b[1],c1,c2,c3);
-        mul_add_c(a[4],b[2],c1,c2,c3);
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        mul_add_c(a[2],b[4],c1,c2,c3);
-        mul_add_c(a[1],b[5],c1,c2,c3);
-        mul_add_c(a[0],b[6],c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        mul_add_c(a[0],b[7],c2,c3,c1);
-        mul_add_c(a[1],b[6],c2,c3,c1);
-        mul_add_c(a[2],b[5],c2,c3,c1);
-        mul_add_c(a[3],b[4],c2,c3,c1);
-        mul_add_c(a[4],b[3],c2,c3,c1);
-        mul_add_c(a[5],b[2],c2,c3,c1);
-        mul_add_c(a[6],b[1],c2,c3,c1);
-        mul_add_c(a[7],b[0],c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        mul_add_c(a[7],b[1],c3,c1,c2);
-        mul_add_c(a[6],b[2],c3,c1,c2);
-        mul_add_c(a[5],b[3],c3,c1,c2);
-        mul_add_c(a[4],b[4],c3,c1,c2);
-        mul_add_c(a[3],b[5],c3,c1,c2);
-        mul_add_c(a[2],b[6],c3,c1,c2);
-        mul_add_c(a[1],b[7],c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        mul_add_c(a[2],b[7],c1,c2,c3);
-        mul_add_c(a[3],b[6],c1,c2,c3);
-        mul_add_c(a[4],b[5],c1,c2,c3);
-        mul_add_c(a[5],b[4],c1,c2,c3);
-        mul_add_c(a[6],b[3],c1,c2,c3);
-        mul_add_c(a[7],b[2],c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        mul_add_c(a[7],b[3],c2,c3,c1);
-        mul_add_c(a[6],b[4],c2,c3,c1);
-        mul_add_c(a[5],b[5],c2,c3,c1);
-        mul_add_c(a[4],b[6],c2,c3,c1);
-        mul_add_c(a[3],b[7],c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        mul_add_c(a[4],b[7],c3,c1,c2);
-        mul_add_c(a[5],b[6],c3,c1,c2);
-        mul_add_c(a[6],b[5],c3,c1,c2);
-        mul_add_c(a[7],b[4],c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        mul_add_c(a[7],b[5],c1,c2,c3);
-        mul_add_c(a[6],b[6],c1,c2,c3);
-        mul_add_c(a[5],b[7],c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        mul_add_c(a[6],b[7],c2,c3,c1);
-        mul_add_c(a[7],b[6],c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        mul_add_c(a[7],b[7],c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        sqr_add_c2(a,4,0,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,5,0,c3,c1,c2);
-        sqr_add_c2(a,4,1,c3,c1,c2);
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        sqr_add_c2(a,4,2,c1,c2,c3);
-        sqr_add_c2(a,5,1,c1,c2,c3);
-        sqr_add_c2(a,6,0,c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        sqr_add_c2(a,7,0,c2,c3,c1);
-        sqr_add_c2(a,6,1,c2,c3,c1);
-        sqr_add_c2(a,5,2,c2,c3,c1);
-        sqr_add_c2(a,4,3,c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        sqr_add_c(a,4,c3,c1,c2);
-        sqr_add_c2(a,5,3,c3,c1,c2);
-        sqr_add_c2(a,6,2,c3,c1,c2);
-        sqr_add_c2(a,7,1,c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        sqr_add_c2(a,7,2,c1,c2,c3);
-        sqr_add_c2(a,6,3,c1,c2,c3);
-        sqr_add_c2(a,5,4,c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        sqr_add_c(a,5,c2,c3,c1);
-        sqr_add_c2(a,6,4,c2,c3,c1);
-        sqr_add_c2(a,7,3,c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        sqr_add_c2(a,7,4,c3,c1,c2);
-        sqr_add_c2(a,6,5,c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        sqr_add_c(a,6,c1,c2,c3);
-        sqr_add_c2(a,7,5,c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        sqr_add_c2(a,7,6,c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        sqr_add_c(a,7,c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }
author	jsing <>	2023-02-11 12:15:02 +0000
committer	jsing <>	2023-02-11 12:15:02 +0000
commit	f3bce1e189bb4d596a7d1feae8b66b6c11c62761 (patch)
tree	9c771840455a831a47e2a3c9c68970885a9a45c3 /src
parent	67afc07de0ed3a0ccc272df42853ba565a8277c6 (diff)
download	openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.gz openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.tar.bz2 openbsd-f3bce1e189bb4d596a7d1feae8b66b6c11c62761.zip

diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c deleted file mode 100644 index c6d6239bc2..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ /dev/null
@@ -1,559 +0,0 @@
1	/* $OpenBSD: x86_64-gcc.c,v 1.8 2023/01/20 17:26:03 jsing Exp $ */
2	#include "../bn_local.h"
3	/*
4	* x86_64 BIGNUM accelerator version 0.1, December 2002.
5	*
6	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7	* project.
8	*
9	* Rights for redistribution and usage in source and binary forms are
10	* granted according to the OpenSSL license. Warranty of any kind is
11	* disclaimed.
12	*
13	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
14	* versions, like 1.0...
15	* A. Well, that's because this code is basically a quick-n-dirty
16	* proof-of-concept hack. As you can see it's implemented with
17	* inline assembler, which means that you're bound to GCC and that
18	* there might be enough room for further improvement.
19	*
20	* Q. Why inline assembler?
21	* A. x86_64 features own ABI which I'm not familiar with. This is
22	* why I decided to let the compiler take care of subroutine
23	* prologue/epilogue as well as register allocation. For reference.
24	* Win64 implements different ABI for AMD64, different from Linux.
25	*
26	* Q. How much faster does it get?
27	* A. 'apps/openssl speed rsa dsa' output with no-asm:
28	*
29	* sign verify sign/s verify/s
30	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
31	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
32	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
33	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
34	* sign verify sign/s verify/s
35	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
36	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
37	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
38	*
39	* 'apps/openssl speed rsa dsa' output with this module:
40	*
41	* sign verify sign/s verify/s
42	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
43	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
44	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
45	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
46	* sign verify sign/s verify/s
47	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
48	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
49	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
50	*
51	* For the reference. IA-32 assembler implementation performs
52	* very much like 64-bit code compiled with no-asm on the same
53	* machine.
54	*/
55
56	#define BN_ULONG unsigned long
57
58	#undef mul
59	#undef mul_add
60	#undef sqr
61
62	/*
63	* "m"(a), "+m"(r) is the way to favor DirectPath �-code;
64	* "g"(0) let the compiler to decide where does it
65	* want to keep the value of zero;
66	*/
67	#define mul_add(r,a,word,carry) do { \
68	BN_ULONG high,low; \
69	asm ("mulq %3" \
70	: "=a"(low),"=d"(high) \
71	: "a"(word),"m"(a) \
72	: "cc"); \
73	asm ("addq %2,%0; adcq %3,%1" \
74	: "+r"(carry),"+d"(high)\
75	: "a"(low),"g"(0) \
76	: "cc"); \
77	asm ("addq %2,%0; adcq %3,%1" \
78	: "+m"(r),"+d"(high) \
79	: "r"(carry),"g"(0) \
80	: "cc"); \
81	carry=high; \
82	} while (0)
83
84	#define mul(r,a,word,carry) do { \
85	BN_ULONG high,low; \
86	asm ("mulq %3" \
87	: "=a"(low),"=d"(high) \
88	: "a"(word),"g"(a) \
89	: "cc"); \
90	asm ("addq %2,%0; adcq %3,%1" \
91	: "+r"(carry),"+d"(high)\
92	: "a"(low),"g"(0) \
93	: "cc"); \
94	(r)=carry, carry=high; \
95	} while (0)
96
97	#define sqr(r0,r1,a) \
98	asm ("mulq %2" \
99	: "=a"(r0),"=d"(r1) \
100	: "a"(a) \
101	: "cc");
102
103	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
104	{
105	BN_ULONG c1=0;
106
107	if (num <= 0) return(c1);
108
109	while (num&~3)
110	{
111	mul_add(rp[0],ap[0],w,c1);
112	mul_add(rp[1],ap[1],w,c1);
113	mul_add(rp[2],ap[2],w,c1);
114	mul_add(rp[3],ap[3],w,c1);
115	ap+=4; rp+=4; num-=4;
116	}
117	if (num)
118	{
119	mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
120	mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
121	mul_add(rp[2],ap[2],w,c1); return c1;
122	}
123
124	return(c1);
125	}
126
127	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
128	{
129	BN_ULONG c1=0;
130
131	if (num <= 0) return(c1);
132
133	while (num&~3)
134	{
135	mul(rp[0],ap[0],w,c1);
136	mul(rp[1],ap[1],w,c1);
137	mul(rp[2],ap[2],w,c1);
138	mul(rp[3],ap[3],w,c1);
139	ap+=4; rp+=4; num-=4;
140	}
141	if (num)
142	{
143	mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
144	mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
145	mul(rp[2],ap[2],w,c1);
146	}
147	return(c1);
148	}
149
150	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
151	{
152	if (n <= 0) return;
153
154	while (n&~3)
155	{
156	sqr(r[0],r[1],a[0]);
157	sqr(r[2],r[3],a[1]);
158	sqr(r[4],r[5],a[2]);
159	sqr(r[6],r[7],a[3]);
160	a+=4; r+=8; n-=4;
161	}
162	if (n)
163	{
164	sqr(r[0],r[1],a[0]); if (--n == 0) return;
165	sqr(r[2],r[3],a[1]); if (--n == 0) return;
166	sqr(r[4],r[5],a[2]);
167	}
168	}
169
170	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
171	{ BN_ULONG ret,waste;
172
173	asm ("divq %4"
174	: "=a"(ret),"=d"(waste)
175	: "a"(l),"d"(h),"g"(d)
176	: "cc");
177
178	return ret;
179	}
180
181	BN_ULONG bn_add_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
182	{ BN_ULONG ret=0,i=0;
183
184	if (n <= 0) return 0;
185
186	asm (
187	" subq %2,%2 \n"
188	".p2align 4 \n"
189	"1: movq (%4,%2,8),%0 \n"
190	" adcq (%5,%2,8),%0 \n"
191	" movq %0,(%3,%2,8) \n"
192	" leaq 1(%2),%2 \n"
193	" loop 1b \n"
194	" sbbq %0,%0 \n"
195	: "=&a"(ret),"+c"(n),"=&r"(i)
196	: "r"(rp),"r"(ap),"r"(bp)
197	: "cc"
198	);
199
200	return ret&1;
201	}
202
203	BN_ULONG bn_sub_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
204	{ BN_ULONG ret=0,i=0;
205
206	if (n <= 0) return 0;
207
208	asm (
209	" subq %2,%2 \n"
210	".p2align 4 \n"
211	"1: movq (%4,%2,8),%0 \n"
212	" sbbq (%5,%2,8),%0 \n"
213	" movq %0,(%3,%2,8) \n"
214	" leaq 1(%2),%2 \n"
215	" loop 1b \n"
216	" sbbq %0,%0 \n"
217	: "=&a"(ret),"+c"(n),"=&r"(i)
218	: "r"(rp),"r"(ap),"r"(bp)
219	: "cc"
220	);
221
222	return ret&1;
223	}
224
225	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
226	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
227	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
228	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */
229
230	#undef mul_add_c
231	#undef mul_add_c2
232	#undef sqr_add_c
233	#undef sqr_add_c2
234
235	/*
236	* Keep in mind that carrying into high part of multiplication result
237	* can not overflow, because it cannot be all-ones.
238	*/
239	#if 0
240	/* original macros are kept for reference purposes */
241	#define mul_add_c(a,b,c0,c1,c2) do { \
242	BN_ULONG ta = (a), tb = (b); \
243	BN_ULONG lo, hi; \
244	BN_UMULT_LOHI(lo,hi,ta,tb); \
245	c0 += lo; hi += (c0<lo)?1:0; \
246	c1 += hi; c2 += (c1<hi)?1:0; \
247	} while(0)
248
249	#define mul_add_c2(a,b,c0,c1,c2) do { \
250	BN_ULONG ta = (a), tb = (b); \
251	BN_ULONG lo, hi, tt; \
252	BN_UMULT_LOHI(lo,hi,ta,tb); \
253	c0 += lo; tt = hi+((c0<lo)?1:0); \
254	c1 += tt; c2 += (c1<tt)?1:0; \
255	c0 += lo; hi += (c0<lo)?1:0; \
256	c1 += hi; c2 += (c1<hi)?1:0; \
257	} while(0)
258
259	#define sqr_add_c(a,i,c0,c1,c2) do { \
260	BN_ULONG ta = (a)[i]; \
261	BN_ULONG lo, hi; \
262	BN_UMULT_LOHI(lo,hi,ta,ta); \
263	c0 += lo; hi += (c0<lo)?1:0; \
264	c1 += hi; c2 += (c1<hi)?1:0; \
265	} while(0)
266	#else
267	#define mul_add_c(a,b,c0,c1,c2) do { \
268	BN_ULONG t1,t2; \
269	asm ("mulq %3" \
270	: "=a"(t1),"=d"(t2) \
271	: "a"(a),"m"(b) \
272	: "cc"); \
273	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
274	: "+r"(c0),"+r"(c1),"+r"(c2) \
275	: "r"(t1),"r"(t2),"g"(0) \
276	: "cc"); \
277	} while (0)
278
279	#define sqr_add_c(a,i,c0,c1,c2) do { \
280	BN_ULONG t1,t2; \
281	asm ("mulq %2" \
282	: "=a"(t1),"=d"(t2) \
283	: "a"(a[i]) \
284	: "cc"); \
285	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
286	: "+r"(c0),"+r"(c1),"+r"(c2) \
287	: "r"(t1),"r"(t2),"g"(0) \
288	: "cc"); \
289	} while (0)
290
291	#define mul_add_c2(a,b,c0,c1,c2) do { \
292	BN_ULONG t1,t2; \
293	asm ("mulq %3" \
294	: "=a"(t1),"=d"(t2) \
295	: "a"(a),"m"(b) \
296	: "cc"); \
297	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
298	: "+r"(c0),"+r"(c1),"+r"(c2) \
299	: "r"(t1),"r"(t2),"g"(0) \
300	: "cc"); \
301	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
302	: "+r"(c0),"+r"(c1),"+r"(c2) \
303	: "r"(t1),"r"(t2),"g"(0) \
304	: "cc"); \
305	} while (0)
306	#endif
307
308	#define sqr_add_c2(a,i,j,c0,c1,c2) \
309	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
310
311	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
312	{
313	BN_ULONG c1,c2,c3;
314
315	c1=0;
316	c2=0;
317	c3=0;
318	mul_add_c(a[0],b[0],c1,c2,c3);
319	r[0]=c1;
320	c1=0;
321	mul_add_c(a[0],b[1],c2,c3,c1);
322	mul_add_c(a[1],b[0],c2,c3,c1);
323	r[1]=c2;
324	c2=0;
325	mul_add_c(a[2],b[0],c3,c1,c2);
326	mul_add_c(a[1],b[1],c3,c1,c2);
327	mul_add_c(a[0],b[2],c3,c1,c2);
328	r[2]=c3;
329	c3=0;
330	mul_add_c(a[0],b[3],c1,c2,c3);
331	mul_add_c(a[1],b[2],c1,c2,c3);
332	mul_add_c(a[2],b[1],c1,c2,c3);
333	mul_add_c(a[3],b[0],c1,c2,c3);
334	r[3]=c1;
335	c1=0;
336	mul_add_c(a[4],b[0],c2,c3,c1);
337	mul_add_c(a[3],b[1],c2,c3,c1);
338	mul_add_c(a[2],b[2],c2,c3,c1);
339	mul_add_c(a[1],b[3],c2,c3,c1);
340	mul_add_c(a[0],b[4],c2,c3,c1);
341	r[4]=c2;
342	c2=0;
343	mul_add_c(a[0],b[5],c3,c1,c2);
344	mul_add_c(a[1],b[4],c3,c1,c2);
345	mul_add_c(a[2],b[3],c3,c1,c2);
346	mul_add_c(a[3],b[2],c3,c1,c2);
347	mul_add_c(a[4],b[1],c3,c1,c2);
348	mul_add_c(a[5],b[0],c3,c1,c2);
349	r[5]=c3;
350	c3=0;
351	mul_add_c(a[6],b[0],c1,c2,c3);
352	mul_add_c(a[5],b[1],c1,c2,c3);
353	mul_add_c(a[4],b[2],c1,c2,c3);
354	mul_add_c(a[3],b[3],c1,c2,c3);
355	mul_add_c(a[2],b[4],c1,c2,c3);
356	mul_add_c(a[1],b[5],c1,c2,c3);
357	mul_add_c(a[0],b[6],c1,c2,c3);
358	r[6]=c1;
359	c1=0;
360	mul_add_c(a[0],b[7],c2,c3,c1);
361	mul_add_c(a[1],b[6],c2,c3,c1);
362	mul_add_c(a[2],b[5],c2,c3,c1);
363	mul_add_c(a[3],b[4],c2,c3,c1);
364	mul_add_c(a[4],b[3],c2,c3,c1);
365	mul_add_c(a[5],b[2],c2,c3,c1);
366	mul_add_c(a[6],b[1],c2,c3,c1);
367	mul_add_c(a[7],b[0],c2,c3,c1);
368	r[7]=c2;
369	c2=0;
370	mul_add_c(a[7],b[1],c3,c1,c2);
371	mul_add_c(a[6],b[2],c3,c1,c2);
372	mul_add_c(a[5],b[3],c3,c1,c2);
373	mul_add_c(a[4],b[4],c3,c1,c2);
374	mul_add_c(a[3],b[5],c3,c1,c2);
375	mul_add_c(a[2],b[6],c3,c1,c2);
376	mul_add_c(a[1],b[7],c3,c1,c2);
377	r[8]=c3;
378	c3=0;
379	mul_add_c(a[2],b[7],c1,c2,c3);
380	mul_add_c(a[3],b[6],c1,c2,c3);
381	mul_add_c(a[4],b[5],c1,c2,c3);
382	mul_add_c(a[5],b[4],c1,c2,c3);
383	mul_add_c(a[6],b[3],c1,c2,c3);
384	mul_add_c(a[7],b[2],c1,c2,c3);
385	r[9]=c1;
386	c1=0;
387	mul_add_c(a[7],b[3],c2,c3,c1);
388	mul_add_c(a[6],b[4],c2,c3,c1);
389	mul_add_c(a[5],b[5],c2,c3,c1);
390	mul_add_c(a[4],b[6],c2,c3,c1);
391	mul_add_c(a[3],b[7],c2,c3,c1);
392	r[10]=c2;
393	c2=0;
394	mul_add_c(a[4],b[7],c3,c1,c2);
395	mul_add_c(a[5],b[6],c3,c1,c2);
396	mul_add_c(a[6],b[5],c3,c1,c2);
397	mul_add_c(a[7],b[4],c3,c1,c2);
398	r[11]=c3;
399	c3=0;
400	mul_add_c(a[7],b[5],c1,c2,c3);
401	mul_add_c(a[6],b[6],c1,c2,c3);
402	mul_add_c(a[5],b[7],c1,c2,c3);
403	r[12]=c1;
404	c1=0;
405	mul_add_c(a[6],b[7],c2,c3,c1);
406	mul_add_c(a[7],b[6],c2,c3,c1);
407	r[13]=c2;
408	c2=0;
409	mul_add_c(a[7],b[7],c3,c1,c2);
410	r[14]=c3;
411	r[15]=c1;
412	}
413
414	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
415	{
416	BN_ULONG c1,c2,c3;
417
418	c1=0;
419	c2=0;
420	c3=0;
421	mul_add_c(a[0],b[0],c1,c2,c3);
422	r[0]=c1;
423	c1=0;
424	mul_add_c(a[0],b[1],c2,c3,c1);
425	mul_add_c(a[1],b[0],c2,c3,c1);
426	r[1]=c2;
427	c2=0;
428	mul_add_c(a[2],b[0],c3,c1,c2);
429	mul_add_c(a[1],b[1],c3,c1,c2);
430	mul_add_c(a[0],b[2],c3,c1,c2);
431	r[2]=c3;
432	c3=0;
433	mul_add_c(a[0],b[3],c1,c2,c3);
434	mul_add_c(a[1],b[2],c1,c2,c3);
435	mul_add_c(a[2],b[1],c1,c2,c3);
436	mul_add_c(a[3],b[0],c1,c2,c3);
437	r[3]=c1;
438	c1=0;
439	mul_add_c(a[3],b[1],c2,c3,c1);
440	mul_add_c(a[2],b[2],c2,c3,c1);
441	mul_add_c(a[1],b[3],c2,c3,c1);
442	r[4]=c2;
443	c2=0;
444	mul_add_c(a[2],b[3],c3,c1,c2);
445	mul_add_c(a[3],b[2],c3,c1,c2);
446	r[5]=c3;
447	c3=0;
448	mul_add_c(a[3],b[3],c1,c2,c3);
449	r[6]=c1;
450	r[7]=c2;
451	}
452
453	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
454	{
455	BN_ULONG c1,c2,c3;
456
457	c1=0;
458	c2=0;
459	c3=0;
460	sqr_add_c(a,0,c1,c2,c3);
461	r[0]=c1;
462	c1=0;
463	sqr_add_c2(a,1,0,c2,c3,c1);
464	r[1]=c2;
465	c2=0;
466	sqr_add_c(a,1,c3,c1,c2);
467	sqr_add_c2(a,2,0,c3,c1,c2);
468	r[2]=c3;
469	c3=0;
470	sqr_add_c2(a,3,0,c1,c2,c3);
471	sqr_add_c2(a,2,1,c1,c2,c3);
472	r[3]=c1;
473	c1=0;
474	sqr_add_c(a,2,c2,c3,c1);
475	sqr_add_c2(a,3,1,c2,c3,c1);
476	sqr_add_c2(a,4,0,c2,c3,c1);
477	r[4]=c2;
478	c2=0;
479	sqr_add_c2(a,5,0,c3,c1,c2);
480	sqr_add_c2(a,4,1,c3,c1,c2);
481	sqr_add_c2(a,3,2,c3,c1,c2);
482	r[5]=c3;
483	c3=0;
484	sqr_add_c(a,3,c1,c2,c3);
485	sqr_add_c2(a,4,2,c1,c2,c3);
486	sqr_add_c2(a,5,1,c1,c2,c3);
487	sqr_add_c2(a,6,0,c1,c2,c3);
488	r[6]=c1;
489	c1=0;
490	sqr_add_c2(a,7,0,c2,c3,c1);
491	sqr_add_c2(a,6,1,c2,c3,c1);
492	sqr_add_c2(a,5,2,c2,c3,c1);
493	sqr_add_c2(a,4,3,c2,c3,c1);
494	r[7]=c2;
495	c2=0;
496	sqr_add_c(a,4,c3,c1,c2);
497	sqr_add_c2(a,5,3,c3,c1,c2);
498	sqr_add_c2(a,6,2,c3,c1,c2);
499	sqr_add_c2(a,7,1,c3,c1,c2);
500	r[8]=c3;
501	c3=0;
502	sqr_add_c2(a,7,2,c1,c2,c3);
503	sqr_add_c2(a,6,3,c1,c2,c3);
504	sqr_add_c2(a,5,4,c1,c2,c3);
505	r[9]=c1;
506	c1=0;
507	sqr_add_c(a,5,c2,c3,c1);
508	sqr_add_c2(a,6,4,c2,c3,c1);
509	sqr_add_c2(a,7,3,c2,c3,c1);
510	r[10]=c2;
511	c2=0;
512	sqr_add_c2(a,7,4,c3,c1,c2);
513	sqr_add_c2(a,6,5,c3,c1,c2);
514	r[11]=c3;
515	c3=0;
516	sqr_add_c(a,6,c1,c2,c3);
517	sqr_add_c2(a,7,5,c1,c2,c3);
518	r[12]=c1;
519	c1=0;
520	sqr_add_c2(a,7,6,c2,c3,c1);
521	r[13]=c2;
522	c2=0;
523	sqr_add_c(a,7,c3,c1,c2);
524	r[14]=c3;
525	r[15]=c1;
526	}
527
528	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
529	{
530	BN_ULONG c1,c2,c3;
531
532	c1=0;
533	c2=0;
534	c3=0;
535	sqr_add_c(a,0,c1,c2,c3);
536	r[0]=c1;
537	c1=0;
538	sqr_add_c2(a,1,0,c2,c3,c1);
539	r[1]=c2;
540	c2=0;
541	sqr_add_c(a,1,c3,c1,c2);
542	sqr_add_c2(a,2,0,c3,c1,c2);
543	r[2]=c3;
544	c3=0;
545	sqr_add_c2(a,3,0,c1,c2,c3);
546	sqr_add_c2(a,2,1,c1,c2,c3);
547	r[3]=c1;
548	c1=0;
549	sqr_add_c(a,2,c2,c3,c1);
550	sqr_add_c2(a,3,1,c2,c3,c1);
551	r[4]=c2;
552	c2=0;
553	sqr_add_c2(a,3,2,c3,c1,c2);
554	r[5]=c3;
555	c3=0;
556	sqr_add_c(a,3,c1,c2,c3);
557	r[6]=c1;
558	r[7]=c2;
559	}