This commit was manufactured by cvs2git to create tag 'OPENBSD_5_8_BASE'.OPENBSD_5_8_BASE

author: cvs2svn <admin@example.com> 2015-08-02 21:54:22 +0000
committer: cvs2svn <admin@example.com> 2015-08-02 21:54:22 +0000
commit: ed3760bf4be4a96a89233fb8f8b84a0d44725862 (patch)
tree: 5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/bn/asm/x86_64-gcc.c
parent: f8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff)
download: openbsd-OPENBSD_5_8_BASE.tar.gz
openbsd-OPENBSD_5_8_BASE.tar.bz2
openbsd-OPENBSD_5_8_BASE.zip
1 files changed, 0 insertions, 598 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index 9deffa71f1..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,598 +0,0 @@
-/* $OpenBSD: x86_64-gcc.c,v 1.5 2015/02/25 15:39:49 bcook Exp $ */
-#include "../bn_lcl.h"
-#if !(defined(__GNUC__) && __GNUC__>=2)
-# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
-#else
-/*
- * x86_64 BIGNUM accelerator version 0.1, December 2002.
- *
- * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- *
- * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
- *    versions, like 1.0...
- * A. Well, that's because this code is basically a quick-n-dirty
- *    proof-of-concept hack. As you can see it's implemented with
- *    inline assembler, which means that you're bound to GCC and that
- *    there might be enough room for further improvement.
- *
- * Q. Why inline assembler?
- * A. x86_64 features own ABI which I'm not familiar with. This is
- *    why I decided to let the compiler take care of subroutine
- *    prologue/epilogue as well as register allocation. For reference.
- *    Win64 implements different ABI for AMD64, different from Linux.
- *
- * Q. How much faster does it get?
- * A. 'apps/openssl speed rsa dsa' output with no-asm:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
- *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
- *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
- *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
- *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
- *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
- *
- *    'apps/openssl speed rsa dsa' output with this module:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
- *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
- *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
- *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
- *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
- *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
- *
- *    For the reference. IA-32 assembler implementation performs
- *    very much like 64-bit code compiled with no-asm on the same
- *    machine.
- */
-#define BN_ULONG unsigned long
-#undef mul
-#undef mul_add
-#undef sqr
-/*
- * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
- * "g"(0)               let the compiler to decide where does it
- *                      want to keep the value of zero;
- */
-#define mul_add(r,a,word,carry) do {    \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"m"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+m"(r),"+d"(high)    \
-                : "r"(carry),"g"(0)     \
-                : "cc");                \
-        carry=high;                     \
-        } while (0)
-#define mul(r,a,word,carry) do {        \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"g"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        (r)=carry, carry=high;          \
-        } while (0)
-#define sqr(r0,r1,a)                    \
-        asm ("mulq %2"                  \
-                : "=a"(r0),"=d"(r1)     \
-                : "a"(a)                \
-                : "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul_add(rp[0],ap[0],w,c1);
-                mul_add(rp[1],ap[1],w,c1);
-                mul_add(rp[2],ap[2],w,c1);
-                mul_add(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
-                mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
-                mul_add(rp[2],ap[2],w,c1); return c1;
-                }
-        
-        return(c1);
-        } 
-BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul(rp[0],ap[0],w,c1);
-                mul(rp[1],ap[1],w,c1);
-                mul(rp[2],ap[2],w,c1);
-                mul(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
-                mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
-                mul(rp[2],ap[2],w,c1);
-                }
-        return(c1);
-        } 
-void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
-        {
-        if (n <= 0) return;
-        while (n&~3)
-                {
-                sqr(r[0],r[1],a[0]);
-                sqr(r[2],r[3],a[1]);
-                sqr(r[4],r[5],a[2]);
-                sqr(r[6],r[7],a[3]);
-                a+=4; r+=8; n-=4;
-                }
-        if (n)
-                {
-                sqr(r[0],r[1],a[0]); if (--n == 0) return;
-                sqr(r[2],r[3],a[1]); if (--n == 0) return;
-                sqr(r[4],r[5],a[2]);
-                }
-        }
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
-{       BN_ULONG ret,waste;
-        asm ("divq      %4"
-                : "=a"(ret),"=d"(waste)
-                : "a"(l),"d"(h),"g"(d)
-                : "cc");
-        return ret;
-}
-BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       adcq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-#ifndef SIMICS
-BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       sbbq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-#else
-/* Simics 1.4<7 has buggy sbbq:-( */
-#define BN_MASK2 0xffffffffffffffffL
-BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
-        {
-        BN_ULONG t1,t2;
-        int c=0;
-        if (n <= 0) return((BN_ULONG)0);
-        for (;;)
-                {
-                t1=a[0]; t2=b[0];
-                r[0]=(t1-t2-c)&BN_MASK2;
-                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
-                t1=a[1]; t2=b[1];
-                r[1]=(t1-t2-c)&BN_MASK2;
-                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
-                t1=a[2]; t2=b[2];
-                r[2]=(t1-t2-c)&BN_MASK2;
-                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
-                t1=a[3]; t2=b[3];
-                r[3]=(t1-t2-c)&BN_MASK2;
-                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
-                a+=4;
-                b+=4;
-                r+=4;
-                }
-        return(c);
-        }
-#endif
-/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
-/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
-/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
-/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
-/*
- * Keep in mind that carrying into high part of multiplication result
- * can not overflow, because it cannot be all-ones.
- */
-#if 0
-/* original macros are kept for reference purposes */
-#define mul_add_c(a,b,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define mul_add_c2(a,b,c0,c1,c2)        do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi, tt;                    \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; tt = hi+((c0<lo)?1:0);        \
-        c1 += tt; c2 += (c1<tt)?1:0;            \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define sqr_add_c(a,i,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a)[i];                   \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,ta);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#else
-#define mul_add_c(a,b,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define sqr_add_c(a,i,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %2"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a[i])             \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define mul_add_c2(a,b,c0,c1,c2) do {   \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#endif
-#define sqr_add_c2(a,i,j,c0,c1,c2)      \
-        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[4],b[0],c2,c3,c1);
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        mul_add_c(a[0],b[4],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[0],b[5],c3,c1,c2);
-        mul_add_c(a[1],b[4],c3,c1,c2);
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        mul_add_c(a[4],b[1],c3,c1,c2);
-        mul_add_c(a[5],b[0],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[6],b[0],c1,c2,c3);
-        mul_add_c(a[5],b[1],c1,c2,c3);
-        mul_add_c(a[4],b[2],c1,c2,c3);
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        mul_add_c(a[2],b[4],c1,c2,c3);
-        mul_add_c(a[1],b[5],c1,c2,c3);
-        mul_add_c(a[0],b[6],c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        mul_add_c(a[0],b[7],c2,c3,c1);
-        mul_add_c(a[1],b[6],c2,c3,c1);
-        mul_add_c(a[2],b[5],c2,c3,c1);
-        mul_add_c(a[3],b[4],c2,c3,c1);
-        mul_add_c(a[4],b[3],c2,c3,c1);
-        mul_add_c(a[5],b[2],c2,c3,c1);
-        mul_add_c(a[6],b[1],c2,c3,c1);
-        mul_add_c(a[7],b[0],c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        mul_add_c(a[7],b[1],c3,c1,c2);
-        mul_add_c(a[6],b[2],c3,c1,c2);
-        mul_add_c(a[5],b[3],c3,c1,c2);
-        mul_add_c(a[4],b[4],c3,c1,c2);
-        mul_add_c(a[3],b[5],c3,c1,c2);
-        mul_add_c(a[2],b[6],c3,c1,c2);
-        mul_add_c(a[1],b[7],c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        mul_add_c(a[2],b[7],c1,c2,c3);
-        mul_add_c(a[3],b[6],c1,c2,c3);
-        mul_add_c(a[4],b[5],c1,c2,c3);
-        mul_add_c(a[5],b[4],c1,c2,c3);
-        mul_add_c(a[6],b[3],c1,c2,c3);
-        mul_add_c(a[7],b[2],c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        mul_add_c(a[7],b[3],c2,c3,c1);
-        mul_add_c(a[6],b[4],c2,c3,c1);
-        mul_add_c(a[5],b[5],c2,c3,c1);
-        mul_add_c(a[4],b[6],c2,c3,c1);
-        mul_add_c(a[3],b[7],c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        mul_add_c(a[4],b[7],c3,c1,c2);
-        mul_add_c(a[5],b[6],c3,c1,c2);
-        mul_add_c(a[6],b[5],c3,c1,c2);
-        mul_add_c(a[7],b[4],c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        mul_add_c(a[7],b[5],c1,c2,c3);
-        mul_add_c(a[6],b[6],c1,c2,c3);
-        mul_add_c(a[5],b[7],c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        mul_add_c(a[6],b[7],c2,c3,c1);
-        mul_add_c(a[7],b[6],c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        mul_add_c(a[7],b[7],c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        sqr_add_c2(a,4,0,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,5,0,c3,c1,c2);
-        sqr_add_c2(a,4,1,c3,c1,c2);
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        sqr_add_c2(a,4,2,c1,c2,c3);
-        sqr_add_c2(a,5,1,c1,c2,c3);
-        sqr_add_c2(a,6,0,c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        sqr_add_c2(a,7,0,c2,c3,c1);
-        sqr_add_c2(a,6,1,c2,c3,c1);
-        sqr_add_c2(a,5,2,c2,c3,c1);
-        sqr_add_c2(a,4,3,c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        sqr_add_c(a,4,c3,c1,c2);
-        sqr_add_c2(a,5,3,c3,c1,c2);
-        sqr_add_c2(a,6,2,c3,c1,c2);
-        sqr_add_c2(a,7,1,c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        sqr_add_c2(a,7,2,c1,c2,c3);
-        sqr_add_c2(a,6,3,c1,c2,c3);
-        sqr_add_c2(a,5,4,c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        sqr_add_c(a,5,c2,c3,c1);
-        sqr_add_c2(a,6,4,c2,c3,c1);
-        sqr_add_c2(a,7,3,c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        sqr_add_c2(a,7,4,c3,c1,c2);
-        sqr_add_c2(a,6,5,c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        sqr_add_c(a,6,c1,c2,c3);
-        sqr_add_c2(a,7,5,c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        sqr_add_c2(a,7,6,c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        sqr_add_c(a,7,c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }
-#endif
author	cvs2svn <admin@example.com>	2015-08-02 21:54:22 +0000
committer	cvs2svn <admin@example.com>	2015-08-02 21:54:22 +0000
commit	ed3760bf4be4a96a89233fb8f8b84a0d44725862 (patch)
tree	5609c82060f75c53af0a7641d9b33a88574876cd /src/lib/libcrypto/bn/asm/x86_64-gcc.c
parent	f8b563fb5ba1524c821d37308f4e6abfc866bc3f (diff)
download	openbsd-OPENBSD_5_8_BASE.tar.gz openbsd-OPENBSD_5_8_BASE.tar.bz2 openbsd-OPENBSD_5_8_BASE.zip

diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c deleted file mode 100644 index 9deffa71f1..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ /dev/null
@@ -1,598 +0,0 @@
1	/* $OpenBSD: x86_64-gcc.c,v 1.5 2015/02/25 15:39:49 bcook Exp $ */
2	#include "../bn_lcl.h"
3	#if !(defined(__GNUC__) && __GNUC__>=2)
4	# include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
5	#else
6	/*
7	* x86_64 BIGNUM accelerator version 0.1, December 2002.
8	*
9	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
10	* project.
11	*
12	* Rights for redistribution and usage in source and binary forms are
13	* granted according to the OpenSSL license. Warranty of any kind is
14	* disclaimed.
15	*
16	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
17	* versions, like 1.0...
18	* A. Well, that's because this code is basically a quick-n-dirty
19	* proof-of-concept hack. As you can see it's implemented with
20	* inline assembler, which means that you're bound to GCC and that
21	* there might be enough room for further improvement.
22	*
23	* Q. Why inline assembler?
24	* A. x86_64 features own ABI which I'm not familiar with. This is
25	* why I decided to let the compiler take care of subroutine
26	* prologue/epilogue as well as register allocation. For reference.
27	* Win64 implements different ABI for AMD64, different from Linux.
28	*
29	* Q. How much faster does it get?
30	* A. 'apps/openssl speed rsa dsa' output with no-asm:
31	*
32	* sign verify sign/s verify/s
33	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
34	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
35	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
36	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
37	* sign verify sign/s verify/s
38	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
39	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
40	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
41	*
42	* 'apps/openssl speed rsa dsa' output with this module:
43	*
44	* sign verify sign/s verify/s
45	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
46	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
47	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
48	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
49	* sign verify sign/s verify/s
50	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
51	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
52	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
53	*
54	* For the reference. IA-32 assembler implementation performs
55	* very much like 64-bit code compiled with no-asm on the same
56	* machine.
57	*/
58
59	#define BN_ULONG unsigned long
60
61	#undef mul
62	#undef mul_add
63	#undef sqr
64
65	/*
66	* "m"(a), "+m"(r) is the way to favor DirectPath �-code;
67	* "g"(0) let the compiler to decide where does it
68	* want to keep the value of zero;
69	*/
70	#define mul_add(r,a,word,carry) do { \
71	BN_ULONG high,low; \
72	asm ("mulq %3" \
73	: "=a"(low),"=d"(high) \
74	: "a"(word),"m"(a) \
75	: "cc"); \
76	asm ("addq %2,%0; adcq %3,%1" \
77	: "+r"(carry),"+d"(high)\
78	: "a"(low),"g"(0) \
79	: "cc"); \
80	asm ("addq %2,%0; adcq %3,%1" \
81	: "+m"(r),"+d"(high) \
82	: "r"(carry),"g"(0) \
83	: "cc"); \
84	carry=high; \
85	} while (0)
86
87	#define mul(r,a,word,carry) do { \
88	BN_ULONG high,low; \
89	asm ("mulq %3" \
90	: "=a"(low),"=d"(high) \
91	: "a"(word),"g"(a) \
92	: "cc"); \
93	asm ("addq %2,%0; adcq %3,%1" \
94	: "+r"(carry),"+d"(high)\
95	: "a"(low),"g"(0) \
96	: "cc"); \
97	(r)=carry, carry=high; \
98	} while (0)
99
100	#define sqr(r0,r1,a) \
101	asm ("mulq %2" \
102	: "=a"(r0),"=d"(r1) \
103	: "a"(a) \
104	: "cc");
105
106	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
107	{
108	BN_ULONG c1=0;
109
110	if (num <= 0) return(c1);
111
112	while (num&~3)
113	{
114	mul_add(rp[0],ap[0],w,c1);
115	mul_add(rp[1],ap[1],w,c1);
116	mul_add(rp[2],ap[2],w,c1);
117	mul_add(rp[3],ap[3],w,c1);
118	ap+=4; rp+=4; num-=4;
119	}
120	if (num)
121	{
122	mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
123	mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
124	mul_add(rp[2],ap[2],w,c1); return c1;
125	}
126
127	return(c1);
128	}
129
130	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
131	{
132	BN_ULONG c1=0;
133
134	if (num <= 0) return(c1);
135
136	while (num&~3)
137	{
138	mul(rp[0],ap[0],w,c1);
139	mul(rp[1],ap[1],w,c1);
140	mul(rp[2],ap[2],w,c1);
141	mul(rp[3],ap[3],w,c1);
142	ap+=4; rp+=4; num-=4;
143	}
144	if (num)
145	{
146	mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
147	mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
148	mul(rp[2],ap[2],w,c1);
149	}
150	return(c1);
151	}
152
153	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
154	{
155	if (n <= 0) return;
156
157	while (n&~3)
158	{
159	sqr(r[0],r[1],a[0]);
160	sqr(r[2],r[3],a[1]);
161	sqr(r[4],r[5],a[2]);
162	sqr(r[6],r[7],a[3]);
163	a+=4; r+=8; n-=4;
164	}
165	if (n)
166	{
167	sqr(r[0],r[1],a[0]); if (--n == 0) return;
168	sqr(r[2],r[3],a[1]); if (--n == 0) return;
169	sqr(r[4],r[5],a[2]);
170	}
171	}
172
173	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
174	{ BN_ULONG ret,waste;
175
176	asm ("divq %4"
177	: "=a"(ret),"=d"(waste)
178	: "a"(l),"d"(h),"g"(d)
179	: "cc");
180
181	return ret;
182	}
183
184	BN_ULONG bn_add_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
185	{ BN_ULONG ret=0,i=0;
186
187	if (n <= 0) return 0;
188
189	asm (
190	" subq %2,%2 \n"
191	".p2align 4 \n"
192	"1: movq (%4,%2,8),%0 \n"
193	" adcq (%5,%2,8),%0 \n"
194	" movq %0,(%3,%2,8) \n"
195	" leaq 1(%2),%2 \n"
196	" loop 1b \n"
197	" sbbq %0,%0 \n"
198	: "=&a"(ret),"+c"(n),"=&r"(i)
199	: "r"(rp),"r"(ap),"r"(bp)
200	: "cc"
201	);
202
203	return ret&1;
204	}
205
206	#ifndef SIMICS
207	BN_ULONG bn_sub_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
208	{ BN_ULONG ret=0,i=0;
209
210	if (n <= 0) return 0;
211
212	asm (
213	" subq %2,%2 \n"
214	".p2align 4 \n"
215	"1: movq (%4,%2,8),%0 \n"
216	" sbbq (%5,%2,8),%0 \n"
217	" movq %0,(%3,%2,8) \n"
218	" leaq 1(%2),%2 \n"
219	" loop 1b \n"
220	" sbbq %0,%0 \n"
221	: "=&a"(ret),"+c"(n),"=&r"(i)
222	: "r"(rp),"r"(ap),"r"(bp)
223	: "cc"
224	);
225
226	return ret&1;
227	}
228	#else
229	/* Simics 1.4<7 has buggy sbbq:-( */
230	#define BN_MASK2 0xffffffffffffffffL
231	BN_ULONG bn_sub_words(BN_ULONG r, BN_ULONG a, BN_ULONG *b, int n)
232	{
233	BN_ULONG t1,t2;
234	int c=0;
235
236	if (n <= 0) return((BN_ULONG)0);
237
238	for (;;)
239	{
240	t1=a[0]; t2=b[0];
241	r[0]=(t1-t2-c)&BN_MASK2;
242	if (t1 != t2) c=(t1 < t2);
243	if (--n <= 0) break;
244
245	t1=a[1]; t2=b[1];
246	r[1]=(t1-t2-c)&BN_MASK2;
247	if (t1 != t2) c=(t1 < t2);
248	if (--n <= 0) break;
249
250	t1=a[2]; t2=b[2];
251	r[2]=(t1-t2-c)&BN_MASK2;
252	if (t1 != t2) c=(t1 < t2);
253	if (--n <= 0) break;
254
255	t1=a[3]; t2=b[3];
256	r[3]=(t1-t2-c)&BN_MASK2;
257	if (t1 != t2) c=(t1 < t2);
258	if (--n <= 0) break;
259
260	a+=4;
261	b+=4;
262	r+=4;
263	}
264	return(c);
265	}
266	#endif
267
268	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
269	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
270	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
271	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */
272
273	/*
274	* Keep in mind that carrying into high part of multiplication result
275	* can not overflow, because it cannot be all-ones.
276	*/
277	#if 0
278	/* original macros are kept for reference purposes */
279	#define mul_add_c(a,b,c0,c1,c2) do { \
280	BN_ULONG ta = (a), tb = (b); \
281	BN_ULONG lo, hi; \
282	BN_UMULT_LOHI(lo,hi,ta,tb); \
283	c0 += lo; hi += (c0<lo)?1:0; \
284	c1 += hi; c2 += (c1<hi)?1:0; \
285	} while(0)
286
287	#define mul_add_c2(a,b,c0,c1,c2) do { \
288	BN_ULONG ta = (a), tb = (b); \
289	BN_ULONG lo, hi, tt; \
290	BN_UMULT_LOHI(lo,hi,ta,tb); \
291	c0 += lo; tt = hi+((c0<lo)?1:0); \
292	c1 += tt; c2 += (c1<tt)?1:0; \
293	c0 += lo; hi += (c0<lo)?1:0; \
294	c1 += hi; c2 += (c1<hi)?1:0; \
295	} while(0)
296
297	#define sqr_add_c(a,i,c0,c1,c2) do { \
298	BN_ULONG ta = (a)[i]; \
299	BN_ULONG lo, hi; \
300	BN_UMULT_LOHI(lo,hi,ta,ta); \
301	c0 += lo; hi += (c0<lo)?1:0; \
302	c1 += hi; c2 += (c1<hi)?1:0; \
303	} while(0)
304	#else
305	#define mul_add_c(a,b,c0,c1,c2) do { \
306	BN_ULONG t1,t2; \
307	asm ("mulq %3" \
308	: "=a"(t1),"=d"(t2) \
309	: "a"(a),"m"(b) \
310	: "cc"); \
311	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
312	: "+r"(c0),"+r"(c1),"+r"(c2) \
313	: "r"(t1),"r"(t2),"g"(0) \
314	: "cc"); \
315	} while (0)
316
317	#define sqr_add_c(a,i,c0,c1,c2) do { \
318	BN_ULONG t1,t2; \
319	asm ("mulq %2" \
320	: "=a"(t1),"=d"(t2) \
321	: "a"(a[i]) \
322	: "cc"); \
323	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
324	: "+r"(c0),"+r"(c1),"+r"(c2) \
325	: "r"(t1),"r"(t2),"g"(0) \
326	: "cc"); \
327	} while (0)
328
329	#define mul_add_c2(a,b,c0,c1,c2) do { \
330	BN_ULONG t1,t2; \
331	asm ("mulq %3" \
332	: "=a"(t1),"=d"(t2) \
333	: "a"(a),"m"(b) \
334	: "cc"); \
335	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
336	: "+r"(c0),"+r"(c1),"+r"(c2) \
337	: "r"(t1),"r"(t2),"g"(0) \
338	: "cc"); \
339	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
340	: "+r"(c0),"+r"(c1),"+r"(c2) \
341	: "r"(t1),"r"(t2),"g"(0) \
342	: "cc"); \
343	} while (0)
344	#endif
345
346	#define sqr_add_c2(a,i,j,c0,c1,c2) \
347	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
348
349	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
350	{
351	BN_ULONG c1,c2,c3;
352
353	c1=0;
354	c2=0;
355	c3=0;
356	mul_add_c(a[0],b[0],c1,c2,c3);
357	r[0]=c1;
358	c1=0;
359	mul_add_c(a[0],b[1],c2,c3,c1);
360	mul_add_c(a[1],b[0],c2,c3,c1);
361	r[1]=c2;
362	c2=0;
363	mul_add_c(a[2],b[0],c3,c1,c2);
364	mul_add_c(a[1],b[1],c3,c1,c2);
365	mul_add_c(a[0],b[2],c3,c1,c2);
366	r[2]=c3;
367	c3=0;
368	mul_add_c(a[0],b[3],c1,c2,c3);
369	mul_add_c(a[1],b[2],c1,c2,c3);
370	mul_add_c(a[2],b[1],c1,c2,c3);
371	mul_add_c(a[3],b[0],c1,c2,c3);
372	r[3]=c1;
373	c1=0;
374	mul_add_c(a[4],b[0],c2,c3,c1);
375	mul_add_c(a[3],b[1],c2,c3,c1);
376	mul_add_c(a[2],b[2],c2,c3,c1);
377	mul_add_c(a[1],b[3],c2,c3,c1);
378	mul_add_c(a[0],b[4],c2,c3,c1);
379	r[4]=c2;
380	c2=0;
381	mul_add_c(a[0],b[5],c3,c1,c2);
382	mul_add_c(a[1],b[4],c3,c1,c2);
383	mul_add_c(a[2],b[3],c3,c1,c2);
384	mul_add_c(a[3],b[2],c3,c1,c2);
385	mul_add_c(a[4],b[1],c3,c1,c2);
386	mul_add_c(a[5],b[0],c3,c1,c2);
387	r[5]=c3;
388	c3=0;
389	mul_add_c(a[6],b[0],c1,c2,c3);
390	mul_add_c(a[5],b[1],c1,c2,c3);
391	mul_add_c(a[4],b[2],c1,c2,c3);
392	mul_add_c(a[3],b[3],c1,c2,c3);
393	mul_add_c(a[2],b[4],c1,c2,c3);
394	mul_add_c(a[1],b[5],c1,c2,c3);
395	mul_add_c(a[0],b[6],c1,c2,c3);
396	r[6]=c1;
397	c1=0;
398	mul_add_c(a[0],b[7],c2,c3,c1);
399	mul_add_c(a[1],b[6],c2,c3,c1);
400	mul_add_c(a[2],b[5],c2,c3,c1);
401	mul_add_c(a[3],b[4],c2,c3,c1);
402	mul_add_c(a[4],b[3],c2,c3,c1);
403	mul_add_c(a[5],b[2],c2,c3,c1);
404	mul_add_c(a[6],b[1],c2,c3,c1);
405	mul_add_c(a[7],b[0],c2,c3,c1);
406	r[7]=c2;
407	c2=0;
408	mul_add_c(a[7],b[1],c3,c1,c2);
409	mul_add_c(a[6],b[2],c3,c1,c2);
410	mul_add_c(a[5],b[3],c3,c1,c2);
411	mul_add_c(a[4],b[4],c3,c1,c2);
412	mul_add_c(a[3],b[5],c3,c1,c2);
413	mul_add_c(a[2],b[6],c3,c1,c2);
414	mul_add_c(a[1],b[7],c3,c1,c2);
415	r[8]=c3;
416	c3=0;
417	mul_add_c(a[2],b[7],c1,c2,c3);
418	mul_add_c(a[3],b[6],c1,c2,c3);
419	mul_add_c(a[4],b[5],c1,c2,c3);
420	mul_add_c(a[5],b[4],c1,c2,c3);
421	mul_add_c(a[6],b[3],c1,c2,c3);
422	mul_add_c(a[7],b[2],c1,c2,c3);
423	r[9]=c1;
424	c1=0;
425	mul_add_c(a[7],b[3],c2,c3,c1);
426	mul_add_c(a[6],b[4],c2,c3,c1);
427	mul_add_c(a[5],b[5],c2,c3,c1);
428	mul_add_c(a[4],b[6],c2,c3,c1);
429	mul_add_c(a[3],b[7],c2,c3,c1);
430	r[10]=c2;
431	c2=0;
432	mul_add_c(a[4],b[7],c3,c1,c2);
433	mul_add_c(a[5],b[6],c3,c1,c2);
434	mul_add_c(a[6],b[5],c3,c1,c2);
435	mul_add_c(a[7],b[4],c3,c1,c2);
436	r[11]=c3;
437	c3=0;
438	mul_add_c(a[7],b[5],c1,c2,c3);
439	mul_add_c(a[6],b[6],c1,c2,c3);
440	mul_add_c(a[5],b[7],c1,c2,c3);
441	r[12]=c1;
442	c1=0;
443	mul_add_c(a[6],b[7],c2,c3,c1);
444	mul_add_c(a[7],b[6],c2,c3,c1);
445	r[13]=c2;
446	c2=0;
447	mul_add_c(a[7],b[7],c3,c1,c2);
448	r[14]=c3;
449	r[15]=c1;
450	}
451
452	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
453	{
454	BN_ULONG c1,c2,c3;
455
456	c1=0;
457	c2=0;
458	c3=0;
459	mul_add_c(a[0],b[0],c1,c2,c3);
460	r[0]=c1;
461	c1=0;
462	mul_add_c(a[0],b[1],c2,c3,c1);
463	mul_add_c(a[1],b[0],c2,c3,c1);
464	r[1]=c2;
465	c2=0;
466	mul_add_c(a[2],b[0],c3,c1,c2);
467	mul_add_c(a[1],b[1],c3,c1,c2);
468	mul_add_c(a[0],b[2],c3,c1,c2);
469	r[2]=c3;
470	c3=0;
471	mul_add_c(a[0],b[3],c1,c2,c3);
472	mul_add_c(a[1],b[2],c1,c2,c3);
473	mul_add_c(a[2],b[1],c1,c2,c3);
474	mul_add_c(a[3],b[0],c1,c2,c3);
475	r[3]=c1;
476	c1=0;
477	mul_add_c(a[3],b[1],c2,c3,c1);
478	mul_add_c(a[2],b[2],c2,c3,c1);
479	mul_add_c(a[1],b[3],c2,c3,c1);
480	r[4]=c2;
481	c2=0;
482	mul_add_c(a[2],b[3],c3,c1,c2);
483	mul_add_c(a[3],b[2],c3,c1,c2);
484	r[5]=c3;
485	c3=0;
486	mul_add_c(a[3],b[3],c1,c2,c3);
487	r[6]=c1;
488	r[7]=c2;
489	}
490
491	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
492	{
493	BN_ULONG c1,c2,c3;
494
495	c1=0;
496	c2=0;
497	c3=0;
498	sqr_add_c(a,0,c1,c2,c3);
499	r[0]=c1;
500	c1=0;
501	sqr_add_c2(a,1,0,c2,c3,c1);
502	r[1]=c2;
503	c2=0;
504	sqr_add_c(a,1,c3,c1,c2);
505	sqr_add_c2(a,2,0,c3,c1,c2);
506	r[2]=c3;
507	c3=0;
508	sqr_add_c2(a,3,0,c1,c2,c3);
509	sqr_add_c2(a,2,1,c1,c2,c3);
510	r[3]=c1;
511	c1=0;
512	sqr_add_c(a,2,c2,c3,c1);
513	sqr_add_c2(a,3,1,c2,c3,c1);
514	sqr_add_c2(a,4,0,c2,c3,c1);
515	r[4]=c2;
516	c2=0;
517	sqr_add_c2(a,5,0,c3,c1,c2);
518	sqr_add_c2(a,4,1,c3,c1,c2);
519	sqr_add_c2(a,3,2,c3,c1,c2);
520	r[5]=c3;
521	c3=0;
522	sqr_add_c(a,3,c1,c2,c3);
523	sqr_add_c2(a,4,2,c1,c2,c3);
524	sqr_add_c2(a,5,1,c1,c2,c3);
525	sqr_add_c2(a,6,0,c1,c2,c3);
526	r[6]=c1;
527	c1=0;
528	sqr_add_c2(a,7,0,c2,c3,c1);
529	sqr_add_c2(a,6,1,c2,c3,c1);
530	sqr_add_c2(a,5,2,c2,c3,c1);
531	sqr_add_c2(a,4,3,c2,c3,c1);
532	r[7]=c2;
533	c2=0;
534	sqr_add_c(a,4,c3,c1,c2);
535	sqr_add_c2(a,5,3,c3,c1,c2);
536	sqr_add_c2(a,6,2,c3,c1,c2);
537	sqr_add_c2(a,7,1,c3,c1,c2);
538	r[8]=c3;
539	c3=0;
540	sqr_add_c2(a,7,2,c1,c2,c3);
541	sqr_add_c2(a,6,3,c1,c2,c3);
542	sqr_add_c2(a,5,4,c1,c2,c3);
543	r[9]=c1;
544	c1=0;
545	sqr_add_c(a,5,c2,c3,c1);
546	sqr_add_c2(a,6,4,c2,c3,c1);
547	sqr_add_c2(a,7,3,c2,c3,c1);
548	r[10]=c2;
549	c2=0;
550	sqr_add_c2(a,7,4,c3,c1,c2);
551	sqr_add_c2(a,6,5,c3,c1,c2);
552	r[11]=c3;
553	c3=0;
554	sqr_add_c(a,6,c1,c2,c3);
555	sqr_add_c2(a,7,5,c1,c2,c3);
556	r[12]=c1;
557	c1=0;
558	sqr_add_c2(a,7,6,c2,c3,c1);
559	r[13]=c2;
560	c2=0;
561	sqr_add_c(a,7,c3,c1,c2);
562	r[14]=c3;
563	r[15]=c1;
564	}
565
566	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
567	{
568	BN_ULONG c1,c2,c3;
569
570	c1=0;
571	c2=0;
572	c3=0;
573	sqr_add_c(a,0,c1,c2,c3);
574	r[0]=c1;
575	c1=0;
576	sqr_add_c2(a,1,0,c2,c3,c1);
577	r[1]=c2;
578	c2=0;
579	sqr_add_c(a,1,c3,c1,c2);
580	sqr_add_c2(a,2,0,c3,c1,c2);
581	r[2]=c3;
582	c3=0;
583	sqr_add_c2(a,3,0,c1,c2,c3);
584	sqr_add_c2(a,2,1,c1,c2,c3);
585	r[3]=c1;
586	c1=0;
587	sqr_add_c(a,2,c2,c3,c1);
588	sqr_add_c2(a,3,1,c2,c3,c1);
589	r[4]=c2;
590	c2=0;
591	sqr_add_c2(a,3,2,c3,c1,c2);
592	r[5]=c3;
593	c3=0;
594	sqr_add_c(a,3,c1,c2,c3);
595	r[6]=c1;
596	r[7]=c2;
597	}
598	#endif