1 files changed, 0 insertions, 554 deletions
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c
deleted file mode 100644
index bd068cfb51..0000000000
--- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c
+++ /dev/null
@@ -1,554 +0,0 @@
-/* $OpenBSD: x86_64-gcc.c,v 1.6 2015/09/12 09:04:12 miod Exp $ */
-#include "../bn_lcl.h"
-/*
- * x86_64 BIGNUM accelerator version 0.1, December 2002.
- *
- * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
- * project.
- *
- * Rights for redistribution and usage in source and binary forms are
- * granted according to the OpenSSL license. Warranty of any kind is
- * disclaimed.
- *
- * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
- *    versions, like 1.0...
- * A. Well, that's because this code is basically a quick-n-dirty
- *    proof-of-concept hack. As you can see it's implemented with
- *    inline assembler, which means that you're bound to GCC and that
- *    there might be enough room for further improvement.
- *
- * Q. Why inline assembler?
- * A. x86_64 features own ABI which I'm not familiar with. This is
- *    why I decided to let the compiler take care of subroutine
- *    prologue/epilogue as well as register allocation. For reference.
- *    Win64 implements different ABI for AMD64, different from Linux.
- *
- * Q. How much faster does it get?
- * A. 'apps/openssl speed rsa dsa' output with no-asm:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
- *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
- *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
- *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
- *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
- *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
- *
- *    'apps/openssl speed rsa dsa' output with this module:
- *
- *                        sign    verify    sign/s verify/s
- *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
- *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
- *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
- *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
- *                        sign    verify    sign/s verify/s
- *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
- *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
- *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
- *
- *    For the reference. IA-32 assembler implementation performs
- *    very much like 64-bit code compiled with no-asm on the same
- *    machine.
- */
-#define BN_ULONG unsigned long
-#undef mul
-#undef mul_add
-#undef sqr
-/*
- * "m"(a), "+m"(r)      is the way to favor DirectPath �-code;
- * "g"(0)               let the compiler to decide where does it
- *                      want to keep the value of zero;
- */
-#define mul_add(r,a,word,carry) do {    \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"m"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+m"(r),"+d"(high)    \
-                : "r"(carry),"g"(0)     \
-                : "cc");                \
-        carry=high;                     \
-        } while (0)
-#define mul(r,a,word,carry) do {        \
-        BN_ULONG high,low;      \
-        asm ("mulq %3"                  \
-                : "=a"(low),"=d"(high)  \
-                : "a"(word),"g"(a)      \
-                : "cc");                \
-        asm ("addq %2,%0; adcq %3,%1"   \
-                : "+r"(carry),"+d"(high)\
-                : "a"(low),"g"(0)       \
-                : "cc");                \
-        (r)=carry, carry=high;          \
-        } while (0)
-#define sqr(r0,r1,a)                    \
-        asm ("mulq %2"                  \
-                : "=a"(r0),"=d"(r1)     \
-                : "a"(a)                \
-                : "cc");
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul_add(rp[0],ap[0],w,c1);
-                mul_add(rp[1],ap[1],w,c1);
-                mul_add(rp[2],ap[2],w,c1);
-                mul_add(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
-                mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
-                mul_add(rp[2],ap[2],w,c1); return c1;
-                }
-        
-        return(c1);
-        } 
-BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
-        {
-        BN_ULONG c1=0;
-        if (num <= 0) return(c1);
-        while (num&~3)
-                {
-                mul(rp[0],ap[0],w,c1);
-                mul(rp[1],ap[1],w,c1);
-                mul(rp[2],ap[2],w,c1);
-                mul(rp[3],ap[3],w,c1);
-                ap+=4; rp+=4; num-=4;
-                }
-        if (num)
-                {
-                mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
-                mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
-                mul(rp[2],ap[2],w,c1);
-                }
-        return(c1);
-        } 
-void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
-        {
-        if (n <= 0) return;
-        while (n&~3)
-                {
-                sqr(r[0],r[1],a[0]);
-                sqr(r[2],r[3],a[1]);
-                sqr(r[4],r[5],a[2]);
-                sqr(r[6],r[7],a[3]);
-                a+=4; r+=8; n-=4;
-                }
-        if (n)
-                {
-                sqr(r[0],r[1],a[0]); if (--n == 0) return;
-                sqr(r[2],r[3],a[1]); if (--n == 0) return;
-                sqr(r[4],r[5],a[2]);
-                }
-        }
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
-{       BN_ULONG ret,waste;
-        asm ("divq      %4"
-                : "=a"(ret),"=d"(waste)
-                : "a"(l),"d"(h),"g"(d)
-                : "cc");
-        return ret;
-}
-BN_ULONG bn_add_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       adcq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-BN_ULONG bn_sub_words (BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int n)
-{ BN_ULONG ret=0,i=0;
-        if (n <= 0) return 0;
-        asm (
-        "       subq    %2,%2           \n"
-        ".p2align 4                     \n"
-        "1:     movq    (%4,%2,8),%0    \n"
-        "       sbbq    (%5,%2,8),%0    \n"
-        "       movq    %0,(%3,%2,8)    \n"
-        "       leaq    1(%2),%2        \n"
-        "       loop    1b              \n"
-        "       sbbq    %0,%0           \n"
-                : "=&a"(ret),"+c"(n),"=&r"(i)
-                : "r"(rp),"r"(ap),"r"(bp)
-                : "cc"
-        );
-  return ret&1;
-}
-/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
-/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
-/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
-/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
-/*
- * Keep in mind that carrying into high part of multiplication result
- * can not overflow, because it cannot be all-ones.
- */
-#if 0
-/* original macros are kept for reference purposes */
-#define mul_add_c(a,b,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define mul_add_c2(a,b,c0,c1,c2)        do {    \
-        BN_ULONG ta = (a), tb = (b);            \
-        BN_ULONG lo, hi, tt;                    \
-        BN_UMULT_LOHI(lo,hi,ta,tb);             \
-        c0 += lo; tt = hi+((c0<lo)?1:0);        \
-        c1 += tt; c2 += (c1<tt)?1:0;            \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#define sqr_add_c(a,i,c0,c1,c2)         do {    \
-        BN_ULONG ta = (a)[i];                   \
-        BN_ULONG lo, hi;                        \
-        BN_UMULT_LOHI(lo,hi,ta,ta);             \
-        c0 += lo; hi += (c0<lo)?1:0;            \
-        c1 += hi; c2 += (c1<hi)?1:0;            \
-        } while(0)
-#else
-#define mul_add_c(a,b,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define sqr_add_c(a,i,c0,c1,c2) do {    \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %2"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a[i])             \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#define mul_add_c2(a,b,c0,c1,c2) do {   \
-        BN_ULONG t1,t2;                 \
-        asm ("mulq %3"                  \
-                : "=a"(t1),"=d"(t2)     \
-                : "a"(a),"m"(b)         \
-                : "cc");                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
-                : "+r"(c0),"+r"(c1),"+r"(c2)            \
-                : "r"(t1),"r"(t2),"g"(0)                \
-                : "cc");                                \
-        } while (0)
-#endif
-#define sqr_add_c2(a,i,j,c0,c1,c2)      \
-        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[4],b[0],c2,c3,c1);
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        mul_add_c(a[0],b[4],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[0],b[5],c3,c1,c2);
-        mul_add_c(a[1],b[4],c3,c1,c2);
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        mul_add_c(a[4],b[1],c3,c1,c2);
-        mul_add_c(a[5],b[0],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[6],b[0],c1,c2,c3);
-        mul_add_c(a[5],b[1],c1,c2,c3);
-        mul_add_c(a[4],b[2],c1,c2,c3);
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        mul_add_c(a[2],b[4],c1,c2,c3);
-        mul_add_c(a[1],b[5],c1,c2,c3);
-        mul_add_c(a[0],b[6],c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        mul_add_c(a[0],b[7],c2,c3,c1);
-        mul_add_c(a[1],b[6],c2,c3,c1);
-        mul_add_c(a[2],b[5],c2,c3,c1);
-        mul_add_c(a[3],b[4],c2,c3,c1);
-        mul_add_c(a[4],b[3],c2,c3,c1);
-        mul_add_c(a[5],b[2],c2,c3,c1);
-        mul_add_c(a[6],b[1],c2,c3,c1);
-        mul_add_c(a[7],b[0],c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        mul_add_c(a[7],b[1],c3,c1,c2);
-        mul_add_c(a[6],b[2],c3,c1,c2);
-        mul_add_c(a[5],b[3],c3,c1,c2);
-        mul_add_c(a[4],b[4],c3,c1,c2);
-        mul_add_c(a[3],b[5],c3,c1,c2);
-        mul_add_c(a[2],b[6],c3,c1,c2);
-        mul_add_c(a[1],b[7],c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        mul_add_c(a[2],b[7],c1,c2,c3);
-        mul_add_c(a[3],b[6],c1,c2,c3);
-        mul_add_c(a[4],b[5],c1,c2,c3);
-        mul_add_c(a[5],b[4],c1,c2,c3);
-        mul_add_c(a[6],b[3],c1,c2,c3);
-        mul_add_c(a[7],b[2],c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        mul_add_c(a[7],b[3],c2,c3,c1);
-        mul_add_c(a[6],b[4],c2,c3,c1);
-        mul_add_c(a[5],b[5],c2,c3,c1);
-        mul_add_c(a[4],b[6],c2,c3,c1);
-        mul_add_c(a[3],b[7],c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        mul_add_c(a[4],b[7],c3,c1,c2);
-        mul_add_c(a[5],b[6],c3,c1,c2);
-        mul_add_c(a[6],b[5],c3,c1,c2);
-        mul_add_c(a[7],b[4],c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        mul_add_c(a[7],b[5],c1,c2,c3);
-        mul_add_c(a[6],b[6],c1,c2,c3);
-        mul_add_c(a[5],b[7],c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        mul_add_c(a[6],b[7],c2,c3,c1);
-        mul_add_c(a[7],b[6],c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        mul_add_c(a[7],b[7],c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        mul_add_c(a[0],b[0],c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        mul_add_c(a[0],b[1],c2,c3,c1);
-        mul_add_c(a[1],b[0],c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        mul_add_c(a[2],b[0],c3,c1,c2);
-        mul_add_c(a[1],b[1],c3,c1,c2);
-        mul_add_c(a[0],b[2],c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        mul_add_c(a[0],b[3],c1,c2,c3);
-        mul_add_c(a[1],b[2],c1,c2,c3);
-        mul_add_c(a[2],b[1],c1,c2,c3);
-        mul_add_c(a[3],b[0],c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        mul_add_c(a[3],b[1],c2,c3,c1);
-        mul_add_c(a[2],b[2],c2,c3,c1);
-        mul_add_c(a[1],b[3],c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        mul_add_c(a[2],b[3],c3,c1,c2);
-        mul_add_c(a[3],b[2],c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        mul_add_c(a[3],b[3],c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        sqr_add_c2(a,4,0,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,5,0,c3,c1,c2);
-        sqr_add_c2(a,4,1,c3,c1,c2);
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        sqr_add_c2(a,4,2,c1,c2,c3);
-        sqr_add_c2(a,5,1,c1,c2,c3);
-        sqr_add_c2(a,6,0,c1,c2,c3);
-        r[6]=c1;
-        c1=0;
-        sqr_add_c2(a,7,0,c2,c3,c1);
-        sqr_add_c2(a,6,1,c2,c3,c1);
-        sqr_add_c2(a,5,2,c2,c3,c1);
-        sqr_add_c2(a,4,3,c2,c3,c1);
-        r[7]=c2;
-        c2=0;
-        sqr_add_c(a,4,c3,c1,c2);
-        sqr_add_c2(a,5,3,c3,c1,c2);
-        sqr_add_c2(a,6,2,c3,c1,c2);
-        sqr_add_c2(a,7,1,c3,c1,c2);
-        r[8]=c3;
-        c3=0;
-        sqr_add_c2(a,7,2,c1,c2,c3);
-        sqr_add_c2(a,6,3,c1,c2,c3);
-        sqr_add_c2(a,5,4,c1,c2,c3);
-        r[9]=c1;
-        c1=0;
-        sqr_add_c(a,5,c2,c3,c1);
-        sqr_add_c2(a,6,4,c2,c3,c1);
-        sqr_add_c2(a,7,3,c2,c3,c1);
-        r[10]=c2;
-        c2=0;
-        sqr_add_c2(a,7,4,c3,c1,c2);
-        sqr_add_c2(a,6,5,c3,c1,c2);
-        r[11]=c3;
-        c3=0;
-        sqr_add_c(a,6,c1,c2,c3);
-        sqr_add_c2(a,7,5,c1,c2,c3);
-        r[12]=c1;
-        c1=0;
-        sqr_add_c2(a,7,6,c2,c3,c1);
-        r[13]=c2;
-        c2=0;
-        sqr_add_c(a,7,c3,c1,c2);
-        r[14]=c3;
-        r[15]=c1;
-        }
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
-        {
-        BN_ULONG c1,c2,c3;
-        c1=0;
-        c2=0;
-        c3=0;
-        sqr_add_c(a,0,c1,c2,c3);
-        r[0]=c1;
-        c1=0;
-        sqr_add_c2(a,1,0,c2,c3,c1);
-        r[1]=c2;
-        c2=0;
-        sqr_add_c(a,1,c3,c1,c2);
-        sqr_add_c2(a,2,0,c3,c1,c2);
-        r[2]=c3;
-        c3=0;
-        sqr_add_c2(a,3,0,c1,c2,c3);
-        sqr_add_c2(a,2,1,c1,c2,c3);
-        r[3]=c1;
-        c1=0;
-        sqr_add_c(a,2,c2,c3,c1);
-        sqr_add_c2(a,3,1,c2,c3,c1);
-        r[4]=c2;
-        c2=0;
-        sqr_add_c2(a,3,2,c3,c1,c2);
-        r[5]=c3;
-        c3=0;
-        sqr_add_c(a,3,c1,c2,c3);
-        r[6]=c1;
-        r[7]=c2;
-        }

diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c deleted file mode 100644 index bd068cfb51..0000000000 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ /dev/null
@@ -1,554 +0,0 @@
1	/* $OpenBSD: x86_64-gcc.c,v 1.6 2015/09/12 09:04:12 miod Exp $ */
2	#include "../bn_lcl.h"
3	/*
4	* x86_64 BIGNUM accelerator version 0.1, December 2002.
5	*
6	* Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
7	* project.
8	*
9	* Rights for redistribution and usage in source and binary forms are
10	* granted according to the OpenSSL license. Warranty of any kind is
11	* disclaimed.
12	*
13	* Q. Version 0.1? It doesn't sound like Andy, he used to assign real
14	* versions, like 1.0...
15	* A. Well, that's because this code is basically a quick-n-dirty
16	* proof-of-concept hack. As you can see it's implemented with
17	* inline assembler, which means that you're bound to GCC and that
18	* there might be enough room for further improvement.
19	*
20	* Q. Why inline assembler?
21	* A. x86_64 features own ABI which I'm not familiar with. This is
22	* why I decided to let the compiler take care of subroutine
23	* prologue/epilogue as well as register allocation. For reference.
24	* Win64 implements different ABI for AMD64, different from Linux.
25	*
26	* Q. How much faster does it get?
27	* A. 'apps/openssl speed rsa dsa' output with no-asm:
28	*
29	* sign verify sign/s verify/s
30	* rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
31	* rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
32	* rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
33	* rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
34	* sign verify sign/s verify/s
35	* dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
36	* dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
37	* dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
38	*
39	* 'apps/openssl speed rsa dsa' output with this module:
40	*
41	* sign verify sign/s verify/s
42	* rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
43	* rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
44	* rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
45	* rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
46	* sign verify sign/s verify/s
47	* dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
48	* dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
49	* dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
50	*
51	* For the reference. IA-32 assembler implementation performs
52	* very much like 64-bit code compiled with no-asm on the same
53	* machine.
54	*/
55
56	#define BN_ULONG unsigned long
57
58	#undef mul
59	#undef mul_add
60	#undef sqr
61
62	/*
63	* "m"(a), "+m"(r) is the way to favor DirectPath �-code;
64	* "g"(0) let the compiler to decide where does it
65	* want to keep the value of zero;
66	*/
67	#define mul_add(r,a,word,carry) do { \
68	BN_ULONG high,low; \
69	asm ("mulq %3" \
70	: "=a"(low),"=d"(high) \
71	: "a"(word),"m"(a) \
72	: "cc"); \
73	asm ("addq %2,%0; adcq %3,%1" \
74	: "+r"(carry),"+d"(high)\
75	: "a"(low),"g"(0) \
76	: "cc"); \
77	asm ("addq %2,%0; adcq %3,%1" \
78	: "+m"(r),"+d"(high) \
79	: "r"(carry),"g"(0) \
80	: "cc"); \
81	carry=high; \
82	} while (0)
83
84	#define mul(r,a,word,carry) do { \
85	BN_ULONG high,low; \
86	asm ("mulq %3" \
87	: "=a"(low),"=d"(high) \
88	: "a"(word),"g"(a) \
89	: "cc"); \
90	asm ("addq %2,%0; adcq %3,%1" \
91	: "+r"(carry),"+d"(high)\
92	: "a"(low),"g"(0) \
93	: "cc"); \
94	(r)=carry, carry=high; \
95	} while (0)
96
97	#define sqr(r0,r1,a) \
98	asm ("mulq %2" \
99	: "=a"(r0),"=d"(r1) \
100	: "a"(a) \
101	: "cc");
102
103	BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
104	{
105	BN_ULONG c1=0;
106
107	if (num <= 0) return(c1);
108
109	while (num&~3)
110	{
111	mul_add(rp[0],ap[0],w,c1);
112	mul_add(rp[1],ap[1],w,c1);
113	mul_add(rp[2],ap[2],w,c1);
114	mul_add(rp[3],ap[3],w,c1);
115	ap+=4; rp+=4; num-=4;
116	}
117	if (num)
118	{
119	mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
120	mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
121	mul_add(rp[2],ap[2],w,c1); return c1;
122	}
123
124	return(c1);
125	}
126
127	BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
128	{
129	BN_ULONG c1=0;
130
131	if (num <= 0) return(c1);
132
133	while (num&~3)
134	{
135	mul(rp[0],ap[0],w,c1);
136	mul(rp[1],ap[1],w,c1);
137	mul(rp[2],ap[2],w,c1);
138	mul(rp[3],ap[3],w,c1);
139	ap+=4; rp+=4; num-=4;
140	}
141	if (num)
142	{
143	mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
144	mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
145	mul(rp[2],ap[2],w,c1);
146	}
147	return(c1);
148	}
149
150	void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
151	{
152	if (n <= 0) return;
153
154	while (n&~3)
155	{
156	sqr(r[0],r[1],a[0]);
157	sqr(r[2],r[3],a[1]);
158	sqr(r[4],r[5],a[2]);
159	sqr(r[6],r[7],a[3]);
160	a+=4; r+=8; n-=4;
161	}
162	if (n)
163	{
164	sqr(r[0],r[1],a[0]); if (--n == 0) return;
165	sqr(r[2],r[3],a[1]); if (--n == 0) return;
166	sqr(r[4],r[5],a[2]);
167	}
168	}
169
170	BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
171	{ BN_ULONG ret,waste;
172
173	asm ("divq %4"
174	: "=a"(ret),"=d"(waste)
175	: "a"(l),"d"(h),"g"(d)
176	: "cc");
177
178	return ret;
179	}
180
181	BN_ULONG bn_add_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
182	{ BN_ULONG ret=0,i=0;
183
184	if (n <= 0) return 0;
185
186	asm (
187	" subq %2,%2 \n"
188	".p2align 4 \n"
189	"1: movq (%4,%2,8),%0 \n"
190	" adcq (%5,%2,8),%0 \n"
191	" movq %0,(%3,%2,8) \n"
192	" leaq 1(%2),%2 \n"
193	" loop 1b \n"
194	" sbbq %0,%0 \n"
195	: "=&a"(ret),"+c"(n),"=&r"(i)
196	: "r"(rp),"r"(ap),"r"(bp)
197	: "cc"
198	);
199
200	return ret&1;
201	}
202
203	BN_ULONG bn_sub_words (BN_ULONG rp, const BN_ULONG ap, const BN_ULONG *bp,int n)
204	{ BN_ULONG ret=0,i=0;
205
206	if (n <= 0) return 0;
207
208	asm (
209	" subq %2,%2 \n"
210	".p2align 4 \n"
211	"1: movq (%4,%2,8),%0 \n"
212	" sbbq (%5,%2,8),%0 \n"
213	" movq %0,(%3,%2,8) \n"
214	" leaq 1(%2),%2 \n"
215	" loop 1b \n"
216	" sbbq %0,%0 \n"
217	: "=&a"(ret),"+c"(n),"=&r"(i)
218	: "r"(rp),"r"(ap),"r"(bp)
219	: "cc"
220	);
221
222	return ret&1;
223	}
224
225	/* mul_add_c(a,b,c0,c1,c2) -- c+=ab for three word number c=(c2,c1,c0) /
226	/* mul_add_c2(a,b,c0,c1,c2) -- c+=2ab for three word number c=(c2,c1,c0) */
227	/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
228	/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2a[i]a[j] for three word number c=(c2,c1,c0) */
229
230	/*
231	* Keep in mind that carrying into high part of multiplication result
232	* can not overflow, because it cannot be all-ones.
233	*/
234	#if 0
235	/* original macros are kept for reference purposes */
236	#define mul_add_c(a,b,c0,c1,c2) do { \
237	BN_ULONG ta = (a), tb = (b); \
238	BN_ULONG lo, hi; \
239	BN_UMULT_LOHI(lo,hi,ta,tb); \
240	c0 += lo; hi += (c0<lo)?1:0; \
241	c1 += hi; c2 += (c1<hi)?1:0; \
242	} while(0)
243
244	#define mul_add_c2(a,b,c0,c1,c2) do { \
245	BN_ULONG ta = (a), tb = (b); \
246	BN_ULONG lo, hi, tt; \
247	BN_UMULT_LOHI(lo,hi,ta,tb); \
248	c0 += lo; tt = hi+((c0<lo)?1:0); \
249	c1 += tt; c2 += (c1<tt)?1:0; \
250	c0 += lo; hi += (c0<lo)?1:0; \
251	c1 += hi; c2 += (c1<hi)?1:0; \
252	} while(0)
253
254	#define sqr_add_c(a,i,c0,c1,c2) do { \
255	BN_ULONG ta = (a)[i]; \
256	BN_ULONG lo, hi; \
257	BN_UMULT_LOHI(lo,hi,ta,ta); \
258	c0 += lo; hi += (c0<lo)?1:0; \
259	c1 += hi; c2 += (c1<hi)?1:0; \
260	} while(0)
261	#else
262	#define mul_add_c(a,b,c0,c1,c2) do { \
263	BN_ULONG t1,t2; \
264	asm ("mulq %3" \
265	: "=a"(t1),"=d"(t2) \
266	: "a"(a),"m"(b) \
267	: "cc"); \
268	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
269	: "+r"(c0),"+r"(c1),"+r"(c2) \
270	: "r"(t1),"r"(t2),"g"(0) \
271	: "cc"); \
272	} while (0)
273
274	#define sqr_add_c(a,i,c0,c1,c2) do { \
275	BN_ULONG t1,t2; \
276	asm ("mulq %2" \
277	: "=a"(t1),"=d"(t2) \
278	: "a"(a[i]) \
279	: "cc"); \
280	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
281	: "+r"(c0),"+r"(c1),"+r"(c2) \
282	: "r"(t1),"r"(t2),"g"(0) \
283	: "cc"); \
284	} while (0)
285
286	#define mul_add_c2(a,b,c0,c1,c2) do { \
287	BN_ULONG t1,t2; \
288	asm ("mulq %3" \
289	: "=a"(t1),"=d"(t2) \
290	: "a"(a),"m"(b) \
291	: "cc"); \
292	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
293	: "+r"(c0),"+r"(c1),"+r"(c2) \
294	: "r"(t1),"r"(t2),"g"(0) \
295	: "cc"); \
296	asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
297	: "+r"(c0),"+r"(c1),"+r"(c2) \
298	: "r"(t1),"r"(t2),"g"(0) \
299	: "cc"); \
300	} while (0)
301	#endif
302
303	#define sqr_add_c2(a,i,j,c0,c1,c2) \
304	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
305
306	void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
307	{
308	BN_ULONG c1,c2,c3;
309
310	c1=0;
311	c2=0;
312	c3=0;
313	mul_add_c(a[0],b[0],c1,c2,c3);
314	r[0]=c1;
315	c1=0;
316	mul_add_c(a[0],b[1],c2,c3,c1);
317	mul_add_c(a[1],b[0],c2,c3,c1);
318	r[1]=c2;
319	c2=0;
320	mul_add_c(a[2],b[0],c3,c1,c2);
321	mul_add_c(a[1],b[1],c3,c1,c2);
322	mul_add_c(a[0],b[2],c3,c1,c2);
323	r[2]=c3;
324	c3=0;
325	mul_add_c(a[0],b[3],c1,c2,c3);
326	mul_add_c(a[1],b[2],c1,c2,c3);
327	mul_add_c(a[2],b[1],c1,c2,c3);
328	mul_add_c(a[3],b[0],c1,c2,c3);
329	r[3]=c1;
330	c1=0;
331	mul_add_c(a[4],b[0],c2,c3,c1);
332	mul_add_c(a[3],b[1],c2,c3,c1);
333	mul_add_c(a[2],b[2],c2,c3,c1);
334	mul_add_c(a[1],b[3],c2,c3,c1);
335	mul_add_c(a[0],b[4],c2,c3,c1);
336	r[4]=c2;
337	c2=0;
338	mul_add_c(a[0],b[5],c3,c1,c2);
339	mul_add_c(a[1],b[4],c3,c1,c2);
340	mul_add_c(a[2],b[3],c3,c1,c2);
341	mul_add_c(a[3],b[2],c3,c1,c2);
342	mul_add_c(a[4],b[1],c3,c1,c2);
343	mul_add_c(a[5],b[0],c3,c1,c2);
344	r[5]=c3;
345	c3=0;
346	mul_add_c(a[6],b[0],c1,c2,c3);
347	mul_add_c(a[5],b[1],c1,c2,c3);
348	mul_add_c(a[4],b[2],c1,c2,c3);
349	mul_add_c(a[3],b[3],c1,c2,c3);
350	mul_add_c(a[2],b[4],c1,c2,c3);
351	mul_add_c(a[1],b[5],c1,c2,c3);
352	mul_add_c(a[0],b[6],c1,c2,c3);
353	r[6]=c1;
354	c1=0;
355	mul_add_c(a[0],b[7],c2,c3,c1);
356	mul_add_c(a[1],b[6],c2,c3,c1);
357	mul_add_c(a[2],b[5],c2,c3,c1);
358	mul_add_c(a[3],b[4],c2,c3,c1);
359	mul_add_c(a[4],b[3],c2,c3,c1);
360	mul_add_c(a[5],b[2],c2,c3,c1);
361	mul_add_c(a[6],b[1],c2,c3,c1);
362	mul_add_c(a[7],b[0],c2,c3,c1);
363	r[7]=c2;
364	c2=0;
365	mul_add_c(a[7],b[1],c3,c1,c2);
366	mul_add_c(a[6],b[2],c3,c1,c2);
367	mul_add_c(a[5],b[3],c3,c1,c2);
368	mul_add_c(a[4],b[4],c3,c1,c2);
369	mul_add_c(a[3],b[5],c3,c1,c2);
370	mul_add_c(a[2],b[6],c3,c1,c2);
371	mul_add_c(a[1],b[7],c3,c1,c2);
372	r[8]=c3;
373	c3=0;
374	mul_add_c(a[2],b[7],c1,c2,c3);
375	mul_add_c(a[3],b[6],c1,c2,c3);
376	mul_add_c(a[4],b[5],c1,c2,c3);
377	mul_add_c(a[5],b[4],c1,c2,c3);
378	mul_add_c(a[6],b[3],c1,c2,c3);
379	mul_add_c(a[7],b[2],c1,c2,c3);
380	r[9]=c1;
381	c1=0;
382	mul_add_c(a[7],b[3],c2,c3,c1);
383	mul_add_c(a[6],b[4],c2,c3,c1);
384	mul_add_c(a[5],b[5],c2,c3,c1);
385	mul_add_c(a[4],b[6],c2,c3,c1);
386	mul_add_c(a[3],b[7],c2,c3,c1);
387	r[10]=c2;
388	c2=0;
389	mul_add_c(a[4],b[7],c3,c1,c2);
390	mul_add_c(a[5],b[6],c3,c1,c2);
391	mul_add_c(a[6],b[5],c3,c1,c2);
392	mul_add_c(a[7],b[4],c3,c1,c2);
393	r[11]=c3;
394	c3=0;
395	mul_add_c(a[7],b[5],c1,c2,c3);
396	mul_add_c(a[6],b[6],c1,c2,c3);
397	mul_add_c(a[5],b[7],c1,c2,c3);
398	r[12]=c1;
399	c1=0;
400	mul_add_c(a[6],b[7],c2,c3,c1);
401	mul_add_c(a[7],b[6],c2,c3,c1);
402	r[13]=c2;
403	c2=0;
404	mul_add_c(a[7],b[7],c3,c1,c2);
405	r[14]=c3;
406	r[15]=c1;
407	}
408
409	void bn_mul_comba4(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
410	{
411	BN_ULONG c1,c2,c3;
412
413	c1=0;
414	c2=0;
415	c3=0;
416	mul_add_c(a[0],b[0],c1,c2,c3);
417	r[0]=c1;
418	c1=0;
419	mul_add_c(a[0],b[1],c2,c3,c1);
420	mul_add_c(a[1],b[0],c2,c3,c1);
421	r[1]=c2;
422	c2=0;
423	mul_add_c(a[2],b[0],c3,c1,c2);
424	mul_add_c(a[1],b[1],c3,c1,c2);
425	mul_add_c(a[0],b[2],c3,c1,c2);
426	r[2]=c3;
427	c3=0;
428	mul_add_c(a[0],b[3],c1,c2,c3);
429	mul_add_c(a[1],b[2],c1,c2,c3);
430	mul_add_c(a[2],b[1],c1,c2,c3);
431	mul_add_c(a[3],b[0],c1,c2,c3);
432	r[3]=c1;
433	c1=0;
434	mul_add_c(a[3],b[1],c2,c3,c1);
435	mul_add_c(a[2],b[2],c2,c3,c1);
436	mul_add_c(a[1],b[3],c2,c3,c1);
437	r[4]=c2;
438	c2=0;
439	mul_add_c(a[2],b[3],c3,c1,c2);
440	mul_add_c(a[3],b[2],c3,c1,c2);
441	r[5]=c3;
442	c3=0;
443	mul_add_c(a[3],b[3],c1,c2,c3);
444	r[6]=c1;
445	r[7]=c2;
446	}
447
448	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
449	{
450	BN_ULONG c1,c2,c3;
451
452	c1=0;
453	c2=0;
454	c3=0;
455	sqr_add_c(a,0,c1,c2,c3);
456	r[0]=c1;
457	c1=0;
458	sqr_add_c2(a,1,0,c2,c3,c1);
459	r[1]=c2;
460	c2=0;
461	sqr_add_c(a,1,c3,c1,c2);
462	sqr_add_c2(a,2,0,c3,c1,c2);
463	r[2]=c3;
464	c3=0;
465	sqr_add_c2(a,3,0,c1,c2,c3);
466	sqr_add_c2(a,2,1,c1,c2,c3);
467	r[3]=c1;
468	c1=0;
469	sqr_add_c(a,2,c2,c3,c1);
470	sqr_add_c2(a,3,1,c2,c3,c1);
471	sqr_add_c2(a,4,0,c2,c3,c1);
472	r[4]=c2;
473	c2=0;
474	sqr_add_c2(a,5,0,c3,c1,c2);
475	sqr_add_c2(a,4,1,c3,c1,c2);
476	sqr_add_c2(a,3,2,c3,c1,c2);
477	r[5]=c3;
478	c3=0;
479	sqr_add_c(a,3,c1,c2,c3);
480	sqr_add_c2(a,4,2,c1,c2,c3);
481	sqr_add_c2(a,5,1,c1,c2,c3);
482	sqr_add_c2(a,6,0,c1,c2,c3);
483	r[6]=c1;
484	c1=0;
485	sqr_add_c2(a,7,0,c2,c3,c1);
486	sqr_add_c2(a,6,1,c2,c3,c1);
487	sqr_add_c2(a,5,2,c2,c3,c1);
488	sqr_add_c2(a,4,3,c2,c3,c1);
489	r[7]=c2;
490	c2=0;
491	sqr_add_c(a,4,c3,c1,c2);
492	sqr_add_c2(a,5,3,c3,c1,c2);
493	sqr_add_c2(a,6,2,c3,c1,c2);
494	sqr_add_c2(a,7,1,c3,c1,c2);
495	r[8]=c3;
496	c3=0;
497	sqr_add_c2(a,7,2,c1,c2,c3);
498	sqr_add_c2(a,6,3,c1,c2,c3);
499	sqr_add_c2(a,5,4,c1,c2,c3);
500	r[9]=c1;
501	c1=0;
502	sqr_add_c(a,5,c2,c3,c1);
503	sqr_add_c2(a,6,4,c2,c3,c1);
504	sqr_add_c2(a,7,3,c2,c3,c1);
505	r[10]=c2;
506	c2=0;
507	sqr_add_c2(a,7,4,c3,c1,c2);
508	sqr_add_c2(a,6,5,c3,c1,c2);
509	r[11]=c3;
510	c3=0;
511	sqr_add_c(a,6,c1,c2,c3);
512	sqr_add_c2(a,7,5,c1,c2,c3);
513	r[12]=c1;
514	c1=0;
515	sqr_add_c2(a,7,6,c2,c3,c1);
516	r[13]=c2;
517	c2=0;
518	sqr_add_c(a,7,c3,c1,c2);
519	r[14]=c3;
520	r[15]=c1;
521	}
522
523	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
524	{
525	BN_ULONG c1,c2,c3;
526
527	c1=0;
528	c2=0;
529	c3=0;
530	sqr_add_c(a,0,c1,c2,c3);
531	r[0]=c1;
532	c1=0;
533	sqr_add_c2(a,1,0,c2,c3,c1);
534	r[1]=c2;
535	c2=0;
536	sqr_add_c(a,1,c3,c1,c2);
537	sqr_add_c2(a,2,0,c3,c1,c2);
538	r[2]=c3;
539	c3=0;
540	sqr_add_c2(a,3,0,c1,c2,c3);
541	sqr_add_c2(a,2,1,c1,c2,c3);
542	r[3]=c1;
543	c1=0;
544	sqr_add_c(a,2,c2,c3,c1);
545	sqr_add_c2(a,3,1,c2,c3,c1);
546	r[4]=c2;
547	c2=0;
548	sqr_add_c2(a,3,2,c3,c1,c2);
549	r[5]=c3;
550	c3=0;
551	sqr_add_c(a,3,c1,c2,c3);
552	r[6]=c1;
553	r[7]=c2;
554	}