1 files changed, 246 insertions, 76 deletions
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
index 99bc2de491..c43c91cc09 100644
--- a/src/lib/libcrypto/bn/bn_asm.c
+++ b/src/lib/libcrypto/bn/bn_asm.c
@@ -75,6 +75,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
        assert(num >= 0);
        if (num <= 0) return(c1);
+#ifndef OPENSSL_SMALL_FOOTPRINT
        while (num&~3)
                {
                mul_add(rp[0],ap[0],w,c1);
@@ -83,11 +84,11 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
                mul_add(rp[3],ap[3],w,c1);
                ap+=4; rp+=4; num-=4;
                }
-        if (num)
+#endif
+        while (num)
                {
-                mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
+                mul_add(rp[0],ap[0],w,c1);
-                mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
+                ap++; rp++; num--;
-                mul_add(rp[2],ap[2],w,c1); return c1;
                }
        
        return(c1);
@@ -100,6 +101,7 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
        assert(num >= 0);
        if (num <= 0) return(c1);
+#ifndef OPENSSL_SMALL_FOOTPRINT
        while (num&~3)
                {
                mul(rp[0],ap[0],w,c1);
@@ -108,11 +110,11 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
                mul(rp[3],ap[3],w,c1);
                ap+=4; rp+=4; num-=4;
                }
-        if (num)
+#endif
+        while (num)
                {
-                mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
+                mul(rp[0],ap[0],w,c1);
-                mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
+                ap++; rp++; num--;
-                mul(rp[2],ap[2],w,c1);
                }
        return(c1);
        } 
@@ -121,6 +123,8 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
        {
        assert(n >= 0);
        if (n <= 0) return;
+#ifndef OPENSSL_SMALL_FOOTPRINT
        while (n&~3)
                {
                sqr(r[0],r[1],a[0]);
@@ -129,11 +133,11 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
                sqr(r[6],r[7],a[3]);
                a+=4; r+=8; n-=4;
                }
-        if (n)
+#endif
+        while (n)
                {
-                sqr(r[0],r[1],a[0]); if (--n == 0) return;
+                sqr(r[0],r[1],a[0]);
-                sqr(r[2],r[3],a[1]); if (--n == 0) return;
+                a++; r+=2; n--;
-                sqr(r[4],r[5],a[2]);
                }
        }
@@ -150,18 +154,20 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
        bl=LBITS(w);
        bh=HBITS(w);
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (num&~3)
                {
                mul_add(rp[0],ap[0],bl,bh,c);
-                if (--num == 0) break;
                mul_add(rp[1],ap[1],bl,bh,c);
-                if (--num == 0) break;
                mul_add(rp[2],ap[2],bl,bh,c);
-                if (--num == 0) break;
                mul_add(rp[3],ap[3],bl,bh,c);
-                if (--num == 0) break;
+                ap+=4; rp+=4; num-=4;
-                ap+=4;
+                }
-                rp+=4;
+#endif
+        while (num)
+                {
+                mul_add(rp[0],ap[0],bl,bh,c);
+                ap++; rp++; num--;
                }
        return(c);
        } 
@@ -177,18 +183,20 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
        bl=LBITS(w);
        bh=HBITS(w);
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (num&~3)
                {
                mul(rp[0],ap[0],bl,bh,carry);
-                if (--num == 0) break;
                mul(rp[1],ap[1],bl,bh,carry);
-                if (--num == 0) break;
                mul(rp[2],ap[2],bl,bh,carry);
-                if (--num == 0) break;
                mul(rp[3],ap[3],bl,bh,carry);
-                if (--num == 0) break;
+                ap+=4; rp+=4; num-=4;
-                ap+=4;
+                }
-                rp+=4;
+#endif
+        while (num)
+                {
+                mul(rp[0],ap[0],bl,bh,carry);
+                ap++; rp++; num--;
                }
        return(carry);
        } 
@@ -197,22 +205,21 @@ void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
        {
        assert(n >= 0);
        if (n <= 0) return;
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (n&~3)
                {
                sqr64(r[0],r[1],a[0]);
-                if (--n == 0) break;
                sqr64(r[2],r[3],a[1]);
-                if (--n == 0) break;
                sqr64(r[4],r[5],a[2]);
-                if (--n == 0) break;
                sqr64(r[6],r[7],a[3]);
-                if (--n == 0) break;
+                a+=4; r+=8; n-=4;
+                }
-                a+=4;
+#endif
-                r+=8;
+        while (n)
+                {
+                sqr64(r[0],r[1],a[0]);
+                a++; r+=2; n--;
                }
        }
@@ -303,31 +310,30 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
        assert(n >= 0);
        if (n <= 0) return((BN_ULONG)0);
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (n&~3)
                {
                ll+=(BN_ULLONG)a[0]+b[0];
                r[0]=(BN_ULONG)ll&BN_MASK2;
                ll>>=BN_BITS2;
-                if (--n <= 0) break;
                ll+=(BN_ULLONG)a[1]+b[1];
                r[1]=(BN_ULONG)ll&BN_MASK2;
                ll>>=BN_BITS2;
-                if (--n <= 0) break;
                ll+=(BN_ULLONG)a[2]+b[2];
                r[2]=(BN_ULONG)ll&BN_MASK2;
                ll>>=BN_BITS2;
-                if (--n <= 0) break;
                ll+=(BN_ULLONG)a[3]+b[3];
                r[3]=(BN_ULONG)ll&BN_MASK2;
                ll>>=BN_BITS2;
-                if (--n <= 0) break;
+                a+=4; b+=4; r+=4; n-=4;
+                }
-                a+=4;
+#endif
-                b+=4;
+        while (n)
-                r+=4;
+                {
+                ll+=(BN_ULLONG)a[0]+b[0];
+                r[0]=(BN_ULONG)ll&BN_MASK2;
+                ll>>=BN_BITS2;
+                a++; b++; r++; n--;
                }
        return((BN_ULONG)ll);
        }
@@ -340,7 +346,8 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
        if (n <= 0) return((BN_ULONG)0);
        c=0;
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (n&~3)
                {
                t=a[0];
                t=(t+c)&BN_MASK2;
@@ -348,35 +355,36 @@ BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
                l=(t+b[0])&BN_MASK2;
                c+=(l < t);
                r[0]=l;
-                if (--n <= 0) break;
                t=a[1];
                t=(t+c)&BN_MASK2;
                c=(t < c);
                l=(t+b[1])&BN_MASK2;
                c+=(l < t);
                r[1]=l;
-                if (--n <= 0) break;
                t=a[2];
                t=(t+c)&BN_MASK2;
                c=(t < c);
                l=(t+b[2])&BN_MASK2;
                c+=(l < t);
                r[2]=l;
-                if (--n <= 0) break;
                t=a[3];
                t=(t+c)&BN_MASK2;
                c=(t < c);
                l=(t+b[3])&BN_MASK2;
                c+=(l < t);
                r[3]=l;
-                if (--n <= 0) break;
+                a+=4; b+=4; r+=4; n-=4;
+                }
-                a+=4;
+#endif
-                b+=4;
+        while(n)
-                r+=4;
+                {
+                t=a[0];
+                t=(t+c)&BN_MASK2;
+                c=(t < c);
+                l=(t+b[0])&BN_MASK2;
+                c+=(l < t);
+                r[0]=l;
+                a++; b++; r++; n--;
                }
        return((BN_ULONG)c);
        }
@@ -390,36 +398,35 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
        assert(n >= 0);
        if (n <= 0) return((BN_ULONG)0);
-        for (;;)
+#ifndef OPENSSL_SMALL_FOOTPRINT
+        while (n&~3)
                {
                t1=a[0]; t2=b[0];
                r[0]=(t1-t2-c)&BN_MASK2;
                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
                t1=a[1]; t2=b[1];
                r[1]=(t1-t2-c)&BN_MASK2;
                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
                t1=a[2]; t2=b[2];
                r[2]=(t1-t2-c)&BN_MASK2;
                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
                t1=a[3]; t2=b[3];
                r[3]=(t1-t2-c)&BN_MASK2;
                if (t1 != t2) c=(t1 < t2);
-                if (--n <= 0) break;
+                a+=4; b+=4; r+=4; n-=4;
+                }
-                a+=4;
+#endif
-                b+=4;
+        while (n)
-                r+=4;
+                {
+                t1=a[0]; t2=b[0];
+                r[0]=(t1-t2-c)&BN_MASK2;
+                if (t1 != t2) c=(t1 < t2);
+                a++; b++; r++; n--;
                }
        return(c);
        }
-#ifdef BN_MUL_COMBA
+#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
 #undef bn_mul_comba8
 #undef bn_mul_comba4
@@ -820,18 +827,134 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
        r[6]=c1;
        r[7]=c2;
        }
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this routine was
+ * observed to give 40% faster rsa1024 private key operations and 10%
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
+ * reference implementation, one to be used as starting point for
+ * platform-specific assembler. Mentioned numbers apply to compiler
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
+ * can vary not only from platform to platform, but even for compiler
+ * versions. Assembler vs. assembler improvement coefficients can
+ * [and are known to] differ and are to be documented elsewhere.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+        {
+        BN_ULONG c0,c1,ml,*tp,n0;
+#ifdef mul64
+        BN_ULONG mh;
+#endif
+        volatile BN_ULONG *vp;
+        int i=0,j;
+#if 0   /* template for platform-specific implementation */
+        if (ap==bp)     return bn_sqr_mont(rp,ap,np,n0p,num);
+#endif
+        vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+        n0 = *n0p;
+        c0 = 0;
+        ml = bp[0];
+#ifdef mul64
+        mh = HBITS(ml);
+        ml = LBITS(ml);
+        for (j=0;j<num;++j)
+                mul(tp[j],ap[j],ml,mh,c0);
+#else
+        for (j=0;j<num;++j)
+                mul(tp[j],ap[j],ml,c0);
+#endif
+        tp[num]   = c0;
+        tp[num+1] = 0;
+        goto enter;
+        for(i=0;i<num;i++)
+                {
+                c0 = 0;
+                ml = bp[i];
+#ifdef mul64
+                mh = HBITS(ml);
+                ml = LBITS(ml);
+                for (j=0;j<num;++j)
+                        mul_add(tp[j],ap[j],ml,mh,c0);
+#else
+                for (j=0;j<num;++j)
+                        mul_add(tp[j],ap[j],ml,c0);
+#endif
+                c1 = (tp[num] + c0)&BN_MASK2;
+                tp[num]   = c1;
+                tp[num+1] = (c1<c0?1:0);
+        enter:
+                c1  = tp[0];
+                ml = (c1*n0)&BN_MASK2;
+                c0 = 0;
+#ifdef mul64
+                mh = HBITS(ml);
+                ml = LBITS(ml);
+                mul_add(c1,np[0],ml,mh,c0);
+#else
+                mul_add(c1,ml,np[0],c0);
+#endif
+                for(j=1;j<num;j++)
+                        {
+                        c1 = tp[j];
+#ifdef mul64
+                        mul_add(c1,np[j],ml,mh,c0);
+#else
+                        mul_add(c1,ml,np[j],c0);
+#endif
+                        tp[j-1] = c1&BN_MASK2;
+                        }
+                c1        = (tp[num] + c0)&BN_MASK2;
+                tp[num-1] = c1;
+                tp[num]   = tp[num+1] + (c1<c0?1:0);
+                }
+        if (tp[num]!=0 || tp[num-1]>=np[num-1])
+                {
+                c0 = bn_sub_words(rp,tp,np,num);
+                if (tp[num]!=0 || c0==0)
+                        {
+                        for(i=0;i<num+2;i++)    vp[i] = 0;
+                        return 1;
+                        }
+                }
+        for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
+        vp[num]   = 0;
+        vp[num+1] = 0;
+        return 1;
+        }
+#else
+/*
+ * Return value of 0 indicates that multiplication/convolution was not
+ * performed to signal the caller to fall down to alternative/original
+ * code-path.
+ */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{       return 0;       }
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
 #else /* !BN_MUL_COMBA */
 /* hmm... is it faster just to do a multiply? */
 #undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
        {
        BN_ULONG t[8];
        bn_sqr_normal(r,a,4,t);
        }
 #undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
        {
        BN_ULONG t[16];
        bn_sqr_normal(r,a,8,t);
@@ -857,4 +980,51 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
        r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
        }
+#ifdef OPENSSL_NO_ASM
+#ifdef OPENSSL_BN_ASM_MONT
+#include <alloca.h>
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num)
+        {
+        BN_ULONG c0,c1,*tp,n0=*n0p;
+        volatile BN_ULONG *vp;
+        int i=0,j;
+        vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+        for(i=0;i<=num;i++)     tp[i]=0;
+        for(i=0;i<num;i++)
+                {
+                c0         = bn_mul_add_words(tp,ap,num,bp[i]);
+                c1         = (tp[num] + c0)&BN_MASK2;
+                tp[num]    = c1;
+                tp[num+1]  = (c1<c0?1:0);
+                c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
+                c1         = (tp[num] + c0)&BN_MASK2;
+                tp[num]    = c1;
+                tp[num+1] += (c1<c0?1:0);
+                for(j=0;j<=num;j++)     tp[j]=tp[j+1];
+                }
+        if (tp[num]!=0 || tp[num-1]>=np[num-1])
+                {
+                c0 = bn_sub_words(rp,tp,np,num);
+                if (tp[num]!=0 || c0==0)
+                        {
+                        for(i=0;i<num+2;i++)    vp[i] = 0;
+                        return 1;
+                        }
+                }
+        for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
+        vp[num]   = 0;
+        vp[num+1] = 0;
+        return 1;
+        }
+#else
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num)
+{       return 0;       }
+#endif /* OPENSSL_BN_ASM_MONT */
+#endif
 #endif /* !BN_MUL_COMBA */

diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c index 99bc2de491..c43c91cc09 100644 --- a/src/lib/libcrypto/bn/bn_asm.c +++ b/src/lib/libcrypto/bn/bn_asm.c
@@ -75,6 +75,7 @@ BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
75	assert(num >= 0);	75	assert(num >= 0);
76	if (num <= 0) return(c1);	76	if (num <= 0) return(c1);
77		77
		78	#ifndef OPENSSL_SMALL_FOOTPRINT
78	while (num&~3)	79	while (num&~3)
79	{	80	{
80	mul_add(rp[0],ap[0],w,c1);	81	mul_add(rp[0],ap[0],w,c1);
@@ -83,11 +84,11 @@ BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
83	mul_add(rp[3],ap[3],w,c1);	84	mul_add(rp[3],ap[3],w,c1);
84	ap+=4; rp+=4; num-=4;	85	ap+=4; rp+=4; num-=4;
85	}	86	}
86	if (num)	87	#endif
		88	while (num)
87	{	89	{
88	mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;	90	mul_add(rp[0],ap[0],w,c1);
89	mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;	91	ap++; rp++; num--;
90	mul_add(rp[2],ap[2],w,c1); return c1;
91	}	92	}
92		93
93	return(c1);	94	return(c1);
@@ -100,6 +101,7 @@ BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
100	assert(num >= 0);	101	assert(num >= 0);
101	if (num <= 0) return(c1);	102	if (num <= 0) return(c1);
102		103
		104	#ifndef OPENSSL_SMALL_FOOTPRINT
103	while (num&~3)	105	while (num&~3)
104	{	106	{
105	mul(rp[0],ap[0],w,c1);	107	mul(rp[0],ap[0],w,c1);
@@ -108,11 +110,11 @@ BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
108	mul(rp[3],ap[3],w,c1);	110	mul(rp[3],ap[3],w,c1);
109	ap+=4; rp+=4; num-=4;	111	ap+=4; rp+=4; num-=4;
110	}	112	}
111	if (num)	113	#endif
		114	while (num)
112	{	115	{
113	mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;	116	mul(rp[0],ap[0],w,c1);
114	mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;	117	ap++; rp++; num--;
115	mul(rp[2],ap[2],w,c1);
116	}	118	}
117	return(c1);	119	return(c1);
118	}	120	}
@@ -121,6 +123,8 @@ void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
121	{	123	{
122	assert(n >= 0);	124	assert(n >= 0);
123	if (n <= 0) return;	125	if (n <= 0) return;
		126
		127	#ifndef OPENSSL_SMALL_FOOTPRINT
124	while (n&~3)	128	while (n&~3)
125	{	129	{
126	sqr(r[0],r[1],a[0]);	130	sqr(r[0],r[1],a[0]);
@@ -129,11 +133,11 @@ void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
129	sqr(r[6],r[7],a[3]);	133	sqr(r[6],r[7],a[3]);
130	a+=4; r+=8; n-=4;	134	a+=4; r+=8; n-=4;
131	}	135	}
132	if (n)	136	#endif
		137	while (n)
133	{	138	{
134	sqr(r[0],r[1],a[0]); if (--n == 0) return;	139	sqr(r[0],r[1],a[0]);
135	sqr(r[2],r[3],a[1]); if (--n == 0) return;	140	a++; r+=2; n--;
136	sqr(r[4],r[5],a[2]);
137	}	141	}
138	}	142	}
139		143
@@ -150,18 +154,20 @@ BN_ULONG bn_mul_add_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
150	bl=LBITS(w);	154	bl=LBITS(w);
151	bh=HBITS(w);	155	bh=HBITS(w);
152		156
153	for (;;)	157	#ifndef OPENSSL_SMALL_FOOTPRINT
		158	while (num&~3)
154	{	159	{
155	mul_add(rp[0],ap[0],bl,bh,c);	160	mul_add(rp[0],ap[0],bl,bh,c);
156	if (--num == 0) break;
157	mul_add(rp[1],ap[1],bl,bh,c);	161	mul_add(rp[1],ap[1],bl,bh,c);
158	if (--num == 0) break;
159	mul_add(rp[2],ap[2],bl,bh,c);	162	mul_add(rp[2],ap[2],bl,bh,c);
160	if (--num == 0) break;
161	mul_add(rp[3],ap[3],bl,bh,c);	163	mul_add(rp[3],ap[3],bl,bh,c);
162	if (--num == 0) break;	164	ap+=4; rp+=4; num-=4;
163	ap+=4;	165	}
164	rp+=4;	166	#endif
		167	while (num)
		168	{
		169	mul_add(rp[0],ap[0],bl,bh,c);
		170	ap++; rp++; num--;
165	}	171	}
166	return(c);	172	return(c);
167	}	173	}
@@ -177,18 +183,20 @@ BN_ULONG bn_mul_words(BN_ULONG rp, const BN_ULONG ap, int num, BN_ULONG w)
177	bl=LBITS(w);	183	bl=LBITS(w);
178	bh=HBITS(w);	184	bh=HBITS(w);
179		185
180	for (;;)	186	#ifndef OPENSSL_SMALL_FOOTPRINT
		187	while (num&~3)
181	{	188	{
182	mul(rp[0],ap[0],bl,bh,carry);	189	mul(rp[0],ap[0],bl,bh,carry);
183	if (--num == 0) break;
184	mul(rp[1],ap[1],bl,bh,carry);	190	mul(rp[1],ap[1],bl,bh,carry);
185	if (--num == 0) break;
186	mul(rp[2],ap[2],bl,bh,carry);	191	mul(rp[2],ap[2],bl,bh,carry);
187	if (--num == 0) break;
188	mul(rp[3],ap[3],bl,bh,carry);	192	mul(rp[3],ap[3],bl,bh,carry);
189	if (--num == 0) break;	193	ap+=4; rp+=4; num-=4;
190	ap+=4;	194	}
191	rp+=4;	195	#endif
		196	while (num)
		197	{
		198	mul(rp[0],ap[0],bl,bh,carry);
		199	ap++; rp++; num--;
192	}	200	}
193	return(carry);	201	return(carry);
194	}	202	}
@@ -197,22 +205,21 @@ void bn_sqr_words(BN_ULONG r, const BN_ULONG a, int n)
197	{	205	{
198	assert(n >= 0);	206	assert(n >= 0);
199	if (n <= 0) return;	207	if (n <= 0) return;
200	for (;;)	208
		209	#ifndef OPENSSL_SMALL_FOOTPRINT
		210	while (n&~3)
201	{	211	{
202	sqr64(r[0],r[1],a[0]);	212	sqr64(r[0],r[1],a[0]);
203	if (--n == 0) break;
204
205	sqr64(r[2],r[3],a[1]);	213	sqr64(r[2],r[3],a[1]);
206	if (--n == 0) break;
207
208	sqr64(r[4],r[5],a[2]);	214	sqr64(r[4],r[5],a[2]);
209	if (--n == 0) break;
210
211	sqr64(r[6],r[7],a[3]);	215	sqr64(r[6],r[7],a[3]);
212	if (--n == 0) break;	216	a+=4; r+=8; n-=4;
213		217	}
214	a+=4;	218	#endif
215	r+=8;	219	while (n)
		220	{
		221	sqr64(r[0],r[1],a[0]);
		222	a++; r+=2; n--;
216	}	223	}
217	}	224	}
218		225
@@ -303,31 +310,30 @@ BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)
303	assert(n >= 0);	310	assert(n >= 0);
304	if (n <= 0) return((BN_ULONG)0);	311	if (n <= 0) return((BN_ULONG)0);
305		312
306	for (;;)	313	#ifndef OPENSSL_SMALL_FOOTPRINT
		314	while (n&~3)
307	{	315	{
308	ll+=(BN_ULLONG)a[0]+b[0];	316	ll+=(BN_ULLONG)a[0]+b[0];
309	r[0]=(BN_ULONG)ll&BN_MASK2;	317	r[0]=(BN_ULONG)ll&BN_MASK2;
310	ll>>=BN_BITS2;	318	ll>>=BN_BITS2;
311	if (--n <= 0) break;
312
313	ll+=(BN_ULLONG)a[1]+b[1];	319	ll+=(BN_ULLONG)a[1]+b[1];
314	r[1]=(BN_ULONG)ll&BN_MASK2;	320	r[1]=(BN_ULONG)ll&BN_MASK2;
315	ll>>=BN_BITS2;	321	ll>>=BN_BITS2;
316	if (--n <= 0) break;
317
318	ll+=(BN_ULLONG)a[2]+b[2];	322	ll+=(BN_ULLONG)a[2]+b[2];
319	r[2]=(BN_ULONG)ll&BN_MASK2;	323	r[2]=(BN_ULONG)ll&BN_MASK2;
320	ll>>=BN_BITS2;	324	ll>>=BN_BITS2;
321	if (--n <= 0) break;
322
323	ll+=(BN_ULLONG)a[3]+b[3];	325	ll+=(BN_ULLONG)a[3]+b[3];
324	r[3]=(BN_ULONG)ll&BN_MASK2;	326	r[3]=(BN_ULONG)ll&BN_MASK2;
325	ll>>=BN_BITS2;	327	ll>>=BN_BITS2;
326	if (--n <= 0) break;	328	a+=4; b+=4; r+=4; n-=4;
327		329	}
328	a+=4;	330	#endif
329	b+=4;	331	while (n)
330	r+=4;	332	{
		333	ll+=(BN_ULLONG)a[0]+b[0];
		334	r[0]=(BN_ULONG)ll&BN_MASK2;
		335	ll>>=BN_BITS2;
		336	a++; b++; r++; n--;
331	}	337	}
332	return((BN_ULONG)ll);	338	return((BN_ULONG)ll);
333	}	339	}
@@ -340,7 +346,8 @@ BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)
340	if (n <= 0) return((BN_ULONG)0);	346	if (n <= 0) return((BN_ULONG)0);
341		347
342	c=0;	348	c=0;
343	for (;;)	349	#ifndef OPENSSL_SMALL_FOOTPRINT
		350	while (n&~3)
344	{	351	{
345	t=a[0];	352	t=a[0];
346	t=(t+c)&BN_MASK2;	353	t=(t+c)&BN_MASK2;
@@ -348,35 +355,36 @@ BN_ULONG bn_add_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)
348	l=(t+b[0])&BN_MASK2;	355	l=(t+b[0])&BN_MASK2;
349	c+=(l < t);	356	c+=(l < t);
350	r[0]=l;	357	r[0]=l;
351	if (--n <= 0) break;
352
353	t=a[1];	358	t=a[1];
354	t=(t+c)&BN_MASK2;	359	t=(t+c)&BN_MASK2;
355	c=(t < c);	360	c=(t < c);
356	l=(t+b[1])&BN_MASK2;	361	l=(t+b[1])&BN_MASK2;
357	c+=(l < t);	362	c+=(l < t);
358	r[1]=l;	363	r[1]=l;
359	if (--n <= 0) break;
360
361	t=a[2];	364	t=a[2];
362	t=(t+c)&BN_MASK2;	365	t=(t+c)&BN_MASK2;
363	c=(t < c);	366	c=(t < c);
364	l=(t+b[2])&BN_MASK2;	367	l=(t+b[2])&BN_MASK2;
365	c+=(l < t);	368	c+=(l < t);
366	r[2]=l;	369	r[2]=l;
367	if (--n <= 0) break;
368
369	t=a[3];	370	t=a[3];
370	t=(t+c)&BN_MASK2;	371	t=(t+c)&BN_MASK2;
371	c=(t < c);	372	c=(t < c);
372	l=(t+b[3])&BN_MASK2;	373	l=(t+b[3])&BN_MASK2;
373	c+=(l < t);	374	c+=(l < t);
374	r[3]=l;	375	r[3]=l;
375	if (--n <= 0) break;	376	a+=4; b+=4; r+=4; n-=4;
376		377	}
377	a+=4;	378	#endif
378	b+=4;	379	while(n)
379	r+=4;	380	{
		381	t=a[0];
		382	t=(t+c)&BN_MASK2;
		383	c=(t < c);
		384	l=(t+b[0])&BN_MASK2;
		385	c+=(l < t);
		386	r[0]=l;
		387	a++; b++; r++; n--;
380	}	388	}
381	return((BN_ULONG)c);	389	return((BN_ULONG)c);
382	}	390	}
@@ -390,36 +398,35 @@ BN_ULONG bn_sub_words(BN_ULONG r, const BN_ULONG a, const BN_ULONG *b, int n)
390	assert(n >= 0);	398	assert(n >= 0);
391	if (n <= 0) return((BN_ULONG)0);	399	if (n <= 0) return((BN_ULONG)0);
392		400
393	for (;;)	401	#ifndef OPENSSL_SMALL_FOOTPRINT
		402	while (n&~3)
394	{	403	{
395	t1=a[0]; t2=b[0];	404	t1=a[0]; t2=b[0];
396	r[0]=(t1-t2-c)&BN_MASK2;	405	r[0]=(t1-t2-c)&BN_MASK2;
397	if (t1 != t2) c=(t1 < t2);	406	if (t1 != t2) c=(t1 < t2);
398	if (--n <= 0) break;
399
400	t1=a[1]; t2=b[1];	407	t1=a[1]; t2=b[1];
401	r[1]=(t1-t2-c)&BN_MASK2;	408	r[1]=(t1-t2-c)&BN_MASK2;
402	if (t1 != t2) c=(t1 < t2);	409	if (t1 != t2) c=(t1 < t2);
403	if (--n <= 0) break;
404
405	t1=a[2]; t2=b[2];	410	t1=a[2]; t2=b[2];
406	r[2]=(t1-t2-c)&BN_MASK2;	411	r[2]=(t1-t2-c)&BN_MASK2;
407	if (t1 != t2) c=(t1 < t2);	412	if (t1 != t2) c=(t1 < t2);
408	if (--n <= 0) break;
409
410	t1=a[3]; t2=b[3];	413	t1=a[3]; t2=b[3];
411	r[3]=(t1-t2-c)&BN_MASK2;	414	r[3]=(t1-t2-c)&BN_MASK2;
412	if (t1 != t2) c=(t1 < t2);	415	if (t1 != t2) c=(t1 < t2);
413	if (--n <= 0) break;	416	a+=4; b+=4; r+=4; n-=4;
414		417	}
415	a+=4;	418	#endif
416	b+=4;	419	while (n)
417	r+=4;	420	{
		421	t1=a[0]; t2=b[0];
		422	r[0]=(t1-t2-c)&BN_MASK2;
		423	if (t1 != t2) c=(t1 < t2);
		424	a++; b++; r++; n--;
418	}	425	}
419	return(c);	426	return(c);
420	}	427	}
421		428
422	#ifdef BN_MUL_COMBA	429	#if defined(BN_MUL_COMBA) && !defined(OPENSSL_SMALL_FOOTPRINT)
423		430
424	#undef bn_mul_comba8	431	#undef bn_mul_comba8
425	#undef bn_mul_comba4	432	#undef bn_mul_comba4
@@ -820,18 +827,134 @@ void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
820	r[6]=c1;	827	r[6]=c1;
821	r[7]=c2;	828	r[7]=c2;
822	}	829	}
		830
		831	#ifdef OPENSSL_NO_ASM
		832	#ifdef OPENSSL_BN_ASM_MONT
		833	#include <alloca.h>
		834	/*
		835	* This is essentially reference implementation, which may or may not
		836	* result in performance improvement. E.g. on IA-32 this routine was
		837	* observed to give 40% faster rsa1024 private key operations and 10%
		838	* faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
		839	* by 10% and worsens rsa4096 sign by 15%. Once again, it's a
		840	* reference implementation, one to be used as starting point for
		841	* platform-specific assembler. Mentioned numbers apply to compiler
		842	* generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
		843	* can vary not only from platform to platform, but even for compiler
		844	* versions. Assembler vs. assembler improvement coefficients can
		845	* [and are known to] differ and are to be documented elsewhere.
		846	*/
		847	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,const BN_ULONG *n0p, int num)
		848	{
		849	BN_ULONG c0,c1,ml,*tp,n0;
		850	#ifdef mul64
		851	BN_ULONG mh;
		852	#endif
		853	volatile BN_ULONG *vp;
		854	int i=0,j;
		855
		856	#if 0 /* template for platform-specific implementation */
		857	if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
		858	#endif
		859	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
		860
		861	n0 = *n0p;
		862
		863	c0 = 0;
		864	ml = bp[0];
		865	#ifdef mul64
		866	mh = HBITS(ml);
		867	ml = LBITS(ml);
		868	for (j=0;j<num;++j)
		869	mul(tp[j],ap[j],ml,mh,c0);
		870	#else
		871	for (j=0;j<num;++j)
		872	mul(tp[j],ap[j],ml,c0);
		873	#endif
		874
		875	tp[num] = c0;
		876	tp[num+1] = 0;
		877	goto enter;
		878
		879	for(i=0;i<num;i++)
		880	{
		881	c0 = 0;
		882	ml = bp[i];
		883	#ifdef mul64
		884	mh = HBITS(ml);
		885	ml = LBITS(ml);
		886	for (j=0;j<num;++j)
		887	mul_add(tp[j],ap[j],ml,mh,c0);
		888	#else
		889	for (j=0;j<num;++j)
		890	mul_add(tp[j],ap[j],ml,c0);
		891	#endif
		892	c1 = (tp[num] + c0)&BN_MASK2;
		893	tp[num] = c1;
		894	tp[num+1] = (c1<c0?1:0);
		895	enter:
		896	c1 = tp[0];
		897	ml = (c1*n0)&BN_MASK2;
		898	c0 = 0;
		899	#ifdef mul64
		900	mh = HBITS(ml);
		901	ml = LBITS(ml);
		902	mul_add(c1,np[0],ml,mh,c0);
		903	#else
		904	mul_add(c1,ml,np[0],c0);
		905	#endif
		906	for(j=1;j<num;j++)
		907	{
		908	c1 = tp[j];
		909	#ifdef mul64
		910	mul_add(c1,np[j],ml,mh,c0);
		911	#else
		912	mul_add(c1,ml,np[j],c0);
		913	#endif
		914	tp[j-1] = c1&BN_MASK2;
		915	}
		916	c1 = (tp[num] + c0)&BN_MASK2;
		917	tp[num-1] = c1;
		918	tp[num] = tp[num+1] + (c1<c0?1:0);
		919	}
		920
		921	if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])
		922	{
		923	c0 = bn_sub_words(rp,tp,np,num);
		924	if (tp[num]!=0 \|\| c0==0)
		925	{
		926	for(i=0;i<num+2;i++) vp[i] = 0;
		927	return 1;
		928	}
		929	}
		930	for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
		931	vp[num] = 0;
		932	vp[num+1] = 0;
		933	return 1;
		934	}
		935	#else
		936	/*
		937	* Return value of 0 indicates that multiplication/convolution was not
		938	* performed to signal the caller to fall down to alternative/original
		939	* code-path.
		940	*/
		941	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,const BN_ULONG *n0, int num)
		942	{ return 0; }
		943	#endif /* OPENSSL_BN_ASM_MONT */
		944	#endif
		945
823	#else /* !BN_MUL_COMBA */	946	#else /* !BN_MUL_COMBA */
824		947
825	/* hmm... is it faster just to do a multiply? */	948	/* hmm... is it faster just to do a multiply? */
826	#undef bn_sqr_comba4	949	#undef bn_sqr_comba4
827	void bn_sqr_comba4(BN_ULONG r, BN_ULONG a)	950	void bn_sqr_comba4(BN_ULONG r, const BN_ULONG a)
828	{	951	{
829	BN_ULONG t[8];	952	BN_ULONG t[8];
830	bn_sqr_normal(r,a,4,t);	953	bn_sqr_normal(r,a,4,t);
831	}	954	}
832		955
833	#undef bn_sqr_comba8	956	#undef bn_sqr_comba8
834	void bn_sqr_comba8(BN_ULONG r, BN_ULONG a)	957	void bn_sqr_comba8(BN_ULONG r, const BN_ULONG a)
835	{	958	{
836	BN_ULONG t[16];	959	BN_ULONG t[16];
837	bn_sqr_normal(r,a,8,t);	960	bn_sqr_normal(r,a,8,t);
@@ -857,4 +980,51 @@ void bn_mul_comba8(BN_ULONG r, BN_ULONG a, BN_ULONG *b)
857	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);	980	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
858	}	981	}
859		982
		983	#ifdef OPENSSL_NO_ASM
		984	#ifdef OPENSSL_BN_ASM_MONT
		985	#include <alloca.h>
		986	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,const BN_ULONG *n0p, int num)
		987	{
		988	BN_ULONG c0,c1,tp,n0=n0p;
		989	volatile BN_ULONG *vp;
		990	int i=0,j;
		991
		992	vp = tp = alloca((num+2)*sizeof(BN_ULONG));
		993
		994	for(i=0;i<=num;i++) tp[i]=0;
		995
		996	for(i=0;i<num;i++)
		997	{
		998	c0 = bn_mul_add_words(tp,ap,num,bp[i]);
		999	c1 = (tp[num] + c0)&BN_MASK2;
		1000	tp[num] = c1;
		1001	tp[num+1] = (c1<c0?1:0);
		1002
		1003	c0 = bn_mul_add_words(tp,np,num,tp[0]*n0);
		1004	c1 = (tp[num] + c0)&BN_MASK2;
		1005	tp[num] = c1;
		1006	tp[num+1] += (c1<c0?1:0);
		1007	for(j=0;j<=num;j++) tp[j]=tp[j+1];
		1008	}
		1009
		1010	if (tp[num]!=0 \|\| tp[num-1]>=np[num-1])
		1011	{
		1012	c0 = bn_sub_words(rp,tp,np,num);
		1013	if (tp[num]!=0 \|\| c0==0)
		1014	{
		1015	for(i=0;i<num+2;i++) vp[i] = 0;
		1016	return 1;
		1017	}
		1018	}
		1019	for(i=0;i<num;i++) rp[i] = tp[i], vp[i] = 0;
		1020	vp[num] = 0;
		1021	vp[num+1] = 0;
		1022	return 1;
		1023	}
		1024	#else
		1025	int bn_mul_mont(BN_ULONG rp, const BN_ULONG ap, const BN_ULONG bp, const BN_ULONG np,const BN_ULONG *n0, int num)
		1026	{ return 0; }
		1027	#endif /* OPENSSL_BN_ASM_MONT */
		1028	#endif
		1029
860	#endif /* !BN_MUL_COMBA */	1030	#endif /* !BN_MUL_COMBA */