From 15b5d84f9da2ce4bfae8580e56e34a859f74ad71 Mon Sep 17 00:00:00 2001
From: markus <>
Date: Thu, 5 Sep 2002 12:51:50 +0000
Subject: import openssl-0.9.7-beta1

---
 src/lib/libcrypto/bn/asm/bn-586.pl   |  295 ++++-
 src/lib/libcrypto/bn/asm/pa-risc2.s  | 2024 +++++++++++++++++++++++++++-------
 src/lib/libcrypto/bn/asm/pa-risc2W.s |    2 +-
 src/lib/libcrypto/bn/bn.h            |  290 +++--
 src/lib/libcrypto/bn/bn_add.c        |  206 +++-
 src/lib/libcrypto/bn/bn_asm.c        |  178 +--
 src/lib/libcrypto/bn/bn_blind.c      |   47 +-
 src/lib/libcrypto/bn/bn_ctx.c        |   17 +-
 src/lib/libcrypto/bn/bn_div.c        |  221 ++--
 src/lib/libcrypto/bn/bn_err.c        |  142 ++-
 src/lib/libcrypto/bn/bn_exp.c        |  600 ++++++----
 src/lib/libcrypto/bn/bn_exp2.c       |  382 ++++---
 src/lib/libcrypto/bn/bn_gcd.c        |  389 ++++++-
 src/lib/libcrypto/bn/bn_lcl.h        |  291 ++++-
 src/lib/libcrypto/bn/bn_lib.c        |  571 +++++++---
 src/lib/libcrypto/bn/bn_mod.c        |  251 ++++-
 src/lib/libcrypto/bn/bn_mont.c       |  335 +++---
 src/lib/libcrypto/bn/bn_mpi.c        |   11 +-
 src/lib/libcrypto/bn/bn_mul.c        | 1158 +++++++++++++++++--
 src/lib/libcrypto/bn/bn_prime.c      |  459 ++++----
 src/lib/libcrypto/bn/bn_prime.h      |    4 +-
 src/lib/libcrypto/bn/bn_prime.pl     |   71 +-
 src/lib/libcrypto/bn/bn_print.c      |   67 +-
 src/lib/libcrypto/bn/bn_rand.c       |  216 +++-
 src/lib/libcrypto/bn/bn_recp.c       |  183 ++-
 src/lib/libcrypto/bn/bn_shift.c      |   27 +-
 src/lib/libcrypto/bn/bn_sqr.c        |  214 +++-
 src/lib/libcrypto/bn/bn_word.c       |   47 +-
 28 files changed, 6630 insertions(+), 2068 deletions(-)

(limited to 'src/lib/libcrypto/bn')

diff --git a/src/lib/libcrypto/bn/asm/bn-586.pl b/src/lib/libcrypto/bn/asm/bn-586.pl
index 19d425ee96..33f6125920 100644
--- a/src/lib/libcrypto/bn/asm/bn-586.pl
+++ b/src/lib/libcrypto/bn/asm/bn-586.pl
@@ -1,18 +1,17 @@
-#!/usr/bin/perl
-#
-
 #!/usr/local/bin/perl
 
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 
-&asm_init($ARGV[0],"bn-586.pl");
+&asm_init($ARGV[0],$0);
 
 &bn_mul_add_words("bn_mul_add_words");
 &bn_mul_words("bn_mul_words");
 &bn_sqr_words("bn_sqr_words");
-&bn_div64("bn_div64");
+&bn_div_words("bn_div_words");
 &bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
 
 &asm_finish();
 
@@ -228,7 +227,7 @@ sub bn_sqr_words
 	&function_end($name);
 	}
 
-sub bn_div64
+sub bn_div_words
 	{
 	local($name)=@_;
 
@@ -302,12 +301,292 @@ sub bn_add_words
 		 &add($tmp1,$tmp2);
 		&adc($c,0);
 		 &dec($num) if ($i != 6);
-		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
 		 &jz(&label("aw_end")) if ($i != 6);
 		}
 	&set_label("aw_end",0);
 
-	&mov("eax",$c);
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_part_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP(0,$a,"",0));	# *a
+		 &mov($tmp2,&DWP(0,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
+		&add($a, 4);
+		&add($b, 4);
+		&add($r, 4);
+		 &dec($num) if ($i != 6);
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+	&cmp(&wparam(4),0);
+	&je(&label("pw_end"));
+
+	&mov($num,&wparam(4));	# get dl
+	&cmp($num,0);
+	&je(&label("pw_end"));
+	&jge(&label("pw_pos"));
+
+	&comment("pw_neg");
+	&mov($tmp2,0);
+	&sub($tmp2,$num);
+	&mov($num,$tmp2);
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_neg_finish"));
+
+	&set_label("pw_neg_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl<0 Round $i");
+
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+	}
+	    
+	&comment("");
+	&add($b,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_neg_loop"));
+	    
+	&set_label("pw_neg_finish",0);
+	&mov($tmp2,&wparam(4));	# get dl
+	&mov($num,0);
+	&sub($num,$tmp2);
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl<0 Tail Round $i");
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &dec($num) if ($i != 6);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+
+	&jmp(&label("pw_end"));
+	
+	&set_label("pw_pos",0);
+	
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_pos_finish"));
+
+	&set_label("pw_pos_loop",0);
+
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl>0 Round $i");
+
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_nc".$i));
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_pos_loop"));
+	    
+	&set_label("pw_pos_finish",0);
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl>0 Tail Round $i");
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_tail_nc".$i));
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+	&mov($c,1);
+	&jmp(&label("pw_end"));
+
+	&set_label("pw_nc_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_nc".$i,0);
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_nc_loop"));
+	    
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_nc_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_tail_nc".$i,0);
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_nc_end")) if ($i != 6);
+	}
+
+	&set_label("pw_nc_end",0);
+	&mov($c,0);
+
+	&set_label("pw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
 
 	&function_end($name);
 	}
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
index c2725996a4..af9730d062 100644
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ b/src/lib/libcrypto/bn/asm/pa-risc2.s
@@ -1,416 +1,1618 @@
-	.SPACE $PRIVATE$
-	.SUBSPA $DATA$,QUAD=1,ALIGN=8,ACCESS=31
-	.SUBSPA $BSS$,QUAD=1,ALIGN=8,ACCESS=31,ZERO,SORT=82
-	.SPACE $TEXT$
-	.SUBSPA $LIT$,QUAD=0,ALIGN=8,ACCESS=44
-	.SUBSPA $CODE$,QUAD=0,ALIGN=8,ACCESS=44,CODE_ONLY
-	.IMPORT $global$,DATA
-	.IMPORT $$dyncall,MILLICODE
-; gcc_compiled.:
-	.SPACE $TEXT$
-	.SUBSPA $CODE$
-
-	.align 4
-	.EXPORT bn_mul_add_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
+;
+; PA-RISC 2.0 implementation of bn_asm code, based on the
+; 64-bit version of the code.  This code is effectively the
+; same as the 64-bit version except the register model is
+; slightly different given all values must be 32-bit between
+; function calls.  Thus the 64-bit return values are returned
+; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
+;
+;
+; This code is approximately 2x faster than the C version
+; for RSA/DSA.
+;
+; See http://devresource.hp.com/  for more details on the PA-RISC
+; architecture.  Also see the book "PA-RISC 2.0 Architecture"
+; by Gerry Kane for information on the instruction set architecture.
+;
+; Code written by Chris Ruemmler (with some help from the HP C
+; compiler).
+;
+; The code compiles with HP's assembler
+;
+
+	.level	2.0N
+	.space	$TEXT$
+	.subspa	$CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
+
+;
+; Global Register definitions used for the routines.
+;
+; Some information about HP's runtime architecture for 32-bits.
+;
+; "Caller save" means the calling function must save the register
+; if it wants the register to be preserved.
+; "Callee save" means if a function uses the register, it must save
+; the value before using it.
+;
+; For the floating point registers 
+;
+;    "caller save" registers: fr4-fr11, fr22-fr31
+;    "callee save" registers: fr12-fr21
+;    "special" registers: fr0-fr3 (status and exception registers)
+;
+; For the integer registers
+;     value zero             :  r0
+;     "caller save" registers: r1,r19-r26
+;     "callee save" registers: r3-r18
+;     return register        :  r2  (rp)
+;     return values          ; r28,r29  (ret0,ret1)
+;     Stack pointer          ; r30  (sp) 
+;     millicode return ptr   ; r31  (also a caller save register)
+
+
+;
+; Arguments to the routines
+;
+r_ptr       .reg %r26
+a_ptr       .reg %r25
+b_ptr       .reg %r24
+num         .reg %r24
+n           .reg %r23
+
+;
+; Note that the "w" argument for bn_mul_add_words and bn_mul_words
+; is passed on the stack at a delta of -56 from the top of stack
+; as the routine is entered.
+;
+
+;
+; Globals used in some routines
+;
+
+top_overflow .reg %r23
+high_mask    .reg %r22    ; value 0xffffffff80000000L
+
+
+;------------------------------------------------------------------------------
+;
+; bn_mul_add_words
+;
+;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr, 
+;								int num, BN_ULONG w)
+;
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg3 = num
+; -56(sp) =  w
+;
+; Local register definitions
+;
+
+fm1          .reg %fr22
+fm           .reg %fr23
+ht_temp      .reg %fr24
+ht_temp_1    .reg %fr25
+lt_temp      .reg %fr26
+lt_temp_1    .reg %fr27
+fm1_1        .reg %fr28
+fm_1         .reg %fr29
+
+fw_h         .reg %fr7L
+fw_l         .reg %fr7R
+fw           .reg %fr7
+
+fht_0        .reg %fr8L
+flt_0        .reg %fr8R
+t_float_0    .reg %fr8
+
+fht_1        .reg %fr9L
+flt_1        .reg %fr9R
+t_float_1    .reg %fr9
+
+tmp_0        .reg %r31
+tmp_1        .reg %r21
+m_0          .reg %r20 
+m_1          .reg %r19 
+ht_0         .reg %r1  
+ht_1         .reg %r3
+lt_0         .reg %r4
+lt_1         .reg %r5
+m1_0         .reg %r6 
+m1_1         .reg %r7 
+rp_val       .reg %r8
+rp_val_1     .reg %r9
+
 bn_mul_add_words
-	.PROC
-	.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=4
-	.ENTRY
-	stw %r2,-20(0,%r30)
-	stwm %r4,64(0,%r30)
-	copy %r24,%r31
-	stw %r3,-60(0,%r30)
-	ldi 0,%r20
-	ldo 12(%r26),%r2
-	stw %r23,-16(0,%r30)
-	copy %r25,%r3
-	ldo 12(%r3),%r1
-	fldws -16(0,%r30),%fr8L
-L$0010
-	copy %r20,%r25
-	ldi 0,%r24
-	fldws 0(0,%r3),%fr9L
-	ldw 0(0,%r26),%r19
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r19,%r23
-	ldw -16(0,%r30),%r28
-	ldw -12(0,%r30),%r29
-	ldi 0,%r22
-	add %r23,%r29,%r29
-	addc %r22,%r28,%r28
-	add %r25,%r29,%r29
-	addc %r24,%r28,%r28
-	copy %r28,%r21
-	ldi 0,%r20
-	copy %r21,%r20
-	addib,= -1,%r31,L$0011
-	stw %r29,0(0,%r26)
-	copy %r20,%r25
-	ldi 0,%r24
-	fldws -8(0,%r1),%fr9L
-	ldw -8(0,%r2),%r19
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r19,%r23
-	ldw -16(0,%r30),%r28
-	ldw -12(0,%r30),%r29
-	ldi 0,%r22
-	add %r23,%r29,%r29
-	addc %r22,%r28,%r28
-	add %r25,%r29,%r29
-	addc %r24,%r28,%r28
-	copy %r28,%r21
-	ldi 0,%r20
-	copy %r21,%r20
-	addib,= -1,%r31,L$0011
-	stw %r29,-8(0,%r2)
-	copy %r20,%r25
-	ldi 0,%r24
-	fldws -4(0,%r1),%fr9L
-	ldw -4(0,%r2),%r19
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r19,%r23
-	ldw -16(0,%r30),%r28
-	ldw -12(0,%r30),%r29
-	ldi 0,%r22
-	add %r23,%r29,%r29
-	addc %r22,%r28,%r28
-	add %r25,%r29,%r29
-	addc %r24,%r28,%r28
-	copy %r28,%r21
-	ldi 0,%r20
-	copy %r21,%r20
-	addib,= -1,%r31,L$0011
-	stw %r29,-4(0,%r2)
-	copy %r20,%r25
-	ldi 0,%r24
-	fldws 0(0,%r1),%fr9L
-	ldw 0(0,%r2),%r19
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r19,%r23
-	ldw -16(0,%r30),%r28
-	ldw -12(0,%r30),%r29
-	ldi 0,%r22
-	add %r23,%r29,%r29
-	addc %r22,%r28,%r28
-	add %r25,%r29,%r29
-	addc %r24,%r28,%r28
-	copy %r28,%r21
-	ldi 0,%r20
-	copy %r21,%r20
-	addib,= -1,%r31,L$0011
-	stw %r29,0(0,%r2)
-	ldo 16(%r1),%r1
-	ldo 16(%r3),%r3
-	ldo 16(%r2),%r2
-	bl L$0010,0
-	ldo 16(%r26),%r26
-L$0011
-	copy %r20,%r28
-	ldw -84(0,%r30),%r2
-	ldw -60(0,%r30),%r3
-	bv 0(%r2)
-	ldwm -64(0,%r30),%r4
-	.EXIT
-	.PROCEND
-	.align 4
-	.EXPORT bn_mul_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR
+	.export	bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
+	.proc
+	.callinfo frame=128
+    .entry
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3  
+    STD     %r4,8(%sp)          ; save r4  
+	NOP                         ; Needed to make the loop 16-byte aligned
+	NOP                         ; needed to make the loop 16-byte aligned
+
+    STD     %r5,16(%sp)         ; save r5  
+	NOP
+    STD     %r6,24(%sp)         ; save r6  
+    STD     %r7,32(%sp)         ; save r7  
+
+    STD     %r8,40(%sp)         ; save r8  
+    STD     %r9,48(%sp)         ; save r9  
+    COPY    %r0,%ret1           ; return 0 by default
+    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32    
+
+    CMPIB,>= 0,num,bn_mul_add_words_exit  ; if (num <= 0) then exit
+	LDO     128(%sp),%sp        ; bump stack
+
+	;
+	; The loop is unrolled twice, so if there is only 1 number
+    ; then go straight to the cleanup code.
+	;
+	CMPIB,= 1,num,bn_mul_add_words_single_top
+	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
+
+	;
+	; This loop is unrolled 2 times (64-byte aligned as well)
+	;
+	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+    ; two 32-bit mutiplies can be issued per cycle.
+    ; 
+bn_mul_add_words_unroll2
+
+    FLDD    0(a_ptr),t_float_0       ; load up 64-bit value (fr8L) ht(L)/lt(R)
+    FLDD    8(a_ptr),t_float_1       ; load up 64-bit value (fr8L) ht(L)/lt(R)
+    LDD     0(r_ptr),rp_val          ; rp[0]
+    LDD     8(r_ptr),rp_val_1        ; rp[1]
+
+    XMPYU   fht_0,fw_l,fm1           ; m1[0] = fht_0*fw_l
+    XMPYU   fht_1,fw_l,fm1_1         ; m1[1] = fht_1*fw_l
+    FSTD    fm1,-16(%sp)             ; -16(sp) = m1[0]
+    FSTD    fm1_1,-48(%sp)           ; -48(sp) = m1[1]
+
+    XMPYU   flt_0,fw_h,fm            ; m[0] = flt_0*fw_h
+    XMPYU   flt_1,fw_h,fm_1          ; m[1] = flt_1*fw_h
+    FSTD    fm,-8(%sp)               ; -8(sp) = m[0]
+    FSTD    fm_1,-40(%sp)            ; -40(sp) = m[1]
+
+    XMPYU   fht_0,fw_h,ht_temp       ; ht_temp   = fht_0*fw_h
+    XMPYU   fht_1,fw_h,ht_temp_1     ; ht_temp_1 = fht_1*fw_h
+    FSTD    ht_temp,-24(%sp)         ; -24(sp)   = ht_temp
+    FSTD    ht_temp_1,-56(%sp)       ; -56(sp)   = ht_temp_1
+
+    XMPYU   flt_0,fw_l,lt_temp       ; lt_temp = lt*fw_l
+    XMPYU   flt_1,fw_l,lt_temp_1     ; lt_temp = lt*fw_l
+    FSTD    lt_temp,-32(%sp)         ; -32(sp) = lt_temp 
+    FSTD    lt_temp_1,-64(%sp)       ; -64(sp) = lt_temp_1 
+
+    LDD     -8(%sp),m_0              ; m[0] 
+    LDD     -40(%sp),m_1             ; m[1]
+    LDD     -16(%sp),m1_0            ; m1[0]
+    LDD     -48(%sp),m1_1            ; m1[1]
+
+    LDD     -24(%sp),ht_0            ; ht[0]
+    LDD     -56(%sp),ht_1            ; ht[1]
+    ADD,L   m1_0,m_0,tmp_0           ; tmp_0 = m[0] + m1[0]; 
+    ADD,L   m1_1,m_1,tmp_1           ; tmp_1 = m[1] + m1[1]; 
+
+    LDD     -32(%sp),lt_0            
+    LDD     -64(%sp),lt_1            
+    CMPCLR,*>>= tmp_0,m1_0, %r0      ; if (m[0] < m1[0])
+    ADD,L   ht_0,top_overflow,ht_0   ; ht[0] += (1<<32)
+
+    CMPCLR,*>>= tmp_1,m1_1,%r0       ; if (m[1] < m1[1])
+    ADD,L   ht_1,top_overflow,ht_1   ; ht[1] += (1<<32)
+    EXTRD,U tmp_0,31,32,m_0          ; m[0]>>32  
+    DEPD,Z  tmp_0,31,32,m1_0         ; m1[0] = m[0]<<32 
+
+    EXTRD,U tmp_1,31,32,m_1          ; m[1]>>32  
+    DEPD,Z  tmp_1,31,32,m1_1         ; m1[1] = m[1]<<32 
+    ADD,L   ht_0,m_0,ht_0            ; ht[0]+= (m[0]>>32)
+    ADD,L   ht_1,m_1,ht_1            ; ht[1]+= (m[1]>>32)
+
+    ADD     lt_0,m1_0,lt_0           ; lt[0] = lt[0]+m1[0];
+	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
+    ADD     lt_1,m1_1,lt_1           ; lt[1] = lt[1]+m1[1];
+    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
+
+    ADD    %ret1,lt_0,lt_0           ; lt[0] = lt[0] + c;
+	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
+    ADD     lt_0,rp_val,lt_0         ; lt[0] = lt[0]+rp[0]
+    ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
+
+	LDO    -2(num),num               ; num = num - 2;
+    ADD     ht_0,lt_1,lt_1           ; lt[1] = lt[1] + ht_0 (c);
+    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
+    STD     lt_0,0(r_ptr)            ; rp[0] = lt[0]
+
+    ADD     lt_1,rp_val_1,lt_1       ; lt[1] = lt[1]+rp[1]
+    ADD,DC  ht_1,%r0,%ret1           ; ht[1]++
+    LDO     16(a_ptr),a_ptr          ; a_ptr += 2
+
+    STD     lt_1,8(r_ptr)            ; rp[1] = lt[1]
+	CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
+    LDO     16(r_ptr),r_ptr          ; r_ptr += 2
+
+    CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
+
+	;
+	; Top of loop aligned on 64-byte boundary
+	;
+bn_mul_add_words_single_top
+    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
+    LDD     0(r_ptr),rp_val           ; rp[0]
+    LDO     8(a_ptr),a_ptr            ; a_ptr++
+    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
+    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
+    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
+    FSTD    fm,-8(%sp)                ; -8(sp) = m
+    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
+    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
+    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
+    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt 
+
+    LDD     -8(%sp),m_0               
+    LDD    -16(%sp),m1_0              ; m1 = temp1 
+    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1; 
+    LDD     -24(%sp),ht_0             
+    LDD     -32(%sp),lt_0             
+
+    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
+    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
+
+    EXTRD,U tmp_0,31,32,m_0           ; m>>32  
+    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32 
+
+    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
+    ADD     lt_0,m1_0,tmp_0           ; tmp_0 = lt+m1;
+    ADD,DC  ht_0,%r0,ht_0             ; ht++
+    ADD     %ret1,tmp_0,lt_0          ; lt = lt + c;
+    ADD,DC  ht_0,%r0,ht_0             ; ht++
+    ADD     lt_0,rp_val,lt_0          ; lt = lt+rp[0]
+    ADD,DC  ht_0,%r0,%ret1            ; ht++
+    STD     lt_0,0(r_ptr)             ; rp[0] = lt
+
+bn_mul_add_words_exit
+    .EXIT
+	
+    EXTRD,U %ret1,31,32,%ret0         ; for 32-bit, return in ret0/ret1
+    LDD     -80(%sp),%r9              ; restore r9  
+    LDD     -88(%sp),%r8              ; restore r8  
+    LDD     -96(%sp),%r7              ; restore r7  
+    LDD     -104(%sp),%r6             ; restore r6  
+    LDD     -112(%sp),%r5             ; restore r5  
+    LDD     -120(%sp),%r4             ; restore r4  
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3             ; restore r3
+	.PROCEND	;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+;
+; arg0 = rp
+; arg1 = ap
+; arg3 = num
+; w on stack at -56(sp)
+
 bn_mul_words
-	.PROC
-	.CALLINFO FRAME=64,CALLS,SAVE_RP,ENTRY_GR=3
-	.ENTRY
-	stw %r2,-20(0,%r30)
-	copy %r25,%r2
-	stwm %r4,64(0,%r30)
-	copy %r24,%r19
-	ldi 0,%r28
-	stw %r23,-16(0,%r30)
-	ldo 12(%r26),%r31
-	ldo 12(%r2),%r29
-	fldws -16(0,%r30),%fr8L
-L$0026
-	fldws 0(0,%r2),%fr9L
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r28,%r21
-	ldi 0,%r20
-	ldw -16(0,%r30),%r24
-	ldw -12(0,%r30),%r25
-	add %r21,%r25,%r25
-	addc %r20,%r24,%r24
-	copy %r24,%r23
-	ldi 0,%r22
-	copy %r23,%r28
-	addib,= -1,%r19,L$0027
-	stw %r25,0(0,%r26)
-	fldws -8(0,%r29),%fr9L
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r28,%r21
-	ldi 0,%r20
-	ldw -16(0,%r30),%r24
-	ldw -12(0,%r30),%r25
-	add %r21,%r25,%r25
-	addc %r20,%r24,%r24
-	copy %r24,%r23
-	ldi 0,%r22
-	copy %r23,%r28
-	addib,= -1,%r19,L$0027
-	stw %r25,-8(0,%r31)
-	fldws -4(0,%r29),%fr9L
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r28,%r21
-	ldi 0,%r20
-	ldw -16(0,%r30),%r24
-	ldw -12(0,%r30),%r25
-	add %r21,%r25,%r25
-	addc %r20,%r24,%r24
-	copy %r24,%r23
-	ldi 0,%r22
-	copy %r23,%r28
-	addib,= -1,%r19,L$0027
-	stw %r25,-4(0,%r31)
-	fldws 0(0,%r29),%fr9L
-	xmpyu %fr8L,%fr9L,%fr9
-	fstds %fr9,-16(0,%r30)
-	copy %r28,%r21
-	ldi 0,%r20
-	ldw -16(0,%r30),%r24
-	ldw -12(0,%r30),%r25
-	add %r21,%r25,%r25
-	addc %r20,%r24,%r24
-	copy %r24,%r23
-	ldi 0,%r22
-	copy %r23,%r28
-	addib,= -1,%r19,L$0027
-	stw %r25,0(0,%r31)
-	ldo 16(%r29),%r29
-	ldo 16(%r2),%r2
-	ldo 16(%r31),%r31
-	bl L$0026,0
-	ldo 16(%r26),%r26
-L$0027
-	ldw -84(0,%r30),%r2
-	bv 0(%r2)
-	ldwm -64(0,%r30),%r4
-	.EXIT
-	.PROCEND
-	.align 4
-	.EXPORT bn_sqr_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.proc
+	.callinfo frame=128
+    .entry
+	.EXPORT	bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3  
+    STD     %r4,8(%sp)          ; save r4  
+	NOP
+    STD     %r5,16(%sp)         ; save r5  
+
+    STD     %r6,24(%sp)         ; save r6  
+    STD     %r7,32(%sp)         ; save r7  
+    COPY    %r0,%ret1           ; return 0 by default
+    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32    
+
+    CMPIB,>= 0,num,bn_mul_words_exit
+	LDO     128(%sp),%sp    ; bump stack
+
+	;
+	; See if only 1 word to do, thus just do cleanup
+	;
+	CMPIB,= 1,num,bn_mul_words_single_top
+	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
+
+	;
+	; This loop is unrolled 2 times (64-byte aligned as well)
+	;
+	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
+    ; two 32-bit mutiplies can be issued per cycle.
+    ; 
+bn_mul_words_unroll2
+
+    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
+    FLDD    8(a_ptr),t_float_1        ; load up 64-bit value (fr8L) ht(L)/lt(R)
+    XMPYU   fht_0,fw_l,fm1            ; m1[0] = fht_0*fw_l
+    XMPYU   fht_1,fw_l,fm1_1          ; m1[1] = ht*fw_l
+
+    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
+    FSTD    fm1_1,-48(%sp)            ; -48(sp) = m1
+    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
+    XMPYU   flt_1,fw_h,fm_1           ; m = lt*fw_h
+
+    FSTD    fm,-8(%sp)                ; -8(sp) = m
+    FSTD    fm_1,-40(%sp)             ; -40(sp) = m
+    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = fht_0*fw_h
+    XMPYU   fht_1,fw_h,ht_temp_1      ; ht_temp = ht*fw_h
+
+    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
+    FSTD    ht_temp_1,-56(%sp)        ; -56(sp) = ht
+    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
+    XMPYU   flt_1,fw_l,lt_temp_1      ; lt_temp = lt*fw_l
+
+    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt 
+    FSTD    lt_temp_1,-64(%sp)        ; -64(sp) = lt 
+    LDD     -8(%sp),m_0               
+    LDD     -40(%sp),m_1              
+
+    LDD    -16(%sp),m1_0              
+    LDD    -48(%sp),m1_1              
+    LDD     -24(%sp),ht_0             
+    LDD     -56(%sp),ht_1             
+
+    ADD,L   m1_0,m_0,tmp_0            ; tmp_0 = m + m1; 
+    ADD,L   m1_1,m_1,tmp_1            ; tmp_1 = m + m1; 
+    LDD     -32(%sp),lt_0             
+    LDD     -64(%sp),lt_1             
+
+    CMPCLR,*>>= tmp_0,m1_0, %r0       ; if (m < m1)
+    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
+    CMPCLR,*>>= tmp_1,m1_1,%r0        ; if (m < m1)
+    ADD,L   ht_1,top_overflow,ht_1    ; ht += (1<<32)
+
+    EXTRD,U tmp_0,31,32,m_0           ; m>>32  
+    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32 
+    EXTRD,U tmp_1,31,32,m_1           ; m>>32  
+    DEPD,Z  tmp_1,31,32,m1_1          ; m1 = m<<32 
+
+    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
+    ADD,L   ht_1,m_1,ht_1             ; ht+= (m>>32)
+    ADD     lt_0,m1_0,lt_0            ; lt = lt+m1;
+	ADD,DC  ht_0,%r0,ht_0             ; ht++
+
+    ADD     lt_1,m1_1,lt_1            ; lt = lt+m1;
+    ADD,DC  ht_1,%r0,ht_1             ; ht++
+    ADD    %ret1,lt_0,lt_0            ; lt = lt + c (ret1);
+	ADD,DC  ht_0,%r0,ht_0             ; ht++
+
+    ADD     ht_0,lt_1,lt_1            ; lt = lt + c (ht_0)
+    ADD,DC  ht_1,%r0,ht_1             ; ht++
+    STD     lt_0,0(r_ptr)             ; rp[0] = lt
+    STD     lt_1,8(r_ptr)             ; rp[1] = lt
+
+	COPY    ht_1,%ret1                ; carry = ht
+	LDO    -2(num),num                ; num = num - 2;
+    LDO     16(a_ptr),a_ptr           ; ap += 2
+	CMPIB,<= 2,num,bn_mul_words_unroll2
+    LDO     16(r_ptr),r_ptr           ; rp++
+
+    CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
+
+	;
+	; Top of loop aligned on 64-byte boundary
+	;
+bn_mul_words_single_top
+    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
+    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
+    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
+    FSTD    fm,-8(%sp)                ; -8(sp) = m
+    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
+    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
+    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
+    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt 
+
+    LDD     -8(%sp),m_0               
+    LDD    -16(%sp),m1_0              
+    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1; 
+    LDD     -24(%sp),ht_0             
+    LDD     -32(%sp),lt_0             
+
+    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
+    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
+
+    EXTRD,U tmp_0,31,32,m_0           ; m>>32  
+    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32 
+
+    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
+    ADD     lt_0,m1_0,lt_0            ; lt= lt+m1;
+    ADD,DC  ht_0,%r0,ht_0             ; ht++
+
+    ADD     %ret1,lt_0,lt_0           ; lt = lt + c;
+    ADD,DC  ht_0,%r0,ht_0             ; ht++
+
+    COPY    ht_0,%ret1                ; copy carry
+    STD     lt_0,0(r_ptr)             ; rp[0] = lt
+
+bn_mul_words_exit
+    .EXIT
+    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
+    LDD     -96(%sp),%r7              ; restore r7  
+    LDD     -104(%sp),%r6             ; restore r6  
+    LDD     -112(%sp),%r5             ; restore r5  
+    LDD     -120(%sp),%r4             ; restore r4  
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3             ; restore r3
+	.PROCEND	
+
+;----------------------------------------------------------------------------
+;
+;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
+;
+; arg0 = rp
+; arg1 = ap
+; arg2 = num
+;
+
 bn_sqr_words
+	.proc
+	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+	.EXPORT	bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .entry
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3  
+    STD     %r4,8(%sp)          ; save r4  
+	NOP
+    STD     %r5,16(%sp)         ; save r5  
+
+    CMPIB,>= 0,num,bn_sqr_words_exit
+	LDO     128(%sp),%sp       ; bump stack
+
+	;
+	; If only 1, the goto straight to cleanup
+	;
+	CMPIB,= 1,num,bn_sqr_words_single_top
+    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
+
+	;
+	; This loop is unrolled 2 times (64-byte aligned as well)
+	;
+
+bn_sqr_words_unroll2
+    FLDD    0(a_ptr),t_float_0        ; a[0]
+    FLDD    8(a_ptr),t_float_1        ; a[1]
+    XMPYU   fht_0,flt_0,fm            ; m[0]
+    XMPYU   fht_1,flt_1,fm_1          ; m[1]
+
+    FSTD    fm,-24(%sp)               ; store m[0]
+    FSTD    fm_1,-56(%sp)             ; store m[1]
+    XMPYU   flt_0,flt_0,lt_temp       ; lt[0]
+    XMPYU   flt_1,flt_1,lt_temp_1     ; lt[1]
+
+    FSTD    lt_temp,-16(%sp)          ; store lt[0]
+    FSTD    lt_temp_1,-48(%sp)        ; store lt[1]
+    XMPYU   fht_0,fht_0,ht_temp       ; ht[0]
+    XMPYU   fht_1,fht_1,ht_temp_1     ; ht[1]
+
+    FSTD    ht_temp,-8(%sp)           ; store ht[0]
+    FSTD    ht_temp_1,-40(%sp)        ; store ht[1]
+    LDD     -24(%sp),m_0             
+    LDD     -56(%sp),m_1              
+
+    AND     m_0,high_mask,tmp_0       ; m[0] & Mask
+    AND     m_1,high_mask,tmp_1       ; m[1] & Mask
+    DEPD,Z  m_0,30,31,m_0             ; m[0] << 32+1
+    DEPD,Z  m_1,30,31,m_1             ; m[1] << 32+1
+
+    LDD     -16(%sp),lt_0        
+    LDD     -48(%sp),lt_1        
+    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m[0]&Mask >> 32-1
+    EXTRD,U tmp_1,32,33,tmp_1         ; tmp_1 = m[1]&Mask >> 32-1
+
+    LDD     -8(%sp),ht_0            
+    LDD     -40(%sp),ht_1           
+    ADD,L   ht_0,tmp_0,ht_0           ; ht[0] += tmp_0
+    ADD,L   ht_1,tmp_1,ht_1           ; ht[1] += tmp_1
+
+    ADD     lt_0,m_0,lt_0             ; lt = lt+m
+    ADD,DC  ht_0,%r0,ht_0             ; ht[0]++
+    STD     lt_0,0(r_ptr)             ; rp[0] = lt[0]
+    STD     ht_0,8(r_ptr)             ; rp[1] = ht[1]
+
+    ADD     lt_1,m_1,lt_1             ; lt = lt+m
+    ADD,DC  ht_1,%r0,ht_1             ; ht[1]++
+    STD     lt_1,16(r_ptr)            ; rp[2] = lt[1]
+    STD     ht_1,24(r_ptr)            ; rp[3] = ht[1]
+
+	LDO    -2(num),num                ; num = num - 2;
+    LDO     16(a_ptr),a_ptr           ; ap += 2
+	CMPIB,<= 2,num,bn_sqr_words_unroll2
+    LDO     32(r_ptr),r_ptr           ; rp += 4
+
+    CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
+
+	;
+	; Top of loop aligned on 64-byte boundary
+	;
+bn_sqr_words_single_top
+    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
+
+    XMPYU   fht_0,flt_0,fm            ; m
+    FSTD    fm,-24(%sp)               ; store m
+
+    XMPYU   flt_0,flt_0,lt_temp       ; lt
+    FSTD    lt_temp,-16(%sp)          ; store lt
+
+    XMPYU   fht_0,fht_0,ht_temp       ; ht
+    FSTD    ht_temp,-8(%sp)           ; store ht
+
+    LDD     -24(%sp),m_0              ; load m
+    AND     m_0,high_mask,tmp_0       ; m & Mask
+    DEPD,Z  m_0,30,31,m_0             ; m << 32+1
+    LDD     -16(%sp),lt_0             ; lt
+
+    LDD     -8(%sp),ht_0              ; ht
+    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m&Mask >> 32-1
+    ADD     m_0,lt_0,lt_0             ; lt = lt+m
+    ADD,L   ht_0,tmp_0,ht_0           ; ht += tmp_0
+    ADD,DC  ht_0,%r0,ht_0             ; ht++
+
+    STD     lt_0,0(r_ptr)             ; rp[0] = lt
+    STD     ht_0,8(r_ptr)             ; rp[1] = ht
+
+bn_sqr_words_exit
+    .EXIT
+    LDD     -112(%sp),%r5       ; restore r5  
+    LDD     -120(%sp),%r4       ; restore r4  
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3 
+	.PROCEND	;in=23,24,25,26,29;out=28;
+
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp 
+; arg1 = ap
+; arg2 = bp 
+; arg3 = n
+
+t  .reg %r22
+b  .reg %r21
+l  .reg %r20
+
+bn_add_words
+	.proc
+    .entry
+	.callinfo
+	.EXPORT	bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+	.align 64
+
+    CMPIB,>= 0,n,bn_add_words_exit
+    COPY    %r0,%ret1           ; return 0 by default
+
+	;
+	; If 2 or more numbers do the loop
+	;
+	CMPIB,= 1,n,bn_add_words_single_top
+	NOP
+
+	;
+	; This loop is unrolled 2 times (64-byte aligned as well)
+	;
+bn_add_words_unroll2
+	LDD     0(a_ptr),t
+	LDD     0(b_ptr),b
+	ADD     t,%ret1,t                    ; t = t+c;
+	ADD,DC  %r0,%r0,%ret1                ; set c to carry
+	ADD     t,b,l                        ; l = t + b[0]
+	ADD,DC  %ret1,%r0,%ret1              ; c+= carry
+	STD     l,0(r_ptr)
+
+	LDD     8(a_ptr),t
+	LDD     8(b_ptr),b
+	ADD     t,%ret1,t                     ; t = t+c;
+	ADD,DC  %r0,%r0,%ret1                 ; set c to carry
+	ADD     t,b,l                         ; l = t + b[0]
+	ADD,DC  %ret1,%r0,%ret1               ; c+= carry
+	STD     l,8(r_ptr)
+
+	LDO     -2(n),n
+	LDO     16(a_ptr),a_ptr
+	LDO     16(b_ptr),b_ptr
+
+	CMPIB,<= 2,n,bn_add_words_unroll2
+	LDO     16(r_ptr),r_ptr
+
+    CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
+
+bn_add_words_single_top
+	LDD     0(a_ptr),t
+	LDD     0(b_ptr),b
+
+	ADD     t,%ret1,t                 ; t = t+c;
+	ADD,DC  %r0,%r0,%ret1             ; set c to carry (could use CMPCLR??)
+	ADD     t,b,l                     ; l = t + b[0]
+	ADD,DC  %ret1,%r0,%ret1           ; c+= carry
+	STD     l,0(r_ptr)
+
+bn_add_words_exit
+    .EXIT
+    BVE     (%rp)
+    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
+	.PROCEND	;in=23,24,25,26,29;out=28;
+
+;----------------------------------------------------------------------------
+;
+;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+;
+; arg0 = rp 
+; arg1 = ap
+; arg2 = bp 
+; arg3 = n
+
+t1       .reg %r22
+t2       .reg %r21
+sub_tmp1 .reg %r20
+sub_tmp2 .reg %r19
+
+
+bn_sub_words
+	.proc
+	.callinfo 
+	.EXPORT	bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .entry
+	.align 64
+
+    CMPIB,>=  0,n,bn_sub_words_exit
+    COPY    %r0,%ret1           ; return 0 by default
+
+	;
+	; If 2 or more numbers do the loop
+	;
+	CMPIB,= 1,n,bn_sub_words_single_top
+	NOP
+
+	;
+	; This loop is unrolled 2 times (64-byte aligned as well)
+	;
+bn_sub_words_unroll2
+	LDD     0(a_ptr),t1
+	LDD     0(b_ptr),t2
+	SUB     t1,t2,sub_tmp1           ; t3 = t1-t2; 
+	SUB     sub_tmp1,%ret1,sub_tmp1  ; t3 = t3- c; 
+
+	CMPCLR,*>> t1,t2,sub_tmp2        ; clear if t1 > t2
+	LDO      1(%r0),sub_tmp2
+	
+	CMPCLR,*= t1,t2,%r0
+	COPY    sub_tmp2,%ret1
+	STD     sub_tmp1,0(r_ptr)
+
+	LDD     8(a_ptr),t1
+	LDD     8(b_ptr),t2
+	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2; 
+	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c; 
+	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
+	LDO      1(%r0),sub_tmp2
+	
+	CMPCLR,*= t1,t2,%r0
+	COPY    sub_tmp2,%ret1
+	STD     sub_tmp1,8(r_ptr)
+
+	LDO     -2(n),n
+	LDO     16(a_ptr),a_ptr
+	LDO     16(b_ptr),b_ptr
+
+	CMPIB,<= 2,n,bn_sub_words_unroll2
+	LDO     16(r_ptr),r_ptr
+
+    CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
+
+bn_sub_words_single_top
+	LDD     0(a_ptr),t1
+	LDD     0(b_ptr),t2
+	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2; 
+	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c; 
+	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
+	LDO      1(%r0),sub_tmp2
+	
+	CMPCLR,*= t1,t2,%r0
+	COPY    sub_tmp2,%ret1
+
+	STD     sub_tmp1,0(r_ptr)
+
+bn_sub_words_exit
+    .EXIT
+    BVE     (%rp)
+    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
+	.PROCEND	;in=23,24,25,26,29;out=28;
+
+;------------------------------------------------------------------------------
+;
+; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
+;
+; arg0 = h
+; arg1 = l
+; arg2 = d
+;
+; This is mainly just output from the HP C compiler.  
+;
+;------------------------------------------------------------------------------
+bn_div_words
 	.PROC
-	.CALLINFO FRAME=0,NO_CALLS
-	.ENTRY
-	ldo 28(%r26),%r19
-	ldo 12(%r25),%r28
-L$0042
-	fldws 0(0,%r25),%fr8L
-	fldws 0(0,%r25),%fr8R
-	xmpyu %fr8L,%fr8R,%fr8
-	fstds %fr8,-16(0,%r30)
-	ldw -16(0,%r30),%r22
-	ldw -12(0,%r30),%r23
-	stw %r23,0(0,%r26)
-	copy %r22,%r21
-	ldi 0,%r20
-	addib,= -1,%r24,L$0049
-	stw %r21,-24(0,%r19)
-	fldws -8(0,%r28),%fr8L
-	fldws -8(0,%r28),%fr8R
-	xmpyu %fr8L,%fr8R,%fr8
-	fstds %fr8,-16(0,%r30)
-	ldw -16(0,%r30),%r22
-	ldw -12(0,%r30),%r23
-	stw %r23,-20(0,%r19)
-	copy %r22,%r21
-	ldi 0,%r20
-	addib,= -1,%r24,L$0049
-	stw %r21,-16(0,%r19)
-	fldws -4(0,%r28),%fr8L
-	fldws -4(0,%r28),%fr8R
-	xmpyu %fr8L,%fr8R,%fr8
-	fstds %fr8,-16(0,%r30)
-	ldw -16(0,%r30),%r22
-	ldw -12(0,%r30),%r23
-	stw %r23,-12(0,%r19)
-	copy %r22,%r21
-	ldi 0,%r20
-	addib,= -1,%r24,L$0049
-	stw %r21,-8(0,%r19)
-	fldws 0(0,%r28),%fr8L
-	fldws 0(0,%r28),%fr8R
-	xmpyu %fr8L,%fr8R,%fr8
-	fstds %fr8,-16(0,%r30)
-	ldw -16(0,%r30),%r22
-	ldw -12(0,%r30),%r23
-	stw %r23,-4(0,%r19)
-	copy %r22,%r21
-	ldi 0,%r20
-	addib,= -1,%r24,L$0049
-	stw %r21,0(0,%r19)
-	ldo 16(%r28),%r28
-	ldo 16(%r25),%r25
-	ldo 32(%r19),%r19
-	bl L$0042,0
-	ldo 32(%r26),%r26
-L$0049
-	bv,n 0(%r2)
-	.EXIT
-	.PROCEND
-	.IMPORT BN_num_bits_word,CODE
-	.IMPORT fprintf,CODE
-	.IMPORT __iob,DATA
-	.SPACE $TEXT$
-	.SUBSPA $LIT$
-
-	.align 4
-L$C0000
-	.STRING "Division would overflow (%d)\x0a\x00"
-	.IMPORT abort,CODE
-	.SPACE $TEXT$
-	.SUBSPA $CODE$
-
-	.align 4
-	.EXPORT bn_div64,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR
-bn_div64
+	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
+	.IMPORT	BN_num_bits_word,CODE
+	.IMPORT	__iob,DATA
+	.IMPORT	fprintf,CODE
+	.IMPORT	abort,CODE
+	.IMPORT	$$div2U,MILLICODE
+	.CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
+        .ENTRY
+        STW     %r2,-20(%r30)   ;offset 0x8ec
+        STW,MA  %r3,192(%r30)   ;offset 0x8f0
+        STW     %r4,-188(%r30)  ;offset 0x8f4
+        DEPD    %r5,31,32,%r6   ;offset 0x8f8
+        STD     %r6,-184(%r30)  ;offset 0x8fc
+        DEPD    %r7,31,32,%r8   ;offset 0x900
+        STD     %r8,-176(%r30)  ;offset 0x904
+        STW     %r9,-168(%r30)  ;offset 0x908
+        LDD     -248(%r30),%r3  ;offset 0x90c
+        COPY    %r26,%r4        ;offset 0x910
+        COPY    %r24,%r5        ;offset 0x914
+        DEPD    %r25,31,32,%r4  ;offset 0x918
+        CMPB,*<>        %r3,%r0,$0006000C       ;offset 0x91c
+        DEPD    %r23,31,32,%r5  ;offset 0x920
+        MOVIB,TR        -1,%r29,$00060002       ;offset 0x924
+        EXTRD,U %r29,31,32,%r28 ;offset 0x928
+$0006002A
+        LDO     -1(%r29),%r29   ;offset 0x92c
+        SUB     %r23,%r7,%r23   ;offset 0x930
+$00060024
+        SUB     %r4,%r31,%r25   ;offset 0x934
+        AND     %r25,%r19,%r26  ;offset 0x938
+        CMPB,*<>,N      %r0,%r26,$00060046      ;offset 0x93c
+        DEPD,Z  %r25,31,32,%r20 ;offset 0x940
+        OR      %r20,%r24,%r21  ;offset 0x944
+        CMPB,*<<,N      %r21,%r23,$0006002A     ;offset 0x948
+        SUB     %r31,%r2,%r31   ;offset 0x94c
+$00060046
+$0006002E
+        DEPD,Z  %r23,31,32,%r25 ;offset 0x950
+        EXTRD,U %r23,31,32,%r26 ;offset 0x954
+        AND     %r25,%r19,%r24  ;offset 0x958
+        ADD,L   %r31,%r26,%r31  ;offset 0x95c
+        CMPCLR,*>>=     %r5,%r24,%r0    ;offset 0x960
+        LDO     1(%r31),%r31    ;offset 0x964
+$00060032
+        CMPB,*<<=,N     %r31,%r4,$00060036      ;offset 0x968
+        LDO     -1(%r29),%r29   ;offset 0x96c
+        ADD,L   %r4,%r3,%r4     ;offset 0x970
+$00060036
+        ADDIB,=,N       -1,%r8,$D0      ;offset 0x974
+        SUB     %r5,%r24,%r28   ;offset 0x978
+$0006003A
+        SUB     %r4,%r31,%r24   ;offset 0x97c
+        SHRPD   %r24,%r28,32,%r4        ;offset 0x980
+        DEPD,Z  %r29,31,32,%r9  ;offset 0x984
+        DEPD,Z  %r28,31,32,%r5  ;offset 0x988
+$0006001C
+        EXTRD,U %r4,31,32,%r31  ;offset 0x98c
+        CMPB,*<>,N      %r31,%r2,$00060020      ;offset 0x990
+        MOVB,TR %r6,%r29,$D1    ;offset 0x994
+        STD     %r29,-152(%r30) ;offset 0x998
+$0006000C
+        EXTRD,U %r3,31,32,%r25  ;offset 0x99c
+        COPY    %r3,%r26        ;offset 0x9a0
+        EXTRD,U %r3,31,32,%r9   ;offset 0x9a4
+        EXTRD,U %r4,31,32,%r8   ;offset 0x9a8
+        .CALL   ARGW0=GR,ARGW1=GR,RTNVAL=GR     ;in=25,26;out=28;
+        B,L     BN_num_bits_word,%r2    ;offset 0x9ac
+        EXTRD,U %r5,31,32,%r7   ;offset 0x9b0
+        LDI     64,%r20 ;offset 0x9b4
+        DEPD    %r7,31,32,%r5   ;offset 0x9b8
+        DEPD    %r8,31,32,%r4   ;offset 0x9bc
+        DEPD    %r9,31,32,%r3   ;offset 0x9c0
+        CMPB,=  %r28,%r20,$00060012     ;offset 0x9c4
+        COPY    %r28,%r24       ;offset 0x9c8
+        MTSARCM %r24    ;offset 0x9cc
+        DEPDI,Z -1,%sar,1,%r19  ;offset 0x9d0
+        CMPB,*>>,N      %r4,%r19,$D2    ;offset 0x9d4
+$00060012
+        SUBI    64,%r24,%r31    ;offset 0x9d8
+        CMPCLR,*<<      %r4,%r3,%r0     ;offset 0x9dc
+        SUB     %r4,%r3,%r4     ;offset 0x9e0
+$00060016
+        CMPB,=  %r31,%r0,$0006001A      ;offset 0x9e4
+        COPY    %r0,%r9 ;offset 0x9e8
+        MTSARCM %r31    ;offset 0x9ec
+        DEPD,Z  %r3,%sar,64,%r3 ;offset 0x9f0
+        SUBI    64,%r31,%r26    ;offset 0x9f4
+        MTSAR   %r26    ;offset 0x9f8
+        SHRPD   %r4,%r5,%sar,%r4        ;offset 0x9fc
+        MTSARCM %r31    ;offset 0xa00
+        DEPD,Z  %r5,%sar,64,%r5 ;offset 0xa04
+$0006001A
+        DEPDI,Z -1,31,32,%r19   ;offset 0xa08
+        AND     %r3,%r19,%r29   ;offset 0xa0c
+        EXTRD,U %r29,31,32,%r2  ;offset 0xa10
+        DEPDI,Z -1,63,32,%r6    ;offset 0xa14
+        MOVIB,TR        2,%r8,$0006001C ;offset 0xa18
+        EXTRD,U %r3,63,32,%r7   ;offset 0xa1c
+$D2
+        ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
+        LDIL    LR'C$7,%r21     ;offset 0xa24
+        LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
+        .CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
+        B,L     fprintf,%r2     ;offset 0xa2c
+        LDO     RR'C$7(%r21),%r25       ;offset 0xa30
+        .CALL           ;
+        B,L     abort,%r2       ;offset 0xa34
+        NOP             ;offset 0xa38
+        B       $D3     ;offset 0xa3c
+        LDW     -212(%r30),%r2  ;offset 0xa40
+$00060020
+        COPY    %r4,%r26        ;offset 0xa44
+        EXTRD,U %r4,31,32,%r25  ;offset 0xa48
+        COPY    %r2,%r24        ;offset 0xa4c
+        .CALL   ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
+        B,L     $$div2U,%r31    ;offset 0xa50
+        EXTRD,U %r2,31,32,%r23  ;offset 0xa54
+        DEPD    %r28,31,32,%r29 ;offset 0xa58
+$00060022
+        STD     %r29,-152(%r30) ;offset 0xa5c
+$D1
+        AND     %r5,%r19,%r24   ;offset 0xa60
+        EXTRD,U %r24,31,32,%r24 ;offset 0xa64
+        STW     %r2,-160(%r30)  ;offset 0xa68
+        STW     %r7,-128(%r30)  ;offset 0xa6c
+        FLDD    -152(%r30),%fr4 ;offset 0xa70
+        FLDD    -152(%r30),%fr7 ;offset 0xa74
+        FLDW    -160(%r30),%fr8L        ;offset 0xa78
+        FLDW    -128(%r30),%fr5L        ;offset 0xa7c
+        XMPYU   %fr8L,%fr7L,%fr10       ;offset 0xa80
+        FSTD    %fr10,-136(%r30)        ;offset 0xa84
+        XMPYU   %fr8L,%fr7R,%fr22       ;offset 0xa88
+        FSTD    %fr22,-144(%r30)        ;offset 0xa8c
+        XMPYU   %fr5L,%fr4L,%fr11       ;offset 0xa90
+        XMPYU   %fr5L,%fr4R,%fr23       ;offset 0xa94
+        FSTD    %fr11,-112(%r30)        ;offset 0xa98
+        FSTD    %fr23,-120(%r30)        ;offset 0xa9c
+        LDD     -136(%r30),%r28 ;offset 0xaa0
+        DEPD,Z  %r28,31,32,%r31 ;offset 0xaa4
+        LDD     -144(%r30),%r20 ;offset 0xaa8
+        ADD,L   %r20,%r31,%r31  ;offset 0xaac
+        LDD     -112(%r30),%r22 ;offset 0xab0
+        DEPD,Z  %r22,31,32,%r22 ;offset 0xab4
+        LDD     -120(%r30),%r21 ;offset 0xab8
+        B       $00060024       ;offset 0xabc
+        ADD,L   %r21,%r22,%r23  ;offset 0xac0
+$D0
+        OR      %r9,%r29,%r29   ;offset 0xac4
+$00060040
+        EXTRD,U %r29,31,32,%r28 ;offset 0xac8
+$00060002
+$L2
+        LDW     -212(%r30),%r2  ;offset 0xacc
+$D3
+        LDW     -168(%r30),%r9  ;offset 0xad0
+        LDD     -176(%r30),%r8  ;offset 0xad4
+        EXTRD,U %r8,31,32,%r7   ;offset 0xad8
+        LDD     -184(%r30),%r6  ;offset 0xadc
+        EXTRD,U %r6,31,32,%r5   ;offset 0xae0
+        LDW     -188(%r30),%r4  ;offset 0xae4
+        BVE     (%r2)   ;offset 0xae8
+        .EXIT
+        LDW,MB  -192(%r30),%r3  ;offset 0xaec
+	.PROCEND	;in=23,25;out=28,29;fpin=105,107;
+
+
+
+
+;----------------------------------------------------------------------------
+;
+; Registers to hold 64-bit values to manipulate.  The "L" part
+; of the register corresponds to the upper 32-bits, while the "R"
+; part corresponds to the lower 32-bits
+; 
+; Note, that when using b6 and b7, the code must save these before
+; using them because they are callee save registers 
+; 
+;
+; Floating point registers to use to save values that
+; are manipulated.  These don't collide with ftemp1-6 and
+; are all caller save registers
+;
+a0        .reg %fr22
+a0L       .reg %fr22L
+a0R       .reg %fr22R
+
+a1        .reg %fr23
+a1L       .reg %fr23L
+a1R       .reg %fr23R
+
+a2        .reg %fr24
+a2L       .reg %fr24L
+a2R       .reg %fr24R
+
+a3        .reg %fr25
+a3L       .reg %fr25L
+a3R       .reg %fr25R
+
+a4        .reg %fr26
+a4L       .reg %fr26L
+a4R       .reg %fr26R
+
+a5        .reg %fr27
+a5L       .reg %fr27L
+a5R       .reg %fr27R
+
+a6        .reg %fr28
+a6L       .reg %fr28L
+a6R       .reg %fr28R
+
+a7        .reg %fr29
+a7L       .reg %fr29L
+a7R       .reg %fr29R
+
+b0        .reg %fr30
+b0L       .reg %fr30L
+b0R       .reg %fr30R
+
+b1        .reg %fr31
+b1L       .reg %fr31L
+b1R       .reg %fr31R
+
+;
+; Temporary floating point variables, these are all caller save
+; registers
+;
+ftemp1    .reg %fr4
+ftemp2    .reg %fr5
+ftemp3    .reg %fr6
+ftemp4    .reg %fr7
+
+;
+; The B set of registers when used.
+;
+
+b2        .reg %fr8
+b2L       .reg %fr8L
+b2R       .reg %fr8R
+
+b3        .reg %fr9
+b3L       .reg %fr9L
+b3R       .reg %fr9R
+
+b4        .reg %fr10
+b4L       .reg %fr10L
+b4R       .reg %fr10R
+
+b5        .reg %fr11
+b5L       .reg %fr11L
+b5R       .reg %fr11R
+
+b6        .reg %fr12
+b6L       .reg %fr12L
+b6R       .reg %fr12R
+
+b7        .reg %fr13
+b7L       .reg %fr13L
+b7R       .reg %fr13R
+
+c1           .reg %r21   ; only reg
+temp1        .reg %r20   ; only reg
+temp2        .reg %r19   ; only reg
+temp3        .reg %r31   ; only reg
+
+m1           .reg %r28   
+c2           .reg %r23   
+high_one     .reg %r1
+ht           .reg %r6
+lt           .reg %r5
+m            .reg %r4
+c3           .reg %r3
+
+SQR_ADD_C  .macro  A0L,A0R,C1,C2,C3
+    XMPYU   A0L,A0R,ftemp1       ; m
+    FSTD    ftemp1,-24(%sp)      ; store m
+
+    XMPYU   A0R,A0R,ftemp2       ; lt
+    FSTD    ftemp2,-16(%sp)      ; store lt
+
+    XMPYU   A0L,A0L,ftemp3       ; ht
+    FSTD    ftemp3,-8(%sp)       ; store ht
+
+    LDD     -24(%sp),m           ; load m
+    AND     m,high_mask,temp2    ; m & Mask
+    DEPD,Z  m,30,31,temp3        ; m << 32+1
+    LDD     -16(%sp),lt          ; lt
+
+    LDD     -8(%sp),ht           ; ht
+    EXTRD,U temp2,32,33,temp1    ; temp1 = m&Mask >> 32-1
+    ADD     temp3,lt,lt          ; lt = lt+m
+    ADD,L   ht,temp1,ht          ; ht += temp1
+    ADD,DC  ht,%r0,ht            ; ht++
+
+    ADD     C1,lt,C1             ; c1=c1+lt
+    ADD,DC  ht,%r0,ht            ; ht++
+
+    ADD     C2,ht,C2             ; c2=c2+ht
+    ADD,DC  C3,%r0,C3            ; c3++
+.endm
+
+SQR_ADD_C2 .macro  A0L,A0R,A1L,A1R,C1,C2,C3
+    XMPYU   A0L,A1R,ftemp1          ; m1 = bl*ht
+    FSTD    ftemp1,-16(%sp)         ;
+    XMPYU   A0R,A1L,ftemp2          ; m = bh*lt
+    FSTD    ftemp2,-8(%sp)          ;
+    XMPYU   A0R,A1R,ftemp3          ; lt = bl*lt
+    FSTD    ftemp3,-32(%sp)
+    XMPYU   A0L,A1L,ftemp4          ; ht = bh*ht
+    FSTD    ftemp4,-24(%sp)         ;
+
+    LDD     -8(%sp),m               ; r21 = m
+    LDD     -16(%sp),m1             ; r19 = m1
+    ADD,L   m,m1,m                  ; m+m1
+
+    DEPD,Z  m,31,32,temp3           ; (m+m1<<32)
+    LDD     -24(%sp),ht             ; r24 = ht
+
+    CMPCLR,*>>= m,m1,%r0            ; if (m < m1)
+    ADD,L   ht,high_one,ht          ; ht+=high_one
+
+    EXTRD,U m,31,32,temp1           ; m >> 32
+    LDD     -32(%sp),lt             ; lt
+    ADD,L   ht,temp1,ht             ; ht+= m>>32
+    ADD     lt,temp3,lt             ; lt = lt+m1
+    ADD,DC  ht,%r0,ht               ; ht++
+
+    ADD     ht,ht,ht                ; ht=ht+ht;
+    ADD,DC  C3,%r0,C3               ; add in carry (c3++)
+
+    ADD     lt,lt,lt                ; lt=lt+lt;
+    ADD,DC  ht,%r0,ht               ; add in carry (ht++)
+
+    ADD     C1,lt,C1                ; c1=c1+lt
+    ADD,DC,*NUV ht,%r0,ht           ; add in carry (ht++)
+    LDO     1(C3),C3              ; bump c3 if overflow,nullify otherwise
+
+    ADD     C2,ht,C2                ; c2 = c2 + ht
+    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
+.endm
+
+;
+;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba8
 	.PROC
-	.CALLINFO FRAME=128,CALLS,SAVE_RP,ENTRY_GR=8
-	.ENTRY
-	stw %r2,-20(0,%r30)
-	stwm %r8,128(0,%r30)
-	stw %r7,-124(0,%r30)
-	stw %r4,-112(0,%r30)
-	stw %r3,-108(0,%r30)
-	copy %r26,%r3
-	copy %r25,%r4
-	stw %r6,-120(0,%r30)
-	ldi 0,%r7
-	stw %r5,-116(0,%r30)
-	movb,<> %r24,%r5,L$0051
-	ldi 2,%r6
-	bl L$0068,0
-	ldi -1,%r28
-L$0051
-	.CALL ARGW0=GR
-	bl BN_num_bits_word,%r2
-	copy %r5,%r26
-	copy %r28,%r24
-	ldi 32,%r19
-	comb,= %r19,%r24,L$0052
-	subi 31,%r24,%r19
-	mtsar %r19
-	zvdepi 1,32,%r19
-	comb,>>= %r19,%r3,L$0052
-	addil LR'__iob-$global$+32,%r27
-	ldo RR'__iob-$global$+32(%r1),%r26
-	ldil LR'L$C0000,%r25
-	.CALL ARGW0=GR,ARGW1=GR,ARGW2=GR
-	bl fprintf,%r2
-	ldo RR'L$C0000(%r25),%r25
-	.CALL 
-	bl abort,%r2
-	nop
-L$0052
-	comb,>> %r5,%r3,L$0053
-	subi 32,%r24,%r24
-	sub %r3,%r5,%r3
-L$0053
-	comib,= 0,%r24,L$0054
-	subi 31,%r24,%r19
-	mtsar %r19
-	zvdep %r5,32,%r5
-	zvdep %r3,32,%r21
-	subi 32,%r24,%r20
-	mtsar %r20
-	vshd 0,%r4,%r20
-	or %r21,%r20,%r3
-	mtsar %r19
-	zvdep %r4,32,%r4
-L$0054
-	extru %r5,15,16,%r23
-	extru %r5,31,16,%r28
-L$0055
-	extru %r3,15,16,%r19
-	comb,<> %r23,%r19,L$0058
-	copy %r3,%r26
-	bl L$0059,0
-	zdepi -1,31,16,%r29
-L$0058
-	.IMPORT $$divU,MILLICODE
-	bl $$divU,%r31
-	copy %r23,%r25
-L$0059
-	stw %r29,-16(0,%r30)
-	fldws -16(0,%r30),%fr10L
-	stw %r28,-16(0,%r30)
-	fldws -16(0,%r30),%fr10R
-	stw %r23,-16(0,%r30)
-	xmpyu %fr10L,%fr10R,%fr8
-	fldws -16(0,%r30),%fr10R
-	fstws %fr8R,-16(0,%r30)
-	xmpyu %fr10L,%fr10R,%fr9
-	ldw -16(0,%r30),%r8
-	fstws %fr9R,-16(0,%r30)
-	copy %r8,%r22
-	ldw -16(0,%r30),%r8
-	extru %r4,15,16,%r24
-	copy %r8,%r21
-L$0060
-	sub %r3,%r21,%r20
-	copy %r20,%r19
-	depi 0,31,16,%r19
-	comib,<> 0,%r19,L$0061
-	zdep %r20,15,16,%r19
-	addl %r19,%r24,%r19
-	comb,>>= %r19,%r22,L$0061
-	sub %r22,%r28,%r22
-	sub %r21,%r23,%r21
-	bl L$0060,0
-	ldo -1(%r29),%r29
-L$0061
-	stw %r29,-16(0,%r30)
-	fldws -16(0,%r30),%fr10L
-	stw %r28,-16(0,%r30)
-	fldws -16(0,%r30),%fr10R
-	xmpyu %fr10L,%fr10R,%fr8
-	fstws %fr8R,-16(0,%r30)
-	ldw -16(0,%r30),%r8
-	stw %r23,-16(0,%r30)
-	fldws -16(0,%r30),%fr10R
-	copy %r8,%r19
-	xmpyu %fr10L,%fr10R,%fr8
-	fstws %fr8R,-16(0,%r30)
-	extru %r19,15,16,%r20
-	ldw -16(0,%r30),%r8
-	zdep %r19,15,16,%r19
-	addl %r8,%r20,%r20
-	comclr,<<= %r19,%r4,0
-	addi 1,%r20,%r20
-	comb,<<= %r20,%r3,L$0066
-	sub %r4,%r19,%r4
-	addl %r3,%r5,%r3
-	ldo -1(%r29),%r29
-L$0066
-	addib,= -1,%r6,L$0056
-	sub %r3,%r20,%r3
-	zdep %r29,15,16,%r7
-	shd %r3,%r4,16,%r3
-	bl L$0055,0
-	zdep %r4,15,16,%r4
-L$0056
-	or %r7,%r29,%r28
-L$0068
-	ldw -148(0,%r30),%r2
-	ldw -124(0,%r30),%r7
-	ldw -120(0,%r30),%r6
-	ldw -116(0,%r30),%r5
-	ldw -112(0,%r30),%r4
-	ldw -108(0,%r30),%r3
-	bv 0(%r2)
-	ldwm -128(0,%r30),%r8
-	.EXIT
-	.PROCEND
+	.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+	.EXPORT	bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .ENTRY
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3
+    STD     %r4,8(%sp)          ; save r4
+    STD     %r5,16(%sp)         ; save r5
+    STD     %r6,24(%sp)         ; save r6
+
+	;
+	; Zero out carries
+	;
+	COPY     %r0,c1
+	COPY     %r0,c2
+	COPY     %r0,c3
+
+	LDO      128(%sp),%sp       ; bump stack
+    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
+    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
+
+	;
+	; Load up all of the values we are going to use
+	;
+    FLDD     0(a_ptr),a0       
+    FLDD     8(a_ptr),a1       
+    FLDD    16(a_ptr),a2       
+    FLDD    24(a_ptr),a3       
+    FLDD    32(a_ptr),a4       
+    FLDD    40(a_ptr),a5       
+    FLDD    48(a_ptr),a6       
+    FLDD    56(a_ptr),a7       
+
+	SQR_ADD_C a0L,a0R,c1,c2,c3
+	STD     c1,0(r_ptr)          ; r[0] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+	STD     c2,8(r_ptr)          ; r[1] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C a1L,a1R,c3,c1,c2
+	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+	STD     c3,16(r_ptr)            ; r[2] = c3;
+	COPY    %r0,c3
+
+	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+	STD     c1,24(r_ptr)           ; r[3] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C a2L,a2R,c2,c3,c1
+	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+	SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
+	STD     c2,32(r_ptr)          ; r[4] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
+	SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
+	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+	STD     c3,40(r_ptr)          ; r[5] = c3;
+	COPY    %r0,c3
+
+	SQR_ADD_C a3L,a3R,c1,c2,c3
+	SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
+	SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
+	SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
+	STD     c1,48(r_ptr)          ; r[6] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
+	SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
+	SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
+	SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
+	STD     c2,56(r_ptr)          ; r[7] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C a4L,a4R,c3,c1,c2
+	SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
+	SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
+	SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
+	STD     c3,64(r_ptr)          ; r[8] = c3;
+	COPY    %r0,c3
+
+	SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
+	SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
+	SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
+	STD     c1,72(r_ptr)          ; r[9] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C a5L,a5R,c2,c3,c1
+	SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
+	SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
+	STD     c2,80(r_ptr)          ; r[10] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
+	SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
+	STD     c3,88(r_ptr)          ; r[11] = c3;
+	COPY    %r0,c3
+	
+	SQR_ADD_C a6L,a6R,c1,c2,c3
+	SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
+	STD     c1,96(r_ptr)          ; r[12] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
+	STD     c2,104(r_ptr)         ; r[13] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C a7L,a7R,c3,c1,c2
+	STD     c3, 112(r_ptr)       ; r[14] = c3
+	STD     c1, 120(r_ptr)       ; r[15] = c1
+
+    .EXIT
+    LDD     -104(%sp),%r6        ; restore r6
+    LDD     -112(%sp),%r5        ; restore r5
+    LDD     -120(%sp),%r4        ; restore r4
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3
+
+	.PROCEND	
+
+;-----------------------------------------------------------------------------
+;
+;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+; arg0 = r_ptr
+; arg1 = a_ptr
+;
+
+bn_sqr_comba4
+	.proc
+	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+	.EXPORT	bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .entry
+	.align 64
+    STD     %r3,0(%sp)          ; save r3
+    STD     %r4,8(%sp)          ; save r4
+    STD     %r5,16(%sp)         ; save r5
+    STD     %r6,24(%sp)         ; save r6
+
+	;
+	; Zero out carries
+	;
+	COPY     %r0,c1
+	COPY     %r0,c2
+	COPY     %r0,c3
+
+	LDO      128(%sp),%sp       ; bump stack
+    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
+    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
+
+	;
+	; Load up all of the values we are going to use
+	;
+    FLDD     0(a_ptr),a0       
+    FLDD     8(a_ptr),a1       
+    FLDD    16(a_ptr),a2       
+    FLDD    24(a_ptr),a3       
+    FLDD    32(a_ptr),a4       
+    FLDD    40(a_ptr),a5       
+    FLDD    48(a_ptr),a6       
+    FLDD    56(a_ptr),a7       
+
+	SQR_ADD_C a0L,a0R,c1,c2,c3
+
+	STD     c1,0(r_ptr)          ; r[0] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
+
+	STD     c2,8(r_ptr)          ; r[1] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C a1L,a1R,c3,c1,c2
+	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
+
+	STD     c3,16(r_ptr)            ; r[2] = c3;
+	COPY    %r0,c3
+
+	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
+	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
+
+	STD     c1,24(r_ptr)           ; r[3] = c1;
+	COPY    %r0,c1
+
+	SQR_ADD_C a2L,a2R,c2,c3,c1
+	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
+
+	STD     c2,32(r_ptr)           ; r[4] = c2;
+	COPY    %r0,c2
+
+	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
+	STD     c3,40(r_ptr)           ; r[5] = c3;
+	COPY    %r0,c3
+
+	SQR_ADD_C a3L,a3R,c1,c2,c3
+	STD     c1,48(r_ptr)           ; r[6] = c1;
+	STD     c2,56(r_ptr)           ; r[7] = c2;
+
+    .EXIT
+    LDD     -104(%sp),%r6        ; restore r6
+    LDD     -112(%sp),%r5        ; restore r5
+    LDD     -120(%sp),%r4        ; restore r4
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3
+
+	.PROCEND	
+
+
+;---------------------------------------------------------------------------
+
+MUL_ADD_C  .macro  A0L,A0R,B0L,B0R,C1,C2,C3
+    XMPYU   A0L,B0R,ftemp1        ; m1 = bl*ht
+    FSTD    ftemp1,-16(%sp)       ;
+    XMPYU   A0R,B0L,ftemp2        ; m = bh*lt
+    FSTD    ftemp2,-8(%sp)        ;
+    XMPYU   A0R,B0R,ftemp3        ; lt = bl*lt
+    FSTD    ftemp3,-32(%sp)
+    XMPYU   A0L,B0L,ftemp4        ; ht = bh*ht
+    FSTD    ftemp4,-24(%sp)       ;
+
+    LDD     -8(%sp),m             ; r21 = m
+    LDD     -16(%sp),m1           ; r19 = m1
+    ADD,L   m,m1,m                ; m+m1
+
+    DEPD,Z  m,31,32,temp3         ; (m+m1<<32)
+    LDD     -24(%sp),ht           ; r24 = ht
+
+    CMPCLR,*>>= m,m1,%r0          ; if (m < m1)
+    ADD,L   ht,high_one,ht        ; ht+=high_one
+
+    EXTRD,U m,31,32,temp1         ; m >> 32
+    LDD     -32(%sp),lt           ; lt
+    ADD,L   ht,temp1,ht           ; ht+= m>>32
+    ADD     lt,temp3,lt           ; lt = lt+m1
+    ADD,DC  ht,%r0,ht             ; ht++
+
+    ADD     C1,lt,C1              ; c1=c1+lt
+    ADD,DC  ht,%r0,ht             ; bump c3 if overflow,nullify otherwise
+
+    ADD     C2,ht,C2              ; c2 = c2 + ht
+    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
+.endm
+
+
+;
+;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba8
+	.proc
+	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+	.EXPORT	bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .entry
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3
+    STD     %r4,8(%sp)          ; save r4
+    STD     %r5,16(%sp)         ; save r5
+    STD     %r6,24(%sp)         ; save r6
+    FSTD    %fr12,32(%sp)       ; save r6
+    FSTD    %fr13,40(%sp)       ; save r7
+
+	;
+	; Zero out carries
+	;
+	COPY     %r0,c1
+	COPY     %r0,c2
+	COPY     %r0,c3
+
+	LDO      128(%sp),%sp       ; bump stack
+    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
+
+	;
+	; Load up all of the values we are going to use
+	;
+    FLDD      0(a_ptr),a0       
+    FLDD      8(a_ptr),a1       
+    FLDD     16(a_ptr),a2       
+    FLDD     24(a_ptr),a3       
+    FLDD     32(a_ptr),a4       
+    FLDD     40(a_ptr),a5       
+    FLDD     48(a_ptr),a6       
+    FLDD     56(a_ptr),a7       
+
+    FLDD      0(b_ptr),b0       
+    FLDD      8(b_ptr),b1       
+    FLDD     16(b_ptr),b2       
+    FLDD     24(b_ptr),b3       
+    FLDD     32(b_ptr),b4       
+    FLDD     40(b_ptr),b5       
+    FLDD     48(b_ptr),b6       
+    FLDD     56(b_ptr),b7       
+
+	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+	STD       c1,0(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+	STD       c2,8(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+	STD       c3,16(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+	STD       c1,24(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
+	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+	MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
+	STD       c2,32(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
+	MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
+	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+	MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
+	MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
+	STD       c3,40(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
+	MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
+	MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
+	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+	MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
+	MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
+	MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
+	STD       c1,48(r_ptr)
+	COPY      %r0,c1
+	
+	MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
+	MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
+	MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
+	MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
+	MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
+	MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
+	MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
+	MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
+	STD       c2,56(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
+	MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
+	MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
+	MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
+	MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
+	MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
+	MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
+	STD       c3,64(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
+	MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
+	MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
+	MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
+	MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
+	MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
+	STD       c1,72(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
+	MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
+	MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
+	MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
+	MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
+	STD       c2,80(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
+	MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
+	MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
+	MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
+	STD       c3,88(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
+	MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
+	MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
+	STD       c1,96(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
+	MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
+	STD       c2,104(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
+	STD       c3,112(r_ptr)
+	STD       c1,120(r_ptr)
+
+    .EXIT
+    FLDD    -88(%sp),%fr13 
+    FLDD    -96(%sp),%fr12 
+    LDD     -104(%sp),%r6        ; restore r6
+    LDD     -112(%sp),%r5        ; restore r5
+    LDD     -120(%sp),%r4        ; restore r4
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3
+
+	.PROCEND	
+
+;-----------------------------------------------------------------------------
+;
+;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+; arg0 = r_ptr
+; arg1 = a_ptr
+; arg2 = b_ptr
+;
+
+bn_mul_comba4
+	.proc
+	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
+	.EXPORT	bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
+    .entry
+	.align 64
+
+    STD     %r3,0(%sp)          ; save r3
+    STD     %r4,8(%sp)          ; save r4
+    STD     %r5,16(%sp)         ; save r5
+    STD     %r6,24(%sp)         ; save r6
+    FSTD    %fr12,32(%sp)       ; save r6
+    FSTD    %fr13,40(%sp)       ; save r7
+
+	;
+	; Zero out carries
+	;
+	COPY     %r0,c1
+	COPY     %r0,c2
+	COPY     %r0,c3
+
+	LDO      128(%sp),%sp       ; bump stack
+    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
+
+	;
+	; Load up all of the values we are going to use
+	;
+    FLDD      0(a_ptr),a0       
+    FLDD      8(a_ptr),a1       
+    FLDD     16(a_ptr),a2       
+    FLDD     24(a_ptr),a3       
+
+    FLDD      0(b_ptr),b0       
+    FLDD      8(b_ptr),b1       
+    FLDD     16(b_ptr),b2       
+    FLDD     24(b_ptr),b3       
+
+	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
+	STD       c1,0(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
+	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
+	STD       c2,8(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
+	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
+	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
+	STD       c3,16(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
+	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
+	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
+	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
+	STD       c1,24(r_ptr)
+	COPY      %r0,c1
+
+	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
+	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
+	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
+	STD       c2,32(r_ptr)
+	COPY      %r0,c2
+
+	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
+	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
+	STD       c3,40(r_ptr)
+	COPY      %r0,c3
+
+	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
+	STD       c1,48(r_ptr)
+	STD       c2,56(r_ptr)
+
+    .EXIT
+    FLDD    -88(%sp),%fr13 
+    FLDD    -96(%sp),%fr12 
+    LDD     -104(%sp),%r6        ; restore r6
+    LDD     -112(%sp),%r5        ; restore r5
+    LDD     -120(%sp),%r4        ; restore r4
+    BVE     (%rp)
+    LDD,MB  -128(%sp),%r3
+
+	.PROCEND	
+
+
+	.SPACE	$TEXT$
+	.SUBSPA	$CODE$
+	.SPACE	$PRIVATE$,SORT=16
+	.IMPORT	$global$,DATA
+	.SPACE	$TEXT$
+	.SUBSPA	$CODE$
+	.SUBSPA	$LIT$,ACCESS=0x2c
+C$7
+	.ALIGN	8
+	.STRINGZ	"Division would overflow (%d)\n"
+	.END
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2W.s b/src/lib/libcrypto/bn/asm/pa-risc2W.s
index 54b6606252..a99545754d 100644
--- a/src/lib/libcrypto/bn/asm/pa-risc2W.s
+++ b/src/lib/libcrypto/bn/asm/pa-risc2W.s
@@ -1598,7 +1598,7 @@ bn_mul_comba4
 	.IMPORT	$global$,DATA
 	.SPACE	$TEXT$
 	.SUBSPA	$CODE$
-	.SUBSPA	$LIT$,QUAD=0,ALIGN=8,ACCESS=0x2c,SORT=16
+	.SUBSPA	$LIT$,ACCESS=0x2c
 C$4
 	.ALIGN	8
 	.STRINGZ	"Division would overflow (%d)\n"
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
index f935e1ca79..1eaf879553 100644
--- a/src/lib/libcrypto/bn/bn.h
+++ b/src/lib/libcrypto/bn/bn.h
@@ -59,38 +59,39 @@
 #ifndef HEADER_BN_H
 #define HEADER_BN_H
 
-#ifndef WIN16
+#include <openssl/e_os2.h>
+#ifndef OPENSSL_NO_FP_API
 #include <stdio.h> /* FILE */
 #endif
-#include <openssl/opensslconf.h>
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-#ifdef VMS
+#ifdef OPENSSL_SYS_VMS
 #undef BN_LLONG /* experimental, so far... */
 #endif
 
 #define BN_MUL_COMBA
 #define BN_SQR_COMBA
 #define BN_RECURSION
-#define RECP_MUL_MOD
-#define MONT_MUL_MOD
 
 /* This next option uses the C libraries (2 word)/(1 word) function.
  * If it is not defined, I use my C version (which is slower).
  * The reason for this flag is that when the particular C compiler
  * library routine is used, and the library is linked with a different
  * compiler, the library is missing.  This mostly happens when the
- * library is built with gcc and then linked using nornal cc.  This would
- * be a common occurance because gcc normally produces code that is
+ * library is built with gcc and then linked using normal cc.  This would
+ * be a common occurrence because gcc normally produces code that is
  * 2 times faster than system compilers for the big number stuff.
  * For machines with only one compiler (or shared libraries), this should
  * be on.  Again this in only really a problem on machines
- * using "long long's", are 32bit, and are not using my assember code. */
-#if defined(MSDOS) || defined(WINDOWS) || defined(linux)
-#define BN_DIV2W
+ * using "long long's", are 32bit, and are not using my assembler code. */
+#if defined(OPENSSL_SYS_MSDOS) || defined(OPENSSL_SYS_WINDOWS) || \
+    defined(OPENSSL_SYS_WIN32) || defined(linux)
+# ifndef BN_DIV2W
+#  define BN_DIV2W
+# endif
 #endif
 
 /* assuming long is 64bit - this is the DEC Alpha
@@ -118,8 +119,8 @@ extern "C" {
 
 /* This is where the long long data type is 64 bits, but long is 32.
  * For machines where there are 64bit registers, this is the mode to use.
- * IRIX, on R4000 and above should use this mode, along with the relevent
- * assember code :-).  Do NOT define BN_LLONG.
+ * IRIX, on R4000 and above should use this mode, along with the relevant
+ * assembler code :-).  Do NOT define BN_LLONG.
  */
 #ifdef SIXTY_FOUR_BIT
 #undef BN_LLONG
@@ -135,14 +136,14 @@ extern "C" {
 #define BN_MASK2h	(0xffffffff00000000LL)
 #define BN_MASK2h1	(0xffffffff80000000LL)
 #define BN_TBIT		(0x8000000000000000LL)
-#define BN_DEC_CONV	(10000000000000000000LL)
+#define BN_DEC_CONV	(10000000000000000000ULL)
 #define BN_DEC_FMT1	"%llu"
 #define BN_DEC_FMT2	"%019llu"
 #define BN_DEC_NUM	19
 #endif
 
 #ifdef THIRTY_TWO_BIT
-#if defined(WIN32) && !defined(__GNUC__)
+#if defined(OPENSSL_SYS_WIN32) && !defined(__GNUC__)
 #define BN_ULLONG	unsigned _int64
 #else
 #define BN_ULLONG	unsigned long long
@@ -153,7 +154,7 @@ extern "C" {
 #define BN_BYTES	4
 #define BN_BITS2	32
 #define BN_BITS4	16
-#ifdef WIN32
+#ifdef OPENSSL_SYS_WIN32
 /* VC++ doesn't like the LL suffix */
 #define BN_MASK		(0xffffffffffffffffL)
 #else
@@ -233,19 +234,13 @@ typedef struct bignum_st
 	BN_ULONG *d;	/* Pointer to an array of 'BN_BITS2' bit chunks. */
 	int top;	/* Index of last used d +1. */
 	/* The next are internal book keeping for bn_expand. */
-	int max;	/* Size of the d array. */
+	int dmax;	/* Size of the d array. */
 	int neg;	/* one if the number is negative */
 	int flags;
 	} BIGNUM;
 
-/* Used for temp variables */
-#define BN_CTX_NUM	12
-typedef struct bignum_ctx
-	{
-	int tos;
-	BIGNUM bn[BN_CTX_NUM+1];
-	int flags;
-	} BN_CTX;
+/* Used for temp variables (declaration hidden in bn_lcl.h) */
+typedef struct bignum_ctx BN_CTX;
 
 typedef struct bn_blinding_st
 	{
@@ -257,16 +252,15 @@ typedef struct bn_blinding_st
 
 /* Used for montgomery multiplication */
 typedef struct bn_mont_ctx_st
-        {
-	int use_word;	/* 0 for word form, 1 for long form */
-        int ri;         /* number of bits in R */
-        BIGNUM RR;     /* used to convert to montgomery form */
-        BIGNUM N;      /* The modulus */
-        BIGNUM Ni;     /* The inverse of N */
-	BN_ULONG n0;	/* word form of inverse, normally only one of
-			 * Ni or n0 is defined */
+	{
+	int ri;        /* number of bits in R */
+	BIGNUM RR;     /* used to convert to montgomery form */
+	BIGNUM N;      /* The modulus */
+	BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
+	                * (Ni is only stored for bignum algorithm) */
+	BN_ULONG n0;   /* least significant word of Ni */
 	int flags;
-        } BN_MONT_CTX;
+	} BN_MONT_CTX;
 
 /* Used for reciprocal division/mod functions
  * It cannot be shared between threads
@@ -280,97 +274,129 @@ typedef struct bn_recp_ctx_st
 	int flags;
 	} BN_RECP_CTX;
 
-#define BN_to_montgomery(r,a,mont,ctx)	BN_mod_mul_montgomery(\
-	r,a,&((mont)->RR),(mont),ctx)
-
-#define BN_prime_checks		(5)
+#define BN_prime_checks 0 /* default: select number of iterations
+			     based on the size of the number */
+
+/* number of Miller-Rabin iterations for an error rate  of less than 2^-80
+ * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
+ * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
+ * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
+ * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
+#define BN_prime_checks_for_size(b) ((b) >= 1300 ?  2 : \
+                                (b) >=  850 ?  3 : \
+                                (b) >=  650 ?  4 : \
+                                (b) >=  550 ?  5 : \
+                                (b) >=  450 ?  6 : \
+                                (b) >=  400 ?  7 : \
+                                (b) >=  350 ?  8 : \
+                                (b) >=  300 ?  9 : \
+                                (b) >=  250 ? 12 : \
+                                (b) >=  200 ? 15 : \
+                                (b) >=  150 ? 18 : \
+                                /* b >= 100 */ 27)
 
 #define BN_num_bytes(a)	((BN_num_bits(a)+7)/8)
-#define BN_is_word(a,w)	(((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w)))
-#define BN_is_zero(a)	(((a)->top == 0) || BN_is_word(a,0))
-#define BN_is_one(a)	(BN_is_word((a),1))
-#define BN_is_odd(a)	(((a)->top > 0) && ((a)->d[0] & 1))
+
+/* Note that BN_abs_is_word does not work reliably for w == 0 */
+#define BN_abs_is_word(a,w) (((a)->top == 1) && ((a)->d[0] == (BN_ULONG)(w)))
+#define BN_is_zero(a)       (((a)->top == 0) || BN_abs_is_word(a,0))
+#define BN_is_one(a)        (BN_abs_is_word((a),1) && !(a)->neg)
+#define BN_is_word(a,w)     ((w) ? BN_abs_is_word((a),(w)) && !(a)->neg : \
+                                   BN_is_zero((a)))
+#define BN_is_odd(a)	    (((a)->top > 0) && ((a)->d[0] & 1))
+
 #define BN_one(a)	(BN_set_word((a),1))
 #define BN_zero(a)	(BN_set_word((a),0))
 
 /*#define BN_ascii2bn(a)	BN_hex2bn(a) */
 /*#define BN_bn2ascii(a)	BN_bn2hex(a) */
 
-#define bn_expand(n,b) ((((((b+BN_BITS2-1))/BN_BITS2)) <= (n)->max)?\
-	(n):bn_expand2((n),(b)/BN_BITS2+1))
-#define bn_wexpand(n,b) (((b) <= (n)->max)?(n):bn_expand2((n),(b)))
-
-#define bn_fix_top(a) \
-        { \
-        BN_ULONG *ftl; \
-	if ((a)->top > 0) \
-		{ \
-		for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
-		if (*(ftl--)) break; \
-		} \
-	}
-
-BIGNUM *BN_value_one(void);
+const BIGNUM *BN_value_one(void);
 char *	BN_options(void);
 BN_CTX *BN_CTX_new(void);
 void	BN_CTX_init(BN_CTX *c);
 void	BN_CTX_free(BN_CTX *c);
+void	BN_CTX_start(BN_CTX *ctx);
+BIGNUM *BN_CTX_get(BN_CTX *ctx);
+void	BN_CTX_end(BN_CTX *ctx);
 int     BN_rand(BIGNUM *rnd, int bits, int top,int bottom);
+int     BN_pseudo_rand(BIGNUM *rnd, int bits, int top,int bottom);
+int	BN_rand_range(BIGNUM *rnd, BIGNUM *range);
+int	BN_pseudo_rand_range(BIGNUM *rnd, BIGNUM *range);
 int	BN_num_bits(const BIGNUM *a);
 int	BN_num_bits_word(BN_ULONG);
 BIGNUM *BN_new(void);
 void	BN_init(BIGNUM *);
 void	BN_clear_free(BIGNUM *a);
 BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b);
+void	BN_swap(BIGNUM *a, BIGNUM *b);
 BIGNUM *BN_bin2bn(const unsigned char *s,int len,BIGNUM *ret);
 int	BN_bn2bin(const BIGNUM *a, unsigned char *to);
-BIGNUM *BN_mpi2bn(unsigned char *s,int len,BIGNUM *ret);
+BIGNUM *BN_mpi2bn(const unsigned char *s,int len,BIGNUM *ret);
 int	BN_bn2mpi(const BIGNUM *a, unsigned char *to);
 int	BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 int	BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
 int	BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
-int	BN_add(BIGNUM *r, BIGNUM *a, BIGNUM *b);
-int	BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
+int	BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+int	BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+int	BN_sqr(BIGNUM *r, const BIGNUM *a,BN_CTX *ctx);
+
 int	BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
-	       BN_CTX *ctx);
-int	BN_mul(BIGNUM *r, BIGNUM *a, BIGNUM *b,BN_CTX *ctx);
-int	BN_sqr(BIGNUM *r, BIGNUM *a,BN_CTX *ctx);
-BN_ULONG BN_mod_word(BIGNUM *a, BN_ULONG w);
+	BN_CTX *ctx);
+#define BN_mod(rem,m,d,ctx) BN_div(NULL,(rem),(m),(d),(ctx))
+int	BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx);
+int	BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
+int	BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m);
+int	BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+	const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
+int	BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx);
+int	BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
 BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w);
 int	BN_mul_word(BIGNUM *a, BN_ULONG w);
 int	BN_add_word(BIGNUM *a, BN_ULONG w);
 int	BN_sub_word(BIGNUM *a, BN_ULONG w);
 int	BN_set_word(BIGNUM *a, BN_ULONG w);
-BN_ULONG BN_get_word(BIGNUM *a);
+BN_ULONG BN_get_word(const BIGNUM *a);
+
 int	BN_cmp(const BIGNUM *a, const BIGNUM *b);
 void	BN_free(BIGNUM *a);
 int	BN_is_bit_set(const BIGNUM *a, int n);
 int	BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
-int	BN_lshift1(BIGNUM *r, BIGNUM *a);
-int	BN_exp(BIGNUM *r, BIGNUM *a, BIGNUM *p,BN_CTX *ctx);
-int	BN_mod_exp(BIGNUM *r, BIGNUM *a, const BIGNUM *p,
-		   const BIGNUM *m,BN_CTX *ctx);
-int	BN_mod_exp_mont(BIGNUM *r, BIGNUM *a, const BIGNUM *p,
-			const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
-int	BN_mod_exp2_mont(BIGNUM *r, BIGNUM *a1, BIGNUM *p1,BIGNUM *a2,
-		BIGNUM *p2,BIGNUM *m,BN_CTX *ctx,BN_MONT_CTX *m_ctx);
-int	BN_mod_exp_simple(BIGNUM *r, BIGNUM *a, BIGNUM *p,
-	BIGNUM *m,BN_CTX *ctx);
+int	BN_lshift1(BIGNUM *r, const BIGNUM *a);
+int	BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,BN_CTX *ctx);
+
+int	BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m,BN_CTX *ctx);
+int	BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int	BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
+	const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int	BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
+	const BIGNUM *a2, const BIGNUM *p2,const BIGNUM *m,
+	BN_CTX *ctx,BN_MONT_CTX *m_ctx);
+int	BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+	const BIGNUM *m,BN_CTX *ctx);
+
 int	BN_mask_bits(BIGNUM *a,int n);
-int	BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx);
-#ifndef WIN16
-int	BN_print_fp(FILE *fp, BIGNUM *a);
+#ifndef OPENSSL_NO_FP_API
+int	BN_print_fp(FILE *fp, const BIGNUM *a);
 #endif
 #ifdef HEADER_BIO_H
 int	BN_print(BIO *fp, const BIGNUM *a);
 #else
-int	BN_print(char *fp, const BIGNUM *a);
+int	BN_print(void *fp, const BIGNUM *a);
 #endif
-int	BN_reciprocal(BIGNUM *r, BIGNUM *m, int len, BN_CTX *ctx);
-int	BN_rshift(BIGNUM *r, BIGNUM *a, int n);
-int	BN_rshift1(BIGNUM *r, BIGNUM *a);
+int	BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx);
+int	BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
+int	BN_rshift1(BIGNUM *r, const BIGNUM *a);
 void	BN_clear(BIGNUM *a);
-BIGNUM *bn_expand2(BIGNUM *b, int bits);
 BIGNUM *BN_dup(const BIGNUM *a);
 int	BN_ucmp(const BIGNUM *a, const BIGNUM *b);
 int	BN_set_bit(BIGNUM *a, int n);
@@ -379,26 +405,30 @@ char *	BN_bn2hex(const BIGNUM *a);
 char *	BN_bn2dec(const BIGNUM *a);
 int 	BN_hex2bn(BIGNUM **a, const char *str);
 int 	BN_dec2bn(BIGNUM **a, const char *str);
-int	BN_gcd(BIGNUM *r,BIGNUM *in_a,BIGNUM *in_b,BN_CTX *ctx);
-BIGNUM *BN_mod_inverse(BIGNUM *ret,BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
-BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int strong,BIGNUM *add,
-		BIGNUM *rem,void (*callback)(int,int,void *),void *cb_arg);
-int	BN_is_prime(BIGNUM *p,int nchecks,void (*callback)(int,int,void *),
-		BN_CTX *ctx,void *cb_arg);
-void	ERR_load_BN_strings(void );
-
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w);
-void     bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num);
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
-BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num);
-BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num);
+int	BN_gcd(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx);
+int	BN_kronecker(const BIGNUM *a,const BIGNUM *b,BN_CTX *ctx); /* returns -2 for error */
+BIGNUM *BN_mod_inverse(BIGNUM *ret,
+	const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
+BIGNUM *BN_mod_sqrt(BIGNUM *ret,
+	const BIGNUM *a, const BIGNUM *n,BN_CTX *ctx);
+BIGNUM *BN_generate_prime(BIGNUM *ret,int bits,int safe,
+	const BIGNUM *add, const BIGNUM *rem,
+	void (*callback)(int,int,void *),void *cb_arg);
+int	BN_is_prime(const BIGNUM *p,int nchecks,
+	void (*callback)(int,int,void *),
+	BN_CTX *ctx,void *cb_arg);
+int	BN_is_prime_fasttest(const BIGNUM *p,int nchecks,
+	void (*callback)(int,int,void *),BN_CTX *ctx,void *cb_arg,
+	int do_trial_division);
 
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
-int BN_mod_mul_montgomery(BIGNUM *r,BIGNUM *a,BIGNUM *b,BN_MONT_CTX *mont,
-			  BN_CTX *ctx);
-int BN_from_montgomery(BIGNUM *r,BIGNUM *a,BN_MONT_CTX *mont,BN_CTX *ctx);
+int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
+	BN_MONT_CTX *mont, BN_CTX *ctx);
+#define BN_to_montgomery(r,a,mont,ctx)	BN_mod_mul_montgomery(\
+	(r),(a),&((mont)->RR),(mont),(ctx))
+int BN_from_montgomery(BIGNUM *r,const BIGNUM *a,
+	BN_MONT_CTX *mont, BN_CTX *ctx);
 void BN_MONT_CTX_free(BN_MONT_CTX *mont);
 int BN_MONT_CTX_set(BN_MONT_CTX *mont,const BIGNUM *modulus,BN_CTX *ctx);
 BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to,BN_MONT_CTX *from);
@@ -416,18 +446,55 @@ void	BN_RECP_CTX_init(BN_RECP_CTX *recp);
 BN_RECP_CTX *BN_RECP_CTX_new(void);
 void	BN_RECP_CTX_free(BN_RECP_CTX *recp);
 int	BN_RECP_CTX_set(BN_RECP_CTX *recp,const BIGNUM *rdiv,BN_CTX *ctx);
-int	BN_mod_mul_reciprocal(BIGNUM *r, BIGNUM *x, BIGNUM *y,
-		BN_RECP_CTX *recp,BN_CTX *ctx);
+int	BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+	BN_RECP_CTX *recp,BN_CTX *ctx);
 int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
-			const BIGNUM *m, BN_CTX *ctx);
-int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m,
-		BN_RECP_CTX *recp, BN_CTX *ctx);
+	const BIGNUM *m, BN_CTX *ctx);
+int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+	BN_RECP_CTX *recp, BN_CTX *ctx);
+
+/* library internal functions */
+
+#define bn_expand(a,bits) ((((((bits+BN_BITS2-1))/BN_BITS2)) <= (a)->dmax)?\
+	(a):bn_expand2((a),(bits)/BN_BITS2+1))
+#define bn_wexpand(a,words) (((words) <= (a)->dmax)?(a):bn_expand2((a),(words)))
+BIGNUM *bn_expand2(BIGNUM *a, int words);
+BIGNUM *bn_dup_expand(const BIGNUM *a, int words);
+
+#define bn_fix_top(a) \
+        { \
+        BN_ULONG *ftl; \
+	if ((a)->top > 0) \
+		{ \
+		for (ftl= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
+		if (*(ftl--)) break; \
+		} \
+	}
 
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+
+#ifdef BN_DEBUG
+void bn_dump1(FILE *o, const char *a, const BN_ULONG *b,int n);
+# define bn_print(a) {fprintf(stderr, #a "="); BN_print_fp(stderr,a); \
+   fprintf(stderr,"\n");}
+# define bn_dump(a,n) bn_dump1(stderr,#a,a,n);
+#else
+# define bn_print(a)
+# define bn_dump(a,b)
+#endif
+
+int BN_bntest_rand(BIGNUM *rnd, int bits, int top,int bottom);
 
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
  */
+void ERR_load_BN_strings(void);
 
 /* Error codes for the BN functions. */
 
@@ -438,30 +505,43 @@ int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, BIGNUM *m,
 #define BN_F_BN_BLINDING_UPDATE				 103
 #define BN_F_BN_BN2DEC					 104
 #define BN_F_BN_BN2HEX					 105
+#define BN_F_BN_CTX_GET					 116
 #define BN_F_BN_CTX_NEW					 106
 #define BN_F_BN_DIV					 107
 #define BN_F_BN_EXPAND2					 108
+#define BN_F_BN_EXPAND_INTERNAL				 120
+#define BN_F_BN_MOD_EXP2_MONT				 118
 #define BN_F_BN_MOD_EXP_MONT				 109
+#define BN_F_BN_MOD_EXP_MONT_WORD			 117
 #define BN_F_BN_MOD_INVERSE				 110
+#define BN_F_BN_MOD_LSHIFT_QUICK			 119
 #define BN_F_BN_MOD_MUL_RECIPROCAL			 111
+#define BN_F_BN_MOD_SQRT				 121
 #define BN_F_BN_MPI2BN					 112
 #define BN_F_BN_NEW					 113
 #define BN_F_BN_RAND					 114
+#define BN_F_BN_RAND_RANGE				 122
 #define BN_F_BN_USUB					 115
 
 /* Reason codes. */
 #define BN_R_ARG2_LT_ARG3				 100
 #define BN_R_BAD_RECIPROCAL				 101
+#define BN_R_BIGNUM_TOO_LONG				 114
 #define BN_R_CALLED_WITH_EVEN_MODULUS			 102
 #define BN_R_DIV_BY_ZERO				 103
 #define BN_R_ENCODING_ERROR				 104
 #define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA		 105
+#define BN_R_INPUT_NOT_REDUCED				 110
 #define BN_R_INVALID_LENGTH				 106
+#define BN_R_INVALID_RANGE				 115
+#define BN_R_NOT_A_SQUARE				 111
 #define BN_R_NOT_INITIALIZED				 107
 #define BN_R_NO_INVERSE					 108
+#define BN_R_P_IS_NOT_PRIME				 112
+#define BN_R_TOO_MANY_ITERATIONS			 113
+#define BN_R_TOO_MANY_TEMPORARY_VARIABLES		 109
 
 #ifdef  __cplusplus
 }
 #endif
 #endif
-
diff --git a/src/lib/libcrypto/bn/bn_add.c b/src/lib/libcrypto/bn/bn_add.c
index efb2e312e8..6cba07e9f6 100644
--- a/src/lib/libcrypto/bn/bn_add.c
+++ b/src/lib/libcrypto/bn/bn_add.c
@@ -61,76 +61,70 @@
 #include "bn_lcl.h"
 
 /* r can == a or b */
-int BN_add(r, a, b)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *b;
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
 	{
-	int i;
-	BIGNUM *tmp;
+	const BIGNUM *tmp;
+	int a_neg = a->neg;
+
+	bn_check_top(a);
+	bn_check_top(b);
 
 	/*  a +  b	a+b
 	 *  a + -b	a-b
 	 * -a +  b	b-a
 	 * -a + -b	-(a+b)
 	 */
-	if (a->neg ^ b->neg)
+	if (a_neg ^ b->neg)
 		{
 		/* only one is negative */
-		if (a->neg)
+		if (a_neg)
 			{ tmp=a; a=b; b=tmp; }
 
 		/* we are now a - b */
 
 		if (BN_ucmp(a,b) < 0)
 			{
-			if (bn_wexpand(r,b->top) == NULL) return(0);
-			bn_qsub(r,b,a);
+			if (!BN_usub(r,b,a)) return(0);
 			r->neg=1;
 			}
 		else
 			{
-			if (bn_wexpand(r,a->top) == NULL) return(0);
-			bn_qsub(r,a,b);
+			if (!BN_usub(r,a,b)) return(0);
 			r->neg=0;
 			}
 		return(1);
 		}
 
-	if (a->neg) /* both are neg */
+	if (!BN_uadd(r,a,b)) return(0);
+	if (a_neg) /* both are neg */
 		r->neg=1;
 	else
 		r->neg=0;
-
-	i=(a->top > b->top);
-
-	if (i)
-		{
-		if (bn_wexpand(r,a->top+1) == NULL) return(0);
-		bn_qadd(r,a,b);
-		}
-	else
-		{
-		if (bn_wexpand(r,b->top+1) == NULL) return(0);
-		bn_qadd(r,b,a);
-		}
 	return(1);
 	}
 
 /* unsigned add of b to a, r must be large enough */
-void bn_qadd(r,a,b)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *b;
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
 	{
 	register int i;
 	int max,min;
 	BN_ULONG *ap,*bp,*rp,carry,t1;
+	const BIGNUM *tmp;
+
+	bn_check_top(a);
+	bn_check_top(b);
 
+	if (a->top < b->top)
+		{ tmp=a; a=b; b=tmp; }
 	max=a->top;
 	min=b->top;
+
+	if (bn_wexpand(r,max+1) == NULL)
+		return(0);
+
 	r->top=max;
 
+
 	ap=a->d;
 	bp=b->d;
 	rp=r->d;
@@ -160,8 +154,156 @@ BIGNUM *b;
 			r->top++;
 			}
 		}
-	for (; i<max; i++)
-		*(rp++)= *(ap++);
+	if (rp != ap)
+		{
+		for (; i<max; i++)
+			*(rp++)= *(ap++);
+		}
 	/* memcpy(rp,ap,sizeof(*ap)*(max-i));*/
+	r->neg = 0;
+	return(1);
+	}
+
+/* unsigned subtraction of b from a, a must be larger than b. */
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+	{
+	int max,min;
+	register BN_ULONG t1,t2,*ap,*bp,*rp;
+	int i,carry;
+#if defined(IRIX_CC_BUG) && !defined(LINT)
+	int dummy;
+#endif
+
+	bn_check_top(a);
+	bn_check_top(b);
+
+	if (a->top < b->top) /* hmm... should not be happening */
+		{
+		BNerr(BN_F_BN_USUB,BN_R_ARG2_LT_ARG3);
+		return(0);
+		}
+
+	max=a->top;
+	min=b->top;
+	if (bn_wexpand(r,max) == NULL) return(0);
+
+	ap=a->d;
+	bp=b->d;
+	rp=r->d;
+
+#if 1
+	carry=0;
+	for (i=0; i<min; i++)
+		{
+		t1= *(ap++);
+		t2= *(bp++);
+		if (carry)
+			{
+			carry=(t1 <= t2);
+			t1=(t1-t2-1)&BN_MASK2;
+			}
+		else
+			{
+			carry=(t1 < t2);
+			t1=(t1-t2)&BN_MASK2;
+			}
+#if defined(IRIX_CC_BUG) && !defined(LINT)
+		dummy=t1;
+#endif
+		*(rp++)=t1&BN_MASK2;
+		}
+#else
+	carry=bn_sub_words(rp,ap,bp,min);
+	ap+=min;
+	bp+=min;
+	rp+=min;
+	i=min;
+#endif
+	if (carry) /* subtracted */
+		{
+		while (i < max)
+			{
+			i++;
+			t1= *(ap++);
+			t2=(t1-1)&BN_MASK2;
+			*(rp++)=t2;
+			if (t1 > t2) break;
+			}
+		}
+#if 0
+	memcpy(rp,ap,sizeof(*rp)*(max-i));
+#else
+	if (rp != ap)
+		{
+		for (;;)
+			{
+			if (i++ >= max) break;
+			rp[0]=ap[0];
+			if (i++ >= max) break;
+			rp[1]=ap[1];
+			if (i++ >= max) break;
+			rp[2]=ap[2];
+			if (i++ >= max) break;
+			rp[3]=ap[3];
+			rp+=4;
+			ap+=4;
+			}
+		}
+#endif
+
+	r->top=max;
+	r->neg=0;
+	bn_fix_top(r);
+	return(1);
+	}
+
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b)
+	{
+	int max;
+	int add=0,neg=0;
+	const BIGNUM *tmp;
+
+	bn_check_top(a);
+	bn_check_top(b);
+
+	/*  a -  b	a-b
+	 *  a - -b	a+b
+	 * -a -  b	-(a+b)
+	 * -a - -b	b-a
+	 */
+	if (a->neg)
+		{
+		if (b->neg)
+			{ tmp=a; a=b; b=tmp; }
+		else
+			{ add=1; neg=1; }
+		}
+	else
+		{
+		if (b->neg) { add=1; neg=0; }
+		}
+
+	if (add)
+		{
+		if (!BN_uadd(r,a,b)) return(0);
+		r->neg=neg;
+		return(1);
+		}
+
+	/* We are actually doing a - b :-) */
+
+	max=(a->top > b->top)?a->top:b->top;
+	if (bn_wexpand(r,max) == NULL) return(0);
+	if (BN_ucmp(a,b) < 0)
+		{
+		if (!BN_usub(r,b,a)) return(0);
+		r->neg=1;
+		}
+	else
+		{
+		if (!BN_usub(r,a,b)) return(0);
+		r->neg=0;
+		}
+	return(1);
 	}
 
diff --git a/src/lib/libcrypto/bn/bn_asm.c b/src/lib/libcrypto/bn/bn_asm.c
index 4d3da16a0c..be8aa3ffc5 100644
--- a/src/lib/libcrypto/bn/bn_asm.c
+++ b/src/lib/libcrypto/bn/bn_asm.c
@@ -56,97 +56,95 @@
  * [including the GNU Public Licence.]
  */
 
+#ifndef BN_DEBUG
+# undef NDEBUG /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
 #include <stdio.h>
+#include <assert.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-#ifdef BN_LLONG 
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
 
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
-	bn_check_num(num);
+	assert(num >= 0);
 	if (num <= 0) return(c1);
 
-	for (;;)
+	while (num&~3)
 		{
 		mul_add(rp[0],ap[0],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[1],ap[1],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[2],ap[2],w,c1);
-		if (--num == 0) break;
 		mul_add(rp[3],ap[3],w,c1);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+	if (num)
+		{
+		mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
+		mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
+		mul_add(rp[2],ap[2],w,c1); return c1;
 		}
 	
 	return(c1);
 	} 
 
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c1=0;
 
-	bn_check_num(num);
+	assert(num >= 0);
 	if (num <= 0) return(c1);
 
-	/* for (;;) */
-	while (1) /* circumvent egcs-1.1.2 bug */
+	while (num&~3)
 		{
 		mul(rp[0],ap[0],w,c1);
-		if (--num == 0) break;
 		mul(rp[1],ap[1],w,c1);
-		if (--num == 0) break;
 		mul(rp[2],ap[2],w,c1);
-		if (--num == 0) break;
 		mul(rp[3],ap[3],w,c1);
-		if (--num == 0) break;
-		ap+=4;
-		rp+=4;
+		ap+=4; rp+=4; num-=4;
+		}
+	if (num)
+		{
+		mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
+		mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
+		mul(rp[2],ap[2],w,c1);
 		}
 	return(c1);
 	} 
 
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
-	bn_check_num(n);
+	assert(n >= 0);
 	if (n <= 0) return;
-	for (;;)
+	while (n&~3)
 		{
-		BN_ULLONG t;
-
-		t=(BN_ULLONG)(a[0])*(a[0]);
-		r[0]=Lw(t); r[1]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[1])*(a[1]);
-		r[2]=Lw(t); r[3]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[2])*(a[2]);
-		r[4]=Lw(t); r[5]=Hw(t);
-		if (--n == 0) break;
-
-		t=(BN_ULLONG)(a[3])*(a[3]);
-		r[6]=Lw(t); r[7]=Hw(t);
-		if (--n == 0) break;
-
-		a+=4;
-		r+=8;
+		sqr(r[0],r[1],a[0]);
+		sqr(r[2],r[3],a[1]);
+		sqr(r[4],r[5],a[2]);
+		sqr(r[6],r[7],a[3]);
+		a+=4; r+=8; n-=4;
+		}
+	if (n)
+		{
+		sqr(r[0],r[1],a[0]); if (--n == 0) return;
+		sqr(r[2],r[3],a[1]); if (--n == 0) return;
+		sqr(r[4],r[5],a[2]);
 		}
 	}
 
-#else
+#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG c=0;
 	BN_ULONG bl,bh;
 
-	bn_check_num(num);
+	assert(num >= 0);
 	if (num <= 0) return((BN_ULONG)0);
 
 	bl=LBITS(w);
@@ -168,12 +166,12 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(c);
 	} 
 
-BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
 	{
 	BN_ULONG carry=0;
 	BN_ULONG bl,bh;
 
-	bn_check_num(num);
+	assert(num >= 0);
 	if (num <= 0) return((BN_ULONG)0);
 
 	bl=LBITS(w);
@@ -195,9 +193,9 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 	return(carry);
 	} 
 
-void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
         {
-	bn_check_num(n);
+	assert(n >= 0);
 	if (n <= 0) return;
 	for (;;)
 		{
@@ -218,7 +216,7 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
 		}
 	}
 
-#endif
+#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 
 #if defined(BN_LLONG) && defined(BN_DIV2W)
 
@@ -229,7 +227,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 
 #else
 
-/* Divide h-l by d and return the result. */
+/* Divide h,l by d and return the result. */
 /* I need to test this some more :-( */
 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	{
@@ -239,13 +237,8 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	if (d == 0) return(BN_MASK2);
 
 	i=BN_num_bits_word(d);
-	if ((i != BN_BITS2) && (h > (BN_ULONG)1<<i))
-		{
-#if !defined(NO_STDIO) && !defined(WIN16)
-		fprintf(stderr,"Division would overflow (%d)\n",i);
-#endif
-		abort();
-		}
+	assert((i == BN_BITS2) || (h > (BN_ULONG)1<<i));
+
 	i=BN_BITS2-i;
 	if (h >= d) h-=d;
 
@@ -300,14 +293,14 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 	ret|=q;
 	return(ret);
 	}
-#endif
+#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 
 #ifdef BN_LLONG
-BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
         {
 	BN_ULLONG ll=0;
 
-	bn_check_num(n);
+	assert(n >= 0);
 	if (n <= 0) return((BN_ULONG)0);
 
 	for (;;)
@@ -338,12 +331,12 @@ BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 		}
 	return((BN_ULONG)ll);
 	}
-#else
-BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+#else /* !BN_LLONG */
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
         {
 	BN_ULONG c,l,t;
 
-	bn_check_num(n);
+	assert(n >= 0);
 	if (n <= 0) return((BN_ULONG)0);
 
 	c=0;
@@ -387,14 +380,14 @@ BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 		}
 	return((BN_ULONG)c);
 	}
-#endif
+#endif /* !BN_LLONG */
 
-BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
         {
 	BN_ULONG t1,t2;
 	int c=0;
 
-	bn_check_num(n);
+	assert(n >= 0);
 	if (n <= 0) return((BN_ULONG)0);
 
 	for (;;)
@@ -433,6 +426,11 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 #undef bn_sqr_comba8
 #undef bn_sqr_comba4
 
+/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
+
 #ifdef BN_LLONG
 #define mul_add_c(a,b,c0,c1,c2) \
 	t=(BN_ULLONG)a*b; \
@@ -460,7 +458,39 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
-#else
+
+#elif defined(BN_UMULT_HIGH)
+
+#define mul_add_c(a,b,c0,c1,c2)	{	\
+	BN_ULONG ta=(a),tb=(b);		\
+	t1 = ta * tb;			\
+	t2 = BN_UMULT_HIGH(ta,tb);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define mul_add_c2(a,b,c0,c1,c2) {	\
+	BN_ULONG ta=(a),tb=(b),t0;	\
+	t1 = BN_UMULT_HIGH(ta,tb);	\
+	t0 = ta * tb;			\
+	t2 = t1+t1; c2 += (t2<t1)?1:0;	\
+	t1 = t0+t0; t2 += (t1<t0)?1:0;	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c(a,i,c0,c1,c2)	{	\
+	BN_ULONG ta=(a)[i];		\
+	t1 = ta * ta;			\
+	t2 = BN_UMULT_HIGH(ta,ta);	\
+	c0 += t1; t2 += (c0<t1)?1:0;	\
+	c1 += t2; c2 += (c1<t2)?1:0;	\
+	}
+
+#define sqr_add_c2(a,i,j,c0,c1,c2)	\
+	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+#else /* !BN_LLONG */
 #define mul_add_c(a,b,c0,c1,c2) \
 	t1=LBITS(a); t2=HBITS(a); \
 	bl=LBITS(b); bh=HBITS(b); \
@@ -487,7 +517,7 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
 	mul_add_c2((a)[i],(a)[j],c0,c1,c2)
-#endif
+#endif /* !BN_LLONG */
 
 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	{
@@ -643,7 +673,7 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[7]=c2;
 	}
 
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
 	{
 #ifdef BN_LLONG
 	BN_ULLONG t,tt;
@@ -724,7 +754,7 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 	r[15]=c1;
 	}
 
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
 	{
 #ifdef BN_LLONG
 	BN_ULLONG t,tt;
@@ -762,7 +792,7 @@ void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 	r[6]=c1;
 	r[7]=c2;
 	}
-#else
+#else /* !BN_MUL_COMBA */
 
 /* hmm... is it faster just to do a multiply? */
 #undef bn_sqr_comba4
@@ -799,4 +829,4 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 	r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
 	}
 
-#endif /* BN_COMBA */
+#endif /* !BN_MUL_COMBA */
diff --git a/src/lib/libcrypto/bn/bn_blind.c b/src/lib/libcrypto/bn/bn_blind.c
index a7b34f0bf0..2d287e6d1b 100644
--- a/src/lib/libcrypto/bn/bn_blind.c
+++ b/src/lib/libcrypto/bn/bn_blind.c
@@ -60,15 +60,18 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-BN_BLINDING *BN_BLINDING_new(A,Ai,mod)
-BIGNUM *A;
-BIGNUM *Ai;
-BIGNUM *mod;
+BN_BLINDING *BN_BLINDING_new(BIGNUM *A, BIGNUM *Ai, BIGNUM *mod)
 	{
 	BN_BLINDING *ret=NULL;
 
-	if ((ret=(BN_BLINDING *)Malloc(sizeof(BN_BLINDING))) == NULL)
+	bn_check_top(Ai);
+	bn_check_top(mod);
+
+	if ((ret=(BN_BLINDING *)OPENSSL_malloc(sizeof(BN_BLINDING))) == NULL)
+		{
 		BNerr(BN_F_BN_BLINDING_NEW,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
 	memset(ret,0,sizeof(BN_BLINDING));
 	if ((ret->A=BN_new()) == NULL) goto err;
 	if ((ret->Ai=BN_new()) == NULL) goto err;
@@ -78,26 +81,26 @@ BIGNUM *mod;
 	return(ret);
 err:
 	if (ret != NULL) BN_BLINDING_free(ret);
-	return(ret);
+	return(NULL);
 	}
 
-void BN_BLINDING_free(r)
-BN_BLINDING *r;
+void BN_BLINDING_free(BN_BLINDING *r)
 	{
+	if(r == NULL)
+	    return;
+
 	if (r->A  != NULL) BN_free(r->A );
 	if (r->Ai != NULL) BN_free(r->Ai);
-	Free(r);
+	OPENSSL_free(r);
 	}
 
-int BN_BLINDING_update(b,ctx)
-BN_BLINDING *b;
-BN_CTX *ctx;
+int BN_BLINDING_update(BN_BLINDING *b, BN_CTX *ctx)
 	{
 	int ret=0;
 
 	if ((b->A == NULL) || (b->Ai == NULL))
 		{
-		BNerr(BN_F_BN_BLINDING_UPDATE,BN_R_NOT_INITALISED);
+		BNerr(BN_F_BN_BLINDING_UPDATE,BN_R_NOT_INITIALIZED);
 		goto err;
 		}
 		
@@ -109,28 +112,26 @@ err:
 	return(ret);
 	}
 
-int BN_BLINDING_convert(n,b,ctx)
-BIGNUM *n;
-BN_BLINDING *b;
-BN_CTX *ctx;
+int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
 	{
+	bn_check_top(n);
+
 	if ((b->A == NULL) || (b->Ai == NULL))
 		{
-		BNerr(BN_F_BN_BLINDING_CONVERT,BN_R_NOT_INITALISED);
+		BNerr(BN_F_BN_BLINDING_CONVERT,BN_R_NOT_INITIALIZED);
 		return(0);
 		}
 	return(BN_mod_mul(n,n,b->A,b->mod,ctx));
 	}
 
-int BN_BLINDING_invert(n,b,ctx)
-BIGNUM *n;
-BN_BLINDING *b;
-BN_CTX *ctx;
+int BN_BLINDING_invert(BIGNUM *n, BN_BLINDING *b, BN_CTX *ctx)
 	{
 	int ret;
+
+	bn_check_top(n);
 	if ((b->A == NULL) || (b->Ai == NULL))
 		{
-		BNerr(BN_F_BN_BLINDING_INVERT,BN_R_NOT_INITALISED);
+		BNerr(BN_F_BN_BLINDING_INVERT,BN_R_NOT_INITIALIZED);
 		return(0);
 		}
 	if ((ret=BN_mod_mul(n,n,b->Ai,b->mod,ctx)) >= 0)
diff --git a/src/lib/libcrypto/bn/bn_ctx.c b/src/lib/libcrypto/bn/bn_ctx.c
index 46132fd180..7daf19eb84 100644
--- a/src/lib/libcrypto/bn/bn_ctx.c
+++ b/src/lib/libcrypto/bn/bn_ctx.c
@@ -61,15 +61,16 @@
 
 #include <stdio.h>
 #include <assert.h>
+
 #include "cryptlib.h"
-#include <openssl/bn.h>
+#include "bn_lcl.h"
 
 
 BN_CTX *BN_CTX_new(void)
 	{
 	BN_CTX *ret;
 
-	ret=(BN_CTX *)Malloc(sizeof(BN_CTX));
+	ret=(BN_CTX *)OPENSSL_malloc(sizeof(BN_CTX));
 	if (ret == NULL)
 		{
 		BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE);
@@ -83,6 +84,7 @@ BN_CTX *BN_CTX_new(void)
 
 void BN_CTX_init(BN_CTX *ctx)
 	{
+#if 0 /* explicit version */
 	int i;
 	ctx->tos = 0;
 	ctx->flags = 0;
@@ -90,6 +92,9 @@ void BN_CTX_init(BN_CTX *ctx)
 	ctx->too_many = 0;
 	for (i = 0; i < BN_CTX_NUM; i++)
 		BN_init(&(ctx->bn[i]));
+#else
+	memset(ctx, 0, sizeof *ctx);
+#endif
 	}
 
 void BN_CTX_free(BN_CTX *ctx)
@@ -102,7 +107,7 @@ void BN_CTX_free(BN_CTX *ctx)
 	for (i=0; i < BN_CTX_NUM; i++)
 		BN_clear_free(&(ctx->bn[i]));
 	if (ctx->flags & BN_FLG_MALLOCED)
-		Free(ctx);
+		OPENSSL_free(ctx);
 	}
 
 void BN_CTX_start(BN_CTX *ctx)
@@ -112,8 +117,14 @@ void BN_CTX_start(BN_CTX *ctx)
 	ctx->depth++;
 	}
 
+
 BIGNUM *BN_CTX_get(BN_CTX *ctx)
 	{
+	/* Note: If BN_CTX_get is ever changed to allocate BIGNUMs dynamically,
+	 * make sure that if BN_CTX_get fails once it will return NULL again
+	 * until BN_CTX_end is called.  (This is so that callers have to check
+	 * only the last return value.)
+	 */
 	if (ctx->depth > BN_CTX_NUM_POS || ctx->tos >= BN_CTX_NUM)
 		{
 		if (!ctx->too_many)
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
index 2263bdc7da..f9a095e3b3 100644
--- a/src/lib/libcrypto/bn/bn_div.c
+++ b/src/lib/libcrypto/bn/bn_div.c
@@ -57,21 +57,22 @@
  */
 
 #include <stdio.h>
+#include <openssl/bn.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+
 /* The old slow way */
 #if 0
-int BN_div(dv, rem, m, d,ctx)
-BIGNUM *dv;
-BIGNUM *rem;
-BIGNUM *m;
-BIGNUM *d;
-BN_CTX *ctx;
+int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
+	   BN_CTX *ctx)
 	{
 	int i,nm,nd;
+	int ret = 0;
 	BIGNUM *D;
 
+	bn_check_top(m);
+	bn_check_top(d);
 	if (BN_is_zero(d))
 		{
 		BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO);
@@ -86,45 +87,83 @@ BN_CTX *ctx;
 		return(1);
 		}
 
-	D=ctx->bn[ctx->tos];
-	if (dv == NULL) dv=ctx->bn[ctx->tos+1];
-	if (rem == NULL) rem=ctx->bn[ctx->tos+2];
+	BN_CTX_start(ctx);
+	D = BN_CTX_get(ctx);
+	if (dv == NULL) dv = BN_CTX_get(ctx);
+	if (rem == NULL) rem = BN_CTX_get(ctx);
+	if (D == NULL || dv == NULL || rem == NULL)
+		goto end;
 
 	nd=BN_num_bits(d);
 	nm=BN_num_bits(m);
-	if (BN_copy(D,d) == NULL) return(0);
-	if (BN_copy(rem,m) == NULL) return(0);
+	if (BN_copy(D,d) == NULL) goto end;
+	if (BN_copy(rem,m) == NULL) goto end;
 
 	/* The next 2 are needed so we can do a dv->d[0]|=1 later
 	 * since BN_lshift1 will only work once there is a value :-) */
 	BN_zero(dv);
+	bn_wexpand(dv,1);
 	dv->top=1;
 
-	if (!BN_lshift(D,D,nm-nd)) return(0);
+	if (!BN_lshift(D,D,nm-nd)) goto end;
 	for (i=nm-nd; i>=0; i--)
 		{
-		if (!BN_lshift1(dv,dv)) return(0);
+		if (!BN_lshift1(dv,dv)) goto end;
 		if (BN_ucmp(rem,D) >= 0)
 			{
 			dv->d[0]|=1;
-			bn_qsub(rem,rem,D);
+			if (!BN_usub(rem,rem,D)) goto end;
 			}
 /* CAN IMPROVE (and have now :=) */
-		if (!BN_rshift1(D,D)) return(0);
+		if (!BN_rshift1(D,D)) goto end;
 		}
 	rem->neg=BN_is_zero(rem)?0:m->neg;
 	dv->neg=m->neg^d->neg;
-	return(1);
+	ret = 1;
+ end:
+	BN_CTX_end(ctx);
+	return(ret);
 	}
 
 #else
 
-int BN_div(dv, rm, num, divisor,ctx)
-BIGNUM *dv;
-BIGNUM *rm;
-BIGNUM *num;
-BIGNUM *divisor;
-BN_CTX *ctx;
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) \
+    && !defined(PEDANTIC) && !defined(BN_DIV3W)
+# if defined(__GNUC__) && __GNUC__>=2
+#  if defined(__i386) || defined (__i386__)
+   /*
+    * There were two reasons for implementing this template:
+    * - GNU C generates a call to a function (__udivdi3 to be exact)
+    *   in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
+    *   understand why...);
+    * - divl doesn't only calculate quotient, but also leaves
+    *   remainder in %edx which we can definitely use here:-)
+    *
+    *					<appro@fy.chalmers.se>
+    */
+#  define bn_div_words(n0,n1,d0)		\
+	({  asm volatile (			\
+		"divl	%4"			\
+		: "=a"(q), "=d"(rem)		\
+		: "a"(n1), "d"(n0), "g"(d0)	\
+		: "cc");			\
+	    q;					\
+	})
+#  define REMAINDER_IS_ALREADY_CALCULATED
+#  endif /* __<cpu> */
+# endif /* __GNUC__ */
+#endif /* OPENSSL_NO_ASM */
+
+
+/* BN_div computes  dv := num / divisor,  rounding towards zero, and sets up
+ * rm  such that  dv*divisor + rm = num  holds.
+ * Thus:
+ *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
+ *     rm->neg == num->neg                 (unless the remainder is zero)
+ * If 'dv' or 'rm' is NULL, the respective value is not returned.
+ */
+int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
+	   BN_CTX *ctx)
 	{
 	int norm_shift,i,j,loop;
 	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
@@ -132,6 +171,9 @@ BN_CTX *ctx;
 	BN_ULONG d0,d1;
 	int num_n,div_n;
 
+	bn_check_top(num);
+	bn_check_top(divisor);
+
 	if (BN_is_zero(divisor))
 		{
 		BNerr(BN_F_BN_DIV,BN_R_DIV_BY_ZERO);
@@ -146,20 +188,22 @@ BN_CTX *ctx;
 		return(1);
 		}
 
-	tmp=ctx->bn[ctx->tos]; 
-	tmp->neg=0;
-	snum=ctx->bn[ctx->tos+1];
-	sdiv=ctx->bn[ctx->tos+2];
+	BN_CTX_start(ctx);
+	tmp=BN_CTX_get(ctx);
+	snum=BN_CTX_get(ctx);
+	sdiv=BN_CTX_get(ctx);
 	if (dv == NULL)
-		res=ctx->bn[ctx->tos+3];
+		res=BN_CTX_get(ctx);
 	else	res=dv;
+	if (sdiv == NULL || res == NULL) goto err;
+	tmp->neg=0;
 
 	/* First we normalise the numbers */
 	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
-	BN_lshift(sdiv,divisor,norm_shift);
+	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
 	sdiv->neg=0;
 	norm_shift+=BN_BITS2;
-	BN_lshift(snum,num,norm_shift);
+	if (!(BN_lshift(snum,num,norm_shift))) goto err;
 	snum->neg=0;
 	div_n=sdiv->top;
 	num_n=snum->top;
@@ -168,10 +212,10 @@ BN_CTX *ctx;
 	/* Lets setup a 'window' into snum
 	 * This is the part that corresponds to the current
 	 * 'area' being divided */
+	BN_init(&wnum);
 	wnum.d=	 &(snum->d[loop]);
 	wnum.top= div_n;
-	wnum.max= snum->max; /* a bit of a lie */
-	wnum.neg= 0;
+	wnum.dmax= snum->dmax+1; /* a bit of a lie */
 
 	/* Get the top 2 words of sdiv */
 	/* i=sdiv->top; */
@@ -183,8 +227,8 @@ BN_CTX *ctx;
 
 	/* Setup to 'res' */
 	res->neg= (num->neg^divisor->neg);
-	res->top=loop;
 	if (!bn_wexpand(res,(loop+1))) goto err;
+	res->top=loop;
 	resp= &(res->d[loop-1]);
 
 	/* space for temp */
@@ -192,74 +236,98 @@ BN_CTX *ctx;
 
 	if (BN_ucmp(&wnum,sdiv) >= 0)
 		{
-		bn_qsub(&wnum,&wnum,sdiv);
+		if (!BN_usub(&wnum,&wnum,sdiv)) goto err;
 		*resp=1;
 		res->d[res->top-1]=1;
 		}
 	else
 		res->top--;
+	if (res->top == 0)
+		res->neg = 0;
 	resp--;
 
 	for (i=0; i<loop-1; i++)
 		{
-		BN_ULONG q,n0,n1;
-		BN_ULONG l0;
+		BN_ULONG q,l0;
+#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
+		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
+		q=bn_div_3_words(wnump,d1,d0);
+#else
+		BN_ULONG n0,n1,rem=0;
 
-		wnum.d--; wnum.top++;
 		n0=wnump[0];
 		n1=wnump[-1];
 		if (n0 == d0)
 			q=BN_MASK2;
-		else
-			q=bn_div64(n0,n1,d0);
-		{
-#ifdef BN_LLONG
-		BN_ULLONG t1,t2,rem;
-		t1=((BN_ULLONG)n0<<BN_BITS2)|n1;
-		for (;;)
+		else 			/* n0 < d0 */
 			{
+#ifdef BN_LLONG
+			BN_ULLONG t2;
+
+#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
+			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
+#else
+			q=bn_div_words(n0,n1,d0);
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			/*
+			 * rem doesn't have to be BN_ULLONG. The least we
+			 * know it's less that d0, isn't it?
+			 */
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
 			t2=(BN_ULLONG)d1*q;
-			rem=t1-(BN_ULLONG)q*d0;
-			if ((rem>>BN_BITS2) ||
-				(t2 <= ((BN_ULLONG)(rem<<BN_BITS2)+wnump[-2])))
-				break;
-			q--;
-			}
+
+			for (;;)
+				{
+				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				t2 -= d1;
+				}
+#else /* !BN_LLONG */
+			BN_ULONG t2l,t2h,ql,qh;
+
+			q=bn_div_words(n0,n1,d0);
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+			rem=(n1-q*d0)&BN_MASK2;
+#endif
+
+#ifdef BN_UMULT_HIGH
+			t2l = d1 * q;
+			t2h = BN_UMULT_HIGH(d1,q);
 #else
-		BN_ULONG t1l,t1h,t2l,t2h,t3l,t3h,ql,qh,t3t;
-		t1h=n0;
-		t1l=n1;
-		for (;;)
-			{
 			t2l=LBITS(d1); t2h=HBITS(d1);
 			ql =LBITS(q);  qh =HBITS(q);
 			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+#endif
 
-			t3t=LBITS(d0); t3h=HBITS(d0);
-			mul64(t3t,t3h,ql,qh); /* t3=t1-(BN_ULLONG)q*d0; */
-			t3l=(t1l-t3t)&BN_MASK2;
-			if (t3l > t1l) t3h++;
-			t3h=(t1h-t3h)&BN_MASK2;
-
-			/*if ((t3>>BN_BITS2) ||
-				(t2 <= ((t3<<BN_BITS2)+wnump[-2])))
-				break; */
-			if (t3h) break;
-			if (t2h < t3l) break;
-			if ((t2h == t3l) && (t2l <= wnump[-2])) break;
-
-			q--;
+			for (;;)
+				{
+				if ((t2h < rem) ||
+					((t2h == rem) && (t2l <= wnump[-2])))
+					break;
+				q--;
+				rem += d0;
+				if (rem < d0) break; /* don't let rem overflow */
+				if (t2l < d1) t2h--; t2l -= d1;
+				}
+#endif /* !BN_LLONG */
 			}
-#endif
-		}
+#endif /* !BN_DIV3W */
+
 		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
+		wnum.d--; wnum.top++;
 		tmp->d[div_n]=l0;
 		for (j=div_n+1; j>0; j--)
 			if (tmp->d[j-1]) break;
 		tmp->top=j;
 
 		j=wnum.top;
-		BN_sub(&wnum,&wnum,tmp);
+		if (!BN_sub(&wnum,&wnum,tmp)) goto err;
 
 		snum->top=snum->top+wnum.top-j;
 
@@ -267,7 +335,7 @@ BN_CTX *ctx;
 			{
 			q--;
 			j=wnum.top;
-			BN_add(&wnum,&wnum,sdiv);
+			if (!BN_add(&wnum,&wnum,sdiv)) goto err;
 			snum->top+=wnum.top-j;
 			}
 		*(resp--)=q;
@@ -275,11 +343,18 @@ BN_CTX *ctx;
 		}
 	if (rm != NULL)
 		{
+		/* Keep a copy of the neg flag in num because if rm==num
+		 * BN_rshift() will overwrite it.
+		 */
+		int neg = num->neg;
 		BN_rshift(rm,snum,norm_shift);
-		rm->neg=num->neg;
+		if (!BN_is_zero(rm))
+			rm->neg = neg;
 		}
+	BN_CTX_end(ctx);
 	return(1);
 err:
+	BN_CTX_end(ctx);
 	return(0);
 	}
 
diff --git a/src/lib/libcrypto/bn/bn_err.c b/src/lib/libcrypto/bn/bn_err.c
index 029ae810d5..fb84ee96d8 100644
--- a/src/lib/libcrypto/bn/bn_err.c
+++ b/src/lib/libcrypto/bn/bn_err.c
@@ -1,66 +1,69 @@
-/* lib/bn/bn_err.c */
-/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
- * All rights reserved.
+/* crypto/bn/bn_err.c */
+/* ====================================================================
+ * Copyright (c) 1999 The OpenSSL Project.  All rights reserved.
  *
- * This package is an SSL implementation written
- * by Eric Young (eay@cryptsoft.com).
- * The implementation was written so as to conform with Netscapes SSL.
- * 
- * This library is free for commercial and non-commercial use as long as
- * the following conditions are aheared to.  The following conditions
- * apply to all code found in this distribution, be it the RC4, RSA,
- * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
- * included with this distribution is covered by the same copyright terms
- * except that the holder is Tim Hudson (tjh@cryptsoft.com).
- * 
- * Copyright remains Eric Young's, and as such any Copyright notices in
- * the code are not to be removed.
- * If this package is used in a product, Eric Young should be given attribution
- * as the author of the parts of the library used.
- * This can be in the form of a textual message at program startup or
- * in documentation (online or textual) provided with the package.
- * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
- * 1. Redistributions of source code must retain the copyright
- *    notice, this list of conditions and the following disclaimer.
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
  * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *    "This product includes cryptographic software written by
- *     Eric Young (eay@cryptsoft.com)"
- *    The word 'cryptographic' can be left out if the rouines from the library
- *    being used are not cryptographic related :-).
- * 4. If you include any Windows specific code (or a derivative thereof) from 
- *    the apps directory (application code) you must include an acknowledgement:
- *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
- * 
- * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * 
- * The licence and distribution terms for any publically available version or
- * derivative of this code cannot be changed.  i.e. this code cannot simply be
- * copied and put under another distribution licence
- * [including the GNU Public Licence.]
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
  */
+
+/* NOTE: this file was auto generated by the mkerr.pl script: any changes
+ * made to it will be overwritten when the script next updates this file,
+ * only reason strings will be preserved.
+ */
+
 #include <stdio.h>
-#include "err.h"
-#include "bn.h"
+#include <openssl/err.h>
+#include <openssl/bn.h>
 
 /* BEGIN ERROR CODES */
-#ifndef NO_ERR
+#ifndef OPENSSL_NO_ERR
 static ERR_STRING_DATA BN_str_functs[]=
 	{
 {ERR_PACK(0,BN_F_BN_BLINDING_CONVERT,0),	"BN_BLINDING_convert"},
@@ -69,40 +72,57 @@ static ERR_STRING_DATA BN_str_functs[]=
 {ERR_PACK(0,BN_F_BN_BLINDING_UPDATE,0),	"BN_BLINDING_update"},
 {ERR_PACK(0,BN_F_BN_BN2DEC,0),	"BN_bn2dec"},
 {ERR_PACK(0,BN_F_BN_BN2HEX,0),	"BN_bn2hex"},
+{ERR_PACK(0,BN_F_BN_CTX_GET,0),	"BN_CTX_get"},
 {ERR_PACK(0,BN_F_BN_CTX_NEW,0),	"BN_CTX_new"},
 {ERR_PACK(0,BN_F_BN_DIV,0),	"BN_div"},
 {ERR_PACK(0,BN_F_BN_EXPAND2,0),	"bn_expand2"},
+{ERR_PACK(0,BN_F_BN_EXPAND_INTERNAL,0),	"BN_EXPAND_INTERNAL"},
+{ERR_PACK(0,BN_F_BN_MOD_EXP2_MONT,0),	"BN_mod_exp2_mont"},
 {ERR_PACK(0,BN_F_BN_MOD_EXP_MONT,0),	"BN_mod_exp_mont"},
+{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0),	"BN_mod_exp_mont_word"},
 {ERR_PACK(0,BN_F_BN_MOD_INVERSE,0),	"BN_mod_inverse"},
+{ERR_PACK(0,BN_F_BN_MOD_LSHIFT_QUICK,0),	"BN_mod_lshift_quick"},
 {ERR_PACK(0,BN_F_BN_MOD_MUL_RECIPROCAL,0),	"BN_mod_mul_reciprocal"},
+{ERR_PACK(0,BN_F_BN_MOD_SQRT,0),	"BN_mod_sqrt"},
 {ERR_PACK(0,BN_F_BN_MPI2BN,0),	"BN_mpi2bn"},
 {ERR_PACK(0,BN_F_BN_NEW,0),	"BN_new"},
 {ERR_PACK(0,BN_F_BN_RAND,0),	"BN_rand"},
-{0,NULL},
+{ERR_PACK(0,BN_F_BN_RAND_RANGE,0),	"BN_rand_range"},
+{ERR_PACK(0,BN_F_BN_USUB,0),	"BN_usub"},
+{0,NULL}
 	};
 
 static ERR_STRING_DATA BN_str_reasons[]=
 	{
+{BN_R_ARG2_LT_ARG3                       ,"arg2 lt arg3"},
 {BN_R_BAD_RECIPROCAL                     ,"bad reciprocal"},
+{BN_R_BIGNUM_TOO_LONG                    ,"bignum too long"},
 {BN_R_CALLED_WITH_EVEN_MODULUS           ,"called with even modulus"},
 {BN_R_DIV_BY_ZERO                        ,"div by zero"},
 {BN_R_ENCODING_ERROR                     ,"encoding error"},
+{BN_R_EXPAND_ON_STATIC_BIGNUM_DATA       ,"expand on static bignum data"},
+{BN_R_INPUT_NOT_REDUCED                  ,"input not reduced"},
 {BN_R_INVALID_LENGTH                     ,"invalid length"},
-{BN_R_NOT_INITALISED                     ,"not initalised"},
+{BN_R_INVALID_RANGE                      ,"invalid range"},
+{BN_R_NOT_A_SQUARE                       ,"not a square"},
+{BN_R_NOT_INITIALIZED                    ,"not initialized"},
 {BN_R_NO_INVERSE                         ,"no inverse"},
-{0,NULL},
+{BN_R_P_IS_NOT_PRIME                     ,"p is not prime"},
+{BN_R_TOO_MANY_ITERATIONS                ,"too many iterations"},
+{BN_R_TOO_MANY_TEMPORARY_VARIABLES       ,"too many temporary variables"},
+{0,NULL}
 	};
 
 #endif
 
-void ERR_load_BN_strings()
+void ERR_load_BN_strings(void)
 	{
 	static int init=1;
 
-	if (init);
-		{;
+	if (init)
+		{
 		init=0;
-#ifndef NO_ERR
+#ifndef OPENSSL_NO_ERR
 		ERR_load_strings(ERR_LIB_BN,BN_str_functs);
 		ERR_load_strings(ERR_LIB_BN,BN_str_reasons);
 #endif
diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c
index c056a5083f..afdfd580fb 100644
--- a/src/lib/libcrypto/bn/bn_exp.c
+++ b/src/lib/libcrypto/bn/bn_exp.c
@@ -55,112 +55,145 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
 
-#include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-/* slow but works */
-int BN_mod_mul(ret, a, b, m, ctx)
-BIGNUM *ret;
-BIGNUM *a;
-BIGNUM *b;
-BIGNUM *m;
-BN_CTX *ctx;
-	{
-	BIGNUM *t;
-	int r=0;
-
-	t=ctx->bn[ctx->tos++];
-	if (a == b)
-		{ if (!BN_sqr(t,a,ctx)) goto err; }
-	else
-		{ if (!BN_mul(t,a,b)) goto err; }
-	if (!BN_mod(ret,t,m,ctx)) goto err;
-	r=1;
-err:
-	ctx->tos--;
-	return(r);
-	}
+#define TABLE_SIZE	32
 
-#if 0
 /* this one works - simple but works */
-int BN_mod_exp(r,a,p,m,ctx)
-BIGNUM *r,*a,*p,*m;
-BN_CTX *ctx;
+int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
 	int i,bits,ret=0;
-	BIGNUM *v,*tmp;
+	BIGNUM *v,*rr;
 
-	v=ctx->bn[ctx->tos++];
-	tmp=ctx->bn[ctx->tos++];
+	BN_CTX_start(ctx);
+	if ((r == a) || (r == p))
+		rr = BN_CTX_get(ctx);
+	else
+		rr = r;
+	if ((v = BN_CTX_get(ctx)) == NULL) goto err;
 
 	if (BN_copy(v,a) == NULL) goto err;
 	bits=BN_num_bits(p);
 
 	if (BN_is_odd(p))
-		{ if (BN_copy(r,a) == NULL) goto err; }
-	else	{ if (BN_one(r)) goto err; }
+		{ if (BN_copy(rr,a) == NULL) goto err; }
+	else	{ if (!BN_one(rr)) goto err; }
 
 	for (i=1; i<bits; i++)
 		{
-		if (!BN_sqr(tmp,v,ctx)) goto err;
-		if (!BN_mod(v,tmp,m,ctx)) goto err;
+		if (!BN_sqr(v,v,ctx)) goto err;
 		if (BN_is_bit_set(p,i))
 			{
-			if (!BN_mul(tmp,r,v)) goto err;
-			if (!BN_mod(r,tmp,m,ctx)) goto err;
+			if (!BN_mul(rr,rr,v,ctx)) goto err;
 			}
 		}
 	ret=1;
 err:
-	ctx->tos-=2;
+	if (r != rr) BN_copy(r,rr);
+	BN_CTX_end(ctx);
 	return(ret);
 	}
 
-#endif
-
-/* this one works - simple but works */
-int BN_exp(r,a,p,ctx)
-BIGNUM *r,*a,*p;
-BN_CTX *ctx;
-	{
-	int i,bits,ret=0;
-	BIGNUM *v,*tmp;
-
-	v=ctx->bn[ctx->tos++];
-	tmp=ctx->bn[ctx->tos++];
-
-	if (BN_copy(v,a) == NULL) goto err;
-	bits=BN_num_bits(p);
-
-	if (BN_is_odd(p))
-		{ if (BN_copy(r,a) == NULL) goto err; }
-	else	{ if (BN_one(r)) goto err; }
-
-	for (i=1; i<bits; i++)
-		{
-		if (!BN_sqr(tmp,v,ctx)) goto err;
-		if (BN_is_bit_set(p,i))
-			{
-			if (!BN_mul(tmp,r,v)) goto err;
-			}
-		}
-	ret=1;
-err:
-	ctx->tos-=2;
-	return(ret);
-	}
 
-int BN_mod_exp(r,a,p,m,ctx)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *p;
-BIGNUM *m;
-BN_CTX *ctx;
+int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
+	       BN_CTX *ctx)
 	{
 	int ret;
 
+	bn_check_top(a);
+	bn_check_top(p);
+	bn_check_top(m);
+
+	/* For even modulus  m = 2^k*m_odd,  it might make sense to compute
+	 * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
+	 * exponentiation for the odd part), using appropriate exponent
+	 * reductions, and combine the results using the CRT.
+	 *
+	 * For now, we use Montgomery only if the modulus is odd; otherwise,
+	 * exponentiation using the reciprocal-based quick remaindering
+	 * algorithm is used.
+	 *
+	 * (Timing obtained with expspeed.c [computations  a^p mod m
+	 * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
+	 * 4096, 8192 bits], compared to the running time of the
+	 * standard algorithm:
+	 *
+	 *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
+         *                     55 .. 77 %  [UltraSparc processor, but
+	 *                                  debug-solaris-sparcv8-gcc conf.]
+	 * 
+	 *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
+	 *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
+	 *
+	 * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
+	 * at 2048 and more bits, but at 512 and 1024 bits, it was
+	 * slower even than the standard algorithm!
+	 *
+	 * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
+	 * should be obtained when the new Montgomery reduction code
+	 * has been integrated into OpenSSL.)
+	 */
+
+#define MONT_MUL_MOD
+#define MONT_EXP_WORD
+#define RECP_MUL_MOD
+
 #ifdef MONT_MUL_MOD
 	/* I have finally been able to take out this pre-condition of
 	 * the top bit being set.  It was caused by an error in BN_div
@@ -169,7 +202,17 @@ BN_CTX *ctx;
 /*	if ((m->d[m->top-1]&BN_TBIT) && BN_is_odd(m)) */
 
 	if (BN_is_odd(m))
-		{ ret=BN_mod_exp_mont(r,a,p,m,ctx,NULL); }
+		{
+#  ifdef MONT_EXP_WORD
+		if (a->top == 1 && !a->neg)
+			{
+			BN_ULONG A = a->d[0];
+			ret=BN_mod_exp_mont_word(r,A,p,m,ctx,NULL);
+			}
+		else
+#  endif
+			ret=BN_mod_exp_mont(r,a,p,m,ctx,NULL);
+		}
 	else
 #endif
 #ifdef RECP_MUL_MOD
@@ -181,55 +224,65 @@ BN_CTX *ctx;
 	return(ret);
 	}
 
-/* #ifdef RECP_MUL_MOD */
-int BN_mod_exp_recp(r,a,p,m,ctx)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *p;
-BIGNUM *m;
-BN_CTX *ctx;
+
+int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+		    const BIGNUM *m, BN_CTX *ctx)
 	{
-	int nb,i,j,bits,ret=0,wstart,wend,window,wvalue;
-	int start=1;
-	BIGNUM *d,*aa;
-	BIGNUM *val[16];
+	int i,j,bits,ret=0,wstart,wend,window,wvalue;
+	int start=1,ts=0;
+	BIGNUM *aa;
+	BIGNUM val[TABLE_SIZE];
+	BN_RECP_CTX recp;
 
-	d=ctx->bn[ctx->tos++];
-	aa=ctx->bn[ctx->tos++];
 	bits=BN_num_bits(p);
 
 	if (bits == 0)
 		{
-		BN_one(r);
-		return(1);
+		ret = BN_one(r);
+		return ret;
+		}
+
+	BN_CTX_start(ctx);
+	if ((aa = BN_CTX_get(ctx)) == NULL) goto err;
+
+	BN_RECP_CTX_init(&recp);
+	if (m->neg)
+		{
+		/* ignore sign of 'm' */
+		if (!BN_copy(aa, m)) goto err;
+		aa->neg = 0;
+		if (BN_RECP_CTX_set(&recp,aa,ctx) <= 0) goto err;
 		}
-	nb=BN_reciprocal(d,m,ctx);
-	if (nb == -1) goto err;
-
-	val[0]=BN_new();
-	if (!BN_mod(val[0],a,m,ctx)) goto err;		/* 1 */
-	if (!BN_mod_mul_reciprocal(aa,val[0],val[0],m,d,nb,ctx))
-		goto err;				/* 2 */
-
-	if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits >= 256)
-		window=5;	/* max size of window */
-	else if (bits >= 128)
-		window=4;
 	else
-		window=3;
+		{
+		if (BN_RECP_CTX_set(&recp,m,ctx) <= 0) goto err;
+		}
+
+	BN_init(&(val[0]));
+	ts=1;
 
-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	if (!BN_nnmod(&(val[0]),a,m,ctx)) goto err;		/* 1 */
+	if (BN_is_zero(&(val[0])))
 		{
-		val[i]=BN_new();
-		if (!BN_mod_mul_reciprocal(val[i],val[i-1],aa,m,d,nb,ctx))
-			goto err;
+		ret = BN_zero(r);
+		goto err;
 		}
-	for (; i<16; i++)
-		val[i]=NULL;
 
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
+		{
+		if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx))
+			goto err;				/* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&val[i]);
+			if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx))
+				goto err;
+			}
+		ts=i;
+		}
+		
 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the
 			 * buffer. */
@@ -244,7 +297,7 @@ BN_CTX *ctx;
 		if (BN_is_bit_set(p,wstart) == 0)
 			{
 			if (!start)
-				if (!BN_mod_mul_reciprocal(r,r,r,m,d,nb,ctx))
+				if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
 				goto err;
 			if (wstart == 0) break;
 			wstart--;
@@ -274,12 +327,12 @@ BN_CTX *ctx;
 		if (!start)
 			for (i=0; i<j; i++)
 				{
-				if (!BN_mod_mul_reciprocal(r,r,r,m,d,nb,ctx))
+				if (!BN_mod_mul_reciprocal(r,r,r,&recp,ctx))
 					goto err;
 				}
 		
 		/* wvalue will be an odd number < 2^window */
-		if (!BN_mod_mul_reciprocal(r,r,val[wvalue>>1],m,d,nb,ctx))
+		if (!BN_mod_mul_reciprocal(r,r,&(val[wvalue>>1]),&recp,ctx))
 			goto err;
 
 		/* move the 'window' down further */
@@ -290,84 +343,86 @@ BN_CTX *ctx;
 		}
 	ret=1;
 err:
-	ctx->tos-=2;
-	for (i=0; i<16; i++)
-		if (val[i] != NULL) BN_clear_free(val[i]);
+	BN_CTX_end(ctx);
+	for (i=0; i<ts; i++)
+		BN_clear_free(&(val[i]));
+	BN_RECP_CTX_free(&recp);
 	return(ret);
 	}
-/* #endif */
-
-/* #ifdef MONT_MUL_MOD */
-int BN_mod_exp_mont(r,a,p,m,ctx,in_mont)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *p;
-BIGNUM *m;
-BN_CTX *ctx;
-BN_MONT_CTX *in_mont;
+
+
+int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-#define TABLE_SIZE	16
 	int i,j,bits,ret=0,wstart,wend,window,wvalue;
-	int start=1;
-	BIGNUM *d,*aa;
-	BIGNUM *val[TABLE_SIZE];
+	int start=1,ts=0;
+	BIGNUM *d,*r;
+	const BIGNUM *aa;
+	BIGNUM val[TABLE_SIZE];
 	BN_MONT_CTX *mont=NULL;
 
+	bn_check_top(a);
+	bn_check_top(p);
+	bn_check_top(m);
+
 	if (!(m->d[0] & 1))
 		{
 		BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
 		return(0);
 		}
-	d=ctx->bn[ctx->tos++];
 	bits=BN_num_bits(p);
 	if (bits == 0)
 		{
-		BN_one(r);
-		return(1);
+		ret = BN_one(rr);
+		return ret;
 		}
 
+	BN_CTX_start(ctx);
+	d = BN_CTX_get(ctx);
+	r = BN_CTX_get(ctx);
+	if (d == NULL || r == NULL) goto err;
+
 	/* If this is not done, things will break in the montgomery
 	 * part */
 
-#if 1
 	if (in_mont != NULL)
 		mont=in_mont;
 	else
-#endif
 		{
 		if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
 		if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
 		}
 
-	val[0]=BN_new();
-	if (BN_ucmp(a,m) >= 0)
+	BN_init(&val[0]);
+	ts=1;
+	if (a->neg || BN_ucmp(a,m) >= 0)
 		{
-		BN_mod(val[0],a,m,ctx);
-		aa=val[0];
+		if (!BN_nnmod(&(val[0]),a,m,ctx))
+			goto err;
+		aa= &(val[0]);
 		}
 	else
 		aa=a;
-	if (!BN_to_montgomery(val[0],aa,mont,ctx)) goto err; /* 1 */
-	if (!BN_mod_mul_montgomery(d,val[0],val[0],mont,ctx)) goto err; /* 2 */
-
-	if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits > 250)
-		window=5;	/* max size of window */
-	else if (bits >= 120)
-		window=4;
-	else
-		window=3;
+	if (BN_is_zero(aa))
+		{
+		ret = BN_zero(rr);
+		goto err;
+		}
+	if (!BN_to_montgomery(&(val[0]),aa,mont,ctx)) goto err; /* 1 */
 
-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
 		{
-		val[i]=BN_new();
-		if (!BN_mod_mul_montgomery(val[i],val[i-1],d,mont,ctx))
-			goto err;
+		if (!BN_mod_mul_montgomery(d,&(val[0]),&(val[0]),mont,ctx)) goto err; /* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&(val[i]));
+			if (!BN_mod_mul_montgomery(&(val[i]),&(val[i-1]),d,mont,ctx))
+				goto err;
+			}
+		ts=i;
 		}
-	for (; i<TABLE_SIZE; i++)
-		val[i]=NULL;
 
 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the
@@ -376,7 +431,7 @@ BN_MONT_CTX *in_mont;
 	wstart=bits-1;	/* The top bit of the window */
 	wend=0;		/* The bottom bit of the window */
 
-        if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
+	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
 	for (;;)
 		{
 		if (BN_is_bit_set(p,wstart) == 0)
@@ -419,7 +474,7 @@ BN_MONT_CTX *in_mont;
 				}
 		
 		/* wvalue will be an odd number < 2^window */
-		if (!BN_mod_mul_montgomery(r,r,val[wvalue>>1],mont,ctx))
+		if (!BN_mod_mul_montgomery(r,r,&(val[wvalue>>1]),mont,ctx))
 			goto err;
 
 		/* move the 'window' down further */
@@ -428,62 +483,201 @@ BN_MONT_CTX *in_mont;
 		start=0;
 		if (wstart < 0) break;
 		}
-	BN_from_montgomery(r,r,mont,ctx);
+	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
-	ctx->tos--;
-	for (i=0; i<TABLE_SIZE; i++)
-		if (val[i] != NULL) BN_clear_free(val[i]);
+	BN_CTX_end(ctx);
+	for (i=0; i<ts; i++)
+		BN_clear_free(&(val[i]));
 	return(ret);
 	}
-/* #endif */
+
+int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
+                         const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
+	{
+	BN_MONT_CTX *mont = NULL;
+	int b, bits, ret=0;
+	int r_is_one;
+	BN_ULONG w, next_w;
+	BIGNUM *d, *r, *t;
+	BIGNUM *swap_tmp;
+#define BN_MOD_MUL_WORD(r, w, m) \
+		(BN_mul_word(r, (w)) && \
+		(/* BN_ucmp(r, (m)) < 0 ? 1 :*/  \
+			(BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
+		/* BN_MOD_MUL_WORD is only used with 'w' large,
+		 * so the BN_ucmp test is probably more overhead
+		 * than always using BN_mod (which uses BN_copy if
+		 * a similar test returns true). */
+		/* We can use BN_mod and do not need BN_nnmod because our
+		 * accumulator is never negative (the result of BN_mod does
+		 * not depend on the sign of the modulus).
+		 */
+#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
+		(BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
+
+	bn_check_top(p);
+	bn_check_top(m);
+
+	if (m->top == 0 || !(m->d[0] & 1))
+		{
+		BNerr(BN_F_BN_MOD_EXP_MONT_WORD,BN_R_CALLED_WITH_EVEN_MODULUS);
+		return(0);
+		}
+	if (m->top == 1)
+		a %= m->d[0]; /* make sure that 'a' is reduced */
+
+	bits = BN_num_bits(p);
+	if (bits == 0)
+		{
+		ret = BN_one(rr);
+		return ret;
+		}
+	if (a == 0)
+		{
+		ret = BN_zero(rr);
+		return ret;
+		}
+
+	BN_CTX_start(ctx);
+	d = BN_CTX_get(ctx);
+	r = BN_CTX_get(ctx);
+	t = BN_CTX_get(ctx);
+	if (d == NULL || r == NULL || t == NULL) goto err;
+
+	if (in_mont != NULL)
+		mont=in_mont;
+	else
+		{
+		if ((mont = BN_MONT_CTX_new()) == NULL) goto err;
+		if (!BN_MONT_CTX_set(mont, m, ctx)) goto err;
+		}
+
+	r_is_one = 1; /* except for Montgomery factor */
+
+	/* bits-1 >= 0 */
+
+	/* The result is accumulated in the product r*w. */
+	w = a; /* bit 'bits-1' of 'p' is always set */
+	for (b = bits-2; b >= 0; b--)
+		{
+		/* First, square r*w. */
+		next_w = w*w;
+		if ((next_w/w) != w) /* overflow */
+			{
+			if (r_is_one)
+				{
+				if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
+				r_is_one = 0;
+				}
+			else
+				{
+				if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
+				}
+			next_w = 1;
+			}
+		w = next_w;
+		if (!r_is_one)
+			{
+			if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) goto err;
+			}
+
+		/* Second, multiply r*w by 'a' if exponent bit is set. */
+		if (BN_is_bit_set(p, b))
+			{
+			next_w = w*a;
+			if ((next_w/a) != w) /* overflow */
+				{
+				if (r_is_one)
+					{
+					if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
+					r_is_one = 0;
+					}
+				else
+					{
+					if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
+					}
+				next_w = a;
+				}
+			w = next_w;
+			}
+		}
+
+	/* Finally, set r:=r*w. */
+	if (w != 1)
+		{
+		if (r_is_one)
+			{
+			if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) goto err;
+			r_is_one = 0;
+			}
+		else
+			{
+			if (!BN_MOD_MUL_WORD(r, w, m)) goto err;
+			}
+		}
+
+	if (r_is_one) /* can happen only if a == 1*/
+		{
+		if (!BN_one(rr)) goto err;
+		}
+	else
+		{
+		if (!BN_from_montgomery(rr, r, mont, ctx)) goto err;
+		}
+	ret = 1;
+err:
+	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
+	BN_CTX_end(ctx);
+	return(ret);
+	}
+
 
 /* The old fallback, simple version :-) */
-int BN_mod_exp_simple(r,a,p,m,ctx)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *p;
-BIGNUM *m;
-BN_CTX *ctx;
+int BN_mod_exp_simple(BIGNUM *r,
+	const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
+	BN_CTX *ctx)
 	{
-	int i,j,bits,ret=0,wstart,wend,window,wvalue;
+	int i,j,bits,ret=0,wstart,wend,window,wvalue,ts=0;
 	int start=1;
 	BIGNUM *d;
-	BIGNUM *val[16];
+	BIGNUM val[TABLE_SIZE];
 
-	d=ctx->bn[ctx->tos++];
 	bits=BN_num_bits(p);
 
 	if (bits == 0)
 		{
-		BN_one(r);
-		return(1);
+		ret = BN_one(r);
+		return ret;
 		}
 
-	val[0]=BN_new();
-	if (!BN_mod(val[0],a,m,ctx)) goto err;		/* 1 */
-	if (!BN_mod_mul(d,val[0],val[0],m,ctx))
-		goto err;				/* 2 */
-
-	if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits >= 256)
-		window=5;	/* max size of window */
-	else if (bits >= 128)
-		window=4;
-	else
-		window=3;
+	BN_CTX_start(ctx);
+	if ((d = BN_CTX_get(ctx)) == NULL) goto err;
 
-	j=1<<(window-1);
-	for (i=1; i<j; i++)
+	BN_init(&(val[0]));
+	ts=1;
+	if (!BN_nnmod(&(val[0]),a,m,ctx)) goto err;		/* 1 */
+	if (BN_is_zero(&(val[0])))
 		{
-		val[i]=BN_new();
-		if (!BN_mod_mul(val[i],val[i-1],d,m,ctx))
-			goto err;
+		ret = BN_zero(r);
+		goto err;
+		}
+
+	window = BN_window_bits_for_exponent_size(bits);
+	if (window > 1)
+		{
+		if (!BN_mod_mul(d,&(val[0]),&(val[0]),m,ctx))
+			goto err;				/* 2 */
+		j=1<<(window-1);
+		for (i=1; i<j; i++)
+			{
+			BN_init(&(val[i]));
+			if (!BN_mod_mul(&(val[i]),&(val[i-1]),d,m,ctx))
+				goto err;
+			}
+		ts=i;
 		}
-	for (; i<16; i++)
-		val[i]=NULL;
 
 	start=1;	/* This is used to avoid multiplication etc
 			 * when there is only the value '1' in the
@@ -534,7 +728,7 @@ BN_CTX *ctx;
 				}
 		
 		/* wvalue will be an odd number < 2^window */
-		if (!BN_mod_mul(r,r,val[wvalue>>1],m,ctx))
+		if (!BN_mod_mul(r,r,&(val[wvalue>>1]),m,ctx))
 			goto err;
 
 		/* move the 'window' down further */
@@ -545,9 +739,9 @@ BN_CTX *ctx;
 		}
 	ret=1;
 err:
-	ctx->tos--;
-	for (i=0; i<16; i++)
-		if (val[i] != NULL) BN_clear_free(val[i]);
+	BN_CTX_end(ctx);
+	for (i=0; i<ts; i++)
+		BN_clear_free(&(val[i]));
 	return(ret);
 	}
 
diff --git a/src/lib/libcrypto/bn/bn_exp2.c b/src/lib/libcrypto/bn/bn_exp2.c
index 1132d53365..73ccd58a83 100644
--- a/src/lib/libcrypto/bn/bn_exp2.c
+++ b/src/lib/libcrypto/bn/bn_exp2.c
@@ -1,27 +1,129 @@
+/* crypto/bn/bn_exp2.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
 #include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-/* I've done some timing with different table sizes.
- * The main hassle is that even with bits set at 3, this requires
- * 63 BIGNUMs to store the pre-calculated values.
- *          512   1024 
- * bits=1  75.4%  79.4%
- * bits=2  61.2%  62.4%
- * bits=3  61.3%  59.3%
- * The lack of speed improvment is also a function of the pre-calculation
- * which could be removed.
- */
-#define EXP2_TABLE_BITS	2 /* 1  2  3  4  5  */
-#define EXP2_TABLE_SIZE	4 /* 2  4  8 16 32  */
+#define TABLE_SIZE	32
 
-int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
-	     BIGNUM *p2, BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
+int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
+	const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+	BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,j,k,bits,bits1,bits2,ret=0,wstart,wend,window,xvalue,yvalue;
-	int start=1,ts=0,x,y;
-	BIGNUM *d,*aa1,*aa2,*r;
-	BIGNUM val[EXP2_TABLE_SIZE][EXP2_TABLE_SIZE];
+	int i,j,bits,b,bits1,bits2,ret=0,wpos1,wpos2,window1,window2,wvalue1,wvalue2;
+	int r_is_one=1,ts1=0,ts2=0;
+	BIGNUM *d,*r;
+	const BIGNUM *a_mod_m;
+	BIGNUM val1[TABLE_SIZE], val2[TABLE_SIZE];
 	BN_MONT_CTX *mont=NULL;
 
 	bn_check_top(a1);
@@ -32,22 +134,23 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
 
 	if (!(m->d[0] & 1))
 		{
-		BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
+		BNerr(BN_F_BN_MOD_EXP2_MONT,BN_R_CALLED_WITH_EVEN_MODULUS);
 		return(0);
 		}
-	d= &(ctx->bn[ctx->tos++]);
-	r= &(ctx->bn[ctx->tos++]);
 	bits1=BN_num_bits(p1);
 	bits2=BN_num_bits(p2);
 	if ((bits1 == 0) && (bits2 == 0))
 		{
-		BN_one(r);
-		return(1);
+		ret = BN_one(rr);
+		return ret;
 		}
+	
 	bits=(bits1 > bits2)?bits1:bits2;
 
-	/* If this is not done, things will break in the montgomery
-	 * part */
+	BN_CTX_start(ctx);
+	d = BN_CTX_get(ctx);
+	r = BN_CTX_get(ctx);
+	if (d == NULL || r == NULL) goto err;
 
 	if (in_mont != NULL)
 		mont=in_mont;
@@ -57,139 +160,154 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2,
 		if (!BN_MONT_CTX_set(mont,m,ctx)) goto err;
 		}
 
-	BN_init(&(val[0][0]));
-	BN_init(&(val[1][1]));
-	BN_init(&(val[0][1]));
-	BN_init(&(val[1][0]));
-	ts=1;
-	if (BN_ucmp(a1,m) >= 0)
+	window1 = BN_window_bits_for_exponent_size(bits1);
+	window2 = BN_window_bits_for_exponent_size(bits2);
+
+	/*
+	 * Build table for a1:   val1[i] := a1^(2*i + 1) mod m  for i = 0 .. 2^(window1-1)
+	 */
+	BN_init(&val1[0]);
+	ts1=1;
+	if (a1->neg || BN_ucmp(a1,m) >= 0)
 		{
-		BN_mod(&(val[1][0]),a1,m,ctx);
-		aa1= &(val[1][0]);
+		if (!BN_mod(&(val1[0]),a1,m,ctx))
+			goto err;
+		a_mod_m = &(val1[0]);
 		}
 	else
-		aa1=a1;
-	if (BN_ucmp(a2,m) >= 0)
+		a_mod_m = a1;
+	if (BN_is_zero(a_mod_m))
 		{
-		BN_mod(&(val[0][1]),a2,m,ctx);
-		aa2= &(val[0][1]);
-		}
-	else
-		aa2=a2;
-	if (!BN_to_montgomery(&(val[1][0]),aa1,mont,ctx)) goto err;
-	if (!BN_to_montgomery(&(val[0][1]),aa2,mont,ctx)) goto err;
-	if (!BN_mod_mul_montgomery(&(val[1][1]),
-		&(val[1][0]),&(val[0][1]),mont,ctx))
+		ret = BN_zero(rr);
 		goto err;
+		}
 
-#if 0
-	if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */
-		window=1;
-	else if (bits > 250)
-		window=5;	/* max size of window */
-	else if (bits >= 120)
-		window=4;
-	else
-		window=3;
-#else
-	window=EXP2_TABLE_BITS;
-#endif
-
-	k=1<<window;
-	for (x=0; x<k; x++)
+	if (!BN_to_montgomery(&(val1[0]),a_mod_m,mont,ctx)) goto err;
+	if (window1 > 1)
 		{
-		if (x >= 2)
+		if (!BN_mod_mul_montgomery(d,&(val1[0]),&(val1[0]),mont,ctx)) goto err;
+
+		j=1<<(window1-1);
+		for (i=1; i<j; i++)
 			{
-			BN_init(&(val[x][0]));
-			BN_init(&(val[x][1]));
-			if (!BN_mod_mul_montgomery(&(val[x][0]),
-				&(val[1][0]),&(val[x-1][0]),mont,ctx)) goto err;
-			if (!BN_mod_mul_montgomery(&(val[x][1]),
-				&(val[1][0]),&(val[x-1][1]),mont,ctx)) goto err;
+			BN_init(&(val1[i]));
+			if (!BN_mod_mul_montgomery(&(val1[i]),&(val1[i-1]),d,mont,ctx))
+				goto err;
 			}
-		for (y=2; y<k; y++)
+		ts1=i;
+		}
+
+
+	/*
+	 * Build table for a2:   val2[i] := a2^(2*i + 1) mod m  for i = 0 .. 2^(window2-1)
+	 */
+	BN_init(&val2[0]);
+	ts2=1;
+	if (a2->neg || BN_ucmp(a2,m) >= 0)
+		{
+		if (!BN_mod(&(val2[0]),a2,m,ctx))
+			goto err;
+		a_mod_m = &(val2[0]);
+		}
+	else
+		a_mod_m = a2;
+	if (BN_is_zero(a_mod_m))
+		{
+		ret = BN_zero(rr);
+		goto err;
+		}
+	if (!BN_to_montgomery(&(val2[0]),a_mod_m,mont,ctx)) goto err;
+	if (window2 > 1)
+		{
+		if (!BN_mod_mul_montgomery(d,&(val2[0]),&(val2[0]),mont,ctx)) goto err;
+
+		j=1<<(window2-1);
+		for (i=1; i<j; i++)
 			{
-			BN_init(&(val[x][y]));
-			if (!BN_mod_mul_montgomery(&(val[x][y]),
-				&(val[x][y-1]),&(val[0][1]),mont,ctx))
+			BN_init(&(val2[i]));
+			if (!BN_mod_mul_montgomery(&(val2[i]),&(val2[i-1]),d,mont,ctx))
 				goto err;
 			}
+		ts2=i;
 		}
-	ts=k;
-
-	start=1;	/* This is used to avoid multiplication etc
-			 * when there is only the value '1' in the
-			 * buffer. */
-	xvalue=0;	/* The 'x value' of the window */
-	yvalue=0;	/* The 'y value' of the window */
-	wstart=bits-1;	/* The top bit of the window */
-	wend=0;		/* The bottom bit of the window */
-
-        if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	for (;;)
+
+
+	/* Now compute the power product, using independent windows. */
+	r_is_one=1;
+	wvalue1=0;  /* The 'value' of the first window */
+	wvalue2=0;  /* The 'value' of the second window */
+	wpos1=0;    /* If wvalue1 > 0, the bottom bit of the first window */
+	wpos2=0;    /* If wvalue2 > 0, the bottom bit of the second window */
+
+	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
+	for (b=bits-1; b>=0; b--)
 		{
-		xvalue=BN_is_bit_set(p1,wstart);
-		yvalue=BN_is_bit_set(p2,wstart);
-		if (!(xvalue || yvalue))
+		if (!r_is_one)
 			{
-			if (!start)
+			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
+				goto err;
+			}
+		
+		if (!wvalue1)
+			if (BN_is_bit_set(p1, b))
 				{
-				if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
-					goto err;
+				/* consider bits b-window1+1 .. b for this window */
+				i = b-window1+1;
+				while (!BN_is_bit_set(p1, i)) /* works for i<0 */
+					i++;
+				wpos1 = i;
+				wvalue1 = 1;
+				for (i = b-1; i >= wpos1; i--)
+					{
+					wvalue1 <<= 1;
+					if (BN_is_bit_set(p1, i))
+						wvalue1++;
+					}
 				}
-			wstart--;
-			if (wstart < 0) break;
-			continue;
-			}
-		/* We now have wstart on a 'set' bit, we now need to work out
-		 * how bit a window to do.  To do this we need to scan
-		 * forward until the last set bit before the end of the
-		 * window */
-		j=wstart;
-		/* xvalue=BN_is_bit_set(p1,wstart); already set */
-		/* yvalue=BN_is_bit_set(p1,wstart); already set */
-		wend=0;
-		for (i=1; i<window; i++)
-			{
-			if (wstart-i < 0) break;
-			xvalue+=xvalue;
-			xvalue|=BN_is_bit_set(p1,wstart-i);
-			yvalue+=yvalue;
-			yvalue|=BN_is_bit_set(p2,wstart-i);
-			}
-
-		/* i is the size of the current window */
-		/* add the 'bytes above' */
-		if (!start)
-			for (j=0; j<i; j++)
+		
+		if (!wvalue2)
+			if (BN_is_bit_set(p2, b))
 				{
-				if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))
-					goto err;
+				/* consider bits b-window2+1 .. b for this window */
+				i = b-window2+1;
+				while (!BN_is_bit_set(p2, i))
+					i++;
+				wpos2 = i;
+				wvalue2 = 1;
+				for (i = b-1; i >= wpos2; i--)
+					{
+					wvalue2 <<= 1;
+					if (BN_is_bit_set(p2, i))
+						wvalue2++;
+					}
 				}
+
+		if (wvalue1 && b == wpos1)
+			{
+			/* wvalue1 is odd and < 2^window1 */
+			if (!BN_mod_mul_montgomery(r,r,&(val1[wvalue1>>1]),mont,ctx))
+				goto err;
+			wvalue1 = 0;
+			r_is_one = 0;
+			}
 		
-		/* wvalue will be an odd number < 2^window */
-		if (xvalue || yvalue)
+		if (wvalue2 && b == wpos2)
 			{
-			if (!BN_mod_mul_montgomery(r,r,&(val[xvalue][yvalue]),
-				mont,ctx)) goto err;
+			/* wvalue2 is odd and < 2^window2 */
+			if (!BN_mod_mul_montgomery(r,r,&(val2[wvalue2>>1]),mont,ctx))
+				goto err;
+			wvalue2 = 0;
+			r_is_one = 0;
 			}
-
-		/* move the 'window' down further */
-		wstart-=i;
-		start=0;
-		if (wstart < 0) break;
 		}
 	BN_from_montgomery(rr,r,mont,ctx);
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
-	ctx->tos-=2;
-	for (i=0; i<ts; i++)
-		{
-		for (j=0; j<ts; j++)
-			{
-			BN_clear_free(&(val[i][j]));
-			}
-		}
+	BN_CTX_end(ctx);
+	for (i=0; i<ts1; i++)
+		BN_clear_free(&(val1[i]));
+	for (i=0; i<ts2; i++)
+		BN_clear_free(&(val2[i]));
 	return(ret);
 	}
diff --git a/src/lib/libcrypto/bn/bn_gcd.c b/src/lib/libcrypto/bn/bn_gcd.c
index 071bba3b4b..7649f63fd2 100644
--- a/src/lib/libcrypto/bn/bn_gcd.c
+++ b/src/lib/libcrypto/bn/bn_gcd.c
@@ -55,29 +55,82 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 
-#include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-#ifndef NOPROTO
 static BIGNUM *euclid(BIGNUM *a, BIGNUM *b);
-#else
-static BIGNUM *euclid();
-#endif
 
-int BN_gcd(r,in_a,in_b,ctx)
-BIGNUM *r,*in_a,*in_b;
-BN_CTX *ctx;
+int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx)
 	{
 	BIGNUM *a,*b,*t;
 	int ret=0;
 
-	a=ctx->bn[ctx->tos];
-	b=ctx->bn[ctx->tos+1];
+	bn_check_top(in_a);
+	bn_check_top(in_b);
+
+	BN_CTX_start(ctx);
+	a = BN_CTX_get(ctx);
+	b = BN_CTX_get(ctx);
+	if (a == NULL || b == NULL) goto err;
 
 	if (BN_copy(a,in_a) == NULL) goto err;
 	if (BN_copy(b,in_b) == NULL) goto err;
+	a->neg = 0;
+	b->neg = 0;
 
 	if (BN_cmp(a,b) < 0) { t=a; a=b; b=t; }
 	t=euclid(a,b);
@@ -86,19 +139,22 @@ BN_CTX *ctx;
 	if (BN_copy(r,t) == NULL) goto err;
 	ret=1;
 err:
+	BN_CTX_end(ctx);
 	return(ret);
 	}
 
-static BIGNUM *euclid(a,b)
-BIGNUM *a,*b;
+static BIGNUM *euclid(BIGNUM *a, BIGNUM *b)
 	{
 	BIGNUM *t;
 	int shifts=0;
 
-	for (;;)
+	bn_check_top(a);
+	bn_check_top(b);
+
+	/* 0 <= b <= a */
+	while (!BN_is_zero(b))
 		{
-		if (BN_is_zero(b))
-			break;
+		/* 0 < b <= a */
 
 		if (BN_is_odd(a))
 			{
@@ -131,7 +187,9 @@ BIGNUM *a,*b;
 				shifts++;
 				}
 			}
+		/* 0 <= b <= a */
 		}
+
 	if (shifts)
 		{
 		if (!BN_lshift(a,a,shifts)) goto err;
@@ -141,54 +199,284 @@ err:
 	return(NULL);
 	}
 
+
 /* solves ax == 1 (mod n) */
-BIGNUM *BN_mod_inverse(a, n, ctx)
-BIGNUM *a;
-BIGNUM *n;
-BN_CTX *ctx;
+BIGNUM *BN_mod_inverse(BIGNUM *in,
+	const BIGNUM *a, const BIGNUM *n, BN_CTX *ctx)
 	{
-	BIGNUM *A,*B,*X,*Y,*M,*D,*R;
-	BIGNUM *ret=NULL,*T;
+	BIGNUM *A,*B,*X,*Y,*M,*D,*T,*R=NULL;
+	BIGNUM *ret=NULL;
 	int sign;
 
-	A=ctx->bn[ctx->tos];
-	B=ctx->bn[ctx->tos+1];
-	X=ctx->bn[ctx->tos+2];
-	D=ctx->bn[ctx->tos+3];
-	M=ctx->bn[ctx->tos+4];
-	Y=ctx->bn[ctx->tos+5];
-	ctx->tos+=6;
-	R=BN_new();
+	bn_check_top(a);
+	bn_check_top(n);
+
+	BN_CTX_start(ctx);
+	A = BN_CTX_get(ctx);
+	B = BN_CTX_get(ctx);
+	X = BN_CTX_get(ctx);
+	D = BN_CTX_get(ctx);
+	M = BN_CTX_get(ctx);
+	Y = BN_CTX_get(ctx);
+	T = BN_CTX_get(ctx);
+	if (T == NULL) goto err;
+
+	if (in == NULL)
+		R=BN_new();
+	else
+		R=in;
 	if (R == NULL) goto err;
 
-	BN_zero(X);
-	BN_one(Y);
-	if (BN_copy(A,a) == NULL) goto err;
-	if (BN_copy(B,n) == NULL) goto err;
-	sign=1;
+	BN_one(X);
+	BN_zero(Y);
+	if (BN_copy(B,a) == NULL) goto err;
+	if (BN_copy(A,n) == NULL) goto err;
+	A->neg = 0;
+	if (B->neg || (BN_ucmp(B, A) >= 0))
+		{
+		if (!BN_nnmod(B, B, A, ctx)) goto err;
+		}
+	sign = -1;
+	/* From  B = a mod |n|,  A = |n|  it follows that
+	 *
+	 *      0 <= B < A,
+	 *     -sign*X*a  ==  B   (mod |n|),
+	 *      sign*Y*a  ==  A   (mod |n|).
+	 */
+
+	if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048)))
+		{
+		/* Binary inversion algorithm; requires odd modulus.
+		 * This is faster than the general algorithm if the modulus
+		 * is sufficiently small (about 400 .. 500 bits on 32-bit
+		 * sytems, but much more on 64-bit systems) */
+		int shift;
+		
+		while (!BN_is_zero(B))
+			{
+			/*
+			 *      0 < B < |n|,
+			 *      0 < A <= |n|,
+			 * (1) -sign*X*a  ==  B   (mod |n|),
+			 * (2)  sign*Y*a  ==  A   (mod |n|)
+			 */
+
+			/* Now divide  B  by the maximum possible power of two in the integers,
+			 * and divide  X  by the same value mod |n|.
+			 * When we're done, (1) still holds. */
+			shift = 0;
+			while (!BN_is_bit_set(B, shift)) /* note that 0 < B */
+				{
+				shift++;
+				
+				if (BN_is_odd(X))
+					{
+					if (!BN_uadd(X, X, n)) goto err;
+					}
+				/* now X is even, so we can easily divide it by two */
+				if (!BN_rshift1(X, X)) goto err;
+				}
+			if (shift > 0)
+				{
+				if (!BN_rshift(B, B, shift)) goto err;
+				}
+
 
-	while (!BN_is_zero(B))
+			/* Same for  A  and  Y.  Afterwards, (2) still holds. */
+			shift = 0;
+			while (!BN_is_bit_set(A, shift)) /* note that 0 < A */
+				{
+				shift++;
+				
+				if (BN_is_odd(Y))
+					{
+					if (!BN_uadd(Y, Y, n)) goto err;
+					}
+				/* now Y is even */
+				if (!BN_rshift1(Y, Y)) goto err;
+				}
+			if (shift > 0)
+				{
+				if (!BN_rshift(A, A, shift)) goto err;
+				}
+
+			
+			/* We still have (1) and (2).
+			 * Both  A  and  B  are odd.
+			 * The following computations ensure that
+			 *
+			 *     0 <= B < |n|,
+			 *      0 < A < |n|,
+			 * (1) -sign*X*a  ==  B   (mod |n|),
+			 * (2)  sign*Y*a  ==  A   (mod |n|),
+			 *
+			 * and that either  A  or  B  is even in the next iteration.
+			 */
+			if (BN_ucmp(B, A) >= 0)
+				{
+				/* -sign*(X + Y)*a == B - A  (mod |n|) */
+				if (!BN_uadd(X, X, Y)) goto err;
+				/* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+				 * actually makes the algorithm slower */
+				if (!BN_usub(B, B, A)) goto err;
+				}
+			else
+				{
+				/*  sign*(X + Y)*a == A - B  (mod |n|) */
+				if (!BN_uadd(Y, Y, X)) goto err;
+				/* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
+				if (!BN_usub(A, A, B)) goto err;
+				}
+			}
+		}
+	else
 		{
-		if (!BN_div(D,M,A,B,ctx)) goto err;
-		T=A;
-		A=B;
-		B=M;
-		/* T has a struct, M does not */
-
-		if (!BN_mul(T,D,X)) goto err;
-		if (!BN_add(T,T,Y)) goto err;
-		M=Y;
-		Y=X;
-		X=T;
-		sign= -sign;
+		/* general inversion algorithm */
+
+		while (!BN_is_zero(B))
+			{
+			BIGNUM *tmp;
+			
+			/*
+			 *      0 < B < A,
+			 * (*) -sign*X*a  ==  B   (mod |n|),
+			 *      sign*Y*a  ==  A   (mod |n|)
+			 */
+			
+			/* (D, M) := (A/B, A%B) ... */
+			if (BN_num_bits(A) == BN_num_bits(B))
+				{
+				if (!BN_one(D)) goto err;
+				if (!BN_sub(M,A,B)) goto err;
+				}
+			else if (BN_num_bits(A) == BN_num_bits(B) + 1)
+				{
+				/* A/B is 1, 2, or 3 */
+				if (!BN_lshift1(T,B)) goto err;
+				if (BN_ucmp(A,T) < 0)
+					{
+					/* A < 2*B, so D=1 */
+					if (!BN_one(D)) goto err;
+					if (!BN_sub(M,A,B)) goto err;
+					}
+				else
+					{
+					/* A >= 2*B, so D=2 or D=3 */
+					if (!BN_sub(M,A,T)) goto err;
+					if (!BN_add(D,T,B)) goto err; /* use D (:= 3*B) as temp */
+					if (BN_ucmp(A,D) < 0)
+						{
+						/* A < 3*B, so D=2 */
+						if (!BN_set_word(D,2)) goto err;
+						/* M (= A - 2*B) already has the correct value */
+						}
+					else
+						{
+						/* only D=3 remains */
+						if (!BN_set_word(D,3)) goto err;
+						/* currently  M = A - 2*B,  but we need  M = A - 3*B */
+						if (!BN_sub(M,M,B)) goto err;
+						}
+					}
+				}
+			else
+				{
+				if (!BN_div(D,M,A,B,ctx)) goto err;
+				}
+			
+			/* Now
+			 *      A = D*B + M;
+			 * thus we have
+			 * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+			 */
+			
+			tmp=A; /* keep the BIGNUM object, the value does not matter */
+			
+			/* (A, B) := (B, A mod B) ... */
+			A=B;
+			B=M;
+			/* ... so we have  0 <= B < A  again */
+			
+			/* Since the former  M  is now  B  and the former  B  is now  A,
+			 * (**) translates into
+			 *       sign*Y*a  ==  D*A + B    (mod |n|),
+			 * i.e.
+			 *       sign*Y*a - D*A  ==  B    (mod |n|).
+			 * Similarly, (*) translates into
+			 *      -sign*X*a  ==  A          (mod |n|).
+			 *
+			 * Thus,
+			 *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+			 * i.e.
+			 *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+			 *
+			 * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+			 *      -sign*X*a  ==  B   (mod |n|),
+			 *       sign*Y*a  ==  A   (mod |n|).
+			 * Note that  X  and  Y  stay non-negative all the time.
+			 */
+			
+			/* most of the time D is very small, so we can optimize tmp := D*X+Y */
+			if (BN_is_one(D))
+				{
+				if (!BN_add(tmp,X,Y)) goto err;
+				}
+			else
+				{
+				if (BN_is_word(D,2))
+					{
+					if (!BN_lshift1(tmp,X)) goto err;
+					}
+				else if (BN_is_word(D,4))
+					{
+					if (!BN_lshift(tmp,X,2)) goto err;
+					}
+				else if (D->top == 1)
+					{
+					if (!BN_copy(tmp,X)) goto err;
+					if (!BN_mul_word(tmp,D->d[0])) goto err;
+					}
+				else
+					{
+					if (!BN_mul(tmp,D,X,ctx)) goto err;
+					}
+				if (!BN_add(tmp,tmp,Y)) goto err;
+				}
+			
+			M=Y; /* keep the BIGNUM object, the value does not matter */
+			Y=X;
+			X=tmp;
+			sign = -sign;
+			}
 		}
+		
+	/*
+	 * The while loop (Euclid's algorithm) ends when
+	 *      A == gcd(a,n);
+	 * we have
+	 *       sign*Y*a  ==  A  (mod |n|),
+	 * where  Y  is non-negative.
+	 */
+
 	if (sign < 0)
 		{
 		if (!BN_sub(Y,n,Y)) goto err;
 		}
+	/* Now  Y*a  ==  A  (mod |n|).  */
+	
 
 	if (BN_is_one(A))
-		{ if (!BN_mod(R,Y,n,ctx)) goto err; }
+		{
+		/* Y*a == 1  (mod |n|) */
+		if (!Y->neg && BN_ucmp(Y,n) < 0)
+			{
+			if (!BN_copy(R,Y)) goto err;
+			}
+		else
+			{
+			if (!BN_nnmod(R,Y,n,ctx)) goto err;
+			}
+		}
 	else
 		{
 		BNerr(BN_F_BN_MOD_INVERSE,BN_R_NO_INVERSE);
@@ -196,8 +484,7 @@ BN_CTX *ctx;
 		}
 	ret=R;
 err:
-	if ((ret == NULL) && (R != NULL)) BN_free(R);
-	ctx->tos-=6;
+	if ((ret == NULL) && (in == NULL)) BN_free(R);
+	BN_CTX_end(ctx);
 	return(ret);
 	}
-
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h
index edfd788338..8a4dba375a 100644
--- a/src/lib/libcrypto/bn/bn_lcl.h
+++ b/src/lib/libcrypto/bn/bn_lcl.h
@@ -55,30 +55,228 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 
 #ifndef HEADER_BN_LCL_H
 #define HEADER_BN_LCL_H
 
-#include "bn.h"
+#include <openssl/bn.h>
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
+
+/* Used for temp variables */
+#define BN_CTX_NUM	32
+#define BN_CTX_NUM_POS	12
+struct bignum_ctx
+	{
+	int tos;
+	BIGNUM bn[BN_CTX_NUM];
+	int flags;
+	int depth;
+	int pos[BN_CTX_NUM_POS];
+	int too_many;
+	} /* BN_CTX */;
+
+
+/*
+ * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
+ *
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent,
+ * the number of multiplications is a constant plus on average
+ *
+ *    2^(w-1) + (b-w)/(w+1);
+ *
+ * here  2^(w-1)  is for precomputing the table (we actually need
+ * entries only for windows that have the lowest bit set), and
+ * (b-w)/(w+1)  is an approximation for the expected number of
+ * w-bit windows, not counting the first one.
+ *
+ * Thus we should use
+ *
+ *    w >= 6  if        b > 671
+ *     w = 5  if  671 > b > 239
+ *     w = 4  if  239 > b >  79
+ *     w = 3  if   79 > b >  23
+ *    w <= 2  if   23 > b
+ *
+ * (with draws in between).  Very small exponents are often selected
+ * with low Hamming weight, so we use  w = 1  for b <= 23.
+ */
+#if 1
+#define BN_window_bits_for_exponent_size(b) \
+		((b) > 671 ? 6 : \
+		 (b) > 239 ? 5 : \
+		 (b) >  79 ? 4 : \
+		 (b) >  23 ? 3 : 1)
+#else
+/* Old SSLeay/OpenSSL table.
+ * Maximum window size was 5, so this table differs for b==1024;
+ * but it coincides for other interesting values (b==160, b==512).
+ */
+#define BN_window_bits_for_exponent_size(b) \
+		((b) > 255 ? 5 : \
+		 (b) > 127 ? 4 : \
+		 (b) >  17 ? 3 : 1)
+#endif	 
+
+
+
+/* Pentium pro 16,16,16,32,64 */
+/* Alpha       16,16,16,16.64 */
+#define BN_MULL_SIZE_NORMAL			(16) /* 32 */
+#define BN_MUL_RECURSIVE_SIZE_NORMAL		(16) /* 32 less than */
+#define BN_SQR_RECURSIVE_SIZE_NORMAL		(16) /* 32 */
+#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL	(32) /* 32 */
+#define BN_MONT_CTX_SET_SIZE_WORD		(64) /* 32 */
+
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
+/*
+ * BN_UMULT_HIGH section.
+ *
+ * No, I'm not trying to overwhelm you when stating that the
+ * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
+ * you to be impressed when I say that if the compiler doesn't
+ * support 2*N integer type, then you have to replace every N*N
+ * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
+ * and additions which unavoidably results in severe performance
+ * penalties. Of course provided that the hardware is capable of
+ * producing 2*N result... That's when you normally start
+ * considering assembler implementation. However! It should be
+ * pointed out that some CPUs (most notably Alpha, PowerPC and
+ * upcoming IA-64 family:-) provide *separate* instruction
+ * calculating the upper half of the product placing the result
+ * into a general purpose register. Now *if* the compiler supports
+ * inline assembler, then it's not impossible to implement the
+ * "bignum" routines (and have the compiler optimize 'em)
+ * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
+ * macro is about:-)
+ *
+ *					<appro@fy.chalmers.se>
+ */
+# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#  if defined(__DECC)
+#   include <c_asm.h>
+#   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
+#  elif defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("umulh	%1,%2,%0"	\
+	     : "=r"(ret)		\
+	     : "r"(a), "r"(b));		\
+	ret;			})
+#  endif	/* compiler */
+# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
+#  if defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("mulhdu	%0,%1,%2"	\
+	     : "=r"(ret)		\
+	     : "r"(a), "r"(b));		\
+	ret;			})
+#  endif	/* compiler */
+# endif		/* cpu */
+#endif		/* OPENSSL_NO_ASM */
+
 /*************************************************************
  * Using the long long type
  */
 #define Lw(t)    (((BN_ULONG)(t))&BN_MASK2)
 #define Hw(t)    (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
 
-#define bn_fix_top(a) \
-        { \
-        BN_ULONG *fix_top_l; \
-        for (fix_top_l= &((a)->d[(a)->top-1]); (a)->top > 0; (a)->top--) \
-		if (*(fix_top_l--)) break; \
+/* This is used for internal error checking and is not normally used */
+#ifdef BN_DEBUG
+# include <assert.h>
+# define bn_check_top(a) assert ((a)->top >= 0 && (a)->top <= (a)->dmax);
+#else
+# define bn_check_top(a)
+#endif
+
+/* This macro is to add extra stuff for development checking */
+#ifdef BN_DEBUG
+#define	bn_set_max(r) ((r)->max=(r)->top,BN_set_flags((r),BN_FLG_STATIC_DATA))
+#else
+#define	bn_set_max(r)
+#endif
+
+/* These macros are used to 'take' a section of a bignum for read only use */
+#define bn_set_low(r,a,n) \
+	{ \
+	(r)->top=((a)->top > (n))?(n):(a)->top; \
+	(r)->d=(a)->d; \
+	(r)->neg=(a)->neg; \
+	(r)->flags|=BN_FLG_STATIC_DATA; \
+	bn_set_max(r); \
 	}
 
-/* #define bn_expand(n,b) ((((b)/BN_BITS2) <= (n)->max)?(n):bn_expand2((n),(b))) */
+#define bn_set_high(r,a,n) \
+	{ \
+	if ((a)->top > (n)) \
+		{ \
+		(r)->top=(a)->top-n; \
+		(r)->d= &((a)->d[n]); \
+		} \
+	else \
+		(r)->top=0; \
+	(r)->neg=(a)->neg; \
+	(r)->flags|=BN_FLG_STATIC_DATA; \
+	bn_set_max(r); \
+	}
 
 #ifdef BN_LLONG
 #define mul_add(r,a,w,c) { \
@@ -95,6 +293,43 @@ extern "C" {
 	(c)= Hw(t); \
 	}
 
+#define sqr(r0,r1,a) { \
+	BN_ULLONG t; \
+	t=(BN_ULLONG)(a)*(a); \
+	(r0)=Lw(t); \
+	(r1)=Hw(t); \
+	}
+
+#elif defined(BN_UMULT_HIGH)
+#define mul_add(r,a,w,c) {		\
+	BN_ULONG high,low,ret,tmp=(a);	\
+	ret =  (r);			\
+	high=  BN_UMULT_HIGH(w,tmp);	\
+	ret += (c);			\
+	low =  (w) * tmp;		\
+	(c) =  (ret<(c))?1:0;		\
+	(c) += high;			\
+	ret += low;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define mul(r,a,w,c)	{		\
+	BN_ULONG high,low,ret,ta=(a);	\
+	low =  (w) * ta;		\
+	high=  BN_UMULT_HIGH(w,ta);	\
+	ret =  low + (c);		\
+	(c) =  high;			\
+	(c) += (ret<low)?1:0;		\
+	(r) =  ret;			\
+	}
+
+#define sqr(r0,r1,a)	{		\
+	BN_ULONG tmp=(a);		\
+	(r0) = tmp * tmp;		\
+	(r1) = BN_UMULT_HIGH(tmp,tmp);	\
+	}
+
 #else
 /*************************************************************
  * No long long type
@@ -172,25 +407,31 @@ extern "C" {
 	(c)=h&BN_MASK2; \
 	(r)=l&BN_MASK2; \
 	}
+#endif /* !BN_LLONG */
 
-#endif
-
-#ifndef NOPROTO
-
-BIGNUM *bn_expand2(BIGNUM *b, int bits);
-
-#ifdef X86_ASM
-void bn_add_words(BN_ULONG *r,BN_ULONG *a,int num);
-#endif
-
-#else
-
-BIGNUM *bn_expand2();
-#ifdef X86_ASM
-BN_ULONG bn_add_words();
-#endif
-
-#endif
+void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
+void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
+void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
+void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp);
+void bn_sqr_comba8(BN_ULONG *r,const BN_ULONG *a);
+void bn_sqr_comba4(BN_ULONG *r,const BN_ULONG *a);
+int bn_cmp_words(const BN_ULONG *a,const BN_ULONG *b,int n);
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl);
+void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
+	int dna,int dnb,BN_ULONG *t);
+void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,
+	int n,int tna,int tnb,BN_ULONG *t);
+void bn_sqr_recursive(BN_ULONG *r,const BN_ULONG *a, int n2, BN_ULONG *t);
+void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n);
+void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
+	BN_ULONG *t);
+void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2,
+	BN_ULONG *t);
+BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl);
+BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl);
 
 #ifdef  __cplusplus
 }
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
index bfe7628ad4..a016cb7f53 100644
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ b/src/lib/libcrypto/bn/bn_lib.c
@@ -56,13 +56,79 @@
  * [including the GNU Public Licence.]
  */
 
+#ifndef BN_DEBUG
+# undef NDEBUG /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
+#include <assert.h>
+#include <limits.h>
 #include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-char *BN_version="Big Number part of SSLeay 0.9.0b 29-Jun-1998";
+const char *BN_version="Big Number" OPENSSL_VERSION_PTEXT;
 
-BIGNUM *BN_value_one()
+/* For a 32 bit machine
+ * 2 -   4 ==  128
+ * 3 -   8 ==  256
+ * 4 -  16 ==  512
+ * 5 -  32 == 1024
+ * 6 -  64 == 2048
+ * 7 - 128 == 4096
+ * 8 - 256 == 8192
+ */
+static int bn_limit_bits=0;
+static int bn_limit_num=8;        /* (1<<bn_limit_bits) */
+static int bn_limit_bits_low=0;
+static int bn_limit_num_low=8;    /* (1<<bn_limit_bits_low) */
+static int bn_limit_bits_high=0;
+static int bn_limit_num_high=8;   /* (1<<bn_limit_bits_high) */
+static int bn_limit_bits_mont=0;
+static int bn_limit_num_mont=8;   /* (1<<bn_limit_bits_mont) */
+
+void BN_set_params(int mult, int high, int low, int mont)
+	{
+	if (mult >= 0)
+		{
+		if (mult > (sizeof(int)*8)-1)
+			mult=sizeof(int)*8-1;
+		bn_limit_bits=mult;
+		bn_limit_num=1<<mult;
+		}
+	if (high >= 0)
+		{
+		if (high > (sizeof(int)*8)-1)
+			high=sizeof(int)*8-1;
+		bn_limit_bits_high=high;
+		bn_limit_num_high=1<<high;
+		}
+	if (low >= 0)
+		{
+		if (low > (sizeof(int)*8)-1)
+			low=sizeof(int)*8-1;
+		bn_limit_bits_low=low;
+		bn_limit_num_low=1<<low;
+		}
+	if (mont >= 0)
+		{
+		if (mont > (sizeof(int)*8)-1)
+			mont=sizeof(int)*8-1;
+		bn_limit_bits_mont=mont;
+		bn_limit_num_mont=1<<mont;
+		}
+	}
+
+int BN_get_params(int which)
+	{
+	if      (which == 0) return(bn_limit_bits);
+	else if (which == 1) return(bn_limit_bits_high);
+	else if (which == 2) return(bn_limit_bits_low);
+	else if (which == 3) return(bn_limit_bits_mont);
+	else return(0);
+	}
+
+const BIGNUM *BN_value_one(void)
 	{
 	static BN_ULONG data_one=1L;
 	static BIGNUM const_one={&data_one,1,1,0};
@@ -70,7 +136,7 @@ BIGNUM *BN_value_one()
 	return(&const_one);
 	}
 
-char *BN_options()
+char *BN_options(void)
 	{
 	static int init=0;
 	static char data[16];
@@ -89,10 +155,9 @@ char *BN_options()
 	return(data);
 	}
 
-int BN_num_bits_word(l)
-BN_ULONG l;
+int BN_num_bits_word(BN_ULONG l)
 	{
-	static char bits[256]={
+	static const char bits[256]={
 		0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
 		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
 		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -111,24 +176,24 @@ BN_ULONG l;
 		8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
 		};
 
-#ifdef SIXTY_FOUR_BIT_LONG
+#if defined(SIXTY_FOUR_BIT_LONG)
 	if (l & 0xffffffff00000000L)
 		{
 		if (l & 0xffff000000000000L)
 			{
 			if (l & 0xff00000000000000L)
 				{
-				return(bits[l>>56]+56);
+				return(bits[(int)(l>>56)]+56);
 				}
-			else	return(bits[l>>48]+48);
+			else	return(bits[(int)(l>>48)]+48);
 			}
 		else
 			{
 			if (l & 0x0000ff0000000000L)
 				{
-				return(bits[l>>40]+40);
+				return(bits[(int)(l>>40)]+40);
 				}
-			else	return(bits[l>>32]+32);
+			else	return(bits[(int)(l>>32)]+32);
 			}
 		}
 	else
@@ -140,17 +205,17 @@ BN_ULONG l;
 			{
 			if (l & 0xff00000000000000LL)
 				{
-				return(bits[l>>56]+56);
+				return(bits[(int)(l>>56)]+56);
 				}
-			else	return(bits[l>>48]+48);
+			else	return(bits[(int)(l>>48)]+48);
 			}
 		else
 			{
 			if (l & 0x0000ff0000000000LL)
 				{
-				return(bits[l>>40]+40);
+				return(bits[(int)(l>>40)]+40);
 				}
-			else	return(bits[l>>32]+32);
+			else	return(bits[(int)(l>>32)]+32);
 			}
 		}
 	else
@@ -161,161 +226,256 @@ BN_ULONG l;
 		if (l & 0xffff0000L)
 			{
 			if (l & 0xff000000L)
-				return(bits[l>>24L]+24);
-			else	return(bits[l>>16L]+16);
+				return(bits[(int)(l>>24L)]+24);
+			else	return(bits[(int)(l>>16L)]+16);
 			}
 		else
 #endif
 			{
 #if defined(SIXTEEN_BIT) || defined(THIRTY_TWO_BIT) || defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)
 			if (l & 0xff00L)
-				return(bits[l>>8]+8);
+				return(bits[(int)(l>>8)]+8);
 			else	
 #endif
-				return(bits[l   ]  );
+				return(bits[(int)(l   )]  );
 			}
 		}
 	}
 
-int BN_num_bits(a)
-BIGNUM *a;
+int BN_num_bits(const BIGNUM *a)
 	{
 	BN_ULONG l;
 	int i;
 
+	bn_check_top(a);
+
 	if (a->top == 0) return(0);
 	l=a->d[a->top-1];
+	assert(l != 0);
 	i=(a->top-1)*BN_BITS2;
-	if (l == 0)
-		{
-#if !defined(NO_STDIO) && !defined(WIN16)
-		fprintf(stderr,"BAD TOP VALUE\n");
-#endif
-		abort();
-		}
 	return(i+BN_num_bits_word(l));
 	}
 
-void BN_clear_free(a)
-BIGNUM *a;
+void BN_clear_free(BIGNUM *a)
 	{
+	int i;
+
 	if (a == NULL) return;
 	if (a->d != NULL)
 		{
-		memset(a->d,0,a->max*sizeof(a->d[0]));
-		Free(a->d);
+		memset(a->d,0,a->dmax*sizeof(a->d[0]));
+		if (!(BN_get_flags(a,BN_FLG_STATIC_DATA)))
+			OPENSSL_free(a->d);
 		}
+	i=BN_get_flags(a,BN_FLG_MALLOCED);
 	memset(a,0,sizeof(BIGNUM));
-	Free(a);
+	if (i)
+		OPENSSL_free(a);
 	}
 
-void BN_free(a)
-BIGNUM *a;
+void BN_free(BIGNUM *a)
 	{
 	if (a == NULL) return;
-	if (a->d != NULL) Free(a->d);
-	Free(a);
+	if ((a->d != NULL) && !(BN_get_flags(a,BN_FLG_STATIC_DATA)))
+		OPENSSL_free(a->d);
+	a->flags|=BN_FLG_FREE; /* REMOVE? */
+	if (a->flags & BN_FLG_MALLOCED)
+		OPENSSL_free(a);
 	}
 
-BIGNUM *BN_new()
+void BN_init(BIGNUM *a)
+	{
+	memset(a,0,sizeof(BIGNUM));
+	}
+
+BIGNUM *BN_new(void)
 	{
 	BIGNUM *ret;
-	BN_ULONG *p;
 
-	ret=(BIGNUM *)Malloc(sizeof(BIGNUM));
-	if (ret == NULL) goto err;
+	if ((ret=(BIGNUM *)OPENSSL_malloc(sizeof(BIGNUM))) == NULL)
+		{
+		BNerr(BN_F_BN_NEW,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+	ret->flags=BN_FLG_MALLOCED;
 	ret->top=0;
 	ret->neg=0;
-	ret->max=(BN_DEFAULT_BITS/BN_BITS2);
-	p=(BN_ULONG *)Malloc(sizeof(BN_ULONG)*(ret->max+1));
-	if (p == NULL) goto err;
-	ret->d=p;
-
-	memset(p,0,(ret->max+1)*sizeof(p[0]));
+	ret->dmax=0;
+	ret->d=NULL;
 	return(ret);
-err:
-	BNerr(BN_F_BN_NEW,ERR_R_MALLOC_FAILURE);
-	return(NULL);
 	}
 
-BN_CTX *BN_CTX_new()
+/* This is used both by bn_expand2() and bn_dup_expand() */
+/* The caller MUST check that words > b->dmax before calling this */
+static BN_ULONG *bn_expand_internal(const BIGNUM *b, int words)
 	{
-	BN_CTX *ret;
-	BIGNUM *n;
-	int i,j;
+	BN_ULONG *A,*a = NULL;
+	const BN_ULONG *B;
+	int i;
 
-	ret=(BN_CTX *)Malloc(sizeof(BN_CTX));
-	if (ret == NULL) goto err2;
+	if (words > (INT_MAX/(4*BN_BITS2)))
+		{
+		BNerr(BN_F_BN_EXPAND_INTERNAL,BN_R_BIGNUM_TOO_LONG);
+		return NULL;
+		}
 
-	for (i=0; i<BN_CTX_NUM; i++)
+	bn_check_top(b);	
+	if (BN_get_flags(b,BN_FLG_STATIC_DATA))
 		{
-		n=BN_new();
-		if (n == NULL) goto err;
-		ret->bn[i]=n;
+		BNerr(BN_F_BN_EXPAND_INTERNAL,BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
+		return(NULL);
+		}
+	a=A=(BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG)*(words+1));
+	if (A == NULL)
+		{
+		BNerr(BN_F_BN_EXPAND_INTERNAL,ERR_R_MALLOC_FAILURE);
+		return(NULL);
+		}
+#if 1
+	B=b->d;
+	/* Check if the previous number needs to be copied */
+	if (B != NULL)
+		{
+		for (i=b->top>>2; i>0; i--,A+=4,B+=4)
+			{
+			/*
+			 * The fact that the loop is unrolled
+			 * 4-wise is a tribute to Intel. It's
+			 * the one that doesn't have enough
+			 * registers to accomodate more data.
+			 * I'd unroll it 8-wise otherwise:-)
+			 *
+			 *		<appro@fy.chalmers.se>
+			 */
+			BN_ULONG a0,a1,a2,a3;
+			a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+			A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
+			}
+		switch (b->top&3)
+			{
+		case 3:	A[2]=B[2];
+		case 2:	A[1]=B[1];
+		case 1:	A[0]=B[0];
+		case 0: /* workaround for ultrix cc: without 'case 0', the optimizer does
+		         * the switch table by doing a=top&3; a--; goto jump_table[a];
+		         * which fails for top== 0 */
+			;
+			}
 		}
 
-	/* There is actually an extra one, this is for debugging my
-	 * stuff */
-	ret->bn[BN_CTX_NUM]=NULL;
+	/* Now need to zero any data between b->top and b->max */
+	/* XXX Why? */
 
-	ret->tos=0;
-	return(ret);
-err:
-	for (j=0; j<i; j++)
-		BN_free(ret->bn[j]);
-	Free(ret);
-err2:
-	BNerr(BN_F_BN_CTX_NEW,ERR_R_MALLOC_FAILURE);
-	return(NULL);
+	A= &(a[b->top]);
+	for (i=(words - b->top)>>3; i>0; i--,A+=8)
+		{
+		A[0]=0; A[1]=0; A[2]=0; A[3]=0;
+		A[4]=0; A[5]=0; A[6]=0; A[7]=0;
+		}
+	for (i=(words - b->top)&7; i>0; i--,A++)
+		A[0]=0;
+#else
+	memset(A,0,sizeof(BN_ULONG)*(words+1));
+	memcpy(A,b->d,sizeof(b->d[0])*b->top);
+#endif
+		
+	return(a);
 	}
 
-void BN_CTX_free(c)
-BN_CTX *c;
+/* This is an internal function that can be used instead of bn_expand2()
+ * when there is a need to copy BIGNUMs instead of only expanding the
+ * data part, while still expanding them.
+ * Especially useful when needing to expand BIGNUMs that are declared
+ * 'const' and should therefore not be changed.
+ * The reason to use this instead of a BN_dup() followed by a bn_expand2()
+ * is memory allocation overhead.  A BN_dup() followed by a bn_expand2()
+ * will allocate new memory for the BIGNUM data twice, and free it once,
+ * while bn_dup_expand() makes sure allocation is made only once.
+ */
+
+BIGNUM *bn_dup_expand(const BIGNUM *b, int words)
 	{
-	int i;
+	BIGNUM *r = NULL;
+
+	if (words > b->dmax)
+		{
+		BN_ULONG *a = bn_expand_internal(b, words);
 
-	for (i=0; i<BN_CTX_NUM; i++)
-		BN_clear_free(c->bn[i]);
-	Free(c);
+		if (a)
+			{
+			r = BN_new();
+			if (r)
+				{
+				r->top = b->top;
+				r->dmax = words;
+				r->neg = b->neg;
+				r->d = a;
+				}
+			else
+				{
+				/* r == NULL, BN_new failure */
+				OPENSSL_free(a);
+				}
+			}
+		/* If a == NULL, there was an error in allocation in
+		   bn_expand_internal(), and NULL should be returned */
+		}
+	else
+		{
+		r = BN_dup(b);
+		}
+
+	return r;
 	}
 
-BIGNUM *bn_expand2(b, words)
-BIGNUM *b;
-int words;
-	{
-	BN_ULONG *p;
+/* This is an internal function that should not be used in applications.
+ * It ensures that 'b' has enough room for a 'words' word number number.
+ * It is mostly used by the various BIGNUM routines. If there is an error,
+ * NULL is returned. If not, 'b' is returned. */
 
-	if (words > b->max)
+BIGNUM *bn_expand2(BIGNUM *b, int words)
+	{
+	if (words > b->dmax)
 		{
-		p=(BN_ULONG *)Realloc(b->d,sizeof(BN_ULONG)*(words+1));
-		if (p == NULL)
+		BN_ULONG *a = bn_expand_internal(b, words);
+
+		if (a)
 			{
-			BNerr(BN_F_BN_EXPAND2,ERR_R_MALLOC_FAILURE);
-			return(NULL);
+			if (b->d)
+				OPENSSL_free(b->d);
+			b->d=a;
+			b->dmax=words;
 			}
-		b->d=p;
-		memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG));
-		b->max=words;
+		else
+			b = NULL;
 		}
-	return(b);
+	return b;
 	}
 
-BIGNUM *BN_dup(a)
-BIGNUM *a;
+BIGNUM *BN_dup(const BIGNUM *a)
 	{
-	BIGNUM *r;
+	BIGNUM *r, *t;
 
-	r=BN_new();
-	if (r == NULL) return(NULL);
-	return((BIGNUM *)BN_copy(r,a));
+	if (a == NULL) return NULL;
+
+	bn_check_top(a);
+
+	t = BN_new();
+	if (t == NULL) return(NULL);
+	r = BN_copy(t, a);
+	/* now  r == t || r == NULL */
+	if (r == NULL)
+		BN_free(t);
+	return r;
 	}
 
-BIGNUM *BN_copy(a, b)
-BIGNUM *a;
-BIGNUM *b;
+BIGNUM *BN_copy(BIGNUM *a, const BIGNUM *b)
 	{
 	int i;
-	BN_ULONG *A,*B;
+	BN_ULONG *A;
+	const BN_ULONG *B;
+
+	bn_check_top(b);
 
 	if (a == b) return(a);
 	if (bn_wexpand(a,b->top) == NULL) return(NULL);
@@ -323,35 +483,18 @@ BIGNUM *b;
 #if 1
 	A=a->d;
 	B=b->d;
-	for (i=b->top&(~7); i>0; i-=8)
-		{
-		A[0]=B[0];
-		A[1]=B[1];
-		A[2]=B[2];
-		A[3]=B[3];
-		A[4]=B[4];
-		A[5]=B[5];
-		A[6]=B[6];
-		A[7]=B[7];
-		A+=8;
-		B+=8;
-		}
-	switch (b->top&7)
-		{
-	case 7:
-		A[6]=B[6];
-	case 6:
-		A[5]=B[5];
-	case 5:
-		A[4]=B[4];
-	case 4:
-		A[3]=B[3];
-	case 3:
-		A[2]=B[2];
-	case 2:
-		A[1]=B[1];
-	case 1:
-		A[0]=B[0];
+	for (i=b->top>>2; i>0; i--,A+=4,B+=4)
+		{
+		BN_ULONG a0,a1,a2,a3;
+		a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+		A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
+		}
+	switch (b->top&3)
+		{
+		case 3: A[2]=B[2];
+		case 2: A[1]=B[1];
+		case 1: A[0]=B[0];
+		case 0: ; /* ultrix cc workaround, see comments in bn_expand_internal */
 		}
 #else
 	memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
@@ -359,52 +502,76 @@ BIGNUM *b;
 
 /*	memset(&(a->d[b->top]),0,sizeof(a->d[0])*(a->max-b->top));*/
 	a->top=b->top;
-	if (a->top == 0)
+	if ((a->top == 0) && (a->d != NULL))
 		a->d[0]=0;
 	a->neg=b->neg;
 	return(a);
 	}
 
-void BN_clear(a)
-BIGNUM *a;
+void BN_swap(BIGNUM *a, BIGNUM *b)
+	{
+	int flags_old_a, flags_old_b;
+	BN_ULONG *tmp_d;
+	int tmp_top, tmp_dmax, tmp_neg;
+	
+	flags_old_a = a->flags;
+	flags_old_b = b->flags;
+
+	tmp_d = a->d;
+	tmp_top = a->top;
+	tmp_dmax = a->dmax;
+	tmp_neg = a->neg;
+	
+	a->d = b->d;
+	a->top = b->top;
+	a->dmax = b->dmax;
+	a->neg = b->neg;
+	
+	b->d = tmp_d;
+	b->top = tmp_top;
+	b->dmax = tmp_dmax;
+	b->neg = tmp_neg;
+	
+	a->flags = (flags_old_a & BN_FLG_MALLOCED) | (flags_old_b & BN_FLG_STATIC_DATA);
+	b->flags = (flags_old_b & BN_FLG_MALLOCED) | (flags_old_a & BN_FLG_STATIC_DATA);
+	}
+
+
+void BN_clear(BIGNUM *a)
 	{
-	memset(a->d,0,a->max*sizeof(a->d[0]));
+	if (a->d != NULL)
+		memset(a->d,0,a->dmax*sizeof(a->d[0]));
 	a->top=0;
 	a->neg=0;
 	}
 
-unsigned long BN_get_word(a)
-BIGNUM *a;
+BN_ULONG BN_get_word(const BIGNUM *a)
 	{
 	int i,n;
-	unsigned long ret=0;
+	BN_ULONG ret=0;
 
 	n=BN_num_bytes(a);
-	if (n > sizeof(unsigned long))
-#ifdef SIXTY_FOUR_BIT_LONG
+	if (n > sizeof(BN_ULONG))
 		return(BN_MASK2);
-#else
-		return(0xFFFFFFFFL);
-#endif
 	for (i=a->top-1; i>=0; i--)
 		{
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
 		ret<<=BN_BITS4; /* stops the compiler complaining */
 		ret<<=BN_BITS4;
+#else
+		ret=0;
 #endif
 		ret|=a->d[i];
 		}
 	return(ret);
 	}
 
-int BN_set_word(a,w)
-BIGNUM *a;
-unsigned long w;
+int BN_set_word(BIGNUM *a, BN_ULONG w)
 	{
 	int i,n;
-	if (bn_expand(a,sizeof(unsigned long)*8) == NULL) return(0);
+	if (bn_expand(a,sizeof(BN_ULONG)*8) == NULL) return(0);
 
-	n=sizeof(unsigned long)/BN_BYTES;
+	n=sizeof(BN_ULONG)/BN_BYTES;
 	a->neg=0;
 	a->top=0;
 	a->d[0]=(BN_ULONG)w&BN_MASK2;
@@ -417,6 +584,8 @@ unsigned long w;
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
 		w>>=BN_BITS4;
 		w>>=BN_BITS4;
+#else
+		w=0;
 #endif
 		a->d[i]=(BN_ULONG)w&BN_MASK2;
 		if (a->d[i] != 0) a->top=i+1;
@@ -424,11 +593,7 @@ unsigned long w;
 	return(1);
 	}
 
-/* ignore negative */
-BIGNUM *BN_bin2bn(s, len, ret)
-unsigned char *s;
-int len;
-BIGNUM *ret;
+BIGNUM *BN_bin2bn(const unsigned char *s, int len, BIGNUM *ret)
 	{
 	unsigned int i,m;
 	unsigned int n;
@@ -448,6 +613,7 @@ BIGNUM *ret;
 	i=((n-1)/BN_BYTES)+1;
 	m=((n-1)%(BN_BYTES));
 	ret->top=i;
+	ret->neg=0;
 	while (n-- > 0)
 		{
 		l=(l<<8L)| *(s++);
@@ -465,9 +631,7 @@ BIGNUM *ret;
 	}
 
 /* ignore negative */
-int BN_bn2bin(a, to)
-BIGNUM *a;
-unsigned char *to;
+int BN_bn2bin(const BIGNUM *a, unsigned char *to)
 	{
 	int n,i;
 	BN_ULONG l;
@@ -481,13 +645,14 @@ unsigned char *to;
 	return(n);
 	}
 
-int BN_ucmp(a, b)
-BIGNUM *a;
-BIGNUM *b;
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b)
 	{
 	int i;
 	BN_ULONG t1,t2,*ap,*bp;
 
+	bn_check_top(a);
+	bn_check_top(b);
+
 	i=a->top-b->top;
 	if (i != 0) return(i);
 	ap=a->d;
@@ -502,9 +667,7 @@ BIGNUM *b;
 	return(0);
 	}
 
-int BN_cmp(a, b)
-BIGNUM *a;
-BIGNUM *b;
+int BN_cmp(const BIGNUM *a, const BIGNUM *b)
 	{
 	int i;
 	int gt,lt;
@@ -519,6 +682,10 @@ BIGNUM *b;
 		else
 			return(0);
 		}
+
+	bn_check_top(a);
+	bn_check_top(b);
+
 	if (a->neg != b->neg)
 		{
 		if (a->neg)
@@ -541,27 +708,25 @@ BIGNUM *b;
 	return(0);
 	}
 
-int BN_set_bit(a, n)
-BIGNUM *a;
-int n;
+int BN_set_bit(BIGNUM *a, int n)
 	{
-	int i,j;
+	int i,j,k;
 
 	i=n/BN_BITS2;
 	j=n%BN_BITS2;
 	if (a->top <= i)
 		{
-		if (bn_expand(a,n) == NULL) return(0);
+		if (bn_wexpand(a,i+1) == NULL) return(0);
+		for(k=a->top; k<i+1; k++)
+			a->d[k]=0;
 		a->top=i+1;
 		}
 
-	a->d[i]|=(1L<<j);
+	a->d[i]|=(((BN_ULONG)1)<<j);
 	return(1);
 	}
 
-int BN_clear_bit(a, n)
-BIGNUM *a;
-int n;
+int BN_clear_bit(BIGNUM *a, int n)
 	{
 	int i,j;
 
@@ -569,13 +734,12 @@ int n;
 	j=n%BN_BITS2;
 	if (a->top <= i) return(0);
 
-	a->d[i]&=(~(1L<<j));
+	a->d[i]&=(~(((BN_ULONG)1)<<j));
+	bn_fix_top(a);
 	return(1);
 	}
 
-int BN_is_bit_set(a, n)
-BIGNUM *a;
-int n;
+int BN_is_bit_set(const BIGNUM *a, int n)
 	{
 	int i,j;
 
@@ -586,9 +750,7 @@ int n;
 	return((a->d[i]&(((BN_ULONG)1)<<j))?1:0);
 	}
 
-int BN_mask_bits(a,n)
-BIGNUM *a;
-int n;
+int BN_mask_bits(BIGNUM *a, int n)
 	{
 	int b,w;
 
@@ -601,11 +763,56 @@ int n;
 		{
 		a->top=w+1;
 		a->d[w]&= ~(BN_MASK2<<b);
-		while ((w >= 0) && (a->d[w] == 0))
+		}
+	bn_fix_top(a);
+	return(1);
+	}
+
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n)
+	{
+	int i;
+	BN_ULONG aa,bb;
+
+	aa=a[n-1];
+	bb=b[n-1];
+	if (aa != bb) return((aa > bb)?1:-1);
+	for (i=n-2; i>=0; i--)
+		{
+		aa=a[i];
+		bb=b[i];
+		if (aa != bb) return((aa > bb)?1:-1);
+		}
+	return(0);
+	}
+
+/* Here follows a specialised variants of bn_cmp_words().  It has the
+   property of performing the operation on arrays of different sizes.
+   The sizes of those arrays is expressed through cl, which is the
+   common length ( basicall, min(len(a),len(b)) ), and dl, which is the
+   delta between the two lengths, calculated as len(a)-len(b).
+   All lengths are the number of BN_ULONGs...  */
+
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl)
+	{
+	int n,i;
+	n = cl-1;
+
+	if (dl < 0)
+		{
+		for (i=dl; i<0; i++)
 			{
-			a->top--;
-			w--;
+			if (b[n-i] != 0)
+				return -1; /* a < b */
 			}
 		}
-	return(1);
+	if (dl > 0)
+		{
+		for (i=dl; i>0; i--)
+			{
+			if (a[n+i] != 0)
+				return 1; /* a > b */
+			}
+		}
+	return bn_cmp_words(a,b,cl);
 	}
diff --git a/src/lib/libcrypto/bn/bn_mod.c b/src/lib/libcrypto/bn/bn_mod.c
index c351aac14f..5cf82480d7 100644
--- a/src/lib/libcrypto/bn/bn_mod.c
+++ b/src/lib/libcrypto/bn/bn_mod.c
@@ -1,4 +1,59 @@
 /* crypto/bn/bn_mod.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  * All rights reserved.
  *
@@ -56,42 +111,186 @@
  * [including the GNU Public Licence.]
  */
 
-#include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-/* rem != m */
-int BN_mod(rem, m, d,ctx)
-BIGNUM *rem;
-BIGNUM *m;
-BIGNUM *d;
-BN_CTX *ctx;
+
+#if 0 /* now just a #define */
+int BN_mod(BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
+	{
+	return(BN_div(NULL,rem,m,d,ctx));
+	/* note that  rem->neg == m->neg  (unless the remainder is zero) */
+	}
+#endif
+
+
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx)
+	{
+	/* like BN_mod, but returns non-negative remainder
+	 * (i.e.,  0 <= r < |d|  always holds) */
+
+	if (!(BN_mod(r,m,d,ctx)))
+		return 0;
+	if (!r->neg)
+		return 1;
+	/* now   -|d| < r < 0,  so we have to set  r := r + |d| */
+	return (d->neg ? BN_sub : BN_add)(r, r, d);
+}
+
+
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
+	{
+	if (!BN_add(r, a, b)) return 0;
+	return BN_nnmod(r, r, m, ctx);
+	}
+
+
+/* BN_mod_add variant that may be used if both  a  and  b  are non-negative
+ * and less than  m */
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
+	{
+	if (!BN_add(r, a, b)) return 0;
+	if (BN_ucmp(r, m) >= 0)
+		return BN_usub(r, r, m);
+	return 1;
+	}
+
+
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, BN_CTX *ctx)
+	{
+	if (!BN_sub(r, a, b)) return 0;
+	return BN_nnmod(r, r, m, ctx);
+	}
+
+
+/* BN_mod_sub variant that may be used if both  a  and  b  are non-negative
+ * and less than  m */
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m)
+	{
+	if (!BN_sub(r, a, b)) return 0;
+	if (r->neg)
+		return BN_add(r, r, m);
+	return 1;
+	}
+
+
+/* slow but works */
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+	BN_CTX *ctx)
 	{
-#if 0 /* The old slow way */
-	int i,nm,nd;
-	BIGNUM *dv;
+	BIGNUM *t;
+	int ret=0;
+
+	bn_check_top(a);
+	bn_check_top(b);
+	bn_check_top(m);
+
+	BN_CTX_start(ctx);
+	if ((t = BN_CTX_get(ctx)) == NULL) goto err;
+	if (a == b)
+		{ if (!BN_sqr(t,a,ctx)) goto err; }
+	else
+		{ if (!BN_mul(t,a,b,ctx)) goto err; }
+	if (!BN_nnmod(r,t,m,ctx)) goto err;
+	ret=1;
+err:
+	BN_CTX_end(ctx);
+	return(ret);
+	}
 
-	if (BN_ucmp(m,d) < 0)
-		return((BN_copy(rem,m) == NULL)?0:1);
 
-	dv=ctx->bn[ctx->tos];
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
+	{
+	if (!BN_sqr(r, a, ctx)) return 0;
+	/* r->neg == 0,  thus we don't need BN_nnmod */
+	return BN_mod(r, r, m, ctx);
+	}
 
-	if (!BN_copy(rem,m)) return(0);
 
-	nm=BN_num_bits(rem);
-	nd=BN_num_bits(d);
-	if (!BN_lshift(dv,d,nm-nd)) return(0);
-	for (i=nm-nd; i>=0; i--)
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx)
+	{
+	if (!BN_lshift1(r, a)) return 0;
+	return BN_nnmod(r, r, m, ctx);
+	}
+
+
+/* BN_mod_lshift1 variant that may be used if  a  is non-negative
+ * and less than  m */
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m)
+	{
+	if (!BN_lshift1(r, a)) return 0;
+	if (BN_cmp(r, m) >= 0)
+		return BN_sub(r, r, m);
+	return 1;
+	}
+
+
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, BN_CTX *ctx)
+	{
+	BIGNUM *abs_m = NULL;
+	int ret;
+
+	if (!BN_nnmod(r, a, m, ctx)) return 0;
+
+	if (m->neg)
 		{
-		if (BN_cmp(rem,dv) >= 0)
+		abs_m = BN_dup(m);
+		if (abs_m == NULL) return 0;
+		abs_m->neg = 0;
+		}
+	
+	ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+
+	if (abs_m)
+		BN_free(abs_m);
+	return ret;
+	}
+
+
+/* BN_mod_lshift variant that may be used if  a  is non-negative
+ * and less than  m */
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m)
+	{
+	if (r != a)
+		{
+		if (BN_copy(r, a) == NULL) return 0;
+		}
+
+	while (n > 0)
+		{
+		int max_shift;
+		
+		/* 0 < r < m */
+		max_shift = BN_num_bits(m) - BN_num_bits(r);
+		/* max_shift >= 0 */
+
+		if (max_shift < 0)
+			{
+			BNerr(BN_F_BN_MOD_LSHIFT_QUICK, BN_R_INPUT_NOT_REDUCED);
+			return 0;
+			}
+
+		if (max_shift > n)
+			max_shift = n;
+
+		if (max_shift)
+			{
+			if (!BN_lshift(r, r, max_shift)) return 0;
+			n -= max_shift;
+			}
+		else
+			{
+			if (!BN_lshift1(r, r)) return 0;
+			--n;
+			}
+
+		/* BN_num_bits(r) <= BN_num_bits(m) */
+
+		if (BN_cmp(r, m) >= 0) 
 			{
-			if (!BN_sub(rem,rem,dv)) return(0);
+			if (!BN_sub(r, r, m)) return 0;
 			}
-		if (!BN_rshift1(dv,dv)) return(0);
 		}
-	return(1);
-#else
-	return(BN_div(NULL,rem,m,d,ctx));
-#endif
+	
+	return 1;
 	}
-
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index e435df61f8..c9ebdbaabe 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -56,59 +56,67 @@
  * [including the GNU Public Licence.]
  */
 
+/*
+ * Details about Montgomery multiplication algorithms can be found at
+ * http://security.ece.orst.edu/publications.html, e.g.
+ * http://security.ece.orst.edu/koc/papers/j37acmon.pdf and
+ * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
+ */
+
 #include <stdio.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-int BN_mod_mul_montgomery(r,a,b,mont,ctx)
-BIGNUM *r,*a,*b;
-BN_MONT_CTX *mont;
-BN_CTX *ctx;
+#define MONT_WORD /* use the faster word-based algorithm */
+
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+			  BN_MONT_CTX *mont, BN_CTX *ctx)
 	{
 	BIGNUM *tmp;
+	int ret=0;
 
-        tmp=ctx->bn[ctx->tos++];
+	BN_CTX_start(ctx);
+	tmp = BN_CTX_get(ctx);
+	if (tmp == NULL) goto err;
 
+	bn_check_top(tmp);
 	if (a == b)
 		{
 		if (!BN_sqr(tmp,a,ctx)) goto err;
 		}
 	else
 		{
-		if (!BN_mul(tmp,a,b)) goto err;
+		if (!BN_mul(tmp,a,b,ctx)) goto err;
 		}
 	/* reduce from aRR to aR */
 	if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
-	ctx->tos--;
-	return(1);
+	ret=1;
 err:
-	return(0);
+	BN_CTX_end(ctx);
+	return(ret);
 	}
 
-#define MONT_WORD
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+	     BN_CTX *ctx)
+	{
+	int retn=0;
 
 #ifdef MONT_WORD
-int BN_from_montgomery(ret,a,mont,ctx)
-BIGNUM *ret;
-BIGNUM *a;
-BN_MONT_CTX *mont;
-BN_CTX *ctx;
-	{
-	BIGNUM *n,*t1,*r;
-	BN_ULONG *ap,*np,*rp,n0,v;
+	BIGNUM *n,*r;
+	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
 	int al,nl,max,i,x,ri;
-	int retn=0;
 
-	t1=ctx->bn[ctx->tos];
-	r=ctx->bn[ctx->tos+1];
+	BN_CTX_start(ctx);
+	if ((r = BN_CTX_get(ctx)) == NULL) goto err;
 
 	if (!BN_copy(r,a)) goto err;
-	n=mont->N;
+	n= &(mont->N);
 
 	ap=a->d;
-	/* mont->ri is the size of mont->N in bits/words */
+	/* mont->ri is the size of mont->N in bits (rounded up
+	   to the word size) */
 	al=ri=mont->ri/BN_BITS2;
-
+	
 	nl=n->top;
 	if ((al == 0) || (nl == 0)) { r->top=0; return(1); }
 
@@ -119,6 +127,7 @@ BN_CTX *ctx;
 	r->neg=a->neg^n->neg;
 	np=n->d;
 	rp=r->d;
+	nrp= &(r->d[nl]);
 
 	/* clear the top words of T */
 #if 1
@@ -131,176 +140,210 @@ BN_CTX *ctx;
 	r->top=max;
 	n0=mont->n0;
 
+#ifdef BN_COUNT
+	fprintf(stderr,"word BN_from_montgomery %d * %d\n",nl,nl);
+#endif
 	for (i=0; i<nl; i++)
 		{
-#if 0
-		int x1,x2;
-
-		if (i+4 > nl)
-			{
-			x2=nl;
-			x1=0;
-			}
-		else
-			{
-			x2=i+4;
-			x1=nl-x2;
-			}
-		v=bn_mul_add_words(&(rp[x1]),&(np[x1]),x2,(rp[x1]*n0)&BN_MASK2);
+#ifdef __TANDEM
+                {
+                   long long t1;
+                   long long t2;
+                   long long t3;
+                   t1 = rp[0] * (n0 & 0177777);
+                   t2 = 037777600000l;
+                   t2 = n0 & t2;
+                   t3 = rp[0] & 0177777;
+                   t2 = (t3 * t2) & BN_MASK2;
+                   t1 = t1 + t2;
+                   v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
+                }
 #else
 		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-
-		if (((rp[nl]+=v)&BN_MASK2) < v)
+		nrp++;
+		rp++;
+		if (((nrp[-1]+=v)&BN_MASK2) >= v)
+			continue;
+		else
 			{
-			for (x=(nl+1); (((++rp[x])&BN_MASK2) == 0); x++)
-				;
+			if (((++nrp[0])&BN_MASK2) != 0) continue;
+			if (((++nrp[1])&BN_MASK2) != 0) continue;
+			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
 			}
-		rp++;
 		}
-	while (r->d[r->top-1] == 0)
-		r->top--;
-
+	bn_fix_top(r);
+	
 	/* mont->ri will be a multiple of the word size */
 #if 0
 	BN_rshift(ret,r,mont->ri);
 #else
-	ap=r->d;
-	rp=ret->d;
+	ret->neg = r->neg;
 	x=ri;
-	al=r->top-x;
-	for (i=0; i<al; i++)
-		{
-		rp[i]=ap[i+x];
-		}
+	rp=ret->d;
+	ap= &(r->d[x]);
+	if (r->top < x)
+		al=0;
+	else
+		al=r->top-x;
 	ret->top=al;
-#endif
-
-	if (BN_ucmp(ret,mont->N) >= 0)
+	al-=4;
+	for (i=0; i<al; i+=4)
 		{
-		bn_qsub(ret,ret,mont->N); /* XXX */
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=ap[i+0];
+		t2=ap[i+1];
+		t3=ap[i+2];
+		t4=ap[i+3];
+		rp[i+0]=t1;
+		rp[i+1]=t2;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
 		}
-	retn=1;
-err:
-	return(retn);
-	}
-#else
-int BN_from_montgomery(r,a,mont,ctx)
-BIGNUM *r;
-BIGNUM *a;
-BN_MONT_CTX *mont;
-BN_CTX *ctx;
-	{
+	al+=4;
+	for (; i<al; i++)
+		rp[i]=ap[i];
+#endif
+#else /* !MONT_WORD */ 
 	BIGNUM *t1,*t2;
 
-	t1=ctx->bn[ctx->tos];
-	t2=ctx->bn[ctx->tos+1];
-
+	BN_CTX_start(ctx);
+	t1 = BN_CTX_get(ctx);
+	t2 = BN_CTX_get(ctx);
+	if (t1 == NULL || t2 == NULL) goto err;
+	
 	if (!BN_copy(t1,a)) goto err;
-	/* can cheat */
 	BN_mask_bits(t1,mont->ri);
 
-	if (!BN_mul(t2,t1,mont->Ni)) goto err;
+	if (!BN_mul(t2,t1,&mont->Ni,ctx)) goto err;
 	BN_mask_bits(t2,mont->ri);
 
-	if (!BN_mul(t1,t2,mont->N)) goto err;
+	if (!BN_mul(t1,t2,&mont->N,ctx)) goto err;
 	if (!BN_add(t2,a,t1)) goto err;
-	BN_rshift(r,t2,mont->ri);
+	if (!BN_rshift(ret,t2,mont->ri)) goto err;
+#endif /* MONT_WORD */
 
-	if (BN_ucmp(r,mont->N) >= 0)
-		bn_qsub(r,r,mont->N);
-
-	return(1);
-err:
-	return(0);
+	if (BN_ucmp(ret, &(mont->N)) >= 0)
+		{
+		if (!BN_usub(ret,ret,&(mont->N))) goto err;
+		}
+	retn=1;
+ err:
+	BN_CTX_end(ctx);
+	return(retn);
 	}
-#endif
 
-BN_MONT_CTX *BN_MONT_CTX_new()
+BN_MONT_CTX *BN_MONT_CTX_new(void)
 	{
 	BN_MONT_CTX *ret;
 
-	if ((ret=(BN_MONT_CTX *)Malloc(sizeof(BN_MONT_CTX))) == NULL)
-		return(NULL);
-	ret->ri=0;
-	ret->RR=BN_new();
-	ret->N=BN_new();
-	ret->Ni=NULL;
-	if ((ret->RR == NULL) || (ret->N == NULL))
-		{
-		BN_MONT_CTX_free(ret);
+	if ((ret=(BN_MONT_CTX *)OPENSSL_malloc(sizeof(BN_MONT_CTX))) == NULL)
 		return(NULL);
-		}
+
+	BN_MONT_CTX_init(ret);
+	ret->flags=BN_FLG_MALLOCED;
 	return(ret);
 	}
 
-void BN_MONT_CTX_free(mont)
-BN_MONT_CTX *mont;
+void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
 	{
-	if (mont->RR != NULL) BN_free(mont->RR);
-	if (mont->N != NULL) BN_free(mont->N);
-	if (mont->Ni != NULL) BN_free(mont->Ni);
-	Free(mont);
+	ctx->ri=0;
+	BN_init(&(ctx->RR));
+	BN_init(&(ctx->N));
+	BN_init(&(ctx->Ni));
+	ctx->flags=0;
 	}
 
-int BN_MONT_CTX_set(mont,mod,ctx)
-BN_MONT_CTX *mont;
-BIGNUM *mod;
-BN_CTX *ctx;
+void BN_MONT_CTX_free(BN_MONT_CTX *mont)
 	{
-	BIGNUM *Ri=NULL,*R=NULL;
+	if(mont == NULL)
+	    return;
+
+	BN_free(&(mont->RR));
+	BN_free(&(mont->N));
+	BN_free(&(mont->Ni));
+	if (mont->flags & BN_FLG_MALLOCED)
+		OPENSSL_free(mont);
+	}
 
-	if (mont->RR == NULL) mont->RR=BN_new();
-	if (mont->N == NULL)  mont->N=BN_new();
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
+	{
+	BIGNUM Ri,*R;
 
-	R=mont->RR;					/* grab RR as a temp */
-	BN_copy(mont->N,mod);				/* Set N */
+	BN_init(&Ri);
+	R= &(mont->RR);					/* grab RR as a temp */
+	BN_copy(&(mont->N),mod);			/* Set N */
+	mont->N.neg = 0;
 
 #ifdef MONT_WORD
-{
-	BIGNUM tmod;
-	BN_ULONG buf[2];
-	/* int z; */
-
-	mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
-	BN_lshift(R,BN_value_one(),BN_BITS2);		/* R */
-	/* I was bad, this modification of a passed variable was
-	 * breaking the multithreaded stuff :-(
-	 * z=mod->top;
-	 * mod->top=1; */
-
-	buf[0]=mod->d[0];
-	buf[1]=0;
-	tmod.d=buf;
-	tmod.top=1;
-	tmod.max=mod->max;
-	tmod.neg=mod->neg;
-
-	if ((Ri=BN_mod_inverse(R,&tmod,ctx)) == NULL) goto err; /* Ri */
-	BN_lshift(Ri,Ri,BN_BITS2);			/* R*Ri */
-	bn_qsub(Ri,Ri,BN_value_one());			/* R*Ri - 1 */
-	BN_div(Ri,NULL,Ri,&tmod,ctx);
-	mont->n0=Ri->d[0];
-	BN_free(Ri);
-	/* mod->top=z; */
-}
-#else
-	mont->ri=BN_num_bits(mod);
-	BN_lshift(R,BN_value_one(),mont->ri);			/* R */
-	if ((Ri=BN_mod_inverse(R,mod,ctx)) == NULL) goto err;	/* Ri */
-	BN_lshift(Ri,Ri,mont->ri);				/* R*Ri */
-	bn_qsub(Ri,Ri,BN_value_one());				/* R*Ri - 1 */
-	BN_div(Ri,NULL,Ri,mod,ctx);
-	if (mont->Ni != NULL) BN_free(mont->Ni);
-	mont->Ni=Ri;					/* Ni=(R*Ri-1)/N */
+		{
+		BIGNUM tmod;
+		BN_ULONG buf[2];
+
+		mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
+		if (!(BN_zero(R))) goto err;
+		if (!(BN_set_bit(R,BN_BITS2))) goto err;	/* R */
+
+		buf[0]=mod->d[0]; /* tmod = N mod word size */
+		buf[1]=0;
+		tmod.d=buf;
+		tmod.top=1;
+		tmod.dmax=2;
+		tmod.neg=0;
+							/* Ri = R^-1 mod N*/
+		if ((BN_mod_inverse(&Ri,R,&tmod,ctx)) == NULL)
+			goto err;
+		if (!BN_lshift(&Ri,&Ri,BN_BITS2)) goto err; /* R*Ri */
+		if (!BN_is_zero(&Ri))
+			{
+			if (!BN_sub_word(&Ri,1)) goto err;
+			}
+		else /* if N mod word size == 1 */
+			{
+			if (!BN_set_word(&Ri,BN_MASK2)) goto err;  /* Ri-- (mod word size) */
+			}
+		if (!BN_div(&Ri,NULL,&Ri,&tmod,ctx)) goto err;
+		/* Ni = (R*Ri-1)/N,
+		 * keep only least significant word: */
+		mont->n0 = (Ri.top > 0) ? Ri.d[0] : 0;
+		BN_free(&Ri);
+		}
+#else /* !MONT_WORD */
+		{ /* bignum version */
+		mont->ri=BN_num_bits(&mont->N);
+		if (!BN_zero(R)) goto err;
+		if (!BN_set_bit(R,mont->ri)) goto err;  /* R = 2^ri */
+		                                        /* Ri = R^-1 mod N*/
+		if ((BN_mod_inverse(&Ri,R,&mont->N,ctx)) == NULL)
+			goto err;
+		if (!BN_lshift(&Ri,&Ri,mont->ri)) goto err; /* R*Ri */
+		if (!BN_sub_word(&Ri,1)) goto err;
+							/* Ni = (R*Ri-1) / N */
+		if (!BN_div(&(mont->Ni),NULL,&Ri,&mont->N,ctx)) goto err;
+		BN_free(&Ri);
+		}
 #endif
 
 	/* setup RR for conversions */
-	BN_lshift(mont->RR,BN_value_one(),mont->ri*2);
-	BN_mod(mont->RR,mont->RR,mont->N,ctx);
+	if (!BN_zero(&(mont->RR))) goto err;
+	if (!BN_set_bit(&(mont->RR),mont->ri*2)) goto err;
+	if (!BN_mod(&(mont->RR),&(mont->RR),&(mont->N),ctx)) goto err;
 
 	return(1);
 err:
 	return(0);
 	}
 
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
+	{
+	if (to == from) return(to);
+
+	if (!BN_copy(&(to->RR),&(from->RR))) return NULL;
+	if (!BN_copy(&(to->N),&(from->N))) return NULL;
+	if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
+	to->ri=from->ri;
+	to->n0=from->n0;
+	return(to);
+	}
+
diff --git a/src/lib/libcrypto/bn/bn_mpi.c b/src/lib/libcrypto/bn/bn_mpi.c
index 53945c1057..05fa9d1e9a 100644
--- a/src/lib/libcrypto/bn/bn_mpi.c
+++ b/src/lib/libcrypto/bn/bn_mpi.c
@@ -60,9 +60,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-int BN_bn2mpi(a,d)
-BIGNUM *a;
-unsigned char *d;
+int BN_bn2mpi(const BIGNUM *a, unsigned char *d)
 	{
 	int bits;
 	int num=0;
@@ -90,10 +88,7 @@ unsigned char *d;
 	return(num+4+ext);
 	}
 
-BIGNUM *BN_mpi2bn(d,n,a)
-unsigned char *d;
-int n;
-BIGNUM *a;
+BIGNUM *BN_mpi2bn(const unsigned char *d, int n, BIGNUM *a)
 	{
 	long len;
 	int neg=0;
@@ -103,7 +98,7 @@ BIGNUM *a;
 		BNerr(BN_F_BN_MPI2BN,BN_R_INVALID_LENGTH);
 		return(NULL);
 		}
-	len=(d[0]<<24)|(d[1]<<16)|(d[2]<<8)|d[3];
+	len=((long)d[0]<<24)|((long)d[1]<<16)|((int)d[2]<<8)|(int)d[3];
 	if ((len+4) != n)
 		{
 		BNerr(BN_F_BN_MPI2BN,BN_R_ENCODING_ERROR);
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index d0c04e1d4b..fd598b8b3d 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -56,154 +56,1108 @@
  * [including the GNU Public Licence.]
  */
 
+#ifndef BN_DEBUG
+# undef NDEBUG /* avoid conflicting definitions */
+# define NDEBUG
+#endif
+
 #include <stdio.h>
+#include <assert.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-/* r must be different to a and b */
-/* int BN_mmul(r, a, b) */
-int BN_mul(r, a, b)
-BIGNUM *r;
-BIGNUM *a;
-BIGNUM *b;
+#if defined(OPENSSL_NO_ASM) || !(defined(__i386) || defined(__i386__))/* Assembler implementation exists only for x86 */
+/* Here follows specialised variants of bn_add_words() and
+   bn_sub_words().  They have the property performing operations on
+   arrays of different sizes.  The sizes of those arrays is expressed through
+   cl, which is the common length ( basicall, min(len(a),len(b)) ), and dl,
+   which is the delta between the two lengths, calculated as len(a)-len(b).
+   All lengths are the number of BN_ULONGs...  For the operations that require
+   a result array as parameter, it must have the length cl+abs(dl).
+   These functions should probably end up in bn_asm.c as soon as there are
+   assembler counterparts for the systems that use assembler files.  */
+
+BN_ULONG bn_sub_part_words(BN_ULONG *r,
+	const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl)
 	{
-	int i;
-	int max,al,bl;
-	BN_ULONG *ap,*bp,*rp;
+	BN_ULONG c, t;
 
-	al=a->top;
-	bl=b->top;
-	if ((al == 0) || (bl == 0))
+	assert(cl >= 0);
+	c = bn_sub_words(r, a, b, cl);
+
+	if (dl == 0)
+		return c;
+
+	r += cl;
+	a += cl;
+	b += cl;
+
+	if (dl < 0)
 		{
-		r->top=0;
-		return(1);
-		}
+#ifdef BN_COUNT
+		fprintf(stderr, "  bn_sub_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
+#endif
+		for (;;)
+			{
+			t = b[0];
+			r[0] = (0-t-c)&BN_MASK2;
+			if (t != 0) c=1;
+			if (++dl >= 0) break;
+
+			t = b[1];
+			r[1] = (0-t-c)&BN_MASK2;
+			if (t != 0) c=1;
+			if (++dl >= 0) break;
+
+			t = b[2];
+			r[2] = (0-t-c)&BN_MASK2;
+			if (t != 0) c=1;
+			if (++dl >= 0) break;
 
-	max=(al+bl);
-	if (bn_wexpand(r,max) == NULL) return(0);
-	r->top=max;
-	r->neg=a->neg^b->neg;
-	ap=a->d;
-	bp=b->d;
-	rp=r->d;
+			t = b[3];
+			r[3] = (0-t-c)&BN_MASK2;
+			if (t != 0) c=1;
+			if (++dl >= 0) break;
 
-	rp[al]=bn_mul_words(rp,ap,al,*(bp++));
-	rp++;
-	for (i=1; i<bl; i++)
+			b += 4;
+			r += 4;
+			}
+		}
+	else
 		{
-		rp[al]=bn_mul_add_words(rp,ap,al,*(bp++));
-		rp++;
+		int save_dl = dl;
+#ifdef BN_COUNT
+		fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, c = %d)\n", cl, dl, c);
+#endif
+		while(c)
+			{
+			t = a[0];
+			r[0] = (t-c)&BN_MASK2;
+			if (t != 0) c=0;
+			if (--dl <= 0) break;
+
+			t = a[1];
+			r[1] = (t-c)&BN_MASK2;
+			if (t != 0) c=0;
+			if (--dl <= 0) break;
+
+			t = a[2];
+			r[2] = (t-c)&BN_MASK2;
+			if (t != 0) c=0;
+			if (--dl <= 0) break;
+
+			t = a[3];
+			r[3] = (t-c)&BN_MASK2;
+			if (t != 0) c=0;
+			if (--dl <= 0) break;
+
+			save_dl = dl;
+			a += 4;
+			r += 4;
+			}
+		if (dl > 0)
+			{
+#ifdef BN_COUNT
+			fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
+#endif
+			if (save_dl > dl)
+				{
+				switch (save_dl - dl)
+					{
+				case 1:
+					r[1] = a[1];
+					if (--dl <= 0) break;
+				case 2:
+					r[2] = a[2];
+					if (--dl <= 0) break;
+				case 3:
+					r[3] = a[3];
+					if (--dl <= 0) break;
+					}
+				a += 4;
+				r += 4;
+				}
+			}
+		if (dl > 0)
+			{
+#ifdef BN_COUNT
+			fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, copy)\n", cl, dl);
+#endif
+			for(;;)
+				{
+				r[0] = a[0];
+				if (--dl <= 0) break;
+				r[1] = a[1];
+				if (--dl <= 0) break;
+				r[2] = a[2];
+				if (--dl <= 0) break;
+				r[3] = a[3];
+				if (--dl <= 0) break;
+
+				a += 4;
+				r += 4;
+				}
+			}
 		}
-	if (r->d[max-1] == 0) r->top--;
-	return(1);
+	return c;
 	}
+#endif
 
-#if 0
-#include "stack.h"
+BN_ULONG bn_add_part_words(BN_ULONG *r,
+	const BN_ULONG *a, const BN_ULONG *b,
+	int cl, int dl)
+	{
+	BN_ULONG c, l, t;
+
+	assert(cl >= 0);
+	c = bn_add_words(r, a, b, cl);
+
+	if (dl == 0)
+		return c;
+
+	r += cl;
+	a += cl;
+	b += cl;
+
+	if (dl < 0)
+		{
+		int save_dl = dl;
+#ifdef BN_COUNT
+		fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
+#endif
+		while (c)
+			{
+			l=(c+b[0])&BN_MASK2;
+			c=(l < c);
+			r[0]=l;
+			if (++dl >= 0) break;
+
+			l=(c+b[1])&BN_MASK2;
+			c=(l < c);
+			r[1]=l;
+			if (++dl >= 0) break;
+
+			l=(c+b[2])&BN_MASK2;
+			c=(l < c);
+			r[2]=l;
+			if (++dl >= 0) break;
 
-int limit=16;
+			l=(c+b[3])&BN_MASK2;
+			c=(l < c);
+			r[3]=l;
+			if (++dl >= 0) break;
 
-typedef struct bn_pool_st
+			save_dl = dl;
+			b+=4;
+			r+=4;
+			}
+		if (dl < 0)
+			{
+#ifdef BN_COUNT
+			fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, c == 0)\n", cl, dl);
+#endif
+			if (save_dl < dl)
+				{
+				switch (dl - save_dl)
+					{
+				case 1:
+					r[1] = b[1];
+					if (++dl >= 0) break;
+				case 2:
+					r[2] = b[2];
+					if (++dl >= 0) break;
+				case 3:
+					r[3] = b[3];
+					if (++dl >= 0) break;
+					}
+				b += 4;
+				r += 4;
+				}
+			}
+		if (dl < 0)
+			{
+#ifdef BN_COUNT
+			fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, copy)\n", cl, dl);
+#endif
+			for(;;)
+				{
+				r[0] = b[0];
+				if (++dl >= 0) break;
+				r[1] = b[1];
+				if (++dl >= 0) break;
+				r[2] = b[2];
+				if (++dl >= 0) break;
+				r[3] = b[3];
+				if (++dl >= 0) break;
+
+				b += 4;
+				r += 4;
+				}
+			}
+		}
+	else
+		{
+		int save_dl = dl;
+#ifdef BN_COUNT
+		fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0)\n", cl, dl);
+#endif
+		while (c)
+			{
+			t=(a[0]+c)&BN_MASK2;
+			c=(t < c);
+			r[0]=t;
+			if (--dl <= 0) break;
+
+			t=(a[1]+c)&BN_MASK2;
+			c=(t < c);
+			r[1]=t;
+			if (--dl <= 0) break;
+
+			t=(a[2]+c)&BN_MASK2;
+			c=(t < c);
+			r[2]=t;
+			if (--dl <= 0) break;
+
+			t=(a[3]+c)&BN_MASK2;
+			c=(t < c);
+			r[3]=t;
+			if (--dl <= 0) break;
+
+			save_dl = dl;
+			a+=4;
+			r+=4;
+			}
+#ifdef BN_COUNT
+		fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
+#endif
+		if (dl > 0)
+			{
+			if (save_dl > dl)
+				{
+				switch (save_dl - dl)
+					{
+				case 1:
+					r[1] = a[1];
+					if (--dl <= 0) break;
+				case 2:
+					r[2] = a[2];
+					if (--dl <= 0) break;
+				case 3:
+					r[3] = a[3];
+					if (--dl <= 0) break;
+					}
+				a += 4;
+				r += 4;
+				}
+			}
+		if (dl > 0)
+			{
+#ifdef BN_COUNT
+			fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0, copy)\n", cl, dl);
+#endif
+			for(;;)
+				{
+				r[0] = a[0];
+				if (--dl <= 0) break;
+				r[1] = a[1];
+				if (--dl <= 0) break;
+				r[2] = a[2];
+				if (--dl <= 0) break;
+				r[3] = a[3];
+				if (--dl <= 0) break;
+
+				a += 4;
+				r += 4;
+				}
+			}
+		}
+	return c;
+	}
+
+#ifdef BN_RECURSION
+/* Karatsuba recursive multiplication algorithm
+ * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
+
+/* r is 2*n2 words in size,
+ * a and b are both n2 words in size.
+ * n2 must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n2 words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+	int dna, int dnb, BN_ULONG *t)
 	{
-	int used;
-	int tos;
-	STACK *sk; 
-	} BN_POOL;
+	int n=n2/2,c1,c2;
+	int tna=n+dna, tnb=n+dnb;
+	unsigned int neg,zero;
+	BN_ULONG ln,lo,*p;
+
+# ifdef BN_COUNT
+	fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2);
+# endif
+# ifdef BN_MUL_COMBA
+#  if 0
+	if (n2 == 4)
+		{
+		bn_mul_comba4(r,a,b);
+		return;
+		}
+#  endif
+	/* Only call bn_mul_comba 8 if n2 == 8 and the
+	 * two arrays are complete [steve]
+	 */
+	if (n2 == 8 && dna == 0 && dnb == 0)
+		{
+		bn_mul_comba8(r,a,b);
+		return; 
+		}
+# endif /* BN_MUL_COMBA */
+	/* Else do normal multiply */
+	if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL)
+		{
+		bn_mul_normal(r,a,n2+dna,b,n2+dnb);
+		if ((dna + dnb) < 0)
+			memset(&r[2*n2 + dna + dnb], 0,
+				sizeof(BN_ULONG) * -(dna + dnb));
+		return;
+		}
+	/* r=(a[0]-a[1])*(b[1]-b[0]) */
+	c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
+	c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
+	zero=neg=0;
+	switch (c1*3+c2)
+		{
+	case -4:
+		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
+		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		break;
+	case -3:
+		zero=1;
+		break;
+	case -2:
+		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
+		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n); /* + */
+		neg=1;
+		break;
+	case -1:
+	case 0:
+	case 1:
+		zero=1;
+		break;
+	case 2:
+		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna); /* + */
+		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		neg=1;
+		break;
+	case 3:
+		zero=1;
+		break;
+	case 4:
+		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna);
+		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n);
+		break;
+		}
+
+# ifdef BN_MUL_COMBA
+	if (n == 4 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba4 could take
+					       extra args to do this well */
+		{
+		if (!zero)
+			bn_mul_comba4(&(t[n2]),t,&(t[n]));
+		else
+			memset(&(t[n2]),0,8*sizeof(BN_ULONG));
+		
+		bn_mul_comba4(r,a,b);
+		bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n]));
+		}
+	else if (n == 8 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba8 could
+						    take extra args to do this
+						    well */
+		{
+		if (!zero)
+			bn_mul_comba8(&(t[n2]),t,&(t[n]));
+		else
+			memset(&(t[n2]),0,16*sizeof(BN_ULONG));
+		
+		bn_mul_comba8(r,a,b);
+		bn_mul_comba8(&(r[n2]),&(a[n]),&(b[n]));
+		}
+	else
+# endif /* BN_MUL_COMBA */
+		{
+		p= &(t[n2*2]);
+		if (!zero)
+			bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
+		else
+			memset(&(t[n2]),0,n2*sizeof(BN_ULONG));
+		bn_mul_recursive(r,a,b,n,0,0,p);
+		bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,dna,dnb,p);
+		}
 
-BIGNUM *BN_POOL_push(bp)
-BN_POOL *bp;
+	/* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+	 * r[10] holds (a[0]*b[0])
+	 * r[32] holds (b[1]*b[1])
+	 */
+
+	c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
+
+	if (neg) /* if t[32] is negative */
+		{
+		c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
+		}
+	else
+		{
+		/* Might have a carry */
+		c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),t,n2));
+		}
+
+	/* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+	 * r[10] holds (a[0]*b[0])
+	 * r[32] holds (b[1]*b[1])
+	 * c1 holds the carry bits
+	 */
+	c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
+	if (c1)
+		{
+		p= &(r[n+n2]);
+		lo= *p;
+		ln=(lo+c1)&BN_MASK2;
+		*p=ln;
+
+		/* The overflow will stop before we over write
+		 * words we should not overwrite */
+		if (ln < (BN_ULONG)c1)
+			{
+			do	{
+				p++;
+				lo= *p;
+				ln=(lo+1)&BN_MASK2;
+				*p=ln;
+				} while (ln == 0);
+			}
+		}
+	}
+
+/* n+tn is the word length
+ * t needs to be n*4 is size, as does r */
+void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
+	     int tna, int tnb, BN_ULONG *t)
 	{
-	BIGNUM *ret;
+	int i,j,n2=n*2;
+	unsigned int c1,c2,neg,zero;
+	BN_ULONG ln,lo,*p;
+
+# ifdef BN_COUNT
+	fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n",
+		tna, n, tnb, n);
+# endif
+	if (n < 8)
+		{
+		bn_mul_normal(r,a,n+tna,b,n+tnb);
+		return;
+		}
 
-	if (bp->used >= bp->tos)
+	/* r=(a[0]-a[1])*(b[1]-b[0]) */
+	c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
+	c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
+	zero=neg=0;
+	switch (c1*3+c2)
+		{
+	case -4:
+		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
+		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		break;
+	case -3:
+		zero=1;
+		/* break; */
+	case -2:
+		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
+		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n); /* + */
+		neg=1;
+		break;
+	case -1:
+	case 0:
+	case 1:
+		zero=1;
+		/* break; */
+	case 2:
+		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna); /* + */
+		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		neg=1;
+		break;
+	case 3:
+		zero=1;
+		/* break; */
+	case 4:
+		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna);
+		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n);
+		break;
+		}
+		/* The zero case isn't yet implemented here. The speedup
+		   would probably be negligible. */
+# if 0
+	if (n == 4)
 		{
-		ret=BN_new();
-		sk_push(bp->sk,(char *)ret);
-		bp->tos++;
-		bp->used++;
+		bn_mul_comba4(&(t[n2]),t,&(t[n]));
+		bn_mul_comba4(r,a,b);
+		bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn);
+		memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2));
 		}
 	else
+# endif
+	if (n == 8)
 		{
-		ret=(BIGNUM *)sk_value(bp->sk,bp->used);
-		bp->used++;
+		bn_mul_comba8(&(t[n2]),t,&(t[n]));
+		bn_mul_comba8(r,a,b);
+		bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
+		memset(&(r[n2+tna+tnb]),0,sizeof(BN_ULONG)*(n2-tna-tnb));
+		}
+	else
+		{
+		p= &(t[n2*2]);
+		bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
+		bn_mul_recursive(r,a,b,n,0,0,p);
+		i=n/2;
+		/* If there is only a bottom half to the number,
+		 * just do it */
+		if (tna > tnb)
+			j = tna - i;
+		else
+			j = tnb - i;
+		if (j == 0)
+			{
+			bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),
+				i,tna-i,tnb-i,p);
+			memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2));
+			}
+		else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */
+				{
+				bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]),
+					i,tna-i,tnb-i,p);
+				memset(&(r[n2+tna+tnb]),0,
+					sizeof(BN_ULONG)*(n2-tna-tnb));
+				}
+		else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
+			{
+			memset(&(r[n2]),0,sizeof(BN_ULONG)*n2);
+			if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL
+				&& tnb < BN_MUL_RECURSIVE_SIZE_NORMAL)
+				{
+				bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
+				}
+			else
+				{
+				for (;;)
+					{
+					i/=2;
+					if (i < tna && i < tnb)
+						{
+						bn_mul_part_recursive(&(r[n2]),
+							&(a[n]),&(b[n]),
+							i,tna-i,tnb-i,p);
+						break;
+						}
+					else if (i <= tna && i <= tnb)
+						{
+						bn_mul_recursive(&(r[n2]),
+							&(a[n]),&(b[n]),
+							i,tna-i,tnb-i,p);
+						break;
+						}
+					}
+				}
+			}
+		}
+
+	/* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+	 * r[10] holds (a[0]*b[0])
+	 * r[32] holds (b[1]*b[1])
+	 */
+
+	c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
+
+	if (neg) /* if t[32] is negative */
+		{
+		c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
+		}
+	else
+		{
+		/* Might have a carry */
+		c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),t,n2));
+		}
+
+	/* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+	 * r[10] holds (a[0]*b[0])
+	 * r[32] holds (b[1]*b[1])
+	 * c1 holds the carry bits
+	 */
+	c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
+	if (c1)
+		{
+		p= &(r[n+n2]);
+		lo= *p;
+		ln=(lo+c1)&BN_MASK2;
+		*p=ln;
+
+		/* The overflow will stop before we over write
+		 * words we should not overwrite */
+		if (ln < c1)
+			{
+			do	{
+				p++;
+				lo= *p;
+				ln=(lo+1)&BN_MASK2;
+				*p=ln;
+				} while (ln == 0);
+			}
 		}
-	return(ret);
 	}
 
-void BN_POOL_pop(bp,num)
-BN_POOL *bp;
-int num;
+/* a and b must be the same size, which is n2.
+ * r needs to be n2 words and t needs to be n2*2
+ */
+void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+	     BN_ULONG *t)
 	{
-	bp->used-=num;
+	int n=n2/2;
+
+# ifdef BN_COUNT
+	fprintf(stderr," bn_mul_low_recursive %d * %d\n",n2,n2);
+# endif
+
+	bn_mul_recursive(r,a,b,n,0,0,&(t[0]));
+	if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL)
+		{
+		bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2]));
+		bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
+		bn_mul_low_recursive(&(t[0]),&(a[n]),&(b[0]),n,&(t[n2]));
+		bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
+		}
+	else
+		{
+		bn_mul_low_normal(&(t[0]),&(a[0]),&(b[n]),n);
+		bn_mul_low_normal(&(t[n]),&(a[n]),&(b[0]),n);
+		bn_add_words(&(r[n]),&(r[n]),&(t[0]),n);
+		bn_add_words(&(r[n]),&(r[n]),&(t[n]),n);
+		}
 	}
 
-int BN_mul(r,a,b)
-BIGNUM *r,*a,*b;
+/* a and b must be the same size, which is n2.
+ * r needs to be n2 words and t needs to be n2*2
+ * l is the low words of the output.
+ * t needs to be n2*3
+ */
+void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
+	     BN_ULONG *t)
 	{
-	static BN_POOL bp;
-	static init=1;
+	int i,n;
+	int c1,c2;
+	int neg,oneg,zero;
+	BN_ULONG ll,lc,*lp,*mp;
+
+# ifdef BN_COUNT
+	fprintf(stderr," bn_mul_high %d * %d\n",n2,n2);
+# endif
+	n=n2/2;
+
+	/* Calculate (al-ah)*(bh-bl) */
+	neg=zero=0;
+	c1=bn_cmp_words(&(a[0]),&(a[n]),n);
+	c2=bn_cmp_words(&(b[n]),&(b[0]),n);
+	switch (c1*3+c2)
+		{
+	case -4:
+		bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n);
+		bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n);
+		break;
+	case -3:
+		zero=1;
+		break;
+	case -2:
+		bn_sub_words(&(r[0]),&(a[n]),&(a[0]),n);
+		bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n);
+		neg=1;
+		break;
+	case -1:
+	case 0:
+	case 1:
+		zero=1;
+		break;
+	case 2:
+		bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n);
+		bn_sub_words(&(r[n]),&(b[0]),&(b[n]),n);
+		neg=1;
+		break;
+	case 3:
+		zero=1;
+		break;
+	case 4:
+		bn_sub_words(&(r[0]),&(a[0]),&(a[n]),n);
+		bn_sub_words(&(r[n]),&(b[n]),&(b[0]),n);
+		break;
+		}
+		
+	oneg=neg;
+	/* t[10] = (a[0]-a[1])*(b[1]-b[0]) */
+	/* r[10] = (a[1]*b[1]) */
+# ifdef BN_MUL_COMBA
+	if (n == 8)
+		{
+		bn_mul_comba8(&(t[0]),&(r[0]),&(r[n]));
+		bn_mul_comba8(r,&(a[n]),&(b[n]));
+		}
+	else
+# endif
+		{
+		bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,0,0,&(t[n2]));
+		bn_mul_recursive(r,&(a[n]),&(b[n]),n,0,0,&(t[n2]));
+		}
+
+	/* s0 == low(al*bl)
+	 * s1 == low(ah*bh)+low((al-ah)*(bh-bl))+low(al*bl)+high(al*bl)
+	 * We know s0 and s1 so the only unknown is high(al*bl)
+	 * high(al*bl) == s1 - low(ah*bh+s0+(al-ah)*(bh-bl))
+	 * high(al*bl) == s1 - (r[0]+l[0]+t[0])
+	 */
+	if (l != NULL)
+		{
+		lp= &(t[n2+n]);
+		c1=(int)(bn_add_words(lp,&(r[0]),&(l[0]),n));
+		}
+	else
+		{
+		c1=0;
+		lp= &(r[0]);
+		}
+
+	if (neg)
+		neg=(int)(bn_sub_words(&(t[n2]),lp,&(t[0]),n));
+	else
+		{
+		bn_add_words(&(t[n2]),lp,&(t[0]),n);
+		neg=0;
+		}
+
+	if (l != NULL)
+		{
+		bn_sub_words(&(t[n2+n]),&(l[n]),&(t[n2]),n);
+		}
+	else
+		{
+		lp= &(t[n2+n]);
+		mp= &(t[n2]);
+		for (i=0; i<n; i++)
+			lp[i]=((~mp[i])+1)&BN_MASK2;
+		}
 
-	if (init)
+	/* s[0] = low(al*bl)
+	 * t[3] = high(al*bl)
+	 * t[10] = (a[0]-a[1])*(b[1]-b[0]) neg is the sign
+	 * r[10] = (a[1]*b[1])
+	 */
+	/* R[10] = al*bl
+	 * R[21] = al*bl + ah*bh + (a[0]-a[1])*(b[1]-b[0])
+	 * R[32] = ah*bh
+	 */
+	/* R[1]=t[3]+l[0]+r[0](+-)t[0] (have carry/borrow)
+	 * R[2]=r[0]+t[3]+r[1](+-)t[1] (have carry/borrow)
+	 * R[3]=r[1]+(carry/borrow)
+	 */
+	if (l != NULL)
+		{
+		lp= &(t[n2]);
+		c1= (int)(bn_add_words(lp,&(t[n2+n]),&(l[0]),n));
+		}
+	else
 		{
-		bp.used=0;
-		bp.tos=0;
-		bp.sk=sk_new_null();
-		init=0;
+		lp= &(t[n2+n]);
+		c1=0;
+		}
+	c1+=(int)(bn_add_words(&(t[n2]),lp,  &(r[0]),n));
+	if (oneg)
+		c1-=(int)(bn_sub_words(&(t[n2]),&(t[n2]),&(t[0]),n));
+	else
+		c1+=(int)(bn_add_words(&(t[n2]),&(t[n2]),&(t[0]),n));
+
+	c2 =(int)(bn_add_words(&(r[0]),&(r[0]),&(t[n2+n]),n));
+	c2+=(int)(bn_add_words(&(r[0]),&(r[0]),&(r[n]),n));
+	if (oneg)
+		c2-=(int)(bn_sub_words(&(r[0]),&(r[0]),&(t[n]),n));
+	else
+		c2+=(int)(bn_add_words(&(r[0]),&(r[0]),&(t[n]),n));
+	
+	if (c1 != 0) /* Add starting at r[0], could be +ve or -ve */
+		{
+		i=0;
+		if (c1 > 0)
+			{
+			lc=c1;
+			do	{
+				ll=(r[i]+lc)&BN_MASK2;
+				r[i++]=ll;
+				lc=(lc > ll);
+				} while (lc);
+			}
+		else
+			{
+			lc= -c1;
+			do	{
+				ll=r[i];
+				r[i++]=(ll-lc)&BN_MASK2;
+				lc=(lc > ll);
+				} while (lc);
+			}
+		}
+	if (c2 != 0) /* Add starting at r[1] */
+		{
+		i=n;
+		if (c2 > 0)
+			{
+			lc=c2;
+			do	{
+				ll=(r[i]+lc)&BN_MASK2;
+				r[i++]=ll;
+				lc=(lc > ll);
+				} while (lc);
+			}
+		else
+			{
+			lc= -c2;
+			do	{
+				ll=r[i];
+				r[i++]=(ll-lc)&BN_MASK2;
+				lc=(lc > ll);
+				} while (lc);
+			}
 		}
-	return(BN_mm(r,a,b,&bp));
 	}
+#endif /* BN_RECURSION */
 
-/* r must be different to a and b */
-int BN_mm(m, A, B, bp)
-BIGNUM *m,*A,*B;
-BN_POOL *bp;
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
-	int i,num;
-	int an,bn;
-	BIGNUM *a,*b,*c,*d,*ac,*bd;
+	int ret=0;
+	int top,al,bl;
+	BIGNUM *rr;
+#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
+	int i;
+#endif
+#ifdef BN_RECURSION
+	BIGNUM *t=NULL;
+	int j=0,k;
+#endif
+
+#ifdef BN_COUNT
+	fprintf(stderr,"BN_mul %d * %d\n",a->top,b->top);
+#endif
 
-	an=A->top;
-	bn=B->top;
-	if ((an <= limit) || (bn <= limit))
+	bn_check_top(a);
+	bn_check_top(b);
+	bn_check_top(r);
+
+	al=a->top;
+	bl=b->top;
+
+	if ((al == 0) || (bl == 0))
 		{
-		return(BN_mmul(m,A,B));
+		if (!BN_zero(r)) goto err;
+		return(1);
 		}
+	top=al+bl;
 
-	a=BN_POOL_push(bp);
-	b=BN_POOL_push(bp);
-	c=BN_POOL_push(bp);
-	d=BN_POOL_push(bp);
-	ac=BN_POOL_push(bp);
-	bd=BN_POOL_push(bp);
+	BN_CTX_start(ctx);
+	if ((r == a) || (r == b))
+		{
+		if ((rr = BN_CTX_get(ctx)) == NULL) goto err;
+		}
+	else
+		rr = r;
+	rr->neg=a->neg^b->neg;
 
-	num=(an <= bn)?an:bn;
-	num=1<<(BN_num_bits_word(num-1)-1);
+#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
+	i = al-bl;
+#endif
+#ifdef BN_MUL_COMBA
+	if (i == 0)
+		{
+# if 0
+		if (al == 4)
+			{
+			if (bn_wexpand(rr,8) == NULL) goto err;
+			rr->top=8;
+			bn_mul_comba4(rr->d,a->d,b->d);
+			goto end;
+			}
+# endif
+		if (al == 8)
+			{
+			if (bn_wexpand(rr,16) == NULL) goto err;
+			rr->top=16;
+			bn_mul_comba8(rr->d,a->d,b->d);
+			goto end;
+			}
+		}
+#endif /* BN_MUL_COMBA */
+#ifdef BN_RECURSION
+	if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL))
+		{
+		if (i >= -1 && i <= 1)
+			{
+			int sav_j =0;
+			/* Find out the power of two lower or equal
+			   to the longest of the two numbers */
+			if (i >= 0)
+				{
+				j = BN_num_bits_word((BN_ULONG)al);
+				}
+			if (i == -1)
+				{
+				j = BN_num_bits_word((BN_ULONG)bl);
+				}
+			sav_j = j;
+			j = 1<<(j-1);
+			assert(j <= al || j <= bl);
+			k = j+j;
+			t = BN_CTX_get(ctx);
+			if (al > j || bl > j)
+				{
+				bn_wexpand(t,k*4);
+				bn_wexpand(rr,k*4);
+				bn_mul_part_recursive(rr->d,a->d,b->d,
+					j,al-j,bl-j,t->d);
+				}
+			else	/* al <= j || bl <= j */
+				{
+				bn_wexpand(t,k*2);
+				bn_wexpand(rr,k*2);
+				bn_mul_recursive(rr->d,a->d,b->d,
+					j,al-j,bl-j,t->d);
+				}
+			rr->top=top;
+			goto end;
+			}
+#if 0
+		if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA))
+			{
+			BIGNUM *tmp_bn = (BIGNUM *)b;
+			if (bn_wexpand(tmp_bn,al) == NULL) goto err;
+			tmp_bn->d[bl]=0;
+			bl++;
+			i--;
+			}
+		else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA))
+			{
+			BIGNUM *tmp_bn = (BIGNUM *)a;
+			if (bn_wexpand(tmp_bn,bl) == NULL) goto err;
+			tmp_bn->d[al]=0;
+			al++;
+			i++;
+			}
+		if (i == 0)
+			{
+			/* symmetric and > 4 */
+			/* 16 or larger */
+			j=BN_num_bits_word((BN_ULONG)al);
+			j=1<<(j-1);
+			k=j+j;
+			t = BN_CTX_get(ctx);
+			if (al == j) /* exact multiple */
+				{
+				if (bn_wexpand(t,k*2) == NULL) goto err;
+				if (bn_wexpand(rr,k*2) == NULL) goto err;
+				bn_mul_recursive(rr->d,a->d,b->d,al,t->d);
+				}
+			else
+				{
+				if (bn_wexpand(t,k*4) == NULL) goto err;
+				if (bn_wexpand(rr,k*4) == NULL) goto err;
+				bn_mul_part_recursive(rr->d,a->d,b->d,al-j,j,t->d);
+				}
+			rr->top=top;
+			goto end;
+			}
+#endif
+		}
+#endif /* BN_RECURSION */
+	if (bn_wexpand(rr,top) == NULL) goto err;
+	rr->top=top;
+	bn_mul_normal(rr->d,a->d,al,b->d,bl);
 
-	/* Are going to now chop things into 'num' word chunks. */
-	num*=BN_BITS2;
+#if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
+end:
+#endif
+	bn_fix_top(rr);
+	if (r != rr) BN_copy(r,rr);
+	ret=1;
+err:
+	BN_CTX_end(ctx);
+	return(ret);
+	}
 
-	BN_copy(a,A);
-	BN_mask_bits(a,num);
-	BN_rshift(b,A,num);
+void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
+	{
+	BN_ULONG *rr;
 
-	BN_copy(c,B);
-	BN_mask_bits(c,num);
-	BN_rshift(d,B,num);
+#ifdef BN_COUNT
+	fprintf(stderr," bn_mul_normal %d * %d\n",na,nb);
+#endif
 
-	BN_sub(ac ,b,a);
-	BN_sub(bd,c,d);
-	BN_mm(m,ac,bd,bp);
-	BN_mm(ac,a,c,bp);
-	BN_mm(bd,b,d,bp);
+	if (na < nb)
+		{
+		int itmp;
+		BN_ULONG *ltmp;
 
-	BN_add(m,m,ac);
-	BN_add(m,m,bd);
-	BN_lshift(m,m,num);
-	BN_lshift(bd,bd,num*2);
+		itmp=na; na=nb; nb=itmp;
+		ltmp=a;   a=b;   b=ltmp;
 
-	BN_add(m,m,ac);
-	BN_add(m,m,bd);
-	BN_POOL_pop(bp,6);
-	return(1);
+		}
+	rr= &(r[na]);
+	if (nb <= 0)
+		{
+		(void)bn_mul_words(r,a,na,0);
+		return;
+		}
+	else
+		rr[0]=bn_mul_words(r,a,na,b[0]);
+
+	for (;;)
+		{
+		if (--nb <= 0) return;
+		rr[1]=bn_mul_add_words(&(r[1]),a,na,b[1]);
+		if (--nb <= 0) return;
+		rr[2]=bn_mul_add_words(&(r[2]),a,na,b[2]);
+		if (--nb <= 0) return;
+		rr[3]=bn_mul_add_words(&(r[3]),a,na,b[3]);
+		if (--nb <= 0) return;
+		rr[4]=bn_mul_add_words(&(r[4]),a,na,b[4]);
+		rr+=4;
+		r+=4;
+		b+=4;
+		}
 	}
+
+void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+	{
+#ifdef BN_COUNT
+	fprintf(stderr," bn_mul_low_normal %d * %d\n",n,n);
 #endif
+	bn_mul_words(r,a,n,b[0]);
+
+	for (;;)
+		{
+		if (--n <= 0) return;
+		bn_mul_add_words(&(r[1]),a,n,b[1]);
+		if (--n <= 0) return;
+		bn_mul_add_words(&(r[2]),a,n,b[2]);
+		if (--n <= 0) return;
+		bn_mul_add_words(&(r[3]),a,n,b[3]);
+		if (--n <= 0) return;
+		bn_mul_add_words(&(r[4]),a,n,b[4]);
+		r+=4;
+		b+=4;
+		}
+	}
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c
index 0c85f70b59..918b9237c6 100644
--- a/src/lib/libcrypto/bn/bn_prime.c
+++ b/src/lib/libcrypto/bn/bn_prime.c
@@ -55,53 +55,100 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 
 #include <stdio.h>
 #include <time.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
-#include "rand.h"
+#include <openssl/rand.h>
 
-/* The quick seive algorithm approach to weeding out primes is
+/* The quick sieve algorithm approach to weeding out primes is
  * Philip Zimmermann's, as implemented in PGP.  I have had a read of
  * his comments and implemented my own version.
  */
 #include "bn_prime.h"
 
-#ifndef NOPROTO
-static int witness(BIGNUM *a, BIGNUM *n, BN_CTX *ctx,BN_CTX *ctx2,
-	BN_MONT_CTX *mont);
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+	const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont);
 static int probable_prime(BIGNUM *rnd, int bits);
 static int probable_prime_dh(BIGNUM *rnd, int bits,
-	BIGNUM *add, BIGNUM *rem, BN_CTX *ctx);
-static int probable_prime_dh_strong(BIGNUM *rnd, int bits,
-	BIGNUM *add, BIGNUM *rem, BN_CTX *ctx);
-#else
-static int witness();
-static int probable_prime();
-static int probable_prime_dh();
-static int probable_prime_dh_strong();
-#endif
-
-BIGNUM *BN_generate_prime(bits,strong,add,rem,callback,cb_arg)
-int bits;
-int strong;
-BIGNUM *add;
-BIGNUM *rem;
-void (*callback)(P_I_I_P); 
-char *cb_arg;
+	const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
+static int probable_prime_dh_safe(BIGNUM *rnd, int bits,
+	const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx);
+
+BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
+	const BIGNUM *add, const BIGNUM *rem,
+	void (*callback)(int,int,void *), void *cb_arg)
 	{
 	BIGNUM *rnd=NULL;
-	BIGNUM *ret=NULL;
-	BIGNUM *t=NULL;
+	BIGNUM t;
+	int found=0;
 	int i,j,c1=0;
 	BN_CTX *ctx;
+	int checks = BN_prime_checks_for_size(bits);
 
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
-	if ((rnd=BN_new()) == NULL) goto err;
-	if (strong)
-		if ((t=BN_new()) == NULL) goto err;
+	if (ret == NULL)
+		{
+		if ((rnd=BN_new()) == NULL) goto err;
+		}
+	else
+		rnd=ret;
+	BN_init(&t);
 loop: 
 	/* make a random number and set the top and bottom bits */
 	if (add == NULL)
@@ -110,9 +157,9 @@ loop:
 		}
 	else
 		{
-		if (strong)
+		if (safe)
 			{
-			if (!probable_prime_dh_strong(rnd,bits,add,rem,ctx))
+			if (!probable_prime_dh_safe(rnd,bits,add,rem,ctx))
 				 goto err;
 			}
 		else
@@ -124,171 +171,188 @@ loop:
 	/* if (BN_mod_word(rnd,(BN_ULONG)3) == 1) goto loop; */
 	if (callback != NULL) callback(0,c1++,cb_arg);
 
-	if (!strong)
+	if (!safe)
 		{
-		i=BN_is_prime(rnd,BN_prime_checks,callback,ctx,cb_arg);
+		i=BN_is_prime_fasttest(rnd,checks,callback,ctx,cb_arg,0);
 		if (i == -1) goto err;
 		if (i == 0) goto loop;
 		}
 	else
 		{
-		/* for a strong prime generation,
+		/* for "safe prime" generation,
 		 * check that (p-1)/2 is prime.
 		 * Since a prime is odd, We just
 		 * need to divide by 2 */
-		if (!BN_rshift1(t,rnd)) goto err;
+		if (!BN_rshift1(&t,rnd)) goto err;
 
-		for (i=0; i<BN_prime_checks; i++)
+		for (i=0; i<checks; i++)
 			{
-			j=BN_is_prime(rnd,1,callback,ctx,cb_arg);
+			j=BN_is_prime_fasttest(rnd,1,callback,ctx,cb_arg,0);
 			if (j == -1) goto err;
 			if (j == 0) goto loop;
 
-			j=BN_is_prime(t,1,callback,ctx,cb_arg);
+			j=BN_is_prime_fasttest(&t,1,callback,ctx,cb_arg,0);
 			if (j == -1) goto err;
 			if (j == 0) goto loop;
 
 			if (callback != NULL) callback(2,c1-1,cb_arg);
-			/* We have a strong prime test pass */
+			/* We have a safe prime test pass */
 			}
 		}
 	/* we have a prime :-) */
-	ret=rnd;
+	found = 1;
 err:
-	if ((ret == NULL) && (rnd != NULL)) BN_free(rnd);
-	if (t != NULL) BN_free(t);
+	if (!found && (ret == NULL) && (rnd != NULL)) BN_free(rnd);
+	BN_free(&t);
 	if (ctx != NULL) BN_CTX_free(ctx);
-	return(ret);
+	return(found ? rnd : NULL);
 	}
 
-int BN_is_prime(a,checks,callback,ctx_passed,cb_arg)
-BIGNUM *a;
-int checks;
-void (*callback)(P_I_I_P);
-BN_CTX *ctx_passed;
-char *cb_arg;
+int BN_is_prime(const BIGNUM *a, int checks, void (*callback)(int,int,void *),
+	BN_CTX *ctx_passed, void *cb_arg)
 	{
-	int i,j,c2=0,ret= -1;
-	BIGNUM *check;
-	BN_CTX *ctx=NULL,*ctx2=NULL;
-	BN_MONT_CTX *mont=NULL;
+	return BN_is_prime_fasttest(a, checks, callback, ctx_passed, cb_arg, 0);
+	}
 
+int BN_is_prime_fasttest(const BIGNUM *a, int checks,
+		void (*callback)(int,int,void *),
+		BN_CTX *ctx_passed, void *cb_arg,
+		int do_trial_division)
+	{
+	int i, j, ret = -1;
+	int k;
+	BN_CTX *ctx = NULL;
+	BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
+	BN_MONT_CTX *mont = NULL;
+	const BIGNUM *A = NULL;
+
+	if (BN_cmp(a, BN_value_one()) <= 0)
+		return 0;
+	
+	if (checks == BN_prime_checks)
+		checks = BN_prime_checks_for_size(BN_num_bits(a));
+
+	/* first look for small factors */
 	if (!BN_is_odd(a))
-		return(0);
+		return 0;
+	if (do_trial_division)
+		{
+		for (i = 1; i < NUMPRIMES; i++)
+			if (BN_mod_word(a, primes[i]) == 0) 
+				return 0;
+		if (callback != NULL) callback(1, -1, cb_arg);
+		}
+
 	if (ctx_passed != NULL)
-		ctx=ctx_passed;
+		ctx = ctx_passed;
 	else
-		if ((ctx=BN_CTX_new()) == NULL) goto err;
-
-	if ((ctx2=BN_CTX_new()) == NULL) goto err;
-	if ((mont=BN_MONT_CTX_new()) == NULL) goto err;
-
-	check=ctx->bn[ctx->tos++];
+		if ((ctx=BN_CTX_new()) == NULL)
+			goto err;
+	BN_CTX_start(ctx);
 
-	/* Setup the montgomery structure */
-	if (!BN_MONT_CTX_set(mont,a,ctx2)) goto err;
+	/* A := abs(a) */
+	if (a->neg)
+		{
+		BIGNUM *t;
+		if ((t = BN_CTX_get(ctx)) == NULL) goto err;
+		BN_copy(t, a);
+		t->neg = 0;
+		A = t;
+		}
+	else
+		A = a;
+	A1 = BN_CTX_get(ctx);
+	A1_odd = BN_CTX_get(ctx);
+	check = BN_CTX_get(ctx);
+	if (check == NULL) goto err;
+
+	/* compute A1 := A - 1 */
+	if (!BN_copy(A1, A))
+		goto err;
+	if (!BN_sub_word(A1, 1))
+		goto err;
+	if (BN_is_zero(A1))
+		{
+		ret = 0;
+		goto err;
+		}
 
-	for (i=0; i<checks; i++)
+	/* write  A1  as  A1_odd * 2^k */
+	k = 1;
+	while (!BN_is_bit_set(A1, k))
+		k++;
+	if (!BN_rshift(A1_odd, A1, k))
+		goto err;
+
+	/* Montgomery setup for computations mod A */
+	mont = BN_MONT_CTX_new();
+	if (mont == NULL)
+		goto err;
+	if (!BN_MONT_CTX_set(mont, A, ctx))
+		goto err;
+	
+	for (i = 0; i < checks; i++)
 		{
-		if (!BN_rand(check,BN_num_bits(a)-1,0,0)) goto err;
-		j=witness(check,a,ctx,ctx2,mont);
+		if (!BN_pseudo_rand_range(check, A1))
+			goto err;
+		if (!BN_add_word(check, 1))
+			goto err;
+		/* now 1 <= check < A */
+
+		j = witness(check, A, A1, A1_odd, k, ctx, mont);
 		if (j == -1) goto err;
 		if (j)
 			{
 			ret=0;
 			goto err;
 			}
-		if (callback != NULL) callback(1,c2++,cb_arg);
+		if (callback != NULL) callback(1,i,cb_arg);
 		}
 	ret=1;
 err:
-	ctx->tos--;
-	if ((ctx_passed == NULL) && (ctx != NULL))
-		BN_CTX_free(ctx);
-	if (ctx2 != NULL)
-		BN_CTX_free(ctx2);
-	if (mont != NULL) BN_MONT_CTX_free(mont);
-		
+	if (ctx != NULL)
+		{
+		BN_CTX_end(ctx);
+		if (ctx_passed == NULL)
+			BN_CTX_free(ctx);
+		}
+	if (mont != NULL)
+		BN_MONT_CTX_free(mont);
+
 	return(ret);
 	}
 
-#define RECP_MUL_MOD
-
-static int witness(a,n,ctx,ctx2,mont)
-BIGNUM *a;
-BIGNUM *n;
-BN_CTX *ctx,*ctx2;
-BN_MONT_CTX *mont;
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+	const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont)
 	{
-	int k,i,ret= -1,good;
-	BIGNUM *d,*dd,*tmp,*d1,*d2,*n1;
-	BIGNUM *mont_one,*mont_n1,*mont_a;
-
-	d1=ctx->bn[ctx->tos];
-	d2=ctx->bn[ctx->tos+1];
-	n1=ctx->bn[ctx->tos+2];
-	ctx->tos+=3;
-
-	mont_one=ctx2->bn[ctx2->tos];
-	mont_n1=ctx2->bn[ctx2->tos+1];
-	mont_a=ctx2->bn[ctx2->tos+2];
-	ctx2->tos+=3;
-
-	d=d1;
-	dd=d2;
-	if (!BN_one(d)) goto err;
-	if (!BN_sub(n1,n,d)) goto err; /* n1=n-1; */
-	k=BN_num_bits(n1);
-
-	if (!BN_to_montgomery(mont_one,BN_value_one(),mont,ctx2)) goto err;
-	if (!BN_to_montgomery(mont_n1,n1,mont,ctx2)) goto err;
-	if (!BN_to_montgomery(mont_a,a,mont,ctx2)) goto err;
-
-	BN_copy(d,mont_one);
-	for (i=k-1; i>=0; i--)
+	if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) /* w := w^a1_odd mod a */
+		return -1;
+	if (BN_is_one(w))
+		return 0; /* probably prime */
+	if (BN_cmp(w, a1) == 0)
+		return 0; /* w == -1 (mod a),  'a' is probably prime */
+	while (--k)
 		{
-		if (	(BN_cmp(d,mont_one) != 0) &&
-			(BN_cmp(d,mont_n1) != 0))
-			good=1;
-		else
-			good=0;
-
-		BN_mod_mul_montgomery(dd,d,d,mont,ctx2);
-		
-		if (good && (BN_cmp(dd,mont_one) == 0))
-			{
-			ret=1;
-			goto err;
-			}
-		if (BN_is_bit_set(n1,i))
-			{
-			BN_mod_mul_montgomery(d,dd,mont_a,mont,ctx2);
-			}
-		else
-			{
-			tmp=d;
-			d=dd;
-			dd=tmp;
-			}
+		if (!BN_mod_mul(w, w, w, a, ctx)) /* w := w^2 mod a */
+			return -1;
+		if (BN_is_one(w))
+			return 1; /* 'a' is composite, otherwise a previous 'w' would
+			           * have been == -1 (mod 'a') */
+		if (BN_cmp(w, a1) == 0)
+			return 0; /* w == -1 (mod a), 'a' is probably prime */
 		}
-	if (BN_cmp(d,mont_one) == 0)
-		i=0;
-	else	i=1;
-	ret=i;
-err:
-	ctx->tos-=3;
-	ctx2->tos-=3;
-	return(ret);
+	/* If we get here, 'w' is the (a-1)/2-th power of the original 'w',
+	 * and it is neither -1 nor +1 -- so 'a' cannot be prime */
+	return 1;
 	}
 
-static int probable_prime(rnd, bits)
-BIGNUM *rnd;
-int bits;
+static int probable_prime(BIGNUM *rnd, int bits)
 	{
 	int i;
-	MS_STATIC BN_ULONG mods[NUMPRIMES];
-	BN_ULONG delta;
+	BN_ULONG mods[NUMPRIMES];
+	BN_ULONG delta,d;
 
+again:
 	if (!BN_rand(rnd,bits,1,1)) return(0);
 	/* we now have a random number 'rand' to test. */
 	for (i=1; i<NUMPRIMES; i++)
@@ -300,9 +364,12 @@ int bits;
 		 * that gcd(rnd-1,primes) == 1 (except for 2) */
 		if (((mods[i]+delta)%primes[i]) <= 1)
 			{
+			d=delta;
 			delta+=2;
 			/* perhaps need to check for overflow of
-			 * delta (but delta can be upto 2^32) */
+			 * delta (but delta can be up to 2^32)
+			 * 21-May-98 eay - added overflow check */
+			if (delta < d) goto again;
 			goto loop;
 			}
 		}
@@ -310,17 +377,14 @@ int bits;
 	return(1);
 	}
 
-static int probable_prime_dh(rnd, bits, add, rem,ctx)
-BIGNUM *rnd;
-int bits;
-BIGNUM *add;
-BIGNUM *rem;
-BN_CTX *ctx;
+static int probable_prime_dh(BIGNUM *rnd, int bits,
+	const BIGNUM *add, const BIGNUM *rem, BN_CTX *ctx)
 	{
 	int i,ret=0;
 	BIGNUM *t1;
 
-	t1=ctx->bn[ctx->tos++];
+	BN_CTX_start(ctx);
+	if ((t1 = BN_CTX_get(ctx)) == NULL) goto err;
 
 	if (!BN_rand(rnd,bits,0,1)) goto err;
 
@@ -338,7 +402,7 @@ BN_CTX *ctx;
 	loop: for (i=1; i<NUMPRIMES; i++)
 		{
 		/* check that rnd is a prime */
-		if (BN_mod_word(rnd,(BN_LONG)primes[i]) <= 1)
+		if (BN_mod_word(rnd,(BN_ULONG)primes[i]) <= 1)
 			{
 			if (!BN_add(rnd,rnd,add)) goto err;
 			goto loop;
@@ -346,24 +410,22 @@ BN_CTX *ctx;
 		}
 	ret=1;
 err:
-	ctx->tos--;
+	BN_CTX_end(ctx);
 	return(ret);
 	}
 
-static int probable_prime_dh_strong(p, bits, padd, rem,ctx)
-BIGNUM *p;
-int bits;
-BIGNUM *padd;
-BIGNUM *rem;
-BN_CTX *ctx;
+static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
+	const BIGNUM *rem, BN_CTX *ctx)
 	{
 	int i,ret=0;
-	BIGNUM *t1,*qadd=NULL,*q=NULL;
+	BIGNUM *t1,*qadd,*q;
 
 	bits--;
-	t1=ctx->bn[ctx->tos++];
-	q=ctx->bn[ctx->tos++];
-	qadd=ctx->bn[ctx->tos++];
+	BN_CTX_start(ctx);
+	t1 = BN_CTX_get(ctx);
+	q = BN_CTX_get(ctx);
+	qadd = BN_CTX_get(ctx);
+	if (qadd == NULL) goto err;
 
 	if (!BN_rshift1(qadd,padd)) goto err;
 		
@@ -389,8 +451,8 @@ BN_CTX *ctx;
 		/* check that p and q are prime */
 		/* check that for p and q
 		 * gcd(p-1,primes) == 1 (except for 2) */
-		if (	(BN_mod_word(p,(BN_LONG)primes[i]) == 0) ||
-			(BN_mod_word(q,(BN_LONG)primes[i]) == 0))
+		if (	(BN_mod_word(p,(BN_ULONG)primes[i]) == 0) ||
+			(BN_mod_word(q,(BN_ULONG)primes[i]) == 0))
 			{
 			if (!BN_add(p,p,padd)) goto err;
 			if (!BN_add(q,q,qadd)) goto err;
@@ -399,75 +461,6 @@ BN_CTX *ctx;
 		}
 	ret=1;
 err:
-	ctx->tos-=3;
-	return(ret);
-	}
-
-#if 0
-static int witness(a, n,ctx)
-BIGNUM *a;
-BIGNUM *n;
-BN_CTX *ctx;
-	{
-	int k,i,nb,ret= -1;
-	BIGNUM *d,*dd,*tmp;
-	BIGNUM *d1,*d2,*x,*n1,*inv;
-
-	d1=ctx->bn[ctx->tos];
-	d2=ctx->bn[ctx->tos+1];
-	x=ctx->bn[ctx->tos+2];
-	n1=ctx->bn[ctx->tos+3];
-	inv=ctx->bn[ctx->tos+4];
-	ctx->tos+=5;
-
-	d=d1;
-	dd=d2;
-	if (!BN_one(d)) goto err;
-	if (!BN_sub(n1,n,d)) goto err; /* n1=n-1; */
-	k=BN_num_bits(n1);
-
-	/* i=BN_num_bits(n); */
-#ifdef RECP_MUL_MOD
-	nb=BN_reciprocal(inv,n,ctx); /**/
-	if (nb == -1) goto err;
-#endif
-
-	for (i=k-1; i>=0; i--)
-		{
-		if (BN_copy(x,d) == NULL) goto err;
-#ifndef RECP_MUL_MOD
-		if (!BN_mod_mul(dd,d,d,n,ctx)) goto err;
-#else
-		if (!BN_mod_mul_reciprocal(dd,d,d,n,inv,nb,ctx)) goto err;
-#endif
-		if (	BN_is_one(dd) &&
-			!BN_is_one(x) &&
-			(BN_cmp(x,n1) != 0))
-			{
-			ret=1;
-			goto err;
-			}
-		if (BN_is_bit_set(n1,i))
-			{
-#ifndef RECP_MUL_MOD
-			if (!BN_mod_mul(d,dd,a,n,ctx)) goto err;
-#else
-			if (!BN_mod_mul_reciprocal(d,dd,a,n,inv,nb,ctx)) goto err; 
-#endif
-			}
-		else
-			{
-			tmp=d;
-			d=dd;
-			dd=tmp;
-			}
-		}
-	if (BN_is_one(d))
-		i=0;
-	else	i=1;
-	ret=i;
-err:
-	ctx->tos-=5;
+	BN_CTX_end(ctx);
 	return(ret);
 	}
-#endif
diff --git a/src/lib/libcrypto/bn/bn_prime.h b/src/lib/libcrypto/bn/bn_prime.h
index 6fce0210cd..b7cf9a9bfe 100644
--- a/src/lib/libcrypto/bn/bn_prime.h
+++ b/src/lib/libcrypto/bn/bn_prime.h
@@ -1,4 +1,4 @@
-/* crypto/bn/bn_prime.h */
+/* Auto generated by bn_prime.pl */
 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  * All rights reserved.
  *
@@ -61,7 +61,7 @@
 #else
 #define NUMPRIMES 54
 #endif
-static unsigned int primes[NUMPRIMES]=
+static const unsigned int primes[NUMPRIMES]=
 	{
 	   2,   3,   5,   7,  11,  13,  17,  19,
 	  23,  29,  31,  37,  41,  43,  47,  53,
diff --git a/src/lib/libcrypto/bn/bn_prime.pl b/src/lib/libcrypto/bn/bn_prime.pl
index 1b00c21a77..9fc3765486 100644
--- a/src/lib/libcrypto/bn/bn_prime.pl
+++ b/src/lib/libcrypto/bn/bn_prime.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl
+#!/usr/local/bin/perl
 # bn_prime.pl
 
 $num=2048;
@@ -18,13 +18,74 @@ loop: while ($#primes < $num-1)
 	push(@primes,$p);
 	}
 
-print <<"EOF";
+# print <<"EOF";
+# /* Auto generated by bn_prime.pl */
+# /* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
+#  * All rights reserved.
+#  * Copyright remains Eric Young's, and as such any Copyright notices in
+#  * the code are not to be removed.
+#  * See the COPYRIGHT file in the SSLeay distribution for more details.
+#  */
+# 
+# EOF
+
+print <<\EOF;
 /* Auto generated by bn_prime.pl */
-/* Copyright (C) 1995-1997 Eric Young (eay\@mincom.oz.au).
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
  * Copyright remains Eric Young's, and as such any Copyright notices in
  * the code are not to be removed.
- * See the COPYRIGHT file in the SSLeay distribution for more details.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
  */
 
 EOF
@@ -43,7 +104,7 @@ printf "#define NUMPRIMES %d\n",$num;
 printf "#else\n";
 printf "#define NUMPRIMES %d\n",$eight;
 printf "#endif\n";
-print "static unsigned int primes[NUMPRIMES]=\n\t{\n\t";
+print "static const unsigned int primes[NUMPRIMES]=\n\t{\n\t";
 $init=0;
 for ($i=0; $i <= $#primes; $i++)
 	{
diff --git a/src/lib/libcrypto/bn/bn_print.c b/src/lib/libcrypto/bn/bn_print.c
index 2bcc11c852..5f46b1826c 100644
--- a/src/lib/libcrypto/bn/bn_print.c
+++ b/src/lib/libcrypto/bn/bn_print.c
@@ -59,20 +59,19 @@
 #include <stdio.h>
 #include <ctype.h>
 #include "cryptlib.h"
-#include "buffer.h"
+#include <openssl/buffer.h>
 #include "bn_lcl.h"
 
-static char *Hex="0123456789ABCDEF";
+static const char *Hex="0123456789ABCDEF";
 
-/* Must 'Free' the returned data */
-char *BN_bn2hex(a)
-BIGNUM *a;
+/* Must 'OPENSSL_free' the returned data */
+char *BN_bn2hex(const BIGNUM *a)
 	{
 	int i,j,v,z=0;
 	char *buf;
 	char *p;
 
-	buf=(char *)Malloc(a->top*BN_BYTES*2+2);
+	buf=(char *)OPENSSL_malloc(a->top*BN_BYTES*2+2);
 	if (buf == NULL)
 		{
 		BNerr(BN_F_BN_BN2HEX,ERR_R_MALLOC_FAILURE);
@@ -100,9 +99,8 @@ err:
 	return(buf);
 	}
 
-/* Must 'Free' the returned data */
-char *BN_bn2dec(a)
-BIGNUM *a;
+/* Must 'OPENSSL_free' the returned data */
+char *BN_bn2dec(const BIGNUM *a)
 	{
 	int i=0,num;
 	char *buf=NULL;
@@ -112,8 +110,8 @@ BIGNUM *a;
 
 	i=BN_num_bits(a)*3;
 	num=(i/10+i/1000+3)+1;
-	bn_data=(BN_ULONG *)Malloc((num/BN_DEC_NUM+1)*sizeof(BN_ULONG));
-	buf=(char *)Malloc(num+3);
+	bn_data=(BN_ULONG *)OPENSSL_malloc((num/BN_DEC_NUM+1)*sizeof(BN_ULONG));
+	buf=(char *)OPENSSL_malloc(num+3);
 	if ((buf == NULL) || (bn_data == NULL))
 		{
 		BNerr(BN_F_BN_BN2DEC,ERR_R_MALLOC_FAILURE);
@@ -139,7 +137,7 @@ BIGNUM *a;
 			}
 		lp--;
 		/* We now have a series of blocks, BN_DEC_NUM chars
-		 * in length, where the last one needs trucation.
+		 * in length, where the last one needs truncation.
 		 * The blocks need to be reversed in order. */
 		sprintf(p,BN_DEC_FMT1,*lp);
 		while (*p) p++;
@@ -151,14 +149,12 @@ BIGNUM *a;
 			}
 		}
 err:
-	if (bn_data != NULL) Free(bn_data);
+	if (bn_data != NULL) OPENSSL_free(bn_data);
 	if (t != NULL) BN_free(t);
 	return(buf);
 	}
 
-int BN_hex2bn(bn,a)
-BIGNUM **bn;
-char *a;
+int BN_hex2bn(BIGNUM **bn, const char *a)
 	{
 	BIGNUM *ret=NULL;
 	BN_ULONG l=0;
@@ -169,13 +165,13 @@ char *a;
 
 	if (*a == '-') { neg=1; a++; }
 
-	for (i=0; isxdigit(a[i]); i++)
+	for (i=0; isxdigit((unsigned char) a[i]); i++)
 		;
 
 	num=i+neg;
 	if (bn == NULL) return(num);
 
-	/* a is the start of the hex digets, and it is 'i' long */
+	/* a is the start of the hex digits, and it is 'i' long */
 	if (*bn == NULL)
 		{
 		if ((ret=BN_new()) == NULL) return(0);
@@ -189,7 +185,7 @@ char *a;
 	/* i is the number of hex digests; */
 	if (bn_expand(ret,i*4) == NULL) goto err;
 
-	j=i; /* least significate 'hex' */
+	j=i; /* least significant 'hex' */
 	m=0;
 	h=0;
 	while (j > 0)
@@ -224,9 +220,7 @@ err:
 	return(0);
 	}
 
-int BN_dec2bn(bn,a)
-BIGNUM **bn;
-char *a;
+int BN_dec2bn(BIGNUM **bn, const char *a)
 	{
 	BIGNUM *ret=NULL;
 	BN_ULONG l=0;
@@ -236,14 +230,14 @@ char *a;
 	if ((a == NULL) || (*a == '\0')) return(0);
 	if (*a == '-') { neg=1; a++; }
 
-	for (i=0; isdigit(a[i]); i++)
+	for (i=0; isdigit((unsigned char) a[i]); i++)
 		;
 
 	num=i+neg;
 	if (bn == NULL) return(num);
 
-	/* a is the start of the digets, and it is 'i' long.
-	 * We chop it into BN_DEC_NUM digets at a time */
+	/* a is the start of the digits, and it is 'i' long.
+	 * We chop it into BN_DEC_NUM digits at a time */
 	if (*bn == NULL)
 		{
 		if ((ret=BN_new()) == NULL) return(0);
@@ -283,12 +277,9 @@ err:
 	return(0);
 	}
 
-#ifndef NO_BIO
-
-#ifndef NO_FP_API
-int BN_print_fp(fp, a)
-FILE *fp;
-BIGNUM *a;
+#ifndef OPENSSL_NO_BIO
+#ifndef OPENSSL_NO_FP_API
+int BN_print_fp(FILE *fp, const BIGNUM *a)
 	{
 	BIO *b;
 	int ret;
@@ -302,9 +293,7 @@ BIGNUM *a;
 	}
 #endif
 
-int BN_print(bp, a)
-BIO *bp;
-BIGNUM *a;
+int BN_print(BIO *bp, const BIGNUM *a)
 	{
 	int i,j,v,z=0;
 	int ret=0;
@@ -329,5 +318,15 @@ BIGNUM *a;
 end:
 	return(ret);
 	}
+#endif
 
+#ifdef BN_DEBUG
+void bn_dump1(FILE *o, const char *a, const BN_ULONG *b,int n)
+	{
+	int i;
+	fprintf(o, "%s=", a);
+	for (i=n-1;i>=0;i--)
+		fprintf(o, "%08lX", b[i]); /* assumes 32-bit BN_ULONG */
+	fprintf(o, "\n");
+	}
 #endif
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c
index 75b6b0493b..9e08ccd22e 100644
--- a/src/lib/libcrypto/bn/bn_rand.c
+++ b/src/lib/libcrypto/bn/bn_rand.c
@@ -55,28 +55,83 @@
  * copied and put under another distribution licence
  * [including the GNU Public Licence.]
  */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
 
 #include <stdio.h>
 #include <time.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
-#include "rand.h"
+#include <openssl/rand.h>
 
-int BN_rand(rnd, bits, top, bottom)
-BIGNUM *rnd;
-int bits;
-int top;
-int bottom;
+static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
 	{
 	unsigned char *buf=NULL;
 	int ret=0,bit,bytes,mask;
 	time_t tim;
 
+	if (bits == 0)
+		{
+		BN_zero(rnd);
+		return 1;
+		}
+
 	bytes=(bits+7)/8;
 	bit=(bits-1)%8;
-	mask=0xff<<bit;
+	mask=0xff<<(bit+1);
 
-	buf=(unsigned char *)Malloc(bytes);
+	buf=(unsigned char *)OPENSSL_malloc(bytes);
 	if (buf == NULL)
 		{
 		BNerr(BN_F_BN_RAND,ERR_R_MALLOC_FAILURE);
@@ -85,28 +140,61 @@ int bottom;
 
 	/* make a random number and set the top and bottom bits */
 	time(&tim);
-	RAND_seed((unsigned char *)&tim,sizeof(tim));
+	RAND_add(&tim,sizeof(tim),0);
 
-	RAND_bytes(buf,(int)bytes);
-	if (top)
+	if (pseudorand)
 		{
-		if (bit == 0)
+		if (RAND_pseudo_bytes(buf, bytes) == -1)
+			goto err;
+		}
+	else
+		{
+		if (RAND_bytes(buf, bytes) <= 0)
+			goto err;
+		}
+
+#if 1
+	if (pseudorand == 2)
+		{
+		/* generate patterns that are more likely to trigger BN
+		   library bugs */
+		int i;
+		unsigned char c;
+
+		for (i = 0; i < bytes; i++)
 			{
-			buf[0]=1;
-			buf[1]|=0x80;
+			RAND_pseudo_bytes(&c, 1);
+			if (c >= 128 && i > 0)
+				buf[i] = buf[i-1];
+			else if (c < 42)
+				buf[i] = 0;
+			else if (c < 84)
+				buf[i] = 255;
+			}
+		}
+#endif
+
+	if (top != -1)
+		{
+		if (top)
+			{
+			if (bit == 0)
+				{
+				buf[0]=1;
+				buf[1]|=0x80;
+				}
+			else
+				{
+				buf[0]|=(3<<(bit-1));
+				}
 			}
 		else
 			{
-			buf[0]|=(3<<(bit-1));
-			buf[0]&= ~(mask<<1);
+			buf[0]|=(1<<bit);
 			}
 		}
-	else
-		{
-		buf[0]|=(1<<bit);
-		buf[0]&= ~(mask<<1);
-		}
-	if (bottom) /* set bottom bits to whatever odd is */
+	buf[0] &= ~mask;
+	if (bottom) /* set bottom bit if requested */
 		buf[bytes-1]|=1;
 	if (!BN_bin2bn(buf,bytes,rnd)) goto err;
 	ret=1;
@@ -114,8 +202,90 @@ err:
 	if (buf != NULL)
 		{
 		memset(buf,0,bytes);
-		Free(buf);
+		OPENSSL_free(buf);
 		}
 	return(ret);
 	}
 
+int     BN_rand(BIGNUM *rnd, int bits, int top, int bottom)
+	{
+	return bnrand(0, rnd, bits, top, bottom);
+	}
+
+int     BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom)
+	{
+	return bnrand(1, rnd, bits, top, bottom);
+	}
+
+#if 1
+int     BN_bntest_rand(BIGNUM *rnd, int bits, int top, int bottom)
+	{
+	return bnrand(2, rnd, bits, top, bottom);
+	}
+#endif
+
+
+/* random number r:  0 <= r < range */
+static int bn_rand_range(int pseudo, BIGNUM *r, BIGNUM *range)
+	{
+	int (*bn_rand)(BIGNUM *, int, int, int) = pseudo ? BN_pseudo_rand : BN_rand;
+	int n;
+
+	if (range->neg || BN_is_zero(range))
+		{
+		BNerr(BN_F_BN_RAND_RANGE, BN_R_INVALID_RANGE);
+		return 0;
+		}
+
+	n = BN_num_bits(range); /* n > 0 */
+
+	/* BN_is_bit_set(range, n - 1) always holds */
+
+	if (n == 1)
+		{
+		if (!BN_zero(r)) return 0;
+		}
+	else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3))
+		{
+		/* range = 100..._2,
+		 * so  3*range (= 11..._2)  is exactly one bit longer than  range */
+		do
+			{
+			if (!bn_rand(r, n + 1, -1, 0)) return 0;
+			/* If  r < 3*range,  use  r := r MOD range
+			 * (which is either  r, r - range,  or  r - 2*range).
+			 * Otherwise, iterate once more.
+			 * Since  3*range = 11..._2, each iteration succeeds with
+			 * probability >= .75. */
+			if (BN_cmp(r ,range) >= 0)
+				{
+				if (!BN_sub(r, r, range)) return 0;
+				if (BN_cmp(r, range) >= 0)
+					if (!BN_sub(r, r, range)) return 0;
+				}
+			}
+		while (BN_cmp(r, range) >= 0);
+		}
+	else
+		{
+		do
+			{
+			/* range = 11..._2  or  range = 101..._2 */
+			if (!bn_rand(r, n, -1, 0)) return 0;
+			}
+		while (BN_cmp(r, range) >= 0);
+		}
+
+	return 1;
+	}
+
+
+int	BN_rand_range(BIGNUM *r, BIGNUM *range)
+	{
+	return bn_rand_range(0, r, range);
+	}
+
+int	BN_pseudo_rand_range(BIGNUM *r, BIGNUM *range)
+	{
+	return bn_rand_range(1, r, range);
+	}
diff --git a/src/lib/libcrypto/bn/bn_recp.c b/src/lib/libcrypto/bn/bn_recp.c
index 72cd69d3fc..ef5fdd4708 100644
--- a/src/lib/libcrypto/bn/bn_recp.c
+++ b/src/lib/libcrypto/bn/bn_recp.c
@@ -60,66 +60,171 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-int BN_mod_mul_reciprocal(r, x, y, m, i, nb, ctx)
-BIGNUM *r;
-BIGNUM *x;
-BIGNUM *y;
-BIGNUM *m;
-BIGNUM *i;
-int nb;
-BN_CTX *ctx;
+void BN_RECP_CTX_init(BN_RECP_CTX *recp)
 	{
-	int ret=0,j;
-	BIGNUM *a,*b,*c,*d;
+	BN_init(&(recp->N));
+	BN_init(&(recp->Nr));
+	recp->num_bits=0;
+	recp->flags=0;
+	}
+
+BN_RECP_CTX *BN_RECP_CTX_new(void)
+	{
+	BN_RECP_CTX *ret;
+
+	if ((ret=(BN_RECP_CTX *)OPENSSL_malloc(sizeof(BN_RECP_CTX))) == NULL)
+		return(NULL);
+
+	BN_RECP_CTX_init(ret);
+	ret->flags=BN_FLG_MALLOCED;
+	return(ret);
+	}
+
+void BN_RECP_CTX_free(BN_RECP_CTX *recp)
+	{
+	if(recp == NULL)
+	    return;
+
+	BN_free(&(recp->N));
+	BN_free(&(recp->Nr));
+	if (recp->flags & BN_FLG_MALLOCED)
+		OPENSSL_free(recp);
+	}
 
-	a=ctx->bn[ctx->tos++];
-	b=ctx->bn[ctx->tos++];
-	c=ctx->bn[ctx->tos++];
-	d=ctx->bn[ctx->tos++];
+int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx)
+	{
+	if (!BN_copy(&(recp->N),d)) return 0;
+	if (!BN_zero(&(recp->Nr))) return 0;
+	recp->num_bits=BN_num_bits(d);
+	recp->shift=0;
+	return(1);
+	}
+
+int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+	BN_RECP_CTX *recp, BN_CTX *ctx)
+	{
+	int ret=0;
+	BIGNUM *a;
+	const BIGNUM *ca;
 
-	if (x == y)
-		{ if (!BN_sqr(a,x,ctx)) goto err; }
+	BN_CTX_start(ctx);
+	if ((a = BN_CTX_get(ctx)) == NULL) goto err;
+	if (y != NULL)
+		{
+		if (x == y)
+			{ if (!BN_sqr(a,x,ctx)) goto err; }
+		else
+			{ if (!BN_mul(a,x,y,ctx)) goto err; }
+		ca = a;
+		}
 	else
-		{ if (!BN_mul(a,x,y)) goto err; }
-	if (!BN_rshift(d,a,nb)) goto err;
-	if (!BN_mul(b,d,i)) goto err;
-	if (!BN_rshift(c,b,nb)) goto err;
-	if (!BN_mul(b,m,c)) goto err;
-	if (!BN_sub(r,a,b)) goto err;
+		ca=x; /* Just do the mod */
+
+	ret = BN_div_recp(NULL,r,ca,recp,ctx);
+err:
+	BN_CTX_end(ctx);
+	return(ret);
+	}
+
+int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+	BN_RECP_CTX *recp, BN_CTX *ctx)
+	{
+	int i,j,ret=0;
+	BIGNUM *a,*b,*d,*r;
+
+	BN_CTX_start(ctx);
+	a=BN_CTX_get(ctx);
+	b=BN_CTX_get(ctx);
+	if (dv != NULL)
+		d=dv;
+	else
+		d=BN_CTX_get(ctx);
+	if (rem != NULL)
+		r=rem;
+	else
+		r=BN_CTX_get(ctx);
+	if (a == NULL || b == NULL || d == NULL || r == NULL) goto err;
+
+	if (BN_ucmp(m,&(recp->N)) < 0)
+		{
+		if (!BN_zero(d)) return 0;
+		if (!BN_copy(r,m)) return 0;
+		BN_CTX_end(ctx);
+		return(1);
+		}
+
+	/* We want the remainder
+	 * Given input of ABCDEF / ab
+	 * we need multiply ABCDEF by 3 digests of the reciprocal of ab
+	 *
+	 */
+
+	/* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
+	i=BN_num_bits(m);
+	j=recp->num_bits<<1;
+	if (j>i) i=j;
+
+	/* Nr := round(2^i / N) */
+	if (i != recp->shift)
+		recp->shift=BN_reciprocal(&(recp->Nr),&(recp->N),
+			i,ctx); /* BN_reciprocal returns i, or -1 for an error */
+	if (recp->shift == -1) goto err;
+
+	/* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - BN_num_bits(N)))|
+	 *    = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - BN_num_bits(N)))|
+	 *   <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
+	 *    = |m/N|
+	 */
+	if (!BN_rshift(a,m,recp->num_bits)) goto err;
+	if (!BN_mul(b,a,&(recp->Nr),ctx)) goto err;
+	if (!BN_rshift(d,b,i-recp->num_bits)) goto err;
+	d->neg=0;
+
+	if (!BN_mul(b,&(recp->N),d,ctx)) goto err;
+	if (!BN_usub(r,m,b)) goto err;
+	r->neg=0;
+
+#if 1
 	j=0;
-	while (BN_cmp(r,m) >= 0)
+	while (BN_ucmp(r,&(recp->N)) >= 0)
 		{
 		if (j++ > 2)
 			{
 			BNerr(BN_F_BN_MOD_MUL_RECIPROCAL,BN_R_BAD_RECIPROCAL);
 			goto err;
 			}
-		if (!BN_sub(r,r,m)) goto err;
+		if (!BN_usub(r,r,&(recp->N))) goto err;
+		if (!BN_add_word(d,1)) goto err;
 		}
+#endif
 
+	r->neg=BN_is_zero(r)?0:m->neg;
+	d->neg=m->neg^recp->N.neg;
 	ret=1;
 err:
-	ctx->tos-=4;
+	BN_CTX_end(ctx);
 	return(ret);
-	}
+	} 
 
-int BN_reciprocal(r, m,ctx)
-BIGNUM *r;
-BIGNUM *m;
-BN_CTX *ctx;
+/* len is the expected size of the result
+ * We actually calculate with an extra word of precision, so
+ * we can do faster division if the remainder is not required.
+ */
+/* r := 2^len / m */
+int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx)
 	{
-	int nm,ret= -1;
-	BIGNUM *t;
+	int ret= -1;
+	BIGNUM t;
 
-	t=ctx->bn[ctx->tos++];
+	BN_init(&t);
 
-	nm=BN_num_bits(m);
-	if (!BN_lshift(t,BN_value_one(),nm*2)) goto err;
+	if (!BN_zero(&t)) goto err;
+	if (!BN_set_bit(&t,len)) goto err;
 
-	if (!BN_div(r,NULL,t,m,ctx)) goto err;
-	ret=nm;
+	if (!BN_div(r,NULL,&t,m,ctx)) goto err;
+
+	ret=len;
 err:
-	ctx->tos--;
+	BN_free(&t);
 	return(ret);
 	}
-
diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c
index 944bf1794b..70f785ea18 100644
--- a/src/lib/libcrypto/bn/bn_shift.c
+++ b/src/lib/libcrypto/bn/bn_shift.c
@@ -60,9 +60,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-int BN_lshift1(r, a)
-BIGNUM *r;
-BIGNUM *a;
+int BN_lshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	register BN_ULONG *ap,*rp,t,c;
 	int i;
@@ -94,9 +92,7 @@ BIGNUM *a;
 	return(1);
 	}
 
-int BN_rshift1(r, a)
-BIGNUM *r;
-BIGNUM *a;
+int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	BN_ULONG *ap,*rp,t,c;
 	int i;
@@ -125,18 +121,15 @@ BIGNUM *a;
 	return(1);
 	}
 
-int BN_lshift(r, a, n)
-BIGNUM *r;
-BIGNUM *a;
-int n;
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n)
 	{
 	int i,nw,lb,rb;
 	BN_ULONG *t,*f;
 	BN_ULONG l;
 
 	r->neg=a->neg;
-	if (bn_wexpand(r,a->top+(n/BN_BITS2)+1) == NULL) return(0);
 	nw=n/BN_BITS2;
+	if (bn_wexpand(r,a->top+nw+1) == NULL) return(0);
 	lb=n%BN_BITS2;
 	rb=BN_BITS2-lb;
 	f=a->d;
@@ -160,10 +153,7 @@ int n;
 	return(1);
 	}
 
-int BN_rshift(r, a, n)
-BIGNUM *r;
-BIGNUM *a;
-int n;
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 	{
 	int i,j,nw,lb,rb;
 	BN_ULONG *t,*f;
@@ -172,7 +162,7 @@ int n;
 	nw=n/BN_BITS2;
 	rb=n%BN_BITS2;
 	lb=BN_BITS2-rb;
-	if (nw > a->top)
+	if (nw > a->top || a->top == 0)
 		{
 		BN_zero(r);
 		return(1);
@@ -182,6 +172,11 @@ int n;
 		r->neg=a->neg;
 		if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
 		}
+	else
+		{
+		if (n == 0)
+			return 1; /* or the copying loop will go berserk */
+		}
 
 	f= &(a->d[nw]);
 	t=r->d;
diff --git a/src/lib/libcrypto/bn/bn_sqr.c b/src/lib/libcrypto/bn/bn_sqr.c
index a8464610e5..c1d0cca438 100644
--- a/src/lib/libcrypto/bn/bn_sqr.c
+++ b/src/lib/libcrypto/bn/bn_sqr.c
@@ -62,35 +62,105 @@
 
 /* r must not be a */
 /* I've just gone over this and it is now %20 faster on x86 - eay - 27 Jun 96 */
-int BN_sqr(r, a, ctx)
-BIGNUM *r;
-BIGNUM *a;
-BN_CTX *ctx;
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx)
 	{
-	int i,j,max,al;
-	BIGNUM *tmp;
-	BN_ULONG *ap,*rp;
+	int max,al;
+	int ret = 0;
+	BIGNUM *tmp,*rr;
 
-	tmp=ctx->bn[ctx->tos];
+#ifdef BN_COUNT
+	fprintf(stderr,"BN_sqr %d * %d\n",a->top,a->top);
+#endif
+	bn_check_top(a);
 
 	al=a->top;
-	if (al == 0)
+	if (al <= 0)
 		{
 		r->top=0;
 		return(1);
 		}
 
-	max=(al*2);
-	if (bn_wexpand(r,1+max) == NULL) return(0);
-	if (bn_wexpand(tmp,1+max) == NULL) return(0);
+	BN_CTX_start(ctx);
+	rr=(a != r) ? r : BN_CTX_get(ctx);
+	tmp=BN_CTX_get(ctx);
+	if (tmp == NULL) goto err;
 
-	r->neg=0;
+	max=(al+al);
+	if (bn_wexpand(rr,max+1) == NULL) goto err;
 
-	ap=a->d;
-	rp=r->d;
+	if (al == 4)
+		{
+#ifndef BN_SQR_COMBA
+		BN_ULONG t[8];
+		bn_sqr_normal(rr->d,a->d,4,t);
+#else
+		bn_sqr_comba4(rr->d,a->d);
+#endif
+		}
+	else if (al == 8)
+		{
+#ifndef BN_SQR_COMBA
+		BN_ULONG t[16];
+		bn_sqr_normal(rr->d,a->d,8,t);
+#else
+		bn_sqr_comba8(rr->d,a->d);
+#endif
+		}
+	else 
+		{
+#if defined(BN_RECURSION)
+		if (al < BN_SQR_RECURSIVE_SIZE_NORMAL)
+			{
+			BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL*2];
+			bn_sqr_normal(rr->d,a->d,al,t);
+			}
+		else
+			{
+			int j,k;
+
+			j=BN_num_bits_word((BN_ULONG)al);
+			j=1<<(j-1);
+			k=j+j;
+			if (al == j)
+				{
+				if (bn_wexpand(tmp,k*2) == NULL) goto err;
+				bn_sqr_recursive(rr->d,a->d,al,tmp->d);
+				}
+			else
+				{
+				if (bn_wexpand(tmp,max) == NULL) goto err;
+				bn_sqr_normal(rr->d,a->d,al,tmp->d);
+				}
+			}
+#else
+		if (bn_wexpand(tmp,max) == NULL) goto err;
+		bn_sqr_normal(rr->d,a->d,al,tmp->d);
+#endif
+		}
+
+	rr->top=max;
+	rr->neg=0;
+	if ((max > 0) && (rr->d[max-1] == 0)) rr->top--;
+	if (rr != r) BN_copy(r,rr);
+	ret = 1;
+ err:
+	BN_CTX_end(ctx);
+	return(ret);
+	}
+
+/* tmp must have 2*n words */
+void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp)
+	{
+	int i,j,max;
+	const BN_ULONG *ap;
+	BN_ULONG *rp;
+
+	max=n*2;
+	ap=a;
+	rp=r;
 	rp[0]=rp[max-1]=0;
 	rp++;
-	j=al;
+	j=n;
 
 	if (--j > 0)
 		{
@@ -99,7 +169,7 @@ BN_CTX *ctx;
 		rp+=2;
 		}
 
-	for (i=2; i<al; i++)
+	for (i=n-2; i>0; i--)
 		{
 		j--;
 		ap++;
@@ -107,16 +177,112 @@ BN_CTX *ctx;
 		rp+=2;
 		}
 
-	bn_add_words(r->d,r->d,r->d,max);
+	bn_add_words(r,r,r,max);
 
 	/* There will not be a carry */
 
-	bn_sqr_words(tmp->d,a->d,al);
+	bn_sqr_words(tmp,a,n);
 
-	bn_add_words(r->d,r->d,tmp->d,max);
-
-	r->top=max;
-	if (r->d[max-1] == 0) r->top--;
-	return(1);
+	bn_add_words(r,r,tmp,max);
 	}
 
+#ifdef BN_RECURSION
+/* r is 2*n words in size,
+ * a and b are both n words in size.    (There's not actually a 'b' here ...)
+ * n must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t)
+	{
+	int n=n2/2;
+	int zero,c1;
+	BN_ULONG ln,lo,*p;
+
+#ifdef BN_COUNT
+	fprintf(stderr," bn_sqr_recursive %d * %d\n",n2,n2);
+#endif
+	if (n2 == 4)
+		{
+#ifndef BN_SQR_COMBA
+		bn_sqr_normal(r,a,4,t);
+#else
+		bn_sqr_comba4(r,a);
+#endif
+		return;
+		}
+	else if (n2 == 8)
+		{
+#ifndef BN_SQR_COMBA
+		bn_sqr_normal(r,a,8,t);
+#else
+		bn_sqr_comba8(r,a);
+#endif
+		return;
+		}
+	if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL)
+		{
+		bn_sqr_normal(r,a,n2,t);
+		return;
+		}
+	/* r=(a[0]-a[1])*(a[1]-a[0]) */
+	c1=bn_cmp_words(a,&(a[n]),n);
+	zero=0;
+	if (c1 > 0)
+		bn_sub_words(t,a,&(a[n]),n);
+	else if (c1 < 0)
+		bn_sub_words(t,&(a[n]),a,n);
+	else
+		zero=1;
+
+	/* The result will always be negative unless it is zero */
+	p= &(t[n2*2]);
+
+	if (!zero)
+		bn_sqr_recursive(&(t[n2]),t,n,p);
+	else
+		memset(&(t[n2]),0,n2*sizeof(BN_ULONG));
+	bn_sqr_recursive(r,a,n,p);
+	bn_sqr_recursive(&(r[n2]),&(a[n]),n,p);
+
+	/* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
+	 * r[10] holds (a[0]*b[0])
+	 * r[32] holds (b[1]*b[1])
+	 */
+
+	c1=(int)(bn_add_words(t,r,&(r[n2]),n2));
+
+	/* t[32] is negative */
+	c1-=(int)(bn_sub_words(&(t[n2]),t,&(t[n2]),n2));
+
+	/* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
+	 * r[10] holds (a[0]*a[0])
+	 * r[32] holds (a[1]*a[1])
+	 * c1 holds the carry bits
+	 */
+	c1+=(int)(bn_add_words(&(r[n]),&(r[n]),&(t[n2]),n2));
+	if (c1)
+		{
+		p= &(r[n+n2]);
+		lo= *p;
+		ln=(lo+c1)&BN_MASK2;
+		*p=ln;
+
+		/* The overflow will stop before we over write
+		 * words we should not overwrite */
+		if (ln < (BN_ULONG)c1)
+			{
+			do	{
+				p++;
+				lo= *p;
+				ln=(lo+1)&BN_MASK2;
+				*p=ln;
+				} while (ln == 0);
+			}
+		}
+	}
+#endif
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c
index 4b3d0f011d..cd59baa2c4 100644
--- a/src/lib/libcrypto/bn/bn_word.c
+++ b/src/lib/libcrypto/bn/bn_word.c
@@ -60,9 +60,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-BN_ULONG BN_mod_word(a, w)
-BIGNUM *a;
-unsigned long w;
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w)
 	{
 #ifndef BN_LLONG
 	BN_ULONG ret=0;
@@ -75,8 +73,8 @@ unsigned long w;
 	for (i=a->top-1; i>=0; i--)
 		{
 #ifndef BN_LLONG
-		ret=((ret<<BN_BITS4)|((a->d[i]>>BN_BITS4)&BN_MASK2l))%(unsigned long)w;
-		ret=((ret<<BN_BITS4)|(a->d[i]&BN_MASK2l))%(unsigned long)w;
+		ret=((ret<<BN_BITS4)|((a->d[i]>>BN_BITS4)&BN_MASK2l))%w;
+		ret=((ret<<BN_BITS4)|(a->d[i]&BN_MASK2l))%w;
 #else
 		ret=(BN_ULLONG)(((ret<<(BN_ULLONG)BN_BITS2)|a->d[i])%
 			(BN_ULLONG)w);
@@ -85,9 +83,7 @@ unsigned long w;
 	return((BN_ULONG)ret);
 	}
 
-BN_ULONG BN_div_word(a, w)
-BIGNUM *a;
-unsigned long w;
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w)
 	{
 	BN_ULONG ret;
 	int i;
@@ -100,18 +96,16 @@ unsigned long w;
 		BN_ULONG l,d;
 		
 		l=a->d[i];
-		d=bn_div64(ret,l,w);
+		d=bn_div_words(ret,l,w);
 		ret=(l-((d*w)&BN_MASK2))&BN_MASK2;
 		a->d[i]=d;
 		}
-	if (a->d[a->top-1] == 0)
+	if ((a->top > 0) && (a->d[a->top-1] == 0))
 		a->top--;
 	return(ret);
 	}
 
-int BN_add_word(a, w)
-BIGNUM *a;
-unsigned long w;
+int BN_add_word(BIGNUM *a, BN_ULONG w)
 	{
 	BN_ULONG l;
 	int i;
@@ -121,7 +115,7 @@ unsigned long w;
 		a->neg=0;
 		i=BN_sub_word(a,w);
 		if (!BN_is_zero(a))
-			a->neg=1;
+			a->neg=!(a->neg);
 		return(i);
 		}
 	w&=BN_MASK2;
@@ -142,13 +136,11 @@ unsigned long w;
 	return(1);
 	}
 
-int BN_sub_word(a, w)
-BIGNUM *a;
-unsigned long w;
+int BN_sub_word(BIGNUM *a, BN_ULONG w)
 	{
 	int i;
 
-	if (a->neg)
+	if (BN_is_zero(a) || a->neg)
 		{
 		a->neg=0;
 		i=BN_add_word(a,w);
@@ -183,22 +175,25 @@ unsigned long w;
 	return(1);
 	}
 
-int BN_mul_word(a,w)
-BIGNUM *a;
-unsigned long w;
+int BN_mul_word(BIGNUM *a, BN_ULONG w)
 	{
 	BN_ULONG ll;
 
 	w&=BN_MASK2;
 	if (a->top)
 		{
-		ll=bn_mul_words(a->d,a->d,a->top,w);
-		if (ll)
+		if (w == 0)
+			BN_zero(a);
+		else
 			{
-			if (bn_wexpand(a,a->top+1) == NULL) return(0);
-			a->d[a->top++]=ll;
+			ll=bn_mul_words(a->d,a->d,a->top,w);
+			if (ll)
+				{
+				if (bn_wexpand(a,a->top+1) == NULL) return(0);
+				a->d[a->top++]=ll;
+				}
 			}
 		}
-	return(0);
+	return(1);
 	}
 
-- 
cgit v1.2.3-55-g6feb