From d4fcd82bb7f6d603bd61e19a81ba97337b89dfca Mon Sep 17 00:00:00 2001
From: markus <>
Date: Mon, 12 May 2003 02:18:40 +0000
Subject: merge 0.9.7b with local changes; crank majors for libssl/libcrypto

---
 src/lib/libcrypto/bn/Makefile.ssl   |  33 +--
 src/lib/libcrypto/bn/asm/ia64.S     | 235 +++++++++++-----
 src/lib/libcrypto/bn/asm/pa-risc2.s |  36 +--
 src/lib/libcrypto/bn/asm/vms.mar    | 254 +++++++++--------
 src/lib/libcrypto/bn/bn.h           |   2 +
 src/lib/libcrypto/bn/bn_div.c       |  28 +-
 src/lib/libcrypto/bn/bn_lcl.h       |  26 +-
 src/lib/libcrypto/bn/bn_lib.c       |   4 +-
 src/lib/libcrypto/bn/bn_mul.c       | 529 ++++++------------------------------
 src/lib/libcrypto/bn/bn_prime.c     |   2 +-
 src/lib/libcrypto/bn/bn_rand.c      |   2 +-
 src/lib/libcrypto/bn/bn_word.c      |   5 +-
 src/lib/libcrypto/bn/bntest.c       |  23 +-
 src/lib/libcrypto/bn/divtest.c      |   6 +-
 src/lib/libcrypto/bn/exptest.c      |  22 +-
 15 files changed, 500 insertions(+), 707 deletions(-)

(limited to 'src/lib/libcrypto/bn')

diff --git a/src/lib/libcrypto/bn/Makefile.ssl b/src/lib/libcrypto/bn/Makefile.ssl
index 6a479726c4..fa17d3c7d8 100644
--- a/src/lib/libcrypto/bn/Makefile.ssl
+++ b/src/lib/libcrypto/bn/Makefile.ssl
@@ -23,14 +23,6 @@ BN_ASM=		bn_asm.o
 
 CFLAGS= $(INCLUDES) $(CFLAG)
 
-# We let the C compiler driver to take care of .s files. This is done in
-# order to be excused from maintaining a separate set of architecture
-# dependent assembler flags. E.g. if you throw -mcpu=ultrasparc at SPARC
-# gcc, then the driver will automatically translate it to -xarch=v8plus
-# and pass it down to assembler.
-AS=$(CC) -c
-ASFLAGS=$(CFLAGS)
-
 GENERAL=Makefile
 TEST=bntest.c exptest.c
 APPS=
@@ -73,22 +65,11 @@ lib:	$(LIBOBJ)
 	@touch lib
 
 # elf
-asm/bn86-elf.o: asm/bn86unix.cpp
-	$(CPP) -DELF -x c asm/bn86unix.cpp | as -o asm/bn86-elf.o
-
-asm/co86-elf.o: asm/co86unix.cpp
-	$(CPP) -DELF -x c asm/co86unix.cpp | as -o asm/co86-elf.o
+asm/bn86-elf.s:	asm/bn-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > bn86-elf.s)
 
-# solaris
-asm/bn86-sol.o: asm/bn86unix.cpp
-	$(CC) -E -DSOL asm/bn86unix.cpp | sed 's/^#.*//' > asm/bn86-sol.s
-	as -o asm/bn86-sol.o asm/bn86-sol.s
-	rm -f asm/bn86-sol.s
-
-asm/co86-sol.o: asm/co86unix.cpp
-	$(CC) -E -DSOL asm/co86unix.cpp | sed 's/^#.*//' > asm/co86-sol.s
-	as -o asm/co86-sol.o asm/co86-sol.s
-	rm -f asm/co86-sol.s
+asm/co86-elf.s:	asm/co-586.pl ../perlasm/x86asm.pl
+	(cd asm; $(PERL) co-586.pl elf $(CFLAGS) > co86-elf.s)
 
 # a.out
 asm/bn86-out.o: asm/bn86unix.cpp
@@ -136,6 +117,8 @@ asm/ia64-cpp.o:	asm/ia64.S
 	$(CC) $(ASFLAGS) -c -o asm/ia64-cpp.o /tmp/ia64.$$$$.s;	\
 	rm -f /tmp/ia64.$$$$.s
 
+asm/x86_64-gcc.o: asm/x86_64-gcc.c
+
 files:
 	$(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
 
@@ -169,14 +152,14 @@ lint:
 	lint -DLINT $(INCLUDES) $(SRC)>fluff
 
 depend:
-	$(MAKEDEPEND) $(CFLAG) $(INCLUDES) $(DEPFLAG) $(PROGS) $(LIBSRC)
+	$(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
 
 dclean:
 	$(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
 	mv -f Makefile.new $(MAKEFILE)
 
 clean:
-	rm -f asm/co86unix.cpp asm/bn86unix.cpp *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff bn_asm.s
+	rm -f asm/co86unix.cpp asm/bn86unix.cpp asm/*-elf.* *.o */*.o *.obj lib tags core .pure .nfs* *.old *.bak fluff bn_asm.s
 
 # DO NOT DELETE THIS LINE -- make depend depends on it.
 
diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S
index ae56066310..7dfda85566 100644
--- a/src/lib/libcrypto/bn/asm/ia64.S
+++ b/src/lib/libcrypto/bn/asm/ia64.S
@@ -1,6 +1,6 @@
 .explicit
 .text
-.ident	"ia64.S, Version 1.1"
+.ident	"ia64.S, Version 2.0"
 .ident	"IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 
 //
@@ -13,6 +13,35 @@
 // disclaimed.
 // ====================================================================
 //
+// Version 2.x is Itanium2 re-tune. Few words about how Itanum2 is
+// different from Itanium to this module viewpoint. Most notably, is it
+// "wider" than Itanium? Can you experience loop scalability as
+// discussed in commentary sections? Not really:-( Itanium2 has 6
+// integer ALU ports, i.e. it's 2 ports wider, but it's not enough to
+// spin twice as fast, as I need 8 IALU ports. Amount of floating point
+// ports is the same, i.e. 2, while I need 4. In other words, to this
+// module Itanium2 remains effectively as "wide" as Itanium. Yet it's
+// essentially different in respect to this module, and a re-tune was
+// required. Well, because some intruction latencies has changed. Most
+// noticeably those intensively used:
+//
+//			Itanium	Itanium2
+//	ldf8		9	6		L2 hit
+//	ld8		2	1		L1 hit
+//	getf		2	5
+//	xma[->getf]	7[+1]	4[+0]
+//	add[->st8]	1[+1]	1[+0]
+//
+// What does it mean? You might ratiocinate that the original code
+// should run just faster... Because sum of latencies is smaller...
+// Wrong! Note that getf latency increased. This means that if a loop is
+// scheduled for lower latency (and they are), then it will suffer from
+// stall condition and the code will therefore turn anti-scalable, e.g.
+// original bn_mul_words spun at 5*n or 2.5 times slower than expected
+// on Itanium2! What to do? Reschedule loops for Itanium2? But then
+// Itanium would exhibit anti-scalability. So I've chosen to reschedule
+// for worst latency for every instruction aiming for best *all-round*
+// performance.  
 
 // Q.	How much faster does it get?
 // A.	Here is the output from 'openssl speed rsa dsa' for vanilla
@@ -149,12 +178,27 @@ bn_add_words:
 	brp.loop.imp	.L_bn_add_words_ctop,.L_bn_add_words_cend-16
 					}
 	.body
-{ .mib;	mov		r14=r32			// rp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r14=0,r32		// rp
+#else
+	mov		r14=r32			// rp
+#endif
 	mov		r9=pr		};;
-{ .mii;	mov		r15=r33			// ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r15=0,r33		// ap
+#else
+	mov		r15=r33			// ap
+#endif
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
-{ .mib;	mov		r16=r34			// bp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r16=0,r34		// bp
+#else
+	mov		r16=r34			// bp
+#endif
 	mov		pr.rot=1<<16	};;
 
 .L_bn_add_words_ctop:
@@ -174,7 +218,7 @@ bn_add_words:
 
 { .mii;
 (p59)	add		r8=1,r8		// return value
-	mov		pr=r9,-1
+	mov		pr=r9,0x1ffff
 	mov		ar.lc=r3	}
 { .mbb;	nop.b		0x0
 	br.ret.sptk.many	b0	};;
@@ -202,12 +246,27 @@ bn_sub_words:
 	brp.loop.imp	.L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
 					}
 	.body
-{ .mib;	mov		r14=r32			// rp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r14=0,r32		// rp
+#else
+	mov		r14=r32			// rp
+#endif
 	mov		r9=pr		};;
-{ .mii;	mov		r15=r33			// ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r15=0,r33		// ap
+#else
+	mov		r15=r33			// ap
+#endif
 	mov		ar.lc=r10
 	mov		ar.ec=6		}
-{ .mib;	mov		r16=r34			// bp
+{ .mib;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r16=0,r34		// bp
+#else
+	mov		r16=r34			// bp
+#endif
 	mov		pr.rot=1<<16	};;
 
 .L_bn_sub_words_ctop:
@@ -227,7 +286,7 @@ bn_sub_words:
 
 { .mii;
 (p59)	add		r8=1,r8		// return value
-	mov		pr=r9,-1
+	mov		pr=r9,0x1ffff
 	mov		ar.lc=r3	}
 { .mbb;	nop.b		0x0
 	br.ret.sptk.many	b0	};;
@@ -253,7 +312,7 @@ bn_mul_words:
 #ifdef XMA_TEMPTATION
 { .mfi;	alloc		r2=ar.pfs,4,0,0,0	};;
 #else
-{ .mfi;	alloc		r2=ar.pfs,4,4,0,8	};;
+{ .mfi;	alloc		r2=ar.pfs,4,12,0,16	};;
 #endif
 { .mib;	mov		r8=r0			// return value
 	cmp4.le		p6,p0=r34,r0
@@ -266,24 +325,30 @@ bn_mul_words:
 
 	.body
 { .mib;	setf.sig	f8=r35	// w
-	mov		pr.rot=0x400001<<16
-			// ------^----- serves as (p48) at first (p26)
+	mov		pr.rot=0x800001<<16
+			// ------^----- serves as (p50) at first (p27)
 	brp.loop.imp	.L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
 					}
 
 #ifndef XMA_TEMPTATION
 
-{ .mii;	mov		r14=r32	// rp
-	mov		r15=r33	// ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r14=0,r32	// rp
+	addp4		r15=0,r33	// ap
+#else
+	mov		r14=r32		// rp
+	mov		r15=r33		// ap
+#endif
 	mov		ar.lc=r10	}
-{ .mii;	mov		r39=0	// serves as r33 at first (p26)
-	mov		ar.ec=12	};;
+{ .mii;	mov		r40=0	// serves as r35 at first (p27)
+	mov		ar.ec=13	};;
 
-// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
-// cache (i.e. 9 ticks away) as floating point load/store instructions
+// This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium
+// L2 cache (i.e. 9 ticks away) as floating point load/store instructions
 // bypass L1 cache and L2 latency is actually best-case scenario for
-// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
-// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
+// ldf8. The loop is not scalable and shall run in 2*(n+12) even on
+// "wider" IA-64 implementations. It's a trade-off here. n+24 loop
 // would give us ~5% in *overall* performance improvement on "wider"
 // IA-64, but would hurt Itanium for about same because of longer
 // epilogue. As it's a matter of few percents in either case I've
@@ -291,25 +356,25 @@ bn_mul_words:
 // this very instruction sequence in bn_mul_add_words loop which in
 // turn is scalable).
 .L_bn_mul_words_ctop:
-{ .mfi;	(p25)	getf.sig	r36=f49			// low
-	(p21)	xmpy.lu		f45=f37,f8
-	(p27)	cmp.ltu		p52,p48=r39,r38	}
+{ .mfi;	(p25)	getf.sig	r36=f52			// low
+	(p21)	xmpy.lu		f48=f37,f8
+	(p28)	cmp.ltu		p54,p50=r41,r39	}
 { .mfi;	(p16)	ldf8		f32=[r15],8
-	(p21)	xmpy.hu		f38=f37,f8
+	(p21)	xmpy.hu		f40=f37,f8
 	(p0)	nop.i		0x0		};;
-{ .mii;	(p26)	getf.sig	r32=f43			// high
-	.pred.rel	"mutex",p48,p52
-	(p48)	add		r38=r37,r33		// (p26)
-	(p52)	add		r38=r37,r33,1	}	// (p26)
-{ .mfb;	(p27)	st8		[r14]=r39,8
+{ .mii;	(p25)	getf.sig	r32=f44			// high
+	.pred.rel	"mutex",p50,p54
+	(p50)	add		r40=r38,r35		// (p27)
+	(p54)	add		r40=r38,r35,1	}	// (p27)
+{ .mfb;	(p28)	st8		[r14]=r41,8
 	(p0)	nop.f		0x0
 	br.ctop.sptk	.L_bn_mul_words_ctop	};;
 .L_bn_mul_words_cend:
 
 { .mii;	nop.m		0x0
-.pred.rel	"mutex",p49,p53
-(p49)	add		r8=r34,r0
-(p53)	add		r8=r34,r0,1	}
+.pred.rel	"mutex",p51,p55
+(p51)	add		r8=r36,r0
+(p55)	add		r8=r36,r0,1	}
 { .mfb;	nop.m	0x0
 	nop.f	0x0
 	nop.b	0x0			}
@@ -344,7 +409,7 @@ bn_mul_words:
 #endif	// XMA_TEMPTATION
 
 { .mii;	nop.m		0x0
-	mov		pr=r9,-1
+	mov		pr=r9,0x1ffff
 	mov		ar.lc=r3	}
 { .mfb;	rum		1<<5		// clear um.mfh
 	nop.f		0x0
@@ -376,59 +441,69 @@ bn_mul_add_words:
 
 	.body
 { .mib;	setf.sig	f8=r35	// w
-	mov		pr.rot=0x400001<<16
-			// ------^----- serves as (p48) at first (p26)
+	mov		pr.rot=0x800001<<16
+			// ------^----- serves as (p50) at first (p27)
 	brp.loop.imp	.L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
 					}
-{ .mii;	mov		r14=r32	// rp
-	mov		r15=r33	// ap
+{ .mii;
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r14=0,r32	// rp
+	addp4		r15=0,r33	// ap
+#else
+	mov		r14=r32		// rp
+	mov		r15=r33		// ap
+#endif
 	mov		ar.lc=r10	}
-{ .mii;	mov		r39=0	// serves as r33 at first (p26)
-	mov		r18=r32	// rp copy
-	mov		ar.ec=14	};;
+{ .mii;	mov		r40=0	// serves as r35 at first (p27)
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+	addp4		r18=0,r32	// rp copy
+#else
+	mov		r18=r32		// rp copy
+#endif
+	mov		ar.ec=15	};;
 
-// This loop spins in 3*(n+13) ticks on Itanium and should spin in
-// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
+// This loop spins in 3*(n+14) ticks on Itanium and should spin in
+// 2*(n+14) on "wider" IA-64 implementations (to be verified with new
 // ľ-architecture manuals as they become available). As usual it's
 // possible to compress the epilogue, down to 10 in this case, at the
 // cost of scalability. Compressed (and therefore non-scalable) loop
-// running at 3*(n+10) would buy you ~10% on Itanium but take ~35%
+// running at 3*(n+11) would buy you ~10% on Itanium but take ~35%
 // from "wider" IA-64 so let it be scalable! Special attention was
 // paid for having the loop body split at 64-byte boundary. ld8 is
 // scheduled for L1 cache as the data is more than likely there.
 // Indeed, bn_mul_words has put it there a moment ago:-)
 .L_bn_mul_add_words_ctop:
-{ .mfi;	(p25)	getf.sig	r36=f49			// low
-	(p21)	xmpy.lu		f45=f37,f8
-	(p27)	cmp.ltu		p52,p48=r39,r38	}
+{ .mfi;	(p25)	getf.sig	r36=f52			// low
+	(p21)	xmpy.lu		f48=f37,f8
+	(p28)	cmp.ltu		p54,p50=r41,r39	}
 { .mfi;	(p16)	ldf8		f32=[r15],8
-	(p21)	xmpy.hu		f38=f37,f8
-	(p27)	add		r43=r43,r39	};;
-{ .mii;	(p26)	getf.sig	r32=f43			// high
-	.pred.rel	"mutex",p48,p52
-	(p48)	add		r38=r37,r33		// (p26)
-	(p52)	add		r38=r37,r33,1	}	// (p26)
-{ .mfb;	(p27)	cmp.ltu.unc	p56,p0=r43,r39
+	(p21)	xmpy.hu		f40=f37,f8
+	(p28)	add		r45=r45,r41	};;
+{ .mii;	(p25)	getf.sig	r32=f44			// high
+	.pred.rel	"mutex",p50,p54
+	(p50)	add		r40=r38,r35		// (p27)
+	(p54)	add		r40=r38,r35,1	}	// (p27)
+{ .mfb;	(p28)	cmp.ltu.unc	p60,p0=r45,r41
 	(p0)	nop.f		0x0
 	(p0)	nop.b		0x0		}
-{ .mii;	(p26)	ld8		r42=[r18],8
-	(p58)	cmp.eq.or	p57,p0=-1,r44
-	(p58)	add		r44=1,r44	}
-{ .mfb;	(p29)	st8		[r14]=r45,8
+{ .mii;	(p27)	ld8		r44=[r18],8
+	(p62)	cmp.eq.or	p61,p0=-1,r46
+	(p62)	add		r46=1,r46	}
+{ .mfb;	(p30)	st8		[r14]=r47,8
 	(p0)	nop.f		0x0
 	br.ctop.sptk	.L_bn_mul_add_words_ctop};;
 .L_bn_mul_add_words_cend:
 
 { .mii;	nop.m		0x0
-.pred.rel	"mutex",p51,p55
-(p51)	add		r8=r36,r0
-(p55)	add		r8=r36,r0,1	}
+.pred.rel	"mutex",p53,p57
+(p53)	add		r8=r38,r0
+(p57)	add		r8=r38,r0,1	}
 { .mfb;	nop.m	0x0
 	nop.f	0x0
 	nop.b	0x0			};;
 { .mii;
-(p59)	add		r8=1,r8
-	mov		pr=r9,-1
+(p63)	add		r8=1,r8
+	mov		pr=r9,0x1ffff
 	mov		ar.lc=r3	}
 { .mfb;	rum		1<<5		// clear um.mfh
 	nop.f		0x0
@@ -461,6 +536,10 @@ bn_sqr_words:
 	mov	r9=pr			};;
 
 	.body
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+{ .mii; addp4		r32=0,r32
+	addp4		r33=0,r33	};;
+#endif
 { .mib;
 	mov		pr.rot=1<<16
 	brp.loop.imp	.L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
@@ -492,7 +571,7 @@ bn_sqr_words:
 .L_bn_sqr_words_cend:
 
 { .mii;	nop.m		0x0
-	mov		pr=r9,-1
+	mov		pr=r9,0x1ffff
 	mov		ar.lc=r3	}
 { .mfb;	rum		1<<5		// clear um.mfh
 	nop.f		0x0
@@ -526,7 +605,14 @@ bn_sqr_comba8:
 	.prologue
 	.fframe	0
 	.save	ar.pfs,r2
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii;	alloc	r2=ar.pfs,2,1,0,0
+	addp4	r33=0,r33
+	addp4	r32=0,r32		};;
+{ .mii;
+#else
+{ .mii;	alloc	r2=ar.pfs,2,1,0,0
+#endif
 	mov	r34=r33
 	add	r14=8,r33		};;
 	.body
@@ -587,7 +673,14 @@ bn_mul_comba8:
 	.prologue
 	.fframe	0
 	.save	ar.pfs,r2
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
 { .mii;	alloc	r2=ar.pfs,3,0,0,0
+	addp4	r33=0,r33
+	addp4	r34=0,r34		};;
+{ .mii;	addp4	r32=0,r32
+#else
+{ .mii;	alloc   r2=ar.pfs,3,0,0,0
+#endif
 	add	r14=8,r33
 	add	r17=8,r34		}
 	.body
@@ -1138,7 +1231,14 @@ bn_sqr_comba4:
 	.prologue
 	.fframe	0
 	.save	ar.pfs,r2
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+{ .mii;	alloc   r2=ar.pfs,2,1,0,0
+	addp4	r32=0,r32
+	addp4	r33=0,r33		};;
+{ .mii;
+#else
 { .mii;	alloc	r2=ar.pfs,2,1,0,0
+#endif
 	mov	r34=r33
 	add	r14=8,r33		};;
 	.body
@@ -1164,7 +1264,14 @@ bn_mul_comba4:
 	.prologue
 	.fframe	0
 	.save	ar.pfs,r2
+#if defined(_HPUX_SOURCE) && defined(_ILP32)
+{ .mii;	alloc   r2=ar.pfs,3,0,0,0
+	addp4	r33=0,r33
+	addp4	r34=0,r34		};;
+{ .mii;	addp4	r32=0,r32
+#else
 { .mii;	alloc	r2=ar.pfs,3,0,0,0
+#endif
 	add	r14=8,r33
 	add	r17=8,r34		}
 	.body
@@ -1464,7 +1571,7 @@ bn_div_words:
 	or	r8=r8,r33
 	mov	ar.pfs=r2		};;
 { .mii;	shr.u	r9=H,I			// remainder if anybody wants it
-	mov	pr=r10,-1		}
+	mov	pr=r10,0x1ffff		}
 { .mfb;	br.ret.sptk.many	b0	};;
 
 // Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
diff --git a/src/lib/libcrypto/bn/asm/pa-risc2.s b/src/lib/libcrypto/bn/asm/pa-risc2.s
index af9730d062..f3b16290eb 100644
--- a/src/lib/libcrypto/bn/asm/pa-risc2.s
+++ b/src/lib/libcrypto/bn/asm/pa-risc2.s
@@ -747,8 +747,8 @@ bn_div_words
 	.PROC
 	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
 	.IMPORT	BN_num_bits_word,CODE
-	.IMPORT	__iob,DATA
-	.IMPORT	fprintf,CODE
+	;--- not PIC	.IMPORT	__iob,DATA
+	;--- not PIC	.IMPORT	fprintf,CODE
 	.IMPORT	abort,CODE
 	.IMPORT	$$div2U,MILLICODE
 	.CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
@@ -844,12 +844,12 @@ $0006001A
         MOVIB,TR        2,%r8,$0006001C ;offset 0xa18
         EXTRD,U %r3,63,32,%r7   ;offset 0xa1c
 $D2
-        ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
-        LDIL    LR'C$7,%r21     ;offset 0xa24
-        LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
-        .CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
-        B,L     fprintf,%r2     ;offset 0xa2c
-        LDO     RR'C$7(%r21),%r25       ;offset 0xa30
+        ;--- not PIC	ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
+        ;--- not PIC	LDIL    LR'C$7,%r21     ;offset 0xa24
+        ;--- not PIC	LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
+        ;--- not PIC	.CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
+        ;--- not PIC	B,L     fprintf,%r2     ;offset 0xa2c
+        ;--- not PIC	LDO     RR'C$7(%r21),%r25       ;offset 0xa30
         .CALL           ;
         B,L     abort,%r2       ;offset 0xa34
         NOP             ;offset 0xa38
@@ -1605,14 +1605,14 @@ bn_mul_comba4
 	.PROCEND	
 
 
-	.SPACE	$TEXT$
-	.SUBSPA	$CODE$
-	.SPACE	$PRIVATE$,SORT=16
-	.IMPORT	$global$,DATA
-	.SPACE	$TEXT$
-	.SUBSPA	$CODE$
-	.SUBSPA	$LIT$,ACCESS=0x2c
-C$7
-	.ALIGN	8
-	.STRINGZ	"Division would overflow (%d)\n"
+;--- not PIC	.SPACE	$TEXT$
+;--- not PIC	.SUBSPA	$CODE$
+;--- not PIC	.SPACE	$PRIVATE$,SORT=16
+;--- not PIC	.IMPORT	$global$,DATA
+;--- not PIC	.SPACE	$TEXT$
+;--- not PIC	.SUBSPA	$CODE$
+;--- not PIC	.SUBSPA	$LIT$,ACCESS=0x2c
+;--- not PIC	C$7
+;--- not PIC	.ALIGN	8
+;--- not PIC	.STRINGZ	"Division would overflow (%d)\n"
 	.END
diff --git a/src/lib/libcrypto/bn/asm/vms.mar b/src/lib/libcrypto/bn/asm/vms.mar
index 465f2774b6..aefab15cdb 100644
--- a/src/lib/libcrypto/bn/asm/vms.mar
+++ b/src/lib/libcrypto/bn/asm/vms.mar
@@ -1,4 +1,4 @@
-	.title	vax_bn_mul_add_word  unsigned multiply & add, 32*32+32+32=>64
+	.title	vax_bn_mul_add_words  unsigned multiply & add, 32*32+32+32=>64
 ;
 ; w.j.m. 15-jan-1999
 ;
@@ -59,7 +59,7 @@ w=16 ;(AP)	w	by value (input)
 	movl	r6,r0			; return c
 	ret
 
-	.title	vax_bn_mul_word  unsigned multiply & add, 32*32+32=>64
+	.title	vax_bn_mul_words  unsigned multiply & add, 32*32+32=>64
 ;
 ; w.j.m. 15-jan-1999
 ;
@@ -172,147 +172,175 @@ n=12 ;(AP)	n	by value (input)
 ; }
 ;
 ; Using EDIV would be very easy, if it didn't do signed calculations.
-; Therefore, som extra things have to happen around it.  The way to
-; handle that is to shift all operands right one step (basically dividing
-; them by 2) and handle the different cases depending on what the lowest
-; bit of each operand was.
+; Any time any of the input numbers are signed, there are problems,
+; usually with integer overflow, at which point it returns useless
+; data (the quotient gets the value of l, and the remainder becomes 0).
 ;
-; To start with, let's define the following:
+; If it was just for the dividend, it would be very easy, just divide
+; it by 2 (unsigned), do the division, multiply the resulting quotient
+; and remainder by 2, add the bit that was dropped when dividing by 2
+; to the remainder, and do some adjustment so the remainder doesn't
+; end up larger than the divisor.  For some cases when the divisor is
+; negative (from EDIV's point of view, i.e. when the highest bit is set),
+; dividing the dividend by 2 isn't enough, and since some operations
+; might generate integer overflows even when the dividend is divided by
+; 4 (when the high part of the shifted down dividend ends up being exactly
+; half of the divisor, the result is the quotient 0x80000000, which is
+; negative...) it needs to be divided by 8.  Furthermore, the divisor needs
+; to be divided by 2 (unsigned) as well, to avoid more problems with the sign.
+; In this case, a little extra fiddling with the remainder is required.
 ;
-; a' = l & 1
-; a2 = <h,l> >> 1	# UNSIGNED shift!
-; b' = d & 1
-; b2 = d >> 1		# UNSIGNED shift!
+; So, the simplest way to handle this is always to divide the dividend
+; by 8, and to divide the divisor by 2 if it's highest bit is set.
+; After EDIV has been used, the quotient gets multiplied by 8 if the
+; original divisor was positive, otherwise 4.  The remainder, oddly
+; enough, is *always* multiplied by 8.
+; NOTE: in the case mentioned above, where the high part of the shifted
+; down dividend ends up being exactly half the shifted down divisor, we
+; end up with a 33 bit quotient.  That's no problem however, it usually
+; means we have ended up with a too large remainder as well, and the
+; problem is fixed by the last part of the algorithm (next paragraph).
 ;
-; Now, use EDIV to calculate a quotient and a remainder:
+; The routine ends with comparing the resulting remainder with the
+; original divisor and if the remainder is larger, subtract the
+; original divisor from it, and increase the quotient by 1.  This is
+; done until the remainder is smaller than the divisor.
 ;
-; q'' = a2/b2
-; r'' = a2 - q''*b2
+; The complete algorithm looks like this:
 ;
-; If b' is 0, the quotient is already correct, we just need to adjust the
-; remainder:
+; d'    = d
+; l'    = l & 7
+; [h,l] = [h,l] >> 3
+; [q,r] = floor([h,l] / d)	# This is the EDIV operation
+; if (q < 0) q = -q		# I doubt this is necessary any more
 ;
-; if (b' == 0)
-;   {
-;     r = 2*r'' + a'
-;     q = q''
-;   }
-;
-; If b' is 1, we need to do other adjustements.  The first thought is the
-; following (note that r' will not always have the right value, but an
-; adjustement follows further down):
-;
-; if (b' == 1)
-;   {
-;     q' = q''
-;     r' = a - q'*b
-;
-; However, one can note the folowing relationship:
-;
-;                         r'' = a2 - q''*b2
-;                  =>   2*r'' = 2*a2 - 2*q''*b2
-;                             = { a = 2*a2 + a', b = 2*b2 + b' = 2*b2 + 1,
-;                                 q' = q'' }
-;                             = a - a' - q'*(b - 1)
-;                             = a - q'*b - a' + q'
-;                             = r' - a' + q'
-;                  =>     r'  = 2*r'' - q' + a'
+; r'    = r >> 29
+; if (d' >= 0)
+;   q'  = q >> 29
+;   q   = q << 3
+; else
+;   q'  = q >> 30
+;   q   = q << 2
+; r     = (r << 3) + l'
 ;
-; This enables us to use r'' instead of discarding and calculating another
-; modulo:
-;
-; if (b' == 1)
+; if (d' < 0)
 ;   {
-;     q' = q''
-;     r' = (r'' << 1) - q' + a'
-;
-; Now, all we have to do is adjust r', because it might be < 0:
-;
-;     while (r' < 0)
+;     [r',r] = [r',r] - q
+;     while ([r',r] < 0)
 ;       {
-;         r' = r' + b
-;         q' = q' - 1
+;         [r',r] = [r',r] + d
+;         [q',q] = [q',q] - 1
 ;       }
 ;   }
 ;
-; return q'
+; while ([r',r] >= d')
+;   {
+;     [r',r] = [r',r] - d'
+;     [q',q] = [q',q] + 1
+;   }
+;
+; return q
 
 h=4 ;(AP)	h	by value (input)
 l=8 ;(AP)	l	by value (input)
 d=12 ;(AP)	d	by value (input)
 
-;aprim=r5
-;a2=r6
-;a20=r6
-;a21=r7
-;bprim=r8
-;b2=r9
-;qprim=r10	; initially used as q''
-;rprim=r11	; initially used as r''
-
+;r2 = l, q
+;r3 = h, r
+;r4 = d
+;r5 = l'
+;r6 = r'
+;r7 = d'
+;r8 = q'
 
 	.psect	code,nowrt
 
-.entry	bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8,r9,r10,r11>
+.entry	bn_div_words,^m<r2,r3,r4,r5,r6,r7,r8>
 	movl	l(ap),r2
 	movl	h(ap),r3
 	movl	d(ap),r4
 
-	movl	#0,r5
-	movl	#0,r8
-	movl	#0,r0
-;	movl	#0,r1
+	bicl3	#^XFFFFFFF8,r2,r5 ; l' = l & 7
+	bicl3	#^X00000007,r2,r2
 
-	rotl	#-1,r2,r6	; a20 = l >> 1 (almost)
-	rotl	#-1,r3,r7	; a21 = h >> 1 (almost)
-	rotl	#-1,r4,r9	; b2 = d >> 1 (almost)
+	bicl3	#^XFFFFFFF8,r3,r6
+	bicl3	#^X00000007,r3,r3
+        
+	addl	r6,r2
 
-	tstl	r6
-	bgeq	1$
-	xorl2	#^X80000000,r6	; fixup a20 so highest bit is 0
-	incl	r5		; a' = 1
-1$:
-	tstl	r7
-	bgeq	2$
-	xorl2	#^X80000000,r6	; fixup a20 so highest bit is 1,
-				; since that's what was lowest in a21
-	xorl2	#^X80000000,r7	; fixup a21 so highest bit is 1
-2$:
-	tstl	r9
+	rotl	#-3,r2,r2	; l = l >> 3
+	rotl	#-3,r3,r3	; h = h >> 3
+                
+	movl	r4,r7		; d' = d
+
+	movl	#0,r6		; r' = 0
+	movl	#0,r8		; q' = 0
+
+	tstl	r4
 	beql	666$		; Uh-oh, the divisor is 0...
-	bgtr	3$
-	xorl2	#^X80000000,r9	; fixup b2 so highest bit is 0
-	incl	r8		; b' = 1
-3$:
-	tstl	r9
-	bneq	4$		; if b2 is 0, we know that b' is 1
-	tstl	r3
-	bneq	666$		; if higher half isn't 0, we overflow
-	movl	r2,r10		; otherwise, we have our result
-	brb	42$		; This is a success, really.
-4$:
-	ediv	r9,r6,r10,r11
-
-	tstl	r8
-	bneq	5$		; If b' != 0, go to the other part
-;	addl3	r11,r11,r1
-;	addl2	r5,r1
-	brb	42$
-5$:
-	ashl	#1,r11,r11
-	subl2	r10,r11
-	addl2	r5,r11
-	bgeq	7$
-6$:
-	decl	r10
-	addl2	r4,r11
-	blss	6$
-7$:
-;	movl	r11,r1
+	bgtr	1$
+	rotl	#-1,r4,r4	; If d is negative, shift it right.
+	bicl2	#^X80000000,r4	; Since d is then a large number, the
+				; lowest bit is insignificant
+				; (contradict that, and I'll fix the problem!)
+1$:     
+	ediv	r4,r2,r2,r3	; Do the actual division
+
+	tstl	r2
+	bgeq	3$
+	mnegl	r2,r2		; if q < 0, negate it
+3$:     
+	tstl	r7
+	blss	4$
+	rotl	#3,r2,r2	;   q = q << 3
+	bicl3	#^XFFFFFFF8,r2,r8 ;    q' gets the high bits from q
+	bicl3	#^X00000007,r2,r2
+	bsb	41$
+4$:				; else
+	rotl	#2,r2,r2	;   q = q << 2
+	bicl3	#^XFFFFFFFC,r2,r8 ;   q' gets the high bits from q
+	bicl3	#^X00000003,r2,r2
+41$:
+	rotl	#3,r3,r3	; r = r << 3
+	bicl3	#^XFFFFFFF8,r3,r6 ; r' gets the high bits from r
+	bicl3	#^X00000007,r3,r3
+	addl	r5,r3		; r = r + l'
+
+	tstl	r7
+	bgeq	5$
+	bitl	#1,r7
+	beql	5$		; if d' < 0 && d' & 1
+	subl	r2,r3		;   [r',r] = [r',r] - [q',q]
+	sbwc	r8,r6
+45$:
+	bgeq	5$		;   while r < 0
+	decl	r2		;     [q',q] = [q',q] - 1
+	sbwc	#0,r8
+	addl	r7,r3		;     [r',r] = [r',r] + d'
+	adwc	#0,r6
+	brb	45$
+
+; The return points are placed in the middle to keep a short distance from
+; all the branch points
 42$:
-	movl	r10,r0
+;	movl	r3,r1
+	movl	r2,r0
+	ret
 666$:
+	movl	#^XFFFFFFFF,r0
 	ret
+
+5$:
+	tstl	r6
+	bneq	6$
+	cmpl	r3,r7
+	blssu	42$		; while [r',r] >= d'
+6$:
+	subl	r7,r3		;   [r',r] = [r',r] - d'
+	sbwc	#0,r6
+	incl	r2		;   [q',q] = [q',q] + 1
+	adwc	#0,r8
+	brb	5$	
 
 	.title	vax_bn_add_words  unsigned add of two arrays
 ;
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
index b40682f831..3da6d8ced9 100644
--- a/src/lib/libcrypto/bn/bn.h
+++ b/src/lib/libcrypto/bn/bn.h
@@ -248,6 +248,8 @@ typedef struct bn_blinding_st
 	BIGNUM *A;
 	BIGNUM *Ai;
 	BIGNUM *mod; /* just a reference */
+	unsigned long thread_id; /* added in OpenSSL 0.9.6j and 0.9.7b;
+				  * used only by crypto/rsa/rsa_eay.c, rsa_lib.c */
 	} BN_BLINDING;
 
 /* Used for montgomery multiplication */
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
index f9a095e3b3..580d1201bc 100644
--- a/src/lib/libcrypto/bn/bn_div.c
+++ b/src/lib/libcrypto/bn/bn_div.c
@@ -150,6 +150,20 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 	    q;					\
 	})
 #  define REMAINDER_IS_ALREADY_CALCULATED
+#  elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
+   /*
+    * Same story here, but it's 128-bit by 64-bit division. Wow!
+    *					<appro@fy.chalmers.se>
+    */
+#  define bn_div_words(n0,n1,d0)		\
+	({  asm volatile (			\
+		"divq	%4"			\
+		: "=a"(q), "=d"(rem)		\
+		: "a"(n1), "d"(n0), "g"(d0)	\
+		: "cc");			\
+	    q;					\
+	})
+#  define REMAINDER_IS_ALREADY_CALCULATED
 #  endif /* __<cpu> */
 # endif /* __GNUC__ */
 #endif /* OPENSSL_NO_ASM */
@@ -268,6 +282,11 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
 #else
 			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
 #endif
 
 #ifndef REMAINDER_IS_ALREADY_CALCULATED
@@ -292,11 +311,18 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 			BN_ULONG t2l,t2h,ql,qh;
 
 			q=bn_div_words(n0,n1,d0);
+#ifdef BN_DEBUG_LEVITTE
+			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
+X) -> 0x%08X\n",
+				n0, n1, d0, q);
+#endif
 #ifndef REMAINDER_IS_ALREADY_CALCULATED
 			rem=(n1-q*d0)&BN_MASK2;
 #endif
 
-#ifdef BN_UMULT_HIGH
+#if defined(BN_UMULT_LOHI)
+			BN_UMULT_LOHI(t2l,t2h,d1,q);
+#elif defined(BN_UMULT_HIGH)
 			t2l = d1 * q;
 			t2h = BN_UMULT_HIGH(d1,q);
 #else
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h
index 8a4dba375a..5614bc6164 100644
--- a/src/lib/libcrypto/bn/bn_lcl.h
+++ b/src/lib/libcrypto/bn/bn_lcl.h
@@ -230,6 +230,21 @@ struct bignum_ctx
 	     : "r"(a), "r"(b));		\
 	ret;			})
 #  endif	/* compiler */
+# elif defined(__x86_64) && defined(SIXTY_FOUR_BIT_LONG)
+#  if defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret,discard;	\
+	asm ("mulq	%3"		\
+	     : "=a"(discard),"=d"(ret)	\
+	     : "a"(a), "g"(b)		\
+	     : "cc");			\
+	ret;			})
+#   define BN_UMULT_LOHI(low,high,a,b)	\
+	asm ("mulq	%3"		\
+		: "=a"(low),"=d"(high)	\
+		: "a"(a),"g"(b)		\
+		: "cc");
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -337,7 +352,7 @@ struct bignum_ctx
 
 #define LBITS(a)	((a)&BN_MASK2l)
 #define HBITS(a)	(((a)>>BN_BITS4)&BN_MASK2l)
-#define	L2HBITS(a)	((BN_ULONG)((a)&BN_MASK2l)<<BN_BITS4)
+#define	L2HBITS(a)	(((a)<<BN_BITS4)&BN_MASK2)
 
 #define LLBITS(a)	((a)&BN_MASKl)
 #define LHBITS(a)	(((a)>>BN_BITS2)&BN_MASKl)
@@ -353,7 +368,7 @@ struct bignum_ctx
 	lt=(bl)*(lt); \
 	m1=(bl)*(ht); \
 	ht =(bh)*(ht); \
-	m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS(1L); \
+	m=(m+m1)&BN_MASK2; if (m < m1) ht+=L2HBITS((BN_ULONG)1); \
 	ht+=HBITS(m); \
 	m1=L2HBITS(m); \
 	lt=(lt+m1)&BN_MASK2; if (lt < m1) ht++; \
@@ -418,20 +433,19 @@ void bn_sqr_comba4(BN_ULONG *r,const BN_ULONG *a);
 int bn_cmp_words(const BN_ULONG *a,const BN_ULONG *b,int n);
 int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b,
 	int cl, int dl);
+#if 0
+/* bn_mul.c rollback <appro> */
 void bn_mul_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
 	int dna,int dnb,BN_ULONG *t);
 void bn_mul_part_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,
 	int n,int tna,int tnb,BN_ULONG *t);
+#endif
 void bn_sqr_recursive(BN_ULONG *r,const BN_ULONG *a, int n2, BN_ULONG *t);
 void bn_mul_low_normal(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b, int n);
 void bn_mul_low_recursive(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,int n2,
 	BN_ULONG *t);
 void bn_mul_high(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b,BN_ULONG *l,int n2,
 	BN_ULONG *t);
-BN_ULONG bn_add_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
-	int cl, int dl);
-BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
-	int cl, int dl);
 
 #ifdef  __cplusplus
 }
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
index ce2ae78419..463463cfcb 100644
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ b/src/lib/libcrypto/bn/bn_lib.c
@@ -263,12 +263,12 @@ void BN_clear_free(BIGNUM *a)
 	if (a == NULL) return;
 	if (a->d != NULL)
 		{
-		memset(a->d,0,a->dmax*sizeof(a->d[0]));
+		OPENSSL_cleanse(a->d,a->dmax*sizeof(a->d[0]));
 		if (!(BN_get_flags(a,BN_FLG_STATIC_DATA)))
 			OPENSSL_free(a->d);
 		}
 	i=BN_get_flags(a,BN_FLG_MALLOCED);
-	memset(a,0,sizeof(BIGNUM));
+	OPENSSL_cleanse(a,sizeof(BIGNUM));
 	if (i)
 		OPENSSL_free(a);
 	}
diff --git a/src/lib/libcrypto/bn/bn_mul.c b/src/lib/libcrypto/bn/bn_mul.c
index b03458d002..cb93ac3356 100644
--- a/src/lib/libcrypto/bn/bn_mul.c
+++ b/src/lib/libcrypto/bn/bn_mul.c
@@ -56,325 +56,10 @@
  * [including the GNU Public Licence.]
  */
 
-#ifndef BN_DEBUG
-# undef NDEBUG /* avoid conflicting definitions */
-# define NDEBUG
-#endif
-
 #include <stdio.h>
-#include <assert.h>
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-#if defined(OPENSSL_NO_ASM) || !(defined(__i386) || defined(__i386__)) || defined(__DJGPP__) /* Assembler implementation exists only for x86 */
-/* Here follows specialised variants of bn_add_words() and
-   bn_sub_words().  They have the property performing operations on
-   arrays of different sizes.  The sizes of those arrays is expressed through
-   cl, which is the common length ( basicall, min(len(a),len(b)) ), and dl,
-   which is the delta between the two lengths, calculated as len(a)-len(b).
-   All lengths are the number of BN_ULONGs...  For the operations that require
-   a result array as parameter, it must have the length cl+abs(dl).
-   These functions should probably end up in bn_asm.c as soon as there are
-   assembler counterparts for the systems that use assembler files.  */
-
-BN_ULONG bn_sub_part_words(BN_ULONG *r,
-	const BN_ULONG *a, const BN_ULONG *b,
-	int cl, int dl)
-	{
-	BN_ULONG c, t;
-
-	assert(cl >= 0);
-	c = bn_sub_words(r, a, b, cl);
-
-	if (dl == 0)
-		return c;
-
-	r += cl;
-	a += cl;
-	b += cl;
-
-	if (dl < 0)
-		{
-#ifdef BN_COUNT
-		fprintf(stderr, "  bn_sub_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
-#endif
-		for (;;)
-			{
-			t = b[0];
-			r[0] = (0-t-c)&BN_MASK2;
-			if (t != 0) c=1;
-			if (++dl >= 0) break;
-
-			t = b[1];
-			r[1] = (0-t-c)&BN_MASK2;
-			if (t != 0) c=1;
-			if (++dl >= 0) break;
-
-			t = b[2];
-			r[2] = (0-t-c)&BN_MASK2;
-			if (t != 0) c=1;
-			if (++dl >= 0) break;
-
-			t = b[3];
-			r[3] = (0-t-c)&BN_MASK2;
-			if (t != 0) c=1;
-			if (++dl >= 0) break;
-
-			b += 4;
-			r += 4;
-			}
-		}
-	else
-		{
-		int save_dl = dl;
-#ifdef BN_COUNT
-		fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, c = %d)\n", cl, dl, c);
-#endif
-		while(c)
-			{
-			t = a[0];
-			r[0] = (t-c)&BN_MASK2;
-			if (t != 0) c=0;
-			if (--dl <= 0) break;
-
-			t = a[1];
-			r[1] = (t-c)&BN_MASK2;
-			if (t != 0) c=0;
-			if (--dl <= 0) break;
-
-			t = a[2];
-			r[2] = (t-c)&BN_MASK2;
-			if (t != 0) c=0;
-			if (--dl <= 0) break;
-
-			t = a[3];
-			r[3] = (t-c)&BN_MASK2;
-			if (t != 0) c=0;
-			if (--dl <= 0) break;
-
-			save_dl = dl;
-			a += 4;
-			r += 4;
-			}
-		if (dl > 0)
-			{
-#ifdef BN_COUNT
-			fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
-#endif
-			if (save_dl > dl)
-				{
-				switch (save_dl - dl)
-					{
-				case 1:
-					r[1] = a[1];
-					if (--dl <= 0) break;
-				case 2:
-					r[2] = a[2];
-					if (--dl <= 0) break;
-				case 3:
-					r[3] = a[3];
-					if (--dl <= 0) break;
-					}
-				a += 4;
-				r += 4;
-				}
-			}
-		if (dl > 0)
-			{
-#ifdef BN_COUNT
-			fprintf(stderr, "  bn_sub_part_words %d + %d (dl > 0, copy)\n", cl, dl);
-#endif
-			for(;;)
-				{
-				r[0] = a[0];
-				if (--dl <= 0) break;
-				r[1] = a[1];
-				if (--dl <= 0) break;
-				r[2] = a[2];
-				if (--dl <= 0) break;
-				r[3] = a[3];
-				if (--dl <= 0) break;
-
-				a += 4;
-				r += 4;
-				}
-			}
-		}
-	return c;
-	}
-#endif
-
-BN_ULONG bn_add_part_words(BN_ULONG *r,
-	const BN_ULONG *a, const BN_ULONG *b,
-	int cl, int dl)
-	{
-	BN_ULONG c, l, t;
-
-	assert(cl >= 0);
-	c = bn_add_words(r, a, b, cl);
-
-	if (dl == 0)
-		return c;
-
-	r += cl;
-	a += cl;
-	b += cl;
-
-	if (dl < 0)
-		{
-		int save_dl = dl;
-#ifdef BN_COUNT
-		fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, c = %d)\n", cl, dl, c);
-#endif
-		while (c)
-			{
-			l=(c+b[0])&BN_MASK2;
-			c=(l < c);
-			r[0]=l;
-			if (++dl >= 0) break;
-
-			l=(c+b[1])&BN_MASK2;
-			c=(l < c);
-			r[1]=l;
-			if (++dl >= 0) break;
-
-			l=(c+b[2])&BN_MASK2;
-			c=(l < c);
-			r[2]=l;
-			if (++dl >= 0) break;
-
-			l=(c+b[3])&BN_MASK2;
-			c=(l < c);
-			r[3]=l;
-			if (++dl >= 0) break;
-
-			save_dl = dl;
-			b+=4;
-			r+=4;
-			}
-		if (dl < 0)
-			{
-#ifdef BN_COUNT
-			fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, c == 0)\n", cl, dl);
-#endif
-			if (save_dl < dl)
-				{
-				switch (dl - save_dl)
-					{
-				case 1:
-					r[1] = b[1];
-					if (++dl >= 0) break;
-				case 2:
-					r[2] = b[2];
-					if (++dl >= 0) break;
-				case 3:
-					r[3] = b[3];
-					if (++dl >= 0) break;
-					}
-				b += 4;
-				r += 4;
-				}
-			}
-		if (dl < 0)
-			{
-#ifdef BN_COUNT
-			fprintf(stderr, "  bn_add_part_words %d + %d (dl < 0, copy)\n", cl, dl);
-#endif
-			for(;;)
-				{
-				r[0] = b[0];
-				if (++dl >= 0) break;
-				r[1] = b[1];
-				if (++dl >= 0) break;
-				r[2] = b[2];
-				if (++dl >= 0) break;
-				r[3] = b[3];
-				if (++dl >= 0) break;
-
-				b += 4;
-				r += 4;
-				}
-			}
-		}
-	else
-		{
-		int save_dl = dl;
-#ifdef BN_COUNT
-		fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0)\n", cl, dl);
-#endif
-		while (c)
-			{
-			t=(a[0]+c)&BN_MASK2;
-			c=(t < c);
-			r[0]=t;
-			if (--dl <= 0) break;
-
-			t=(a[1]+c)&BN_MASK2;
-			c=(t < c);
-			r[1]=t;
-			if (--dl <= 0) break;
-
-			t=(a[2]+c)&BN_MASK2;
-			c=(t < c);
-			r[2]=t;
-			if (--dl <= 0) break;
-
-			t=(a[3]+c)&BN_MASK2;
-			c=(t < c);
-			r[3]=t;
-			if (--dl <= 0) break;
-
-			save_dl = dl;
-			a+=4;
-			r+=4;
-			}
-#ifdef BN_COUNT
-		fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0, c == 0)\n", cl, dl);
-#endif
-		if (dl > 0)
-			{
-			if (save_dl > dl)
-				{
-				switch (save_dl - dl)
-					{
-				case 1:
-					r[1] = a[1];
-					if (--dl <= 0) break;
-				case 2:
-					r[2] = a[2];
-					if (--dl <= 0) break;
-				case 3:
-					r[3] = a[3];
-					if (--dl <= 0) break;
-					}
-				a += 4;
-				r += 4;
-				}
-			}
-		if (dl > 0)
-			{
-#ifdef BN_COUNT
-			fprintf(stderr, "  bn_add_part_words %d + %d (dl > 0, copy)\n", cl, dl);
-#endif
-			for(;;)
-				{
-				r[0] = a[0];
-				if (--dl <= 0) break;
-				r[1] = a[1];
-				if (--dl <= 0) break;
-				r[2] = a[2];
-				if (--dl <= 0) break;
-				r[3] = a[3];
-				if (--dl <= 0) break;
-
-				a += 4;
-				r += 4;
-				}
-			}
-		}
-	return c;
-	}
-
 #ifdef BN_RECURSION
 /* Karatsuba recursive multiplication algorithm
  * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
@@ -390,15 +75,14 @@ BN_ULONG bn_add_part_words(BN_ULONG *r,
  * a[1]*b[1]
  */
 void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
-	int dna, int dnb, BN_ULONG *t)
+	     BN_ULONG *t)
 	{
 	int n=n2/2,c1,c2;
-	int tna=n+dna, tnb=n+dnb;
 	unsigned int neg,zero;
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_recursive %d * %d\n",n2,n2);
+	printf(" bn_mul_recursive %d * %d\n",n2,n2);
 # endif
 # ifdef BN_MUL_COMBA
 #  if 0
@@ -408,40 +92,34 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 		return;
 		}
 #  endif
-	/* Only call bn_mul_comba 8 if n2 == 8 and the
-	 * two arrays are complete [steve]
-	 */
-	if (n2 == 8 && dna == 0 && dnb == 0)
+	if (n2 == 8)
 		{
 		bn_mul_comba8(r,a,b);
 		return; 
 		}
 # endif /* BN_MUL_COMBA */
-	/* Else do normal multiply */
 	if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL)
 		{
-		bn_mul_normal(r,a,n2+dna,b,n2+dnb);
-		if ((dna + dnb) < 0)
-			memset(&r[2*n2 + dna + dnb], 0,
-				sizeof(BN_ULONG) * -(dna + dnb));
+		/* This should not happen */
+		bn_mul_normal(r,a,n2,b,n2);
 		return;
 		}
 	/* r=(a[0]-a[1])*(b[1]-b[0]) */
-	c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
-	c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
+	c1=bn_cmp_words(a,&(a[n]),n);
+	c2=bn_cmp_words(&(b[n]),b,n);
 	zero=neg=0;
 	switch (c1*3+c2)
 		{
 	case -4:
-		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
-		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		bn_sub_words(t,      &(a[n]),a,      n); /* - */
+		bn_sub_words(&(t[n]),b,      &(b[n]),n); /* - */
 		break;
 	case -3:
 		zero=1;
 		break;
 	case -2:
-		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
-		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n); /* + */
+		bn_sub_words(t,      &(a[n]),a,      n); /* - */
+		bn_sub_words(&(t[n]),&(b[n]),b,      n); /* + */
 		neg=1;
 		break;
 	case -1:
@@ -450,22 +128,21 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 		zero=1;
 		break;
 	case 2:
-		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna); /* + */
-		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		bn_sub_words(t,      a,      &(a[n]),n); /* + */
+		bn_sub_words(&(t[n]),b,      &(b[n]),n); /* - */
 		neg=1;
 		break;
 	case 3:
 		zero=1;
 		break;
 	case 4:
-		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna);
-		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n);
+		bn_sub_words(t,      a,      &(a[n]),n);
+		bn_sub_words(&(t[n]),&(b[n]),b,      n);
 		break;
 		}
 
 # ifdef BN_MUL_COMBA
-	if (n == 4 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba4 could take
-					       extra args to do this well */
+	if (n == 4)
 		{
 		if (!zero)
 			bn_mul_comba4(&(t[n2]),t,&(t[n]));
@@ -475,9 +152,7 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 		bn_mul_comba4(r,a,b);
 		bn_mul_comba4(&(r[n2]),&(a[n]),&(b[n]));
 		}
-	else if (n == 8 && dna == 0 && dnb == 0) /* XXX: bn_mul_comba8 could
-						    take extra args to do this
-						    well */
+	else if (n == 8)
 		{
 		if (!zero)
 			bn_mul_comba8(&(t[n2]),t,&(t[n]));
@@ -492,11 +167,11 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 		{
 		p= &(t[n2*2]);
 		if (!zero)
-			bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
+			bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p);
 		else
 			memset(&(t[n2]),0,n2*sizeof(BN_ULONG));
-		bn_mul_recursive(r,a,b,n,0,0,p);
-		bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,dna,dnb,p);
+		bn_mul_recursive(r,a,b,n,p);
+		bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),n,p);
 		}
 
 	/* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
@@ -545,39 +220,39 @@ void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 
 /* n+tn is the word length
  * t needs to be n*4 is size, as does r */
-void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
-	     int tna, int tnb, BN_ULONG *t)
+void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int tn,
+	     int n, BN_ULONG *t)
 	{
 	int i,j,n2=n*2;
 	unsigned int c1,c2,neg,zero;
 	BN_ULONG ln,lo,*p;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_part_recursive (%d+%d) * (%d+%d)\n",
-		tna, n, tnb, n);
+	printf(" bn_mul_part_recursive %d * %d\n",tn+n,tn+n);
 # endif
 	if (n < 8)
 		{
-		bn_mul_normal(r,a,n+tna,b,n+tnb);
+		i=tn+n;
+		bn_mul_normal(r,a,i,b,i);
 		return;
 		}
 
 	/* r=(a[0]-a[1])*(b[1]-b[0]) */
-	c1=bn_cmp_part_words(a,&(a[n]),tna,n-tna);
-	c2=bn_cmp_part_words(&(b[n]),b,tnb,tnb-n);
+	c1=bn_cmp_words(a,&(a[n]),n);
+	c2=bn_cmp_words(&(b[n]),b,n);
 	zero=neg=0;
 	switch (c1*3+c2)
 		{
 	case -4:
-		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
-		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		bn_sub_words(t,      &(a[n]),a,      n); /* - */
+		bn_sub_words(&(t[n]),b,      &(b[n]),n); /* - */
 		break;
 	case -3:
 		zero=1;
 		/* break; */
 	case -2:
-		bn_sub_part_words(t,      &(a[n]),a,      tna,tna-n); /* - */
-		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n); /* + */
+		bn_sub_words(t,      &(a[n]),a,      n); /* - */
+		bn_sub_words(&(t[n]),&(b[n]),b,      n); /* + */
 		neg=1;
 		break;
 	case -1:
@@ -586,16 +261,16 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 		zero=1;
 		/* break; */
 	case 2:
-		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna); /* + */
-		bn_sub_part_words(&(t[n]),b,      &(b[n]),tnb,n-tnb); /* - */
+		bn_sub_words(t,      a,      &(a[n]),n); /* + */
+		bn_sub_words(&(t[n]),b,      &(b[n]),n); /* - */
 		neg=1;
 		break;
 	case 3:
 		zero=1;
 		/* break; */
 	case 4:
-		bn_sub_part_words(t,      a,      &(a[n]),tna,n-tna);
-		bn_sub_part_words(&(t[n]),&(b[n]),b,      tnb,tnb-n);
+		bn_sub_words(t,      a,      &(a[n]),n);
+		bn_sub_words(&(t[n]),&(b[n]),b,      n);
 		break;
 		}
 		/* The zero case isn't yet implemented here. The speedup
@@ -614,59 +289,54 @@ void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
 		{
 		bn_mul_comba8(&(t[n2]),t,&(t[n]));
 		bn_mul_comba8(r,a,b);
-		bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
-		memset(&(r[n2+tna+tnb]),0,sizeof(BN_ULONG)*(n2-tna-tnb));
+		bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn);
+		memset(&(r[n2+tn*2]),0,sizeof(BN_ULONG)*(n2-tn*2));
 		}
 	else
 		{
 		p= &(t[n2*2]);
-		bn_mul_recursive(&(t[n2]),t,&(t[n]),n,0,0,p);
-		bn_mul_recursive(r,a,b,n,0,0,p);
+		bn_mul_recursive(&(t[n2]),t,&(t[n]),n,p);
+		bn_mul_recursive(r,a,b,n,p);
 		i=n/2;
 		/* If there is only a bottom half to the number,
 		 * just do it */
-		if (tna > tnb)
-			j = tna - i;
-		else
-			j = tnb - i;
+		j=tn-i;
 		if (j == 0)
 			{
-			bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),
-				i,tna-i,tnb-i,p);
+			bn_mul_recursive(&(r[n2]),&(a[n]),&(b[n]),i,p);
 			memset(&(r[n2+i*2]),0,sizeof(BN_ULONG)*(n2-i*2));
 			}
 		else if (j > 0) /* eg, n == 16, i == 8 and tn == 11 */
 				{
 				bn_mul_part_recursive(&(r[n2]),&(a[n]),&(b[n]),
-					i,tna-i,tnb-i,p);
-				memset(&(r[n2+tna+tnb]),0,
-					sizeof(BN_ULONG)*(n2-tna-tnb));
+					j,i,p);
+				memset(&(r[n2+tn*2]),0,
+					sizeof(BN_ULONG)*(n2-tn*2));
 				}
 		else /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
 			{
 			memset(&(r[n2]),0,sizeof(BN_ULONG)*n2);
-			if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL
-				&& tnb < BN_MUL_RECURSIVE_SIZE_NORMAL)
+			if (tn < BN_MUL_RECURSIVE_SIZE_NORMAL)
 				{
-				bn_mul_normal(&(r[n2]),&(a[n]),tna,&(b[n]),tnb);
+				bn_mul_normal(&(r[n2]),&(a[n]),tn,&(b[n]),tn);
 				}
 			else
 				{
 				for (;;)
 					{
 					i/=2;
-					if (i < tna && i < tnb)
+					if (i < tn)
 						{
 						bn_mul_part_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
-							i,tna-i,tnb-i,p);
+							tn-i,i,p);
 						break;
 						}
-					else if (i <= tna && i <= tnb)
+					else if (i == tn)
 						{
 						bn_mul_recursive(&(r[n2]),
 							&(a[n]),&(b[n]),
-							i,tna-i,tnb-i,p);
+							i,p);
 						break;
 						}
 					}
@@ -727,10 +397,10 @@ void bn_mul_low_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
 	int n=n2/2;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_low_recursive %d * %d\n",n2,n2);
+	printf(" bn_mul_low_recursive %d * %d\n",n2,n2);
 # endif
 
-	bn_mul_recursive(r,a,b,n,0,0,&(t[0]));
+	bn_mul_recursive(r,a,b,n,&(t[0]));
 	if (n >= BN_MUL_LOW_RECURSIVE_SIZE_NORMAL)
 		{
 		bn_mul_low_recursive(&(t[0]),&(a[0]),&(b[n]),n,&(t[n2]));
@@ -761,7 +431,7 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
 	BN_ULONG ll,lc,*lp,*mp;
 
 # ifdef BN_COUNT
-	fprintf(stderr," bn_mul_high %d * %d\n",n2,n2);
+	printf(" bn_mul_high %d * %d\n",n2,n2);
 # endif
 	n=n2/2;
 
@@ -814,8 +484,8 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
 	else
 # endif
 		{
-		bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,0,0,&(t[n2]));
-		bn_mul_recursive(r,&(a[n]),&(b[n]),n,0,0,&(t[n2]));
+		bn_mul_recursive(&(t[0]),&(r[0]),&(r[n]),n,&(t[n2]));
+		bn_mul_recursive(r,&(a[n]),&(b[n]),n,&(t[n2]));
 		}
 
 	/* s0 == low(al*bl)
@@ -940,19 +610,19 @@ void bn_mul_high(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, BN_ULONG *l, int n2,
 
 int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
-	int ret=0;
 	int top,al,bl;
 	BIGNUM *rr;
+	int ret = 0;
 #if defined(BN_MUL_COMBA) || defined(BN_RECURSION)
 	int i;
 #endif
 #ifdef BN_RECURSION
-	BIGNUM *t=NULL;
-	int j=0,k;
+	BIGNUM *t;
+	int j,k;
 #endif
 
 #ifdef BN_COUNT
-	fprintf(stderr,"BN_mul %d * %d\n",a->top,b->top);
+	printf("BN_mul %d * %d\n",a->top,b->top);
 #endif
 
 	bn_check_top(a);
@@ -1005,55 +675,21 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 #ifdef BN_RECURSION
 	if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL))
 		{
-		if (i >= -1 && i <= 1)
+		if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA) && bl<b->dmax)
 			{
-			int sav_j =0;
-			/* Find out the power of two lower or equal
-			   to the longest of the two numbers */
-			if (i >= 0)
-				{
-				j = BN_num_bits_word((BN_ULONG)al);
-				}
-			if (i == -1)
-				{
-				j = BN_num_bits_word((BN_ULONG)bl);
-				}
-			sav_j = j;
-			j = 1<<(j-1);
-			assert(j <= al || j <= bl);
-			k = j+j;
-			t = BN_CTX_get(ctx);
-			if (al > j || bl > j)
-				{
-				bn_wexpand(t,k*4);
-				bn_wexpand(rr,k*4);
-				bn_mul_part_recursive(rr->d,a->d,b->d,
-					j,al-j,bl-j,t->d);
-				}
-			else	/* al <= j || bl <= j */
-				{
-				bn_wexpand(t,k*2);
-				bn_wexpand(rr,k*2);
-				bn_mul_recursive(rr->d,a->d,b->d,
-					j,al-j,bl-j,t->d);
-				}
-			rr->top=top;
-			goto end;
-			}
-#if 0
-		if (i == 1 && !BN_get_flags(b,BN_FLG_STATIC_DATA))
-			{
-			BIGNUM *tmp_bn = (BIGNUM *)b;
-			if (bn_wexpand(tmp_bn,al) == NULL) goto err;
-			tmp_bn->d[bl]=0;
+#if 0	/* tribute to const-ification, bl<b->dmax above covers for this */
+			if (bn_wexpand(b,al) == NULL) goto err;
+#endif
+			b->d[bl]=0;
 			bl++;
 			i--;
 			}
-		else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA))
+		else if (i == -1 && !BN_get_flags(a,BN_FLG_STATIC_DATA) && al<a->dmax)
 			{
-			BIGNUM *tmp_bn = (BIGNUM *)a;
-			if (bn_wexpand(tmp_bn,bl) == NULL) goto err;
-			tmp_bn->d[al]=0;
+#if 0	/* tribute to const-ification, al<a->dmax above covers for this */
+			if (bn_wexpand(a,bl) == NULL) goto err;
+#endif
+			a->d[al]=0;
 			al++;
 			i++;
 			}
@@ -1070,17 +706,26 @@ int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 				if (bn_wexpand(t,k*2) == NULL) goto err;
 				if (bn_wexpand(rr,k*2) == NULL) goto err;
 				bn_mul_recursive(rr->d,a->d,b->d,al,t->d);
+				rr->top=top;
+				goto end;
 				}
+#if 0	/* tribute to const-ification, rsa/dsa performance is not affected */
 			else
 				{
-				if (bn_wexpand(t,k*4) == NULL) goto err;
-				if (bn_wexpand(rr,k*4) == NULL) goto err;
+				if (bn_wexpand(a,k) == NULL ) goto err;
+				if (bn_wexpand(b,k) == NULL ) goto err;
+				if (bn_wexpand(t,k*4) == NULL ) goto err;
+				if (bn_wexpand(rr,k*4) == NULL ) goto err;
+				for (i=a->top; i<k; i++)
+					a->d[i]=0;
+				for (i=b->top; i<k; i++)
+					b->d[i]=0;
 				bn_mul_part_recursive(rr->d,a->d,b->d,al-j,j,t->d);
 				}
 			rr->top=top;
 			goto end;
-			}
 #endif
+			}
 		}
 #endif /* BN_RECURSION */
 	if (bn_wexpand(rr,top) == NULL) goto err;
@@ -1103,7 +748,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
 	BN_ULONG *rr;
 
 #ifdef BN_COUNT
-	fprintf(stderr," bn_mul_normal %d * %d\n",na,nb);
+	printf(" bn_mul_normal %d * %d\n",na,nb);
 #endif
 
 	if (na < nb)
@@ -1116,13 +761,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
 
 		}
 	rr= &(r[na]);
-	if (nb <= 0)
-		{
-		(void)bn_mul_words(r,a,na,0);
-		return;
-		}
-	else
-		rr[0]=bn_mul_words(r,a,na,b[0]);
+	rr[0]=bn_mul_words(r,a,na,b[0]);
 
 	for (;;)
 		{
@@ -1143,7 +782,7 @@ void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb)
 void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 	{
 #ifdef BN_COUNT
-	fprintf(stderr," bn_mul_low_normal %d * %d\n",n,n);
+	printf(" bn_mul_low_normal %d * %d\n",n,n);
 #endif
 	bn_mul_words(r,a,n,b[0]);
 
diff --git a/src/lib/libcrypto/bn/bn_prime.c b/src/lib/libcrypto/bn/bn_prime.c
index 918b9237c6..e072d9255c 100644
--- a/src/lib/libcrypto/bn/bn_prime.c
+++ b/src/lib/libcrypto/bn/bn_prime.c
@@ -140,6 +140,7 @@ BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
 	BN_CTX *ctx;
 	int checks = BN_prime_checks_for_size(bits);
 
+	BN_init(&t);
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
 	if (ret == NULL)
@@ -148,7 +149,6 @@ BIGNUM *BN_generate_prime(BIGNUM *ret, int bits, int safe,
 		}
 	else
 		rnd=ret;
-	BN_init(&t);
 loop: 
 	/* make a random number and set the top and bottom bits */
 	if (add == NULL)
diff --git a/src/lib/libcrypto/bn/bn_rand.c b/src/lib/libcrypto/bn/bn_rand.c
index 9e08ccd22e..893c9d2af9 100644
--- a/src/lib/libcrypto/bn/bn_rand.c
+++ b/src/lib/libcrypto/bn/bn_rand.c
@@ -201,7 +201,7 @@ static int bnrand(int pseudorand, BIGNUM *rnd, int bits, int top, int bottom)
 err:
 	if (buf != NULL)
 		{
-		memset(buf,0,bytes);
+		OPENSSL_cleanse(buf,bytes);
 		OPENSSL_free(buf);
 		}
 	return(ret);
diff --git a/src/lib/libcrypto/bn/bn_word.c b/src/lib/libcrypto/bn/bn_word.c
index cd59baa2c4..988e0ca7b3 100644
--- a/src/lib/libcrypto/bn/bn_word.c
+++ b/src/lib/libcrypto/bn/bn_word.c
@@ -123,7 +123,10 @@ int BN_add_word(BIGNUM *a, BN_ULONG w)
 	i=0;
 	for (;;)
 		{
-		l=(a->d[i]+(BN_ULONG)w)&BN_MASK2;
+		if (i >= a->top)
+			l=w;
+		else
+			l=(a->d[i]+(BN_ULONG)w)&BN_MASK2;
 		a->d[i]=l;
 		if (w > l)
 			w=1;
diff --git a/src/lib/libcrypto/bn/bntest.c b/src/lib/libcrypto/bn/bntest.c
index 8158a67374..3c8c540387 100644
--- a/src/lib/libcrypto/bn/bntest.c
+++ b/src/lib/libcrypto/bn/bntest.c
@@ -68,10 +68,6 @@
 #include <openssl/x509.h>
 #include <openssl/err.h>
 
-#ifdef OPENSSL_SYS_WINDOWS
-#include "../bio/bss_file.c"
-#endif
-
 const int num0 = 100; /* number of tests */
 const int num1 = 50;  /* additional tests for some functions */
 const int num2 = 5;   /* number of tests for slow functions */
@@ -96,11 +92,6 @@ int test_sqrt(BIO *bp,BN_CTX *ctx);
 int rand_neg(void);
 static int results=0;
 
-#ifdef OPENSSL_NO_STDIO
-#define APPS_WIN16
-#include "bss_file.c"
-#endif
-
 static unsigned char lst[]="\xC6\x4F\x43\x04\x2A\xEA\xCA\x6E\x58\x36\x80\x5B\xE8\xC9"
 "\x9B\x04\x5D\x48\x36\xC2\xFD\x16\xC9\x64\xF0";
 
@@ -141,10 +132,10 @@ int main(int argc, char *argv[])
 
 
 	ctx=BN_CTX_new();
-	if (ctx == NULL) exit(1);
+	if (ctx == NULL) EXIT(1);
 
 	out=BIO_new(BIO_s_file());
-	if (out == NULL) exit(1);
+	if (out == NULL) EXIT(1);
 	if (outfile == NULL)
 		{
 		BIO_set_fp(out,stdout,BIO_NOCLOSE);
@@ -154,7 +145,7 @@ int main(int argc, char *argv[])
 		if (!BIO_write_filename(out,outfile))
 			{
 			perror(outfile);
-			exit(1);
+			EXIT(1);
 			}
 		}
 
@@ -238,14 +229,14 @@ int main(int argc, char *argv[])
 	BIO_free(out);
 
 /**/
-	exit(0);
+	EXIT(0);
 err:
 	BIO_puts(out,"1\n"); /* make sure the Perl script fed by bc notices
 	                      * the failure, see test_bn in test/Makefile.ssl*/
 	BIO_flush(out);
 	ERR_load_crypto_strings();
 	ERR_print_errors_fp(stderr);
-	exit(1);
+	EXIT(1);
 	return(1);
 	}
 
@@ -488,7 +479,7 @@ int test_mul(BIO *bp)
 	BN_CTX *ctx;
 
 	ctx = BN_CTX_new();
-	if (ctx == NULL) exit(1);
+	if (ctx == NULL) EXIT(1);
 	
 	BN_init(&a);
 	BN_init(&b);
@@ -726,7 +717,7 @@ int test_mod_mul(BIO *bp, BN_CTX *ctx)
 			while ((l=ERR_get_error()))
 				fprintf(stderr,"ERROR:%s\n",
 					ERR_error_string(l,NULL));
-			exit(1);
+			EXIT(1);
 			}
 		if (bp != NULL)
 			{
diff --git a/src/lib/libcrypto/bn/divtest.c b/src/lib/libcrypto/bn/divtest.c
index 13ba86e3c4..d3fc688f33 100644
--- a/src/lib/libcrypto/bn/divtest.c
+++ b/src/lib/libcrypto/bn/divtest.c
@@ -1,7 +1,7 @@
 #include <openssl/bn.h>
 #include <openssl/rand.h>
 
-static int rand(n)
+static int Rand(n)
 {
     unsigned char x[2];
     RAND_pseudo_bytes(x,2);
@@ -26,8 +26,8 @@ main()
     BN_CTX *ctx=BN_CTX_new();
 
     for(;;) {
-	BN_pseudo_rand(a,rand(),0,0);
-	BN_pseudo_rand(b,rand(),0,0);
+	BN_pseudo_rand(a,Rand(),0,0);
+	BN_pseudo_rand(b,Rand(),0,0);
 	if (BN_is_zero(b)) continue;
 
 	BN_RECP_CTX_set(recp,b,ctx);
diff --git a/src/lib/libcrypto/bn/exptest.c b/src/lib/libcrypto/bn/exptest.c
index 5ca570d1a8..b09cf88705 100644
--- a/src/lib/libcrypto/bn/exptest.c
+++ b/src/lib/libcrypto/bn/exptest.c
@@ -59,13 +59,13 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include "../e_os.h"
+
 #include <openssl/bio.h>
 #include <openssl/bn.h>
 #include <openssl/rand.h>
 #include <openssl/err.h>
-#ifdef OPENSSL_SYS_WINDOWS
-#include "../bio/bss_file.c"
-#endif
 
 #define NUM_BITS	(BN_BITS*2)
 
@@ -86,7 +86,7 @@ int main(int argc, char *argv[])
 	ERR_load_BN_strings();
 
 	ctx=BN_CTX_new();
-	if (ctx == NULL) exit(1);
+	if (ctx == NULL) EXIT(1);
 	r_mont=BN_new();
 	r_recp=BN_new();
 	r_simple=BN_new();
@@ -99,7 +99,7 @@ int main(int argc, char *argv[])
 
 	out=BIO_new(BIO_s_file());
 
-	if (out == NULL) exit(1);
+	if (out == NULL) EXIT(1);
 	BIO_set_fp(out,stdout,BIO_NOCLOSE);
 
 	for (i=0; i<200; i++)
@@ -124,7 +124,7 @@ int main(int argc, char *argv[])
 			{
 			printf("BN_mod_exp_mont() problems\n");
 			ERR_print_errors(out);
-			exit(1);
+			EXIT(1);
 			}
 
 		ret=BN_mod_exp_recp(r_recp,a,b,m,ctx);
@@ -132,7 +132,7 @@ int main(int argc, char *argv[])
 			{
 			printf("BN_mod_exp_recp() problems\n");
 			ERR_print_errors(out);
-			exit(1);
+			EXIT(1);
 			}
 
 		ret=BN_mod_exp_simple(r_simple,a,b,m,ctx);
@@ -140,7 +140,7 @@ int main(int argc, char *argv[])
 			{
 			printf("BN_mod_exp_simple() problems\n");
 			ERR_print_errors(out);
-			exit(1);
+			EXIT(1);
 			}
 
 		if (BN_cmp(r_simple, r_mont) == 0
@@ -163,7 +163,7 @@ int main(int argc, char *argv[])
 			printf("\nrecp     =");	BN_print(out,r_recp);
 			printf("\nmont     ="); BN_print(out,r_mont);
 			printf("\n");
-			exit(1);
+			EXIT(1);
 			}
 		}
 	BN_free(r_mont);
@@ -177,11 +177,11 @@ int main(int argc, char *argv[])
 	CRYPTO_mem_leaks(out);
 	BIO_free(out);
 	printf(" done\n");
-	exit(0);
+	EXIT(0);
 err:
 	ERR_load_crypto_strings();
 	ERR_print_errors(out);
-	exit(1);
+	EXIT(1);
 	return(1);
 	}
 
-- 
cgit v1.2.3-55-g6feb