From 5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80 Mon Sep 17 00:00:00 2001
From: djm <>
Date: Sat, 6 Sep 2008 12:15:56 +0000
Subject: import of OpenSSL 0.9.8h

---
 src/lib/libcrypto/rc4/asm/rc4-586.pl    |  11 +-
 src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 286 ++++++++++++++++++++++++++++----
 src/lib/libcrypto/rc4/rc4.h             |   6 +-
 src/lib/libcrypto/rc4/rc4_enc.c         |   4 +-
 src/lib/libcrypto/rc4/rc4_skey.c        |  55 ++++--
 5 files changed, 302 insertions(+), 60 deletions(-)

(limited to 'src/lib/libcrypto/rc4')

diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
index d6e98f0811..ef7eee766c 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -200,22 +200,23 @@ sub RC4
 
 	&lea	($ty,&DWP(0,$in,$ty));
 	&mov	(&swtmp(2),$ty);
+	&movz	($tx,&BP(0,$d,$x));
 
 	# strangely enough unrolled loop performs over 20% slower...
 	&set_label("RC4_CHAR_loop");
-		&movz	($tx,&BP(0,$d,$x));
 		&add	(&LB($y),&LB($tx));
 		&movz	($ty,&BP(0,$d,$y));
 		&movb	(&BP(0,$d,$y),&LB($tx));
 		&movb	(&BP(0,$d,$x),&LB($ty));
 		&add	(&LB($ty),&LB($tx));
 		&movz	($ty,&BP(0,$d,$ty));
+		&add	(&LB($x),1);
 		&xorb	(&LB($ty),&BP(0,$in));
-		&movb	(&BP(0,$out),&LB($ty));
-		&inc	(&LB($x));
-		&inc	($in);
-		&inc	($out);
+		&lea	($in,&DWP(1,$in));
+		&movz	($tx,&BP(0,$d,$x));
 		&cmp	($in,&swtmp(2));
+		&movb	(&BP(0,$out),&LB($ty));
+		&lea	($out,&DWP(1,$out));
 	&jb	(&label("RC4_CHAR_loop"));
 
 	&set_label("finished");
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
index b628daca70..2d47320485 100755
--- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -2,29 +2,70 @@
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
-# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
-# commentary section in corresponding script in development branch
-# for background information about this option carousel. For those
-# who don't have energy to figure out these gory details, here is
-# basis in form of performance matrix relative to the original
-# 0.9.7e C code-base:
-#
-#		0.9.7e	0.9.7f	this
-# AMD64		1x	3.3x	2.4x
-# EM64T		1x	0.8x	1.5x
-#
-# In other words idea is to trade -25% AMD64 performance to compensate
-# for deterioration and gain +90% on EM64T core. Development branch
-# maintains best performance for either target, i.e. 3.3x for AMD64
-# and 1.5x for EM64T.
+# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
+# "hand-coded assembler"] doesn't stand for the whole improvement
+# coefficient. It turned out that eliminating RC4_CHAR from config
+# line results in ~40% improvement (yes, even for C implementation).
+# Presumably it has everything to do with AMD cache architecture and
+# RAW or whatever penalties. Once again! The module *requires* config
+# line *without* RC4_CHAR! As for coding "secret," I bet on partial
+# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
+# I simply 'inc %r8b'. Even though optimization manual discourages
+# to operate on partial registers, it turned out to be the best bet.
+# At least for AMD... How IA32E would perform remains to be seen...
+
+# As was shown by Marc Bevand reordering of couple of load operations
+# results in even higher performance gain of 3.3x:-) At least on
+# Opteron... For reference, 1x in this case is RC4_CHAR C-code
+# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
+# Latter means that if you want to *estimate* what to expect from
+# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
+
+# Intel P4 EM64T core was found to run the AMD64 code really slow...
+# The only way to achieve comparable performance on P4 was to keep
+# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
+# compose blended code, which would perform even within 30% marginal
+# on either AMD and Intel platforms, I implement both cases. See
+# rc4_skey.c for further details...
+
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
+# those with add/sub results in 50% performance improvement of folded
+# loop...
+
+# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
+# performance by >30% [unlike P4 32-bit case that is]. But this is
+# provided that loads are reordered even more aggressively! Both code
+# pathes, AMD64 and EM64T, reorder loads in essentially same manner
+# as my IA-64 implementation. On Opteron this resulted in modest 5%
+# improvement [I had to test it], while final Intel P4 performance
+# achieves respectful 432MBps on 2.8GHz processor now. For reference.
+# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
+# RC4_INT code-path. While if executed on Opteron, it's only 25%
+# slower than the RC4_INT one [meaning that if CPU ľ-arch detection
+# is not implemented, then this final RC4_CHAR code-path should be
+# preferred, as it provides better *all-round* performance].
+
+# Intel Core2 was observed to perform poorly on both code paths:-( It
+# apparently suffers from some kind of partial register stall, which
+# occurs in 64-bit mode only [as virtually identical 32-bit loop was
+# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
+# cloop1 boosts its performance by 80%! This loop appears to be optimal
+# fit for Core2 and therefore the code was modified to skip cloop8 on
+# this CPU.
 
 $output=shift;
 
-open STDOUT,">$output" || die "can't open $output: $!";
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
 
 $dat="%rdi";	    # arg1
 $len="%rsi";	    # arg2
@@ -36,29 +77,101 @@ $out="%rcx";	    # arg4
 $YY="%r12";
 $TY="%r13";
 
-$code=<<___;;
+$code=<<___;
 .text
 
 .globl	RC4
-.type	RC4,\@function
+.type	RC4,\@function,4
 .align	16
 RC4:	or	$len,$len
 	jne	.Lentry
-	repret
+	ret
 .Lentry:
 	push	%r12
 	push	%r13
 
-	add	\$2,$dat
-	movzb	-2($dat),$XX[0]#d
-	movzb	-1($dat),$YY#d
+	add	\$8,$dat
+	movl	-8($dat),$XX[0]#d
+	movl	-4($dat),$YY#d
+	cmpl	\$-1,256($dat)
+	je	.LRC4_CHAR
+	inc	$XX[0]#b
+	movl	($dat,$XX[0],4),$TX[0]#d
+	test	\$-8,$len
+	jz	.Lloop1
+	jmp	.Lloop8
+.align	16
+.Lloop8:
+___
+for ($i=0;$i<8;$i++) {
+$code.=<<___;
+	add	$TX[0]#b,$YY#b
+	mov	$XX[0],$XX[1]
+	movl	($dat,$YY,4),$TY#d
+	ror	\$8,%rax			# ror is redundant when $i=0
+	inc	$XX[1]#b
+	movl	($dat,$XX[1],4),$TX[1]#d
+	cmp	$XX[1],$YY
+	movl	$TX[0]#d,($dat,$YY,4)
+	cmove	$TX[0],$TX[1]
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TX[0]#b,$TY#b
+	movb	($dat,$TY,4),%al
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+}
+$code.=<<___;
+	ror	\$8,%rax
+	sub	\$8,$len
+
+	xor	($inp),%rax
+	add	\$8,$inp
+	mov	%rax,($out)
+	add	\$8,$out
 
+	test	\$-8,$len
+	jnz	.Lloop8
+	cmp	\$0,$len
+	jne	.Lloop1
+___
+$code.=<<___;
+.Lexit:
+	sub	\$1,$XX[0]#b
+	movl	$XX[0]#d,-8($dat)
+	movl	$YY#d,-4($dat)
+
+	pop	%r13
+	pop	%r12
+	ret
+.align	16
+.Lloop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	inc	$inp
+	movb	$TY#b,($out)
+	inc	$out
+	dec	$len
+	jnz	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.LRC4_CHAR:
 	add	\$1,$XX[0]#b
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
+	cmp	\$0,260($dat)
+	jnz	.Lcloop1
 	push	%rbx
-.align	16	# incidentally aligned already
+	jmp	.Lcloop8
+.align	16
 .Lcloop8:
 	mov	($inp),%eax
 	mov	4($inp),%ebx
@@ -114,15 +227,9 @@ $code.=<<___;
 	pop	%rbx
 	cmp	\$0,$len
 	jne	.Lcloop1
-.Lexit:
-	sub	\$1,$XX[0]#b
-	movb	$XX[0]#b,-2($dat)
-	movb	$YY#b,-1($dat)
-
-	pop	%r13
-	pop	%r12
-	repret
-
+	jmp	.Lexit
+___
+$code.=<<___;
 .align	16
 .Lcloop1:
 	add	$TX[0]#b,$YY#b
@@ -131,6 +238,8 @@ $code.=<<___;
 	movb	$TY#b,($dat,$XX[0])
 	add	$TX[0]#b,$TY#b
 	add	\$1,$XX[0]#b
+	movzb	$TY#b,$TY#d
+	movzb	$XX[0]#b,$XX[0]#d
 	movzb	($dat,$TY),$TY#d
 	movzb	($dat,$XX[0]),$TX[0]#d
 	xorb	($inp),$TY#b
@@ -143,8 +252,113 @@ $code.=<<___;
 .size	RC4,.-RC4
 ___
 
-$code =~ s/#([bwd])/$1/gm;
+$idx="%r8";
+$ido="%r9";
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
 
-$code =~ s/repret/.byte\t0xF3,0xC3/gm;
+	mov	OPENSSL_ia32cap_P(%rip),$idx#d
+	bt	\$20,$idx#d
+	jnc	.Lw1stloop
+	bt	\$30,$idx#d
+	setc	$ido#b
+	mov	$ido#d,260($dat)
+	jmp	.Lc1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+	jmp	.Lexit_key
+
+.align	16
+.Lc1stloop:
+	mov	%al,($dat,%rax)
+	add	\$1,%al
+	jnc	.Lc1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lc2ndloop:
+	mov	($dat,$ido),%r10b
+	add	($inp,$len),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx),%r11b
+	jnz	.Lcnowrap
+	mov	%rcx,$len
+.Lcnowrap:
+	mov	%r10b,($dat,$idx)
+	mov	%r11b,($dat,$ido)
+	add	\$1,$ido#b
+	jnc	.Lc2ndloop
+	movl	\$-1,256($dat)
+
+.align	16
+.Lexit_key:
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@function,0
+.align	16
+RC4_options:
+	.picmeup %rax
+	lea	.Lopts-.(%rax),%rax
+	mov	OPENSSL_ia32cap_P(%rip),%edx
+	bt	\$20,%edx
+	jnc	.Ldone
+	add	\$12,%rax
+	bt	\$30,%edx
+	jnc	.Ldone
+	add	\$13,%rax
+.Ldone:
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(8x,int)"
+.asciz	"rc4(8x,char)"
+.asciz	"rc4(1x,char)"
+.asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+
+$code =~ s/#([bwd])/$1/gm;
 
 print $code;
+
+close STDOUT;
diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h
index ae0cea75b8..7aec04fe93 100644
--- a/src/lib/libcrypto/rc4/rc4.h
+++ b/src/lib/libcrypto/rc4/rc4.h
@@ -59,12 +59,11 @@
 #ifndef HEADER_RC4_H
 #define HEADER_RC4_H
 
+#include <openssl/opensslconf.h> /* OPENSSL_NO_RC4, RC4_INT */
 #ifdef OPENSSL_NO_RC4
 #error RC4 is disabled.
 #endif
 
-#include <openssl/opensslconf.h> /* RC4_INT */
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -77,9 +76,6 @@ typedef struct rc4_key_st
 
  
 const char *RC4_options(void);
-#ifdef OPENSSL_FIPS
-void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
-#endif
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
 		unsigned char *outdata);
diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c
index d5f18a3a70..0660ea60a2 100644
--- a/src/lib/libcrypto/rc4/rc4_enc.c
+++ b/src/lib/libcrypto/rc4/rc4_enc.c
@@ -157,7 +157,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
 		if (!is_endian.little)
 			{	/* BIG-ENDIAN CASE */
 # define BESHFT(c)	(((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
-			for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK))
+			for (;len&~(sizeof(RC4_CHUNK)-1);len-=sizeof(RC4_CHUNK))
 				{
 				ichunk  = *(RC4_CHUNK *)indata;
 				otp  = RC4_STEP<<BESHFT(0);
@@ -210,7 +210,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
 		else
 			{	/* LITTLE-ENDIAN CASE */
 # define LESHFT(c)	(((c)*8)&(sizeof(RC4_CHUNK)*8-1))
-			for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK))
+			for (;len&~(sizeof(RC4_CHUNK)-1);len-=sizeof(RC4_CHUNK))
 				{
 				ichunk  = *(RC4_CHUNK *)indata;
 				otp  = RC4_STEP;
diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c
index 60510624fd..46b77ec321 100644
--- a/src/lib/libcrypto/rc4/rc4_skey.c
+++ b/src/lib/libcrypto/rc4/rc4_skey.c
@@ -57,12 +57,10 @@
  */
 
 #include <openssl/rc4.h>
-#include <openssl/crypto.h>
-#include <openssl/fips.h>
 #include "rc4_locl.h"
 #include <openssl/opensslv.h>
 
-const char *RC4_version="RC4" OPENSSL_VERSION_PTEXT;
+const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT;
 
 const char *RC4_options(void)
 	{
@@ -87,7 +85,7 @@ const char *RC4_options(void)
  * Date: Wed, 14 Sep 1994 06:35:31 GMT
  */
 
-FIPS_NON_FIPS_VCIPHER_Init(RC4)
+void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 	{
         register RC4_INT tmp;
         register int id1,id2;
@@ -95,26 +93,59 @@ FIPS_NON_FIPS_VCIPHER_Init(RC4)
         unsigned int i;
         
         d= &(key->data[0]);
-
-	for (i=0; i<256; i++)
-		d[i]=i;
         key->x = 0;     
         key->y = 0;     
         id1=id2=0;     
 
-#define SK_LOOP(n) { \
+#define SK_LOOP(d,n) { \
 		tmp=d[(n)]; \
 		id2 = (data[id1] + tmp + id2) & 0xff; \
 		if (++id1 == len) id1=0; \
 		d[(n)]=d[id2]; \
 		d[id2]=tmp; }
 
+#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
+# if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
+	defined(__INTEL__) || \
+	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
+	if (sizeof(RC4_INT) > 1) {
+		/*
+		 * Unlike all other x86 [and x86_64] implementations,
+		 * Intel P4 core [including EM64T] was found to perform
+		 * poorly with wider RC4_INT. Performance improvement
+		 * for IA-32 hand-coded assembler turned out to be 2.8x
+		 * if re-coded for RC4_CHAR! It's however inappropriate
+		 * to just switch to RC4_CHAR for x86[_64], as non-P4
+		 * implementations suffer from significant performance
+		 * losses then, e.g. PIII exhibits >2x deterioration,
+		 * and so does Opteron. In order to assure optimal
+		 * all-round performance, we detect P4 at run-time by
+		 * checking upon reserved bit 20 in CPU capability
+		 * vector and set up compressed key schedule, which is
+		 * recognized by correspondingly updated assembler
+		 * module... Bit 20 is set up by OPENSSL_ia32_cpuid.
+		 *
+		 *				<appro@fy.chalmers.se>
+		 */
+		if (OPENSSL_ia32cap_P & (1<<20)) {
+			unsigned char *cp=(unsigned char *)d;
+
+			for (i=0;i<256;i++) cp[i]=i;
+			for (i=0;i<256;i++) SK_LOOP(cp,i);
+			/* mark schedule as compressed! */
+			d[256/sizeof(RC4_INT)]=-1;
+			return;
+		}
+	}
+# endif
+#endif
+	for (i=0; i < 256; i++) d[i]=i;
 	for (i=0; i < 256; i+=4)
 		{
-		SK_LOOP(i+0);
-		SK_LOOP(i+1);
-		SK_LOOP(i+2);
-		SK_LOOP(i+3);
+		SK_LOOP(d,i+0);
+		SK_LOOP(d,i+1);
+		SK_LOOP(d,i+2);
+		SK_LOOP(d,i+3);
 		}
 	}
     
-- 
cgit v1.2.3-55-g6feb