From f6198d4d0ab97685dc56be2d48715ed39fcc74b9 Mon Sep 17 00:00:00 2001
From: djm <>
Date: Tue, 27 Jun 2006 05:05:42 +0000
Subject: import of openssl-0.9.7j

---
 src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 150 ++++++++++++++++++++++++++++++++
 src/lib/libcrypto/rc4/rc4.h             |   4 -
 src/lib/libcrypto/rc4/rc4_enc.c         |   4 -
 src/lib/libcrypto/rc4/rc4_skey.c        |   5 +-
 4 files changed, 151 insertions(+), 12 deletions(-)
 create mode 100755 src/lib/libcrypto/rc4/asm/rc4-x86_64.pl

(limited to 'src/lib/libcrypto/rc4')

diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
new file mode 100755
index 0000000000..b628daca70
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,150 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
+# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See
+# commentary section in corresponding script in development branch
+# for background information about this option carousel. For those
+# who don't have energy to figure out these gory details, here is
+# basis in form of performance matrix relative to the original
+# 0.9.7e C code-base:
+#
+#		0.9.7e	0.9.7f	this
+# AMD64		1x	3.3x	2.4x
+# EM64T		1x	0.8x	1.5x
+#
+# In other words idea is to trade -25% AMD64 performance to compensate
+# for deterioration and gain +90% on EM64T core. Development branch
+# maintains best performance for either target, i.e. 3.3x for AMD64
+# and 1.5x for EM64T.
+
+$output=shift;
+
+open STDOUT,">$output" || die "can't open $output: $!";
+
+$dat="%rdi";	    # arg1
+$len="%rsi";	    # arg2
+$inp="%rdx";	    # arg3
+$out="%rcx";	    # arg4
+
+@XX=("%r8","%r10");
+@TX=("%r9","%r11");
+$YY="%r12";
+$TY="%r13";
+
+$code=<<___;;
+.text
+
+.globl	RC4
+.type	RC4,\@function
+.align	16
+RC4:	or	$len,$len
+	jne	.Lentry
+	repret
+.Lentry:
+	push	%r12
+	push	%r13
+
+	add	\$2,$dat
+	movzb	-2($dat),$XX[0]#d
+	movzb	-1($dat),$YY#d
+
+	add	\$1,$XX[0]#b
+	movzb	($dat,$XX[0]),$TX[0]#d
+	test	\$-8,$len
+	jz	.Lcloop1
+	push	%rbx
+.align	16	# incidentally aligned already
+.Lcloop8:
+	mov	($inp),%eax
+	mov	4($inp),%ebx
+___
+# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+	add	$TX[0]#b,$YY#b
+	lea	1($XX[0]),$XX[1]
+	movzb	($dat,$YY),$TY#d
+	movzb	$XX[1]#b,$XX[1]#d
+	movzb	($dat,$XX[1]),$TX[1]#d
+	movb	$TX[0]#b,($dat,$YY)
+	cmp	$XX[1],$YY
+	movb	$TY#b,($dat,$XX[0])
+	jne	.Lcmov$i			# Intel cmov is sloooow...
+	mov	$TX[0],$TX[1]
+.Lcmov$i:
+	add	$TX[0]#b,$TY#b
+	xor	($dat,$TY),%al
+	ror	\$8,%eax
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+}
+for ($i=4;$i<8;$i++) {
+$code.=<<___;
+	add	$TX[0]#b,$YY#b
+	lea	1($XX[0]),$XX[1]
+	movzb	($dat,$YY),$TY#d
+	movzb	$XX[1]#b,$XX[1]#d
+	movzb	($dat,$XX[1]),$TX[1]#d
+	movb	$TX[0]#b,($dat,$YY)
+	cmp	$XX[1],$YY
+	movb	$TY#b,($dat,$XX[0])
+	jne	.Lcmov$i			# Intel cmov is sloooow...
+	mov	$TX[0],$TX[1]
+.Lcmov$i:
+	add	$TX[0]#b,$TY#b
+	xor	($dat,$TY),%bl
+	ror	\$8,%ebx
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+}
+$code.=<<___;
+	lea	-8($len),$len
+	mov	%eax,($out)
+	lea	8($inp),$inp
+	mov	%ebx,4($out)
+	lea	8($out),$out
+
+	test	\$-8,$len
+	jnz	.Lcloop8
+	pop	%rbx
+	cmp	\$0,$len
+	jne	.Lcloop1
+.Lexit:
+	sub	\$1,$XX[0]#b
+	movb	$XX[0]#b,-2($dat)
+	movb	$YY#b,-1($dat)
+
+	pop	%r13
+	pop	%r12
+	repret
+
+.align	16
+.Lcloop1:
+	add	$TX[0]#b,$YY#b
+	movzb	($dat,$YY),$TY#d
+	movb	$TX[0]#b,($dat,$YY)
+	movb	$TY#b,($dat,$XX[0])
+	add	$TX[0]#b,$TY#b
+	add	\$1,$XX[0]#b
+	movzb	($dat,$TY),$TY#d
+	movzb	($dat,$XX[0]),$TX[0]#d
+	xorb	($inp),$TY#b
+	lea	1($inp),$inp
+	movb	$TY#b,($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lcloop1
+	jmp	.Lexit
+.size	RC4,.-RC4
+___
+
+$code =~ s/#([bwd])/$1/gm;
+
+$code =~ s/repret/.byte\t0xF3,0xC3/gm;
+
+print $code;
diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h
index dd90d9fde0..ae0cea75b8 100644
--- a/src/lib/libcrypto/rc4/rc4.h
+++ b/src/lib/libcrypto/rc4/rc4.h
@@ -73,10 +73,6 @@ typedef struct rc4_key_st
 	{
 	RC4_INT x,y;
 	RC4_INT data[256];
-#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-	/* see crypto/rc4/asm/rc4-ia64.S for further details... */
-	RC4_INT pad[512-256-2];
-#endif
 	} RC4_KEY;
 
  
diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c
index 81a97ea3b7..d5f18a3a70 100644
--- a/src/lib/libcrypto/rc4/rc4_enc.c
+++ b/src/lib/libcrypto/rc4/rc4_enc.c
@@ -77,10 +77,6 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
         x=key->x;     
         y=key->y;     
         d=key->data; 
-#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-	/* see crypto/rc4/asm/rc4-ia64.S for further details... */
-	d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));
-#endif
 
 #if defined(RC4_CHUNK)
 	/*
diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c
index 07234f061a..60510624fd 100644
--- a/src/lib/libcrypto/rc4/rc4_skey.c
+++ b/src/lib/libcrypto/rc4/rc4_skey.c
@@ -58,6 +58,7 @@
 
 #include <openssl/rc4.h>
 #include <openssl/crypto.h>
+#include <openssl/fips.h>
 #include "rc4_locl.h"
 #include <openssl/opensslv.h>
 
@@ -94,10 +95,6 @@ FIPS_NON_FIPS_VCIPHER_Init(RC4)
         unsigned int i;
         
         d= &(key->data[0]);
-#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
-	/* see crypto/rc4/asm/rc4-ia64.S for further details... */
-	d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));
-#endif
 
 	for (i=0; i<256; i++)
 		d[i]=i;
-- 
cgit v1.2.3-55-g6feb