From f6198d4d0ab97685dc56be2d48715ed39fcc74b9 Mon Sep 17 00:00:00 2001 From: djm <> Date: Tue, 27 Jun 2006 05:05:42 +0000 Subject: import of openssl-0.9.7j --- src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 150 ++++++++++++++++++++++++++++++++ src/lib/libcrypto/rc4/rc4.h | 4 - src/lib/libcrypto/rc4/rc4_enc.c | 4 - src/lib/libcrypto/rc4/rc4_skey.c | 5 +- 4 files changed, 151 insertions(+), 12 deletions(-) create mode 100755 src/lib/libcrypto/rc4/asm/rc4-x86_64.pl (limited to 'src/lib/libcrypto/rc4') diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl new file mode 100755 index 0000000000..b628daca70 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl @@ -0,0 +1,150 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. Rights for redistribution and usage in source and binary +# forms are granted according to the OpenSSL license. +# ==================================================================== +# +# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See +# commentary section in corresponding script in development branch +# for background information about this option carousel. For those +# who don't have energy to figure out these gory details, here is +# basis in form of performance matrix relative to the original +# 0.9.7e C code-base: +# +# 0.9.7e 0.9.7f this +# AMD64 1x 3.3x 2.4x +# EM64T 1x 0.8x 1.5x +# +# In other words idea is to trade -25% AMD64 performance to compensate +# for deterioration and gain +90% on EM64T core. Development branch +# maintains best performance for either target, i.e. 3.3x for AMD64 +# and 1.5x for EM64T. + +$output=shift; + +open STDOUT,">$output" || die "can't open $output: $!"; + +$dat="%rdi"; # arg1 +$len="%rsi"; # arg2 +$inp="%rdx"; # arg3 +$out="%rcx"; # arg4 + +@XX=("%r8","%r10"); +@TX=("%r9","%r11"); +$YY="%r12"; +$TY="%r13"; + +$code=<<___;; +.text + +.globl RC4 +.type RC4,\@function +.align 16 +RC4: or $len,$len + jne .Lentry + repret +.Lentry: + push %r12 + push %r13 + + add \$2,$dat + movzb -2($dat),$XX[0]#d + movzb -1($dat),$YY#d + + add \$1,$XX[0]#b + movzb ($dat,$XX[0]),$TX[0]#d + test \$-8,$len + jz .Lcloop1 + push %rbx +.align 16 # incidentally aligned already +.Lcloop8: + mov ($inp),%eax + mov 4($inp),%ebx +___ +# unroll 2x4-wise, because 64-bit rotates kill Intel P4... +for ($i=0;$i<4;$i++) { +$code.=<<___; + add $TX[0]#b,$YY#b + lea 1($XX[0]),$XX[1] + movzb ($dat,$YY),$TY#d + movzb $XX[1]#b,$XX[1]#d + movzb ($dat,$XX[1]),$TX[1]#d + movb $TX[0]#b,($dat,$YY) + cmp $XX[1],$YY + movb $TY#b,($dat,$XX[0]) + jne .Lcmov$i # Intel cmov is sloooow... + mov $TX[0],$TX[1] +.Lcmov$i: + add $TX[0]#b,$TY#b + xor ($dat,$TY),%al + ror \$8,%eax +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} +for ($i=4;$i<8;$i++) { +$code.=<<___; + add $TX[0]#b,$YY#b + lea 1($XX[0]),$XX[1] + movzb ($dat,$YY),$TY#d + movzb $XX[1]#b,$XX[1]#d + movzb ($dat,$XX[1]),$TX[1]#d + movb $TX[0]#b,($dat,$YY) + cmp $XX[1],$YY + movb $TY#b,($dat,$XX[0]) + jne .Lcmov$i # Intel cmov is sloooow... + mov $TX[0],$TX[1] +.Lcmov$i: + add $TX[0]#b,$TY#b + xor ($dat,$TY),%bl + ror \$8,%ebx +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} +$code.=<<___; + lea -8($len),$len + mov %eax,($out) + lea 8($inp),$inp + mov %ebx,4($out) + lea 8($out),$out + + test \$-8,$len + jnz .Lcloop8 + pop %rbx + cmp \$0,$len + jne .Lcloop1 +.Lexit: + sub \$1,$XX[0]#b + movb $XX[0]#b,-2($dat) + movb $YY#b,-1($dat) + + pop %r13 + pop %r12 + repret + +.align 16 +.Lcloop1: + add $TX[0]#b,$YY#b + movzb ($dat,$YY),$TY#d + movb $TX[0]#b,($dat,$YY) + movb $TY#b,($dat,$XX[0]) + add $TX[0]#b,$TY#b + add \$1,$XX[0]#b + movzb ($dat,$TY),$TY#d + movzb ($dat,$XX[0]),$TX[0]#d + xorb ($inp),$TY#b + lea 1($inp),$inp + movb $TY#b,($out) + lea 1($out),$out + sub \$1,$len + jnz .Lcloop1 + jmp .Lexit +.size RC4,.-RC4 +___ + +$code =~ s/#([bwd])/$1/gm; + +$code =~ s/repret/.byte\t0xF3,0xC3/gm; + +print $code; diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h index dd90d9fde0..ae0cea75b8 100644 --- a/src/lib/libcrypto/rc4/rc4.h +++ b/src/lib/libcrypto/rc4/rc4.h @@ -73,10 +73,6 @@ typedef struct rc4_key_st { RC4_INT x,y; RC4_INT data[256]; -#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) - /* see crypto/rc4/asm/rc4-ia64.S for further details... */ - RC4_INT pad[512-256-2]; -#endif } RC4_KEY; diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c index 81a97ea3b7..d5f18a3a70 100644 --- a/src/lib/libcrypto/rc4/rc4_enc.c +++ b/src/lib/libcrypto/rc4/rc4_enc.c @@ -77,10 +77,6 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, x=key->x; y=key->y; d=key->data; -#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) - /* see crypto/rc4/asm/rc4-ia64.S for further details... */ - d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); -#endif #if defined(RC4_CHUNK) /* diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c index 07234f061a..60510624fd 100644 --- a/src/lib/libcrypto/rc4/rc4_skey.c +++ b/src/lib/libcrypto/rc4/rc4_skey.c @@ -58,6 +58,7 @@ #include #include +#include #include "rc4_locl.h" #include @@ -94,10 +95,6 @@ FIPS_NON_FIPS_VCIPHER_Init(RC4) unsigned int i; d= &(key->data[0]); -#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) - /* see crypto/rc4/asm/rc4-ia64.S for further details... */ - d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); -#endif for (i=0; i<256; i++) d[i]=i; -- cgit v1.2.3-55-g6feb