From 5a3c0a05c7f2c5d3c584b7c8d6aec836dd724c80 Mon Sep 17 00:00:00 2001 From: djm <> Date: Sat, 6 Sep 2008 12:15:56 +0000 Subject: import of OpenSSL 0.9.8h --- src/lib/libcrypto/rc4/asm/rc4-586.pl | 11 +- src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 286 ++++++++++++++++++++++++++++---- src/lib/libcrypto/rc4/rc4.h | 6 +- src/lib/libcrypto/rc4/rc4_enc.c | 4 +- src/lib/libcrypto/rc4/rc4_skey.c | 55 ++++-- 5 files changed, 302 insertions(+), 60 deletions(-) (limited to 'src/lib/libcrypto/rc4') diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl index d6e98f0811..ef7eee766c 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-586.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl @@ -200,22 +200,23 @@ sub RC4 &lea ($ty,&DWP(0,$in,$ty)); &mov (&swtmp(2),$ty); + &movz ($tx,&BP(0,$d,$x)); # strangely enough unrolled loop performs over 20% slower... &set_label("RC4_CHAR_loop"); - &movz ($tx,&BP(0,$d,$x)); &add (&LB($y),&LB($tx)); &movz ($ty,&BP(0,$d,$y)); &movb (&BP(0,$d,$y),&LB($tx)); &movb (&BP(0,$d,$x),&LB($ty)); &add (&LB($ty),&LB($tx)); &movz ($ty,&BP(0,$d,$ty)); + &add (&LB($x),1); &xorb (&LB($ty),&BP(0,$in)); - &movb (&BP(0,$out),&LB($ty)); - &inc (&LB($x)); - &inc ($in); - &inc ($out); + &lea ($in,&DWP(1,$in)); + &movz ($tx,&BP(0,$d,$x)); &cmp ($in,&swtmp(2)); + &movb (&BP(0,$out),&LB($ty)); + &lea ($out,&DWP(1,$out)); &jb (&label("RC4_CHAR_loop")); &set_label("finished"); diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl index b628daca70..2d47320485 100755 --- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl @@ -2,29 +2,70 @@ # # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # -# Unlike 0.9.7f this code expects RC4_CHAR back in config line! See -# commentary section in corresponding script in development branch -# for background information about this option carousel. For those -# who don't have energy to figure out these gory details, here is -# basis in form of performance matrix relative to the original -# 0.9.7e C code-base: -# -# 0.9.7e 0.9.7f this -# AMD64 1x 3.3x 2.4x -# EM64T 1x 0.8x 1.5x -# -# In other words idea is to trade -25% AMD64 performance to compensate -# for deterioration and gain +90% on EM64T core. Development branch -# maintains best performance for either target, i.e. 3.3x for AMD64 -# and 1.5x for EM64T. +# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in +# "hand-coded assembler"] doesn't stand for the whole improvement +# coefficient. It turned out that eliminating RC4_CHAR from config +# line results in ~40% improvement (yes, even for C implementation). +# Presumably it has everything to do with AMD cache architecture and +# RAW or whatever penalties. Once again! The module *requires* config +# line *without* RC4_CHAR! As for coding "secret," I bet on partial +# register arithmetics. For example instead of 'inc %r8; and $255,%r8' +# I simply 'inc %r8b'. Even though optimization manual discourages +# to operate on partial registers, it turned out to be the best bet. +# At least for AMD... How IA32E would perform remains to be seen... + +# As was shown by Marc Bevand reordering of couple of load operations +# results in even higher performance gain of 3.3x:-) At least on +# Opteron... For reference, 1x in this case is RC4_CHAR C-code +# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. +# Latter means that if you want to *estimate* what to expect from +# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. + +# Intel P4 EM64T core was found to run the AMD64 code really slow... +# The only way to achieve comparable performance on P4 was to keep +# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to +# compose blended code, which would perform even within 30% marginal +# on either AMD and Intel platforms, I implement both cases. See +# rc4_skey.c for further details... + +# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing +# those with add/sub results in 50% performance improvement of folded +# loop... + +# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T +# performance by >30% [unlike P4 32-bit case that is]. But this is +# provided that loads are reordered even more aggressively! Both code +# pathes, AMD64 and EM64T, reorder loads in essentially same manner +# as my IA-64 implementation. On Opteron this resulted in modest 5% +# improvement [I had to test it], while final Intel P4 performance +# achieves respectful 432MBps on 2.8GHz processor now. For reference. +# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than +# RC4_INT code-path. While if executed on Opteron, it's only 25% +# slower than the RC4_INT one [meaning that if CPU µ-arch detection +# is not implemented, then this final RC4_CHAR code-path should be +# preferred, as it provides better *all-round* performance]. + +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. $output=shift; -open STDOUT,">$output" || die "can't open $output: $!"; +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; $dat="%rdi"; # arg1 $len="%rsi"; # arg2 @@ -36,29 +77,101 @@ $out="%rcx"; # arg4 $YY="%r12"; $TY="%r13"; -$code=<<___;; +$code=<<___; .text .globl RC4 -.type RC4,\@function +.type RC4,\@function,4 .align 16 RC4: or $len,$len jne .Lentry - repret + ret .Lentry: push %r12 push %r13 - add \$2,$dat - movzb -2($dat),$XX[0]#d - movzb -1($dat),$YY#d + add \$8,$dat + movl -8($dat),$XX[0]#d + movl -4($dat),$YY#d + cmpl \$-1,256($dat) + je .LRC4_CHAR + inc $XX[0]#b + movl ($dat,$XX[0],4),$TX[0]#d + test \$-8,$len + jz .Lloop1 + jmp .Lloop8 +.align 16 +.Lloop8: +___ +for ($i=0;$i<8;$i++) { +$code.=<<___; + add $TX[0]#b,$YY#b + mov $XX[0],$XX[1] + movl ($dat,$YY,4),$TY#d + ror \$8,%rax # ror is redundant when $i=0 + inc $XX[1]#b + movl ($dat,$XX[1],4),$TX[1]#d + cmp $XX[1],$YY + movl $TX[0]#d,($dat,$YY,4) + cmove $TX[0],$TX[1] + movl $TY#d,($dat,$XX[0],4) + add $TX[0]#b,$TY#b + movb ($dat,$TY,4),%al +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} +$code.=<<___; + ror \$8,%rax + sub \$8,$len + + xor ($inp),%rax + add \$8,$inp + mov %rax,($out) + add \$8,$out + test \$-8,$len + jnz .Lloop8 + cmp \$0,$len + jne .Lloop1 +___ +$code.=<<___; +.Lexit: + sub \$1,$XX[0]#b + movl $XX[0]#d,-8($dat) + movl $YY#d,-4($dat) + + pop %r13 + pop %r12 + ret +.align 16 +.Lloop1: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + inc $inp + movb $TY#b,($out) + inc $out + dec $len + jnz .Lloop1 + jmp .Lexit + +.align 16 +.LRC4_CHAR: add \$1,$XX[0]#b movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx -.align 16 # incidentally aligned already + jmp .Lcloop8 +.align 16 .Lcloop8: mov ($inp),%eax mov 4($inp),%ebx @@ -114,15 +227,9 @@ $code.=<<___; pop %rbx cmp \$0,$len jne .Lcloop1 -.Lexit: - sub \$1,$XX[0]#b - movb $XX[0]#b,-2($dat) - movb $YY#b,-1($dat) - - pop %r13 - pop %r12 - repret - + jmp .Lexit +___ +$code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b @@ -131,6 +238,8 @@ $code.=<<___; movb $TY#b,($dat,$XX[0]) add $TX[0]#b,$TY#b add \$1,$XX[0]#b + movzb $TY#b,$TY#d + movzb $XX[0]#b,$XX[0]#d movzb ($dat,$TY),$TY#d movzb ($dat,$XX[0]),$TX[0]#d xorb ($inp),$TY#b @@ -143,8 +252,113 @@ $code.=<<___; .size RC4,.-RC4 ___ -$code =~ s/#([bwd])/$1/gm; +$idx="%r8"; +$ido="%r9"; + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 -$code =~ s/repret/.byte\t0xF3,0xC3/gm; + mov OPENSSL_ia32cap_P(%rip),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov OPENSSL_ia32cap_P(%rip),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, CRYPTOGAMS by " +.align 64 +.size RC4_options,.-RC4_options +___ + +$code =~ s/#([bwd])/$1/gm; print $code; + +close STDOUT; diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h index ae0cea75b8..7aec04fe93 100644 --- a/src/lib/libcrypto/rc4/rc4.h +++ b/src/lib/libcrypto/rc4/rc4.h @@ -59,12 +59,11 @@ #ifndef HEADER_RC4_H #define HEADER_RC4_H +#include /* OPENSSL_NO_RC4, RC4_INT */ #ifdef OPENSSL_NO_RC4 #error RC4 is disabled. #endif -#include /* RC4_INT */ - #ifdef __cplusplus extern "C" { #endif @@ -77,9 +76,6 @@ typedef struct rc4_key_st const char *RC4_options(void); -#ifdef OPENSSL_FIPS -void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); -#endif void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, unsigned char *outdata); diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c index d5f18a3a70..0660ea60a2 100644 --- a/src/lib/libcrypto/rc4/rc4_enc.c +++ b/src/lib/libcrypto/rc4/rc4_enc.c @@ -157,7 +157,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, if (!is_endian.little) { /* BIG-ENDIAN CASE */ # define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) - for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) + for (;len&~(sizeof(RC4_CHUNK)-1);len-=sizeof(RC4_CHUNK)) { ichunk = *(RC4_CHUNK *)indata; otp = RC4_STEP< -#include -#include #include "rc4_locl.h" #include -const char *RC4_version="RC4" OPENSSL_VERSION_PTEXT; +const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT; const char *RC4_options(void) { @@ -87,7 +85,7 @@ const char *RC4_options(void) * Date: Wed, 14 Sep 1994 06:35:31 GMT */ -FIPS_NON_FIPS_VCIPHER_Init(RC4) +void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) { register RC4_INT tmp; register int id1,id2; @@ -95,26 +93,59 @@ FIPS_NON_FIPS_VCIPHER_Init(RC4) unsigned int i; d= &(key->data[0]); - - for (i=0; i<256; i++) - d[i]=i; key->x = 0; key->y = 0; id1=id2=0; -#define SK_LOOP(n) { \ +#define SK_LOOP(d,n) { \ tmp=d[(n)]; \ id2 = (data[id1] + tmp + id2) & 0xff; \ if (++id1 == len) id1=0; \ d[(n)]=d[id2]; \ d[id2]=tmp; } +#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__INTEL__) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) + if (sizeof(RC4_INT) > 1) { + /* + * Unlike all other x86 [and x86_64] implementations, + * Intel P4 core [including EM64T] was found to perform + * poorly with wider RC4_INT. Performance improvement + * for IA-32 hand-coded assembler turned out to be 2.8x + * if re-coded for RC4_CHAR! It's however inappropriate + * to just switch to RC4_CHAR for x86[_64], as non-P4 + * implementations suffer from significant performance + * losses then, e.g. PIII exhibits >2x deterioration, + * and so does Opteron. In order to assure optimal + * all-round performance, we detect P4 at run-time by + * checking upon reserved bit 20 in CPU capability + * vector and set up compressed key schedule, which is + * recognized by correspondingly updated assembler + * module... Bit 20 is set up by OPENSSL_ia32_cpuid. + * + * + */ + if (OPENSSL_ia32cap_P & (1<<20)) { + unsigned char *cp=(unsigned char *)d; + + for (i=0;i<256;i++) cp[i]=i; + for (i=0;i<256;i++) SK_LOOP(cp,i); + /* mark schedule as compressed! */ + d[256/sizeof(RC4_INT)]=-1; + return; + } + } +# endif +#endif + for (i=0; i < 256; i++) d[i]=i; for (i=0; i < 256; i+=4) { - SK_LOOP(i+0); - SK_LOOP(i+1); - SK_LOOP(i+2); - SK_LOOP(i+3); + SK_LOOP(d,i+0); + SK_LOOP(d,i+1); + SK_LOOP(d,i+2); + SK_LOOP(d,i+3); } } -- cgit v1.2.3-55-g6feb