diff options
Diffstat (limited to 'src/lib/libcrypto/rc4/asm/rc4-586.pl')
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-586.pl | 162 |
1 files changed, 11 insertions, 151 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl index 5c9ac6ad28..38a44a70ef 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-586.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl | |||
| @@ -28,34 +28,6 @@ | |||
| 28 | # | 28 | # |
| 29 | # <appro@fy.chalmers.se> | 29 | # <appro@fy.chalmers.se> |
| 30 | 30 | ||
| 31 | # May 2011 | ||
| 32 | # | ||
| 33 | # Optimize for Core2 and Westmere [and incidentally Opteron]. Current | ||
| 34 | # performance in cycles per processed byte (less is better) and | ||
| 35 | # improvement relative to previous version of this module is: | ||
| 36 | # | ||
| 37 | # Pentium 10.2 # original numbers | ||
| 38 | # Pentium III 7.8(*) | ||
| 39 | # Intel P4 7.5 | ||
| 40 | # | ||
| 41 | # Opteron 6.1/+20% # new MMX numbers | ||
| 42 | # Core2 5.3/+67%(**) | ||
| 43 | # Westmere 5.1/+94%(**) | ||
| 44 | # Sandy Bridge 5.0/+8% | ||
| 45 | # Atom 12.6/+6% | ||
| 46 | # | ||
| 47 | # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, | ||
| 48 | # but this specific code performs poorly on Core2. And vice | ||
| 49 | # versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs | ||
| 50 | # poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU | ||
| 51 | # [anymore], I chose to discard PIII-specific code path and opt | ||
| 52 | # for original IALU-only code, which is why MMX/SSE code path | ||
| 53 | # is guarded by SSE2 bit (see below), not MMX/SSE. | ||
| 54 | # (**) Performance vs. block size on Core2 and Westmere had a maximum | ||
| 55 | # at ... 64 bytes block size. And it was quite a maximum, 40-60% | ||
| 56 | # in comparison to largest 8KB block size. Above improvement | ||
| 57 | # coefficients are for the largest block size. | ||
| 58 | |||
| 59 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | 31 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; |
| 60 | push(@INC,"${dir}","${dir}../../perlasm"); | 32 | push(@INC,"${dir}","${dir}../../perlasm"); |
| 61 | require "x86asm.pl"; | 33 | require "x86asm.pl"; |
| @@ -90,68 +62,6 @@ sub RC4_loop { | |||
| 90 | &$func ($out,&DWP(0,$dat,$ty,4)); | 62 | &$func ($out,&DWP(0,$dat,$ty,4)); |
| 91 | } | 63 | } |
| 92 | 64 | ||
| 93 | if ($alt=0) { | ||
| 94 | # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron, | ||
| 95 | # but ~40% slower on Core2 and Westmere... Attempt to add movz | ||
| 96 | # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet | ||
| 97 | # on Core2 with movz it's almost 20% slower than below alternative | ||
| 98 | # code... Yes, it's a total mess... | ||
| 99 | my @XX=($xx,$out); | ||
| 100 | $RC4_loop_mmx = sub { # SSE actually... | ||
| 101 | my $i=shift; | ||
| 102 | my $j=$i<=0?0:$i>>1; | ||
| 103 | my $mm=$i<=0?"mm0":"mm".($i&1); | ||
| 104 | |||
| 105 | &add (&LB($yy),&LB($tx)); | ||
| 106 | &lea (@XX[1],&DWP(1,@XX[0])); | ||
| 107 | &pxor ("mm2","mm0") if ($i==0); | ||
| 108 | &psllq ("mm1",8) if ($i==0); | ||
| 109 | &and (@XX[1],0xff); | ||
| 110 | &pxor ("mm0","mm0") if ($i<=0); | ||
| 111 | &mov ($ty,&DWP(0,$dat,$yy,4)); | ||
| 112 | &mov (&DWP(0,$dat,$yy,4),$tx); | ||
| 113 | &pxor ("mm1","mm2") if ($i==0); | ||
| 114 | &mov (&DWP(0,$dat,$XX[0],4),$ty); | ||
| 115 | &add (&LB($ty),&LB($tx)); | ||
| 116 | &movd (@XX[0],"mm7") if ($i==0); | ||
| 117 | &mov ($tx,&DWP(0,$dat,@XX[1],4)); | ||
| 118 | &pxor ("mm1","mm1") if ($i==1); | ||
| 119 | &movq ("mm2",&QWP(0,$inp)) if ($i==1); | ||
| 120 | &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0); | ||
| 121 | &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j); | ||
| 122 | |||
| 123 | push (@XX,shift(@XX)) if ($i>=0); | ||
| 124 | } | ||
| 125 | } else { | ||
| 126 | # Using pinsrw here improves performane on Intel CPUs by 2-3%, but | ||
| 127 | # brings down AMD by 7%... | ||
| 128 | $RC4_loop_mmx = sub { | ||
| 129 | my $i=shift; | ||
| 130 | |||
| 131 | &add (&LB($yy),&LB($tx)); | ||
| 132 | &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); | ||
| 133 | &mov ($ty,&DWP(0,$dat,$yy,4)); | ||
| 134 | &mov (&DWP(0,$dat,$yy,4),$tx); | ||
| 135 | &mov (&DWP(0,$dat,$xx,4),$ty); | ||
| 136 | &inc ($xx); | ||
| 137 | &add ($ty,$tx); | ||
| 138 | &movz ($xx,&LB($xx)); # (*) | ||
| 139 | &movz ($ty,&LB($ty)); # (*) | ||
| 140 | &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); | ||
| 141 | &movq ("mm0",&QWP(0,$inp)) if ($i<=0); | ||
| 142 | &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); | ||
| 143 | &mov ($tx,&DWP(0,$dat,$xx,4)); | ||
| 144 | &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); | ||
| 145 | |||
| 146 | # (*) This is the key to Core2 and Westmere performance. | ||
| 147 | # Whithout movz out-of-order execution logic confuses | ||
| 148 | # itself and fails to reorder loads and stores. Problem | ||
| 149 | # appears to be fixed in Sandy Bridge... | ||
| 150 | } | ||
| 151 | } | ||
| 152 | |||
| 153 | &external_label("OPENSSL_ia32cap_P"); | ||
| 154 | |||
| 155 | # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); | 65 | # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); |
| 156 | &function_begin("RC4"); | 66 | &function_begin("RC4"); |
| 157 | &mov ($dat,&wparam(0)); # load key schedule pointer | 67 | &mov ($dat,&wparam(0)); # load key schedule pointer |
| @@ -184,56 +94,11 @@ if ($alt=0) { | |||
| 184 | &and ($ty,-4); # how many 4-byte chunks? | 94 | &and ($ty,-4); # how many 4-byte chunks? |
| 185 | &jz (&label("loop1")); | 95 | &jz (&label("loop1")); |
| 186 | 96 | ||
| 187 | &test ($ty,-8); | ||
| 188 | &mov (&wparam(3),$out); # $out as accumulator in these loops | ||
| 189 | &jz (&label("go4loop4")); | ||
| 190 | |||
| 191 | &picmeup($out,"OPENSSL_ia32cap_P"); | ||
| 192 | &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] | ||
| 193 | &jnc (&label("go4loop4")); | ||
| 194 | |||
| 195 | &mov ($out,&wparam(3)) if (!$alt); | ||
| 196 | &movd ("mm7",&wparam(3)) if ($alt); | ||
| 197 | &and ($ty,-8); | ||
| 198 | &lea ($ty,&DWP(-8,$inp,$ty)); | ||
| 199 | &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 | ||
| 200 | |||
| 201 | &$RC4_loop_mmx(-1); | ||
| 202 | &jmp(&label("loop_mmx_enter")); | ||
| 203 | |||
| 204 | &set_label("loop_mmx",16); | ||
| 205 | &$RC4_loop_mmx(0); | ||
| 206 | &set_label("loop_mmx_enter"); | ||
| 207 | for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } | ||
| 208 | &mov ($ty,$yy); | ||
| 209 | &xor ($yy,$yy); # this is second key to Core2 | ||
| 210 | &mov (&LB($yy),&LB($ty)); # and Westmere performance... | ||
| 211 | &cmp ($inp,&DWP(-4,$dat)); | ||
| 212 | &lea ($inp,&DWP(8,$inp)); | ||
| 213 | &jb (&label("loop_mmx")); | ||
| 214 | |||
| 215 | if ($alt) { | ||
| 216 | &movd ($out,"mm7"); | ||
| 217 | &pxor ("mm2","mm0"); | ||
| 218 | &psllq ("mm1",8); | ||
| 219 | &pxor ("mm1","mm2"); | ||
| 220 | &movq (&QWP(-8,$out,$inp),"mm1"); | ||
| 221 | } else { | ||
| 222 | &psllq ("mm1",56); | ||
| 223 | &pxor ("mm2","mm1"); | ||
| 224 | &movq (&QWP(-8,$out,$inp),"mm2"); | ||
| 225 | } | ||
| 226 | &emms (); | ||
| 227 | |||
| 228 | &cmp ($inp,&wparam(1)); # compare to input+len | ||
| 229 | &je (&label("done")); | ||
| 230 | &jmp (&label("loop1")); | ||
| 231 | |||
| 232 | &set_label("go4loop4",16); | ||
| 233 | &lea ($ty,&DWP(-4,$inp,$ty)); | 97 | &lea ($ty,&DWP(-4,$inp,$ty)); |
| 234 | &mov (&wparam(2),$ty); # save input+(len/4)*4-4 | 98 | &mov (&wparam(2),$ty); # save input+(len/4)*4-4 |
| 99 | &mov (&wparam(3),$out); # $out as accumulator in this loop | ||
| 235 | 100 | ||
| 236 | &set_label("loop4"); | 101 | &set_label("loop4",16); |
| 237 | for ($i=0;$i<4;$i++) { RC4_loop($i); } | 102 | for ($i=0;$i<4;$i++) { RC4_loop($i); } |
| 238 | &ror ($out,8); | 103 | &ror ($out,8); |
| 239 | &xor ($out,&DWP(0,$inp)); | 104 | &xor ($out,&DWP(0,$inp)); |
| @@ -286,7 +151,7 @@ if ($alt=0) { | |||
| 286 | 151 | ||
| 287 | &set_label("done"); | 152 | &set_label("done"); |
| 288 | &dec (&LB($xx)); | 153 | &dec (&LB($xx)); |
| 289 | &mov (&DWP(-4,$dat),$yy); # save key->y | 154 | &mov (&BP(-4,$dat),&LB($yy)); # save key->y |
| 290 | &mov (&BP(-8,$dat),&LB($xx)); # save key->x | 155 | &mov (&BP(-8,$dat),&LB($xx)); # save key->x |
| 291 | &set_label("abort"); | 156 | &set_label("abort"); |
| 292 | &function_end("RC4"); | 157 | &function_end("RC4"); |
| @@ -299,8 +164,10 @@ $idi="ebp"; | |||
| 299 | $ido="ecx"; | 164 | $ido="ecx"; |
| 300 | $idx="edx"; | 165 | $idx="edx"; |
| 301 | 166 | ||
| 167 | &external_label("OPENSSL_ia32cap_P"); | ||
| 168 | |||
| 302 | # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); | 169 | # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); |
| 303 | &function_begin("private_RC4_set_key"); | 170 | &function_begin("RC4_set_key"); |
| 304 | &mov ($out,&wparam(0)); # load key | 171 | &mov ($out,&wparam(0)); # load key |
| 305 | &mov ($idi,&wparam(1)); # load len | 172 | &mov ($idi,&wparam(1)); # load len |
| 306 | &mov ($inp,&wparam(2)); # load data | 173 | &mov ($inp,&wparam(2)); # load data |
| @@ -378,7 +245,7 @@ $idx="edx"; | |||
| 378 | &xor ("eax","eax"); | 245 | &xor ("eax","eax"); |
| 379 | &mov (&DWP(-8,$out),"eax"); # key->x=0; | 246 | &mov (&DWP(-8,$out),"eax"); # key->x=0; |
| 380 | &mov (&DWP(-4,$out),"eax"); # key->y=0; | 247 | &mov (&DWP(-4,$out),"eax"); # key->y=0; |
| 381 | &function_end("private_RC4_set_key"); | 248 | &function_end("RC4_set_key"); |
| 382 | 249 | ||
| 383 | # const char *RC4_options(void); | 250 | # const char *RC4_options(void); |
| 384 | &function_begin_B("RC4_options"); | 251 | &function_begin_B("RC4_options"); |
| @@ -387,21 +254,14 @@ $idx="edx"; | |||
| 387 | &blindpop("eax"); | 254 | &blindpop("eax"); |
| 388 | &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); | 255 | &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); |
| 389 | &picmeup("edx","OPENSSL_ia32cap_P"); | 256 | &picmeup("edx","OPENSSL_ia32cap_P"); |
| 390 | &mov ("edx",&DWP(0,"edx")); | 257 | &bt (&DWP(0,"edx"),20); |
| 391 | &bt ("edx",20); | 258 | &jnc (&label("skip")); |
| 392 | &jc (&label("1xchar")); | 259 | &add ("eax",12); |
| 393 | &bt ("edx",26); | 260 | &set_label("skip"); |
| 394 | &jnc (&label("ret")); | ||
| 395 | &add ("eax",25); | ||
| 396 | &ret (); | ||
| 397 | &set_label("1xchar"); | ||
| 398 | &add ("eax",12); | ||
| 399 | &set_label("ret"); | ||
| 400 | &ret (); | 261 | &ret (); |
| 401 | &set_label("opts",64); | 262 | &set_label("opts",64); |
| 402 | &asciz ("rc4(4x,int)"); | 263 | &asciz ("rc4(4x,int)"); |
| 403 | &asciz ("rc4(1x,char)"); | 264 | &asciz ("rc4(1x,char)"); |
| 404 | &asciz ("rc4(8x,mmx)"); | ||
| 405 | &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); | 265 | &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); |
| 406 | &align (64); | 266 | &align (64); |
| 407 | &function_end_B("RC4_options"); | 267 | &function_end_B("RC4_options"); |
