1 files changed, 244 insertions, 204 deletions
diff --git a/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl b/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl
index ef7eee766c..38a44a70ef 100644
--- a/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl
@@ -1,14 +1,21 @@
-#!/usr/local/bin/perl
+#!/usr/bin/env perl
+# ====================================================================
+# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
 # At some point it became apparent that the original SSLeay RC4
-# assembler implementation performs suboptimaly on latest IA-32
+# assembler implementation performs suboptimally on latest IA-32
 # microarchitectures. After re-tuning performance has changed as
 # following:
 #
-# Pentium       +0%
+# Pentium       -10%
-# Pentium III   +17%
+# Pentium III   +12%
-# AMD           +52%(*)
+# AMD           +50%(*)
-# P4            +180%(**)
+# P4            +250%(**)
 #
 # (*)   This number is actually a trade-off:-) It's possible to
 #       achieve +72%, but at the cost of -48% off PIII performance.
@@ -17,214 +24,247 @@
 #       For reference! This code delivers ~80% of rc4-amd64.pl
 #       performance on the same Opteron machine.
 # (**)  This number requires compressed key schedule set up by
-#       RC4_set_key and therefore doesn't apply to 0.9.7 [option for
+#       RC4_set_key [see commentary below for further details].
-#       compressed key schedule is implemented in 0.9.8 and later,
-#       see commentary section in rc4_skey.c for further details].
 #
 #                                       <appro@fy.chalmers.se>
-push(@INC,"perlasm","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],"rc4-586.pl");
-$x="eax";
+$xx="eax";
-$y="ebx";
+$yy="ebx";
 $tx="ecx";
 $ty="edx";
-$in="esi";
+$inp="esi";
-$out="edi";
+$out="ebp";
-$d="ebp";
+$dat="edi";
-&RC4("RC4");
+sub RC4_loop {
+  my $i=shift;
-&asm_finish();
+  my $func = ($i==0)?*mov:*or;
-sub RC4_loop
+        &add    (&LB($yy),&LB($tx));
-        {
+        &mov    ($ty,&DWP(0,$dat,$yy,4));
-        local($n,$p,$char)=@_;
+        &mov    (&DWP(0,$dat,$yy,4),$tx);
+        &mov    (&DWP(0,$dat,$xx,4),$ty);
-        &comment("Round $n");
+        &add    ($ty,$tx);
+        &inc    (&LB($xx));
-        if ($char)
+        &and    ($ty,0xff);
-                {
+        &ror    ($out,8)        if ($i!=0);
-                if ($p >= 0)
+        if ($i<3) {
-                        {
+          &mov  ($tx,&DWP(0,$dat,$xx,4));
-                         &mov($ty,      &swtmp(2));
+        } else {
-                        &cmp($ty,       $in);
+          &mov  ($tx,&wparam(3));       # reload [re-biased] out
-                         &jbe(&label("finished"));
-                        &inc($in);
-                        }
-                else
-                        {
-                        &add($ty,       8);
-                         &inc($in);
-                        &cmp($ty,       $in);
-                         &jb(&label("finished"));
-                        &mov(&swtmp(2), $ty);
-                        }
-                }
-        # Moved out
-        # &mov( $tx,            &DWP(0,$d,$x,4)) if $p < 0;
-        &add(   &LB($y),        &LB($tx));
-        &mov(   $ty,            &DWP(0,$d,$y,4));
-         # XXX
-        &mov(   &DWP(0,$d,$x,4),$ty);
-         &add(  $ty,            $tx);
-        &mov(   &DWP(0,$d,$y,4),$tx);
-         &and(  $ty,            0xff);
-         &inc(  &LB($x));                       # NEXT ROUND
-        &mov(   $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
-         &mov(  $ty,            &DWP(0,$d,$ty,4));
-        if (!$char)
-                {
-                #moved up into last round
-                if ($p >= 1)
-                        {
-                        &add(   $out,   8)
-                        }
-                &movb(  &BP($n,"esp","",0),     &LB($ty));
-                }
-        else
-                {
-                # Note in+=8 has occured
-                &movb(  &HB($ty),       &BP(-1,$in,"",0));
-                 # XXX
-                &xorb(&LB($ty),         &HB($ty));
-                 # XXX
-                &movb(&BP($n,$out,"",0),&LB($ty));
-                }
        }
+        &$func  ($out,&DWP(0,$dat,$ty,4));
+}
-sub RC4
-        {
+# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
-        local($name)=@_;
+&function_begin("RC4");
+        &mov    ($dat,&wparam(0));      # load key schedule pointer
-        &function_begin_B($name,"");
+        &mov    ($ty, &wparam(1));      # load len
+        &mov    ($inp,&wparam(2));      # load inp
-        &mov($ty,&wparam(1));           # len
+        &mov    ($out,&wparam(3));      # load out
-        &cmp($ty,0);
-        &jne(&label("proceed"));
+        &xor    ($xx,$xx);              # avoid partial register stalls
-        &ret();
+        &xor    ($yy,$yy);
-        &set_label("proceed");
+        &cmp    ($ty,0);                # safety net
-        &comment("");
+        &je     (&label("abort"));
-        &push("ebp");
+        &mov    (&LB($xx),&BP(0,$dat)); # load key->x
-         &push("ebx");
+        &mov    (&LB($yy),&BP(4,$dat)); # load key->y
-        &push("esi");
+        &add    ($dat,8);
-         &xor(  $x,     $x);            # avoid partial register stalls
-        &push("edi");
+        &lea    ($tx,&DWP(0,$inp,$ty));
-         &xor(  $y,     $y);            # avoid partial register stalls
+        &sub    ($out,$inp);            # re-bias out
-        &mov(   $d,     &wparam(0));    # key
+        &mov    (&wparam(1),$tx);       # save input+len
-         &mov(  $in,    &wparam(2));
+        &inc    (&LB($xx));
-        &movb(  &LB($x),        &BP(0,$d,"",1));
-         &movb( &LB($y),        &BP(4,$d,"",1));
+        # detect compressed key schedule...
+        &cmp    (&DWP(256,$dat),-1);
-        &mov(   $out,   &wparam(3));
+        &je     (&label("RC4_CHAR"));
-         &inc(  &LB($x));
+        &mov    ($tx,&DWP(0,$dat,$xx,4));
-        &stack_push(3); # 3 temp variables
-         &add(  $d,     8);
+        &and    ($ty,-4);               # how many 4-byte chunks?
+        &jz     (&label("loop1"));
-        # detect compressed schedule, see commentary section in rc4_skey.c...
-        # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
+        &lea    ($ty,&DWP(-4,$inp,$ty));
-        # as compressed key schedule is set up in 0.9.8 and later.
+        &mov    (&wparam(2),$ty);       # save input+(len/4)*4-4
-        &cmp(&DWP(256,$d),-1);
+        &mov    (&wparam(3),$out);      # $out as accumulator in this loop
-        &je(&label("RC4_CHAR"));
+        &set_label("loop4",16);
-         &lea(  $ty,    &DWP(-8,$ty,$in));
+                for ($i=0;$i<4;$i++) { RC4_loop($i); }
+                &ror    ($out,8);
-        # check for 0 length input
+                &xor    ($out,&DWP(0,$inp));
+                &cmp    ($inp,&wparam(2));      # compare to input+(len/4)*4-4
-         &mov(  &swtmp(2),      $ty);   # this is now address to exit at
+                &mov    (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
-        &mov(   $tx,    &DWP(0,$d,$x,4));
+                &lea    ($inp,&DWP(4,$inp));
+                &mov    ($tx,&DWP(0,$dat,$xx,4));
-         &cmp(  $ty,    $in);
+        &jb     (&label("loop4"));
-        &jb(    &label("end")); # less than 8 bytes
+        &cmp    ($inp,&wparam(1));      # compare to input+len
-        &set_label("start");
+        &je     (&label("done"));
+        &mov    ($out,&wparam(3));      # restore $out
-        # filling DELAY SLOT
-        &add(   $in,    8);
+        &set_label("loop1",16);
+                &add    (&LB($yy),&LB($tx));
-        &RC4_loop(0,-1,0);
+                &mov    ($ty,&DWP(0,$dat,$yy,4));
-        &RC4_loop(1,0,0);
+                &mov    (&DWP(0,$dat,$yy,4),$tx);
-        &RC4_loop(2,0,0);
+                &mov    (&DWP(0,$dat,$xx,4),$ty);
-        &RC4_loop(3,0,0);
+                &add    ($ty,$tx);
-        &RC4_loop(4,0,0);
+                &inc    (&LB($xx));
-        &RC4_loop(5,0,0);
+                &and    ($ty,0xff);
-        &RC4_loop(6,0,0);
+                &mov    ($ty,&DWP(0,$dat,$ty,4));
-        &RC4_loop(7,1,0);
+                &xor    (&LB($ty),&BP(0,$inp));
-        
+                &lea    ($inp,&DWP(1,$inp));
-        &comment("apply the cipher text");
+                &mov    ($tx,&DWP(0,$dat,$xx,4));
-        # xor the cipher data with input
+                &cmp    ($inp,&wparam(1));      # compare to input+len
+                &mov    (&BP(-1,$out,$inp),&LB($ty));
-        #&add(  $out,   8); #moved up into last round
+        &jb     (&label("loop1"));
-        &mov(   $tx,    &swtmp(0));
+        &jmp    (&label("done"));
-         &mov(  $ty,    &DWP(-8,$in,"",0));
-        &xor(   $tx,    $ty);
+# this is essentially Intel P4 specific codepath...
-         &mov(  $ty,    &DWP(-4,$in,"",0)); 
+&set_label("RC4_CHAR",16);
-        &mov(   &DWP(-8,$out,"",0),     $tx);
+        &movz   ($tx,&BP(0,$dat,$xx));
-         &mov(  $tx,    &swtmp(1));
-        &xor(   $tx,    $ty);
-         &mov(  $ty,    &swtmp(2));     # load end ptr;
-        &mov(   &DWP(-4,$out,"",0),     $tx);
-         &mov(  $tx,            &DWP(0,$d,$x,4));
-        &cmp($in,       $ty);
-         &jbe(&label("start"));
-        &set_label("end");
-        # There is quite a bit of extra crap in RC4_loop() for this
-        # first round
-        &RC4_loop(0,-1,1);
-        &RC4_loop(1,0,1);
-        &RC4_loop(2,0,1);
-        &RC4_loop(3,0,1);
-        &RC4_loop(4,0,1);
-        &RC4_loop(5,0,1);
-        &RC4_loop(6,1,1);
-        &jmp(&label("finished"));
-        &align(16);
-        # this is essentially Intel P4 specific codepath, see rc4_skey.c,
-        # and is engaged in 0.9.8 and later context...
-        &set_label("RC4_CHAR");
-        &lea    ($ty,&DWP(0,$in,$ty));
-        &mov    (&swtmp(2),$ty);
-        &movz   ($tx,&BP(0,$d,$x));
        # strangely enough unrolled loop performs over 20% slower...
-        &set_label("RC4_CHAR_loop");
+        &set_label("cloop1");
-                &add    (&LB($y),&LB($tx));
+                &add    (&LB($yy),&LB($tx));
-                &movz   ($ty,&BP(0,$d,$y));
+                &movz   ($ty,&BP(0,$dat,$yy));
-                &movb   (&BP(0,$d,$y),&LB($tx));
+                &mov    (&BP(0,$dat,$yy),&LB($tx));
-                &movb   (&BP(0,$d,$x),&LB($ty));
+                &mov    (&BP(0,$dat,$xx),&LB($ty));
                &add    (&LB($ty),&LB($tx));
-                &movz   ($ty,&BP(0,$d,$ty));
+                &movz   ($ty,&BP(0,$dat,$ty));
-                &add    (&LB($x),1);
+                &add    (&LB($xx),1);
-                &xorb   (&LB($ty),&BP(0,$in));
+                &xor    (&LB($ty),&BP(0,$inp));
-                &lea    ($in,&DWP(1,$in));
+                &lea    ($inp,&DWP(1,$inp));
-                &movz   ($tx,&BP(0,$d,$x));
+                &movz   ($tx,&BP(0,$dat,$xx));
-                &cmp    ($in,&swtmp(2));
+                &cmp    ($inp,&wparam(1));
-                &movb   (&BP(0,$out),&LB($ty));
+                &mov    (&BP(-1,$out,$inp),&LB($ty));
-                &lea    ($out,&DWP(1,$out));
+        &jb     (&label("cloop1"));
-        &jb     (&label("RC4_CHAR_loop"));
+&set_label("done");
-        &set_label("finished");
+        &dec    (&LB($xx));
-        &dec(   $x);
+        &mov    (&BP(-4,$dat),&LB($yy));        # save key->y
-         &stack_pop(3);
+        &mov    (&BP(-8,$dat),&LB($xx));        # save key->x
-        &movb(  &BP(-4,$d,"",0),&LB($y));
+&set_label("abort");
-         &movb( &BP(-8,$d,"",0),&LB($x));
+&function_end("RC4");
-        &function_end($name);
+########################################################################
-        }
+$inp="esi";
+$out="edi";
+$idi="ebp";
+$ido="ecx";
+$idx="edx";
+&external_label("OPENSSL_ia32cap_P");
+# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
+&function_begin("RC4_set_key");
+        &mov    ($out,&wparam(0));              # load key
+        &mov    ($idi,&wparam(1));              # load len
+        &mov    ($inp,&wparam(2));              # load data
+        &picmeup($idx,"OPENSSL_ia32cap_P");
+        &lea    ($out,&DWP(2*4,$out));          # &key->data
+        &lea    ($inp,&DWP(0,$inp,$idi));       # $inp to point at the end
+        &neg    ($idi);
+        &xor    ("eax","eax");
+        &mov    (&DWP(-4,$out),$idi);           # borrow key->y
+        &bt     (&DWP(0,$idx),20);              # check for bit#20
+        &jc     (&label("c1stloop"));
+&set_label("w1stloop",16);
+        &mov    (&DWP(0,$out,"eax",4),"eax");   # key->data[i]=i;
+        &add    (&LB("eax"),1);                 # i++;
+        &jnc    (&label("w1stloop"));
+        &xor    ($ido,$ido);
+        &xor    ($idx,$idx);
+&set_label("w2ndloop",16);
+        &mov    ("eax",&DWP(0,$out,$ido,4));
+        &add    (&LB($idx),&BP(0,$inp,$idi));
+        &add    (&LB($idx),&LB("eax"));
+        &add    ($idi,1);
+        &mov    ("ebx",&DWP(0,$out,$idx,4));
+        &jnz    (&label("wnowrap"));
+          &mov  ($idi,&DWP(-4,$out));
+        &set_label("wnowrap");
+        &mov    (&DWP(0,$out,$idx,4),"eax");
+        &mov    (&DWP(0,$out,$ido,4),"ebx");
+        &add    (&LB($ido),1);
+        &jnc    (&label("w2ndloop"));
+&jmp    (&label("exit"));
+# Unlike all other x86 [and x86_64] implementations, Intel P4 core
+# [including EM64T] was found to perform poorly with above "32-bit" key
+# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
+# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
+# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
+# schedule for x86[_64], because non-P4 implementations suffer from
+# significant performance losses then, e.g. PIII exhibits >2x
+# deterioration, and so does Opteron. In order to assure optimal
+# all-round performance, we detect P4 at run-time and set up compressed
+# key schedule, which is recognized by RC4 procedure.
+&set_label("c1stloop",16);
+        &mov    (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
+        &add    (&LB("eax"),1);                 # i++;
+        &jnc    (&label("c1stloop"));
+        &xor    ($ido,$ido);
+        &xor    ($idx,$idx);
+        &xor    ("ebx","ebx");
+&set_label("c2ndloop",16);
+        &mov    (&LB("eax"),&BP(0,$out,$ido));
+        &add    (&LB($idx),&BP(0,$inp,$idi));
+        &add    (&LB($idx),&LB("eax"));
+        &add    ($idi,1);
+        &mov    (&LB("ebx"),&BP(0,$out,$idx));
+        &jnz    (&label("cnowrap"));
+          &mov  ($idi,&DWP(-4,$out));
+        &set_label("cnowrap");
+        &mov    (&BP(0,$out,$idx),&LB("eax"));
+        &mov    (&BP(0,$out,$ido),&LB("ebx"));
+        &add    (&LB($ido),1);
+        &jnc    (&label("c2ndloop"));
+        &mov    (&DWP(256,$out),-1);            # mark schedule as compressed
+&set_label("exit");
+        &xor    ("eax","eax");
+        &mov    (&DWP(-8,$out),"eax");          # key->x=0;
+        &mov    (&DWP(-4,$out),"eax");          # key->y=0;
+&function_end("RC4_set_key");
+# const char *RC4_options(void);
+&function_begin_B("RC4_options");
+        &call   (&label("pic_point"));
+&set_label("pic_point");
+        &blindpop("eax");
+        &lea    ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
+        &picmeup("edx","OPENSSL_ia32cap_P");
+        &bt     (&DWP(0,"edx"),20);
+        &jnc    (&label("skip"));
+          &add  ("eax",12);
+        &set_label("skip");
+        &ret    ();
+&set_label("opts",64);
+&asciz  ("rc4(4x,int)");
+&asciz  ("rc4(1x,char)");
+&asciz  ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&align  (64);
+&function_end_B("RC4_options");
+&asm_finish();

diff --git a/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl b/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl index ef7eee766c..38a44a70ef 100644 --- a/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl +++ b/src/lib/libssl/src/crypto/rc4/asm/rc4-586.pl
@@ -1,14 +1,21 @@
1	#!/usr/local/bin/perl	1	#!/usr/bin/env perl
		2
		3	# ====================================================================
		4	# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
		5	# project. The module is, however, dual licensed under OpenSSL and
		6	# CRYPTOGAMS licenses depending on where you obtain it. For further
		7	# details see http://www.openssl.org/~appro/cryptogams/.
		8	# ====================================================================
2		9
3	# At some point it became apparent that the original SSLeay RC4	10	# At some point it became apparent that the original SSLeay RC4
4	# assembler implementation performs suboptimaly on latest IA-32	11	# assembler implementation performs suboptimally on latest IA-32
5	# microarchitectures. After re-tuning performance has changed as	12	# microarchitectures. After re-tuning performance has changed as
6	# following:	13	# following:
7	#	14	#
8	# Pentium +0%	15	# Pentium -10%
9	# Pentium III +17%	16	# Pentium III +12%
10	# AMD +52%(*)	17	# AMD +50%(*)
11	# P4 +180%(**)	18	# P4 +250%(**)
12	#	19	#
13	# (*) This number is actually a trade-off:-) It's possible to	20	# (*) This number is actually a trade-off:-) It's possible to
14	# achieve +72%, but at the cost of -48% off PIII performance.	21	# achieve +72%, but at the cost of -48% off PIII performance.
@@ -17,214 +24,247 @@
17	# For reference! This code delivers ~80% of rc4-amd64.pl	24	# For reference! This code delivers ~80% of rc4-amd64.pl
18	# performance on the same Opteron machine.	25	# performance on the same Opteron machine.
19	# (**) This number requires compressed key schedule set up by	26	# (**) This number requires compressed key schedule set up by
20	# RC4_set_key and therefore doesn't apply to 0.9.7 [option for	27	# RC4_set_key [see commentary below for further details].
21	# compressed key schedule is implemented in 0.9.8 and later,
22	# see commentary section in rc4_skey.c for further details].
23	#	28	#
24	# <appro@fy.chalmers.se>	29	# <appro@fy.chalmers.se>
25		30
26	push(@INC,"perlasm","../../perlasm");	31	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
		32	push(@INC,"${dir}","${dir}../../perlasm");
27	require "x86asm.pl";	33	require "x86asm.pl";
28		34
29	&asm_init($ARGV[0],"rc4-586.pl");	35	&asm_init($ARGV[0],"rc4-586.pl");
30		36
31	$x="eax";	37	$xx="eax";
32	$y="ebx";	38	$yy="ebx";
33	$tx="ecx";	39	$tx="ecx";
34	$ty="edx";	40	$ty="edx";
35	$in="esi";	41	$inp="esi";
36	$out="edi";	42	$out="ebp";
37	$d="ebp";	43	$dat="edi";
38		44
39	&RC4("RC4");	45	sub RC4_loop {
40		46	my $i=shift;
41	&asm_finish();	47	my $func = ($i==0)?mov:or;
42		48
43	sub RC4_loop	49	&add (&LB($yy),&LB($tx));
44	{	50	&mov ($ty,&DWP(0,$dat,$yy,4));
45	local($n,$p,$char)=@_;	51	&mov (&DWP(0,$dat,$yy,4),$tx);
46		52	&mov (&DWP(0,$dat,$xx,4),$ty);
47	&comment("Round $n");	53	&add ($ty,$tx);
48		54	&inc (&LB($xx));
49	if ($char)	55	&and ($ty,0xff);
50	{	56	&ror ($out,8) if ($i!=0);
51	if ($p >= 0)	57	if ($i<3) {
52	{	58	&mov ($tx,&DWP(0,$dat,$xx,4));
53	&mov($ty, &swtmp(2));	59	} else {
54	&cmp($ty, $in);	60	&mov ($tx,&wparam(3)); # reload [re-biased] out
55	&jbe(&label("finished"));
56	&inc($in);
57	}
58	else
59	{
60	&add($ty, 8);
61	&inc($in);
62	&cmp($ty, $in);
63	&jb(&label("finished"));
64	&mov(&swtmp(2), $ty);
65	}
66	}
67	# Moved out
68	# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
69
70	&add( &LB($y), &LB($tx));
71	&mov( $ty, &DWP(0,$d,$y,4));
72	# XXX
73	&mov( &DWP(0,$d,$x,4),$ty);
74	&add( $ty, $tx);
75	&mov( &DWP(0,$d,$y,4),$tx);
76	&and( $ty, 0xff);
77	&inc( &LB($x)); # NEXT ROUND
78	&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
79	&mov( $ty, &DWP(0,$d,$ty,4));
80
81	if (!$char)
82	{
83	#moved up into last round
84	if ($p >= 1)
85	{
86	&add( $out, 8)
87	}
88	&movb( &BP($n,"esp","",0), &LB($ty));
89	}
90	else
91	{
92	# Note in+=8 has occured
93	&movb( &HB($ty), &BP(-1,$in,"",0));
94	# XXX
95	&xorb(&LB($ty), &HB($ty));
96	# XXX
97	&movb(&BP($n,$out,"",0),&LB($ty));
98	}
99	}	61	}
100		62	&$func ($out,&DWP(0,$dat,$ty,4));
101		63	}
102	sub RC4	64
103	{	65	# void RC4(RC4_KEY key,size_t len,const unsigned char inp,unsigned char *out);
104	local($name)=@_;	66	&function_begin("RC4");
105		67	&mov ($dat,&wparam(0)); # load key schedule pointer
106	&function_begin_B($name,"");	68	&mov ($ty, &wparam(1)); # load len
107		69	&mov ($inp,&wparam(2)); # load inp
108	&mov($ty,&wparam(1)); # len	70	&mov ($out,&wparam(3)); # load out
109	&cmp($ty,0);	71
110	&jne(&label("proceed"));	72	&xor ($xx,$xx); # avoid partial register stalls
111	&ret();	73	&xor ($yy,$yy);
112	&set_label("proceed");	74
113		75	&cmp ($ty,0); # safety net
114	&comment("");	76	&je (&label("abort"));
115		77
116	&push("ebp");	78	&mov (&LB($xx),&BP(0,$dat)); # load key->x
117	&push("ebx");	79	&mov (&LB($yy),&BP(4,$dat)); # load key->y
118	&push("esi");	80	&add ($dat,8);
119	&xor( $x, $x); # avoid partial register stalls	81
120	&push("edi");	82	&lea ($tx,&DWP(0,$inp,$ty));
121	&xor( $y, $y); # avoid partial register stalls	83	&sub ($out,$inp); # re-bias out
122	&mov( $d, &wparam(0)); # key	84	&mov (&wparam(1),$tx); # save input+len
123	&mov( $in, &wparam(2));	85
124		86	&inc (&LB($xx));
125	&movb( &LB($x), &BP(0,$d,"",1));	87
126	&movb( &LB($y), &BP(4,$d,"",1));	88	# detect compressed key schedule...
127		89	&cmp (&DWP(256,$dat),-1);
128	&mov( $out, &wparam(3));	90	&je (&label("RC4_CHAR"));
129	&inc( &LB($x));	91
130		92	&mov ($tx,&DWP(0,$dat,$xx,4));
131	&stack_push(3); # 3 temp variables	93
132	&add( $d, 8);	94	&and ($ty,-4); # how many 4-byte chunks?
133		95	&jz (&label("loop1"));
134	# detect compressed schedule, see commentary section in rc4_skey.c...	96
135	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,	97	&lea ($ty,&DWP(-4,$inp,$ty));
136	# as compressed key schedule is set up in 0.9.8 and later.	98	&mov (&wparam(2),$ty); # save input+(len/4)*4-4
137	&cmp(&DWP(256,$d),-1);	99	&mov (&wparam(3),$out); # $out as accumulator in this loop
138	&je(&label("RC4_CHAR"));	100
139		101	&set_label("loop4",16);
140	&lea( $ty, &DWP(-8,$ty,$in));	102	for ($i=0;$i<4;$i++) { RC4_loop($i); }
141		103	&ror ($out,8);
142	# check for 0 length input	104	&xor ($out,&DWP(0,$inp));
143		105	&cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
144	&mov( &swtmp(2), $ty); # this is now address to exit at	106	&mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
145	&mov( $tx, &DWP(0,$d,$x,4));	107	&lea ($inp,&DWP(4,$inp));
146		108	&mov ($tx,&DWP(0,$dat,$xx,4));
147	&cmp( $ty, $in);	109	&jb (&label("loop4"));
148	&jb( &label("end")); # less than 8 bytes	110
149		111	&cmp ($inp,&wparam(1)); # compare to input+len
150	&set_label("start");	112	&je (&label("done"));
151		113	&mov ($out,&wparam(3)); # restore $out
152	# filling DELAY SLOT	114
153	&add( $in, 8);	115	&set_label("loop1",16);
154		116	&add (&LB($yy),&LB($tx));
155	&RC4_loop(0,-1,0);	117	&mov ($ty,&DWP(0,$dat,$yy,4));
156	&RC4_loop(1,0,0);	118	&mov (&DWP(0,$dat,$yy,4),$tx);
157	&RC4_loop(2,0,0);	119	&mov (&DWP(0,$dat,$xx,4),$ty);
158	&RC4_loop(3,0,0);	120	&add ($ty,$tx);
159	&RC4_loop(4,0,0);	121	&inc (&LB($xx));
160	&RC4_loop(5,0,0);	122	&and ($ty,0xff);
161	&RC4_loop(6,0,0);	123	&mov ($ty,&DWP(0,$dat,$ty,4));
162	&RC4_loop(7,1,0);	124	&xor (&LB($ty),&BP(0,$inp));
163		125	&lea ($inp,&DWP(1,$inp));
164	&comment("apply the cipher text");	126	&mov ($tx,&DWP(0,$dat,$xx,4));
165	# xor the cipher data with input	127	&cmp ($inp,&wparam(1)); # compare to input+len
166		128	&mov (&BP(-1,$out,$inp),&LB($ty));
167	#&add( $out, 8); #moved up into last round	129	&jb (&label("loop1"));
168		130
169	&mov( $tx, &swtmp(0));	131	&jmp (&label("done"));
170	&mov( $ty, &DWP(-8,$in,"",0));	132
171	&xor( $tx, $ty);	133	# this is essentially Intel P4 specific codepath...
172	&mov( $ty, &DWP(-4,$in,"",0));	134	&set_label("RC4_CHAR",16);
173	&mov( &DWP(-8,$out,"",0), $tx);	135	&movz ($tx,&BP(0,$dat,$xx));
174	&mov( $tx, &swtmp(1));
175	&xor( $tx, $ty);
176	&mov( $ty, &swtmp(2)); # load end ptr;
177	&mov( &DWP(-4,$out,"",0), $tx);
178	&mov( $tx, &DWP(0,$d,$x,4));
179	&cmp($in, $ty);
180	&jbe(&label("start"));
181
182	&set_label("end");
183
184	# There is quite a bit of extra crap in RC4_loop() for this
185	# first round
186	&RC4_loop(0,-1,1);
187	&RC4_loop(1,0,1);
188	&RC4_loop(2,0,1);
189	&RC4_loop(3,0,1);
190	&RC4_loop(4,0,1);
191	&RC4_loop(5,0,1);
192	&RC4_loop(6,1,1);
193
194	&jmp(&label("finished"));
195
196	&align(16);
197	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
198	# and is engaged in 0.9.8 and later context...
199	&set_label("RC4_CHAR");
200
201	&lea ($ty,&DWP(0,$in,$ty));
202	&mov (&swtmp(2),$ty);
203	&movz ($tx,&BP(0,$d,$x));
204
205	# strangely enough unrolled loop performs over 20% slower...	136	# strangely enough unrolled loop performs over 20% slower...
206	&set_label("RC4_CHAR_loop");	137	&set_label("cloop1");
207	&add (&LB($y),&LB($tx));	138	&add (&LB($yy),&LB($tx));
208	&movz ($ty,&BP(0,$d,$y));	139	&movz ($ty,&BP(0,$dat,$yy));
209	&movb (&BP(0,$d,$y),&LB($tx));	140	&mov (&BP(0,$dat,$yy),&LB($tx));
210	&movb (&BP(0,$d,$x),&LB($ty));	141	&mov (&BP(0,$dat,$xx),&LB($ty));
211	&add (&LB($ty),&LB($tx));	142	&add (&LB($ty),&LB($tx));
212	&movz ($ty,&BP(0,$d,$ty));	143	&movz ($ty,&BP(0,$dat,$ty));
213	&add (&LB($x),1);	144	&add (&LB($xx),1);
214	&xorb (&LB($ty),&BP(0,$in));	145	&xor (&LB($ty),&BP(0,$inp));
215	&lea ($in,&DWP(1,$in));	146	&lea ($inp,&DWP(1,$inp));
216	&movz ($tx,&BP(0,$d,$x));	147	&movz ($tx,&BP(0,$dat,$xx));
217	&cmp ($in,&swtmp(2));	148	&cmp ($inp,&wparam(1));
218	&movb (&BP(0,$out),&LB($ty));	149	&mov (&BP(-1,$out,$inp),&LB($ty));
219	&lea ($out,&DWP(1,$out));	150	&jb (&label("cloop1"));
220	&jb (&label("RC4_CHAR_loop"));	151
221		152	&set_label("done");
222	&set_label("finished");	153	&dec (&LB($xx));
223	&dec( $x);	154	&mov (&BP(-4,$dat),&LB($yy)); # save key->y
224	&stack_pop(3);	155	&mov (&BP(-8,$dat),&LB($xx)); # save key->x
225	&movb( &BP(-4,$d,"",0),&LB($y));	156	&set_label("abort");
226	&movb( &BP(-8,$d,"",0),&LB($x));	157	&function_end("RC4");
227		158
228	&function_end($name);	159	########################################################################
229	}	160
		161	$inp="esi";
		162	$out="edi";
		163	$idi="ebp";
		164	$ido="ecx";
		165	$idx="edx";
		166
		167	&external_label("OPENSSL_ia32cap_P");
		168
		169	# void RC4_set_key(RC4_KEY key,int len,const unsigned char data);
		170	&function_begin("RC4_set_key");
		171	&mov ($out,&wparam(0)); # load key
		172	&mov ($idi,&wparam(1)); # load len
		173	&mov ($inp,&wparam(2)); # load data
		174	&picmeup($idx,"OPENSSL_ia32cap_P");
		175
		176	&lea ($out,&DWP(2*4,$out)); # &key->data
		177	&lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
		178	&neg ($idi);
		179	&xor ("eax","eax");
		180	&mov (&DWP(-4,$out),$idi); # borrow key->y
		181
		182	&bt (&DWP(0,$idx),20); # check for bit#20
		183	&jc (&label("c1stloop"));
		184
		185	&set_label("w1stloop",16);
		186	&mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
		187	&add (&LB("eax"),1); # i++;
		188	&jnc (&label("w1stloop"));
		189
		190	&xor ($ido,$ido);
		191	&xor ($idx,$idx);
		192
		193	&set_label("w2ndloop",16);
		194	&mov ("eax",&DWP(0,$out,$ido,4));
		195	&add (&LB($idx),&BP(0,$inp,$idi));
		196	&add (&LB($idx),&LB("eax"));
		197	&add ($idi,1);
		198	&mov ("ebx",&DWP(0,$out,$idx,4));
		199	&jnz (&label("wnowrap"));
		200	&mov ($idi,&DWP(-4,$out));
		201	&set_label("wnowrap");
		202	&mov (&DWP(0,$out,$idx,4),"eax");
		203	&mov (&DWP(0,$out,$ido,4),"ebx");
		204	&add (&LB($ido),1);
		205	&jnc (&label("w2ndloop"));
		206	&jmp (&label("exit"));
		207
		208	# Unlike all other x86 [and x86_64] implementations, Intel P4 core
		209	# [including EM64T] was found to perform poorly with above "32-bit" key
		210	# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
		211	# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
		212	# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
		213	# schedule for x86[_64], because non-P4 implementations suffer from
		214	# significant performance losses then, e.g. PIII exhibits >2x
		215	# deterioration, and so does Opteron. In order to assure optimal
		216	# all-round performance, we detect P4 at run-time and set up compressed
		217	# key schedule, which is recognized by RC4 procedure.
		218
		219	&set_label("c1stloop",16);
		220	&mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
		221	&add (&LB("eax"),1); # i++;
		222	&jnc (&label("c1stloop"));
		223
		224	&xor ($ido,$ido);
		225	&xor ($idx,$idx);
		226	&xor ("ebx","ebx");
		227
		228	&set_label("c2ndloop",16);
		229	&mov (&LB("eax"),&BP(0,$out,$ido));
		230	&add (&LB($idx),&BP(0,$inp,$idi));
		231	&add (&LB($idx),&LB("eax"));
		232	&add ($idi,1);
		233	&mov (&LB("ebx"),&BP(0,$out,$idx));
		234	&jnz (&label("cnowrap"));
		235	&mov ($idi,&DWP(-4,$out));
		236	&set_label("cnowrap");
		237	&mov (&BP(0,$out,$idx),&LB("eax"));
		238	&mov (&BP(0,$out,$ido),&LB("ebx"));
		239	&add (&LB($ido),1);
		240	&jnc (&label("c2ndloop"));
		241
		242	&mov (&DWP(256,$out),-1); # mark schedule as compressed
		243
		244	&set_label("exit");
		245	&xor ("eax","eax");
		246	&mov (&DWP(-8,$out),"eax"); # key->x=0;
		247	&mov (&DWP(-4,$out),"eax"); # key->y=0;
		248	&function_end("RC4_set_key");
		249
		250	# const char *RC4_options(void);
		251	&function_begin_B("RC4_options");
		252	&call (&label("pic_point"));
		253	&set_label("pic_point");
		254	&blindpop("eax");
		255	&lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
		256	&picmeup("edx","OPENSSL_ia32cap_P");
		257	&bt (&DWP(0,"edx"),20);
		258	&jnc (&label("skip"));
		259	&add ("eax",12);
		260	&set_label("skip");
		261	&ret ();
		262	&set_label("opts",64);
		263	&asciz ("rc4(4x,int)");
		264	&asciz ("rc4(1x,char)");
		265	&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
		266	&align (64);
		267	&function_end_B("RC4_options");
		268
		269	&asm_finish();
230		270