1 files changed, 85 insertions, 29 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
index 7ef889e5a1..d6e98f0811 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -1,16 +1,37 @@
 #!/usr/local/bin/perl
-# define for pentium pro friendly version
+# At some point it became apparent that the original SSLeay RC4
+# assembler implementation performs suboptimaly on latest IA-32
+# microarchitectures. After re-tuning performance has changed as
+# following:
+#
+# Pentium       +0%
+# Pentium III   +17%
+# AMD           +52%(*)
+# P4            +180%(**)
+#
+# (*)   This number is actually a trade-off:-) It's possible to
+#       achieve +72%, but at the cost of -48% off PIII performance.
+#       In other words code performing further 13% faster on AMD
+#       would perform almost 2 times slower on Intel PIII...
+#       For reference! This code delivers ~80% of rc4-amd64.pl
+#       performance on the same Opteron machine.
+# (**)  This number requires compressed key schedule set up by
+#       RC4_set_key and therefore doesn't apply to 0.9.7 [option for
+#       compressed key schedule is implemented in 0.9.8 and later,
+#       see commentary section in rc4_skey.c for further details].
+#
+#                                       <appro@fy.chalmers.se>
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
 &asm_init($ARGV[0],"rc4-586.pl");
-$tx="eax";
+$x="eax";
-$ty="ebx";
+$y="ebx";
-$x="ecx";
+$tx="ecx";
-$y="edx";
+$ty="edx";
 $in="esi";
 $out="edi";
 $d="ebp";
@@ -31,7 +52,7 @@ sub RC4_loop
                        {
                         &mov($ty,      &swtmp(2));
                        &cmp($ty,       $in);
-                         &jle(&label("finished"));
+                         &jbe(&label("finished"));
                        &inc($in);
                        }
                else
@@ -39,27 +60,23 @@ sub RC4_loop
                        &add($ty,       8);
                         &inc($in);
                        &cmp($ty,       $in);
-                         &jl(&label("finished"));
+                         &jb(&label("finished"));
                        &mov(&swtmp(2), $ty);
                        }
                }
        # Moved out
        # &mov( $tx,            &DWP(0,$d,$x,4)) if $p < 0;
-         &add(  $y,             $tx);
+        &add(   &LB($y),        &LB($tx));
-        &and(   $y,             0xff);
-         &inc(  $x);                    # NEXT ROUND 
        &mov(   $ty,            &DWP(0,$d,$y,4));
         # XXX
-        &mov(   &DWP(-4,$d,$x,4),$ty);                  # AGI
+        &mov(   &DWP(0,$d,$x,4),$ty);
         &add(  $ty,            $tx);
-        &and(   $x,             0xff);  # NEXT ROUND
-         &and(  $ty,            0xff);
        &mov(   &DWP(0,$d,$y,4),$tx);
-         &nop();
+         &and(  $ty,            0xff);
-        &mov(   $ty,            &DWP(0,$d,$ty,4));
+         &inc(  &LB($x));                       # NEXT ROUND
-         &mov(  $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
+        &mov(   $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
-         # XXX
+         &mov(  $ty,            &DWP(0,$d,$ty,4));
        if (!$char)
                {
@@ -88,35 +105,47 @@ sub RC4
        &function_begin_B($name,"");
+        &mov($ty,&wparam(1));           # len
+        &cmp($ty,0);
+        &jne(&label("proceed"));
+        &ret();
+        &set_label("proceed");
        &comment("");
        &push("ebp");
         &push("ebx");
-        &mov(   $d,     &wparam(0));    # key
-         &mov(  $ty,    &wparam(1));    # num
        &push("esi");
-         &push("edi");
+         &xor(  $x,     $x);            # avoid partial register stalls
+        &push("edi");
+         &xor(  $y,     $y);            # avoid partial register stalls
+        &mov(   $d,     &wparam(0));    # key
+         &mov(  $in,    &wparam(2));
-        &mov(   $x,     &DWP(0,$d,"",1));
+        &movb(  &LB($x),        &BP(0,$d,"",1));
-         &mov(  $y,     &DWP(4,$d,"",1));
+         &movb( &LB($y),        &BP(4,$d,"",1));
-        &mov(   $in,    &wparam(2));
+        &mov(   $out,   &wparam(3));
-         &inc(  $x);
+         &inc(  &LB($x));
        &stack_push(3); # 3 temp variables
         &add(  $d,     8);
-        &and(   $x,             0xff);
+        # detect compressed schedule, see commentary section in rc4_skey.c...
+        # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
+        # as compressed key schedule is set up in 0.9.8 and later.
+        &cmp(&DWP(256,$d),-1);
+        &je(&label("RC4_CHAR"));
         &lea(  $ty,    &DWP(-8,$ty,$in));
        # check for 0 length input
-        &mov(   $out,   &wparam(3));
         &mov(  &swtmp(2),      $ty);   # this is now address to exit at
        &mov(   $tx,    &DWP(0,$d,$x,4));
         &cmp(  $ty,    $in);
-        &jl(    &label("end")); # less than 8 bytes
+        &jb(    &label("end")); # less than 8 bytes
        &set_label("start");
@@ -148,7 +177,7 @@ sub RC4
        &mov(   &DWP(-4,$out,"",0),     $tx);
         &mov(  $tx,            &DWP(0,$d,$x,4));
        &cmp($in,       $ty);
-         &jle(&label("start"));
+         &jbe(&label("start"));
        &set_label("end");
@@ -162,10 +191,37 @@ sub RC4
        &RC4_loop(5,0,1);
        &RC4_loop(6,1,1);
+        &jmp(&label("finished"));
+        &align(16);
+        # this is essentially Intel P4 specific codepath, see rc4_skey.c,
+        # and is engaged in 0.9.8 and later context...
+        &set_label("RC4_CHAR");
+        &lea    ($ty,&DWP(0,$in,$ty));
+        &mov    (&swtmp(2),$ty);
+        # strangely enough unrolled loop performs over 20% slower...
+        &set_label("RC4_CHAR_loop");
+                &movz   ($tx,&BP(0,$d,$x));
+                &add    (&LB($y),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$y));
+                &movb   (&BP(0,$d,$y),&LB($tx));
+                &movb   (&BP(0,$d,$x),&LB($ty));
+                &add    (&LB($ty),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$ty));
+                &xorb   (&LB($ty),&BP(0,$in));
+                &movb   (&BP(0,$out),&LB($ty));
+                &inc    (&LB($x));
+                &inc    ($in);
+                &inc    ($out);
+                &cmp    ($in,&swtmp(2));
+        &jb     (&label("RC4_CHAR_loop"));
        &set_label("finished");
        &dec(   $x);
         &stack_pop(3);
-        &mov(   &DWP(-4,$d,"",0),$y);
+        &movb(  &BP(-4,$d,"",0),&LB($y));
         &movb( &BP(-8,$d,"",0),&LB($x));
        &function_end($name);

diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl index 7ef889e5a1..d6e98f0811 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-586.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -1,16 +1,37 @@
1	#!/usr/local/bin/perl	1	#!/usr/local/bin/perl
2		2
3	# define for pentium pro friendly version	3	# At some point it became apparent that the original SSLeay RC4
		4	# assembler implementation performs suboptimaly on latest IA-32
		5	# microarchitectures. After re-tuning performance has changed as
		6	# following:
		7	#
		8	# Pentium +0%
		9	# Pentium III +17%
		10	# AMD +52%(*)
		11	# P4 +180%(**)
		12	#
		13	# (*) This number is actually a trade-off:-) It's possible to
		14	# achieve +72%, but at the cost of -48% off PIII performance.
		15	# In other words code performing further 13% faster on AMD
		16	# would perform almost 2 times slower on Intel PIII...
		17	# For reference! This code delivers ~80% of rc4-amd64.pl
		18	# performance on the same Opteron machine.
		19	# (**) This number requires compressed key schedule set up by
		20	# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
		21	# compressed key schedule is implemented in 0.9.8 and later,
		22	# see commentary section in rc4_skey.c for further details].
		23	#
		24	# <appro@fy.chalmers.se>
4		25
5	push(@INC,"perlasm","../../perlasm");	26	push(@INC,"perlasm","../../perlasm");
6	require "x86asm.pl";	27	require "x86asm.pl";
7		28
8	&asm_init($ARGV[0],"rc4-586.pl");	29	&asm_init($ARGV[0],"rc4-586.pl");
9		30
10	$tx="eax";	31	$x="eax";
11	$ty="ebx";	32	$y="ebx";
12	$x="ecx";	33	$tx="ecx";
13	$y="edx";	34	$ty="edx";
14	$in="esi";	35	$in="esi";
15	$out="edi";	36	$out="edi";
16	$d="ebp";	37	$d="ebp";
@@ -31,7 +52,7 @@ sub RC4_loop
31	{	52	{
32	&mov($ty, &swtmp(2));	53	&mov($ty, &swtmp(2));
33	&cmp($ty, $in);	54	&cmp($ty, $in);
34	&jle(&label("finished"));	55	&jbe(&label("finished"));
35	&inc($in);	56	&inc($in);
36	}	57	}
37	else	58	else
@@ -39,27 +60,23 @@ sub RC4_loop
39	&add($ty, 8);	60	&add($ty, 8);
40	&inc($in);	61	&inc($in);
41	&cmp($ty, $in);	62	&cmp($ty, $in);
42	&jl(&label("finished"));	63	&jb(&label("finished"));
43	&mov(&swtmp(2), $ty);	64	&mov(&swtmp(2), $ty);
44	}	65	}
45	}	66	}
46	# Moved out	67	# Moved out
47	# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;	68	# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
48		69
49	&add( $y, $tx);	70	&add( &LB($y), &LB($tx));
50	&and( $y, 0xff);
51	&inc( $x); # NEXT ROUND
52	&mov( $ty, &DWP(0,$d,$y,4));	71	&mov( $ty, &DWP(0,$d,$y,4));
53	# XXX	72	# XXX
54	&mov( &DWP(-4,$d,$x,4),$ty); # AGI	73	&mov( &DWP(0,$d,$x,4),$ty);
55	&add( $ty, $tx);	74	&add( $ty, $tx);
56	&and( $x, 0xff); # NEXT ROUND
57	&and( $ty, 0xff);
58	&mov( &DWP(0,$d,$y,4),$tx);	75	&mov( &DWP(0,$d,$y,4),$tx);
59	&nop();	76	&and( $ty, 0xff);
60	&mov( $ty, &DWP(0,$d,$ty,4));	77	&inc( &LB($x)); # NEXT ROUND
61	&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND	78	&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
62	# XXX	79	&mov( $ty, &DWP(0,$d,$ty,4));
63		80
64	if (!$char)	81	if (!$char)
65	{	82	{
@@ -88,35 +105,47 @@ sub RC4
88		105
89	&function_begin_B($name,"");	106	&function_begin_B($name,"");
90		107
		108	&mov($ty,&wparam(1)); # len
		109	&cmp($ty,0);
		110	&jne(&label("proceed"));
		111	&ret();
		112	&set_label("proceed");
		113
91	&comment("");	114	&comment("");
92		115
93	&push("ebp");	116	&push("ebp");
94	&push("ebx");	117	&push("ebx");
95	&mov( $d, &wparam(0)); # key
96	&mov( $ty, &wparam(1)); # num
97	&push("esi");	118	&push("esi");
98	&push("edi");	119	&xor( $x, $x); # avoid partial register stalls
		120	&push("edi");
		121	&xor( $y, $y); # avoid partial register stalls
		122	&mov( $d, &wparam(0)); # key
		123	&mov( $in, &wparam(2));
99		124
100	&mov( $x, &DWP(0,$d,"",1));	125	&movb( &LB($x), &BP(0,$d,"",1));
101	&mov( $y, &DWP(4,$d,"",1));	126	&movb( &LB($y), &BP(4,$d,"",1));
102		127
103	&mov( $in, &wparam(2));	128	&mov( $out, &wparam(3));
104	&inc( $x);	129	&inc( &LB($x));
105		130
106	&stack_push(3); # 3 temp variables	131	&stack_push(3); # 3 temp variables
107	&add( $d, 8);	132	&add( $d, 8);
108	&and( $x, 0xff);	133
		134	# detect compressed schedule, see commentary section in rc4_skey.c...
		135	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
		136	# as compressed key schedule is set up in 0.9.8 and later.
		137	&cmp(&DWP(256,$d),-1);
		138	&je(&label("RC4_CHAR"));
109		139
110	&lea( $ty, &DWP(-8,$ty,$in));	140	&lea( $ty, &DWP(-8,$ty,$in));
111		141
112	# check for 0 length input	142	# check for 0 length input
113		143
114	&mov( $out, &wparam(3));
115	&mov( &swtmp(2), $ty); # this is now address to exit at	144	&mov( &swtmp(2), $ty); # this is now address to exit at
116	&mov( $tx, &DWP(0,$d,$x,4));	145	&mov( $tx, &DWP(0,$d,$x,4));
117		146
118	&cmp( $ty, $in);	147	&cmp( $ty, $in);
119	&jl( &label("end")); # less than 8 bytes	148	&jb( &label("end")); # less than 8 bytes
120		149
121	&set_label("start");	150	&set_label("start");
122		151
@@ -148,7 +177,7 @@ sub RC4
148	&mov( &DWP(-4,$out,"",0), $tx);	177	&mov( &DWP(-4,$out,"",0), $tx);
149	&mov( $tx, &DWP(0,$d,$x,4));	178	&mov( $tx, &DWP(0,$d,$x,4));
150	&cmp($in, $ty);	179	&cmp($in, $ty);
151	&jle(&label("start"));	180	&jbe(&label("start"));
152		181
153	&set_label("end");	182	&set_label("end");
154		183
@@ -162,10 +191,37 @@ sub RC4
162	&RC4_loop(5,0,1);	191	&RC4_loop(5,0,1);
163	&RC4_loop(6,1,1);	192	&RC4_loop(6,1,1);
164		193
		194	&jmp(&label("finished"));
		195
		196	&align(16);
		197	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
		198	# and is engaged in 0.9.8 and later context...
		199	&set_label("RC4_CHAR");
		200
		201	&lea ($ty,&DWP(0,$in,$ty));
		202	&mov (&swtmp(2),$ty);
		203
		204	# strangely enough unrolled loop performs over 20% slower...
		205	&set_label("RC4_CHAR_loop");
		206	&movz ($tx,&BP(0,$d,$x));
		207	&add (&LB($y),&LB($tx));
		208	&movz ($ty,&BP(0,$d,$y));
		209	&movb (&BP(0,$d,$y),&LB($tx));
		210	&movb (&BP(0,$d,$x),&LB($ty));
		211	&add (&LB($ty),&LB($tx));
		212	&movz ($ty,&BP(0,$d,$ty));
		213	&xorb (&LB($ty),&BP(0,$in));
		214	&movb (&BP(0,$out),&LB($ty));
		215	&inc (&LB($x));
		216	&inc ($in);
		217	&inc ($out);
		218	&cmp ($in,&swtmp(2));
		219	&jb (&label("RC4_CHAR_loop"));
		220
165	&set_label("finished");	221	&set_label("finished");
166	&dec( $x);	222	&dec( $x);
167	&stack_pop(3);	223	&stack_pop(3);
168	&mov( &DWP(-4,$d,"",0),$y);	224	&movb( &BP(-4,$d,"",0),&LB($y));
169	&movb( &BP(-8,$d,"",0),&LB($x));	225	&movb( &BP(-8,$d,"",0),&LB($x));
170		226
171	&function_end($name);	227	&function_end($name);