1 files changed, 230 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
new file mode 100644
index 0000000000..ef7eee766c
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -0,0 +1,230 @@
+#!/usr/local/bin/perl
+# At some point it became apparent that the original SSLeay RC4
+# assembler implementation performs suboptimaly on latest IA-32
+# microarchitectures. After re-tuning performance has changed as
+# following:
+#
+# Pentium       +0%
+# Pentium III   +17%
+# AMD           +52%(*)
+# P4            +180%(**)
+#
+# (*)   This number is actually a trade-off:-) It's possible to
+#       achieve +72%, but at the cost of -48% off PIII performance.
+#       In other words code performing further 13% faster on AMD
+#       would perform almost 2 times slower on Intel PIII...
+#       For reference! This code delivers ~80% of rc4-amd64.pl
+#       performance on the same Opteron machine.
+# (**)  This number requires compressed key schedule set up by
+#       RC4_set_key and therefore doesn't apply to 0.9.7 [option for
+#       compressed key schedule is implemented in 0.9.8 and later,
+#       see commentary section in rc4_skey.c for further details].
+#
+#                                       <appro@fy.chalmers.se>
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+&asm_init($ARGV[0],"rc4-586.pl");
+$x="eax";
+$y="ebx";
+$tx="ecx";
+$ty="edx";
+$in="esi";
+$out="edi";
+$d="ebp";
+&RC4("RC4");
+&asm_finish();
+sub RC4_loop
+        {
+        local($n,$p,$char)=@_;
+        &comment("Round $n");
+        if ($char)
+                {
+                if ($p >= 0)
+                        {
+                         &mov($ty,      &swtmp(2));
+                        &cmp($ty,       $in);
+                         &jbe(&label("finished"));
+                        &inc($in);
+                        }
+                else
+                        {
+                        &add($ty,       8);
+                         &inc($in);
+                        &cmp($ty,       $in);
+                         &jb(&label("finished"));
+                        &mov(&swtmp(2), $ty);
+                        }
+                }
+        # Moved out
+        # &mov( $tx,            &DWP(0,$d,$x,4)) if $p < 0;
+        &add(   &LB($y),        &LB($tx));
+        &mov(   $ty,            &DWP(0,$d,$y,4));
+         # XXX
+        &mov(   &DWP(0,$d,$x,4),$ty);
+         &add(  $ty,            $tx);
+        &mov(   &DWP(0,$d,$y,4),$tx);
+         &and(  $ty,            0xff);
+         &inc(  &LB($x));                       # NEXT ROUND
+        &mov(   $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
+         &mov(  $ty,            &DWP(0,$d,$ty,4));
+        if (!$char)
+                {
+                #moved up into last round
+                if ($p >= 1)
+                        {
+                        &add(   $out,   8)
+                        }
+                &movb(  &BP($n,"esp","",0),     &LB($ty));
+                }
+        else
+                {
+                # Note in+=8 has occured
+                &movb(  &HB($ty),       &BP(-1,$in,"",0));
+                 # XXX
+                &xorb(&LB($ty),         &HB($ty));
+                 # XXX
+                &movb(&BP($n,$out,"",0),&LB($ty));
+                }
+        }
+sub RC4
+        {
+        local($name)=@_;
+        &function_begin_B($name,"");
+        &mov($ty,&wparam(1));           # len
+        &cmp($ty,0);
+        &jne(&label("proceed"));
+        &ret();
+        &set_label("proceed");
+        &comment("");
+        &push("ebp");
+         &push("ebx");
+        &push("esi");
+         &xor(  $x,     $x);            # avoid partial register stalls
+        &push("edi");
+         &xor(  $y,     $y);            # avoid partial register stalls
+        &mov(   $d,     &wparam(0));    # key
+         &mov(  $in,    &wparam(2));
+        &movb(  &LB($x),        &BP(0,$d,"",1));
+         &movb( &LB($y),        &BP(4,$d,"",1));
+        &mov(   $out,   &wparam(3));
+         &inc(  &LB($x));
+        &stack_push(3); # 3 temp variables
+         &add(  $d,     8);
+        # detect compressed schedule, see commentary section in rc4_skey.c...
+        # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
+        # as compressed key schedule is set up in 0.9.8 and later.
+        &cmp(&DWP(256,$d),-1);
+        &je(&label("RC4_CHAR"));
+         &lea(  $ty,    &DWP(-8,$ty,$in));
+        # check for 0 length input
+         &mov(  &swtmp(2),      $ty);   # this is now address to exit at
+        &mov(   $tx,    &DWP(0,$d,$x,4));
+         &cmp(  $ty,    $in);
+        &jb(    &label("end")); # less than 8 bytes
+        &set_label("start");
+        # filling DELAY SLOT
+        &add(   $in,    8);
+        &RC4_loop(0,-1,0);
+        &RC4_loop(1,0,0);
+        &RC4_loop(2,0,0);
+        &RC4_loop(3,0,0);
+        &RC4_loop(4,0,0);
+        &RC4_loop(5,0,0);
+        &RC4_loop(6,0,0);
+        &RC4_loop(7,1,0);
+        
+        &comment("apply the cipher text");
+        # xor the cipher data with input
+        #&add(  $out,   8); #moved up into last round
+        &mov(   $tx,    &swtmp(0));
+         &mov(  $ty,    &DWP(-8,$in,"",0));
+        &xor(   $tx,    $ty);
+         &mov(  $ty,    &DWP(-4,$in,"",0)); 
+        &mov(   &DWP(-8,$out,"",0),     $tx);
+         &mov(  $tx,    &swtmp(1));
+        &xor(   $tx,    $ty);
+         &mov(  $ty,    &swtmp(2));     # load end ptr;
+        &mov(   &DWP(-4,$out,"",0),     $tx);
+         &mov(  $tx,            &DWP(0,$d,$x,4));
+        &cmp($in,       $ty);
+         &jbe(&label("start"));
+        &set_label("end");
+        # There is quite a bit of extra crap in RC4_loop() for this
+        # first round
+        &RC4_loop(0,-1,1);
+        &RC4_loop(1,0,1);
+        &RC4_loop(2,0,1);
+        &RC4_loop(3,0,1);
+        &RC4_loop(4,0,1);
+        &RC4_loop(5,0,1);
+        &RC4_loop(6,1,1);
+        &jmp(&label("finished"));
+        &align(16);
+        # this is essentially Intel P4 specific codepath, see rc4_skey.c,
+        # and is engaged in 0.9.8 and later context...
+        &set_label("RC4_CHAR");
+        &lea    ($ty,&DWP(0,$in,$ty));
+        &mov    (&swtmp(2),$ty);
+        &movz   ($tx,&BP(0,$d,$x));
+        # strangely enough unrolled loop performs over 20% slower...
+        &set_label("RC4_CHAR_loop");
+                &add    (&LB($y),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$y));
+                &movb   (&BP(0,$d,$y),&LB($tx));
+                &movb   (&BP(0,$d,$x),&LB($ty));
+                &add    (&LB($ty),&LB($tx));
+                &movz   ($ty,&BP(0,$d,$ty));
+                &add    (&LB($x),1);
+                &xorb   (&LB($ty),&BP(0,$in));
+                &lea    ($in,&DWP(1,$in));
+                &movz   ($tx,&BP(0,$d,$x));
+                &cmp    ($in,&swtmp(2));
+                &movb   (&BP(0,$out),&LB($ty));
+                &lea    ($out,&DWP(1,$out));
+        &jb     (&label("RC4_CHAR_loop"));
+        &set_label("finished");
+        &dec(   $x);
+         &stack_pop(3);
+        &movb(  &BP(-4,$d,"",0),&LB($y));
+         &movb( &BP(-8,$d,"",0),&LB($x));
+        &function_end($name);
+        }

diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl new file mode 100644 index 0000000000..ef7eee766c --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -0,0 +1,230 @@
	1	#!/usr/local/bin/perl
	2
	3	# At some point it became apparent that the original SSLeay RC4
	4	# assembler implementation performs suboptimaly on latest IA-32
	5	# microarchitectures. After re-tuning performance has changed as
	6	# following:
	7	#
	8	# Pentium +0%
	9	# Pentium III +17%
	10	# AMD +52%(*)
	11	# P4 +180%(**)
	12	#
	13	# (*) This number is actually a trade-off:-) It's possible to
	14	# achieve +72%, but at the cost of -48% off PIII performance.
	15	# In other words code performing further 13% faster on AMD
	16	# would perform almost 2 times slower on Intel PIII...
	17	# For reference! This code delivers ~80% of rc4-amd64.pl
	18	# performance on the same Opteron machine.
	19	# (**) This number requires compressed key schedule set up by
	20	# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
	21	# compressed key schedule is implemented in 0.9.8 and later,
	22	# see commentary section in rc4_skey.c for further details].
	23	#
	24	# <appro@fy.chalmers.se>
	25
	26	push(@INC,"perlasm","../../perlasm");
	27	require "x86asm.pl";
	28
	29	&asm_init($ARGV[0],"rc4-586.pl");
	30
	31	$x="eax";
	32	$y="ebx";
	33	$tx="ecx";
	34	$ty="edx";
	35	$in="esi";
	36	$out="edi";
	37	$d="ebp";
	38
	39	&RC4("RC4");
	40
	41	&asm_finish();
	42
	43	sub RC4_loop
	44	{
	45	local($n,$p,$char)=@_;
	46
	47	&comment("Round $n");
	48
	49	if ($char)
	50	{
	51	if ($p >= 0)
	52	{
	53	&mov($ty, &swtmp(2));
	54	&cmp($ty, $in);
	55	&jbe(&label("finished"));
	56	&inc($in);
	57	}
	58	else
	59	{
	60	&add($ty, 8);
	61	&inc($in);
	62	&cmp($ty, $in);
	63	&jb(&label("finished"));
	64	&mov(&swtmp(2), $ty);
	65	}
	66	}
	67	# Moved out
	68	# &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
	69
	70	&add( &LB($y), &LB($tx));
	71	&mov( $ty, &DWP(0,$d,$y,4));
	72	# XXX
	73	&mov( &DWP(0,$d,$x,4),$ty);
	74	&add( $ty, $tx);
	75	&mov( &DWP(0,$d,$y,4),$tx);
	76	&and( $ty, 0xff);
	77	&inc( &LB($x)); # NEXT ROUND
	78	&mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
	79	&mov( $ty, &DWP(0,$d,$ty,4));
	80
	81	if (!$char)
	82	{
	83	#moved up into last round
	84	if ($p >= 1)
	85	{
	86	&add( $out, 8)
	87	}
	88	&movb( &BP($n,"esp","",0), &LB($ty));
	89	}
	90	else
	91	{
	92	# Note in+=8 has occured
	93	&movb( &HB($ty), &BP(-1,$in,"",0));
	94	# XXX
	95	&xorb(&LB($ty), &HB($ty));
	96	# XXX
	97	&movb(&BP($n,$out,"",0),&LB($ty));
	98	}
	99	}
	100
	101
	102	sub RC4
	103	{
	104	local($name)=@_;
	105
	106	&function_begin_B($name,"");
	107
	108	&mov($ty,&wparam(1)); # len
	109	&cmp($ty,0);
	110	&jne(&label("proceed"));
	111	&ret();
	112	&set_label("proceed");
	113
	114	&comment("");
	115
	116	&push("ebp");
	117	&push("ebx");
	118	&push("esi");
	119	&xor( $x, $x); # avoid partial register stalls
	120	&push("edi");
	121	&xor( $y, $y); # avoid partial register stalls
	122	&mov( $d, &wparam(0)); # key
	123	&mov( $in, &wparam(2));
	124
	125	&movb( &LB($x), &BP(0,$d,"",1));
	126	&movb( &LB($y), &BP(4,$d,"",1));
	127
	128	&mov( $out, &wparam(3));
	129	&inc( &LB($x));
	130
	131	&stack_push(3); # 3 temp variables
	132	&add( $d, 8);
	133
	134	# detect compressed schedule, see commentary section in rc4_skey.c...
	135	# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
	136	# as compressed key schedule is set up in 0.9.8 and later.
	137	&cmp(&DWP(256,$d),-1);
	138	&je(&label("RC4_CHAR"));
	139
	140	&lea( $ty, &DWP(-8,$ty,$in));
	141
	142	# check for 0 length input
	143
	144	&mov( &swtmp(2), $ty); # this is now address to exit at
	145	&mov( $tx, &DWP(0,$d,$x,4));
	146
	147	&cmp( $ty, $in);
	148	&jb( &label("end")); # less than 8 bytes
	149
	150	&set_label("start");
	151
	152	# filling DELAY SLOT
	153	&add( $in, 8);
	154
	155	&RC4_loop(0,-1,0);
	156	&RC4_loop(1,0,0);
	157	&RC4_loop(2,0,0);
	158	&RC4_loop(3,0,0);
	159	&RC4_loop(4,0,0);
	160	&RC4_loop(5,0,0);
	161	&RC4_loop(6,0,0);
	162	&RC4_loop(7,1,0);
	163
	164	&comment("apply the cipher text");
	165	# xor the cipher data with input
	166
	167	#&add( $out, 8); #moved up into last round
	168
	169	&mov( $tx, &swtmp(0));
	170	&mov( $ty, &DWP(-8,$in,"",0));
	171	&xor( $tx, $ty);
	172	&mov( $ty, &DWP(-4,$in,"",0));
	173	&mov( &DWP(-8,$out,"",0), $tx);
	174	&mov( $tx, &swtmp(1));
	175	&xor( $tx, $ty);
	176	&mov( $ty, &swtmp(2)); # load end ptr;
	177	&mov( &DWP(-4,$out,"",0), $tx);
	178	&mov( $tx, &DWP(0,$d,$x,4));
	179	&cmp($in, $ty);
	180	&jbe(&label("start"));
	181
	182	&set_label("end");
	183
	184	# There is quite a bit of extra crap in RC4_loop() for this
	185	# first round
	186	&RC4_loop(0,-1,1);
	187	&RC4_loop(1,0,1);
	188	&RC4_loop(2,0,1);
	189	&RC4_loop(3,0,1);
	190	&RC4_loop(4,0,1);
	191	&RC4_loop(5,0,1);
	192	&RC4_loop(6,1,1);
	193
	194	&jmp(&label("finished"));
	195
	196	&align(16);
	197	# this is essentially Intel P4 specific codepath, see rc4_skey.c,
	198	# and is engaged in 0.9.8 and later context...
	199	&set_label("RC4_CHAR");
	200
	201	&lea ($ty,&DWP(0,$in,$ty));
	202	&mov (&swtmp(2),$ty);
	203	&movz ($tx,&BP(0,$d,$x));
	204
	205	# strangely enough unrolled loop performs over 20% slower...
	206	&set_label("RC4_CHAR_loop");
	207	&add (&LB($y),&LB($tx));
	208	&movz ($ty,&BP(0,$d,$y));
	209	&movb (&BP(0,$d,$y),&LB($tx));
	210	&movb (&BP(0,$d,$x),&LB($ty));
	211	&add (&LB($ty),&LB($tx));
	212	&movz ($ty,&BP(0,$d,$ty));
	213	&add (&LB($x),1);
	214	&xorb (&LB($ty),&BP(0,$in));
	215	&lea ($in,&DWP(1,$in));
	216	&movz ($tx,&BP(0,$d,$x));
	217	&cmp ($in,&swtmp(2));
	218	&movb (&BP(0,$out),&LB($ty));
	219	&lea ($out,&DWP(1,$out));
	220	&jb (&label("RC4_CHAR_loop"));
	221
	222	&set_label("finished");
	223	&dec( $x);
	224	&stack_pop(3);
	225	&movb( &BP(-4,$d,"",0),&LB($y));
	226	&movb( &BP(-8,$d,"",0),&LB($x));
	227
	228	&function_end($name);
	229	}
	230