summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/rc4/asm/rc4-586.pl
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/rc4/asm/rc4-586.pl')
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-586.pl114
1 files changed, 85 insertions, 29 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
index 7ef889e5a1..d6e98f0811 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -1,16 +1,37 @@
1#!/usr/local/bin/perl 1#!/usr/local/bin/perl
2 2
3# define for pentium pro friendly version 3# At some point it became apparent that the original SSLeay RC4
4# assembler implementation performs suboptimaly on latest IA-32
5# microarchitectures. After re-tuning performance has changed as
6# following:
7#
8# Pentium +0%
9# Pentium III +17%
10# AMD +52%(*)
11# P4 +180%(**)
12#
13# (*) This number is actually a trade-off:-) It's possible to
14# achieve +72%, but at the cost of -48% off PIII performance.
15# In other words code performing further 13% faster on AMD
16# would perform almost 2 times slower on Intel PIII...
17# For reference! This code delivers ~80% of rc4-amd64.pl
18# performance on the same Opteron machine.
19# (**) This number requires compressed key schedule set up by
20# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
21# compressed key schedule is implemented in 0.9.8 and later,
22# see commentary section in rc4_skey.c for further details].
23#
24# <appro@fy.chalmers.se>
4 25
5push(@INC,"perlasm","../../perlasm"); 26push(@INC,"perlasm","../../perlasm");
6require "x86asm.pl"; 27require "x86asm.pl";
7 28
8&asm_init($ARGV[0],"rc4-586.pl"); 29&asm_init($ARGV[0],"rc4-586.pl");
9 30
10$tx="eax"; 31$x="eax";
11$ty="ebx"; 32$y="ebx";
12$x="ecx"; 33$tx="ecx";
13$y="edx"; 34$ty="edx";
14$in="esi"; 35$in="esi";
15$out="edi"; 36$out="edi";
16$d="ebp"; 37$d="ebp";
@@ -31,7 +52,7 @@ sub RC4_loop
31 { 52 {
32 &mov($ty, &swtmp(2)); 53 &mov($ty, &swtmp(2));
33 &cmp($ty, $in); 54 &cmp($ty, $in);
34 &jle(&label("finished")); 55 &jbe(&label("finished"));
35 &inc($in); 56 &inc($in);
36 } 57 }
37 else 58 else
@@ -39,27 +60,23 @@ sub RC4_loop
39 &add($ty, 8); 60 &add($ty, 8);
40 &inc($in); 61 &inc($in);
41 &cmp($ty, $in); 62 &cmp($ty, $in);
42 &jl(&label("finished")); 63 &jb(&label("finished"));
43 &mov(&swtmp(2), $ty); 64 &mov(&swtmp(2), $ty);
44 } 65 }
45 } 66 }
46 # Moved out 67 # Moved out
47 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0; 68 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
48 69
49 &add( $y, $tx); 70 &add( &LB($y), &LB($tx));
50 &and( $y, 0xff);
51 &inc( $x); # NEXT ROUND
52 &mov( $ty, &DWP(0,$d,$y,4)); 71 &mov( $ty, &DWP(0,$d,$y,4));
53 # XXX 72 # XXX
54 &mov( &DWP(-4,$d,$x,4),$ty); # AGI 73 &mov( &DWP(0,$d,$x,4),$ty);
55 &add( $ty, $tx); 74 &add( $ty, $tx);
56 &and( $x, 0xff); # NEXT ROUND
57 &and( $ty, 0xff);
58 &mov( &DWP(0,$d,$y,4),$tx); 75 &mov( &DWP(0,$d,$y,4),$tx);
59 &nop(); 76 &and( $ty, 0xff);
60 &mov( $ty, &DWP(0,$d,$ty,4)); 77 &inc( &LB($x)); # NEXT ROUND
61 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND 78 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
62 # XXX 79 &mov( $ty, &DWP(0,$d,$ty,4));
63 80
64 if (!$char) 81 if (!$char)
65 { 82 {
@@ -88,35 +105,47 @@ sub RC4
88 105
89 &function_begin_B($name,""); 106 &function_begin_B($name,"");
90 107
108 &mov($ty,&wparam(1)); # len
109 &cmp($ty,0);
110 &jne(&label("proceed"));
111 &ret();
112 &set_label("proceed");
113
91 &comment(""); 114 &comment("");
92 115
93 &push("ebp"); 116 &push("ebp");
94 &push("ebx"); 117 &push("ebx");
95 &mov( $d, &wparam(0)); # key
96 &mov( $ty, &wparam(1)); # num
97 &push("esi"); 118 &push("esi");
98 &push("edi"); 119 &xor( $x, $x); # avoid partial register stalls
120 &push("edi");
121 &xor( $y, $y); # avoid partial register stalls
122 &mov( $d, &wparam(0)); # key
123 &mov( $in, &wparam(2));
99 124
100 &mov( $x, &DWP(0,$d,"",1)); 125 &movb( &LB($x), &BP(0,$d,"",1));
101 &mov( $y, &DWP(4,$d,"",1)); 126 &movb( &LB($y), &BP(4,$d,"",1));
102 127
103 &mov( $in, &wparam(2)); 128 &mov( $out, &wparam(3));
104 &inc( $x); 129 &inc( &LB($x));
105 130
106 &stack_push(3); # 3 temp variables 131 &stack_push(3); # 3 temp variables
107 &add( $d, 8); 132 &add( $d, 8);
108 &and( $x, 0xff); 133
134 # detect compressed schedule, see commentary section in rc4_skey.c...
135 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
136 # as compressed key schedule is set up in 0.9.8 and later.
137 &cmp(&DWP(256,$d),-1);
138 &je(&label("RC4_CHAR"));
109 139
110 &lea( $ty, &DWP(-8,$ty,$in)); 140 &lea( $ty, &DWP(-8,$ty,$in));
111 141
112 # check for 0 length input 142 # check for 0 length input
113 143
114 &mov( $out, &wparam(3));
115 &mov( &swtmp(2), $ty); # this is now address to exit at 144 &mov( &swtmp(2), $ty); # this is now address to exit at
116 &mov( $tx, &DWP(0,$d,$x,4)); 145 &mov( $tx, &DWP(0,$d,$x,4));
117 146
118 &cmp( $ty, $in); 147 &cmp( $ty, $in);
119 &jl( &label("end")); # less than 8 bytes 148 &jb( &label("end")); # less than 8 bytes
120 149
121 &set_label("start"); 150 &set_label("start");
122 151
@@ -148,7 +177,7 @@ sub RC4
148 &mov( &DWP(-4,$out,"",0), $tx); 177 &mov( &DWP(-4,$out,"",0), $tx);
149 &mov( $tx, &DWP(0,$d,$x,4)); 178 &mov( $tx, &DWP(0,$d,$x,4));
150 &cmp($in, $ty); 179 &cmp($in, $ty);
151 &jle(&label("start")); 180 &jbe(&label("start"));
152 181
153 &set_label("end"); 182 &set_label("end");
154 183
@@ -162,10 +191,37 @@ sub RC4
162 &RC4_loop(5,0,1); 191 &RC4_loop(5,0,1);
163 &RC4_loop(6,1,1); 192 &RC4_loop(6,1,1);
164 193
194 &jmp(&label("finished"));
195
196 &align(16);
197 # this is essentially Intel P4 specific codepath, see rc4_skey.c,
198 # and is engaged in 0.9.8 and later context...
199 &set_label("RC4_CHAR");
200
201 &lea ($ty,&DWP(0,$in,$ty));
202 &mov (&swtmp(2),$ty);
203
204 # strangely enough unrolled loop performs over 20% slower...
205 &set_label("RC4_CHAR_loop");
206 &movz ($tx,&BP(0,$d,$x));
207 &add (&LB($y),&LB($tx));
208 &movz ($ty,&BP(0,$d,$y));
209 &movb (&BP(0,$d,$y),&LB($tx));
210 &movb (&BP(0,$d,$x),&LB($ty));
211 &add (&LB($ty),&LB($tx));
212 &movz ($ty,&BP(0,$d,$ty));
213 &xorb (&LB($ty),&BP(0,$in));
214 &movb (&BP(0,$out),&LB($ty));
215 &inc (&LB($x));
216 &inc ($in);
217 &inc ($out);
218 &cmp ($in,&swtmp(2));
219 &jb (&label("RC4_CHAR_loop"));
220
165 &set_label("finished"); 221 &set_label("finished");
166 &dec( $x); 222 &dec( $x);
167 &stack_pop(3); 223 &stack_pop(3);
168 &mov( &DWP(-4,$d,"",0),$y); 224 &movb( &BP(-4,$d,"",0),&LB($y));
169 &movb( &BP(-8,$d,"",0),&LB($x)); 225 &movb( &BP(-8,$d,"",0),&LB($x));
170 226
171 &function_end($name); 227 &function_end($name);