diff options
Diffstat (limited to 'src/lib/libcrypto/rc4/asm/rc4-586.pl')
-rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-586.pl | 270 |
1 files changed, 270 insertions, 0 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl new file mode 100644 index 0000000000..38a44a70ef --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl | |||
@@ -0,0 +1,270 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | |||
3 | # ==================================================================== | ||
4 | # [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | |||
10 | # At some point it became apparent that the original SSLeay RC4 | ||
11 | # assembler implementation performs suboptimally on latest IA-32 | ||
12 | # microarchitectures. After re-tuning performance has changed as | ||
13 | # following: | ||
14 | # | ||
15 | # Pentium -10% | ||
16 | # Pentium III +12% | ||
17 | # AMD +50%(*) | ||
18 | # P4 +250%(**) | ||
19 | # | ||
20 | # (*) This number is actually a trade-off:-) It's possible to | ||
21 | # achieve +72%, but at the cost of -48% off PIII performance. | ||
22 | # In other words code performing further 13% faster on AMD | ||
23 | # would perform almost 2 times slower on Intel PIII... | ||
24 | # For reference! This code delivers ~80% of rc4-amd64.pl | ||
25 | # performance on the same Opteron machine. | ||
26 | # (**) This number requires compressed key schedule set up by | ||
27 | # RC4_set_key [see commentary below for further details]. | ||
28 | # | ||
29 | # <appro@fy.chalmers.se> | ||
30 | |||
31 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
32 | push(@INC,"${dir}","${dir}../../perlasm"); | ||
33 | require "x86asm.pl"; | ||
34 | |||
35 | &asm_init($ARGV[0],"rc4-586.pl"); | ||
36 | |||
37 | $xx="eax"; | ||
38 | $yy="ebx"; | ||
39 | $tx="ecx"; | ||
40 | $ty="edx"; | ||
41 | $inp="esi"; | ||
42 | $out="ebp"; | ||
43 | $dat="edi"; | ||
44 | |||
45 | sub RC4_loop { | ||
46 | my $i=shift; | ||
47 | my $func = ($i==0)?*mov:*or; | ||
48 | |||
49 | &add (&LB($yy),&LB($tx)); | ||
50 | &mov ($ty,&DWP(0,$dat,$yy,4)); | ||
51 | &mov (&DWP(0,$dat,$yy,4),$tx); | ||
52 | &mov (&DWP(0,$dat,$xx,4),$ty); | ||
53 | &add ($ty,$tx); | ||
54 | &inc (&LB($xx)); | ||
55 | &and ($ty,0xff); | ||
56 | &ror ($out,8) if ($i!=0); | ||
57 | if ($i<3) { | ||
58 | &mov ($tx,&DWP(0,$dat,$xx,4)); | ||
59 | } else { | ||
60 | &mov ($tx,&wparam(3)); # reload [re-biased] out | ||
61 | } | ||
62 | &$func ($out,&DWP(0,$dat,$ty,4)); | ||
63 | } | ||
64 | |||
65 | # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); | ||
66 | &function_begin("RC4"); | ||
67 | &mov ($dat,&wparam(0)); # load key schedule pointer | ||
68 | &mov ($ty, &wparam(1)); # load len | ||
69 | &mov ($inp,&wparam(2)); # load inp | ||
70 | &mov ($out,&wparam(3)); # load out | ||
71 | |||
72 | &xor ($xx,$xx); # avoid partial register stalls | ||
73 | &xor ($yy,$yy); | ||
74 | |||
75 | &cmp ($ty,0); # safety net | ||
76 | &je (&label("abort")); | ||
77 | |||
78 | &mov (&LB($xx),&BP(0,$dat)); # load key->x | ||
79 | &mov (&LB($yy),&BP(4,$dat)); # load key->y | ||
80 | &add ($dat,8); | ||
81 | |||
82 | &lea ($tx,&DWP(0,$inp,$ty)); | ||
83 | &sub ($out,$inp); # re-bias out | ||
84 | &mov (&wparam(1),$tx); # save input+len | ||
85 | |||
86 | &inc (&LB($xx)); | ||
87 | |||
88 | # detect compressed key schedule... | ||
89 | &cmp (&DWP(256,$dat),-1); | ||
90 | &je (&label("RC4_CHAR")); | ||
91 | |||
92 | &mov ($tx,&DWP(0,$dat,$xx,4)); | ||
93 | |||
94 | &and ($ty,-4); # how many 4-byte chunks? | ||
95 | &jz (&label("loop1")); | ||
96 | |||
97 | &lea ($ty,&DWP(-4,$inp,$ty)); | ||
98 | &mov (&wparam(2),$ty); # save input+(len/4)*4-4 | ||
99 | &mov (&wparam(3),$out); # $out as accumulator in this loop | ||
100 | |||
101 | &set_label("loop4",16); | ||
102 | for ($i=0;$i<4;$i++) { RC4_loop($i); } | ||
103 | &ror ($out,8); | ||
104 | &xor ($out,&DWP(0,$inp)); | ||
105 | &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4 | ||
106 | &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here | ||
107 | &lea ($inp,&DWP(4,$inp)); | ||
108 | &mov ($tx,&DWP(0,$dat,$xx,4)); | ||
109 | &jb (&label("loop4")); | ||
110 | |||
111 | &cmp ($inp,&wparam(1)); # compare to input+len | ||
112 | &je (&label("done")); | ||
113 | &mov ($out,&wparam(3)); # restore $out | ||
114 | |||
115 | &set_label("loop1",16); | ||
116 | &add (&LB($yy),&LB($tx)); | ||
117 | &mov ($ty,&DWP(0,$dat,$yy,4)); | ||
118 | &mov (&DWP(0,$dat,$yy,4),$tx); | ||
119 | &mov (&DWP(0,$dat,$xx,4),$ty); | ||
120 | &add ($ty,$tx); | ||
121 | &inc (&LB($xx)); | ||
122 | &and ($ty,0xff); | ||
123 | &mov ($ty,&DWP(0,$dat,$ty,4)); | ||
124 | &xor (&LB($ty),&BP(0,$inp)); | ||
125 | &lea ($inp,&DWP(1,$inp)); | ||
126 | &mov ($tx,&DWP(0,$dat,$xx,4)); | ||
127 | &cmp ($inp,&wparam(1)); # compare to input+len | ||
128 | &mov (&BP(-1,$out,$inp),&LB($ty)); | ||
129 | &jb (&label("loop1")); | ||
130 | |||
131 | &jmp (&label("done")); | ||
132 | |||
133 | # this is essentially Intel P4 specific codepath... | ||
134 | &set_label("RC4_CHAR",16); | ||
135 | &movz ($tx,&BP(0,$dat,$xx)); | ||
136 | # strangely enough unrolled loop performs over 20% slower... | ||
137 | &set_label("cloop1"); | ||
138 | &add (&LB($yy),&LB($tx)); | ||
139 | &movz ($ty,&BP(0,$dat,$yy)); | ||
140 | &mov (&BP(0,$dat,$yy),&LB($tx)); | ||
141 | &mov (&BP(0,$dat,$xx),&LB($ty)); | ||
142 | &add (&LB($ty),&LB($tx)); | ||
143 | &movz ($ty,&BP(0,$dat,$ty)); | ||
144 | &add (&LB($xx),1); | ||
145 | &xor (&LB($ty),&BP(0,$inp)); | ||
146 | &lea ($inp,&DWP(1,$inp)); | ||
147 | &movz ($tx,&BP(0,$dat,$xx)); | ||
148 | &cmp ($inp,&wparam(1)); | ||
149 | &mov (&BP(-1,$out,$inp),&LB($ty)); | ||
150 | &jb (&label("cloop1")); | ||
151 | |||
152 | &set_label("done"); | ||
153 | &dec (&LB($xx)); | ||
154 | &mov (&BP(-4,$dat),&LB($yy)); # save key->y | ||
155 | &mov (&BP(-8,$dat),&LB($xx)); # save key->x | ||
156 | &set_label("abort"); | ||
157 | &function_end("RC4"); | ||
158 | |||
159 | ######################################################################## | ||
160 | |||
161 | $inp="esi"; | ||
162 | $out="edi"; | ||
163 | $idi="ebp"; | ||
164 | $ido="ecx"; | ||
165 | $idx="edx"; | ||
166 | |||
167 | &external_label("OPENSSL_ia32cap_P"); | ||
168 | |||
169 | # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); | ||
170 | &function_begin("RC4_set_key"); | ||
171 | &mov ($out,&wparam(0)); # load key | ||
172 | &mov ($idi,&wparam(1)); # load len | ||
173 | &mov ($inp,&wparam(2)); # load data | ||
174 | &picmeup($idx,"OPENSSL_ia32cap_P"); | ||
175 | |||
176 | &lea ($out,&DWP(2*4,$out)); # &key->data | ||
177 | &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end | ||
178 | &neg ($idi); | ||
179 | &xor ("eax","eax"); | ||
180 | &mov (&DWP(-4,$out),$idi); # borrow key->y | ||
181 | |||
182 | &bt (&DWP(0,$idx),20); # check for bit#20 | ||
183 | &jc (&label("c1stloop")); | ||
184 | |||
185 | &set_label("w1stloop",16); | ||
186 | &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i; | ||
187 | &add (&LB("eax"),1); # i++; | ||
188 | &jnc (&label("w1stloop")); | ||
189 | |||
190 | &xor ($ido,$ido); | ||
191 | &xor ($idx,$idx); | ||
192 | |||
193 | &set_label("w2ndloop",16); | ||
194 | &mov ("eax",&DWP(0,$out,$ido,4)); | ||
195 | &add (&LB($idx),&BP(0,$inp,$idi)); | ||
196 | &add (&LB($idx),&LB("eax")); | ||
197 | &add ($idi,1); | ||
198 | &mov ("ebx",&DWP(0,$out,$idx,4)); | ||
199 | &jnz (&label("wnowrap")); | ||
200 | &mov ($idi,&DWP(-4,$out)); | ||
201 | &set_label("wnowrap"); | ||
202 | &mov (&DWP(0,$out,$idx,4),"eax"); | ||
203 | &mov (&DWP(0,$out,$ido,4),"ebx"); | ||
204 | &add (&LB($ido),1); | ||
205 | &jnc (&label("w2ndloop")); | ||
206 | &jmp (&label("exit")); | ||
207 | |||
208 | # Unlike all other x86 [and x86_64] implementations, Intel P4 core | ||
209 | # [including EM64T] was found to perform poorly with above "32-bit" key | ||
210 | # schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded | ||
211 | # assembler turned out to be 3.5x if re-coded for compressed 8-bit one, | ||
212 | # a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit | ||
213 | # schedule for x86[_64], because non-P4 implementations suffer from | ||
214 | # significant performance losses then, e.g. PIII exhibits >2x | ||
215 | # deterioration, and so does Opteron. In order to assure optimal | ||
216 | # all-round performance, we detect P4 at run-time and set up compressed | ||
217 | # key schedule, which is recognized by RC4 procedure. | ||
218 | |||
219 | &set_label("c1stloop",16); | ||
220 | &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i; | ||
221 | &add (&LB("eax"),1); # i++; | ||
222 | &jnc (&label("c1stloop")); | ||
223 | |||
224 | &xor ($ido,$ido); | ||
225 | &xor ($idx,$idx); | ||
226 | &xor ("ebx","ebx"); | ||
227 | |||
228 | &set_label("c2ndloop",16); | ||
229 | &mov (&LB("eax"),&BP(0,$out,$ido)); | ||
230 | &add (&LB($idx),&BP(0,$inp,$idi)); | ||
231 | &add (&LB($idx),&LB("eax")); | ||
232 | &add ($idi,1); | ||
233 | &mov (&LB("ebx"),&BP(0,$out,$idx)); | ||
234 | &jnz (&label("cnowrap")); | ||
235 | &mov ($idi,&DWP(-4,$out)); | ||
236 | &set_label("cnowrap"); | ||
237 | &mov (&BP(0,$out,$idx),&LB("eax")); | ||
238 | &mov (&BP(0,$out,$ido),&LB("ebx")); | ||
239 | &add (&LB($ido),1); | ||
240 | &jnc (&label("c2ndloop")); | ||
241 | |||
242 | &mov (&DWP(256,$out),-1); # mark schedule as compressed | ||
243 | |||
244 | &set_label("exit"); | ||
245 | &xor ("eax","eax"); | ||
246 | &mov (&DWP(-8,$out),"eax"); # key->x=0; | ||
247 | &mov (&DWP(-4,$out),"eax"); # key->y=0; | ||
248 | &function_end("RC4_set_key"); | ||
249 | |||
250 | # const char *RC4_options(void); | ||
251 | &function_begin_B("RC4_options"); | ||
252 | &call (&label("pic_point")); | ||
253 | &set_label("pic_point"); | ||
254 | &blindpop("eax"); | ||
255 | &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); | ||
256 | &picmeup("edx","OPENSSL_ia32cap_P"); | ||
257 | &bt (&DWP(0,"edx"),20); | ||
258 | &jnc (&label("skip")); | ||
259 | &add ("eax",12); | ||
260 | &set_label("skip"); | ||
261 | &ret (); | ||
262 | &set_label("opts",64); | ||
263 | &asciz ("rc4(4x,int)"); | ||
264 | &asciz ("rc4(1x,char)"); | ||
265 | &asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>"); | ||
266 | &align (64); | ||
267 | &function_end_B("RC4_options"); | ||
268 | |||
269 | &asm_finish(); | ||
270 | |||