summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/rc4/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/rc4/asm')
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-586.pl410
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-ia64.pl755
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl525
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-parisc.pl320
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-s390x.pl234
-rwxr-xr-xsrc/lib/libcrypto/rc4/asm/rc4-x86_64.pl543
6 files changed, 0 insertions, 2787 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
deleted file mode 100644
index 84f1a798cb..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ /dev/null
@@ -1,410 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# At some point it became apparent that the original SSLeay RC4
11# assembler implementation performs suboptimally on latest IA-32
12# microarchitectures. After re-tuning performance has changed as
13# following:
14#
15# Pentium -10%
16# Pentium III +12%
17# AMD +50%(*)
18# P4 +250%(**)
19#
20# (*) This number is actually a trade-off:-) It's possible to
21# achieve +72%, but at the cost of -48% off PIII performance.
22# In other words code performing further 13% faster on AMD
23# would perform almost 2 times slower on Intel PIII...
24# For reference! This code delivers ~80% of rc4-amd64.pl
25# performance on the same Opteron machine.
26# (**) This number requires compressed key schedule set up by
27# RC4_set_key [see commentary below for further details].
28#
29# <appro@fy.chalmers.se>
30
31# May 2011
32#
33# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
34# performance in cycles per processed byte (less is better) and
35# improvement relative to previous version of this module is:
36#
37# Pentium 10.2 # original numbers
38# Pentium III 7.8(*)
39# Intel P4 7.5
40#
41# Opteron 6.1/+20% # new MMX numbers
42# Core2 5.3/+67%(**)
43# Westmere 5.1/+94%(**)
44# Sandy Bridge 5.0/+8%
45# Atom 12.6/+6%
46#
47# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
48# but this specific code performs poorly on Core2. And vice
49# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
50# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
51# [anymore], I chose to discard PIII-specific code path and opt
52# for original IALU-only code, which is why MMX/SSE code path
53# is guarded by SSE2 bit (see below), not MMX/SSE.
54# (**) Performance vs. block size on Core2 and Westmere had a maximum
55# at ... 64 bytes block size. And it was quite a maximum, 40-60%
56# in comparison to largest 8KB block size. Above improvement
57# coefficients are for the largest block size.
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60push(@INC,"${dir}","${dir}../../perlasm");
61require "x86asm.pl";
62
63&asm_init($ARGV[0],"rc4-586.pl");
64
65$xx="eax";
66$yy="ebx";
67$tx="ecx";
68$ty="edx";
69$inp="esi";
70$out="ebp";
71$dat="edi";
72
73sub RC4_loop {
74 my $i=shift;
75 my $func = ($i==0)?*mov:*or;
76
77 &add (&LB($yy),&LB($tx));
78 &mov ($ty,&DWP(0,$dat,$yy,4));
79 &mov (&DWP(0,$dat,$yy,4),$tx);
80 &mov (&DWP(0,$dat,$xx,4),$ty);
81 &add ($ty,$tx);
82 &inc (&LB($xx));
83 &and ($ty,0xff);
84 &ror ($out,8) if ($i!=0);
85 if ($i<3) {
86 &mov ($tx,&DWP(0,$dat,$xx,4));
87 } else {
88 &mov ($tx,&wparam(3)); # reload [re-biased] out
89 }
90 &$func ($out,&DWP(0,$dat,$ty,4));
91}
92
93if ($alt=0) {
94 # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
95 # but ~40% slower on Core2 and Westmere... Attempt to add movz
96 # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
97 # on Core2 with movz it's almost 20% slower than below alternative
98 # code... Yes, it's a total mess...
99 my @XX=($xx,$out);
100 $RC4_loop_mmx = sub { # SSE actually...
101 my $i=shift;
102 my $j=$i<=0?0:$i>>1;
103 my $mm=$i<=0?"mm0":"mm".($i&1);
104
105 &add (&LB($yy),&LB($tx));
106 &lea (@XX[1],&DWP(1,@XX[0]));
107 &pxor ("mm2","mm0") if ($i==0);
108 &psllq ("mm1",8) if ($i==0);
109 &and (@XX[1],0xff);
110 &pxor ("mm0","mm0") if ($i<=0);
111 &mov ($ty,&DWP(0,$dat,$yy,4));
112 &mov (&DWP(0,$dat,$yy,4),$tx);
113 &pxor ("mm1","mm2") if ($i==0);
114 &mov (&DWP(0,$dat,$XX[0],4),$ty);
115 &add (&LB($ty),&LB($tx));
116 &movd (@XX[0],"mm7") if ($i==0);
117 &mov ($tx,&DWP(0,$dat,@XX[1],4));
118 &pxor ("mm1","mm1") if ($i==1);
119 &movq ("mm2",&QWP(0,$inp)) if ($i==1);
120 &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0);
121 &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j);
122
123 push (@XX,shift(@XX)) if ($i>=0);
124 }
125} else {
126 # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
127 # brings down AMD by 7%...
128 $RC4_loop_mmx = sub {
129 my $i=shift;
130
131 &add (&LB($yy),&LB($tx));
132 &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
133 &mov ($ty,&DWP(0,$dat,$yy,4));
134 &mov (&DWP(0,$dat,$yy,4),$tx);
135 &mov (&DWP(0,$dat,$xx,4),$ty);
136 &inc ($xx);
137 &add ($ty,$tx);
138 &movz ($xx,&LB($xx)); # (*)
139 &movz ($ty,&LB($ty)); # (*)
140 &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0);
141 &movq ("mm0",&QWP(0,$inp)) if ($i<=0);
142 &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0);
143 &mov ($tx,&DWP(0,$dat,$xx,4));
144 &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
145
146 # (*) This is the key to Core2 and Westmere performance.
147 # Whithout movz out-of-order execution logic confuses
148 # itself and fails to reorder loads and stores. Problem
149 # appears to be fixed in Sandy Bridge...
150 }
151}
152
153&external_label("OPENSSL_ia32cap_P");
154
155# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
156&function_begin("RC4");
157 &mov ($dat,&wparam(0)); # load key schedule pointer
158 &mov ($ty, &wparam(1)); # load len
159 &mov ($inp,&wparam(2)); # load inp
160 &mov ($out,&wparam(3)); # load out
161
162 &xor ($xx,$xx); # avoid partial register stalls
163 &xor ($yy,$yy);
164
165 &cmp ($ty,0); # safety net
166 &je (&label("abort"));
167
168 &mov (&LB($xx),&BP(0,$dat)); # load key->x
169 &mov (&LB($yy),&BP(4,$dat)); # load key->y
170 &add ($dat,8);
171
172 &lea ($tx,&DWP(0,$inp,$ty));
173 &sub ($out,$inp); # re-bias out
174 &mov (&wparam(1),$tx); # save input+len
175
176 &inc (&LB($xx));
177
178 # detect compressed key schedule...
179 &cmp (&DWP(256,$dat),-1);
180 &je (&label("RC4_CHAR"));
181
182 &mov ($tx,&DWP(0,$dat,$xx,4));
183
184 &and ($ty,-4); # how many 4-byte chunks?
185 &jz (&label("loop1"));
186
187 &test ($ty,-8);
188 &mov (&wparam(3),$out); # $out as accumulator in these loops
189 &jz (&label("go4loop4"));
190
191 &picmeup($out,"OPENSSL_ia32cap_P");
192 &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX]
193 &jnc (&label("go4loop4"));
194
195 &mov ($out,&wparam(3)) if (!$alt);
196 &movd ("mm7",&wparam(3)) if ($alt);
197 &and ($ty,-8);
198 &lea ($ty,&DWP(-8,$inp,$ty));
199 &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8
200
201 &$RC4_loop_mmx(-1);
202 &jmp(&label("loop_mmx_enter"));
203
204 &set_label("loop_mmx",16);
205 &$RC4_loop_mmx(0);
206 &set_label("loop_mmx_enter");
207 for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
208 &mov ($ty,$yy);
209 &xor ($yy,$yy); # this is second key to Core2
210 &mov (&LB($yy),&LB($ty)); # and Westmere performance...
211 &cmp ($inp,&DWP(-4,$dat));
212 &lea ($inp,&DWP(8,$inp));
213 &jb (&label("loop_mmx"));
214
215 if ($alt) {
216 &movd ($out,"mm7");
217 &pxor ("mm2","mm0");
218 &psllq ("mm1",8);
219 &pxor ("mm1","mm2");
220 &movq (&QWP(-8,$out,$inp),"mm1");
221 } else {
222 &psllq ("mm1",56);
223 &pxor ("mm2","mm1");
224 &movq (&QWP(-8,$out,$inp),"mm2");
225 }
226 &emms ();
227
228 &cmp ($inp,&wparam(1)); # compare to input+len
229 &je (&label("done"));
230 &jmp (&label("loop1"));
231
232&set_label("go4loop4",16);
233 &lea ($ty,&DWP(-4,$inp,$ty));
234 &mov (&wparam(2),$ty); # save input+(len/4)*4-4
235
236 &set_label("loop4");
237 for ($i=0;$i<4;$i++) { RC4_loop($i); }
238 &ror ($out,8);
239 &xor ($out,&DWP(0,$inp));
240 &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
241 &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
242 &lea ($inp,&DWP(4,$inp));
243 &mov ($tx,&DWP(0,$dat,$xx,4));
244 &jb (&label("loop4"));
245
246 &cmp ($inp,&wparam(1)); # compare to input+len
247 &je (&label("done"));
248 &mov ($out,&wparam(3)); # restore $out
249
250 &set_label("loop1",16);
251 &add (&LB($yy),&LB($tx));
252 &mov ($ty,&DWP(0,$dat,$yy,4));
253 &mov (&DWP(0,$dat,$yy,4),$tx);
254 &mov (&DWP(0,$dat,$xx,4),$ty);
255 &add ($ty,$tx);
256 &inc (&LB($xx));
257 &and ($ty,0xff);
258 &mov ($ty,&DWP(0,$dat,$ty,4));
259 &xor (&LB($ty),&BP(0,$inp));
260 &lea ($inp,&DWP(1,$inp));
261 &mov ($tx,&DWP(0,$dat,$xx,4));
262 &cmp ($inp,&wparam(1)); # compare to input+len
263 &mov (&BP(-1,$out,$inp),&LB($ty));
264 &jb (&label("loop1"));
265
266 &jmp (&label("done"));
267
268# this is essentially Intel P4 specific codepath...
269&set_label("RC4_CHAR",16);
270 &movz ($tx,&BP(0,$dat,$xx));
271 # strangely enough unrolled loop performs over 20% slower...
272 &set_label("cloop1");
273 &add (&LB($yy),&LB($tx));
274 &movz ($ty,&BP(0,$dat,$yy));
275 &mov (&BP(0,$dat,$yy),&LB($tx));
276 &mov (&BP(0,$dat,$xx),&LB($ty));
277 &add (&LB($ty),&LB($tx));
278 &movz ($ty,&BP(0,$dat,$ty));
279 &add (&LB($xx),1);
280 &xor (&LB($ty),&BP(0,$inp));
281 &lea ($inp,&DWP(1,$inp));
282 &movz ($tx,&BP(0,$dat,$xx));
283 &cmp ($inp,&wparam(1));
284 &mov (&BP(-1,$out,$inp),&LB($ty));
285 &jb (&label("cloop1"));
286
287&set_label("done");
288 &dec (&LB($xx));
289 &mov (&DWP(-4,$dat),$yy); # save key->y
290 &mov (&BP(-8,$dat),&LB($xx)); # save key->x
291&set_label("abort");
292&function_end("RC4");
293
294########################################################################
295
296$inp="esi";
297$out="edi";
298$idi="ebp";
299$ido="ecx";
300$idx="edx";
301
302# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
303&function_begin("RC4_set_key");
304 &mov ($out,&wparam(0)); # load key
305 &mov ($idi,&wparam(1)); # load len
306 &mov ($inp,&wparam(2)); # load data
307 &picmeup($idx,"OPENSSL_ia32cap_P");
308
309 &lea ($out,&DWP(2*4,$out)); # &key->data
310 &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
311 &neg ($idi);
312 &xor ("eax","eax");
313 &mov (&DWP(-4,$out),$idi); # borrow key->y
314
315 &bt (&DWP(0,$idx),20); # check for bit#20
316 &jc (&label("c1stloop"));
317
318&set_label("w1stloop",16);
319 &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
320 &add (&LB("eax"),1); # i++;
321 &jnc (&label("w1stloop"));
322
323 &xor ($ido,$ido);
324 &xor ($idx,$idx);
325
326&set_label("w2ndloop",16);
327 &mov ("eax",&DWP(0,$out,$ido,4));
328 &add (&LB($idx),&BP(0,$inp,$idi));
329 &add (&LB($idx),&LB("eax"));
330 &add ($idi,1);
331 &mov ("ebx",&DWP(0,$out,$idx,4));
332 &jnz (&label("wnowrap"));
333 &mov ($idi,&DWP(-4,$out));
334 &set_label("wnowrap");
335 &mov (&DWP(0,$out,$idx,4),"eax");
336 &mov (&DWP(0,$out,$ido,4),"ebx");
337 &add (&LB($ido),1);
338 &jnc (&label("w2ndloop"));
339&jmp (&label("exit"));
340
341# Unlike all other x86 [and x86_64] implementations, Intel P4 core
342# [including EM64T] was found to perform poorly with above "32-bit" key
343# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
344# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
345# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
346# schedule for x86[_64], because non-P4 implementations suffer from
347# significant performance losses then, e.g. PIII exhibits >2x
348# deterioration, and so does Opteron. In order to assure optimal
349# all-round performance, we detect P4 at run-time and set up compressed
350# key schedule, which is recognized by RC4 procedure.
351
352&set_label("c1stloop",16);
353 &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
354 &add (&LB("eax"),1); # i++;
355 &jnc (&label("c1stloop"));
356
357 &xor ($ido,$ido);
358 &xor ($idx,$idx);
359 &xor ("ebx","ebx");
360
361&set_label("c2ndloop",16);
362 &mov (&LB("eax"),&BP(0,$out,$ido));
363 &add (&LB($idx),&BP(0,$inp,$idi));
364 &add (&LB($idx),&LB("eax"));
365 &add ($idi,1);
366 &mov (&LB("ebx"),&BP(0,$out,$idx));
367 &jnz (&label("cnowrap"));
368 &mov ($idi,&DWP(-4,$out));
369 &set_label("cnowrap");
370 &mov (&BP(0,$out,$idx),&LB("eax"));
371 &mov (&BP(0,$out,$ido),&LB("ebx"));
372 &add (&LB($ido),1);
373 &jnc (&label("c2ndloop"));
374
375 &mov (&DWP(256,$out),-1); # mark schedule as compressed
376
377&set_label("exit");
378 &xor ("eax","eax");
379 &mov (&DWP(-8,$out),"eax"); # key->x=0;
380 &mov (&DWP(-4,$out),"eax"); # key->y=0;
381&function_end("RC4_set_key");
382
383# const char *RC4_options(void);
384&function_begin_B("RC4_options");
385 &call (&label("pic_point"));
386&set_label("pic_point");
387 &blindpop("eax");
388 &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
389 &picmeup("edx","OPENSSL_ia32cap_P");
390 &mov ("edx",&DWP(0,"edx"));
391 &bt ("edx",20);
392 &jc (&label("1xchar"));
393 &bt ("edx",26);
394 &jnc (&label("ret"));
395 &add ("eax",25);
396 &ret ();
397&set_label("1xchar");
398 &add ("eax",12);
399&set_label("ret");
400 &ret ();
401&set_label("opts",64);
402&asciz ("rc4(4x,int)");
403&asciz ("rc4(1x,char)");
404&asciz ("rc4(8x,mmx)");
405&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
406&align (64);
407&function_end_B("RC4_options");
408
409&asm_finish();
410
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
deleted file mode 100644
index 49cd5b5e69..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
+++ /dev/null
@@ -1,755 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by David Mosberger <David.Mosberger@acm.org> based on the
5# Itanium optimized Crypto code which was released by HP Labs at
6# http://www.hpl.hp.com/research/linux/crypto/.
7#
8# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
9#
10# Permission is hereby granted, free of charge, to any person obtaining
11# a copy of this software and associated documentation files (the
12# "Software"), to deal in the Software without restriction, including
13# without limitation the rights to use, copy, modify, merge, publish,
14# distribute, sublicense, and/or sell copies of the Software, and to
15# permit persons to whom the Software is furnished to do so, subject to
16# the following conditions:
17#
18# The above copyright notice and this permission notice shall be
19# included in all copies or substantial portions of the Software.
20
21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
28
29
30
31# This is a little helper program which generates a software-pipelined
32# for RC4 encryption. The basic algorithm looks like this:
33#
34# for (counter = 0; counter < len; ++counter)
35# {
36# in = inp[counter];
37# SI = S[I];
38# J = (SI + J) & 0xff;
39# SJ = S[J];
40# T = (SI + SJ) & 0xff;
41# S[I] = SJ, S[J] = SI;
42# ST = S[T];
43# outp[counter] = in ^ ST;
44# I = (I + 1) & 0xff;
45# }
46#
47# Pipelining this loop isn't easy, because the stores to the S[] array
48# need to be observed in the right order. The loop generated by the
49# code below has the following pipeline diagram:
50#
51# cycle
52# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
53# iter
54# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
55# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
56# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
57#
58# where:
59# LDI = load of S[I]
60# LDJ = load of S[J]
61# SWP = swap of S[I] and S[J]
62# LDT = load of S[T]
63#
64# Note that in the above diagram, the major trouble-spot is that LDI
65# of the 2nd iteration is performed BEFORE the SWP of the first
66# iteration. Fortunately, this is easy to detect (I of the 1st
67# iteration will be equal to J of the 2nd iteration) and when this
68# happens, we simply forward the proper value from the 1st iteration
69# to the 2nd one. The proper value in this case is simply the value
70# of S[I] from the first iteration (thanks to the fact that SWP
71# simply swaps the contents of S[I] and S[J]).
72#
73# Another potential trouble-spot is in cycle 7, where SWP of the 1st
74# iteration issues at the same time as the LDI of the 3rd iteration.
75# However, thanks to IA-64 execution semantics, this can be taken
76# care of simply by placing LDI later in the instruction-group than
77# SWP. IA-64 CPUs will automatically forward the value if they
78# detect that the SWP and LDI are accessing the same memory-location.
79
80# The core-loop that can be pipelined then looks like this (annotated
81# with McKinley/Madison issue port & latency numbers, assuming L1
82# cache hits for the most part):
83
84# operation: instruction: issue-ports: latency
85# ------------------ ----------------------------- ------------- -------
86
87# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
88# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
89# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
90# ;;
91# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
92# ;;
93# cmp.eq.unc pBypass = I, J * after J is valid!
94# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
95# (pBypass) br.cond.spnt Bypass
96# ;;
97# ---------------------------------------------------------------------------------------
98# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
99# ;;
100# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
101# ;;
102# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
103# ;;
104# ---------------------------------------------------------------------------------------
105# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
106# ;;
107# T = T & 0xff zxt1 T = T I0, I1 1 cyc
108# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
109# S[J] = SI st8 [Jptr] = SI M2-M3
110# ;;
111# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
112# ;;
113# ---------------------------------------------------------------------------------------
114# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
115# ;;
116# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
117# ;;
118# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
119# ;;
120# ---------------------------------------------------------------------------------------
121
122# There are several points worth making here:
123
124# - Note that due to the bypass/forwarding-path, the first two
125# phases of the loop are strangly mingled together. In
126# particular, note that the first stage of the pipeline is
127# using the value of "J", as calculated by the second stage.
128# - Each bundle-pair will have exactly 6 instructions.
129# - Pipelined, the loop can execute in 3 cycles/iteration and
130# 4 stages. However, McKinley/Madison can issue "st1" to
131# the same bank at a rate of at most one per 4 cycles. Thus,
132# instead of storing each byte, we accumulate them in a word
133# and then write them back at once with a single "st8" (this
134# implies that the setup code needs to ensure that the output
135# buffer is properly aligned, if need be, by encoding the
136# first few bytes separately).
137# - There is no space for a "br.ctop" instruction. For this
138# reason we can't use module-loop support in IA-64 and have
139# to do a traditional, purely software-pipelined loop.
140# - We can't replace any of the remaining "add/zxt1" pairs with
141# "padd1" because the latency for that instruction is too high
142# and would push the loop to the point where more bypasses
143# would be needed, which we don't have space for.
144# - The above loop runs at around 3.26 cycles/byte, or roughly
145# 440 MByte/sec on a 1.5GHz Madison. This is well below the
146# system bus bandwidth and hence with judicious use of
147# "lfetch" this loop can run at (almost) peak speed even when
148# the input and output data reside in memory. The
149# max. latency that can be tolerated is (PREFETCH_DISTANCE *
150# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
151# least) 1-ahead prefetching of 128 byte cache-lines. Note
152# that we do NOT prefetch into L1, since that would only
153# interfere with the S[] table values stored there. This is
154# acceptable because there is a 10 cycle latency between
155# load and first use of the input data.
156# - We use a branch to out-of-line bypass-code of cycle-pressure:
157# we calculate the next J, check for the need to activate the
158# bypass path, and activate the bypass path ALL IN THE SAME
159# CYCLE. If we didn't have these constraints, we could do
160# the bypass with a simple conditional move instruction.
161# Fortunately, the bypass paths get activated relatively
162# infrequently, so the extra branches don't cost all that much
163# (about 0.04 cycles/byte, measured on a 16396 byte file with
164# random input data).
165#
166
167$phases = 4; # number of stages/phases in the pipelined-loop
168$unroll_count = 6; # number of times we unrolled it
169$pComI = (1 << 0);
170$pComJ = (1 << 1);
171$pComT = (1 << 2);
172$pOut = (1 << 3);
173
174$NData = 4;
175$NIP = 3;
176$NJP = 2;
177$NI = 2;
178$NSI = 3;
179$NSJ = 2;
180$NT = 2;
181$NOutWord = 2;
182
183#
184# $threshold is the minimum length before we attempt to use the
185# big software-pipelined loop. It MUST be greater-or-equal
186# to:
187# PHASES * (UNROLL_COUNT + 1) + 7
188#
189# The "+ 7" comes from the fact we may have to encode up to
190# 7 bytes separately before the output pointer is aligned.
191#
192$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
193
194sub I {
195 local *code = shift;
196 local $format = shift;
197 $code .= sprintf ("\t\t".$format."\n", @_);
198}
199
200sub P {
201 local *code = shift;
202 local $format = shift;
203 $code .= sprintf ($format."\n", @_);
204}
205
206sub STOP {
207 local *code = shift;
208 $code .=<<___;
209 ;;
210___
211}
212
213sub emit_body {
214 local *c = shift;
215 local *bypass = shift;
216 local ($iteration, $p) = @_;
217
218 local $i0 = $iteration;
219 local $i1 = $iteration - 1;
220 local $i2 = $iteration - 2;
221 local $i3 = $iteration - 3;
222 local $iw0 = ($iteration - 3) / 8;
223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
224 local $byte_num = ($iteration - 3) % 8;
225 local $label = $iteration + 1;
226 local $pAny = ($p & 0xf) == 0xf;
227 local $pByp = (($p & $pComI) && ($iteration > 0));
228
229 $c.=<<___;
230//////////////////////////////////////////////////
231___
232
233 if (($p & 0xf) == 0) {
234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
236 $iw1 % $NOutWord, $iw1 % $NOutWord);
237 $c.="#endif\n";
238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
239 return;
240 }
241
242 # Cycle 0
243 &I(\$c, "{ .mmi") if ($pAny);
244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
246 &I(\$c, "zxt1 J = J") if ($p & $pComJ);
247 &I(\$c, "}") if ($pAny);
248 &I(\$c, "{ .mmi") if ($pAny);
249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
253 &I(\$c, "}") if ($pAny);
254 &STOP(\$c);
255
256 # Cycle 1
257 &I(\$c, "{ .mmi") if ($pAny);
258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
261 &I(\$c, "}") if ($pAny);
262 &I(\$c, "{ .mmi") if ($pAny);
263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
267 &I(\$c, "}") if ($pAny);
268 &STOP(\$c);
269
270 # Cycle 2
271 &I(\$c, "{ .mmi") if ($pAny);
272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
276 &I(\$c, "}") if ($pAny);
277 &I(\$c, "{ .mmb") if ($pAny);
278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
281 &I(\$c, "}") if ($pAny);
282 &STOP(\$c);
283
284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
285 if ($byte_num == 0 && $iteration >= $phases) {
286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
287 $iw1 % $NOutWord) if ($p & $pOut);
288 if ($iteration == (1 + $unroll_count) * $phases - 1) {
289 if ($unroll_count == 6) {
290 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
291 $iw1 % $NOutWord, $iw0 % $NOutWord);
292 }
293 &I(\$c, "lfetch.nt1 [InPrefetch], %u",
294 $unroll_count * $phases);
295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
296 $unroll_count * $phases);
297 &I(\$c, "br.cloop.sptk.few .rc4Loop");
298 }
299 }
300
301 if ($pByp) {
302 &P(\$bypass, ".rc4Bypass%u:", $label);
303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
304 &I(\$bypass, "nop 0");
305 &I(\$bypass, "nop 0");
306 &I(\$bypass, ";;");
307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
310 &I(\$bypass, ";;");
311 }
312}
313
314$code=<<___;
315.ident \"rc4-ia64.s, version 3.0\"
316.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
317
318#define LCSave r8
319#define PRSave r9
320
321/* Inputs become invalid once rotation begins! */
322
323#define StateTable in0
324#define DataLen in1
325#define InputBuffer in2
326#define OutputBuffer in3
327
328#define KTable r14
329#define J r15
330#define InPtr r16
331#define OutPtr r17
332#define InPrefetch r18
333#define OutPrefetch r19
334#define One r20
335#define LoopCount r21
336#define Remainder r22
337#define IFinal r23
338#define EndPtr r24
339
340#define tmp0 r25
341#define tmp1 r26
342
343#define pBypass p6
344#define pDone p7
345#define pSmall p8
346#define pAligned p9
347#define pUnaligned p10
348
349#define pComputeI pPhase[0]
350#define pComputeJ pPhase[1]
351#define pComputeT pPhase[2]
352#define pOutput pPhase[3]
353
354#define RetVal r8
355#define L_OK p7
356#define L_NOK p8
357
358#define _NINPUTS 4
359#define _NOUTPUT 0
360
361#define _NROTATE 24
362#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
363
364#ifndef SZ
365# define SZ 4 // this must be set to sizeof(RC4_INT)
366#endif
367
368#if SZ == 1
369# define LKEY ld1
370# define SKEY st1
371# define KEYADDR(dst, i) add dst = i, KTable
372#elif SZ == 2
373# define LKEY ld2
374# define SKEY st2
375# define KEYADDR(dst, i) shladd dst = i, 1, KTable
376#elif SZ == 4
377# define LKEY ld4
378# define SKEY st4
379# define KEYADDR(dst, i) shladd dst = i, 2, KTable
380#else
381# define LKEY ld8
382# define SKEY st8
383# define KEYADDR(dst, i) shladd dst = i, 3, KTable
384#endif
385
386#if defined(_HPUX_SOURCE) && !defined(_LP64)
387# define ADDP addp4
388#else
389# define ADDP add
390#endif
391
392/* Define a macro for the bit number of the n-th byte: */
393
394#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
395# define HOST_IS_BIG_ENDIAN
396# define BYTE_POS(n) (56 - (8 * (n)))
397#else
398# define BYTE_POS(n) (8 * (n))
399#endif
400
401/*
402 We must perform the first phase of the pipeline explicitly since
403 we will always load from the stable the first time. The br.cexit
404 will never be taken since regardless of the number of bytes because
405 the epilogue count is 4.
406*/
407/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
408 assembler failed on original macro with syntax error. <appro> */
409#define MODSCHED_RC4_PROLOGUE \\
410 { \\
411 ld1 Data[0] = [InPtr], 1; \\
412 add IFinal = 1, I[1]; \\
413 KEYADDR(IPr[0], I[1]); \\
414 } ;; \\
415 { \\
416 LKEY SI[0] = [IPr[0]]; \\
417 mov pr.rot = 0x10000; \\
418 mov ar.ec = 4; \\
419 } ;; \\
420 { \\
421 add J = J, SI[0]; \\
422 zxt1 I[0] = IFinal; \\
423 br.cexit.spnt.few .+16; /* never taken */ \\
424 } ;;
425#define MODSCHED_RC4_LOOP(label) \\
426label: \\
427 { .mmi; \\
428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\
429 (pComputeI) add IFinal = 1, I[1]; \\
430 (pComputeJ) zxt1 J = J; \\
431 }{ .mmi; \\
432 (pOutput) LKEY T[1] = [T[1]]; \\
433 (pComputeT) add T[0] = SI[2], SJ[1]; \\
434 (pComputeI) KEYADDR(IPr[0], I[1]); \\
435 } ;; \\
436 { .mmi; \\
437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
438 (pComputeT) SKEY [JP[1]] = SI[2]; \\
439 (pComputeT) zxt1 T[0] = T[0]; \\
440 }{ .mmi; \\
441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\
442 (pComputeJ) KEYADDR(JP[0], J); \\
443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
444 } ;; \\
445 { .mmi; \\
446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
447 (pOutput) xor Data[3] = Data[3], T[1]; \\
448 nop 0x0; \\
449 }{ .mmi; \\
450 (pComputeT) KEYADDR(T[0], T[0]); \\
451 (pBypass) mov SI[0] = SI[1]; \\
452 (pComputeI) zxt1 I[0] = IFinal; \\
453 } ;; \\
454 { .mmb; \\
455 (pOutput) st1 [OutPtr] = Data[3], 1; \\
456 (pComputeI) add J = J, SI[0]; \\
457 br.ctop.sptk.few label; \\
458 } ;;
459
460 .text
461
462 .align 32
463
464 .type RC4, \@function
465 .global RC4
466
467 .proc RC4
468 .prologue
469
470RC4:
471 {
472 .mmi
473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
474
475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
476 OutWord[2]
477 .rotp pPhase[4]
478
479 ADDP InPrefetch = 0, InputBuffer
480 ADDP KTable = 0, StateTable
481 }
482 {
483 .mmi
484 ADDP InPtr = 0, InputBuffer
485 ADDP OutPtr = 0, OutputBuffer
486 mov RetVal = r0
487 }
488 ;;
489 {
490 .mmi
491 lfetch.nt1 [InPrefetch], 0x80
492 ADDP OutPrefetch = 0, OutputBuffer
493 }
494 { // Return 0 if the input length is nonsensical
495 .mib
496 ADDP StateTable = 0, StateTable
497 cmp.ge.unc L_NOK, L_OK = r0, DataLen
498 (L_NOK) br.ret.sptk.few rp
499 }
500 ;;
501 {
502 .mib
503 cmp.eq.or L_NOK, L_OK = r0, InPtr
504 cmp.eq.or L_NOK, L_OK = r0, OutPtr
505 nop 0x0
506 }
507 {
508 .mib
509 cmp.eq.or L_NOK, L_OK = r0, StateTable
510 nop 0x0
511 (L_NOK) br.ret.sptk.few rp
512 }
513 ;;
514 LKEY I[1] = [KTable], SZ
515/* Prefetch the state-table. It contains 256 elements of size SZ */
516
517#if SZ == 1
518 ADDP tmp0 = 1*128, StateTable
519#elif SZ == 2
520 ADDP tmp0 = 3*128, StateTable
521 ADDP tmp1 = 2*128, StateTable
522#elif SZ == 4
523 ADDP tmp0 = 7*128, StateTable
524 ADDP tmp1 = 6*128, StateTable
525#elif SZ == 8
526 ADDP tmp0 = 15*128, StateTable
527 ADDP tmp1 = 14*128, StateTable
528#endif
529 ;;
530#if SZ >= 8
531 lfetch.fault.nt1 [tmp0], -256 // 15
532 lfetch.fault.nt1 [tmp1], -256;;
533 lfetch.fault.nt1 [tmp0], -256 // 13
534 lfetch.fault.nt1 [tmp1], -256;;
535 lfetch.fault.nt1 [tmp0], -256 // 11
536 lfetch.fault.nt1 [tmp1], -256;;
537 lfetch.fault.nt1 [tmp0], -256 // 9
538 lfetch.fault.nt1 [tmp1], -256;;
539#endif
540#if SZ >= 4
541 lfetch.fault.nt1 [tmp0], -256 // 7
542 lfetch.fault.nt1 [tmp1], -256;;
543 lfetch.fault.nt1 [tmp0], -256 // 5
544 lfetch.fault.nt1 [tmp1], -256;;
545#endif
546#if SZ >= 2
547 lfetch.fault.nt1 [tmp0], -256 // 3
548 lfetch.fault.nt1 [tmp1], -256;;
549#endif
550 {
551 .mii
552 lfetch.fault.nt1 [tmp0] // 1
553 add I[1]=1,I[1];;
554 zxt1 I[1]=I[1]
555 }
556 {
557 .mmi
558 lfetch.nt1 [InPrefetch], 0x80
559 lfetch.excl.nt1 [OutPrefetch], 0x80
560 .save pr, PRSave
561 mov PRSave = pr
562 } ;;
563 {
564 .mmi
565 lfetch.excl.nt1 [OutPrefetch], 0x80
566 LKEY J = [KTable], SZ
567 ADDP EndPtr = DataLen, InPtr
568 } ;;
569 {
570 .mmi
571 ADDP EndPtr = -1, EndPtr // Make it point to
572 // last data byte.
573 mov One = 1
574 .save ar.lc, LCSave
575 mov LCSave = ar.lc
576 .body
577 } ;;
578 {
579 .mmb
580 sub Remainder = 0, OutPtr
581 cmp.gtu pSmall, p0 = $threshold, DataLen
582(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
583 // big loop.
584 } ;;
585 {
586 .mmi
587 and Remainder = 0x7, Remainder
588 ;;
589 cmp.eq pAligned, pUnaligned = Remainder, r0
590 nop 0x0
591 } ;;
592 {
593 .mmb
594.pred.rel "mutex",pUnaligned,pAligned
595(pUnaligned) add Remainder = -1, Remainder
596(pAligned) sub Remainder = EndPtr, InPtr
597(pAligned) br.cond.dptk.many .rc4Aligned
598 } ;;
599 {
600 .mmi
601 nop 0x0
602 nop 0x0
603 mov.i ar.lc = Remainder
604 }
605
606/* Do the initial few bytes via the compact, modulo-scheduled loop
607 until the output pointer is 8-byte-aligned. */
608
609 MODSCHED_RC4_PROLOGUE
610 MODSCHED_RC4_LOOP(.RC4AlignLoop)
611
612 {
613 .mib
614 sub Remainder = EndPtr, InPtr
615 zxt1 IFinal = IFinal
616 clrrrb // Clear CFM.rrb.pr so
617 ;; // next "mov pr.rot = N"
618 // does the right thing.
619 }
620 {
621 .mmi
622 mov I[1] = IFinal
623 nop 0x0
624 nop 0x0
625 } ;;
626
627
628.rc4Aligned:
629
630/*
631 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
632 */
633
634 {
635 .mlx
636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
637 movl Remainder = 0xaaaaaaaaaaaaaaab
638 } ;;
639 {
640 .mmi
641 setf.sig f6 = LoopCount // M2, M3 6 cyc
642 setf.sig f7 = Remainder // M2, M3 6 cyc
643 nop 0x0
644 } ;;
645 {
646 .mfb
647 nop 0x0
648 xmpy.hu f6 = f6, f7
649 nop 0x0
650 } ;;
651 {
652 .mmi
653 getf.sig LoopCount = f6;; // M2 5 cyc
654 nop 0x0
655 shr.u LoopCount = LoopCount, 4
656 } ;;
657 {
658 .mmi
659 nop 0x0
660 nop 0x0
661 mov.i ar.lc = LoopCount
662 } ;;
663
664/* Now comes the unrolled loop: */
665
666.rc4Prologue:
667___
668
669$iteration = 0;
670
671# Generate the prologue:
672$predicates = 1;
673for ($i = 0; $i < $phases; ++$i) {
674 &emit_body (\$code, \$bypass, $iteration++, $predicates);
675 $predicates = ($predicates << 1) | 1;
676}
677
678$code.=<<___;
679.rc4Loop:
680___
681
682# Generate the body:
683for ($i = 0; $i < $unroll_count*$phases; ++$i) {
684 &emit_body (\$code, \$bypass, $iteration++, $predicates);
685}
686
687$code.=<<___;
688.rc4Epilogue:
689___
690
691# Generate the epilogue:
692for ($i = 0; $i < $phases; ++$i) {
693 $predicates <<= 1;
694 &emit_body (\$code, \$bypass, $iteration++, $predicates);
695}
696
697$code.=<<___;
698 {
699 .mmi
700 lfetch.nt1 [EndPtr] // fetch line with last byte
701 mov IFinal = I[1]
702 nop 0x0
703 }
704
705.rc4Remainder:
706 {
707 .mmi
708 sub Remainder = EndPtr, InPtr // Calculate
709 // # of bytes
710 // left - 1
711 nop 0x0
712 nop 0x0
713 } ;;
714 {
715 .mib
716 cmp.eq pDone, p0 = -1, Remainder // done already?
717 mov.i ar.lc = Remainder
718(pDone) br.cond.dptk.few .rc4Complete
719 }
720
721/* Do the remaining bytes via the compact, modulo-scheduled loop */
722
723 MODSCHED_RC4_PROLOGUE
724 MODSCHED_RC4_LOOP(.RC4RestLoop)
725
726.rc4Complete:
727 {
728 .mmi
729 add KTable = -SZ, KTable
730 add IFinal = -1, IFinal
731 mov ar.lc = LCSave
732 } ;;
733 {
734 .mii
735 SKEY [KTable] = J,-SZ
736 zxt1 IFinal = IFinal
737 mov pr = PRSave, 0x1FFFF
738 } ;;
739 {
740 .mib
741 SKEY [KTable] = IFinal
742 add RetVal = 1, r0
743 br.ret.sptk.few rp
744 } ;;
745___
746
747# Last but not least, emit the code for the bypass-code of the unrolled loop:
748
749$code.=$bypass;
750
751$code.=<<___;
752 .endp RC4
753___
754
755print $code;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
deleted file mode 100644
index 501d9e936b..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
+++ /dev/null
@@ -1,525 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# June 2011
11#
12# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
13# http://download.intel.com/design/intarch/papers/323686.pdf, is that
14# since both algorithms exhibit instruction-level parallelism, ILP,
15# below theoretical maximum, interleaving them would allow to utilize
16# processor resources better and achieve better performance. RC4
17# instruction sequence is virtually identical to rc4-x86_64.pl, which
18# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
19# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
20# minimize register usage, which was used as "main thread" with RC4
21# weaved into it, one RC4 round per one MD5 round. In addition to the
22# stiched subroutine the script can generate standalone replacement
23# md5_block_asm_data_order and RC4. Below are performance numbers in
24# cycles per processed byte, less is better, for these the standalone
25# subroutines, sum of them, and stitched one:
26#
27# RC4 MD5 RC4+MD5 stitch gain
28# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*)
29# Core2 6.5 5.8 12.3 7.7 +60%
30# Westmere 4.3 5.2 9.5 7.0 +36%
31# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
32# Atom 9.3 6.5 15.8 11.1 +42%
33#
34# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
35# is +53%...
36
37my ($rc4,$md5)=(1,1); # what to generate?
38my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
39 # but its result is discarded. Idea here is
40 # to be able to use 'openssl speed rc4' for
41 # benchmarking the stitched subroutine...
42
43my $flavour = shift;
44my $output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
48( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50die "can't locate x86_64-xlate.pl";
51
52open OUT,"| \"$^X\" $xlate $flavour $output";
53*STDOUT=*OUT;
54
55my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
56
57if ($rc4 && !$md5) {
58 ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
59 $func="RC4"; $nargs=4;
60} elsif ($md5 && !$rc4) {
61 ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
62 $func="md5_block_asm_data_order"; $nargs=3;
63} else {
64 ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
65 $func="rc4_md5_enc"; $nargs=6;
66 # void rc4_md5_enc(
67 # RC4_KEY *key, #
68 # const void *in0, # RC4 input
69 # void *out, # RC4 output
70 # MD5_CTX *ctx, #
71 # const void *inp, # MD5 input
72 # size_t len); # number of 64-byte blocks
73}
74
75my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
76 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
77 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
78 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
79
80 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
81 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
82 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
83 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
84
85 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
86 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
87 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
88 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
89
90 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
91 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
92 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
93 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 );
94
95my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers
96my $tmp="%r12d";
97
98my @XX=("%rbp","%rsi"); # RC4 registers
99my @TX=("%rax","%rbx");
100my $YY="%rcx";
101my $TY="%rdx";
102
103my $MOD=32; # 16, 32 or 64
104
105$code.=<<___;
106.text
107.align 16
108
109.globl $func
110.type $func,\@function,$nargs
111$func:
112 cmp \$0,$len
113 je .Labort
114 push %rbx
115 push %rbp
116 push %r12
117 push %r13
118 push %r14
119 push %r15
120 sub \$40,%rsp
121.Lbody:
122___
123if ($rc4) {
124$code.=<<___;
125$D#md5# mov $ctx,%r11 # reassign arguments
126 mov $len,%r12
127 mov $in0,%r13
128 mov $out,%r14
129$D#md5# mov $inp,%r15
130___
131 $ctx="%r11" if ($md5); # reassign arguments
132 $len="%r12";
133 $in0="%r13";
134 $out="%r14";
135 $inp="%r15" if ($md5);
136 $inp=$in0 if (!$md5);
137$code.=<<___;
138 xor $XX[0],$XX[0]
139 xor $YY,$YY
140
141 lea 8($dat),$dat
142 mov -8($dat),$XX[0]#b
143 mov -4($dat),$YY#b
144
145 inc $XX[0]#b
146 sub $in0,$out
147 movl ($dat,$XX[0],4),$TX[0]#d
148___
149$code.=<<___ if (!$md5);
150 xor $TX[1],$TX[1]
151 test \$-128,$len
152 jz .Loop1
153 sub $XX[0],$TX[1]
154 and \$`$MOD-1`,$TX[1]
155 jz .Loop${MOD}_is_hot
156 sub $TX[1],$len
157.Loop${MOD}_warmup:
158 add $TX[0]#b,$YY#b
159 movl ($dat,$YY,4),$TY#d
160 movl $TX[0]#d,($dat,$YY,4)
161 movl $TY#d,($dat,$XX[0],4)
162 add $TY#b,$TX[0]#b
163 inc $XX[0]#b
164 movl ($dat,$TX[0],4),$TY#d
165 movl ($dat,$XX[0],4),$TX[0]#d
166 xorb ($in0),$TY#b
167 movb $TY#b,($out,$in0)
168 lea 1($in0),$in0
169 dec $TX[1]
170 jnz .Loop${MOD}_warmup
171
172 mov $YY,$TX[1]
173 xor $YY,$YY
174 mov $TX[1]#b,$YY#b
175
176.Loop${MOD}_is_hot:
177 mov $len,32(%rsp) # save original $len
178 shr \$6,$len # number of 64-byte blocks
179___
180 if ($D && !$md5) { # stitch in dummy MD5
181 $md5=1;
182 $ctx="%r11";
183 $inp="%r15";
184 $code.=<<___;
185 mov %rsp,$ctx
186 mov $in0,$inp
187___
188 }
189}
190$code.=<<___;
191#rc4# add $TX[0]#b,$YY#b
192#rc4# lea ($dat,$XX[0],4),$XX[1]
193 shl \$6,$len
194 add $inp,$len # pointer to the end of input
195 mov $len,16(%rsp)
196
197#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX
198#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX
199#md5# mov 1*4($ctx),$V[1]
200#md5# mov 2*4($ctx),$V[2]
201#md5# mov 3*4($ctx),$V[3]
202 jmp .Loop
203
204.align 16
205.Loop:
206#md5# mov $V[0],0*4(%rsp) # put aside current hash value
207#md5# mov $V[1],1*4(%rsp)
208#md5# mov $V[2],2*4(%rsp)
209#md5# mov $V[3],$tmp # forward reference
210#md5# mov $V[3],3*4(%rsp)
211___
212
213sub R0 {
214 my ($i,$a,$b,$c,$d)=@_;
215 my @rot0=(7,12,17,22);
216 my $j=$i%16;
217 my $k=$i%$MOD;
218 my $xmm="%xmm".($j&1);
219 $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15);
220 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
221 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
222 $code.=<<___;
223#rc4# movl ($dat,$YY,4),$TY#d
224#md5# xor $c,$tmp
225#rc4# movl $TX[0]#d,($dat,$YY,4)
226#md5# and $b,$tmp
227#md5# add 4*`$j`($inp),$a
228#rc4# add $TY#b,$TX[0]#b
229#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
230#md5# add \$$K[$i],$a
231#md5# xor $d,$tmp
232#rc4# movz $TX[0]#b,$TX[0]#d
233#rc4# movl $TY#d,4*$k($XX[1])
234#md5# add $tmp,$a
235#rc4# add $TX[1]#b,$YY#b
236#md5# rol \$$rot0[$j%4],$a
237#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference
238#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
239#md5# add $b,$a
240___
241 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
242 mov $YY,$XX[1]
243 xor $YY,$YY # keyword to partial register
244 mov $XX[1]#b,$YY#b
245 lea ($dat,$XX[0],4),$XX[1]
246___
247 $code.=<<___ if ($rc4 && $j==15);
248 psllq \$8,%xmm1
249 pxor %xmm0,%xmm2
250 pxor %xmm1,%xmm2
251___
252}
253sub R1 {
254 my ($i,$a,$b,$c,$d)=@_;
255 my @rot1=(5,9,14,20);
256 my $j=$i%16;
257 my $k=$i%$MOD;
258 my $xmm="%xmm".($j&1);
259 $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15);
260 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
261 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
262 $code.=<<___;
263#rc4# movl ($dat,$YY,4),$TY#d
264#md5# xor $b,$tmp
265#rc4# movl $TX[0]#d,($dat,$YY,4)
266#md5# and $d,$tmp
267#md5# add 4*`((1+5*$j)%16)`($inp),$a
268#rc4# add $TY#b,$TX[0]#b
269#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
270#md5# add \$$K[$i],$a
271#md5# xor $c,$tmp
272#rc4# movz $TX[0]#b,$TX[0]#d
273#rc4# movl $TY#d,4*$k($XX[1])
274#md5# add $tmp,$a
275#rc4# add $TX[1]#b,$YY#b
276#md5# rol \$$rot1[$j%4],$a
277#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference
278#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
279#md5# add $b,$a
280___
281 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
282 mov $YY,$XX[1]
283 xor $YY,$YY # keyword to partial register
284 mov $XX[1]#b,$YY#b
285 lea ($dat,$XX[0],4),$XX[1]
286___
287 $code.=<<___ if ($rc4 && $j==15);
288 psllq \$8,%xmm1
289 pxor %xmm0,%xmm3
290 pxor %xmm1,%xmm3
291___
292}
293sub R2 {
294 my ($i,$a,$b,$c,$d)=@_;
295 my @rot2=(4,11,16,23);
296 my $j=$i%16;
297 my $k=$i%$MOD;
298 my $xmm="%xmm".($j&1);
299 $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15);
300 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
301 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
302 $code.=<<___;
303#rc4# movl ($dat,$YY,4),$TY#d
304#md5# xor $c,$tmp
305#rc4# movl $TX[0]#d,($dat,$YY,4)
306#md5# xor $b,$tmp
307#md5# add 4*`((5+3*$j)%16)`($inp),$a
308#rc4# add $TY#b,$TX[0]#b
309#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
310#md5# add \$$K[$i],$a
311#rc4# movz $TX[0]#b,$TX[0]#d
312#md5# add $tmp,$a
313#rc4# movl $TY#d,4*$k($XX[1])
314#rc4# add $TX[1]#b,$YY#b
315#md5# rol \$$rot2[$j%4],$a
316#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference
317#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
318#md5# add $b,$a
319___
320 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
321 mov $YY,$XX[1]
322 xor $YY,$YY # keyword to partial register
323 mov $XX[1]#b,$YY#b
324 lea ($dat,$XX[0],4),$XX[1]
325___
326 $code.=<<___ if ($rc4 && $j==15);
327 psllq \$8,%xmm1
328 pxor %xmm0,%xmm4
329 pxor %xmm1,%xmm4
330___
331}
332sub R3 {
333 my ($i,$a,$b,$c,$d)=@_;
334 my @rot3=(6,10,15,21);
335 my $j=$i%16;
336 my $k=$i%$MOD;
337 my $xmm="%xmm".($j&1);
338 $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15);
339 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
340 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
341 $code.=<<___;
342#rc4# movl ($dat,$YY,4),$TY#d
343#md5# xor $d,$tmp
344#rc4# movl $TX[0]#d,($dat,$YY,4)
345#md5# or $b,$tmp
346#md5# add 4*`((7*$j)%16)`($inp),$a
347#rc4# add $TY#b,$TX[0]#b
348#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
349#md5# add \$$K[$i],$a
350#rc4# movz $TX[0]#b,$TX[0]#d
351#md5# xor $c,$tmp
352#rc4# movl $TY#d,4*$k($XX[1])
353#md5# add $tmp,$a
354#rc4# add $TX[1]#b,$YY#b
355#md5# rol \$$rot3[$j%4],$a
356#md5# mov \$-1,$tmp # forward reference
357#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
358#md5# add $b,$a
359___
360 $code.=<<___ if ($rc4 && $j==15);
361 mov $XX[0],$XX[1]
362 xor $XX[0],$XX[0] # keyword to partial register
363 mov $XX[1]#b,$XX[0]#b
364 mov $YY,$XX[1]
365 xor $YY,$YY # keyword to partial register
366 mov $XX[1]#b,$YY#b
367 lea ($dat,$XX[0],4),$XX[1]
368 psllq \$8,%xmm1
369 pxor %xmm0,%xmm5
370 pxor %xmm1,%xmm5
371___
372}
373
374my $i=0;
375for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
376for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
377for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
378for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
379
380$code.=<<___;
381#md5# add 0*4(%rsp),$V[0] # accumulate hash value
382#md5# add 1*4(%rsp),$V[1]
383#md5# add 2*4(%rsp),$V[2]
384#md5# add 3*4(%rsp),$V[3]
385
386#rc4# movdqu %xmm2,($out,$in0) # write RC4 output
387#rc4# movdqu %xmm3,16($out,$in0)
388#rc4# movdqu %xmm4,32($out,$in0)
389#rc4# movdqu %xmm5,48($out,$in0)
390#md5# lea 64($inp),$inp
391#rc4# lea 64($in0),$in0
392 cmp 16(%rsp),$inp # are we done?
393 jb .Loop
394
395#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX
396#rc4# sub $TX[0]#b,$YY#b # correct $YY
397#md5# mov $V[0],0*4($len) # write MD5_CTX
398#md5# mov $V[1],1*4($len)
399#md5# mov $V[2],2*4($len)
400#md5# mov $V[3],3*4($len)
401___
402$code.=<<___ if ($rc4 && (!$md5 || $D));
403 mov 32(%rsp),$len # restore original $len
404 and \$63,$len # remaining bytes
405 jnz .Loop1
406 jmp .Ldone
407
408.align 16
409.Loop1:
410 add $TX[0]#b,$YY#b
411 movl ($dat,$YY,4),$TY#d
412 movl $TX[0]#d,($dat,$YY,4)
413 movl $TY#d,($dat,$XX[0],4)
414 add $TY#b,$TX[0]#b
415 inc $XX[0]#b
416 movl ($dat,$TX[0],4),$TY#d
417 movl ($dat,$XX[0],4),$TX[0]#d
418 xorb ($in0),$TY#b
419 movb $TY#b,($out,$in0)
420 lea 1($in0),$in0
421 dec $len
422 jnz .Loop1
423
424.Ldone:
425___
426$code.=<<___;
427#rc4# sub \$1,$XX[0]#b
428#rc4# movl $XX[0]#d,-8($dat)
429#rc4# movl $YY#d,-4($dat)
430
431 mov 40(%rsp),%r15
432 mov 48(%rsp),%r14
433 mov 56(%rsp),%r13
434 mov 64(%rsp),%r12
435 mov 72(%rsp),%rbp
436 mov 80(%rsp),%rbx
437 lea 88(%rsp),%rsp
438.Lepilogue:
439.Labort:
440 ret
441.size $func,.-$func
442___
443
444if ($rc4 && $D) { # sole purpose of this section is to provide
445 # option to use the generated module as drop-in
446 # replacement for rc4-x86_64.pl for debugging
447 # and testing purposes...
448my ($idx,$ido)=("%r8","%r9");
449my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
450
451$code.=<<___;
452.globl RC4_set_key
453.type RC4_set_key,\@function,3
454.align 16
455RC4_set_key:
456 lea 8($dat),$dat
457 lea ($inp,$len),$inp
458 neg $len
459 mov $len,%rcx
460 xor %eax,%eax
461 xor $ido,$ido
462 xor %r10,%r10
463 xor %r11,%r11
464 jmp .Lw1stloop
465
466.align 16
467.Lw1stloop:
468 mov %eax,($dat,%rax,4)
469 add \$1,%al
470 jnc .Lw1stloop
471
472 xor $ido,$ido
473 xor $idx,$idx
474.align 16
475.Lw2ndloop:
476 mov ($dat,$ido,4),%r10d
477 add ($inp,$len,1),$idx#b
478 add %r10b,$idx#b
479 add \$1,$len
480 mov ($dat,$idx,4),%r11d
481 cmovz %rcx,$len
482 mov %r10d,($dat,$idx,4)
483 mov %r11d,($dat,$ido,4)
484 add \$1,$ido#b
485 jnc .Lw2ndloop
486
487 xor %eax,%eax
488 mov %eax,-8($dat)
489 mov %eax,-4($dat)
490 ret
491.size RC4_set_key,.-RC4_set_key
492
493.globl RC4_options
494.type RC4_options,\@abi-omnipotent
495.align 16
496RC4_options:
497 lea .Lopts(%rip),%rax
498 ret
499.align 64
500.Lopts:
501.asciz "rc4(64x,int)"
502.align 64
503.size RC4_options,.-RC4_options
504___
505}
506
507sub reg_part {
508my ($reg,$conv)=@_;
509 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
510 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
511 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
512 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
513 return $reg;
514}
515
516$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
517$code =~ s/\`([^\`]*)\`/eval $1/gem;
518$code =~ s/pinsrw\s+\$0,/movd /gm;
519
520$code =~ s/#md5#//gm if ($md5);
521$code =~ s/#rc4#//gm if ($rc4);
522
523print $code;
524
525close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
deleted file mode 100644
index 7e7974430a..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
+++ /dev/null
@@ -1,320 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# RC4 for PA-RISC.
11
12# June 2009.
13#
14# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
15# For reference, [4x] unrolled loop is >40% faster than folded one.
16# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
17# is believed to be not sufficient to justify the effort...
18#
19# Special thanks to polarhome.com for providing HP-UX account.
20
21$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36} else {
37 $LEVEL ="1.0";
38 $SIZE_T =4;
39 $FRAME_MARKER =48;
40 $SAVED_RP =20;
41 $PUSH ="stw";
42 $PUSHMA ="stwm";
43 $POP ="ldw";
44 $POPMB ="ldwm";
45}
46
47$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
48 # [+ argument transfer]
49$SZ=1; # defaults to RC4_CHAR
50if (open CONF,"<${dir}../../opensslconf.h") {
51 while(<CONF>) {
52 if (m/#\s*define\s+RC4_INT\s+(.*)/) {
53 $SZ = ($1=~/char$/) ? 1 : 4;
54 last;
55 }
56 }
57 close CONF;
58}
59
60if ($SZ==1) { # RC4_CHAR
61 $LD="ldb";
62 $LDX="ldbx";
63 $MKX="addl";
64 $ST="stb";
65} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
66 $LD="ldw";
67 $LDX="ldwx,s";
68 $MKX="sh2addl";
69 $ST="stw";
70}
71
72$key="%r26";
73$len="%r25";
74$inp="%r24";
75$out="%r23";
76
77@XX=("%r19","%r20");
78@TX=("%r21","%r22");
79$YY="%r28";
80$TY="%r29";
81
82$acc="%r1";
83$ix="%r2";
84$iy="%r3";
85$dat0="%r4";
86$dat1="%r5";
87$rem="%r6";
88$mask="%r31";
89
90sub unrolledloopbody {
91for ($i=0;$i<4;$i++) {
92$code.=<<___;
93 ldo 1($XX[0]),$XX[1]
94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
95 and $mask,$XX[1],$XX[1]
96 $LDX $YY($key),$TY
97 $MKX $YY,$key,$ix
98 $LDX $XX[1]($key),$TX[1]
99 $MKX $XX[0],$key,$iy
100 $ST $TX[0],0($ix)
101 comclr,<> $XX[1],$YY,%r0 ; conditional
102 copy $TX[0],$TX[1] ; move
103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
104 $ST $TY,0($iy)
105 addl $TX[0],$TY,$TY
106 addl $TX[1],$YY,$YY
107 and $mask,$TY,$TY
108 and $mask,$YY,$YY
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111} }
112
113sub foldedloop {
114my ($label,$count)=@_;
115$code.=<<___;
116$label
117 $MKX $YY,$key,$iy
118 $LDX $YY($key),$TY
119 $MKX $XX[0],$key,$ix
120 $ST $TX[0],0($iy)
121 ldo 1($XX[0]),$XX[0]
122 $ST $TY,0($ix)
123 addl $TX[0],$TY,$TY
124 ldbx $inp($out),$dat1
125 and $mask,$TY,$TY
126 and $mask,$XX[0],$XX[0]
127 $LDX $TY($key),$acc
128 $LDX $XX[0]($key),$TX[0]
129 ldo 1($out),$out
130 xor $dat1,$acc,$acc
131 addl $TX[0],$YY,$YY
132 stb $acc,-1($out)
133 addib,<> -1,$count,$label ; $count is always small
134 and $mask,$YY,$YY
135___
136}
137
138$code=<<___;
139 .LEVEL $LEVEL
140#if 0
141 .SPACE \$TEXT\$
142 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
143#else
144 .text
145#endif
146
147 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
148RC4
149 .PROC
150 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
151 .ENTRY
152 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
153 $PUSHMA %r3,$FRAME(%sp)
154 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
155 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
156 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
157
158 cmpib,*= 0,$len,L\$abort
159 sub $inp,$out,$inp ; distance between $inp and $out
160
161 $LD `0*$SZ`($key),$XX[0]
162 $LD `1*$SZ`($key),$YY
163 ldo `2*$SZ`($key),$key
164
165 ldi 0xff,$mask
166 ldi 3,$dat0
167
168 ldo 1($XX[0]),$XX[0] ; warm up loop
169 and $mask,$XX[0],$XX[0]
170 $LDX $XX[0]($key),$TX[0]
171 addl $TX[0],$YY,$YY
172 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
173 and $mask,$YY,$YY
174
175 and,<> $out,$dat0,$rem ; is $out aligned?
176 b L\$alignedout
177 subi 4,$rem,$rem
178 sub $len,$rem,$len
179___
180&foldedloop("L\$alignout",$rem); # process till $out is aligned
181
182$code.=<<___;
183L\$alignedout ; $len is at least 4 here
184 and,<> $inp,$dat0,$acc ; is $inp aligned?
185 b L\$oop4
186 sub $inp,$acc,$rem ; align $inp
187
188 sh3addl $acc,%r0,$acc
189 subi 32,$acc,$acc
190 mtctl $acc,%cr11 ; load %sar with vshd align factor
191 ldwx $rem($out),$dat0
192 ldo 4($rem),$rem
193L\$oop4misalignedinp
194___
195&unrolledloopbody();
196$code.=<<___;
197 $LDX $TY($key),$ix
198 ldwx $rem($out),$dat1
199 ldo -4($len),$len
200 or $ix,$acc,$acc ; last piece, no need to dep
201 vshd $dat0,$dat1,$iy ; align data
202 copy $dat1,$dat0
203 xor $iy,$acc,$acc
204 stw $acc,0($out)
205 cmpib,*<< 3,$len,L\$oop4misalignedinp
206 ldo 4($out),$out
207 cmpib,*= 0,$len,L\$done
208 nop
209 b L\$oop1
210 nop
211
212 .ALIGN 8
213L\$oop4
214___
215&unrolledloopbody();
216$code.=<<___;
217 $LDX $TY($key),$ix
218 ldwx $inp($out),$dat0
219 ldo -4($len),$len
220 or $ix,$acc,$acc ; last piece, no need to dep
221 xor $dat0,$acc,$acc
222 stw $acc,0($out)
223 cmpib,*<< 3,$len,L\$oop4
224 ldo 4($out),$out
225 cmpib,*= 0,$len,L\$done
226 nop
227___
228&foldedloop("L\$oop1",$len);
229$code.=<<___;
230L\$done
231 $POP `-$FRAME-$SAVED_RP`(%sp),%r2
232 ldo -1($XX[0]),$XX[0] ; chill out loop
233 sub $YY,$TX[0],$YY
234 and $mask,$XX[0],$XX[0]
235 and $mask,$YY,$YY
236 $ST $XX[0],`-2*$SZ`($key)
237 $ST $YY,`-1*$SZ`($key)
238 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
239 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
240 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
241L\$abort
242 bv (%r2)
243 .EXIT
244 $POPMB -$FRAME(%sp),%r3
245 .PROCEND
246___
247
248$code.=<<___;
249
250 .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
251 .ALIGN 8
252RC4_set_key
253 .PROC
254 .CALLINFO NO_CALLS
255 .ENTRY
256 $ST %r0,`0*$SZ`($key)
257 $ST %r0,`1*$SZ`($key)
258 ldo `2*$SZ`($key),$key
259 copy %r0,@XX[0]
260L\$1st
261 $ST @XX[0],0($key)
262 ldo 1(@XX[0]),@XX[0]
263 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
264 ldo $SZ($key),$key
265
266 ldo `-256*$SZ`($key),$key ; rewind $key
267 addl $len,$inp,$inp ; $inp to point at the end
268 sub %r0,$len,%r23 ; inverse index
269 copy %r0,@XX[0]
270 copy %r0,@XX[1]
271 ldi 0xff,$mask
272
273L\$2nd
274 $LDX @XX[0]($key),@TX[0]
275 ldbx %r23($inp),@TX[1]
276 addi,nuv 1,%r23,%r23 ; increment and conditional
277 sub %r0,$len,%r23 ; inverse index
278 addl @TX[0],@XX[1],@XX[1]
279 addl @TX[1],@XX[1],@XX[1]
280 and $mask,@XX[1],@XX[1]
281 $MKX @XX[0],$key,$TY
282 $LDX @XX[1]($key),@TX[1]
283 $MKX @XX[1],$key,$YY
284 ldo 1(@XX[0]),@XX[0]
285 $ST @TX[0],0($YY)
286 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
287 $ST @TX[1],0($TY)
288
289 bv,n (%r2)
290 .EXIT
291 nop
292 .PROCEND
293
294 .EXPORT RC4_options,ENTRY
295 .ALIGN 8
296RC4_options
297 .PROC
298 .CALLINFO NO_CALLS
299 .ENTRY
300 blr %r0,%r28
301 ldi 3,%r1
302L\$pic
303 andcm %r28,%r1,%r28
304 bv (%r2)
305 .EXIT
306 ldo L\$opts-L\$pic(%r28),%r28
307 .PROCEND
308
309 .data
310 .ALIGN 8
311L\$opts
312 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
313 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
314___
315$code =~ s/\`([^\`]*)\`/eval $1/gem;
316$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
317$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
318
319print $code;
320close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
deleted file mode 100644
index 1aa754820c..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
+++ /dev/null
@@ -1,234 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# February 2009
11#
12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13# "cluster" Address Generation Interlocks, so that one pipeline stall
14# resolves several dependencies.
15
16# November 2010.
17#
18# Adapt for -m31 build. If kernel supports what's called "highgprs"
19# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
20# instructions and achieve "64-bit" performance even in 31-bit legacy
21# application context. The feature is not specific to any particular
22# processor, as long as it's "z-CPU". Latter implies that the code
23# remains z/Architecture specific. On z990 it was measured to perform
24# 50% better than code generated by gcc 4.3.
25
26$flavour = shift;
27
28if ($flavour =~ /3[12]/) {
29 $SIZE_T=4;
30 $g="";
31} else {
32 $SIZE_T=8;
33 $g="g";
34}
35
36while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
37open STDOUT,">$output";
38
39$rp="%r14";
40$sp="%r15";
41$code=<<___;
42.text
43
44___
45
46# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
47{
48$acc="%r0";
49$cnt="%r1";
50$key="%r2";
51$len="%r3";
52$inp="%r4";
53$out="%r5";
54
55@XX=("%r6","%r7");
56@TX=("%r8","%r9");
57$YY="%r10";
58$TY="%r11";
59
60$code.=<<___;
61.globl RC4
62.type RC4,\@function
63.align 64
64RC4:
65 stm${g} %r6,%r11,6*$SIZE_T($sp)
66___
67$code.=<<___ if ($flavour =~ /3[12]/);
68 llgfr $len,$len
69___
70$code.=<<___;
71 llgc $XX[0],0($key)
72 llgc $YY,1($key)
73 la $XX[0],1($XX[0])
74 nill $XX[0],0xff
75 srlg $cnt,$len,3
76 ltgr $cnt,$cnt
77 llgc $TX[0],2($XX[0],$key)
78 jz .Lshort
79 j .Loop8
80
81.align 64
82.Loop8:
83___
84for ($i=0;$i<8;$i++) {
85$code.=<<___;
86 la $YY,0($YY,$TX[0]) # $i
87 nill $YY,255
88 la $XX[1],1($XX[0])
89 nill $XX[1],255
90___
91$code.=<<___ if ($i==1);
92 llgc $acc,2($TY,$key)
93___
94$code.=<<___ if ($i>1);
95 sllg $acc,$acc,8
96 ic $acc,2($TY,$key)
97___
98$code.=<<___;
99 llgc $TY,2($YY,$key)
100 stc $TX[0],2($YY,$key)
101 llgc $TX[1],2($XX[1],$key)
102 stc $TY,2($XX[0],$key)
103 cr $XX[1],$YY
104 jne .Lcmov$i
105 la $TX[1],0($TX[0])
106.Lcmov$i:
107 la $TY,0($TY,$TX[0])
108 nill $TY,255
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111}
112
113$code.=<<___;
114 lg $TX[1],0($inp)
115 sllg $acc,$acc,8
116 la $inp,8($inp)
117 ic $acc,2($TY,$key)
118 xgr $acc,$TX[1]
119 stg $acc,0($out)
120 la $out,8($out)
121 brctg $cnt,.Loop8
122
123.Lshort:
124 lghi $acc,7
125 ngr $len,$acc
126 jz .Lexit
127 j .Loop1
128
129.align 16
130.Loop1:
131 la $YY,0($YY,$TX[0])
132 nill $YY,255
133 llgc $TY,2($YY,$key)
134 stc $TX[0],2($YY,$key)
135 stc $TY,2($XX[0],$key)
136 ar $TY,$TX[0]
137 ahi $XX[0],1
138 nill $TY,255
139 nill $XX[0],255
140 llgc $acc,0($inp)
141 la $inp,1($inp)
142 llgc $TY,2($TY,$key)
143 llgc $TX[0],2($XX[0],$key)
144 xr $acc,$TY
145 stc $acc,0($out)
146 la $out,1($out)
147 brct $len,.Loop1
148
149.Lexit:
150 ahi $XX[0],-1
151 stc $XX[0],0($key)
152 stc $YY,1($key)
153 lm${g} %r6,%r11,6*$SIZE_T($sp)
154 br $rp
155.size RC4,.-RC4
156.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
157
158___
159}
160
161# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
162{
163$cnt="%r0";
164$idx="%r1";
165$key="%r2";
166$len="%r3";
167$inp="%r4";
168$acc="%r5";
169$dat="%r6";
170$ikey="%r7";
171$iinp="%r8";
172
173$code.=<<___;
174.globl RC4_set_key
175.type RC4_set_key,\@function
176.align 64
177RC4_set_key:
178 stm${g} %r6,%r8,6*$SIZE_T($sp)
179 lhi $cnt,256
180 la $idx,0(%r0)
181 sth $idx,0($key)
182.align 4
183.L1stloop:
184 stc $idx,2($idx,$key)
185 la $idx,1($idx)
186 brct $cnt,.L1stloop
187
188 lghi $ikey,-256
189 lr $cnt,$len
190 la $iinp,0(%r0)
191 la $idx,0(%r0)
192.align 16
193.L2ndloop:
194 llgc $acc,2+256($ikey,$key)
195 llgc $dat,0($iinp,$inp)
196 la $idx,0($idx,$acc)
197 la $ikey,1($ikey)
198 la $idx,0($idx,$dat)
199 nill $idx,255
200 la $iinp,1($iinp)
201 tml $ikey,255
202 llgc $dat,2($idx,$key)
203 stc $dat,2+256-1($ikey,$key)
204 stc $acc,2($idx,$key)
205 jz .Ldone
206 brct $cnt,.L2ndloop
207 lr $cnt,$len
208 la $iinp,0(%r0)
209 j .L2ndloop
210.Ldone:
211 lm${g} %r6,%r8,6*$SIZE_T($sp)
212 br $rp
213.size RC4_set_key,.-RC4_set_key
214
215___
216}
217
218# const char *RC4_options()
219$code.=<<___;
220.globl RC4_options
221.type RC4_options,\@function
222.align 16
223RC4_options:
224 larl %r2,.Loptions
225 br %r14
226.size RC4_options,.-RC4_options
227.section .rodata
228.Loptions:
229.align 8
230.string "rc4(8x,char)"
231___
232
233print $code;
234close STDOUT; # force flush
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
deleted file mode 100755
index 197749dda7..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
+++ /dev/null
@@ -1,543 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# July 2004
11#
12# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
13# "hand-coded assembler"] doesn't stand for the whole improvement
14# coefficient. It turned out that eliminating RC4_CHAR from config
15# line results in ~40% improvement (yes, even for C implementation).
16# Presumably it has everything to do with AMD cache architecture and
17# RAW or whatever penalties. Once again! The module *requires* config
18# line *without* RC4_CHAR! As for coding "secret," I bet on partial
19# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
20# I simply 'inc %r8b'. Even though optimization manual discourages
21# to operate on partial registers, it turned out to be the best bet.
22# At least for AMD... How IA32E would perform remains to be seen...
23
24# November 2004
25#
26# As was shown by Marc Bevand reordering of couple of load operations
27# results in even higher performance gain of 3.3x:-) At least on
28# Opteron... For reference, 1x in this case is RC4_CHAR C-code
29# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
30# Latter means that if you want to *estimate* what to expect from
31# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
32
33# November 2004
34#
35# Intel P4 EM64T core was found to run the AMD64 code really slow...
36# The only way to achieve comparable performance on P4 was to keep
37# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
38# compose blended code, which would perform even within 30% marginal
39# on either AMD and Intel platforms, I implement both cases. See
40# rc4_skey.c for further details...
41
42# April 2005
43#
44# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
45# those with add/sub results in 50% performance improvement of folded
46# loop...
47
48# May 2005
49#
50# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
51# performance by >30% [unlike P4 32-bit case that is]. But this is
52# provided that loads are reordered even more aggressively! Both code
53# pathes, AMD64 and EM64T, reorder loads in essentially same manner
54# as my IA-64 implementation. On Opteron this resulted in modest 5%
55# improvement [I had to test it], while final Intel P4 performance
56# achieves respectful 432MBps on 2.8GHz processor now. For reference.
57# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
58# RC4_INT code-path. While if executed on Opteron, it's only 25%
59# slower than the RC4_INT one [meaning that if CPU µ-arch detection
60# is not implemented, then this final RC4_CHAR code-path should be
61# preferred, as it provides better *all-round* performance].
62
63# March 2007
64#
65# Intel Core2 was observed to perform poorly on both code paths:-( It
66# apparently suffers from some kind of partial register stall, which
67# occurs in 64-bit mode only [as virtually identical 32-bit loop was
68# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
69# cloop1 boosts its performance by 80%! This loop appears to be optimal
70# fit for Core2 and therefore the code was modified to skip cloop8 on
71# this CPU.
72
73# May 2010
74#
75# Intel Westmere was observed to perform suboptimally. Adding yet
76# another movzb to cloop1 improved performance by almost 50%! Core2
77# performance is improved too, but nominally...
78
79# May 2011
80#
81# The only code path that was not modified is P4-specific one. Non-P4
82# Intel code path optimization is heavily based on submission by Maxim
83# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
84# some of the ideas even in attempt to optmize the original RC4_INT
85# code path... Current performance in cycles per processed byte (less
86# is better) and improvement coefficients relative to previous
87# version of this module are:
88#
89# Opteron 5.3/+0%(*)
90# P4 6.5
91# Core2 6.2/+15%(**)
92# Westmere 4.2/+60%
93# Sandy Bridge 4.2/+120%
94# Atom 9.3/+80%
95#
96# (*) But corresponding loop has less instructions, which should have
97# positive effect on upcoming Bulldozer, which has one less ALU.
98# For reference, Intel code runs at 6.8 cpb rate on Opteron.
99# (**) Note that Core2 result is ~15% lower than corresponding result
100# for 32-bit code, meaning that it's possible to improve it,
101# but more than likely at the cost of the others (see rc4-586.pl
102# to get the idea)...
103
104$flavour = shift;
105$output = shift;
106if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
107
108$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
109( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
110( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
111die "can't locate x86_64-xlate.pl";
112
113open OUT,"| \"$^X\" $xlate $flavour $output";
114*STDOUT=*OUT;
115
116$dat="%rdi"; # arg1
117$len="%rsi"; # arg2
118$inp="%rdx"; # arg3
119$out="%rcx"; # arg4
120
121{
122$code=<<___;
123.text
124.extern OPENSSL_ia32cap_P
125
126.globl RC4
127.type RC4,\@function,4
128.align 16
129RC4: or $len,$len
130 jne .Lentry
131 ret
132.Lentry:
133 push %rbx
134 push %r12
135 push %r13
136.Lprologue:
137 mov $len,%r11
138 mov $inp,%r12
139 mov $out,%r13
140___
141my $len="%r11"; # reassign input arguments
142my $inp="%r12";
143my $out="%r13";
144
145my @XX=("%r10","%rsi");
146my @TX=("%rax","%rbx");
147my $YY="%rcx";
148my $TY="%rdx";
149
150$code.=<<___;
151 xor $XX[0],$XX[0]
152 xor $YY,$YY
153
154 lea 8($dat),$dat
155 mov -8($dat),$XX[0]#b
156 mov -4($dat),$YY#b
157 cmpl \$-1,256($dat)
158 je .LRC4_CHAR
159 mov OPENSSL_ia32cap_P(%rip),%r8d
160 xor $TX[1],$TX[1]
161 inc $XX[0]#b
162 sub $XX[0],$TX[1]
163 sub $inp,$out
164 movl ($dat,$XX[0],4),$TX[0]#d
165 test \$-16,$len
166 jz .Lloop1
167 bt \$30,%r8d # Intel CPU?
168 jc .Lintel
169 and \$7,$TX[1]
170 lea 1($XX[0]),$XX[1]
171 jz .Loop8
172 sub $TX[1],$len
173.Loop8_warmup:
174 add $TX[0]#b,$YY#b
175 movl ($dat,$YY,4),$TY#d
176 movl $TX[0]#d,($dat,$YY,4)
177 movl $TY#d,($dat,$XX[0],4)
178 add $TY#b,$TX[0]#b
179 inc $XX[0]#b
180 movl ($dat,$TX[0],4),$TY#d
181 movl ($dat,$XX[0],4),$TX[0]#d
182 xorb ($inp),$TY#b
183 movb $TY#b,($out,$inp)
184 lea 1($inp),$inp
185 dec $TX[1]
186 jnz .Loop8_warmup
187
188 lea 1($XX[0]),$XX[1]
189 jmp .Loop8
190.align 16
191.Loop8:
192___
193for ($i=0;$i<8;$i++) {
194$code.=<<___ if ($i==7);
195 add \$8,$XX[1]#b
196___
197$code.=<<___;
198 add $TX[0]#b,$YY#b
199 movl ($dat,$YY,4),$TY#d
200 movl $TX[0]#d,($dat,$YY,4)
201 movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
202 ror \$8,%r8 # ror is redundant when $i=0
203 movl $TY#d,4*$i($dat,$XX[0],4)
204 add $TX[0]#b,$TY#b
205 movb ($dat,$TY,4),%r8b
206___
207push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
208}
209$code.=<<___;
210 add \$8,$XX[0]#b
211 ror \$8,%r8
212 sub \$8,$len
213
214 xor ($inp),%r8
215 mov %r8,($out,$inp)
216 lea 8($inp),$inp
217
218 test \$-8,$len
219 jnz .Loop8
220 cmp \$0,$len
221 jne .Lloop1
222 jmp .Lexit
223
224.align 16
225.Lintel:
226 test \$-32,$len
227 jz .Lloop1
228 and \$15,$TX[1]
229 jz .Loop16_is_hot
230 sub $TX[1],$len
231.Loop16_warmup:
232 add $TX[0]#b,$YY#b
233 movl ($dat,$YY,4),$TY#d
234 movl $TX[0]#d,($dat,$YY,4)
235 movl $TY#d,($dat,$XX[0],4)
236 add $TY#b,$TX[0]#b
237 inc $XX[0]#b
238 movl ($dat,$TX[0],4),$TY#d
239 movl ($dat,$XX[0],4),$TX[0]#d
240 xorb ($inp),$TY#b
241 movb $TY#b,($out,$inp)
242 lea 1($inp),$inp
243 dec $TX[1]
244 jnz .Loop16_warmup
245
246 mov $YY,$TX[1]
247 xor $YY,$YY
248 mov $TX[1]#b,$YY#b
249
250.Loop16_is_hot:
251 lea ($dat,$XX[0],4),$XX[1]
252___
253sub RC4_loop {
254 my $i=shift;
255 my $j=$i<0?0:$i;
256 my $xmm="%xmm".($j&1);
257
258 $code.=" add \$16,$XX[0]#b\n" if ($i==15);
259 $code.=" movdqu ($inp),%xmm2\n" if ($i==15);
260 $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
261 $code.=" movl ($dat,$YY,4),$TY#d\n";
262 $code.=" pxor %xmm0,%xmm2\n" if ($i==0);
263 $code.=" psllq \$8,%xmm1\n" if ($i==0);
264 $code.=" pxor $xmm,$xmm\n" if ($i<=1);
265 $code.=" movl $TX[0]#d,($dat,$YY,4)\n";
266 $code.=" add $TY#b,$TX[0]#b\n";
267 $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
268 $code.=" movz $TX[0]#b,$TX[0]#d\n";
269 $code.=" movl $TY#d,4*$j($XX[1])\n";
270 $code.=" pxor %xmm1,%xmm2\n" if ($i==0);
271 $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
272 $code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
273 $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
274 $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
275 $code.=" lea 16($inp),$inp\n" if ($i==0);
276 $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
277}
278 RC4_loop(-1);
279$code.=<<___;
280 jmp .Loop16_enter
281.align 16
282.Loop16:
283___
284
285for ($i=0;$i<16;$i++) {
286 $code.=".Loop16_enter:\n" if ($i==1);
287 RC4_loop($i);
288 push(@TX,shift(@TX)); # "rotate" registers
289}
290$code.=<<___;
291 mov $YY,$TX[1]
292 xor $YY,$YY # keyword to partial register
293 sub \$16,$len
294 mov $TX[1]#b,$YY#b
295 test \$-16,$len
296 jnz .Loop16
297
298 psllq \$8,%xmm1
299 pxor %xmm0,%xmm2
300 pxor %xmm1,%xmm2
301 movdqu %xmm2,($out,$inp)
302 lea 16($inp),$inp
303
304 cmp \$0,$len
305 jne .Lloop1
306 jmp .Lexit
307
308.align 16
309.Lloop1:
310 add $TX[0]#b,$YY#b
311 movl ($dat,$YY,4),$TY#d
312 movl $TX[0]#d,($dat,$YY,4)
313 movl $TY#d,($dat,$XX[0],4)
314 add $TY#b,$TX[0]#b
315 inc $XX[0]#b
316 movl ($dat,$TX[0],4),$TY#d
317 movl ($dat,$XX[0],4),$TX[0]#d
318 xorb ($inp),$TY#b
319 movb $TY#b,($out,$inp)
320 lea 1($inp),$inp
321 dec $len
322 jnz .Lloop1
323 jmp .Lexit
324
325.align 16
326.LRC4_CHAR:
327 add \$1,$XX[0]#b
328 movzb ($dat,$XX[0]),$TX[0]#d
329 test \$-8,$len
330 jz .Lcloop1
331 jmp .Lcloop8
332.align 16
333.Lcloop8:
334 mov ($inp),%r8d
335 mov 4($inp),%r9d
336___
337# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
338for ($i=0;$i<4;$i++) {
339$code.=<<___;
340 add $TX[0]#b,$YY#b
341 lea 1($XX[0]),$XX[1]
342 movzb ($dat,$YY),$TY#d
343 movzb $XX[1]#b,$XX[1]#d
344 movzb ($dat,$XX[1]),$TX[1]#d
345 movb $TX[0]#b,($dat,$YY)
346 cmp $XX[1],$YY
347 movb $TY#b,($dat,$XX[0])
348 jne .Lcmov$i # Intel cmov is sloooow...
349 mov $TX[0],$TX[1]
350.Lcmov$i:
351 add $TX[0]#b,$TY#b
352 xor ($dat,$TY),%r8b
353 ror \$8,%r8d
354___
355push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
356}
357for ($i=4;$i<8;$i++) {
358$code.=<<___;
359 add $TX[0]#b,$YY#b
360 lea 1($XX[0]),$XX[1]
361 movzb ($dat,$YY),$TY#d
362 movzb $XX[1]#b,$XX[1]#d
363 movzb ($dat,$XX[1]),$TX[1]#d
364 movb $TX[0]#b,($dat,$YY)
365 cmp $XX[1],$YY
366 movb $TY#b,($dat,$XX[0])
367 jne .Lcmov$i # Intel cmov is sloooow...
368 mov $TX[0],$TX[1]
369.Lcmov$i:
370 add $TX[0]#b,$TY#b
371 xor ($dat,$TY),%r9b
372 ror \$8,%r9d
373___
374push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
375}
376$code.=<<___;
377 lea -8($len),$len
378 mov %r8d,($out)
379 lea 8($inp),$inp
380 mov %r9d,4($out)
381 lea 8($out),$out
382
383 test \$-8,$len
384 jnz .Lcloop8
385 cmp \$0,$len
386 jne .Lcloop1
387 jmp .Lexit
388___
389$code.=<<___;
390.align 16
391.Lcloop1:
392 add $TX[0]#b,$YY#b
393 movzb $YY#b,$YY#d
394 movzb ($dat,$YY),$TY#d
395 movb $TX[0]#b,($dat,$YY)
396 movb $TY#b,($dat,$XX[0])
397 add $TX[0]#b,$TY#b
398 add \$1,$XX[0]#b
399 movzb $TY#b,$TY#d
400 movzb $XX[0]#b,$XX[0]#d
401 movzb ($dat,$TY),$TY#d
402 movzb ($dat,$XX[0]),$TX[0]#d
403 xorb ($inp),$TY#b
404 lea 1($inp),$inp
405 movb $TY#b,($out)
406 lea 1($out),$out
407 sub \$1,$len
408 jnz .Lcloop1
409 jmp .Lexit
410
411.align 16
412.Lexit:
413 sub \$1,$XX[0]#b
414 movl $XX[0]#d,-8($dat)
415 movl $YY#d,-4($dat)
416
417 mov (%rsp),%r13
418 mov 8(%rsp),%r12
419 mov 16(%rsp),%rbx
420 add \$24,%rsp
421.Lepilogue:
422 ret
423.size RC4,.-RC4
424___
425}
426
427$idx="%r8";
428$ido="%r9";
429
430$code.=<<___;
431.globl RC4_set_key
432.type RC4_set_key,\@function,3
433.align 16
434RC4_set_key:
435 lea 8($dat),$dat
436 lea ($inp,$len),$inp
437 neg $len
438 mov $len,%rcx
439 xor %eax,%eax
440 xor $ido,$ido
441 xor %r10,%r10
442 xor %r11,%r11
443
444 mov OPENSSL_ia32cap_P(%rip),$idx#d
445 bt \$20,$idx#d # RC4_CHAR?
446 jc .Lc1stloop
447 jmp .Lw1stloop
448
449.align 16
450.Lw1stloop:
451 mov %eax,($dat,%rax,4)
452 add \$1,%al
453 jnc .Lw1stloop
454
455 xor $ido,$ido
456 xor $idx,$idx
457.align 16
458.Lw2ndloop:
459 mov ($dat,$ido,4),%r10d
460 add ($inp,$len,1),$idx#b
461 add %r10b,$idx#b
462 add \$1,$len
463 mov ($dat,$idx,4),%r11d
464 cmovz %rcx,$len
465 mov %r10d,($dat,$idx,4)
466 mov %r11d,($dat,$ido,4)
467 add \$1,$ido#b
468 jnc .Lw2ndloop
469 jmp .Lexit_key
470
471.align 16
472.Lc1stloop:
473 mov %al,($dat,%rax)
474 add \$1,%al
475 jnc .Lc1stloop
476
477 xor $ido,$ido
478 xor $idx,$idx
479.align 16
480.Lc2ndloop:
481 mov ($dat,$ido),%r10b
482 add ($inp,$len),$idx#b
483 add %r10b,$idx#b
484 add \$1,$len
485 mov ($dat,$idx),%r11b
486 jnz .Lcnowrap
487 mov %rcx,$len
488.Lcnowrap:
489 mov %r10b,($dat,$idx)
490 mov %r11b,($dat,$ido)
491 add \$1,$ido#b
492 jnc .Lc2ndloop
493 movl \$-1,256($dat)
494
495.align 16
496.Lexit_key:
497 xor %eax,%eax
498 mov %eax,-8($dat)
499 mov %eax,-4($dat)
500 ret
501.size RC4_set_key,.-RC4_set_key
502
503.globl RC4_options
504.type RC4_options,\@abi-omnipotent
505.align 16
506RC4_options:
507 lea .Lopts(%rip),%rax
508 mov OPENSSL_ia32cap_P(%rip),%edx
509 bt \$20,%edx
510 jc .L8xchar
511 bt \$30,%edx
512 jnc .Ldone
513 add \$25,%rax
514 ret
515.L8xchar:
516 add \$12,%rax
517.Ldone:
518 ret
519.align 64
520.Lopts:
521.asciz "rc4(8x,int)"
522.asciz "rc4(8x,char)"
523.asciz "rc4(16x,int)"
524.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
525.align 64
526.size RC4_options,.-RC4_options
527___
528
529sub reg_part {
530my ($reg,$conv)=@_;
531 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
532 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
533 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
534 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
535 return $reg;
536}
537
538$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
539$code =~ s/\`([^\`]*)\`/eval $1/gem;
540
541print $code;
542
543close STDOUT;