summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/rc4
diff options
context:
space:
mode:
authordjm <>2010-10-01 22:54:21 +0000
committerdjm <>2010-10-01 22:54:21 +0000
commit829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (patch)
treee03b9f1bd051e844b971936729e9df549a209130 /src/lib/libcrypto/rc4
parente6b755d2a53d3cac7a344dfdd6bf7c951cac754c (diff)
downloadopenbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.gz
openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.bz2
openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.zip
import OpenSSL-1.0.0a
Diffstat (limited to 'src/lib/libcrypto/rc4')
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-586.pl448
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-ia64.pl755
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-s390x.pl205
-rwxr-xr-xsrc/lib/libcrypto/rc4/asm/rc4-x86_64.pl176
-rw-r--r--src/lib/libcrypto/rc4/rc4.h7
-rw-r--r--src/lib/libcrypto/rc4/rc4_enc.c16
-rw-r--r--src/lib/libcrypto/rc4/rc4_skey.c21
7 files changed, 1375 insertions, 253 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
index ef7eee766c..38a44a70ef 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -1,14 +1,21 @@
1#!/usr/local/bin/perl 1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
2 9
3# At some point it became apparent that the original SSLeay RC4 10# At some point it became apparent that the original SSLeay RC4
4# assembler implementation performs suboptimaly on latest IA-32 11# assembler implementation performs suboptimally on latest IA-32
5# microarchitectures. After re-tuning performance has changed as 12# microarchitectures. After re-tuning performance has changed as
6# following: 13# following:
7# 14#
8# Pentium +0% 15# Pentium -10%
9# Pentium III +17% 16# Pentium III +12%
10# AMD +52%(*) 17# AMD +50%(*)
11# P4 +180%(**) 18# P4 +250%(**)
12# 19#
13# (*) This number is actually a trade-off:-) It's possible to 20# (*) This number is actually a trade-off:-) It's possible to
14# achieve +72%, but at the cost of -48% off PIII performance. 21# achieve +72%, but at the cost of -48% off PIII performance.
@@ -17,214 +24,247 @@
17# For reference! This code delivers ~80% of rc4-amd64.pl 24# For reference! This code delivers ~80% of rc4-amd64.pl
18# performance on the same Opteron machine. 25# performance on the same Opteron machine.
19# (**) This number requires compressed key schedule set up by 26# (**) This number requires compressed key schedule set up by
20# RC4_set_key and therefore doesn't apply to 0.9.7 [option for 27# RC4_set_key [see commentary below for further details].
21# compressed key schedule is implemented in 0.9.8 and later,
22# see commentary section in rc4_skey.c for further details].
23# 28#
24# <appro@fy.chalmers.se> 29# <appro@fy.chalmers.se>
25 30
26push(@INC,"perlasm","../../perlasm"); 31$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
32push(@INC,"${dir}","${dir}../../perlasm");
27require "x86asm.pl"; 33require "x86asm.pl";
28 34
29&asm_init($ARGV[0],"rc4-586.pl"); 35&asm_init($ARGV[0],"rc4-586.pl");
30 36
31$x="eax"; 37$xx="eax";
32$y="ebx"; 38$yy="ebx";
33$tx="ecx"; 39$tx="ecx";
34$ty="edx"; 40$ty="edx";
35$in="esi"; 41$inp="esi";
36$out="edi"; 42$out="ebp";
37$d="ebp"; 43$dat="edi";
38 44
39&RC4("RC4"); 45sub RC4_loop {
40 46 my $i=shift;
41&asm_finish(); 47 my $func = ($i==0)?*mov:*or;
42 48
43sub RC4_loop 49 &add (&LB($yy),&LB($tx));
44 { 50 &mov ($ty,&DWP(0,$dat,$yy,4));
45 local($n,$p,$char)=@_; 51 &mov (&DWP(0,$dat,$yy,4),$tx);
46 52 &mov (&DWP(0,$dat,$xx,4),$ty);
47 &comment("Round $n"); 53 &add ($ty,$tx);
48 54 &inc (&LB($xx));
49 if ($char) 55 &and ($ty,0xff);
50 { 56 &ror ($out,8) if ($i!=0);
51 if ($p >= 0) 57 if ($i<3) {
52 { 58 &mov ($tx,&DWP(0,$dat,$xx,4));
53 &mov($ty, &swtmp(2)); 59 } else {
54 &cmp($ty, $in); 60 &mov ($tx,&wparam(3)); # reload [re-biased] out
55 &jbe(&label("finished"));
56 &inc($in);
57 }
58 else
59 {
60 &add($ty, 8);
61 &inc($in);
62 &cmp($ty, $in);
63 &jb(&label("finished"));
64 &mov(&swtmp(2), $ty);
65 }
66 }
67 # Moved out
68 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
69
70 &add( &LB($y), &LB($tx));
71 &mov( $ty, &DWP(0,$d,$y,4));
72 # XXX
73 &mov( &DWP(0,$d,$x,4),$ty);
74 &add( $ty, $tx);
75 &mov( &DWP(0,$d,$y,4),$tx);
76 &and( $ty, 0xff);
77 &inc( &LB($x)); # NEXT ROUND
78 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
79 &mov( $ty, &DWP(0,$d,$ty,4));
80
81 if (!$char)
82 {
83 #moved up into last round
84 if ($p >= 1)
85 {
86 &add( $out, 8)
87 }
88 &movb( &BP($n,"esp","",0), &LB($ty));
89 }
90 else
91 {
92 # Note in+=8 has occured
93 &movb( &HB($ty), &BP(-1,$in,"",0));
94 # XXX
95 &xorb(&LB($ty), &HB($ty));
96 # XXX
97 &movb(&BP($n,$out,"",0),&LB($ty));
98 }
99 } 61 }
100 62 &$func ($out,&DWP(0,$dat,$ty,4));
101 63}
102sub RC4 64
103 { 65# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
104 local($name)=@_; 66&function_begin("RC4");
105 67 &mov ($dat,&wparam(0)); # load key schedule pointer
106 &function_begin_B($name,""); 68 &mov ($ty, &wparam(1)); # load len
107 69 &mov ($inp,&wparam(2)); # load inp
108 &mov($ty,&wparam(1)); # len 70 &mov ($out,&wparam(3)); # load out
109 &cmp($ty,0); 71
110 &jne(&label("proceed")); 72 &xor ($xx,$xx); # avoid partial register stalls
111 &ret(); 73 &xor ($yy,$yy);
112 &set_label("proceed"); 74
113 75 &cmp ($ty,0); # safety net
114 &comment(""); 76 &je (&label("abort"));
115 77
116 &push("ebp"); 78 &mov (&LB($xx),&BP(0,$dat)); # load key->x
117 &push("ebx"); 79 &mov (&LB($yy),&BP(4,$dat)); # load key->y
118 &push("esi"); 80 &add ($dat,8);
119 &xor( $x, $x); # avoid partial register stalls 81
120 &push("edi"); 82 &lea ($tx,&DWP(0,$inp,$ty));
121 &xor( $y, $y); # avoid partial register stalls 83 &sub ($out,$inp); # re-bias out
122 &mov( $d, &wparam(0)); # key 84 &mov (&wparam(1),$tx); # save input+len
123 &mov( $in, &wparam(2)); 85
124 86 &inc (&LB($xx));
125 &movb( &LB($x), &BP(0,$d,"",1)); 87
126 &movb( &LB($y), &BP(4,$d,"",1)); 88 # detect compressed key schedule...
127 89 &cmp (&DWP(256,$dat),-1);
128 &mov( $out, &wparam(3)); 90 &je (&label("RC4_CHAR"));
129 &inc( &LB($x)); 91
130 92 &mov ($tx,&DWP(0,$dat,$xx,4));
131 &stack_push(3); # 3 temp variables 93
132 &add( $d, 8); 94 &and ($ty,-4); # how many 4-byte chunks?
133 95 &jz (&label("loop1"));
134 # detect compressed schedule, see commentary section in rc4_skey.c... 96
135 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, 97 &lea ($ty,&DWP(-4,$inp,$ty));
136 # as compressed key schedule is set up in 0.9.8 and later. 98 &mov (&wparam(2),$ty); # save input+(len/4)*4-4
137 &cmp(&DWP(256,$d),-1); 99 &mov (&wparam(3),$out); # $out as accumulator in this loop
138 &je(&label("RC4_CHAR")); 100
139 101 &set_label("loop4",16);
140 &lea( $ty, &DWP(-8,$ty,$in)); 102 for ($i=0;$i<4;$i++) { RC4_loop($i); }
141 103 &ror ($out,8);
142 # check for 0 length input 104 &xor ($out,&DWP(0,$inp));
143 105 &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
144 &mov( &swtmp(2), $ty); # this is now address to exit at 106 &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
145 &mov( $tx, &DWP(0,$d,$x,4)); 107 &lea ($inp,&DWP(4,$inp));
146 108 &mov ($tx,&DWP(0,$dat,$xx,4));
147 &cmp( $ty, $in); 109 &jb (&label("loop4"));
148 &jb( &label("end")); # less than 8 bytes 110
149 111 &cmp ($inp,&wparam(1)); # compare to input+len
150 &set_label("start"); 112 &je (&label("done"));
151 113 &mov ($out,&wparam(3)); # restore $out
152 # filling DELAY SLOT 114
153 &add( $in, 8); 115 &set_label("loop1",16);
154 116 &add (&LB($yy),&LB($tx));
155 &RC4_loop(0,-1,0); 117 &mov ($ty,&DWP(0,$dat,$yy,4));
156 &RC4_loop(1,0,0); 118 &mov (&DWP(0,$dat,$yy,4),$tx);
157 &RC4_loop(2,0,0); 119 &mov (&DWP(0,$dat,$xx,4),$ty);
158 &RC4_loop(3,0,0); 120 &add ($ty,$tx);
159 &RC4_loop(4,0,0); 121 &inc (&LB($xx));
160 &RC4_loop(5,0,0); 122 &and ($ty,0xff);
161 &RC4_loop(6,0,0); 123 &mov ($ty,&DWP(0,$dat,$ty,4));
162 &RC4_loop(7,1,0); 124 &xor (&LB($ty),&BP(0,$inp));
163 125 &lea ($inp,&DWP(1,$inp));
164 &comment("apply the cipher text"); 126 &mov ($tx,&DWP(0,$dat,$xx,4));
165 # xor the cipher data with input 127 &cmp ($inp,&wparam(1)); # compare to input+len
166 128 &mov (&BP(-1,$out,$inp),&LB($ty));
167 #&add( $out, 8); #moved up into last round 129 &jb (&label("loop1"));
168 130
169 &mov( $tx, &swtmp(0)); 131 &jmp (&label("done"));
170 &mov( $ty, &DWP(-8,$in,"",0)); 132
171 &xor( $tx, $ty); 133# this is essentially Intel P4 specific codepath...
172 &mov( $ty, &DWP(-4,$in,"",0)); 134&set_label("RC4_CHAR",16);
173 &mov( &DWP(-8,$out,"",0), $tx); 135 &movz ($tx,&BP(0,$dat,$xx));
174 &mov( $tx, &swtmp(1));
175 &xor( $tx, $ty);
176 &mov( $ty, &swtmp(2)); # load end ptr;
177 &mov( &DWP(-4,$out,"",0), $tx);
178 &mov( $tx, &DWP(0,$d,$x,4));
179 &cmp($in, $ty);
180 &jbe(&label("start"));
181
182 &set_label("end");
183
184 # There is quite a bit of extra crap in RC4_loop() for this
185 # first round
186 &RC4_loop(0,-1,1);
187 &RC4_loop(1,0,1);
188 &RC4_loop(2,0,1);
189 &RC4_loop(3,0,1);
190 &RC4_loop(4,0,1);
191 &RC4_loop(5,0,1);
192 &RC4_loop(6,1,1);
193
194 &jmp(&label("finished"));
195
196 &align(16);
197 # this is essentially Intel P4 specific codepath, see rc4_skey.c,
198 # and is engaged in 0.9.8 and later context...
199 &set_label("RC4_CHAR");
200
201 &lea ($ty,&DWP(0,$in,$ty));
202 &mov (&swtmp(2),$ty);
203 &movz ($tx,&BP(0,$d,$x));
204
205 # strangely enough unrolled loop performs over 20% slower... 136 # strangely enough unrolled loop performs over 20% slower...
206 &set_label("RC4_CHAR_loop"); 137 &set_label("cloop1");
207 &add (&LB($y),&LB($tx)); 138 &add (&LB($yy),&LB($tx));
208 &movz ($ty,&BP(0,$d,$y)); 139 &movz ($ty,&BP(0,$dat,$yy));
209 &movb (&BP(0,$d,$y),&LB($tx)); 140 &mov (&BP(0,$dat,$yy),&LB($tx));
210 &movb (&BP(0,$d,$x),&LB($ty)); 141 &mov (&BP(0,$dat,$xx),&LB($ty));
211 &add (&LB($ty),&LB($tx)); 142 &add (&LB($ty),&LB($tx));
212 &movz ($ty,&BP(0,$d,$ty)); 143 &movz ($ty,&BP(0,$dat,$ty));
213 &add (&LB($x),1); 144 &add (&LB($xx),1);
214 &xorb (&LB($ty),&BP(0,$in)); 145 &xor (&LB($ty),&BP(0,$inp));
215 &lea ($in,&DWP(1,$in)); 146 &lea ($inp,&DWP(1,$inp));
216 &movz ($tx,&BP(0,$d,$x)); 147 &movz ($tx,&BP(0,$dat,$xx));
217 &cmp ($in,&swtmp(2)); 148 &cmp ($inp,&wparam(1));
218 &movb (&BP(0,$out),&LB($ty)); 149 &mov (&BP(-1,$out,$inp),&LB($ty));
219 &lea ($out,&DWP(1,$out)); 150 &jb (&label("cloop1"));
220 &jb (&label("RC4_CHAR_loop")); 151
221 152&set_label("done");
222 &set_label("finished"); 153 &dec (&LB($xx));
223 &dec( $x); 154 &mov (&BP(-4,$dat),&LB($yy)); # save key->y
224 &stack_pop(3); 155 &mov (&BP(-8,$dat),&LB($xx)); # save key->x
225 &movb( &BP(-4,$d,"",0),&LB($y)); 156&set_label("abort");
226 &movb( &BP(-8,$d,"",0),&LB($x)); 157&function_end("RC4");
227 158
228 &function_end($name); 159########################################################################
229 } 160
161$inp="esi";
162$out="edi";
163$idi="ebp";
164$ido="ecx";
165$idx="edx";
166
167&external_label("OPENSSL_ia32cap_P");
168
169# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
170&function_begin("RC4_set_key");
171 &mov ($out,&wparam(0)); # load key
172 &mov ($idi,&wparam(1)); # load len
173 &mov ($inp,&wparam(2)); # load data
174 &picmeup($idx,"OPENSSL_ia32cap_P");
175
176 &lea ($out,&DWP(2*4,$out)); # &key->data
177 &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
178 &neg ($idi);
179 &xor ("eax","eax");
180 &mov (&DWP(-4,$out),$idi); # borrow key->y
181
182 &bt (&DWP(0,$idx),20); # check for bit#20
183 &jc (&label("c1stloop"));
184
185&set_label("w1stloop",16);
186 &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
187 &add (&LB("eax"),1); # i++;
188 &jnc (&label("w1stloop"));
189
190 &xor ($ido,$ido);
191 &xor ($idx,$idx);
192
193&set_label("w2ndloop",16);
194 &mov ("eax",&DWP(0,$out,$ido,4));
195 &add (&LB($idx),&BP(0,$inp,$idi));
196 &add (&LB($idx),&LB("eax"));
197 &add ($idi,1);
198 &mov ("ebx",&DWP(0,$out,$idx,4));
199 &jnz (&label("wnowrap"));
200 &mov ($idi,&DWP(-4,$out));
201 &set_label("wnowrap");
202 &mov (&DWP(0,$out,$idx,4),"eax");
203 &mov (&DWP(0,$out,$ido,4),"ebx");
204 &add (&LB($ido),1);
205 &jnc (&label("w2ndloop"));
206&jmp (&label("exit"));
207
208# Unlike all other x86 [and x86_64] implementations, Intel P4 core
209# [including EM64T] was found to perform poorly with above "32-bit" key
210# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
211# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
212# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
213# schedule for x86[_64], because non-P4 implementations suffer from
214# significant performance losses then, e.g. PIII exhibits >2x
215# deterioration, and so does Opteron. In order to assure optimal
216# all-round performance, we detect P4 at run-time and set up compressed
217# key schedule, which is recognized by RC4 procedure.
218
219&set_label("c1stloop",16);
220 &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
221 &add (&LB("eax"),1); # i++;
222 &jnc (&label("c1stloop"));
223
224 &xor ($ido,$ido);
225 &xor ($idx,$idx);
226 &xor ("ebx","ebx");
227
228&set_label("c2ndloop",16);
229 &mov (&LB("eax"),&BP(0,$out,$ido));
230 &add (&LB($idx),&BP(0,$inp,$idi));
231 &add (&LB($idx),&LB("eax"));
232 &add ($idi,1);
233 &mov (&LB("ebx"),&BP(0,$out,$idx));
234 &jnz (&label("cnowrap"));
235 &mov ($idi,&DWP(-4,$out));
236 &set_label("cnowrap");
237 &mov (&BP(0,$out,$idx),&LB("eax"));
238 &mov (&BP(0,$out,$ido),&LB("ebx"));
239 &add (&LB($ido),1);
240 &jnc (&label("c2ndloop"));
241
242 &mov (&DWP(256,$out),-1); # mark schedule as compressed
243
244&set_label("exit");
245 &xor ("eax","eax");
246 &mov (&DWP(-8,$out),"eax"); # key->x=0;
247 &mov (&DWP(-4,$out),"eax"); # key->y=0;
248&function_end("RC4_set_key");
249
250# const char *RC4_options(void);
251&function_begin_B("RC4_options");
252 &call (&label("pic_point"));
253&set_label("pic_point");
254 &blindpop("eax");
255 &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
256 &picmeup("edx","OPENSSL_ia32cap_P");
257 &bt (&DWP(0,"edx"),20);
258 &jnc (&label("skip"));
259 &add ("eax",12);
260 &set_label("skip");
261 &ret ();
262&set_label("opts",64);
263&asciz ("rc4(4x,int)");
264&asciz ("rc4(1x,char)");
265&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
266&align (64);
267&function_end_B("RC4_options");
268
269&asm_finish();
230 270
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
new file mode 100644
index 0000000000..49cd5b5e69
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
@@ -0,0 +1,755 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by David Mosberger <David.Mosberger@acm.org> based on the
5# Itanium optimized Crypto code which was released by HP Labs at
6# http://www.hpl.hp.com/research/linux/crypto/.
7#
8# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
9#
10# Permission is hereby granted, free of charge, to any person obtaining
11# a copy of this software and associated documentation files (the
12# "Software"), to deal in the Software without restriction, including
13# without limitation the rights to use, copy, modify, merge, publish,
14# distribute, sublicense, and/or sell copies of the Software, and to
15# permit persons to whom the Software is furnished to do so, subject to
16# the following conditions:
17#
18# The above copyright notice and this permission notice shall be
19# included in all copies or substantial portions of the Software.
20
21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
28
29
30
31# This is a little helper program which generates a software-pipelined
32# for RC4 encryption. The basic algorithm looks like this:
33#
34# for (counter = 0; counter < len; ++counter)
35# {
36# in = inp[counter];
37# SI = S[I];
38# J = (SI + J) & 0xff;
39# SJ = S[J];
40# T = (SI + SJ) & 0xff;
41# S[I] = SJ, S[J] = SI;
42# ST = S[T];
43# outp[counter] = in ^ ST;
44# I = (I + 1) & 0xff;
45# }
46#
47# Pipelining this loop isn't easy, because the stores to the S[] array
48# need to be observed in the right order. The loop generated by the
49# code below has the following pipeline diagram:
50#
51# cycle
52# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
53# iter
54# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
55# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
56# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
57#
58# where:
59# LDI = load of S[I]
60# LDJ = load of S[J]
61# SWP = swap of S[I] and S[J]
62# LDT = load of S[T]
63#
64# Note that in the above diagram, the major trouble-spot is that LDI
65# of the 2nd iteration is performed BEFORE the SWP of the first
66# iteration. Fortunately, this is easy to detect (I of the 1st
67# iteration will be equal to J of the 2nd iteration) and when this
68# happens, we simply forward the proper value from the 1st iteration
69# to the 2nd one. The proper value in this case is simply the value
70# of S[I] from the first iteration (thanks to the fact that SWP
71# simply swaps the contents of S[I] and S[J]).
72#
73# Another potential trouble-spot is in cycle 7, where SWP of the 1st
74# iteration issues at the same time as the LDI of the 3rd iteration.
75# However, thanks to IA-64 execution semantics, this can be taken
76# care of simply by placing LDI later in the instruction-group than
77# SWP. IA-64 CPUs will automatically forward the value if they
78# detect that the SWP and LDI are accessing the same memory-location.
79
80# The core-loop that can be pipelined then looks like this (annotated
81# with McKinley/Madison issue port & latency numbers, assuming L1
82# cache hits for the most part):
83
84# operation: instruction: issue-ports: latency
85# ------------------ ----------------------------- ------------- -------
86
87# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
88# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
89# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
90# ;;
91# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
92# ;;
93# cmp.eq.unc pBypass = I, J * after J is valid!
94# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
95# (pBypass) br.cond.spnt Bypass
96# ;;
97# ---------------------------------------------------------------------------------------
98# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
99# ;;
100# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
101# ;;
102# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
103# ;;
104# ---------------------------------------------------------------------------------------
105# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
106# ;;
107# T = T & 0xff zxt1 T = T I0, I1 1 cyc
108# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
109# S[J] = SI st8 [Jptr] = SI M2-M3
110# ;;
111# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
112# ;;
113# ---------------------------------------------------------------------------------------
114# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
115# ;;
116# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
117# ;;
118# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
119# ;;
120# ---------------------------------------------------------------------------------------
121
122# There are several points worth making here:
123
124# - Note that due to the bypass/forwarding-path, the first two
125# phases of the loop are strangly mingled together. In
126# particular, note that the first stage of the pipeline is
127# using the value of "J", as calculated by the second stage.
128# - Each bundle-pair will have exactly 6 instructions.
129# - Pipelined, the loop can execute in 3 cycles/iteration and
130# 4 stages. However, McKinley/Madison can issue "st1" to
131# the same bank at a rate of at most one per 4 cycles. Thus,
132# instead of storing each byte, we accumulate them in a word
133# and then write them back at once with a single "st8" (this
134# implies that the setup code needs to ensure that the output
135# buffer is properly aligned, if need be, by encoding the
136# first few bytes separately).
137# - There is no space for a "br.ctop" instruction. For this
138# reason we can't use module-loop support in IA-64 and have
139# to do a traditional, purely software-pipelined loop.
140# - We can't replace any of the remaining "add/zxt1" pairs with
141# "padd1" because the latency for that instruction is too high
142# and would push the loop to the point where more bypasses
143# would be needed, which we don't have space for.
144# - The above loop runs at around 3.26 cycles/byte, or roughly
145# 440 MByte/sec on a 1.5GHz Madison. This is well below the
146# system bus bandwidth and hence with judicious use of
147# "lfetch" this loop can run at (almost) peak speed even when
148# the input and output data reside in memory. The
149# max. latency that can be tolerated is (PREFETCH_DISTANCE *
150# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
151# least) 1-ahead prefetching of 128 byte cache-lines. Note
152# that we do NOT prefetch into L1, since that would only
153# interfere with the S[] table values stored there. This is
154# acceptable because there is a 10 cycle latency between
155# load and first use of the input data.
156# - We use a branch to out-of-line bypass-code of cycle-pressure:
157# we calculate the next J, check for the need to activate the
158# bypass path, and activate the bypass path ALL IN THE SAME
159# CYCLE. If we didn't have these constraints, we could do
160# the bypass with a simple conditional move instruction.
161# Fortunately, the bypass paths get activated relatively
162# infrequently, so the extra branches don't cost all that much
163# (about 0.04 cycles/byte, measured on a 16396 byte file with
164# random input data).
165#
166
167$phases = 4; # number of stages/phases in the pipelined-loop
168$unroll_count = 6; # number of times we unrolled it
169$pComI = (1 << 0);
170$pComJ = (1 << 1);
171$pComT = (1 << 2);
172$pOut = (1 << 3);
173
174$NData = 4;
175$NIP = 3;
176$NJP = 2;
177$NI = 2;
178$NSI = 3;
179$NSJ = 2;
180$NT = 2;
181$NOutWord = 2;
182
183#
184# $threshold is the minimum length before we attempt to use the
185# big software-pipelined loop. It MUST be greater-or-equal
186# to:
187# PHASES * (UNROLL_COUNT + 1) + 7
188#
189# The "+ 7" comes from the fact we may have to encode up to
190# 7 bytes separately before the output pointer is aligned.
191#
192$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
193
194sub I {
195 local *code = shift;
196 local $format = shift;
197 $code .= sprintf ("\t\t".$format."\n", @_);
198}
199
200sub P {
201 local *code = shift;
202 local $format = shift;
203 $code .= sprintf ($format."\n", @_);
204}
205
206sub STOP {
207 local *code = shift;
208 $code .=<<___;
209 ;;
210___
211}
212
213sub emit_body {
214 local *c = shift;
215 local *bypass = shift;
216 local ($iteration, $p) = @_;
217
218 local $i0 = $iteration;
219 local $i1 = $iteration - 1;
220 local $i2 = $iteration - 2;
221 local $i3 = $iteration - 3;
222 local $iw0 = ($iteration - 3) / 8;
223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
224 local $byte_num = ($iteration - 3) % 8;
225 local $label = $iteration + 1;
226 local $pAny = ($p & 0xf) == 0xf;
227 local $pByp = (($p & $pComI) && ($iteration > 0));
228
229 $c.=<<___;
230//////////////////////////////////////////////////
231___
232
233 if (($p & 0xf) == 0) {
234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
236 $iw1 % $NOutWord, $iw1 % $NOutWord);
237 $c.="#endif\n";
238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
239 return;
240 }
241
242 # Cycle 0
243 &I(\$c, "{ .mmi") if ($pAny);
244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
246 &I(\$c, "zxt1 J = J") if ($p & $pComJ);
247 &I(\$c, "}") if ($pAny);
248 &I(\$c, "{ .mmi") if ($pAny);
249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
253 &I(\$c, "}") if ($pAny);
254 &STOP(\$c);
255
256 # Cycle 1
257 &I(\$c, "{ .mmi") if ($pAny);
258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
261 &I(\$c, "}") if ($pAny);
262 &I(\$c, "{ .mmi") if ($pAny);
263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
267 &I(\$c, "}") if ($pAny);
268 &STOP(\$c);
269
270 # Cycle 2
271 &I(\$c, "{ .mmi") if ($pAny);
272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
276 &I(\$c, "}") if ($pAny);
277 &I(\$c, "{ .mmb") if ($pAny);
278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
281 &I(\$c, "}") if ($pAny);
282 &STOP(\$c);
283
284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
285 if ($byte_num == 0 && $iteration >= $phases) {
286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
287 $iw1 % $NOutWord) if ($p & $pOut);
288 if ($iteration == (1 + $unroll_count) * $phases - 1) {
289 if ($unroll_count == 6) {
290 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
291 $iw1 % $NOutWord, $iw0 % $NOutWord);
292 }
293 &I(\$c, "lfetch.nt1 [InPrefetch], %u",
294 $unroll_count * $phases);
295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
296 $unroll_count * $phases);
297 &I(\$c, "br.cloop.sptk.few .rc4Loop");
298 }
299 }
300
301 if ($pByp) {
302 &P(\$bypass, ".rc4Bypass%u:", $label);
303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
304 &I(\$bypass, "nop 0");
305 &I(\$bypass, "nop 0");
306 &I(\$bypass, ";;");
307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
310 &I(\$bypass, ";;");
311 }
312}
313
314$code=<<___;
315.ident \"rc4-ia64.s, version 3.0\"
316.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
317
318#define LCSave r8
319#define PRSave r9
320
321/* Inputs become invalid once rotation begins! */
322
323#define StateTable in0
324#define DataLen in1
325#define InputBuffer in2
326#define OutputBuffer in3
327
328#define KTable r14
329#define J r15
330#define InPtr r16
331#define OutPtr r17
332#define InPrefetch r18
333#define OutPrefetch r19
334#define One r20
335#define LoopCount r21
336#define Remainder r22
337#define IFinal r23
338#define EndPtr r24
339
340#define tmp0 r25
341#define tmp1 r26
342
343#define pBypass p6
344#define pDone p7
345#define pSmall p8
346#define pAligned p9
347#define pUnaligned p10
348
349#define pComputeI pPhase[0]
350#define pComputeJ pPhase[1]
351#define pComputeT pPhase[2]
352#define pOutput pPhase[3]
353
354#define RetVal r8
355#define L_OK p7
356#define L_NOK p8
357
358#define _NINPUTS 4
359#define _NOUTPUT 0
360
361#define _NROTATE 24
362#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
363
364#ifndef SZ
365# define SZ 4 // this must be set to sizeof(RC4_INT)
366#endif
367
368#if SZ == 1
369# define LKEY ld1
370# define SKEY st1
371# define KEYADDR(dst, i) add dst = i, KTable
372#elif SZ == 2
373# define LKEY ld2
374# define SKEY st2
375# define KEYADDR(dst, i) shladd dst = i, 1, KTable
376#elif SZ == 4
377# define LKEY ld4
378# define SKEY st4
379# define KEYADDR(dst, i) shladd dst = i, 2, KTable
380#else
381# define LKEY ld8
382# define SKEY st8
383# define KEYADDR(dst, i) shladd dst = i, 3, KTable
384#endif
385
386#if defined(_HPUX_SOURCE) && !defined(_LP64)
387# define ADDP addp4
388#else
389# define ADDP add
390#endif
391
392/* Define a macro for the bit number of the n-th byte: */
393
394#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
395# define HOST_IS_BIG_ENDIAN
396# define BYTE_POS(n) (56 - (8 * (n)))
397#else
398# define BYTE_POS(n) (8 * (n))
399#endif
400
401/*
402 We must perform the first phase of the pipeline explicitly since
403 we will always load from the stable the first time. The br.cexit
404 will never be taken since regardless of the number of bytes because
405 the epilogue count is 4.
406*/
407/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
408 assembler failed on original macro with syntax error. <appro> */
409#define MODSCHED_RC4_PROLOGUE \\
410 { \\
411 ld1 Data[0] = [InPtr], 1; \\
412 add IFinal = 1, I[1]; \\
413 KEYADDR(IPr[0], I[1]); \\
414 } ;; \\
415 { \\
416 LKEY SI[0] = [IPr[0]]; \\
417 mov pr.rot = 0x10000; \\
418 mov ar.ec = 4; \\
419 } ;; \\
420 { \\
421 add J = J, SI[0]; \\
422 zxt1 I[0] = IFinal; \\
423 br.cexit.spnt.few .+16; /* never taken */ \\
424 } ;;
425#define MODSCHED_RC4_LOOP(label) \\
426label: \\
427 { .mmi; \\
428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\
429 (pComputeI) add IFinal = 1, I[1]; \\
430 (pComputeJ) zxt1 J = J; \\
431 }{ .mmi; \\
432 (pOutput) LKEY T[1] = [T[1]]; \\
433 (pComputeT) add T[0] = SI[2], SJ[1]; \\
434 (pComputeI) KEYADDR(IPr[0], I[1]); \\
435 } ;; \\
436 { .mmi; \\
437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
438 (pComputeT) SKEY [JP[1]] = SI[2]; \\
439 (pComputeT) zxt1 T[0] = T[0]; \\
440 }{ .mmi; \\
441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\
442 (pComputeJ) KEYADDR(JP[0], J); \\
443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
444 } ;; \\
445 { .mmi; \\
446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
447 (pOutput) xor Data[3] = Data[3], T[1]; \\
448 nop 0x0; \\
449 }{ .mmi; \\
450 (pComputeT) KEYADDR(T[0], T[0]); \\
451 (pBypass) mov SI[0] = SI[1]; \\
452 (pComputeI) zxt1 I[0] = IFinal; \\
453 } ;; \\
454 { .mmb; \\
455 (pOutput) st1 [OutPtr] = Data[3], 1; \\
456 (pComputeI) add J = J, SI[0]; \\
457 br.ctop.sptk.few label; \\
458 } ;;
459
460 .text
461
462 .align 32
463
464 .type RC4, \@function
465 .global RC4
466
467 .proc RC4
468 .prologue
469
470RC4:
471 {
472 .mmi
473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
474
475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
476 OutWord[2]
477 .rotp pPhase[4]
478
479 ADDP InPrefetch = 0, InputBuffer
480 ADDP KTable = 0, StateTable
481 }
482 {
483 .mmi
484 ADDP InPtr = 0, InputBuffer
485 ADDP OutPtr = 0, OutputBuffer
486 mov RetVal = r0
487 }
488 ;;
489 {
490 .mmi
491 lfetch.nt1 [InPrefetch], 0x80
492 ADDP OutPrefetch = 0, OutputBuffer
493 }
494 { // Return 0 if the input length is nonsensical
495 .mib
496 ADDP StateTable = 0, StateTable
497 cmp.ge.unc L_NOK, L_OK = r0, DataLen
498 (L_NOK) br.ret.sptk.few rp
499 }
500 ;;
501 {
502 .mib
503 cmp.eq.or L_NOK, L_OK = r0, InPtr
504 cmp.eq.or L_NOK, L_OK = r0, OutPtr
505 nop 0x0
506 }
507 {
508 .mib
509 cmp.eq.or L_NOK, L_OK = r0, StateTable
510 nop 0x0
511 (L_NOK) br.ret.sptk.few rp
512 }
513 ;;
514 LKEY I[1] = [KTable], SZ
515/* Prefetch the state-table. It contains 256 elements of size SZ */
516
517#if SZ == 1
518 ADDP tmp0 = 1*128, StateTable
519#elif SZ == 2
520 ADDP tmp0 = 3*128, StateTable
521 ADDP tmp1 = 2*128, StateTable
522#elif SZ == 4
523 ADDP tmp0 = 7*128, StateTable
524 ADDP tmp1 = 6*128, StateTable
525#elif SZ == 8
526 ADDP tmp0 = 15*128, StateTable
527 ADDP tmp1 = 14*128, StateTable
528#endif
529 ;;
530#if SZ >= 8
531 lfetch.fault.nt1 [tmp0], -256 // 15
532 lfetch.fault.nt1 [tmp1], -256;;
533 lfetch.fault.nt1 [tmp0], -256 // 13
534 lfetch.fault.nt1 [tmp1], -256;;
535 lfetch.fault.nt1 [tmp0], -256 // 11
536 lfetch.fault.nt1 [tmp1], -256;;
537 lfetch.fault.nt1 [tmp0], -256 // 9
538 lfetch.fault.nt1 [tmp1], -256;;
539#endif
540#if SZ >= 4
541 lfetch.fault.nt1 [tmp0], -256 // 7
542 lfetch.fault.nt1 [tmp1], -256;;
543 lfetch.fault.nt1 [tmp0], -256 // 5
544 lfetch.fault.nt1 [tmp1], -256;;
545#endif
546#if SZ >= 2
547 lfetch.fault.nt1 [tmp0], -256 // 3
548 lfetch.fault.nt1 [tmp1], -256;;
549#endif
550 {
551 .mii
552 lfetch.fault.nt1 [tmp0] // 1
553 add I[1]=1,I[1];;
554 zxt1 I[1]=I[1]
555 }
556 {
557 .mmi
558 lfetch.nt1 [InPrefetch], 0x80
559 lfetch.excl.nt1 [OutPrefetch], 0x80
560 .save pr, PRSave
561 mov PRSave = pr
562 } ;;
563 {
564 .mmi
565 lfetch.excl.nt1 [OutPrefetch], 0x80
566 LKEY J = [KTable], SZ
567 ADDP EndPtr = DataLen, InPtr
568 } ;;
569 {
570 .mmi
571 ADDP EndPtr = -1, EndPtr // Make it point to
572 // last data byte.
573 mov One = 1
574 .save ar.lc, LCSave
575 mov LCSave = ar.lc
576 .body
577 } ;;
578 {
579 .mmb
580 sub Remainder = 0, OutPtr
581 cmp.gtu pSmall, p0 = $threshold, DataLen
582(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
583 // big loop.
584 } ;;
585 {
586 .mmi
587 and Remainder = 0x7, Remainder
588 ;;
589 cmp.eq pAligned, pUnaligned = Remainder, r0
590 nop 0x0
591 } ;;
592 {
593 .mmb
594.pred.rel "mutex",pUnaligned,pAligned
595(pUnaligned) add Remainder = -1, Remainder
596(pAligned) sub Remainder = EndPtr, InPtr
597(pAligned) br.cond.dptk.many .rc4Aligned
598 } ;;
599 {
600 .mmi
601 nop 0x0
602 nop 0x0
603 mov.i ar.lc = Remainder
604 }
605
606/* Do the initial few bytes via the compact, modulo-scheduled loop
607 until the output pointer is 8-byte-aligned. */
608
609 MODSCHED_RC4_PROLOGUE
610 MODSCHED_RC4_LOOP(.RC4AlignLoop)
611
612 {
613 .mib
614 sub Remainder = EndPtr, InPtr
615 zxt1 IFinal = IFinal
616 clrrrb // Clear CFM.rrb.pr so
617 ;; // next "mov pr.rot = N"
618 // does the right thing.
619 }
620 {
621 .mmi
622 mov I[1] = IFinal
623 nop 0x0
624 nop 0x0
625 } ;;
626
627
628.rc4Aligned:
629
630/*
631 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
632 */
633
634 {
635 .mlx
636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
637 movl Remainder = 0xaaaaaaaaaaaaaaab
638 } ;;
639 {
640 .mmi
641 setf.sig f6 = LoopCount // M2, M3 6 cyc
642 setf.sig f7 = Remainder // M2, M3 6 cyc
643 nop 0x0
644 } ;;
645 {
646 .mfb
647 nop 0x0
648 xmpy.hu f6 = f6, f7
649 nop 0x0
650 } ;;
651 {
652 .mmi
653 getf.sig LoopCount = f6;; // M2 5 cyc
654 nop 0x0
655 shr.u LoopCount = LoopCount, 4
656 } ;;
657 {
658 .mmi
659 nop 0x0
660 nop 0x0
661 mov.i ar.lc = LoopCount
662 } ;;
663
664/* Now comes the unrolled loop: */
665
666.rc4Prologue:
667___
668
669$iteration = 0;
670
671# Generate the prologue:
672$predicates = 1;
673for ($i = 0; $i < $phases; ++$i) {
674 &emit_body (\$code, \$bypass, $iteration++, $predicates);
675 $predicates = ($predicates << 1) | 1;
676}
677
678$code.=<<___;
679.rc4Loop:
680___
681
682# Generate the body:
683for ($i = 0; $i < $unroll_count*$phases; ++$i) {
684 &emit_body (\$code, \$bypass, $iteration++, $predicates);
685}
686
687$code.=<<___;
688.rc4Epilogue:
689___
690
691# Generate the epilogue:
692for ($i = 0; $i < $phases; ++$i) {
693 $predicates <<= 1;
694 &emit_body (\$code, \$bypass, $iteration++, $predicates);
695}
696
697$code.=<<___;
698 {
699 .mmi
700 lfetch.nt1 [EndPtr] // fetch line with last byte
701 mov IFinal = I[1]
702 nop 0x0
703 }
704
705.rc4Remainder:
706 {
707 .mmi
708 sub Remainder = EndPtr, InPtr // Calculate
709 // # of bytes
710 // left - 1
711 nop 0x0
712 nop 0x0
713 } ;;
714 {
715 .mib
716 cmp.eq pDone, p0 = -1, Remainder // done already?
717 mov.i ar.lc = Remainder
718(pDone) br.cond.dptk.few .rc4Complete
719 }
720
721/* Do the remaining bytes via the compact, modulo-scheduled loop */
722
723 MODSCHED_RC4_PROLOGUE
724 MODSCHED_RC4_LOOP(.RC4RestLoop)
725
726.rc4Complete:
727 {
728 .mmi
729 add KTable = -SZ, KTable
730 add IFinal = -1, IFinal
731 mov ar.lc = LCSave
732 } ;;
733 {
734 .mii
735 SKEY [KTable] = J,-SZ
736 zxt1 IFinal = IFinal
737 mov pr = PRSave, 0x1FFFF
738 } ;;
739 {
740 .mib
741 SKEY [KTable] = IFinal
742 add RetVal = 1, r0
743 br.ret.sptk.few rp
744 } ;;
745___
746
747# Last but not least, emit the code for the bypass-code of the unrolled loop:
748
749$code.=$bypass;
750
751$code.=<<___;
752 .endp RC4
753___
754
755print $code;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
new file mode 100644
index 0000000000..96681fa05e
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
@@ -0,0 +1,205 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# February 2009
11#
12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13# "cluster" Address Generation Interlocks, so that one pipeline stall
14# resolves several dependencies.
15
16$rp="%r14";
17$sp="%r15";
18$code=<<___;
19.text
20
21___
22
23# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
24{
25$acc="%r0";
26$cnt="%r1";
27$key="%r2";
28$len="%r3";
29$inp="%r4";
30$out="%r5";
31
32@XX=("%r6","%r7");
33@TX=("%r8","%r9");
34$YY="%r10";
35$TY="%r11";
36
37$code.=<<___;
38.globl RC4
39.type RC4,\@function
40.align 64
41RC4:
42 stmg %r6,%r11,48($sp)
43 llgc $XX[0],0($key)
44 llgc $YY,1($key)
45 la $XX[0],1($XX[0])
46 nill $XX[0],0xff
47 srlg $cnt,$len,3
48 ltgr $cnt,$cnt
49 llgc $TX[0],2($XX[0],$key)
50 jz .Lshort
51 j .Loop8
52
53.align 64
54.Loop8:
55___
56for ($i=0;$i<8;$i++) {
57$code.=<<___;
58 la $YY,0($YY,$TX[0]) # $i
59 nill $YY,255
60 la $XX[1],1($XX[0])
61 nill $XX[1],255
62___
63$code.=<<___ if ($i==1);
64 llgc $acc,2($TY,$key)
65___
66$code.=<<___ if ($i>1);
67 sllg $acc,$acc,8
68 ic $acc,2($TY,$key)
69___
70$code.=<<___;
71 llgc $TY,2($YY,$key)
72 stc $TX[0],2($YY,$key)
73 llgc $TX[1],2($XX[1],$key)
74 stc $TY,2($XX[0],$key)
75 cr $XX[1],$YY
76 jne .Lcmov$i
77 la $TX[1],0($TX[0])
78.Lcmov$i:
79 la $TY,0($TY,$TX[0])
80 nill $TY,255
81___
82push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
83}
84
85$code.=<<___;
86 lg $TX[1],0($inp)
87 sllg $acc,$acc,8
88 la $inp,8($inp)
89 ic $acc,2($TY,$key)
90 xgr $acc,$TX[1]
91 stg $acc,0($out)
92 la $out,8($out)
93 brct $cnt,.Loop8
94
95.Lshort:
96 lghi $acc,7
97 ngr $len,$acc
98 jz .Lexit
99 j .Loop1
100
101.align 16
102.Loop1:
103 la $YY,0($YY,$TX[0])
104 nill $YY,255
105 llgc $TY,2($YY,$key)
106 stc $TX[0],2($YY,$key)
107 stc $TY,2($XX[0],$key)
108 ar $TY,$TX[0]
109 ahi $XX[0],1
110 nill $TY,255
111 nill $XX[0],255
112 llgc $acc,0($inp)
113 la $inp,1($inp)
114 llgc $TY,2($TY,$key)
115 llgc $TX[0],2($XX[0],$key)
116 xr $acc,$TY
117 stc $acc,0($out)
118 la $out,1($out)
119 brct $len,.Loop1
120
121.Lexit:
122 ahi $XX[0],-1
123 stc $XX[0],0($key)
124 stc $YY,1($key)
125 lmg %r6,%r11,48($sp)
126 br $rp
127.size RC4,.-RC4
128.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
129
130___
131}
132
133# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
134{
135$cnt="%r0";
136$idx="%r1";
137$key="%r2";
138$len="%r3";
139$inp="%r4";
140$acc="%r5";
141$dat="%r6";
142$ikey="%r7";
143$iinp="%r8";
144
145$code.=<<___;
146.globl RC4_set_key
147.type RC4_set_key,\@function
148.align 64
149RC4_set_key:
150 stmg %r6,%r8,48($sp)
151 lhi $cnt,256
152 la $idx,0(%r0)
153 sth $idx,0($key)
154.align 4
155.L1stloop:
156 stc $idx,2($idx,$key)
157 la $idx,1($idx)
158 brct $cnt,.L1stloop
159
160 lghi $ikey,-256
161 lr $cnt,$len
162 la $iinp,0(%r0)
163 la $idx,0(%r0)
164.align 16
165.L2ndloop:
166 llgc $acc,2+256($ikey,$key)
167 llgc $dat,0($iinp,$inp)
168 la $idx,0($idx,$acc)
169 la $ikey,1($ikey)
170 la $idx,0($idx,$dat)
171 nill $idx,255
172 la $iinp,1($iinp)
173 tml $ikey,255
174 llgc $dat,2($idx,$key)
175 stc $dat,2+256-1($ikey,$key)
176 stc $acc,2($idx,$key)
177 jz .Ldone
178 brct $cnt,.L2ndloop
179 lr $cnt,$len
180 la $iinp,0(%r0)
181 j .L2ndloop
182.Ldone:
183 lmg %r6,%r8,48($sp)
184 br $rp
185.size RC4_set_key,.-RC4_set_key
186
187___
188}
189
190# const char *RC4_options()
191$code.=<<___;
192.globl RC4_options
193.type RC4_options,\@function
194.align 16
195RC4_options:
196 larl %r2,.Loptions
197 br %r14
198.size RC4_options,.-RC4_options
199.section .rodata
200.Loptions:
201.align 8
202.string "rc4(8x,char)"
203___
204
205print $code;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
index 00c6fa28aa..677be5fe25 100755
--- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -58,14 +58,18 @@
58# fit for Core2 and therefore the code was modified to skip cloop8 on 58# fit for Core2 and therefore the code was modified to skip cloop8 on
59# this CPU. 59# this CPU.
60 60
61$output=shift; 61$flavour = shift;
62$output = shift;
63if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
64
65$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62 66
63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 67$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 68( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 69( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66die "can't locate x86_64-xlate.pl"; 70die "can't locate x86_64-xlate.pl";
67 71
68open STDOUT,"| $^X $xlate $output"; 72open STDOUT,"| $^X $xlate $flavour $output";
69 73
70$dat="%rdi"; # arg1 74$dat="%rdi"; # arg1
71$len="%rsi"; # arg2 75$len="%rsi"; # arg2
@@ -87,8 +91,10 @@ RC4: or $len,$len
87 jne .Lentry 91 jne .Lentry
88 ret 92 ret
89.Lentry: 93.Lentry:
94 push %rbx
90 push %r12 95 push %r12
91 push %r13 96 push %r13
97.Lprologue:
92 98
93 add \$8,$dat 99 add \$8,$dat
94 movl -8($dat),$XX[0]#d 100 movl -8($dat),$XX[0]#d
@@ -133,16 +139,8 @@ $code.=<<___;
133 jnz .Lloop8 139 jnz .Lloop8
134 cmp \$0,$len 140 cmp \$0,$len
135 jne .Lloop1 141 jne .Lloop1
136___ 142 jmp .Lexit
137$code.=<<___;
138.Lexit:
139 sub \$1,$XX[0]#b
140 movl $XX[0]#d,-8($dat)
141 movl $YY#d,-4($dat)
142 143
143 pop %r13
144 pop %r12
145 ret
146.align 16 144.align 16
147.Lloop1: 145.Lloop1:
148 add $TX[0]#b,$YY#b 146 add $TX[0]#b,$YY#b
@@ -167,9 +165,8 @@ $code.=<<___;
167 movzb ($dat,$XX[0]),$TX[0]#d 165 movzb ($dat,$XX[0]),$TX[0]#d
168 test \$-8,$len 166 test \$-8,$len
169 jz .Lcloop1 167 jz .Lcloop1
170 cmp \$0,260($dat) 168 cmpl \$0,260($dat)
171 jnz .Lcloop1 169 jnz .Lcloop1
172 push %rbx
173 jmp .Lcloop8 170 jmp .Lcloop8
174.align 16 171.align 16
175.Lcloop8: 172.Lcloop8:
@@ -224,7 +221,6 @@ $code.=<<___;
224 221
225 test \$-8,$len 222 test \$-8,$len
226 jnz .Lcloop8 223 jnz .Lcloop8
227 pop %rbx
228 cmp \$0,$len 224 cmp \$0,$len
229 jne .Lcloop1 225 jne .Lcloop1
230 jmp .Lexit 226 jmp .Lexit
@@ -249,6 +245,19 @@ $code.=<<___;
249 sub \$1,$len 245 sub \$1,$len
250 jnz .Lcloop1 246 jnz .Lcloop1
251 jmp .Lexit 247 jmp .Lexit
248
249.align 16
250.Lexit:
251 sub \$1,$XX[0]#b
252 movl $XX[0]#d,-8($dat)
253 movl $YY#d,-4($dat)
254
255 mov (%rsp),%r13
256 mov 8(%rsp),%r12
257 mov 16(%rsp),%rbx
258 add \$24,%rsp
259.Lepilogue:
260 ret
252.size RC4,.-RC4 261.size RC4,.-RC4
253___ 262___
254 263
@@ -333,11 +342,10 @@ RC4_set_key:
333.size RC4_set_key,.-RC4_set_key 342.size RC4_set_key,.-RC4_set_key
334 343
335.globl RC4_options 344.globl RC4_options
336.type RC4_options,\@function,0 345.type RC4_options,\@abi-omnipotent
337.align 16 346.align 16
338RC4_options: 347RC4_options:
339 .picmeup %rax 348 lea .Lopts(%rip),%rax
340 lea .Lopts-.(%rax),%rax
341 mov OPENSSL_ia32cap_P(%rip),%edx 349 mov OPENSSL_ia32cap_P(%rip),%edx
342 bt \$20,%edx 350 bt \$20,%edx
343 jnc .Ldone 351 jnc .Ldone
@@ -357,9 +365,139 @@ RC4_options:
357.size RC4_options,.-RC4_options 365.size RC4_options,.-RC4_options
358___ 366___
359 367
360$code =~ s/#([bwd])/$1/gm; 368# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
369# CONTEXT *context,DISPATCHER_CONTEXT *disp)
370if ($win64) {
371$rec="%rcx";
372$frame="%rdx";
373$context="%r8";
374$disp="%r9";
375
376$code.=<<___;
377.extern __imp_RtlVirtualUnwind
378.type stream_se_handler,\@abi-omnipotent
379.align 16
380stream_se_handler:
381 push %rsi
382 push %rdi
383 push %rbx
384 push %rbp
385 push %r12
386 push %r13
387 push %r14
388 push %r15
389 pushfq
390 sub \$64,%rsp
361 391
362$code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPSCANLIB} ne ""); 392 mov 120($context),%rax # pull context->Rax
393 mov 248($context),%rbx # pull context->Rip
394
395 lea .Lprologue(%rip),%r10
396 cmp %r10,%rbx # context->Rip<prologue label
397 jb .Lin_prologue
398
399 mov 152($context),%rax # pull context->Rsp
400
401 lea .Lepilogue(%rip),%r10
402 cmp %r10,%rbx # context->Rip>=epilogue label
403 jae .Lin_prologue
404
405 lea 24(%rax),%rax
406
407 mov -8(%rax),%rbx
408 mov -16(%rax),%r12
409 mov -24(%rax),%r13
410 mov %rbx,144($context) # restore context->Rbx
411 mov %r12,216($context) # restore context->R12
412 mov %r13,224($context) # restore context->R13
413
414.Lin_prologue:
415 mov 8(%rax),%rdi
416 mov 16(%rax),%rsi
417 mov %rax,152($context) # restore context->Rsp
418 mov %rsi,168($context) # restore context->Rsi
419 mov %rdi,176($context) # restore context->Rdi
420
421 jmp .Lcommon_seh_exit
422.size stream_se_handler,.-stream_se_handler
423
424.type key_se_handler,\@abi-omnipotent
425.align 16
426key_se_handler:
427 push %rsi
428 push %rdi
429 push %rbx
430 push %rbp
431 push %r12
432 push %r13
433 push %r14
434 push %r15
435 pushfq
436 sub \$64,%rsp
437
438 mov 152($context),%rax # pull context->Rsp
439 mov 8(%rax),%rdi
440 mov 16(%rax),%rsi
441 mov %rsi,168($context) # restore context->Rsi
442 mov %rdi,176($context) # restore context->Rdi
443
444.Lcommon_seh_exit:
445
446 mov 40($disp),%rdi # disp->ContextRecord
447 mov $context,%rsi # context
448 mov \$154,%ecx # sizeof(CONTEXT)
449 .long 0xa548f3fc # cld; rep movsq
450
451 mov $disp,%rsi
452 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
453 mov 8(%rsi),%rdx # arg2, disp->ImageBase
454 mov 0(%rsi),%r8 # arg3, disp->ControlPc
455 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
456 mov 40(%rsi),%r10 # disp->ContextRecord
457 lea 56(%rsi),%r11 # &disp->HandlerData
458 lea 24(%rsi),%r12 # &disp->EstablisherFrame
459 mov %r10,32(%rsp) # arg5
460 mov %r11,40(%rsp) # arg6
461 mov %r12,48(%rsp) # arg7
462 mov %rcx,56(%rsp) # arg8, (NULL)
463 call *__imp_RtlVirtualUnwind(%rip)
464
465 mov \$1,%eax # ExceptionContinueSearch
466 add \$64,%rsp
467 popfq
468 pop %r15
469 pop %r14
470 pop %r13
471 pop %r12
472 pop %rbp
473 pop %rbx
474 pop %rdi
475 pop %rsi
476 ret
477.size key_se_handler,.-key_se_handler
478
479.section .pdata
480.align 4
481 .rva .LSEH_begin_RC4
482 .rva .LSEH_end_RC4
483 .rva .LSEH_info_RC4
484
485 .rva .LSEH_begin_RC4_set_key
486 .rva .LSEH_end_RC4_set_key
487 .rva .LSEH_info_RC4_set_key
488
489.section .xdata
490.align 8
491.LSEH_info_RC4:
492 .byte 9,0,0,0
493 .rva stream_se_handler
494.LSEH_info_RC4_set_key:
495 .byte 9,0,0,0
496 .rva key_se_handler
497___
498}
499
500$code =~ s/#([bwd])/$1/gm;
363 501
364print $code; 502print $code;
365 503
diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h
index 2d8620d33b..29d1acccf5 100644
--- a/src/lib/libcrypto/rc4/rc4.h
+++ b/src/lib/libcrypto/rc4/rc4.h
@@ -64,6 +64,8 @@
64#error RC4 is disabled. 64#error RC4 is disabled.
65#endif 65#endif
66 66
67#include <stddef.h>
68
67#ifdef __cplusplus 69#ifdef __cplusplus
68extern "C" { 70extern "C" {
69#endif 71#endif
@@ -76,11 +78,8 @@ typedef struct rc4_key_st
76 78
77 79
78const char *RC4_options(void); 80const char *RC4_options(void);
79#ifdef OPENSSL_FIPS
80void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
81#endif
82void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); 81void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
83void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, 82void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
84 unsigned char *outdata); 83 unsigned char *outdata);
85 84
86#ifdef __cplusplus 85#ifdef __cplusplus
diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c
index 0660ea60a2..8c4fc6c7a3 100644
--- a/src/lib/libcrypto/rc4/rc4_enc.c
+++ b/src/lib/libcrypto/rc4/rc4_enc.c
@@ -67,12 +67,12 @@
67 * Date: Wed, 14 Sep 1994 06:35:31 GMT 67 * Date: Wed, 14 Sep 1994 06:35:31 GMT
68 */ 68 */
69 69
70void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, 70void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
71 unsigned char *outdata) 71 unsigned char *outdata)
72 { 72 {
73 register RC4_INT *d; 73 register RC4_INT *d;
74 register RC4_INT x,y,tx,ty; 74 register RC4_INT x,y,tx,ty;
75 int i; 75 size_t i;
76 76
77 x=key->x; 77 x=key->x;
78 y=key->y; 78 y=key->y;
@@ -120,8 +120,8 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
120 (RC4_CHUNK)d[(tx+ty)&0xff]\ 120 (RC4_CHUNK)d[(tx+ty)&0xff]\
121 ) 121 )
122 122
123 if ( ( ((unsigned long)indata & (sizeof(RC4_CHUNK)-1)) | 123 if ( ( ((size_t)indata & (sizeof(RC4_CHUNK)-1)) |
124 ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 ) 124 ((size_t)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 )
125 { 125 {
126 RC4_CHUNK ichunk,otp; 126 RC4_CHUNK ichunk,otp;
127 const union { long one; char little; } is_endian = {1}; 127 const union { long one; char little; } is_endian = {1};
@@ -157,7 +157,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
157 if (!is_endian.little) 157 if (!is_endian.little)
158 { /* BIG-ENDIAN CASE */ 158 { /* BIG-ENDIAN CASE */
159# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) 159# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
160 for (;len&~(sizeof(RC4_CHUNK)-1);len-=sizeof(RC4_CHUNK)) 160 for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
161 { 161 {
162 ichunk = *(RC4_CHUNK *)indata; 162 ichunk = *(RC4_CHUNK *)indata;
163 otp = RC4_STEP<<BESHFT(0); 163 otp = RC4_STEP<<BESHFT(0);
@@ -210,7 +210,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
210 else 210 else
211 { /* LITTLE-ENDIAN CASE */ 211 { /* LITTLE-ENDIAN CASE */
212# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1)) 212# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1))
213 for (;len&~(sizeof(RC4_CHUNK)-1);len-=sizeof(RC4_CHUNK)) 213 for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
214 { 214 {
215 ichunk = *(RC4_CHUNK *)indata; 215 ichunk = *(RC4_CHUNK *)indata;
216 otp = RC4_STEP; 216 otp = RC4_STEP;
@@ -276,7 +276,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
276#define RC4_LOOP(a,b,i) LOOP(a[i],b[i]) 276#define RC4_LOOP(a,b,i) LOOP(a[i],b[i])
277#endif 277#endif
278 278
279 i=(int)(len>>3L); 279 i=len>>3;
280 if (i) 280 if (i)
281 { 281 {
282 for (;;) 282 for (;;)
@@ -296,7 +296,7 @@ void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata,
296 if (--i == 0) break; 296 if (--i == 0) break;
297 } 297 }
298 } 298 }
299 i=(int)len&0x07; 299 i=len&0x07;
300 if (i) 300 if (i)
301 { 301 {
302 for (;;) 302 for (;;)
diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c
index 4478d1a4b3..b22c40b0bd 100644
--- a/src/lib/libcrypto/rc4/rc4_skey.c
+++ b/src/lib/libcrypto/rc4/rc4_skey.c
@@ -59,11 +59,6 @@
59#include <openssl/rc4.h> 59#include <openssl/rc4.h>
60#include "rc4_locl.h" 60#include "rc4_locl.h"
61#include <openssl/opensslv.h> 61#include <openssl/opensslv.h>
62#include <openssl/crypto.h>
63#ifdef OPENSSL_FIPS
64#include <openssl/fips.h>
65#endif
66
67 62
68const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT; 63const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT;
69 64
@@ -90,11 +85,7 @@ const char *RC4_options(void)
90 * Date: Wed, 14 Sep 1994 06:35:31 GMT 85 * Date: Wed, 14 Sep 1994 06:35:31 GMT
91 */ 86 */
92 87
93#ifdef OPENSSL_FIPS
94void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
95#else
96void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) 88void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
97#endif
98 { 89 {
99 register RC4_INT tmp; 90 register RC4_INT tmp;
100 register int id1,id2; 91 register int id1,id2;
@@ -128,20 +119,14 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
128 * implementations suffer from significant performance 119 * implementations suffer from significant performance
129 * losses then, e.g. PIII exhibits >2x deterioration, 120 * losses then, e.g. PIII exhibits >2x deterioration,
130 * and so does Opteron. In order to assure optimal 121 * and so does Opteron. In order to assure optimal
131 * all-round performance, we detect P4 at run-time by 122 * all-round performance, let us [try to] detect P4 at
132 * checking upon reserved bit 20 in CPU capability 123 * run-time by checking upon HTT bit in CPU capability
133 * vector and set up compressed key schedule, which is 124 * vector and set up compressed key schedule, which is
134 * recognized by correspondingly updated assembler 125 * recognized by correspondingly updated assembler
135 * module... Bit 20 is set up by OPENSSL_ia32_cpuid. 126 * module...
136 *
137 * <appro@fy.chalmers.se> 127 * <appro@fy.chalmers.se>
138 */ 128 */
139#ifdef OPENSSL_FIPS
140 unsigned long *ia32cap_ptr = OPENSSL_ia32cap_loc();
141 if (ia32cap_ptr && (*ia32cap_ptr & (1<<28))) {
142#else
143 if (OPENSSL_ia32cap_P & (1<<28)) { 129 if (OPENSSL_ia32cap_P & (1<<28)) {
144#endif
145 unsigned char *cp=(unsigned char *)d; 130 unsigned char *cp=(unsigned char *)d;
146 131
147 for (i=0;i<256;i++) cp[i]=i; 132 for (i=0;i<256;i++) cp[i]=i;