summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/rc4
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/rc4')
-rw-r--r--src/lib/libcrypto/rc4/Makefile103
-rw-r--r--src/lib/libcrypto/rc4/Makefile.ssl110
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-586.pl410
-rwxr-xr-xsrc/lib/libcrypto/rc4/asm/rc4-amd64.pl227
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-ia64.pl755
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl632
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-parisc.pl314
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-s390x.pl234
-rwxr-xr-xsrc/lib/libcrypto/rc4/asm/rc4-x86_64.pl677
-rw-r--r--src/lib/libcrypto/rc4/rc4.c193
-rw-r--r--src/lib/libcrypto/rc4/rc4.h90
-rw-r--r--src/lib/libcrypto/rc4/rc4_enc.c315
-rw-r--r--src/lib/libcrypto/rc4/rc4_locl.h5
-rw-r--r--src/lib/libcrypto/rc4/rc4_skey.c116
-rw-r--r--src/lib/libcrypto/rc4/rc4speed.c253
-rw-r--r--src/lib/libcrypto/rc4/rc4test.c242
-rw-r--r--src/lib/libcrypto/rc4/rrc4.doc278
17 files changed, 4617 insertions, 337 deletions
diff --git a/src/lib/libcrypto/rc4/Makefile b/src/lib/libcrypto/rc4/Makefile
new file mode 100644
index 0000000000..f623f6e354
--- /dev/null
+++ b/src/lib/libcrypto/rc4/Makefile
@@ -0,0 +1,103 @@
1#
2# OpenSSL/crypto/rc4/Makefile
3#
4
5DIR= rc4
6TOP= ../..
7CC= cc
8CPP= $(CC) -E
9INCLUDES=
10CFLAG=-g
11AR= ar r
12
13RC4_ENC=rc4_enc.o rc4_skey.o
14
15CFLAGS= $(INCLUDES) $(CFLAG)
16ASFLAGS= $(INCLUDES) $(ASFLAG)
17AFLAGS= $(ASFLAGS)
18
19GENERAL=Makefile
20TEST=rc4test.c
21APPS=
22
23LIB=$(TOP)/libcrypto.a
24LIBSRC=rc4_skey.c rc4_enc.c rc4_utl.c
25LIBOBJ=$(RC4_ENC) rc4_utl.o
26
27SRC= $(LIBSRC)
28
29EXHEADER= rc4.h
30HEADER= $(EXHEADER) rc4_locl.h
31
32ALL= $(GENERAL) $(SRC) $(HEADER)
33
34top:
35 (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
36
37all: lib
38
39lib: $(LIBOBJ)
40 $(AR) $(LIB) $(LIBOBJ)
41 $(RANLIB) $(LIB) || echo Never mind.
42 @touch lib
43
44rc4-586.s: asm/rc4-586.pl ../perlasm/x86asm.pl
45 $(PERL) asm/rc4-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@
46
47rc4-x86_64.s: asm/rc4-x86_64.pl
48 $(PERL) asm/rc4-x86_64.pl $(PERLASM_SCHEME) > $@
49rc4-md5-x86_64.s: asm/rc4-md5-x86_64.pl
50 $(PERL) asm/rc4-md5-x86_64.pl $(PERLASM_SCHEME) > $@
51
52rc4-ia64.S: asm/rc4-ia64.pl
53 $(PERL) asm/rc4-ia64.pl $(CFLAGS) > $@
54
55rc4-parisc.s: asm/rc4-parisc.pl
56 $(PERL) asm/rc4-parisc.pl $(PERLASM_SCHEME) $@
57
58rc4-ia64.s: rc4-ia64.S
59 @case `awk '/^#define RC4_INT/{print$$NF}' $(TOP)/include/openssl/opensslconf.h` in \
60 int) set -x; $(CC) $(CFLAGS) -DSZ=4 -E rc4-ia64.S > $@ ;; \
61 char) set -x; $(CC) $(CFLAGS) -DSZ=1 -E rc4-ia64.S > $@ ;; \
62 *) exit 1 ;; \
63 esac
64
65# GNU make "catch all"
66rc4-%.s: asm/rc4-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
67
68files:
69 $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
70
71links:
72 @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
73 @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
74 @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
75
76install:
77 @[ -n "$(INSTALLTOP)" ] # should be set by top Makefile...
78 @headerlist="$(EXHEADER)"; for i in $$headerlist ; \
79 do \
80 (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
81 chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
82 done;
83
84tags:
85 ctags $(SRC)
86
87tests:
88
89lint:
90 lint -DLINT $(INCLUDES) $(SRC)>fluff
91
92depend:
93 @[ -n "$(MAKEDEPEND)" ] # should be set by upper Makefile...
94 $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
95
96dclean:
97 $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
98 mv -f Makefile.new $(MAKEFILE)
99
100clean:
101 rm -f *.s *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff
102
103# DO NOT DELETE THIS LINE -- make depend depends on it.
diff --git a/src/lib/libcrypto/rc4/Makefile.ssl b/src/lib/libcrypto/rc4/Makefile.ssl
deleted file mode 100644
index 3e602662be..0000000000
--- a/src/lib/libcrypto/rc4/Makefile.ssl
+++ /dev/null
@@ -1,110 +0,0 @@
1#
2# SSLeay/crypto/rc4/Makefile
3#
4
5DIR= rc4
6TOP= ../..
7CC= cc
8CPP= $(CC) -E
9INCLUDES=
10CFLAG=-g
11INSTALL_PREFIX=
12OPENSSLDIR= /usr/local/ssl
13INSTALLTOP=/usr/local/ssl
14MAKE= make -f Makefile.ssl
15MAKEDEPPROG= makedepend
16MAKEDEPEND= $(TOP)/util/domd $(TOP) -MD $(MAKEDEPPROG)
17MAKEFILE= Makefile.ssl
18AR= ar r
19
20RC4_ENC=rc4_enc.o
21# or use
22#RC4_ENC=asm/rx86-elf.o
23#RC4_ENC=asm/rx86-out.o
24#RC4_ENC=asm/rx86-sol.o
25#RC4_ENC=asm/rx86bdsi.o
26
27CFLAGS= $(INCLUDES) $(CFLAG)
28ASFLAGS= $(INCLUDES) $(ASFLAG)
29
30GENERAL=Makefile
31TEST=rc4test.c
32APPS=
33
34LIB=$(TOP)/libcrypto.a
35LIBSRC=rc4_skey.c rc4_enc.c
36LIBOBJ=rc4_skey.o $(RC4_ENC)
37
38SRC= $(LIBSRC)
39
40EXHEADER= rc4.h
41HEADER= $(EXHEADER) rc4_locl.h
42
43ALL= $(GENERAL) $(SRC) $(HEADER)
44
45top:
46 (cd ../..; $(MAKE) DIRS=crypto SDIRS=$(DIR) sub_all)
47
48all: lib
49
50lib: $(LIBOBJ)
51 $(AR) $(LIB) $(LIBOBJ)
52 $(RANLIB) $(LIB) || echo Never mind.
53 @touch lib
54
55# elf
56asm/rx86-elf.s: asm/rc4-586.pl ../perlasm/x86asm.pl
57 (cd asm; $(PERL) rc4-586.pl elf $(CFLAGS) > rx86-elf.s)
58
59# a.out
60asm/rx86-out.o: asm/rx86unix.cpp
61 $(CPP) -DOUT asm/rx86unix.cpp | as -o asm/rx86-out.o
62
63# bsdi
64asm/rx86bsdi.o: asm/rx86unix.cpp
65 $(CPP) -DBSDI asm/rx86unix.cpp | sed 's/ :/:/' | as -o asm/rx86bsdi.o
66
67asm/rx86unix.cpp: asm/rc4-586.pl ../perlasm/x86asm.pl
68 (cd asm; $(PERL) rc4-586.pl cpp >rx86unix.cpp)
69
70files:
71 $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO
72
73links:
74 @sh $(TOP)/util/point.sh Makefile.ssl Makefile
75 @$(PERL) $(TOP)/util/mklink.pl ../../include/openssl $(EXHEADER)
76 @$(PERL) $(TOP)/util/mklink.pl ../../test $(TEST)
77 @$(PERL) $(TOP)/util/mklink.pl ../../apps $(APPS)
78
79install:
80 @for i in $(EXHEADER) ; \
81 do \
82 (cp $$i $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i; \
83 chmod 644 $(INSTALL_PREFIX)$(INSTALLTOP)/include/openssl/$$i ); \
84 done;
85
86tags:
87 ctags $(SRC)
88
89tests:
90
91lint:
92 lint -DLINT $(INCLUDES) $(SRC)>fluff
93
94depend:
95 $(MAKEDEPEND) -- $(CFLAG) $(INCLUDES) $(DEPFLAG) -- $(PROGS) $(LIBSRC)
96
97dclean:
98 $(PERL) -pe 'if (/^# DO NOT DELETE THIS LINE/) {print; exit(0);}' $(MAKEFILE) >Makefile.new
99 mv -f Makefile.new $(MAKEFILE)
100
101clean:
102 rm -f asm/rx86unix.cpp asm/*-elf.* *.o *.obj lib tags core .pure .nfs* *.old *.bak fluff asm/*.o
103
104# DO NOT DELETE THIS LINE -- make depend depends on it.
105
106rc4_enc.o: ../../include/openssl/opensslconf.h ../../include/openssl/rc4.h
107rc4_enc.o: rc4_enc.c rc4_locl.h
108rc4_skey.o: ../../include/openssl/opensslconf.h
109rc4_skey.o: ../../include/openssl/opensslv.h ../../include/openssl/rc4.h
110rc4_skey.o: rc4_locl.h rc4_skey.c
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
new file mode 100644
index 0000000000..5c9ac6ad28
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -0,0 +1,410 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# At some point it became apparent that the original SSLeay RC4
11# assembler implementation performs suboptimally on latest IA-32
12# microarchitectures. After re-tuning performance has changed as
13# following:
14#
15# Pentium -10%
16# Pentium III +12%
17# AMD +50%(*)
18# P4 +250%(**)
19#
20# (*) This number is actually a trade-off:-) It's possible to
21# achieve +72%, but at the cost of -48% off PIII performance.
22# In other words code performing further 13% faster on AMD
23# would perform almost 2 times slower on Intel PIII...
24# For reference! This code delivers ~80% of rc4-amd64.pl
25# performance on the same Opteron machine.
26# (**) This number requires compressed key schedule set up by
27# RC4_set_key [see commentary below for further details].
28#
29# <appro@fy.chalmers.se>
30
31# May 2011
32#
33# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
34# performance in cycles per processed byte (less is better) and
35# improvement relative to previous version of this module is:
36#
37# Pentium 10.2 # original numbers
38# Pentium III 7.8(*)
39# Intel P4 7.5
40#
41# Opteron 6.1/+20% # new MMX numbers
42# Core2 5.3/+67%(**)
43# Westmere 5.1/+94%(**)
44# Sandy Bridge 5.0/+8%
45# Atom 12.6/+6%
46#
47# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
48# but this specific code performs poorly on Core2. And vice
49# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
50# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
51# [anymore], I chose to discard PIII-specific code path and opt
52# for original IALU-only code, which is why MMX/SSE code path
53# is guarded by SSE2 bit (see below), not MMX/SSE.
54# (**) Performance vs. block size on Core2 and Westmere had a maximum
55# at ... 64 bytes block size. And it was quite a maximum, 40-60%
56# in comparison to largest 8KB block size. Above improvement
57# coefficients are for the largest block size.
58
59$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60push(@INC,"${dir}","${dir}../../perlasm");
61require "x86asm.pl";
62
63&asm_init($ARGV[0],"rc4-586.pl");
64
65$xx="eax";
66$yy="ebx";
67$tx="ecx";
68$ty="edx";
69$inp="esi";
70$out="ebp";
71$dat="edi";
72
73sub RC4_loop {
74 my $i=shift;
75 my $func = ($i==0)?*mov:*or;
76
77 &add (&LB($yy),&LB($tx));
78 &mov ($ty,&DWP(0,$dat,$yy,4));
79 &mov (&DWP(0,$dat,$yy,4),$tx);
80 &mov (&DWP(0,$dat,$xx,4),$ty);
81 &add ($ty,$tx);
82 &inc (&LB($xx));
83 &and ($ty,0xff);
84 &ror ($out,8) if ($i!=0);
85 if ($i<3) {
86 &mov ($tx,&DWP(0,$dat,$xx,4));
87 } else {
88 &mov ($tx,&wparam(3)); # reload [re-biased] out
89 }
90 &$func ($out,&DWP(0,$dat,$ty,4));
91}
92
93if ($alt=0) {
94 # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
95 # but ~40% slower on Core2 and Westmere... Attempt to add movz
96 # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
97 # on Core2 with movz it's almost 20% slower than below alternative
98 # code... Yes, it's a total mess...
99 my @XX=($xx,$out);
100 $RC4_loop_mmx = sub { # SSE actually...
101 my $i=shift;
102 my $j=$i<=0?0:$i>>1;
103 my $mm=$i<=0?"mm0":"mm".($i&1);
104
105 &add (&LB($yy),&LB($tx));
106 &lea (@XX[1],&DWP(1,@XX[0]));
107 &pxor ("mm2","mm0") if ($i==0);
108 &psllq ("mm1",8) if ($i==0);
109 &and (@XX[1],0xff);
110 &pxor ("mm0","mm0") if ($i<=0);
111 &mov ($ty,&DWP(0,$dat,$yy,4));
112 &mov (&DWP(0,$dat,$yy,4),$tx);
113 &pxor ("mm1","mm2") if ($i==0);
114 &mov (&DWP(0,$dat,$XX[0],4),$ty);
115 &add (&LB($ty),&LB($tx));
116 &movd (@XX[0],"mm7") if ($i==0);
117 &mov ($tx,&DWP(0,$dat,@XX[1],4));
118 &pxor ("mm1","mm1") if ($i==1);
119 &movq ("mm2",&QWP(0,$inp)) if ($i==1);
120 &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0);
121 &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j);
122
123 push (@XX,shift(@XX)) if ($i>=0);
124 }
125} else {
126 # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
127 # brings down AMD by 7%...
128 $RC4_loop_mmx = sub {
129 my $i=shift;
130
131 &add (&LB($yy),&LB($tx));
132 &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1);
133 &mov ($ty,&DWP(0,$dat,$yy,4));
134 &mov (&DWP(0,$dat,$yy,4),$tx);
135 &mov (&DWP(0,$dat,$xx,4),$ty);
136 &inc ($xx);
137 &add ($ty,$tx);
138 &movz ($xx,&LB($xx)); # (*)
139 &movz ($ty,&LB($ty)); # (*)
140 &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0);
141 &movq ("mm0",&QWP(0,$inp)) if ($i<=0);
142 &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0);
143 &mov ($tx,&DWP(0,$dat,$xx,4));
144 &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
145
146 # (*) This is the key to Core2 and Westmere performance.
147 # Whithout movz out-of-order execution logic confuses
148 # itself and fails to reorder loads and stores. Problem
149 # appears to be fixed in Sandy Bridge...
150 }
151}
152
153&external_label("OPENSSL_ia32cap_P");
154
155# void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
156&function_begin("RC4");
157 &mov ($dat,&wparam(0)); # load key schedule pointer
158 &mov ($ty, &wparam(1)); # load len
159 &mov ($inp,&wparam(2)); # load inp
160 &mov ($out,&wparam(3)); # load out
161
162 &xor ($xx,$xx); # avoid partial register stalls
163 &xor ($yy,$yy);
164
165 &cmp ($ty,0); # safety net
166 &je (&label("abort"));
167
168 &mov (&LB($xx),&BP(0,$dat)); # load key->x
169 &mov (&LB($yy),&BP(4,$dat)); # load key->y
170 &add ($dat,8);
171
172 &lea ($tx,&DWP(0,$inp,$ty));
173 &sub ($out,$inp); # re-bias out
174 &mov (&wparam(1),$tx); # save input+len
175
176 &inc (&LB($xx));
177
178 # detect compressed key schedule...
179 &cmp (&DWP(256,$dat),-1);
180 &je (&label("RC4_CHAR"));
181
182 &mov ($tx,&DWP(0,$dat,$xx,4));
183
184 &and ($ty,-4); # how many 4-byte chunks?
185 &jz (&label("loop1"));
186
187 &test ($ty,-8);
188 &mov (&wparam(3),$out); # $out as accumulator in these loops
189 &jz (&label("go4loop4"));
190
191 &picmeup($out,"OPENSSL_ia32cap_P");
192 &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX]
193 &jnc (&label("go4loop4"));
194
195 &mov ($out,&wparam(3)) if (!$alt);
196 &movd ("mm7",&wparam(3)) if ($alt);
197 &and ($ty,-8);
198 &lea ($ty,&DWP(-8,$inp,$ty));
199 &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8
200
201 &$RC4_loop_mmx(-1);
202 &jmp(&label("loop_mmx_enter"));
203
204 &set_label("loop_mmx",16);
205 &$RC4_loop_mmx(0);
206 &set_label("loop_mmx_enter");
207 for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
208 &mov ($ty,$yy);
209 &xor ($yy,$yy); # this is second key to Core2
210 &mov (&LB($yy),&LB($ty)); # and Westmere performance...
211 &cmp ($inp,&DWP(-4,$dat));
212 &lea ($inp,&DWP(8,$inp));
213 &jb (&label("loop_mmx"));
214
215 if ($alt) {
216 &movd ($out,"mm7");
217 &pxor ("mm2","mm0");
218 &psllq ("mm1",8);
219 &pxor ("mm1","mm2");
220 &movq (&QWP(-8,$out,$inp),"mm1");
221 } else {
222 &psllq ("mm1",56);
223 &pxor ("mm2","mm1");
224 &movq (&QWP(-8,$out,$inp),"mm2");
225 }
226 &emms ();
227
228 &cmp ($inp,&wparam(1)); # compare to input+len
229 &je (&label("done"));
230 &jmp (&label("loop1"));
231
232&set_label("go4loop4",16);
233 &lea ($ty,&DWP(-4,$inp,$ty));
234 &mov (&wparam(2),$ty); # save input+(len/4)*4-4
235
236 &set_label("loop4");
237 for ($i=0;$i<4;$i++) { RC4_loop($i); }
238 &ror ($out,8);
239 &xor ($out,&DWP(0,$inp));
240 &cmp ($inp,&wparam(2)); # compare to input+(len/4)*4-4
241 &mov (&DWP(0,$tx,$inp),$out);# $tx holds re-biased out here
242 &lea ($inp,&DWP(4,$inp));
243 &mov ($tx,&DWP(0,$dat,$xx,4));
244 &jb (&label("loop4"));
245
246 &cmp ($inp,&wparam(1)); # compare to input+len
247 &je (&label("done"));
248 &mov ($out,&wparam(3)); # restore $out
249
250 &set_label("loop1",16);
251 &add (&LB($yy),&LB($tx));
252 &mov ($ty,&DWP(0,$dat,$yy,4));
253 &mov (&DWP(0,$dat,$yy,4),$tx);
254 &mov (&DWP(0,$dat,$xx,4),$ty);
255 &add ($ty,$tx);
256 &inc (&LB($xx));
257 &and ($ty,0xff);
258 &mov ($ty,&DWP(0,$dat,$ty,4));
259 &xor (&LB($ty),&BP(0,$inp));
260 &lea ($inp,&DWP(1,$inp));
261 &mov ($tx,&DWP(0,$dat,$xx,4));
262 &cmp ($inp,&wparam(1)); # compare to input+len
263 &mov (&BP(-1,$out,$inp),&LB($ty));
264 &jb (&label("loop1"));
265
266 &jmp (&label("done"));
267
268# this is essentially Intel P4 specific codepath...
269&set_label("RC4_CHAR",16);
270 &movz ($tx,&BP(0,$dat,$xx));
271 # strangely enough unrolled loop performs over 20% slower...
272 &set_label("cloop1");
273 &add (&LB($yy),&LB($tx));
274 &movz ($ty,&BP(0,$dat,$yy));
275 &mov (&BP(0,$dat,$yy),&LB($tx));
276 &mov (&BP(0,$dat,$xx),&LB($ty));
277 &add (&LB($ty),&LB($tx));
278 &movz ($ty,&BP(0,$dat,$ty));
279 &add (&LB($xx),1);
280 &xor (&LB($ty),&BP(0,$inp));
281 &lea ($inp,&DWP(1,$inp));
282 &movz ($tx,&BP(0,$dat,$xx));
283 &cmp ($inp,&wparam(1));
284 &mov (&BP(-1,$out,$inp),&LB($ty));
285 &jb (&label("cloop1"));
286
287&set_label("done");
288 &dec (&LB($xx));
289 &mov (&DWP(-4,$dat),$yy); # save key->y
290 &mov (&BP(-8,$dat),&LB($xx)); # save key->x
291&set_label("abort");
292&function_end("RC4");
293
294########################################################################
295
296$inp="esi";
297$out="edi";
298$idi="ebp";
299$ido="ecx";
300$idx="edx";
301
302# void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
303&function_begin("private_RC4_set_key");
304 &mov ($out,&wparam(0)); # load key
305 &mov ($idi,&wparam(1)); # load len
306 &mov ($inp,&wparam(2)); # load data
307 &picmeup($idx,"OPENSSL_ia32cap_P");
308
309 &lea ($out,&DWP(2*4,$out)); # &key->data
310 &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
311 &neg ($idi);
312 &xor ("eax","eax");
313 &mov (&DWP(-4,$out),$idi); # borrow key->y
314
315 &bt (&DWP(0,$idx),20); # check for bit#20
316 &jc (&label("c1stloop"));
317
318&set_label("w1stloop",16);
319 &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
320 &add (&LB("eax"),1); # i++;
321 &jnc (&label("w1stloop"));
322
323 &xor ($ido,$ido);
324 &xor ($idx,$idx);
325
326&set_label("w2ndloop",16);
327 &mov ("eax",&DWP(0,$out,$ido,4));
328 &add (&LB($idx),&BP(0,$inp,$idi));
329 &add (&LB($idx),&LB("eax"));
330 &add ($idi,1);
331 &mov ("ebx",&DWP(0,$out,$idx,4));
332 &jnz (&label("wnowrap"));
333 &mov ($idi,&DWP(-4,$out));
334 &set_label("wnowrap");
335 &mov (&DWP(0,$out,$idx,4),"eax");
336 &mov (&DWP(0,$out,$ido,4),"ebx");
337 &add (&LB($ido),1);
338 &jnc (&label("w2ndloop"));
339&jmp (&label("exit"));
340
341# Unlike all other x86 [and x86_64] implementations, Intel P4 core
342# [including EM64T] was found to perform poorly with above "32-bit" key
343# schedule, a.k.a. RC4_INT. Performance improvement for IA-32 hand-coded
344# assembler turned out to be 3.5x if re-coded for compressed 8-bit one,
345# a.k.a. RC4_CHAR! It's however inappropriate to just switch to 8-bit
346# schedule for x86[_64], because non-P4 implementations suffer from
347# significant performance losses then, e.g. PIII exhibits >2x
348# deterioration, and so does Opteron. In order to assure optimal
349# all-round performance, we detect P4 at run-time and set up compressed
350# key schedule, which is recognized by RC4 procedure.
351
352&set_label("c1stloop",16);
353 &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
354 &add (&LB("eax"),1); # i++;
355 &jnc (&label("c1stloop"));
356
357 &xor ($ido,$ido);
358 &xor ($idx,$idx);
359 &xor ("ebx","ebx");
360
361&set_label("c2ndloop",16);
362 &mov (&LB("eax"),&BP(0,$out,$ido));
363 &add (&LB($idx),&BP(0,$inp,$idi));
364 &add (&LB($idx),&LB("eax"));
365 &add ($idi,1);
366 &mov (&LB("ebx"),&BP(0,$out,$idx));
367 &jnz (&label("cnowrap"));
368 &mov ($idi,&DWP(-4,$out));
369 &set_label("cnowrap");
370 &mov (&BP(0,$out,$idx),&LB("eax"));
371 &mov (&BP(0,$out,$ido),&LB("ebx"));
372 &add (&LB($ido),1);
373 &jnc (&label("c2ndloop"));
374
375 &mov (&DWP(256,$out),-1); # mark schedule as compressed
376
377&set_label("exit");
378 &xor ("eax","eax");
379 &mov (&DWP(-8,$out),"eax"); # key->x=0;
380 &mov (&DWP(-4,$out),"eax"); # key->y=0;
381&function_end("private_RC4_set_key");
382
383# const char *RC4_options(void);
384&function_begin_B("RC4_options");
385 &call (&label("pic_point"));
386&set_label("pic_point");
387 &blindpop("eax");
388 &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
389 &picmeup("edx","OPENSSL_ia32cap_P");
390 &mov ("edx",&DWP(0,"edx"));
391 &bt ("edx",20);
392 &jc (&label("1xchar"));
393 &bt ("edx",26);
394 &jnc (&label("ret"));
395 &add ("eax",25);
396 &ret ();
397&set_label("1xchar");
398 &add ("eax",12);
399&set_label("ret");
400 &ret ();
401&set_label("opts",64);
402&asciz ("rc4(4x,int)");
403&asciz ("rc4(1x,char)");
404&asciz ("rc4(8x,mmx)");
405&asciz ("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
406&align (64);
407&function_end_B("RC4_options");
408
409&asm_finish();
410
diff --git a/src/lib/libcrypto/rc4/asm/rc4-amd64.pl b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl
deleted file mode 100755
index 9e0da8af99..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-amd64.pl
+++ /dev/null
@@ -1,227 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
10# "hand-coded assembler"] doesn't stand for the whole improvement
11# coefficient. It turned out that eliminating RC4_CHAR from config
12# line results in ~40% improvement (yes, even for C implementation).
13# Presumably it has everything to do with AMD cache architecture and
14# RAW or whatever penalties. Once again! The module *requires* config
15# line *without* RC4_CHAR! As for coding "secret," I bet on partial
16# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
17# I simply 'inc %r8b'. Even though optimization manual discourages
18# to operate on partial registers, it turned out to be the best bet.
19# At least for AMD... How IA32E would perform remains to be seen...
20
21# As was shown by Marc Bevand reordering of couple of load operations
22# results in even higher performance gain of 3.3x:-) At least on
23# Opteron... For reference, 1x in this case is RC4_CHAR C-code
24# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
25# Latter means that if you want to *estimate* what to expect from
26# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
27
28# Intel P4 EM64T core was found to run the AMD64 code really slow...
29# The only way to achieve comparable performance on P4 is to keep
30# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
31# compose blended code, which would perform even within 30% marginal
32# on either AMD and Intel platforms, I implement both cases. See
33# rc4_skey.c for further details... This applies to 0.9.8 and later.
34# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
35# of code remain redundant.
36
37$output=shift;
38
39$win64a=1 if ($output =~ /win64a.[s|asm]/);
40
41open STDOUT,">$output" || die "can't open $output: $!";
42
43if (defined($win64a)) {
44 $dat="%rcx"; # arg1
45 $len="%rdx"; # arg2
46 $inp="%rsi"; # r8, arg3 moves here
47 $out="%rdi"; # r9, arg4 moves here
48} else {
49 $dat="%rdi"; # arg1
50 $len="%rsi"; # arg2
51 $inp="%rdx"; # arg3
52 $out="%rcx"; # arg4
53}
54
55$XX="%r10";
56$TX="%r8";
57$YY="%r11";
58$TY="%r9";
59
60sub PTR() {
61 my $ret=shift;
62 if (defined($win64a)) {
63 $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN]
64 $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off]
65 } else {
66 $ret =~ s/[\+\*]/,/g; # [%rN+%rM*4]->[%rN,%rM,4]
67 $ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN)
68 }
69 $ret;
70}
71
72$code=<<___ if (!defined($win64a));
73.text
74
75.globl RC4
76.type RC4,\@function
77.align 16
78RC4: or $len,$len
79 jne .Lentry
80 repret
81.Lentry:
82___
83$code=<<___ if (defined($win64a));
84_TEXT SEGMENT
85PUBLIC RC4
86ALIGN 16
87RC4 PROC
88 or $len,$len
89 jne .Lentry
90 repret
91.Lentry:
92 push %rdi
93 push %rsi
94 sub \$40,%rsp
95 mov %r8,$inp
96 mov %r9,$out
97___
98$code.=<<___;
99 add \$8,$dat
100 movl `&PTR("DWORD:-8[$dat]")`,$XX#d
101 movl `&PTR("DWORD:-4[$dat]")`,$YY#d
102 cmpl \$-1,`&PTR("DWORD:256[$dat]")`
103 je .LRC4_CHAR
104 test \$-8,$len
105 jz .Lloop1
106.align 16
107.Lloop8:
108 inc $XX#b
109 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
110 add $TX#b,$YY#b
111 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
112 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
113 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
114 add $TX#b,$TY#b
115 inc $XX#b
116 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
117 movb `&PTR("BYTE:[$dat+$TY*4]")`,%al
118___
119for ($i=1;$i<=6;$i++) {
120$code.=<<___;
121 add $TX#b,$YY#b
122 ror \$8,%rax
123 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
124 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
125 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
126 add $TX#b,$TY#b
127 inc $XX#b
128 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
129 movb `&PTR("BYTE:[$dat+$TY*4]")`,%al
130___
131}
132$code.=<<___;
133 add $TX#b,$YY#b
134 ror \$8,%rax
135 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
136 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
137 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
138 sub \$8,$len
139 add $TY#b,$TX#b
140 movb `&PTR("BYTE:[$dat+$TX*4]")`,%al
141 ror \$8,%rax
142 add \$8,$inp
143 add \$8,$out
144
145 xor `&PTR("QWORD:-8[$inp]")`,%rax
146 mov %rax,`&PTR("QWORD:-8[$out]")`
147
148 test \$-8,$len
149 jnz .Lloop8
150 cmp \$0,$len
151 jne .Lloop1
152.Lexit:
153 movl $XX#d,`&PTR("DWORD:-8[$dat]")`
154 movl $YY#d,`&PTR("DWORD:-4[$dat]")`
155___
156$code.=<<___ if (defined($win64a));
157 add \$40,%rsp
158 pop %rsi
159 pop %rdi
160___
161$code.=<<___;
162 repret
163.align 16
164.Lloop1:
165 movzb `&PTR("BYTE:[$inp]")`,%eax
166 inc $XX#b
167 movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d
168 add $TX#b,$YY#b
169 movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d
170 movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")`
171 movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")`
172 add $TY#b,$TX#b
173 movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d
174 xor $TY,%rax
175 inc $inp
176 movb %al,`&PTR("BYTE:[$out]")`
177 inc $out
178 dec $len
179 jnz .Lloop1
180 jmp .Lexit
181
182.align 16
183.LRC4_CHAR:
184 inc $XX#b
185 movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d
186 add $TX#b,$YY#b
187 movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d
188 movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`
189 movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`
190 add $TX#b,$TY#b
191 movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d
192 xorb `&PTR("BYTE:[$inp]")`,$TY#b
193 movb $TY#b,`&PTR("BYTE:[$out]")`
194 inc $inp
195 inc $out
196 dec $len
197 jnz .LRC4_CHAR
198 jmp .Lexit
199___
200$code.=<<___ if (defined($win64a));
201RC4 ENDP
202_TEXT ENDS
203END
204___
205$code.=<<___ if (!defined($win64a));
206.size RC4,.-RC4
207___
208
209$code =~ s/#([bwd])/$1/gm;
210$code =~ s/\`([^\`]*)\`/eval $1/gem;
211
212if (defined($win64a)) {
213 $code =~ s/\.align/ALIGN/gm;
214 $code =~ s/[\$%]//gm;
215 $code =~ s/\.L/\$L/gm;
216 $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm;
217 $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm;
218 $code =~ s/mov[bwlq]/mov/gm;
219 $code =~ s/movzb/movzx/gm;
220 $code =~ s/repret/DB\t0F3h,0C3h/gm;
221 $code =~ s/cmpl/cmp/gm;
222 $code =~ s/xorb/xor/gm;
223} else {
224 $code =~ s/([QD]*WORD|BYTE)://gm;
225 $code =~ s/repret/.byte\t0xF3,0xC3/gm;
226}
227print $code;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.pl b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
new file mode 100644
index 0000000000..49cd5b5e69
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-ia64.pl
@@ -0,0 +1,755 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by David Mosberger <David.Mosberger@acm.org> based on the
5# Itanium optimized Crypto code which was released by HP Labs at
6# http://www.hpl.hp.com/research/linux/crypto/.
7#
8# Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
9#
10# Permission is hereby granted, free of charge, to any person obtaining
11# a copy of this software and associated documentation files (the
12# "Software"), to deal in the Software without restriction, including
13# without limitation the rights to use, copy, modify, merge, publish,
14# distribute, sublicense, and/or sell copies of the Software, and to
15# permit persons to whom the Software is furnished to do so, subject to
16# the following conditions:
17#
18# The above copyright notice and this permission notice shall be
19# included in all copies or substantial portions of the Software.
20
21# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
28
29
30
31# This is a little helper program which generates a software-pipelined
32# for RC4 encryption. The basic algorithm looks like this:
33#
34# for (counter = 0; counter < len; ++counter)
35# {
36# in = inp[counter];
37# SI = S[I];
38# J = (SI + J) & 0xff;
39# SJ = S[J];
40# T = (SI + SJ) & 0xff;
41# S[I] = SJ, S[J] = SI;
42# ST = S[T];
43# outp[counter] = in ^ ST;
44# I = (I + 1) & 0xff;
45# }
46#
47# Pipelining this loop isn't easy, because the stores to the S[] array
48# need to be observed in the right order. The loop generated by the
49# code below has the following pipeline diagram:
50#
51# cycle
52# | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
53# iter
54# 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
55# 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
56# 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
57#
58# where:
59# LDI = load of S[I]
60# LDJ = load of S[J]
61# SWP = swap of S[I] and S[J]
62# LDT = load of S[T]
63#
64# Note that in the above diagram, the major trouble-spot is that LDI
65# of the 2nd iteration is performed BEFORE the SWP of the first
66# iteration. Fortunately, this is easy to detect (I of the 1st
67# iteration will be equal to J of the 2nd iteration) and when this
68# happens, we simply forward the proper value from the 1st iteration
69# to the 2nd one. The proper value in this case is simply the value
70# of S[I] from the first iteration (thanks to the fact that SWP
71# simply swaps the contents of S[I] and S[J]).
72#
73# Another potential trouble-spot is in cycle 7, where SWP of the 1st
74# iteration issues at the same time as the LDI of the 3rd iteration.
75# However, thanks to IA-64 execution semantics, this can be taken
76# care of simply by placing LDI later in the instruction-group than
77# SWP. IA-64 CPUs will automatically forward the value if they
78# detect that the SWP and LDI are accessing the same memory-location.
79
80# The core-loop that can be pipelined then looks like this (annotated
81# with McKinley/Madison issue port & latency numbers, assuming L1
82# cache hits for the most part):
83
84# operation: instruction: issue-ports: latency
85# ------------------ ----------------------------- ------------- -------
86
87# Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
88# shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
89# I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
90# ;;
91# SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
92# ;;
93# cmp.eq.unc pBypass = I, J * after J is valid!
94# J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
95# (pBypass) br.cond.spnt Bypass
96# ;;
97# ---------------------------------------------------------------------------------------
98# J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
99# ;;
100# shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
101# ;;
102# SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
103# ;;
104# ---------------------------------------------------------------------------------------
105# T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
106# ;;
107# T = T & 0xff zxt1 T = T I0, I1 1 cyc
108# S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
109# S[J] = SI st8 [Jptr] = SI M2-M3
110# ;;
111# shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
112# ;;
113# ---------------------------------------------------------------------------------------
114# T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
115# ;;
116# data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
117# ;;
118# *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
119# ;;
120# ---------------------------------------------------------------------------------------
121
122# There are several points worth making here:
123
124# - Note that due to the bypass/forwarding-path, the first two
125# phases of the loop are strangly mingled together. In
126# particular, note that the first stage of the pipeline is
127# using the value of "J", as calculated by the second stage.
128# - Each bundle-pair will have exactly 6 instructions.
129# - Pipelined, the loop can execute in 3 cycles/iteration and
130# 4 stages. However, McKinley/Madison can issue "st1" to
131# the same bank at a rate of at most one per 4 cycles. Thus,
132# instead of storing each byte, we accumulate them in a word
133# and then write them back at once with a single "st8" (this
134# implies that the setup code needs to ensure that the output
135# buffer is properly aligned, if need be, by encoding the
136# first few bytes separately).
137# - There is no space for a "br.ctop" instruction. For this
138# reason we can't use module-loop support in IA-64 and have
139# to do a traditional, purely software-pipelined loop.
140# - We can't replace any of the remaining "add/zxt1" pairs with
141# "padd1" because the latency for that instruction is too high
142# and would push the loop to the point where more bypasses
143# would be needed, which we don't have space for.
144# - The above loop runs at around 3.26 cycles/byte, or roughly
145# 440 MByte/sec on a 1.5GHz Madison. This is well below the
146# system bus bandwidth and hence with judicious use of
147# "lfetch" this loop can run at (almost) peak speed even when
148# the input and output data reside in memory. The
149# max. latency that can be tolerated is (PREFETCH_DISTANCE *
150# L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
151# least) 1-ahead prefetching of 128 byte cache-lines. Note
152# that we do NOT prefetch into L1, since that would only
153# interfere with the S[] table values stored there. This is
154# acceptable because there is a 10 cycle latency between
155# load and first use of the input data.
156# - We use a branch to out-of-line bypass-code of cycle-pressure:
157# we calculate the next J, check for the need to activate the
158# bypass path, and activate the bypass path ALL IN THE SAME
159# CYCLE. If we didn't have these constraints, we could do
160# the bypass with a simple conditional move instruction.
161# Fortunately, the bypass paths get activated relatively
162# infrequently, so the extra branches don't cost all that much
163# (about 0.04 cycles/byte, measured on a 16396 byte file with
164# random input data).
165#
166
167$phases = 4; # number of stages/phases in the pipelined-loop
168$unroll_count = 6; # number of times we unrolled it
169$pComI = (1 << 0);
170$pComJ = (1 << 1);
171$pComT = (1 << 2);
172$pOut = (1 << 3);
173
174$NData = 4;
175$NIP = 3;
176$NJP = 2;
177$NI = 2;
178$NSI = 3;
179$NSJ = 2;
180$NT = 2;
181$NOutWord = 2;
182
183#
184# $threshold is the minimum length before we attempt to use the
185# big software-pipelined loop. It MUST be greater-or-equal
186# to:
187# PHASES * (UNROLL_COUNT + 1) + 7
188#
189# The "+ 7" comes from the fact we may have to encode up to
190# 7 bytes separately before the output pointer is aligned.
191#
192$threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
193
194sub I {
195 local *code = shift;
196 local $format = shift;
197 $code .= sprintf ("\t\t".$format."\n", @_);
198}
199
200sub P {
201 local *code = shift;
202 local $format = shift;
203 $code .= sprintf ($format."\n", @_);
204}
205
206sub STOP {
207 local *code = shift;
208 $code .=<<___;
209 ;;
210___
211}
212
213sub emit_body {
214 local *c = shift;
215 local *bypass = shift;
216 local ($iteration, $p) = @_;
217
218 local $i0 = $iteration;
219 local $i1 = $iteration - 1;
220 local $i2 = $iteration - 2;
221 local $i3 = $iteration - 3;
222 local $iw0 = ($iteration - 3) / 8;
223 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
224 local $byte_num = ($iteration - 3) % 8;
225 local $label = $iteration + 1;
226 local $pAny = ($p & 0xf) == 0xf;
227 local $pByp = (($p & $pComI) && ($iteration > 0));
228
229 $c.=<<___;
230//////////////////////////////////////////////////
231___
232
233 if (($p & 0xf) == 0) {
234 $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
235 &I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
236 $iw1 % $NOutWord, $iw1 % $NOutWord);
237 $c.="#endif\n";
238 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
239 return;
240 }
241
242 # Cycle 0
243 &I(\$c, "{ .mmi") if ($pAny);
244 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
245 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
246 &I(\$c, "zxt1 J = J") if ($p & $pComJ);
247 &I(\$c, "}") if ($pAny);
248 &I(\$c, "{ .mmi") if ($pAny);
249 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
250 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
251 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
252 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
253 &I(\$c, "}") if ($pAny);
254 &STOP(\$c);
255
256 # Cycle 1
257 &I(\$c, "{ .mmi") if ($pAny);
258 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
259 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
260 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
261 &I(\$c, "}") if ($pAny);
262 &I(\$c, "{ .mmi") if ($pAny);
263 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
264 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
265 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
266 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
267 &I(\$c, "}") if ($pAny);
268 &STOP(\$c);
269
270 # Cycle 2
271 &I(\$c, "{ .mmi") if ($pAny);
272 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
273 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
274 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
275 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
276 &I(\$c, "}") if ($pAny);
277 &I(\$c, "{ .mmb") if ($pAny);
278 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
279 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
280 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
281 &I(\$c, "}") if ($pAny);
282 &STOP(\$c);
283
284 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
285 if ($byte_num == 0 && $iteration >= $phases) {
286 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
287 $iw1 % $NOutWord) if ($p & $pOut);
288 if ($iteration == (1 + $unroll_count) * $phases - 1) {
289 if ($unroll_count == 6) {
290 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
291 $iw1 % $NOutWord, $iw0 % $NOutWord);
292 }
293 &I(\$c, "lfetch.nt1 [InPrefetch], %u",
294 $unroll_count * $phases);
295 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
296 $unroll_count * $phases);
297 &I(\$c, "br.cloop.sptk.few .rc4Loop");
298 }
299 }
300
301 if ($pByp) {
302 &P(\$bypass, ".rc4Bypass%u:", $label);
303 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
304 &I(\$bypass, "nop 0");
305 &I(\$bypass, "nop 0");
306 &I(\$bypass, ";;");
307 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
308 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
309 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
310 &I(\$bypass, ";;");
311 }
312}
313
314$code=<<___;
315.ident \"rc4-ia64.s, version 3.0\"
316.ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
317
318#define LCSave r8
319#define PRSave r9
320
321/* Inputs become invalid once rotation begins! */
322
323#define StateTable in0
324#define DataLen in1
325#define InputBuffer in2
326#define OutputBuffer in3
327
328#define KTable r14
329#define J r15
330#define InPtr r16
331#define OutPtr r17
332#define InPrefetch r18
333#define OutPrefetch r19
334#define One r20
335#define LoopCount r21
336#define Remainder r22
337#define IFinal r23
338#define EndPtr r24
339
340#define tmp0 r25
341#define tmp1 r26
342
343#define pBypass p6
344#define pDone p7
345#define pSmall p8
346#define pAligned p9
347#define pUnaligned p10
348
349#define pComputeI pPhase[0]
350#define pComputeJ pPhase[1]
351#define pComputeT pPhase[2]
352#define pOutput pPhase[3]
353
354#define RetVal r8
355#define L_OK p7
356#define L_NOK p8
357
358#define _NINPUTS 4
359#define _NOUTPUT 0
360
361#define _NROTATE 24
362#define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
363
364#ifndef SZ
365# define SZ 4 // this must be set to sizeof(RC4_INT)
366#endif
367
368#if SZ == 1
369# define LKEY ld1
370# define SKEY st1
371# define KEYADDR(dst, i) add dst = i, KTable
372#elif SZ == 2
373# define LKEY ld2
374# define SKEY st2
375# define KEYADDR(dst, i) shladd dst = i, 1, KTable
376#elif SZ == 4
377# define LKEY ld4
378# define SKEY st4
379# define KEYADDR(dst, i) shladd dst = i, 2, KTable
380#else
381# define LKEY ld8
382# define SKEY st8
383# define KEYADDR(dst, i) shladd dst = i, 3, KTable
384#endif
385
386#if defined(_HPUX_SOURCE) && !defined(_LP64)
387# define ADDP addp4
388#else
389# define ADDP add
390#endif
391
392/* Define a macro for the bit number of the n-th byte: */
393
394#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
395# define HOST_IS_BIG_ENDIAN
396# define BYTE_POS(n) (56 - (8 * (n)))
397#else
398# define BYTE_POS(n) (8 * (n))
399#endif
400
401/*
402 We must perform the first phase of the pipeline explicitly since
403 we will always load from the stable the first time. The br.cexit
404 will never be taken since regardless of the number of bytes because
405 the epilogue count is 4.
406*/
407/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
408 assembler failed on original macro with syntax error. <appro> */
409#define MODSCHED_RC4_PROLOGUE \\
410 { \\
411 ld1 Data[0] = [InPtr], 1; \\
412 add IFinal = 1, I[1]; \\
413 KEYADDR(IPr[0], I[1]); \\
414 } ;; \\
415 { \\
416 LKEY SI[0] = [IPr[0]]; \\
417 mov pr.rot = 0x10000; \\
418 mov ar.ec = 4; \\
419 } ;; \\
420 { \\
421 add J = J, SI[0]; \\
422 zxt1 I[0] = IFinal; \\
423 br.cexit.spnt.few .+16; /* never taken */ \\
424 } ;;
425#define MODSCHED_RC4_LOOP(label) \\
426label: \\
427 { .mmi; \\
428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\
429 (pComputeI) add IFinal = 1, I[1]; \\
430 (pComputeJ) zxt1 J = J; \\
431 }{ .mmi; \\
432 (pOutput) LKEY T[1] = [T[1]]; \\
433 (pComputeT) add T[0] = SI[2], SJ[1]; \\
434 (pComputeI) KEYADDR(IPr[0], I[1]); \\
435 } ;; \\
436 { .mmi; \\
437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
438 (pComputeT) SKEY [JP[1]] = SI[2]; \\
439 (pComputeT) zxt1 T[0] = T[0]; \\
440 }{ .mmi; \\
441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\
442 (pComputeJ) KEYADDR(JP[0], J); \\
443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
444 } ;; \\
445 { .mmi; \\
446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
447 (pOutput) xor Data[3] = Data[3], T[1]; \\
448 nop 0x0; \\
449 }{ .mmi; \\
450 (pComputeT) KEYADDR(T[0], T[0]); \\
451 (pBypass) mov SI[0] = SI[1]; \\
452 (pComputeI) zxt1 I[0] = IFinal; \\
453 } ;; \\
454 { .mmb; \\
455 (pOutput) st1 [OutPtr] = Data[3], 1; \\
456 (pComputeI) add J = J, SI[0]; \\
457 br.ctop.sptk.few label; \\
458 } ;;
459
460 .text
461
462 .align 32
463
464 .type RC4, \@function
465 .global RC4
466
467 .proc RC4
468 .prologue
469
470RC4:
471 {
472 .mmi
473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
474
475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
476 OutWord[2]
477 .rotp pPhase[4]
478
479 ADDP InPrefetch = 0, InputBuffer
480 ADDP KTable = 0, StateTable
481 }
482 {
483 .mmi
484 ADDP InPtr = 0, InputBuffer
485 ADDP OutPtr = 0, OutputBuffer
486 mov RetVal = r0
487 }
488 ;;
489 {
490 .mmi
491 lfetch.nt1 [InPrefetch], 0x80
492 ADDP OutPrefetch = 0, OutputBuffer
493 }
494 { // Return 0 if the input length is nonsensical
495 .mib
496 ADDP StateTable = 0, StateTable
497 cmp.ge.unc L_NOK, L_OK = r0, DataLen
498 (L_NOK) br.ret.sptk.few rp
499 }
500 ;;
501 {
502 .mib
503 cmp.eq.or L_NOK, L_OK = r0, InPtr
504 cmp.eq.or L_NOK, L_OK = r0, OutPtr
505 nop 0x0
506 }
507 {
508 .mib
509 cmp.eq.or L_NOK, L_OK = r0, StateTable
510 nop 0x0
511 (L_NOK) br.ret.sptk.few rp
512 }
513 ;;
514 LKEY I[1] = [KTable], SZ
515/* Prefetch the state-table. It contains 256 elements of size SZ */
516
517#if SZ == 1
518 ADDP tmp0 = 1*128, StateTable
519#elif SZ == 2
520 ADDP tmp0 = 3*128, StateTable
521 ADDP tmp1 = 2*128, StateTable
522#elif SZ == 4
523 ADDP tmp0 = 7*128, StateTable
524 ADDP tmp1 = 6*128, StateTable
525#elif SZ == 8
526 ADDP tmp0 = 15*128, StateTable
527 ADDP tmp1 = 14*128, StateTable
528#endif
529 ;;
530#if SZ >= 8
531 lfetch.fault.nt1 [tmp0], -256 // 15
532 lfetch.fault.nt1 [tmp1], -256;;
533 lfetch.fault.nt1 [tmp0], -256 // 13
534 lfetch.fault.nt1 [tmp1], -256;;
535 lfetch.fault.nt1 [tmp0], -256 // 11
536 lfetch.fault.nt1 [tmp1], -256;;
537 lfetch.fault.nt1 [tmp0], -256 // 9
538 lfetch.fault.nt1 [tmp1], -256;;
539#endif
540#if SZ >= 4
541 lfetch.fault.nt1 [tmp0], -256 // 7
542 lfetch.fault.nt1 [tmp1], -256;;
543 lfetch.fault.nt1 [tmp0], -256 // 5
544 lfetch.fault.nt1 [tmp1], -256;;
545#endif
546#if SZ >= 2
547 lfetch.fault.nt1 [tmp0], -256 // 3
548 lfetch.fault.nt1 [tmp1], -256;;
549#endif
550 {
551 .mii
552 lfetch.fault.nt1 [tmp0] // 1
553 add I[1]=1,I[1];;
554 zxt1 I[1]=I[1]
555 }
556 {
557 .mmi
558 lfetch.nt1 [InPrefetch], 0x80
559 lfetch.excl.nt1 [OutPrefetch], 0x80
560 .save pr, PRSave
561 mov PRSave = pr
562 } ;;
563 {
564 .mmi
565 lfetch.excl.nt1 [OutPrefetch], 0x80
566 LKEY J = [KTable], SZ
567 ADDP EndPtr = DataLen, InPtr
568 } ;;
569 {
570 .mmi
571 ADDP EndPtr = -1, EndPtr // Make it point to
572 // last data byte.
573 mov One = 1
574 .save ar.lc, LCSave
575 mov LCSave = ar.lc
576 .body
577 } ;;
578 {
579 .mmb
580 sub Remainder = 0, OutPtr
581 cmp.gtu pSmall, p0 = $threshold, DataLen
582(pSmall) br.cond.dpnt .rc4Remainder // Data too small for
583 // big loop.
584 } ;;
585 {
586 .mmi
587 and Remainder = 0x7, Remainder
588 ;;
589 cmp.eq pAligned, pUnaligned = Remainder, r0
590 nop 0x0
591 } ;;
592 {
593 .mmb
594.pred.rel "mutex",pUnaligned,pAligned
595(pUnaligned) add Remainder = -1, Remainder
596(pAligned) sub Remainder = EndPtr, InPtr
597(pAligned) br.cond.dptk.many .rc4Aligned
598 } ;;
599 {
600 .mmi
601 nop 0x0
602 nop 0x0
603 mov.i ar.lc = Remainder
604 }
605
606/* Do the initial few bytes via the compact, modulo-scheduled loop
607 until the output pointer is 8-byte-aligned. */
608
609 MODSCHED_RC4_PROLOGUE
610 MODSCHED_RC4_LOOP(.RC4AlignLoop)
611
612 {
613 .mib
614 sub Remainder = EndPtr, InPtr
615 zxt1 IFinal = IFinal
616 clrrrb // Clear CFM.rrb.pr so
617 ;; // next "mov pr.rot = N"
618 // does the right thing.
619 }
620 {
621 .mmi
622 mov I[1] = IFinal
623 nop 0x0
624 nop 0x0
625 } ;;
626
627
628.rc4Aligned:
629
630/*
631 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
632 */
633
634 {
635 .mlx
636 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
637 movl Remainder = 0xaaaaaaaaaaaaaaab
638 } ;;
639 {
640 .mmi
641 setf.sig f6 = LoopCount // M2, M3 6 cyc
642 setf.sig f7 = Remainder // M2, M3 6 cyc
643 nop 0x0
644 } ;;
645 {
646 .mfb
647 nop 0x0
648 xmpy.hu f6 = f6, f7
649 nop 0x0
650 } ;;
651 {
652 .mmi
653 getf.sig LoopCount = f6;; // M2 5 cyc
654 nop 0x0
655 shr.u LoopCount = LoopCount, 4
656 } ;;
657 {
658 .mmi
659 nop 0x0
660 nop 0x0
661 mov.i ar.lc = LoopCount
662 } ;;
663
664/* Now comes the unrolled loop: */
665
666.rc4Prologue:
667___
668
669$iteration = 0;
670
671# Generate the prologue:
672$predicates = 1;
673for ($i = 0; $i < $phases; ++$i) {
674 &emit_body (\$code, \$bypass, $iteration++, $predicates);
675 $predicates = ($predicates << 1) | 1;
676}
677
678$code.=<<___;
679.rc4Loop:
680___
681
682# Generate the body:
683for ($i = 0; $i < $unroll_count*$phases; ++$i) {
684 &emit_body (\$code, \$bypass, $iteration++, $predicates);
685}
686
687$code.=<<___;
688.rc4Epilogue:
689___
690
691# Generate the epilogue:
692for ($i = 0; $i < $phases; ++$i) {
693 $predicates <<= 1;
694 &emit_body (\$code, \$bypass, $iteration++, $predicates);
695}
696
697$code.=<<___;
698 {
699 .mmi
700 lfetch.nt1 [EndPtr] // fetch line with last byte
701 mov IFinal = I[1]
702 nop 0x0
703 }
704
705.rc4Remainder:
706 {
707 .mmi
708 sub Remainder = EndPtr, InPtr // Calculate
709 // # of bytes
710 // left - 1
711 nop 0x0
712 nop 0x0
713 } ;;
714 {
715 .mib
716 cmp.eq pDone, p0 = -1, Remainder // done already?
717 mov.i ar.lc = Remainder
718(pDone) br.cond.dptk.few .rc4Complete
719 }
720
721/* Do the remaining bytes via the compact, modulo-scheduled loop */
722
723 MODSCHED_RC4_PROLOGUE
724 MODSCHED_RC4_LOOP(.RC4RestLoop)
725
726.rc4Complete:
727 {
728 .mmi
729 add KTable = -SZ, KTable
730 add IFinal = -1, IFinal
731 mov ar.lc = LCSave
732 } ;;
733 {
734 .mii
735 SKEY [KTable] = J,-SZ
736 zxt1 IFinal = IFinal
737 mov pr = PRSave, 0x1FFFF
738 } ;;
739 {
740 .mib
741 SKEY [KTable] = IFinal
742 add RetVal = 1, r0
743 br.ret.sptk.few rp
744 } ;;
745___
746
747# Last but not least, emit the code for the bypass-code of the unrolled loop:
748
749$code.=$bypass;
750
751$code.=<<___;
752 .endp RC4
753___
754
755print $code;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000000..272fa91e1a
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,632 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# June 2011
11#
12# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
13# http://download.intel.com/design/intarch/papers/323686.pdf, is that
14# since both algorithms exhibit instruction-level parallelism, ILP,
15# below theoretical maximum, interleaving them would allow to utilize
16# processor resources better and achieve better performance. RC4
17# instruction sequence is virtually identical to rc4-x86_64.pl, which
18# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
19# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
20# minimize register usage, which was used as "main thread" with RC4
21# weaved into it, one RC4 round per one MD5 round. In addition to the
22# stiched subroutine the script can generate standalone replacement
23# md5_block_asm_data_order and RC4. Below are performance numbers in
24# cycles per processed byte, less is better, for these the standalone
25# subroutines, sum of them, and stitched one:
26#
27# RC4 MD5 RC4+MD5 stitch gain
28# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*)
29# Core2 6.5 5.8 12.3 7.7 +60%
30# Westmere 4.3 5.2 9.5 7.0 +36%
31# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
32# Atom 9.3 6.5 15.8 11.1 +42%
33#
34# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
35# is +53%...
36
37my ($rc4,$md5)=(1,1); # what to generate?
38my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(),
39 # but its result is discarded. Idea here is
40 # to be able to use 'openssl speed rc4' for
41 # benchmarking the stitched subroutine...
42
43my $flavour = shift;
44my $output = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54open OUT,"| \"$^X\" $xlate $flavour $output";
55*STDOUT=*OUT;
56
57my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
58
59if ($rc4 && !$md5) {
60 ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
61 $func="RC4"; $nargs=4;
62} elsif ($md5 && !$rc4) {
63 ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
64 $func="md5_block_asm_data_order"; $nargs=3;
65} else {
66 ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
67 $func="rc4_md5_enc"; $nargs=6;
68 # void rc4_md5_enc(
69 # RC4_KEY *key, #
70 # const void *in0, # RC4 input
71 # void *out, # RC4 output
72 # MD5_CTX *ctx, #
73 # const void *inp, # MD5 input
74 # size_t len); # number of 64-byte blocks
75}
76
77my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
78 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
79 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
80 0x6b901122,0xfd987193,0xa679438e,0x49b40821,
81
82 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
83 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
84 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
85 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
86
87 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
88 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
89 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
90 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
91
92 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
93 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
94 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
95 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 );
96
97my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers
98my $tmp="%r12d";
99
100my @XX=("%rbp","%rsi"); # RC4 registers
101my @TX=("%rax","%rbx");
102my $YY="%rcx";
103my $TY="%rdx";
104
105my $MOD=32; # 16, 32 or 64
106
107$code.=<<___;
108.text
109.align 16
110
111.globl $func
112.type $func,\@function,$nargs
113$func:
114 cmp \$0,$len
115 je .Labort
116 push %rbx
117 push %rbp
118 push %r12
119 push %r13
120 push %r14
121 push %r15
122 sub \$40,%rsp
123.Lbody:
124___
125if ($rc4) {
126$code.=<<___;
127$D#md5# mov $ctx,%r11 # reassign arguments
128 mov $len,%r12
129 mov $in0,%r13
130 mov $out,%r14
131$D#md5# mov $inp,%r15
132___
133 $ctx="%r11" if ($md5); # reassign arguments
134 $len="%r12";
135 $in0="%r13";
136 $out="%r14";
137 $inp="%r15" if ($md5);
138 $inp=$in0 if (!$md5);
139$code.=<<___;
140 xor $XX[0],$XX[0]
141 xor $YY,$YY
142
143 lea 8($dat),$dat
144 mov -8($dat),$XX[0]#b
145 mov -4($dat),$YY#b
146
147 inc $XX[0]#b
148 sub $in0,$out
149 movl ($dat,$XX[0],4),$TX[0]#d
150___
151$code.=<<___ if (!$md5);
152 xor $TX[1],$TX[1]
153 test \$-128,$len
154 jz .Loop1
155 sub $XX[0],$TX[1]
156 and \$`$MOD-1`,$TX[1]
157 jz .Loop${MOD}_is_hot
158 sub $TX[1],$len
159.Loop${MOD}_warmup:
160 add $TX[0]#b,$YY#b
161 movl ($dat,$YY,4),$TY#d
162 movl $TX[0]#d,($dat,$YY,4)
163 movl $TY#d,($dat,$XX[0],4)
164 add $TY#b,$TX[0]#b
165 inc $XX[0]#b
166 movl ($dat,$TX[0],4),$TY#d
167 movl ($dat,$XX[0],4),$TX[0]#d
168 xorb ($in0),$TY#b
169 movb $TY#b,($out,$in0)
170 lea 1($in0),$in0
171 dec $TX[1]
172 jnz .Loop${MOD}_warmup
173
174 mov $YY,$TX[1]
175 xor $YY,$YY
176 mov $TX[1]#b,$YY#b
177
178.Loop${MOD}_is_hot:
179 mov $len,32(%rsp) # save original $len
180 shr \$6,$len # number of 64-byte blocks
181___
182 if ($D && !$md5) { # stitch in dummy MD5
183 $md5=1;
184 $ctx="%r11";
185 $inp="%r15";
186 $code.=<<___;
187 mov %rsp,$ctx
188 mov $in0,$inp
189___
190 }
191}
192$code.=<<___;
193#rc4# add $TX[0]#b,$YY#b
194#rc4# lea ($dat,$XX[0],4),$XX[1]
195 shl \$6,$len
196 add $inp,$len # pointer to the end of input
197 mov $len,16(%rsp)
198
199#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX
200#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX
201#md5# mov 1*4($ctx),$V[1]
202#md5# mov 2*4($ctx),$V[2]
203#md5# mov 3*4($ctx),$V[3]
204 jmp .Loop
205
206.align 16
207.Loop:
208#md5# mov $V[0],0*4(%rsp) # put aside current hash value
209#md5# mov $V[1],1*4(%rsp)
210#md5# mov $V[2],2*4(%rsp)
211#md5# mov $V[3],$tmp # forward reference
212#md5# mov $V[3],3*4(%rsp)
213___
214
215sub R0 {
216 my ($i,$a,$b,$c,$d)=@_;
217 my @rot0=(7,12,17,22);
218 my $j=$i%16;
219 my $k=$i%$MOD;
220 my $xmm="%xmm".($j&1);
221 $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15);
222 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
223 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
224 $code.=<<___;
225#rc4# movl ($dat,$YY,4),$TY#d
226#md5# xor $c,$tmp
227#rc4# movl $TX[0]#d,($dat,$YY,4)
228#md5# and $b,$tmp
229#md5# add 4*`$j`($inp),$a
230#rc4# add $TY#b,$TX[0]#b
231#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
232#md5# add \$$K[$i],$a
233#md5# xor $d,$tmp
234#rc4# movz $TX[0]#b,$TX[0]#d
235#rc4# movl $TY#d,4*$k($XX[1])
236#md5# add $tmp,$a
237#rc4# add $TX[1]#b,$YY#b
238#md5# rol \$$rot0[$j%4],$a
239#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference
240#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
241#md5# add $b,$a
242___
243 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
244 mov $YY,$XX[1]
245 xor $YY,$YY # keyword to partial register
246 mov $XX[1]#b,$YY#b
247 lea ($dat,$XX[0],4),$XX[1]
248___
249 $code.=<<___ if ($rc4 && $j==15);
250 psllq \$8,%xmm1
251 pxor %xmm0,%xmm2
252 pxor %xmm1,%xmm2
253___
254}
255sub R1 {
256 my ($i,$a,$b,$c,$d)=@_;
257 my @rot1=(5,9,14,20);
258 my $j=$i%16;
259 my $k=$i%$MOD;
260 my $xmm="%xmm".($j&1);
261 $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15);
262 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
263 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
264 $code.=<<___;
265#rc4# movl ($dat,$YY,4),$TY#d
266#md5# xor $b,$tmp
267#rc4# movl $TX[0]#d,($dat,$YY,4)
268#md5# and $d,$tmp
269#md5# add 4*`((1+5*$j)%16)`($inp),$a
270#rc4# add $TY#b,$TX[0]#b
271#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
272#md5# add \$$K[$i],$a
273#md5# xor $c,$tmp
274#rc4# movz $TX[0]#b,$TX[0]#d
275#rc4# movl $TY#d,4*$k($XX[1])
276#md5# add $tmp,$a
277#rc4# add $TX[1]#b,$YY#b
278#md5# rol \$$rot1[$j%4],$a
279#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference
280#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
281#md5# add $b,$a
282___
283 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
284 mov $YY,$XX[1]
285 xor $YY,$YY # keyword to partial register
286 mov $XX[1]#b,$YY#b
287 lea ($dat,$XX[0],4),$XX[1]
288___
289 $code.=<<___ if ($rc4 && $j==15);
290 psllq \$8,%xmm1
291 pxor %xmm0,%xmm3
292 pxor %xmm1,%xmm3
293___
294}
295sub R2 {
296 my ($i,$a,$b,$c,$d)=@_;
297 my @rot2=(4,11,16,23);
298 my $j=$i%16;
299 my $k=$i%$MOD;
300 my $xmm="%xmm".($j&1);
301 $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15);
302 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
303 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
304 $code.=<<___;
305#rc4# movl ($dat,$YY,4),$TY#d
306#md5# xor $c,$tmp
307#rc4# movl $TX[0]#d,($dat,$YY,4)
308#md5# xor $b,$tmp
309#md5# add 4*`((5+3*$j)%16)`($inp),$a
310#rc4# add $TY#b,$TX[0]#b
311#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
312#md5# add \$$K[$i],$a
313#rc4# movz $TX[0]#b,$TX[0]#d
314#md5# add $tmp,$a
315#rc4# movl $TY#d,4*$k($XX[1])
316#rc4# add $TX[1]#b,$YY#b
317#md5# rol \$$rot2[$j%4],$a
318#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference
319#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
320#md5# add $b,$a
321___
322 $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
323 mov $YY,$XX[1]
324 xor $YY,$YY # keyword to partial register
325 mov $XX[1]#b,$YY#b
326 lea ($dat,$XX[0],4),$XX[1]
327___
328 $code.=<<___ if ($rc4 && $j==15);
329 psllq \$8,%xmm1
330 pxor %xmm0,%xmm4
331 pxor %xmm1,%xmm4
332___
333}
334sub R3 {
335 my ($i,$a,$b,$c,$d)=@_;
336 my @rot3=(6,10,15,21);
337 my $j=$i%16;
338 my $k=$i%$MOD;
339 my $xmm="%xmm".($j&1);
340 $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15);
341 $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1);
342 $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1);
343 $code.=<<___;
344#rc4# movl ($dat,$YY,4),$TY#d
345#md5# xor $d,$tmp
346#rc4# movl $TX[0]#d,($dat,$YY,4)
347#md5# or $b,$tmp
348#md5# add 4*`((7*$j)%16)`($inp),$a
349#rc4# add $TY#b,$TX[0]#b
350#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
351#md5# add \$$K[$i],$a
352#rc4# movz $TX[0]#b,$TX[0]#d
353#md5# xor $c,$tmp
354#rc4# movl $TY#d,4*$k($XX[1])
355#md5# add $tmp,$a
356#rc4# add $TX[1]#b,$YY#b
357#md5# rol \$$rot3[$j%4],$a
358#md5# mov \$-1,$tmp # forward reference
359#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
360#md5# add $b,$a
361___
362 $code.=<<___ if ($rc4 && $j==15);
363 mov $XX[0],$XX[1]
364 xor $XX[0],$XX[0] # keyword to partial register
365 mov $XX[1]#b,$XX[0]#b
366 mov $YY,$XX[1]
367 xor $YY,$YY # keyword to partial register
368 mov $XX[1]#b,$YY#b
369 lea ($dat,$XX[0],4),$XX[1]
370 psllq \$8,%xmm1
371 pxor %xmm0,%xmm5
372 pxor %xmm1,%xmm5
373___
374}
375
376my $i=0;
377for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
378for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
379for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
380for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
381
382$code.=<<___;
383#md5# add 0*4(%rsp),$V[0] # accumulate hash value
384#md5# add 1*4(%rsp),$V[1]
385#md5# add 2*4(%rsp),$V[2]
386#md5# add 3*4(%rsp),$V[3]
387
388#rc4# movdqu %xmm2,($out,$in0) # write RC4 output
389#rc4# movdqu %xmm3,16($out,$in0)
390#rc4# movdqu %xmm4,32($out,$in0)
391#rc4# movdqu %xmm5,48($out,$in0)
392#md5# lea 64($inp),$inp
393#rc4# lea 64($in0),$in0
394 cmp 16(%rsp),$inp # are we done?
395 jb .Loop
396
397#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX
398#rc4# sub $TX[0]#b,$YY#b # correct $YY
399#md5# mov $V[0],0*4($len) # write MD5_CTX
400#md5# mov $V[1],1*4($len)
401#md5# mov $V[2],2*4($len)
402#md5# mov $V[3],3*4($len)
403___
404$code.=<<___ if ($rc4 && (!$md5 || $D));
405 mov 32(%rsp),$len # restore original $len
406 and \$63,$len # remaining bytes
407 jnz .Loop1
408 jmp .Ldone
409
410.align 16
411.Loop1:
412 add $TX[0]#b,$YY#b
413 movl ($dat,$YY,4),$TY#d
414 movl $TX[0]#d,($dat,$YY,4)
415 movl $TY#d,($dat,$XX[0],4)
416 add $TY#b,$TX[0]#b
417 inc $XX[0]#b
418 movl ($dat,$TX[0],4),$TY#d
419 movl ($dat,$XX[0],4),$TX[0]#d
420 xorb ($in0),$TY#b
421 movb $TY#b,($out,$in0)
422 lea 1($in0),$in0
423 dec $len
424 jnz .Loop1
425
426.Ldone:
427___
428$code.=<<___;
429#rc4# sub \$1,$XX[0]#b
430#rc4# movl $XX[0]#d,-8($dat)
431#rc4# movl $YY#d,-4($dat)
432
433 mov 40(%rsp),%r15
434 mov 48(%rsp),%r14
435 mov 56(%rsp),%r13
436 mov 64(%rsp),%r12
437 mov 72(%rsp),%rbp
438 mov 80(%rsp),%rbx
439 lea 88(%rsp),%rsp
440.Lepilogue:
441.Labort:
442 ret
443.size $func,.-$func
444___
445
446if ($rc4 && $D) { # sole purpose of this section is to provide
447 # option to use the generated module as drop-in
448 # replacement for rc4-x86_64.pl for debugging
449 # and testing purposes...
450my ($idx,$ido)=("%r8","%r9");
451my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
452
453$code.=<<___;
454.globl RC4_set_key
455.type RC4_set_key,\@function,3
456.align 16
457RC4_set_key:
458 lea 8($dat),$dat
459 lea ($inp,$len),$inp
460 neg $len
461 mov $len,%rcx
462 xor %eax,%eax
463 xor $ido,$ido
464 xor %r10,%r10
465 xor %r11,%r11
466 jmp .Lw1stloop
467
468.align 16
469.Lw1stloop:
470 mov %eax,($dat,%rax,4)
471 add \$1,%al
472 jnc .Lw1stloop
473
474 xor $ido,$ido
475 xor $idx,$idx
476.align 16
477.Lw2ndloop:
478 mov ($dat,$ido,4),%r10d
479 add ($inp,$len,1),$idx#b
480 add %r10b,$idx#b
481 add \$1,$len
482 mov ($dat,$idx,4),%r11d
483 cmovz %rcx,$len
484 mov %r10d,($dat,$idx,4)
485 mov %r11d,($dat,$ido,4)
486 add \$1,$ido#b
487 jnc .Lw2ndloop
488
489 xor %eax,%eax
490 mov %eax,-8($dat)
491 mov %eax,-4($dat)
492 ret
493.size RC4_set_key,.-RC4_set_key
494
495.globl RC4_options
496.type RC4_options,\@abi-omnipotent
497.align 16
498RC4_options:
499 lea .Lopts(%rip),%rax
500 ret
501.align 64
502.Lopts:
503.asciz "rc4(64x,int)"
504.align 64
505.size RC4_options,.-RC4_options
506___
507}
508# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
509# CONTEXT *context,DISPATCHER_CONTEXT *disp)
510if ($win64) {
511my $rec="%rcx";
512my $frame="%rdx";
513my $context="%r8";
514my $disp="%r9";
515
516$code.=<<___;
517.extern __imp_RtlVirtualUnwind
518.type se_handler,\@abi-omnipotent
519.align 16
520se_handler:
521 push %rsi
522 push %rdi
523 push %rbx
524 push %rbp
525 push %r12
526 push %r13
527 push %r14
528 push %r15
529 pushfq
530 sub \$64,%rsp
531
532 mov 120($context),%rax # pull context->Rax
533 mov 248($context),%rbx # pull context->Rip
534
535 lea .Lbody(%rip),%r10
536 cmp %r10,%rbx # context->Rip<.Lbody
537 jb .Lin_prologue
538
539 mov 152($context),%rax # pull context->Rsp
540
541 lea .Lepilogue(%rip),%r10
542 cmp %r10,%rbx # context->Rip>=.Lepilogue
543 jae .Lin_prologue
544
545 mov 40(%rax),%r15
546 mov 48(%rax),%r14
547 mov 56(%rax),%r13
548 mov 64(%rax),%r12
549 mov 72(%rax),%rbp
550 mov 80(%rax),%rbx
551 lea 88(%rax),%rax
552
553 mov %rbx,144($context) # restore context->Rbx
554 mov %rbp,160($context) # restore context->Rbp
555 mov %r12,216($context) # restore context->R12
556 mov %r13,224($context) # restore context->R12
557 mov %r14,232($context) # restore context->R14
558 mov %r15,240($context) # restore context->R15
559
560.Lin_prologue:
561 mov 8(%rax),%rdi
562 mov 16(%rax),%rsi
563 mov %rax,152($context) # restore context->Rsp
564 mov %rsi,168($context) # restore context->Rsi
565 mov %rdi,176($context) # restore context->Rdi
566
567 mov 40($disp),%rdi # disp->ContextRecord
568 mov $context,%rsi # context
569 mov \$154,%ecx # sizeof(CONTEXT)
570 .long 0xa548f3fc # cld; rep movsq
571
572 mov $disp,%rsi
573 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
574 mov 8(%rsi),%rdx # arg2, disp->ImageBase
575 mov 0(%rsi),%r8 # arg3, disp->ControlPc
576 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
577 mov 40(%rsi),%r10 # disp->ContextRecord
578 lea 56(%rsi),%r11 # &disp->HandlerData
579 lea 24(%rsi),%r12 # &disp->EstablisherFrame
580 mov %r10,32(%rsp) # arg5
581 mov %r11,40(%rsp) # arg6
582 mov %r12,48(%rsp) # arg7
583 mov %rcx,56(%rsp) # arg8, (NULL)
584 call *__imp_RtlVirtualUnwind(%rip)
585
586 mov \$1,%eax # ExceptionContinueSearch
587 add \$64,%rsp
588 popfq
589 pop %r15
590 pop %r14
591 pop %r13
592 pop %r12
593 pop %rbp
594 pop %rbx
595 pop %rdi
596 pop %rsi
597 ret
598.size se_handler,.-se_handler
599
600.section .pdata
601.align 4
602 .rva .LSEH_begin_$func
603 .rva .LSEH_end_$func
604 .rva .LSEH_info_$func
605
606.section .xdata
607.align 8
608.LSEH_info_$func:
609 .byte 9,0,0,0
610 .rva se_handler
611___
612}
613
614sub reg_part {
615my ($reg,$conv)=@_;
616 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
617 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
618 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
619 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
620 return $reg;
621}
622
623$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
624$code =~ s/\`([^\`]*)\`/eval $1/gem;
625$code =~ s/pinsrw\s+\$0,/movd /gm;
626
627$code =~ s/#md5#//gm if ($md5);
628$code =~ s/#rc4#//gm if ($rc4);
629
630print $code;
631
632close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 0000000000..ad7e65651c
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,314 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# RC4 for PA-RISC.
11
12# June 2009.
13#
14# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
15# For reference, [4x] unrolled loop is >40% faster than folded one.
16# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
17# is believed to be not sufficient to justify the effort...
18#
19# Special thanks to polarhome.com for providing HP-UX account.
20
21$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22
23$flavour = shift;
24$output = shift;
25open STDOUT,">$output";
26
27if ($flavour =~ /64/) {
28 $LEVEL ="2.0W";
29 $SIZE_T =8;
30 $FRAME_MARKER =80;
31 $SAVED_RP =16;
32 $PUSH ="std";
33 $PUSHMA ="std,ma";
34 $POP ="ldd";
35 $POPMB ="ldd,mb";
36} else {
37 $LEVEL ="1.0";
38 $SIZE_T =4;
39 $FRAME_MARKER =48;
40 $SAVED_RP =20;
41 $PUSH ="stw";
42 $PUSHMA ="stwm";
43 $POP ="ldw";
44 $POPMB ="ldwm";
45}
46
47$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
48 # [+ argument transfer]
49$SZ=1; # defaults to RC4_CHAR
50if (open CONF,"<${dir}../../opensslconf.h") {
51 while(<CONF>) {
52 if (m/#\s*define\s+RC4_INT\s+(.*)/) {
53 $SZ = ($1=~/char$/) ? 1 : 4;
54 last;
55 }
56 }
57 close CONF;
58}
59
60if ($SZ==1) { # RC4_CHAR
61 $LD="ldb";
62 $LDX="ldbx";
63 $MKX="addl";
64 $ST="stb";
65} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
66 $LD="ldw";
67 $LDX="ldwx,s";
68 $MKX="sh2addl";
69 $ST="stw";
70}
71
72$key="%r26";
73$len="%r25";
74$inp="%r24";
75$out="%r23";
76
77@XX=("%r19","%r20");
78@TX=("%r21","%r22");
79$YY="%r28";
80$TY="%r29";
81
82$acc="%r1";
83$ix="%r2";
84$iy="%r3";
85$dat0="%r4";
86$dat1="%r5";
87$rem="%r6";
88$mask="%r31";
89
90sub unrolledloopbody {
91for ($i=0;$i<4;$i++) {
92$code.=<<___;
93 ldo 1($XX[0]),$XX[1]
94 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
95 and $mask,$XX[1],$XX[1]
96 $LDX $YY($key),$TY
97 $MKX $YY,$key,$ix
98 $LDX $XX[1]($key),$TX[1]
99 $MKX $XX[0],$key,$iy
100 $ST $TX[0],0($ix)
101 comclr,<> $XX[1],$YY,%r0 ; conditional
102 copy $TX[0],$TX[1] ; move
103 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
104 $ST $TY,0($iy)
105 addl $TX[0],$TY,$TY
106 addl $TX[1],$YY,$YY
107 and $mask,$TY,$TY
108 and $mask,$YY,$YY
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111} }
112
113sub foldedloop {
114my ($label,$count)=@_;
115$code.=<<___;
116$label
117 $MKX $YY,$key,$iy
118 $LDX $YY($key),$TY
119 $MKX $XX[0],$key,$ix
120 $ST $TX[0],0($iy)
121 ldo 1($XX[0]),$XX[0]
122 $ST $TY,0($ix)
123 addl $TX[0],$TY,$TY
124 ldbx $inp($out),$dat1
125 and $mask,$TY,$TY
126 and $mask,$XX[0],$XX[0]
127 $LDX $TY($key),$acc
128 $LDX $XX[0]($key),$TX[0]
129 ldo 1($out),$out
130 xor $dat1,$acc,$acc
131 addl $TX[0],$YY,$YY
132 stb $acc,-1($out)
133 addib,<> -1,$count,$label ; $count is always small
134 and $mask,$YY,$YY
135___
136}
137
138$code=<<___;
139 .LEVEL $LEVEL
140 .SPACE \$TEXT\$
141 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
142
143 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
144RC4
145 .PROC
146 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
147 .ENTRY
148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
149 $PUSHMA %r3,$FRAME(%sp)
150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
153
154 cmpib,*= 0,$len,L\$abort
155 sub $inp,$out,$inp ; distance between $inp and $out
156
157 $LD `0*$SZ`($key),$XX[0]
158 $LD `1*$SZ`($key),$YY
159 ldo `2*$SZ`($key),$key
160
161 ldi 0xff,$mask
162 ldi 3,$dat0
163
164 ldo 1($XX[0]),$XX[0] ; warm up loop
165 and $mask,$XX[0],$XX[0]
166 $LDX $XX[0]($key),$TX[0]
167 addl $TX[0],$YY,$YY
168 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
169 and $mask,$YY,$YY
170
171 and,<> $out,$dat0,$rem ; is $out aligned?
172 b L\$alignedout
173 subi 4,$rem,$rem
174 sub $len,$rem,$len
175___
176&foldedloop("L\$alignout",$rem); # process till $out is aligned
177
178$code.=<<___;
179L\$alignedout ; $len is at least 4 here
180 and,<> $inp,$dat0,$acc ; is $inp aligned?
181 b L\$oop4
182 sub $inp,$acc,$rem ; align $inp
183
184 sh3addl $acc,%r0,$acc
185 subi 32,$acc,$acc
186 mtctl $acc,%cr11 ; load %sar with vshd align factor
187 ldwx $rem($out),$dat0
188 ldo 4($rem),$rem
189L\$oop4misalignedinp
190___
191&unrolledloopbody();
192$code.=<<___;
193 $LDX $TY($key),$ix
194 ldwx $rem($out),$dat1
195 ldo -4($len),$len
196 or $ix,$acc,$acc ; last piece, no need to dep
197 vshd $dat0,$dat1,$iy ; align data
198 copy $dat1,$dat0
199 xor $iy,$acc,$acc
200 stw $acc,0($out)
201 cmpib,*<< 3,$len,L\$oop4misalignedinp
202 ldo 4($out),$out
203 cmpib,*= 0,$len,L\$done
204 nop
205 b L\$oop1
206 nop
207
208 .ALIGN 8
209L\$oop4
210___
211&unrolledloopbody();
212$code.=<<___;
213 $LDX $TY($key),$ix
214 ldwx $inp($out),$dat0
215 ldo -4($len),$len
216 or $ix,$acc,$acc ; last piece, no need to dep
217 xor $dat0,$acc,$acc
218 stw $acc,0($out)
219 cmpib,*<< 3,$len,L\$oop4
220 ldo 4($out),$out
221 cmpib,*= 0,$len,L\$done
222 nop
223___
224&foldedloop("L\$oop1",$len);
225$code.=<<___;
226L\$done
227 $POP `-$FRAME-$SAVED_RP`(%sp),%r2
228 ldo -1($XX[0]),$XX[0] ; chill out loop
229 sub $YY,$TX[0],$YY
230 and $mask,$XX[0],$XX[0]
231 and $mask,$YY,$YY
232 $ST $XX[0],`-2*$SZ`($key)
233 $ST $YY,`-1*$SZ`($key)
234 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
235 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
236 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
237L\$abort
238 bv (%r2)
239 .EXIT
240 $POPMB -$FRAME(%sp),%r3
241 .PROCEND
242___
243
244$code.=<<___;
245
246 .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
247 .ALIGN 8
248private_RC4_set_key
249 .PROC
250 .CALLINFO NO_CALLS
251 .ENTRY
252 $ST %r0,`0*$SZ`($key)
253 $ST %r0,`1*$SZ`($key)
254 ldo `2*$SZ`($key),$key
255 copy %r0,@XX[0]
256L\$1st
257 $ST @XX[0],0($key)
258 ldo 1(@XX[0]),@XX[0]
259 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
260 ldo $SZ($key),$key
261
262 ldo `-256*$SZ`($key),$key ; rewind $key
263 addl $len,$inp,$inp ; $inp to point at the end
264 sub %r0,$len,%r23 ; inverse index
265 copy %r0,@XX[0]
266 copy %r0,@XX[1]
267 ldi 0xff,$mask
268
269L\$2nd
270 $LDX @XX[0]($key),@TX[0]
271 ldbx %r23($inp),@TX[1]
272 addi,nuv 1,%r23,%r23 ; increment and conditional
273 sub %r0,$len,%r23 ; inverse index
274 addl @TX[0],@XX[1],@XX[1]
275 addl @TX[1],@XX[1],@XX[1]
276 and $mask,@XX[1],@XX[1]
277 $MKX @XX[0],$key,$TY
278 $LDX @XX[1]($key),@TX[1]
279 $MKX @XX[1],$key,$YY
280 ldo 1(@XX[0]),@XX[0]
281 $ST @TX[0],0($YY)
282 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
283 $ST @TX[1],0($TY)
284
285 bv,n (%r2)
286 .EXIT
287 nop
288 .PROCEND
289
290 .EXPORT RC4_options,ENTRY
291 .ALIGN 8
292RC4_options
293 .PROC
294 .CALLINFO NO_CALLS
295 .ENTRY
296 blr %r0,%r28
297 ldi 3,%r1
298L\$pic
299 andcm %r28,%r1,%r28
300 bv (%r2)
301 .EXIT
302 ldo L\$opts-L\$pic(%r28),%r28
303 .PROCEND
304 .ALIGN 8
305L\$opts
306 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
307 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
308___
309$code =~ s/\`([^\`]*)\`/eval $1/gem;
310$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
311$code =~ s/\bbv\b/bve/gm if ($SIZE_T==8);
312
313print $code;
314close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
new file mode 100644
index 0000000000..7528ece13c
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
@@ -0,0 +1,234 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# February 2009
11#
12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13# "cluster" Address Generation Interlocks, so that one pipeline stall
14# resolves several dependencies.
15
16# November 2010.
17#
18# Adapt for -m31 build. If kernel supports what's called "highgprs"
19# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
20# instructions and achieve "64-bit" performance even in 31-bit legacy
21# application context. The feature is not specific to any particular
22# processor, as long as it's "z-CPU". Latter implies that the code
23# remains z/Architecture specific. On z990 it was measured to perform
24# 50% better than code generated by gcc 4.3.
25
26$flavour = shift;
27
28if ($flavour =~ /3[12]/) {
29 $SIZE_T=4;
30 $g="";
31} else {
32 $SIZE_T=8;
33 $g="g";
34}
35
36while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
37open STDOUT,">$output";
38
39$rp="%r14";
40$sp="%r15";
41$code=<<___;
42.text
43
44___
45
46# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
47{
48$acc="%r0";
49$cnt="%r1";
50$key="%r2";
51$len="%r3";
52$inp="%r4";
53$out="%r5";
54
55@XX=("%r6","%r7");
56@TX=("%r8","%r9");
57$YY="%r10";
58$TY="%r11";
59
60$code.=<<___;
61.globl RC4
62.type RC4,\@function
63.align 64
64RC4:
65 stm${g} %r6,%r11,6*$SIZE_T($sp)
66___
67$code.=<<___ if ($flavour =~ /3[12]/);
68 llgfr $len,$len
69___
70$code.=<<___;
71 llgc $XX[0],0($key)
72 llgc $YY,1($key)
73 la $XX[0],1($XX[0])
74 nill $XX[0],0xff
75 srlg $cnt,$len,3
76 ltgr $cnt,$cnt
77 llgc $TX[0],2($XX[0],$key)
78 jz .Lshort
79 j .Loop8
80
81.align 64
82.Loop8:
83___
84for ($i=0;$i<8;$i++) {
85$code.=<<___;
86 la $YY,0($YY,$TX[0]) # $i
87 nill $YY,255
88 la $XX[1],1($XX[0])
89 nill $XX[1],255
90___
91$code.=<<___ if ($i==1);
92 llgc $acc,2($TY,$key)
93___
94$code.=<<___ if ($i>1);
95 sllg $acc,$acc,8
96 ic $acc,2($TY,$key)
97___
98$code.=<<___;
99 llgc $TY,2($YY,$key)
100 stc $TX[0],2($YY,$key)
101 llgc $TX[1],2($XX[1],$key)
102 stc $TY,2($XX[0],$key)
103 cr $XX[1],$YY
104 jne .Lcmov$i
105 la $TX[1],0($TX[0])
106.Lcmov$i:
107 la $TY,0($TY,$TX[0])
108 nill $TY,255
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111}
112
113$code.=<<___;
114 lg $TX[1],0($inp)
115 sllg $acc,$acc,8
116 la $inp,8($inp)
117 ic $acc,2($TY,$key)
118 xgr $acc,$TX[1]
119 stg $acc,0($out)
120 la $out,8($out)
121 brctg $cnt,.Loop8
122
123.Lshort:
124 lghi $acc,7
125 ngr $len,$acc
126 jz .Lexit
127 j .Loop1
128
129.align 16
130.Loop1:
131 la $YY,0($YY,$TX[0])
132 nill $YY,255
133 llgc $TY,2($YY,$key)
134 stc $TX[0],2($YY,$key)
135 stc $TY,2($XX[0],$key)
136 ar $TY,$TX[0]
137 ahi $XX[0],1
138 nill $TY,255
139 nill $XX[0],255
140 llgc $acc,0($inp)
141 la $inp,1($inp)
142 llgc $TY,2($TY,$key)
143 llgc $TX[0],2($XX[0],$key)
144 xr $acc,$TY
145 stc $acc,0($out)
146 la $out,1($out)
147 brct $len,.Loop1
148
149.Lexit:
150 ahi $XX[0],-1
151 stc $XX[0],0($key)
152 stc $YY,1($key)
153 lm${g} %r6,%r11,6*$SIZE_T($sp)
154 br $rp
155.size RC4,.-RC4
156.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
157
158___
159}
160
161# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
162{
163$cnt="%r0";
164$idx="%r1";
165$key="%r2";
166$len="%r3";
167$inp="%r4";
168$acc="%r5";
169$dat="%r6";
170$ikey="%r7";
171$iinp="%r8";
172
173$code.=<<___;
174.globl private_RC4_set_key
175.type private_RC4_set_key,\@function
176.align 64
177private_RC4_set_key:
178 stm${g} %r6,%r8,6*$SIZE_T($sp)
179 lhi $cnt,256
180 la $idx,0(%r0)
181 sth $idx,0($key)
182.align 4
183.L1stloop:
184 stc $idx,2($idx,$key)
185 la $idx,1($idx)
186 brct $cnt,.L1stloop
187
188 lghi $ikey,-256
189 lr $cnt,$len
190 la $iinp,0(%r0)
191 la $idx,0(%r0)
192.align 16
193.L2ndloop:
194 llgc $acc,2+256($ikey,$key)
195 llgc $dat,0($iinp,$inp)
196 la $idx,0($idx,$acc)
197 la $ikey,1($ikey)
198 la $idx,0($idx,$dat)
199 nill $idx,255
200 la $iinp,1($iinp)
201 tml $ikey,255
202 llgc $dat,2($idx,$key)
203 stc $dat,2+256-1($ikey,$key)
204 stc $acc,2($idx,$key)
205 jz .Ldone
206 brct $cnt,.L2ndloop
207 lr $cnt,$len
208 la $iinp,0(%r0)
209 j .L2ndloop
210.Ldone:
211 lm${g} %r6,%r8,6*$SIZE_T($sp)
212 br $rp
213.size private_RC4_set_key,.-private_RC4_set_key
214
215___
216}
217
218# const char *RC4_options()
219$code.=<<___;
220.globl RC4_options
221.type RC4_options,\@function
222.align 16
223RC4_options:
224 larl %r2,.Loptions
225 br %r14
226.size RC4_options,.-RC4_options
227.section .rodata
228.Loptions:
229.align 8
230.string "rc4(8x,char)"
231___
232
233print $code;
234close STDOUT; # force flush
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
new file mode 100755
index 0000000000..75750dbf33
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -0,0 +1,677 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# July 2004
11#
12# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
13# "hand-coded assembler"] doesn't stand for the whole improvement
14# coefficient. It turned out that eliminating RC4_CHAR from config
15# line results in ~40% improvement (yes, even for C implementation).
16# Presumably it has everything to do with AMD cache architecture and
17# RAW or whatever penalties. Once again! The module *requires* config
18# line *without* RC4_CHAR! As for coding "secret," I bet on partial
19# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
20# I simply 'inc %r8b'. Even though optimization manual discourages
21# to operate on partial registers, it turned out to be the best bet.
22# At least for AMD... How IA32E would perform remains to be seen...
23
24# November 2004
25#
26# As was shown by Marc Bevand reordering of couple of load operations
27# results in even higher performance gain of 3.3x:-) At least on
28# Opteron... For reference, 1x in this case is RC4_CHAR C-code
29# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
30# Latter means that if you want to *estimate* what to expect from
31# *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
32
33# November 2004
34#
35# Intel P4 EM64T core was found to run the AMD64 code really slow...
36# The only way to achieve comparable performance on P4 was to keep
37# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
38# compose blended code, which would perform even within 30% marginal
39# on either AMD and Intel platforms, I implement both cases. See
40# rc4_skey.c for further details...
41
42# April 2005
43#
44# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
45# those with add/sub results in 50% performance improvement of folded
46# loop...
47
48# May 2005
49#
50# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
51# performance by >30% [unlike P4 32-bit case that is]. But this is
52# provided that loads are reordered even more aggressively! Both code
53# pathes, AMD64 and EM64T, reorder loads in essentially same manner
54# as my IA-64 implementation. On Opteron this resulted in modest 5%
55# improvement [I had to test it], while final Intel P4 performance
56# achieves respectful 432MBps on 2.8GHz processor now. For reference.
57# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
58# RC4_INT code-path. While if executed on Opteron, it's only 25%
59# slower than the RC4_INT one [meaning that if CPU µ-arch detection
60# is not implemented, then this final RC4_CHAR code-path should be
61# preferred, as it provides better *all-round* performance].
62
63# March 2007
64#
65# Intel Core2 was observed to perform poorly on both code paths:-( It
66# apparently suffers from some kind of partial register stall, which
67# occurs in 64-bit mode only [as virtually identical 32-bit loop was
68# observed to outperform 64-bit one by almost 50%]. Adding two movzb to
69# cloop1 boosts its performance by 80%! This loop appears to be optimal
70# fit for Core2 and therefore the code was modified to skip cloop8 on
71# this CPU.
72
73# May 2010
74#
75# Intel Westmere was observed to perform suboptimally. Adding yet
76# another movzb to cloop1 improved performance by almost 50%! Core2
77# performance is improved too, but nominally...
78
79# May 2011
80#
81# The only code path that was not modified is P4-specific one. Non-P4
82# Intel code path optimization is heavily based on submission by Maxim
83# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
84# some of the ideas even in attempt to optmize the original RC4_INT
85# code path... Current performance in cycles per processed byte (less
86# is better) and improvement coefficients relative to previous
87# version of this module are:
88#
89# Opteron 5.3/+0%(*)
90# P4 6.5
91# Core2 6.2/+15%(**)
92# Westmere 4.2/+60%
93# Sandy Bridge 4.2/+120%
94# Atom 9.3/+80%
95#
96# (*) But corresponding loop has less instructions, which should have
97# positive effect on upcoming Bulldozer, which has one less ALU.
98# For reference, Intel code runs at 6.8 cpb rate on Opteron.
99# (**) Note that Core2 result is ~15% lower than corresponding result
100# for 32-bit code, meaning that it's possible to improve it,
101# but more than likely at the cost of the others (see rc4-586.pl
102# to get the idea)...
103
104$flavour = shift;
105$output = shift;
106if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
107
108$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
109
110$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
111( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
112( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
113die "can't locate x86_64-xlate.pl";
114
115open OUT,"| \"$^X\" $xlate $flavour $output";
116*STDOUT=*OUT;
117
118$dat="%rdi"; # arg1
119$len="%rsi"; # arg2
120$inp="%rdx"; # arg3
121$out="%rcx"; # arg4
122
123{
124$code=<<___;
125.text
126.extern OPENSSL_ia32cap_P
127
128.globl RC4
129.type RC4,\@function,4
130.align 16
131RC4: or $len,$len
132 jne .Lentry
133 ret
134.Lentry:
135 push %rbx
136 push %r12
137 push %r13
138.Lprologue:
139 mov $len,%r11
140 mov $inp,%r12
141 mov $out,%r13
142___
143my $len="%r11"; # reassign input arguments
144my $inp="%r12";
145my $out="%r13";
146
147my @XX=("%r10","%rsi");
148my @TX=("%rax","%rbx");
149my $YY="%rcx";
150my $TY="%rdx";
151
152$code.=<<___;
153 xor $XX[0],$XX[0]
154 xor $YY,$YY
155
156 lea 8($dat),$dat
157 mov -8($dat),$XX[0]#b
158 mov -4($dat),$YY#b
159 cmpl \$-1,256($dat)
160 je .LRC4_CHAR
161 mov OPENSSL_ia32cap_P(%rip),%r8d
162 xor $TX[1],$TX[1]
163 inc $XX[0]#b
164 sub $XX[0],$TX[1]
165 sub $inp,$out
166 movl ($dat,$XX[0],4),$TX[0]#d
167 test \$-16,$len
168 jz .Lloop1
169 bt \$30,%r8d # Intel CPU?
170 jc .Lintel
171 and \$7,$TX[1]
172 lea 1($XX[0]),$XX[1]
173 jz .Loop8
174 sub $TX[1],$len
175.Loop8_warmup:
176 add $TX[0]#b,$YY#b
177 movl ($dat,$YY,4),$TY#d
178 movl $TX[0]#d,($dat,$YY,4)
179 movl $TY#d,($dat,$XX[0],4)
180 add $TY#b,$TX[0]#b
181 inc $XX[0]#b
182 movl ($dat,$TX[0],4),$TY#d
183 movl ($dat,$XX[0],4),$TX[0]#d
184 xorb ($inp),$TY#b
185 movb $TY#b,($out,$inp)
186 lea 1($inp),$inp
187 dec $TX[1]
188 jnz .Loop8_warmup
189
190 lea 1($XX[0]),$XX[1]
191 jmp .Loop8
192.align 16
193.Loop8:
194___
195for ($i=0;$i<8;$i++) {
196$code.=<<___ if ($i==7);
197 add \$8,$XX[1]#b
198___
199$code.=<<___;
200 add $TX[0]#b,$YY#b
201 movl ($dat,$YY,4),$TY#d
202 movl $TX[0]#d,($dat,$YY,4)
203 movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
204 ror \$8,%r8 # ror is redundant when $i=0
205 movl $TY#d,4*$i($dat,$XX[0],4)
206 add $TX[0]#b,$TY#b
207 movb ($dat,$TY,4),%r8b
208___
209push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers
210}
211$code.=<<___;
212 add \$8,$XX[0]#b
213 ror \$8,%r8
214 sub \$8,$len
215
216 xor ($inp),%r8
217 mov %r8,($out,$inp)
218 lea 8($inp),$inp
219
220 test \$-8,$len
221 jnz .Loop8
222 cmp \$0,$len
223 jne .Lloop1
224 jmp .Lexit
225
226.align 16
227.Lintel:
228 test \$-32,$len
229 jz .Lloop1
230 and \$15,$TX[1]
231 jz .Loop16_is_hot
232 sub $TX[1],$len
233.Loop16_warmup:
234 add $TX[0]#b,$YY#b
235 movl ($dat,$YY,4),$TY#d
236 movl $TX[0]#d,($dat,$YY,4)
237 movl $TY#d,($dat,$XX[0],4)
238 add $TY#b,$TX[0]#b
239 inc $XX[0]#b
240 movl ($dat,$TX[0],4),$TY#d
241 movl ($dat,$XX[0],4),$TX[0]#d
242 xorb ($inp),$TY#b
243 movb $TY#b,($out,$inp)
244 lea 1($inp),$inp
245 dec $TX[1]
246 jnz .Loop16_warmup
247
248 mov $YY,$TX[1]
249 xor $YY,$YY
250 mov $TX[1]#b,$YY#b
251
252.Loop16_is_hot:
253 lea ($dat,$XX[0],4),$XX[1]
254___
255sub RC4_loop {
256 my $i=shift;
257 my $j=$i<0?0:$i;
258 my $xmm="%xmm".($j&1);
259
260 $code.=" add \$16,$XX[0]#b\n" if ($i==15);
261 $code.=" movdqu ($inp),%xmm2\n" if ($i==15);
262 $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0);
263 $code.=" movl ($dat,$YY,4),$TY#d\n";
264 $code.=" pxor %xmm0,%xmm2\n" if ($i==0);
265 $code.=" psllq \$8,%xmm1\n" if ($i==0);
266 $code.=" pxor $xmm,$xmm\n" if ($i<=1);
267 $code.=" movl $TX[0]#d,($dat,$YY,4)\n";
268 $code.=" add $TY#b,$TX[0]#b\n";
269 $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15);
270 $code.=" movz $TX[0]#b,$TX[0]#d\n";
271 $code.=" movl $TY#d,4*$j($XX[1])\n";
272 $code.=" pxor %xmm1,%xmm2\n" if ($i==0);
273 $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15);
274 $code.=" add $TX[1]#b,$YY#b\n" if ($i<15);
275 $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
276 $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0);
277 $code.=" lea 16($inp),$inp\n" if ($i==0);
278 $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15);
279}
280 RC4_loop(-1);
281$code.=<<___;
282 jmp .Loop16_enter
283.align 16
284.Loop16:
285___
286
287for ($i=0;$i<16;$i++) {
288 $code.=".Loop16_enter:\n" if ($i==1);
289 RC4_loop($i);
290 push(@TX,shift(@TX)); # "rotate" registers
291}
292$code.=<<___;
293 mov $YY,$TX[1]
294 xor $YY,$YY # keyword to partial register
295 sub \$16,$len
296 mov $TX[1]#b,$YY#b
297 test \$-16,$len
298 jnz .Loop16
299
300 psllq \$8,%xmm1
301 pxor %xmm0,%xmm2
302 pxor %xmm1,%xmm2
303 movdqu %xmm2,($out,$inp)
304 lea 16($inp),$inp
305
306 cmp \$0,$len
307 jne .Lloop1
308 jmp .Lexit
309
310.align 16
311.Lloop1:
312 add $TX[0]#b,$YY#b
313 movl ($dat,$YY,4),$TY#d
314 movl $TX[0]#d,($dat,$YY,4)
315 movl $TY#d,($dat,$XX[0],4)
316 add $TY#b,$TX[0]#b
317 inc $XX[0]#b
318 movl ($dat,$TX[0],4),$TY#d
319 movl ($dat,$XX[0],4),$TX[0]#d
320 xorb ($inp),$TY#b
321 movb $TY#b,($out,$inp)
322 lea 1($inp),$inp
323 dec $len
324 jnz .Lloop1
325 jmp .Lexit
326
327.align 16
328.LRC4_CHAR:
329 add \$1,$XX[0]#b
330 movzb ($dat,$XX[0]),$TX[0]#d
331 test \$-8,$len
332 jz .Lcloop1
333 jmp .Lcloop8
334.align 16
335.Lcloop8:
336 mov ($inp),%r8d
337 mov 4($inp),%r9d
338___
339# unroll 2x4-wise, because 64-bit rotates kill Intel P4...
340for ($i=0;$i<4;$i++) {
341$code.=<<___;
342 add $TX[0]#b,$YY#b
343 lea 1($XX[0]),$XX[1]
344 movzb ($dat,$YY),$TY#d
345 movzb $XX[1]#b,$XX[1]#d
346 movzb ($dat,$XX[1]),$TX[1]#d
347 movb $TX[0]#b,($dat,$YY)
348 cmp $XX[1],$YY
349 movb $TY#b,($dat,$XX[0])
350 jne .Lcmov$i # Intel cmov is sloooow...
351 mov $TX[0],$TX[1]
352.Lcmov$i:
353 add $TX[0]#b,$TY#b
354 xor ($dat,$TY),%r8b
355 ror \$8,%r8d
356___
357push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
358}
359for ($i=4;$i<8;$i++) {
360$code.=<<___;
361 add $TX[0]#b,$YY#b
362 lea 1($XX[0]),$XX[1]
363 movzb ($dat,$YY),$TY#d
364 movzb $XX[1]#b,$XX[1]#d
365 movzb ($dat,$XX[1]),$TX[1]#d
366 movb $TX[0]#b,($dat,$YY)
367 cmp $XX[1],$YY
368 movb $TY#b,($dat,$XX[0])
369 jne .Lcmov$i # Intel cmov is sloooow...
370 mov $TX[0],$TX[1]
371.Lcmov$i:
372 add $TX[0]#b,$TY#b
373 xor ($dat,$TY),%r9b
374 ror \$8,%r9d
375___
376push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
377}
378$code.=<<___;
379 lea -8($len),$len
380 mov %r8d,($out)
381 lea 8($inp),$inp
382 mov %r9d,4($out)
383 lea 8($out),$out
384
385 test \$-8,$len
386 jnz .Lcloop8
387 cmp \$0,$len
388 jne .Lcloop1
389 jmp .Lexit
390___
391$code.=<<___;
392.align 16
393.Lcloop1:
394 add $TX[0]#b,$YY#b
395 movzb $YY#b,$YY#d
396 movzb ($dat,$YY),$TY#d
397 movb $TX[0]#b,($dat,$YY)
398 movb $TY#b,($dat,$XX[0])
399 add $TX[0]#b,$TY#b
400 add \$1,$XX[0]#b
401 movzb $TY#b,$TY#d
402 movzb $XX[0]#b,$XX[0]#d
403 movzb ($dat,$TY),$TY#d
404 movzb ($dat,$XX[0]),$TX[0]#d
405 xorb ($inp),$TY#b
406 lea 1($inp),$inp
407 movb $TY#b,($out)
408 lea 1($out),$out
409 sub \$1,$len
410 jnz .Lcloop1
411 jmp .Lexit
412
413.align 16
414.Lexit:
415 sub \$1,$XX[0]#b
416 movl $XX[0]#d,-8($dat)
417 movl $YY#d,-4($dat)
418
419 mov (%rsp),%r13
420 mov 8(%rsp),%r12
421 mov 16(%rsp),%rbx
422 add \$24,%rsp
423.Lepilogue:
424 ret
425.size RC4,.-RC4
426___
427}
428
429$idx="%r8";
430$ido="%r9";
431
432$code.=<<___;
433.globl private_RC4_set_key
434.type private_RC4_set_key,\@function,3
435.align 16
436private_RC4_set_key:
437 lea 8($dat),$dat
438 lea ($inp,$len),$inp
439 neg $len
440 mov $len,%rcx
441 xor %eax,%eax
442 xor $ido,$ido
443 xor %r10,%r10
444 xor %r11,%r11
445
446 mov OPENSSL_ia32cap_P(%rip),$idx#d
447 bt \$20,$idx#d # RC4_CHAR?
448 jc .Lc1stloop
449 jmp .Lw1stloop
450
451.align 16
452.Lw1stloop:
453 mov %eax,($dat,%rax,4)
454 add \$1,%al
455 jnc .Lw1stloop
456
457 xor $ido,$ido
458 xor $idx,$idx
459.align 16
460.Lw2ndloop:
461 mov ($dat,$ido,4),%r10d
462 add ($inp,$len,1),$idx#b
463 add %r10b,$idx#b
464 add \$1,$len
465 mov ($dat,$idx,4),%r11d
466 cmovz %rcx,$len
467 mov %r10d,($dat,$idx,4)
468 mov %r11d,($dat,$ido,4)
469 add \$1,$ido#b
470 jnc .Lw2ndloop
471 jmp .Lexit_key
472
473.align 16
474.Lc1stloop:
475 mov %al,($dat,%rax)
476 add \$1,%al
477 jnc .Lc1stloop
478
479 xor $ido,$ido
480 xor $idx,$idx
481.align 16
482.Lc2ndloop:
483 mov ($dat,$ido),%r10b
484 add ($inp,$len),$idx#b
485 add %r10b,$idx#b
486 add \$1,$len
487 mov ($dat,$idx),%r11b
488 jnz .Lcnowrap
489 mov %rcx,$len
490.Lcnowrap:
491 mov %r10b,($dat,$idx)
492 mov %r11b,($dat,$ido)
493 add \$1,$ido#b
494 jnc .Lc2ndloop
495 movl \$-1,256($dat)
496
497.align 16
498.Lexit_key:
499 xor %eax,%eax
500 mov %eax,-8($dat)
501 mov %eax,-4($dat)
502 ret
503.size private_RC4_set_key,.-private_RC4_set_key
504
505.globl RC4_options
506.type RC4_options,\@abi-omnipotent
507.align 16
508RC4_options:
509 lea .Lopts(%rip),%rax
510 mov OPENSSL_ia32cap_P(%rip),%edx
511 bt \$20,%edx
512 jc .L8xchar
513 bt \$30,%edx
514 jnc .Ldone
515 add \$25,%rax
516 ret
517.L8xchar:
518 add \$12,%rax
519.Ldone:
520 ret
521.align 64
522.Lopts:
523.asciz "rc4(8x,int)"
524.asciz "rc4(8x,char)"
525.asciz "rc4(16x,int)"
526.asciz "RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
527.align 64
528.size RC4_options,.-RC4_options
529___
530
531# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
532# CONTEXT *context,DISPATCHER_CONTEXT *disp)
533if ($win64) {
534$rec="%rcx";
535$frame="%rdx";
536$context="%r8";
537$disp="%r9";
538
539$code.=<<___;
540.extern __imp_RtlVirtualUnwind
541.type stream_se_handler,\@abi-omnipotent
542.align 16
543stream_se_handler:
544 push %rsi
545 push %rdi
546 push %rbx
547 push %rbp
548 push %r12
549 push %r13
550 push %r14
551 push %r15
552 pushfq
553 sub \$64,%rsp
554
555 mov 120($context),%rax # pull context->Rax
556 mov 248($context),%rbx # pull context->Rip
557
558 lea .Lprologue(%rip),%r10
559 cmp %r10,%rbx # context->Rip<prologue label
560 jb .Lin_prologue
561
562 mov 152($context),%rax # pull context->Rsp
563
564 lea .Lepilogue(%rip),%r10
565 cmp %r10,%rbx # context->Rip>=epilogue label
566 jae .Lin_prologue
567
568 lea 24(%rax),%rax
569
570 mov -8(%rax),%rbx
571 mov -16(%rax),%r12
572 mov -24(%rax),%r13
573 mov %rbx,144($context) # restore context->Rbx
574 mov %r12,216($context) # restore context->R12
575 mov %r13,224($context) # restore context->R13
576
577.Lin_prologue:
578 mov 8(%rax),%rdi
579 mov 16(%rax),%rsi
580 mov %rax,152($context) # restore context->Rsp
581 mov %rsi,168($context) # restore context->Rsi
582 mov %rdi,176($context) # restore context->Rdi
583
584 jmp .Lcommon_seh_exit
585.size stream_se_handler,.-stream_se_handler
586
587.type key_se_handler,\@abi-omnipotent
588.align 16
589key_se_handler:
590 push %rsi
591 push %rdi
592 push %rbx
593 push %rbp
594 push %r12
595 push %r13
596 push %r14
597 push %r15
598 pushfq
599 sub \$64,%rsp
600
601 mov 152($context),%rax # pull context->Rsp
602 mov 8(%rax),%rdi
603 mov 16(%rax),%rsi
604 mov %rsi,168($context) # restore context->Rsi
605 mov %rdi,176($context) # restore context->Rdi
606
607.Lcommon_seh_exit:
608
609 mov 40($disp),%rdi # disp->ContextRecord
610 mov $context,%rsi # context
611 mov \$154,%ecx # sizeof(CONTEXT)
612 .long 0xa548f3fc # cld; rep movsq
613
614 mov $disp,%rsi
615 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
616 mov 8(%rsi),%rdx # arg2, disp->ImageBase
617 mov 0(%rsi),%r8 # arg3, disp->ControlPc
618 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
619 mov 40(%rsi),%r10 # disp->ContextRecord
620 lea 56(%rsi),%r11 # &disp->HandlerData
621 lea 24(%rsi),%r12 # &disp->EstablisherFrame
622 mov %r10,32(%rsp) # arg5
623 mov %r11,40(%rsp) # arg6
624 mov %r12,48(%rsp) # arg7
625 mov %rcx,56(%rsp) # arg8, (NULL)
626 call *__imp_RtlVirtualUnwind(%rip)
627
628 mov \$1,%eax # ExceptionContinueSearch
629 add \$64,%rsp
630 popfq
631 pop %r15
632 pop %r14
633 pop %r13
634 pop %r12
635 pop %rbp
636 pop %rbx
637 pop %rdi
638 pop %rsi
639 ret
640.size key_se_handler,.-key_se_handler
641
642.section .pdata
643.align 4
644 .rva .LSEH_begin_RC4
645 .rva .LSEH_end_RC4
646 .rva .LSEH_info_RC4
647
648 .rva .LSEH_begin_private_RC4_set_key
649 .rva .LSEH_end_private_RC4_set_key
650 .rva .LSEH_info_private_RC4_set_key
651
652.section .xdata
653.align 8
654.LSEH_info_RC4:
655 .byte 9,0,0,0
656 .rva stream_se_handler
657.LSEH_info_private_RC4_set_key:
658 .byte 9,0,0,0
659 .rva key_se_handler
660___
661}
662
663sub reg_part {
664my ($reg,$conv)=@_;
665 if ($reg =~ /%r[0-9]+/) { $reg .= $conv; }
666 elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; }
667 elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; }
668 elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; }
669 return $reg;
670}
671
672$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
673$code =~ s/\`([^\`]*)\`/eval $1/gem;
674
675print $code;
676
677close STDOUT;
diff --git a/src/lib/libcrypto/rc4/rc4.c b/src/lib/libcrypto/rc4/rc4.c
new file mode 100644
index 0000000000..c900b26055
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4.c
@@ -0,0 +1,193 @@
1/* crypto/rc4/rc4.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <stdlib.h>
61#include <string.h>
62#include <openssl/rc4.h>
63#include <openssl/evp.h>
64
65char *usage[]={
66"usage: rc4 args\n",
67"\n",
68" -in arg - input file - default stdin\n",
69" -out arg - output file - default stdout\n",
70" -key key - password\n",
71NULL
72};
73
74int main(int argc, char *argv[])
75 {
76 FILE *in=NULL,*out=NULL;
77 char *infile=NULL,*outfile=NULL,*keystr=NULL;
78 RC4_KEY key;
79 char buf[BUFSIZ];
80 int badops=0,i;
81 char **pp;
82 unsigned char md[MD5_DIGEST_LENGTH];
83
84 argc--;
85 argv++;
86 while (argc >= 1)
87 {
88 if (strcmp(*argv,"-in") == 0)
89 {
90 if (--argc < 1) goto bad;
91 infile= *(++argv);
92 }
93 else if (strcmp(*argv,"-out") == 0)
94 {
95 if (--argc < 1) goto bad;
96 outfile= *(++argv);
97 }
98 else if (strcmp(*argv,"-key") == 0)
99 {
100 if (--argc < 1) goto bad;
101 keystr= *(++argv);
102 }
103 else
104 {
105 fprintf(stderr,"unknown option %s\n",*argv);
106 badops=1;
107 break;
108 }
109 argc--;
110 argv++;
111 }
112
113 if (badops)
114 {
115bad:
116 for (pp=usage; (*pp != NULL); pp++)
117 fprintf(stderr,"%s",*pp);
118 exit(1);
119 }
120
121 if (infile == NULL)
122 in=stdin;
123 else
124 {
125 in=fopen(infile,"r");
126 if (in == NULL)
127 {
128 perror("open");
129 exit(1);
130 }
131
132 }
133 if (outfile == NULL)
134 out=stdout;
135 else
136 {
137 out=fopen(outfile,"w");
138 if (out == NULL)
139 {
140 perror("open");
141 exit(1);
142 }
143 }
144
145#ifdef OPENSSL_SYS_MSDOS
146 /* This should set the file to binary mode. */
147 {
148#include <fcntl.h>
149 setmode(fileno(in),O_BINARY);
150 setmode(fileno(out),O_BINARY);
151 }
152#endif
153
154 if (keystr == NULL)
155 { /* get key */
156 i=EVP_read_pw_string(buf,BUFSIZ,"Enter RC4 password:",0);
157 if (i != 0)
158 {
159 OPENSSL_cleanse(buf,BUFSIZ);
160 fprintf(stderr,"bad password read\n");
161 exit(1);
162 }
163 keystr=buf;
164 }
165
166 EVP_Digest((unsigned char *)keystr,strlen(keystr),md,NULL,EVP_md5(),NULL);
167 OPENSSL_cleanse(keystr,strlen(keystr));
168 RC4_set_key(&key,MD5_DIGEST_LENGTH,md);
169
170 for(;;)
171 {
172 i=fread(buf,1,BUFSIZ,in);
173 if (i == 0) break;
174 if (i < 0)
175 {
176 perror("read");
177 exit(1);
178 }
179 RC4(&key,(unsigned int)i,(unsigned char *)buf,
180 (unsigned char *)buf);
181 i=fwrite(buf,(unsigned int)i,1,out);
182 if (i != 1)
183 {
184 perror("write");
185 exit(1);
186 }
187 }
188 fclose(out);
189 fclose(in);
190 exit(0);
191 return(1);
192 }
193
diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h
new file mode 100644
index 0000000000..88ceb46bc5
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4.h
@@ -0,0 +1,90 @@
1/* crypto/rc4/rc4.h */
2/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#ifndef HEADER_RC4_H
60#define HEADER_RC4_H
61
62#include <openssl/opensslconf.h> /* OPENSSL_NO_RC4, RC4_INT */
63#ifdef OPENSSL_NO_RC4
64#error RC4 is disabled.
65#endif
66
67#include <stddef.h>
68
69#ifdef __cplusplus
70extern "C" {
71#endif
72
73typedef struct rc4_key_st
74 {
75 RC4_INT x,y;
76 RC4_INT data[256];
77 } RC4_KEY;
78
79
80const char *RC4_options(void);
81void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
82void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
83void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
84 unsigned char *outdata);
85
86#ifdef __cplusplus
87}
88#endif
89
90#endif
diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c
new file mode 100644
index 0000000000..8c4fc6c7a3
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4_enc.c
@@ -0,0 +1,315 @@
1/* crypto/rc4/rc4_enc.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <openssl/rc4.h>
60#include "rc4_locl.h"
61
62/* RC4 as implemented from a posting from
63 * Newsgroups: sci.crypt
64 * From: sterndark@netcom.com (David Sterndark)
65 * Subject: RC4 Algorithm revealed.
66 * Message-ID: <sternCvKL4B.Hyy@netcom.com>
67 * Date: Wed, 14 Sep 1994 06:35:31 GMT
68 */
69
70void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
71 unsigned char *outdata)
72 {
73 register RC4_INT *d;
74 register RC4_INT x,y,tx,ty;
75 size_t i;
76
77 x=key->x;
78 y=key->y;
79 d=key->data;
80
81#if defined(RC4_CHUNK)
82 /*
83 * The original reason for implementing this(*) was the fact that
84 * pre-21164a Alpha CPUs don't have byte load/store instructions
85 * and e.g. a byte store has to be done with 64-bit load, shift,
86 * and, or and finally 64-bit store. Peaking data and operating
87 * at natural word size made it possible to reduce amount of
88 * instructions as well as to perform early read-ahead without
89 * suffering from RAW (read-after-write) hazard. This resulted
90 * in ~40%(**) performance improvement on 21064 box with gcc.
91 * But it's not only Alpha users who win here:-) Thanks to the
92 * early-n-wide read-ahead this implementation also exhibits
93 * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending
94 * on sizeof(RC4_INT)).
95 *
96 * (*) "this" means code which recognizes the case when input
97 * and output pointers appear to be aligned at natural CPU
98 * word boundary
99 * (**) i.e. according to 'apps/openssl speed rc4' benchmark,
100 * crypto/rc4/rc4speed.c exhibits almost 70% speed-up...
101 *
102 * Cavets.
103 *
104 * - RC4_CHUNK="unsigned long long" should be a #1 choice for
105 * UltraSPARC. Unfortunately gcc generates very slow code
106 * (2.5-3 times slower than one generated by Sun's WorkShop
107 * C) and therefore gcc (at least 2.95 and earlier) should
108 * always be told that RC4_CHUNK="unsigned long".
109 *
110 * <appro@fy.chalmers.se>
111 */
112
113# define RC4_STEP ( \
114 x=(x+1) &0xff, \
115 tx=d[x], \
116 y=(tx+y)&0xff, \
117 ty=d[y], \
118 d[y]=tx, \
119 d[x]=ty, \
120 (RC4_CHUNK)d[(tx+ty)&0xff]\
121 )
122
123 if ( ( ((size_t)indata & (sizeof(RC4_CHUNK)-1)) |
124 ((size_t)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 )
125 {
126 RC4_CHUNK ichunk,otp;
127 const union { long one; char little; } is_endian = {1};
128
129 /*
130 * I reckon we can afford to implement both endian
131 * cases and to decide which way to take at run-time
132 * because the machine code appears to be very compact
133 * and redundant 1-2KB is perfectly tolerable (i.e.
134 * in case the compiler fails to eliminate it:-). By
135 * suggestion from Terrel Larson <terr@terralogic.net>
136 * who also stands for the is_endian union:-)
137 *
138 * Special notes.
139 *
140 * - is_endian is declared automatic as doing otherwise
141 * (declaring static) prevents gcc from eliminating
142 * the redundant code;
143 * - compilers (those I've tried) don't seem to have
144 * problems eliminating either the operators guarded
145 * by "if (sizeof(RC4_CHUNK)==8)" or the condition
146 * expressions themselves so I've got 'em to replace
147 * corresponding #ifdefs from the previous version;
148 * - I chose to let the redundant switch cases when
149 * sizeof(RC4_CHUNK)!=8 be (were also #ifdefed
150 * before);
151 * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in
152 * [LB]ESHFT guards against "shift is out of range"
153 * warnings when sizeof(RC4_CHUNK)!=8
154 *
155 * <appro@fy.chalmers.se>
156 */
157 if (!is_endian.little)
158 { /* BIG-ENDIAN CASE */
159# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1))
160 for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
161 {
162 ichunk = *(RC4_CHUNK *)indata;
163 otp = RC4_STEP<<BESHFT(0);
164 otp |= RC4_STEP<<BESHFT(1);
165 otp |= RC4_STEP<<BESHFT(2);
166 otp |= RC4_STEP<<BESHFT(3);
167 if (sizeof(RC4_CHUNK)==8)
168 {
169 otp |= RC4_STEP<<BESHFT(4);
170 otp |= RC4_STEP<<BESHFT(5);
171 otp |= RC4_STEP<<BESHFT(6);
172 otp |= RC4_STEP<<BESHFT(7);
173 }
174 *(RC4_CHUNK *)outdata = otp^ichunk;
175 indata += sizeof(RC4_CHUNK);
176 outdata += sizeof(RC4_CHUNK);
177 }
178 if (len)
179 {
180 RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk;
181
182 ichunk = *(RC4_CHUNK *)indata;
183 ochunk = *(RC4_CHUNK *)outdata;
184 otp = 0;
185 i = BESHFT(0);
186 mask <<= (sizeof(RC4_CHUNK)-len)<<3;
187 switch (len&(sizeof(RC4_CHUNK)-1))
188 {
189 case 7: otp = RC4_STEP<<i, i-=8;
190 case 6: otp |= RC4_STEP<<i, i-=8;
191 case 5: otp |= RC4_STEP<<i, i-=8;
192 case 4: otp |= RC4_STEP<<i, i-=8;
193 case 3: otp |= RC4_STEP<<i, i-=8;
194 case 2: otp |= RC4_STEP<<i, i-=8;
195 case 1: otp |= RC4_STEP<<i, i-=8;
196 case 0: ; /*
197 * it's never the case,
198 * but it has to be here
199 * for ultrix?
200 */
201 }
202 ochunk &= ~mask;
203 ochunk |= (otp^ichunk) & mask;
204 *(RC4_CHUNK *)outdata = ochunk;
205 }
206 key->x=x;
207 key->y=y;
208 return;
209 }
210 else
211 { /* LITTLE-ENDIAN CASE */
212# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1))
213 for (;len&(0-sizeof(RC4_CHUNK));len-=sizeof(RC4_CHUNK))
214 {
215 ichunk = *(RC4_CHUNK *)indata;
216 otp = RC4_STEP;
217 otp |= RC4_STEP<<8;
218 otp |= RC4_STEP<<16;
219 otp |= RC4_STEP<<24;
220 if (sizeof(RC4_CHUNK)==8)
221 {
222 otp |= RC4_STEP<<LESHFT(4);
223 otp |= RC4_STEP<<LESHFT(5);
224 otp |= RC4_STEP<<LESHFT(6);
225 otp |= RC4_STEP<<LESHFT(7);
226 }
227 *(RC4_CHUNK *)outdata = otp^ichunk;
228 indata += sizeof(RC4_CHUNK);
229 outdata += sizeof(RC4_CHUNK);
230 }
231 if (len)
232 {
233 RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk;
234
235 ichunk = *(RC4_CHUNK *)indata;
236 ochunk = *(RC4_CHUNK *)outdata;
237 otp = 0;
238 i = 0;
239 mask >>= (sizeof(RC4_CHUNK)-len)<<3;
240 switch (len&(sizeof(RC4_CHUNK)-1))
241 {
242 case 7: otp = RC4_STEP, i+=8;
243 case 6: otp |= RC4_STEP<<i, i+=8;
244 case 5: otp |= RC4_STEP<<i, i+=8;
245 case 4: otp |= RC4_STEP<<i, i+=8;
246 case 3: otp |= RC4_STEP<<i, i+=8;
247 case 2: otp |= RC4_STEP<<i, i+=8;
248 case 1: otp |= RC4_STEP<<i, i+=8;
249 case 0: ; /*
250 * it's never the case,
251 * but it has to be here
252 * for ultrix?
253 */
254 }
255 ochunk &= ~mask;
256 ochunk |= (otp^ichunk) & mask;
257 *(RC4_CHUNK *)outdata = ochunk;
258 }
259 key->x=x;
260 key->y=y;
261 return;
262 }
263 }
264#endif
265#define LOOP(in,out) \
266 x=((x+1)&0xff); \
267 tx=d[x]; \
268 y=(tx+y)&0xff; \
269 d[x]=ty=d[y]; \
270 d[y]=tx; \
271 (out) = d[(tx+ty)&0xff]^ (in);
272
273#ifndef RC4_INDEX
274#define RC4_LOOP(a,b,i) LOOP(*((a)++),*((b)++))
275#else
276#define RC4_LOOP(a,b,i) LOOP(a[i],b[i])
277#endif
278
279 i=len>>3;
280 if (i)
281 {
282 for (;;)
283 {
284 RC4_LOOP(indata,outdata,0);
285 RC4_LOOP(indata,outdata,1);
286 RC4_LOOP(indata,outdata,2);
287 RC4_LOOP(indata,outdata,3);
288 RC4_LOOP(indata,outdata,4);
289 RC4_LOOP(indata,outdata,5);
290 RC4_LOOP(indata,outdata,6);
291 RC4_LOOP(indata,outdata,7);
292#ifdef RC4_INDEX
293 indata+=8;
294 outdata+=8;
295#endif
296 if (--i == 0) break;
297 }
298 }
299 i=len&0x07;
300 if (i)
301 {
302 for (;;)
303 {
304 RC4_LOOP(indata,outdata,0); if (--i == 0) break;
305 RC4_LOOP(indata,outdata,1); if (--i == 0) break;
306 RC4_LOOP(indata,outdata,2); if (--i == 0) break;
307 RC4_LOOP(indata,outdata,3); if (--i == 0) break;
308 RC4_LOOP(indata,outdata,4); if (--i == 0) break;
309 RC4_LOOP(indata,outdata,5); if (--i == 0) break;
310 RC4_LOOP(indata,outdata,6); if (--i == 0) break;
311 }
312 }
313 key->x=x;
314 key->y=y;
315 }
diff --git a/src/lib/libcrypto/rc4/rc4_locl.h b/src/lib/libcrypto/rc4/rc4_locl.h
new file mode 100644
index 0000000000..c712e1632e
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4_locl.h
@@ -0,0 +1,5 @@
1#ifndef HEADER_RC4_LOCL_H
2#define HEADER_RC4_LOCL_H
3#include <openssl/opensslconf.h>
4#include <cryptlib.h>
5#endif
diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c
new file mode 100644
index 0000000000..fda27636e7
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4_skey.c
@@ -0,0 +1,116 @@
1/* crypto/rc4/rc4_skey.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <openssl/rc4.h>
60#include "rc4_locl.h"
61#include <openssl/opensslv.h>
62
63const char RC4_version[]="RC4" OPENSSL_VERSION_PTEXT;
64
65const char *RC4_options(void)
66 {
67#ifdef RC4_INDEX
68 if (sizeof(RC4_INT) == 1)
69 return("rc4(idx,char)");
70 else
71 return("rc4(idx,int)");
72#else
73 if (sizeof(RC4_INT) == 1)
74 return("rc4(ptr,char)");
75 else
76 return("rc4(ptr,int)");
77#endif
78 }
79
80/* RC4 as implemented from a posting from
81 * Newsgroups: sci.crypt
82 * From: sterndark@netcom.com (David Sterndark)
83 * Subject: RC4 Algorithm revealed.
84 * Message-ID: <sternCvKL4B.Hyy@netcom.com>
85 * Date: Wed, 14 Sep 1994 06:35:31 GMT
86 */
87
88void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
89 {
90 register RC4_INT tmp;
91 register int id1,id2;
92 register RC4_INT *d;
93 unsigned int i;
94
95 d= &(key->data[0]);
96 key->x = 0;
97 key->y = 0;
98 id1=id2=0;
99
100#define SK_LOOP(d,n) { \
101 tmp=d[(n)]; \
102 id2 = (data[id1] + tmp + id2) & 0xff; \
103 if (++id1 == len) id1=0; \
104 d[(n)]=d[id2]; \
105 d[id2]=tmp; }
106
107 for (i=0; i < 256; i++) d[i]=i;
108 for (i=0; i < 256; i+=4)
109 {
110 SK_LOOP(d,i+0);
111 SK_LOOP(d,i+1);
112 SK_LOOP(d,i+2);
113 SK_LOOP(d,i+3);
114 }
115 }
116
diff --git a/src/lib/libcrypto/rc4/rc4speed.c b/src/lib/libcrypto/rc4/rc4speed.c
new file mode 100644
index 0000000000..0ebd38123d
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4speed.c
@@ -0,0 +1,253 @@
1/* crypto/rc4/rc4speed.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59/* 11-Sep-92 Andrew Daviel Support for Silicon Graphics IRIX added */
60/* 06-Apr-92 Luke Brennan Support for VMS and add extra signal calls */
61
62#if !defined(OPENSSL_SYS_MSDOS) && (!defined(OPENSSL_SYS_VMS) || defined(__DECC)) && !defined(OPENSSL_SYS_MACOSX)
63#define TIMES
64#endif
65
66#include <stdio.h>
67
68#include <openssl/e_os2.h>
69#include OPENSSL_UNISTD_IO
70OPENSSL_DECLARE_EXIT
71
72#ifndef OPENSSL_SYS_NETWARE
73#include <signal.h>
74#endif
75
76#ifndef _IRIX
77#include <time.h>
78#endif
79#ifdef TIMES
80#include <sys/types.h>
81#include <sys/times.h>
82#endif
83
84/* Depending on the VMS version, the tms structure is perhaps defined.
85 The __TMS macro will show if it was. If it wasn't defined, we should
86 undefine TIMES, since that tells the rest of the program how things
87 should be handled. -- Richard Levitte */
88#if defined(OPENSSL_SYS_VMS_DECC) && !defined(__TMS)
89#undef TIMES
90#endif
91
92#ifndef TIMES
93#include <sys/timeb.h>
94#endif
95
96#if defined(sun) || defined(__ultrix)
97#define _POSIX_SOURCE
98#include <limits.h>
99#include <sys/param.h>
100#endif
101
102#include <openssl/rc4.h>
103
104/* The following if from times(3) man page. It may need to be changed */
105#ifndef HZ
106#ifndef CLK_TCK
107#define HZ 100.0
108#else /* CLK_TCK */
109#define HZ ((double)CLK_TCK)
110#endif
111#endif
112
113#define BUFSIZE ((long)1024)
114long run=0;
115
116double Time_F(int s);
117#ifdef SIGALRM
118#if defined(__STDC__) || defined(sgi) || defined(_AIX)
119#define SIGRETTYPE void
120#else
121#define SIGRETTYPE int
122#endif
123
124SIGRETTYPE sig_done(int sig);
125SIGRETTYPE sig_done(int sig)
126 {
127 signal(SIGALRM,sig_done);
128 run=0;
129#ifdef LINT
130 sig=sig;
131#endif
132 }
133#endif
134
135#define START 0
136#define STOP 1
137
138double Time_F(int s)
139 {
140 double ret;
141#ifdef TIMES
142 static struct tms tstart,tend;
143
144 if (s == START)
145 {
146 times(&tstart);
147 return(0);
148 }
149 else
150 {
151 times(&tend);
152 ret=((double)(tend.tms_utime-tstart.tms_utime))/HZ;
153 return((ret == 0.0)?1e-6:ret);
154 }
155#else /* !times() */
156 static struct timeb tstart,tend;
157 long i;
158
159 if (s == START)
160 {
161 ftime(&tstart);
162 return(0);
163 }
164 else
165 {
166 ftime(&tend);
167 i=(long)tend.millitm-(long)tstart.millitm;
168 ret=((double)(tend.time-tstart.time))+((double)i)/1e3;
169 return((ret == 0.0)?1e-6:ret);
170 }
171#endif
172 }
173
174int main(int argc, char **argv)
175 {
176 long count;
177 static unsigned char buf[BUFSIZE];
178 static unsigned char key[] ={
179 0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,
180 0xfe,0xdc,0xba,0x98,0x76,0x54,0x32,0x10,
181 };
182 RC4_KEY sch;
183 double a,b,c,d;
184#ifndef SIGALRM
185 long ca,cb,cc;
186#endif
187
188#ifndef TIMES
189 printf("To get the most accurate results, try to run this\n");
190 printf("program when this computer is idle.\n");
191#endif
192
193#ifndef SIGALRM
194 printf("First we calculate the approximate speed ...\n");
195 RC4_set_key(&sch,16,key);
196 count=10;
197 do {
198 long i;
199 unsigned long data[2];
200
201 count*=2;
202 Time_F(START);
203 for (i=count; i; i--)
204 RC4(&sch,8,buf,buf);
205 d=Time_F(STOP);
206 } while (d < 3.0);
207 ca=count/512;
208 cc=count*8/BUFSIZE+1;
209 printf("Doing RC4_set_key %ld times\n",ca);
210#define COND(d) (count != (d))
211#define COUNT(d) (d)
212#else
213#define COND(c) (run)
214#define COUNT(d) (count)
215 signal(SIGALRM,sig_done);
216 printf("Doing RC4_set_key for 10 seconds\n");
217 alarm(10);
218#endif
219
220 Time_F(START);
221 for (count=0,run=1; COND(ca); count+=4)
222 {
223 RC4_set_key(&sch,16,key);
224 RC4_set_key(&sch,16,key);
225 RC4_set_key(&sch,16,key);
226 RC4_set_key(&sch,16,key);
227 }
228 d=Time_F(STOP);
229 printf("%ld RC4_set_key's in %.2f seconds\n",count,d);
230 a=((double)COUNT(ca))/d;
231
232#ifdef SIGALRM
233 printf("Doing RC4 on %ld byte blocks for 10 seconds\n",BUFSIZE);
234 alarm(10);
235#else
236 printf("Doing RC4 %ld times on %ld byte blocks\n",cc,BUFSIZE);
237#endif
238 Time_F(START);
239 for (count=0,run=1; COND(cc); count++)
240 RC4(&sch,BUFSIZE,buf,buf);
241 d=Time_F(STOP);
242 printf("%ld RC4's of %ld byte blocks in %.2f second\n",
243 count,BUFSIZE,d);
244 c=((double)COUNT(cc)*BUFSIZE)/d;
245
246 printf("RC4 set_key per sec = %12.2f (%9.3fuS)\n",a,1.0e6/a);
247 printf("RC4 bytes per sec = %12.2f (%9.3fuS)\n",c,8.0e6/c);
248 exit(0);
249#if defined(LINT) || defined(OPENSSL_SYS_MSDOS)
250 return(0);
251#endif
252 }
253
diff --git a/src/lib/libcrypto/rc4/rc4test.c b/src/lib/libcrypto/rc4/rc4test.c
new file mode 100644
index 0000000000..4312605ccb
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rc4test.c
@@ -0,0 +1,242 @@
1/* crypto/rc4/rc4test.c */
2/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
3 * All rights reserved.
4 *
5 * This package is an SSL implementation written
6 * by Eric Young (eay@cryptsoft.com).
7 * The implementation was written so as to conform with Netscapes SSL.
8 *
9 * This library is free for commercial and non-commercial use as long as
10 * the following conditions are aheared to. The following conditions
11 * apply to all code found in this distribution, be it the RC4, RSA,
12 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
13 * included with this distribution is covered by the same copyright terms
14 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 *
16 * Copyright remains Eric Young's, and as such any Copyright notices in
17 * the code are not to be removed.
18 * If this package is used in a product, Eric Young should be given attribution
19 * as the author of the parts of the library used.
20 * This can be in the form of a textual message at program startup or
21 * in documentation (online or textual) provided with the package.
22 *
23 * Redistribution and use in source and binary forms, with or without
24 * modification, are permitted provided that the following conditions
25 * are met:
26 * 1. Redistributions of source code must retain the copyright
27 * notice, this list of conditions and the following disclaimer.
28 * 2. Redistributions in binary form must reproduce the above copyright
29 * notice, this list of conditions and the following disclaimer in the
30 * documentation and/or other materials provided with the distribution.
31 * 3. All advertising materials mentioning features or use of this software
32 * must display the following acknowledgement:
33 * "This product includes cryptographic software written by
34 * Eric Young (eay@cryptsoft.com)"
35 * The word 'cryptographic' can be left out if the rouines from the library
36 * being used are not cryptographic related :-).
37 * 4. If you include any Windows specific code (or a derivative thereof) from
38 * the apps directory (application code) you must include an acknowledgement:
39 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 *
41 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 *
53 * The licence and distribution terms for any publically available version or
54 * derivative of this code cannot be changed. i.e. this code cannot simply be
55 * copied and put under another distribution licence
56 * [including the GNU Public Licence.]
57 */
58
59#include <stdio.h>
60#include <stdlib.h>
61#include <string.h>
62
63#include "../e_os.h"
64
65#ifdef OPENSSL_NO_RC4
66int main(int argc, char *argv[])
67{
68 printf("No RC4 support\n");
69 return(0);
70}
71#else
72#include <openssl/rc4.h>
73#include <openssl/sha.h>
74
75static unsigned char keys[7][30]={
76 {8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
77 {8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
78 {8,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
79 {4,0xef,0x01,0x23,0x45},
80 {8,0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef},
81 {4,0xef,0x01,0x23,0x45},
82 };
83
84static unsigned char data_len[7]={8,8,8,20,28,10};
85static unsigned char data[7][30]={
86 {0x01,0x23,0x45,0x67,0x89,0xab,0xcd,0xef,0xff},
87 {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
88 {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
89 {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
90 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
91 0x00,0x00,0x00,0x00,0xff},
92 {0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
93 0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
94 0x12,0x34,0x56,0x78,0x9A,0xBC,0xDE,0xF0,
95 0x12,0x34,0x56,0x78,0xff},
96 {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff},
97 {0},
98 };
99
100static unsigned char output[7][30]={
101 {0x75,0xb7,0x87,0x80,0x99,0xe0,0xc5,0x96,0x00},
102 {0x74,0x94,0xc2,0xe7,0x10,0x4b,0x08,0x79,0x00},
103 {0xde,0x18,0x89,0x41,0xa3,0x37,0x5d,0x3a,0x00},
104 {0xd6,0xa1,0x41,0xa7,0xec,0x3c,0x38,0xdf,
105 0xbd,0x61,0x5a,0x11,0x62,0xe1,0xc7,0xba,
106 0x36,0xb6,0x78,0x58,0x00},
107 {0x66,0xa0,0x94,0x9f,0x8a,0xf7,0xd6,0x89,
108 0x1f,0x7f,0x83,0x2b,0xa8,0x33,0xc0,0x0c,
109 0x89,0x2e,0xbe,0x30,0x14,0x3c,0xe2,0x87,
110 0x40,0x01,0x1e,0xcf,0x00},
111 {0xd6,0xa1,0x41,0xa7,0xec,0x3c,0x38,0xdf,0xbd,0x61,0x00},
112 {0},
113 };
114
115int main(int argc, char *argv[])
116 {
117 int i,err=0;
118 int j;
119 unsigned char *p;
120 RC4_KEY key;
121 unsigned char obuf[512];
122
123#if !defined(OPENSSL_PIC)
124 void OPENSSL_cpuid_setup(void);
125
126 OPENSSL_cpuid_setup();
127#endif
128
129 for (i=0; i<6; i++)
130 {
131 RC4_set_key(&key,keys[i][0],&(keys[i][1]));
132 memset(obuf,0x00,sizeof(obuf));
133 RC4(&key,data_len[i],&(data[i][0]),obuf);
134 if (memcmp(obuf,output[i],data_len[i]+1) != 0)
135 {
136 printf("error calculating RC4\n");
137 printf("output:");
138 for (j=0; j<data_len[i]+1; j++)
139 printf(" %02x",obuf[j]);
140 printf("\n");
141 printf("expect:");
142 p= &(output[i][0]);
143 for (j=0; j<data_len[i]+1; j++)
144 printf(" %02x",*(p++));
145 printf("\n");
146 err++;
147 }
148 else
149 printf("test %d ok\n",i);
150 }
151 printf("test end processing ");
152 for (i=0; i<data_len[3]; i++)
153 {
154 RC4_set_key(&key,keys[3][0],&(keys[3][1]));
155 memset(obuf,0x00,sizeof(obuf));
156 RC4(&key,i,&(data[3][0]),obuf);
157 if ((memcmp(obuf,output[3],i) != 0) || (obuf[i] != 0))
158 {
159 printf("error in RC4 length processing\n");
160 printf("output:");
161 for (j=0; j<i+1; j++)
162 printf(" %02x",obuf[j]);
163 printf("\n");
164 printf("expect:");
165 p= &(output[3][0]);
166 for (j=0; j<i; j++)
167 printf(" %02x",*(p++));
168 printf(" 00\n");
169 err++;
170 }
171 else
172 {
173 printf(".");
174 fflush(stdout);
175 }
176 }
177 printf("done\n");
178 printf("test multi-call ");
179 for (i=0; i<data_len[3]; i++)
180 {
181 RC4_set_key(&key,keys[3][0],&(keys[3][1]));
182 memset(obuf,0x00,sizeof(obuf));
183 RC4(&key,i,&(data[3][0]),obuf);
184 RC4(&key,data_len[3]-i,&(data[3][i]),&(obuf[i]));
185 if (memcmp(obuf,output[3],data_len[3]+1) != 0)
186 {
187 printf("error in RC4 multi-call processing\n");
188 printf("output:");
189 for (j=0; j<data_len[3]+1; j++)
190 printf(" %02x",obuf[j]);
191 printf("\n");
192 printf("expect:");
193 p= &(output[3][0]);
194 for (j=0; j<data_len[3]+1; j++)
195 printf(" %02x",*(p++));
196 err++;
197 }
198 else
199 {
200 printf(".");
201 fflush(stdout);
202 }
203 }
204 printf("done\n");
205 printf("bulk test ");
206 { unsigned char buf[513];
207 SHA_CTX c;
208 unsigned char md[SHA_DIGEST_LENGTH];
209 static unsigned char expected[]={
210 0xa4,0x7b,0xcc,0x00,0x3d,0xd0,0xbd,0xe1,0xac,0x5f,
211 0x12,0x1e,0x45,0xbc,0xfb,0x1a,0xa1,0xf2,0x7f,0xc5 };
212
213 RC4_set_key(&key,keys[0][0],&(keys[3][1]));
214 memset(buf,'\0',sizeof(buf));
215 SHA1_Init(&c);
216 for (i=0;i<2571;i++) {
217 RC4(&key,sizeof(buf),buf,buf);
218 SHA1_Update(&c,buf,sizeof(buf));
219 }
220 SHA1_Final(md,&c);
221
222 if (memcmp(md,expected,sizeof(md))) {
223 printf("error in RC4 bulk test\n");
224 printf("output:");
225 for (j=0; j<(int)sizeof(md); j++)
226 printf(" %02x",md[j]);
227 printf("\n");
228 printf("expect:");
229 for (j=0; j<(int)sizeof(md); j++)
230 printf(" %02x",expected[j]);
231 printf("\n");
232 err++;
233 }
234 else printf("ok\n");
235 }
236#ifdef OPENSSL_SYS_NETWARE
237 if (err) printf("ERROR: %d\n", err);
238#endif
239 EXIT(err);
240 return(0);
241 }
242#endif
diff --git a/src/lib/libcrypto/rc4/rrc4.doc b/src/lib/libcrypto/rc4/rrc4.doc
new file mode 100644
index 0000000000..2f9a953c12
--- /dev/null
+++ b/src/lib/libcrypto/rc4/rrc4.doc
@@ -0,0 +1,278 @@
1Newsgroups: sci.crypt,alt.security,comp.security.misc,alt.privacy
2Path: ghost.dsi.unimi.it!univ-lyon1.fr!jussieu.fr!zaphod.crihan.fr!warwick!clyde.open.ac.uk!strath-cs!bnr.co.uk!bt!pipex!howland.reston.ans.net!europa.eng.gtefsd.com!MathWorks.Com!yeshua.marcam.com!charnel.ecst.csuchico.edu!csusac!csus.edu!netcom.com!sterndark
3From: sterndark@netcom.com (David Sterndark)
4Subject: RC4 Algorithm revealed.
5Message-ID: <sternCvKL4B.Hyy@netcom.com>
6Sender: sterndark@netcom.com
7Organization: NETCOM On-line Communication Services (408 261-4700 guest)
8X-Newsreader: TIN [version 1.2 PL1]
9Date: Wed, 14 Sep 1994 06:35:31 GMT
10Lines: 263
11Xref: ghost.dsi.unimi.it sci.crypt:27332 alt.security:14732 comp.security.misc:11701 alt.privacy:16026
12
13I am shocked, shocked, I tell you, shocked, to discover
14that the cypherpunks have illegaly and criminally revealed
15a crucial RSA trade secret and harmed the security of
16America by reverse engineering the RC4 algorithm and
17publishing it to the world.
18
19On Saturday morning an anonymous cypherpunk wrote:
20
21
22 SUBJECT: RC4 Source Code
23
24
25 I've tested this. It is compatible with the RC4 object module
26 that comes in the various RSA toolkits.
27
28 /* rc4.h */
29 typedef struct rc4_key
30 {
31 unsigned char state[256];
32 unsigned char x;
33 unsigned char y;
34 } rc4_key;
35 void prepare_key(unsigned char *key_data_ptr,int key_data_len,
36 rc4_key *key);
37 void rc4(unsigned char *buffer_ptr,int buffer_len,rc4_key * key);
38
39
40 /*rc4.c */
41 #include "rc4.h"
42 static void swap_byte(unsigned char *a, unsigned char *b);
43 void prepare_key(unsigned char *key_data_ptr, int key_data_len,
44 rc4_key *key)
45 {
46 unsigned char swapByte;
47 unsigned char index1;
48 unsigned char index2;
49 unsigned char* state;
50 short counter;
51
52 state = &key->state[0];
53 for(counter = 0; counter < 256; counter++)
54 state[counter] = counter;
55 key->x = 0;
56 key->y = 0;
57 index1 = 0;
58 index2 = 0;
59 for(counter = 0; counter < 256; counter++)
60 {
61 index2 = (key_data_ptr[index1] + state[counter] +
62 index2) % 256;
63 swap_byte(&state[counter], &state[index2]);
64
65 index1 = (index1 + 1) % key_data_len;
66 }
67 }
68
69 void rc4(unsigned char *buffer_ptr, int buffer_len, rc4_key *key)
70 {
71 unsigned char x;
72 unsigned char y;
73 unsigned char* state;
74 unsigned char xorIndex;
75 short counter;
76
77 x = key->x;
78 y = key->y;
79
80 state = &key->state[0];
81 for(counter = 0; counter < buffer_len; counter ++)
82 {
83 x = (x + 1) % 256;
84 y = (state[x] + y) % 256;
85 swap_byte(&state[x], &state[y]);
86
87 xorIndex = (state[x] + state[y]) % 256;
88
89 buffer_ptr[counter] ^= state[xorIndex];
90 }
91 key->x = x;
92 key->y = y;
93 }
94
95 static void swap_byte(unsigned char *a, unsigned char *b)
96 {
97 unsigned char swapByte;
98
99 swapByte = *a;
100 *a = *b;
101 *b = swapByte;
102 }
103
104
105
106Another cypherpunk, this one not anonymous, tested the
107output from this algorithm against the output from
108official RC4 object code
109
110
111 Date: Tue, 13 Sep 94 18:37:56 PDT
112 From: ekr@eit.COM (Eric Rescorla)
113 Message-Id: <9409140137.AA17743@eitech.eit.com>
114 Subject: RC4 compatibility testing
115 Cc: cypherpunks@toad.com
116
117 One data point:
118
119 I can't say anything about the internals of RC4 versus the
120 algorithm that Bill Sommerfeld is rightly calling 'Alleged RC4',
121 since I don't know anything about RC4's internals.
122
123 However, I do have a (legitimately acquired) copy of BSAFE2 and
124 so I'm able to compare the output of this algorithm to the output
125 of genuine RC4 as found in BSAFE. I chose a set of test vectors
126 and ran them through both algorithms. The algorithms appear to
127 give identical results, at least with these key/plaintext pairs.
128
129 I note that this is the algorithm _without_ Hal Finney's
130 proposed modification
131
132 (see <199409130605.XAA24133@jobe.shell.portal.com>).
133
134 The vectors I used (together with the ciphertext they produce)
135 follow at the end of this message.
136
137 -Ekr
138
139 Disclaimer: This posting does not reflect the opinions of EIT.
140
141 --------------------results follow--------------
142 Test vector 0
143 Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
144 Input: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
145 0 Output: 0x75 0xb7 0x87 0x80 0x99 0xe0 0xc5 0x96
146
147 Test vector 1
148 Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
149 Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
150 0 Output: 0x74 0x94 0xc2 0xe7 0x10 0x4b 0x08 0x79
151
152 Test vector 2
153 Key: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
154 Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
155 0 Output: 0xde 0x18 0x89 0x41 0xa3 0x37 0x5d 0x3a
156
157 Test vector 3
158 Key: 0xef 0x01 0x23 0x45
159 Input: 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
160 0 Output: 0xd6 0xa1 0x41 0xa7 0xec 0x3c 0x38 0xdf 0xbd 0x61
161
162 Test vector 4
163 Key: 0x01 0x23 0x45 0x67 0x89 0xab 0xcd 0xef
164 Input: 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
165 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
166 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
167 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
168 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
169 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
170 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
171 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
172 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
173 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
174 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
175 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
176 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
177 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
178 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
179 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
180 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
181 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
182 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
183 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
184 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
185 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
186 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
187 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
188 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
189 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
190 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
191 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
192 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
193 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
194 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
195 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
196 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
197 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
198 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
199 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
200 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
201 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
202 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
203 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
204 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
205 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
206 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
207 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
208 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
209 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
210 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
211 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
212 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
213 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
214 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01
215 0x01
216 0 Output: 0x75 0x95 0xc3 0xe6 0x11 0x4a 0x09 0x78 0x0c 0x4a 0xd4
217 0x52 0x33 0x8e 0x1f 0xfd 0x9a 0x1b 0xe9 0x49 0x8f
218 0x81 0x3d 0x76 0x53 0x34 0x49 0xb6 0x77 0x8d 0xca
219 0xd8 0xc7 0x8a 0x8d 0x2b 0xa9 0xac 0x66 0x08 0x5d
220 0x0e 0x53 0xd5 0x9c 0x26 0xc2 0xd1 0xc4 0x90 0xc1
221 0xeb 0xbe 0x0c 0xe6 0x6d 0x1b 0x6b 0x1b 0x13 0xb6
222 0xb9 0x19 0xb8 0x47 0xc2 0x5a 0x91 0x44 0x7a 0x95
223 0xe7 0x5e 0x4e 0xf1 0x67 0x79 0xcd 0xe8 0xbf 0x0a
224 0x95 0x85 0x0e 0x32 0xaf 0x96 0x89 0x44 0x4f 0xd3
225 0x77 0x10 0x8f 0x98 0xfd 0xcb 0xd4 0xe7 0x26 0x56
226 0x75 0x00 0x99 0x0b 0xcc 0x7e 0x0c 0xa3 0xc4 0xaa
227 0xa3 0x04 0xa3 0x87 0xd2 0x0f 0x3b 0x8f 0xbb 0xcd
228 0x42 0xa1 0xbd 0x31 0x1d 0x7a 0x43 0x03 0xdd 0xa5
229 0xab 0x07 0x88 0x96 0xae 0x80 0xc1 0x8b 0x0a 0xf6
230 0x6d 0xff 0x31 0x96 0x16 0xeb 0x78 0x4e 0x49 0x5a
231 0xd2 0xce 0x90 0xd7 0xf7 0x72 0xa8 0x17 0x47 0xb6
232 0x5f 0x62 0x09 0x3b 0x1e 0x0d 0xb9 0xe5 0xba 0x53
233 0x2f 0xaf 0xec 0x47 0x50 0x83 0x23 0xe6 0x71 0x32
234 0x7d 0xf9 0x44 0x44 0x32 0xcb 0x73 0x67 0xce 0xc8
235 0x2f 0x5d 0x44 0xc0 0xd0 0x0b 0x67 0xd6 0x50 0xa0
236 0x75 0xcd 0x4b 0x70 0xde 0xdd 0x77 0xeb 0x9b 0x10
237 0x23 0x1b 0x6b 0x5b 0x74 0x13 0x47 0x39 0x6d 0x62
238 0x89 0x74 0x21 0xd4 0x3d 0xf9 0xb4 0x2e 0x44 0x6e
239 0x35 0x8e 0x9c 0x11 0xa9 0xb2 0x18 0x4e 0xcb 0xef
240 0x0c 0xd8 0xe7 0xa8 0x77 0xef 0x96 0x8f 0x13 0x90
241 0xec 0x9b 0x3d 0x35 0xa5 0x58 0x5c 0xb0 0x09 0x29
242 0x0e 0x2f 0xcd 0xe7 0xb5 0xec 0x66 0xd9 0x08 0x4b
243 0xe4 0x40 0x55 0xa6 0x19 0xd9 0xdd 0x7f 0xc3 0x16
244 0x6f 0x94 0x87 0xf7 0xcb 0x27 0x29 0x12 0x42 0x64
245 0x45 0x99 0x85 0x14 0xc1 0x5d 0x53 0xa1 0x8c 0x86
246 0x4c 0xe3 0xa2 0xb7 0x55 0x57 0x93 0x98 0x81 0x26
247 0x52 0x0e 0xac 0xf2 0xe3 0x06 0x6e 0x23 0x0c 0x91
248 0xbe 0xe4 0xdd 0x53 0x04 0xf5 0xfd 0x04 0x05 0xb3
249 0x5b 0xd9 0x9c 0x73 0x13 0x5d 0x3d 0x9b 0xc3 0x35
250 0xee 0x04 0x9e 0xf6 0x9b 0x38 0x67 0xbf 0x2d 0x7b
251 0xd1 0xea 0xa5 0x95 0xd8 0xbf 0xc0 0x06 0x6f 0xf8
252 0xd3 0x15 0x09 0xeb 0x0c 0x6c 0xaa 0x00 0x6c 0x80
253 0x7a 0x62 0x3e 0xf8 0x4c 0x3d 0x33 0xc1 0x95 0xd2
254 0x3e 0xe3 0x20 0xc4 0x0d 0xe0 0x55 0x81 0x57 0xc8
255 0x22 0xd4 0xb8 0xc5 0x69 0xd8 0x49 0xae 0xd5 0x9d
256 0x4e 0x0f 0xd7 0xf3 0x79 0x58 0x6b 0x4b 0x7f 0xf6
257 0x84 0xed 0x6a 0x18 0x9f 0x74 0x86 0xd4 0x9b 0x9c
258 0x4b 0xad 0x9b 0xa2 0x4b 0x96 0xab 0xf9 0x24 0x37
259 0x2c 0x8a 0x8f 0xff 0xb1 0x0d 0x55 0x35 0x49 0x00
260 0xa7 0x7a 0x3d 0xb5 0xf2 0x05 0xe1 0xb9 0x9f 0xcd
261 0x86 0x60 0x86 0x3a 0x15 0x9a 0xd4 0xab 0xe4 0x0f
262 0xa4 0x89 0x34 0x16 0x3d 0xdd 0xe5 0x42 0xa6 0x58
263 0x55 0x40 0xfd 0x68 0x3c 0xbf 0xd8 0xc0 0x0f 0x12
264 0x12 0x9a 0x28 0x4d 0xea 0xcc 0x4c 0xde 0xfe 0x58
265 0xbe 0x71 0x37 0x54 0x1c 0x04 0x71 0x26 0xc8 0xd4
266 0x9e 0x27 0x55 0xab 0x18 0x1a 0xb7 0xe9 0x40 0xb0
267 0xc0
268
269
270
271--
272 ---------------------------------------------------------------------
273We have the right to defend ourselves and our
274property, because of the kind of animals that we James A. Donald
275are. True law derives from this right, not from
276the arbitrary power of the omnipotent state. jamesd@netcom.com
277
278