diff options
Diffstat (limited to 'src/lib/libcrypto/rc4/rc4_enc.c')
-rw-r--r-- | src/lib/libcrypto/rc4/rc4_enc.c | 186 |
1 files changed, 185 insertions, 1 deletions
diff --git a/src/lib/libcrypto/rc4/rc4_enc.c b/src/lib/libcrypto/rc4/rc4_enc.c index 3256bea8cc..d5f18a3a70 100644 --- a/src/lib/libcrypto/rc4/rc4_enc.c +++ b/src/lib/libcrypto/rc4/rc4_enc.c | |||
@@ -67,7 +67,7 @@ | |||
67 | * Date: Wed, 14 Sep 1994 06:35:31 GMT | 67 | * Date: Wed, 14 Sep 1994 06:35:31 GMT |
68 | */ | 68 | */ |
69 | 69 | ||
70 | void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, | 70 | void RC4(RC4_KEY *key, unsigned long len, const unsigned char *indata, |
71 | unsigned char *outdata) | 71 | unsigned char *outdata) |
72 | { | 72 | { |
73 | register RC4_INT *d; | 73 | register RC4_INT *d; |
@@ -78,6 +78,190 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, | |||
78 | y=key->y; | 78 | y=key->y; |
79 | d=key->data; | 79 | d=key->data; |
80 | 80 | ||
81 | #if defined(RC4_CHUNK) | ||
82 | /* | ||
83 | * The original reason for implementing this(*) was the fact that | ||
84 | * pre-21164a Alpha CPUs don't have byte load/store instructions | ||
85 | * and e.g. a byte store has to be done with 64-bit load, shift, | ||
86 | * and, or and finally 64-bit store. Peaking data and operating | ||
87 | * at natural word size made it possible to reduce amount of | ||
88 | * instructions as well as to perform early read-ahead without | ||
89 | * suffering from RAW (read-after-write) hazard. This resulted | ||
90 | * in ~40%(**) performance improvement on 21064 box with gcc. | ||
91 | * But it's not only Alpha users who win here:-) Thanks to the | ||
92 | * early-n-wide read-ahead this implementation also exhibits | ||
93 | * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending | ||
94 | * on sizeof(RC4_INT)). | ||
95 | * | ||
96 | * (*) "this" means code which recognizes the case when input | ||
97 | * and output pointers appear to be aligned at natural CPU | ||
98 | * word boundary | ||
99 | * (**) i.e. according to 'apps/openssl speed rc4' benchmark, | ||
100 | * crypto/rc4/rc4speed.c exhibits almost 70% speed-up... | ||
101 | * | ||
102 | * Cavets. | ||
103 | * | ||
104 | * - RC4_CHUNK="unsigned long long" should be a #1 choice for | ||
105 | * UltraSPARC. Unfortunately gcc generates very slow code | ||
106 | * (2.5-3 times slower than one generated by Sun's WorkShop | ||
107 | * C) and therefore gcc (at least 2.95 and earlier) should | ||
108 | * always be told that RC4_CHUNK="unsigned long". | ||
109 | * | ||
110 | * <appro@fy.chalmers.se> | ||
111 | */ | ||
112 | |||
113 | # define RC4_STEP ( \ | ||
114 | x=(x+1) &0xff, \ | ||
115 | tx=d[x], \ | ||
116 | y=(tx+y)&0xff, \ | ||
117 | ty=d[y], \ | ||
118 | d[y]=tx, \ | ||
119 | d[x]=ty, \ | ||
120 | (RC4_CHUNK)d[(tx+ty)&0xff]\ | ||
121 | ) | ||
122 | |||
123 | if ( ( ((unsigned long)indata & (sizeof(RC4_CHUNK)-1)) | | ||
124 | ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 ) | ||
125 | { | ||
126 | RC4_CHUNK ichunk,otp; | ||
127 | const union { long one; char little; } is_endian = {1}; | ||
128 | |||
129 | /* | ||
130 | * I reckon we can afford to implement both endian | ||
131 | * cases and to decide which way to take at run-time | ||
132 | * because the machine code appears to be very compact | ||
133 | * and redundant 1-2KB is perfectly tolerable (i.e. | ||
134 | * in case the compiler fails to eliminate it:-). By | ||
135 | * suggestion from Terrel Larson <terr@terralogic.net> | ||
136 | * who also stands for the is_endian union:-) | ||
137 | * | ||
138 | * Special notes. | ||
139 | * | ||
140 | * - is_endian is declared automatic as doing otherwise | ||
141 | * (declaring static) prevents gcc from eliminating | ||
142 | * the redundant code; | ||
143 | * - compilers (those I've tried) don't seem to have | ||
144 | * problems eliminating either the operators guarded | ||
145 | * by "if (sizeof(RC4_CHUNK)==8)" or the condition | ||
146 | * expressions themselves so I've got 'em to replace | ||
147 | * corresponding #ifdefs from the previous version; | ||
148 | * - I chose to let the redundant switch cases when | ||
149 | * sizeof(RC4_CHUNK)!=8 be (were also #ifdefed | ||
150 | * before); | ||
151 | * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in | ||
152 | * [LB]ESHFT guards against "shift is out of range" | ||
153 | * warnings when sizeof(RC4_CHUNK)!=8 | ||
154 | * | ||
155 | * <appro@fy.chalmers.se> | ||
156 | */ | ||
157 | if (!is_endian.little) | ||
158 | { /* BIG-ENDIAN CASE */ | ||
159 | # define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) | ||
160 | for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) | ||
161 | { | ||
162 | ichunk = *(RC4_CHUNK *)indata; | ||
163 | otp = RC4_STEP<<BESHFT(0); | ||
164 | otp |= RC4_STEP<<BESHFT(1); | ||
165 | otp |= RC4_STEP<<BESHFT(2); | ||
166 | otp |= RC4_STEP<<BESHFT(3); | ||
167 | if (sizeof(RC4_CHUNK)==8) | ||
168 | { | ||
169 | otp |= RC4_STEP<<BESHFT(4); | ||
170 | otp |= RC4_STEP<<BESHFT(5); | ||
171 | otp |= RC4_STEP<<BESHFT(6); | ||
172 | otp |= RC4_STEP<<BESHFT(7); | ||
173 | } | ||
174 | *(RC4_CHUNK *)outdata = otp^ichunk; | ||
175 | indata += sizeof(RC4_CHUNK); | ||
176 | outdata += sizeof(RC4_CHUNK); | ||
177 | } | ||
178 | if (len) | ||
179 | { | ||
180 | RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk; | ||
181 | |||
182 | ichunk = *(RC4_CHUNK *)indata; | ||
183 | ochunk = *(RC4_CHUNK *)outdata; | ||
184 | otp = 0; | ||
185 | i = BESHFT(0); | ||
186 | mask <<= (sizeof(RC4_CHUNK)-len)<<3; | ||
187 | switch (len&(sizeof(RC4_CHUNK)-1)) | ||
188 | { | ||
189 | case 7: otp = RC4_STEP<<i, i-=8; | ||
190 | case 6: otp |= RC4_STEP<<i, i-=8; | ||
191 | case 5: otp |= RC4_STEP<<i, i-=8; | ||
192 | case 4: otp |= RC4_STEP<<i, i-=8; | ||
193 | case 3: otp |= RC4_STEP<<i, i-=8; | ||
194 | case 2: otp |= RC4_STEP<<i, i-=8; | ||
195 | case 1: otp |= RC4_STEP<<i, i-=8; | ||
196 | case 0: ; /* | ||
197 | * it's never the case, | ||
198 | * but it has to be here | ||
199 | * for ultrix? | ||
200 | */ | ||
201 | } | ||
202 | ochunk &= ~mask; | ||
203 | ochunk |= (otp^ichunk) & mask; | ||
204 | *(RC4_CHUNK *)outdata = ochunk; | ||
205 | } | ||
206 | key->x=x; | ||
207 | key->y=y; | ||
208 | return; | ||
209 | } | ||
210 | else | ||
211 | { /* LITTLE-ENDIAN CASE */ | ||
212 | # define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1)) | ||
213 | for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) | ||
214 | { | ||
215 | ichunk = *(RC4_CHUNK *)indata; | ||
216 | otp = RC4_STEP; | ||
217 | otp |= RC4_STEP<<8; | ||
218 | otp |= RC4_STEP<<16; | ||
219 | otp |= RC4_STEP<<24; | ||
220 | if (sizeof(RC4_CHUNK)==8) | ||
221 | { | ||
222 | otp |= RC4_STEP<<LESHFT(4); | ||
223 | otp |= RC4_STEP<<LESHFT(5); | ||
224 | otp |= RC4_STEP<<LESHFT(6); | ||
225 | otp |= RC4_STEP<<LESHFT(7); | ||
226 | } | ||
227 | *(RC4_CHUNK *)outdata = otp^ichunk; | ||
228 | indata += sizeof(RC4_CHUNK); | ||
229 | outdata += sizeof(RC4_CHUNK); | ||
230 | } | ||
231 | if (len) | ||
232 | { | ||
233 | RC4_CHUNK mask=(RC4_CHUNK)-1, ochunk; | ||
234 | |||
235 | ichunk = *(RC4_CHUNK *)indata; | ||
236 | ochunk = *(RC4_CHUNK *)outdata; | ||
237 | otp = 0; | ||
238 | i = 0; | ||
239 | mask >>= (sizeof(RC4_CHUNK)-len)<<3; | ||
240 | switch (len&(sizeof(RC4_CHUNK)-1)) | ||
241 | { | ||
242 | case 7: otp = RC4_STEP, i+=8; | ||
243 | case 6: otp |= RC4_STEP<<i, i+=8; | ||
244 | case 5: otp |= RC4_STEP<<i, i+=8; | ||
245 | case 4: otp |= RC4_STEP<<i, i+=8; | ||
246 | case 3: otp |= RC4_STEP<<i, i+=8; | ||
247 | case 2: otp |= RC4_STEP<<i, i+=8; | ||
248 | case 1: otp |= RC4_STEP<<i, i+=8; | ||
249 | case 0: ; /* | ||
250 | * it's never the case, | ||
251 | * but it has to be here | ||
252 | * for ultrix? | ||
253 | */ | ||
254 | } | ||
255 | ochunk &= ~mask; | ||
256 | ochunk |= (otp^ichunk) & mask; | ||
257 | *(RC4_CHUNK *)outdata = ochunk; | ||
258 | } | ||
259 | key->x=x; | ||
260 | key->y=y; | ||
261 | return; | ||
262 | } | ||
263 | } | ||
264 | #endif | ||
81 | #define LOOP(in,out) \ | 265 | #define LOOP(in,out) \ |
82 | x=((x+1)&0xff); \ | 266 | x=((x+1)&0xff); \ |
83 | tx=d[x]; \ | 267 | tx=d[x]; \ |