diff options
Diffstat (limited to 'src/lib/libcrypto/aes/asm/aes-x86_64.pl')
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2809 |
1 files changed, 2809 insertions, 0 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl new file mode 100755 index 0000000000..53e4ef85fd --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl | |||
@@ -0,0 +1,2809 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # Version 2.1. | ||
11 | # | ||
12 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on | ||
13 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version | ||
14 | # [you'll notice a lot of resemblance], such as compressed S-boxes | ||
15 | # in little-endian byte order, prefetch of these tables in CBC mode, | ||
16 | # as well as avoiding L1 cache aliasing between stack frame and key | ||
17 | # schedule and already mentioned tables, compressed Td4... | ||
18 | # | ||
19 | # Performance in number of cycles per processed byte for 128-bit key: | ||
20 | # | ||
21 | # ECB encrypt ECB decrypt CBC large chunk | ||
22 | # AMD64 33 41 13.0 | ||
23 | # EM64T 38 59 18.6(*) | ||
24 | # Core 2 30 43 14.5(*) | ||
25 | # | ||
26 | # (*) with hyper-threading off | ||
27 | |||
28 | $flavour = shift; | ||
29 | $output = shift; | ||
30 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
31 | |||
32 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
33 | |||
34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
35 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
36 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
37 | die "can't locate x86_64-xlate.pl"; | ||
38 | |||
39 | open STDOUT,"| $^X $xlate $flavour $output"; | ||
40 | |||
41 | $verticalspin=1; # unlike 32-bit version $verticalspin performs | ||
42 | # ~15% better on both AMD and Intel cores | ||
43 | $speed_limit=512; # see aes-586.pl for details | ||
44 | |||
45 | $code=".text\n"; | ||
46 | |||
47 | $s0="%eax"; | ||
48 | $s1="%ebx"; | ||
49 | $s2="%ecx"; | ||
50 | $s3="%edx"; | ||
51 | $acc0="%esi"; $mask80="%rsi"; | ||
52 | $acc1="%edi"; $maskfe="%rdi"; | ||
53 | $acc2="%ebp"; $mask1b="%rbp"; | ||
54 | $inp="%r8"; | ||
55 | $out="%r9"; | ||
56 | $t0="%r10d"; | ||
57 | $t1="%r11d"; | ||
58 | $t2="%r12d"; | ||
59 | $rnds="%r13d"; | ||
60 | $sbox="%r14"; | ||
61 | $key="%r15"; | ||
62 | |||
63 | sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | ||
64 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | ||
65 | $r =~ s/%[er]([sd]i)/%\1l/; | ||
66 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | ||
67 | sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; | ||
68 | $r =~ s/%r([0-9]+)/%r\1d/; $r; } | ||
69 | sub _data_word() | ||
70 | { my $i; | ||
71 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | ||
72 | } | ||
73 | sub data_word() | ||
74 | { my $i; | ||
75 | my $last=pop(@_); | ||
76 | $code.=".long\t"; | ||
77 | while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } | ||
78 | $code.=sprintf"0x%08x\n",$last; | ||
79 | } | ||
80 | |||
81 | sub data_byte() | ||
82 | { my $i; | ||
83 | my $last=pop(@_); | ||
84 | $code.=".byte\t"; | ||
85 | while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } | ||
86 | $code.=sprintf"0x%02x\n",$last&0xff; | ||
87 | } | ||
88 | |||
89 | sub encvert() | ||
90 | { my $t3="%r8d"; # zaps $inp! | ||
91 | |||
92 | $code.=<<___; | ||
93 | # favor 3-way issue Opteron pipeline... | ||
94 | movzb `&lo("$s0")`,$acc0 | ||
95 | movzb `&lo("$s1")`,$acc1 | ||
96 | movzb `&lo("$s2")`,$acc2 | ||
97 | mov 0($sbox,$acc0,8),$t0 | ||
98 | mov 0($sbox,$acc1,8),$t1 | ||
99 | mov 0($sbox,$acc2,8),$t2 | ||
100 | |||
101 | movzb `&hi("$s1")`,$acc0 | ||
102 | movzb `&hi("$s2")`,$acc1 | ||
103 | movzb `&lo("$s3")`,$acc2 | ||
104 | xor 3($sbox,$acc0,8),$t0 | ||
105 | xor 3($sbox,$acc1,8),$t1 | ||
106 | mov 0($sbox,$acc2,8),$t3 | ||
107 | |||
108 | movzb `&hi("$s3")`,$acc0 | ||
109 | shr \$16,$s2 | ||
110 | movzb `&hi("$s0")`,$acc2 | ||
111 | xor 3($sbox,$acc0,8),$t2 | ||
112 | shr \$16,$s3 | ||
113 | xor 3($sbox,$acc2,8),$t3 | ||
114 | |||
115 | shr \$16,$s1 | ||
116 | lea 16($key),$key | ||
117 | shr \$16,$s0 | ||
118 | |||
119 | movzb `&lo("$s2")`,$acc0 | ||
120 | movzb `&lo("$s3")`,$acc1 | ||
121 | movzb `&lo("$s0")`,$acc2 | ||
122 | xor 2($sbox,$acc0,8),$t0 | ||
123 | xor 2($sbox,$acc1,8),$t1 | ||
124 | xor 2($sbox,$acc2,8),$t2 | ||
125 | |||
126 | movzb `&hi("$s3")`,$acc0 | ||
127 | movzb `&hi("$s0")`,$acc1 | ||
128 | movzb `&lo("$s1")`,$acc2 | ||
129 | xor 1($sbox,$acc0,8),$t0 | ||
130 | xor 1($sbox,$acc1,8),$t1 | ||
131 | xor 2($sbox,$acc2,8),$t3 | ||
132 | |||
133 | mov 12($key),$s3 | ||
134 | movzb `&hi("$s1")`,$acc1 | ||
135 | movzb `&hi("$s2")`,$acc2 | ||
136 | mov 0($key),$s0 | ||
137 | xor 1($sbox,$acc1,8),$t2 | ||
138 | xor 1($sbox,$acc2,8),$t3 | ||
139 | |||
140 | mov 4($key),$s1 | ||
141 | mov 8($key),$s2 | ||
142 | xor $t0,$s0 | ||
143 | xor $t1,$s1 | ||
144 | xor $t2,$s2 | ||
145 | xor $t3,$s3 | ||
146 | ___ | ||
147 | } | ||
148 | |||
149 | sub enclastvert() | ||
150 | { my $t3="%r8d"; # zaps $inp! | ||
151 | |||
152 | $code.=<<___; | ||
153 | movzb `&lo("$s0")`,$acc0 | ||
154 | movzb `&lo("$s1")`,$acc1 | ||
155 | movzb `&lo("$s2")`,$acc2 | ||
156 | movzb 2($sbox,$acc0,8),$t0 | ||
157 | movzb 2($sbox,$acc1,8),$t1 | ||
158 | movzb 2($sbox,$acc2,8),$t2 | ||
159 | |||
160 | movzb `&lo("$s3")`,$acc0 | ||
161 | movzb `&hi("$s1")`,$acc1 | ||
162 | movzb `&hi("$s2")`,$acc2 | ||
163 | movzb 2($sbox,$acc0,8),$t3 | ||
164 | mov 0($sbox,$acc1,8),$acc1 #$t0 | ||
165 | mov 0($sbox,$acc2,8),$acc2 #$t1 | ||
166 | |||
167 | and \$0x0000ff00,$acc1 | ||
168 | and \$0x0000ff00,$acc2 | ||
169 | |||
170 | xor $acc1,$t0 | ||
171 | xor $acc2,$t1 | ||
172 | shr \$16,$s2 | ||
173 | |||
174 | movzb `&hi("$s3")`,$acc0 | ||
175 | movzb `&hi("$s0")`,$acc1 | ||
176 | shr \$16,$s3 | ||
177 | mov 0($sbox,$acc0,8),$acc0 #$t2 | ||
178 | mov 0($sbox,$acc1,8),$acc1 #$t3 | ||
179 | |||
180 | and \$0x0000ff00,$acc0 | ||
181 | and \$0x0000ff00,$acc1 | ||
182 | shr \$16,$s1 | ||
183 | xor $acc0,$t2 | ||
184 | xor $acc1,$t3 | ||
185 | shr \$16,$s0 | ||
186 | |||
187 | movzb `&lo("$s2")`,$acc0 | ||
188 | movzb `&lo("$s3")`,$acc1 | ||
189 | movzb `&lo("$s0")`,$acc2 | ||
190 | mov 0($sbox,$acc0,8),$acc0 #$t0 | ||
191 | mov 0($sbox,$acc1,8),$acc1 #$t1 | ||
192 | mov 0($sbox,$acc2,8),$acc2 #$t2 | ||
193 | |||
194 | and \$0x00ff0000,$acc0 | ||
195 | and \$0x00ff0000,$acc1 | ||
196 | and \$0x00ff0000,$acc2 | ||
197 | |||
198 | xor $acc0,$t0 | ||
199 | xor $acc1,$t1 | ||
200 | xor $acc2,$t2 | ||
201 | |||
202 | movzb `&lo("$s1")`,$acc0 | ||
203 | movzb `&hi("$s3")`,$acc1 | ||
204 | movzb `&hi("$s0")`,$acc2 | ||
205 | mov 0($sbox,$acc0,8),$acc0 #$t3 | ||
206 | mov 2($sbox,$acc1,8),$acc1 #$t0 | ||
207 | mov 2($sbox,$acc2,8),$acc2 #$t1 | ||
208 | |||
209 | and \$0x00ff0000,$acc0 | ||
210 | and \$0xff000000,$acc1 | ||
211 | and \$0xff000000,$acc2 | ||
212 | |||
213 | xor $acc0,$t3 | ||
214 | xor $acc1,$t0 | ||
215 | xor $acc2,$t1 | ||
216 | |||
217 | movzb `&hi("$s1")`,$acc0 | ||
218 | movzb `&hi("$s2")`,$acc1 | ||
219 | mov 16+12($key),$s3 | ||
220 | mov 2($sbox,$acc0,8),$acc0 #$t2 | ||
221 | mov 2($sbox,$acc1,8),$acc1 #$t3 | ||
222 | mov 16+0($key),$s0 | ||
223 | |||
224 | and \$0xff000000,$acc0 | ||
225 | and \$0xff000000,$acc1 | ||
226 | |||
227 | xor $acc0,$t2 | ||
228 | xor $acc1,$t3 | ||
229 | |||
230 | mov 16+4($key),$s1 | ||
231 | mov 16+8($key),$s2 | ||
232 | xor $t0,$s0 | ||
233 | xor $t1,$s1 | ||
234 | xor $t2,$s2 | ||
235 | xor $t3,$s3 | ||
236 | ___ | ||
237 | } | ||
238 | |||
239 | sub encstep() | ||
240 | { my ($i,@s) = @_; | ||
241 | my $tmp0=$acc0; | ||
242 | my $tmp1=$acc1; | ||
243 | my $tmp2=$acc2; | ||
244 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
245 | |||
246 | if ($i==3) { | ||
247 | $tmp0=$s[1]; | ||
248 | $tmp1=$s[2]; | ||
249 | $tmp2=$s[3]; | ||
250 | } | ||
251 | $code.=" movzb ".&lo($s[0]).",$out\n"; | ||
252 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
253 | $code.=" lea 16($key),$key\n" if ($i==0); | ||
254 | |||
255 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
256 | $code.=" mov 0($sbox,$out,8),$out\n"; | ||
257 | |||
258 | $code.=" shr \$16,$tmp1\n"; | ||
259 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
260 | $code.=" xor 3($sbox,$tmp0,8),$out\n"; | ||
261 | |||
262 | $code.=" movzb ".&lo($tmp1).",$tmp1\n"; | ||
263 | $code.=" shr \$24,$tmp2\n"; | ||
264 | $code.=" xor 4*$i($key),$out\n"; | ||
265 | |||
266 | $code.=" xor 2($sbox,$tmp1,8),$out\n"; | ||
267 | $code.=" xor 1($sbox,$tmp2,8),$out\n"; | ||
268 | |||
269 | $code.=" mov $t0,$s[1]\n" if ($i==3); | ||
270 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
271 | $code.=" mov $t2,$s[3]\n" if ($i==3); | ||
272 | $code.="\n"; | ||
273 | } | ||
274 | |||
275 | sub enclast() | ||
276 | { my ($i,@s)=@_; | ||
277 | my $tmp0=$acc0; | ||
278 | my $tmp1=$acc1; | ||
279 | my $tmp2=$acc2; | ||
280 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
281 | |||
282 | if ($i==3) { | ||
283 | $tmp0=$s[1]; | ||
284 | $tmp1=$s[2]; | ||
285 | $tmp2=$s[3]; | ||
286 | } | ||
287 | $code.=" movzb ".&lo($s[0]).",$out\n"; | ||
288 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
289 | |||
290 | $code.=" mov 2($sbox,$out,8),$out\n"; | ||
291 | $code.=" shr \$16,$tmp1\n"; | ||
292 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
293 | |||
294 | $code.=" and \$0x000000ff,$out\n"; | ||
295 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
296 | $code.=" movzb ".&lo($tmp1).",$tmp1\n"; | ||
297 | $code.=" shr \$24,$tmp2\n"; | ||
298 | |||
299 | $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; | ||
300 | $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; | ||
301 | $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; | ||
302 | |||
303 | $code.=" and \$0x0000ff00,$tmp0\n"; | ||
304 | $code.=" and \$0x00ff0000,$tmp1\n"; | ||
305 | $code.=" and \$0xff000000,$tmp2\n"; | ||
306 | |||
307 | $code.=" xor $tmp0,$out\n"; | ||
308 | $code.=" mov $t0,$s[1]\n" if ($i==3); | ||
309 | $code.=" xor $tmp1,$out\n"; | ||
310 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
311 | $code.=" xor $tmp2,$out\n"; | ||
312 | $code.=" mov $t2,$s[3]\n" if ($i==3); | ||
313 | $code.="\n"; | ||
314 | } | ||
315 | |||
316 | $code.=<<___; | ||
317 | .type _x86_64_AES_encrypt,\@abi-omnipotent | ||
318 | .align 16 | ||
319 | _x86_64_AES_encrypt: | ||
320 | xor 0($key),$s0 # xor with key | ||
321 | xor 4($key),$s1 | ||
322 | xor 8($key),$s2 | ||
323 | xor 12($key),$s3 | ||
324 | |||
325 | mov 240($key),$rnds # load key->rounds | ||
326 | sub \$1,$rnds | ||
327 | jmp .Lenc_loop | ||
328 | .align 16 | ||
329 | .Lenc_loop: | ||
330 | ___ | ||
331 | if ($verticalspin) { &encvert(); } | ||
332 | else { &encstep(0,$s0,$s1,$s2,$s3); | ||
333 | &encstep(1,$s1,$s2,$s3,$s0); | ||
334 | &encstep(2,$s2,$s3,$s0,$s1); | ||
335 | &encstep(3,$s3,$s0,$s1,$s2); | ||
336 | } | ||
337 | $code.=<<___; | ||
338 | sub \$1,$rnds | ||
339 | jnz .Lenc_loop | ||
340 | ___ | ||
341 | if ($verticalspin) { &enclastvert(); } | ||
342 | else { &enclast(0,$s0,$s1,$s2,$s3); | ||
343 | &enclast(1,$s1,$s2,$s3,$s0); | ||
344 | &enclast(2,$s2,$s3,$s0,$s1); | ||
345 | &enclast(3,$s3,$s0,$s1,$s2); | ||
346 | $code.=<<___; | ||
347 | xor 16+0($key),$s0 # xor with key | ||
348 | xor 16+4($key),$s1 | ||
349 | xor 16+8($key),$s2 | ||
350 | xor 16+12($key),$s3 | ||
351 | ___ | ||
352 | } | ||
353 | $code.=<<___; | ||
354 | .byte 0xf3,0xc3 # rep ret | ||
355 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt | ||
356 | ___ | ||
357 | |||
358 | # it's possible to implement this by shifting tN by 8, filling least | ||
359 | # significant byte with byte load and finally bswap-ing at the end, | ||
360 | # but such partial register load kills Core 2... | ||
361 | sub enccompactvert() | ||
362 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
363 | |||
364 | $code.=<<___; | ||
365 | movzb `&lo("$s0")`,$t0 | ||
366 | movzb `&lo("$s1")`,$t1 | ||
367 | movzb `&lo("$s2")`,$t2 | ||
368 | movzb ($sbox,$t0,1),$t0 | ||
369 | movzb ($sbox,$t1,1),$t1 | ||
370 | movzb ($sbox,$t2,1),$t2 | ||
371 | |||
372 | movzb `&lo("$s3")`,$t3 | ||
373 | movzb `&hi("$s1")`,$acc0 | ||
374 | movzb `&hi("$s2")`,$acc1 | ||
375 | movzb ($sbox,$t3,1),$t3 | ||
376 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
377 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
378 | |||
379 | movzb `&hi("$s3")`,$acc2 | ||
380 | movzb `&hi("$s0")`,$acc0 | ||
381 | shr \$16,$s2 | ||
382 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
383 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
384 | shr \$16,$s3 | ||
385 | |||
386 | movzb `&lo("$s2")`,$acc1 | ||
387 | shl \$8,$t4 | ||
388 | shl \$8,$t5 | ||
389 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
390 | xor $t4,$t0 | ||
391 | xor $t5,$t1 | ||
392 | |||
393 | movzb `&lo("$s3")`,$t4 | ||
394 | shr \$16,$s0 | ||
395 | shr \$16,$s1 | ||
396 | movzb `&lo("$s0")`,$t5 | ||
397 | shl \$8,$acc2 | ||
398 | shl \$8,$acc0 | ||
399 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
400 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
401 | xor $acc2,$t2 | ||
402 | xor $acc0,$t3 | ||
403 | |||
404 | movzb `&lo("$s1")`,$acc2 | ||
405 | movzb `&hi("$s3")`,$acc0 | ||
406 | shl \$16,$acc1 | ||
407 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
408 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
409 | xor $acc1,$t0 | ||
410 | |||
411 | movzb `&hi("$s0")`,$acc1 | ||
412 | shr \$8,$s2 | ||
413 | shr \$8,$s1 | ||
414 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
415 | movzb ($sbox,$s2,1),$s3 #$t3 | ||
416 | movzb ($sbox,$s1,1),$s2 #$t2 | ||
417 | shl \$16,$t4 | ||
418 | shl \$16,$t5 | ||
419 | shl \$16,$acc2 | ||
420 | xor $t4,$t1 | ||
421 | xor $t5,$t2 | ||
422 | xor $acc2,$t3 | ||
423 | |||
424 | shl \$24,$acc0 | ||
425 | shl \$24,$acc1 | ||
426 | shl \$24,$s3 | ||
427 | xor $acc0,$t0 | ||
428 | shl \$24,$s2 | ||
429 | xor $acc1,$t1 | ||
430 | mov $t0,$s0 | ||
431 | mov $t1,$s1 | ||
432 | xor $t2,$s2 | ||
433 | xor $t3,$s3 | ||
434 | ___ | ||
435 | } | ||
436 | |||
437 | sub enctransform_ref() | ||
438 | { my $sn = shift; | ||
439 | my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); | ||
440 | |||
441 | $code.=<<___; | ||
442 | mov $sn,$acc | ||
443 | and \$0x80808080,$acc | ||
444 | mov $acc,$tmp | ||
445 | shr \$7,$tmp | ||
446 | lea ($sn,$sn),$r2 | ||
447 | sub $tmp,$acc | ||
448 | and \$0xfefefefe,$r2 | ||
449 | and \$0x1b1b1b1b,$acc | ||
450 | mov $sn,$tmp | ||
451 | xor $acc,$r2 | ||
452 | |||
453 | xor $r2,$sn | ||
454 | rol \$24,$sn | ||
455 | xor $r2,$sn | ||
456 | ror \$16,$tmp | ||
457 | xor $tmp,$sn | ||
458 | ror \$8,$tmp | ||
459 | xor $tmp,$sn | ||
460 | ___ | ||
461 | } | ||
462 | |||
463 | # unlike decrypt case it does not pay off to parallelize enctransform | ||
464 | sub enctransform() | ||
465 | { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); | ||
466 | |||
467 | $code.=<<___; | ||
468 | mov $s0,$acc0 | ||
469 | mov $s1,$acc1 | ||
470 | and \$0x80808080,$acc0 | ||
471 | and \$0x80808080,$acc1 | ||
472 | mov $acc0,$t0 | ||
473 | mov $acc1,$t1 | ||
474 | shr \$7,$t0 | ||
475 | lea ($s0,$s0),$r20 | ||
476 | shr \$7,$t1 | ||
477 | lea ($s1,$s1),$r21 | ||
478 | sub $t0,$acc0 | ||
479 | sub $t1,$acc1 | ||
480 | and \$0xfefefefe,$r20 | ||
481 | and \$0xfefefefe,$r21 | ||
482 | and \$0x1b1b1b1b,$acc0 | ||
483 | and \$0x1b1b1b1b,$acc1 | ||
484 | mov $s0,$t0 | ||
485 | mov $s1,$t1 | ||
486 | xor $acc0,$r20 | ||
487 | xor $acc1,$r21 | ||
488 | |||
489 | xor $r20,$s0 | ||
490 | xor $r21,$s1 | ||
491 | mov $s2,$acc0 | ||
492 | mov $s3,$acc1 | ||
493 | rol \$24,$s0 | ||
494 | rol \$24,$s1 | ||
495 | and \$0x80808080,$acc0 | ||
496 | and \$0x80808080,$acc1 | ||
497 | xor $r20,$s0 | ||
498 | xor $r21,$s1 | ||
499 | mov $acc0,$t2 | ||
500 | mov $acc1,$t3 | ||
501 | ror \$16,$t0 | ||
502 | ror \$16,$t1 | ||
503 | shr \$7,$t2 | ||
504 | lea ($s2,$s2),$r20 | ||
505 | xor $t0,$s0 | ||
506 | xor $t1,$s1 | ||
507 | shr \$7,$t3 | ||
508 | lea ($s3,$s3),$r21 | ||
509 | ror \$8,$t0 | ||
510 | ror \$8,$t1 | ||
511 | sub $t2,$acc0 | ||
512 | sub $t3,$acc1 | ||
513 | xor $t0,$s0 | ||
514 | xor $t1,$s1 | ||
515 | |||
516 | and \$0xfefefefe,$r20 | ||
517 | and \$0xfefefefe,$r21 | ||
518 | and \$0x1b1b1b1b,$acc0 | ||
519 | and \$0x1b1b1b1b,$acc1 | ||
520 | mov $s2,$t2 | ||
521 | mov $s3,$t3 | ||
522 | xor $acc0,$r20 | ||
523 | xor $acc1,$r21 | ||
524 | |||
525 | xor $r20,$s2 | ||
526 | xor $r21,$s3 | ||
527 | rol \$24,$s2 | ||
528 | rol \$24,$s3 | ||
529 | xor $r20,$s2 | ||
530 | xor $r21,$s3 | ||
531 | mov 0($sbox),$acc0 # prefetch Te4 | ||
532 | ror \$16,$t2 | ||
533 | ror \$16,$t3 | ||
534 | mov 64($sbox),$acc1 | ||
535 | xor $t2,$s2 | ||
536 | xor $t3,$s3 | ||
537 | mov 128($sbox),$r20 | ||
538 | ror \$8,$t2 | ||
539 | ror \$8,$t3 | ||
540 | mov 192($sbox),$r21 | ||
541 | xor $t2,$s2 | ||
542 | xor $t3,$s3 | ||
543 | ___ | ||
544 | } | ||
545 | |||
546 | $code.=<<___; | ||
547 | .type _x86_64_AES_encrypt_compact,\@abi-omnipotent | ||
548 | .align 16 | ||
549 | _x86_64_AES_encrypt_compact: | ||
550 | lea 128($sbox),$inp # size optimization | ||
551 | mov 0-128($inp),$acc1 # prefetch Te4 | ||
552 | mov 32-128($inp),$acc2 | ||
553 | mov 64-128($inp),$t0 | ||
554 | mov 96-128($inp),$t1 | ||
555 | mov 128-128($inp),$acc1 | ||
556 | mov 160-128($inp),$acc2 | ||
557 | mov 192-128($inp),$t0 | ||
558 | mov 224-128($inp),$t1 | ||
559 | jmp .Lenc_loop_compact | ||
560 | .align 16 | ||
561 | .Lenc_loop_compact: | ||
562 | xor 0($key),$s0 # xor with key | ||
563 | xor 4($key),$s1 | ||
564 | xor 8($key),$s2 | ||
565 | xor 12($key),$s3 | ||
566 | lea 16($key),$key | ||
567 | ___ | ||
568 | &enccompactvert(); | ||
569 | $code.=<<___; | ||
570 | cmp 16(%rsp),$key | ||
571 | je .Lenc_compact_done | ||
572 | ___ | ||
573 | &enctransform(); | ||
574 | $code.=<<___; | ||
575 | jmp .Lenc_loop_compact | ||
576 | .align 16 | ||
577 | .Lenc_compact_done: | ||
578 | xor 0($key),$s0 | ||
579 | xor 4($key),$s1 | ||
580 | xor 8($key),$s2 | ||
581 | xor 12($key),$s3 | ||
582 | .byte 0xf3,0xc3 # rep ret | ||
583 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact | ||
584 | ___ | ||
585 | |||
586 | # void AES_encrypt (const void *inp,void *out,const AES_KEY *key); | ||
587 | $code.=<<___; | ||
588 | .globl AES_encrypt | ||
589 | .type AES_encrypt,\@function,3 | ||
590 | .align 16 | ||
591 | AES_encrypt: | ||
592 | push %rbx | ||
593 | push %rbp | ||
594 | push %r12 | ||
595 | push %r13 | ||
596 | push %r14 | ||
597 | push %r15 | ||
598 | |||
599 | # allocate frame "above" key schedule | ||
600 | mov %rsp,%r10 | ||
601 | lea -63(%rdx),%rcx # %rdx is key argument | ||
602 | and \$-64,%rsp | ||
603 | sub %rsp,%rcx | ||
604 | neg %rcx | ||
605 | and \$0x3c0,%rcx | ||
606 | sub %rcx,%rsp | ||
607 | sub \$32,%rsp | ||
608 | |||
609 | mov %rsi,16(%rsp) # save out | ||
610 | mov %r10,24(%rsp) # save real stack pointer | ||
611 | .Lenc_prologue: | ||
612 | |||
613 | mov %rdx,$key | ||
614 | mov 240($key),$rnds # load rounds | ||
615 | |||
616 | mov 0(%rdi),$s0 # load input vector | ||
617 | mov 4(%rdi),$s1 | ||
618 | mov 8(%rdi),$s2 | ||
619 | mov 12(%rdi),$s3 | ||
620 | |||
621 | shl \$4,$rnds | ||
622 | lea ($key,$rnds),%rbp | ||
623 | mov $key,(%rsp) # key schedule | ||
624 | mov %rbp,8(%rsp) # end of key schedule | ||
625 | |||
626 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
627 | lea .LAES_Te+2048(%rip),$sbox | ||
628 | lea 768(%rsp),%rbp | ||
629 | sub $sbox,%rbp | ||
630 | and \$0x300,%rbp | ||
631 | lea ($sbox,%rbp),$sbox | ||
632 | |||
633 | call _x86_64_AES_encrypt_compact | ||
634 | |||
635 | mov 16(%rsp),$out # restore out | ||
636 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
637 | mov $s0,0($out) # write output vector | ||
638 | mov $s1,4($out) | ||
639 | mov $s2,8($out) | ||
640 | mov $s3,12($out) | ||
641 | |||
642 | mov (%rsi),%r15 | ||
643 | mov 8(%rsi),%r14 | ||
644 | mov 16(%rsi),%r13 | ||
645 | mov 24(%rsi),%r12 | ||
646 | mov 32(%rsi),%rbp | ||
647 | mov 40(%rsi),%rbx | ||
648 | lea 48(%rsi),%rsp | ||
649 | .Lenc_epilogue: | ||
650 | ret | ||
651 | .size AES_encrypt,.-AES_encrypt | ||
652 | ___ | ||
653 | |||
654 | #------------------------------------------------------------------# | ||
655 | |||
656 | sub decvert() | ||
657 | { my $t3="%r8d"; # zaps $inp! | ||
658 | |||
659 | $code.=<<___; | ||
660 | # favor 3-way issue Opteron pipeline... | ||
661 | movzb `&lo("$s0")`,$acc0 | ||
662 | movzb `&lo("$s1")`,$acc1 | ||
663 | movzb `&lo("$s2")`,$acc2 | ||
664 | mov 0($sbox,$acc0,8),$t0 | ||
665 | mov 0($sbox,$acc1,8),$t1 | ||
666 | mov 0($sbox,$acc2,8),$t2 | ||
667 | |||
668 | movzb `&hi("$s3")`,$acc0 | ||
669 | movzb `&hi("$s0")`,$acc1 | ||
670 | movzb `&lo("$s3")`,$acc2 | ||
671 | xor 3($sbox,$acc0,8),$t0 | ||
672 | xor 3($sbox,$acc1,8),$t1 | ||
673 | mov 0($sbox,$acc2,8),$t3 | ||
674 | |||
675 | movzb `&hi("$s1")`,$acc0 | ||
676 | shr \$16,$s0 | ||
677 | movzb `&hi("$s2")`,$acc2 | ||
678 | xor 3($sbox,$acc0,8),$t2 | ||
679 | shr \$16,$s3 | ||
680 | xor 3($sbox,$acc2,8),$t3 | ||
681 | |||
682 | shr \$16,$s1 | ||
683 | lea 16($key),$key | ||
684 | shr \$16,$s2 | ||
685 | |||
686 | movzb `&lo("$s2")`,$acc0 | ||
687 | movzb `&lo("$s3")`,$acc1 | ||
688 | movzb `&lo("$s0")`,$acc2 | ||
689 | xor 2($sbox,$acc0,8),$t0 | ||
690 | xor 2($sbox,$acc1,8),$t1 | ||
691 | xor 2($sbox,$acc2,8),$t2 | ||
692 | |||
693 | movzb `&hi("$s1")`,$acc0 | ||
694 | movzb `&hi("$s2")`,$acc1 | ||
695 | movzb `&lo("$s1")`,$acc2 | ||
696 | xor 1($sbox,$acc0,8),$t0 | ||
697 | xor 1($sbox,$acc1,8),$t1 | ||
698 | xor 2($sbox,$acc2,8),$t3 | ||
699 | |||
700 | movzb `&hi("$s3")`,$acc0 | ||
701 | mov 12($key),$s3 | ||
702 | movzb `&hi("$s0")`,$acc2 | ||
703 | xor 1($sbox,$acc0,8),$t2 | ||
704 | mov 0($key),$s0 | ||
705 | xor 1($sbox,$acc2,8),$t3 | ||
706 | |||
707 | xor $t0,$s0 | ||
708 | mov 4($key),$s1 | ||
709 | mov 8($key),$s2 | ||
710 | xor $t2,$s2 | ||
711 | xor $t1,$s1 | ||
712 | xor $t3,$s3 | ||
713 | ___ | ||
714 | } | ||
715 | |||
716 | sub declastvert() | ||
717 | { my $t3="%r8d"; # zaps $inp! | ||
718 | |||
719 | $code.=<<___; | ||
720 | lea 2048($sbox),$sbox # size optimization | ||
721 | movzb `&lo("$s0")`,$acc0 | ||
722 | movzb `&lo("$s1")`,$acc1 | ||
723 | movzb `&lo("$s2")`,$acc2 | ||
724 | movzb ($sbox,$acc0,1),$t0 | ||
725 | movzb ($sbox,$acc1,1),$t1 | ||
726 | movzb ($sbox,$acc2,1),$t2 | ||
727 | |||
728 | movzb `&lo("$s3")`,$acc0 | ||
729 | movzb `&hi("$s3")`,$acc1 | ||
730 | movzb `&hi("$s0")`,$acc2 | ||
731 | movzb ($sbox,$acc0,1),$t3 | ||
732 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
733 | movzb ($sbox,$acc2,1),$acc2 #$t1 | ||
734 | |||
735 | shl \$8,$acc1 | ||
736 | shl \$8,$acc2 | ||
737 | |||
738 | xor $acc1,$t0 | ||
739 | xor $acc2,$t1 | ||
740 | shr \$16,$s3 | ||
741 | |||
742 | movzb `&hi("$s1")`,$acc0 | ||
743 | movzb `&hi("$s2")`,$acc1 | ||
744 | shr \$16,$s0 | ||
745 | movzb ($sbox,$acc0,1),$acc0 #$t2 | ||
746 | movzb ($sbox,$acc1,1),$acc1 #$t3 | ||
747 | |||
748 | shl \$8,$acc0 | ||
749 | shl \$8,$acc1 | ||
750 | shr \$16,$s1 | ||
751 | xor $acc0,$t2 | ||
752 | xor $acc1,$t3 | ||
753 | shr \$16,$s2 | ||
754 | |||
755 | movzb `&lo("$s2")`,$acc0 | ||
756 | movzb `&lo("$s3")`,$acc1 | ||
757 | movzb `&lo("$s0")`,$acc2 | ||
758 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
759 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
760 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
761 | |||
762 | shl \$16,$acc0 | ||
763 | shl \$16,$acc1 | ||
764 | shl \$16,$acc2 | ||
765 | |||
766 | xor $acc0,$t0 | ||
767 | xor $acc1,$t1 | ||
768 | xor $acc2,$t2 | ||
769 | |||
770 | movzb `&lo("$s1")`,$acc0 | ||
771 | movzb `&hi("$s1")`,$acc1 | ||
772 | movzb `&hi("$s2")`,$acc2 | ||
773 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
774 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
775 | movzb ($sbox,$acc2,1),$acc2 #$t1 | ||
776 | |||
777 | shl \$16,$acc0 | ||
778 | shl \$24,$acc1 | ||
779 | shl \$24,$acc2 | ||
780 | |||
781 | xor $acc0,$t3 | ||
782 | xor $acc1,$t0 | ||
783 | xor $acc2,$t1 | ||
784 | |||
785 | movzb `&hi("$s3")`,$acc0 | ||
786 | movzb `&hi("$s0")`,$acc1 | ||
787 | mov 16+12($key),$s3 | ||
788 | movzb ($sbox,$acc0,1),$acc0 #$t2 | ||
789 | movzb ($sbox,$acc1,1),$acc1 #$t3 | ||
790 | mov 16+0($key),$s0 | ||
791 | |||
792 | shl \$24,$acc0 | ||
793 | shl \$24,$acc1 | ||
794 | |||
795 | xor $acc0,$t2 | ||
796 | xor $acc1,$t3 | ||
797 | |||
798 | mov 16+4($key),$s1 | ||
799 | mov 16+8($key),$s2 | ||
800 | lea -2048($sbox),$sbox | ||
801 | xor $t0,$s0 | ||
802 | xor $t1,$s1 | ||
803 | xor $t2,$s2 | ||
804 | xor $t3,$s3 | ||
805 | ___ | ||
806 | } | ||
807 | |||
808 | sub decstep() | ||
809 | { my ($i,@s) = @_; | ||
810 | my $tmp0=$acc0; | ||
811 | my $tmp1=$acc1; | ||
812 | my $tmp2=$acc2; | ||
813 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
814 | |||
815 | $code.=" mov $s[0],$out\n" if ($i!=3); | ||
816 | $tmp1=$s[2] if ($i==3); | ||
817 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
818 | $code.=" and \$0xFF,$out\n"; | ||
819 | |||
820 | $code.=" mov 0($sbox,$out,8),$out\n"; | ||
821 | $code.=" shr \$16,$tmp1\n"; | ||
822 | $tmp2=$s[3] if ($i==3); | ||
823 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
824 | |||
825 | $tmp0=$s[1] if ($i==3); | ||
826 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
827 | $code.=" and \$0xFF,$tmp1\n"; | ||
828 | $code.=" shr \$24,$tmp2\n"; | ||
829 | |||
830 | $code.=" xor 3($sbox,$tmp0,8),$out\n"; | ||
831 | $code.=" xor 2($sbox,$tmp1,8),$out\n"; | ||
832 | $code.=" xor 1($sbox,$tmp2,8),$out\n"; | ||
833 | |||
834 | $code.=" mov $t2,$s[1]\n" if ($i==3); | ||
835 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
836 | $code.=" mov $t0,$s[3]\n" if ($i==3); | ||
837 | $code.="\n"; | ||
838 | } | ||
839 | |||
840 | sub declast() | ||
841 | { my ($i,@s)=@_; | ||
842 | my $tmp0=$acc0; | ||
843 | my $tmp1=$acc1; | ||
844 | my $tmp2=$acc2; | ||
845 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
846 | |||
847 | $code.=" mov $s[0],$out\n" if ($i!=3); | ||
848 | $tmp1=$s[2] if ($i==3); | ||
849 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
850 | $code.=" and \$0xFF,$out\n"; | ||
851 | |||
852 | $code.=" movzb 2048($sbox,$out,1),$out\n"; | ||
853 | $code.=" shr \$16,$tmp1\n"; | ||
854 | $tmp2=$s[3] if ($i==3); | ||
855 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
856 | |||
857 | $tmp0=$s[1] if ($i==3); | ||
858 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
859 | $code.=" and \$0xFF,$tmp1\n"; | ||
860 | $code.=" shr \$24,$tmp2\n"; | ||
861 | |||
862 | $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n"; | ||
863 | $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n"; | ||
864 | $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n"; | ||
865 | |||
866 | $code.=" shl \$8,$tmp0\n"; | ||
867 | $code.=" shl \$16,$tmp1\n"; | ||
868 | $code.=" shl \$24,$tmp2\n"; | ||
869 | |||
870 | $code.=" xor $tmp0,$out\n"; | ||
871 | $code.=" mov $t2,$s[1]\n" if ($i==3); | ||
872 | $code.=" xor $tmp1,$out\n"; | ||
873 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
874 | $code.=" xor $tmp2,$out\n"; | ||
875 | $code.=" mov $t0,$s[3]\n" if ($i==3); | ||
876 | $code.="\n"; | ||
877 | } | ||
878 | |||
879 | $code.=<<___; | ||
880 | .type _x86_64_AES_decrypt,\@abi-omnipotent | ||
881 | .align 16 | ||
882 | _x86_64_AES_decrypt: | ||
883 | xor 0($key),$s0 # xor with key | ||
884 | xor 4($key),$s1 | ||
885 | xor 8($key),$s2 | ||
886 | xor 12($key),$s3 | ||
887 | |||
888 | mov 240($key),$rnds # load key->rounds | ||
889 | sub \$1,$rnds | ||
890 | jmp .Ldec_loop | ||
891 | .align 16 | ||
892 | .Ldec_loop: | ||
893 | ___ | ||
894 | if ($verticalspin) { &decvert(); } | ||
895 | else { &decstep(0,$s0,$s3,$s2,$s1); | ||
896 | &decstep(1,$s1,$s0,$s3,$s2); | ||
897 | &decstep(2,$s2,$s1,$s0,$s3); | ||
898 | &decstep(3,$s3,$s2,$s1,$s0); | ||
899 | $code.=<<___; | ||
900 | lea 16($key),$key | ||
901 | xor 0($key),$s0 # xor with key | ||
902 | xor 4($key),$s1 | ||
903 | xor 8($key),$s2 | ||
904 | xor 12($key),$s3 | ||
905 | ___ | ||
906 | } | ||
907 | $code.=<<___; | ||
908 | sub \$1,$rnds | ||
909 | jnz .Ldec_loop | ||
910 | ___ | ||
911 | if ($verticalspin) { &declastvert(); } | ||
912 | else { &declast(0,$s0,$s3,$s2,$s1); | ||
913 | &declast(1,$s1,$s0,$s3,$s2); | ||
914 | &declast(2,$s2,$s1,$s0,$s3); | ||
915 | &declast(3,$s3,$s2,$s1,$s0); | ||
916 | $code.=<<___; | ||
917 | xor 16+0($key),$s0 # xor with key | ||
918 | xor 16+4($key),$s1 | ||
919 | xor 16+8($key),$s2 | ||
920 | xor 16+12($key),$s3 | ||
921 | ___ | ||
922 | } | ||
923 | $code.=<<___; | ||
924 | .byte 0xf3,0xc3 # rep ret | ||
925 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt | ||
926 | ___ | ||
927 | |||
928 | sub deccompactvert() | ||
929 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
930 | |||
931 | $code.=<<___; | ||
932 | movzb `&lo("$s0")`,$t0 | ||
933 | movzb `&lo("$s1")`,$t1 | ||
934 | movzb `&lo("$s2")`,$t2 | ||
935 | movzb ($sbox,$t0,1),$t0 | ||
936 | movzb ($sbox,$t1,1),$t1 | ||
937 | movzb ($sbox,$t2,1),$t2 | ||
938 | |||
939 | movzb `&lo("$s3")`,$t3 | ||
940 | movzb `&hi("$s3")`,$acc0 | ||
941 | movzb `&hi("$s0")`,$acc1 | ||
942 | movzb ($sbox,$t3,1),$t3 | ||
943 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
944 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
945 | |||
946 | movzb `&hi("$s1")`,$acc2 | ||
947 | movzb `&hi("$s2")`,$acc0 | ||
948 | shr \$16,$s2 | ||
949 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
950 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
951 | shr \$16,$s3 | ||
952 | |||
953 | movzb `&lo("$s2")`,$acc1 | ||
954 | shl \$8,$t4 | ||
955 | shl \$8,$t5 | ||
956 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
957 | xor $t4,$t0 | ||
958 | xor $t5,$t1 | ||
959 | |||
960 | movzb `&lo("$s3")`,$t4 | ||
961 | shr \$16,$s0 | ||
962 | shr \$16,$s1 | ||
963 | movzb `&lo("$s0")`,$t5 | ||
964 | shl \$8,$acc2 | ||
965 | shl \$8,$acc0 | ||
966 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
967 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
968 | xor $acc2,$t2 | ||
969 | xor $acc0,$t3 | ||
970 | |||
971 | movzb `&lo("$s1")`,$acc2 | ||
972 | movzb `&hi("$s1")`,$acc0 | ||
973 | shl \$16,$acc1 | ||
974 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
975 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
976 | xor $acc1,$t0 | ||
977 | |||
978 | movzb `&hi("$s2")`,$acc1 | ||
979 | shl \$16,$t4 | ||
980 | shl \$16,$t5 | ||
981 | movzb ($sbox,$acc1,1),$s1 #$t1 | ||
982 | xor $t4,$t1 | ||
983 | xor $t5,$t2 | ||
984 | |||
985 | movzb `&hi("$s3")`,$acc1 | ||
986 | shr \$8,$s0 | ||
987 | shl \$16,$acc2 | ||
988 | movzb ($sbox,$acc1,1),$s2 #$t2 | ||
989 | movzb ($sbox,$s0,1),$s3 #$t3 | ||
990 | xor $acc2,$t3 | ||
991 | |||
992 | shl \$24,$acc0 | ||
993 | shl \$24,$s1 | ||
994 | shl \$24,$s2 | ||
995 | xor $acc0,$t0 | ||
996 | shl \$24,$s3 | ||
997 | xor $t1,$s1 | ||
998 | mov $t0,$s0 | ||
999 | xor $t2,$s2 | ||
1000 | xor $t3,$s3 | ||
1001 | ___ | ||
1002 | } | ||
1003 | |||
1004 | # parallelized version! input is pair of 64-bit values: %rax=s1.s0 | ||
1005 | # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, | ||
1006 | # %ecx=s2 and %edx=s3. | ||
1007 | sub dectransform() | ||
1008 | { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); | ||
1009 | my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); | ||
1010 | my $prefetch = shift; | ||
1011 | |||
1012 | $code.=<<___; | ||
1013 | mov $tp10,$acc0 | ||
1014 | mov $tp18,$acc8 | ||
1015 | and $mask80,$acc0 | ||
1016 | and $mask80,$acc8 | ||
1017 | mov $acc0,$tp40 | ||
1018 | mov $acc8,$tp48 | ||
1019 | shr \$7,$tp40 | ||
1020 | lea ($tp10,$tp10),$tp20 | ||
1021 | shr \$7,$tp48 | ||
1022 | lea ($tp18,$tp18),$tp28 | ||
1023 | sub $tp40,$acc0 | ||
1024 | sub $tp48,$acc8 | ||
1025 | and $maskfe,$tp20 | ||
1026 | and $maskfe,$tp28 | ||
1027 | and $mask1b,$acc0 | ||
1028 | and $mask1b,$acc8 | ||
1029 | xor $tp20,$acc0 | ||
1030 | xor $tp28,$acc8 | ||
1031 | mov $acc0,$tp20 | ||
1032 | mov $acc8,$tp28 | ||
1033 | |||
1034 | and $mask80,$acc0 | ||
1035 | and $mask80,$acc8 | ||
1036 | mov $acc0,$tp80 | ||
1037 | mov $acc8,$tp88 | ||
1038 | shr \$7,$tp80 | ||
1039 | lea ($tp20,$tp20),$tp40 | ||
1040 | shr \$7,$tp88 | ||
1041 | lea ($tp28,$tp28),$tp48 | ||
1042 | sub $tp80,$acc0 | ||
1043 | sub $tp88,$acc8 | ||
1044 | and $maskfe,$tp40 | ||
1045 | and $maskfe,$tp48 | ||
1046 | and $mask1b,$acc0 | ||
1047 | and $mask1b,$acc8 | ||
1048 | xor $tp40,$acc0 | ||
1049 | xor $tp48,$acc8 | ||
1050 | mov $acc0,$tp40 | ||
1051 | mov $acc8,$tp48 | ||
1052 | |||
1053 | and $mask80,$acc0 | ||
1054 | and $mask80,$acc8 | ||
1055 | mov $acc0,$tp80 | ||
1056 | mov $acc8,$tp88 | ||
1057 | shr \$7,$tp80 | ||
1058 | xor $tp10,$tp20 # tp2^=tp1 | ||
1059 | shr \$7,$tp88 | ||
1060 | xor $tp18,$tp28 # tp2^=tp1 | ||
1061 | sub $tp80,$acc0 | ||
1062 | sub $tp88,$acc8 | ||
1063 | lea ($tp40,$tp40),$tp80 | ||
1064 | lea ($tp48,$tp48),$tp88 | ||
1065 | xor $tp10,$tp40 # tp4^=tp1 | ||
1066 | xor $tp18,$tp48 # tp4^=tp1 | ||
1067 | and $maskfe,$tp80 | ||
1068 | and $maskfe,$tp88 | ||
1069 | and $mask1b,$acc0 | ||
1070 | and $mask1b,$acc8 | ||
1071 | xor $acc0,$tp80 | ||
1072 | xor $acc8,$tp88 | ||
1073 | |||
1074 | xor $tp80,$tp10 # tp1^=tp8 | ||
1075 | xor $tp88,$tp18 # tp1^=tp8 | ||
1076 | xor $tp80,$tp20 # tp2^tp1^=tp8 | ||
1077 | xor $tp88,$tp28 # tp2^tp1^=tp8 | ||
1078 | mov $tp10,$acc0 | ||
1079 | mov $tp18,$acc8 | ||
1080 | xor $tp80,$tp40 # tp4^tp1^=tp8 | ||
1081 | xor $tp88,$tp48 # tp4^tp1^=tp8 | ||
1082 | shr \$32,$acc0 | ||
1083 | shr \$32,$acc8 | ||
1084 | xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1085 | xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1086 | rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) | ||
1087 | rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) | ||
1088 | xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1089 | xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1090 | |||
1091 | rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) | ||
1092 | rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) | ||
1093 | xor `&LO("$tp80")`,`&LO("$tp10")` | ||
1094 | xor `&LO("$tp88")`,`&LO("$tp18")` | ||
1095 | shr \$32,$tp80 | ||
1096 | shr \$32,$tp88 | ||
1097 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1098 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1099 | |||
1100 | mov $tp20,$tp80 | ||
1101 | mov $tp28,$tp88 | ||
1102 | shr \$32,$tp80 | ||
1103 | shr \$32,$tp88 | ||
1104 | rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) | ||
1105 | rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) | ||
1106 | rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) | ||
1107 | rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) | ||
1108 | xor `&LO("$tp20")`,`&LO("$tp10")` | ||
1109 | xor `&LO("$tp28")`,`&LO("$tp18")` | ||
1110 | mov $tp40,$tp20 | ||
1111 | mov $tp48,$tp28 | ||
1112 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1113 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1114 | |||
1115 | `"mov 0($sbox),$mask80" if ($prefetch)` | ||
1116 | shr \$32,$tp20 | ||
1117 | shr \$32,$tp28 | ||
1118 | `"mov 64($sbox),$maskfe" if ($prefetch)` | ||
1119 | rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) | ||
1120 | rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) | ||
1121 | `"mov 128($sbox),$mask1b" if ($prefetch)` | ||
1122 | rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) | ||
1123 | rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) | ||
1124 | `"mov 192($sbox),$tp80" if ($prefetch)` | ||
1125 | xor `&LO("$tp40")`,`&LO("$tp10")` | ||
1126 | xor `&LO("$tp48")`,`&LO("$tp18")` | ||
1127 | `"mov 256($sbox),$tp88" if ($prefetch)` | ||
1128 | xor `&LO("$tp20")`,`&LO("$acc0")` | ||
1129 | xor `&LO("$tp28")`,`&LO("$acc8")` | ||
1130 | ___ | ||
1131 | } | ||
1132 | |||
1133 | $code.=<<___; | ||
1134 | .type _x86_64_AES_decrypt_compact,\@abi-omnipotent | ||
1135 | .align 16 | ||
1136 | _x86_64_AES_decrypt_compact: | ||
1137 | lea 128($sbox),$inp # size optimization | ||
1138 | mov 0-128($inp),$acc1 # prefetch Td4 | ||
1139 | mov 32-128($inp),$acc2 | ||
1140 | mov 64-128($inp),$t0 | ||
1141 | mov 96-128($inp),$t1 | ||
1142 | mov 128-128($inp),$acc1 | ||
1143 | mov 160-128($inp),$acc2 | ||
1144 | mov 192-128($inp),$t0 | ||
1145 | mov 224-128($inp),$t1 | ||
1146 | jmp .Ldec_loop_compact | ||
1147 | |||
1148 | .align 16 | ||
1149 | .Ldec_loop_compact: | ||
1150 | xor 0($key),$s0 # xor with key | ||
1151 | xor 4($key),$s1 | ||
1152 | xor 8($key),$s2 | ||
1153 | xor 12($key),$s3 | ||
1154 | lea 16($key),$key | ||
1155 | ___ | ||
1156 | &deccompactvert(); | ||
1157 | $code.=<<___; | ||
1158 | cmp 16(%rsp),$key | ||
1159 | je .Ldec_compact_done | ||
1160 | |||
1161 | mov 256+0($sbox),$mask80 | ||
1162 | shl \$32,%rbx | ||
1163 | shl \$32,%rdx | ||
1164 | mov 256+8($sbox),$maskfe | ||
1165 | or %rbx,%rax | ||
1166 | or %rdx,%rcx | ||
1167 | mov 256+16($sbox),$mask1b | ||
1168 | ___ | ||
1169 | &dectransform(1); | ||
1170 | $code.=<<___; | ||
1171 | jmp .Ldec_loop_compact | ||
1172 | .align 16 | ||
1173 | .Ldec_compact_done: | ||
1174 | xor 0($key),$s0 | ||
1175 | xor 4($key),$s1 | ||
1176 | xor 8($key),$s2 | ||
1177 | xor 12($key),$s3 | ||
1178 | .byte 0xf3,0xc3 # rep ret | ||
1179 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact | ||
1180 | ___ | ||
1181 | |||
1182 | # void AES_decrypt (const void *inp,void *out,const AES_KEY *key); | ||
1183 | $code.=<<___; | ||
1184 | .globl AES_decrypt | ||
1185 | .type AES_decrypt,\@function,3 | ||
1186 | .align 16 | ||
1187 | AES_decrypt: | ||
1188 | push %rbx | ||
1189 | push %rbp | ||
1190 | push %r12 | ||
1191 | push %r13 | ||
1192 | push %r14 | ||
1193 | push %r15 | ||
1194 | |||
1195 | # allocate frame "above" key schedule | ||
1196 | mov %rsp,%r10 | ||
1197 | lea -63(%rdx),%rcx # %rdx is key argument | ||
1198 | and \$-64,%rsp | ||
1199 | sub %rsp,%rcx | ||
1200 | neg %rcx | ||
1201 | and \$0x3c0,%rcx | ||
1202 | sub %rcx,%rsp | ||
1203 | sub \$32,%rsp | ||
1204 | |||
1205 | mov %rsi,16(%rsp) # save out | ||
1206 | mov %r10,24(%rsp) # save real stack pointer | ||
1207 | .Ldec_prologue: | ||
1208 | |||
1209 | mov %rdx,$key | ||
1210 | mov 240($key),$rnds # load rounds | ||
1211 | |||
1212 | mov 0(%rdi),$s0 # load input vector | ||
1213 | mov 4(%rdi),$s1 | ||
1214 | mov 8(%rdi),$s2 | ||
1215 | mov 12(%rdi),$s3 | ||
1216 | |||
1217 | shl \$4,$rnds | ||
1218 | lea ($key,$rnds),%rbp | ||
1219 | mov $key,(%rsp) # key schedule | ||
1220 | mov %rbp,8(%rsp) # end of key schedule | ||
1221 | |||
1222 | # pick Td4 copy which can't "overlap" with stack frame or key schedule | ||
1223 | lea .LAES_Td+2048(%rip),$sbox | ||
1224 | lea 768(%rsp),%rbp | ||
1225 | sub $sbox,%rbp | ||
1226 | and \$0x300,%rbp | ||
1227 | lea ($sbox,%rbp),$sbox | ||
1228 | shr \$3,%rbp # recall "magic" constants! | ||
1229 | add %rbp,$sbox | ||
1230 | |||
1231 | call _x86_64_AES_decrypt_compact | ||
1232 | |||
1233 | mov 16(%rsp),$out # restore out | ||
1234 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
1235 | mov $s0,0($out) # write output vector | ||
1236 | mov $s1,4($out) | ||
1237 | mov $s2,8($out) | ||
1238 | mov $s3,12($out) | ||
1239 | |||
1240 | mov (%rsi),%r15 | ||
1241 | mov 8(%rsi),%r14 | ||
1242 | mov 16(%rsi),%r13 | ||
1243 | mov 24(%rsi),%r12 | ||
1244 | mov 32(%rsi),%rbp | ||
1245 | mov 40(%rsi),%rbx | ||
1246 | lea 48(%rsi),%rsp | ||
1247 | .Ldec_epilogue: | ||
1248 | ret | ||
1249 | .size AES_decrypt,.-AES_decrypt | ||
1250 | ___ | ||
1251 | #------------------------------------------------------------------# | ||
1252 | |||
1253 | sub enckey() | ||
1254 | { | ||
1255 | $code.=<<___; | ||
1256 | movz %dl,%esi # rk[i]>>0 | ||
1257 | movzb -128(%rbp,%rsi),%ebx | ||
1258 | movz %dh,%esi # rk[i]>>8 | ||
1259 | shl \$24,%ebx | ||
1260 | xor %ebx,%eax | ||
1261 | |||
1262 | movzb -128(%rbp,%rsi),%ebx | ||
1263 | shr \$16,%edx | ||
1264 | movz %dl,%esi # rk[i]>>16 | ||
1265 | xor %ebx,%eax | ||
1266 | |||
1267 | movzb -128(%rbp,%rsi),%ebx | ||
1268 | movz %dh,%esi # rk[i]>>24 | ||
1269 | shl \$8,%ebx | ||
1270 | xor %ebx,%eax | ||
1271 | |||
1272 | movzb -128(%rbp,%rsi),%ebx | ||
1273 | shl \$16,%ebx | ||
1274 | xor %ebx,%eax | ||
1275 | |||
1276 | xor 1024-128(%rbp,%rcx,4),%eax # rcon | ||
1277 | ___ | ||
1278 | } | ||
1279 | |||
1280 | # int AES_set_encrypt_key(const unsigned char *userKey, const int bits, | ||
1281 | # AES_KEY *key) | ||
1282 | $code.=<<___; | ||
1283 | .globl AES_set_encrypt_key | ||
1284 | .type AES_set_encrypt_key,\@function,3 | ||
1285 | .align 16 | ||
1286 | AES_set_encrypt_key: | ||
1287 | push %rbx | ||
1288 | push %rbp | ||
1289 | push %r12 # redundant, but allows to share | ||
1290 | push %r13 # exception handler... | ||
1291 | push %r14 | ||
1292 | push %r15 | ||
1293 | sub \$8,%rsp | ||
1294 | .Lenc_key_prologue: | ||
1295 | |||
1296 | call _x86_64_AES_set_encrypt_key | ||
1297 | |||
1298 | mov 8(%rsp),%r15 | ||
1299 | mov 16(%rsp),%r14 | ||
1300 | mov 24(%rsp),%r13 | ||
1301 | mov 32(%rsp),%r12 | ||
1302 | mov 40(%rsp),%rbp | ||
1303 | mov 48(%rsp),%rbx | ||
1304 | add \$56,%rsp | ||
1305 | .Lenc_key_epilogue: | ||
1306 | ret | ||
1307 | .size AES_set_encrypt_key,.-AES_set_encrypt_key | ||
1308 | |||
1309 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | ||
1310 | .align 16 | ||
1311 | _x86_64_AES_set_encrypt_key: | ||
1312 | mov %esi,%ecx # %ecx=bits | ||
1313 | mov %rdi,%rsi # %rsi=userKey | ||
1314 | mov %rdx,%rdi # %rdi=key | ||
1315 | |||
1316 | test \$-1,%rsi | ||
1317 | jz .Lbadpointer | ||
1318 | test \$-1,%rdi | ||
1319 | jz .Lbadpointer | ||
1320 | |||
1321 | lea .LAES_Te(%rip),%rbp | ||
1322 | lea 2048+128(%rbp),%rbp | ||
1323 | |||
1324 | # prefetch Te4 | ||
1325 | mov 0-128(%rbp),%eax | ||
1326 | mov 32-128(%rbp),%ebx | ||
1327 | mov 64-128(%rbp),%r8d | ||
1328 | mov 96-128(%rbp),%edx | ||
1329 | mov 128-128(%rbp),%eax | ||
1330 | mov 160-128(%rbp),%ebx | ||
1331 | mov 192-128(%rbp),%r8d | ||
1332 | mov 224-128(%rbp),%edx | ||
1333 | |||
1334 | cmp \$128,%ecx | ||
1335 | je .L10rounds | ||
1336 | cmp \$192,%ecx | ||
1337 | je .L12rounds | ||
1338 | cmp \$256,%ecx | ||
1339 | je .L14rounds | ||
1340 | mov \$-2,%rax # invalid number of bits | ||
1341 | jmp .Lexit | ||
1342 | |||
1343 | .L10rounds: | ||
1344 | mov 0(%rsi),%rax # copy first 4 dwords | ||
1345 | mov 8(%rsi),%rdx | ||
1346 | mov %rax,0(%rdi) | ||
1347 | mov %rdx,8(%rdi) | ||
1348 | |||
1349 | shr \$32,%rdx | ||
1350 | xor %ecx,%ecx | ||
1351 | jmp .L10shortcut | ||
1352 | .align 4 | ||
1353 | .L10loop: | ||
1354 | mov 0(%rdi),%eax # rk[0] | ||
1355 | mov 12(%rdi),%edx # rk[3] | ||
1356 | .L10shortcut: | ||
1357 | ___ | ||
1358 | &enckey (); | ||
1359 | $code.=<<___; | ||
1360 | mov %eax,16(%rdi) # rk[4] | ||
1361 | xor 4(%rdi),%eax | ||
1362 | mov %eax,20(%rdi) # rk[5] | ||
1363 | xor 8(%rdi),%eax | ||
1364 | mov %eax,24(%rdi) # rk[6] | ||
1365 | xor 12(%rdi),%eax | ||
1366 | mov %eax,28(%rdi) # rk[7] | ||
1367 | add \$1,%ecx | ||
1368 | lea 16(%rdi),%rdi | ||
1369 | cmp \$10,%ecx | ||
1370 | jl .L10loop | ||
1371 | |||
1372 | movl \$10,80(%rdi) # setup number of rounds | ||
1373 | xor %rax,%rax | ||
1374 | jmp .Lexit | ||
1375 | |||
1376 | .L12rounds: | ||
1377 | mov 0(%rsi),%rax # copy first 6 dwords | ||
1378 | mov 8(%rsi),%rbx | ||
1379 | mov 16(%rsi),%rdx | ||
1380 | mov %rax,0(%rdi) | ||
1381 | mov %rbx,8(%rdi) | ||
1382 | mov %rdx,16(%rdi) | ||
1383 | |||
1384 | shr \$32,%rdx | ||
1385 | xor %ecx,%ecx | ||
1386 | jmp .L12shortcut | ||
1387 | .align 4 | ||
1388 | .L12loop: | ||
1389 | mov 0(%rdi),%eax # rk[0] | ||
1390 | mov 20(%rdi),%edx # rk[5] | ||
1391 | .L12shortcut: | ||
1392 | ___ | ||
1393 | &enckey (); | ||
1394 | $code.=<<___; | ||
1395 | mov %eax,24(%rdi) # rk[6] | ||
1396 | xor 4(%rdi),%eax | ||
1397 | mov %eax,28(%rdi) # rk[7] | ||
1398 | xor 8(%rdi),%eax | ||
1399 | mov %eax,32(%rdi) # rk[8] | ||
1400 | xor 12(%rdi),%eax | ||
1401 | mov %eax,36(%rdi) # rk[9] | ||
1402 | |||
1403 | cmp \$7,%ecx | ||
1404 | je .L12break | ||
1405 | add \$1,%ecx | ||
1406 | |||
1407 | xor 16(%rdi),%eax | ||
1408 | mov %eax,40(%rdi) # rk[10] | ||
1409 | xor 20(%rdi),%eax | ||
1410 | mov %eax,44(%rdi) # rk[11] | ||
1411 | |||
1412 | lea 24(%rdi),%rdi | ||
1413 | jmp .L12loop | ||
1414 | .L12break: | ||
1415 | movl \$12,72(%rdi) # setup number of rounds | ||
1416 | xor %rax,%rax | ||
1417 | jmp .Lexit | ||
1418 | |||
1419 | .L14rounds: | ||
1420 | mov 0(%rsi),%rax # copy first 8 dwords | ||
1421 | mov 8(%rsi),%rbx | ||
1422 | mov 16(%rsi),%rcx | ||
1423 | mov 24(%rsi),%rdx | ||
1424 | mov %rax,0(%rdi) | ||
1425 | mov %rbx,8(%rdi) | ||
1426 | mov %rcx,16(%rdi) | ||
1427 | mov %rdx,24(%rdi) | ||
1428 | |||
1429 | shr \$32,%rdx | ||
1430 | xor %ecx,%ecx | ||
1431 | jmp .L14shortcut | ||
1432 | .align 4 | ||
1433 | .L14loop: | ||
1434 | mov 0(%rdi),%eax # rk[0] | ||
1435 | mov 28(%rdi),%edx # rk[4] | ||
1436 | .L14shortcut: | ||
1437 | ___ | ||
1438 | &enckey (); | ||
1439 | $code.=<<___; | ||
1440 | mov %eax,32(%rdi) # rk[8] | ||
1441 | xor 4(%rdi),%eax | ||
1442 | mov %eax,36(%rdi) # rk[9] | ||
1443 | xor 8(%rdi),%eax | ||
1444 | mov %eax,40(%rdi) # rk[10] | ||
1445 | xor 12(%rdi),%eax | ||
1446 | mov %eax,44(%rdi) # rk[11] | ||
1447 | |||
1448 | cmp \$6,%ecx | ||
1449 | je .L14break | ||
1450 | add \$1,%ecx | ||
1451 | |||
1452 | mov %eax,%edx | ||
1453 | mov 16(%rdi),%eax # rk[4] | ||
1454 | movz %dl,%esi # rk[11]>>0 | ||
1455 | movzb -128(%rbp,%rsi),%ebx | ||
1456 | movz %dh,%esi # rk[11]>>8 | ||
1457 | xor %ebx,%eax | ||
1458 | |||
1459 | movzb -128(%rbp,%rsi),%ebx | ||
1460 | shr \$16,%edx | ||
1461 | shl \$8,%ebx | ||
1462 | movz %dl,%esi # rk[11]>>16 | ||
1463 | xor %ebx,%eax | ||
1464 | |||
1465 | movzb -128(%rbp,%rsi),%ebx | ||
1466 | movz %dh,%esi # rk[11]>>24 | ||
1467 | shl \$16,%ebx | ||
1468 | xor %ebx,%eax | ||
1469 | |||
1470 | movzb -128(%rbp,%rsi),%ebx | ||
1471 | shl \$24,%ebx | ||
1472 | xor %ebx,%eax | ||
1473 | |||
1474 | mov %eax,48(%rdi) # rk[12] | ||
1475 | xor 20(%rdi),%eax | ||
1476 | mov %eax,52(%rdi) # rk[13] | ||
1477 | xor 24(%rdi),%eax | ||
1478 | mov %eax,56(%rdi) # rk[14] | ||
1479 | xor 28(%rdi),%eax | ||
1480 | mov %eax,60(%rdi) # rk[15] | ||
1481 | |||
1482 | lea 32(%rdi),%rdi | ||
1483 | jmp .L14loop | ||
1484 | .L14break: | ||
1485 | movl \$14,48(%rdi) # setup number of rounds | ||
1486 | xor %rax,%rax | ||
1487 | jmp .Lexit | ||
1488 | |||
1489 | .Lbadpointer: | ||
1490 | mov \$-1,%rax | ||
1491 | .Lexit: | ||
1492 | .byte 0xf3,0xc3 # rep ret | ||
1493 | .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key | ||
1494 | ___ | ||
1495 | |||
1496 | sub deckey_ref() | ||
1497 | { my ($i,$ptr,$te,$td) = @_; | ||
1498 | my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); | ||
1499 | $code.=<<___; | ||
1500 | mov $i($ptr),$tp1 | ||
1501 | mov $tp1,$acc | ||
1502 | and \$0x80808080,$acc | ||
1503 | mov $acc,$tp4 | ||
1504 | shr \$7,$tp4 | ||
1505 | lea 0($tp1,$tp1),$tp2 | ||
1506 | sub $tp4,$acc | ||
1507 | and \$0xfefefefe,$tp2 | ||
1508 | and \$0x1b1b1b1b,$acc | ||
1509 | xor $tp2,$acc | ||
1510 | mov $acc,$tp2 | ||
1511 | |||
1512 | and \$0x80808080,$acc | ||
1513 | mov $acc,$tp8 | ||
1514 | shr \$7,$tp8 | ||
1515 | lea 0($tp2,$tp2),$tp4 | ||
1516 | sub $tp8,$acc | ||
1517 | and \$0xfefefefe,$tp4 | ||
1518 | and \$0x1b1b1b1b,$acc | ||
1519 | xor $tp1,$tp2 # tp2^tp1 | ||
1520 | xor $tp4,$acc | ||
1521 | mov $acc,$tp4 | ||
1522 | |||
1523 | and \$0x80808080,$acc | ||
1524 | mov $acc,$tp8 | ||
1525 | shr \$7,$tp8 | ||
1526 | sub $tp8,$acc | ||
1527 | lea 0($tp4,$tp4),$tp8 | ||
1528 | xor $tp1,$tp4 # tp4^tp1 | ||
1529 | and \$0xfefefefe,$tp8 | ||
1530 | and \$0x1b1b1b1b,$acc | ||
1531 | xor $acc,$tp8 | ||
1532 | |||
1533 | xor $tp8,$tp1 # tp1^tp8 | ||
1534 | rol \$8,$tp1 # ROTATE(tp1^tp8,8) | ||
1535 | xor $tp8,$tp2 # tp2^tp1^tp8 | ||
1536 | xor $tp8,$tp4 # tp4^tp1^tp8 | ||
1537 | xor $tp2,$tp8 | ||
1538 | xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 | ||
1539 | |||
1540 | xor $tp8,$tp1 | ||
1541 | rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) | ||
1542 | xor $tp2,$tp1 | ||
1543 | rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) | ||
1544 | xor $tp4,$tp1 | ||
1545 | |||
1546 | mov $tp1,$i($ptr) | ||
1547 | ___ | ||
1548 | } | ||
1549 | |||
1550 | # int AES_set_decrypt_key(const unsigned char *userKey, const int bits, | ||
1551 | # AES_KEY *key) | ||
1552 | $code.=<<___; | ||
1553 | .globl AES_set_decrypt_key | ||
1554 | .type AES_set_decrypt_key,\@function,3 | ||
1555 | .align 16 | ||
1556 | AES_set_decrypt_key: | ||
1557 | push %rbx | ||
1558 | push %rbp | ||
1559 | push %r12 | ||
1560 | push %r13 | ||
1561 | push %r14 | ||
1562 | push %r15 | ||
1563 | push %rdx # save key schedule | ||
1564 | .Ldec_key_prologue: | ||
1565 | |||
1566 | call _x86_64_AES_set_encrypt_key | ||
1567 | mov (%rsp),%r8 # restore key schedule | ||
1568 | cmp \$0,%eax | ||
1569 | jne .Labort | ||
1570 | |||
1571 | mov 240(%r8),%r14d # pull number of rounds | ||
1572 | xor %rdi,%rdi | ||
1573 | lea (%rdi,%r14d,4),%rcx | ||
1574 | mov %r8,%rsi | ||
1575 | lea (%r8,%rcx,4),%rdi # pointer to last chunk | ||
1576 | .align 4 | ||
1577 | .Linvert: | ||
1578 | mov 0(%rsi),%rax | ||
1579 | mov 8(%rsi),%rbx | ||
1580 | mov 0(%rdi),%rcx | ||
1581 | mov 8(%rdi),%rdx | ||
1582 | mov %rax,0(%rdi) | ||
1583 | mov %rbx,8(%rdi) | ||
1584 | mov %rcx,0(%rsi) | ||
1585 | mov %rdx,8(%rsi) | ||
1586 | lea 16(%rsi),%rsi | ||
1587 | lea -16(%rdi),%rdi | ||
1588 | cmp %rsi,%rdi | ||
1589 | jne .Linvert | ||
1590 | |||
1591 | lea .LAES_Te+2048+1024(%rip),%rax # rcon | ||
1592 | |||
1593 | mov 40(%rax),$mask80 | ||
1594 | mov 48(%rax),$maskfe | ||
1595 | mov 56(%rax),$mask1b | ||
1596 | |||
1597 | mov %r8,$key | ||
1598 | sub \$1,%r14d | ||
1599 | .align 4 | ||
1600 | .Lpermute: | ||
1601 | lea 16($key),$key | ||
1602 | mov 0($key),%rax | ||
1603 | mov 8($key),%rcx | ||
1604 | ___ | ||
1605 | &dectransform (); | ||
1606 | $code.=<<___; | ||
1607 | mov %eax,0($key) | ||
1608 | mov %ebx,4($key) | ||
1609 | mov %ecx,8($key) | ||
1610 | mov %edx,12($key) | ||
1611 | sub \$1,%r14d | ||
1612 | jnz .Lpermute | ||
1613 | |||
1614 | xor %rax,%rax | ||
1615 | .Labort: | ||
1616 | mov 8(%rsp),%r15 | ||
1617 | mov 16(%rsp),%r14 | ||
1618 | mov 24(%rsp),%r13 | ||
1619 | mov 32(%rsp),%r12 | ||
1620 | mov 40(%rsp),%rbp | ||
1621 | mov 48(%rsp),%rbx | ||
1622 | add \$56,%rsp | ||
1623 | .Ldec_key_epilogue: | ||
1624 | ret | ||
1625 | .size AES_set_decrypt_key,.-AES_set_decrypt_key | ||
1626 | ___ | ||
1627 | |||
1628 | # void AES_cbc_encrypt (const void char *inp, unsigned char *out, | ||
1629 | # size_t length, const AES_KEY *key, | ||
1630 | # unsigned char *ivp,const int enc); | ||
1631 | { | ||
1632 | # stack frame layout | ||
1633 | # -8(%rsp) return address | ||
1634 | my $keyp="0(%rsp)"; # one to pass as $key | ||
1635 | my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) | ||
1636 | my $_rsp="16(%rsp)"; # saved %rsp | ||
1637 | my $_inp="24(%rsp)"; # copy of 1st parameter, inp | ||
1638 | my $_out="32(%rsp)"; # copy of 2nd parameter, out | ||
1639 | my $_len="40(%rsp)"; # copy of 3rd parameter, length | ||
1640 | my $_key="48(%rsp)"; # copy of 4th parameter, key | ||
1641 | my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp | ||
1642 | my $ivec="64(%rsp)"; # ivec[16] | ||
1643 | my $aes_key="80(%rsp)"; # copy of aes_key | ||
1644 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds | ||
1645 | |||
1646 | $code.=<<___; | ||
1647 | .globl AES_cbc_encrypt | ||
1648 | .type AES_cbc_encrypt,\@function,6 | ||
1649 | .align 16 | ||
1650 | .extern OPENSSL_ia32cap_P | ||
1651 | AES_cbc_encrypt: | ||
1652 | cmp \$0,%rdx # check length | ||
1653 | je .Lcbc_epilogue | ||
1654 | pushfq | ||
1655 | push %rbx | ||
1656 | push %rbp | ||
1657 | push %r12 | ||
1658 | push %r13 | ||
1659 | push %r14 | ||
1660 | push %r15 | ||
1661 | .Lcbc_prologue: | ||
1662 | |||
1663 | cld | ||
1664 | mov %r9d,%r9d # clear upper half of enc | ||
1665 | |||
1666 | lea .LAES_Te(%rip),$sbox | ||
1667 | cmp \$0,%r9 | ||
1668 | jne .Lcbc_picked_te | ||
1669 | lea .LAES_Td(%rip),$sbox | ||
1670 | .Lcbc_picked_te: | ||
1671 | |||
1672 | mov PIC_GOT(OPENSSL_ia32cap_P),%r10d | ||
1673 | cmp \$$speed_limit,%rdx | ||
1674 | jb .Lcbc_slow_prologue | ||
1675 | test \$15,%rdx | ||
1676 | jnz .Lcbc_slow_prologue | ||
1677 | bt \$28,%r10d | ||
1678 | jc .Lcbc_slow_prologue | ||
1679 | |||
1680 | # allocate aligned stack frame... | ||
1681 | lea -88-248(%rsp),$key | ||
1682 | and \$-64,$key | ||
1683 | |||
1684 | # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 | ||
1685 | mov $sbox,%r10 | ||
1686 | lea 2304($sbox),%r11 | ||
1687 | mov $key,%r12 | ||
1688 | and \$0xFFF,%r10 # s = $sbox&0xfff | ||
1689 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff | ||
1690 | and \$0xFFF,%r12 # p = %rsp&0xfff | ||
1691 | |||
1692 | cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); | ||
1693 | jb .Lcbc_te_break_out | ||
1694 | sub %r11,%r12 | ||
1695 | sub %r12,$key | ||
1696 | jmp .Lcbc_te_ok | ||
1697 | .Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz | ||
1698 | sub %r10,%r12 | ||
1699 | and \$0xFFF,%r12 | ||
1700 | add \$320,%r12 | ||
1701 | sub %r12,$key | ||
1702 | .align 4 | ||
1703 | .Lcbc_te_ok: | ||
1704 | |||
1705 | xchg %rsp,$key | ||
1706 | #add \$8,%rsp # reserve for return address! | ||
1707 | mov $key,$_rsp # save %rsp | ||
1708 | .Lcbc_fast_body: | ||
1709 | mov %rdi,$_inp # save copy of inp | ||
1710 | mov %rsi,$_out # save copy of out | ||
1711 | mov %rdx,$_len # save copy of len | ||
1712 | mov %rcx,$_key # save copy of key | ||
1713 | mov %r8,$_ivp # save copy of ivp | ||
1714 | movl \$0,$mark # copy of aes_key->rounds = 0; | ||
1715 | mov %r8,%rbp # rearrange input arguments | ||
1716 | mov %r9,%rbx | ||
1717 | mov %rsi,$out | ||
1718 | mov %rdi,$inp | ||
1719 | mov %rcx,$key | ||
1720 | |||
1721 | mov 240($key),%eax # key->rounds | ||
1722 | # do we copy key schedule to stack? | ||
1723 | mov $key,%r10 | ||
1724 | sub $sbox,%r10 | ||
1725 | and \$0xfff,%r10 | ||
1726 | cmp \$2304,%r10 | ||
1727 | jb .Lcbc_do_ecopy | ||
1728 | cmp \$4096-248,%r10 | ||
1729 | jb .Lcbc_skip_ecopy | ||
1730 | .align 4 | ||
1731 | .Lcbc_do_ecopy: | ||
1732 | mov $key,%rsi | ||
1733 | lea $aes_key,%rdi | ||
1734 | lea $aes_key,$key | ||
1735 | mov \$240/8,%ecx | ||
1736 | .long 0x90A548F3 # rep movsq | ||
1737 | mov %eax,(%rdi) # copy aes_key->rounds | ||
1738 | .Lcbc_skip_ecopy: | ||
1739 | mov $key,$keyp # save key pointer | ||
1740 | |||
1741 | mov \$18,%ecx | ||
1742 | .align 4 | ||
1743 | .Lcbc_prefetch_te: | ||
1744 | mov 0($sbox),%r10 | ||
1745 | mov 32($sbox),%r11 | ||
1746 | mov 64($sbox),%r12 | ||
1747 | mov 96($sbox),%r13 | ||
1748 | lea 128($sbox),$sbox | ||
1749 | sub \$1,%ecx | ||
1750 | jnz .Lcbc_prefetch_te | ||
1751 | lea -2304($sbox),$sbox | ||
1752 | |||
1753 | cmp \$0,%rbx | ||
1754 | je .LFAST_DECRYPT | ||
1755 | |||
1756 | #----------------------------- ENCRYPT -----------------------------# | ||
1757 | mov 0(%rbp),$s0 # load iv | ||
1758 | mov 4(%rbp),$s1 | ||
1759 | mov 8(%rbp),$s2 | ||
1760 | mov 12(%rbp),$s3 | ||
1761 | |||
1762 | .align 4 | ||
1763 | .Lcbc_fast_enc_loop: | ||
1764 | xor 0($inp),$s0 | ||
1765 | xor 4($inp),$s1 | ||
1766 | xor 8($inp),$s2 | ||
1767 | xor 12($inp),$s3 | ||
1768 | mov $keyp,$key # restore key | ||
1769 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1770 | |||
1771 | call _x86_64_AES_encrypt | ||
1772 | |||
1773 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1774 | mov $_len,%r10 | ||
1775 | mov $s0,0($out) | ||
1776 | mov $s1,4($out) | ||
1777 | mov $s2,8($out) | ||
1778 | mov $s3,12($out) | ||
1779 | |||
1780 | lea 16($inp),$inp | ||
1781 | lea 16($out),$out | ||
1782 | sub \$16,%r10 | ||
1783 | test \$-16,%r10 | ||
1784 | mov %r10,$_len | ||
1785 | jnz .Lcbc_fast_enc_loop | ||
1786 | mov $_ivp,%rbp # restore ivp | ||
1787 | mov $s0,0(%rbp) # save ivec | ||
1788 | mov $s1,4(%rbp) | ||
1789 | mov $s2,8(%rbp) | ||
1790 | mov $s3,12(%rbp) | ||
1791 | |||
1792 | jmp .Lcbc_fast_cleanup | ||
1793 | |||
1794 | #----------------------------- DECRYPT -----------------------------# | ||
1795 | .align 16 | ||
1796 | .LFAST_DECRYPT: | ||
1797 | cmp $inp,$out | ||
1798 | je .Lcbc_fast_dec_in_place | ||
1799 | |||
1800 | mov %rbp,$ivec | ||
1801 | .align 4 | ||
1802 | .Lcbc_fast_dec_loop: | ||
1803 | mov 0($inp),$s0 # read input | ||
1804 | mov 4($inp),$s1 | ||
1805 | mov 8($inp),$s2 | ||
1806 | mov 12($inp),$s3 | ||
1807 | mov $keyp,$key # restore key | ||
1808 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1809 | |||
1810 | call _x86_64_AES_decrypt | ||
1811 | |||
1812 | mov $ivec,%rbp # load ivp | ||
1813 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1814 | mov $_len,%r10 # load len | ||
1815 | xor 0(%rbp),$s0 # xor iv | ||
1816 | xor 4(%rbp),$s1 | ||
1817 | xor 8(%rbp),$s2 | ||
1818 | xor 12(%rbp),$s3 | ||
1819 | mov $inp,%rbp # current input, next iv | ||
1820 | |||
1821 | sub \$16,%r10 | ||
1822 | mov %r10,$_len # update len | ||
1823 | mov %rbp,$ivec # update ivp | ||
1824 | |||
1825 | mov $s0,0($out) # write output | ||
1826 | mov $s1,4($out) | ||
1827 | mov $s2,8($out) | ||
1828 | mov $s3,12($out) | ||
1829 | |||
1830 | lea 16($inp),$inp | ||
1831 | lea 16($out),$out | ||
1832 | jnz .Lcbc_fast_dec_loop | ||
1833 | mov $_ivp,%r12 # load user ivp | ||
1834 | mov 0(%rbp),%r10 # load iv | ||
1835 | mov 8(%rbp),%r11 | ||
1836 | mov %r10,0(%r12) # copy back to user | ||
1837 | mov %r11,8(%r12) | ||
1838 | jmp .Lcbc_fast_cleanup | ||
1839 | |||
1840 | .align 16 | ||
1841 | .Lcbc_fast_dec_in_place: | ||
1842 | mov 0(%rbp),%r10 # copy iv to stack | ||
1843 | mov 8(%rbp),%r11 | ||
1844 | mov %r10,0+$ivec | ||
1845 | mov %r11,8+$ivec | ||
1846 | .align 4 | ||
1847 | .Lcbc_fast_dec_in_place_loop: | ||
1848 | mov 0($inp),$s0 # load input | ||
1849 | mov 4($inp),$s1 | ||
1850 | mov 8($inp),$s2 | ||
1851 | mov 12($inp),$s3 | ||
1852 | mov $keyp,$key # restore key | ||
1853 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1854 | |||
1855 | call _x86_64_AES_decrypt | ||
1856 | |||
1857 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1858 | mov $_len,%r10 | ||
1859 | xor 0+$ivec,$s0 | ||
1860 | xor 4+$ivec,$s1 | ||
1861 | xor 8+$ivec,$s2 | ||
1862 | xor 12+$ivec,$s3 | ||
1863 | |||
1864 | mov 0($inp),%r11 # load input | ||
1865 | mov 8($inp),%r12 | ||
1866 | sub \$16,%r10 | ||
1867 | jz .Lcbc_fast_dec_in_place_done | ||
1868 | |||
1869 | mov %r11,0+$ivec # copy input to iv | ||
1870 | mov %r12,8+$ivec | ||
1871 | |||
1872 | mov $s0,0($out) # save output [zaps input] | ||
1873 | mov $s1,4($out) | ||
1874 | mov $s2,8($out) | ||
1875 | mov $s3,12($out) | ||
1876 | |||
1877 | lea 16($inp),$inp | ||
1878 | lea 16($out),$out | ||
1879 | mov %r10,$_len | ||
1880 | jmp .Lcbc_fast_dec_in_place_loop | ||
1881 | .Lcbc_fast_dec_in_place_done: | ||
1882 | mov $_ivp,%rdi | ||
1883 | mov %r11,0(%rdi) # copy iv back to user | ||
1884 | mov %r12,8(%rdi) | ||
1885 | |||
1886 | mov $s0,0($out) # save output [zaps input] | ||
1887 | mov $s1,4($out) | ||
1888 | mov $s2,8($out) | ||
1889 | mov $s3,12($out) | ||
1890 | |||
1891 | .align 4 | ||
1892 | .Lcbc_fast_cleanup: | ||
1893 | cmpl \$0,$mark # was the key schedule copied? | ||
1894 | lea $aes_key,%rdi | ||
1895 | je .Lcbc_exit | ||
1896 | mov \$240/8,%ecx | ||
1897 | xor %rax,%rax | ||
1898 | .long 0x90AB48F3 # rep stosq | ||
1899 | |||
1900 | jmp .Lcbc_exit | ||
1901 | |||
1902 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
1903 | .align 16 | ||
1904 | .Lcbc_slow_prologue: | ||
1905 | # allocate aligned stack frame... | ||
1906 | lea -88(%rsp),%rbp | ||
1907 | and \$-64,%rbp | ||
1908 | # ... just "above" key schedule | ||
1909 | lea -88-63(%rcx),%r10 | ||
1910 | sub %rbp,%r10 | ||
1911 | neg %r10 | ||
1912 | and \$0x3c0,%r10 | ||
1913 | sub %r10,%rbp | ||
1914 | |||
1915 | xchg %rsp,%rbp | ||
1916 | #add \$8,%rsp # reserve for return address! | ||
1917 | mov %rbp,$_rsp # save %rsp | ||
1918 | .Lcbc_slow_body: | ||
1919 | #mov %rdi,$_inp # save copy of inp | ||
1920 | #mov %rsi,$_out # save copy of out | ||
1921 | #mov %rdx,$_len # save copy of len | ||
1922 | #mov %rcx,$_key # save copy of key | ||
1923 | mov %r8,$_ivp # save copy of ivp | ||
1924 | mov %r8,%rbp # rearrange input arguments | ||
1925 | mov %r9,%rbx | ||
1926 | mov %rsi,$out | ||
1927 | mov %rdi,$inp | ||
1928 | mov %rcx,$key | ||
1929 | mov %rdx,%r10 | ||
1930 | |||
1931 | mov 240($key),%eax | ||
1932 | mov $key,$keyp # save key pointer | ||
1933 | shl \$4,%eax | ||
1934 | lea ($key,%rax),%rax | ||
1935 | mov %rax,$keyend | ||
1936 | |||
1937 | # pick Te4 copy which can't "overlap" with stack frame or key scdedule | ||
1938 | lea 2048($sbox),$sbox | ||
1939 | lea 768-8(%rsp),%rax | ||
1940 | sub $sbox,%rax | ||
1941 | and \$0x300,%rax | ||
1942 | lea ($sbox,%rax),$sbox | ||
1943 | |||
1944 | cmp \$0,%rbx | ||
1945 | je .LSLOW_DECRYPT | ||
1946 | |||
1947 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
1948 | test \$-16,%r10 # check upon length | ||
1949 | mov 0(%rbp),$s0 # load iv | ||
1950 | mov 4(%rbp),$s1 | ||
1951 | mov 8(%rbp),$s2 | ||
1952 | mov 12(%rbp),$s3 | ||
1953 | jz .Lcbc_slow_enc_tail # short input... | ||
1954 | |||
1955 | .align 4 | ||
1956 | .Lcbc_slow_enc_loop: | ||
1957 | xor 0($inp),$s0 | ||
1958 | xor 4($inp),$s1 | ||
1959 | xor 8($inp),$s2 | ||
1960 | xor 12($inp),$s3 | ||
1961 | mov $keyp,$key # restore key | ||
1962 | mov $inp,$_inp # save inp | ||
1963 | mov $out,$_out # save out | ||
1964 | mov %r10,$_len # save len | ||
1965 | |||
1966 | call _x86_64_AES_encrypt_compact | ||
1967 | |||
1968 | mov $_inp,$inp # restore inp | ||
1969 | mov $_out,$out # restore out | ||
1970 | mov $_len,%r10 # restore len | ||
1971 | mov $s0,0($out) | ||
1972 | mov $s1,4($out) | ||
1973 | mov $s2,8($out) | ||
1974 | mov $s3,12($out) | ||
1975 | |||
1976 | lea 16($inp),$inp | ||
1977 | lea 16($out),$out | ||
1978 | sub \$16,%r10 | ||
1979 | test \$-16,%r10 | ||
1980 | jnz .Lcbc_slow_enc_loop | ||
1981 | test \$15,%r10 | ||
1982 | jnz .Lcbc_slow_enc_tail | ||
1983 | mov $_ivp,%rbp # restore ivp | ||
1984 | mov $s0,0(%rbp) # save ivec | ||
1985 | mov $s1,4(%rbp) | ||
1986 | mov $s2,8(%rbp) | ||
1987 | mov $s3,12(%rbp) | ||
1988 | |||
1989 | jmp .Lcbc_exit | ||
1990 | |||
1991 | .align 4 | ||
1992 | .Lcbc_slow_enc_tail: | ||
1993 | mov %rax,%r11 | ||
1994 | mov %rcx,%r12 | ||
1995 | mov %r10,%rcx | ||
1996 | mov $inp,%rsi | ||
1997 | mov $out,%rdi | ||
1998 | .long 0x9066A4F3 # rep movsb | ||
1999 | mov \$16,%rcx # zero tail | ||
2000 | sub %r10,%rcx | ||
2001 | xor %rax,%rax | ||
2002 | .long 0x9066AAF3 # rep stosb | ||
2003 | mov $out,$inp # this is not a mistake! | ||
2004 | mov \$16,%r10 # len=16 | ||
2005 | mov %r11,%rax | ||
2006 | mov %r12,%rcx | ||
2007 | jmp .Lcbc_slow_enc_loop # one more spin... | ||
2008 | #--------------------------- SLOW DECRYPT ---------------------------# | ||
2009 | .align 16 | ||
2010 | .LSLOW_DECRYPT: | ||
2011 | shr \$3,%rax | ||
2012 | add %rax,$sbox # recall "magic" constants! | ||
2013 | |||
2014 | mov 0(%rbp),%r11 # copy iv to stack | ||
2015 | mov 8(%rbp),%r12 | ||
2016 | mov %r11,0+$ivec | ||
2017 | mov %r12,8+$ivec | ||
2018 | |||
2019 | .align 4 | ||
2020 | .Lcbc_slow_dec_loop: | ||
2021 | mov 0($inp),$s0 # load input | ||
2022 | mov 4($inp),$s1 | ||
2023 | mov 8($inp),$s2 | ||
2024 | mov 12($inp),$s3 | ||
2025 | mov $keyp,$key # restore key | ||
2026 | mov $inp,$_inp # save inp | ||
2027 | mov $out,$_out # save out | ||
2028 | mov %r10,$_len # save len | ||
2029 | |||
2030 | call _x86_64_AES_decrypt_compact | ||
2031 | |||
2032 | mov $_inp,$inp # restore inp | ||
2033 | mov $_out,$out # restore out | ||
2034 | mov $_len,%r10 | ||
2035 | xor 0+$ivec,$s0 | ||
2036 | xor 4+$ivec,$s1 | ||
2037 | xor 8+$ivec,$s2 | ||
2038 | xor 12+$ivec,$s3 | ||
2039 | |||
2040 | mov 0($inp),%r11 # load input | ||
2041 | mov 8($inp),%r12 | ||
2042 | sub \$16,%r10 | ||
2043 | jc .Lcbc_slow_dec_partial | ||
2044 | jz .Lcbc_slow_dec_done | ||
2045 | |||
2046 | mov %r11,0+$ivec # copy input to iv | ||
2047 | mov %r12,8+$ivec | ||
2048 | |||
2049 | mov $s0,0($out) # save output [can zap input] | ||
2050 | mov $s1,4($out) | ||
2051 | mov $s2,8($out) | ||
2052 | mov $s3,12($out) | ||
2053 | |||
2054 | lea 16($inp),$inp | ||
2055 | lea 16($out),$out | ||
2056 | jmp .Lcbc_slow_dec_loop | ||
2057 | .Lcbc_slow_dec_done: | ||
2058 | mov $_ivp,%rdi | ||
2059 | mov %r11,0(%rdi) # copy iv back to user | ||
2060 | mov %r12,8(%rdi) | ||
2061 | |||
2062 | mov $s0,0($out) # save output [can zap input] | ||
2063 | mov $s1,4($out) | ||
2064 | mov $s2,8($out) | ||
2065 | mov $s3,12($out) | ||
2066 | |||
2067 | jmp .Lcbc_exit | ||
2068 | |||
2069 | .align 4 | ||
2070 | .Lcbc_slow_dec_partial: | ||
2071 | mov $_ivp,%rdi | ||
2072 | mov %r11,0(%rdi) # copy iv back to user | ||
2073 | mov %r12,8(%rdi) | ||
2074 | |||
2075 | mov $s0,0+$ivec # save output to stack | ||
2076 | mov $s1,4+$ivec | ||
2077 | mov $s2,8+$ivec | ||
2078 | mov $s3,12+$ivec | ||
2079 | |||
2080 | mov $out,%rdi | ||
2081 | lea $ivec,%rsi | ||
2082 | lea 16(%r10),%rcx | ||
2083 | .long 0x9066A4F3 # rep movsb | ||
2084 | jmp .Lcbc_exit | ||
2085 | |||
2086 | .align 16 | ||
2087 | .Lcbc_exit: | ||
2088 | mov $_rsp,%rsi | ||
2089 | mov (%rsi),%r15 | ||
2090 | mov 8(%rsi),%r14 | ||
2091 | mov 16(%rsi),%r13 | ||
2092 | mov 24(%rsi),%r12 | ||
2093 | mov 32(%rsi),%rbp | ||
2094 | mov 40(%rsi),%rbx | ||
2095 | lea 48(%rsi),%rsp | ||
2096 | .Lcbc_popfq: | ||
2097 | popfq | ||
2098 | .Lcbc_epilogue: | ||
2099 | ret | ||
2100 | .size AES_cbc_encrypt,.-AES_cbc_encrypt | ||
2101 | ___ | ||
2102 | } | ||
2103 | |||
2104 | $code.=<<___; | ||
2105 | .align 64 | ||
2106 | .LAES_Te: | ||
2107 | ___ | ||
2108 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); | ||
2109 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | ||
2110 | &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); | ||
2111 | &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); | ||
2112 | &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); | ||
2113 | &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); | ||
2114 | &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); | ||
2115 | &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); | ||
2116 | &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); | ||
2117 | &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); | ||
2118 | &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); | ||
2119 | &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); | ||
2120 | &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); | ||
2121 | &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); | ||
2122 | &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); | ||
2123 | &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); | ||
2124 | &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); | ||
2125 | &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); | ||
2126 | &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); | ||
2127 | &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); | ||
2128 | &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); | ||
2129 | &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); | ||
2130 | &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); | ||
2131 | &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); | ||
2132 | &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); | ||
2133 | &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); | ||
2134 | &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); | ||
2135 | &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); | ||
2136 | &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); | ||
2137 | &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); | ||
2138 | &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); | ||
2139 | &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); | ||
2140 | &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); | ||
2141 | &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); | ||
2142 | &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); | ||
2143 | &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); | ||
2144 | &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); | ||
2145 | &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); | ||
2146 | &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); | ||
2147 | &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); | ||
2148 | &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); | ||
2149 | &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); | ||
2150 | &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); | ||
2151 | &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); | ||
2152 | &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); | ||
2153 | &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); | ||
2154 | &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); | ||
2155 | &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); | ||
2156 | &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); | ||
2157 | &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); | ||
2158 | &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); | ||
2159 | &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); | ||
2160 | &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); | ||
2161 | &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); | ||
2162 | &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); | ||
2163 | &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); | ||
2164 | &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); | ||
2165 | &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); | ||
2166 | &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); | ||
2167 | &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); | ||
2168 | &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); | ||
2169 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | ||
2170 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | ||
2171 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | ||
2172 | |||
2173 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
2174 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2175 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2176 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2177 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2178 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2179 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2180 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2181 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2182 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2183 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2184 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2185 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2186 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2187 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2188 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2189 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2190 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2191 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2192 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2193 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2194 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2195 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2196 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2197 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2198 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2199 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2200 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2201 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2202 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2203 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2204 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2205 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2206 | |||
2207 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2208 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2209 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2210 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2211 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2212 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2213 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2214 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2215 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2216 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2217 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2218 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2219 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2220 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2221 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2222 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2223 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2224 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2225 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2226 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2227 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2228 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2229 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2230 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2231 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2232 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2233 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2234 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2235 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2236 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2237 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2238 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2239 | |||
2240 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2241 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2242 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2243 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2244 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2245 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2246 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2247 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2248 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2249 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2250 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2251 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2252 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2253 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2254 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2255 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2256 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2257 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2258 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2259 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2260 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2261 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2262 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2263 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2264 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2265 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2266 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2267 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2268 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2269 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2270 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2271 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2272 | |||
2273 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2274 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2275 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2276 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2277 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2278 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2279 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2280 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2281 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2282 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2283 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2284 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2285 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2286 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2287 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2288 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2289 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2290 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2291 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2292 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2293 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2294 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2295 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2296 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2297 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2298 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2299 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2300 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2301 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2302 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2303 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2304 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2305 | #rcon: | ||
2306 | $code.=<<___; | ||
2307 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 | ||
2308 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 | ||
2309 | .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 | ||
2310 | .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b | ||
2311 | ___ | ||
2312 | $code.=<<___; | ||
2313 | .align 64 | ||
2314 | .LAES_Td: | ||
2315 | ___ | ||
2316 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); | ||
2317 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | ||
2318 | &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); | ||
2319 | &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); | ||
2320 | &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); | ||
2321 | &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); | ||
2322 | &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); | ||
2323 | &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); | ||
2324 | &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); | ||
2325 | &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d); | ||
2326 | &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362); | ||
2327 | &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9); | ||
2328 | &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52); | ||
2329 | &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566); | ||
2330 | &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3); | ||
2331 | &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed); | ||
2332 | &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e); | ||
2333 | &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4); | ||
2334 | &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4); | ||
2335 | &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd); | ||
2336 | &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d); | ||
2337 | &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060); | ||
2338 | &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967); | ||
2339 | &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879); | ||
2340 | &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000); | ||
2341 | &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c); | ||
2342 | &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36); | ||
2343 | &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624); | ||
2344 | &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b); | ||
2345 | &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c); | ||
2346 | &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12); | ||
2347 | &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14); | ||
2348 | &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3); | ||
2349 | &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b); | ||
2350 | &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8); | ||
2351 | &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684); | ||
2352 | &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7); | ||
2353 | &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177); | ||
2354 | &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947); | ||
2355 | &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322); | ||
2356 | &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498); | ||
2357 | &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f); | ||
2358 | &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54); | ||
2359 | &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382); | ||
2360 | &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf); | ||
2361 | &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb); | ||
2362 | &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83); | ||
2363 | &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef); | ||
2364 | &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029); | ||
2365 | &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235); | ||
2366 | &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733); | ||
2367 | &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117); | ||
2368 | &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4); | ||
2369 | &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546); | ||
2370 | &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); | ||
2371 | &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); | ||
2372 | &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); | ||
2373 | &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); | ||
2374 | &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); | ||
2375 | &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); | ||
2376 | &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); | ||
2377 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | ||
2378 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | ||
2379 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | ||
2380 | |||
2381 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
2382 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2383 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2384 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2385 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2386 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2387 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2388 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2389 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2390 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2391 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2392 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2393 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2394 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2395 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2396 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2397 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2398 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2399 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2400 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2401 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2402 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2403 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2404 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2405 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2406 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2407 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2408 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2409 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2410 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2411 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2412 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2413 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2414 | $code.=<<___; | ||
2415 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2416 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2417 | ___ | ||
2418 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2419 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2420 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2421 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2422 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2423 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2424 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2425 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2426 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2427 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2428 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2429 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2430 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2431 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2432 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2433 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2434 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2435 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2436 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2437 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2438 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2439 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2440 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2441 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2442 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2443 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2444 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2445 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2446 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2447 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2448 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2449 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2450 | $code.=<<___; | ||
2451 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2452 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2453 | ___ | ||
2454 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2455 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2456 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2457 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2458 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2459 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2460 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2461 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2462 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2463 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2464 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2465 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2466 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2467 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2468 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2469 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2470 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2471 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2472 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2473 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2474 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2475 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2476 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2477 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2478 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2479 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2480 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2481 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2482 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2483 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2484 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2485 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2486 | $code.=<<___; | ||
2487 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2488 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2489 | ___ | ||
2490 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2491 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2492 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2493 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2494 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2495 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2496 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2497 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2498 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2499 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2500 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2501 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2502 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2503 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2504 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2505 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2506 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2507 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2508 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2509 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2510 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2511 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2512 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2513 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2514 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2515 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2516 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2517 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2518 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2519 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2520 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2521 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2522 | $code.=<<___; | ||
2523 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2524 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2525 | .asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>" | ||
2526 | .align 64 | ||
2527 | ___ | ||
2528 | |||
2529 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2530 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2531 | if ($win64) { | ||
2532 | $rec="%rcx"; | ||
2533 | $frame="%rdx"; | ||
2534 | $context="%r8"; | ||
2535 | $disp="%r9"; | ||
2536 | |||
2537 | $code.=<<___; | ||
2538 | .extern __imp_RtlVirtualUnwind | ||
2539 | .type block_se_handler,\@abi-omnipotent | ||
2540 | .align 16 | ||
2541 | block_se_handler: | ||
2542 | push %rsi | ||
2543 | push %rdi | ||
2544 | push %rbx | ||
2545 | push %rbp | ||
2546 | push %r12 | ||
2547 | push %r13 | ||
2548 | push %r14 | ||
2549 | push %r15 | ||
2550 | pushfq | ||
2551 | sub \$64,%rsp | ||
2552 | |||
2553 | mov 120($context),%rax # pull context->Rax | ||
2554 | mov 248($context),%rbx # pull context->Rip | ||
2555 | |||
2556 | mov 8($disp),%rsi # disp->ImageBase | ||
2557 | mov 56($disp),%r11 # disp->HandlerData | ||
2558 | |||
2559 | mov 0(%r11),%r10d # HandlerData[0] | ||
2560 | lea (%rsi,%r10),%r10 # prologue label | ||
2561 | cmp %r10,%rbx # context->Rip<prologue label | ||
2562 | jb .Lin_block_prologue | ||
2563 | |||
2564 | mov 152($context),%rax # pull context->Rsp | ||
2565 | |||
2566 | mov 4(%r11),%r10d # HandlerData[1] | ||
2567 | lea (%rsi,%r10),%r10 # epilogue label | ||
2568 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2569 | jae .Lin_block_prologue | ||
2570 | |||
2571 | mov 24(%rax),%rax # pull saved real stack pointer | ||
2572 | lea 48(%rax),%rax # adjust... | ||
2573 | |||
2574 | mov -8(%rax),%rbx | ||
2575 | mov -16(%rax),%rbp | ||
2576 | mov -24(%rax),%r12 | ||
2577 | mov -32(%rax),%r13 | ||
2578 | mov -40(%rax),%r14 | ||
2579 | mov -48(%rax),%r15 | ||
2580 | mov %rbx,144($context) # restore context->Rbx | ||
2581 | mov %rbp,160($context) # restore context->Rbp | ||
2582 | mov %r12,216($context) # restore context->R12 | ||
2583 | mov %r13,224($context) # restore context->R13 | ||
2584 | mov %r14,232($context) # restore context->R14 | ||
2585 | mov %r15,240($context) # restore context->R15 | ||
2586 | |||
2587 | .Lin_block_prologue: | ||
2588 | mov 8(%rax),%rdi | ||
2589 | mov 16(%rax),%rsi | ||
2590 | mov %rax,152($context) # restore context->Rsp | ||
2591 | mov %rsi,168($context) # restore context->Rsi | ||
2592 | mov %rdi,176($context) # restore context->Rdi | ||
2593 | |||
2594 | jmp .Lcommon_seh_exit | ||
2595 | .size block_se_handler,.-block_se_handler | ||
2596 | |||
2597 | .type key_se_handler,\@abi-omnipotent | ||
2598 | .align 16 | ||
2599 | key_se_handler: | ||
2600 | push %rsi | ||
2601 | push %rdi | ||
2602 | push %rbx | ||
2603 | push %rbp | ||
2604 | push %r12 | ||
2605 | push %r13 | ||
2606 | push %r14 | ||
2607 | push %r15 | ||
2608 | pushfq | ||
2609 | sub \$64,%rsp | ||
2610 | |||
2611 | mov 120($context),%rax # pull context->Rax | ||
2612 | mov 248($context),%rbx # pull context->Rip | ||
2613 | |||
2614 | mov 8($disp),%rsi # disp->ImageBase | ||
2615 | mov 56($disp),%r11 # disp->HandlerData | ||
2616 | |||
2617 | mov 0(%r11),%r10d # HandlerData[0] | ||
2618 | lea (%rsi,%r10),%r10 # prologue label | ||
2619 | cmp %r10,%rbx # context->Rip<prologue label | ||
2620 | jb .Lin_key_prologue | ||
2621 | |||
2622 | mov 152($context),%rax # pull context->Rsp | ||
2623 | |||
2624 | mov 4(%r11),%r10d # HandlerData[1] | ||
2625 | lea (%rsi,%r10),%r10 # epilogue label | ||
2626 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2627 | jae .Lin_key_prologue | ||
2628 | |||
2629 | lea 56(%rax),%rax | ||
2630 | |||
2631 | mov -8(%rax),%rbx | ||
2632 | mov -16(%rax),%rbp | ||
2633 | mov -24(%rax),%r12 | ||
2634 | mov -32(%rax),%r13 | ||
2635 | mov -40(%rax),%r14 | ||
2636 | mov -48(%rax),%r15 | ||
2637 | mov %rbx,144($context) # restore context->Rbx | ||
2638 | mov %rbp,160($context) # restore context->Rbp | ||
2639 | mov %r12,216($context) # restore context->R12 | ||
2640 | mov %r13,224($context) # restore context->R13 | ||
2641 | mov %r14,232($context) # restore context->R14 | ||
2642 | mov %r15,240($context) # restore context->R15 | ||
2643 | |||
2644 | .Lin_key_prologue: | ||
2645 | mov 8(%rax),%rdi | ||
2646 | mov 16(%rax),%rsi | ||
2647 | mov %rax,152($context) # restore context->Rsp | ||
2648 | mov %rsi,168($context) # restore context->Rsi | ||
2649 | mov %rdi,176($context) # restore context->Rdi | ||
2650 | |||
2651 | jmp .Lcommon_seh_exit | ||
2652 | .size key_se_handler,.-key_se_handler | ||
2653 | |||
2654 | .type cbc_se_handler,\@abi-omnipotent | ||
2655 | .align 16 | ||
2656 | cbc_se_handler: | ||
2657 | push %rsi | ||
2658 | push %rdi | ||
2659 | push %rbx | ||
2660 | push %rbp | ||
2661 | push %r12 | ||
2662 | push %r13 | ||
2663 | push %r14 | ||
2664 | push %r15 | ||
2665 | pushfq | ||
2666 | sub \$64,%rsp | ||
2667 | |||
2668 | mov 120($context),%rax # pull context->Rax | ||
2669 | mov 248($context),%rbx # pull context->Rip | ||
2670 | |||
2671 | lea .Lcbc_prologue(%rip),%r10 | ||
2672 | cmp %r10,%rbx # context->Rip<.Lcbc_prologue | ||
2673 | jb .Lin_cbc_prologue | ||
2674 | |||
2675 | lea .Lcbc_fast_body(%rip),%r10 | ||
2676 | cmp %r10,%rbx # context->Rip<.Lcbc_fast_body | ||
2677 | jb .Lin_cbc_frame_setup | ||
2678 | |||
2679 | lea .Lcbc_slow_prologue(%rip),%r10 | ||
2680 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue | ||
2681 | jb .Lin_cbc_body | ||
2682 | |||
2683 | lea .Lcbc_slow_body(%rip),%r10 | ||
2684 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_body | ||
2685 | jb .Lin_cbc_frame_setup | ||
2686 | |||
2687 | .Lin_cbc_body: | ||
2688 | mov 152($context),%rax # pull context->Rsp | ||
2689 | |||
2690 | lea .Lcbc_epilogue(%rip),%r10 | ||
2691 | cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue | ||
2692 | jae .Lin_cbc_prologue | ||
2693 | |||
2694 | lea 8(%rax),%rax | ||
2695 | |||
2696 | lea .Lcbc_popfq(%rip),%r10 | ||
2697 | cmp %r10,%rbx # context->Rip>=.Lcbc_popfq | ||
2698 | jae .Lin_cbc_prologue | ||
2699 | |||
2700 | mov `16-8`(%rax),%rax # biased $_rsp | ||
2701 | lea 56(%rax),%rax | ||
2702 | |||
2703 | .Lin_cbc_frame_setup: | ||
2704 | mov -16(%rax),%rbx | ||
2705 | mov -24(%rax),%rbp | ||
2706 | mov -32(%rax),%r12 | ||
2707 | mov -40(%rax),%r13 | ||
2708 | mov -48(%rax),%r14 | ||
2709 | mov -56(%rax),%r15 | ||
2710 | mov %rbx,144($context) # restore context->Rbx | ||
2711 | mov %rbp,160($context) # restore context->Rbp | ||
2712 | mov %r12,216($context) # restore context->R12 | ||
2713 | mov %r13,224($context) # restore context->R13 | ||
2714 | mov %r14,232($context) # restore context->R14 | ||
2715 | mov %r15,240($context) # restore context->R15 | ||
2716 | |||
2717 | .Lin_cbc_prologue: | ||
2718 | mov 8(%rax),%rdi | ||
2719 | mov 16(%rax),%rsi | ||
2720 | mov %rax,152($context) # restore context->Rsp | ||
2721 | mov %rsi,168($context) # restore context->Rsi | ||
2722 | mov %rdi,176($context) # restore context->Rdi | ||
2723 | |||
2724 | .Lcommon_seh_exit: | ||
2725 | |||
2726 | mov 40($disp),%rdi # disp->ContextRecord | ||
2727 | mov $context,%rsi # context | ||
2728 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
2729 | .long 0xa548f3fc # cld; rep movsq | ||
2730 | |||
2731 | mov $disp,%rsi | ||
2732 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2733 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2734 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2735 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2736 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2737 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2738 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2739 | mov %r10,32(%rsp) # arg5 | ||
2740 | mov %r11,40(%rsp) # arg6 | ||
2741 | mov %r12,48(%rsp) # arg7 | ||
2742 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2743 | call *__imp_RtlVirtualUnwind(%rip) | ||
2744 | |||
2745 | mov \$1,%eax # ExceptionContinueSearch | ||
2746 | add \$64,%rsp | ||
2747 | popfq | ||
2748 | pop %r15 | ||
2749 | pop %r14 | ||
2750 | pop %r13 | ||
2751 | pop %r12 | ||
2752 | pop %rbp | ||
2753 | pop %rbx | ||
2754 | pop %rdi | ||
2755 | pop %rsi | ||
2756 | ret | ||
2757 | .size cbc_se_handler,.-cbc_se_handler | ||
2758 | |||
2759 | .section .pdata | ||
2760 | .align 4 | ||
2761 | .rva .LSEH_begin_AES_encrypt | ||
2762 | .rva .LSEH_end_AES_encrypt | ||
2763 | .rva .LSEH_info_AES_encrypt | ||
2764 | |||
2765 | .rva .LSEH_begin_AES_decrypt | ||
2766 | .rva .LSEH_end_AES_decrypt | ||
2767 | .rva .LSEH_info_AES_decrypt | ||
2768 | |||
2769 | .rva .LSEH_begin_AES_set_encrypt_key | ||
2770 | .rva .LSEH_end_AES_set_encrypt_key | ||
2771 | .rva .LSEH_info_AES_set_encrypt_key | ||
2772 | |||
2773 | .rva .LSEH_begin_AES_set_decrypt_key | ||
2774 | .rva .LSEH_end_AES_set_decrypt_key | ||
2775 | .rva .LSEH_info_AES_set_decrypt_key | ||
2776 | |||
2777 | .rva .LSEH_begin_AES_cbc_encrypt | ||
2778 | .rva .LSEH_end_AES_cbc_encrypt | ||
2779 | .rva .LSEH_info_AES_cbc_encrypt | ||
2780 | |||
2781 | .section .xdata | ||
2782 | .align 8 | ||
2783 | .LSEH_info_AES_encrypt: | ||
2784 | .byte 9,0,0,0 | ||
2785 | .rva block_se_handler | ||
2786 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | ||
2787 | .LSEH_info_AES_decrypt: | ||
2788 | .byte 9,0,0,0 | ||
2789 | .rva block_se_handler | ||
2790 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | ||
2791 | .LSEH_info_AES_set_encrypt_key: | ||
2792 | .byte 9,0,0,0 | ||
2793 | .rva key_se_handler | ||
2794 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | ||
2795 | .LSEH_info_AES_set_decrypt_key: | ||
2796 | .byte 9,0,0,0 | ||
2797 | .rva key_se_handler | ||
2798 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | ||
2799 | .LSEH_info_AES_cbc_encrypt: | ||
2800 | .byte 9,0,0,0 | ||
2801 | .rva cbc_se_handler | ||
2802 | ___ | ||
2803 | } | ||
2804 | |||
2805 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
2806 | |||
2807 | print $code; | ||
2808 | |||
2809 | close STDOUT; | ||