diff options
Diffstat (limited to '')
-rwxr-xr-x | src/lib/libcrypto/aes/asm/aes-x86_64.pl | 2834 |
1 files changed, 0 insertions, 2834 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl deleted file mode 100755 index 324c4a2be2..0000000000 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ /dev/null | |||
@@ -1,2834 +0,0 @@ | |||
1 | #!/usr/bin/env perl | ||
2 | # | ||
3 | # ==================================================================== | ||
4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
5 | # project. The module is, however, dual licensed under OpenSSL and | ||
6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
8 | # ==================================================================== | ||
9 | # | ||
10 | # Version 2.1. | ||
11 | # | ||
12 | # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on | ||
13 | # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version | ||
14 | # [you'll notice a lot of resemblance], such as compressed S-boxes | ||
15 | # in little-endian byte order, prefetch of these tables in CBC mode, | ||
16 | # as well as avoiding L1 cache aliasing between stack frame and key | ||
17 | # schedule and already mentioned tables, compressed Td4... | ||
18 | # | ||
19 | # Performance in number of cycles per processed byte for 128-bit key: | ||
20 | # | ||
21 | # ECB encrypt ECB decrypt CBC large chunk | ||
22 | # AMD64 33 41 13.0 | ||
23 | # EM64T 38 59 18.6(*) | ||
24 | # Core 2 30 43 14.5(*) | ||
25 | # | ||
26 | # (*) with hyper-threading off | ||
27 | |||
28 | $flavour = shift; | ||
29 | $output = shift; | ||
30 | if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } | ||
31 | |||
32 | $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); | ||
33 | |||
34 | $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | ||
35 | ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or | ||
36 | ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or | ||
37 | die "can't locate x86_64-xlate.pl"; | ||
38 | |||
39 | open OUT,"| \"$^X\" $xlate $flavour $output"; | ||
40 | *STDOUT=*OUT; | ||
41 | |||
42 | $verticalspin=1; # unlike 32-bit version $verticalspin performs | ||
43 | # ~15% better on both AMD and Intel cores | ||
44 | $speed_limit=512; # see aes-586.pl for details | ||
45 | |||
46 | $code=".text\n"; | ||
47 | |||
48 | $s0="%eax"; | ||
49 | $s1="%ebx"; | ||
50 | $s2="%ecx"; | ||
51 | $s3="%edx"; | ||
52 | $acc0="%esi"; $mask80="%rsi"; | ||
53 | $acc1="%edi"; $maskfe="%rdi"; | ||
54 | $acc2="%ebp"; $mask1b="%rbp"; | ||
55 | $inp="%r8"; | ||
56 | $out="%r9"; | ||
57 | $t0="%r10d"; | ||
58 | $t1="%r11d"; | ||
59 | $t2="%r12d"; | ||
60 | $rnds="%r13d"; | ||
61 | $sbox="%r14"; | ||
62 | $key="%r15"; | ||
63 | |||
64 | sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } | ||
65 | sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; | ||
66 | $r =~ s/%[er]([sd]i)/%\1l/; | ||
67 | $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } | ||
68 | sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/; | ||
69 | $r =~ s/%r([0-9]+)/%r\1d/; $r; } | ||
70 | sub _data_word() | ||
71 | { my $i; | ||
72 | while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } | ||
73 | } | ||
74 | sub data_word() | ||
75 | { my $i; | ||
76 | my $last=pop(@_); | ||
77 | $code.=".long\t"; | ||
78 | while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; } | ||
79 | $code.=sprintf"0x%08x\n",$last; | ||
80 | } | ||
81 | |||
82 | sub data_byte() | ||
83 | { my $i; | ||
84 | my $last=pop(@_); | ||
85 | $code.=".byte\t"; | ||
86 | while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; } | ||
87 | $code.=sprintf"0x%02x\n",$last&0xff; | ||
88 | } | ||
89 | |||
90 | sub encvert() | ||
91 | { my $t3="%r8d"; # zaps $inp! | ||
92 | |||
93 | $code.=<<___; | ||
94 | # favor 3-way issue Opteron pipeline... | ||
95 | movzb `&lo("$s0")`,$acc0 | ||
96 | movzb `&lo("$s1")`,$acc1 | ||
97 | movzb `&lo("$s2")`,$acc2 | ||
98 | mov 0($sbox,$acc0,8),$t0 | ||
99 | mov 0($sbox,$acc1,8),$t1 | ||
100 | mov 0($sbox,$acc2,8),$t2 | ||
101 | |||
102 | movzb `&hi("$s1")`,$acc0 | ||
103 | movzb `&hi("$s2")`,$acc1 | ||
104 | movzb `&lo("$s3")`,$acc2 | ||
105 | xor 3($sbox,$acc0,8),$t0 | ||
106 | xor 3($sbox,$acc1,8),$t1 | ||
107 | mov 0($sbox,$acc2,8),$t3 | ||
108 | |||
109 | movzb `&hi("$s3")`,$acc0 | ||
110 | shr \$16,$s2 | ||
111 | movzb `&hi("$s0")`,$acc2 | ||
112 | xor 3($sbox,$acc0,8),$t2 | ||
113 | shr \$16,$s3 | ||
114 | xor 3($sbox,$acc2,8),$t3 | ||
115 | |||
116 | shr \$16,$s1 | ||
117 | lea 16($key),$key | ||
118 | shr \$16,$s0 | ||
119 | |||
120 | movzb `&lo("$s2")`,$acc0 | ||
121 | movzb `&lo("$s3")`,$acc1 | ||
122 | movzb `&lo("$s0")`,$acc2 | ||
123 | xor 2($sbox,$acc0,8),$t0 | ||
124 | xor 2($sbox,$acc1,8),$t1 | ||
125 | xor 2($sbox,$acc2,8),$t2 | ||
126 | |||
127 | movzb `&hi("$s3")`,$acc0 | ||
128 | movzb `&hi("$s0")`,$acc1 | ||
129 | movzb `&lo("$s1")`,$acc2 | ||
130 | xor 1($sbox,$acc0,8),$t0 | ||
131 | xor 1($sbox,$acc1,8),$t1 | ||
132 | xor 2($sbox,$acc2,8),$t3 | ||
133 | |||
134 | mov 12($key),$s3 | ||
135 | movzb `&hi("$s1")`,$acc1 | ||
136 | movzb `&hi("$s2")`,$acc2 | ||
137 | mov 0($key),$s0 | ||
138 | xor 1($sbox,$acc1,8),$t2 | ||
139 | xor 1($sbox,$acc2,8),$t3 | ||
140 | |||
141 | mov 4($key),$s1 | ||
142 | mov 8($key),$s2 | ||
143 | xor $t0,$s0 | ||
144 | xor $t1,$s1 | ||
145 | xor $t2,$s2 | ||
146 | xor $t3,$s3 | ||
147 | ___ | ||
148 | } | ||
149 | |||
150 | sub enclastvert() | ||
151 | { my $t3="%r8d"; # zaps $inp! | ||
152 | |||
153 | $code.=<<___; | ||
154 | movzb `&lo("$s0")`,$acc0 | ||
155 | movzb `&lo("$s1")`,$acc1 | ||
156 | movzb `&lo("$s2")`,$acc2 | ||
157 | movzb 2($sbox,$acc0,8),$t0 | ||
158 | movzb 2($sbox,$acc1,8),$t1 | ||
159 | movzb 2($sbox,$acc2,8),$t2 | ||
160 | |||
161 | movzb `&lo("$s3")`,$acc0 | ||
162 | movzb `&hi("$s1")`,$acc1 | ||
163 | movzb `&hi("$s2")`,$acc2 | ||
164 | movzb 2($sbox,$acc0,8),$t3 | ||
165 | mov 0($sbox,$acc1,8),$acc1 #$t0 | ||
166 | mov 0($sbox,$acc2,8),$acc2 #$t1 | ||
167 | |||
168 | and \$0x0000ff00,$acc1 | ||
169 | and \$0x0000ff00,$acc2 | ||
170 | |||
171 | xor $acc1,$t0 | ||
172 | xor $acc2,$t1 | ||
173 | shr \$16,$s2 | ||
174 | |||
175 | movzb `&hi("$s3")`,$acc0 | ||
176 | movzb `&hi("$s0")`,$acc1 | ||
177 | shr \$16,$s3 | ||
178 | mov 0($sbox,$acc0,8),$acc0 #$t2 | ||
179 | mov 0($sbox,$acc1,8),$acc1 #$t3 | ||
180 | |||
181 | and \$0x0000ff00,$acc0 | ||
182 | and \$0x0000ff00,$acc1 | ||
183 | shr \$16,$s1 | ||
184 | xor $acc0,$t2 | ||
185 | xor $acc1,$t3 | ||
186 | shr \$16,$s0 | ||
187 | |||
188 | movzb `&lo("$s2")`,$acc0 | ||
189 | movzb `&lo("$s3")`,$acc1 | ||
190 | movzb `&lo("$s0")`,$acc2 | ||
191 | mov 0($sbox,$acc0,8),$acc0 #$t0 | ||
192 | mov 0($sbox,$acc1,8),$acc1 #$t1 | ||
193 | mov 0($sbox,$acc2,8),$acc2 #$t2 | ||
194 | |||
195 | and \$0x00ff0000,$acc0 | ||
196 | and \$0x00ff0000,$acc1 | ||
197 | and \$0x00ff0000,$acc2 | ||
198 | |||
199 | xor $acc0,$t0 | ||
200 | xor $acc1,$t1 | ||
201 | xor $acc2,$t2 | ||
202 | |||
203 | movzb `&lo("$s1")`,$acc0 | ||
204 | movzb `&hi("$s3")`,$acc1 | ||
205 | movzb `&hi("$s0")`,$acc2 | ||
206 | mov 0($sbox,$acc0,8),$acc0 #$t3 | ||
207 | mov 2($sbox,$acc1,8),$acc1 #$t0 | ||
208 | mov 2($sbox,$acc2,8),$acc2 #$t1 | ||
209 | |||
210 | and \$0x00ff0000,$acc0 | ||
211 | and \$0xff000000,$acc1 | ||
212 | and \$0xff000000,$acc2 | ||
213 | |||
214 | xor $acc0,$t3 | ||
215 | xor $acc1,$t0 | ||
216 | xor $acc2,$t1 | ||
217 | |||
218 | movzb `&hi("$s1")`,$acc0 | ||
219 | movzb `&hi("$s2")`,$acc1 | ||
220 | mov 16+12($key),$s3 | ||
221 | mov 2($sbox,$acc0,8),$acc0 #$t2 | ||
222 | mov 2($sbox,$acc1,8),$acc1 #$t3 | ||
223 | mov 16+0($key),$s0 | ||
224 | |||
225 | and \$0xff000000,$acc0 | ||
226 | and \$0xff000000,$acc1 | ||
227 | |||
228 | xor $acc0,$t2 | ||
229 | xor $acc1,$t3 | ||
230 | |||
231 | mov 16+4($key),$s1 | ||
232 | mov 16+8($key),$s2 | ||
233 | xor $t0,$s0 | ||
234 | xor $t1,$s1 | ||
235 | xor $t2,$s2 | ||
236 | xor $t3,$s3 | ||
237 | ___ | ||
238 | } | ||
239 | |||
240 | sub encstep() | ||
241 | { my ($i,@s) = @_; | ||
242 | my $tmp0=$acc0; | ||
243 | my $tmp1=$acc1; | ||
244 | my $tmp2=$acc2; | ||
245 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
246 | |||
247 | if ($i==3) { | ||
248 | $tmp0=$s[1]; | ||
249 | $tmp1=$s[2]; | ||
250 | $tmp2=$s[3]; | ||
251 | } | ||
252 | $code.=" movzb ".&lo($s[0]).",$out\n"; | ||
253 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
254 | $code.=" lea 16($key),$key\n" if ($i==0); | ||
255 | |||
256 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
257 | $code.=" mov 0($sbox,$out,8),$out\n"; | ||
258 | |||
259 | $code.=" shr \$16,$tmp1\n"; | ||
260 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
261 | $code.=" xor 3($sbox,$tmp0,8),$out\n"; | ||
262 | |||
263 | $code.=" movzb ".&lo($tmp1).",$tmp1\n"; | ||
264 | $code.=" shr \$24,$tmp2\n"; | ||
265 | $code.=" xor 4*$i($key),$out\n"; | ||
266 | |||
267 | $code.=" xor 2($sbox,$tmp1,8),$out\n"; | ||
268 | $code.=" xor 1($sbox,$tmp2,8),$out\n"; | ||
269 | |||
270 | $code.=" mov $t0,$s[1]\n" if ($i==3); | ||
271 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
272 | $code.=" mov $t2,$s[3]\n" if ($i==3); | ||
273 | $code.="\n"; | ||
274 | } | ||
275 | |||
276 | sub enclast() | ||
277 | { my ($i,@s)=@_; | ||
278 | my $tmp0=$acc0; | ||
279 | my $tmp1=$acc1; | ||
280 | my $tmp2=$acc2; | ||
281 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
282 | |||
283 | if ($i==3) { | ||
284 | $tmp0=$s[1]; | ||
285 | $tmp1=$s[2]; | ||
286 | $tmp2=$s[3]; | ||
287 | } | ||
288 | $code.=" movzb ".&lo($s[0]).",$out\n"; | ||
289 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
290 | |||
291 | $code.=" mov 2($sbox,$out,8),$out\n"; | ||
292 | $code.=" shr \$16,$tmp1\n"; | ||
293 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
294 | |||
295 | $code.=" and \$0x000000ff,$out\n"; | ||
296 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
297 | $code.=" movzb ".&lo($tmp1).",$tmp1\n"; | ||
298 | $code.=" shr \$24,$tmp2\n"; | ||
299 | |||
300 | $code.=" mov 0($sbox,$tmp0,8),$tmp0\n"; | ||
301 | $code.=" mov 0($sbox,$tmp1,8),$tmp1\n"; | ||
302 | $code.=" mov 2($sbox,$tmp2,8),$tmp2\n"; | ||
303 | |||
304 | $code.=" and \$0x0000ff00,$tmp0\n"; | ||
305 | $code.=" and \$0x00ff0000,$tmp1\n"; | ||
306 | $code.=" and \$0xff000000,$tmp2\n"; | ||
307 | |||
308 | $code.=" xor $tmp0,$out\n"; | ||
309 | $code.=" mov $t0,$s[1]\n" if ($i==3); | ||
310 | $code.=" xor $tmp1,$out\n"; | ||
311 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
312 | $code.=" xor $tmp2,$out\n"; | ||
313 | $code.=" mov $t2,$s[3]\n" if ($i==3); | ||
314 | $code.="\n"; | ||
315 | } | ||
316 | |||
317 | $code.=<<___; | ||
318 | .type _x86_64_AES_encrypt,\@abi-omnipotent | ||
319 | .align 16 | ||
320 | _x86_64_AES_encrypt: | ||
321 | _CET_ENDBR | ||
322 | xor 0($key),$s0 # xor with key | ||
323 | xor 4($key),$s1 | ||
324 | xor 8($key),$s2 | ||
325 | xor 12($key),$s3 | ||
326 | |||
327 | mov 240($key),$rnds # load key->rounds | ||
328 | sub \$1,$rnds | ||
329 | jmp .Lenc_loop | ||
330 | .align 16 | ||
331 | .Lenc_loop: | ||
332 | ___ | ||
333 | if ($verticalspin) { &encvert(); } | ||
334 | else { &encstep(0,$s0,$s1,$s2,$s3); | ||
335 | &encstep(1,$s1,$s2,$s3,$s0); | ||
336 | &encstep(2,$s2,$s3,$s0,$s1); | ||
337 | &encstep(3,$s3,$s0,$s1,$s2); | ||
338 | } | ||
339 | $code.=<<___; | ||
340 | sub \$1,$rnds | ||
341 | jnz .Lenc_loop | ||
342 | ___ | ||
343 | if ($verticalspin) { &enclastvert(); } | ||
344 | else { &enclast(0,$s0,$s1,$s2,$s3); | ||
345 | &enclast(1,$s1,$s2,$s3,$s0); | ||
346 | &enclast(2,$s2,$s3,$s0,$s1); | ||
347 | &enclast(3,$s3,$s0,$s1,$s2); | ||
348 | $code.=<<___; | ||
349 | xor 16+0($key),$s0 # xor with key | ||
350 | xor 16+4($key),$s1 | ||
351 | xor 16+8($key),$s2 | ||
352 | xor 16+12($key),$s3 | ||
353 | ___ | ||
354 | } | ||
355 | $code.=<<___; | ||
356 | retq | ||
357 | .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt | ||
358 | ___ | ||
359 | |||
360 | # it's possible to implement this by shifting tN by 8, filling least | ||
361 | # significant byte with byte load and finally bswap-ing at the end, | ||
362 | # but such partial register load kills Core 2... | ||
363 | sub enccompactvert() | ||
364 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
365 | |||
366 | $code.=<<___; | ||
367 | movzb `&lo("$s0")`,$t0 | ||
368 | movzb `&lo("$s1")`,$t1 | ||
369 | movzb `&lo("$s2")`,$t2 | ||
370 | movzb ($sbox,$t0,1),$t0 | ||
371 | movzb ($sbox,$t1,1),$t1 | ||
372 | movzb ($sbox,$t2,1),$t2 | ||
373 | |||
374 | movzb `&lo("$s3")`,$t3 | ||
375 | movzb `&hi("$s1")`,$acc0 | ||
376 | movzb `&hi("$s2")`,$acc1 | ||
377 | movzb ($sbox,$t3,1),$t3 | ||
378 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
379 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
380 | |||
381 | movzb `&hi("$s3")`,$acc2 | ||
382 | movzb `&hi("$s0")`,$acc0 | ||
383 | shr \$16,$s2 | ||
384 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
385 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
386 | shr \$16,$s3 | ||
387 | |||
388 | movzb `&lo("$s2")`,$acc1 | ||
389 | shl \$8,$t4 | ||
390 | shl \$8,$t5 | ||
391 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
392 | xor $t4,$t0 | ||
393 | xor $t5,$t1 | ||
394 | |||
395 | movzb `&lo("$s3")`,$t4 | ||
396 | shr \$16,$s0 | ||
397 | shr \$16,$s1 | ||
398 | movzb `&lo("$s0")`,$t5 | ||
399 | shl \$8,$acc2 | ||
400 | shl \$8,$acc0 | ||
401 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
402 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
403 | xor $acc2,$t2 | ||
404 | xor $acc0,$t3 | ||
405 | |||
406 | movzb `&lo("$s1")`,$acc2 | ||
407 | movzb `&hi("$s3")`,$acc0 | ||
408 | shl \$16,$acc1 | ||
409 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
410 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
411 | xor $acc1,$t0 | ||
412 | |||
413 | movzb `&hi("$s0")`,$acc1 | ||
414 | shr \$8,$s2 | ||
415 | shr \$8,$s1 | ||
416 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
417 | movzb ($sbox,$s2,1),$s3 #$t3 | ||
418 | movzb ($sbox,$s1,1),$s2 #$t2 | ||
419 | shl \$16,$t4 | ||
420 | shl \$16,$t5 | ||
421 | shl \$16,$acc2 | ||
422 | xor $t4,$t1 | ||
423 | xor $t5,$t2 | ||
424 | xor $acc2,$t3 | ||
425 | |||
426 | shl \$24,$acc0 | ||
427 | shl \$24,$acc1 | ||
428 | shl \$24,$s3 | ||
429 | xor $acc0,$t0 | ||
430 | shl \$24,$s2 | ||
431 | xor $acc1,$t1 | ||
432 | mov $t0,$s0 | ||
433 | mov $t1,$s1 | ||
434 | xor $t2,$s2 | ||
435 | xor $t3,$s3 | ||
436 | ___ | ||
437 | } | ||
438 | |||
439 | sub enctransform_ref() | ||
440 | { my $sn = shift; | ||
441 | my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d"); | ||
442 | |||
443 | $code.=<<___; | ||
444 | mov $sn,$acc | ||
445 | and \$0x80808080,$acc | ||
446 | mov $acc,$tmp | ||
447 | shr \$7,$tmp | ||
448 | lea ($sn,$sn),$r2 | ||
449 | sub $tmp,$acc | ||
450 | and \$0xfefefefe,$r2 | ||
451 | and \$0x1b1b1b1b,$acc | ||
452 | mov $sn,$tmp | ||
453 | xor $acc,$r2 | ||
454 | |||
455 | xor $r2,$sn | ||
456 | rol \$24,$sn | ||
457 | xor $r2,$sn | ||
458 | ror \$16,$tmp | ||
459 | xor $tmp,$sn | ||
460 | ror \$8,$tmp | ||
461 | xor $tmp,$sn | ||
462 | ___ | ||
463 | } | ||
464 | |||
465 | # unlike decrypt case it does not pay off to parallelize enctransform | ||
466 | sub enctransform() | ||
467 | { my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d"); | ||
468 | |||
469 | $code.=<<___; | ||
470 | mov $s0,$acc0 | ||
471 | mov $s1,$acc1 | ||
472 | and \$0x80808080,$acc0 | ||
473 | and \$0x80808080,$acc1 | ||
474 | mov $acc0,$t0 | ||
475 | mov $acc1,$t1 | ||
476 | shr \$7,$t0 | ||
477 | lea ($s0,$s0),$r20 | ||
478 | shr \$7,$t1 | ||
479 | lea ($s1,$s1),$r21 | ||
480 | sub $t0,$acc0 | ||
481 | sub $t1,$acc1 | ||
482 | and \$0xfefefefe,$r20 | ||
483 | and \$0xfefefefe,$r21 | ||
484 | and \$0x1b1b1b1b,$acc0 | ||
485 | and \$0x1b1b1b1b,$acc1 | ||
486 | mov $s0,$t0 | ||
487 | mov $s1,$t1 | ||
488 | xor $acc0,$r20 | ||
489 | xor $acc1,$r21 | ||
490 | |||
491 | xor $r20,$s0 | ||
492 | xor $r21,$s1 | ||
493 | mov $s2,$acc0 | ||
494 | mov $s3,$acc1 | ||
495 | rol \$24,$s0 | ||
496 | rol \$24,$s1 | ||
497 | and \$0x80808080,$acc0 | ||
498 | and \$0x80808080,$acc1 | ||
499 | xor $r20,$s0 | ||
500 | xor $r21,$s1 | ||
501 | mov $acc0,$t2 | ||
502 | mov $acc1,$t3 | ||
503 | ror \$16,$t0 | ||
504 | ror \$16,$t1 | ||
505 | shr \$7,$t2 | ||
506 | lea ($s2,$s2),$r20 | ||
507 | xor $t0,$s0 | ||
508 | xor $t1,$s1 | ||
509 | shr \$7,$t3 | ||
510 | lea ($s3,$s3),$r21 | ||
511 | ror \$8,$t0 | ||
512 | ror \$8,$t1 | ||
513 | sub $t2,$acc0 | ||
514 | sub $t3,$acc1 | ||
515 | xor $t0,$s0 | ||
516 | xor $t1,$s1 | ||
517 | |||
518 | and \$0xfefefefe,$r20 | ||
519 | and \$0xfefefefe,$r21 | ||
520 | and \$0x1b1b1b1b,$acc0 | ||
521 | and \$0x1b1b1b1b,$acc1 | ||
522 | mov $s2,$t2 | ||
523 | mov $s3,$t3 | ||
524 | xor $acc0,$r20 | ||
525 | xor $acc1,$r21 | ||
526 | |||
527 | xor $r20,$s2 | ||
528 | xor $r21,$s3 | ||
529 | rol \$24,$s2 | ||
530 | rol \$24,$s3 | ||
531 | xor $r20,$s2 | ||
532 | xor $r21,$s3 | ||
533 | mov 0($sbox),$acc0 # prefetch Te4 | ||
534 | ror \$16,$t2 | ||
535 | ror \$16,$t3 | ||
536 | mov 64($sbox),$acc1 | ||
537 | xor $t2,$s2 | ||
538 | xor $t3,$s3 | ||
539 | mov 128($sbox),$r20 | ||
540 | ror \$8,$t2 | ||
541 | ror \$8,$t3 | ||
542 | mov 192($sbox),$r21 | ||
543 | xor $t2,$s2 | ||
544 | xor $t3,$s3 | ||
545 | ___ | ||
546 | } | ||
547 | |||
548 | $code.=<<___; | ||
549 | .type _x86_64_AES_encrypt_compact,\@abi-omnipotent | ||
550 | .align 16 | ||
551 | _x86_64_AES_encrypt_compact: | ||
552 | _CET_ENDBR | ||
553 | lea 128($sbox),$inp # size optimization | ||
554 | mov 0-128($inp),$acc1 # prefetch Te4 | ||
555 | mov 32-128($inp),$acc2 | ||
556 | mov 64-128($inp),$t0 | ||
557 | mov 96-128($inp),$t1 | ||
558 | mov 128-128($inp),$acc1 | ||
559 | mov 160-128($inp),$acc2 | ||
560 | mov 192-128($inp),$t0 | ||
561 | mov 224-128($inp),$t1 | ||
562 | jmp .Lenc_loop_compact | ||
563 | .align 16 | ||
564 | .Lenc_loop_compact: | ||
565 | xor 0($key),$s0 # xor with key | ||
566 | xor 4($key),$s1 | ||
567 | xor 8($key),$s2 | ||
568 | xor 12($key),$s3 | ||
569 | lea 16($key),$key | ||
570 | ___ | ||
571 | &enccompactvert(); | ||
572 | $code.=<<___; | ||
573 | cmp 16(%rsp),$key | ||
574 | je .Lenc_compact_done | ||
575 | ___ | ||
576 | &enctransform(); | ||
577 | $code.=<<___; | ||
578 | jmp .Lenc_loop_compact | ||
579 | .align 16 | ||
580 | .Lenc_compact_done: | ||
581 | xor 0($key),$s0 | ||
582 | xor 4($key),$s1 | ||
583 | xor 8($key),$s2 | ||
584 | xor 12($key),$s3 | ||
585 | retq | ||
586 | .size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact | ||
587 | ___ | ||
588 | |||
589 | # void aes_encrypt_internal(const void *inp, void *out, const AES_KEY *key); | ||
590 | $code.=<<___; | ||
591 | .globl aes_encrypt_internal | ||
592 | .type aes_encrypt_internal,\@function,3 | ||
593 | .align 16 | ||
594 | .globl asm_AES_encrypt | ||
595 | .hidden asm_AES_encrypt | ||
596 | asm_AES_encrypt: | ||
597 | aes_encrypt_internal: | ||
598 | _CET_ENDBR | ||
599 | push %rbx | ||
600 | push %rbp | ||
601 | push %r12 | ||
602 | push %r13 | ||
603 | push %r14 | ||
604 | push %r15 | ||
605 | |||
606 | # allocate frame "above" key schedule | ||
607 | mov %rsp,%r10 | ||
608 | lea -63(%rdx),%rcx # %rdx is key argument | ||
609 | and \$-64,%rsp | ||
610 | sub %rsp,%rcx | ||
611 | neg %rcx | ||
612 | and \$0x3c0,%rcx | ||
613 | sub %rcx,%rsp | ||
614 | sub \$32,%rsp | ||
615 | |||
616 | mov %rsi,16(%rsp) # save out | ||
617 | mov %r10,24(%rsp) # save real stack pointer | ||
618 | .Lenc_prologue: | ||
619 | |||
620 | mov %rdx,$key | ||
621 | mov 240($key),$rnds # load rounds | ||
622 | |||
623 | mov 0(%rdi),$s0 # load input vector | ||
624 | mov 4(%rdi),$s1 | ||
625 | mov 8(%rdi),$s2 | ||
626 | mov 12(%rdi),$s3 | ||
627 | |||
628 | shl \$4,$rnds | ||
629 | lea ($key,$rnds),%rbp | ||
630 | mov $key,(%rsp) # key schedule | ||
631 | mov %rbp,8(%rsp) # end of key schedule | ||
632 | |||
633 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
634 | lea .LAES_Te+2048(%rip),$sbox | ||
635 | lea 768(%rsp),%rbp | ||
636 | sub $sbox,%rbp | ||
637 | and \$0x300,%rbp | ||
638 | lea ($sbox,%rbp),$sbox | ||
639 | |||
640 | call _x86_64_AES_encrypt_compact | ||
641 | |||
642 | mov 16(%rsp),$out # restore out | ||
643 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
644 | mov $s0,0($out) # write output vector | ||
645 | mov $s1,4($out) | ||
646 | mov $s2,8($out) | ||
647 | mov $s3,12($out) | ||
648 | |||
649 | mov (%rsi),%r15 | ||
650 | mov 8(%rsi),%r14 | ||
651 | mov 16(%rsi),%r13 | ||
652 | mov 24(%rsi),%r12 | ||
653 | mov 32(%rsi),%rbp | ||
654 | mov 40(%rsi),%rbx | ||
655 | lea 48(%rsi),%rsp | ||
656 | .Lenc_epilogue: | ||
657 | ret | ||
658 | .size aes_encrypt_internal,.-aes_encrypt_internal | ||
659 | ___ | ||
660 | |||
661 | #------------------------------------------------------------------# | ||
662 | |||
663 | sub decvert() | ||
664 | { my $t3="%r8d"; # zaps $inp! | ||
665 | |||
666 | $code.=<<___; | ||
667 | # favor 3-way issue Opteron pipeline... | ||
668 | movzb `&lo("$s0")`,$acc0 | ||
669 | movzb `&lo("$s1")`,$acc1 | ||
670 | movzb `&lo("$s2")`,$acc2 | ||
671 | mov 0($sbox,$acc0,8),$t0 | ||
672 | mov 0($sbox,$acc1,8),$t1 | ||
673 | mov 0($sbox,$acc2,8),$t2 | ||
674 | |||
675 | movzb `&hi("$s3")`,$acc0 | ||
676 | movzb `&hi("$s0")`,$acc1 | ||
677 | movzb `&lo("$s3")`,$acc2 | ||
678 | xor 3($sbox,$acc0,8),$t0 | ||
679 | xor 3($sbox,$acc1,8),$t1 | ||
680 | mov 0($sbox,$acc2,8),$t3 | ||
681 | |||
682 | movzb `&hi("$s1")`,$acc0 | ||
683 | shr \$16,$s0 | ||
684 | movzb `&hi("$s2")`,$acc2 | ||
685 | xor 3($sbox,$acc0,8),$t2 | ||
686 | shr \$16,$s3 | ||
687 | xor 3($sbox,$acc2,8),$t3 | ||
688 | |||
689 | shr \$16,$s1 | ||
690 | lea 16($key),$key | ||
691 | shr \$16,$s2 | ||
692 | |||
693 | movzb `&lo("$s2")`,$acc0 | ||
694 | movzb `&lo("$s3")`,$acc1 | ||
695 | movzb `&lo("$s0")`,$acc2 | ||
696 | xor 2($sbox,$acc0,8),$t0 | ||
697 | xor 2($sbox,$acc1,8),$t1 | ||
698 | xor 2($sbox,$acc2,8),$t2 | ||
699 | |||
700 | movzb `&hi("$s1")`,$acc0 | ||
701 | movzb `&hi("$s2")`,$acc1 | ||
702 | movzb `&lo("$s1")`,$acc2 | ||
703 | xor 1($sbox,$acc0,8),$t0 | ||
704 | xor 1($sbox,$acc1,8),$t1 | ||
705 | xor 2($sbox,$acc2,8),$t3 | ||
706 | |||
707 | movzb `&hi("$s3")`,$acc0 | ||
708 | mov 12($key),$s3 | ||
709 | movzb `&hi("$s0")`,$acc2 | ||
710 | xor 1($sbox,$acc0,8),$t2 | ||
711 | mov 0($key),$s0 | ||
712 | xor 1($sbox,$acc2,8),$t3 | ||
713 | |||
714 | xor $t0,$s0 | ||
715 | mov 4($key),$s1 | ||
716 | mov 8($key),$s2 | ||
717 | xor $t2,$s2 | ||
718 | xor $t1,$s1 | ||
719 | xor $t3,$s3 | ||
720 | ___ | ||
721 | } | ||
722 | |||
723 | sub declastvert() | ||
724 | { my $t3="%r8d"; # zaps $inp! | ||
725 | |||
726 | $code.=<<___; | ||
727 | lea 2048($sbox),$sbox # size optimization | ||
728 | movzb `&lo("$s0")`,$acc0 | ||
729 | movzb `&lo("$s1")`,$acc1 | ||
730 | movzb `&lo("$s2")`,$acc2 | ||
731 | movzb ($sbox,$acc0,1),$t0 | ||
732 | movzb ($sbox,$acc1,1),$t1 | ||
733 | movzb ($sbox,$acc2,1),$t2 | ||
734 | |||
735 | movzb `&lo("$s3")`,$acc0 | ||
736 | movzb `&hi("$s3")`,$acc1 | ||
737 | movzb `&hi("$s0")`,$acc2 | ||
738 | movzb ($sbox,$acc0,1),$t3 | ||
739 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
740 | movzb ($sbox,$acc2,1),$acc2 #$t1 | ||
741 | |||
742 | shl \$8,$acc1 | ||
743 | shl \$8,$acc2 | ||
744 | |||
745 | xor $acc1,$t0 | ||
746 | xor $acc2,$t1 | ||
747 | shr \$16,$s3 | ||
748 | |||
749 | movzb `&hi("$s1")`,$acc0 | ||
750 | movzb `&hi("$s2")`,$acc1 | ||
751 | shr \$16,$s0 | ||
752 | movzb ($sbox,$acc0,1),$acc0 #$t2 | ||
753 | movzb ($sbox,$acc1,1),$acc1 #$t3 | ||
754 | |||
755 | shl \$8,$acc0 | ||
756 | shl \$8,$acc1 | ||
757 | shr \$16,$s1 | ||
758 | xor $acc0,$t2 | ||
759 | xor $acc1,$t3 | ||
760 | shr \$16,$s2 | ||
761 | |||
762 | movzb `&lo("$s2")`,$acc0 | ||
763 | movzb `&lo("$s3")`,$acc1 | ||
764 | movzb `&lo("$s0")`,$acc2 | ||
765 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
766 | movzb ($sbox,$acc1,1),$acc1 #$t1 | ||
767 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
768 | |||
769 | shl \$16,$acc0 | ||
770 | shl \$16,$acc1 | ||
771 | shl \$16,$acc2 | ||
772 | |||
773 | xor $acc0,$t0 | ||
774 | xor $acc1,$t1 | ||
775 | xor $acc2,$t2 | ||
776 | |||
777 | movzb `&lo("$s1")`,$acc0 | ||
778 | movzb `&hi("$s1")`,$acc1 | ||
779 | movzb `&hi("$s2")`,$acc2 | ||
780 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
781 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
782 | movzb ($sbox,$acc2,1),$acc2 #$t1 | ||
783 | |||
784 | shl \$16,$acc0 | ||
785 | shl \$24,$acc1 | ||
786 | shl \$24,$acc2 | ||
787 | |||
788 | xor $acc0,$t3 | ||
789 | xor $acc1,$t0 | ||
790 | xor $acc2,$t1 | ||
791 | |||
792 | movzb `&hi("$s3")`,$acc0 | ||
793 | movzb `&hi("$s0")`,$acc1 | ||
794 | mov 16+12($key),$s3 | ||
795 | movzb ($sbox,$acc0,1),$acc0 #$t2 | ||
796 | movzb ($sbox,$acc1,1),$acc1 #$t3 | ||
797 | mov 16+0($key),$s0 | ||
798 | |||
799 | shl \$24,$acc0 | ||
800 | shl \$24,$acc1 | ||
801 | |||
802 | xor $acc0,$t2 | ||
803 | xor $acc1,$t3 | ||
804 | |||
805 | mov 16+4($key),$s1 | ||
806 | mov 16+8($key),$s2 | ||
807 | lea -2048($sbox),$sbox | ||
808 | xor $t0,$s0 | ||
809 | xor $t1,$s1 | ||
810 | xor $t2,$s2 | ||
811 | xor $t3,$s3 | ||
812 | ___ | ||
813 | } | ||
814 | |||
815 | sub decstep() | ||
816 | { my ($i,@s) = @_; | ||
817 | my $tmp0=$acc0; | ||
818 | my $tmp1=$acc1; | ||
819 | my $tmp2=$acc2; | ||
820 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
821 | |||
822 | $code.=" mov $s[0],$out\n" if ($i!=3); | ||
823 | $tmp1=$s[2] if ($i==3); | ||
824 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
825 | $code.=" and \$0xFF,$out\n"; | ||
826 | |||
827 | $code.=" mov 0($sbox,$out,8),$out\n"; | ||
828 | $code.=" shr \$16,$tmp1\n"; | ||
829 | $tmp2=$s[3] if ($i==3); | ||
830 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
831 | |||
832 | $tmp0=$s[1] if ($i==3); | ||
833 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
834 | $code.=" and \$0xFF,$tmp1\n"; | ||
835 | $code.=" shr \$24,$tmp2\n"; | ||
836 | |||
837 | $code.=" xor 3($sbox,$tmp0,8),$out\n"; | ||
838 | $code.=" xor 2($sbox,$tmp1,8),$out\n"; | ||
839 | $code.=" xor 1($sbox,$tmp2,8),$out\n"; | ||
840 | |||
841 | $code.=" mov $t2,$s[1]\n" if ($i==3); | ||
842 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
843 | $code.=" mov $t0,$s[3]\n" if ($i==3); | ||
844 | $code.="\n"; | ||
845 | } | ||
846 | |||
847 | sub declast() | ||
848 | { my ($i,@s)=@_; | ||
849 | my $tmp0=$acc0; | ||
850 | my $tmp1=$acc1; | ||
851 | my $tmp2=$acc2; | ||
852 | my $out=($t0,$t1,$t2,$s[0])[$i]; | ||
853 | |||
854 | $code.=" mov $s[0],$out\n" if ($i!=3); | ||
855 | $tmp1=$s[2] if ($i==3); | ||
856 | $code.=" mov $s[2],$tmp1\n" if ($i!=3); | ||
857 | $code.=" and \$0xFF,$out\n"; | ||
858 | |||
859 | $code.=" movzb 2048($sbox,$out,1),$out\n"; | ||
860 | $code.=" shr \$16,$tmp1\n"; | ||
861 | $tmp2=$s[3] if ($i==3); | ||
862 | $code.=" mov $s[3],$tmp2\n" if ($i!=3); | ||
863 | |||
864 | $tmp0=$s[1] if ($i==3); | ||
865 | $code.=" movzb ".&hi($s[1]).",$tmp0\n"; | ||
866 | $code.=" and \$0xFF,$tmp1\n"; | ||
867 | $code.=" shr \$24,$tmp2\n"; | ||
868 | |||
869 | $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n"; | ||
870 | $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n"; | ||
871 | $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n"; | ||
872 | |||
873 | $code.=" shl \$8,$tmp0\n"; | ||
874 | $code.=" shl \$16,$tmp1\n"; | ||
875 | $code.=" shl \$24,$tmp2\n"; | ||
876 | |||
877 | $code.=" xor $tmp0,$out\n"; | ||
878 | $code.=" mov $t2,$s[1]\n" if ($i==3); | ||
879 | $code.=" xor $tmp1,$out\n"; | ||
880 | $code.=" mov $t1,$s[2]\n" if ($i==3); | ||
881 | $code.=" xor $tmp2,$out\n"; | ||
882 | $code.=" mov $t0,$s[3]\n" if ($i==3); | ||
883 | $code.="\n"; | ||
884 | } | ||
885 | |||
886 | $code.=<<___; | ||
887 | .type _x86_64_AES_decrypt,\@abi-omnipotent | ||
888 | .align 16 | ||
889 | _x86_64_AES_decrypt: | ||
890 | _CET_ENDBR | ||
891 | xor 0($key),$s0 # xor with key | ||
892 | xor 4($key),$s1 | ||
893 | xor 8($key),$s2 | ||
894 | xor 12($key),$s3 | ||
895 | |||
896 | mov 240($key),$rnds # load key->rounds | ||
897 | sub \$1,$rnds | ||
898 | jmp .Ldec_loop | ||
899 | .align 16 | ||
900 | .Ldec_loop: | ||
901 | ___ | ||
902 | if ($verticalspin) { &decvert(); } | ||
903 | else { &decstep(0,$s0,$s3,$s2,$s1); | ||
904 | &decstep(1,$s1,$s0,$s3,$s2); | ||
905 | &decstep(2,$s2,$s1,$s0,$s3); | ||
906 | &decstep(3,$s3,$s2,$s1,$s0); | ||
907 | $code.=<<___; | ||
908 | lea 16($key),$key | ||
909 | xor 0($key),$s0 # xor with key | ||
910 | xor 4($key),$s1 | ||
911 | xor 8($key),$s2 | ||
912 | xor 12($key),$s3 | ||
913 | ___ | ||
914 | } | ||
915 | $code.=<<___; | ||
916 | sub \$1,$rnds | ||
917 | jnz .Ldec_loop | ||
918 | ___ | ||
919 | if ($verticalspin) { &declastvert(); } | ||
920 | else { &declast(0,$s0,$s3,$s2,$s1); | ||
921 | &declast(1,$s1,$s0,$s3,$s2); | ||
922 | &declast(2,$s2,$s1,$s0,$s3); | ||
923 | &declast(3,$s3,$s2,$s1,$s0); | ||
924 | $code.=<<___; | ||
925 | xor 16+0($key),$s0 # xor with key | ||
926 | xor 16+4($key),$s1 | ||
927 | xor 16+8($key),$s2 | ||
928 | xor 16+12($key),$s3 | ||
929 | ___ | ||
930 | } | ||
931 | $code.=<<___; | ||
932 | retq | ||
933 | .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt | ||
934 | ___ | ||
935 | |||
936 | sub deccompactvert() | ||
937 | { my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d"); | ||
938 | |||
939 | $code.=<<___; | ||
940 | movzb `&lo("$s0")`,$t0 | ||
941 | movzb `&lo("$s1")`,$t1 | ||
942 | movzb `&lo("$s2")`,$t2 | ||
943 | movzb ($sbox,$t0,1),$t0 | ||
944 | movzb ($sbox,$t1,1),$t1 | ||
945 | movzb ($sbox,$t2,1),$t2 | ||
946 | |||
947 | movzb `&lo("$s3")`,$t3 | ||
948 | movzb `&hi("$s3")`,$acc0 | ||
949 | movzb `&hi("$s0")`,$acc1 | ||
950 | movzb ($sbox,$t3,1),$t3 | ||
951 | movzb ($sbox,$acc0,1),$t4 #$t0 | ||
952 | movzb ($sbox,$acc1,1),$t5 #$t1 | ||
953 | |||
954 | movzb `&hi("$s1")`,$acc2 | ||
955 | movzb `&hi("$s2")`,$acc0 | ||
956 | shr \$16,$s2 | ||
957 | movzb ($sbox,$acc2,1),$acc2 #$t2 | ||
958 | movzb ($sbox,$acc0,1),$acc0 #$t3 | ||
959 | shr \$16,$s3 | ||
960 | |||
961 | movzb `&lo("$s2")`,$acc1 | ||
962 | shl \$8,$t4 | ||
963 | shl \$8,$t5 | ||
964 | movzb ($sbox,$acc1,1),$acc1 #$t0 | ||
965 | xor $t4,$t0 | ||
966 | xor $t5,$t1 | ||
967 | |||
968 | movzb `&lo("$s3")`,$t4 | ||
969 | shr \$16,$s0 | ||
970 | shr \$16,$s1 | ||
971 | movzb `&lo("$s0")`,$t5 | ||
972 | shl \$8,$acc2 | ||
973 | shl \$8,$acc0 | ||
974 | movzb ($sbox,$t4,1),$t4 #$t1 | ||
975 | movzb ($sbox,$t5,1),$t5 #$t2 | ||
976 | xor $acc2,$t2 | ||
977 | xor $acc0,$t3 | ||
978 | |||
979 | movzb `&lo("$s1")`,$acc2 | ||
980 | movzb `&hi("$s1")`,$acc0 | ||
981 | shl \$16,$acc1 | ||
982 | movzb ($sbox,$acc2,1),$acc2 #$t3 | ||
983 | movzb ($sbox,$acc0,1),$acc0 #$t0 | ||
984 | xor $acc1,$t0 | ||
985 | |||
986 | movzb `&hi("$s2")`,$acc1 | ||
987 | shl \$16,$t4 | ||
988 | shl \$16,$t5 | ||
989 | movzb ($sbox,$acc1,1),$s1 #$t1 | ||
990 | xor $t4,$t1 | ||
991 | xor $t5,$t2 | ||
992 | |||
993 | movzb `&hi("$s3")`,$acc1 | ||
994 | shr \$8,$s0 | ||
995 | shl \$16,$acc2 | ||
996 | movzb ($sbox,$acc1,1),$s2 #$t2 | ||
997 | movzb ($sbox,$s0,1),$s3 #$t3 | ||
998 | xor $acc2,$t3 | ||
999 | |||
1000 | shl \$24,$acc0 | ||
1001 | shl \$24,$s1 | ||
1002 | shl \$24,$s2 | ||
1003 | xor $acc0,$t0 | ||
1004 | shl \$24,$s3 | ||
1005 | xor $t1,$s1 | ||
1006 | mov $t0,$s0 | ||
1007 | xor $t2,$s2 | ||
1008 | xor $t3,$s3 | ||
1009 | ___ | ||
1010 | } | ||
1011 | |||
1012 | # parallelized version! input is pair of 64-bit values: %rax=s1.s0 | ||
1013 | # and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1, | ||
1014 | # %ecx=s2 and %edx=s3. | ||
1015 | sub dectransform() | ||
1016 | { my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx"); | ||
1017 | my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx"); | ||
1018 | my $prefetch = shift; | ||
1019 | |||
1020 | $code.=<<___; | ||
1021 | mov $tp10,$acc0 | ||
1022 | mov $tp18,$acc8 | ||
1023 | and $mask80,$acc0 | ||
1024 | and $mask80,$acc8 | ||
1025 | mov $acc0,$tp40 | ||
1026 | mov $acc8,$tp48 | ||
1027 | shr \$7,$tp40 | ||
1028 | lea ($tp10,$tp10),$tp20 | ||
1029 | shr \$7,$tp48 | ||
1030 | lea ($tp18,$tp18),$tp28 | ||
1031 | sub $tp40,$acc0 | ||
1032 | sub $tp48,$acc8 | ||
1033 | and $maskfe,$tp20 | ||
1034 | and $maskfe,$tp28 | ||
1035 | and $mask1b,$acc0 | ||
1036 | and $mask1b,$acc8 | ||
1037 | xor $tp20,$acc0 | ||
1038 | xor $tp28,$acc8 | ||
1039 | mov $acc0,$tp20 | ||
1040 | mov $acc8,$tp28 | ||
1041 | |||
1042 | and $mask80,$acc0 | ||
1043 | and $mask80,$acc8 | ||
1044 | mov $acc0,$tp80 | ||
1045 | mov $acc8,$tp88 | ||
1046 | shr \$7,$tp80 | ||
1047 | lea ($tp20,$tp20),$tp40 | ||
1048 | shr \$7,$tp88 | ||
1049 | lea ($tp28,$tp28),$tp48 | ||
1050 | sub $tp80,$acc0 | ||
1051 | sub $tp88,$acc8 | ||
1052 | and $maskfe,$tp40 | ||
1053 | and $maskfe,$tp48 | ||
1054 | and $mask1b,$acc0 | ||
1055 | and $mask1b,$acc8 | ||
1056 | xor $tp40,$acc0 | ||
1057 | xor $tp48,$acc8 | ||
1058 | mov $acc0,$tp40 | ||
1059 | mov $acc8,$tp48 | ||
1060 | |||
1061 | and $mask80,$acc0 | ||
1062 | and $mask80,$acc8 | ||
1063 | mov $acc0,$tp80 | ||
1064 | mov $acc8,$tp88 | ||
1065 | shr \$7,$tp80 | ||
1066 | xor $tp10,$tp20 # tp2^=tp1 | ||
1067 | shr \$7,$tp88 | ||
1068 | xor $tp18,$tp28 # tp2^=tp1 | ||
1069 | sub $tp80,$acc0 | ||
1070 | sub $tp88,$acc8 | ||
1071 | lea ($tp40,$tp40),$tp80 | ||
1072 | lea ($tp48,$tp48),$tp88 | ||
1073 | xor $tp10,$tp40 # tp4^=tp1 | ||
1074 | xor $tp18,$tp48 # tp4^=tp1 | ||
1075 | and $maskfe,$tp80 | ||
1076 | and $maskfe,$tp88 | ||
1077 | and $mask1b,$acc0 | ||
1078 | and $mask1b,$acc8 | ||
1079 | xor $acc0,$tp80 | ||
1080 | xor $acc8,$tp88 | ||
1081 | |||
1082 | xor $tp80,$tp10 # tp1^=tp8 | ||
1083 | xor $tp88,$tp18 # tp1^=tp8 | ||
1084 | xor $tp80,$tp20 # tp2^tp1^=tp8 | ||
1085 | xor $tp88,$tp28 # tp2^tp1^=tp8 | ||
1086 | mov $tp10,$acc0 | ||
1087 | mov $tp18,$acc8 | ||
1088 | xor $tp80,$tp40 # tp4^tp1^=tp8 | ||
1089 | xor $tp88,$tp48 # tp4^tp1^=tp8 | ||
1090 | shr \$32,$acc0 | ||
1091 | shr \$32,$acc8 | ||
1092 | xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1093 | xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1 | ||
1094 | rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8) | ||
1095 | rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8) | ||
1096 | xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1097 | xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2 | ||
1098 | |||
1099 | rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8) | ||
1100 | rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8) | ||
1101 | xor `&LO("$tp80")`,`&LO("$tp10")` | ||
1102 | xor `&LO("$tp88")`,`&LO("$tp18")` | ||
1103 | shr \$32,$tp80 | ||
1104 | shr \$32,$tp88 | ||
1105 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1106 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1107 | |||
1108 | mov $tp20,$tp80 | ||
1109 | mov $tp28,$tp88 | ||
1110 | shr \$32,$tp80 | ||
1111 | shr \$32,$tp88 | ||
1112 | rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24) | ||
1113 | rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24) | ||
1114 | rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24) | ||
1115 | rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24) | ||
1116 | xor `&LO("$tp20")`,`&LO("$tp10")` | ||
1117 | xor `&LO("$tp28")`,`&LO("$tp18")` | ||
1118 | mov $tp40,$tp20 | ||
1119 | mov $tp48,$tp28 | ||
1120 | xor `&LO("$tp80")`,`&LO("$acc0")` | ||
1121 | xor `&LO("$tp88")`,`&LO("$acc8")` | ||
1122 | |||
1123 | `"mov 0($sbox),$mask80" if ($prefetch)` | ||
1124 | shr \$32,$tp20 | ||
1125 | shr \$32,$tp28 | ||
1126 | `"mov 64($sbox),$maskfe" if ($prefetch)` | ||
1127 | rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16) | ||
1128 | rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16) | ||
1129 | `"mov 128($sbox),$mask1b" if ($prefetch)` | ||
1130 | rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16) | ||
1131 | rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16) | ||
1132 | `"mov 192($sbox),$tp80" if ($prefetch)` | ||
1133 | xor `&LO("$tp40")`,`&LO("$tp10")` | ||
1134 | xor `&LO("$tp48")`,`&LO("$tp18")` | ||
1135 | `"mov 256($sbox),$tp88" if ($prefetch)` | ||
1136 | xor `&LO("$tp20")`,`&LO("$acc0")` | ||
1137 | xor `&LO("$tp28")`,`&LO("$acc8")` | ||
1138 | ___ | ||
1139 | } | ||
1140 | |||
1141 | $code.=<<___; | ||
1142 | .type _x86_64_AES_decrypt_compact,\@abi-omnipotent | ||
1143 | .align 16 | ||
1144 | _x86_64_AES_decrypt_compact: | ||
1145 | _CET_ENDBR | ||
1146 | lea 128($sbox),$inp # size optimization | ||
1147 | mov 0-128($inp),$acc1 # prefetch Td4 | ||
1148 | mov 32-128($inp),$acc2 | ||
1149 | mov 64-128($inp),$t0 | ||
1150 | mov 96-128($inp),$t1 | ||
1151 | mov 128-128($inp),$acc1 | ||
1152 | mov 160-128($inp),$acc2 | ||
1153 | mov 192-128($inp),$t0 | ||
1154 | mov 224-128($inp),$t1 | ||
1155 | jmp .Ldec_loop_compact | ||
1156 | |||
1157 | .align 16 | ||
1158 | .Ldec_loop_compact: | ||
1159 | xor 0($key),$s0 # xor with key | ||
1160 | xor 4($key),$s1 | ||
1161 | xor 8($key),$s2 | ||
1162 | xor 12($key),$s3 | ||
1163 | lea 16($key),$key | ||
1164 | ___ | ||
1165 | &deccompactvert(); | ||
1166 | $code.=<<___; | ||
1167 | cmp 16(%rsp),$key | ||
1168 | je .Ldec_compact_done | ||
1169 | |||
1170 | mov 256+0($sbox),$mask80 | ||
1171 | shl \$32,%rbx | ||
1172 | shl \$32,%rdx | ||
1173 | mov 256+8($sbox),$maskfe | ||
1174 | or %rbx,%rax | ||
1175 | or %rdx,%rcx | ||
1176 | mov 256+16($sbox),$mask1b | ||
1177 | ___ | ||
1178 | &dectransform(1); | ||
1179 | $code.=<<___; | ||
1180 | jmp .Ldec_loop_compact | ||
1181 | .align 16 | ||
1182 | .Ldec_compact_done: | ||
1183 | xor 0($key),$s0 | ||
1184 | xor 4($key),$s1 | ||
1185 | xor 8($key),$s2 | ||
1186 | xor 12($key),$s3 | ||
1187 | retq | ||
1188 | .size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact | ||
1189 | ___ | ||
1190 | |||
1191 | # void aes_decrypt_internal(const void *inp, void *out, const AES_KEY *key); | ||
1192 | $code.=<<___; | ||
1193 | .globl aes_decrypt_internal | ||
1194 | .type aes_decrypt_internal,\@function,3 | ||
1195 | .align 16 | ||
1196 | .globl asm_AES_decrypt | ||
1197 | .hidden asm_AES_decrypt | ||
1198 | asm_AES_decrypt: | ||
1199 | aes_decrypt_internal: | ||
1200 | _CET_ENDBR | ||
1201 | push %rbx | ||
1202 | push %rbp | ||
1203 | push %r12 | ||
1204 | push %r13 | ||
1205 | push %r14 | ||
1206 | push %r15 | ||
1207 | |||
1208 | # allocate frame "above" key schedule | ||
1209 | mov %rsp,%r10 | ||
1210 | lea -63(%rdx),%rcx # %rdx is key argument | ||
1211 | and \$-64,%rsp | ||
1212 | sub %rsp,%rcx | ||
1213 | neg %rcx | ||
1214 | and \$0x3c0,%rcx | ||
1215 | sub %rcx,%rsp | ||
1216 | sub \$32,%rsp | ||
1217 | |||
1218 | mov %rsi,16(%rsp) # save out | ||
1219 | mov %r10,24(%rsp) # save real stack pointer | ||
1220 | .Ldec_prologue: | ||
1221 | |||
1222 | mov %rdx,$key | ||
1223 | mov 240($key),$rnds # load rounds | ||
1224 | |||
1225 | mov 0(%rdi),$s0 # load input vector | ||
1226 | mov 4(%rdi),$s1 | ||
1227 | mov 8(%rdi),$s2 | ||
1228 | mov 12(%rdi),$s3 | ||
1229 | |||
1230 | shl \$4,$rnds | ||
1231 | lea ($key,$rnds),%rbp | ||
1232 | mov $key,(%rsp) # key schedule | ||
1233 | mov %rbp,8(%rsp) # end of key schedule | ||
1234 | |||
1235 | # pick Td4 copy which can't "overlap" with stack frame or key schedule | ||
1236 | lea .LAES_Td+2048(%rip),$sbox | ||
1237 | lea 768(%rsp),%rbp | ||
1238 | sub $sbox,%rbp | ||
1239 | and \$0x300,%rbp | ||
1240 | lea ($sbox,%rbp),$sbox | ||
1241 | shr \$3,%rbp # recall "magic" constants! | ||
1242 | add %rbp,$sbox | ||
1243 | |||
1244 | call _x86_64_AES_decrypt_compact | ||
1245 | |||
1246 | mov 16(%rsp),$out # restore out | ||
1247 | mov 24(%rsp),%rsi # restore saved stack pointer | ||
1248 | mov $s0,0($out) # write output vector | ||
1249 | mov $s1,4($out) | ||
1250 | mov $s2,8($out) | ||
1251 | mov $s3,12($out) | ||
1252 | |||
1253 | mov (%rsi),%r15 | ||
1254 | mov 8(%rsi),%r14 | ||
1255 | mov 16(%rsi),%r13 | ||
1256 | mov 24(%rsi),%r12 | ||
1257 | mov 32(%rsi),%rbp | ||
1258 | mov 40(%rsi),%rbx | ||
1259 | lea 48(%rsi),%rsp | ||
1260 | .Ldec_epilogue: | ||
1261 | ret | ||
1262 | .size aes_decrypt_internal,.-aes_decrypt_internal | ||
1263 | ___ | ||
1264 | #------------------------------------------------------------------# | ||
1265 | |||
1266 | sub enckey() | ||
1267 | { | ||
1268 | $code.=<<___; | ||
1269 | movz %dl,%esi # rk[i]>>0 | ||
1270 | movzb -128(%rbp,%rsi),%ebx | ||
1271 | movz %dh,%esi # rk[i]>>8 | ||
1272 | shl \$24,%ebx | ||
1273 | xor %ebx,%eax | ||
1274 | |||
1275 | movzb -128(%rbp,%rsi),%ebx | ||
1276 | shr \$16,%edx | ||
1277 | movz %dl,%esi # rk[i]>>16 | ||
1278 | xor %ebx,%eax | ||
1279 | |||
1280 | movzb -128(%rbp,%rsi),%ebx | ||
1281 | movz %dh,%esi # rk[i]>>24 | ||
1282 | shl \$8,%ebx | ||
1283 | xor %ebx,%eax | ||
1284 | |||
1285 | movzb -128(%rbp,%rsi),%ebx | ||
1286 | shl \$16,%ebx | ||
1287 | xor %ebx,%eax | ||
1288 | |||
1289 | xor 1024-128(%rbp,%rcx,4),%eax # rcon | ||
1290 | ___ | ||
1291 | } | ||
1292 | |||
1293 | # int aes_set_encrypt_key_internal(const unsigned char *userKey, const int bits, | ||
1294 | # AES_KEY *key) | ||
1295 | $code.=<<___; | ||
1296 | .globl aes_set_encrypt_key_internal | ||
1297 | .type aes_set_encrypt_key_internal,\@function,3 | ||
1298 | .align 16 | ||
1299 | aes_set_encrypt_key_internal: | ||
1300 | _CET_ENDBR | ||
1301 | push %rbx | ||
1302 | push %rbp | ||
1303 | push %r12 # redundant, but allows to share | ||
1304 | push %r13 # exception handler... | ||
1305 | push %r14 | ||
1306 | push %r15 | ||
1307 | sub \$8,%rsp | ||
1308 | .Lenc_key_prologue: | ||
1309 | |||
1310 | call _x86_64_AES_set_encrypt_key | ||
1311 | |||
1312 | mov 8(%rsp),%r15 | ||
1313 | mov 16(%rsp),%r14 | ||
1314 | mov 24(%rsp),%r13 | ||
1315 | mov 32(%rsp),%r12 | ||
1316 | mov 40(%rsp),%rbp | ||
1317 | mov 48(%rsp),%rbx | ||
1318 | add \$56,%rsp | ||
1319 | .Lenc_key_epilogue: | ||
1320 | ret | ||
1321 | .size aes_set_encrypt_key_internal,.-aes_set_encrypt_key_internal | ||
1322 | |||
1323 | .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent | ||
1324 | .align 16 | ||
1325 | _x86_64_AES_set_encrypt_key: | ||
1326 | _CET_ENDBR | ||
1327 | mov %esi,%ecx # %ecx=bits | ||
1328 | mov %rdi,%rsi # %rsi=userKey | ||
1329 | mov %rdx,%rdi # %rdi=key | ||
1330 | |||
1331 | test \$-1,%rsi | ||
1332 | jz .Lbadpointer | ||
1333 | test \$-1,%rdi | ||
1334 | jz .Lbadpointer | ||
1335 | |||
1336 | lea .LAES_Te(%rip),%rbp | ||
1337 | lea 2048+128(%rbp),%rbp | ||
1338 | |||
1339 | # prefetch Te4 | ||
1340 | mov 0-128(%rbp),%eax | ||
1341 | mov 32-128(%rbp),%ebx | ||
1342 | mov 64-128(%rbp),%r8d | ||
1343 | mov 96-128(%rbp),%edx | ||
1344 | mov 128-128(%rbp),%eax | ||
1345 | mov 160-128(%rbp),%ebx | ||
1346 | mov 192-128(%rbp),%r8d | ||
1347 | mov 224-128(%rbp),%edx | ||
1348 | |||
1349 | cmp \$128,%ecx | ||
1350 | je .L10rounds | ||
1351 | cmp \$192,%ecx | ||
1352 | je .L12rounds | ||
1353 | cmp \$256,%ecx | ||
1354 | je .L14rounds | ||
1355 | mov \$-2,%rax # invalid number of bits | ||
1356 | jmp .Lexit | ||
1357 | |||
1358 | .L10rounds: | ||
1359 | mov 0(%rsi),%rax # copy first 4 dwords | ||
1360 | mov 8(%rsi),%rdx | ||
1361 | mov %rax,0(%rdi) | ||
1362 | mov %rdx,8(%rdi) | ||
1363 | |||
1364 | shr \$32,%rdx | ||
1365 | xor %ecx,%ecx | ||
1366 | jmp .L10shortcut | ||
1367 | .align 4 | ||
1368 | .L10loop: | ||
1369 | mov 0(%rdi),%eax # rk[0] | ||
1370 | mov 12(%rdi),%edx # rk[3] | ||
1371 | .L10shortcut: | ||
1372 | ___ | ||
1373 | &enckey (); | ||
1374 | $code.=<<___; | ||
1375 | mov %eax,16(%rdi) # rk[4] | ||
1376 | xor 4(%rdi),%eax | ||
1377 | mov %eax,20(%rdi) # rk[5] | ||
1378 | xor 8(%rdi),%eax | ||
1379 | mov %eax,24(%rdi) # rk[6] | ||
1380 | xor 12(%rdi),%eax | ||
1381 | mov %eax,28(%rdi) # rk[7] | ||
1382 | add \$1,%ecx | ||
1383 | lea 16(%rdi),%rdi | ||
1384 | cmp \$10,%ecx | ||
1385 | jl .L10loop | ||
1386 | |||
1387 | movl \$10,80(%rdi) # setup number of rounds | ||
1388 | xor %rax,%rax | ||
1389 | jmp .Lexit | ||
1390 | |||
1391 | .L12rounds: | ||
1392 | mov 0(%rsi),%rax # copy first 6 dwords | ||
1393 | mov 8(%rsi),%rbx | ||
1394 | mov 16(%rsi),%rdx | ||
1395 | mov %rax,0(%rdi) | ||
1396 | mov %rbx,8(%rdi) | ||
1397 | mov %rdx,16(%rdi) | ||
1398 | |||
1399 | shr \$32,%rdx | ||
1400 | xor %ecx,%ecx | ||
1401 | jmp .L12shortcut | ||
1402 | .align 4 | ||
1403 | .L12loop: | ||
1404 | mov 0(%rdi),%eax # rk[0] | ||
1405 | mov 20(%rdi),%edx # rk[5] | ||
1406 | .L12shortcut: | ||
1407 | ___ | ||
1408 | &enckey (); | ||
1409 | $code.=<<___; | ||
1410 | mov %eax,24(%rdi) # rk[6] | ||
1411 | xor 4(%rdi),%eax | ||
1412 | mov %eax,28(%rdi) # rk[7] | ||
1413 | xor 8(%rdi),%eax | ||
1414 | mov %eax,32(%rdi) # rk[8] | ||
1415 | xor 12(%rdi),%eax | ||
1416 | mov %eax,36(%rdi) # rk[9] | ||
1417 | |||
1418 | cmp \$7,%ecx | ||
1419 | je .L12break | ||
1420 | add \$1,%ecx | ||
1421 | |||
1422 | xor 16(%rdi),%eax | ||
1423 | mov %eax,40(%rdi) # rk[10] | ||
1424 | xor 20(%rdi),%eax | ||
1425 | mov %eax,44(%rdi) # rk[11] | ||
1426 | |||
1427 | lea 24(%rdi),%rdi | ||
1428 | jmp .L12loop | ||
1429 | .L12break: | ||
1430 | movl \$12,72(%rdi) # setup number of rounds | ||
1431 | xor %rax,%rax | ||
1432 | jmp .Lexit | ||
1433 | |||
1434 | .L14rounds: | ||
1435 | mov 0(%rsi),%rax # copy first 8 dwords | ||
1436 | mov 8(%rsi),%rbx | ||
1437 | mov 16(%rsi),%rcx | ||
1438 | mov 24(%rsi),%rdx | ||
1439 | mov %rax,0(%rdi) | ||
1440 | mov %rbx,8(%rdi) | ||
1441 | mov %rcx,16(%rdi) | ||
1442 | mov %rdx,24(%rdi) | ||
1443 | |||
1444 | shr \$32,%rdx | ||
1445 | xor %ecx,%ecx | ||
1446 | jmp .L14shortcut | ||
1447 | .align 4 | ||
1448 | .L14loop: | ||
1449 | mov 0(%rdi),%eax # rk[0] | ||
1450 | mov 28(%rdi),%edx # rk[4] | ||
1451 | .L14shortcut: | ||
1452 | ___ | ||
1453 | &enckey (); | ||
1454 | $code.=<<___; | ||
1455 | mov %eax,32(%rdi) # rk[8] | ||
1456 | xor 4(%rdi),%eax | ||
1457 | mov %eax,36(%rdi) # rk[9] | ||
1458 | xor 8(%rdi),%eax | ||
1459 | mov %eax,40(%rdi) # rk[10] | ||
1460 | xor 12(%rdi),%eax | ||
1461 | mov %eax,44(%rdi) # rk[11] | ||
1462 | |||
1463 | cmp \$6,%ecx | ||
1464 | je .L14break | ||
1465 | add \$1,%ecx | ||
1466 | |||
1467 | mov %eax,%edx | ||
1468 | mov 16(%rdi),%eax # rk[4] | ||
1469 | movz %dl,%esi # rk[11]>>0 | ||
1470 | movzb -128(%rbp,%rsi),%ebx | ||
1471 | movz %dh,%esi # rk[11]>>8 | ||
1472 | xor %ebx,%eax | ||
1473 | |||
1474 | movzb -128(%rbp,%rsi),%ebx | ||
1475 | shr \$16,%edx | ||
1476 | shl \$8,%ebx | ||
1477 | movz %dl,%esi # rk[11]>>16 | ||
1478 | xor %ebx,%eax | ||
1479 | |||
1480 | movzb -128(%rbp,%rsi),%ebx | ||
1481 | movz %dh,%esi # rk[11]>>24 | ||
1482 | shl \$16,%ebx | ||
1483 | xor %ebx,%eax | ||
1484 | |||
1485 | movzb -128(%rbp,%rsi),%ebx | ||
1486 | shl \$24,%ebx | ||
1487 | xor %ebx,%eax | ||
1488 | |||
1489 | mov %eax,48(%rdi) # rk[12] | ||
1490 | xor 20(%rdi),%eax | ||
1491 | mov %eax,52(%rdi) # rk[13] | ||
1492 | xor 24(%rdi),%eax | ||
1493 | mov %eax,56(%rdi) # rk[14] | ||
1494 | xor 28(%rdi),%eax | ||
1495 | mov %eax,60(%rdi) # rk[15] | ||
1496 | |||
1497 | lea 32(%rdi),%rdi | ||
1498 | jmp .L14loop | ||
1499 | .L14break: | ||
1500 | movl \$14,48(%rdi) # setup number of rounds | ||
1501 | xor %rax,%rax | ||
1502 | jmp .Lexit | ||
1503 | |||
1504 | .Lbadpointer: | ||
1505 | mov \$-1,%rax | ||
1506 | .Lexit: | ||
1507 | retq | ||
1508 | .size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key | ||
1509 | ___ | ||
1510 | |||
1511 | sub deckey_ref() | ||
1512 | { my ($i,$ptr,$te,$td) = @_; | ||
1513 | my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d"); | ||
1514 | $code.=<<___; | ||
1515 | mov $i($ptr),$tp1 | ||
1516 | mov $tp1,$acc | ||
1517 | and \$0x80808080,$acc | ||
1518 | mov $acc,$tp4 | ||
1519 | shr \$7,$tp4 | ||
1520 | lea 0($tp1,$tp1),$tp2 | ||
1521 | sub $tp4,$acc | ||
1522 | and \$0xfefefefe,$tp2 | ||
1523 | and \$0x1b1b1b1b,$acc | ||
1524 | xor $tp2,$acc | ||
1525 | mov $acc,$tp2 | ||
1526 | |||
1527 | and \$0x80808080,$acc | ||
1528 | mov $acc,$tp8 | ||
1529 | shr \$7,$tp8 | ||
1530 | lea 0($tp2,$tp2),$tp4 | ||
1531 | sub $tp8,$acc | ||
1532 | and \$0xfefefefe,$tp4 | ||
1533 | and \$0x1b1b1b1b,$acc | ||
1534 | xor $tp1,$tp2 # tp2^tp1 | ||
1535 | xor $tp4,$acc | ||
1536 | mov $acc,$tp4 | ||
1537 | |||
1538 | and \$0x80808080,$acc | ||
1539 | mov $acc,$tp8 | ||
1540 | shr \$7,$tp8 | ||
1541 | sub $tp8,$acc | ||
1542 | lea 0($tp4,$tp4),$tp8 | ||
1543 | xor $tp1,$tp4 # tp4^tp1 | ||
1544 | and \$0xfefefefe,$tp8 | ||
1545 | and \$0x1b1b1b1b,$acc | ||
1546 | xor $acc,$tp8 | ||
1547 | |||
1548 | xor $tp8,$tp1 # tp1^tp8 | ||
1549 | rol \$8,$tp1 # ROTATE(tp1^tp8,8) | ||
1550 | xor $tp8,$tp2 # tp2^tp1^tp8 | ||
1551 | xor $tp8,$tp4 # tp4^tp1^tp8 | ||
1552 | xor $tp2,$tp8 | ||
1553 | xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2 | ||
1554 | |||
1555 | xor $tp8,$tp1 | ||
1556 | rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24) | ||
1557 | xor $tp2,$tp1 | ||
1558 | rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16) | ||
1559 | xor $tp4,$tp1 | ||
1560 | |||
1561 | mov $tp1,$i($ptr) | ||
1562 | ___ | ||
1563 | } | ||
1564 | |||
1565 | # int aes_set_decrypt_key_internal(const unsigned char *userKey, const int bits, | ||
1566 | # AES_KEY *key) | ||
1567 | $code.=<<___; | ||
1568 | .globl aes_set_decrypt_key_internal | ||
1569 | .type aes_set_decrypt_key_internal,\@function,3 | ||
1570 | .align 16 | ||
1571 | aes_set_decrypt_key_internal: | ||
1572 | _CET_ENDBR | ||
1573 | push %rbx | ||
1574 | push %rbp | ||
1575 | push %r12 | ||
1576 | push %r13 | ||
1577 | push %r14 | ||
1578 | push %r15 | ||
1579 | push %rdx # save key schedule | ||
1580 | .Ldec_key_prologue: | ||
1581 | |||
1582 | call _x86_64_AES_set_encrypt_key | ||
1583 | mov (%rsp),%r8 # restore key schedule | ||
1584 | cmp \$0,%eax | ||
1585 | jne .Labort | ||
1586 | |||
1587 | mov 240(%r8),%r14d # pull number of rounds | ||
1588 | xor %rdi,%rdi | ||
1589 | lea (%rdi,%r14d,4),%rcx | ||
1590 | mov %r8,%rsi | ||
1591 | lea (%r8,%rcx,4),%rdi # pointer to last chunk | ||
1592 | .align 4 | ||
1593 | .Linvert: | ||
1594 | mov 0(%rsi),%rax | ||
1595 | mov 8(%rsi),%rbx | ||
1596 | mov 0(%rdi),%rcx | ||
1597 | mov 8(%rdi),%rdx | ||
1598 | mov %rax,0(%rdi) | ||
1599 | mov %rbx,8(%rdi) | ||
1600 | mov %rcx,0(%rsi) | ||
1601 | mov %rdx,8(%rsi) | ||
1602 | lea 16(%rsi),%rsi | ||
1603 | lea -16(%rdi),%rdi | ||
1604 | cmp %rsi,%rdi | ||
1605 | jne .Linvert | ||
1606 | |||
1607 | lea .LAES_Te+2048+1024(%rip),%rax # rcon | ||
1608 | |||
1609 | mov 40(%rax),$mask80 | ||
1610 | mov 48(%rax),$maskfe | ||
1611 | mov 56(%rax),$mask1b | ||
1612 | |||
1613 | mov %r8,$key | ||
1614 | sub \$1,%r14d | ||
1615 | .align 4 | ||
1616 | .Lpermute: | ||
1617 | lea 16($key),$key | ||
1618 | mov 0($key),%rax | ||
1619 | mov 8($key),%rcx | ||
1620 | ___ | ||
1621 | &dectransform (); | ||
1622 | $code.=<<___; | ||
1623 | mov %eax,0($key) | ||
1624 | mov %ebx,4($key) | ||
1625 | mov %ecx,8($key) | ||
1626 | mov %edx,12($key) | ||
1627 | sub \$1,%r14d | ||
1628 | jnz .Lpermute | ||
1629 | |||
1630 | xor %rax,%rax | ||
1631 | .Labort: | ||
1632 | mov 8(%rsp),%r15 | ||
1633 | mov 16(%rsp),%r14 | ||
1634 | mov 24(%rsp),%r13 | ||
1635 | mov 32(%rsp),%r12 | ||
1636 | mov 40(%rsp),%rbp | ||
1637 | mov 48(%rsp),%rbx | ||
1638 | add \$56,%rsp | ||
1639 | .Ldec_key_epilogue: | ||
1640 | ret | ||
1641 | .size aes_set_decrypt_key_internal,.-aes_set_decrypt_key_internal | ||
1642 | ___ | ||
1643 | |||
1644 | # void aes_cbc_encrypt_internal(const void char *inp, unsigned char *out, | ||
1645 | # size_t length, const AES_KEY *key, unsigned char *ivp,const int enc); | ||
1646 | { | ||
1647 | # stack frame layout | ||
1648 | # -8(%rsp) return address | ||
1649 | my $keyp="0(%rsp)"; # one to pass as $key | ||
1650 | my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds]) | ||
1651 | my $_rsp="16(%rsp)"; # saved %rsp | ||
1652 | my $_inp="24(%rsp)"; # copy of 1st parameter, inp | ||
1653 | my $_out="32(%rsp)"; # copy of 2nd parameter, out | ||
1654 | my $_len="40(%rsp)"; # copy of 3rd parameter, length | ||
1655 | my $_key="48(%rsp)"; # copy of 4th parameter, key | ||
1656 | my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp | ||
1657 | my $ivec="64(%rsp)"; # ivec[16] | ||
1658 | my $aes_key="80(%rsp)"; # copy of aes_key | ||
1659 | my $mark="80+240(%rsp)"; # copy of aes_key->rounds | ||
1660 | |||
1661 | $code.=<<___; | ||
1662 | .globl aes_cbc_encrypt_internal | ||
1663 | .type aes_cbc_encrypt_internal,\@function,6 | ||
1664 | .align 16 | ||
1665 | .extern OPENSSL_ia32cap_P | ||
1666 | .hidden OPENSSL_ia32cap_P | ||
1667 | .globl asm_AES_cbc_encrypt | ||
1668 | .hidden asm_AES_cbc_encrypt | ||
1669 | asm_AES_cbc_encrypt: | ||
1670 | aes_cbc_encrypt_internal: | ||
1671 | _CET_ENDBR | ||
1672 | cmp \$0,%rdx # check length | ||
1673 | je .Lcbc_epilogue | ||
1674 | pushfq | ||
1675 | push %rbx | ||
1676 | push %rbp | ||
1677 | push %r12 | ||
1678 | push %r13 | ||
1679 | push %r14 | ||
1680 | push %r15 | ||
1681 | .Lcbc_prologue: | ||
1682 | |||
1683 | cld | ||
1684 | mov %r9d,%r9d # clear upper half of enc | ||
1685 | |||
1686 | lea .LAES_Te(%rip),$sbox | ||
1687 | cmp \$0,%r9 | ||
1688 | jne .Lcbc_picked_te | ||
1689 | lea .LAES_Td(%rip),$sbox | ||
1690 | .Lcbc_picked_te: | ||
1691 | |||
1692 | mov OPENSSL_ia32cap_P(%rip),%r10d | ||
1693 | cmp \$$speed_limit,%rdx | ||
1694 | jb .Lcbc_slow_prologue | ||
1695 | test \$15,%rdx | ||
1696 | jnz .Lcbc_slow_prologue | ||
1697 | bt \$IA32CAP_BIT0_HT,%r10d | ||
1698 | jc .Lcbc_slow_prologue | ||
1699 | |||
1700 | # allocate aligned stack frame... | ||
1701 | lea -88-248(%rsp),$key | ||
1702 | and \$-64,$key | ||
1703 | |||
1704 | # ... and make sure it doesn't alias with AES_T[ed] modulo 4096 | ||
1705 | mov $sbox,%r10 | ||
1706 | lea 2304($sbox),%r11 | ||
1707 | mov $key,%r12 | ||
1708 | and \$0xFFF,%r10 # s = $sbox&0xfff | ||
1709 | and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff | ||
1710 | and \$0xFFF,%r12 # p = %rsp&0xfff | ||
1711 | |||
1712 | cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); | ||
1713 | jb .Lcbc_te_break_out | ||
1714 | sub %r11,%r12 | ||
1715 | sub %r12,$key | ||
1716 | jmp .Lcbc_te_ok | ||
1717 | .Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz | ||
1718 | sub %r10,%r12 | ||
1719 | and \$0xFFF,%r12 | ||
1720 | add \$320,%r12 | ||
1721 | sub %r12,$key | ||
1722 | .align 4 | ||
1723 | .Lcbc_te_ok: | ||
1724 | |||
1725 | xchg %rsp,$key | ||
1726 | #add \$8,%rsp # reserve for return address! | ||
1727 | mov $key,$_rsp # save %rsp | ||
1728 | .Lcbc_fast_body: | ||
1729 | mov %rdi,$_inp # save copy of inp | ||
1730 | mov %rsi,$_out # save copy of out | ||
1731 | mov %rdx,$_len # save copy of len | ||
1732 | mov %rcx,$_key # save copy of key | ||
1733 | mov %r8,$_ivp # save copy of ivp | ||
1734 | movl \$0,$mark # copy of aes_key->rounds = 0; | ||
1735 | mov %r8,%rbp # rearrange input arguments | ||
1736 | mov %r9,%rbx | ||
1737 | mov %rsi,$out | ||
1738 | mov %rdi,$inp | ||
1739 | mov %rcx,$key | ||
1740 | |||
1741 | mov 240($key),%eax # key->rounds | ||
1742 | # do we copy key schedule to stack? | ||
1743 | mov $key,%r10 | ||
1744 | sub $sbox,%r10 | ||
1745 | and \$0xfff,%r10 | ||
1746 | cmp \$2304,%r10 | ||
1747 | jb .Lcbc_do_ecopy | ||
1748 | cmp \$4096-248,%r10 | ||
1749 | jb .Lcbc_skip_ecopy | ||
1750 | .align 4 | ||
1751 | .Lcbc_do_ecopy: | ||
1752 | mov $key,%rsi | ||
1753 | lea $aes_key,%rdi | ||
1754 | lea $aes_key,$key | ||
1755 | mov \$240/8,%ecx | ||
1756 | .long 0x90A548F3 # rep movsq | ||
1757 | mov %eax,(%rdi) # copy aes_key->rounds | ||
1758 | .Lcbc_skip_ecopy: | ||
1759 | mov $key,$keyp # save key pointer | ||
1760 | |||
1761 | mov \$18,%ecx | ||
1762 | .align 4 | ||
1763 | .Lcbc_prefetch_te: | ||
1764 | mov 0($sbox),%r10 | ||
1765 | mov 32($sbox),%r11 | ||
1766 | mov 64($sbox),%r12 | ||
1767 | mov 96($sbox),%r13 | ||
1768 | lea 128($sbox),$sbox | ||
1769 | sub \$1,%ecx | ||
1770 | jnz .Lcbc_prefetch_te | ||
1771 | lea -2304($sbox),$sbox | ||
1772 | |||
1773 | cmp \$0,%rbx | ||
1774 | je .LFAST_DECRYPT | ||
1775 | |||
1776 | #----------------------------- ENCRYPT -----------------------------# | ||
1777 | mov 0(%rbp),$s0 # load iv | ||
1778 | mov 4(%rbp),$s1 | ||
1779 | mov 8(%rbp),$s2 | ||
1780 | mov 12(%rbp),$s3 | ||
1781 | |||
1782 | .align 4 | ||
1783 | .Lcbc_fast_enc_loop: | ||
1784 | xor 0($inp),$s0 | ||
1785 | xor 4($inp),$s1 | ||
1786 | xor 8($inp),$s2 | ||
1787 | xor 12($inp),$s3 | ||
1788 | mov $keyp,$key # restore key | ||
1789 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1790 | |||
1791 | call _x86_64_AES_encrypt | ||
1792 | |||
1793 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1794 | mov $_len,%r10 | ||
1795 | mov $s0,0($out) | ||
1796 | mov $s1,4($out) | ||
1797 | mov $s2,8($out) | ||
1798 | mov $s3,12($out) | ||
1799 | |||
1800 | lea 16($inp),$inp | ||
1801 | lea 16($out),$out | ||
1802 | sub \$16,%r10 | ||
1803 | test \$-16,%r10 | ||
1804 | mov %r10,$_len | ||
1805 | jnz .Lcbc_fast_enc_loop | ||
1806 | mov $_ivp,%rbp # restore ivp | ||
1807 | mov $s0,0(%rbp) # save ivec | ||
1808 | mov $s1,4(%rbp) | ||
1809 | mov $s2,8(%rbp) | ||
1810 | mov $s3,12(%rbp) | ||
1811 | |||
1812 | jmp .Lcbc_fast_cleanup | ||
1813 | |||
1814 | #----------------------------- DECRYPT -----------------------------# | ||
1815 | .align 16 | ||
1816 | .LFAST_DECRYPT: | ||
1817 | cmp $inp,$out | ||
1818 | je .Lcbc_fast_dec_in_place | ||
1819 | |||
1820 | mov %rbp,$ivec | ||
1821 | .align 4 | ||
1822 | .Lcbc_fast_dec_loop: | ||
1823 | mov 0($inp),$s0 # read input | ||
1824 | mov 4($inp),$s1 | ||
1825 | mov 8($inp),$s2 | ||
1826 | mov 12($inp),$s3 | ||
1827 | mov $keyp,$key # restore key | ||
1828 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1829 | |||
1830 | call _x86_64_AES_decrypt | ||
1831 | |||
1832 | mov $ivec,%rbp # load ivp | ||
1833 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1834 | mov $_len,%r10 # load len | ||
1835 | xor 0(%rbp),$s0 # xor iv | ||
1836 | xor 4(%rbp),$s1 | ||
1837 | xor 8(%rbp),$s2 | ||
1838 | xor 12(%rbp),$s3 | ||
1839 | mov $inp,%rbp # current input, next iv | ||
1840 | |||
1841 | sub \$16,%r10 | ||
1842 | mov %r10,$_len # update len | ||
1843 | mov %rbp,$ivec # update ivp | ||
1844 | |||
1845 | mov $s0,0($out) # write output | ||
1846 | mov $s1,4($out) | ||
1847 | mov $s2,8($out) | ||
1848 | mov $s3,12($out) | ||
1849 | |||
1850 | lea 16($inp),$inp | ||
1851 | lea 16($out),$out | ||
1852 | jnz .Lcbc_fast_dec_loop | ||
1853 | mov $_ivp,%r12 # load user ivp | ||
1854 | mov 0(%rbp),%r10 # load iv | ||
1855 | mov 8(%rbp),%r11 | ||
1856 | mov %r10,0(%r12) # copy back to user | ||
1857 | mov %r11,8(%r12) | ||
1858 | jmp .Lcbc_fast_cleanup | ||
1859 | |||
1860 | .align 16 | ||
1861 | .Lcbc_fast_dec_in_place: | ||
1862 | mov 0(%rbp),%r10 # copy iv to stack | ||
1863 | mov 8(%rbp),%r11 | ||
1864 | mov %r10,0+$ivec | ||
1865 | mov %r11,8+$ivec | ||
1866 | .align 4 | ||
1867 | .Lcbc_fast_dec_in_place_loop: | ||
1868 | mov 0($inp),$s0 # load input | ||
1869 | mov 4($inp),$s1 | ||
1870 | mov 8($inp),$s2 | ||
1871 | mov 12($inp),$s3 | ||
1872 | mov $keyp,$key # restore key | ||
1873 | mov $inp,$_inp # if ($verticalspin) save inp | ||
1874 | |||
1875 | call _x86_64_AES_decrypt | ||
1876 | |||
1877 | mov $_inp,$inp # if ($verticalspin) restore inp | ||
1878 | mov $_len,%r10 | ||
1879 | xor 0+$ivec,$s0 | ||
1880 | xor 4+$ivec,$s1 | ||
1881 | xor 8+$ivec,$s2 | ||
1882 | xor 12+$ivec,$s3 | ||
1883 | |||
1884 | mov 0($inp),%r11 # load input | ||
1885 | mov 8($inp),%r12 | ||
1886 | sub \$16,%r10 | ||
1887 | jz .Lcbc_fast_dec_in_place_done | ||
1888 | |||
1889 | mov %r11,0+$ivec # copy input to iv | ||
1890 | mov %r12,8+$ivec | ||
1891 | |||
1892 | mov $s0,0($out) # save output [zaps input] | ||
1893 | mov $s1,4($out) | ||
1894 | mov $s2,8($out) | ||
1895 | mov $s3,12($out) | ||
1896 | |||
1897 | lea 16($inp),$inp | ||
1898 | lea 16($out),$out | ||
1899 | mov %r10,$_len | ||
1900 | jmp .Lcbc_fast_dec_in_place_loop | ||
1901 | .Lcbc_fast_dec_in_place_done: | ||
1902 | mov $_ivp,%rdi | ||
1903 | mov %r11,0(%rdi) # copy iv back to user | ||
1904 | mov %r12,8(%rdi) | ||
1905 | |||
1906 | mov $s0,0($out) # save output [zaps input] | ||
1907 | mov $s1,4($out) | ||
1908 | mov $s2,8($out) | ||
1909 | mov $s3,12($out) | ||
1910 | |||
1911 | .align 4 | ||
1912 | .Lcbc_fast_cleanup: | ||
1913 | cmpl \$0,$mark # was the key schedule copied? | ||
1914 | lea $aes_key,%rdi | ||
1915 | je .Lcbc_exit | ||
1916 | mov \$240/8,%ecx | ||
1917 | xor %rax,%rax | ||
1918 | .long 0x90AB48F3 # rep stosq | ||
1919 | |||
1920 | jmp .Lcbc_exit | ||
1921 | |||
1922 | #--------------------------- SLOW ROUTINE ---------------------------# | ||
1923 | .align 16 | ||
1924 | .Lcbc_slow_prologue: | ||
1925 | # allocate aligned stack frame... | ||
1926 | lea -88(%rsp),%rbp | ||
1927 | and \$-64,%rbp | ||
1928 | # ... just "above" key schedule | ||
1929 | lea -88-63(%rcx),%r10 | ||
1930 | sub %rbp,%r10 | ||
1931 | neg %r10 | ||
1932 | and \$0x3c0,%r10 | ||
1933 | sub %r10,%rbp | ||
1934 | |||
1935 | xchg %rsp,%rbp | ||
1936 | #add \$8,%rsp # reserve for return address! | ||
1937 | mov %rbp,$_rsp # save %rsp | ||
1938 | .Lcbc_slow_body: | ||
1939 | #mov %rdi,$_inp # save copy of inp | ||
1940 | #mov %rsi,$_out # save copy of out | ||
1941 | #mov %rdx,$_len # save copy of len | ||
1942 | #mov %rcx,$_key # save copy of key | ||
1943 | mov %r8,$_ivp # save copy of ivp | ||
1944 | mov %r8,%rbp # rearrange input arguments | ||
1945 | mov %r9,%rbx | ||
1946 | mov %rsi,$out | ||
1947 | mov %rdi,$inp | ||
1948 | mov %rcx,$key | ||
1949 | mov %rdx,%r10 | ||
1950 | |||
1951 | mov 240($key),%eax | ||
1952 | mov $key,$keyp # save key pointer | ||
1953 | shl \$4,%eax | ||
1954 | lea ($key,%rax),%rax | ||
1955 | mov %rax,$keyend | ||
1956 | |||
1957 | # pick Te4 copy which can't "overlap" with stack frame or key schedule | ||
1958 | lea 2048($sbox),$sbox | ||
1959 | lea 768-8(%rsp),%rax | ||
1960 | sub $sbox,%rax | ||
1961 | and \$0x300,%rax | ||
1962 | lea ($sbox,%rax),$sbox | ||
1963 | |||
1964 | cmp \$0,%rbx | ||
1965 | je .LSLOW_DECRYPT | ||
1966 | |||
1967 | #--------------------------- SLOW ENCRYPT ---------------------------# | ||
1968 | test \$-16,%r10 # check upon length | ||
1969 | mov 0(%rbp),$s0 # load iv | ||
1970 | mov 4(%rbp),$s1 | ||
1971 | mov 8(%rbp),$s2 | ||
1972 | mov 12(%rbp),$s3 | ||
1973 | jz .Lcbc_slow_enc_tail # short input... | ||
1974 | |||
1975 | .align 4 | ||
1976 | .Lcbc_slow_enc_loop: | ||
1977 | xor 0($inp),$s0 | ||
1978 | xor 4($inp),$s1 | ||
1979 | xor 8($inp),$s2 | ||
1980 | xor 12($inp),$s3 | ||
1981 | mov $keyp,$key # restore key | ||
1982 | mov $inp,$_inp # save inp | ||
1983 | mov $out,$_out # save out | ||
1984 | mov %r10,$_len # save len | ||
1985 | |||
1986 | call _x86_64_AES_encrypt_compact | ||
1987 | |||
1988 | mov $_inp,$inp # restore inp | ||
1989 | mov $_out,$out # restore out | ||
1990 | mov $_len,%r10 # restore len | ||
1991 | mov $s0,0($out) | ||
1992 | mov $s1,4($out) | ||
1993 | mov $s2,8($out) | ||
1994 | mov $s3,12($out) | ||
1995 | |||
1996 | lea 16($inp),$inp | ||
1997 | lea 16($out),$out | ||
1998 | sub \$16,%r10 | ||
1999 | test \$-16,%r10 | ||
2000 | jnz .Lcbc_slow_enc_loop | ||
2001 | test \$15,%r10 | ||
2002 | jnz .Lcbc_slow_enc_tail | ||
2003 | mov $_ivp,%rbp # restore ivp | ||
2004 | mov $s0,0(%rbp) # save ivec | ||
2005 | mov $s1,4(%rbp) | ||
2006 | mov $s2,8(%rbp) | ||
2007 | mov $s3,12(%rbp) | ||
2008 | |||
2009 | jmp .Lcbc_exit | ||
2010 | |||
2011 | .align 4 | ||
2012 | .Lcbc_slow_enc_tail: | ||
2013 | mov %rax,%r11 | ||
2014 | mov %rcx,%r12 | ||
2015 | mov %r10,%rcx | ||
2016 | mov $inp,%rsi | ||
2017 | mov $out,%rdi | ||
2018 | .long 0x9066A4F3 # rep movsb | ||
2019 | mov \$16,%rcx # zero tail | ||
2020 | sub %r10,%rcx | ||
2021 | xor %rax,%rax | ||
2022 | .long 0x9066AAF3 # rep stosb | ||
2023 | mov $out,$inp # this is not a mistake! | ||
2024 | mov \$16,%r10 # len=16 | ||
2025 | mov %r11,%rax | ||
2026 | mov %r12,%rcx | ||
2027 | jmp .Lcbc_slow_enc_loop # one more spin... | ||
2028 | #--------------------------- SLOW DECRYPT ---------------------------# | ||
2029 | .align 16 | ||
2030 | .LSLOW_DECRYPT: | ||
2031 | shr \$3,%rax | ||
2032 | add %rax,$sbox # recall "magic" constants! | ||
2033 | |||
2034 | mov 0(%rbp),%r11 # copy iv to stack | ||
2035 | mov 8(%rbp),%r12 | ||
2036 | mov %r11,0+$ivec | ||
2037 | mov %r12,8+$ivec | ||
2038 | |||
2039 | .align 4 | ||
2040 | .Lcbc_slow_dec_loop: | ||
2041 | mov 0($inp),$s0 # load input | ||
2042 | mov 4($inp),$s1 | ||
2043 | mov 8($inp),$s2 | ||
2044 | mov 12($inp),$s3 | ||
2045 | mov $keyp,$key # restore key | ||
2046 | mov $inp,$_inp # save inp | ||
2047 | mov $out,$_out # save out | ||
2048 | mov %r10,$_len # save len | ||
2049 | |||
2050 | call _x86_64_AES_decrypt_compact | ||
2051 | |||
2052 | mov $_inp,$inp # restore inp | ||
2053 | mov $_out,$out # restore out | ||
2054 | mov $_len,%r10 | ||
2055 | xor 0+$ivec,$s0 | ||
2056 | xor 4+$ivec,$s1 | ||
2057 | xor 8+$ivec,$s2 | ||
2058 | xor 12+$ivec,$s3 | ||
2059 | |||
2060 | mov 0($inp),%r11 # load input | ||
2061 | mov 8($inp),%r12 | ||
2062 | sub \$16,%r10 | ||
2063 | jc .Lcbc_slow_dec_partial | ||
2064 | jz .Lcbc_slow_dec_done | ||
2065 | |||
2066 | mov %r11,0+$ivec # copy input to iv | ||
2067 | mov %r12,8+$ivec | ||
2068 | |||
2069 | mov $s0,0($out) # save output [can zap input] | ||
2070 | mov $s1,4($out) | ||
2071 | mov $s2,8($out) | ||
2072 | mov $s3,12($out) | ||
2073 | |||
2074 | lea 16($inp),$inp | ||
2075 | lea 16($out),$out | ||
2076 | jmp .Lcbc_slow_dec_loop | ||
2077 | .Lcbc_slow_dec_done: | ||
2078 | mov $_ivp,%rdi | ||
2079 | mov %r11,0(%rdi) # copy iv back to user | ||
2080 | mov %r12,8(%rdi) | ||
2081 | |||
2082 | mov $s0,0($out) # save output [can zap input] | ||
2083 | mov $s1,4($out) | ||
2084 | mov $s2,8($out) | ||
2085 | mov $s3,12($out) | ||
2086 | |||
2087 | jmp .Lcbc_exit | ||
2088 | |||
2089 | .align 4 | ||
2090 | .Lcbc_slow_dec_partial: | ||
2091 | mov $_ivp,%rdi | ||
2092 | mov %r11,0(%rdi) # copy iv back to user | ||
2093 | mov %r12,8(%rdi) | ||
2094 | |||
2095 | mov $s0,0+$ivec # save output to stack | ||
2096 | mov $s1,4+$ivec | ||
2097 | mov $s2,8+$ivec | ||
2098 | mov $s3,12+$ivec | ||
2099 | |||
2100 | mov $out,%rdi | ||
2101 | lea $ivec,%rsi | ||
2102 | lea 16(%r10),%rcx | ||
2103 | .long 0x9066A4F3 # rep movsb | ||
2104 | jmp .Lcbc_exit | ||
2105 | |||
2106 | .align 16 | ||
2107 | .Lcbc_exit: | ||
2108 | mov $_rsp,%rsi | ||
2109 | mov (%rsi),%r15 | ||
2110 | mov 8(%rsi),%r14 | ||
2111 | mov 16(%rsi),%r13 | ||
2112 | mov 24(%rsi),%r12 | ||
2113 | mov 32(%rsi),%rbp | ||
2114 | mov 40(%rsi),%rbx | ||
2115 | lea 48(%rsi),%rsp | ||
2116 | .Lcbc_popfq: | ||
2117 | popfq | ||
2118 | .Lcbc_epilogue: | ||
2119 | ret | ||
2120 | .size aes_cbc_encrypt_internal,.-aes_cbc_encrypt_internal | ||
2121 | ___ | ||
2122 | } | ||
2123 | |||
2124 | $code.=<<___; | ||
2125 | .section .rodata | ||
2126 | .align 64 | ||
2127 | .LAES_Te: | ||
2128 | ___ | ||
2129 | &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); | ||
2130 | &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); | ||
2131 | &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56); | ||
2132 | &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec); | ||
2133 | &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa); | ||
2134 | &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb); | ||
2135 | &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45); | ||
2136 | &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b); | ||
2137 | &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c); | ||
2138 | &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83); | ||
2139 | &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9); | ||
2140 | &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a); | ||
2141 | &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d); | ||
2142 | &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f); | ||
2143 | &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df); | ||
2144 | &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea); | ||
2145 | &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34); | ||
2146 | &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b); | ||
2147 | &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d); | ||
2148 | &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413); | ||
2149 | &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1); | ||
2150 | &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6); | ||
2151 | &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972); | ||
2152 | &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85); | ||
2153 | &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed); | ||
2154 | &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511); | ||
2155 | &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe); | ||
2156 | &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b); | ||
2157 | &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05); | ||
2158 | &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1); | ||
2159 | &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142); | ||
2160 | &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf); | ||
2161 | &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3); | ||
2162 | &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e); | ||
2163 | &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a); | ||
2164 | &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6); | ||
2165 | &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3); | ||
2166 | &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b); | ||
2167 | &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428); | ||
2168 | &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad); | ||
2169 | &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14); | ||
2170 | &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8); | ||
2171 | &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4); | ||
2172 | &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2); | ||
2173 | &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda); | ||
2174 | &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949); | ||
2175 | &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf); | ||
2176 | &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810); | ||
2177 | &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c); | ||
2178 | &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697); | ||
2179 | &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e); | ||
2180 | &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f); | ||
2181 | &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc); | ||
2182 | &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c); | ||
2183 | &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969); | ||
2184 | &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27); | ||
2185 | &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122); | ||
2186 | &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433); | ||
2187 | &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9); | ||
2188 | &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5); | ||
2189 | &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a); | ||
2190 | &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); | ||
2191 | &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); | ||
2192 | &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); | ||
2193 | |||
2194 | #Te4 # four copies of Te4 to choose from to avoid L1 aliasing | ||
2195 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2196 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2197 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2198 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2199 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2200 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2201 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2202 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2203 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2204 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2205 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2206 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2207 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2208 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2209 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2210 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2211 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2212 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2213 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2214 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2215 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2216 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2217 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2218 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2219 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2220 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2221 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2222 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2223 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2224 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2225 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2226 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2227 | |||
2228 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2229 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2230 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2231 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2232 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2233 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2234 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2235 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2236 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2237 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2238 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2239 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2240 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2241 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2242 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2243 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2244 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2245 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2246 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2247 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2248 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2249 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2250 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2251 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2252 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2253 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2254 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2255 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2256 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2257 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2258 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2259 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2260 | |||
2261 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2262 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2263 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2264 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2265 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2266 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2267 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2268 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2269 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2270 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2271 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2272 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2273 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2274 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2275 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2276 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2277 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2278 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2279 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2280 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2281 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2282 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2283 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2284 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2285 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2286 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2287 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2288 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2289 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2290 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2291 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2292 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2293 | |||
2294 | &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5); | ||
2295 | &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76); | ||
2296 | &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0); | ||
2297 | &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0); | ||
2298 | &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc); | ||
2299 | &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15); | ||
2300 | &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a); | ||
2301 | &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75); | ||
2302 | &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0); | ||
2303 | &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84); | ||
2304 | &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b); | ||
2305 | &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf); | ||
2306 | &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85); | ||
2307 | &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8); | ||
2308 | &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5); | ||
2309 | &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2); | ||
2310 | &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17); | ||
2311 | &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73); | ||
2312 | &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88); | ||
2313 | &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb); | ||
2314 | &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c); | ||
2315 | &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79); | ||
2316 | &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9); | ||
2317 | &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08); | ||
2318 | &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6); | ||
2319 | &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a); | ||
2320 | &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e); | ||
2321 | &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e); | ||
2322 | &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94); | ||
2323 | &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf); | ||
2324 | &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68); | ||
2325 | &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16); | ||
2326 | #rcon: | ||
2327 | $code.=<<___; | ||
2328 | .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 | ||
2329 | .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 | ||
2330 | .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080 | ||
2331 | .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b | ||
2332 | ___ | ||
2333 | $code.=<<___; | ||
2334 | .align 64 | ||
2335 | .LAES_Td: | ||
2336 | ___ | ||
2337 | &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); | ||
2338 | &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); | ||
2339 | &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5); | ||
2340 | &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5); | ||
2341 | &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d); | ||
2342 | &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b); | ||
2343 | &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295); | ||
2344 | &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e); | ||
2345 | &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927); | ||
2346 | &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d); | ||
2347 | &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362); | ||
2348 | &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9); | ||
2349 | &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52); | ||
2350 | &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566); | ||
2351 | &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3); | ||
2352 | &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed); | ||
2353 | &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e); | ||
2354 | &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4); | ||
2355 | &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4); | ||
2356 | &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd); | ||
2357 | &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d); | ||
2358 | &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060); | ||
2359 | &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967); | ||
2360 | &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879); | ||
2361 | &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000); | ||
2362 | &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c); | ||
2363 | &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36); | ||
2364 | &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624); | ||
2365 | &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b); | ||
2366 | &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c); | ||
2367 | &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12); | ||
2368 | &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14); | ||
2369 | &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3); | ||
2370 | &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b); | ||
2371 | &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8); | ||
2372 | &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684); | ||
2373 | &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7); | ||
2374 | &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177); | ||
2375 | &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947); | ||
2376 | &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322); | ||
2377 | &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498); | ||
2378 | &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f); | ||
2379 | &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54); | ||
2380 | &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382); | ||
2381 | &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf); | ||
2382 | &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb); | ||
2383 | &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83); | ||
2384 | &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef); | ||
2385 | &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029); | ||
2386 | &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235); | ||
2387 | &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733); | ||
2388 | &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117); | ||
2389 | &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4); | ||
2390 | &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546); | ||
2391 | &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb); | ||
2392 | &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d); | ||
2393 | &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb); | ||
2394 | &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a); | ||
2395 | &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773); | ||
2396 | &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478); | ||
2397 | &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2); | ||
2398 | &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); | ||
2399 | &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); | ||
2400 | &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); | ||
2401 | |||
2402 | #Td4: # four copies of Td4 to choose from to avoid L1 aliasing | ||
2403 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2404 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2405 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2406 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2407 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2408 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2409 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2410 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2411 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2412 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2413 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2414 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2415 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2416 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2417 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2418 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2419 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2420 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2421 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2422 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2423 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2424 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2425 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2426 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2427 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2428 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2429 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2430 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2431 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2432 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2433 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2434 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2435 | $code.=<<___; | ||
2436 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2437 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2438 | ___ | ||
2439 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2440 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2441 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2442 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2443 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2444 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2445 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2446 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2447 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2448 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2449 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2450 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2451 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2452 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2453 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2454 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2455 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2456 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2457 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2458 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2459 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2460 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2461 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2462 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2463 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2464 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2465 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2466 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2467 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2468 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2469 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2470 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2471 | $code.=<<___; | ||
2472 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2473 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2474 | ___ | ||
2475 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2476 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2477 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2478 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2479 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2480 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2481 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2482 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2483 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2484 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2485 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2486 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2487 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2488 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2489 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2490 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2491 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2492 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2493 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2494 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2495 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2496 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2497 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2498 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2499 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2500 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2501 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2502 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2503 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2504 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2505 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2506 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2507 | $code.=<<___; | ||
2508 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2509 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2510 | ___ | ||
2511 | &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); | ||
2512 | &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); | ||
2513 | &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); | ||
2514 | &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb); | ||
2515 | &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d); | ||
2516 | &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e); | ||
2517 | &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2); | ||
2518 | &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25); | ||
2519 | &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16); | ||
2520 | &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92); | ||
2521 | &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda); | ||
2522 | &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84); | ||
2523 | &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a); | ||
2524 | &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06); | ||
2525 | &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02); | ||
2526 | &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b); | ||
2527 | &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea); | ||
2528 | &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73); | ||
2529 | &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85); | ||
2530 | &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e); | ||
2531 | &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89); | ||
2532 | &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b); | ||
2533 | &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20); | ||
2534 | &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4); | ||
2535 | &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31); | ||
2536 | &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f); | ||
2537 | &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d); | ||
2538 | &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef); | ||
2539 | &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0); | ||
2540 | &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); | ||
2541 | &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); | ||
2542 | &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); | ||
2543 | $code.=<<___; | ||
2544 | .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe | ||
2545 | .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0 | ||
2546 | .align 64 | ||
2547 | .text | ||
2548 | ___ | ||
2549 | |||
2550 | # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, | ||
2551 | # CONTEXT *context,DISPATCHER_CONTEXT *disp) | ||
2552 | if ($win64) { | ||
2553 | $rec="%rcx"; | ||
2554 | $frame="%rdx"; | ||
2555 | $context="%r8"; | ||
2556 | $disp="%r9"; | ||
2557 | |||
2558 | $code.=<<___; | ||
2559 | .extern __imp_RtlVirtualUnwind | ||
2560 | .type block_se_handler,\@abi-omnipotent | ||
2561 | .align 16 | ||
2562 | block_se_handler: | ||
2563 | _CET_ENDBR | ||
2564 | push %rsi | ||
2565 | push %rdi | ||
2566 | push %rbx | ||
2567 | push %rbp | ||
2568 | push %r12 | ||
2569 | push %r13 | ||
2570 | push %r14 | ||
2571 | push %r15 | ||
2572 | pushfq | ||
2573 | sub \$64,%rsp | ||
2574 | |||
2575 | mov 120($context),%rax # pull context->Rax | ||
2576 | mov 248($context),%rbx # pull context->Rip | ||
2577 | |||
2578 | mov 8($disp),%rsi # disp->ImageBase | ||
2579 | mov 56($disp),%r11 # disp->HandlerData | ||
2580 | |||
2581 | mov 0(%r11),%r10d # HandlerData[0] | ||
2582 | lea (%rsi,%r10),%r10 # prologue label | ||
2583 | cmp %r10,%rbx # context->Rip<prologue label | ||
2584 | jb .Lin_block_prologue | ||
2585 | |||
2586 | mov 152($context),%rax # pull context->Rsp | ||
2587 | |||
2588 | mov 4(%r11),%r10d # HandlerData[1] | ||
2589 | lea (%rsi,%r10),%r10 # epilogue label | ||
2590 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2591 | jae .Lin_block_prologue | ||
2592 | |||
2593 | mov 24(%rax),%rax # pull saved real stack pointer | ||
2594 | lea 48(%rax),%rax # adjust... | ||
2595 | |||
2596 | mov -8(%rax),%rbx | ||
2597 | mov -16(%rax),%rbp | ||
2598 | mov -24(%rax),%r12 | ||
2599 | mov -32(%rax),%r13 | ||
2600 | mov -40(%rax),%r14 | ||
2601 | mov -48(%rax),%r15 | ||
2602 | mov %rbx,144($context) # restore context->Rbx | ||
2603 | mov %rbp,160($context) # restore context->Rbp | ||
2604 | mov %r12,216($context) # restore context->R12 | ||
2605 | mov %r13,224($context) # restore context->R13 | ||
2606 | mov %r14,232($context) # restore context->R14 | ||
2607 | mov %r15,240($context) # restore context->R15 | ||
2608 | |||
2609 | .Lin_block_prologue: | ||
2610 | mov 8(%rax),%rdi | ||
2611 | mov 16(%rax),%rsi | ||
2612 | mov %rax,152($context) # restore context->Rsp | ||
2613 | mov %rsi,168($context) # restore context->Rsi | ||
2614 | mov %rdi,176($context) # restore context->Rdi | ||
2615 | |||
2616 | jmp .Lcommon_seh_exit | ||
2617 | .size block_se_handler,.-block_se_handler | ||
2618 | |||
2619 | .type key_se_handler,\@abi-omnipotent | ||
2620 | .align 16 | ||
2621 | key_se_handler: | ||
2622 | _CET_ENDBR | ||
2623 | push %rsi | ||
2624 | push %rdi | ||
2625 | push %rbx | ||
2626 | push %rbp | ||
2627 | push %r12 | ||
2628 | push %r13 | ||
2629 | push %r14 | ||
2630 | push %r15 | ||
2631 | pushfq | ||
2632 | sub \$64,%rsp | ||
2633 | |||
2634 | mov 120($context),%rax # pull context->Rax | ||
2635 | mov 248($context),%rbx # pull context->Rip | ||
2636 | |||
2637 | mov 8($disp),%rsi # disp->ImageBase | ||
2638 | mov 56($disp),%r11 # disp->HandlerData | ||
2639 | |||
2640 | mov 0(%r11),%r10d # HandlerData[0] | ||
2641 | lea (%rsi,%r10),%r10 # prologue label | ||
2642 | cmp %r10,%rbx # context->Rip<prologue label | ||
2643 | jb .Lin_key_prologue | ||
2644 | |||
2645 | mov 152($context),%rax # pull context->Rsp | ||
2646 | |||
2647 | mov 4(%r11),%r10d # HandlerData[1] | ||
2648 | lea (%rsi,%r10),%r10 # epilogue label | ||
2649 | cmp %r10,%rbx # context->Rip>=epilogue label | ||
2650 | jae .Lin_key_prologue | ||
2651 | |||
2652 | lea 56(%rax),%rax | ||
2653 | |||
2654 | mov -8(%rax),%rbx | ||
2655 | mov -16(%rax),%rbp | ||
2656 | mov -24(%rax),%r12 | ||
2657 | mov -32(%rax),%r13 | ||
2658 | mov -40(%rax),%r14 | ||
2659 | mov -48(%rax),%r15 | ||
2660 | mov %rbx,144($context) # restore context->Rbx | ||
2661 | mov %rbp,160($context) # restore context->Rbp | ||
2662 | mov %r12,216($context) # restore context->R12 | ||
2663 | mov %r13,224($context) # restore context->R13 | ||
2664 | mov %r14,232($context) # restore context->R14 | ||
2665 | mov %r15,240($context) # restore context->R15 | ||
2666 | |||
2667 | .Lin_key_prologue: | ||
2668 | mov 8(%rax),%rdi | ||
2669 | mov 16(%rax),%rsi | ||
2670 | mov %rax,152($context) # restore context->Rsp | ||
2671 | mov %rsi,168($context) # restore context->Rsi | ||
2672 | mov %rdi,176($context) # restore context->Rdi | ||
2673 | |||
2674 | jmp .Lcommon_seh_exit | ||
2675 | .size key_se_handler,.-key_se_handler | ||
2676 | |||
2677 | .type cbc_se_handler,\@abi-omnipotent | ||
2678 | .align 16 | ||
2679 | cbc_se_handler: | ||
2680 | _CET_ENDBR | ||
2681 | push %rsi | ||
2682 | push %rdi | ||
2683 | push %rbx | ||
2684 | push %rbp | ||
2685 | push %r12 | ||
2686 | push %r13 | ||
2687 | push %r14 | ||
2688 | push %r15 | ||
2689 | pushfq | ||
2690 | sub \$64,%rsp | ||
2691 | |||
2692 | mov 120($context),%rax # pull context->Rax | ||
2693 | mov 248($context),%rbx # pull context->Rip | ||
2694 | |||
2695 | lea .Lcbc_prologue(%rip),%r10 | ||
2696 | cmp %r10,%rbx # context->Rip<.Lcbc_prologue | ||
2697 | jb .Lin_cbc_prologue | ||
2698 | |||
2699 | lea .Lcbc_fast_body(%rip),%r10 | ||
2700 | cmp %r10,%rbx # context->Rip<.Lcbc_fast_body | ||
2701 | jb .Lin_cbc_frame_setup | ||
2702 | |||
2703 | lea .Lcbc_slow_prologue(%rip),%r10 | ||
2704 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue | ||
2705 | jb .Lin_cbc_body | ||
2706 | |||
2707 | lea .Lcbc_slow_body(%rip),%r10 | ||
2708 | cmp %r10,%rbx # context->Rip<.Lcbc_slow_body | ||
2709 | jb .Lin_cbc_frame_setup | ||
2710 | |||
2711 | .Lin_cbc_body: | ||
2712 | mov 152($context),%rax # pull context->Rsp | ||
2713 | |||
2714 | lea .Lcbc_epilogue(%rip),%r10 | ||
2715 | cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue | ||
2716 | jae .Lin_cbc_prologue | ||
2717 | |||
2718 | lea 8(%rax),%rax | ||
2719 | |||
2720 | lea .Lcbc_popfq(%rip),%r10 | ||
2721 | cmp %r10,%rbx # context->Rip>=.Lcbc_popfq | ||
2722 | jae .Lin_cbc_prologue | ||
2723 | |||
2724 | mov `16-8`(%rax),%rax # biased $_rsp | ||
2725 | lea 56(%rax),%rax | ||
2726 | |||
2727 | .Lin_cbc_frame_setup: | ||
2728 | mov -16(%rax),%rbx | ||
2729 | mov -24(%rax),%rbp | ||
2730 | mov -32(%rax),%r12 | ||
2731 | mov -40(%rax),%r13 | ||
2732 | mov -48(%rax),%r14 | ||
2733 | mov -56(%rax),%r15 | ||
2734 | mov %rbx,144($context) # restore context->Rbx | ||
2735 | mov %rbp,160($context) # restore context->Rbp | ||
2736 | mov %r12,216($context) # restore context->R12 | ||
2737 | mov %r13,224($context) # restore context->R13 | ||
2738 | mov %r14,232($context) # restore context->R14 | ||
2739 | mov %r15,240($context) # restore context->R15 | ||
2740 | |||
2741 | .Lin_cbc_prologue: | ||
2742 | mov 8(%rax),%rdi | ||
2743 | mov 16(%rax),%rsi | ||
2744 | mov %rax,152($context) # restore context->Rsp | ||
2745 | mov %rsi,168($context) # restore context->Rsi | ||
2746 | mov %rdi,176($context) # restore context->Rdi | ||
2747 | |||
2748 | .Lcommon_seh_exit: | ||
2749 | |||
2750 | mov 40($disp),%rdi # disp->ContextRecord | ||
2751 | mov $context,%rsi # context | ||
2752 | mov \$`1232/8`,%ecx # sizeof(CONTEXT) | ||
2753 | .long 0xa548f3fc # cld; rep movsq | ||
2754 | |||
2755 | mov $disp,%rsi | ||
2756 | xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER | ||
2757 | mov 8(%rsi),%rdx # arg2, disp->ImageBase | ||
2758 | mov 0(%rsi),%r8 # arg3, disp->ControlPc | ||
2759 | mov 16(%rsi),%r9 # arg4, disp->FunctionEntry | ||
2760 | mov 40(%rsi),%r10 # disp->ContextRecord | ||
2761 | lea 56(%rsi),%r11 # &disp->HandlerData | ||
2762 | lea 24(%rsi),%r12 # &disp->EstablisherFrame | ||
2763 | mov %r10,32(%rsp) # arg5 | ||
2764 | mov %r11,40(%rsp) # arg6 | ||
2765 | mov %r12,48(%rsp) # arg7 | ||
2766 | mov %rcx,56(%rsp) # arg8, (NULL) | ||
2767 | call *__imp_RtlVirtualUnwind(%rip) | ||
2768 | |||
2769 | mov \$1,%eax # ExceptionContinueSearch | ||
2770 | add \$64,%rsp | ||
2771 | popfq | ||
2772 | pop %r15 | ||
2773 | pop %r14 | ||
2774 | pop %r13 | ||
2775 | pop %r12 | ||
2776 | pop %rbp | ||
2777 | pop %rbx | ||
2778 | pop %rdi | ||
2779 | pop %rsi | ||
2780 | ret | ||
2781 | .size cbc_se_handler,.-cbc_se_handler | ||
2782 | |||
2783 | .section .pdata | ||
2784 | .align 4 | ||
2785 | .rva .LSEH_begin_aes_encrypt_internal | ||
2786 | .rva .LSEH_end_aes_encrypt_internal | ||
2787 | .rva .LSEH_info_aes_encrypt_internal | ||
2788 | |||
2789 | .rva .LSEH_begin_aes_decrypt_internal | ||
2790 | .rva .LSEH_end_aes_decrypt_internal | ||
2791 | .rva .LSEH_info_aes_decrypt_internal | ||
2792 | |||
2793 | .rva .LSEH_begin_aes_set_encrypt_key_internal | ||
2794 | .rva .LSEH_end_aes_set_encrypt_key_internal | ||
2795 | .rva .LSEH_info_aes_set_encrypt_key_internal | ||
2796 | |||
2797 | .rva .LSEH_begin_aes_set_decrypt_key_internal | ||
2798 | .rva .LSEH_end_aes_set_decrypt_key_internal | ||
2799 | .rva .LSEH_info_aes_set_decrypt_key_internal | ||
2800 | |||
2801 | .rva .LSEH_begin_aes_cbc_encrypt_internal | ||
2802 | .rva .LSEH_end_aes_cbc_encrypt_internal | ||
2803 | .rva .LSEH_info_aes_cbc_encrypt_internal | ||
2804 | |||
2805 | .section .xdata | ||
2806 | .align 8 | ||
2807 | .LSEH_info_aes_encrypt_internal: | ||
2808 | .byte 9,0,0,0 | ||
2809 | .rva block_se_handler | ||
2810 | .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] | ||
2811 | .LSEH_info_aes_decrypt_internal: | ||
2812 | .byte 9,0,0,0 | ||
2813 | .rva block_se_handler | ||
2814 | .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] | ||
2815 | .LSEH_info_aes_set_encrypt_key_internal: | ||
2816 | .byte 9,0,0,0 | ||
2817 | .rva key_se_handler | ||
2818 | .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] | ||
2819 | .LSEH_info_aes_set_decrypt_key_internal: | ||
2820 | .byte 9,0,0,0 | ||
2821 | .rva key_se_handler | ||
2822 | .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] | ||
2823 | .LSEH_info_aes_cbc_encrypt_internal: | ||
2824 | .byte 9,0,0,0 | ||
2825 | .rva cbc_se_handler | ||
2826 | ___ | ||
2827 | } | ||
2828 | |||
2829 | $code =~ s/\`([^\`]*)\`/eval($1)/gem; | ||
2830 | |||
2831 | print "#include \"x86_arch.h\"\n"; | ||
2832 | print $code; | ||
2833 | |||
2834 | close STDOUT; | ||