summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm
diff options
context:
space:
mode:
authordjm <>2010-10-01 22:54:21 +0000
committerdjm <>2010-10-01 22:54:21 +0000
commit829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2 (patch)
treee03b9f1bd051e844b971936729e9df549a209130 /src/lib/libcrypto/aes/asm
parente6b755d2a53d3cac7a344dfdd6bf7c951cac754c (diff)
downloadopenbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.gz
openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.tar.bz2
openbsd-829fd51d4f8dde4a7f3bf54754f3c1d1a502f5e2.zip
import OpenSSL-1.0.0a
Diffstat (limited to 'src/lib/libcrypto/aes/asm')
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2401
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl269
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl6
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2012
5 files changed, 3693 insertions, 996 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index 3bc46a968e..aab40e6f1c 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -2,11 +2,12 @@
2# 2#
3# ==================================================================== 3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary 5# project. The module is, however, dual licensed under OpenSSL and
6# forms are granted according to the OpenSSL license. 6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
7# ==================================================================== 8# ====================================================================
8# 9#
9# Version 3.6. 10# Version 4.3.
10# 11#
11# You might fail to appreciate this module performance from the first 12# You might fail to appreciate this module performance from the first
12# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered 13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -81,11 +82,117 @@
81# AMD K8 20 19 82# AMD K8 20 19
82# PIII 25 23 83# PIII 25 23
83# Pentium 81 78 84# Pentium 81 78
84 85#
85push(@INC,"perlasm","../../perlasm"); 86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
86require "x86asm.pl"; 191require "x86asm.pl";
87 192
88&asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); 193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
89 196
90$s0="eax"; 197$s0="eax";
91$s1="ebx"; 198$s1="ebx";
@@ -93,21 +200,36 @@ $s2="ecx";
93$s3="edx"; 200$s3="edx";
94$key="edi"; 201$key="edi";
95$acc="esi"; 202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
96 220
97$compromise=0; # $compromise=128 abstains from copying key 221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
98 # schedule to stack when encrypting inputs 222
99 # shorter than 128 bytes at the cost of 223$speed_limit=512; # chunks smaller than $speed_limit are
100 # risksing aliasing with S-boxes. In return 224 # processed with compact routine in CBC mode
101 # you get way better, up to +70%, small block
102 # performance.
103$small_footprint=1; # $small_footprint=1 code is ~5% slower [on 225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
104 # recent µ-archs], but ~5 times smaller! 226 # recent µ-archs], but ~5 times smaller!
105 # I favor compact code to minimize cache 227 # I favor compact code to minimize cache
106 # contention and in hope to "collect" 5% back 228 # contention and in hope to "collect" 5% back
107 # in real-life applications... 229 # in real-life applications...
230
108$vertical_spin=0; # shift "verticaly" defaults to 0, because of 231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
109 # its proof-of-concept status... 232 # its proof-of-concept status...
110
111# Note that there is no decvert(), as well as last encryption round is 233# Note that there is no decvert(), as well as last encryption round is
112# performed with "horizontal" shifts. This is because this "vertical" 234# performed with "horizontal" shifts. This is because this "vertical"
113# implementation [one which groups shifts on a given $s[i] to form a 235# implementation [one which groups shifts on a given $s[i] to form a
@@ -170,17 +292,484 @@ sub encvert()
170 &movz ($v0,&HB($v1)); 292 &movz ($v0,&HB($v1));
171 &and ($v1,0xFF); 293 &and ($v1,0xFF);
172 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16 294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
173 &mov ($key,&DWP(12,"esp")); # reincarnate v1 as key 295 &mov ($key,$__key); # reincarnate v1 as key
174 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24 296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
175} 297}
176 298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
177sub encstep() 766sub encstep()
178{ my ($i,$te,@s) = @_; 767{ my ($i,$te,@s) = @_;
179 my $tmp = $key; 768 my $tmp = $key;
180 my $out = $i==3?$s[0]:$acc; 769 my $out = $i==3?$s[0]:$acc;
181 770
182 # lines marked with #%e?x[i] denote "reordered" instructions... 771 # lines marked with #%e?x[i] denote "reordered" instructions...
183 if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx 772 if ($i==3) { &mov ($key,$__key); }##%edx
184 else { &mov ($out,$s[0]); 773 else { &mov ($out,$s[0]);
185 &and ($out,0xFF); } 774 &and ($out,0xFF); }
186 if ($i==1) { &shr ($s[0],16); }#%ebx[1] 775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -191,14 +780,14 @@ sub encstep()
191 &movz ($tmp,&HB($s[1])); 780 &movz ($tmp,&HB($s[1]));
192 &xor ($out,&DWP(3,$te,$tmp,8)); 781 &xor ($out,&DWP(3,$te,$tmp,8));
193 782
194 if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx 783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
195 else { &mov ($tmp,$s[2]); 784 else { &mov ($tmp,$s[2]);
196 &shr ($tmp,16); } 785 &shr ($tmp,16); }
197 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] 786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
198 &and ($tmp,0xFF); 787 &and ($tmp,0xFF);
199 &xor ($out,&DWP(2,$te,$tmp,8)); 788 &xor ($out,&DWP(2,$te,$tmp,8));
200 789
201 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx 790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
202 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] 791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
203 else { &mov ($tmp,$s[3]); 792 else { &mov ($tmp,$s[3]);
204 &shr ($tmp,24) } 793 &shr ($tmp,24) }
@@ -213,7 +802,7 @@ sub enclast()
213 my $tmp = $key; 802 my $tmp = $key;
214 my $out = $i==3?$s[0]:$acc; 803 my $out = $i==3?$s[0]:$acc;
215 804
216 if ($i==3) { &mov ($key,&DWP(12,"esp")); }##%edx 805 if ($i==3) { &mov ($key,$__key); }##%edx
217 else { &mov ($out,$s[0]); } 806 else { &mov ($out,$s[0]); }
218 &and ($out,0xFF); 807 &and ($out,0xFF);
219 if ($i==1) { &shr ($s[0],16); }#%ebx[1] 808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
@@ -227,8 +816,8 @@ sub enclast()
227 &and ($tmp,0x0000ff00); 816 &and ($tmp,0x0000ff00);
228 &xor ($out,$tmp); 817 &xor ($out,$tmp);
229 818
230 if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx 819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
231 else { mov ($tmp,$s[2]); 820 else { &mov ($tmp,$s[2]);
232 &shr ($tmp,16); } 821 &shr ($tmp,16); }
233 if ($i==2) { &and ($s[1],0xFF); }#%edx[2] 822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
234 &and ($tmp,0xFF); 823 &and ($tmp,0xFF);
@@ -236,7 +825,7 @@ sub enclast()
236 &and ($tmp,0x00ff0000); 825 &and ($tmp,0x00ff0000);
237 &xor ($out,$tmp); 826 &xor ($out,$tmp);
238 827
239 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx 828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
240 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] 829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
241 else { &mov ($tmp,$s[3]); 830 else { &mov ($tmp,$s[3]);
242 &shr ($tmp,24); } 831 &shr ($tmp,24); }
@@ -247,9 +836,6 @@ sub enclast()
247 if ($i==3) { &mov ($s[3],$acc); } 836 if ($i==3) { &mov ($s[3],$acc); }
248} 837}
249 838
250sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
251
252&public_label("AES_Te");
253&function_begin_B("_x86_AES_encrypt"); 839&function_begin_B("_x86_AES_encrypt");
254 if ($vertical_spin) { 840 if ($vertical_spin) {
255 # I need high parts of volatile registers to be accessible... 841 # I need high parts of volatile registers to be accessible...
@@ -258,7 +844,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
258 } 844 }
259 845
260 # note that caller is expected to allocate stack frame for me! 846 # note that caller is expected to allocate stack frame for me!
261 &mov (&DWP(12,"esp"),$key); # save key 847 &mov ($__key,$key); # save key
262 848
263 &xor ($s0,&DWP(0,$key)); # xor with key 849 &xor ($s0,&DWP(0,$key)); # xor with key
264 &xor ($s1,&DWP(4,$key)); 850 &xor ($s1,&DWP(4,$key));
@@ -270,24 +856,24 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
270 if ($small_footprint) { 856 if ($small_footprint) {
271 &lea ($acc,&DWP(-2,$acc,$acc)); 857 &lea ($acc,&DWP(-2,$acc,$acc));
272 &lea ($acc,&DWP(0,$key,$acc,8)); 858 &lea ($acc,&DWP(0,$key,$acc,8));
273 &mov (&DWP(16,"esp"),$acc); # end of key schedule 859 &mov ($__end,$acc); # end of key schedule
274 &align (4); 860
275 &set_label("loop"); 861 &set_label("loop",16);
276 if ($vertical_spin) { 862 if ($vertical_spin) {
277 &encvert("ebp",$s0,$s1,$s2,$s3); 863 &encvert($tbl,$s0,$s1,$s2,$s3);
278 } else { 864 } else {
279 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
280 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
281 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
282 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
283 } 869 }
284 &add ($key,16); # advance rd_key 870 &add ($key,16); # advance rd_key
285 &xor ($s0,&DWP(0,$key)); 871 &xor ($s0,&DWP(0,$key));
286 &xor ($s1,&DWP(4,$key)); 872 &xor ($s1,&DWP(4,$key));
287 &xor ($s2,&DWP(8,$key)); 873 &xor ($s2,&DWP(8,$key));
288 &xor ($s3,&DWP(12,$key)); 874 &xor ($s3,&DWP(12,$key));
289 &cmp ($key,&DWP(16,"esp")); 875 &cmp ($key,$__end);
290 &mov (&DWP(12,"esp"),$key); 876 &mov ($__key,$key);
291 &jb (&label("loop")); 877 &jb (&label("loop"));
292 } 878 }
293 else { 879 else {
@@ -296,15 +882,15 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
296 &cmp ($acc,12); 882 &cmp ($acc,12);
297 &jle (&label("12rounds")); 883 &jle (&label("12rounds"));
298 884
299 &set_label("14rounds"); 885 &set_label("14rounds",4);
300 for ($i=1;$i<3;$i++) { 886 for ($i=1;$i<3;$i++) {
301 if ($vertical_spin) { 887 if ($vertical_spin) {
302 &encvert("ebp",$s0,$s1,$s2,$s3); 888 &encvert($tbl,$s0,$s1,$s2,$s3);
303 } else { 889 } else {
304 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
305 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
306 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
307 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
308 } 894 }
309 &xor ($s0,&DWP(16*$i+0,$key)); 895 &xor ($s0,&DWP(16*$i+0,$key));
310 &xor ($s1,&DWP(16*$i+4,$key)); 896 &xor ($s1,&DWP(16*$i+4,$key));
@@ -312,16 +898,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
312 &xor ($s3,&DWP(16*$i+12,$key)); 898 &xor ($s3,&DWP(16*$i+12,$key));
313 } 899 }
314 &add ($key,32); 900 &add ($key,32);
315 &mov (&DWP(12,"esp"),$key); # advance rd_key 901 &mov ($__key,$key); # advance rd_key
316 &set_label("12rounds"); 902 &set_label("12rounds",4);
317 for ($i=1;$i<3;$i++) { 903 for ($i=1;$i<3;$i++) {
318 if ($vertical_spin) { 904 if ($vertical_spin) {
319 &encvert("ebp",$s0,$s1,$s2,$s3); 905 &encvert($tbl,$s0,$s1,$s2,$s3);
320 } else { 906 } else {
321 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
322 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
323 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
324 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
325 } 911 }
326 &xor ($s0,&DWP(16*$i+0,$key)); 912 &xor ($s0,&DWP(16*$i+0,$key));
327 &xor ($s1,&DWP(16*$i+4,$key)); 913 &xor ($s1,&DWP(16*$i+4,$key));
@@ -329,16 +915,16 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
329 &xor ($s3,&DWP(16*$i+12,$key)); 915 &xor ($s3,&DWP(16*$i+12,$key));
330 } 916 }
331 &add ($key,32); 917 &add ($key,32);
332 &mov (&DWP(12,"esp"),$key); # advance rd_key 918 &mov ($__key,$key); # advance rd_key
333 &set_label("10rounds"); 919 &set_label("10rounds",4);
334 for ($i=1;$i<10;$i++) { 920 for ($i=1;$i<10;$i++) {
335 if ($vertical_spin) { 921 if ($vertical_spin) {
336 &encvert("ebp",$s0,$s1,$s2,$s3); 922 &encvert($tbl,$s0,$s1,$s2,$s3);
337 } else { 923 } else {
338 &encstep(0,"ebp",$s0,$s1,$s2,$s3); 924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
339 &encstep(1,"ebp",$s1,$s2,$s3,$s0); 925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
340 &encstep(2,"ebp",$s2,$s3,$s0,$s1); 926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
341 &encstep(3,"ebp",$s3,$s0,$s1,$s2); 927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
342 } 928 }
343 &xor ($s0,&DWP(16*$i+0,$key)); 929 &xor ($s0,&DWP(16*$i+0,$key));
344 &xor ($s1,&DWP(16*$i+4,$key)); 930 &xor ($s1,&DWP(16*$i+4,$key));
@@ -352,10 +938,10 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
352 &mov ($s1="ebx",$key="edi"); 938 &mov ($s1="ebx",$key="edi");
353 &mov ($s2="ecx",$acc="esi"); 939 &mov ($s2="ecx",$acc="esi");
354 } 940 }
355 &enclast(0,"ebp",$s0,$s1,$s2,$s3); 941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
356 &enclast(1,"ebp",$s1,$s2,$s3,$s0); 942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
357 &enclast(2,"ebp",$s2,$s3,$s0,$s1); 943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
358 &enclast(3,"ebp",$s3,$s0,$s1,$s2); 944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
359 945
360 &add ($key,$small_footprint?16:160); 946 &add ($key,$small_footprint?16:160);
361 &xor ($s0,&DWP(0,$key)); 947 &xor ($s0,&DWP(0,$key));
@@ -430,38 +1016,198 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
430 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
431 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
432 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
433#rcon: 1152#rcon:
434 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); 1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
435 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); 1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
436 &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0); 1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
437&function_end_B("_x86_AES_encrypt"); 1157&function_end_B("_x86_AES_encrypt");
438 1158
439# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
440&public_label("AES_Te");
441&function_begin("AES_encrypt"); 1160&function_begin("AES_encrypt");
442 &mov ($acc,&wparam(0)); # load inp 1161 &mov ($acc,&wparam(0)); # load inp
443 &mov ($key,&wparam(2)); # load key 1162 &mov ($key,&wparam(2)); # load key
444 1163
445 &mov ($s0,"esp"); 1164 &mov ($s0,"esp");
446 &sub ("esp",24); 1165 &sub ("esp",36);
447 &and ("esp",-64); 1166 &and ("esp",-64); # align to cache-line
448 &add ("esp",4); 1167
449 &mov (&DWP(16,"esp"),$s0); 1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
450 1176
451 &call (&label("pic_point")); # make it PIC! 1177 &call (&label("pic_point")); # make it PIC!
452 &set_label("pic_point"); 1178 &set_label("pic_point");
453 &blindpop("ebp"); 1179 &blindpop($tbl);
454 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
455 1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
456 &mov ($s0,&DWP(0,$acc)); # load input data 1205 &mov ($s0,&DWP(0,$acc)); # load input data
457 &mov ($s1,&DWP(4,$acc)); 1206 &mov ($s1,&DWP(4,$acc));
458 &mov ($s2,&DWP(8,$acc)); 1207 &mov ($s2,&DWP(8,$acc));
459 &mov ($s3,&DWP(12,$acc)); 1208 &mov ($s3,&DWP(12,$acc));
460 1209 &call ("_x86_AES_encrypt_compact");
461 &call ("_x86_AES_encrypt"); 1210 &mov ("esp",$_esp); # restore stack pointer
462
463 &mov ("esp",&DWP(16,"esp"));
464
465 &mov ($acc,&wparam(1)); # load out 1211 &mov ($acc,&wparam(1)); # load out
466 &mov (&DWP(0,$acc),$s0); # write output data 1212 &mov (&DWP(0,$acc),$s0); # write output data
467 &mov (&DWP(4,$acc),$s1); 1213 &mov (&DWP(4,$acc),$s1);
@@ -469,7 +1215,370 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
469 &mov (&DWP(12,$acc),$s3); 1215 &mov (&DWP(12,$acc),$s3);
470&function_end("AES_encrypt"); 1216&function_end("AES_encrypt");
471 1217
472#------------------------------------------------------------------# 1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
473 1582
474sub decstep() 1583sub decstep()
475{ my ($i,$td,@s) = @_; 1584{ my ($i,$td,@s) = @_;
@@ -480,7 +1589,7 @@ sub decstep()
480 # optimal... or rather that all attempts to reorder didn't 1589 # optimal... or rather that all attempts to reorder didn't
481 # result in better performance [which by the way is not a 1590 # result in better performance [which by the way is not a
482 # bit lower than ecryption]. 1591 # bit lower than ecryption].
483 if($i==3) { &mov ($key,&DWP(12,"esp")); } 1592 if($i==3) { &mov ($key,$__key); }
484 else { &mov ($out,$s[0]); } 1593 else { &mov ($out,$s[0]); }
485 &and ($out,0xFF); 1594 &and ($out,0xFF);
486 &mov ($out,&DWP(0,$td,$out,8)); 1595 &mov ($out,&DWP(0,$td,$out,8));
@@ -495,12 +1604,12 @@ sub decstep()
495 &and ($tmp,0xFF); 1604 &and ($tmp,0xFF);
496 &xor ($out,&DWP(2,$td,$tmp,8)); 1605 &xor ($out,&DWP(2,$td,$tmp,8));
497 1606
498 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } 1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
499 else { &mov ($tmp,$s[3]); } 1608 else { &mov ($tmp,$s[3]); }
500 &shr ($tmp,24); 1609 &shr ($tmp,24);
501 &xor ($out,&DWP(1,$td,$tmp,8)); 1610 &xor ($out,&DWP(1,$td,$tmp,8));
502 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } 1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
503 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } 1612 if ($i==3) { &mov ($s[3],$__s0); }
504 &comment(); 1613 &comment();
505} 1614}
506 1615
@@ -509,14 +1618,24 @@ sub declast()
509 my $tmp = $key; 1618 my $tmp = $key;
510 my $out = $i==3?$s[0]:$acc; 1619 my $out = $i==3?$s[0]:$acc;
511 1620
512 if($i==3) { &mov ($key,&DWP(12,"esp")); } 1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
513 else { &mov ($out,$s[0]); } 1632 else { &mov ($out,$s[0]); }
514 &and ($out,0xFF); 1633 &and ($out,0xFF);
515 &movz ($out,&BP(2048,$td,$out,1)); 1634 &movz ($out,&BP(0,$td,$out,1));
516 1635
517 if ($i==3) { $tmp=$s[1]; } 1636 if ($i==3) { $tmp=$s[1]; }
518 &movz ($tmp,&HB($s[1])); 1637 &movz ($tmp,&HB($s[1]));
519 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1638 &movz ($tmp,&BP(0,$td,$tmp,1));
520 &shl ($tmp,8); 1639 &shl ($tmp,8);
521 &xor ($out,$tmp); 1640 &xor ($out,$tmp);
522 1641
@@ -524,24 +1643,24 @@ sub declast()
524 else { mov ($tmp,$s[2]); } 1643 else { mov ($tmp,$s[2]); }
525 &shr ($tmp,16); 1644 &shr ($tmp,16);
526 &and ($tmp,0xFF); 1645 &and ($tmp,0xFF);
527 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1646 &movz ($tmp,&BP(0,$td,$tmp,1));
528 &shl ($tmp,16); 1647 &shl ($tmp,16);
529 &xor ($out,$tmp); 1648 &xor ($out,$tmp);
530 1649
531 if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); } 1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
532 else { &mov ($tmp,$s[3]); } 1651 else { &mov ($tmp,$s[3]); }
533 &shr ($tmp,24); 1652 &shr ($tmp,24);
534 &movz ($tmp,&BP(2048,$td,$tmp,1)); 1653 &movz ($tmp,&BP(0,$td,$tmp,1));
535 &shl ($tmp,24); 1654 &shl ($tmp,24);
536 &xor ($out,$tmp); 1655 &xor ($out,$tmp);
537 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); } 1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
538 if ($i==3) { &mov ($s[3],&DWP(4,"esp")); } 1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
539} 1659}
540 1660
541&public_label("AES_Td");
542&function_begin_B("_x86_AES_decrypt"); 1661&function_begin_B("_x86_AES_decrypt");
543 # note that caller is expected to allocate stack frame for me! 1662 # note that caller is expected to allocate stack frame for me!
544 &mov (&DWP(12,"esp"),$key); # save key 1663 &mov ($__key,$key); # save key
545 1664
546 &xor ($s0,&DWP(0,$key)); # xor with key 1665 &xor ($s0,&DWP(0,$key)); # xor with key
547 &xor ($s1,&DWP(4,$key)); 1666 &xor ($s1,&DWP(4,$key));
@@ -553,20 +1672,19 @@ sub declast()
553 if ($small_footprint) { 1672 if ($small_footprint) {
554 &lea ($acc,&DWP(-2,$acc,$acc)); 1673 &lea ($acc,&DWP(-2,$acc,$acc));
555 &lea ($acc,&DWP(0,$key,$acc,8)); 1674 &lea ($acc,&DWP(0,$key,$acc,8));
556 &mov (&DWP(16,"esp"),$acc); # end of key schedule 1675 &mov ($__end,$acc); # end of key schedule
557 &align (4); 1676 &set_label("loop",16);
558 &set_label("loop"); 1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
559 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
560 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
561 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
562 &decstep(3,"ebp",$s3,$s2,$s1,$s0);
563 &add ($key,16); # advance rd_key 1681 &add ($key,16); # advance rd_key
564 &xor ($s0,&DWP(0,$key)); 1682 &xor ($s0,&DWP(0,$key));
565 &xor ($s1,&DWP(4,$key)); 1683 &xor ($s1,&DWP(4,$key));
566 &xor ($s2,&DWP(8,$key)); 1684 &xor ($s2,&DWP(8,$key));
567 &xor ($s3,&DWP(12,$key)); 1685 &xor ($s3,&DWP(12,$key));
568 &cmp ($key,&DWP(16,"esp")); 1686 &cmp ($key,$__end);
569 &mov (&DWP(12,"esp"),$key); 1687 &mov ($__key,$key);
570 &jb (&label("loop")); 1688 &jb (&label("loop"));
571 } 1689 }
572 else { 1690 else {
@@ -575,38 +1693,38 @@ sub declast()
575 &cmp ($acc,12); 1693 &cmp ($acc,12);
576 &jle (&label("12rounds")); 1694 &jle (&label("12rounds"));
577 1695
578 &set_label("14rounds"); 1696 &set_label("14rounds",4);
579 for ($i=1;$i<3;$i++) { 1697 for ($i=1;$i<3;$i++) {
580 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
581 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
582 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
583 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
584 &xor ($s0,&DWP(16*$i+0,$key)); 1702 &xor ($s0,&DWP(16*$i+0,$key));
585 &xor ($s1,&DWP(16*$i+4,$key)); 1703 &xor ($s1,&DWP(16*$i+4,$key));
586 &xor ($s2,&DWP(16*$i+8,$key)); 1704 &xor ($s2,&DWP(16*$i+8,$key));
587 &xor ($s3,&DWP(16*$i+12,$key)); 1705 &xor ($s3,&DWP(16*$i+12,$key));
588 } 1706 }
589 &add ($key,32); 1707 &add ($key,32);
590 &mov (&DWP(12,"esp"),$key); # advance rd_key 1708 &mov ($__key,$key); # advance rd_key
591 &set_label("12rounds"); 1709 &set_label("12rounds",4);
592 for ($i=1;$i<3;$i++) { 1710 for ($i=1;$i<3;$i++) {
593 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
594 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
595 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
596 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
597 &xor ($s0,&DWP(16*$i+0,$key)); 1715 &xor ($s0,&DWP(16*$i+0,$key));
598 &xor ($s1,&DWP(16*$i+4,$key)); 1716 &xor ($s1,&DWP(16*$i+4,$key));
599 &xor ($s2,&DWP(16*$i+8,$key)); 1717 &xor ($s2,&DWP(16*$i+8,$key));
600 &xor ($s3,&DWP(16*$i+12,$key)); 1718 &xor ($s3,&DWP(16*$i+12,$key));
601 } 1719 }
602 &add ($key,32); 1720 &add ($key,32);
603 &mov (&DWP(12,"esp"),$key); # advance rd_key 1721 &mov ($__key,$key); # advance rd_key
604 &set_label("10rounds"); 1722 &set_label("10rounds",4);
605 for ($i=1;$i<10;$i++) { 1723 for ($i=1;$i<10;$i++) {
606 &decstep(0,"ebp",$s0,$s3,$s2,$s1); 1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
607 &decstep(1,"ebp",$s1,$s0,$s3,$s2); 1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
608 &decstep(2,"ebp",$s2,$s1,$s0,$s3); 1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
609 &decstep(3,"ebp",$s3,$s2,$s1,$s0); 1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
610 &xor ($s0,&DWP(16*$i+0,$key)); 1728 &xor ($s0,&DWP(16*$i+0,$key));
611 &xor ($s1,&DWP(16*$i+4,$key)); 1729 &xor ($s1,&DWP(16*$i+4,$key));
612 &xor ($s2,&DWP(16*$i+8,$key)); 1730 &xor ($s2,&DWP(16*$i+8,$key));
@@ -614,10 +1732,10 @@ sub declast()
614 } 1732 }
615 } 1733 }
616 1734
617 &declast(0,"ebp",$s0,$s3,$s2,$s1); 1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
618 &declast(1,"ebp",$s1,$s0,$s3,$s2); 1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
619 &declast(2,"ebp",$s2,$s1,$s0,$s3); 1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
620 &declast(3,"ebp",$s3,$s2,$s1,$s0); 1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
621 1739
622 &add ($key,$small_footprint?16:160); 1740 &add ($key,$small_footprint?16:160);
623 &xor ($s0,&DWP(0,$key)); 1741 &xor ($s0,&DWP(0,$key));
@@ -692,7 +1810,107 @@ sub declast()
692 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
693 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
694 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
695#Td4: 1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
696 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
697 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
698 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
@@ -728,43 +1946,57 @@ sub declast()
728&function_end_B("_x86_AES_decrypt"); 1946&function_end_B("_x86_AES_decrypt");
729 1947
730# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
731&public_label("AES_Td");
732&function_begin("AES_decrypt"); 1949&function_begin("AES_decrypt");
733 &mov ($acc,&wparam(0)); # load inp 1950 &mov ($acc,&wparam(0)); # load inp
734 &mov ($key,&wparam(2)); # load key 1951 &mov ($key,&wparam(2)); # load key
735 1952
736 &mov ($s0,"esp"); 1953 &mov ($s0,"esp");
737 &sub ("esp",24); 1954 &sub ("esp",36);
738 &and ("esp",-64); 1955 &and ("esp",-64); # align to cache-line
739 &add ("esp",4); 1956
740 &mov (&DWP(16,"esp"),$s0); 1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
741 1965
742 &call (&label("pic_point")); # make it PIC! 1966 &call (&label("pic_point")); # make it PIC!
743 &set_label("pic_point"); 1967 &set_label("pic_point");
744 &blindpop("ebp"); 1968 &blindpop($tbl);
745 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); 1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
746 1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
747 # prefetch Td4 1971
748 &lea ("ebp",&DWP(2048+128,"ebp")); 1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
749 &mov ($s0,&DWP(0-128,"ebp")); 1973 &lea ($s1,&DWP(768-4,"esp"));
750 &mov ($s1,&DWP(32-128,"ebp")); 1974 &sub ($s1,$tbl);
751 &mov ($s2,&DWP(64-128,"ebp")); 1975 &and ($s1,0x300);
752 &mov ($s3,&DWP(96-128,"ebp")); 1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
753 &mov ($s0,&DWP(128-128,"ebp")); 1977
754 &mov ($s1,&DWP(160-128,"ebp")); 1978 if (!$x86only) {
755 &mov ($s2,&DWP(192-128,"ebp")); 1979 &bt (&DWP(0,$s0),25); # check for SSE bit
756 &mov ($s3,&DWP(224-128,"ebp")); 1980 &jnc (&label("x86"));
757 &lea ("ebp",&DWP(-2048-128,"ebp")); 1981
758 1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
759 &mov ($s0,&DWP(0,$acc)); # load input data 1994 &mov ($s0,&DWP(0,$acc)); # load input data
760 &mov ($s1,&DWP(4,$acc)); 1995 &mov ($s1,&DWP(4,$acc));
761 &mov ($s2,&DWP(8,$acc)); 1996 &mov ($s2,&DWP(8,$acc));
762 &mov ($s3,&DWP(12,$acc)); 1997 &mov ($s3,&DWP(12,$acc));
763 1998 &call ("_x86_AES_decrypt_compact");
764 &call ("_x86_AES_decrypt"); 1999 &mov ("esp",$_esp); # restore stack pointer
765
766 &mov ("esp",&DWP(16,"esp"));
767
768 &mov ($acc,&wparam(1)); # load out 2000 &mov ($acc,&wparam(1)); # load out
769 &mov (&DWP(0,$acc),$s0); # write output data 2001 &mov (&DWP(0,$acc),$s0); # write output data
770 &mov (&DWP(4,$acc),$s1); 2002 &mov (&DWP(4,$acc),$s1);
@@ -777,126 +2009,136 @@ sub declast()
777# unsigned char *ivp,const int enc); 2009# unsigned char *ivp,const int enc);
778{ 2010{
779# stack frame layout 2011# stack frame layout
780# -4(%esp) 0(%esp) return address 2012# -4(%esp) # return address 0(%esp)
781# 0(%esp) 4(%esp) tmp1 2013# 0(%esp) # s0 backing store 4(%esp)
782# 4(%esp) 8(%esp) tmp2 2014# 4(%esp) # s1 backing store 8(%esp)
783# 8(%esp) 12(%esp) key 2015# 8(%esp) # s2 backing store 12(%esp)
784# 12(%esp) 16(%esp) end of key schedule 2016# 12(%esp) # s3 backing store 16(%esp)
785my $_esp=&DWP(16,"esp"); #saved %esp 2017# 16(%esp) # key backup 20(%esp)
786my $_inp=&DWP(20,"esp"); #copy of wparam(0) 2018# 20(%esp) # end of key schedule 24(%esp)
787my $_out=&DWP(24,"esp"); #copy of wparam(1) 2019# 24(%esp) # %ebp backup 28(%esp)
788my $_len=&DWP(28,"esp"); #copy of wparam(2) 2020# 28(%esp) # %esp backup
789my $_key=&DWP(32,"esp"); #copy of wparam(3) 2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
790my $_ivp=&DWP(36,"esp"); #copy of wparam(4) 2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
791my $_tmp=&DWP(40,"esp"); #volatile variable 2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
792my $ivec=&DWP(44,"esp"); #ivec[16] 2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
793my $aes_key=&DWP(60,"esp"); #copy of aes_key 2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
794my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds 2026my $_tmp=&DWP(52,"esp"); # volatile variable
795 2027#
796&public_label("AES_Te"); 2028my $ivec=&DWP(60,"esp"); # ivec[16]
797&public_label("AES_Td"); 2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
798&function_begin("AES_cbc_encrypt"); 2032&function_begin("AES_cbc_encrypt");
799 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
800 &cmp ($s2,0); 2034 &cmp ($s2,0);
801 &je (&label("enc_out")); 2035 &je (&label("drop_out"));
802 2036
803 &call (&label("pic_point")); # make it PIC! 2037 &call (&label("pic_point")); # make it PIC!
804 &set_label("pic_point"); 2038 &set_label("pic_point");
805 &blindpop("ebp"); 2039 &blindpop($tbl);
806 2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
807 &pushf ();
808 &cld ();
809 2041
810 &cmp (&wparam(5),0); 2042 &cmp (&wparam(5),0);
811 &je (&label("DECRYPT")); 2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
812 2044 &jne (&label("picked_te"));
813 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
814 2047
815 # allocate aligned stack frame... 2048 # one can argue if this is required
816 &lea ($key,&DWP(-64-244,"esp")); 2049 &pushf ();
817 &and ($key,-64); 2050 &cld ();
818 2051
819 # ... and make sure it doesn't alias with AES_Te modulo 4096 2052 &cmp ($s2,$speed_limit);
820 &mov ($s0,"ebp"); 2053 &jb (&label("slow_way"));
821 &lea ($s1,&DWP(2048,"ebp")); 2054 &test ($s2,15);
822 &mov ($s3,$key); 2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
823 &and ($s0,0xfff); # s = %ebp&0xfff 2068 &and ($s0,0xfff); # s = %ebp&0xfff
824 &and ($s1,0xfff); # e = (%ebp+2048)&0xfff 2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
825 &and ($s3,0xfff); # p = %esp&0xfff 2070 &and ($s3,0xfff); # p = %esp&0xfff
826 2071
827 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e); 2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
828 &jb (&label("te_break_out")); 2073 &jb (&label("tbl_break_out"));
829 &sub ($s3,$s1); 2074 &sub ($s3,$s1);
830 &sub ($key,$s3); 2075 &sub ($acc,$s3);
831 &jmp (&label("te_ok")); 2076 &jmp (&label("tbl_ok"));
832 &set_label("te_break_out"); # else %esp -= (p-s)&0xfff + framesz; 2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
833 &sub ($s3,$s0); 2078 &sub ($s3,$s0);
834 &and ($s3,0xfff); 2079 &and ($s3,0xfff);
835 &add ($s3,64+256); 2080 &add ($s3,384);
836 &sub ($key,$s3); 2081 &sub ($acc,$s3);
837 &align (4); 2082 &set_label("tbl_ok",4);
838 &set_label("te_ok");
839
840 &mov ($s0,&wparam(0)); # load inp
841 &mov ($s1,&wparam(1)); # load out
842 &mov ($s3,&wparam(3)); # load key
843 &mov ($acc,&wparam(4)); # load ivp
844 2083
845 &exch ("esp",$key); 2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2085 &exch ("esp",$acc); # allocate stack frame
846 &add ("esp",4); # reserve for return address! 2086 &add ("esp",4); # reserve for return address!
847 &mov ($_esp,$key); # save %esp 2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
848 2096
849 &mov ($_inp,$s0); # save copy of inp 2097 &mov ($_inp,$s0); # save copy of inp
850 &mov ($_out,$s1); # save copy of out 2098 &mov ($_out,$s1); # save copy of out
851 &mov ($_len,$s2); # save copy of len 2099 &mov ($_len,$s2); # save copy of len
852 &mov ($_key,$s3); # save copy of key 2100 &mov ($_key,$key); # save copy of key
853 &mov ($_ivp,$acc); # save copy of ivp 2101 &mov ($_ivp,$acc); # save copy of ivp
854 2102
855 &mov ($mark,0); # copy of aes_key->rounds = 0; 2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
856 if ($compromise) {
857 &cmp ($s2,$compromise);
858 &jb (&label("skip_ecopy"));
859 }
860 # do we copy key schedule to stack? 2104 # do we copy key schedule to stack?
861 &mov ($s1 eq "ebx" ? $s1 : "",$s3); 2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
862 &mov ($s2 eq "ecx" ? $s2 : "",244/4); 2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
863 &sub ($s1,"ebp"); 2107 &sub ($s1,$tbl);
864 &mov ("esi",$s3); 2108 &mov ("esi",$key);
865 &and ($s1,0xfff); 2109 &and ($s1,0xfff);
866 &lea ("edi",$aes_key); 2110 &lea ("edi",$aes_key);
867 &cmp ($s1,2048); 2111 &cmp ($s1,2048+256);
868 &jb (&label("do_ecopy")); 2112 &jb (&label("do_copy"));
869 &cmp ($s1,4096-244); 2113 &cmp ($s1,4096-244);
870 &jb (&label("skip_ecopy")); 2114 &jb (&label("skip_copy"));
871 &align (4); 2115 &set_label("do_copy",4);
872 &set_label("do_ecopy");
873 &mov ($_key,"edi"); 2116 &mov ($_key,"edi");
874 &data_word(0xA5F3F689); # rep movsd 2117 &data_word(0xA5F3F689); # rep movsd
875 &set_label("skip_ecopy"); 2118 &set_label("skip_copy");
876 2119
877 &mov ($acc,$s0);
878 &mov ($key,16); 2120 &mov ($key,16);
879 &align (4); 2121 &set_label("prefetch_tbl",4);
880 &set_label("prefetch_te"); 2122 &mov ($s0,&DWP(0,$tbl));
881 &mov ($s0,&DWP(0,"ebp")); 2123 &mov ($s1,&DWP(32,$tbl));
882 &mov ($s1,&DWP(32,"ebp")); 2124 &mov ($s2,&DWP(64,$tbl));
883 &mov ($s2,&DWP(64,"ebp")); 2125 &mov ($acc,&DWP(96,$tbl));
884 &mov ($s3,&DWP(96,"ebp")); 2126 &lea ($tbl,&DWP(128,$tbl));
885 &lea ("ebp",&DWP(128,"ebp")); 2127 &sub ($key,1);
886 &dec ($key); 2128 &jnz (&label("prefetch_tbl"));
887 &jnz (&label("prefetch_te")); 2129 &sub ($tbl,2048);
888 &sub ("ebp",2048); 2130
889 2131 &mov ($acc,$_inp);
890 &mov ($s2,$_len);
891 &mov ($key,$_ivp); 2132 &mov ($key,$_ivp);
892 &test ($s2,0xFFFFFFF0);
893 &jz (&label("enc_tail")); # short input...
894 2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
895 &mov ($s0,&DWP(0,$key)); # load iv 2138 &mov ($s0,&DWP(0,$key)); # load iv
896 &mov ($s1,&DWP(4,$key)); 2139 &mov ($s1,&DWP(4,$key));
897 2140
898 &align (4); 2141 &set_label("fast_enc_loop",16);
899 &set_label("enc_loop");
900 &mov ($s2,&DWP(8,$key)); 2142 &mov ($s2,&DWP(8,$key));
901 &mov ($s3,&DWP(12,$key)); 2143 &mov ($s3,&DWP(12,$key));
902 2144
@@ -916,22 +2158,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
916 &mov (&DWP(8,$key),$s2); 2158 &mov (&DWP(8,$key),$s2);
917 &mov (&DWP(12,$key),$s3); 2159 &mov (&DWP(12,$key),$s3);
918 2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
919 &mov ($s2,$_len); # load len 2162 &mov ($s2,$_len); # load len
920
921 &lea ($acc,&DWP(16,$acc));
922 &mov ($_inp,$acc); # save inp 2163 &mov ($_inp,$acc); # save inp
923 2164 &lea ($s3,&DWP(16,$key)); # advance out
924 &lea ($s3,&DWP(16,$key));
925 &mov ($_out,$s3); # save out 2165 &mov ($_out,$s3); # save out
926 2166 &sub ($s2,16); # decrease len
927 &sub ($s2,16);
928 &test ($s2,0xFFFFFFF0);
929 &mov ($_len,$s2); # save len 2167 &mov ($_len,$s2); # save len
930 &jnz (&label("enc_loop")); 2168 &jnz (&label("fast_enc_loop"));
931 &test ($s2,15);
932 &jnz (&label("enc_tail"));
933 &mov ($acc,$_ivp); # load ivp 2169 &mov ($acc,$_ivp); # load ivp
934 &mov ($s2,&DWP(8,$key)); # restore last dwords 2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
935 &mov ($s3,&DWP(12,$key)); 2171 &mov ($s3,&DWP(12,$key));
936 &mov (&DWP(0,$acc),$s0); # save ivec 2172 &mov (&DWP(0,$acc),$s0); # save ivec
937 &mov (&DWP(4,$acc),$s1); 2173 &mov (&DWP(4,$acc),$s1);
@@ -949,125 +2185,20 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
949 &set_label("skip_ezero") 2185 &set_label("skip_ezero")
950 &mov ("esp",$_esp); 2186 &mov ("esp",$_esp);
951 &popf (); 2187 &popf ();
952 &set_label("enc_out"); 2188 &set_label("drop_out");
953 &function_end_A(); 2189 &function_end_A();
954 &pushf (); # kludge, never executed 2190 &pushf (); # kludge, never executed
955 2191
956 &align (4);
957 &set_label("enc_tail");
958 &mov ($s0,$key eq "edi" ? $key : "");
959 &mov ($key,$_out); # load out
960 &push ($s0); # push ivp
961 &mov ($s1,16);
962 &sub ($s1,$s2);
963 &cmp ($key,$acc); # compare with inp
964 &je (&label("enc_in_place"));
965 &align (4);
966 &data_word(0xA4F3F689); # rep movsb # copy input
967 &jmp (&label("enc_skip_in_place"));
968 &set_label("enc_in_place");
969 &lea ($key,&DWP(0,$key,$s2));
970 &set_label("enc_skip_in_place");
971 &mov ($s2,$s1);
972 &xor ($s0,$s0);
973 &align (4);
974 &data_word(0xAAF3F689); # rep stosb # zero tail
975 &pop ($key); # pop ivp
976
977 &mov ($acc,$_out); # output as input
978 &mov ($s0,&DWP(0,$key));
979 &mov ($s1,&DWP(4,$key));
980 &mov ($_len,16); # len=16
981 &jmp (&label("enc_loop")); # one more spin...
982
983#----------------------------- DECRYPT -----------------------------# 2192#----------------------------- DECRYPT -----------------------------#
984&align (4); 2193&set_label("fast_decrypt",16);
985&set_label("DECRYPT");
986 &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
987
988 # allocate aligned stack frame...
989 &lea ($key,&DWP(-64-244,"esp"));
990 &and ($key,-64);
991
992 # ... and make sure it doesn't alias with AES_Td modulo 4096
993 &mov ($s0,"ebp");
994 &lea ($s1,&DWP(2048+256,"ebp"));
995 &mov ($s3,$key);
996 &and ($s0,0xfff); # s = %ebp&0xfff
997 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
998 &and ($s3,0xfff); # p = %esp&0xfff
999
1000 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
1001 &jb (&label("td_break_out"));
1002 &sub ($s3,$s1);
1003 &sub ($key,$s3);
1004 &jmp (&label("td_ok"));
1005 &set_label("td_break_out"); # else %esp -= (p-s)&0xfff + framesz;
1006 &sub ($s3,$s0);
1007 &and ($s3,0xfff);
1008 &add ($s3,64+256);
1009 &sub ($key,$s3);
1010 &align (4);
1011 &set_label("td_ok");
1012
1013 &mov ($s0,&wparam(0)); # load inp
1014 &mov ($s1,&wparam(1)); # load out
1015 &mov ($s3,&wparam(3)); # load key
1016 &mov ($acc,&wparam(4)); # load ivp
1017
1018 &exch ("esp",$key);
1019 &add ("esp",4); # reserve for return address!
1020 &mov ($_esp,$key); # save %esp
1021
1022 &mov ($_inp,$s0); # save copy of inp
1023 &mov ($_out,$s1); # save copy of out
1024 &mov ($_len,$s2); # save copy of len
1025 &mov ($_key,$s3); # save copy of key
1026 &mov ($_ivp,$acc); # save copy of ivp
1027
1028 &mov ($mark,0); # copy of aes_key->rounds = 0;
1029 if ($compromise) {
1030 &cmp ($s2,$compromise);
1031 &jb (&label("skip_dcopy"));
1032 }
1033 # do we copy key schedule to stack?
1034 &mov ($s1 eq "ebx" ? $s1 : "",$s3);
1035 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
1036 &sub ($s1,"ebp");
1037 &mov ("esi",$s3);
1038 &and ($s1,0xfff);
1039 &lea ("edi",$aes_key);
1040 &cmp ($s1,2048+256);
1041 &jb (&label("do_dcopy"));
1042 &cmp ($s1,4096-244);
1043 &jb (&label("skip_dcopy"));
1044 &align (4);
1045 &set_label("do_dcopy");
1046 &mov ($_key,"edi");
1047 &data_word(0xA5F3F689); # rep movsd
1048 &set_label("skip_dcopy");
1049
1050 &mov ($acc,$s0);
1051 &mov ($key,18);
1052 &align (4);
1053 &set_label("prefetch_td");
1054 &mov ($s0,&DWP(0,"ebp"));
1055 &mov ($s1,&DWP(32,"ebp"));
1056 &mov ($s2,&DWP(64,"ebp"));
1057 &mov ($s3,&DWP(96,"ebp"));
1058 &lea ("ebp",&DWP(128,"ebp"));
1059 &dec ($key);
1060 &jnz (&label("prefetch_td"));
1061 &sub ("ebp",2048+256);
1062 2194
1063 &cmp ($acc,$_out); 2195 &cmp ($acc,$_out);
1064 &je (&label("dec_in_place")); # in-place processing... 2196 &je (&label("fast_dec_in_place")); # in-place processing...
1065 2197
1066 &mov ($key,$_ivp); # load ivp
1067 &mov ($_tmp,$key); 2198 &mov ($_tmp,$key);
1068 2199
1069 &align (4); 2200 &align (4);
1070 &set_label("dec_loop"); 2201 &set_label("fast_dec_loop",16);
1071 &mov ($s0,&DWP(0,$acc)); # read input 2202 &mov ($s0,&DWP(0,$acc)); # read input
1072 &mov ($s1,&DWP(4,$acc)); 2203 &mov ($s1,&DWP(4,$acc));
1073 &mov ($s2,&DWP(8,$acc)); 2204 &mov ($s2,&DWP(8,$acc));
@@ -1083,27 +2214,24 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1083 &xor ($s2,&DWP(8,$key)); 2214 &xor ($s2,&DWP(8,$key));
1084 &xor ($s3,&DWP(12,$key)); 2215 &xor ($s3,&DWP(12,$key));
1085 2216
1086 &sub ($acc,16);
1087 &jc (&label("dec_partial"));
1088 &mov ($_len,$acc); # save len
1089 &mov ($acc,$_inp); # load inp
1090 &mov ($key,$_out); # load out 2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
1091 2219
1092 &mov (&DWP(0,$key),$s0); # write output 2220 &mov (&DWP(0,$key),$s0); # write output
1093 &mov (&DWP(4,$key),$s1); 2221 &mov (&DWP(4,$key),$s1);
1094 &mov (&DWP(8,$key),$s2); 2222 &mov (&DWP(8,$key),$s2);
1095 &mov (&DWP(12,$key),$s3); 2223 &mov (&DWP(12,$key),$s3);
1096 2224
2225 &mov ($s2,$_len); # load len
1097 &mov ($_tmp,$acc); # save ivp 2226 &mov ($_tmp,$acc); # save ivp
1098 &lea ($acc,&DWP(16,$acc)); 2227 &lea ($acc,&DWP(16,$acc)); # advance inp
1099 &mov ($_inp,$acc); # save inp 2228 &mov ($_inp,$acc); # save inp
1100 2229 &lea ($key,&DWP(16,$key)); # advance out
1101 &lea ($key,&DWP(16,$key));
1102 &mov ($_out,$key); # save out 2230 &mov ($_out,$key); # save out
1103 2231 &sub ($s2,16); # decrease len
1104 &jnz (&label("dec_loop")); 2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
1105 &mov ($key,$_tmp); # load temp ivp 2234 &mov ($key,$_tmp); # load temp ivp
1106 &set_label("dec_end");
1107 &mov ($acc,$_ivp); # load user ivp 2235 &mov ($acc,$_ivp); # load user ivp
1108 &mov ($s0,&DWP(0,$key)); # load iv 2236 &mov ($s0,&DWP(0,$key)); # load iv
1109 &mov ($s1,&DWP(4,$key)); 2237 &mov ($s1,&DWP(4,$key));
@@ -1113,31 +2241,16 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1113 &mov (&DWP(4,$acc),$s1); 2241 &mov (&DWP(4,$acc),$s1);
1114 &mov (&DWP(8,$acc),$s2); 2242 &mov (&DWP(8,$acc),$s2);
1115 &mov (&DWP(12,$acc),$s3); 2243 &mov (&DWP(12,$acc),$s3);
1116 &jmp (&label("dec_out")); 2244 &jmp (&label("fast_dec_out"));
1117 2245
1118 &align (4); 2246 &set_label("fast_dec_in_place",16);
1119 &set_label("dec_partial"); 2247 &set_label("fast_dec_in_place_loop");
1120 &lea ($key,$ivec);
1121 &mov (&DWP(0,$key),$s0); # dump output to stack
1122 &mov (&DWP(4,$key),$s1);
1123 &mov (&DWP(8,$key),$s2);
1124 &mov (&DWP(12,$key),$s3);
1125 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
1126 &mov ($acc eq "esi" ? $acc : "",$key);
1127 &mov ($key eq "edi" ? $key : "",$_out); # load out
1128 &data_word(0xA4F3F689); # rep movsb # copy output
1129 &mov ($key,$_inp); # use inp as temp ivp
1130 &jmp (&label("dec_end"));
1131
1132 &align (4);
1133 &set_label("dec_in_place");
1134 &set_label("dec_in_place_loop");
1135 &lea ($key,$ivec);
1136 &mov ($s0,&DWP(0,$acc)); # read input 2248 &mov ($s0,&DWP(0,$acc)); # read input
1137 &mov ($s1,&DWP(4,$acc)); 2249 &mov ($s1,&DWP(4,$acc));
1138 &mov ($s2,&DWP(8,$acc)); 2250 &mov ($s2,&DWP(8,$acc));
1139 &mov ($s3,&DWP(12,$acc)); 2251 &mov ($s3,&DWP(12,$acc));
1140 2252
2253 &lea ($key,$ivec);
1141 &mov (&DWP(0,$key),$s0); # copy to temp 2254 &mov (&DWP(0,$key),$s0); # copy to temp
1142 &mov (&DWP(4,$key),$s1); 2255 &mov (&DWP(4,$key),$s1);
1143 &mov (&DWP(8,$key),$s2); 2256 &mov (&DWP(8,$key),$s2);
@@ -1158,7 +2271,7 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1158 &mov (&DWP(8,$acc),$s2); 2271 &mov (&DWP(8,$acc),$s2);
1159 &mov (&DWP(12,$acc),$s3); 2272 &mov (&DWP(12,$acc),$s3);
1160 2273
1161 &lea ($acc,&DWP(16,$acc)); 2274 &lea ($acc,&DWP(16,$acc)); # advance out
1162 &mov ($_out,$acc); # save out 2275 &mov ($_out,$acc); # save out
1163 2276
1164 &lea ($acc,$ivec); 2277 &lea ($acc,$ivec);
@@ -1173,40 +2286,340 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1173 &mov (&DWP(12,$key),$s3); 2286 &mov (&DWP(12,$key),$s3);
1174 2287
1175 &mov ($acc,$_inp); # load inp 2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
1176 2369
1177 &lea ($acc,&DWP(16,$acc)); 2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
2372
2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
1178 &mov ($_inp,$acc); # save inp 2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
1179 2429
1180 &mov ($s2,$_len); # load len 2430 &mov ($s2,$_len); # load len
1181 &sub ($s2,16); 2431 &lea ($acc,&DWP(16,$acc)); # advance inp
1182 &jc (&label("dec_in_place_partial")); 2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
1183 &mov ($_len,$s2); # save len 2437 &mov ($_len,$s2); # save len
1184 &jnz (&label("dec_in_place_loop")); 2438 &jae (&label("slow_enc_loop_x86"));
1185 &jmp (&label("dec_out")); 2439 &test ($s2,15);
1186 2440 &jnz (&label("slow_enc_tail"));
1187 &align (4); 2441 &mov ($acc,$_ivp); # load ivp
1188 &set_label("dec_in_place_partial"); 2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
1189 # one can argue if this is actually required... 2443 &mov ($s3,&DWP(12,$key));
1190 &mov ($key eq "edi" ? $key : "",$_out); 2444 &mov (&DWP(0,$acc),$s0); # save ivec
1191 &lea ($acc eq "esi" ? $acc : "",$ivec); 2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
1192 &lea ($key,&DWP(0,$key,$s2)); 2465 &lea ($key,&DWP(0,$key,$s2));
1193 &lea ($acc,&DWP(16,$acc,$s2)); 2466 &set_label("enc_skip_in_place");
1194 &neg ($s2 eq "ecx" ? $s2 : ""); 2467 &mov ($s2,$s1);
1195 &data_word(0xA4F3F689); # rep movsb # restore tail 2468 &xor ($s0,$s0);
1196 2469 &align (4);
1197 &align (4); 2470 &data_word(0xAAF3F689); # rep stosb # zero tail
1198 &set_label("dec_out"); 2471
1199 &cmp ($mark,0); # was the key schedule copied? 2472 &mov ($key,$_ivp); # restore ivp
1200 &mov ("edi",$_key); 2473 &mov ($acc,$s3); # output as input
1201 &je (&label("skip_dzero")); 2474 &mov ($s0,&DWP(0,$key));
1202 # zero copy of key schedule 2475 &mov ($s1,&DWP(4,$key));
1203 &mov ("ecx",240/4); 2476 &mov ($_len,16); # len=16
1204 &xor ("eax","eax"); 2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
1205 &align (4); 2478
1206 &data_word(0xABF3F689); # rep stosd 2479#--------------------------- SLOW DECRYPT ---------------------------#
1207 &set_label("skip_dzero") 2480&set_label("slow_decrypt",16);
1208 &mov ("esp",$_esp); 2481 if (!$x86only) {
1209 &popf (); 2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
1210&function_end("AES_cbc_encrypt"); 2623&function_end("AES_cbc_encrypt");
1211} 2624}
1212 2625
@@ -1215,35 +2628,31 @@ my $mark=&DWP(60+240,"esp"); #copy of aes_key->rounds
1215sub enckey() 2628sub enckey()
1216{ 2629{
1217 &movz ("esi",&LB("edx")); # rk[i]>>0 2630 &movz ("esi",&LB("edx")); # rk[i]>>0
1218 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1219 &movz ("esi",&HB("edx")); # rk[i]>>8 2632 &movz ("esi",&HB("edx")); # rk[i]>>8
1220 &and ("ebx",0xFF000000); 2633 &shl ("ebx",24);
1221 &xor ("eax","ebx"); 2634 &xor ("eax","ebx");
1222 2635
1223 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1224 &shr ("edx",16); 2637 &shr ("edx",16);
1225 &and ("ebx",0x000000FF);
1226 &movz ("esi",&LB("edx")); # rk[i]>>16 2638 &movz ("esi",&LB("edx")); # rk[i]>>16
1227 &xor ("eax","ebx"); 2639 &xor ("eax","ebx");
1228 2640
1229 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1230 &movz ("esi",&HB("edx")); # rk[i]>>24 2642 &movz ("esi",&HB("edx")); # rk[i]>>24
1231 &and ("ebx",0x0000FF00); 2643 &shl ("ebx",8);
1232 &xor ("eax","ebx"); 2644 &xor ("eax","ebx");
1233 2645
1234 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1235 &and ("ebx",0x00FF0000); 2647 &shl ("ebx",16);
1236 &xor ("eax","ebx"); 2648 &xor ("eax","ebx");
1237 2649
1238 &xor ("eax",&DWP(2048,"ebp","ecx",4)); # rcon 2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
1239} 2651}
1240 2652
1241# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, 2653&function_begin("_x86_AES_set_encrypt_key");
1242# AES_KEY *key) 2654 &mov ("esi",&wparam(1)); # user supplied key
1243&public_label("AES_Te"); 2655 &mov ("edi",&wparam(3)); # private key schedule
1244&function_begin("AES_set_encrypt_key");
1245 &mov ("esi",&wparam(0)); # user supplied key
1246 &mov ("edi",&wparam(2)); # private key schedule
1247 2656
1248 &test ("esi",-1); 2657 &test ("esi",-1);
1249 &jz (&label("badpointer")); 2658 &jz (&label("badpointer"));
@@ -1252,10 +2661,21 @@ sub enckey()
1252 2661
1253 &call (&label("pic_point")); 2662 &call (&label("pic_point"));
1254 &set_label("pic_point"); 2663 &set_label("pic_point");
1255 &blindpop("ebp"); 2664 &blindpop($tbl);
1256 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1257 2666 &lea ($tbl,&DWP(2048+128,$tbl));
1258 &mov ("ecx",&wparam(1)); # number of bits in key 2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
1259 &cmp ("ecx",128); 2679 &cmp ("ecx",128);
1260 &je (&label("10rounds")); 2680 &je (&label("10rounds"));
1261 &cmp ("ecx",192); 2681 &cmp ("ecx",192);
@@ -1394,24 +2814,23 @@ sub enckey()
1394 &mov ("edx","eax"); 2814 &mov ("edx","eax");
1395 &mov ("eax",&DWP(16,"edi")); # rk[4] 2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
1396 &movz ("esi",&LB("edx")); # rk[11]>>0 2816 &movz ("esi",&LB("edx")); # rk[11]>>0
1397 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1398 &movz ("esi",&HB("edx")); # rk[11]>>8 2818 &movz ("esi",&HB("edx")); # rk[11]>>8
1399 &and ("ebx",0x000000FF);
1400 &xor ("eax","ebx"); 2819 &xor ("eax","ebx");
1401 2820
1402 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1403 &shr ("edx",16); 2822 &shr ("edx",16);
1404 &and ("ebx",0x0000FF00); 2823 &shl ("ebx",8);
1405 &movz ("esi",&LB("edx")); # rk[11]>>16 2824 &movz ("esi",&LB("edx")); # rk[11]>>16
1406 &xor ("eax","ebx"); 2825 &xor ("eax","ebx");
1407 2826
1408 &mov ("ebx",&DWP(0,"ebp","esi",8)); 2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1409 &movz ("esi",&HB("edx")); # rk[11]>>24 2828 &movz ("esi",&HB("edx")); # rk[11]>>24
1410 &and ("ebx",0x00FF0000); 2829 &shl ("ebx",16);
1411 &xor ("eax","ebx"); 2830 &xor ("eax","ebx");
1412 2831
1413 &mov ("ebx",&DWP(2,"ebp","esi",8)); 2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
1414 &and ("ebx",0xFF000000); 2833 &shl ("ebx",24);
1415 &xor ("eax","ebx"); 2834 &xor ("eax","ebx");
1416 2835
1417 &mov (&DWP(48,"edi"),"eax"); # rk[12] 2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
@@ -1433,43 +2852,74 @@ sub enckey()
1433 &set_label("badpointer"); 2852 &set_label("badpointer");
1434 &mov ("eax",-1); 2853 &mov ("eax",-1);
1435 &set_label("exit"); 2854 &set_label("exit");
1436&function_end("AES_set_encrypt_key"); 2855&function_end("_x86_AES_set_encrypt_key");
1437 2856
1438sub deckey() 2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1439{ my ($i,$ptr,$te,$td) = @_; 2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("AES_set_encrypt_key");
1440 2863
1441 &mov ("eax",&DWP($i,$ptr)); 2864sub deckey()
1442 &mov ("edx","eax"); 2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
1443 &movz ("ebx",&HB("eax")); 2866 my $tmp = $tbl;
1444 &shr ("edx",16); 2867
1445 &and ("eax",0xFF); 2868 &mov ($acc,$tp1);
1446 &movz ("eax",&BP(2,$te,"eax",8)); 2869 &and ($acc,0x80808080);
1447 &movz ("ebx",&BP(2,$te,"ebx",8)); 2870 &mov ($tmp,$acc);
1448 &mov ("eax",&DWP(0,$td,"eax",8)); 2871 &shr ($tmp,7);
1449 &xor ("eax",&DWP(3,$td,"ebx",8)); 2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
1450 &movz ("ebx",&HB("edx")); 2873 &sub ($acc,$tmp);
1451 &and ("edx",0xFF); 2874 &and ($tp2,0xfefefefe);
1452 &movz ("edx",&BP(2,$te,"edx",8)); 2875 &and ($acc,0x1b1b1b1b);
1453 &movz ("ebx",&BP(2,$te,"ebx",8)); 2876 &xor ($acc,$tp2);
1454 &xor ("eax",&DWP(2,$td,"edx",8)); 2877 &mov ($tp2,$acc);
1455 &xor ("eax",&DWP(1,$td,"ebx",8)); 2878
1456 &mov (&DWP($i,$ptr),"eax"); 2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
1457} 2917}
1458 2918
1459# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, 2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1460# AES_KEY *key) 2920# AES_KEY *key)
1461&public_label("AES_Td");
1462&public_label("AES_Te");
1463&function_begin_B("AES_set_decrypt_key"); 2921&function_begin_B("AES_set_decrypt_key");
1464 &mov ("eax",&wparam(0)); 2922 &call ("_x86_AES_set_encrypt_key");
1465 &mov ("ecx",&wparam(1));
1466 &mov ("edx",&wparam(2));
1467 &sub ("esp",12);
1468 &mov (&DWP(0,"esp"),"eax");
1469 &mov (&DWP(4,"esp"),"ecx");
1470 &mov (&DWP(8,"esp"),"edx");
1471 &call ("AES_set_encrypt_key");
1472 &add ("esp",12);
1473 &cmp ("eax",0); 2923 &cmp ("eax",0);
1474 &je (&label("proceed")); 2924 &je (&label("proceed"));
1475 &ret (); 2925 &ret ();
@@ -1485,8 +2935,7 @@ sub deckey()
1485 &lea ("ecx",&DWP(0,"","ecx",4)); 2935 &lea ("ecx",&DWP(0,"","ecx",4));
1486 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk 2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
1487 2937
1488 &align (4); 2938 &set_label("invert",4); # invert order of chunks
1489 &set_label("invert"); # invert order of chunks
1490 &mov ("eax",&DWP(0,"esi")); 2939 &mov ("eax",&DWP(0,"esi"));
1491 &mov ("ebx",&DWP(4,"esi")); 2940 &mov ("ebx",&DWP(4,"esi"));
1492 &mov ("ecx",&DWP(0,"edi")); 2941 &mov ("ecx",&DWP(0,"edi"));
@@ -1508,26 +2957,24 @@ sub deckey()
1508 &cmp ("esi","edi"); 2957 &cmp ("esi","edi");
1509 &jne (&label("invert")); 2958 &jne (&label("invert"));
1510 2959
1511 &call (&label("pic_point")); 2960 &mov ($key,&wparam(2));
1512 &set_label("pic_point"); 2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
1513 blindpop("ebp"); 2962 &lea ($acc,&DWP(-2,$acc,$acc));
1514 &lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); 2963 &lea ($acc,&DWP(0,$key,$acc,8));
1515 &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); 2964 &mov (&wparam(2),$acc);
1516 2965
1517 &mov ("esi",&wparam(2)); 2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
1518 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds 2967 &set_label("permute",4); # permute the key schedule
1519 &dec ("ecx"); 2968 &add ($key,16);
1520 &align (4); 2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
1521 &set_label("permute"); # permute the key schedule 2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
1522 &add ("esi",16); 2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
1523 &deckey (0,"esi","ebp","edi"); 2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
1524 &deckey (4,"esi","ebp","edi"); 2973 &cmp ($key,&wparam(2));
1525 &deckey (8,"esi","ebp","edi"); 2974 &jb (&label("permute"));
1526 &deckey (12,"esi","ebp","edi");
1527 &dec ("ecx");
1528 &jnz (&label("permute"));
1529 2975
1530 &xor ("eax","eax"); # return success 2976 &xor ("eax","eax"); # return success
1531&function_end("AES_set_decrypt_key"); 2977&function_end("AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
1532 2979
1533&asm_finish(); 2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
index 15742c1ec5..690244111a 100644
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -1024,6 +1024,7 @@ _armv4_AES_decrypt:
1024 mov pc,lr @ return 1024 mov pc,lr @ return
1025.size _armv4_AES_decrypt,.-_armv4_AES_decrypt 1025.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1026.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 1026.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1027.align 2
1027___ 1028___
1028 1029
1029$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 1030$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
index ce427655ef..f82c5e1814 100644
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -16,6 +16,19 @@
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - 16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt. 17# at 1/3 of ppc_AES_decrypt.
18 18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gives 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
19$flavour = shift; 32$flavour = shift;
20 33
21if ($flavour =~ /64/) { 34if ($flavour =~ /64/) {
@@ -376,7 +389,7 @@ $code.=<<___;
376 addi $sp,$sp,$FRAME 389 addi $sp,$sp,$FRAME
377 blr 390 blr
378 391
379.align 4 392.align 5
380Lppc_AES_encrypt: 393Lppc_AES_encrypt:
381 lwz $acc00,240($key) 394 lwz $acc00,240($key)
382 lwz $t0,0($key) 395 lwz $t0,0($key)
@@ -397,46 +410,46 @@ Lppc_AES_encrypt:
397Lenc_loop: 410Lenc_loop:
398 rlwinm $acc00,$s0,`32-24+3`,21,28 411 rlwinm $acc00,$s0,`32-24+3`,21,28
399 rlwinm $acc01,$s1,`32-24+3`,21,28 412 rlwinm $acc01,$s1,`32-24+3`,21,28
400 lwz $t0,0($key)
401 lwz $t1,4($key)
402 rlwinm $acc02,$s2,`32-24+3`,21,28 413 rlwinm $acc02,$s2,`32-24+3`,21,28
403 rlwinm $acc03,$s3,`32-24+3`,21,28 414 rlwinm $acc03,$s3,`32-24+3`,21,28
404 lwz $t2,8($key) 415 lwz $t0,0($key)
405 lwz $t3,12($key) 416 lwz $t1,4($key)
406 rlwinm $acc04,$s1,`32-16+3`,21,28 417 rlwinm $acc04,$s1,`32-16+3`,21,28
407 rlwinm $acc05,$s2,`32-16+3`,21,28 418 rlwinm $acc05,$s2,`32-16+3`,21,28
408 lwzx $acc00,$Tbl0,$acc00 419 lwz $t2,8($key)
409 lwzx $acc01,$Tbl0,$acc01 420 lwz $t3,12($key)
410 rlwinm $acc06,$s3,`32-16+3`,21,28 421 rlwinm $acc06,$s3,`32-16+3`,21,28
411 rlwinm $acc07,$s0,`32-16+3`,21,28 422 rlwinm $acc07,$s0,`32-16+3`,21,28
412 lwzx $acc02,$Tbl0,$acc02 423 lwzx $acc00,$Tbl0,$acc00
413 lwzx $acc03,$Tbl0,$acc03 424 lwzx $acc01,$Tbl0,$acc01
414 rlwinm $acc08,$s2,`32-8+3`,21,28 425 rlwinm $acc08,$s2,`32-8+3`,21,28
415 rlwinm $acc09,$s3,`32-8+3`,21,28 426 rlwinm $acc09,$s3,`32-8+3`,21,28
416 lwzx $acc04,$Tbl1,$acc04 427 lwzx $acc02,$Tbl0,$acc02
417 lwzx $acc05,$Tbl1,$acc05 428 lwzx $acc03,$Tbl0,$acc03
418 rlwinm $acc10,$s0,`32-8+3`,21,28 429 rlwinm $acc10,$s0,`32-8+3`,21,28
419 rlwinm $acc11,$s1,`32-8+3`,21,28 430 rlwinm $acc11,$s1,`32-8+3`,21,28
420 lwzx $acc06,$Tbl1,$acc06 431 lwzx $acc04,$Tbl1,$acc04
421 lwzx $acc07,$Tbl1,$acc07 432 lwzx $acc05,$Tbl1,$acc05
422 rlwinm $acc12,$s3,`0+3`,21,28 433 rlwinm $acc12,$s3,`0+3`,21,28
423 rlwinm $acc13,$s0,`0+3`,21,28 434 rlwinm $acc13,$s0,`0+3`,21,28
424 lwzx $acc08,$Tbl2,$acc08 435 lwzx $acc06,$Tbl1,$acc06
425 lwzx $acc09,$Tbl2,$acc09 436 lwzx $acc07,$Tbl1,$acc07
426 rlwinm $acc14,$s1,`0+3`,21,28 437 rlwinm $acc14,$s1,`0+3`,21,28
427 rlwinm $acc15,$s2,`0+3`,21,28 438 rlwinm $acc15,$s2,`0+3`,21,28
428 lwzx $acc10,$Tbl2,$acc10 439 lwzx $acc08,$Tbl2,$acc08
429 lwzx $acc11,$Tbl2,$acc11 440 lwzx $acc09,$Tbl2,$acc09
430 xor $t0,$t0,$acc00 441 xor $t0,$t0,$acc00
431 xor $t1,$t1,$acc01 442 xor $t1,$t1,$acc01
432 lwzx $acc12,$Tbl3,$acc12 443 lwzx $acc10,$Tbl2,$acc10
433 lwzx $acc13,$Tbl3,$acc13 444 lwzx $acc11,$Tbl2,$acc11
434 xor $t2,$t2,$acc02 445 xor $t2,$t2,$acc02
435 xor $t3,$t3,$acc03 446 xor $t3,$t3,$acc03
436 lwzx $acc14,$Tbl3,$acc14 447 lwzx $acc12,$Tbl3,$acc12
437 lwzx $acc15,$Tbl3,$acc15 448 lwzx $acc13,$Tbl3,$acc13
438 xor $t0,$t0,$acc04 449 xor $t0,$t0,$acc04
439 xor $t1,$t1,$acc05 450 xor $t1,$t1,$acc05
451 lwzx $acc14,$Tbl3,$acc14
452 lwzx $acc15,$Tbl3,$acc15
440 xor $t2,$t2,$acc06 453 xor $t2,$t2,$acc06
441 xor $t3,$t3,$acc07 454 xor $t3,$t3,$acc07
442 xor $t0,$t0,$acc08 455 xor $t0,$t0,$acc08
@@ -452,60 +465,60 @@ Lenc_loop:
452 465
453 addi $Tbl2,$Tbl0,2048 466 addi $Tbl2,$Tbl0,2048
454 nop 467 nop
455 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
456 lwz $acc09,`2048+32`($Tbl0)
457 lwz $acc10,`2048+64`($Tbl0)
458 lwz $acc11,`2048+96`($Tbl0)
459 lwz $acc08,`2048+128`($Tbl0)
460 lwz $acc09,`2048+160`($Tbl0)
461 lwz $acc10,`2048+192`($Tbl0)
462 lwz $acc11,`2048+224`($Tbl0)
463 rlwinm $acc00,$s0,`32-24`,24,31
464 rlwinm $acc01,$s1,`32-24`,24,31
465 lwz $t0,0($key) 468 lwz $t0,0($key)
466 lwz $t1,4($key) 469 lwz $t1,4($key)
467 rlwinm $acc02,$s2,`32-24`,24,31 470 rlwinm $acc00,$s0,`32-24`,24,31
468 rlwinm $acc03,$s3,`32-24`,24,31 471 rlwinm $acc01,$s1,`32-24`,24,31
469 lwz $t2,8($key) 472 lwz $t2,8($key)
470 lwz $t3,12($key) 473 lwz $t3,12($key)
474 rlwinm $acc02,$s2,`32-24`,24,31
475 rlwinm $acc03,$s3,`32-24`,24,31
476 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
477 lwz $acc09,`2048+32`($Tbl0)
471 rlwinm $acc04,$s1,`32-16`,24,31 478 rlwinm $acc04,$s1,`32-16`,24,31
472 rlwinm $acc05,$s2,`32-16`,24,31 479 rlwinm $acc05,$s2,`32-16`,24,31
473 lbzx $acc00,$Tbl2,$acc00 480 lwz $acc10,`2048+64`($Tbl0)
474 lbzx $acc01,$Tbl2,$acc01 481 lwz $acc11,`2048+96`($Tbl0)
475 rlwinm $acc06,$s3,`32-16`,24,31 482 rlwinm $acc06,$s3,`32-16`,24,31
476 rlwinm $acc07,$s0,`32-16`,24,31 483 rlwinm $acc07,$s0,`32-16`,24,31
477 lbzx $acc02,$Tbl2,$acc02 484 lwz $acc12,`2048+128`($Tbl0)
478 lbzx $acc03,$Tbl2,$acc03 485 lwz $acc13,`2048+160`($Tbl0)
479 rlwinm $acc08,$s2,`32-8`,24,31 486 rlwinm $acc08,$s2,`32-8`,24,31
480 rlwinm $acc09,$s3,`32-8`,24,31 487 rlwinm $acc09,$s3,`32-8`,24,31
481 lbzx $acc04,$Tbl2,$acc04 488 lwz $acc14,`2048+192`($Tbl0)
482 lbzx $acc05,$Tbl2,$acc05 489 lwz $acc15,`2048+224`($Tbl0)
483 rlwinm $acc10,$s0,`32-8`,24,31 490 rlwinm $acc10,$s0,`32-8`,24,31
484 rlwinm $acc11,$s1,`32-8`,24,31 491 rlwinm $acc11,$s1,`32-8`,24,31
485 lbzx $acc06,$Tbl2,$acc06 492 lbzx $acc00,$Tbl2,$acc00
486 lbzx $acc07,$Tbl2,$acc07 493 lbzx $acc01,$Tbl2,$acc01
487 rlwinm $acc12,$s3,`0`,24,31 494 rlwinm $acc12,$s3,`0`,24,31
488 rlwinm $acc13,$s0,`0`,24,31 495 rlwinm $acc13,$s0,`0`,24,31
489 lbzx $acc08,$Tbl2,$acc08 496 lbzx $acc02,$Tbl2,$acc02
490 lbzx $acc09,$Tbl2,$acc09 497 lbzx $acc03,$Tbl2,$acc03
491 rlwinm $acc14,$s1,`0`,24,31 498 rlwinm $acc14,$s1,`0`,24,31
492 rlwinm $acc15,$s2,`0`,24,31 499 rlwinm $acc15,$s2,`0`,24,31
493 lbzx $acc10,$Tbl2,$acc10 500 lbzx $acc04,$Tbl2,$acc04
494 lbzx $acc11,$Tbl2,$acc11 501 lbzx $acc05,$Tbl2,$acc05
495 rlwinm $s0,$acc00,24,0,7 502 rlwinm $s0,$acc00,24,0,7
496 rlwinm $s1,$acc01,24,0,7 503 rlwinm $s1,$acc01,24,0,7
497 lbzx $acc12,$Tbl2,$acc12 504 lbzx $acc06,$Tbl2,$acc06
498 lbzx $acc13,$Tbl2,$acc13 505 lbzx $acc07,$Tbl2,$acc07
499 rlwinm $s2,$acc02,24,0,7 506 rlwinm $s2,$acc02,24,0,7
500 rlwinm $s3,$acc03,24,0,7 507 rlwinm $s3,$acc03,24,0,7
501 lbzx $acc14,$Tbl2,$acc14 508 lbzx $acc08,$Tbl2,$acc08
502 lbzx $acc15,$Tbl2,$acc15 509 lbzx $acc09,$Tbl2,$acc09
503 rlwimi $s0,$acc04,16,8,15 510 rlwimi $s0,$acc04,16,8,15
504 rlwimi $s1,$acc05,16,8,15 511 rlwimi $s1,$acc05,16,8,15
512 lbzx $acc10,$Tbl2,$acc10
513 lbzx $acc11,$Tbl2,$acc11
505 rlwimi $s2,$acc06,16,8,15 514 rlwimi $s2,$acc06,16,8,15
506 rlwimi $s3,$acc07,16,8,15 515 rlwimi $s3,$acc07,16,8,15
516 lbzx $acc12,$Tbl2,$acc12
517 lbzx $acc13,$Tbl2,$acc13
507 rlwimi $s0,$acc08,8,16,23 518 rlwimi $s0,$acc08,8,16,23
508 rlwimi $s1,$acc09,8,16,23 519 rlwimi $s1,$acc09,8,16,23
520 lbzx $acc14,$Tbl2,$acc14
521 lbzx $acc15,$Tbl2,$acc15
509 rlwimi $s2,$acc10,8,16,23 522 rlwimi $s2,$acc10,8,16,23
510 rlwimi $s3,$acc11,8,16,23 523 rlwimi $s3,$acc11,8,16,23
511 or $s0,$s0,$acc12 524 or $s0,$s0,$acc12
@@ -542,40 +555,40 @@ Lenc_compact_loop:
542 rlwinm $acc01,$s1,`32-24`,24,31 555 rlwinm $acc01,$s1,`32-24`,24,31
543 rlwinm $acc02,$s2,`32-24`,24,31 556 rlwinm $acc02,$s2,`32-24`,24,31
544 rlwinm $acc03,$s3,`32-24`,24,31 557 rlwinm $acc03,$s3,`32-24`,24,31
545 lbzx $acc00,$Tbl1,$acc00
546 lbzx $acc01,$Tbl1,$acc01
547 rlwinm $acc04,$s1,`32-16`,24,31 558 rlwinm $acc04,$s1,`32-16`,24,31
548 rlwinm $acc05,$s2,`32-16`,24,31 559 rlwinm $acc05,$s2,`32-16`,24,31
549 lbzx $acc02,$Tbl1,$acc02
550 lbzx $acc03,$Tbl1,$acc03
551 rlwinm $acc06,$s3,`32-16`,24,31 560 rlwinm $acc06,$s3,`32-16`,24,31
552 rlwinm $acc07,$s0,`32-16`,24,31 561 rlwinm $acc07,$s0,`32-16`,24,31
553 lbzx $acc04,$Tbl1,$acc04 562 lbzx $acc00,$Tbl1,$acc00
554 lbzx $acc05,$Tbl1,$acc05 563 lbzx $acc01,$Tbl1,$acc01
555 rlwinm $acc08,$s2,`32-8`,24,31 564 rlwinm $acc08,$s2,`32-8`,24,31
556 rlwinm $acc09,$s3,`32-8`,24,31 565 rlwinm $acc09,$s3,`32-8`,24,31
557 lbzx $acc06,$Tbl1,$acc06 566 lbzx $acc02,$Tbl1,$acc02
558 lbzx $acc07,$Tbl1,$acc07 567 lbzx $acc03,$Tbl1,$acc03
559 rlwinm $acc10,$s0,`32-8`,24,31 568 rlwinm $acc10,$s0,`32-8`,24,31
560 rlwinm $acc11,$s1,`32-8`,24,31 569 rlwinm $acc11,$s1,`32-8`,24,31
561 lbzx $acc08,$Tbl1,$acc08 570 lbzx $acc04,$Tbl1,$acc04
562 lbzx $acc09,$Tbl1,$acc09 571 lbzx $acc05,$Tbl1,$acc05
563 rlwinm $acc12,$s3,`0`,24,31 572 rlwinm $acc12,$s3,`0`,24,31
564 rlwinm $acc13,$s0,`0`,24,31 573 rlwinm $acc13,$s0,`0`,24,31
565 lbzx $acc10,$Tbl1,$acc10 574 lbzx $acc06,$Tbl1,$acc06
566 lbzx $acc11,$Tbl1,$acc11 575 lbzx $acc07,$Tbl1,$acc07
567 rlwinm $acc14,$s1,`0`,24,31 576 rlwinm $acc14,$s1,`0`,24,31
568 rlwinm $acc15,$s2,`0`,24,31 577 rlwinm $acc15,$s2,`0`,24,31
569 lbzx $acc12,$Tbl1,$acc12 578 lbzx $acc08,$Tbl1,$acc08
570 lbzx $acc13,$Tbl1,$acc13 579 lbzx $acc09,$Tbl1,$acc09
571 rlwinm $s0,$acc00,24,0,7 580 rlwinm $s0,$acc00,24,0,7
572 rlwinm $s1,$acc01,24,0,7 581 rlwinm $s1,$acc01,24,0,7
573 lbzx $acc14,$Tbl1,$acc14 582 lbzx $acc10,$Tbl1,$acc10
574 lbzx $acc15,$Tbl1,$acc15 583 lbzx $acc11,$Tbl1,$acc11
575 rlwinm $s2,$acc02,24,0,7 584 rlwinm $s2,$acc02,24,0,7
576 rlwinm $s3,$acc03,24,0,7 585 rlwinm $s3,$acc03,24,0,7
586 lbzx $acc12,$Tbl1,$acc12
587 lbzx $acc13,$Tbl1,$acc13
577 rlwimi $s0,$acc04,16,8,15 588 rlwimi $s0,$acc04,16,8,15
578 rlwimi $s1,$acc05,16,8,15 589 rlwimi $s1,$acc05,16,8,15
590 lbzx $acc14,$Tbl1,$acc14
591 lbzx $acc15,$Tbl1,$acc15
579 rlwimi $s2,$acc06,16,8,15 592 rlwimi $s2,$acc06,16,8,15
580 rlwimi $s3,$acc07,16,8,15 593 rlwimi $s3,$acc07,16,8,15
581 rlwimi $s0,$acc08,8,16,23 594 rlwimi $s0,$acc08,8,16,23
@@ -725,7 +738,7 @@ Lenc_compact_done:
725 addi $sp,$sp,$FRAME 738 addi $sp,$sp,$FRAME
726 blr 739 blr
727 740
728.align 4 741.align 5
729Lppc_AES_decrypt: 742Lppc_AES_decrypt:
730 lwz $acc00,240($key) 743 lwz $acc00,240($key)
731 lwz $t0,0($key) 744 lwz $t0,0($key)
@@ -746,46 +759,46 @@ Lppc_AES_decrypt:
746Ldec_loop: 759Ldec_loop:
747 rlwinm $acc00,$s0,`32-24+3`,21,28 760 rlwinm $acc00,$s0,`32-24+3`,21,28
748 rlwinm $acc01,$s1,`32-24+3`,21,28 761 rlwinm $acc01,$s1,`32-24+3`,21,28
749 lwz $t0,0($key)
750 lwz $t1,4($key)
751 rlwinm $acc02,$s2,`32-24+3`,21,28 762 rlwinm $acc02,$s2,`32-24+3`,21,28
752 rlwinm $acc03,$s3,`32-24+3`,21,28 763 rlwinm $acc03,$s3,`32-24+3`,21,28
753 lwz $t2,8($key) 764 lwz $t0,0($key)
754 lwz $t3,12($key) 765 lwz $t1,4($key)
755 rlwinm $acc04,$s3,`32-16+3`,21,28 766 rlwinm $acc04,$s3,`32-16+3`,21,28
756 rlwinm $acc05,$s0,`32-16+3`,21,28 767 rlwinm $acc05,$s0,`32-16+3`,21,28
757 lwzx $acc00,$Tbl0,$acc00 768 lwz $t2,8($key)
758 lwzx $acc01,$Tbl0,$acc01 769 lwz $t3,12($key)
759 rlwinm $acc06,$s1,`32-16+3`,21,28 770 rlwinm $acc06,$s1,`32-16+3`,21,28
760 rlwinm $acc07,$s2,`32-16+3`,21,28 771 rlwinm $acc07,$s2,`32-16+3`,21,28
761 lwzx $acc02,$Tbl0,$acc02 772 lwzx $acc00,$Tbl0,$acc00
762 lwzx $acc03,$Tbl0,$acc03 773 lwzx $acc01,$Tbl0,$acc01
763 rlwinm $acc08,$s2,`32-8+3`,21,28 774 rlwinm $acc08,$s2,`32-8+3`,21,28
764 rlwinm $acc09,$s3,`32-8+3`,21,28 775 rlwinm $acc09,$s3,`32-8+3`,21,28
765 lwzx $acc04,$Tbl1,$acc04 776 lwzx $acc02,$Tbl0,$acc02
766 lwzx $acc05,$Tbl1,$acc05 777 lwzx $acc03,$Tbl0,$acc03
767 rlwinm $acc10,$s0,`32-8+3`,21,28 778 rlwinm $acc10,$s0,`32-8+3`,21,28
768 rlwinm $acc11,$s1,`32-8+3`,21,28 779 rlwinm $acc11,$s1,`32-8+3`,21,28
769 lwzx $acc06,$Tbl1,$acc06 780 lwzx $acc04,$Tbl1,$acc04
770 lwzx $acc07,$Tbl1,$acc07 781 lwzx $acc05,$Tbl1,$acc05
771 rlwinm $acc12,$s1,`0+3`,21,28 782 rlwinm $acc12,$s1,`0+3`,21,28
772 rlwinm $acc13,$s2,`0+3`,21,28 783 rlwinm $acc13,$s2,`0+3`,21,28
773 lwzx $acc08,$Tbl2,$acc08 784 lwzx $acc06,$Tbl1,$acc06
774 lwzx $acc09,$Tbl2,$acc09 785 lwzx $acc07,$Tbl1,$acc07
775 rlwinm $acc14,$s3,`0+3`,21,28 786 rlwinm $acc14,$s3,`0+3`,21,28
776 rlwinm $acc15,$s0,`0+3`,21,28 787 rlwinm $acc15,$s0,`0+3`,21,28
777 lwzx $acc10,$Tbl2,$acc10 788 lwzx $acc08,$Tbl2,$acc08
778 lwzx $acc11,$Tbl2,$acc11 789 lwzx $acc09,$Tbl2,$acc09
779 xor $t0,$t0,$acc00 790 xor $t0,$t0,$acc00
780 xor $t1,$t1,$acc01 791 xor $t1,$t1,$acc01
781 lwzx $acc12,$Tbl3,$acc12 792 lwzx $acc10,$Tbl2,$acc10
782 lwzx $acc13,$Tbl3,$acc13 793 lwzx $acc11,$Tbl2,$acc11
783 xor $t2,$t2,$acc02 794 xor $t2,$t2,$acc02
784 xor $t3,$t3,$acc03 795 xor $t3,$t3,$acc03
785 lwzx $acc14,$Tbl3,$acc14 796 lwzx $acc12,$Tbl3,$acc12
786 lwzx $acc15,$Tbl3,$acc15 797 lwzx $acc13,$Tbl3,$acc13
787 xor $t0,$t0,$acc04 798 xor $t0,$t0,$acc04
788 xor $t1,$t1,$acc05 799 xor $t1,$t1,$acc05
800 lwzx $acc14,$Tbl3,$acc14
801 lwzx $acc15,$Tbl3,$acc15
789 xor $t2,$t2,$acc06 802 xor $t2,$t2,$acc06
790 xor $t3,$t3,$acc07 803 xor $t3,$t3,$acc07
791 xor $t0,$t0,$acc08 804 xor $t0,$t0,$acc08
@@ -801,56 +814,56 @@ Ldec_loop:
801 814
802 addi $Tbl2,$Tbl0,2048 815 addi $Tbl2,$Tbl0,2048
803 nop 816 nop
804 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
805 lwz $acc09,`2048+32`($Tbl0)
806 lwz $acc10,`2048+64`($Tbl0)
807 lwz $acc11,`2048+96`($Tbl0)
808 lwz $acc08,`2048+128`($Tbl0)
809 lwz $acc09,`2048+160`($Tbl0)
810 lwz $acc10,`2048+192`($Tbl0)
811 lwz $acc11,`2048+224`($Tbl0)
812 rlwinm $acc00,$s0,`32-24`,24,31
813 rlwinm $acc01,$s1,`32-24`,24,31
814 lwz $t0,0($key) 817 lwz $t0,0($key)
815 lwz $t1,4($key) 818 lwz $t1,4($key)
816 rlwinm $acc02,$s2,`32-24`,24,31 819 rlwinm $acc00,$s0,`32-24`,24,31
817 rlwinm $acc03,$s3,`32-24`,24,31 820 rlwinm $acc01,$s1,`32-24`,24,31
818 lwz $t2,8($key) 821 lwz $t2,8($key)
819 lwz $t3,12($key) 822 lwz $t3,12($key)
823 rlwinm $acc02,$s2,`32-24`,24,31
824 rlwinm $acc03,$s3,`32-24`,24,31
825 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
826 lwz $acc09,`2048+32`($Tbl0)
820 rlwinm $acc04,$s3,`32-16`,24,31 827 rlwinm $acc04,$s3,`32-16`,24,31
821 rlwinm $acc05,$s0,`32-16`,24,31 828 rlwinm $acc05,$s0,`32-16`,24,31
829 lwz $acc10,`2048+64`($Tbl0)
830 lwz $acc11,`2048+96`($Tbl0)
822 lbzx $acc00,$Tbl2,$acc00 831 lbzx $acc00,$Tbl2,$acc00
823 lbzx $acc01,$Tbl2,$acc01 832 lbzx $acc01,$Tbl2,$acc01
833 lwz $acc12,`2048+128`($Tbl0)
834 lwz $acc13,`2048+160`($Tbl0)
824 rlwinm $acc06,$s1,`32-16`,24,31 835 rlwinm $acc06,$s1,`32-16`,24,31
825 rlwinm $acc07,$s2,`32-16`,24,31 836 rlwinm $acc07,$s2,`32-16`,24,31
826 lbzx $acc02,$Tbl2,$acc02 837 lwz $acc14,`2048+192`($Tbl0)
827 lbzx $acc03,$Tbl2,$acc03 838 lwz $acc15,`2048+224`($Tbl0)
828 rlwinm $acc08,$s2,`32-8`,24,31 839 rlwinm $acc08,$s2,`32-8`,24,31
829 rlwinm $acc09,$s3,`32-8`,24,31 840 rlwinm $acc09,$s3,`32-8`,24,31
830 lbzx $acc04,$Tbl2,$acc04 841 lbzx $acc02,$Tbl2,$acc02
831 lbzx $acc05,$Tbl2,$acc05 842 lbzx $acc03,$Tbl2,$acc03
832 rlwinm $acc10,$s0,`32-8`,24,31 843 rlwinm $acc10,$s0,`32-8`,24,31
833 rlwinm $acc11,$s1,`32-8`,24,31 844 rlwinm $acc11,$s1,`32-8`,24,31
834 lbzx $acc06,$Tbl2,$acc06 845 lbzx $acc04,$Tbl2,$acc04
835 lbzx $acc07,$Tbl2,$acc07 846 lbzx $acc05,$Tbl2,$acc05
836 rlwinm $acc12,$s1,`0`,24,31 847 rlwinm $acc12,$s1,`0`,24,31
837 rlwinm $acc13,$s2,`0`,24,31 848 rlwinm $acc13,$s2,`0`,24,31
838 lbzx $acc08,$Tbl2,$acc08 849 lbzx $acc06,$Tbl2,$acc06
839 lbzx $acc09,$Tbl2,$acc09 850 lbzx $acc07,$Tbl2,$acc07
840 rlwinm $acc14,$s3,`0`,24,31 851 rlwinm $acc14,$s3,`0`,24,31
841 rlwinm $acc15,$s0,`0`,24,31 852 rlwinm $acc15,$s0,`0`,24,31
842 lbzx $acc10,$Tbl2,$acc10 853 lbzx $acc08,$Tbl2,$acc08
843 lbzx $acc11,$Tbl2,$acc11 854 lbzx $acc09,$Tbl2,$acc09
844 rlwinm $s0,$acc00,24,0,7 855 rlwinm $s0,$acc00,24,0,7
845 rlwinm $s1,$acc01,24,0,7 856 rlwinm $s1,$acc01,24,0,7
846 lbzx $acc12,$Tbl2,$acc12 857 lbzx $acc10,$Tbl2,$acc10
847 lbzx $acc13,$Tbl2,$acc13 858 lbzx $acc11,$Tbl2,$acc11
848 rlwinm $s2,$acc02,24,0,7 859 rlwinm $s2,$acc02,24,0,7
849 rlwinm $s3,$acc03,24,0,7 860 rlwinm $s3,$acc03,24,0,7
850 lbzx $acc14,$Tbl2,$acc14 861 lbzx $acc12,$Tbl2,$acc12
851 lbzx $acc15,$Tbl2,$acc15 862 lbzx $acc13,$Tbl2,$acc13
852 rlwimi $s0,$acc04,16,8,15 863 rlwimi $s0,$acc04,16,8,15
853 rlwimi $s1,$acc05,16,8,15 864 rlwimi $s1,$acc05,16,8,15
865 lbzx $acc14,$Tbl2,$acc14
866 lbzx $acc15,$Tbl2,$acc15
854 rlwimi $s2,$acc06,16,8,15 867 rlwimi $s2,$acc06,16,8,15
855 rlwimi $s3,$acc07,16,8,15 868 rlwimi $s3,$acc07,16,8,15
856 rlwimi $s0,$acc08,8,16,23 869 rlwimi $s0,$acc08,8,16,23
@@ -897,40 +910,40 @@ Ldec_compact_loop:
897 rlwinm $acc01,$s1,`32-24`,24,31 910 rlwinm $acc01,$s1,`32-24`,24,31
898 rlwinm $acc02,$s2,`32-24`,24,31 911 rlwinm $acc02,$s2,`32-24`,24,31
899 rlwinm $acc03,$s3,`32-24`,24,31 912 rlwinm $acc03,$s3,`32-24`,24,31
900 lbzx $acc00,$Tbl1,$acc00
901 lbzx $acc01,$Tbl1,$acc01
902 rlwinm $acc04,$s3,`32-16`,24,31 913 rlwinm $acc04,$s3,`32-16`,24,31
903 rlwinm $acc05,$s0,`32-16`,24,31 914 rlwinm $acc05,$s0,`32-16`,24,31
904 lbzx $acc02,$Tbl1,$acc02
905 lbzx $acc03,$Tbl1,$acc03
906 rlwinm $acc06,$s1,`32-16`,24,31 915 rlwinm $acc06,$s1,`32-16`,24,31
907 rlwinm $acc07,$s2,`32-16`,24,31 916 rlwinm $acc07,$s2,`32-16`,24,31
908 lbzx $acc04,$Tbl1,$acc04 917 lbzx $acc00,$Tbl1,$acc00
909 lbzx $acc05,$Tbl1,$acc05 918 lbzx $acc01,$Tbl1,$acc01
910 rlwinm $acc08,$s2,`32-8`,24,31 919 rlwinm $acc08,$s2,`32-8`,24,31
911 rlwinm $acc09,$s3,`32-8`,24,31 920 rlwinm $acc09,$s3,`32-8`,24,31
912 lbzx $acc06,$Tbl1,$acc06 921 lbzx $acc02,$Tbl1,$acc02
913 lbzx $acc07,$Tbl1,$acc07 922 lbzx $acc03,$Tbl1,$acc03
914 rlwinm $acc10,$s0,`32-8`,24,31 923 rlwinm $acc10,$s0,`32-8`,24,31
915 rlwinm $acc11,$s1,`32-8`,24,31 924 rlwinm $acc11,$s1,`32-8`,24,31
916 lbzx $acc08,$Tbl1,$acc08 925 lbzx $acc04,$Tbl1,$acc04
917 lbzx $acc09,$Tbl1,$acc09 926 lbzx $acc05,$Tbl1,$acc05
918 rlwinm $acc12,$s1,`0`,24,31 927 rlwinm $acc12,$s1,`0`,24,31
919 rlwinm $acc13,$s2,`0`,24,31 928 rlwinm $acc13,$s2,`0`,24,31
920 lbzx $acc10,$Tbl1,$acc10 929 lbzx $acc06,$Tbl1,$acc06
921 lbzx $acc11,$Tbl1,$acc11 930 lbzx $acc07,$Tbl1,$acc07
922 rlwinm $acc14,$s3,`0`,24,31 931 rlwinm $acc14,$s3,`0`,24,31
923 rlwinm $acc15,$s0,`0`,24,31 932 rlwinm $acc15,$s0,`0`,24,31
924 lbzx $acc12,$Tbl1,$acc12 933 lbzx $acc08,$Tbl1,$acc08
925 lbzx $acc13,$Tbl1,$acc13 934 lbzx $acc09,$Tbl1,$acc09
926 rlwinm $s0,$acc00,24,0,7 935 rlwinm $s0,$acc00,24,0,7
927 rlwinm $s1,$acc01,24,0,7 936 rlwinm $s1,$acc01,24,0,7
928 lbzx $acc14,$Tbl1,$acc14 937 lbzx $acc10,$Tbl1,$acc10
929 lbzx $acc15,$Tbl1,$acc15 938 lbzx $acc11,$Tbl1,$acc11
930 rlwinm $s2,$acc02,24,0,7 939 rlwinm $s2,$acc02,24,0,7
931 rlwinm $s3,$acc03,24,0,7 940 rlwinm $s3,$acc03,24,0,7
941 lbzx $acc12,$Tbl1,$acc12
942 lbzx $acc13,$Tbl1,$acc13
932 rlwimi $s0,$acc04,16,8,15 943 rlwimi $s0,$acc04,16,8,15
933 rlwimi $s1,$acc05,16,8,15 944 rlwimi $s1,$acc05,16,8,15
945 lbzx $acc14,$Tbl1,$acc14
946 lbzx $acc15,$Tbl1,$acc15
934 rlwimi $s2,$acc06,16,8,15 947 rlwimi $s2,$acc06,16,8,15
935 rlwimi $s3,$acc07,16,8,15 948 rlwimi $s3,$acc07,16,8,15
936 rlwimi $s0,$acc08,8,16,23 949 rlwimi $s0,$acc08,8,16,23
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 4b27afd92f..7e01889298 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -765,6 +765,11 @@ $code.=<<___ if (!$softonly);
765 srl %r5,6 765 srl %r5,6
766 ar %r5,%r0 766 ar %r5,%r0
767 767
768 larl %r1,OPENSSL_s390xcap_P
769 lg %r0,0(%r1)
770 tmhl %r0,0x4000 # check for message-security assist
771 jz .Lekey_internal
772
768 lghi %r0,0 # query capability vector 773 lghi %r0,0 # query capability vector
769 la %r1,16($sp) 774 la %r1,16($sp)
770 .long 0xb92f0042 # kmc %r4,%r2 775 .long 0xb92f0042 # kmc %r4,%r2
@@ -1323,6 +1328,7 @@ $code.=<<___;
13234: ex $len,0($s1) 13284: ex $len,0($s1)
1324 j .Lcbc_dec_exit 1329 j .Lcbc_dec_exit
1325.size AES_cbc_encrypt,.-AES_cbc_encrypt 1330.size AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm OPENSSL_s390xcap_P,8,8
1326___ 1332___
1327} 1333}
1328$code.=<<___; 1334$code.=<<___;
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index f616f1751f..a545e892ae 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -2,11 +2,12 @@
2# 2#
3# ==================================================================== 3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary 5# project. The module is, however, dual licensed under OpenSSL and
6# forms are granted according to the OpenSSL license. 6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
7# ==================================================================== 8# ====================================================================
8# 9#
9# Version 1.2. 10# Version 2.1.
10# 11#
11# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on 12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
12# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version 13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
@@ -17,17 +18,29 @@
17# 18#
18# Performance in number of cycles per processed byte for 128-bit key: 19# Performance in number of cycles per processed byte for 128-bit key:
19# 20#
20# ECB CBC encrypt 21# ECB encrypt ECB decrypt CBC large chunk
21# AMD64 13.7 13.0(*) 22# AMD64 33 41 13.0
22# EM64T 20.2 18.6(*) 23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
23# 25#
24# (*) CBC benchmarks are better than ECB thanks to custom ABI used 26# (*) with hyper-threading off
25# by the private block encryption function. 27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $flavour $output";
26 40
27$verticalspin=1; # unlike 32-bit version $verticalspin performs 41$verticalspin=1; # unlike 32-bit version $verticalspin performs
28 # ~15% better on both AMD and Intel cores 42 # ~15% better on both AMD and Intel cores
29$output=shift; 43$speed_limit=512; # see aes-586.pl for details
30open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
31 44
32$code=".text\n"; 45$code=".text\n";
33 46
@@ -35,9 +48,9 @@ $s0="%eax";
35$s1="%ebx"; 48$s1="%ebx";
36$s2="%ecx"; 49$s2="%ecx";
37$s3="%edx"; 50$s3="%edx";
38$acc0="%esi"; 51$acc0="%esi"; $mask80="%rsi";
39$acc1="%edi"; 52$acc1="%edi"; $maskfe="%rdi";
40$acc2="%ebp"; 53$acc2="%ebp"; $mask1b="%rbp";
41$inp="%r8"; 54$inp="%r8";
42$out="%r9"; 55$out="%r9";
43$t0="%r10d"; 56$t0="%r10d";
@@ -51,6 +64,8 @@ sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
51sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 64sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
52 $r =~ s/%[er]([sd]i)/%\1l/; 65 $r =~ s/%[er]([sd]i)/%\1l/;
53 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
67sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
68 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
54sub _data_word() 69sub _data_word()
55{ my $i; 70{ my $i;
56 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -138,22 +153,17 @@ $code.=<<___;
138 movzb `&lo("$s0")`,$acc0 153 movzb `&lo("$s0")`,$acc0
139 movzb `&lo("$s1")`,$acc1 154 movzb `&lo("$s1")`,$acc1
140 movzb `&lo("$s2")`,$acc2 155 movzb `&lo("$s2")`,$acc2
141 mov 2($sbox,$acc0,8),$t0 156 movzb 2($sbox,$acc0,8),$t0
142 mov 2($sbox,$acc1,8),$t1 157 movzb 2($sbox,$acc1,8),$t1
143 mov 2($sbox,$acc2,8),$t2 158 movzb 2($sbox,$acc2,8),$t2
144
145 and \$0x000000ff,$t0
146 and \$0x000000ff,$t1
147 and \$0x000000ff,$t2
148 159
149 movzb `&lo("$s3")`,$acc0 160 movzb `&lo("$s3")`,$acc0
150 movzb `&hi("$s1")`,$acc1 161 movzb `&hi("$s1")`,$acc1
151 movzb `&hi("$s2")`,$acc2 162 movzb `&hi("$s2")`,$acc2
152 mov 2($sbox,$acc0,8),$t3 163 movzb 2($sbox,$acc0,8),$t3
153 mov 0($sbox,$acc1,8),$acc1 #$t0 164 mov 0($sbox,$acc1,8),$acc1 #$t0
154 mov 0($sbox,$acc2,8),$acc2 #$t1 165 mov 0($sbox,$acc2,8),$acc2 #$t1
155 166
156 and \$0x000000ff,$t3
157 and \$0x0000ff00,$acc1 167 and \$0x0000ff00,$acc1
158 and \$0x0000ff00,$acc2 168 and \$0x0000ff00,$acc2
159 169
@@ -345,6 +355,234 @@ $code.=<<___;
345.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt 355.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
346___ 356___
347 357
358# it's possible to implement this by shifting tN by 8, filling least
359# significant byte with byte load and finally bswap-ing at the end,
360# but such partial register load kills Core 2...
361sub enccompactvert()
362{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
363
364$code.=<<___;
365 movzb `&lo("$s0")`,$t0
366 movzb `&lo("$s1")`,$t1
367 movzb `&lo("$s2")`,$t2
368 movzb ($sbox,$t0,1),$t0
369 movzb ($sbox,$t1,1),$t1
370 movzb ($sbox,$t2,1),$t2
371
372 movzb `&lo("$s3")`,$t3
373 movzb `&hi("$s1")`,$acc0
374 movzb `&hi("$s2")`,$acc1
375 movzb ($sbox,$t3,1),$t3
376 movzb ($sbox,$acc0,1),$t4 #$t0
377 movzb ($sbox,$acc1,1),$t5 #$t1
378
379 movzb `&hi("$s3")`,$acc2
380 movzb `&hi("$s0")`,$acc0
381 shr \$16,$s2
382 movzb ($sbox,$acc2,1),$acc2 #$t2
383 movzb ($sbox,$acc0,1),$acc0 #$t3
384 shr \$16,$s3
385
386 movzb `&lo("$s2")`,$acc1
387 shl \$8,$t4
388 shl \$8,$t5
389 movzb ($sbox,$acc1,1),$acc1 #$t0
390 xor $t4,$t0
391 xor $t5,$t1
392
393 movzb `&lo("$s3")`,$t4
394 shr \$16,$s0
395 shr \$16,$s1
396 movzb `&lo("$s0")`,$t5
397 shl \$8,$acc2
398 shl \$8,$acc0
399 movzb ($sbox,$t4,1),$t4 #$t1
400 movzb ($sbox,$t5,1),$t5 #$t2
401 xor $acc2,$t2
402 xor $acc0,$t3
403
404 movzb `&lo("$s1")`,$acc2
405 movzb `&hi("$s3")`,$acc0
406 shl \$16,$acc1
407 movzb ($sbox,$acc2,1),$acc2 #$t3
408 movzb ($sbox,$acc0,1),$acc0 #$t0
409 xor $acc1,$t0
410
411 movzb `&hi("$s0")`,$acc1
412 shr \$8,$s2
413 shr \$8,$s1
414 movzb ($sbox,$acc1,1),$acc1 #$t1
415 movzb ($sbox,$s2,1),$s3 #$t3
416 movzb ($sbox,$s1,1),$s2 #$t2
417 shl \$16,$t4
418 shl \$16,$t5
419 shl \$16,$acc2
420 xor $t4,$t1
421 xor $t5,$t2
422 xor $acc2,$t3
423
424 shl \$24,$acc0
425 shl \$24,$acc1
426 shl \$24,$s3
427 xor $acc0,$t0
428 shl \$24,$s2
429 xor $acc1,$t1
430 mov $t0,$s0
431 mov $t1,$s1
432 xor $t2,$s2
433 xor $t3,$s3
434___
435}
436
437sub enctransform_ref()
438{ my $sn = shift;
439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
440
441$code.=<<___;
442 mov $sn,$acc
443 and \$0x80808080,$acc
444 mov $acc,$tmp
445 shr \$7,$tmp
446 lea ($sn,$sn),$r2
447 sub $tmp,$acc
448 and \$0xfefefefe,$r2
449 and \$0x1b1b1b1b,$acc
450 mov $sn,$tmp
451 xor $acc,$r2
452
453 xor $r2,$sn
454 rol \$24,$sn
455 xor $r2,$sn
456 ror \$16,$tmp
457 xor $tmp,$sn
458 ror \$8,$tmp
459 xor $tmp,$sn
460___
461}
462
463# unlike decrypt case it does not pay off to parallelize enctransform
464sub enctransform()
465{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
466
467$code.=<<___;
468 mov $s0,$acc0
469 mov $s1,$acc1
470 and \$0x80808080,$acc0
471 and \$0x80808080,$acc1
472 mov $acc0,$t0
473 mov $acc1,$t1
474 shr \$7,$t0
475 lea ($s0,$s0),$r20
476 shr \$7,$t1
477 lea ($s1,$s1),$r21
478 sub $t0,$acc0
479 sub $t1,$acc1
480 and \$0xfefefefe,$r20
481 and \$0xfefefefe,$r21
482 and \$0x1b1b1b1b,$acc0
483 and \$0x1b1b1b1b,$acc1
484 mov $s0,$t0
485 mov $s1,$t1
486 xor $acc0,$r20
487 xor $acc1,$r21
488
489 xor $r20,$s0
490 xor $r21,$s1
491 mov $s2,$acc0
492 mov $s3,$acc1
493 rol \$24,$s0
494 rol \$24,$s1
495 and \$0x80808080,$acc0
496 and \$0x80808080,$acc1
497 xor $r20,$s0
498 xor $r21,$s1
499 mov $acc0,$t2
500 mov $acc1,$t3
501 ror \$16,$t0
502 ror \$16,$t1
503 shr \$7,$t2
504 lea ($s2,$s2),$r20
505 xor $t0,$s0
506 xor $t1,$s1
507 shr \$7,$t3
508 lea ($s3,$s3),$r21
509 ror \$8,$t0
510 ror \$8,$t1
511 sub $t2,$acc0
512 sub $t3,$acc1
513 xor $t0,$s0
514 xor $t1,$s1
515
516 and \$0xfefefefe,$r20
517 and \$0xfefefefe,$r21
518 and \$0x1b1b1b1b,$acc0
519 and \$0x1b1b1b1b,$acc1
520 mov $s2,$t2
521 mov $s3,$t3
522 xor $acc0,$r20
523 xor $acc1,$r21
524
525 xor $r20,$s2
526 xor $r21,$s3
527 rol \$24,$s2
528 rol \$24,$s3
529 xor $r20,$s2
530 xor $r21,$s3
531 mov 0($sbox),$acc0 # prefetch Te4
532 ror \$16,$t2
533 ror \$16,$t3
534 mov 64($sbox),$acc1
535 xor $t2,$s2
536 xor $t3,$s3
537 mov 128($sbox),$r20
538 ror \$8,$t2
539 ror \$8,$t3
540 mov 192($sbox),$r21
541 xor $t2,$s2
542 xor $t3,$s3
543___
544}
545
546$code.=<<___;
547.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
548.align 16
549_x86_64_AES_encrypt_compact:
550 lea 128($sbox),$inp # size optimization
551 mov 0-128($inp),$acc1 # prefetch Te4
552 mov 32-128($inp),$acc2
553 mov 64-128($inp),$t0
554 mov 96-128($inp),$t1
555 mov 128-128($inp),$acc1
556 mov 160-128($inp),$acc2
557 mov 192-128($inp),$t0
558 mov 224-128($inp),$t1
559 jmp .Lenc_loop_compact
560.align 16
561.Lenc_loop_compact:
562 xor 0($key),$s0 # xor with key
563 xor 4($key),$s1
564 xor 8($key),$s2
565 xor 12($key),$s3
566 lea 16($key),$key
567___
568 &enccompactvert();
569$code.=<<___;
570 cmp 16(%rsp),$key
571 je .Lenc_compact_done
572___
573 &enctransform();
574$code.=<<___;
575 jmp .Lenc_loop_compact
576.align 16
577.Lenc_compact_done:
578 xor 0($key),$s0
579 xor 4($key),$s1
580 xor 8($key),$s2
581 xor 12($key),$s3
582 .byte 0xf3,0xc3 # rep ret
583.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
584___
585
348# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); 586# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
349$code.=<<___; 587$code.=<<___;
350.globl AES_encrypt 588.globl AES_encrypt
@@ -358,31 +596,57 @@ AES_encrypt:
358 push %r14 596 push %r14
359 push %r15 597 push %r15
360 598
361 mov %rdx,$key 599 # allocate frame "above" key schedule
362 mov %rdi,$inp 600 mov %rsp,%r10
363 mov %rsi,$out 601 lea -63(%rdx),%rcx # %rdx is key argument
364 602 and \$-64,%rsp
365 .picmeup $sbox 603 sub %rsp,%rcx
366 lea AES_Te-.($sbox),$sbox 604 neg %rcx
367 605 and \$0x3c0,%rcx
368 mov 0($inp),$s0 606 sub %rcx,%rsp
369 mov 4($inp),$s1 607 sub \$32,%rsp
370 mov 8($inp),$s2
371 mov 12($inp),$s3
372 608
373 call _x86_64_AES_encrypt 609 mov %rsi,16(%rsp) # save out
610 mov %r10,24(%rsp) # save real stack pointer
611.Lenc_prologue:
374 612
375 mov $s0,0($out) 613 mov %rdx,$key
614 mov 240($key),$rnds # load rounds
615
616 mov 0(%rdi),$s0 # load input vector
617 mov 4(%rdi),$s1
618 mov 8(%rdi),$s2
619 mov 12(%rdi),$s3
620
621 shl \$4,$rnds
622 lea ($key,$rnds),%rbp
623 mov $key,(%rsp) # key schedule
624 mov %rbp,8(%rsp) # end of key schedule
625
626 # pick Te4 copy which can't "overlap" with stack frame or key schedule
627 lea .LAES_Te+2048(%rip),$sbox
628 lea 768(%rsp),%rbp
629 sub $sbox,%rbp
630 and \$0x300,%rbp
631 lea ($sbox,%rbp),$sbox
632
633 call _x86_64_AES_encrypt_compact
634
635 mov 16(%rsp),$out # restore out
636 mov 24(%rsp),%rsi # restore saved stack pointer
637 mov $s0,0($out) # write output vector
376 mov $s1,4($out) 638 mov $s1,4($out)
377 mov $s2,8($out) 639 mov $s2,8($out)
378 mov $s3,12($out) 640 mov $s3,12($out)
379 641
380 pop %r15 642 mov (%rsi),%r15
381 pop %r14 643 mov 8(%rsi),%r14
382 pop %r13 644 mov 16(%rsi),%r13
383 pop %r12 645 mov 24(%rsi),%r12
384 pop %rbp 646 mov 32(%rsi),%rbp
385 pop %rbx 647 mov 40(%rsi),%rbx
648 lea 48(%rsi),%rsp
649.Lenc_epilogue:
386 ret 650 ret
387.size AES_encrypt,.-AES_encrypt 651.size AES_encrypt,.-AES_encrypt
388___ 652___
@@ -453,19 +717,20 @@ sub declastvert()
453{ my $t3="%r8d"; # zaps $inp! 717{ my $t3="%r8d"; # zaps $inp!
454 718
455$code.=<<___; 719$code.=<<___;
720 lea 2048($sbox),$sbox # size optimization
456 movzb `&lo("$s0")`,$acc0 721 movzb `&lo("$s0")`,$acc0
457 movzb `&lo("$s1")`,$acc1 722 movzb `&lo("$s1")`,$acc1
458 movzb `&lo("$s2")`,$acc2 723 movzb `&lo("$s2")`,$acc2
459 movzb 2048($sbox,$acc0,1),$t0 724 movzb ($sbox,$acc0,1),$t0
460 movzb 2048($sbox,$acc1,1),$t1 725 movzb ($sbox,$acc1,1),$t1
461 movzb 2048($sbox,$acc2,1),$t2 726 movzb ($sbox,$acc2,1),$t2
462 727
463 movzb `&lo("$s3")`,$acc0 728 movzb `&lo("$s3")`,$acc0
464 movzb `&hi("$s3")`,$acc1 729 movzb `&hi("$s3")`,$acc1
465 movzb `&hi("$s0")`,$acc2 730 movzb `&hi("$s0")`,$acc2
466 movzb 2048($sbox,$acc0,1),$t3 731 movzb ($sbox,$acc0,1),$t3
467 movzb 2048($sbox,$acc1,1),$acc1 #$t0 732 movzb ($sbox,$acc1,1),$acc1 #$t0
468 movzb 2048($sbox,$acc2,1),$acc2 #$t1 733 movzb ($sbox,$acc2,1),$acc2 #$t1
469 734
470 shl \$8,$acc1 735 shl \$8,$acc1
471 shl \$8,$acc2 736 shl \$8,$acc2
@@ -477,8 +742,8 @@ $code.=<<___;
477 movzb `&hi("$s1")`,$acc0 742 movzb `&hi("$s1")`,$acc0
478 movzb `&hi("$s2")`,$acc1 743 movzb `&hi("$s2")`,$acc1
479 shr \$16,$s0 744 shr \$16,$s0
480 movzb 2048($sbox,$acc0,1),$acc0 #$t2 745 movzb ($sbox,$acc0,1),$acc0 #$t2
481 movzb 2048($sbox,$acc1,1),$acc1 #$t3 746 movzb ($sbox,$acc1,1),$acc1 #$t3
482 747
483 shl \$8,$acc0 748 shl \$8,$acc0
484 shl \$8,$acc1 749 shl \$8,$acc1
@@ -490,9 +755,9 @@ $code.=<<___;
490 movzb `&lo("$s2")`,$acc0 755 movzb `&lo("$s2")`,$acc0
491 movzb `&lo("$s3")`,$acc1 756 movzb `&lo("$s3")`,$acc1
492 movzb `&lo("$s0")`,$acc2 757 movzb `&lo("$s0")`,$acc2
493 movzb 2048($sbox,$acc0,1),$acc0 #$t0 758 movzb ($sbox,$acc0,1),$acc0 #$t0
494 movzb 2048($sbox,$acc1,1),$acc1 #$t1 759 movzb ($sbox,$acc1,1),$acc1 #$t1
495 movzb 2048($sbox,$acc2,1),$acc2 #$t2 760 movzb ($sbox,$acc2,1),$acc2 #$t2
496 761
497 shl \$16,$acc0 762 shl \$16,$acc0
498 shl \$16,$acc1 763 shl \$16,$acc1
@@ -505,9 +770,9 @@ $code.=<<___;
505 movzb `&lo("$s1")`,$acc0 770 movzb `&lo("$s1")`,$acc0
506 movzb `&hi("$s1")`,$acc1 771 movzb `&hi("$s1")`,$acc1
507 movzb `&hi("$s2")`,$acc2 772 movzb `&hi("$s2")`,$acc2
508 movzb 2048($sbox,$acc0,1),$acc0 #$t3 773 movzb ($sbox,$acc0,1),$acc0 #$t3
509 movzb 2048($sbox,$acc1,1),$acc1 #$t0 774 movzb ($sbox,$acc1,1),$acc1 #$t0
510 movzb 2048($sbox,$acc2,1),$acc2 #$t1 775 movzb ($sbox,$acc2,1),$acc2 #$t1
511 776
512 shl \$16,$acc0 777 shl \$16,$acc0
513 shl \$24,$acc1 778 shl \$24,$acc1
@@ -520,8 +785,8 @@ $code.=<<___;
520 movzb `&hi("$s3")`,$acc0 785 movzb `&hi("$s3")`,$acc0
521 movzb `&hi("$s0")`,$acc1 786 movzb `&hi("$s0")`,$acc1
522 mov 16+12($key),$s3 787 mov 16+12($key),$s3
523 movzb 2048($sbox,$acc0,1),$acc0 #$t2 788 movzb ($sbox,$acc0,1),$acc0 #$t2
524 movzb 2048($sbox,$acc1,1),$acc1 #$t3 789 movzb ($sbox,$acc1,1),$acc1 #$t3
525 mov 16+0($key),$s0 790 mov 16+0($key),$s0
526 791
527 shl \$24,$acc0 792 shl \$24,$acc0
@@ -532,6 +797,7 @@ $code.=<<___;
532 797
533 mov 16+4($key),$s1 798 mov 16+4($key),$s1
534 mov 16+8($key),$s2 799 mov 16+8($key),$s2
800 lea -2048($sbox),$sbox
535 xor $t0,$s0 801 xor $t0,$s0
536 xor $t1,$s1 802 xor $t1,$s1
537 xor $t2,$s2 803 xor $t2,$s2
@@ -659,6 +925,260 @@ $code.=<<___;
659.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt 925.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
660___ 926___
661 927
928sub deccompactvert()
929{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
930
931$code.=<<___;
932 movzb `&lo("$s0")`,$t0
933 movzb `&lo("$s1")`,$t1
934 movzb `&lo("$s2")`,$t2
935 movzb ($sbox,$t0,1),$t0
936 movzb ($sbox,$t1,1),$t1
937 movzb ($sbox,$t2,1),$t2
938
939 movzb `&lo("$s3")`,$t3
940 movzb `&hi("$s3")`,$acc0
941 movzb `&hi("$s0")`,$acc1
942 movzb ($sbox,$t3,1),$t3
943 movzb ($sbox,$acc0,1),$t4 #$t0
944 movzb ($sbox,$acc1,1),$t5 #$t1
945
946 movzb `&hi("$s1")`,$acc2
947 movzb `&hi("$s2")`,$acc0
948 shr \$16,$s2
949 movzb ($sbox,$acc2,1),$acc2 #$t2
950 movzb ($sbox,$acc0,1),$acc0 #$t3
951 shr \$16,$s3
952
953 movzb `&lo("$s2")`,$acc1
954 shl \$8,$t4
955 shl \$8,$t5
956 movzb ($sbox,$acc1,1),$acc1 #$t0
957 xor $t4,$t0
958 xor $t5,$t1
959
960 movzb `&lo("$s3")`,$t4
961 shr \$16,$s0
962 shr \$16,$s1
963 movzb `&lo("$s0")`,$t5
964 shl \$8,$acc2
965 shl \$8,$acc0
966 movzb ($sbox,$t4,1),$t4 #$t1
967 movzb ($sbox,$t5,1),$t5 #$t2
968 xor $acc2,$t2
969 xor $acc0,$t3
970
971 movzb `&lo("$s1")`,$acc2
972 movzb `&hi("$s1")`,$acc0
973 shl \$16,$acc1
974 movzb ($sbox,$acc2,1),$acc2 #$t3
975 movzb ($sbox,$acc0,1),$acc0 #$t0
976 xor $acc1,$t0
977
978 movzb `&hi("$s2")`,$acc1
979 shl \$16,$t4
980 shl \$16,$t5
981 movzb ($sbox,$acc1,1),$s1 #$t1
982 xor $t4,$t1
983 xor $t5,$t2
984
985 movzb `&hi("$s3")`,$acc1
986 shr \$8,$s0
987 shl \$16,$acc2
988 movzb ($sbox,$acc1,1),$s2 #$t2
989 movzb ($sbox,$s0,1),$s3 #$t3
990 xor $acc2,$t3
991
992 shl \$24,$acc0
993 shl \$24,$s1
994 shl \$24,$s2
995 xor $acc0,$t0
996 shl \$24,$s3
997 xor $t1,$s1
998 mov $t0,$s0
999 xor $t2,$s2
1000 xor $t3,$s3
1001___
1002}
1003
1004# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1005# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1006# %ecx=s2 and %edx=s3.
1007sub dectransform()
1008{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1009 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1010 my $prefetch = shift;
1011
1012$code.=<<___;
1013 mov $tp10,$acc0
1014 mov $tp18,$acc8
1015 and $mask80,$acc0
1016 and $mask80,$acc8
1017 mov $acc0,$tp40
1018 mov $acc8,$tp48
1019 shr \$7,$tp40
1020 lea ($tp10,$tp10),$tp20
1021 shr \$7,$tp48
1022 lea ($tp18,$tp18),$tp28
1023 sub $tp40,$acc0
1024 sub $tp48,$acc8
1025 and $maskfe,$tp20
1026 and $maskfe,$tp28
1027 and $mask1b,$acc0
1028 and $mask1b,$acc8
1029 xor $tp20,$acc0
1030 xor $tp28,$acc8
1031 mov $acc0,$tp20
1032 mov $acc8,$tp28
1033
1034 and $mask80,$acc0
1035 and $mask80,$acc8
1036 mov $acc0,$tp80
1037 mov $acc8,$tp88
1038 shr \$7,$tp80
1039 lea ($tp20,$tp20),$tp40
1040 shr \$7,$tp88
1041 lea ($tp28,$tp28),$tp48
1042 sub $tp80,$acc0
1043 sub $tp88,$acc8
1044 and $maskfe,$tp40
1045 and $maskfe,$tp48
1046 and $mask1b,$acc0
1047 and $mask1b,$acc8
1048 xor $tp40,$acc0
1049 xor $tp48,$acc8
1050 mov $acc0,$tp40
1051 mov $acc8,$tp48
1052
1053 and $mask80,$acc0
1054 and $mask80,$acc8
1055 mov $acc0,$tp80
1056 mov $acc8,$tp88
1057 shr \$7,$tp80
1058 xor $tp10,$tp20 # tp2^=tp1
1059 shr \$7,$tp88
1060 xor $tp18,$tp28 # tp2^=tp1
1061 sub $tp80,$acc0
1062 sub $tp88,$acc8
1063 lea ($tp40,$tp40),$tp80
1064 lea ($tp48,$tp48),$tp88
1065 xor $tp10,$tp40 # tp4^=tp1
1066 xor $tp18,$tp48 # tp4^=tp1
1067 and $maskfe,$tp80
1068 and $maskfe,$tp88
1069 and $mask1b,$acc0
1070 and $mask1b,$acc8
1071 xor $acc0,$tp80
1072 xor $acc8,$tp88
1073
1074 xor $tp80,$tp10 # tp1^=tp8
1075 xor $tp88,$tp18 # tp1^=tp8
1076 xor $tp80,$tp20 # tp2^tp1^=tp8
1077 xor $tp88,$tp28 # tp2^tp1^=tp8
1078 mov $tp10,$acc0
1079 mov $tp18,$acc8
1080 xor $tp80,$tp40 # tp4^tp1^=tp8
1081 xor $tp88,$tp48 # tp4^tp1^=tp8
1082 shr \$32,$acc0
1083 shr \$32,$acc8
1084 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1085 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1086 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1087 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1088 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1089 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1090
1091 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1092 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1093 xor `&LO("$tp80")`,`&LO("$tp10")`
1094 xor `&LO("$tp88")`,`&LO("$tp18")`
1095 shr \$32,$tp80
1096 shr \$32,$tp88
1097 xor `&LO("$tp80")`,`&LO("$acc0")`
1098 xor `&LO("$tp88")`,`&LO("$acc8")`
1099
1100 mov $tp20,$tp80
1101 mov $tp28,$tp88
1102 shr \$32,$tp80
1103 shr \$32,$tp88
1104 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1105 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1106 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1107 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1108 xor `&LO("$tp20")`,`&LO("$tp10")`
1109 xor `&LO("$tp28")`,`&LO("$tp18")`
1110 mov $tp40,$tp20
1111 mov $tp48,$tp28
1112 xor `&LO("$tp80")`,`&LO("$acc0")`
1113 xor `&LO("$tp88")`,`&LO("$acc8")`
1114
1115 `"mov 0($sbox),$mask80" if ($prefetch)`
1116 shr \$32,$tp20
1117 shr \$32,$tp28
1118 `"mov 64($sbox),$maskfe" if ($prefetch)`
1119 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1120 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1121 `"mov 128($sbox),$mask1b" if ($prefetch)`
1122 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1123 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1124 `"mov 192($sbox),$tp80" if ($prefetch)`
1125 xor `&LO("$tp40")`,`&LO("$tp10")`
1126 xor `&LO("$tp48")`,`&LO("$tp18")`
1127 `"mov 256($sbox),$tp88" if ($prefetch)`
1128 xor `&LO("$tp20")`,`&LO("$acc0")`
1129 xor `&LO("$tp28")`,`&LO("$acc8")`
1130___
1131}
1132
1133$code.=<<___;
1134.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1135.align 16
1136_x86_64_AES_decrypt_compact:
1137 lea 128($sbox),$inp # size optimization
1138 mov 0-128($inp),$acc1 # prefetch Td4
1139 mov 32-128($inp),$acc2
1140 mov 64-128($inp),$t0
1141 mov 96-128($inp),$t1
1142 mov 128-128($inp),$acc1
1143 mov 160-128($inp),$acc2
1144 mov 192-128($inp),$t0
1145 mov 224-128($inp),$t1
1146 jmp .Ldec_loop_compact
1147
1148.align 16
1149.Ldec_loop_compact:
1150 xor 0($key),$s0 # xor with key
1151 xor 4($key),$s1
1152 xor 8($key),$s2
1153 xor 12($key),$s3
1154 lea 16($key),$key
1155___
1156 &deccompactvert();
1157$code.=<<___;
1158 cmp 16(%rsp),$key
1159 je .Ldec_compact_done
1160
1161 mov 256+0($sbox),$mask80
1162 shl \$32,%rbx
1163 shl \$32,%rdx
1164 mov 256+8($sbox),$maskfe
1165 or %rbx,%rax
1166 or %rdx,%rcx
1167 mov 256+16($sbox),$mask1b
1168___
1169 &dectransform(1);
1170$code.=<<___;
1171 jmp .Ldec_loop_compact
1172.align 16
1173.Ldec_compact_done:
1174 xor 0($key),$s0
1175 xor 4($key),$s1
1176 xor 8($key),$s2
1177 xor 12($key),$s3
1178 .byte 0xf3,0xc3 # rep ret
1179.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1180___
1181
662# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); 1182# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
663$code.=<<___; 1183$code.=<<___;
664.globl AES_decrypt 1184.globl AES_decrypt
@@ -672,43 +1192,59 @@ AES_decrypt:
672 push %r14 1192 push %r14
673 push %r15 1193 push %r15
674 1194
675 mov %rdx,$key 1195 # allocate frame "above" key schedule
676 mov %rdi,$inp 1196 mov %rsp,%r10
677 mov %rsi,$out 1197 lea -63(%rdx),%rcx # %rdx is key argument
1198 and \$-64,%rsp
1199 sub %rsp,%rcx
1200 neg %rcx
1201 and \$0x3c0,%rcx
1202 sub %rcx,%rsp
1203 sub \$32,%rsp
1204
1205 mov %rsi,16(%rsp) # save out
1206 mov %r10,24(%rsp) # save real stack pointer
1207.Ldec_prologue:
678 1208
679 .picmeup $sbox 1209 mov %rdx,$key
680 lea AES_Td-.($sbox),$sbox 1210 mov 240($key),$rnds # load rounds
681 1211
682 # prefetch Td4 1212 mov 0(%rdi),$s0 # load input vector
683 lea 2048+128($sbox),$sbox; 1213 mov 4(%rdi),$s1
684 mov 0-128($sbox),$s0 1214 mov 8(%rdi),$s2
685 mov 32-128($sbox),$s1 1215 mov 12(%rdi),$s3
686 mov 64-128($sbox),$s2 1216
687 mov 96-128($sbox),$s3 1217 shl \$4,$rnds
688 mov 128-128($sbox),$s0 1218 lea ($key,$rnds),%rbp
689 mov 160-128($sbox),$s1 1219 mov $key,(%rsp) # key schedule
690 mov 192-128($sbox),$s2 1220 mov %rbp,8(%rsp) # end of key schedule
691 mov 224-128($sbox),$s3 1221
692 lea -2048-128($sbox),$sbox; 1222 # pick Td4 copy which can't "overlap" with stack frame or key schedule
693 1223 lea .LAES_Td+2048(%rip),$sbox
694 mov 0($inp),$s0 1224 lea 768(%rsp),%rbp
695 mov 4($inp),$s1 1225 sub $sbox,%rbp
696 mov 8($inp),$s2 1226 and \$0x300,%rbp
697 mov 12($inp),$s3 1227 lea ($sbox,%rbp),$sbox
698 1228 shr \$3,%rbp # recall "magic" constants!
699 call _x86_64_AES_decrypt 1229 add %rbp,$sbox
700 1230
701 mov $s0,0($out) 1231 call _x86_64_AES_decrypt_compact
1232
1233 mov 16(%rsp),$out # restore out
1234 mov 24(%rsp),%rsi # restore saved stack pointer
1235 mov $s0,0($out) # write output vector
702 mov $s1,4($out) 1236 mov $s1,4($out)
703 mov $s2,8($out) 1237 mov $s2,8($out)
704 mov $s3,12($out) 1238 mov $s3,12($out)
705 1239
706 pop %r15 1240 mov (%rsi),%r15
707 pop %r14 1241 mov 8(%rsi),%r14
708 pop %r13 1242 mov 16(%rsi),%r13
709 pop %r12 1243 mov 24(%rsi),%r12
710 pop %rbp 1244 mov 32(%rsi),%rbp
711 pop %rbx 1245 mov 40(%rsi),%rbx
1246 lea 48(%rsi),%rsp
1247.Ldec_epilogue:
712 ret 1248 ret
713.size AES_decrypt,.-AES_decrypt 1249.size AES_decrypt,.-AES_decrypt
714___ 1250___
@@ -718,27 +1254,26 @@ sub enckey()
718{ 1254{
719$code.=<<___; 1255$code.=<<___;
720 movz %dl,%esi # rk[i]>>0 1256 movz %dl,%esi # rk[i]>>0
721 mov 2(%rbp,%rsi,8),%ebx 1257 movzb -128(%rbp,%rsi),%ebx
722 movz %dh,%esi # rk[i]>>8 1258 movz %dh,%esi # rk[i]>>8
723 and \$0xFF000000,%ebx 1259 shl \$24,%ebx
724 xor %ebx,%eax 1260 xor %ebx,%eax
725 1261
726 mov 2(%rbp,%rsi,8),%ebx 1262 movzb -128(%rbp,%rsi),%ebx
727 shr \$16,%edx 1263 shr \$16,%edx
728 and \$0x000000FF,%ebx
729 movz %dl,%esi # rk[i]>>16 1264 movz %dl,%esi # rk[i]>>16
730 xor %ebx,%eax 1265 xor %ebx,%eax
731 1266
732 mov 0(%rbp,%rsi,8),%ebx 1267 movzb -128(%rbp,%rsi),%ebx
733 movz %dh,%esi # rk[i]>>24 1268 movz %dh,%esi # rk[i]>>24
734 and \$0x0000FF00,%ebx 1269 shl \$8,%ebx
735 xor %ebx,%eax 1270 xor %ebx,%eax
736 1271
737 mov 0(%rbp,%rsi,8),%ebx 1272 movzb -128(%rbp,%rsi),%ebx
738 and \$0x00FF0000,%ebx 1273 shl \$16,%ebx
739 xor %ebx,%eax 1274 xor %ebx,%eax
740 1275
741 xor 2048(%rbp,%rcx,4),%eax # rcon 1276 xor 1024-128(%rbp,%rcx,4),%eax # rcon
742___ 1277___
743} 1278}
744 1279
@@ -751,7 +1286,29 @@ $code.=<<___;
751AES_set_encrypt_key: 1286AES_set_encrypt_key:
752 push %rbx 1287 push %rbx
753 push %rbp 1288 push %rbp
1289 push %r12 # redundant, but allows to share
1290 push %r13 # exception handler...
1291 push %r14
1292 push %r15
1293 sub \$8,%rsp
1294.Lenc_key_prologue:
1295
1296 call _x86_64_AES_set_encrypt_key
1297
1298 mov 8(%rsp),%r15
1299 mov 16(%rsp),%r14
1300 mov 24(%rsp),%r13
1301 mov 32(%rsp),%r12
1302 mov 40(%rsp),%rbp
1303 mov 48(%rsp),%rbx
1304 add \$56,%rsp
1305.Lenc_key_epilogue:
1306 ret
1307.size AES_set_encrypt_key,.-AES_set_encrypt_key
754 1308
1309.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1310.align 16
1311_x86_64_AES_set_encrypt_key:
755 mov %esi,%ecx # %ecx=bits 1312 mov %esi,%ecx # %ecx=bits
756 mov %rdi,%rsi # %rsi=userKey 1313 mov %rdi,%rsi # %rsi=userKey
757 mov %rdx,%rdi # %rdi=key 1314 mov %rdx,%rdi # %rdi=key
@@ -761,8 +1318,18 @@ AES_set_encrypt_key:
761 test \$-1,%rdi 1318 test \$-1,%rdi
762 jz .Lbadpointer 1319 jz .Lbadpointer
763 1320
764 .picmeup %rbp 1321 lea .LAES_Te(%rip),%rbp
765 lea AES_Te-.(%rbp),%rbp 1322 lea 2048+128(%rbp),%rbp
1323
1324 # prefetch Te4
1325 mov 0-128(%rbp),%eax
1326 mov 32-128(%rbp),%ebx
1327 mov 64-128(%rbp),%r8d
1328 mov 96-128(%rbp),%edx
1329 mov 128-128(%rbp),%eax
1330 mov 160-128(%rbp),%ebx
1331 mov 192-128(%rbp),%r8d
1332 mov 224-128(%rbp),%edx
766 1333
767 cmp \$128,%ecx 1334 cmp \$128,%ecx
768 je .L10rounds 1335 je .L10rounds
@@ -774,15 +1341,12 @@ AES_set_encrypt_key:
774 jmp .Lexit 1341 jmp .Lexit
775 1342
776.L10rounds: 1343.L10rounds:
777 mov 0(%rsi),%eax # copy first 4 dwords 1344 mov 0(%rsi),%rax # copy first 4 dwords
778 mov 4(%rsi),%ebx 1345 mov 8(%rsi),%rdx
779 mov 8(%rsi),%ecx 1346 mov %rax,0(%rdi)
780 mov 12(%rsi),%edx 1347 mov %rdx,8(%rdi)
781 mov %eax,0(%rdi)
782 mov %ebx,4(%rdi)
783 mov %ecx,8(%rdi)
784 mov %edx,12(%rdi)
785 1348
1349 shr \$32,%rdx
786 xor %ecx,%ecx 1350 xor %ecx,%ecx
787 jmp .L10shortcut 1351 jmp .L10shortcut
788.align 4 1352.align 4
@@ -810,19 +1374,14 @@ $code.=<<___;
810 jmp .Lexit 1374 jmp .Lexit
811 1375
812.L12rounds: 1376.L12rounds:
813 mov 0(%rsi),%eax # copy first 6 dwords 1377 mov 0(%rsi),%rax # copy first 6 dwords
814 mov 4(%rsi),%ebx 1378 mov 8(%rsi),%rbx
815 mov 8(%rsi),%ecx 1379 mov 16(%rsi),%rdx
816 mov 12(%rsi),%edx 1380 mov %rax,0(%rdi)
817 mov %eax,0(%rdi) 1381 mov %rbx,8(%rdi)
818 mov %ebx,4(%rdi) 1382 mov %rdx,16(%rdi)
819 mov %ecx,8(%rdi) 1383
820 mov %edx,12(%rdi) 1384 shr \$32,%rdx
821 mov 16(%rsi),%ecx
822 mov 20(%rsi),%edx
823 mov %ecx,16(%rdi)
824 mov %edx,20(%rdi)
825
826 xor %ecx,%ecx 1385 xor %ecx,%ecx
827 jmp .L12shortcut 1386 jmp .L12shortcut
828.align 4 1387.align 4
@@ -858,30 +1417,23 @@ $code.=<<___;
858 jmp .Lexit 1417 jmp .Lexit
859 1418
860.L14rounds: 1419.L14rounds:
861 mov 0(%rsi),%eax # copy first 8 dwords 1420 mov 0(%rsi),%rax # copy first 8 dwords
862 mov 4(%rsi),%ebx 1421 mov 8(%rsi),%rbx
863 mov 8(%rsi),%ecx 1422 mov 16(%rsi),%rcx
864 mov 12(%rsi),%edx 1423 mov 24(%rsi),%rdx
865 mov %eax,0(%rdi) 1424 mov %rax,0(%rdi)
866 mov %ebx,4(%rdi) 1425 mov %rbx,8(%rdi)
867 mov %ecx,8(%rdi) 1426 mov %rcx,16(%rdi)
868 mov %edx,12(%rdi) 1427 mov %rdx,24(%rdi)
869 mov 16(%rsi),%eax 1428
870 mov 20(%rsi),%ebx 1429 shr \$32,%rdx
871 mov 24(%rsi),%ecx
872 mov 28(%rsi),%edx
873 mov %eax,16(%rdi)
874 mov %ebx,20(%rdi)
875 mov %ecx,24(%rdi)
876 mov %edx,28(%rdi)
877
878 xor %ecx,%ecx 1430 xor %ecx,%ecx
879 jmp .L14shortcut 1431 jmp .L14shortcut
880.align 4 1432.align 4
881.L14loop: 1433.L14loop:
1434 mov 0(%rdi),%eax # rk[0]
882 mov 28(%rdi),%edx # rk[4] 1435 mov 28(%rdi),%edx # rk[4]
883.L14shortcut: 1436.L14shortcut:
884 mov 0(%rdi),%eax # rk[0]
885___ 1437___
886 &enckey (); 1438 &enckey ();
887$code.=<<___; 1439$code.=<<___;
@@ -900,24 +1452,23 @@ $code.=<<___;
900 mov %eax,%edx 1452 mov %eax,%edx
901 mov 16(%rdi),%eax # rk[4] 1453 mov 16(%rdi),%eax # rk[4]
902 movz %dl,%esi # rk[11]>>0 1454 movz %dl,%esi # rk[11]>>0
903 mov 2(%rbp,%rsi,8),%ebx 1455 movzb -128(%rbp,%rsi),%ebx
904 movz %dh,%esi # rk[11]>>8 1456 movz %dh,%esi # rk[11]>>8
905 and \$0x000000FF,%ebx
906 xor %ebx,%eax 1457 xor %ebx,%eax
907 1458
908 mov 0(%rbp,%rsi,8),%ebx 1459 movzb -128(%rbp,%rsi),%ebx
909 shr \$16,%edx 1460 shr \$16,%edx
910 and \$0x0000FF00,%ebx 1461 shl \$8,%ebx
911 movz %dl,%esi # rk[11]>>16 1462 movz %dl,%esi # rk[11]>>16
912 xor %ebx,%eax 1463 xor %ebx,%eax
913 1464
914 mov 0(%rbp,%rsi,8),%ebx 1465 movzb -128(%rbp,%rsi),%ebx
915 movz %dh,%esi # rk[11]>>24 1466 movz %dh,%esi # rk[11]>>24
916 and \$0x00FF0000,%ebx 1467 shl \$16,%ebx
917 xor %ebx,%eax 1468 xor %ebx,%eax
918 1469
919 mov 2(%rbp,%rsi,8),%ebx 1470 movzb -128(%rbp,%rsi),%ebx
920 and \$0xFF000000,%ebx 1471 shl \$24,%ebx
921 xor %ebx,%eax 1472 xor %ebx,%eax
922 1473
923 mov %eax,48(%rdi) # rk[12] 1474 mov %eax,48(%rdi) # rk[12]
@@ -938,31 +1489,61 @@ $code.=<<___;
938.Lbadpointer: 1489.Lbadpointer:
939 mov \$-1,%rax 1490 mov \$-1,%rax
940.Lexit: 1491.Lexit:
941 pop %rbp 1492 .byte 0xf3,0xc3 # rep ret
942 pop %rbx 1493.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
943 ret
944.size AES_set_encrypt_key,.-AES_set_encrypt_key
945___ 1494___
946 1495
947sub deckey() 1496sub deckey_ref()
948{ my ($i,$ptr,$te,$td) = @_; 1497{ my ($i,$ptr,$te,$td) = @_;
1498 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
949$code.=<<___; 1499$code.=<<___;
950 mov $i($ptr),%eax 1500 mov $i($ptr),$tp1
951 mov %eax,%edx 1501 mov $tp1,$acc
952 movz %ah,%ebx 1502 and \$0x80808080,$acc
953 shr \$16,%edx 1503 mov $acc,$tp4
954 and \$0xFF,%eax 1504 shr \$7,$tp4
955 movzb 2($te,%rax,8),%rax 1505 lea 0($tp1,$tp1),$tp2
956 movzb 2($te,%rbx,8),%rbx 1506 sub $tp4,$acc
957 mov 0($td,%rax,8),%eax 1507 and \$0xfefefefe,$tp2
958 xor 3($td,%rbx,8),%eax 1508 and \$0x1b1b1b1b,$acc
959 movzb %dh,%ebx 1509 xor $tp2,$acc
960 and \$0xFF,%edx 1510 mov $acc,$tp2
961 movzb 2($te,%rdx,8),%rdx 1511
962 movzb 2($te,%rbx,8),%rbx 1512 and \$0x80808080,$acc
963 xor 2($td,%rdx,8),%eax 1513 mov $acc,$tp8
964 xor 1($td,%rbx,8),%eax 1514 shr \$7,$tp8
965 mov %eax,$i($ptr) 1515 lea 0($tp2,$tp2),$tp4
1516 sub $tp8,$acc
1517 and \$0xfefefefe,$tp4
1518 and \$0x1b1b1b1b,$acc
1519 xor $tp1,$tp2 # tp2^tp1
1520 xor $tp4,$acc
1521 mov $acc,$tp4
1522
1523 and \$0x80808080,$acc
1524 mov $acc,$tp8
1525 shr \$7,$tp8
1526 sub $tp8,$acc
1527 lea 0($tp4,$tp4),$tp8
1528 xor $tp1,$tp4 # tp4^tp1
1529 and \$0xfefefefe,$tp8
1530 and \$0x1b1b1b1b,$acc
1531 xor $acc,$tp8
1532
1533 xor $tp8,$tp1 # tp1^tp8
1534 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1535 xor $tp8,$tp2 # tp2^tp1^tp8
1536 xor $tp8,$tp4 # tp4^tp1^tp8
1537 xor $tp2,$tp8
1538 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1539
1540 xor $tp8,$tp1
1541 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1542 xor $tp2,$tp1
1543 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1544 xor $tp4,$tp1
1545
1546 mov $tp1,$i($ptr)
966___ 1547___
967} 1548}
968 1549
@@ -973,19 +1554,23 @@ $code.=<<___;
973.type AES_set_decrypt_key,\@function,3 1554.type AES_set_decrypt_key,\@function,3
974.align 16 1555.align 16
975AES_set_decrypt_key: 1556AES_set_decrypt_key:
976 push %rdx 1557 push %rbx
977 call AES_set_encrypt_key 1558 push %rbp
978 cmp \$0,%eax 1559 push %r12
979 je .Lproceed 1560 push %r13
980 lea 24(%rsp),%rsp 1561 push %r14
981 ret 1562 push %r15
982.Lproceed: 1563 push %rdx # save key schedule
1564.Ldec_key_prologue:
1565
1566 call _x86_64_AES_set_encrypt_key
983 mov (%rsp),%r8 # restore key schedule 1567 mov (%rsp),%r8 # restore key schedule
984 mov %rbx,(%rsp) 1568 cmp \$0,%eax
1569 jne .Labort
985 1570
986 mov 240(%r8),%ecx # pull number of rounds 1571 mov 240(%r8),%r14d # pull number of rounds
987 xor %rdi,%rdi 1572 xor %rdi,%rdi
988 lea (%rdi,%rcx,4),%rcx 1573 lea (%rdi,%r14d,4),%rcx
989 mov %r8,%rsi 1574 mov %r8,%rsi
990 lea (%r8,%rcx,4),%rdi # pointer to last chunk 1575 lea (%r8,%rcx,4),%rdi # pointer to last chunk
991.align 4 1576.align 4
@@ -1003,27 +1588,39 @@ AES_set_decrypt_key:
1003 cmp %rsi,%rdi 1588 cmp %rsi,%rdi
1004 jne .Linvert 1589 jne .Linvert
1005 1590
1006 .picmeup %r9 1591 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1007 lea AES_Td-.(%r9),%rdi
1008 lea AES_Te-AES_Td(%rdi),%r9
1009 1592
1010 mov %r8,%rsi 1593 mov 40(%rax),$mask80
1011 mov 240(%r8),%ecx # pull number of rounds 1594 mov 48(%rax),$maskfe
1012 sub \$1,%ecx 1595 mov 56(%rax),$mask1b
1596
1597 mov %r8,$key
1598 sub \$1,%r14d
1013.align 4 1599.align 4
1014.Lpermute: 1600.Lpermute:
1015 lea 16(%rsi),%rsi 1601 lea 16($key),$key
1602 mov 0($key),%rax
1603 mov 8($key),%rcx
1016___ 1604___
1017 &deckey (0,"%rsi","%r9","%rdi"); 1605 &dectransform ();
1018 &deckey (4,"%rsi","%r9","%rdi");
1019 &deckey (8,"%rsi","%r9","%rdi");
1020 &deckey (12,"%rsi","%r9","%rdi");
1021$code.=<<___; 1606$code.=<<___;
1022 sub \$1,%ecx 1607 mov %eax,0($key)
1608 mov %ebx,4($key)
1609 mov %ecx,8($key)
1610 mov %edx,12($key)
1611 sub \$1,%r14d
1023 jnz .Lpermute 1612 jnz .Lpermute
1024 1613
1025 xor %rax,%rax 1614 xor %rax,%rax
1026 pop %rbx 1615.Labort:
1616 mov 8(%rsp),%r15
1617 mov 16(%rsp),%r14
1618 mov 24(%rsp),%r13
1619 mov 32(%rsp),%r12
1620 mov 40(%rsp),%rbp
1621 mov 48(%rsp),%rbx
1622 add \$56,%rsp
1623.Ldec_key_epilogue:
1027 ret 1624 ret
1028.size AES_set_decrypt_key,.-AES_set_decrypt_key 1625.size AES_set_decrypt_key,.-AES_set_decrypt_key
1029___ 1626___
@@ -1034,47 +1631,59 @@ ___
1034{ 1631{
1035# stack frame layout 1632# stack frame layout
1036# -8(%rsp) return address 1633# -8(%rsp) return address
1037my $_rsp="0(%rsp)"; # saved %rsp 1634my $keyp="0(%rsp)"; # one to pass as $key
1038my $_len="8(%rsp)"; # copy of 3rd parameter, length 1635my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1039my $_key="16(%rsp)"; # copy of 4th parameter, key 1636my $_rsp="16(%rsp)"; # saved %rsp
1040my $_ivp="24(%rsp)"; # copy of 5th parameter, ivp 1637my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1041my $keyp="32(%rsp)"; # one to pass as $key 1638my $_out="32(%rsp)"; # copy of 2nd parameter, out
1042my $ivec="40(%rsp)"; # ivec[16] 1639my $_len="40(%rsp)"; # copy of 3rd parameter, length
1043my $aes_key="56(%rsp)"; # copy of aes_key 1640my $_key="48(%rsp)"; # copy of 4th parameter, key
1044my $mark="56+240(%rsp)"; # copy of aes_key->rounds 1641my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1642my $ivec="64(%rsp)"; # ivec[16]
1643my $aes_key="80(%rsp)"; # copy of aes_key
1644my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1045 1645
1046$code.=<<___; 1646$code.=<<___;
1047.globl AES_cbc_encrypt 1647.globl AES_cbc_encrypt
1048.type AES_cbc_encrypt,\@function,6 1648.type AES_cbc_encrypt,\@function,6
1049.align 16 1649.align 16
1650.extern OPENSSL_ia32cap_P
1050AES_cbc_encrypt: 1651AES_cbc_encrypt:
1051 cmp \$0,%rdx # check length 1652 cmp \$0,%rdx # check length
1052 je .Lcbc_just_ret 1653 je .Lcbc_epilogue
1654 pushfq
1053 push %rbx 1655 push %rbx
1054 push %rbp 1656 push %rbp
1055 push %r12 1657 push %r12
1056 push %r13 1658 push %r13
1057 push %r14 1659 push %r14
1058 push %r15 1660 push %r15
1059 pushfq 1661.Lcbc_prologue:
1662
1060 cld 1663 cld
1061 mov %r9d,%r9d # clear upper half of enc 1664 mov %r9d,%r9d # clear upper half of enc
1062 1665
1063 .picmeup $sbox 1666 lea .LAES_Te(%rip),$sbox
1064.Lcbc_pic_point:
1065
1066 cmp \$0,%r9 1667 cmp \$0,%r9
1067 je .LDECRYPT 1668 jne .Lcbc_picked_te
1068 1669 lea .LAES_Td(%rip),$sbox
1069 lea AES_Te-.Lcbc_pic_point($sbox),$sbox 1670.Lcbc_picked_te:
1671
1672 mov OPENSSL_ia32cap_P(%rip),%r10d
1673 cmp \$$speed_limit,%rdx
1674 jb .Lcbc_slow_prologue
1675 test \$15,%rdx
1676 jnz .Lcbc_slow_prologue
1677 bt \$28,%r10d
1678 jc .Lcbc_slow_prologue
1070 1679
1071 # allocate aligned stack frame... 1680 # allocate aligned stack frame...
1072 lea -64-248(%rsp),$key 1681 lea -88-248(%rsp),$key
1073 and \$-64,$key 1682 and \$-64,$key
1074 1683
1075 # ... and make it doesn't alias with AES_Te modulo 4096 1684 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1076 mov $sbox,%r10 1685 mov $sbox,%r10
1077 lea 2048($sbox),%r11 1686 lea 2304($sbox),%r11
1078 mov $key,%r12 1687 mov $key,%r12
1079 and \$0xFFF,%r10 # s = $sbox&0xfff 1688 and \$0xFFF,%r10 # s = $sbox&0xfff
1080 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff 1689 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
@@ -1094,22 +1703,27 @@ AES_cbc_encrypt:
1094.Lcbc_te_ok: 1703.Lcbc_te_ok:
1095 1704
1096 xchg %rsp,$key 1705 xchg %rsp,$key
1097 add \$8,%rsp # reserve for return address! 1706 #add \$8,%rsp # reserve for return address!
1098 mov $key,$_rsp # save %rsp 1707 mov $key,$_rsp # save %rsp
1708.Lcbc_fast_body:
1709 mov %rdi,$_inp # save copy of inp
1710 mov %rsi,$_out # save copy of out
1099 mov %rdx,$_len # save copy of len 1711 mov %rdx,$_len # save copy of len
1100 mov %rcx,$_key # save copy of key 1712 mov %rcx,$_key # save copy of key
1101 mov %r8,$_ivp # save copy of ivp 1713 mov %r8,$_ivp # save copy of ivp
1102 movl \$0,$mark # copy of aes_key->rounds = 0; 1714 movl \$0,$mark # copy of aes_key->rounds = 0;
1103 mov %r8,%rbp # rearrange input arguments 1715 mov %r8,%rbp # rearrange input arguments
1716 mov %r9,%rbx
1104 mov %rsi,$out 1717 mov %rsi,$out
1105 mov %rdi,$inp 1718 mov %rdi,$inp
1106 mov %rcx,$key 1719 mov %rcx,$key
1107 1720
1721 mov 240($key),%eax # key->rounds
1108 # do we copy key schedule to stack? 1722 # do we copy key schedule to stack?
1109 mov $key,%r10 1723 mov $key,%r10
1110 sub $sbox,%r10 1724 sub $sbox,%r10
1111 and \$0xfff,%r10 1725 and \$0xfff,%r10
1112 cmp \$2048,%r10 1726 cmp \$2304,%r10
1113 jb .Lcbc_do_ecopy 1727 jb .Lcbc_do_ecopy
1114 cmp \$4096-248,%r10 1728 cmp \$4096-248,%r10
1115 jb .Lcbc_skip_ecopy 1729 jb .Lcbc_skip_ecopy
@@ -1120,12 +1734,11 @@ AES_cbc_encrypt:
1120 lea $aes_key,$key 1734 lea $aes_key,$key
1121 mov \$240/8,%ecx 1735 mov \$240/8,%ecx
1122 .long 0x90A548F3 # rep movsq 1736 .long 0x90A548F3 # rep movsq
1123 mov (%rsi),%eax # copy aes_key->rounds 1737 mov %eax,(%rdi) # copy aes_key->rounds
1124 mov %eax,(%rdi)
1125.Lcbc_skip_ecopy: 1738.Lcbc_skip_ecopy:
1126 mov $key,$keyp # save key pointer 1739 mov $key,$keyp # save key pointer
1127 1740
1128 mov \$16,%ecx 1741 mov \$18,%ecx
1129.align 4 1742.align 4
1130.Lcbc_prefetch_te: 1743.Lcbc_prefetch_te:
1131 mov 0($sbox),%r10 1744 mov 0($sbox),%r10
@@ -1135,184 +1748,77 @@ AES_cbc_encrypt:
1135 lea 128($sbox),$sbox 1748 lea 128($sbox),$sbox
1136 sub \$1,%ecx 1749 sub \$1,%ecx
1137 jnz .Lcbc_prefetch_te 1750 jnz .Lcbc_prefetch_te
1138 sub \$2048,$sbox 1751 lea -2304($sbox),$sbox
1139 1752
1140 test \$-16,%rdx # check upon length 1753 cmp \$0,%rbx
1141 mov %rdx,%r10 1754 je .LFAST_DECRYPT
1755
1756#----------------------------- ENCRYPT -----------------------------#
1142 mov 0(%rbp),$s0 # load iv 1757 mov 0(%rbp),$s0 # load iv
1143 mov 4(%rbp),$s1 1758 mov 4(%rbp),$s1
1144 mov 8(%rbp),$s2 1759 mov 8(%rbp),$s2
1145 mov 12(%rbp),$s3 1760 mov 12(%rbp),$s3
1146 jz .Lcbc_enc_tail # short input...
1147 1761
1148.align 4 1762.align 4
1149.Lcbc_enc_loop: 1763.Lcbc_fast_enc_loop:
1150 xor 0($inp),$s0 1764 xor 0($inp),$s0
1151 xor 4($inp),$s1 1765 xor 4($inp),$s1
1152 xor 8($inp),$s2 1766 xor 8($inp),$s2
1153 xor 12($inp),$s3 1767 xor 12($inp),$s3
1154 mov $inp,$ivec # if ($verticalspin) save inp
1155
1156 mov $keyp,$key # restore key 1768 mov $keyp,$key # restore key
1769 mov $inp,$_inp # if ($verticalspin) save inp
1770
1157 call _x86_64_AES_encrypt 1771 call _x86_64_AES_encrypt
1158 1772
1159 mov $ivec,$inp # if ($verticalspin) restore inp 1773 mov $_inp,$inp # if ($verticalspin) restore inp
1774 mov $_len,%r10
1160 mov $s0,0($out) 1775 mov $s0,0($out)
1161 mov $s1,4($out) 1776 mov $s1,4($out)
1162 mov $s2,8($out) 1777 mov $s2,8($out)
1163 mov $s3,12($out) 1778 mov $s3,12($out)
1164 1779
1165 mov $_len,%r10
1166 lea 16($inp),$inp 1780 lea 16($inp),$inp
1167 lea 16($out),$out 1781 lea 16($out),$out
1168 sub \$16,%r10 1782 sub \$16,%r10
1169 test \$-16,%r10 1783 test \$-16,%r10
1170 mov %r10,$_len 1784 mov %r10,$_len
1171 jnz .Lcbc_enc_loop 1785 jnz .Lcbc_fast_enc_loop
1172 test \$15,%r10
1173 jnz .Lcbc_enc_tail
1174 mov $_ivp,%rbp # restore ivp 1786 mov $_ivp,%rbp # restore ivp
1175 mov $s0,0(%rbp) # save ivec 1787 mov $s0,0(%rbp) # save ivec
1176 mov $s1,4(%rbp) 1788 mov $s1,4(%rbp)
1177 mov $s2,8(%rbp) 1789 mov $s2,8(%rbp)
1178 mov $s3,12(%rbp) 1790 mov $s3,12(%rbp)
1179 1791
1180.align 4 1792 jmp .Lcbc_fast_cleanup
1181.Lcbc_cleanup: 1793
1182 cmpl \$0,$mark # was the key schedule copied?
1183 lea $aes_key,%rdi
1184 mov $_rsp,%rsp
1185 je .Lcbc_exit
1186 mov \$240/8,%ecx
1187 xor %rax,%rax
1188 .long 0x90AB48F3 # rep stosq
1189.Lcbc_exit:
1190 popfq
1191 pop %r15
1192 pop %r14
1193 pop %r13
1194 pop %r12
1195 pop %rbp
1196 pop %rbx
1197.Lcbc_just_ret:
1198 ret
1199.align 4
1200.Lcbc_enc_tail:
1201 mov %rax,%r11
1202 mov %rcx,%r12
1203 mov %r10,%rcx
1204 mov $inp,%rsi
1205 mov $out,%rdi
1206 .long 0xF689A4F3 # rep movsb
1207 mov \$16,%rcx # zero tail
1208 sub %r10,%rcx
1209 xor %rax,%rax
1210 .long 0xF689AAF3 # rep stosb
1211 mov $out,$inp # this is not a mistake!
1212 movq \$16,$_len # len=16
1213 mov %r11,%rax
1214 mov %r12,%rcx
1215 jmp .Lcbc_enc_loop # one more spin...
1216#----------------------------- DECRYPT -----------------------------# 1794#----------------------------- DECRYPT -----------------------------#
1217.align 16 1795.align 16
1218.LDECRYPT: 1796.LFAST_DECRYPT:
1219 lea AES_Td-.Lcbc_pic_point($sbox),$sbox
1220
1221 # allocate aligned stack frame...
1222 lea -64-248(%rsp),$key
1223 and \$-64,$key
1224
1225 # ... and make it doesn't alias with AES_Td modulo 4096
1226 mov $sbox,%r10
1227 lea 2304($sbox),%r11
1228 mov $key,%r12
1229 and \$0xFFF,%r10 # s = $sbox&0xfff
1230 and \$0xFFF,%r11 # e = ($sbox+2048+256)&0xfff
1231 and \$0xFFF,%r12 # p = %rsp&0xfff
1232
1233 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1234 jb .Lcbc_td_break_out
1235 sub %r11,%r12
1236 sub %r12,$key
1237 jmp .Lcbc_td_ok
1238.Lcbc_td_break_out: # else %rsp -= (p-s)&0xfff + framesz
1239 sub %r10,%r12
1240 and \$0xFFF,%r12
1241 add \$320,%r12
1242 sub %r12,$key
1243.align 4
1244.Lcbc_td_ok:
1245
1246 xchg %rsp,$key
1247 add \$8,%rsp # reserve for return address!
1248 mov $key,$_rsp # save %rsp
1249 mov %rdx,$_len # save copy of len
1250 mov %rcx,$_key # save copy of key
1251 mov %r8,$_ivp # save copy of ivp
1252 movl \$0,$mark # copy of aes_key->rounds = 0;
1253 mov %r8,%rbp # rearrange input arguments
1254 mov %rsi,$out
1255 mov %rdi,$inp
1256 mov %rcx,$key
1257
1258 # do we copy key schedule to stack?
1259 mov $key,%r10
1260 sub $sbox,%r10
1261 and \$0xfff,%r10
1262 cmp \$2304,%r10
1263 jb .Lcbc_do_dcopy
1264 cmp \$4096-248,%r10
1265 jb .Lcbc_skip_dcopy
1266.align 4
1267.Lcbc_do_dcopy:
1268 mov $key,%rsi
1269 lea $aes_key,%rdi
1270 lea $aes_key,$key
1271 mov \$240/8,%ecx
1272 .long 0x90A548F3 # rep movsq
1273 mov (%rsi),%eax # copy aes_key->rounds
1274 mov %eax,(%rdi)
1275.Lcbc_skip_dcopy:
1276 mov $key,$keyp # save key pointer
1277
1278 mov \$18,%ecx
1279.align 4
1280.Lcbc_prefetch_td:
1281 mov 0($sbox),%r10
1282 mov 32($sbox),%r11
1283 mov 64($sbox),%r12
1284 mov 96($sbox),%r13
1285 lea 128($sbox),$sbox
1286 sub \$1,%ecx
1287 jnz .Lcbc_prefetch_td
1288 sub \$2304,$sbox
1289
1290 cmp $inp,$out 1797 cmp $inp,$out
1291 je .Lcbc_dec_in_place 1798 je .Lcbc_fast_dec_in_place
1292 1799
1293 mov %rbp,$ivec 1800 mov %rbp,$ivec
1294.align 4 1801.align 4
1295.Lcbc_dec_loop: 1802.Lcbc_fast_dec_loop:
1296 mov 0($inp),$s0 # read input 1803 mov 0($inp),$s0 # read input
1297 mov 4($inp),$s1 1804 mov 4($inp),$s1
1298 mov 8($inp),$s2 1805 mov 8($inp),$s2
1299 mov 12($inp),$s3 1806 mov 12($inp),$s3
1300 mov $inp,8+$ivec # if ($verticalspin) save inp
1301
1302 mov $keyp,$key # restore key 1807 mov $keyp,$key # restore key
1808 mov $inp,$_inp # if ($verticalspin) save inp
1809
1303 call _x86_64_AES_decrypt 1810 call _x86_64_AES_decrypt
1304 1811
1305 mov $ivec,%rbp # load ivp 1812 mov $ivec,%rbp # load ivp
1306 mov 8+$ivec,$inp # if ($verticalspin) restore inp 1813 mov $_inp,$inp # if ($verticalspin) restore inp
1814 mov $_len,%r10 # load len
1307 xor 0(%rbp),$s0 # xor iv 1815 xor 0(%rbp),$s0 # xor iv
1308 xor 4(%rbp),$s1 1816 xor 4(%rbp),$s1
1309 xor 8(%rbp),$s2 1817 xor 8(%rbp),$s2
1310 xor 12(%rbp),$s3 1818 xor 12(%rbp),$s3
1311 mov $inp,%rbp # current input, next iv 1819 mov $inp,%rbp # current input, next iv
1312 1820
1313 mov $_len,%r10 # load len
1314 sub \$16,%r10 1821 sub \$16,%r10
1315 jc .Lcbc_dec_partial
1316 mov %r10,$_len # update len 1822 mov %r10,$_len # update len
1317 mov %rbp,$ivec # update ivp 1823 mov %rbp,$ivec # update ivp
1318 1824
@@ -1323,81 +1829,281 @@ AES_cbc_encrypt:
1323 1829
1324 lea 16($inp),$inp 1830 lea 16($inp),$inp
1325 lea 16($out),$out 1831 lea 16($out),$out
1326 jnz .Lcbc_dec_loop 1832 jnz .Lcbc_fast_dec_loop
1327.Lcbc_dec_end:
1328 mov $_ivp,%r12 # load user ivp 1833 mov $_ivp,%r12 # load user ivp
1329 mov 0(%rbp),%r10 # load iv 1834 mov 0(%rbp),%r10 # load iv
1330 mov 8(%rbp),%r11 1835 mov 8(%rbp),%r11
1331 mov %r10,0(%r12) # copy back to user 1836 mov %r10,0(%r12) # copy back to user
1332 mov %r11,8(%r12) 1837 mov %r11,8(%r12)
1333 jmp .Lcbc_cleanup 1838 jmp .Lcbc_fast_cleanup
1334
1335.align 4
1336.Lcbc_dec_partial:
1337 mov $s0,0+$ivec # dump output to stack
1338 mov $s1,4+$ivec
1339 mov $s2,8+$ivec
1340 mov $s3,12+$ivec
1341 mov $out,%rdi
1342 lea $ivec,%rsi
1343 mov \$16,%rcx
1344 add %r10,%rcx # number of bytes to copy
1345 .long 0xF689A4F3 # rep movsb
1346 jmp .Lcbc_dec_end
1347 1839
1348.align 16 1840.align 16
1349.Lcbc_dec_in_place: 1841.Lcbc_fast_dec_in_place:
1842 mov 0(%rbp),%r10 # copy iv to stack
1843 mov 8(%rbp),%r11
1844 mov %r10,0+$ivec
1845 mov %r11,8+$ivec
1846.align 4
1847.Lcbc_fast_dec_in_place_loop:
1350 mov 0($inp),$s0 # load input 1848 mov 0($inp),$s0 # load input
1351 mov 4($inp),$s1 1849 mov 4($inp),$s1
1352 mov 8($inp),$s2 1850 mov 8($inp),$s2
1353 mov 12($inp),$s3 1851 mov 12($inp),$s3
1852 mov $keyp,$key # restore key
1853 mov $inp,$_inp # if ($verticalspin) save inp
1354 1854
1355 mov $inp,$ivec # if ($verticalspin) save inp
1356 mov $keyp,$key
1357 call _x86_64_AES_decrypt 1855 call _x86_64_AES_decrypt
1358 1856
1359 mov $ivec,$inp # if ($verticalspin) restore inp 1857 mov $_inp,$inp # if ($verticalspin) restore inp
1360 mov $_ivp,%rbp 1858 mov $_len,%r10
1361 xor 0(%rbp),$s0 1859 xor 0+$ivec,$s0
1362 xor 4(%rbp),$s1 1860 xor 4+$ivec,$s1
1363 xor 8(%rbp),$s2 1861 xor 8+$ivec,$s2
1364 xor 12(%rbp),$s3 1862 xor 12+$ivec,$s3
1863
1864 mov 0($inp),%r11 # load input
1865 mov 8($inp),%r12
1866 sub \$16,%r10
1867 jz .Lcbc_fast_dec_in_place_done
1365 1868
1366 mov 0($inp),%r10 # copy input to iv 1869 mov %r11,0+$ivec # copy input to iv
1367 mov 8($inp),%r11 1870 mov %r12,8+$ivec
1368 mov %r10,0(%rbp)
1369 mov %r11,8(%rbp)
1370 1871
1371 mov $s0,0($out) # save output [zaps input] 1872 mov $s0,0($out) # save output [zaps input]
1372 mov $s1,4($out) 1873 mov $s1,4($out)
1373 mov $s2,8($out) 1874 mov $s2,8($out)
1374 mov $s3,12($out) 1875 mov $s3,12($out)
1375 1876
1376 mov $_len,%rcx
1377 lea 16($inp),$inp 1877 lea 16($inp),$inp
1378 lea 16($out),$out 1878 lea 16($out),$out
1379 sub \$16,%rcx 1879 mov %r10,$_len
1380 jc .Lcbc_dec_in_place_partial 1880 jmp .Lcbc_fast_dec_in_place_loop
1381 mov %rcx,$_len 1881.Lcbc_fast_dec_in_place_done:
1382 jnz .Lcbc_dec_in_place 1882 mov $_ivp,%rdi
1383 jmp .Lcbc_cleanup 1883 mov %r11,0(%rdi) # copy iv back to user
1884 mov %r12,8(%rdi)
1885
1886 mov $s0,0($out) # save output [zaps input]
1887 mov $s1,4($out)
1888 mov $s2,8($out)
1889 mov $s3,12($out)
1384 1890
1385.align 4 1891.align 4
1386.Lcbc_dec_in_place_partial: 1892.Lcbc_fast_cleanup:
1387 # one can argue if this is actually required 1893 cmpl \$0,$mark # was the key schedule copied?
1388 lea ($out,%rcx),%rdi 1894 lea $aes_key,%rdi
1389 lea (%rbp,%rcx),%rsi 1895 je .Lcbc_exit
1390 neg %rcx 1896 mov \$240/8,%ecx
1391 .long 0xF689A4F3 # rep movsb # restore tail 1897 xor %rax,%rax
1392 jmp .Lcbc_cleanup 1898 .long 0x90AB48F3 # rep stosq
1899
1900 jmp .Lcbc_exit
1901
1902#--------------------------- SLOW ROUTINE ---------------------------#
1903.align 16
1904.Lcbc_slow_prologue:
1905 # allocate aligned stack frame...
1906 lea -88(%rsp),%rbp
1907 and \$-64,%rbp
1908 # ... just "above" key schedule
1909 lea -88-63(%rcx),%r10
1910 sub %rbp,%r10
1911 neg %r10
1912 and \$0x3c0,%r10
1913 sub %r10,%rbp
1914
1915 xchg %rsp,%rbp
1916 #add \$8,%rsp # reserve for return address!
1917 mov %rbp,$_rsp # save %rsp
1918.Lcbc_slow_body:
1919 #mov %rdi,$_inp # save copy of inp
1920 #mov %rsi,$_out # save copy of out
1921 #mov %rdx,$_len # save copy of len
1922 #mov %rcx,$_key # save copy of key
1923 mov %r8,$_ivp # save copy of ivp
1924 mov %r8,%rbp # rearrange input arguments
1925 mov %r9,%rbx
1926 mov %rsi,$out
1927 mov %rdi,$inp
1928 mov %rcx,$key
1929 mov %rdx,%r10
1930
1931 mov 240($key),%eax
1932 mov $key,$keyp # save key pointer
1933 shl \$4,%eax
1934 lea ($key,%rax),%rax
1935 mov %rax,$keyend
1936
1937 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1938 lea 2048($sbox),$sbox
1939 lea 768-8(%rsp),%rax
1940 sub $sbox,%rax
1941 and \$0x300,%rax
1942 lea ($sbox,%rax),$sbox
1943
1944 cmp \$0,%rbx
1945 je .LSLOW_DECRYPT
1946
1947#--------------------------- SLOW ENCRYPT ---------------------------#
1948 test \$-16,%r10 # check upon length
1949 mov 0(%rbp),$s0 # load iv
1950 mov 4(%rbp),$s1
1951 mov 8(%rbp),$s2
1952 mov 12(%rbp),$s3
1953 jz .Lcbc_slow_enc_tail # short input...
1954
1955.align 4
1956.Lcbc_slow_enc_loop:
1957 xor 0($inp),$s0
1958 xor 4($inp),$s1
1959 xor 8($inp),$s2
1960 xor 12($inp),$s3
1961 mov $keyp,$key # restore key
1962 mov $inp,$_inp # save inp
1963 mov $out,$_out # save out
1964 mov %r10,$_len # save len
1965
1966 call _x86_64_AES_encrypt_compact
1967
1968 mov $_inp,$inp # restore inp
1969 mov $_out,$out # restore out
1970 mov $_len,%r10 # restore len
1971 mov $s0,0($out)
1972 mov $s1,4($out)
1973 mov $s2,8($out)
1974 mov $s3,12($out)
1975
1976 lea 16($inp),$inp
1977 lea 16($out),$out
1978 sub \$16,%r10
1979 test \$-16,%r10
1980 jnz .Lcbc_slow_enc_loop
1981 test \$15,%r10
1982 jnz .Lcbc_slow_enc_tail
1983 mov $_ivp,%rbp # restore ivp
1984 mov $s0,0(%rbp) # save ivec
1985 mov $s1,4(%rbp)
1986 mov $s2,8(%rbp)
1987 mov $s3,12(%rbp)
1988
1989 jmp .Lcbc_exit
1990
1991.align 4
1992.Lcbc_slow_enc_tail:
1993 mov %rax,%r11
1994 mov %rcx,%r12
1995 mov %r10,%rcx
1996 mov $inp,%rsi
1997 mov $out,%rdi
1998 .long 0x9066A4F3 # rep movsb
1999 mov \$16,%rcx # zero tail
2000 sub %r10,%rcx
2001 xor %rax,%rax
2002 .long 0x9066AAF3 # rep stosb
2003 mov $out,$inp # this is not a mistake!
2004 mov \$16,%r10 # len=16
2005 mov %r11,%rax
2006 mov %r12,%rcx
2007 jmp .Lcbc_slow_enc_loop # one more spin...
2008#--------------------------- SLOW DECRYPT ---------------------------#
2009.align 16
2010.LSLOW_DECRYPT:
2011 shr \$3,%rax
2012 add %rax,$sbox # recall "magic" constants!
2013
2014 mov 0(%rbp),%r11 # copy iv to stack
2015 mov 8(%rbp),%r12
2016 mov %r11,0+$ivec
2017 mov %r12,8+$ivec
2018
2019.align 4
2020.Lcbc_slow_dec_loop:
2021 mov 0($inp),$s0 # load input
2022 mov 4($inp),$s1
2023 mov 8($inp),$s2
2024 mov 12($inp),$s3
2025 mov $keyp,$key # restore key
2026 mov $inp,$_inp # save inp
2027 mov $out,$_out # save out
2028 mov %r10,$_len # save len
2029
2030 call _x86_64_AES_decrypt_compact
2031
2032 mov $_inp,$inp # restore inp
2033 mov $_out,$out # restore out
2034 mov $_len,%r10
2035 xor 0+$ivec,$s0
2036 xor 4+$ivec,$s1
2037 xor 8+$ivec,$s2
2038 xor 12+$ivec,$s3
2039
2040 mov 0($inp),%r11 # load input
2041 mov 8($inp),%r12
2042 sub \$16,%r10
2043 jc .Lcbc_slow_dec_partial
2044 jz .Lcbc_slow_dec_done
2045
2046 mov %r11,0+$ivec # copy input to iv
2047 mov %r12,8+$ivec
2048
2049 mov $s0,0($out) # save output [can zap input]
2050 mov $s1,4($out)
2051 mov $s2,8($out)
2052 mov $s3,12($out)
2053
2054 lea 16($inp),$inp
2055 lea 16($out),$out
2056 jmp .Lcbc_slow_dec_loop
2057.Lcbc_slow_dec_done:
2058 mov $_ivp,%rdi
2059 mov %r11,0(%rdi) # copy iv back to user
2060 mov %r12,8(%rdi)
2061
2062 mov $s0,0($out) # save output [can zap input]
2063 mov $s1,4($out)
2064 mov $s2,8($out)
2065 mov $s3,12($out)
2066
2067 jmp .Lcbc_exit
2068
2069.align 4
2070.Lcbc_slow_dec_partial:
2071 mov $_ivp,%rdi
2072 mov %r11,0(%rdi) # copy iv back to user
2073 mov %r12,8(%rdi)
2074
2075 mov $s0,0+$ivec # save output to stack
2076 mov $s1,4+$ivec
2077 mov $s2,8+$ivec
2078 mov $s3,12+$ivec
2079
2080 mov $out,%rdi
2081 lea $ivec,%rsi
2082 lea 16(%r10),%rcx
2083 .long 0x9066A4F3 # rep movsb
2084 jmp .Lcbc_exit
2085
2086.align 16
2087.Lcbc_exit:
2088 mov $_rsp,%rsi
2089 mov (%rsi),%r15
2090 mov 8(%rsi),%r14
2091 mov 16(%rsi),%r13
2092 mov 24(%rsi),%r12
2093 mov 32(%rsi),%rbp
2094 mov 40(%rsi),%rbx
2095 lea 48(%rsi),%rsp
2096.Lcbc_popfq:
2097 popfq
2098.Lcbc_epilogue:
2099 ret
1393.size AES_cbc_encrypt,.-AES_cbc_encrypt 2100.size AES_cbc_encrypt,.-AES_cbc_encrypt
1394___ 2101___
1395} 2102}
1396 2103
1397$code.=<<___; 2104$code.=<<___;
1398.globl AES_Te
1399.align 64 2105.align 64
1400AES_Te: 2106.LAES_Te:
1401___ 2107___
1402 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); 2108 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
1403 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); 2109 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
@@ -1463,16 +2169,149 @@ ___
1463 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0); 2169 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1464 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e); 2170 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1465 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c); 2171 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2172
2173#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2174 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2175 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2176 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2177 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2178 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2179 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2180 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2181 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2182 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2183 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2184 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2185 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2186 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2187 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2188 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2189 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2190 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2191 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2192 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2193 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2194 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2195 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2196 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2197 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2198 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2199 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2200 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2201 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2202 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2203 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2204 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2205 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2206
2207 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2208 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2209 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2210 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2211 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2212 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2213 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2214 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2215 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2216 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2217 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2218 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2219 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2220 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2221 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2222 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2223 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2224 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2225 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2226 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2227 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2228 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2229 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2230 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2231 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2232 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2233 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2234 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2235 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2236 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2237 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2238 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2239
2240 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2241 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2242 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2243 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2244 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2245 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2246 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2247 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2248 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2249 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2250 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2251 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2252 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2253 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2254 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2255 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2256 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2257 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2258 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2259 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2260 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2261 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2262 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2263 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2264 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2265 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2266 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2267 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2268 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2269 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2270 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2271 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2272
2273 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2274 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2275 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2276 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2277 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2278 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2279 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2280 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2281 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2282 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2283 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2284 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2285 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2286 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2287 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2288 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2289 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2290 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2291 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2292 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2293 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2294 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2295 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2296 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2297 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2298 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2299 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2300 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2301 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2302 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2303 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2304 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1466#rcon: 2305#rcon:
1467$code.=<<___; 2306$code.=<<___;
1468 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008 2307 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
1469 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080 2308 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
1470 .long 0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0 2309 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2310 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
1471___ 2311___
1472$code.=<<___; 2312$code.=<<___;
1473.globl AES_Td
1474.align 64 2313.align 64
1475AES_Td: 2314.LAES_Td:
1476___ 2315___
1477 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); 2316 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1478 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); 2317 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
@@ -1538,7 +2377,116 @@ ___
1538 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff); 2377 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1539 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664); 2378 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1540 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0); 2379 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1541#Td4: 2380
2381#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2382 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2383 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2384 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2385 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2386 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2387 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2388 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2389 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2390 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2391 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2392 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2393 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2394 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2395 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2396 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2397 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2398 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2399 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2400 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2401 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2402 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2403 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2404 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2405 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2406 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2407 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2408 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2409 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2410 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2411 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2412 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2413 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2414$code.=<<___;
2415 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2416 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2417___
2418 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2419 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2420 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2421 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2422 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2423 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2424 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2425 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2426 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2427 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2428 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2429 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2430 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2431 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2432 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2433 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2434 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2435 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2436 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2437 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2438 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2439 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2440 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2441 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2442 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2443 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2444 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2445 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2446 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2447 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2448 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2449 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2450$code.=<<___;
2451 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2452 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2453___
2454 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2455 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2456 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2457 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2458 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2459 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2460 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2461 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2462 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2463 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2464 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2465 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2466 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2467 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2468 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2469 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2470 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2471 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2472 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2473 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2474 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2475 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2476 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2477 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2478 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2479 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2480 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2481 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2482 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2483 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2484 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2485 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2486$code.=<<___;
2487 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2488 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2489___
1542 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38); 2490 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1543 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb); 2491 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1544 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87); 2492 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
@@ -1571,6 +2519,288 @@ ___
1571 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61); 2519 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1572 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26); 2520 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1573 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d); 2521 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2522$code.=<<___;
2523 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2524 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2525.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2526.align 64
2527___
2528
2529# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2530# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2531if ($win64) {
2532$rec="%rcx";
2533$frame="%rdx";
2534$context="%r8";
2535$disp="%r9";
2536
2537$code.=<<___;
2538.extern __imp_RtlVirtualUnwind
2539.type block_se_handler,\@abi-omnipotent
2540.align 16
2541block_se_handler:
2542 push %rsi
2543 push %rdi
2544 push %rbx
2545 push %rbp
2546 push %r12
2547 push %r13
2548 push %r14
2549 push %r15
2550 pushfq
2551 sub \$64,%rsp
2552
2553 mov 120($context),%rax # pull context->Rax
2554 mov 248($context),%rbx # pull context->Rip
2555
2556 mov 8($disp),%rsi # disp->ImageBase
2557 mov 56($disp),%r11 # disp->HandlerData
2558
2559 mov 0(%r11),%r10d # HandlerData[0]
2560 lea (%rsi,%r10),%r10 # prologue label
2561 cmp %r10,%rbx # context->Rip<prologue label
2562 jb .Lin_block_prologue
2563
2564 mov 152($context),%rax # pull context->Rsp
2565
2566 mov 4(%r11),%r10d # HandlerData[1]
2567 lea (%rsi,%r10),%r10 # epilogue label
2568 cmp %r10,%rbx # context->Rip>=epilogue label
2569 jae .Lin_block_prologue
2570
2571 mov 24(%rax),%rax # pull saved real stack pointer
2572 lea 48(%rax),%rax # adjust...
2573
2574 mov -8(%rax),%rbx
2575 mov -16(%rax),%rbp
2576 mov -24(%rax),%r12
2577 mov -32(%rax),%r13
2578 mov -40(%rax),%r14
2579 mov -48(%rax),%r15
2580 mov %rbx,144($context) # restore context->Rbx
2581 mov %rbp,160($context) # restore context->Rbp
2582 mov %r12,216($context) # restore context->R12
2583 mov %r13,224($context) # restore context->R13
2584 mov %r14,232($context) # restore context->R14
2585 mov %r15,240($context) # restore context->R15
2586
2587.Lin_block_prologue:
2588 mov 8(%rax),%rdi
2589 mov 16(%rax),%rsi
2590 mov %rax,152($context) # restore context->Rsp
2591 mov %rsi,168($context) # restore context->Rsi
2592 mov %rdi,176($context) # restore context->Rdi
2593
2594 jmp .Lcommon_seh_exit
2595.size block_se_handler,.-block_se_handler
2596
2597.type key_se_handler,\@abi-omnipotent
2598.align 16
2599key_se_handler:
2600 push %rsi
2601 push %rdi
2602 push %rbx
2603 push %rbp
2604 push %r12
2605 push %r13
2606 push %r14
2607 push %r15
2608 pushfq
2609 sub \$64,%rsp
2610
2611 mov 120($context),%rax # pull context->Rax
2612 mov 248($context),%rbx # pull context->Rip
2613
2614 mov 8($disp),%rsi # disp->ImageBase
2615 mov 56($disp),%r11 # disp->HandlerData
2616
2617 mov 0(%r11),%r10d # HandlerData[0]
2618 lea (%rsi,%r10),%r10 # prologue label
2619 cmp %r10,%rbx # context->Rip<prologue label
2620 jb .Lin_key_prologue
2621
2622 mov 152($context),%rax # pull context->Rsp
2623
2624 mov 4(%r11),%r10d # HandlerData[1]
2625 lea (%rsi,%r10),%r10 # epilogue label
2626 cmp %r10,%rbx # context->Rip>=epilogue label
2627 jae .Lin_key_prologue
2628
2629 lea 56(%rax),%rax
2630
2631 mov -8(%rax),%rbx
2632 mov -16(%rax),%rbp
2633 mov -24(%rax),%r12
2634 mov -32(%rax),%r13
2635 mov -40(%rax),%r14
2636 mov -48(%rax),%r15
2637 mov %rbx,144($context) # restore context->Rbx
2638 mov %rbp,160($context) # restore context->Rbp
2639 mov %r12,216($context) # restore context->R12
2640 mov %r13,224($context) # restore context->R13
2641 mov %r14,232($context) # restore context->R14
2642 mov %r15,240($context) # restore context->R15
2643
2644.Lin_key_prologue:
2645 mov 8(%rax),%rdi
2646 mov 16(%rax),%rsi
2647 mov %rax,152($context) # restore context->Rsp
2648 mov %rsi,168($context) # restore context->Rsi
2649 mov %rdi,176($context) # restore context->Rdi
2650
2651 jmp .Lcommon_seh_exit
2652.size key_se_handler,.-key_se_handler
2653
2654.type cbc_se_handler,\@abi-omnipotent
2655.align 16
2656cbc_se_handler:
2657 push %rsi
2658 push %rdi
2659 push %rbx
2660 push %rbp
2661 push %r12
2662 push %r13
2663 push %r14
2664 push %r15
2665 pushfq
2666 sub \$64,%rsp
2667
2668 mov 120($context),%rax # pull context->Rax
2669 mov 248($context),%rbx # pull context->Rip
2670
2671 lea .Lcbc_prologue(%rip),%r10
2672 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2673 jb .Lin_cbc_prologue
2674
2675 lea .Lcbc_fast_body(%rip),%r10
2676 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2677 jb .Lin_cbc_frame_setup
2678
2679 lea .Lcbc_slow_prologue(%rip),%r10
2680 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2681 jb .Lin_cbc_body
2682
2683 lea .Lcbc_slow_body(%rip),%r10
2684 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2685 jb .Lin_cbc_frame_setup
2686
2687.Lin_cbc_body:
2688 mov 152($context),%rax # pull context->Rsp
2689
2690 lea .Lcbc_epilogue(%rip),%r10
2691 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2692 jae .Lin_cbc_prologue
2693
2694 lea 8(%rax),%rax
2695
2696 lea .Lcbc_popfq(%rip),%r10
2697 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2698 jae .Lin_cbc_prologue
2699
2700 mov `16-8`(%rax),%rax # biased $_rsp
2701 lea 56(%rax),%rax
2702
2703.Lin_cbc_frame_setup:
2704 mov -16(%rax),%rbx
2705 mov -24(%rax),%rbp
2706 mov -32(%rax),%r12
2707 mov -40(%rax),%r13
2708 mov -48(%rax),%r14
2709 mov -56(%rax),%r15
2710 mov %rbx,144($context) # restore context->Rbx
2711 mov %rbp,160($context) # restore context->Rbp
2712 mov %r12,216($context) # restore context->R12
2713 mov %r13,224($context) # restore context->R13
2714 mov %r14,232($context) # restore context->R14
2715 mov %r15,240($context) # restore context->R15
2716
2717.Lin_cbc_prologue:
2718 mov 8(%rax),%rdi
2719 mov 16(%rax),%rsi
2720 mov %rax,152($context) # restore context->Rsp
2721 mov %rsi,168($context) # restore context->Rsi
2722 mov %rdi,176($context) # restore context->Rdi
2723
2724.Lcommon_seh_exit:
2725
2726 mov 40($disp),%rdi # disp->ContextRecord
2727 mov $context,%rsi # context
2728 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2729 .long 0xa548f3fc # cld; rep movsq
2730
2731 mov $disp,%rsi
2732 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2733 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2734 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2735 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2736 mov 40(%rsi),%r10 # disp->ContextRecord
2737 lea 56(%rsi),%r11 # &disp->HandlerData
2738 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2739 mov %r10,32(%rsp) # arg5
2740 mov %r11,40(%rsp) # arg6
2741 mov %r12,48(%rsp) # arg7
2742 mov %rcx,56(%rsp) # arg8, (NULL)
2743 call *__imp_RtlVirtualUnwind(%rip)
2744
2745 mov \$1,%eax # ExceptionContinueSearch
2746 add \$64,%rsp
2747 popfq
2748 pop %r15
2749 pop %r14
2750 pop %r13
2751 pop %r12
2752 pop %rbp
2753 pop %rbx
2754 pop %rdi
2755 pop %rsi
2756 ret
2757.size cbc_se_handler,.-cbc_se_handler
2758
2759.section .pdata
2760.align 4
2761 .rva .LSEH_begin_AES_encrypt
2762 .rva .LSEH_end_AES_encrypt
2763 .rva .LSEH_info_AES_encrypt
2764
2765 .rva .LSEH_begin_AES_decrypt
2766 .rva .LSEH_end_AES_decrypt
2767 .rva .LSEH_info_AES_decrypt
2768
2769 .rva .LSEH_begin_AES_set_encrypt_key
2770 .rva .LSEH_end_AES_set_encrypt_key
2771 .rva .LSEH_info_AES_set_encrypt_key
2772
2773 .rva .LSEH_begin_AES_set_decrypt_key
2774 .rva .LSEH_end_AES_set_decrypt_key
2775 .rva .LSEH_info_AES_set_decrypt_key
2776
2777 .rva .LSEH_begin_AES_cbc_encrypt
2778 .rva .LSEH_end_AES_cbc_encrypt
2779 .rva .LSEH_info_AES_cbc_encrypt
2780
2781.section .xdata
2782.align 8
2783.LSEH_info_AES_encrypt:
2784 .byte 9,0,0,0
2785 .rva block_se_handler
2786 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2787.LSEH_info_AES_decrypt:
2788 .byte 9,0,0,0
2789 .rva block_se_handler
2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2791.LSEH_info_AES_set_encrypt_key:
2792 .byte 9,0,0,0
2793 .rva key_se_handler
2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2795.LSEH_info_AES_set_decrypt_key:
2796 .byte 9,0,0,0
2797 .rva key_se_handler
2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2799.LSEH_info_AES_cbc_encrypt:
2800 .byte 9,0,0,0
2801 .rva cbc_se_handler
2802___
2803}
1574 2804
1575$code =~ s/\`([^\`]*)\`/eval($1)/gem; 2805$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1576 2806