summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm
diff options
context:
space:
mode:
Diffstat (limited to 'src/lib/libcrypto/aes/asm')
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl2980
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl1134
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ia64.S1123
-rw-r--r--src/lib/libcrypto/aes/asm/aes-mips.pl1611
-rw-r--r--src/lib/libcrypto/aes/asm/aes-parisc.pl1021
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl1365
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl2254
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl1182
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl2809
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl1249
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86.pl2189
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl992
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3044
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl903
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1206
15 files changed, 25062 insertions, 0 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
new file mode 100644
index 0000000000..aab40e6f1c
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -0,0 +1,2980 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 4.3.
11#
12# You might fail to appreciate this module performance from the first
13# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14# to be *the* best Intel C compiler without -KPIC, performance appears
15# to be virtually identical... But try to re-configure with shared
16# library support... Aha! Intel compiler "suddenly" lags behind by 30%
17# [on P4, more on others]:-) And if compared to position-independent
18# code generated by GNU C, this code performs *more* than *twice* as
19# fast! Yes, all this buzz about PIC means that unlike other hand-
20# coded implementations, this one was explicitly designed to be safe
21# to use even in shared library context... This also means that this
22# code isn't necessarily absolutely fastest "ever," because in order
23# to achieve position independence an extra register has to be
24# off-loaded to stack, which affects the benchmark result.
25#
26# Special note about instruction choice. Do you recall RC4_INT code
27# performing poorly on P4? It might be the time to figure out why.
28# RC4_INT code implies effective address calculations in base+offset*4
29# form. Trouble is that it seems that offset scaling turned to be
30# critical path... At least eliminating scaling resulted in 2.8x RC4
31# performance improvement [as you might recall]. As AES code is hungry
32# for scaling too, I [try to] avoid the latter by favoring off-by-2
33# shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34#
35# As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36# void. Performance improvement with off-by-2 shifts was observed on
37# intermediate implementation, which was spilling yet another register
38# to stack... Final offset*4 code below runs just a tad faster on P4,
39# but exhibits up to 10% improvement on other cores.
40#
41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32
46# register window little-endian flavor could achieve slightly higher
47# Instruction Level Parallelism, and it indeed resulted in up to 15%
48# better performance on most recent µ-archs...
49#
50# Third version adds AES_cbc_encrypt implementation, which resulted in
51# up to 40% performance imrovement of CBC benchmark results. 40% was
52# observed on P4 core, where "overall" imrovement coefficient, i.e. if
53# compared to PIC generated by GCC and in CBC mode, was observed to be
54# as large as 4x:-) CBC performance is virtually identical to ECB now
55# and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56# Opteron, because certain function prologues and epilogues are
57# effectively taken out of the loop...
58#
59# Version 3.2 implements compressed tables and prefetch of these tables
60# in CBC[!] mode. Former means that 3/4 of table references are now
61# misaligned, which unfortunately has negative impact on elder IA-32
62# implementations, Pentium suffered 30% penalty, PIII - 10%.
63#
64# Version 3.3 avoids L1 cache aliasing between stack frame and
65# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66# latter is achieved by copying the key schedule to controlled place in
67# stack. This unfortunately has rather strong impact on small block CBC
68# performance, ~2x deterioration on 16-byte block if compared to 3.3.
69#
70# Version 3.5 checks if there is L1 cache aliasing between user-supplied
71# key schedule and S-boxes and abstains from copying the former if
72# there is no. This allows end-user to consciously retain small block
73# performance by aligning key schedule in specific manner.
74#
75# Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76#
77# Current ECB performance numbers for 128-bit key in CPU cycles per
78# processed byte [measure commonly used by AES benchmarkers] are:
79#
80# small footprint fully unrolled
81# P4 24 22
82# AMD K8 20 19
83# PIII 25 23
84# Pentium 81 78
85#
86# Version 3.7 reimplements outer rounds as "compact." Meaning that
87# first and last rounds reference compact 256 bytes S-box. This means
88# that first round consumes a lot more CPU cycles and that encrypt
89# and decrypt performance becomes asymmetric. Encrypt performance
90# drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91# aggressively pre-fetched.
92#
93# Version 4.0 effectively rolls back to 3.6 and instead implements
94# additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95# which use exclusively 256 byte S-box. These functions are to be
96# called in modes not concealing plain text, such as ECB, or when
97# we're asked to process smaller amount of data [or unconditionally
98# on hyper-threading CPU]. Currently it's called unconditionally from
99# AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100# still needs to be modified to switch between slower and faster
101# mode when appropriate... But in either case benchmark landscape
102# changes dramatically and below numbers are CPU cycles per processed
103# byte for 128-bit key.
104#
105# ECB encrypt ECB decrypt CBC large chunk
106# P4 56[60] 84[100] 23
107# AMD K8 48[44] 70[79] 18
108# PIII 41[50] 61[91] 24
109# Core 2 32[38] 45[70] 18.5
110# Pentium 120 160 77
111#
112# Version 4.1 switches to compact S-box even in key schedule setup.
113#
114# Version 4.2 prefetches compact S-box in every SSE round or in other
115# words every cache-line is *guaranteed* to be accessed within ~50
116# cycles window. Why just SSE? Because it's needed on hyper-threading
117# CPU! Which is also why it's prefetched with 64 byte stride. Best
118# part is that it has no negative effect on performance:-)
119#
120# Version 4.3 implements switch between compact and non-compact block
121# functions in AES_cbc_encrypt depending on how much data was asked
122# to be processed in one stroke.
123#
124######################################################################
125# Timing attacks are classified in two classes: synchronous when
126# attacker consciously initiates cryptographic operation and collects
127# timing data of various character afterwards, and asynchronous when
128# malicious code is executed on same CPU simultaneously with AES,
129# instruments itself and performs statistical analysis of this data.
130#
131# As far as synchronous attacks go the root to the AES timing
132# vulnerability is twofold. Firstly, of 256 S-box elements at most 160
133# are referred to in single 128-bit block operation. Well, in C
134# implementation with 4 distinct tables it's actually as little as 40
135# references per 256 elements table, but anyway... Secondly, even
136# though S-box elements are clustered into smaller amount of cache-
137# lines, smaller than 160 and even 40, it turned out that for certain
138# plain-text pattern[s] or simply put chosen plain-text and given key
139# few cache-lines remain unaccessed during block operation. Now, if
140# attacker can figure out this access pattern, he can deduct the key
141# [or at least part of it]. The natural way to mitigate this kind of
142# attacks is to minimize the amount of cache-lines in S-box and/or
143# prefetch them to ensure that every one is accessed for more uniform
144# timing. But note that *if* plain-text was concealed in such way that
145# input to block function is distributed *uniformly*, then attack
146# wouldn't apply. Now note that some encryption modes, most notably
147# CBC, do mask the plain-text in this exact way [secure cipher output
148# is distributed uniformly]. Yes, one still might find input that
149# would reveal the information about given key, but if amount of
150# candidate inputs to be tried is larger than amount of possible key
151# combinations then attack becomes infeasible. This is why revised
152# AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
153# of data is to be processed in one stroke. The current size limit of
154# 512 bytes is chosen to provide same [diminishigly low] probability
155# for cache-line to remain untouched in large chunk operation with
156# large S-box as for single block operation with compact S-box and
157# surely needs more careful consideration...
158#
159# As for asynchronous attacks. There are two flavours: attacker code
160# being interleaved with AES on hyper-threading CPU at *instruction*
161# level, and two processes time sharing single core. As for latter.
162# Two vectors. 1. Given that attacker process has higher priority,
163# yield execution to process performing AES just before timer fires
164# off the scheduler, immediately regain control of CPU and analyze the
165# cache state. For this attack to be efficient attacker would have to
166# effectively slow down the operation by several *orders* of magnitute,
167# by ratio of time slice to duration of handful of AES rounds, which
168# unlikely to remain unnoticed. Not to mention that this also means
169# that he would spend correspondigly more time to collect enough
170# statistical data to mount the attack. It's probably appropriate to
171# say that if adeversary reckons that this attack is beneficial and
172# risks to be noticed, you probably have larger problems having him
173# mere opportunity. In other words suggested code design expects you
174# to preclude/mitigate this attack by overall system security design.
175# 2. Attacker manages to make his code interrupt driven. In order for
176# this kind of attack to be feasible, interrupt rate has to be high
177# enough, again comparable to duration of handful of AES rounds. But
178# is there interrupt source of such rate? Hardly, not even 1Gbps NIC
179# generates interrupts at such raging rate...
180#
181# And now back to the former, hyper-threading CPU or more specifically
182# Intel P4. Recall that asynchronous attack implies that malicious
183# code instruments itself. And naturally instrumentation granularity
184# has be noticeably lower than duration of codepath accessing S-box.
185# Given that all cache-lines are accessed during that time that is.
186# Current implementation accesses *all* cache-lines within ~50 cycles
187# window, which is actually *less* than RDTSC latency on Intel P4!
188
189$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
190push(@INC,"${dir}","${dir}../../perlasm");
191require "x86asm.pl";
192
193&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
194&static_label("AES_Te");
195&static_label("AES_Td");
196
197$s0="eax";
198$s1="ebx";
199$s2="ecx";
200$s3="edx";
201$key="edi";
202$acc="esi";
203$tbl="ebp";
204
205# stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
206# by caller
207$__ra=&DWP(0,"esp"); # return address
208$__s0=&DWP(4,"esp"); # s0 backing store
209$__s1=&DWP(8,"esp"); # s1 backing store
210$__s2=&DWP(12,"esp"); # s2 backing store
211$__s3=&DWP(16,"esp"); # s3 backing store
212$__key=&DWP(20,"esp"); # pointer to key schedule
213$__end=&DWP(24,"esp"); # pointer to end of key schedule
214$__tbl=&DWP(28,"esp"); # %ebp backing store
215
216# stack frame layout in AES_[en|crypt] routines, which differs from
217# above by 4 and overlaps by %ebp backing store
218$_tbl=&DWP(24,"esp");
219$_esp=&DWP(28,"esp");
220
221sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
222
223$speed_limit=512; # chunks smaller than $speed_limit are
224 # processed with compact routine in CBC mode
225$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
226 # recent µ-archs], but ~5 times smaller!
227 # I favor compact code to minimize cache
228 # contention and in hope to "collect" 5% back
229 # in real-life applications...
230
231$vertical_spin=0; # shift "verticaly" defaults to 0, because of
232 # its proof-of-concept status...
233# Note that there is no decvert(), as well as last encryption round is
234# performed with "horizontal" shifts. This is because this "vertical"
235# implementation [one which groups shifts on a given $s[i] to form a
236# "column," unlike "horizontal" one, which groups shifts on different
237# $s[i] to form a "row"] is work in progress. It was observed to run
238# few percents faster on Intel cores, but not AMD. On AMD K8 core it's
239# whole 12% slower:-( So we face a trade-off... Shall it be resolved
240# some day? Till then the code is considered experimental and by
241# default remains dormant...
242
243sub encvert()
244{ my ($te,@s) = @_;
245 my $v0 = $acc, $v1 = $key;
246
247 &mov ($v0,$s[3]); # copy s3
248 &mov (&DWP(4,"esp"),$s[2]); # save s2
249 &mov ($v1,$s[0]); # copy s0
250 &mov (&DWP(8,"esp"),$s[1]); # save s1
251
252 &movz ($s[2],&HB($s[0]));
253 &and ($s[0],0xFF);
254 &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
255 &shr ($v1,16);
256 &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
257 &movz ($s[1],&HB($v1));
258 &and ($v1,0xFF);
259 &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
260 &mov ($v1,$v0);
261 &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
262
263 &and ($v0,0xFF);
264 &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
265 &movz ($v0,&HB($v1));
266 &shr ($v1,16);
267 &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
268 &movz ($v0,&HB($v1));
269 &and ($v1,0xFF);
270 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
271 &mov ($v1,&DWP(4,"esp")); # restore s2
272 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
273
274 &mov ($v0,$v1);
275 &and ($v1,0xFF);
276 &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
277 &movz ($v1,&HB($v0));
278 &shr ($v0,16);
279 &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
280 &movz ($v1,&HB($v0));
281 &and ($v0,0xFF);
282 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
283 &mov ($v0,&DWP(8,"esp")); # restore s1
284 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
285
286 &mov ($v1,$v0);
287 &and ($v0,0xFF);
288 &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
289 &movz ($v0,&HB($v1));
290 &shr ($v1,16);
291 &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
292 &movz ($v0,&HB($v1));
293 &and ($v1,0xFF);
294 &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
295 &mov ($key,$__key); # reincarnate v1 as key
296 &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
297}
298
299# Another experimental routine, which features "horizontal spin," but
300# eliminates one reference to stack. Strangely enough runs slower...
301sub enchoriz()
302{ my $v0 = $key, $v1 = $acc;
303
304 &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
305 &rotr ($s2,8); # 8,11,10, 9
306 &mov ($v1,&DWP(0,$te,$v0,8)); # 0
307 &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
308 &rotr ($s3,16); # 13,12,15,14
309 &xor ($v1,&DWP(3,$te,$v0,8)); # 5
310 &movz ($v0,&HB($s2)); # 8,11,10*, 9
311 &rotr ($s0,16); # 1, 0, 3, 2
312 &xor ($v1,&DWP(2,$te,$v0,8)); # 10
313 &movz ($v0,&HB($s3)); # 13,12,15*,14
314 &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
315 &mov ($__s0,$v1); # t[0] saved
316
317 &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
318 &shr ($s1,16); # -, -, 7, 6
319 &mov ($v1,&DWP(0,$te,$v0,8)); # 4
320 &movz ($v0,&LB($s3)); # 13,12,15,14*
321 &xor ($v1,&DWP(2,$te,$v0,8)); # 14
322 &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
323 &and ($s3,0xffff0000); # 13,12, -, -
324 &xor ($v1,&DWP(1,$te,$v0,8)); # 3
325 &movz ($v0,&LB($s2)); # 8,11,10, 9*
326 &or ($s3,$s1); # 13,12, 7, 6
327 &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
328 &mov ($s1,$v1); # s[1]=t[1]
329
330 &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
331 &shr ($s2,16); # -, -, 8,11
332 &mov ($v1,&DWP(2,$te,$v0,8)); # 2
333 &movz ($v0,&HB($s3)); # 13,12, 7*, 6
334 &xor ($v1,&DWP(1,$te,$v0,8)); # 7
335 &movz ($v0,&HB($s2)); # -, -, 8*,11
336 &xor ($v1,&DWP(0,$te,$v0,8)); # 8
337 &mov ($v0,$s3);
338 &shr ($v0,24); # 13
339 &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
340
341 &movz ($v0,&LB($s2)); # -, -, 8,11*
342 &shr ($s0,24); # 1*
343 &mov ($s2,&DWP(1,$te,$v0,8)); # 11
344 &xor ($s2,&DWP(3,$te,$s0,8)); # 1
345 &mov ($s0,$__s0); # s[0]=t[0]
346 &movz ($v0,&LB($s3)); # 13,12, 7, 6*
347 &shr ($s3,16); # , ,13,12
348 &xor ($s2,&DWP(2,$te,$v0,8)); # 6
349 &mov ($key,$__key); # reincarnate v0 as key
350 &and ($s3,0xff); # , ,13,12*
351 &mov ($s3,&DWP(0,$te,$s3,8)); # 12
352 &xor ($s3,$s2); # s[2]=t[3] collected
353 &mov ($s2,$v1); # s[2]=t[2]
354}
355
356# More experimental code... SSE one... Even though this one eliminates
357# *all* references to stack, it's not faster...
358sub sse_encbody()
359{
360 &movz ($acc,&LB("eax")); # 0
361 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
362 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
363 &movz ("edx",&HB("eax")); # 1
364 &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
365 &shr ("eax",16); # 5, 4
366
367 &movz ($acc,&LB("ebx")); # 10
368 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
369 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
370 &movz ($acc,&HB("ebx")); # 11
371 &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
372 &shr ("ebx",16); # 15,14
373
374 &movz ($acc,&HB("eax")); # 5
375 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
376 &movq ("mm3",QWP(16,$key));
377 &movz ($acc,&HB("ebx")); # 15
378 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
379 &movd ("mm0","ecx"); # t[0] collected
380
381 &movz ($acc,&LB("eax")); # 4
382 &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
383 &movd ("eax","mm2"); # 7, 6, 3, 2
384 &movz ($acc,&LB("ebx")); # 14
385 &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
386 &movd ("ebx","mm6"); # 13,12, 9, 8
387
388 &movz ($acc,&HB("eax")); # 3
389 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
390 &movz ($acc,&HB("ebx")); # 9
391 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
392 &movd ("mm1","ecx"); # t[1] collected
393
394 &movz ($acc,&LB("eax")); # 2
395 &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
396 &shr ("eax",16); # 7, 6
397 &punpckldq ("mm0","mm1"); # t[0,1] collected
398 &movz ($acc,&LB("ebx")); # 8
399 &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
400 &shr ("ebx",16); # 13,12
401
402 &movz ($acc,&HB("eax")); # 7
403 &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
404 &pxor ("mm0","mm3");
405 &movz ("eax",&LB("eax")); # 6
406 &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
407 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
408 &movz ($acc,&HB("ebx")); # 13
409 &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
410 &xor ("ecx",&DWP(24,$key)); # t[2]
411 &movd ("mm4","ecx"); # t[2] collected
412 &movz ("ebx",&LB("ebx")); # 12
413 &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
414 &shr ("ecx",16);
415 &movd ("eax","mm1"); # 5, 4, 1, 0
416 &mov ("ebx",&DWP(28,$key)); # t[3]
417 &xor ("ebx","edx");
418 &movd ("mm5","ebx"); # t[3] collected
419 &and ("ebx",0xffff0000);
420 &or ("ebx","ecx");
421
422 &punpckldq ("mm4","mm5"); # t[2,3] collected
423}
424
425######################################################################
426# "Compact" block function
427######################################################################
428
429sub enccompact()
430{ my $Fn = mov;
431 while ($#_>5) { pop(@_); $Fn=sub{}; }
432 my ($i,$te,@s)=@_;
433 my $tmp = $key;
434 my $out = $i==3?$s[0]:$acc;
435
436 # $Fn is used in first compact round and its purpose is to
437 # void restoration of some values from stack, so that after
438 # 4xenccompact with extra argument $key value is left there...
439 if ($i==3) { &$Fn ($key,$__key); }##%edx
440 else { &mov ($out,$s[0]); }
441 &and ($out,0xFF);
442 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
443 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
444 &movz ($out,&BP(-128,$te,$out,1));
445
446 if ($i==3) { $tmp=$s[1]; }##%eax
447 &movz ($tmp,&HB($s[1]));
448 &movz ($tmp,&BP(-128,$te,$tmp,1));
449 &shl ($tmp,8);
450 &xor ($out,$tmp);
451
452 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
453 else { &mov ($tmp,$s[2]);
454 &shr ($tmp,16); }
455 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
456 &and ($tmp,0xFF);
457 &movz ($tmp,&BP(-128,$te,$tmp,1));
458 &shl ($tmp,16);
459 &xor ($out,$tmp);
460
461 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
462 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
463 else { &mov ($tmp,$s[3]);
464 &shr ($tmp,24); }
465 &movz ($tmp,&BP(-128,$te,$tmp,1));
466 &shl ($tmp,24);
467 &xor ($out,$tmp);
468 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
469 if ($i==3) { &mov ($s[3],$acc); }
470 &comment();
471}
472
473sub enctransform()
474{ my @s = ($s0,$s1,$s2,$s3);
475 my $i = shift;
476 my $tmp = $tbl;
477 my $r2 = $key ;
478
479 &mov ($acc,$s[$i]);
480 &and ($acc,0x80808080);
481 &mov ($tmp,$acc);
482 &shr ($tmp,7);
483 &lea ($r2,&DWP(0,$s[$i],$s[$i]));
484 &sub ($acc,$tmp);
485 &and ($r2,0xfefefefe);
486 &and ($acc,0x1b1b1b1b);
487 &mov ($tmp,$s[$i]);
488 &xor ($acc,$r2); # r2
489
490 &xor ($s[$i],$acc); # r0 ^ r2
491 &rotl ($s[$i],24);
492 &xor ($s[$i],$acc) # ROTATE(r2^r0,24) ^ r2
493 &rotr ($tmp,16);
494 &xor ($s[$i],$tmp);
495 &rotr ($tmp,8);
496 &xor ($s[$i],$tmp);
497}
498
499&function_begin_B("_x86_AES_encrypt_compact");
500 # note that caller is expected to allocate stack frame for me!
501 &mov ($__key,$key); # save key
502
503 &xor ($s0,&DWP(0,$key)); # xor with key
504 &xor ($s1,&DWP(4,$key));
505 &xor ($s2,&DWP(8,$key));
506 &xor ($s3,&DWP(12,$key));
507
508 &mov ($acc,&DWP(240,$key)); # load key->rounds
509 &lea ($acc,&DWP(-2,$acc,$acc));
510 &lea ($acc,&DWP(0,$key,$acc,8));
511 &mov ($__end,$acc); # end of key schedule
512
513 # prefetch Te4
514 &mov ($key,&DWP(0-128,$tbl));
515 &mov ($acc,&DWP(32-128,$tbl));
516 &mov ($key,&DWP(64-128,$tbl));
517 &mov ($acc,&DWP(96-128,$tbl));
518 &mov ($key,&DWP(128-128,$tbl));
519 &mov ($acc,&DWP(160-128,$tbl));
520 &mov ($key,&DWP(192-128,$tbl));
521 &mov ($acc,&DWP(224-128,$tbl));
522
523 &set_label("loop",16);
524
525 &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
526 &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
527 &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
528 &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
529 &enctransform(2);
530 &enctransform(3);
531 &enctransform(0);
532 &enctransform(1);
533 &mov ($key,$__key);
534 &mov ($tbl,$__tbl);
535 &add ($key,16); # advance rd_key
536 &xor ($s0,&DWP(0,$key));
537 &xor ($s1,&DWP(4,$key));
538 &xor ($s2,&DWP(8,$key));
539 &xor ($s3,&DWP(12,$key));
540
541 &cmp ($key,$__end);
542 &mov ($__key,$key);
543 &jb (&label("loop"));
544
545 &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
546 &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
547 &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
548 &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
549
550 &xor ($s0,&DWP(16,$key));
551 &xor ($s1,&DWP(20,$key));
552 &xor ($s2,&DWP(24,$key));
553 &xor ($s3,&DWP(28,$key));
554
555 &ret ();
556&function_end_B("_x86_AES_encrypt_compact");
557
558######################################################################
559# "Compact" SSE block function.
560######################################################################
561#
562# Performance is not actually extraordinary in comparison to pure
563# x86 code. In particular encrypt performance is virtually the same.
564# Decrypt performance on the other hand is 15-20% better on newer
565# µ-archs [but we're thankful for *any* improvement here], and ~50%
566# better on PIII:-) And additionally on the pros side this code
567# eliminates redundant references to stack and thus relieves/
568# minimizes the pressure on the memory bus.
569#
570# MMX register layout lsb
571# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
572# | mm4 | mm0 |
573# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
574# | s3 | s2 | s1 | s0 |
575# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
576# |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
577# +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
578#
579# Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
580# In this terms encryption and decryption "compact" permutation
581# matrices can be depicted as following:
582#
583# encryption lsb # decryption lsb
584# +----++----+----+----+----+ # +----++----+----+----+----+
585# | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
586# +----++----+----+----+----+ # +----++----+----+----+----+
587# | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
588# +----++----+----+----+----+ # +----++----+----+----+----+
589# | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
590# +----++----+----+----+----+ # +----++----+----+----+----+
591# | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
592# +----++----+----+----+----+ # +----++----+----+----+----+
593#
594######################################################################
595# Why not xmm registers? Short answer. It was actually tested and
596# was not any faster, but *contrary*, most notably on Intel CPUs.
597# Longer answer. Main advantage of using mm registers is that movd
598# latency is lower, especially on Intel P4. While arithmetic
599# instructions are twice as many, they can be scheduled every cycle
600# and not every second one when they are operating on xmm register,
601# so that "arithmetic throughput" remains virtually the same. And
602# finally the code can be executed even on elder SSE-only CPUs:-)
603
604sub sse_enccompact()
605{
606 &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
607 &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
608 &movd ("eax","mm1"); # 5, 4, 1, 0
609 &movd ("ebx","mm5"); # 15,14,11,10
610
611 &movz ($acc,&LB("eax")); # 0
612 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
613 &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
614 &movz ("edx",&HB("eax")); # 1
615 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
616 &shl ("edx",8); # 1
617 &shr ("eax",16); # 5, 4
618
619 &movz ($acc,&LB("ebx")); # 10
620 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
621 &shl ($acc,16); # 10
622 &or ("ecx",$acc); # 10
623 &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
624 &movz ($acc,&HB("ebx")); # 11
625 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
626 &shl ($acc,24); # 11
627 &or ("edx",$acc); # 11
628 &shr ("ebx",16); # 15,14
629
630 &movz ($acc,&HB("eax")); # 5
631 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
632 &shl ($acc,8); # 5
633 &or ("ecx",$acc); # 5
634 &movz ($acc,&HB("ebx")); # 15
635 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
636 &shl ($acc,24); # 15
637 &or ("ecx",$acc); # 15
638 &movd ("mm0","ecx"); # t[0] collected
639
640 &movz ($acc,&LB("eax")); # 4
641 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
642 &movd ("eax","mm2"); # 7, 6, 3, 2
643 &movz ($acc,&LB("ebx")); # 14
644 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
645 &shl ($acc,16); # 14
646 &or ("ecx",$acc); # 14
647
648 &movd ("ebx","mm6"); # 13,12, 9, 8
649 &movz ($acc,&HB("eax")); # 3
650 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
651 &shl ($acc,24); # 3
652 &or ("ecx",$acc); # 3
653 &movz ($acc,&HB("ebx")); # 9
654 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
655 &shl ($acc,8); # 9
656 &or ("ecx",$acc); # 9
657 &movd ("mm1","ecx"); # t[1] collected
658
659 &movz ($acc,&LB("ebx")); # 8
660 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
661 &shr ("ebx",16); # 13,12
662 &movz ($acc,&LB("eax")); # 2
663 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
664 &shl ($acc,16); # 2
665 &or ("ecx",$acc); # 2
666 &shr ("eax",16); # 7, 6
667
668 &punpckldq ("mm0","mm1"); # t[0,1] collected
669
670 &movz ($acc,&HB("eax")); # 7
671 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
672 &shl ($acc,24); # 7
673 &or ("ecx",$acc); # 7
674 &and ("eax",0xff); # 6
675 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
676 &shl ("eax",16); # 6
677 &or ("edx","eax"); # 6
678 &movz ($acc,&HB("ebx")); # 13
679 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
680 &shl ($acc,8); # 13
681 &or ("ecx",$acc); # 13
682 &movd ("mm4","ecx"); # t[2] collected
683 &and ("ebx",0xff); # 12
684 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
685 &or ("edx","ebx"); # 12
686 &movd ("mm5","edx"); # t[3] collected
687
688 &punpckldq ("mm4","mm5"); # t[2,3] collected
689}
690
691 if (!$x86only) {
692&function_begin_B("_sse_AES_encrypt_compact");
693 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
694 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
695
696 # note that caller is expected to allocate stack frame for me!
697 &mov ($acc,&DWP(240,$key)); # load key->rounds
698 &lea ($acc,&DWP(-2,$acc,$acc));
699 &lea ($acc,&DWP(0,$key,$acc,8));
700 &mov ($__end,$acc); # end of key schedule
701
702 &mov ($s0,0x1b1b1b1b); # magic constant
703 &mov (&DWP(8,"esp"),$s0);
704 &mov (&DWP(12,"esp"),$s0);
705
706 # prefetch Te4
707 &mov ($s0,&DWP(0-128,$tbl));
708 &mov ($s1,&DWP(32-128,$tbl));
709 &mov ($s2,&DWP(64-128,$tbl));
710 &mov ($s3,&DWP(96-128,$tbl));
711 &mov ($s0,&DWP(128-128,$tbl));
712 &mov ($s1,&DWP(160-128,$tbl));
713 &mov ($s2,&DWP(192-128,$tbl));
714 &mov ($s3,&DWP(224-128,$tbl));
715
716 &set_label("loop",16);
717 &sse_enccompact();
718 &add ($key,16);
719 &cmp ($key,$__end);
720 &ja (&label("out"));
721
722 &movq ("mm2",&QWP(8,"esp"));
723 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
724 &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
725 &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
726 &pand ("mm3","mm2"); &pand ("mm7","mm2");
727 &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
728 &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
729 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
730 &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
731 &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
732 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
733
734 &movq ("mm2","mm3"); &movq ("mm6","mm7");
735 &pslld ("mm3",8); &pslld ("mm7",8);
736 &psrld ("mm2",24); &psrld ("mm6",24);
737 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
738 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
739
740 &movq ("mm3","mm1"); &movq ("mm7","mm5");
741 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
742 &psrld ("mm1",8); &psrld ("mm5",8);
743 &mov ($s0,&DWP(0-128,$tbl));
744 &pslld ("mm3",24); &pslld ("mm7",24);
745 &mov ($s1,&DWP(64-128,$tbl));
746 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
747 &mov ($s2,&DWP(128-128,$tbl));
748 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
749 &mov ($s3,&DWP(192-128,$tbl));
750
751 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
752 &jmp (&label("loop"));
753
754 &set_label("out",16);
755 &pxor ("mm0",&QWP(0,$key));
756 &pxor ("mm4",&QWP(8,$key));
757
758 &ret ();
759&function_end_B("_sse_AES_encrypt_compact");
760 }
761
762######################################################################
763# Vanilla block function.
764######################################################################
765
766sub encstep()
767{ my ($i,$te,@s) = @_;
768 my $tmp = $key;
769 my $out = $i==3?$s[0]:$acc;
770
771 # lines marked with #%e?x[i] denote "reordered" instructions...
772 if ($i==3) { &mov ($key,$__key); }##%edx
773 else { &mov ($out,$s[0]);
774 &and ($out,0xFF); }
775 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
776 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
777 &mov ($out,&DWP(0,$te,$out,8));
778
779 if ($i==3) { $tmp=$s[1]; }##%eax
780 &movz ($tmp,&HB($s[1]));
781 &xor ($out,&DWP(3,$te,$tmp,8));
782
783 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
784 else { &mov ($tmp,$s[2]);
785 &shr ($tmp,16); }
786 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
787 &and ($tmp,0xFF);
788 &xor ($out,&DWP(2,$te,$tmp,8));
789
790 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
791 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
792 else { &mov ($tmp,$s[3]);
793 &shr ($tmp,24) }
794 &xor ($out,&DWP(1,$te,$tmp,8));
795 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
796 if ($i==3) { &mov ($s[3],$acc); }
797 &comment();
798}
799
800sub enclast()
801{ my ($i,$te,@s)=@_;
802 my $tmp = $key;
803 my $out = $i==3?$s[0]:$acc;
804
805 if ($i==3) { &mov ($key,$__key); }##%edx
806 else { &mov ($out,$s[0]); }
807 &and ($out,0xFF);
808 if ($i==1) { &shr ($s[0],16); }#%ebx[1]
809 if ($i==2) { &shr ($s[0],24); }#%ecx[2]
810 &mov ($out,&DWP(2,$te,$out,8));
811 &and ($out,0x000000ff);
812
813 if ($i==3) { $tmp=$s[1]; }##%eax
814 &movz ($tmp,&HB($s[1]));
815 &mov ($tmp,&DWP(0,$te,$tmp,8));
816 &and ($tmp,0x0000ff00);
817 &xor ($out,$tmp);
818
819 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
820 else { &mov ($tmp,$s[2]);
821 &shr ($tmp,16); }
822 if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
823 &and ($tmp,0xFF);
824 &mov ($tmp,&DWP(0,$te,$tmp,8));
825 &and ($tmp,0x00ff0000);
826 &xor ($out,$tmp);
827
828 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
829 elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
830 else { &mov ($tmp,$s[3]);
831 &shr ($tmp,24); }
832 &mov ($tmp,&DWP(2,$te,$tmp,8));
833 &and ($tmp,0xff000000);
834 &xor ($out,$tmp);
835 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
836 if ($i==3) { &mov ($s[3],$acc); }
837}
838
839&function_begin_B("_x86_AES_encrypt");
840 if ($vertical_spin) {
841 # I need high parts of volatile registers to be accessible...
842 &exch ($s1="edi",$key="ebx");
843 &mov ($s2="esi",$acc="ecx");
844 }
845
846 # note that caller is expected to allocate stack frame for me!
847 &mov ($__key,$key); # save key
848
849 &xor ($s0,&DWP(0,$key)); # xor with key
850 &xor ($s1,&DWP(4,$key));
851 &xor ($s2,&DWP(8,$key));
852 &xor ($s3,&DWP(12,$key));
853
854 &mov ($acc,&DWP(240,$key)); # load key->rounds
855
856 if ($small_footprint) {
857 &lea ($acc,&DWP(-2,$acc,$acc));
858 &lea ($acc,&DWP(0,$key,$acc,8));
859 &mov ($__end,$acc); # end of key schedule
860
861 &set_label("loop",16);
862 if ($vertical_spin) {
863 &encvert($tbl,$s0,$s1,$s2,$s3);
864 } else {
865 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
866 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
867 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
868 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
869 }
870 &add ($key,16); # advance rd_key
871 &xor ($s0,&DWP(0,$key));
872 &xor ($s1,&DWP(4,$key));
873 &xor ($s2,&DWP(8,$key));
874 &xor ($s3,&DWP(12,$key));
875 &cmp ($key,$__end);
876 &mov ($__key,$key);
877 &jb (&label("loop"));
878 }
879 else {
880 &cmp ($acc,10);
881 &jle (&label("10rounds"));
882 &cmp ($acc,12);
883 &jle (&label("12rounds"));
884
885 &set_label("14rounds",4);
886 for ($i=1;$i<3;$i++) {
887 if ($vertical_spin) {
888 &encvert($tbl,$s0,$s1,$s2,$s3);
889 } else {
890 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
891 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
892 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
893 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
894 }
895 &xor ($s0,&DWP(16*$i+0,$key));
896 &xor ($s1,&DWP(16*$i+4,$key));
897 &xor ($s2,&DWP(16*$i+8,$key));
898 &xor ($s3,&DWP(16*$i+12,$key));
899 }
900 &add ($key,32);
901 &mov ($__key,$key); # advance rd_key
902 &set_label("12rounds",4);
903 for ($i=1;$i<3;$i++) {
904 if ($vertical_spin) {
905 &encvert($tbl,$s0,$s1,$s2,$s3);
906 } else {
907 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
908 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
909 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
910 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
911 }
912 &xor ($s0,&DWP(16*$i+0,$key));
913 &xor ($s1,&DWP(16*$i+4,$key));
914 &xor ($s2,&DWP(16*$i+8,$key));
915 &xor ($s3,&DWP(16*$i+12,$key));
916 }
917 &add ($key,32);
918 &mov ($__key,$key); # advance rd_key
919 &set_label("10rounds",4);
920 for ($i=1;$i<10;$i++) {
921 if ($vertical_spin) {
922 &encvert($tbl,$s0,$s1,$s2,$s3);
923 } else {
924 &encstep(0,$tbl,$s0,$s1,$s2,$s3);
925 &encstep(1,$tbl,$s1,$s2,$s3,$s0);
926 &encstep(2,$tbl,$s2,$s3,$s0,$s1);
927 &encstep(3,$tbl,$s3,$s0,$s1,$s2);
928 }
929 &xor ($s0,&DWP(16*$i+0,$key));
930 &xor ($s1,&DWP(16*$i+4,$key));
931 &xor ($s2,&DWP(16*$i+8,$key));
932 &xor ($s3,&DWP(16*$i+12,$key));
933 }
934 }
935
936 if ($vertical_spin) {
937 # "reincarnate" some registers for "horizontal" spin...
938 &mov ($s1="ebx",$key="edi");
939 &mov ($s2="ecx",$acc="esi");
940 }
941 &enclast(0,$tbl,$s0,$s1,$s2,$s3);
942 &enclast(1,$tbl,$s1,$s2,$s3,$s0);
943 &enclast(2,$tbl,$s2,$s3,$s0,$s1);
944 &enclast(3,$tbl,$s3,$s0,$s1,$s2);
945
946 &add ($key,$small_footprint?16:160);
947 &xor ($s0,&DWP(0,$key));
948 &xor ($s1,&DWP(4,$key));
949 &xor ($s2,&DWP(8,$key));
950 &xor ($s3,&DWP(12,$key));
951
952 &ret ();
953
954&set_label("AES_Te",64); # Yes! I keep it in the code segment!
955 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
956 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
957 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
958 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
959 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
960 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
961 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
962 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
963 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
964 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
965 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
966 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
967 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
968 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
969 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
970 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
971 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
972 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
973 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
974 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
975 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
976 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
977 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
978 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
979 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
980 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
981 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
982 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
983 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
984 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
985 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
986 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
987 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
988 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
989 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
990 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
991 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
992 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
993 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
994 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
995 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
996 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
997 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
998 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
999 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1000 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1001 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1002 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1003 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1004 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1005 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1006 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1007 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1008 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1009 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1010 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1011 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1012 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1013 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1014 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1015 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1016 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1017 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1018 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1019
1020#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1021 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1022 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1023 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1024 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1025 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1026 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1027 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1028 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1029 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1030 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1031 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1032 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1033 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1034 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1035 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1036 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1037 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1038 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1039 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1040 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1041 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1042 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1043 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1044 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1045 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1046 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1047 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1048 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1049 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1050 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1051 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1052 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1053
1054 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1055 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1056 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1057 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1058 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1059 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1060 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1061 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1062 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1063 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1064 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1065 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1066 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1067 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1068 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1069 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1070 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1071 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1072 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1073 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1074 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1075 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1076 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1077 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1078 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1079 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1080 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1081 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1082 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1083 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1084 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1085 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1086
1087 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1088 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1089 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1090 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1091 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1092 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1093 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1094 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1095 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1096 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1097 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1098 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1099 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1100 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1101 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1102 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1103 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1104 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1105 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1106 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1107 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1108 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1109 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1110 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1111 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1112 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1113 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1114 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1115 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1116 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1117 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1118 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1119
1120 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1121 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1122 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1123 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1124 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1125 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1126 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1127 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1128 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1129 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1130 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1131 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1132 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1133 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1134 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1135 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1136 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1137 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1138 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1139 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1140 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1141 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1142 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1143 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1144 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1145 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1146 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1147 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1148 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1149 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1150 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1151 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1152#rcon:
1153 &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1154 &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1155 &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1156 &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1157&function_end_B("_x86_AES_encrypt");
1158
1159# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1160&function_begin("AES_encrypt");
1161 &mov ($acc,&wparam(0)); # load inp
1162 &mov ($key,&wparam(2)); # load key
1163
1164 &mov ($s0,"esp");
1165 &sub ("esp",36);
1166 &and ("esp",-64); # align to cache-line
1167
1168 # place stack frame just "above" the key schedule
1169 &lea ($s1,&DWP(-64-63,$key));
1170 &sub ($s1,"esp");
1171 &neg ($s1);
1172 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1173 &sub ("esp",$s1);
1174 &add ("esp",4); # 4 is reserved for caller's return address
1175 &mov ($_esp,$s0); # save stack pointer
1176
1177 &call (&label("pic_point")); # make it PIC!
1178 &set_label("pic_point");
1179 &blindpop($tbl);
1180 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1181 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1182
1183 # pick Te4 copy which can't "overlap" with stack frame or key schedule
1184 &lea ($s1,&DWP(768-4,"esp"));
1185 &sub ($s1,$tbl);
1186 &and ($s1,0x300);
1187 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1188
1189 if (!$x86only) {
1190 &bt (&DWP(0,$s0),25); # check for SSE bit
1191 &jnc (&label("x86"));
1192
1193 &movq ("mm0",&QWP(0,$acc));
1194 &movq ("mm4",&QWP(8,$acc));
1195 &call ("_sse_AES_encrypt_compact");
1196 &mov ("esp",$_esp); # restore stack pointer
1197 &mov ($acc,&wparam(1)); # load out
1198 &movq (&QWP(0,$acc),"mm0"); # write output data
1199 &movq (&QWP(8,$acc),"mm4");
1200 &emms ();
1201 &function_end_A();
1202 }
1203 &set_label("x86",16);
1204 &mov ($_tbl,$tbl);
1205 &mov ($s0,&DWP(0,$acc)); # load input data
1206 &mov ($s1,&DWP(4,$acc));
1207 &mov ($s2,&DWP(8,$acc));
1208 &mov ($s3,&DWP(12,$acc));
1209 &call ("_x86_AES_encrypt_compact");
1210 &mov ("esp",$_esp); # restore stack pointer
1211 &mov ($acc,&wparam(1)); # load out
1212 &mov (&DWP(0,$acc),$s0); # write output data
1213 &mov (&DWP(4,$acc),$s1);
1214 &mov (&DWP(8,$acc),$s2);
1215 &mov (&DWP(12,$acc),$s3);
1216&function_end("AES_encrypt");
1217
1218#--------------------------------------------------------------------#
1219
1220######################################################################
1221# "Compact" block function
1222######################################################################
1223
1224sub deccompact()
1225{ my $Fn = mov;
1226 while ($#_>5) { pop(@_); $Fn=sub{}; }
1227 my ($i,$td,@s)=@_;
1228 my $tmp = $key;
1229 my $out = $i==3?$s[0]:$acc;
1230
1231 # $Fn is used in first compact round and its purpose is to
1232 # void restoration of some values from stack, so that after
1233 # 4xdeccompact with extra argument $key, $s0 and $s1 values
1234 # are left there...
1235 if($i==3) { &$Fn ($key,$__key); }
1236 else { &mov ($out,$s[0]); }
1237 &and ($out,0xFF);
1238 &movz ($out,&BP(-128,$td,$out,1));
1239
1240 if ($i==3) { $tmp=$s[1]; }
1241 &movz ($tmp,&HB($s[1]));
1242 &movz ($tmp,&BP(-128,$td,$tmp,1));
1243 &shl ($tmp,8);
1244 &xor ($out,$tmp);
1245
1246 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1247 else { mov ($tmp,$s[2]); }
1248 &shr ($tmp,16);
1249 &and ($tmp,0xFF);
1250 &movz ($tmp,&BP(-128,$td,$tmp,1));
1251 &shl ($tmp,16);
1252 &xor ($out,$tmp);
1253
1254 if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1255 else { &mov ($tmp,$s[3]); }
1256 &shr ($tmp,24);
1257 &movz ($tmp,&BP(-128,$td,$tmp,1));
1258 &shl ($tmp,24);
1259 &xor ($out,$tmp);
1260 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1261 if ($i==3) { &$Fn ($s[3],$__s0); }
1262}
1263
1264# must be called with 2,3,0,1 as argument sequence!!!
1265sub dectransform()
1266{ my @s = ($s0,$s1,$s2,$s3);
1267 my $i = shift;
1268 my $tmp = $key;
1269 my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1270 my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1271 my $tp8 = $tbl;
1272
1273 &mov ($acc,$s[$i]);
1274 &and ($acc,0x80808080);
1275 &mov ($tmp,$acc);
1276 &shr ($tmp,7);
1277 &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1278 &sub ($acc,$tmp);
1279 &and ($tp2,0xfefefefe);
1280 &and ($acc,0x1b1b1b1b);
1281 &xor ($acc,$tp2);
1282 &mov ($tp2,$acc);
1283
1284 &and ($acc,0x80808080);
1285 &mov ($tmp,$acc);
1286 &shr ($tmp,7);
1287 &lea ($tp4,&DWP(0,$tp2,$tp2));
1288 &sub ($acc,$tmp);
1289 &and ($tp4,0xfefefefe);
1290 &and ($acc,0x1b1b1b1b);
1291 &xor ($tp2,$s[$i]); # tp2^tp1
1292 &xor ($acc,$tp4);
1293 &mov ($tp4,$acc);
1294
1295 &and ($acc,0x80808080);
1296 &mov ($tmp,$acc);
1297 &shr ($tmp,7);
1298 &lea ($tp8,&DWP(0,$tp4,$tp4));
1299 &sub ($acc,$tmp);
1300 &and ($tp8,0xfefefefe);
1301 &and ($acc,0x1b1b1b1b);
1302 &xor ($tp4,$s[$i]); # tp4^tp1
1303 &rotl ($s[$i],8); # = ROTATE(tp1,8)
1304 &xor ($tp8,$acc);
1305
1306 &xor ($s[$i],$tp2);
1307 &xor ($tp2,$tp8);
1308 &rotl ($tp2,24);
1309 &xor ($s[$i],$tp4);
1310 &xor ($tp4,$tp8);
1311 &rotl ($tp4,16);
1312 &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1313 &rotl ($tp8,8);
1314 &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1315 &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1316 &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1317 &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1318 &mov ($s[2],$__s2) if($i==1);
1319 &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1320
1321 &mov ($s[3],$__s3) if($i==1);
1322 &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1323}
1324
1325&function_begin_B("_x86_AES_decrypt_compact");
1326 # note that caller is expected to allocate stack frame for me!
1327 &mov ($__key,$key); # save key
1328
1329 &xor ($s0,&DWP(0,$key)); # xor with key
1330 &xor ($s1,&DWP(4,$key));
1331 &xor ($s2,&DWP(8,$key));
1332 &xor ($s3,&DWP(12,$key));
1333
1334 &mov ($acc,&DWP(240,$key)); # load key->rounds
1335
1336 &lea ($acc,&DWP(-2,$acc,$acc));
1337 &lea ($acc,&DWP(0,$key,$acc,8));
1338 &mov ($__end,$acc); # end of key schedule
1339
1340 # prefetch Td4
1341 &mov ($key,&DWP(0-128,$tbl));
1342 &mov ($acc,&DWP(32-128,$tbl));
1343 &mov ($key,&DWP(64-128,$tbl));
1344 &mov ($acc,&DWP(96-128,$tbl));
1345 &mov ($key,&DWP(128-128,$tbl));
1346 &mov ($acc,&DWP(160-128,$tbl));
1347 &mov ($key,&DWP(192-128,$tbl));
1348 &mov ($acc,&DWP(224-128,$tbl));
1349
1350 &set_label("loop",16);
1351
1352 &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1353 &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1354 &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1355 &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1356 &dectransform(2);
1357 &dectransform(3);
1358 &dectransform(0);
1359 &dectransform(1);
1360 &mov ($key,$__key);
1361 &mov ($tbl,$__tbl);
1362 &add ($key,16); # advance rd_key
1363 &xor ($s0,&DWP(0,$key));
1364 &xor ($s1,&DWP(4,$key));
1365 &xor ($s2,&DWP(8,$key));
1366 &xor ($s3,&DWP(12,$key));
1367
1368 &cmp ($key,$__end);
1369 &mov ($__key,$key);
1370 &jb (&label("loop"));
1371
1372 &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1373 &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1374 &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1375 &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1376
1377 &xor ($s0,&DWP(16,$key));
1378 &xor ($s1,&DWP(20,$key));
1379 &xor ($s2,&DWP(24,$key));
1380 &xor ($s3,&DWP(28,$key));
1381
1382 &ret ();
1383&function_end_B("_x86_AES_decrypt_compact");
1384
1385######################################################################
1386# "Compact" SSE block function.
1387######################################################################
1388
1389sub sse_deccompact()
1390{
1391 &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1392 &movd ("eax","mm1"); # 7, 6, 1, 0
1393
1394 &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1395 &movz ($acc,&LB("eax")); # 0
1396 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1397 &movd ("ebx","mm5"); # 13,12,11,10
1398 &movz ("edx",&HB("eax")); # 1
1399 &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1400 &shl ("edx",8); # 1
1401
1402 &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1403 &movz ($acc,&LB("ebx")); # 10
1404 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
1405 &shl ($acc,16); # 10
1406 &or ("ecx",$acc); # 10
1407 &shr ("eax",16); # 7, 6
1408 &movz ($acc,&HB("ebx")); # 11
1409 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
1410 &shl ($acc,24); # 11
1411 &or ("edx",$acc); # 11
1412 &shr ("ebx",16); # 13,12
1413
1414 &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415 &movz ($acc,&HB("eax")); # 7
1416 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
1417 &shl ($acc,24); # 7
1418 &or ("ecx",$acc); # 7
1419 &movz ($acc,&HB("ebx")); # 13
1420 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
1421 &shl ($acc,8); # 13
1422 &or ("ecx",$acc); # 13
1423 &movd ("mm0","ecx"); # t[0] collected
1424
1425 &movz ($acc,&LB("eax")); # 6
1426 &movd ("eax","mm2"); # 3, 2, 5, 4
1427 &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
1428 &shl ("ecx",16); # 6
1429 &movz ($acc,&LB("ebx")); # 12
1430 &movd ("ebx","mm6"); # 9, 8,15,14
1431 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
1432 &or ("ecx",$acc); # 12
1433
1434 &movz ($acc,&LB("eax")); # 4
1435 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
1436 &or ("edx",$acc); # 4
1437 &movz ($acc,&LB("ebx")); # 14
1438 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
1439 &shl ($acc,16); # 14
1440 &or ("edx",$acc); # 14
1441 &movd ("mm1","edx"); # t[1] collected
1442
1443 &movz ($acc,&HB("eax")); # 5
1444 &movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
1445 &shl ("edx",8); # 5
1446 &movz ($acc,&HB("ebx")); # 15
1447 &shr ("eax",16); # 3, 2
1448 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
1449 &shl ($acc,24); # 15
1450 &or ("edx",$acc); # 15
1451 &shr ("ebx",16); # 9, 8
1452
1453 &punpckldq ("mm0","mm1"); # t[0,1] collected
1454
1455 &movz ($acc,&HB("ebx")); # 9
1456 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
1457 &shl ($acc,8); # 9
1458 &or ("ecx",$acc); # 9
1459 &and ("ebx",0xff); # 8
1460 &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1461 &or ("edx","ebx"); # 8
1462 &movz ($acc,&LB("eax")); # 2
1463 &movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
1464 &shl ($acc,16); # 2
1465 &or ("edx",$acc); # 2
1466 &movd ("mm4","edx"); # t[2] collected
1467 &movz ("eax",&HB("eax")); # 3
1468 &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1469 &shl ("eax",24); # 3
1470 &or ("ecx","eax"); # 3
1471 &movd ("mm5","ecx"); # t[3] collected
1472
1473 &punpckldq ("mm4","mm5"); # t[2,3] collected
1474}
1475
1476 if (!$x86only) {
1477&function_begin_B("_sse_AES_decrypt_compact");
1478 &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1479 &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1480
1481 # note that caller is expected to allocate stack frame for me!
1482 &mov ($acc,&DWP(240,$key)); # load key->rounds
1483 &lea ($acc,&DWP(-2,$acc,$acc));
1484 &lea ($acc,&DWP(0,$key,$acc,8));
1485 &mov ($__end,$acc); # end of key schedule
1486
1487 &mov ($s0,0x1b1b1b1b); # magic constant
1488 &mov (&DWP(8,"esp"),$s0);
1489 &mov (&DWP(12,"esp"),$s0);
1490
1491 # prefetch Td4
1492 &mov ($s0,&DWP(0-128,$tbl));
1493 &mov ($s1,&DWP(32-128,$tbl));
1494 &mov ($s2,&DWP(64-128,$tbl));
1495 &mov ($s3,&DWP(96-128,$tbl));
1496 &mov ($s0,&DWP(128-128,$tbl));
1497 &mov ($s1,&DWP(160-128,$tbl));
1498 &mov ($s2,&DWP(192-128,$tbl));
1499 &mov ($s3,&DWP(224-128,$tbl));
1500
1501 &set_label("loop",16);
1502 &sse_deccompact();
1503 &add ($key,16);
1504 &cmp ($key,$__end);
1505 &ja (&label("out"));
1506
1507 # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1508 &movq ("mm3","mm0"); &movq ("mm7","mm4");
1509 &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1510 &movq ("mm1","mm0"); &movq ("mm5","mm4");
1511 &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1512 &pslld ("mm2",8); &pslld ("mm6",8);
1513 &psrld ("mm3",8); &psrld ("mm7",8);
1514 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1515 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1516 &pslld ("mm2",16); &pslld ("mm6",16);
1517 &psrld ("mm3",16); &psrld ("mm7",16);
1518 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1519 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1520
1521 &movq ("mm3",&QWP(8,"esp"));
1522 &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1523 &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1524 &pand ("mm2","mm3"); &pand ("mm6","mm3");
1525 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1526 &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1527 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1528 &movq ("mm2","mm1"); &movq ("mm6","mm5");
1529 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1530 &pslld ("mm3",24); &pslld ("mm7",24);
1531 &psrld ("mm2",8); &psrld ("mm6",8);
1532 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1533 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1534
1535 &movq ("mm2",&QWP(8,"esp"));
1536 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1537 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1538 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1539 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1540 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1541 &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1542 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1543 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1544
1545 &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1546 &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1547 &pand ("mm3","mm2"); &pand ("mm7","mm2");
1548 &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1549 &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1550 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1551 &movq ("mm3","mm1"); &movq ("mm7","mm5");
1552 &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1553 &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1554 &pslld ("mm1",8); &pslld ("mm5",8);
1555 &psrld ("mm3",8); &psrld ("mm7",8);
1556 &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1557 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1558 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1559 &mov ($s0,&DWP(0-128,$tbl));
1560 &pslld ("mm1",16); &pslld ("mm5",16);
1561 &mov ($s1,&DWP(64-128,$tbl));
1562 &psrld ("mm3",16); &psrld ("mm7",16);
1563 &mov ($s2,&DWP(128-128,$tbl));
1564 &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1565 &mov ($s3,&DWP(192-128,$tbl));
1566 &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1567
1568 &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1569 &jmp (&label("loop"));
1570
1571 &set_label("out",16);
1572 &pxor ("mm0",&QWP(0,$key));
1573 &pxor ("mm4",&QWP(8,$key));
1574
1575 &ret ();
1576&function_end_B("_sse_AES_decrypt_compact");
1577 }
1578
1579######################################################################
1580# Vanilla block function.
1581######################################################################
1582
1583sub decstep()
1584{ my ($i,$td,@s) = @_;
1585 my $tmp = $key;
1586 my $out = $i==3?$s[0]:$acc;
1587
1588 # no instructions are reordered, as performance appears
1589 # optimal... or rather that all attempts to reorder didn't
1590 # result in better performance [which by the way is not a
1591 # bit lower than ecryption].
1592 if($i==3) { &mov ($key,$__key); }
1593 else { &mov ($out,$s[0]); }
1594 &and ($out,0xFF);
1595 &mov ($out,&DWP(0,$td,$out,8));
1596
1597 if ($i==3) { $tmp=$s[1]; }
1598 &movz ($tmp,&HB($s[1]));
1599 &xor ($out,&DWP(3,$td,$tmp,8));
1600
1601 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1602 else { &mov ($tmp,$s[2]); }
1603 &shr ($tmp,16);
1604 &and ($tmp,0xFF);
1605 &xor ($out,&DWP(2,$td,$tmp,8));
1606
1607 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1608 else { &mov ($tmp,$s[3]); }
1609 &shr ($tmp,24);
1610 &xor ($out,&DWP(1,$td,$tmp,8));
1611 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1612 if ($i==3) { &mov ($s[3],$__s0); }
1613 &comment();
1614}
1615
1616sub declast()
1617{ my ($i,$td,@s)=@_;
1618 my $tmp = $key;
1619 my $out = $i==3?$s[0]:$acc;
1620
1621 if($i==0) { &lea ($td,&DWP(2048+128,$td));
1622 &mov ($tmp,&DWP(0-128,$td));
1623 &mov ($acc,&DWP(32-128,$td));
1624 &mov ($tmp,&DWP(64-128,$td));
1625 &mov ($acc,&DWP(96-128,$td));
1626 &mov ($tmp,&DWP(128-128,$td));
1627 &mov ($acc,&DWP(160-128,$td));
1628 &mov ($tmp,&DWP(192-128,$td));
1629 &mov ($acc,&DWP(224-128,$td));
1630 &lea ($td,&DWP(-128,$td)); }
1631 if($i==3) { &mov ($key,$__key); }
1632 else { &mov ($out,$s[0]); }
1633 &and ($out,0xFF);
1634 &movz ($out,&BP(0,$td,$out,1));
1635
1636 if ($i==3) { $tmp=$s[1]; }
1637 &movz ($tmp,&HB($s[1]));
1638 &movz ($tmp,&BP(0,$td,$tmp,1));
1639 &shl ($tmp,8);
1640 &xor ($out,$tmp);
1641
1642 if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1643 else { mov ($tmp,$s[2]); }
1644 &shr ($tmp,16);
1645 &and ($tmp,0xFF);
1646 &movz ($tmp,&BP(0,$td,$tmp,1));
1647 &shl ($tmp,16);
1648 &xor ($out,$tmp);
1649
1650 if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1651 else { &mov ($tmp,$s[3]); }
1652 &shr ($tmp,24);
1653 &movz ($tmp,&BP(0,$td,$tmp,1));
1654 &shl ($tmp,24);
1655 &xor ($out,$tmp);
1656 if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1657 if ($i==3) { &mov ($s[3],$__s0);
1658 &lea ($td,&DWP(-2048,$td)); }
1659}
1660
1661&function_begin_B("_x86_AES_decrypt");
1662 # note that caller is expected to allocate stack frame for me!
1663 &mov ($__key,$key); # save key
1664
1665 &xor ($s0,&DWP(0,$key)); # xor with key
1666 &xor ($s1,&DWP(4,$key));
1667 &xor ($s2,&DWP(8,$key));
1668 &xor ($s3,&DWP(12,$key));
1669
1670 &mov ($acc,&DWP(240,$key)); # load key->rounds
1671
1672 if ($small_footprint) {
1673 &lea ($acc,&DWP(-2,$acc,$acc));
1674 &lea ($acc,&DWP(0,$key,$acc,8));
1675 &mov ($__end,$acc); # end of key schedule
1676 &set_label("loop",16);
1677 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1678 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1679 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1680 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1681 &add ($key,16); # advance rd_key
1682 &xor ($s0,&DWP(0,$key));
1683 &xor ($s1,&DWP(4,$key));
1684 &xor ($s2,&DWP(8,$key));
1685 &xor ($s3,&DWP(12,$key));
1686 &cmp ($key,$__end);
1687 &mov ($__key,$key);
1688 &jb (&label("loop"));
1689 }
1690 else {
1691 &cmp ($acc,10);
1692 &jle (&label("10rounds"));
1693 &cmp ($acc,12);
1694 &jle (&label("12rounds"));
1695
1696 &set_label("14rounds",4);
1697 for ($i=1;$i<3;$i++) {
1698 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1699 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1700 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1701 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1702 &xor ($s0,&DWP(16*$i+0,$key));
1703 &xor ($s1,&DWP(16*$i+4,$key));
1704 &xor ($s2,&DWP(16*$i+8,$key));
1705 &xor ($s3,&DWP(16*$i+12,$key));
1706 }
1707 &add ($key,32);
1708 &mov ($__key,$key); # advance rd_key
1709 &set_label("12rounds",4);
1710 for ($i=1;$i<3;$i++) {
1711 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1712 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1713 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1714 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1715 &xor ($s0,&DWP(16*$i+0,$key));
1716 &xor ($s1,&DWP(16*$i+4,$key));
1717 &xor ($s2,&DWP(16*$i+8,$key));
1718 &xor ($s3,&DWP(16*$i+12,$key));
1719 }
1720 &add ($key,32);
1721 &mov ($__key,$key); # advance rd_key
1722 &set_label("10rounds",4);
1723 for ($i=1;$i<10;$i++) {
1724 &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1725 &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1726 &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1727 &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1728 &xor ($s0,&DWP(16*$i+0,$key));
1729 &xor ($s1,&DWP(16*$i+4,$key));
1730 &xor ($s2,&DWP(16*$i+8,$key));
1731 &xor ($s3,&DWP(16*$i+12,$key));
1732 }
1733 }
1734
1735 &declast(0,$tbl,$s0,$s3,$s2,$s1);
1736 &declast(1,$tbl,$s1,$s0,$s3,$s2);
1737 &declast(2,$tbl,$s2,$s1,$s0,$s3);
1738 &declast(3,$tbl,$s3,$s2,$s1,$s0);
1739
1740 &add ($key,$small_footprint?16:160);
1741 &xor ($s0,&DWP(0,$key));
1742 &xor ($s1,&DWP(4,$key));
1743 &xor ($s2,&DWP(8,$key));
1744 &xor ($s3,&DWP(12,$key));
1745
1746 &ret ();
1747
1748&set_label("AES_Td",64); # Yes! I keep it in the code segment!
1749 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1750 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1751 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1752 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1753 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1754 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1755 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1756 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1757 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1758 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1759 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1760 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1761 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1762 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1763 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1764 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1765 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1766 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1767 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1768 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1769 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1770 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1771 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1772 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1773 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1774 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1775 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1776 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1777 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1778 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1779 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1780 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1781 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1782 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1783 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1784 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1785 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1786 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1787 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1788 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1789 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1790 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1791 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1792 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1793 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1794 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1795 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1796 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1797 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1798 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1799 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1800 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1801 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1802 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1803 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1804 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1805 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1806 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1807 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1808 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1809 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1810 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1811 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1812 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1813
1814#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1815 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1816 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1817 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1818 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1819 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1820 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1821 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1822 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1823 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1824 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1825 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1826 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1827 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1828 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1829 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1830 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1831 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1832 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1833 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1834 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1835 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1836 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1837 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1838 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1839 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1840 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1841 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1842 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1843 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1844 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1845 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1846 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1847
1848 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1849 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1850 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1851 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1852 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1853 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1854 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1855 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1856 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1857 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1858 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1859 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1860 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1861 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1862 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1863 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1864 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1865 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1866 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1867 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1868 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1869 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1870 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1871 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1872 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1873 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1874 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1875 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1876 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1877 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1878 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1879 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1880
1881 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1882 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1883 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1884 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1885 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1886 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1887 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1888 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1889 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1890 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1891 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1892 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1893 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1894 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1895 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1896 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1897 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1898 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1899 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1900 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1901 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1902 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1903 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1904 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1905 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1906 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1907 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1908 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1909 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1910 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1911 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1912 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1913
1914 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1915 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1916 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1917 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1918 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1919 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1920 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1921 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1922 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1923 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1924 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1925 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1926 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1927 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1928 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1929 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1930 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1931 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1932 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1933 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1934 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1935 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1936 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1937 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1938 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1939 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1940 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1941 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1942 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1943 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1944 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1945 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1946&function_end_B("_x86_AES_decrypt");
1947
1948# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1949&function_begin("AES_decrypt");
1950 &mov ($acc,&wparam(0)); # load inp
1951 &mov ($key,&wparam(2)); # load key
1952
1953 &mov ($s0,"esp");
1954 &sub ("esp",36);
1955 &and ("esp",-64); # align to cache-line
1956
1957 # place stack frame just "above" the key schedule
1958 &lea ($s1,&DWP(-64-63,$key));
1959 &sub ($s1,"esp");
1960 &neg ($s1);
1961 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1962 &sub ("esp",$s1);
1963 &add ("esp",4); # 4 is reserved for caller's return address
1964 &mov ($_esp,$s0); # save stack pointer
1965
1966 &call (&label("pic_point")); # make it PIC!
1967 &set_label("pic_point");
1968 &blindpop($tbl);
1969 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1970 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1971
1972 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1973 &lea ($s1,&DWP(768-4,"esp"));
1974 &sub ($s1,$tbl);
1975 &and ($s1,0x300);
1976 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1977
1978 if (!$x86only) {
1979 &bt (&DWP(0,$s0),25); # check for SSE bit
1980 &jnc (&label("x86"));
1981
1982 &movq ("mm0",&QWP(0,$acc));
1983 &movq ("mm4",&QWP(8,$acc));
1984 &call ("_sse_AES_decrypt_compact");
1985 &mov ("esp",$_esp); # restore stack pointer
1986 &mov ($acc,&wparam(1)); # load out
1987 &movq (&QWP(0,$acc),"mm0"); # write output data
1988 &movq (&QWP(8,$acc),"mm4");
1989 &emms ();
1990 &function_end_A();
1991 }
1992 &set_label("x86",16);
1993 &mov ($_tbl,$tbl);
1994 &mov ($s0,&DWP(0,$acc)); # load input data
1995 &mov ($s1,&DWP(4,$acc));
1996 &mov ($s2,&DWP(8,$acc));
1997 &mov ($s3,&DWP(12,$acc));
1998 &call ("_x86_AES_decrypt_compact");
1999 &mov ("esp",$_esp); # restore stack pointer
2000 &mov ($acc,&wparam(1)); # load out
2001 &mov (&DWP(0,$acc),$s0); # write output data
2002 &mov (&DWP(4,$acc),$s1);
2003 &mov (&DWP(8,$acc),$s2);
2004 &mov (&DWP(12,$acc),$s3);
2005&function_end("AES_decrypt");
2006
2007# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
2008# size_t length, const AES_KEY *key,
2009# unsigned char *ivp,const int enc);
2010{
2011# stack frame layout
2012# -4(%esp) # return address 0(%esp)
2013# 0(%esp) # s0 backing store 4(%esp)
2014# 4(%esp) # s1 backing store 8(%esp)
2015# 8(%esp) # s2 backing store 12(%esp)
2016# 12(%esp) # s3 backing store 16(%esp)
2017# 16(%esp) # key backup 20(%esp)
2018# 20(%esp) # end of key schedule 24(%esp)
2019# 24(%esp) # %ebp backup 28(%esp)
2020# 28(%esp) # %esp backup
2021my $_inp=&DWP(32,"esp"); # copy of wparam(0)
2022my $_out=&DWP(36,"esp"); # copy of wparam(1)
2023my $_len=&DWP(40,"esp"); # copy of wparam(2)
2024my $_key=&DWP(44,"esp"); # copy of wparam(3)
2025my $_ivp=&DWP(48,"esp"); # copy of wparam(4)
2026my $_tmp=&DWP(52,"esp"); # volatile variable
2027#
2028my $ivec=&DWP(60,"esp"); # ivec[16]
2029my $aes_key=&DWP(76,"esp"); # copy of aes_key
2030my $mark=&DWP(76+240,"esp"); # copy of aes_key->rounds
2031
2032&function_begin("AES_cbc_encrypt");
2033 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len
2034 &cmp ($s2,0);
2035 &je (&label("drop_out"));
2036
2037 &call (&label("pic_point")); # make it PIC!
2038 &set_label("pic_point");
2039 &blindpop($tbl);
2040 &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
2041
2042 &cmp (&wparam(5),0);
2043 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2044 &jne (&label("picked_te"));
2045 &lea ($tbl,&DWP(&label("AES_Td")."-".&label("AES_Te"),$tbl));
2046 &set_label("picked_te");
2047
2048 # one can argue if this is required
2049 &pushf ();
2050 &cld ();
2051
2052 &cmp ($s2,$speed_limit);
2053 &jb (&label("slow_way"));
2054 &test ($s2,15);
2055 &jnz (&label("slow_way"));
2056 if (!$x86only) {
2057 &bt (&DWP(0,$s0),28); # check for hyper-threading bit
2058 &jc (&label("slow_way"));
2059 }
2060 # pre-allocate aligned stack frame...
2061 &lea ($acc,&DWP(-80-244,"esp"));
2062 &and ($acc,-64);
2063
2064 # ... and make sure it doesn't alias with $tbl modulo 4096
2065 &mov ($s0,$tbl);
2066 &lea ($s1,&DWP(2048+256,$tbl));
2067 &mov ($s3,$acc);
2068 &and ($s0,0xfff); # s = %ebp&0xfff
2069 &and ($s1,0xfff); # e = (%ebp+2048+256)&0xfff
2070 &and ($s3,0xfff); # p = %esp&0xfff
2071
2072 &cmp ($s3,$s1); # if (p>=e) %esp =- (p-e);
2073 &jb (&label("tbl_break_out"));
2074 &sub ($s3,$s1);
2075 &sub ($acc,$s3);
2076 &jmp (&label("tbl_ok"));
2077 &set_label("tbl_break_out",4); # else %esp -= (p-s)&0xfff + framesz;
2078 &sub ($s3,$s0);
2079 &and ($s3,0xfff);
2080 &add ($s3,384);
2081 &sub ($acc,$s3);
2082 &set_label("tbl_ok",4);
2083
2084 &lea ($s3,&wparam(0)); # obtain pointer to parameter block
2085 &exch ("esp",$acc); # allocate stack frame
2086 &add ("esp",4); # reserve for return address!
2087 &mov ($_tbl,$tbl); # save %ebp
2088 &mov ($_esp,$acc); # save %esp
2089
2090 &mov ($s0,&DWP(0,$s3)); # load inp
2091 &mov ($s1,&DWP(4,$s3)); # load out
2092 #&mov ($s2,&DWP(8,$s3)); # load len
2093 &mov ($key,&DWP(12,$s3)); # load key
2094 &mov ($acc,&DWP(16,$s3)); # load ivp
2095 &mov ($s3,&DWP(20,$s3)); # load enc flag
2096
2097 &mov ($_inp,$s0); # save copy of inp
2098 &mov ($_out,$s1); # save copy of out
2099 &mov ($_len,$s2); # save copy of len
2100 &mov ($_key,$key); # save copy of key
2101 &mov ($_ivp,$acc); # save copy of ivp
2102
2103 &mov ($mark,0); # copy of aes_key->rounds = 0;
2104 # do we copy key schedule to stack?
2105 &mov ($s1 eq "ebx" ? $s1 : "",$key);
2106 &mov ($s2 eq "ecx" ? $s2 : "",244/4);
2107 &sub ($s1,$tbl);
2108 &mov ("esi",$key);
2109 &and ($s1,0xfff);
2110 &lea ("edi",$aes_key);
2111 &cmp ($s1,2048+256);
2112 &jb (&label("do_copy"));
2113 &cmp ($s1,4096-244);
2114 &jb (&label("skip_copy"));
2115 &set_label("do_copy",4);
2116 &mov ($_key,"edi");
2117 &data_word(0xA5F3F689); # rep movsd
2118 &set_label("skip_copy");
2119
2120 &mov ($key,16);
2121 &set_label("prefetch_tbl",4);
2122 &mov ($s0,&DWP(0,$tbl));
2123 &mov ($s1,&DWP(32,$tbl));
2124 &mov ($s2,&DWP(64,$tbl));
2125 &mov ($acc,&DWP(96,$tbl));
2126 &lea ($tbl,&DWP(128,$tbl));
2127 &sub ($key,1);
2128 &jnz (&label("prefetch_tbl"));
2129 &sub ($tbl,2048);
2130
2131 &mov ($acc,$_inp);
2132 &mov ($key,$_ivp);
2133
2134 &cmp ($s3,0);
2135 &je (&label("fast_decrypt"));
2136
2137#----------------------------- ENCRYPT -----------------------------#
2138 &mov ($s0,&DWP(0,$key)); # load iv
2139 &mov ($s1,&DWP(4,$key));
2140
2141 &set_label("fast_enc_loop",16);
2142 &mov ($s2,&DWP(8,$key));
2143 &mov ($s3,&DWP(12,$key));
2144
2145 &xor ($s0,&DWP(0,$acc)); # xor input data
2146 &xor ($s1,&DWP(4,$acc));
2147 &xor ($s2,&DWP(8,$acc));
2148 &xor ($s3,&DWP(12,$acc));
2149
2150 &mov ($key,$_key); # load key
2151 &call ("_x86_AES_encrypt");
2152
2153 &mov ($acc,$_inp); # load inp
2154 &mov ($key,$_out); # load out
2155
2156 &mov (&DWP(0,$key),$s0); # save output data
2157 &mov (&DWP(4,$key),$s1);
2158 &mov (&DWP(8,$key),$s2);
2159 &mov (&DWP(12,$key),$s3);
2160
2161 &lea ($acc,&DWP(16,$acc)); # advance inp
2162 &mov ($s2,$_len); # load len
2163 &mov ($_inp,$acc); # save inp
2164 &lea ($s3,&DWP(16,$key)); # advance out
2165 &mov ($_out,$s3); # save out
2166 &sub ($s2,16); # decrease len
2167 &mov ($_len,$s2); # save len
2168 &jnz (&label("fast_enc_loop"));
2169 &mov ($acc,$_ivp); # load ivp
2170 &mov ($s2,&DWP(8,$key)); # restore last 2 dwords
2171 &mov ($s3,&DWP(12,$key));
2172 &mov (&DWP(0,$acc),$s0); # save ivec
2173 &mov (&DWP(4,$acc),$s1);
2174 &mov (&DWP(8,$acc),$s2);
2175 &mov (&DWP(12,$acc),$s3);
2176
2177 &cmp ($mark,0); # was the key schedule copied?
2178 &mov ("edi",$_key);
2179 &je (&label("skip_ezero"));
2180 # zero copy of key schedule
2181 &mov ("ecx",240/4);
2182 &xor ("eax","eax");
2183 &align (4);
2184 &data_word(0xABF3F689); # rep stosd
2185 &set_label("skip_ezero")
2186 &mov ("esp",$_esp);
2187 &popf ();
2188 &set_label("drop_out");
2189 &function_end_A();
2190 &pushf (); # kludge, never executed
2191
2192#----------------------------- DECRYPT -----------------------------#
2193&set_label("fast_decrypt",16);
2194
2195 &cmp ($acc,$_out);
2196 &je (&label("fast_dec_in_place")); # in-place processing...
2197
2198 &mov ($_tmp,$key);
2199
2200 &align (4);
2201 &set_label("fast_dec_loop",16);
2202 &mov ($s0,&DWP(0,$acc)); # read input
2203 &mov ($s1,&DWP(4,$acc));
2204 &mov ($s2,&DWP(8,$acc));
2205 &mov ($s3,&DWP(12,$acc));
2206
2207 &mov ($key,$_key); # load key
2208 &call ("_x86_AES_decrypt");
2209
2210 &mov ($key,$_tmp); # load ivp
2211 &mov ($acc,$_len); # load len
2212 &xor ($s0,&DWP(0,$key)); # xor iv
2213 &xor ($s1,&DWP(4,$key));
2214 &xor ($s2,&DWP(8,$key));
2215 &xor ($s3,&DWP(12,$key));
2216
2217 &mov ($key,$_out); # load out
2218 &mov ($acc,$_inp); # load inp
2219
2220 &mov (&DWP(0,$key),$s0); # write output
2221 &mov (&DWP(4,$key),$s1);
2222 &mov (&DWP(8,$key),$s2);
2223 &mov (&DWP(12,$key),$s3);
2224
2225 &mov ($s2,$_len); # load len
2226 &mov ($_tmp,$acc); # save ivp
2227 &lea ($acc,&DWP(16,$acc)); # advance inp
2228 &mov ($_inp,$acc); # save inp
2229 &lea ($key,&DWP(16,$key)); # advance out
2230 &mov ($_out,$key); # save out
2231 &sub ($s2,16); # decrease len
2232 &mov ($_len,$s2); # save len
2233 &jnz (&label("fast_dec_loop"));
2234 &mov ($key,$_tmp); # load temp ivp
2235 &mov ($acc,$_ivp); # load user ivp
2236 &mov ($s0,&DWP(0,$key)); # load iv
2237 &mov ($s1,&DWP(4,$key));
2238 &mov ($s2,&DWP(8,$key));
2239 &mov ($s3,&DWP(12,$key));
2240 &mov (&DWP(0,$acc),$s0); # copy back to user
2241 &mov (&DWP(4,$acc),$s1);
2242 &mov (&DWP(8,$acc),$s2);
2243 &mov (&DWP(12,$acc),$s3);
2244 &jmp (&label("fast_dec_out"));
2245
2246 &set_label("fast_dec_in_place",16);
2247 &set_label("fast_dec_in_place_loop");
2248 &mov ($s0,&DWP(0,$acc)); # read input
2249 &mov ($s1,&DWP(4,$acc));
2250 &mov ($s2,&DWP(8,$acc));
2251 &mov ($s3,&DWP(12,$acc));
2252
2253 &lea ($key,$ivec);
2254 &mov (&DWP(0,$key),$s0); # copy to temp
2255 &mov (&DWP(4,$key),$s1);
2256 &mov (&DWP(8,$key),$s2);
2257 &mov (&DWP(12,$key),$s3);
2258
2259 &mov ($key,$_key); # load key
2260 &call ("_x86_AES_decrypt");
2261
2262 &mov ($key,$_ivp); # load ivp
2263 &mov ($acc,$_out); # load out
2264 &xor ($s0,&DWP(0,$key)); # xor iv
2265 &xor ($s1,&DWP(4,$key));
2266 &xor ($s2,&DWP(8,$key));
2267 &xor ($s3,&DWP(12,$key));
2268
2269 &mov (&DWP(0,$acc),$s0); # write output
2270 &mov (&DWP(4,$acc),$s1);
2271 &mov (&DWP(8,$acc),$s2);
2272 &mov (&DWP(12,$acc),$s3);
2273
2274 &lea ($acc,&DWP(16,$acc)); # advance out
2275 &mov ($_out,$acc); # save out
2276
2277 &lea ($acc,$ivec);
2278 &mov ($s0,&DWP(0,$acc)); # read temp
2279 &mov ($s1,&DWP(4,$acc));
2280 &mov ($s2,&DWP(8,$acc));
2281 &mov ($s3,&DWP(12,$acc));
2282
2283 &mov (&DWP(0,$key),$s0); # copy iv
2284 &mov (&DWP(4,$key),$s1);
2285 &mov (&DWP(8,$key),$s2);
2286 &mov (&DWP(12,$key),$s3);
2287
2288 &mov ($acc,$_inp); # load inp
2289 &mov ($s2,$_len); # load len
2290 &lea ($acc,&DWP(16,$acc)); # advance inp
2291 &mov ($_inp,$acc); # save inp
2292 &sub ($s2,16); # decrease len
2293 &mov ($_len,$s2); # save len
2294 &jnz (&label("fast_dec_in_place_loop"));
2295
2296 &set_label("fast_dec_out",4);
2297 &cmp ($mark,0); # was the key schedule copied?
2298 &mov ("edi",$_key);
2299 &je (&label("skip_dzero"));
2300 # zero copy of key schedule
2301 &mov ("ecx",240/4);
2302 &xor ("eax","eax");
2303 &align (4);
2304 &data_word(0xABF3F689); # rep stosd
2305 &set_label("skip_dzero")
2306 &mov ("esp",$_esp);
2307 &popf ();
2308 &function_end_A();
2309 &pushf (); # kludge, never executed
2310
2311#--------------------------- SLOW ROUTINE ---------------------------#
2312&set_label("slow_way",16);
2313
2314 &mov ($s0,&DWP(0,$s0)) if (!$x86only);# load OPENSSL_ia32cap
2315 &mov ($key,&wparam(3)); # load key
2316
2317 # pre-allocate aligned stack frame...
2318 &lea ($acc,&DWP(-80,"esp"));
2319 &and ($acc,-64);
2320
2321 # ... and make sure it doesn't alias with $key modulo 1024
2322 &lea ($s1,&DWP(-80-63,$key));
2323 &sub ($s1,$acc);
2324 &neg ($s1);
2325 &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
2326 &sub ($acc,$s1);
2327
2328 # pick S-box copy which can't overlap with stack frame or $key
2329 &lea ($s1,&DWP(768,$acc));
2330 &sub ($s1,$tbl);
2331 &and ($s1,0x300);
2332 &lea ($tbl,&DWP(2048+128,$tbl,$s1));
2333
2334 &lea ($s3,&wparam(0)); # pointer to parameter block
2335
2336 &exch ("esp",$acc);
2337 &add ("esp",4); # reserve for return address!
2338 &mov ($_tbl,$tbl); # save %ebp
2339 &mov ($_esp,$acc); # save %esp
2340 &mov ($_tmp,$s0); # save OPENSSL_ia32cap
2341
2342 &mov ($s0,&DWP(0,$s3)); # load inp
2343 &mov ($s1,&DWP(4,$s3)); # load out
2344 #&mov ($s2,&DWP(8,$s3)); # load len
2345 #&mov ($key,&DWP(12,$s3)); # load key
2346 &mov ($acc,&DWP(16,$s3)); # load ivp
2347 &mov ($s3,&DWP(20,$s3)); # load enc flag
2348
2349 &mov ($_inp,$s0); # save copy of inp
2350 &mov ($_out,$s1); # save copy of out
2351 &mov ($_len,$s2); # save copy of len
2352 &mov ($_key,$key); # save copy of key
2353 &mov ($_ivp,$acc); # save copy of ivp
2354
2355 &mov ($key,$acc);
2356 &mov ($acc,$s0);
2357
2358 &cmp ($s3,0);
2359 &je (&label("slow_decrypt"));
2360
2361#--------------------------- SLOW ENCRYPT ---------------------------#
2362 &cmp ($s2,16);
2363 &mov ($s3,$s1);
2364 &jb (&label("slow_enc_tail"));
2365
2366 if (!$x86only) {
2367 &bt ($_tmp,25); # check for SSE bit
2368 &jnc (&label("slow_enc_x86"));
2369
2370 &movq ("mm0",&QWP(0,$key)); # load iv
2371 &movq ("mm4",&QWP(8,$key));
2372
2373 &set_label("slow_enc_loop_sse",16);
2374 &pxor ("mm0",&QWP(0,$acc)); # xor input data
2375 &pxor ("mm4",&QWP(8,$acc));
2376
2377 &mov ($key,$_key);
2378 &call ("_sse_AES_encrypt_compact");
2379
2380 &mov ($acc,$_inp); # load inp
2381 &mov ($key,$_out); # load out
2382 &mov ($s2,$_len); # load len
2383
2384 &movq (&QWP(0,$key),"mm0"); # save output data
2385 &movq (&QWP(8,$key),"mm4");
2386
2387 &lea ($acc,&DWP(16,$acc)); # advance inp
2388 &mov ($_inp,$acc); # save inp
2389 &lea ($s3,&DWP(16,$key)); # advance out
2390 &mov ($_out,$s3); # save out
2391 &sub ($s2,16); # decrease len
2392 &cmp ($s2,16);
2393 &mov ($_len,$s2); # save len
2394 &jae (&label("slow_enc_loop_sse"));
2395 &test ($s2,15);
2396 &jnz (&label("slow_enc_tail"));
2397 &mov ($acc,$_ivp); # load ivp
2398 &movq (&QWP(0,$acc),"mm0"); # save ivec
2399 &movq (&QWP(8,$acc),"mm4");
2400 &emms ();
2401 &mov ("esp",$_esp);
2402 &popf ();
2403 &function_end_A();
2404 &pushf (); # kludge, never executed
2405 }
2406 &set_label("slow_enc_x86",16);
2407 &mov ($s0,&DWP(0,$key)); # load iv
2408 &mov ($s1,&DWP(4,$key));
2409
2410 &set_label("slow_enc_loop_x86",4);
2411 &mov ($s2,&DWP(8,$key));
2412 &mov ($s3,&DWP(12,$key));
2413
2414 &xor ($s0,&DWP(0,$acc)); # xor input data
2415 &xor ($s1,&DWP(4,$acc));
2416 &xor ($s2,&DWP(8,$acc));
2417 &xor ($s3,&DWP(12,$acc));
2418
2419 &mov ($key,$_key); # load key
2420 &call ("_x86_AES_encrypt_compact");
2421
2422 &mov ($acc,$_inp); # load inp
2423 &mov ($key,$_out); # load out
2424
2425 &mov (&DWP(0,$key),$s0); # save output data
2426 &mov (&DWP(4,$key),$s1);
2427 &mov (&DWP(8,$key),$s2);
2428 &mov (&DWP(12,$key),$s3);
2429
2430 &mov ($s2,$_len); # load len
2431 &lea ($acc,&DWP(16,$acc)); # advance inp
2432 &mov ($_inp,$acc); # save inp
2433 &lea ($s3,&DWP(16,$key)); # advance out
2434 &mov ($_out,$s3); # save out
2435 &sub ($s2,16); # decrease len
2436 &cmp ($s2,16);
2437 &mov ($_len,$s2); # save len
2438 &jae (&label("slow_enc_loop_x86"));
2439 &test ($s2,15);
2440 &jnz (&label("slow_enc_tail"));
2441 &mov ($acc,$_ivp); # load ivp
2442 &mov ($s2,&DWP(8,$key)); # restore last dwords
2443 &mov ($s3,&DWP(12,$key));
2444 &mov (&DWP(0,$acc),$s0); # save ivec
2445 &mov (&DWP(4,$acc),$s1);
2446 &mov (&DWP(8,$acc),$s2);
2447 &mov (&DWP(12,$acc),$s3);
2448
2449 &mov ("esp",$_esp);
2450 &popf ();
2451 &function_end_A();
2452 &pushf (); # kludge, never executed
2453
2454 &set_label("slow_enc_tail",16);
2455 &emms () if (!$x86only);
2456 &mov ($key eq "edi"? $key:"",$s3); # load out to edi
2457 &mov ($s1,16);
2458 &sub ($s1,$s2);
2459 &cmp ($key,$acc eq "esi"? $acc:""); # compare with inp
2460 &je (&label("enc_in_place"));
2461 &align (4);
2462 &data_word(0xA4F3F689); # rep movsb # copy input
2463 &jmp (&label("enc_skip_in_place"));
2464 &set_label("enc_in_place");
2465 &lea ($key,&DWP(0,$key,$s2));
2466 &set_label("enc_skip_in_place");
2467 &mov ($s2,$s1);
2468 &xor ($s0,$s0);
2469 &align (4);
2470 &data_word(0xAAF3F689); # rep stosb # zero tail
2471
2472 &mov ($key,$_ivp); # restore ivp
2473 &mov ($acc,$s3); # output as input
2474 &mov ($s0,&DWP(0,$key));
2475 &mov ($s1,&DWP(4,$key));
2476 &mov ($_len,16); # len=16
2477 &jmp (&label("slow_enc_loop_x86")); # one more spin...
2478
2479#--------------------------- SLOW DECRYPT ---------------------------#
2480&set_label("slow_decrypt",16);
2481 if (!$x86only) {
2482 &bt ($_tmp,25); # check for SSE bit
2483 &jnc (&label("slow_dec_loop_x86"));
2484
2485 &set_label("slow_dec_loop_sse",4);
2486 &movq ("mm0",&QWP(0,$acc)); # read input
2487 &movq ("mm4",&QWP(8,$acc));
2488
2489 &mov ($key,$_key);
2490 &call ("_sse_AES_decrypt_compact");
2491
2492 &mov ($acc,$_inp); # load inp
2493 &lea ($s0,$ivec);
2494 &mov ($s1,$_out); # load out
2495 &mov ($s2,$_len); # load len
2496 &mov ($key,$_ivp); # load ivp
2497
2498 &movq ("mm1",&QWP(0,$acc)); # re-read input
2499 &movq ("mm5",&QWP(8,$acc));
2500
2501 &pxor ("mm0",&QWP(0,$key)); # xor iv
2502 &pxor ("mm4",&QWP(8,$key));
2503
2504 &movq (&QWP(0,$key),"mm1"); # copy input to iv
2505 &movq (&QWP(8,$key),"mm5");
2506
2507 &sub ($s2,16); # decrease len
2508 &jc (&label("slow_dec_partial_sse"));
2509
2510 &movq (&QWP(0,$s1),"mm0"); # write output
2511 &movq (&QWP(8,$s1),"mm4");
2512
2513 &lea ($s1,&DWP(16,$s1)); # advance out
2514 &mov ($_out,$s1); # save out
2515 &lea ($acc,&DWP(16,$acc)); # advance inp
2516 &mov ($_inp,$acc); # save inp
2517 &mov ($_len,$s2); # save len
2518 &jnz (&label("slow_dec_loop_sse"));
2519 &emms ();
2520 &mov ("esp",$_esp);
2521 &popf ();
2522 &function_end_A();
2523 &pushf (); # kludge, never executed
2524
2525 &set_label("slow_dec_partial_sse",16);
2526 &movq (&QWP(0,$s0),"mm0"); # save output to temp
2527 &movq (&QWP(8,$s0),"mm4");
2528 &emms ();
2529
2530 &add ($s2 eq "ecx" ? "ecx":"",16);
2531 &mov ("edi",$s1); # out
2532 &mov ("esi",$s0); # temp
2533 &align (4);
2534 &data_word(0xA4F3F689); # rep movsb # copy partial output
2535
2536 &mov ("esp",$_esp);
2537 &popf ();
2538 &function_end_A();
2539 &pushf (); # kludge, never executed
2540 }
2541 &set_label("slow_dec_loop_x86",16);
2542 &mov ($s0,&DWP(0,$acc)); # read input
2543 &mov ($s1,&DWP(4,$acc));
2544 &mov ($s2,&DWP(8,$acc));
2545 &mov ($s3,&DWP(12,$acc));
2546
2547 &lea ($key,$ivec);
2548 &mov (&DWP(0,$key),$s0); # copy to temp
2549 &mov (&DWP(4,$key),$s1);
2550 &mov (&DWP(8,$key),$s2);
2551 &mov (&DWP(12,$key),$s3);
2552
2553 &mov ($key,$_key); # load key
2554 &call ("_x86_AES_decrypt_compact");
2555
2556 &mov ($key,$_ivp); # load ivp
2557 &mov ($acc,$_len); # load len
2558 &xor ($s0,&DWP(0,$key)); # xor iv
2559 &xor ($s1,&DWP(4,$key));
2560 &xor ($s2,&DWP(8,$key));
2561 &xor ($s3,&DWP(12,$key));
2562
2563 &sub ($acc,16);
2564 &jc (&label("slow_dec_partial_x86"));
2565
2566 &mov ($_len,$acc); # save len
2567 &mov ($acc,$_out); # load out
2568
2569 &mov (&DWP(0,$acc),$s0); # write output
2570 &mov (&DWP(4,$acc),$s1);
2571 &mov (&DWP(8,$acc),$s2);
2572 &mov (&DWP(12,$acc),$s3);
2573
2574 &lea ($acc,&DWP(16,$acc)); # advance out
2575 &mov ($_out,$acc); # save out
2576
2577 &lea ($acc,$ivec);
2578 &mov ($s0,&DWP(0,$acc)); # read temp
2579 &mov ($s1,&DWP(4,$acc));
2580 &mov ($s2,&DWP(8,$acc));
2581 &mov ($s3,&DWP(12,$acc));
2582
2583 &mov (&DWP(0,$key),$s0); # copy it to iv
2584 &mov (&DWP(4,$key),$s1);
2585 &mov (&DWP(8,$key),$s2);
2586 &mov (&DWP(12,$key),$s3);
2587
2588 &mov ($acc,$_inp); # load inp
2589 &lea ($acc,&DWP(16,$acc)); # advance inp
2590 &mov ($_inp,$acc); # save inp
2591 &jnz (&label("slow_dec_loop_x86"));
2592 &mov ("esp",$_esp);
2593 &popf ();
2594 &function_end_A();
2595 &pushf (); # kludge, never executed
2596
2597 &set_label("slow_dec_partial_x86",16);
2598 &lea ($acc,$ivec);
2599 &mov (&DWP(0,$acc),$s0); # save output to temp
2600 &mov (&DWP(4,$acc),$s1);
2601 &mov (&DWP(8,$acc),$s2);
2602 &mov (&DWP(12,$acc),$s3);
2603
2604 &mov ($acc,$_inp);
2605 &mov ($s0,&DWP(0,$acc)); # re-read input
2606 &mov ($s1,&DWP(4,$acc));
2607 &mov ($s2,&DWP(8,$acc));
2608 &mov ($s3,&DWP(12,$acc));
2609
2610 &mov (&DWP(0,$key),$s0); # copy it to iv
2611 &mov (&DWP(4,$key),$s1);
2612 &mov (&DWP(8,$key),$s2);
2613 &mov (&DWP(12,$key),$s3);
2614
2615 &mov ("ecx",$_len);
2616 &mov ("edi",$_out);
2617 &lea ("esi",$ivec);
2618 &align (4);
2619 &data_word(0xA4F3F689); # rep movsb # copy partial output
2620
2621 &mov ("esp",$_esp);
2622 &popf ();
2623&function_end("AES_cbc_encrypt");
2624}
2625
2626#------------------------------------------------------------------#
2627
2628sub enckey()
2629{
2630 &movz ("esi",&LB("edx")); # rk[i]>>0
2631 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2632 &movz ("esi",&HB("edx")); # rk[i]>>8
2633 &shl ("ebx",24);
2634 &xor ("eax","ebx");
2635
2636 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2637 &shr ("edx",16);
2638 &movz ("esi",&LB("edx")); # rk[i]>>16
2639 &xor ("eax","ebx");
2640
2641 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2642 &movz ("esi",&HB("edx")); # rk[i]>>24
2643 &shl ("ebx",8);
2644 &xor ("eax","ebx");
2645
2646 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2647 &shl ("ebx",16);
2648 &xor ("eax","ebx");
2649
2650 &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2651}
2652
2653&function_begin("_x86_AES_set_encrypt_key");
2654 &mov ("esi",&wparam(1)); # user supplied key
2655 &mov ("edi",&wparam(3)); # private key schedule
2656
2657 &test ("esi",-1);
2658 &jz (&label("badpointer"));
2659 &test ("edi",-1);
2660 &jz (&label("badpointer"));
2661
2662 &call (&label("pic_point"));
2663 &set_label("pic_point");
2664 &blindpop($tbl);
2665 &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2666 &lea ($tbl,&DWP(2048+128,$tbl));
2667
2668 # prefetch Te4
2669 &mov ("eax",&DWP(0-128,$tbl));
2670 &mov ("ebx",&DWP(32-128,$tbl));
2671 &mov ("ecx",&DWP(64-128,$tbl));
2672 &mov ("edx",&DWP(96-128,$tbl));
2673 &mov ("eax",&DWP(128-128,$tbl));
2674 &mov ("ebx",&DWP(160-128,$tbl));
2675 &mov ("ecx",&DWP(192-128,$tbl));
2676 &mov ("edx",&DWP(224-128,$tbl));
2677
2678 &mov ("ecx",&wparam(2)); # number of bits in key
2679 &cmp ("ecx",128);
2680 &je (&label("10rounds"));
2681 &cmp ("ecx",192);
2682 &je (&label("12rounds"));
2683 &cmp ("ecx",256);
2684 &je (&label("14rounds"));
2685 &mov ("eax",-2); # invalid number of bits
2686 &jmp (&label("exit"));
2687
2688 &set_label("10rounds");
2689 &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2690 &mov ("ebx",&DWP(4,"esi"));
2691 &mov ("ecx",&DWP(8,"esi"));
2692 &mov ("edx",&DWP(12,"esi"));
2693 &mov (&DWP(0,"edi"),"eax");
2694 &mov (&DWP(4,"edi"),"ebx");
2695 &mov (&DWP(8,"edi"),"ecx");
2696 &mov (&DWP(12,"edi"),"edx");
2697
2698 &xor ("ecx","ecx");
2699 &jmp (&label("10shortcut"));
2700
2701 &align (4);
2702 &set_label("10loop");
2703 &mov ("eax",&DWP(0,"edi")); # rk[0]
2704 &mov ("edx",&DWP(12,"edi")); # rk[3]
2705 &set_label("10shortcut");
2706 &enckey ();
2707
2708 &mov (&DWP(16,"edi"),"eax"); # rk[4]
2709 &xor ("eax",&DWP(4,"edi"));
2710 &mov (&DWP(20,"edi"),"eax"); # rk[5]
2711 &xor ("eax",&DWP(8,"edi"));
2712 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2713 &xor ("eax",&DWP(12,"edi"));
2714 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2715 &inc ("ecx");
2716 &add ("edi",16);
2717 &cmp ("ecx",10);
2718 &jl (&label("10loop"));
2719
2720 &mov (&DWP(80,"edi"),10); # setup number of rounds
2721 &xor ("eax","eax");
2722 &jmp (&label("exit"));
2723
2724 &set_label("12rounds");
2725 &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2726 &mov ("ebx",&DWP(4,"esi"));
2727 &mov ("ecx",&DWP(8,"esi"));
2728 &mov ("edx",&DWP(12,"esi"));
2729 &mov (&DWP(0,"edi"),"eax");
2730 &mov (&DWP(4,"edi"),"ebx");
2731 &mov (&DWP(8,"edi"),"ecx");
2732 &mov (&DWP(12,"edi"),"edx");
2733 &mov ("ecx",&DWP(16,"esi"));
2734 &mov ("edx",&DWP(20,"esi"));
2735 &mov (&DWP(16,"edi"),"ecx");
2736 &mov (&DWP(20,"edi"),"edx");
2737
2738 &xor ("ecx","ecx");
2739 &jmp (&label("12shortcut"));
2740
2741 &align (4);
2742 &set_label("12loop");
2743 &mov ("eax",&DWP(0,"edi")); # rk[0]
2744 &mov ("edx",&DWP(20,"edi")); # rk[5]
2745 &set_label("12shortcut");
2746 &enckey ();
2747
2748 &mov (&DWP(24,"edi"),"eax"); # rk[6]
2749 &xor ("eax",&DWP(4,"edi"));
2750 &mov (&DWP(28,"edi"),"eax"); # rk[7]
2751 &xor ("eax",&DWP(8,"edi"));
2752 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2753 &xor ("eax",&DWP(12,"edi"));
2754 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2755
2756 &cmp ("ecx",7);
2757 &je (&label("12break"));
2758 &inc ("ecx");
2759
2760 &xor ("eax",&DWP(16,"edi"));
2761 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2762 &xor ("eax",&DWP(20,"edi"));
2763 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2764
2765 &add ("edi",24);
2766 &jmp (&label("12loop"));
2767
2768 &set_label("12break");
2769 &mov (&DWP(72,"edi"),12); # setup number of rounds
2770 &xor ("eax","eax");
2771 &jmp (&label("exit"));
2772
2773 &set_label("14rounds");
2774 &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2775 &mov ("ebx",&DWP(4,"esi"));
2776 &mov ("ecx",&DWP(8,"esi"));
2777 &mov ("edx",&DWP(12,"esi"));
2778 &mov (&DWP(0,"edi"),"eax");
2779 &mov (&DWP(4,"edi"),"ebx");
2780 &mov (&DWP(8,"edi"),"ecx");
2781 &mov (&DWP(12,"edi"),"edx");
2782 &mov ("eax",&DWP(16,"esi"));
2783 &mov ("ebx",&DWP(20,"esi"));
2784 &mov ("ecx",&DWP(24,"esi"));
2785 &mov ("edx",&DWP(28,"esi"));
2786 &mov (&DWP(16,"edi"),"eax");
2787 &mov (&DWP(20,"edi"),"ebx");
2788 &mov (&DWP(24,"edi"),"ecx");
2789 &mov (&DWP(28,"edi"),"edx");
2790
2791 &xor ("ecx","ecx");
2792 &jmp (&label("14shortcut"));
2793
2794 &align (4);
2795 &set_label("14loop");
2796 &mov ("edx",&DWP(28,"edi")); # rk[7]
2797 &set_label("14shortcut");
2798 &mov ("eax",&DWP(0,"edi")); # rk[0]
2799
2800 &enckey ();
2801
2802 &mov (&DWP(32,"edi"),"eax"); # rk[8]
2803 &xor ("eax",&DWP(4,"edi"));
2804 &mov (&DWP(36,"edi"),"eax"); # rk[9]
2805 &xor ("eax",&DWP(8,"edi"));
2806 &mov (&DWP(40,"edi"),"eax"); # rk[10]
2807 &xor ("eax",&DWP(12,"edi"));
2808 &mov (&DWP(44,"edi"),"eax"); # rk[11]
2809
2810 &cmp ("ecx",6);
2811 &je (&label("14break"));
2812 &inc ("ecx");
2813
2814 &mov ("edx","eax");
2815 &mov ("eax",&DWP(16,"edi")); # rk[4]
2816 &movz ("esi",&LB("edx")); # rk[11]>>0
2817 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2818 &movz ("esi",&HB("edx")); # rk[11]>>8
2819 &xor ("eax","ebx");
2820
2821 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2822 &shr ("edx",16);
2823 &shl ("ebx",8);
2824 &movz ("esi",&LB("edx")); # rk[11]>>16
2825 &xor ("eax","ebx");
2826
2827 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2828 &movz ("esi",&HB("edx")); # rk[11]>>24
2829 &shl ("ebx",16);
2830 &xor ("eax","ebx");
2831
2832 &movz ("ebx",&BP(-128,$tbl,"esi",1));
2833 &shl ("ebx",24);
2834 &xor ("eax","ebx");
2835
2836 &mov (&DWP(48,"edi"),"eax"); # rk[12]
2837 &xor ("eax",&DWP(20,"edi"));
2838 &mov (&DWP(52,"edi"),"eax"); # rk[13]
2839 &xor ("eax",&DWP(24,"edi"));
2840 &mov (&DWP(56,"edi"),"eax"); # rk[14]
2841 &xor ("eax",&DWP(28,"edi"));
2842 &mov (&DWP(60,"edi"),"eax"); # rk[15]
2843
2844 &add ("edi",32);
2845 &jmp (&label("14loop"));
2846
2847 &set_label("14break");
2848 &mov (&DWP(48,"edi"),14); # setup number of rounds
2849 &xor ("eax","eax");
2850 &jmp (&label("exit"));
2851
2852 &set_label("badpointer");
2853 &mov ("eax",-1);
2854 &set_label("exit");
2855&function_end("_x86_AES_set_encrypt_key");
2856
2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret ();
2862&function_end_B("AES_set_encrypt_key");
2863
2864sub deckey()
2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2866 my $tmp = $tbl;
2867
2868 &mov ($acc,$tp1);
2869 &and ($acc,0x80808080);
2870 &mov ($tmp,$acc);
2871 &shr ($tmp,7);
2872 &lea ($tp2,&DWP(0,$tp1,$tp1));
2873 &sub ($acc,$tmp);
2874 &and ($tp2,0xfefefefe);
2875 &and ($acc,0x1b1b1b1b);
2876 &xor ($acc,$tp2);
2877 &mov ($tp2,$acc);
2878
2879 &and ($acc,0x80808080);
2880 &mov ($tmp,$acc);
2881 &shr ($tmp,7);
2882 &lea ($tp4,&DWP(0,$tp2,$tp2));
2883 &sub ($acc,$tmp);
2884 &and ($tp4,0xfefefefe);
2885 &and ($acc,0x1b1b1b1b);
2886 &xor ($tp2,$tp1); # tp2^tp1
2887 &xor ($acc,$tp4);
2888 &mov ($tp4,$acc);
2889
2890 &and ($acc,0x80808080);
2891 &mov ($tmp,$acc);
2892 &shr ($tmp,7);
2893 &lea ($tp8,&DWP(0,$tp4,$tp4));
2894 &xor ($tp4,$tp1); # tp4^tp1
2895 &sub ($acc,$tmp);
2896 &and ($tp8,0xfefefefe);
2897 &and ($acc,0x1b1b1b1b);
2898 &rotl ($tp1,8); # = ROTATE(tp1,8)
2899 &xor ($tp8,$acc);
2900
2901 &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2902
2903 &xor ($tp1,$tp2);
2904 &xor ($tp2,$tp8);
2905 &xor ($tp1,$tp4);
2906 &rotl ($tp2,24);
2907 &xor ($tp4,$tp8);
2908 &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2909 &rotl ($tp4,16);
2910 &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2911 &rotl ($tp8,8);
2912 &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2913 &mov ($tp2,$tmp);
2914 &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2915
2916 &mov (&DWP(4*$i,$key),$tp1);
2917}
2918
2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2920# AES_KEY *key)
2921&function_begin_B("AES_set_decrypt_key");
2922 &call ("_x86_AES_set_encrypt_key");
2923 &cmp ("eax",0);
2924 &je (&label("proceed"));
2925 &ret ();
2926
2927 &set_label("proceed");
2928 &push ("ebp");
2929 &push ("ebx");
2930 &push ("esi");
2931 &push ("edi");
2932
2933 &mov ("esi",&wparam(2));
2934 &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2935 &lea ("ecx",&DWP(0,"","ecx",4));
2936 &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2937
2938 &set_label("invert",4); # invert order of chunks
2939 &mov ("eax",&DWP(0,"esi"));
2940 &mov ("ebx",&DWP(4,"esi"));
2941 &mov ("ecx",&DWP(0,"edi"));
2942 &mov ("edx",&DWP(4,"edi"));
2943 &mov (&DWP(0,"edi"),"eax");
2944 &mov (&DWP(4,"edi"),"ebx");
2945 &mov (&DWP(0,"esi"),"ecx");
2946 &mov (&DWP(4,"esi"),"edx");
2947 &mov ("eax",&DWP(8,"esi"));
2948 &mov ("ebx",&DWP(12,"esi"));
2949 &mov ("ecx",&DWP(8,"edi"));
2950 &mov ("edx",&DWP(12,"edi"));
2951 &mov (&DWP(8,"edi"),"eax");
2952 &mov (&DWP(12,"edi"),"ebx");
2953 &mov (&DWP(8,"esi"),"ecx");
2954 &mov (&DWP(12,"esi"),"edx");
2955 &add ("esi",16);
2956 &sub ("edi",16);
2957 &cmp ("esi","edi");
2958 &jne (&label("invert"));
2959
2960 &mov ($key,&wparam(2));
2961 &mov ($acc,&DWP(240,$key)); # pull number of rounds
2962 &lea ($acc,&DWP(-2,$acc,$acc));
2963 &lea ($acc,&DWP(0,$key,$acc,8));
2964 &mov (&wparam(2),$acc);
2965
2966 &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2967 &set_label("permute",4); # permute the key schedule
2968 &add ($key,16);
2969 &deckey (0,$key,$s0,$s1,$s2,$s3);
2970 &deckey (1,$key,$s1,$s2,$s3,$s0);
2971 &deckey (2,$key,$s2,$s3,$s0,$s1);
2972 &deckey (3,$key,$s3,$s0,$s1,$s2);
2973 &cmp ($key,&wparam(2));
2974 &jb (&label("permute"));
2975
2976 &xor ("eax","eax"); # return success
2977&function_end("AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2979
2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
new file mode 100644
index 0000000000..86b86c4a0f
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -0,0 +1,1134 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for ARMv4
11
12# January 2007.
13#
14# Code uses single 1K S-box and is >2 times faster than code generated
15# by gcc-3.4.1. This is thanks to unique feature of ARMv4 ISA, which
16# allows to merge logical or arithmetic operation with shift or rotate
17# in one instruction and emit combined result every cycle. The module
18# is endian-neutral. The performance is ~42 cycles/byte for 128-bit
19# key [on single-issue Xscale PXA250 core].
20
21# May 2007.
22#
23# AES_set_[en|de]crypt_key is added.
24
25# July 2010.
26#
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~21.5 cycles per byte.
34
35while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
36open STDOUT,">$output";
37
38$s0="r0";
39$s1="r1";
40$s2="r2";
41$s3="r3";
42$t1="r4";
43$t2="r5";
44$t3="r6";
45$i1="r7";
46$i2="r8";
47$i3="r9";
48
49$tbl="r10";
50$key="r11";
51$rounds="r12";
52
53$code=<<___;
54#include "arm_arch.h"
55.text
56.code 32
57
58.type AES_Te,%object
59.align 5
60AES_Te:
61.word 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
62.word 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
63.word 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
64.word 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
65.word 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
66.word 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
67.word 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
68.word 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
69.word 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
70.word 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
71.word 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
72.word 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
73.word 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
74.word 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
75.word 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
76.word 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
77.word 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
78.word 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
79.word 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
80.word 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
81.word 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
82.word 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
83.word 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
84.word 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
85.word 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
86.word 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
87.word 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
88.word 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
89.word 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
90.word 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
91.word 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
92.word 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
93.word 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
94.word 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
95.word 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
96.word 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
97.word 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
98.word 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
99.word 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
100.word 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
101.word 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
102.word 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
103.word 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
104.word 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
105.word 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
106.word 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
107.word 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
108.word 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
109.word 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
110.word 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
111.word 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
112.word 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
113.word 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
114.word 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
115.word 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
116.word 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
117.word 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
118.word 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
119.word 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
120.word 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
121.word 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
122.word 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
123.word 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
124.word 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
125@ Te4[256]
126.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
127.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
128.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
129.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
130.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
131.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
132.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
133.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
134.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
135.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
136.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
137.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
138.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
139.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
140.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
141.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
142.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
143.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
144.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
145.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
146.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
147.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
148.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
149.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
150.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
151.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
152.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
153.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
154.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
155.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
156.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
157.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
158@ rcon[]
159.word 0x01000000, 0x02000000, 0x04000000, 0x08000000
160.word 0x10000000, 0x20000000, 0x40000000, 0x80000000
161.word 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
162.size AES_Te,.-AES_Te
163
164@ void AES_encrypt(const unsigned char *in, unsigned char *out,
165@ const AES_KEY *key) {
166.global AES_encrypt
167.type AES_encrypt,%function
168.align 5
169AES_encrypt:
170 sub r3,pc,#8 @ AES_encrypt
171 stmdb sp!,{r1,r4-r12,lr}
172 mov $rounds,r0 @ inp
173 mov $key,r2
174 sub $tbl,r3,#AES_encrypt-AES_Te @ Te
175#if __ARM_ARCH__<7
176 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
177 ldrb $t1,[$rounds,#2] @ manner...
178 ldrb $t2,[$rounds,#1]
179 ldrb $t3,[$rounds,#0]
180 orr $s0,$s0,$t1,lsl#8
181 ldrb $s1,[$rounds,#7]
182 orr $s0,$s0,$t2,lsl#16
183 ldrb $t1,[$rounds,#6]
184 orr $s0,$s0,$t3,lsl#24
185 ldrb $t2,[$rounds,#5]
186 ldrb $t3,[$rounds,#4]
187 orr $s1,$s1,$t1,lsl#8
188 ldrb $s2,[$rounds,#11]
189 orr $s1,$s1,$t2,lsl#16
190 ldrb $t1,[$rounds,#10]
191 orr $s1,$s1,$t3,lsl#24
192 ldrb $t2,[$rounds,#9]
193 ldrb $t3,[$rounds,#8]
194 orr $s2,$s2,$t1,lsl#8
195 ldrb $s3,[$rounds,#15]
196 orr $s2,$s2,$t2,lsl#16
197 ldrb $t1,[$rounds,#14]
198 orr $s2,$s2,$t3,lsl#24
199 ldrb $t2,[$rounds,#13]
200 ldrb $t3,[$rounds,#12]
201 orr $s3,$s3,$t1,lsl#8
202 orr $s3,$s3,$t2,lsl#16
203 orr $s3,$s3,$t3,lsl#24
204#else
205 ldr $s0,[$rounds,#0]
206 ldr $s1,[$rounds,#4]
207 ldr $s2,[$rounds,#8]
208 ldr $s3,[$rounds,#12]
209#ifdef __ARMEL__
210 rev $s0,$s0
211 rev $s1,$s1
212 rev $s2,$s2
213 rev $s3,$s3
214#endif
215#endif
216 bl _armv4_AES_encrypt
217
218 ldr $rounds,[sp],#4 @ pop out
219#if __ARM_ARCH__>=7
220#ifdef __ARMEL__
221 rev $s0,$s0
222 rev $s1,$s1
223 rev $s2,$s2
224 rev $s3,$s3
225#endif
226 str $s0,[$rounds,#0]
227 str $s1,[$rounds,#4]
228 str $s2,[$rounds,#8]
229 str $s3,[$rounds,#12]
230#else
231 mov $t1,$s0,lsr#24 @ write output in endian-neutral
232 mov $t2,$s0,lsr#16 @ manner...
233 mov $t3,$s0,lsr#8
234 strb $t1,[$rounds,#0]
235 strb $t2,[$rounds,#1]
236 mov $t1,$s1,lsr#24
237 strb $t3,[$rounds,#2]
238 mov $t2,$s1,lsr#16
239 strb $s0,[$rounds,#3]
240 mov $t3,$s1,lsr#8
241 strb $t1,[$rounds,#4]
242 strb $t2,[$rounds,#5]
243 mov $t1,$s2,lsr#24
244 strb $t3,[$rounds,#6]
245 mov $t2,$s2,lsr#16
246 strb $s1,[$rounds,#7]
247 mov $t3,$s2,lsr#8
248 strb $t1,[$rounds,#8]
249 strb $t2,[$rounds,#9]
250 mov $t1,$s3,lsr#24
251 strb $t3,[$rounds,#10]
252 mov $t2,$s3,lsr#16
253 strb $s2,[$rounds,#11]
254 mov $t3,$s3,lsr#8
255 strb $t1,[$rounds,#12]
256 strb $t2,[$rounds,#13]
257 strb $t3,[$rounds,#14]
258 strb $s3,[$rounds,#15]
259#endif
260#if __ARM_ARCH__>=5
261 ldmia sp!,{r4-r12,pc}
262#else
263 ldmia sp!,{r4-r12,lr}
264 tst lr,#1
265 moveq pc,lr @ be binary compatible with V4, yet
266 bx lr @ interoperable with Thumb ISA:-)
267#endif
268.size AES_encrypt,.-AES_encrypt
269
270.type _armv4_AES_encrypt,%function
271.align 2
272_armv4_AES_encrypt:
273 str lr,[sp,#-4]! @ push lr
274 ldmia $key!,{$t1-$i1}
275 eor $s0,$s0,$t1
276 ldr $rounds,[$key,#240-16]
277 eor $s1,$s1,$t2
278 eor $s2,$s2,$t3
279 eor $s3,$s3,$i1
280 sub $rounds,$rounds,#1
281 mov lr,#255
282
283 and $i1,lr,$s0
284 and $i2,lr,$s0,lsr#8
285 and $i3,lr,$s0,lsr#16
286 mov $s0,$s0,lsr#24
287.Lenc_loop:
288 ldr $t1,[$tbl,$i1,lsl#2] @ Te3[s0>>0]
289 and $i1,lr,$s1,lsr#16 @ i0
290 ldr $t2,[$tbl,$i2,lsl#2] @ Te2[s0>>8]
291 and $i2,lr,$s1
292 ldr $t3,[$tbl,$i3,lsl#2] @ Te1[s0>>16]
293 and $i3,lr,$s1,lsr#8
294 ldr $s0,[$tbl,$s0,lsl#2] @ Te0[s0>>24]
295 mov $s1,$s1,lsr#24
296
297 ldr $i1,[$tbl,$i1,lsl#2] @ Te1[s1>>16]
298 ldr $i2,[$tbl,$i2,lsl#2] @ Te3[s1>>0]
299 ldr $i3,[$tbl,$i3,lsl#2] @ Te2[s1>>8]
300 eor $s0,$s0,$i1,ror#8
301 ldr $s1,[$tbl,$s1,lsl#2] @ Te0[s1>>24]
302 and $i1,lr,$s2,lsr#8 @ i0
303 eor $t2,$t2,$i2,ror#8
304 and $i2,lr,$s2,lsr#16 @ i1
305 eor $t3,$t3,$i3,ror#8
306 and $i3,lr,$s2
307 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
308 eor $s1,$s1,$t1,ror#24
309 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
310 mov $s2,$s2,lsr#24
311
312 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
313 eor $s0,$s0,$i1,ror#16
314 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
315 and $i1,lr,$s3 @ i0
316 eor $s1,$s1,$i2,ror#8
317 and $i2,lr,$s3,lsr#8 @ i1
318 eor $t3,$t3,$i3,ror#16
319 and $i3,lr,$s3,lsr#16 @ i2
320 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
321 eor $s2,$s2,$t2,ror#16
322 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
323 mov $s3,$s3,lsr#24
324
325 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
326 eor $s0,$s0,$i1,ror#24
327 ldr $i1,[$key],#16
328 eor $s1,$s1,$i2,ror#16
329 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
330 eor $s2,$s2,$i3,ror#8
331 ldr $t1,[$key,#-12]
332 eor $s3,$s3,$t3,ror#8
333
334 ldr $t2,[$key,#-8]
335 eor $s0,$s0,$i1
336 ldr $t3,[$key,#-4]
337 and $i1,lr,$s0
338 eor $s1,$s1,$t1
339 and $i2,lr,$s0,lsr#8
340 eor $s2,$s2,$t2
341 and $i3,lr,$s0,lsr#16
342 eor $s3,$s3,$t3
343 mov $s0,$s0,lsr#24
344
345 subs $rounds,$rounds,#1
346 bne .Lenc_loop
347
348 add $tbl,$tbl,#2
349
350 ldrb $t1,[$tbl,$i1,lsl#2] @ Te4[s0>>0]
351 and $i1,lr,$s1,lsr#16 @ i0
352 ldrb $t2,[$tbl,$i2,lsl#2] @ Te4[s0>>8]
353 and $i2,lr,$s1
354 ldrb $t3,[$tbl,$i3,lsl#2] @ Te4[s0>>16]
355 and $i3,lr,$s1,lsr#8
356 ldrb $s0,[$tbl,$s0,lsl#2] @ Te4[s0>>24]
357 mov $s1,$s1,lsr#24
358
359 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s1>>16]
360 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s1>>0]
361 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s1>>8]
362 eor $s0,$i1,$s0,lsl#8
363 ldrb $s1,[$tbl,$s1,lsl#2] @ Te4[s1>>24]
364 and $i1,lr,$s2,lsr#8 @ i0
365 eor $t2,$i2,$t2,lsl#8
366 and $i2,lr,$s2,lsr#16 @ i1
367 eor $t3,$i3,$t3,lsl#8
368 and $i3,lr,$s2
369 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
370 eor $s1,$t1,$s1,lsl#24
371 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
372 mov $s2,$s2,lsr#24
373
374 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
375 eor $s0,$i1,$s0,lsl#8
376 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
377 and $i1,lr,$s3 @ i0
378 eor $s1,$s1,$i2,lsl#16
379 and $i2,lr,$s3,lsr#8 @ i1
380 eor $t3,$i3,$t3,lsl#8
381 and $i3,lr,$s3,lsr#16 @ i2
382 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
383 eor $s2,$t2,$s2,lsl#24
384 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
385 mov $s3,$s3,lsr#24
386
387 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
388 eor $s0,$i1,$s0,lsl#8
389 ldr $i1,[$key,#0]
390 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
391 eor $s1,$s1,$i2,lsl#8
392 ldr $t1,[$key,#4]
393 eor $s2,$s2,$i3,lsl#16
394 ldr $t2,[$key,#8]
395 eor $s3,$t3,$s3,lsl#24
396 ldr $t3,[$key,#12]
397
398 eor $s0,$s0,$i1
399 eor $s1,$s1,$t1
400 eor $s2,$s2,$t2
401 eor $s3,$s3,$t3
402
403 sub $tbl,$tbl,#2
404 ldr pc,[sp],#4 @ pop and return
405.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
406
407.global private_AES_set_encrypt_key
408.type private_AES_set_encrypt_key,%function
409.align 5
410private_AES_set_encrypt_key:
411_armv4_AES_set_encrypt_key:
412 sub r3,pc,#8 @ AES_set_encrypt_key
413 teq r0,#0
414 moveq r0,#-1
415 beq .Labrt
416 teq r2,#0
417 moveq r0,#-1
418 beq .Labrt
419
420 teq r1,#128
421 beq .Lok
422 teq r1,#192
423 beq .Lok
424 teq r1,#256
425 movne r0,#-1
426 bne .Labrt
427
428.Lok: stmdb sp!,{r4-r12,lr}
429 sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
430
431 mov $rounds,r0 @ inp
432 mov lr,r1 @ bits
433 mov $key,r2 @ key
434
435#if __ARM_ARCH__<7
436 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
437 ldrb $t1,[$rounds,#2] @ manner...
438 ldrb $t2,[$rounds,#1]
439 ldrb $t3,[$rounds,#0]
440 orr $s0,$s0,$t1,lsl#8
441 ldrb $s1,[$rounds,#7]
442 orr $s0,$s0,$t2,lsl#16
443 ldrb $t1,[$rounds,#6]
444 orr $s0,$s0,$t3,lsl#24
445 ldrb $t2,[$rounds,#5]
446 ldrb $t3,[$rounds,#4]
447 orr $s1,$s1,$t1,lsl#8
448 ldrb $s2,[$rounds,#11]
449 orr $s1,$s1,$t2,lsl#16
450 ldrb $t1,[$rounds,#10]
451 orr $s1,$s1,$t3,lsl#24
452 ldrb $t2,[$rounds,#9]
453 ldrb $t3,[$rounds,#8]
454 orr $s2,$s2,$t1,lsl#8
455 ldrb $s3,[$rounds,#15]
456 orr $s2,$s2,$t2,lsl#16
457 ldrb $t1,[$rounds,#14]
458 orr $s2,$s2,$t3,lsl#24
459 ldrb $t2,[$rounds,#13]
460 ldrb $t3,[$rounds,#12]
461 orr $s3,$s3,$t1,lsl#8
462 str $s0,[$key],#16
463 orr $s3,$s3,$t2,lsl#16
464 str $s1,[$key,#-12]
465 orr $s3,$s3,$t3,lsl#24
466 str $s2,[$key,#-8]
467 str $s3,[$key,#-4]
468#else
469 ldr $s0,[$rounds,#0]
470 ldr $s1,[$rounds,#4]
471 ldr $s2,[$rounds,#8]
472 ldr $s3,[$rounds,#12]
473#ifdef __ARMEL__
474 rev $s0,$s0
475 rev $s1,$s1
476 rev $s2,$s2
477 rev $s3,$s3
478#endif
479 str $s0,[$key],#16
480 str $s1,[$key,#-12]
481 str $s2,[$key,#-8]
482 str $s3,[$key,#-4]
483#endif
484
485 teq lr,#128
486 bne .Lnot128
487 mov $rounds,#10
488 str $rounds,[$key,#240-16]
489 add $t3,$tbl,#256 @ rcon
490 mov lr,#255
491
492.L128_loop:
493 and $t2,lr,$s3,lsr#24
494 and $i1,lr,$s3,lsr#16
495 ldrb $t2,[$tbl,$t2]
496 and $i2,lr,$s3,lsr#8
497 ldrb $i1,[$tbl,$i1]
498 and $i3,lr,$s3
499 ldrb $i2,[$tbl,$i2]
500 orr $t2,$t2,$i1,lsl#24
501 ldrb $i3,[$tbl,$i3]
502 orr $t2,$t2,$i2,lsl#16
503 ldr $t1,[$t3],#4 @ rcon[i++]
504 orr $t2,$t2,$i3,lsl#8
505 eor $t2,$t2,$t1
506 eor $s0,$s0,$t2 @ rk[4]=rk[0]^...
507 eor $s1,$s1,$s0 @ rk[5]=rk[1]^rk[4]
508 str $s0,[$key],#16
509 eor $s2,$s2,$s1 @ rk[6]=rk[2]^rk[5]
510 str $s1,[$key,#-12]
511 eor $s3,$s3,$s2 @ rk[7]=rk[3]^rk[6]
512 str $s2,[$key,#-8]
513 subs $rounds,$rounds,#1
514 str $s3,[$key,#-4]
515 bne .L128_loop
516 sub r2,$key,#176
517 b .Ldone
518
519.Lnot128:
520#if __ARM_ARCH__<7
521 ldrb $i2,[$rounds,#19]
522 ldrb $t1,[$rounds,#18]
523 ldrb $t2,[$rounds,#17]
524 ldrb $t3,[$rounds,#16]
525 orr $i2,$i2,$t1,lsl#8
526 ldrb $i3,[$rounds,#23]
527 orr $i2,$i2,$t2,lsl#16
528 ldrb $t1,[$rounds,#22]
529 orr $i2,$i2,$t3,lsl#24
530 ldrb $t2,[$rounds,#21]
531 ldrb $t3,[$rounds,#20]
532 orr $i3,$i3,$t1,lsl#8
533 orr $i3,$i3,$t2,lsl#16
534 str $i2,[$key],#8
535 orr $i3,$i3,$t3,lsl#24
536 str $i3,[$key,#-4]
537#else
538 ldr $i2,[$rounds,#16]
539 ldr $i3,[$rounds,#20]
540#ifdef __ARMEL__
541 rev $i2,$i2
542 rev $i3,$i3
543#endif
544 str $i2,[$key],#8
545 str $i3,[$key,#-4]
546#endif
547
548 teq lr,#192
549 bne .Lnot192
550 mov $rounds,#12
551 str $rounds,[$key,#240-24]
552 add $t3,$tbl,#256 @ rcon
553 mov lr,#255
554 mov $rounds,#8
555
556.L192_loop:
557 and $t2,lr,$i3,lsr#24
558 and $i1,lr,$i3,lsr#16
559 ldrb $t2,[$tbl,$t2]
560 and $i2,lr,$i3,lsr#8
561 ldrb $i1,[$tbl,$i1]
562 and $i3,lr,$i3
563 ldrb $i2,[$tbl,$i2]
564 orr $t2,$t2,$i1,lsl#24
565 ldrb $i3,[$tbl,$i3]
566 orr $t2,$t2,$i2,lsl#16
567 ldr $t1,[$t3],#4 @ rcon[i++]
568 orr $t2,$t2,$i3,lsl#8
569 eor $i3,$t2,$t1
570 eor $s0,$s0,$i3 @ rk[6]=rk[0]^...
571 eor $s1,$s1,$s0 @ rk[7]=rk[1]^rk[6]
572 str $s0,[$key],#24
573 eor $s2,$s2,$s1 @ rk[8]=rk[2]^rk[7]
574 str $s1,[$key,#-20]
575 eor $s3,$s3,$s2 @ rk[9]=rk[3]^rk[8]
576 str $s2,[$key,#-16]
577 subs $rounds,$rounds,#1
578 str $s3,[$key,#-12]
579 subeq r2,$key,#216
580 beq .Ldone
581
582 ldr $i1,[$key,#-32]
583 ldr $i2,[$key,#-28]
584 eor $i1,$i1,$s3 @ rk[10]=rk[4]^rk[9]
585 eor $i3,$i2,$i1 @ rk[11]=rk[5]^rk[10]
586 str $i1,[$key,#-8]
587 str $i3,[$key,#-4]
588 b .L192_loop
589
590.Lnot192:
591#if __ARM_ARCH__<7
592 ldrb $i2,[$rounds,#27]
593 ldrb $t1,[$rounds,#26]
594 ldrb $t2,[$rounds,#25]
595 ldrb $t3,[$rounds,#24]
596 orr $i2,$i2,$t1,lsl#8
597 ldrb $i3,[$rounds,#31]
598 orr $i2,$i2,$t2,lsl#16
599 ldrb $t1,[$rounds,#30]
600 orr $i2,$i2,$t3,lsl#24
601 ldrb $t2,[$rounds,#29]
602 ldrb $t3,[$rounds,#28]
603 orr $i3,$i3,$t1,lsl#8
604 orr $i3,$i3,$t2,lsl#16
605 str $i2,[$key],#8
606 orr $i3,$i3,$t3,lsl#24
607 str $i3,[$key,#-4]
608#else
609 ldr $i2,[$rounds,#24]
610 ldr $i3,[$rounds,#28]
611#ifdef __ARMEL__
612 rev $i2,$i2
613 rev $i3,$i3
614#endif
615 str $i2,[$key],#8
616 str $i3,[$key,#-4]
617#endif
618
619 mov $rounds,#14
620 str $rounds,[$key,#240-32]
621 add $t3,$tbl,#256 @ rcon
622 mov lr,#255
623 mov $rounds,#7
624
625.L256_loop:
626 and $t2,lr,$i3,lsr#24
627 and $i1,lr,$i3,lsr#16
628 ldrb $t2,[$tbl,$t2]
629 and $i2,lr,$i3,lsr#8
630 ldrb $i1,[$tbl,$i1]
631 and $i3,lr,$i3
632 ldrb $i2,[$tbl,$i2]
633 orr $t2,$t2,$i1,lsl#24
634 ldrb $i3,[$tbl,$i3]
635 orr $t2,$t2,$i2,lsl#16
636 ldr $t1,[$t3],#4 @ rcon[i++]
637 orr $t2,$t2,$i3,lsl#8
638 eor $i3,$t2,$t1
639 eor $s0,$s0,$i3 @ rk[8]=rk[0]^...
640 eor $s1,$s1,$s0 @ rk[9]=rk[1]^rk[8]
641 str $s0,[$key],#32
642 eor $s2,$s2,$s1 @ rk[10]=rk[2]^rk[9]
643 str $s1,[$key,#-28]
644 eor $s3,$s3,$s2 @ rk[11]=rk[3]^rk[10]
645 str $s2,[$key,#-24]
646 subs $rounds,$rounds,#1
647 str $s3,[$key,#-20]
648 subeq r2,$key,#256
649 beq .Ldone
650
651 and $t2,lr,$s3
652 and $i1,lr,$s3,lsr#8
653 ldrb $t2,[$tbl,$t2]
654 and $i2,lr,$s3,lsr#16
655 ldrb $i1,[$tbl,$i1]
656 and $i3,lr,$s3,lsr#24
657 ldrb $i2,[$tbl,$i2]
658 orr $t2,$t2,$i1,lsl#8
659 ldrb $i3,[$tbl,$i3]
660 orr $t2,$t2,$i2,lsl#16
661 ldr $t1,[$key,#-48]
662 orr $t2,$t2,$i3,lsl#24
663
664 ldr $i1,[$key,#-44]
665 ldr $i2,[$key,#-40]
666 eor $t1,$t1,$t2 @ rk[12]=rk[4]^...
667 ldr $i3,[$key,#-36]
668 eor $i1,$i1,$t1 @ rk[13]=rk[5]^rk[12]
669 str $t1,[$key,#-16]
670 eor $i2,$i2,$i1 @ rk[14]=rk[6]^rk[13]
671 str $i1,[$key,#-12]
672 eor $i3,$i3,$i2 @ rk[15]=rk[7]^rk[14]
673 str $i2,[$key,#-8]
674 str $i3,[$key,#-4]
675 b .L256_loop
676
677.Ldone: mov r0,#0
678 ldmia sp!,{r4-r12,lr}
679.Labrt: tst lr,#1
680 moveq pc,lr @ be binary compatible with V4, yet
681 bx lr @ interoperable with Thumb ISA:-)
682.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
683
684.global private_AES_set_decrypt_key
685.type private_AES_set_decrypt_key,%function
686.align 5
687private_AES_set_decrypt_key:
688 str lr,[sp,#-4]! @ push lr
689 bl _armv4_AES_set_encrypt_key
690 teq r0,#0
691 ldrne lr,[sp],#4 @ pop lr
692 bne .Labrt
693
694 stmdb sp!,{r4-r12}
695
696 ldr $rounds,[r2,#240] @ AES_set_encrypt_key preserves r2,
697 mov $key,r2 @ which is AES_KEY *key
698 mov $i1,r2
699 add $i2,r2,$rounds,lsl#4
700
701.Linv: ldr $s0,[$i1]
702 ldr $s1,[$i1,#4]
703 ldr $s2,[$i1,#8]
704 ldr $s3,[$i1,#12]
705 ldr $t1,[$i2]
706 ldr $t2,[$i2,#4]
707 ldr $t3,[$i2,#8]
708 ldr $i3,[$i2,#12]
709 str $s0,[$i2],#-16
710 str $s1,[$i2,#16+4]
711 str $s2,[$i2,#16+8]
712 str $s3,[$i2,#16+12]
713 str $t1,[$i1],#16
714 str $t2,[$i1,#-12]
715 str $t3,[$i1,#-8]
716 str $i3,[$i1,#-4]
717 teq $i1,$i2
718 bne .Linv
719___
720$mask80=$i1;
721$mask1b=$i2;
722$mask7f=$i3;
723$code.=<<___;
724 ldr $s0,[$key,#16]! @ prefetch tp1
725 mov $mask80,#0x80
726 mov $mask1b,#0x1b
727 orr $mask80,$mask80,#0x8000
728 orr $mask1b,$mask1b,#0x1b00
729 orr $mask80,$mask80,$mask80,lsl#16
730 orr $mask1b,$mask1b,$mask1b,lsl#16
731 sub $rounds,$rounds,#1
732 mvn $mask7f,$mask80
733 mov $rounds,$rounds,lsl#2 @ (rounds-1)*4
734
735.Lmix: and $t1,$s0,$mask80
736 and $s1,$s0,$mask7f
737 sub $t1,$t1,$t1,lsr#7
738 and $t1,$t1,$mask1b
739 eor $s1,$t1,$s1,lsl#1 @ tp2
740
741 and $t1,$s1,$mask80
742 and $s2,$s1,$mask7f
743 sub $t1,$t1,$t1,lsr#7
744 and $t1,$t1,$mask1b
745 eor $s2,$t1,$s2,lsl#1 @ tp4
746
747 and $t1,$s2,$mask80
748 and $s3,$s2,$mask7f
749 sub $t1,$t1,$t1,lsr#7
750 and $t1,$t1,$mask1b
751 eor $s3,$t1,$s3,lsl#1 @ tp8
752
753 eor $t1,$s1,$s2
754 eor $t2,$s0,$s3 @ tp9
755 eor $t1,$t1,$s3 @ tpe
756 eor $t1,$t1,$s1,ror#24
757 eor $t1,$t1,$t2,ror#24 @ ^= ROTATE(tpb=tp9^tp2,8)
758 eor $t1,$t1,$s2,ror#16
759 eor $t1,$t1,$t2,ror#16 @ ^= ROTATE(tpd=tp9^tp4,16)
760 eor $t1,$t1,$t2,ror#8 @ ^= ROTATE(tp9,24)
761
762 ldr $s0,[$key,#4] @ prefetch tp1
763 str $t1,[$key],#4
764 subs $rounds,$rounds,#1
765 bne .Lmix
766
767 mov r0,#0
768#if __ARM_ARCH__>=5
769 ldmia sp!,{r4-r12,pc}
770#else
771 ldmia sp!,{r4-r12,lr}
772 tst lr,#1
773 moveq pc,lr @ be binary compatible with V4, yet
774 bx lr @ interoperable with Thumb ISA:-)
775#endif
776.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
777
778.type AES_Td,%object
779.align 5
780AES_Td:
781.word 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
782.word 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
783.word 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
784.word 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
785.word 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
786.word 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
787.word 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
788.word 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
789.word 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
790.word 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
791.word 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
792.word 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
793.word 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
794.word 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
795.word 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
796.word 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
797.word 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
798.word 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
799.word 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
800.word 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
801.word 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
802.word 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
803.word 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
804.word 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
805.word 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
806.word 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
807.word 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
808.word 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
809.word 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
810.word 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
811.word 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
812.word 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
813.word 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
814.word 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
815.word 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
816.word 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
817.word 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
818.word 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
819.word 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
820.word 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
821.word 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
822.word 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
823.word 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
824.word 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
825.word 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
826.word 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
827.word 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
828.word 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
829.word 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
830.word 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
831.word 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
832.word 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
833.word 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
834.word 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
835.word 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
836.word 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
837.word 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
838.word 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
839.word 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
840.word 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
841.word 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
842.word 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
843.word 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
844.word 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
845@ Td4[256]
846.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
847.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
848.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
849.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
850.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
851.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
852.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
853.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
854.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
855.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
856.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
857.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
858.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
859.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
860.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
861.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
862.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
863.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
864.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
865.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
866.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
867.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
868.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
869.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
870.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
871.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
872.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
873.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
874.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
875.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
876.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
877.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
878.size AES_Td,.-AES_Td
879
880@ void AES_decrypt(const unsigned char *in, unsigned char *out,
881@ const AES_KEY *key) {
882.global AES_decrypt
883.type AES_decrypt,%function
884.align 5
885AES_decrypt:
886 sub r3,pc,#8 @ AES_decrypt
887 stmdb sp!,{r1,r4-r12,lr}
888 mov $rounds,r0 @ inp
889 mov $key,r2
890 sub $tbl,r3,#AES_decrypt-AES_Td @ Td
891#if __ARM_ARCH__<7
892 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
893 ldrb $t1,[$rounds,#2] @ manner...
894 ldrb $t2,[$rounds,#1]
895 ldrb $t3,[$rounds,#0]
896 orr $s0,$s0,$t1,lsl#8
897 ldrb $s1,[$rounds,#7]
898 orr $s0,$s0,$t2,lsl#16
899 ldrb $t1,[$rounds,#6]
900 orr $s0,$s0,$t3,lsl#24
901 ldrb $t2,[$rounds,#5]
902 ldrb $t3,[$rounds,#4]
903 orr $s1,$s1,$t1,lsl#8
904 ldrb $s2,[$rounds,#11]
905 orr $s1,$s1,$t2,lsl#16
906 ldrb $t1,[$rounds,#10]
907 orr $s1,$s1,$t3,lsl#24
908 ldrb $t2,[$rounds,#9]
909 ldrb $t3,[$rounds,#8]
910 orr $s2,$s2,$t1,lsl#8
911 ldrb $s3,[$rounds,#15]
912 orr $s2,$s2,$t2,lsl#16
913 ldrb $t1,[$rounds,#14]
914 orr $s2,$s2,$t3,lsl#24
915 ldrb $t2,[$rounds,#13]
916 ldrb $t3,[$rounds,#12]
917 orr $s3,$s3,$t1,lsl#8
918 orr $s3,$s3,$t2,lsl#16
919 orr $s3,$s3,$t3,lsl#24
920#else
921 ldr $s0,[$rounds,#0]
922 ldr $s1,[$rounds,#4]
923 ldr $s2,[$rounds,#8]
924 ldr $s3,[$rounds,#12]
925#ifdef __ARMEL__
926 rev $s0,$s0
927 rev $s1,$s1
928 rev $s2,$s2
929 rev $s3,$s3
930#endif
931#endif
932 bl _armv4_AES_decrypt
933
934 ldr $rounds,[sp],#4 @ pop out
935#if __ARM_ARCH__>=7
936#ifdef __ARMEL__
937 rev $s0,$s0
938 rev $s1,$s1
939 rev $s2,$s2
940 rev $s3,$s3
941#endif
942 str $s0,[$rounds,#0]
943 str $s1,[$rounds,#4]
944 str $s2,[$rounds,#8]
945 str $s3,[$rounds,#12]
946#else
947 mov $t1,$s0,lsr#24 @ write output in endian-neutral
948 mov $t2,$s0,lsr#16 @ manner...
949 mov $t3,$s0,lsr#8
950 strb $t1,[$rounds,#0]
951 strb $t2,[$rounds,#1]
952 mov $t1,$s1,lsr#24
953 strb $t3,[$rounds,#2]
954 mov $t2,$s1,lsr#16
955 strb $s0,[$rounds,#3]
956 mov $t3,$s1,lsr#8
957 strb $t1,[$rounds,#4]
958 strb $t2,[$rounds,#5]
959 mov $t1,$s2,lsr#24
960 strb $t3,[$rounds,#6]
961 mov $t2,$s2,lsr#16
962 strb $s1,[$rounds,#7]
963 mov $t3,$s2,lsr#8
964 strb $t1,[$rounds,#8]
965 strb $t2,[$rounds,#9]
966 mov $t1,$s3,lsr#24
967 strb $t3,[$rounds,#10]
968 mov $t2,$s3,lsr#16
969 strb $s2,[$rounds,#11]
970 mov $t3,$s3,lsr#8
971 strb $t1,[$rounds,#12]
972 strb $t2,[$rounds,#13]
973 strb $t3,[$rounds,#14]
974 strb $s3,[$rounds,#15]
975#endif
976#if __ARM_ARCH__>=5
977 ldmia sp!,{r4-r12,pc}
978#else
979 ldmia sp!,{r4-r12,lr}
980 tst lr,#1
981 moveq pc,lr @ be binary compatible with V4, yet
982 bx lr @ interoperable with Thumb ISA:-)
983#endif
984.size AES_decrypt,.-AES_decrypt
985
986.type _armv4_AES_decrypt,%function
987.align 2
988_armv4_AES_decrypt:
989 str lr,[sp,#-4]! @ push lr
990 ldmia $key!,{$t1-$i1}
991 eor $s0,$s0,$t1
992 ldr $rounds,[$key,#240-16]
993 eor $s1,$s1,$t2
994 eor $s2,$s2,$t3
995 eor $s3,$s3,$i1
996 sub $rounds,$rounds,#1
997 mov lr,#255
998
999 and $i1,lr,$s0,lsr#16
1000 and $i2,lr,$s0,lsr#8
1001 and $i3,lr,$s0
1002 mov $s0,$s0,lsr#24
1003.Ldec_loop:
1004 ldr $t1,[$tbl,$i1,lsl#2] @ Td1[s0>>16]
1005 and $i1,lr,$s1 @ i0
1006 ldr $t2,[$tbl,$i2,lsl#2] @ Td2[s0>>8]
1007 and $i2,lr,$s1,lsr#16
1008 ldr $t3,[$tbl,$i3,lsl#2] @ Td3[s0>>0]
1009 and $i3,lr,$s1,lsr#8
1010 ldr $s0,[$tbl,$s0,lsl#2] @ Td0[s0>>24]
1011 mov $s1,$s1,lsr#24
1012
1013 ldr $i1,[$tbl,$i1,lsl#2] @ Td3[s1>>0]
1014 ldr $i2,[$tbl,$i2,lsl#2] @ Td1[s1>>16]
1015 ldr $i3,[$tbl,$i3,lsl#2] @ Td2[s1>>8]
1016 eor $s0,$s0,$i1,ror#24
1017 ldr $s1,[$tbl,$s1,lsl#2] @ Td0[s1>>24]
1018 and $i1,lr,$s2,lsr#8 @ i0
1019 eor $t2,$i2,$t2,ror#8
1020 and $i2,lr,$s2 @ i1
1021 eor $t3,$i3,$t3,ror#8
1022 and $i3,lr,$s2,lsr#16
1023 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
1024 eor $s1,$s1,$t1,ror#8
1025 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
1026 mov $s2,$s2,lsr#24
1027
1028 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
1029 eor $s0,$s0,$i1,ror#16
1030 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
1031 and $i1,lr,$s3,lsr#16 @ i0
1032 eor $s1,$s1,$i2,ror#24
1033 and $i2,lr,$s3,lsr#8 @ i1
1034 eor $t3,$i3,$t3,ror#8
1035 and $i3,lr,$s3 @ i2
1036 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
1037 eor $s2,$s2,$t2,ror#8
1038 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
1039 mov $s3,$s3,lsr#24
1040
1041 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
1042 eor $s0,$s0,$i1,ror#8
1043 ldr $i1,[$key],#16
1044 eor $s1,$s1,$i2,ror#16
1045 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
1046 eor $s2,$s2,$i3,ror#24
1047
1048 ldr $t1,[$key,#-12]
1049 eor $s0,$s0,$i1
1050 ldr $t2,[$key,#-8]
1051 eor $s3,$s3,$t3,ror#8
1052 ldr $t3,[$key,#-4]
1053 and $i1,lr,$s0,lsr#16
1054 eor $s1,$s1,$t1
1055 and $i2,lr,$s0,lsr#8
1056 eor $s2,$s2,$t2
1057 and $i3,lr,$s0
1058 eor $s3,$s3,$t3
1059 mov $s0,$s0,lsr#24
1060
1061 subs $rounds,$rounds,#1
1062 bne .Ldec_loop
1063
1064 add $tbl,$tbl,#1024
1065
1066 ldr $t2,[$tbl,#0] @ prefetch Td4
1067 ldr $t3,[$tbl,#32]
1068 ldr $t1,[$tbl,#64]
1069 ldr $t2,[$tbl,#96]
1070 ldr $t3,[$tbl,#128]
1071 ldr $t1,[$tbl,#160]
1072 ldr $t2,[$tbl,#192]
1073 ldr $t3,[$tbl,#224]
1074
1075 ldrb $s0,[$tbl,$s0] @ Td4[s0>>24]
1076 ldrb $t1,[$tbl,$i1] @ Td4[s0>>16]
1077 and $i1,lr,$s1 @ i0
1078 ldrb $t2,[$tbl,$i2] @ Td4[s0>>8]
1079 and $i2,lr,$s1,lsr#16
1080 ldrb $t3,[$tbl,$i3] @ Td4[s0>>0]
1081 and $i3,lr,$s1,lsr#8
1082
1083 ldrb $i1,[$tbl,$i1] @ Td4[s1>>0]
1084 ldrb $s1,[$tbl,$s1,lsr#24] @ Td4[s1>>24]
1085 ldrb $i2,[$tbl,$i2] @ Td4[s1>>16]
1086 eor $s0,$i1,$s0,lsl#24
1087 ldrb $i3,[$tbl,$i3] @ Td4[s1>>8]
1088 eor $s1,$t1,$s1,lsl#8
1089 and $i1,lr,$s2,lsr#8 @ i0
1090 eor $t2,$t2,$i2,lsl#8
1091 and $i2,lr,$s2 @ i1
1092 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
1093 eor $t3,$t3,$i3,lsl#8
1094 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
1095 and $i3,lr,$s2,lsr#16
1096
1097 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
1098 eor $s0,$s0,$i1,lsl#8
1099 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
1100 eor $s1,$i2,$s1,lsl#16
1101 and $i1,lr,$s3,lsr#16 @ i0
1102 eor $s2,$t2,$s2,lsl#16
1103 and $i2,lr,$s3,lsr#8 @ i1
1104 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1105 eor $t3,$t3,$i3,lsl#16
1106 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1107 and $i3,lr,$s3 @ i2
1108
1109 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1110 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1111 eor $s0,$s0,$i1,lsl#16
1112 ldr $i1,[$key,#0]
1113 eor $s1,$s1,$i2,lsl#8
1114 ldr $t1,[$key,#4]
1115 eor $s2,$i3,$s2,lsl#8
1116 ldr $t2,[$key,#8]
1117 eor $s3,$t3,$s3,lsl#24
1118 ldr $t3,[$key,#12]
1119
1120 eor $s0,$s0,$i1
1121 eor $s1,$s1,$t1
1122 eor $s2,$s2,$t2
1123 eor $s3,$s3,$t3
1124
1125 sub $tbl,$tbl,#1024
1126 ldr pc,[sp],#4 @ pop and return
1127.size _armv4_AES_decrypt,.-_armv4_AES_decrypt
1128.asciz "AES for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
1129.align 2
1130___
1131
1132$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
1133print $code;
1134close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/aes/asm/aes-ia64.S b/src/lib/libcrypto/aes/asm/aes-ia64.S
new file mode 100644
index 0000000000..7f6c4c3662
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-ia64.S
@@ -0,0 +1,1123 @@
1// ====================================================================
2// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
3// project. Rights for redistribution and usage in source and binary
4// forms are granted according to the OpenSSL license.
5// ====================================================================
6//
7// What's wrong with compiler generated code? Compiler never uses
8// variable 'shr' which is pairable with 'extr'/'dep' instructions.
9// Then it uses 'zxt' which is an I-type, but can be replaced with
10// 'and' which in turn can be assigned to M-port [there're double as
11// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
12// registers for small constants (255, 24 and 16) to be used with
13// 'shr' and 'and' instructions I can achieve better ILP, Intruction
14// Level Parallelism, and performance. This code outperforms GCC 3.3
15// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
16// HP C - by 40%. Measured best-case scenario, i.e. aligned
17// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
18// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
19
20// Version 1.2 mitigates the hazard of cache-timing attacks by
21// a) compressing S-boxes from 8KB to 2KB+256B, b) scheduling
22// references to S-boxes for L2 cache latency, c) prefetching T[ed]4
23// prior last round. As result performance dropped to (26 + 15*rounds)
24// ticks per block or 11 cycles per byte processed with 128-bit key.
25// This is ~16% deterioration. For reference Itanium 2 L1 cache has
26// 64 bytes line size and L2 - 128 bytes...
27
28.ident "aes-ia64.S, version 1.2"
29.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
30.explicit
31.text
32
33rk0=r8; rk1=r9;
34
35pfssave=r2;
36lcsave=r10;
37prsave=r3;
38maskff=r11;
39twenty4=r14;
40sixteen=r15;
41
42te00=r16; te11=r17; te22=r18; te33=r19;
43te01=r20; te12=r21; te23=r22; te30=r23;
44te02=r24; te13=r25; te20=r26; te31=r27;
45te03=r28; te10=r29; te21=r30; te32=r31;
46
47// these are rotating...
48t0=r32; s0=r33;
49t1=r34; s1=r35;
50t2=r36; s2=r37;
51t3=r38; s3=r39;
52
53te0=r40; te1=r41; te2=r42; te3=r43;
54
55#if defined(_HPUX_SOURCE) && !defined(_LP64)
56# define ADDP addp4
57#else
58# define ADDP add
59#endif
60
61// Offsets from Te0
62#define TE0 0
63#define TE2 2
64#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
65#define TE1 3
66#define TE3 1
67#else
68#define TE1 1
69#define TE3 3
70#endif
71
72// This implies that AES_KEY comprises 32-bit key schedule elements
73// even on LP64 platforms.
74#ifndef KSZ
75# define KSZ 4
76# define LDKEY ld4
77#endif
78
79.proc _ia64_AES_encrypt#
80// Input: rk0-rk1
81// te0
82// te3 as AES_KEY->rounds!!!
83// s0-s3
84// maskff,twenty4,sixteen
85// Output: r16,r20,r24,r28 as s0-s3
86// Clobber: r16-r31,rk0-rk1,r32-r43
87.align 32
88_ia64_AES_encrypt:
89 .prologue
90 .altrp b6
91 .body
92{ .mmi; alloc r16=ar.pfs,12,0,0,8
93 LDKEY t0=[rk0],2*KSZ
94 mov pr.rot=1<<16 }
95{ .mmi; LDKEY t1=[rk1],2*KSZ
96 add te1=TE1,te0
97 add te3=-3,te3 };;
98{ .mib; LDKEY t2=[rk0],2*KSZ
99 mov ar.ec=2 }
100{ .mib; LDKEY t3=[rk1],2*KSZ
101 add te2=TE2,te0
102 brp.loop.imp .Le_top,.Le_end-16 };;
103
104{ .mmi; xor s0=s0,t0
105 xor s1=s1,t1
106 mov ar.lc=te3 }
107{ .mmi; xor s2=s2,t2
108 xor s3=s3,t3
109 add te3=TE3,te0 };;
110
111.align 32
112.Le_top:
113{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
114 (p0) and te33=s3,maskff // 0/0:s3&0xff
115 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
116{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
117 (p0) and te30=s0,maskff // 0/1:s0&0xff
118 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
119{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
120 (p0) shladd te33=te33,3,te3 // 1/0:te0+s0>>24
121 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
122{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
123 (p0) shladd te30=te30,3,te3 // 1/1:te3+s0
124 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
125{ .mmi; (p0) ld4 te33=[te33] // 2/0:te3[s3&0xff]
126 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
127 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
128{ .mmi; (p0) ld4 te30=[te30] // 2/1:te3[s0]
129 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
130 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
131{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
132 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
133 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
134{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
135 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
136 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
137{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
138 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
139 (p0) extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
140{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
141 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
142 (p0) shr.u te13=s3,sixteen };; // 4/2:s3>>16
143{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
144 (p0) shladd te11=te11,3,te1 // 5/0:te1+s1>>16
145 (p0) extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
146{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
147 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
148 (p0) and te31=s1,maskff };; // 5/2:s1&0xff
149{ .mmi; (p0) ld4 te11=[te11] // 6/0:te1[s1>>16]
150 (p0) shladd te12=te12,3,te1 // 6/1:te1+s2>>16
151 (p0) extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
152{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
153 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
154 (p0) and te32=s2,maskff };; // 6/3:s2&0xff
155
156{ .mmi; (p0) ld4 te12=[te12] // 7/1:te1[s2>>16]
157 (p0) shladd te31=te31,3,te3 // 7/2:te3+s1&0xff
158 (p0) and te13=te13,maskff} // 7/2:s3>>16&0xff
159{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
160 (p0) shladd te32=te32,3,te3 // 7/3:te3+s2
161 (p0) xor t0=t0,te33 };; // 7/0:
162{ .mmi; (p0) ld4 te31=[te31] // 8/2:te3[s1]
163 (p0) shladd te13=te13,3,te1 // 8/2:te1+s3>>16
164 (p0) xor t0=t0,te22 } // 8/0:
165{ .mmi; (p0) ld4 te32=[te32] // 8/3:te3[s2]
166 (p0) shladd te10=te10,3,te1 // 8/3:te1+s0>>16
167 (p0) xor t1=t1,te30 };; // 8/1:
168{ .mmi; (p0) ld4 te13=[te13] // 9/2:te1[s3>>16]
169 (p0) ld4 te10=[te10] // 9/3:te1[s0>>16]
170 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
171{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
172 (p0) xor t2=t2,te20 // 10[9]/2:
173 (p0) xor t3=t3,te21 };; // 10[9]/3:
174{ .mmi; (p0) xor t0=t0,te11 // 11[10]/0:done!
175 (p0) xor t1=t1,te01 // 11[10]/1:
176 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
177{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
178 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
179{ .mmi; (p0) xor t1=t1,te12 // 13[11]/1:done!
180 (p0) xor t2=t2,te31 // 13[11]/2:
181 (p0) xor t3=t3,te32 } // 13[11]/3:
182{ .mmi; (p17) add te0=2048,te0 // 13[11]/
183 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
184{ .mib; (p0) xor t2=t2,te13 // 14[12]/2:done!
185 (p17) add te2=2048+128-TE2,te2} // 14[12]/
186{ .mib; (p0) xor t3=t3,te10 // 14[12]/3:done!
187 (p17) add te3=2048+192-TE3,te3 // 14[12]/
188 br.ctop.sptk .Le_top };;
189.Le_end:
190
191
192{ .mmi; ld8 te12=[te0] // prefetch Te4
193 ld8 te31=[te1] }
194{ .mmi; ld8 te10=[te2]
195 ld8 te32=[te3] }
196
197{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
198 and te33=s3,maskff // 0/0:s3&0xff
199 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
200{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
201 and te30=s0,maskff // 0/1:s0&0xff
202 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
203{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
204 add te33=te33,te0 // 1/0:te0+s0>>24
205 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
206{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
207 add te30=te30,te0 // 1/1:te0+s0
208 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
209{ .mmi; ld1 te33=[te33] // 2/0:te0[s3&0xff]
210 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
211 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
212{ .mmi; ld1 te30=[te30] // 2/1:te0[s0]
213 add te23=te23,te0 // 2/1:te0+s3>>8
214 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
215{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
216 add te20=te20,te0 // 3/2:te0+s0>>8
217 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
218{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
219 add te00=te00,te0 // 3/0:te0+s0>>24
220 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
221{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
222 add te21=te21,te0 // 4/3:te0+s2
223 extr.u te11=s1,16,8 } // 4/0:s1>>16&0xff
224{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
225 add te01=te01,te0 // 4/1:te0+s1>>24
226 shr.u te13=s3,sixteen };; // 4/2:s3>>16
227{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
228 add te11=te11,te0 // 5/0:te0+s1>>16
229 extr.u te12=s2,16,8 } // 5/1:s2>>16&0xff
230{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
231 add te02=te02,te0 // 5/2:te0+s2>>24
232 and te31=s1,maskff };; // 5/2:s1&0xff
233{ .mmi; ld1 te11=[te11] // 6/0:te0[s1>>16]
234 add te12=te12,te0 // 6/1:te0+s2>>16
235 extr.u te10=s0,16,8 } // 6/3:s0>>16&0xff
236{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
237 add te03=te03,te0 // 6/3:te0+s0>>16
238 and te32=s2,maskff };; // 6/3:s2&0xff
239
240{ .mmi; ld1 te12=[te12] // 7/1:te0[s2>>16]
241 add te31=te31,te0 // 7/2:te0+s1&0xff
242 dep te33=te22,te33,8,8} // 7/0:
243{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
244 add te32=te32,te0 // 7/3:te0+s2
245 and te13=te13,maskff};; // 7/2:s3>>16&0xff
246{ .mmi; ld1 te31=[te31] // 8/2:te0[s1]
247 add te13=te13,te0 // 8/2:te0+s3>>16
248 dep te30=te23,te30,8,8} // 8/1:
249{ .mmi; ld1 te32=[te32] // 8/3:te0[s2]
250 add te10=te10,te0 // 8/3:te0+s0>>16
251 shl te00=te00,twenty4};; // 8/0:
252{ .mii; ld1 te13=[te13] // 9/2:te0[s3>>16]
253 dep te33=te11,te33,16,8 // 9/0:
254 shl te01=te01,twenty4};; // 9/1:
255{ .mii; ld1 te10=[te10] // 10/3:te0[s0>>16]
256 dep te31=te20,te31,8,8 // 10/2:
257 shl te02=te02,twenty4};; // 10/2:
258{ .mii; xor t0=t0,te33 // 11/0:
259 dep te32=te21,te32,8,8 // 11/3:
260 shl te12=te12,sixteen};; // 11/1:
261{ .mii; xor r16=t0,te00 // 12/0:done!
262 dep te31=te13,te31,16,8 // 12/2:
263 shl te03=te03,twenty4};; // 12/3:
264{ .mmi; xor t1=t1,te01 // 13/1:
265 xor t2=t2,te02 // 13/2:
266 dep te32=te10,te32,16,8};; // 13/3:
267{ .mmi; xor t1=t1,te30 // 14/1:
268 xor r24=t2,te31 // 14/2:done!
269 xor t3=t3,te32 };; // 14/3:
270{ .mib; xor r20=t1,te12 // 15/1:done!
271 xor r28=t3,te03 // 15/3:done!
272 br.ret.sptk b6 };;
273.endp _ia64_AES_encrypt#
274
275// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
276.global AES_encrypt#
277.proc AES_encrypt#
278.align 32
279AES_encrypt:
280 .prologue
281 .save ar.pfs,pfssave
282{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
283 and out0=3,in0
284 mov r3=ip }
285{ .mmi; ADDP in0=0,in0
286 mov loc0=psr.um
287 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
288
289{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
290 add out8=(AES_Te#-AES_encrypt#),r3 // Te0
291 .save pr,prsave
292 mov prsave=pr }
293{ .mmi; rum 1<<3 // clear um.ac
294 .save ar.lc,lcsave
295 mov lcsave=ar.lc };;
296
297 .body
298#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
299{ .mib; cmp.ne p6,p0=out0,r0
300 add out0=4,in0
301(p6) br.dpnt.many .Le_i_unaligned };;
302
303{ .mmi; ld4 out1=[in0],8 // s0
304 and out9=3,in1
305 mov twenty4=24 }
306{ .mmi; ld4 out3=[out0],8 // s1
307 ADDP rk0=0,in2
308 mov sixteen=16 };;
309{ .mmi; ld4 out5=[in0] // s2
310 cmp.ne p6,p0=out9,r0
311 mov maskff=0xff }
312{ .mmb; ld4 out7=[out0] // s3
313 ADDP rk1=KSZ,in2
314 br.call.sptk.many b6=_ia64_AES_encrypt };;
315
316{ .mib; ADDP in0=4,in1
317 ADDP in1=0,in1
318(p6) br.spnt .Le_o_unaligned };;
319
320{ .mii; mov psr.um=loc0
321 mov ar.pfs=pfssave
322 mov ar.lc=lcsave };;
323{ .mmi; st4 [in1]=r16,8 // s0
324 st4 [in0]=r20,8 // s1
325 mov pr=prsave,0x1ffff };;
326{ .mmb; st4 [in1]=r24 // s2
327 st4 [in0]=r28 // s3
328 br.ret.sptk.many b0 };;
329#endif
330
331.align 32
332.Le_i_unaligned:
333{ .mmi; add out0=1,in0
334 add out2=2,in0
335 add out4=3,in0 };;
336{ .mmi; ld1 r16=[in0],4
337 ld1 r17=[out0],4 }//;;
338{ .mmi; ld1 r18=[out2],4
339 ld1 out1=[out4],4 };; // s0
340{ .mmi; ld1 r20=[in0],4
341 ld1 r21=[out0],4 }//;;
342{ .mmi; ld1 r22=[out2],4
343 ld1 out3=[out4],4 };; // s1
344{ .mmi; ld1 r24=[in0],4
345 ld1 r25=[out0],4 }//;;
346{ .mmi; ld1 r26=[out2],4
347 ld1 out5=[out4],4 };; // s2
348{ .mmi; ld1 r28=[in0]
349 ld1 r29=[out0] }//;;
350{ .mmi; ld1 r30=[out2]
351 ld1 out7=[out4] };; // s3
352
353{ .mii;
354 dep out1=r16,out1,24,8 //;;
355 dep out3=r20,out3,24,8 }//;;
356{ .mii; ADDP rk0=0,in2
357 dep out5=r24,out5,24,8 //;;
358 dep out7=r28,out7,24,8 };;
359{ .mii; ADDP rk1=KSZ,in2
360 dep out1=r17,out1,16,8 //;;
361 dep out3=r21,out3,16,8 }//;;
362{ .mii; mov twenty4=24
363 dep out5=r25,out5,16,8 //;;
364 dep out7=r29,out7,16,8 };;
365{ .mii; mov sixteen=16
366 dep out1=r18,out1,8,8 //;;
367 dep out3=r22,out3,8,8 }//;;
368{ .mii; mov maskff=0xff
369 dep out5=r26,out5,8,8 //;;
370 dep out7=r30,out7,8,8 };;
371
372{ .mib; br.call.sptk.many b6=_ia64_AES_encrypt };;
373
374.Le_o_unaligned:
375{ .mii; ADDP out0=0,in1
376 extr.u r17=r16,8,8 // s0
377 shr.u r19=r16,twenty4 }//;;
378{ .mii; ADDP out1=1,in1
379 extr.u r18=r16,16,8
380 shr.u r23=r20,twenty4 }//;; // s1
381{ .mii; ADDP out2=2,in1
382 extr.u r21=r20,8,8
383 shr.u r22=r20,sixteen }//;;
384{ .mii; ADDP out3=3,in1
385 extr.u r25=r24,8,8 // s2
386 shr.u r27=r24,twenty4 };;
387{ .mii; st1 [out3]=r16,4
388 extr.u r26=r24,16,8
389 shr.u r31=r28,twenty4 }//;; // s3
390{ .mii; st1 [out2]=r17,4
391 extr.u r29=r28,8,8
392 shr.u r30=r28,sixteen }//;;
393
394{ .mmi; st1 [out1]=r18,4
395 st1 [out0]=r19,4 };;
396{ .mmi; st1 [out3]=r20,4
397 st1 [out2]=r21,4 }//;;
398{ .mmi; st1 [out1]=r22,4
399 st1 [out0]=r23,4 };;
400{ .mmi; st1 [out3]=r24,4
401 st1 [out2]=r25,4
402 mov pr=prsave,0x1ffff }//;;
403{ .mmi; st1 [out1]=r26,4
404 st1 [out0]=r27,4
405 mov ar.pfs=pfssave };;
406{ .mmi; st1 [out3]=r28
407 st1 [out2]=r29
408 mov ar.lc=lcsave }//;;
409{ .mmi; st1 [out1]=r30
410 st1 [out0]=r31 }
411{ .mfb; mov psr.um=loc0 // restore user mask
412 br.ret.sptk.many b0 };;
413.endp AES_encrypt#
414
415// *AES_decrypt are autogenerated by the following script:
416#if 0
417#!/usr/bin/env perl
418print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
419open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
420print "#endif\n";
421while(<>) {
422 $process=1 if (/\.proc\s+_ia64_AES_encrypt/);
423 next if (!$process);
424
425 #s/te00=s0/td00=s0/; s/te00/td00/g;
426 s/te11=s1/td13=s3/; s/te11/td13/g;
427 #s/te22=s2/td22=s2/; s/te22/td22/g;
428 s/te33=s3/td31=s1/; s/te33/td31/g;
429
430 #s/te01=s1/td01=s1/; s/te01/td01/g;
431 s/te12=s2/td10=s0/; s/te12/td10/g;
432 #s/te23=s3/td23=s3/; s/te23/td23/g;
433 s/te30=s0/td32=s2/; s/te30/td32/g;
434
435 #s/te02=s2/td02=s2/; s/te02/td02/g;
436 s/te13=s3/td11=s1/; s/te13/td11/g;
437 #s/te20=s0/td20=s0/; s/te20/td20/g;
438 s/te31=s1/td33=s3/; s/te31/td33/g;
439
440 #s/te03=s3/td03=s3/; s/te03/td03/g;
441 s/te10=s0/td12=s2/; s/te10/td12/g;
442 #s/te21=s1/td21=s1/; s/te21/td21/g;
443 s/te32=s2/td30=s0/; s/te32/td30/g;
444
445 s/td/te/g;
446
447 s/AES_encrypt/AES_decrypt/g;
448 s/\.Le_/.Ld_/g;
449 s/AES_Te#/AES_Td#/g;
450
451 print;
452
453 exit if (/\.endp\s+AES_decrypt/);
454}
455#endif
456.proc _ia64_AES_decrypt#
457// Input: rk0-rk1
458// te0
459// te3 as AES_KEY->rounds!!!
460// s0-s3
461// maskff,twenty4,sixteen
462// Output: r16,r20,r24,r28 as s0-s3
463// Clobber: r16-r31,rk0-rk1,r32-r43
464.align 32
465_ia64_AES_decrypt:
466 .prologue
467 .altrp b6
468 .body
469{ .mmi; alloc r16=ar.pfs,12,0,0,8
470 LDKEY t0=[rk0],2*KSZ
471 mov pr.rot=1<<16 }
472{ .mmi; LDKEY t1=[rk1],2*KSZ
473 add te1=TE1,te0
474 add te3=-3,te3 };;
475{ .mib; LDKEY t2=[rk0],2*KSZ
476 mov ar.ec=2 }
477{ .mib; LDKEY t3=[rk1],2*KSZ
478 add te2=TE2,te0
479 brp.loop.imp .Ld_top,.Ld_end-16 };;
480
481{ .mmi; xor s0=s0,t0
482 xor s1=s1,t1
483 mov ar.lc=te3 }
484{ .mmi; xor s2=s2,t2
485 xor s3=s3,t3
486 add te3=TE3,te0 };;
487
488.align 32
489.Ld_top:
490{ .mmi; (p0) LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
491 (p0) and te31=s1,maskff // 0/0:s3&0xff
492 (p0) extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
493{ .mmi; (p0) LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
494 (p0) and te32=s2,maskff // 0/1:s0&0xff
495 (p0) shr.u te00=s0,twenty4 };; // 0/0:s0>>24
496{ .mmi; (p0) LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
497 (p0) shladd te31=te31,3,te3 // 1/0:te0+s0>>24
498 (p0) extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
499{ .mmi; (p0) LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
500 (p0) shladd te32=te32,3,te3 // 1/1:te3+s0
501 (p0) shr.u te01=s1,twenty4 };; // 1/1:s1>>24
502{ .mmi; (p0) ld4 te31=[te31] // 2/0:te3[s3&0xff]
503 (p0) shladd te22=te22,3,te2 // 2/0:te2+s2>>8&0xff
504 (p0) extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
505{ .mmi; (p0) ld4 te32=[te32] // 2/1:te3[s0]
506 (p0) shladd te23=te23,3,te2 // 2/1:te2+s3>>8
507 (p0) shr.u te02=s2,twenty4 };; // 2/2:s2>>24
508{ .mmi; (p0) ld4 te22=[te22] // 3/0:te2[s2>>8]
509 (p0) shladd te20=te20,3,te2 // 3/2:te2+s0>>8
510 (p0) extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
511{ .mmi; (p0) ld4 te23=[te23] // 3/1:te2[s3>>8]
512 (p0) shladd te00=te00,3,te0 // 3/0:te0+s0>>24
513 (p0) shr.u te03=s3,twenty4 };; // 3/3:s3>>24
514{ .mmi; (p0) ld4 te20=[te20] // 4/2:te2[s0>>8]
515 (p0) shladd te21=te21,3,te2 // 4/3:te3+s2
516 (p0) extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
517{ .mmi; (p0) ld4 te00=[te00] // 4/0:te0[s0>>24]
518 (p0) shladd te01=te01,3,te0 // 4/1:te0+s1>>24
519 (p0) shr.u te11=s1,sixteen };; // 4/2:s3>>16
520{ .mmi; (p0) ld4 te21=[te21] // 5/3:te2[s1>>8]
521 (p0) shladd te13=te13,3,te1 // 5/0:te1+s1>>16
522 (p0) extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
523{ .mmi; (p0) ld4 te01=[te01] // 5/1:te0[s1>>24]
524 (p0) shladd te02=te02,3,te0 // 5/2:te0+s2>>24
525 (p0) and te33=s3,maskff };; // 5/2:s1&0xff
526{ .mmi; (p0) ld4 te13=[te13] // 6/0:te1[s1>>16]
527 (p0) shladd te10=te10,3,te1 // 6/1:te1+s2>>16
528 (p0) extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
529{ .mmi; (p0) ld4 te02=[te02] // 6/2:te0[s2>>24]
530 (p0) shladd te03=te03,3,te0 // 6/3:te1+s0>>16
531 (p0) and te30=s0,maskff };; // 6/3:s2&0xff
532
533{ .mmi; (p0) ld4 te10=[te10] // 7/1:te1[s2>>16]
534 (p0) shladd te33=te33,3,te3 // 7/2:te3+s1&0xff
535 (p0) and te11=te11,maskff} // 7/2:s3>>16&0xff
536{ .mmi; (p0) ld4 te03=[te03] // 7/3:te0[s3>>24]
537 (p0) shladd te30=te30,3,te3 // 7/3:te3+s2
538 (p0) xor t0=t0,te31 };; // 7/0:
539{ .mmi; (p0) ld4 te33=[te33] // 8/2:te3[s1]
540 (p0) shladd te11=te11,3,te1 // 8/2:te1+s3>>16
541 (p0) xor t0=t0,te22 } // 8/0:
542{ .mmi; (p0) ld4 te30=[te30] // 8/3:te3[s2]
543 (p0) shladd te12=te12,3,te1 // 8/3:te1+s0>>16
544 (p0) xor t1=t1,te32 };; // 8/1:
545{ .mmi; (p0) ld4 te11=[te11] // 9/2:te1[s3>>16]
546 (p0) ld4 te12=[te12] // 9/3:te1[s0>>16]
547 (p0) xor t0=t0,te00 };; // 9/0: !L2 scheduling
548{ .mmi; (p0) xor t1=t1,te23 // 10[9]/1:
549 (p0) xor t2=t2,te20 // 10[9]/2:
550 (p0) xor t3=t3,te21 };; // 10[9]/3:
551{ .mmi; (p0) xor t0=t0,te13 // 11[10]/0:done!
552 (p0) xor t1=t1,te01 // 11[10]/1:
553 (p0) xor t2=t2,te02 };; // 11[10]/2: !L2 scheduling
554{ .mmi; (p0) xor t3=t3,te03 // 12[10]/3:
555 (p16) cmp.eq p0,p17=r0,r0 };; // 12[10]/clear (p17)
556{ .mmi; (p0) xor t1=t1,te10 // 13[11]/1:done!
557 (p0) xor t2=t2,te33 // 13[11]/2:
558 (p0) xor t3=t3,te30 } // 13[11]/3:
559{ .mmi; (p17) add te0=2048,te0 // 13[11]/
560 (p17) add te1=2048+64-TE1,te1};; // 13[11]/
561{ .mib; (p0) xor t2=t2,te11 // 14[12]/2:done!
562 (p17) add te2=2048+128-TE2,te2} // 14[12]/
563{ .mib; (p0) xor t3=t3,te12 // 14[12]/3:done!
564 (p17) add te3=2048+192-TE3,te3 // 14[12]/
565 br.ctop.sptk .Ld_top };;
566.Ld_end:
567
568
569{ .mmi; ld8 te10=[te0] // prefetch Td4
570 ld8 te33=[te1] }
571{ .mmi; ld8 te12=[te2]
572 ld8 te30=[te3] }
573
574{ .mmi; LDKEY t0=[rk0],2*KSZ // 0/0:rk[0]
575 and te31=s1,maskff // 0/0:s3&0xff
576 extr.u te22=s2,8,8 } // 0/0:s2>>8&0xff
577{ .mmi; LDKEY t1=[rk1],2*KSZ // 0/1:rk[1]
578 and te32=s2,maskff // 0/1:s0&0xff
579 shr.u te00=s0,twenty4 };; // 0/0:s0>>24
580{ .mmi; LDKEY t2=[rk0],2*KSZ // 1/2:rk[2]
581 add te31=te31,te0 // 1/0:te0+s0>>24
582 extr.u te23=s3,8,8 } // 1/1:s3>>8&0xff
583{ .mmi; LDKEY t3=[rk1],2*KSZ // 1/3:rk[3]
584 add te32=te32,te0 // 1/1:te0+s0
585 shr.u te01=s1,twenty4 };; // 1/1:s1>>24
586{ .mmi; ld1 te31=[te31] // 2/0:te0[s3&0xff]
587 add te22=te22,te0 // 2/0:te0+s2>>8&0xff
588 extr.u te20=s0,8,8 } // 2/2:s0>>8&0xff
589{ .mmi; ld1 te32=[te32] // 2/1:te0[s0]
590 add te23=te23,te0 // 2/1:te0+s3>>8
591 shr.u te02=s2,twenty4 };; // 2/2:s2>>24
592{ .mmi; ld1 te22=[te22] // 3/0:te0[s2>>8]
593 add te20=te20,te0 // 3/2:te0+s0>>8
594 extr.u te21=s1,8,8 } // 3/3:s1>>8&0xff
595{ .mmi; ld1 te23=[te23] // 3/1:te0[s3>>8]
596 add te00=te00,te0 // 3/0:te0+s0>>24
597 shr.u te03=s3,twenty4 };; // 3/3:s3>>24
598{ .mmi; ld1 te20=[te20] // 4/2:te0[s0>>8]
599 add te21=te21,te0 // 4/3:te0+s2
600 extr.u te13=s3,16,8 } // 4/0:s1>>16&0xff
601{ .mmi; ld1 te00=[te00] // 4/0:te0[s0>>24]
602 add te01=te01,te0 // 4/1:te0+s1>>24
603 shr.u te11=s1,sixteen };; // 4/2:s3>>16
604{ .mmi; ld1 te21=[te21] // 5/3:te0[s1>>8]
605 add te13=te13,te0 // 5/0:te0+s1>>16
606 extr.u te10=s0,16,8 } // 5/1:s2>>16&0xff
607{ .mmi; ld1 te01=[te01] // 5/1:te0[s1>>24]
608 add te02=te02,te0 // 5/2:te0+s2>>24
609 and te33=s3,maskff };; // 5/2:s1&0xff
610{ .mmi; ld1 te13=[te13] // 6/0:te0[s1>>16]
611 add te10=te10,te0 // 6/1:te0+s2>>16
612 extr.u te12=s2,16,8 } // 6/3:s0>>16&0xff
613{ .mmi; ld1 te02=[te02] // 6/2:te0[s2>>24]
614 add te03=te03,te0 // 6/3:te0+s0>>16
615 and te30=s0,maskff };; // 6/3:s2&0xff
616
617{ .mmi; ld1 te10=[te10] // 7/1:te0[s2>>16]
618 add te33=te33,te0 // 7/2:te0+s1&0xff
619 dep te31=te22,te31,8,8} // 7/0:
620{ .mmi; ld1 te03=[te03] // 7/3:te0[s3>>24]
621 add te30=te30,te0 // 7/3:te0+s2
622 and te11=te11,maskff};; // 7/2:s3>>16&0xff
623{ .mmi; ld1 te33=[te33] // 8/2:te0[s1]
624 add te11=te11,te0 // 8/2:te0+s3>>16
625 dep te32=te23,te32,8,8} // 8/1:
626{ .mmi; ld1 te30=[te30] // 8/3:te0[s2]
627 add te12=te12,te0 // 8/3:te0+s0>>16
628 shl te00=te00,twenty4};; // 8/0:
629{ .mii; ld1 te11=[te11] // 9/2:te0[s3>>16]
630 dep te31=te13,te31,16,8 // 9/0:
631 shl te01=te01,twenty4};; // 9/1:
632{ .mii; ld1 te12=[te12] // 10/3:te0[s0>>16]
633 dep te33=te20,te33,8,8 // 10/2:
634 shl te02=te02,twenty4};; // 10/2:
635{ .mii; xor t0=t0,te31 // 11/0:
636 dep te30=te21,te30,8,8 // 11/3:
637 shl te10=te10,sixteen};; // 11/1:
638{ .mii; xor r16=t0,te00 // 12/0:done!
639 dep te33=te11,te33,16,8 // 12/2:
640 shl te03=te03,twenty4};; // 12/3:
641{ .mmi; xor t1=t1,te01 // 13/1:
642 xor t2=t2,te02 // 13/2:
643 dep te30=te12,te30,16,8};; // 13/3:
644{ .mmi; xor t1=t1,te32 // 14/1:
645 xor r24=t2,te33 // 14/2:done!
646 xor t3=t3,te30 };; // 14/3:
647{ .mib; xor r20=t1,te10 // 15/1:done!
648 xor r28=t3,te03 // 15/3:done!
649 br.ret.sptk b6 };;
650.endp _ia64_AES_decrypt#
651
652// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
653.global AES_decrypt#
654.proc AES_decrypt#
655.align 32
656AES_decrypt:
657 .prologue
658 .save ar.pfs,pfssave
659{ .mmi; alloc pfssave=ar.pfs,3,1,12,0
660 and out0=3,in0
661 mov r3=ip }
662{ .mmi; ADDP in0=0,in0
663 mov loc0=psr.um
664 ADDP out11=KSZ*60,in2 };; // &AES_KEY->rounds
665
666{ .mmi; ld4 out11=[out11] // AES_KEY->rounds
667 add out8=(AES_Td#-AES_decrypt#),r3 // Te0
668 .save pr,prsave
669 mov prsave=pr }
670{ .mmi; rum 1<<3 // clear um.ac
671 .save ar.lc,lcsave
672 mov lcsave=ar.lc };;
673
674 .body
675#if defined(_HPUX_SOURCE) // HPUX is big-endian, cut 15+15 cycles...
676{ .mib; cmp.ne p6,p0=out0,r0
677 add out0=4,in0
678(p6) br.dpnt.many .Ld_i_unaligned };;
679
680{ .mmi; ld4 out1=[in0],8 // s0
681 and out9=3,in1
682 mov twenty4=24 }
683{ .mmi; ld4 out3=[out0],8 // s1
684 ADDP rk0=0,in2
685 mov sixteen=16 };;
686{ .mmi; ld4 out5=[in0] // s2
687 cmp.ne p6,p0=out9,r0
688 mov maskff=0xff }
689{ .mmb; ld4 out7=[out0] // s3
690 ADDP rk1=KSZ,in2
691 br.call.sptk.many b6=_ia64_AES_decrypt };;
692
693{ .mib; ADDP in0=4,in1
694 ADDP in1=0,in1
695(p6) br.spnt .Ld_o_unaligned };;
696
697{ .mii; mov psr.um=loc0
698 mov ar.pfs=pfssave
699 mov ar.lc=lcsave };;
700{ .mmi; st4 [in1]=r16,8 // s0
701 st4 [in0]=r20,8 // s1
702 mov pr=prsave,0x1ffff };;
703{ .mmb; st4 [in1]=r24 // s2
704 st4 [in0]=r28 // s3
705 br.ret.sptk.many b0 };;
706#endif
707
708.align 32
709.Ld_i_unaligned:
710{ .mmi; add out0=1,in0
711 add out2=2,in0
712 add out4=3,in0 };;
713{ .mmi; ld1 r16=[in0],4
714 ld1 r17=[out0],4 }//;;
715{ .mmi; ld1 r18=[out2],4
716 ld1 out1=[out4],4 };; // s0
717{ .mmi; ld1 r20=[in0],4
718 ld1 r21=[out0],4 }//;;
719{ .mmi; ld1 r22=[out2],4
720 ld1 out3=[out4],4 };; // s1
721{ .mmi; ld1 r24=[in0],4
722 ld1 r25=[out0],4 }//;;
723{ .mmi; ld1 r26=[out2],4
724 ld1 out5=[out4],4 };; // s2
725{ .mmi; ld1 r28=[in0]
726 ld1 r29=[out0] }//;;
727{ .mmi; ld1 r30=[out2]
728 ld1 out7=[out4] };; // s3
729
730{ .mii;
731 dep out1=r16,out1,24,8 //;;
732 dep out3=r20,out3,24,8 }//;;
733{ .mii; ADDP rk0=0,in2
734 dep out5=r24,out5,24,8 //;;
735 dep out7=r28,out7,24,8 };;
736{ .mii; ADDP rk1=KSZ,in2
737 dep out1=r17,out1,16,8 //;;
738 dep out3=r21,out3,16,8 }//;;
739{ .mii; mov twenty4=24
740 dep out5=r25,out5,16,8 //;;
741 dep out7=r29,out7,16,8 };;
742{ .mii; mov sixteen=16
743 dep out1=r18,out1,8,8 //;;
744 dep out3=r22,out3,8,8 }//;;
745{ .mii; mov maskff=0xff
746 dep out5=r26,out5,8,8 //;;
747 dep out7=r30,out7,8,8 };;
748
749{ .mib; br.call.sptk.many b6=_ia64_AES_decrypt };;
750
751.Ld_o_unaligned:
752{ .mii; ADDP out0=0,in1
753 extr.u r17=r16,8,8 // s0
754 shr.u r19=r16,twenty4 }//;;
755{ .mii; ADDP out1=1,in1
756 extr.u r18=r16,16,8
757 shr.u r23=r20,twenty4 }//;; // s1
758{ .mii; ADDP out2=2,in1
759 extr.u r21=r20,8,8
760 shr.u r22=r20,sixteen }//;;
761{ .mii; ADDP out3=3,in1
762 extr.u r25=r24,8,8 // s2
763 shr.u r27=r24,twenty4 };;
764{ .mii; st1 [out3]=r16,4
765 extr.u r26=r24,16,8
766 shr.u r31=r28,twenty4 }//;; // s3
767{ .mii; st1 [out2]=r17,4
768 extr.u r29=r28,8,8
769 shr.u r30=r28,sixteen }//;;
770
771{ .mmi; st1 [out1]=r18,4
772 st1 [out0]=r19,4 };;
773{ .mmi; st1 [out3]=r20,4
774 st1 [out2]=r21,4 }//;;
775{ .mmi; st1 [out1]=r22,4
776 st1 [out0]=r23,4 };;
777{ .mmi; st1 [out3]=r24,4
778 st1 [out2]=r25,4
779 mov pr=prsave,0x1ffff }//;;
780{ .mmi; st1 [out1]=r26,4
781 st1 [out0]=r27,4
782 mov ar.pfs=pfssave };;
783{ .mmi; st1 [out3]=r28
784 st1 [out2]=r29
785 mov ar.lc=lcsave }//;;
786{ .mmi; st1 [out1]=r30
787 st1 [out0]=r31 }
788{ .mfb; mov psr.um=loc0 // restore user mask
789 br.ret.sptk.many b0 };;
790.endp AES_decrypt#
791
792// leave it in .text segment...
793.align 64
794.global AES_Te#
795.type AES_Te#,@object
796AES_Te: data4 0xc66363a5,0xc66363a5, 0xf87c7c84,0xf87c7c84
797 data4 0xee777799,0xee777799, 0xf67b7b8d,0xf67b7b8d
798 data4 0xfff2f20d,0xfff2f20d, 0xd66b6bbd,0xd66b6bbd
799 data4 0xde6f6fb1,0xde6f6fb1, 0x91c5c554,0x91c5c554
800 data4 0x60303050,0x60303050, 0x02010103,0x02010103
801 data4 0xce6767a9,0xce6767a9, 0x562b2b7d,0x562b2b7d
802 data4 0xe7fefe19,0xe7fefe19, 0xb5d7d762,0xb5d7d762
803 data4 0x4dababe6,0x4dababe6, 0xec76769a,0xec76769a
804 data4 0x8fcaca45,0x8fcaca45, 0x1f82829d,0x1f82829d
805 data4 0x89c9c940,0x89c9c940, 0xfa7d7d87,0xfa7d7d87
806 data4 0xeffafa15,0xeffafa15, 0xb25959eb,0xb25959eb
807 data4 0x8e4747c9,0x8e4747c9, 0xfbf0f00b,0xfbf0f00b
808 data4 0x41adadec,0x41adadec, 0xb3d4d467,0xb3d4d467
809 data4 0x5fa2a2fd,0x5fa2a2fd, 0x45afafea,0x45afafea
810 data4 0x239c9cbf,0x239c9cbf, 0x53a4a4f7,0x53a4a4f7
811 data4 0xe4727296,0xe4727296, 0x9bc0c05b,0x9bc0c05b
812 data4 0x75b7b7c2,0x75b7b7c2, 0xe1fdfd1c,0xe1fdfd1c
813 data4 0x3d9393ae,0x3d9393ae, 0x4c26266a,0x4c26266a
814 data4 0x6c36365a,0x6c36365a, 0x7e3f3f41,0x7e3f3f41
815 data4 0xf5f7f702,0xf5f7f702, 0x83cccc4f,0x83cccc4f
816 data4 0x6834345c,0x6834345c, 0x51a5a5f4,0x51a5a5f4
817 data4 0xd1e5e534,0xd1e5e534, 0xf9f1f108,0xf9f1f108
818 data4 0xe2717193,0xe2717193, 0xabd8d873,0xabd8d873
819 data4 0x62313153,0x62313153, 0x2a15153f,0x2a15153f
820 data4 0x0804040c,0x0804040c, 0x95c7c752,0x95c7c752
821 data4 0x46232365,0x46232365, 0x9dc3c35e,0x9dc3c35e
822 data4 0x30181828,0x30181828, 0x379696a1,0x379696a1
823 data4 0x0a05050f,0x0a05050f, 0x2f9a9ab5,0x2f9a9ab5
824 data4 0x0e070709,0x0e070709, 0x24121236,0x24121236
825 data4 0x1b80809b,0x1b80809b, 0xdfe2e23d,0xdfe2e23d
826 data4 0xcdebeb26,0xcdebeb26, 0x4e272769,0x4e272769
827 data4 0x7fb2b2cd,0x7fb2b2cd, 0xea75759f,0xea75759f
828 data4 0x1209091b,0x1209091b, 0x1d83839e,0x1d83839e
829 data4 0x582c2c74,0x582c2c74, 0x341a1a2e,0x341a1a2e
830 data4 0x361b1b2d,0x361b1b2d, 0xdc6e6eb2,0xdc6e6eb2
831 data4 0xb45a5aee,0xb45a5aee, 0x5ba0a0fb,0x5ba0a0fb
832 data4 0xa45252f6,0xa45252f6, 0x763b3b4d,0x763b3b4d
833 data4 0xb7d6d661,0xb7d6d661, 0x7db3b3ce,0x7db3b3ce
834 data4 0x5229297b,0x5229297b, 0xdde3e33e,0xdde3e33e
835 data4 0x5e2f2f71,0x5e2f2f71, 0x13848497,0x13848497
836 data4 0xa65353f5,0xa65353f5, 0xb9d1d168,0xb9d1d168
837 data4 0x00000000,0x00000000, 0xc1eded2c,0xc1eded2c
838 data4 0x40202060,0x40202060, 0xe3fcfc1f,0xe3fcfc1f
839 data4 0x79b1b1c8,0x79b1b1c8, 0xb65b5bed,0xb65b5bed
840 data4 0xd46a6abe,0xd46a6abe, 0x8dcbcb46,0x8dcbcb46
841 data4 0x67bebed9,0x67bebed9, 0x7239394b,0x7239394b
842 data4 0x944a4ade,0x944a4ade, 0x984c4cd4,0x984c4cd4
843 data4 0xb05858e8,0xb05858e8, 0x85cfcf4a,0x85cfcf4a
844 data4 0xbbd0d06b,0xbbd0d06b, 0xc5efef2a,0xc5efef2a
845 data4 0x4faaaae5,0x4faaaae5, 0xedfbfb16,0xedfbfb16
846 data4 0x864343c5,0x864343c5, 0x9a4d4dd7,0x9a4d4dd7
847 data4 0x66333355,0x66333355, 0x11858594,0x11858594
848 data4 0x8a4545cf,0x8a4545cf, 0xe9f9f910,0xe9f9f910
849 data4 0x04020206,0x04020206, 0xfe7f7f81,0xfe7f7f81
850 data4 0xa05050f0,0xa05050f0, 0x783c3c44,0x783c3c44
851 data4 0x259f9fba,0x259f9fba, 0x4ba8a8e3,0x4ba8a8e3
852 data4 0xa25151f3,0xa25151f3, 0x5da3a3fe,0x5da3a3fe
853 data4 0x804040c0,0x804040c0, 0x058f8f8a,0x058f8f8a
854 data4 0x3f9292ad,0x3f9292ad, 0x219d9dbc,0x219d9dbc
855 data4 0x70383848,0x70383848, 0xf1f5f504,0xf1f5f504
856 data4 0x63bcbcdf,0x63bcbcdf, 0x77b6b6c1,0x77b6b6c1
857 data4 0xafdada75,0xafdada75, 0x42212163,0x42212163
858 data4 0x20101030,0x20101030, 0xe5ffff1a,0xe5ffff1a
859 data4 0xfdf3f30e,0xfdf3f30e, 0xbfd2d26d,0xbfd2d26d
860 data4 0x81cdcd4c,0x81cdcd4c, 0x180c0c14,0x180c0c14
861 data4 0x26131335,0x26131335, 0xc3ecec2f,0xc3ecec2f
862 data4 0xbe5f5fe1,0xbe5f5fe1, 0x359797a2,0x359797a2
863 data4 0x884444cc,0x884444cc, 0x2e171739,0x2e171739
864 data4 0x93c4c457,0x93c4c457, 0x55a7a7f2,0x55a7a7f2
865 data4 0xfc7e7e82,0xfc7e7e82, 0x7a3d3d47,0x7a3d3d47
866 data4 0xc86464ac,0xc86464ac, 0xba5d5de7,0xba5d5de7
867 data4 0x3219192b,0x3219192b, 0xe6737395,0xe6737395
868 data4 0xc06060a0,0xc06060a0, 0x19818198,0x19818198
869 data4 0x9e4f4fd1,0x9e4f4fd1, 0xa3dcdc7f,0xa3dcdc7f
870 data4 0x44222266,0x44222266, 0x542a2a7e,0x542a2a7e
871 data4 0x3b9090ab,0x3b9090ab, 0x0b888883,0x0b888883
872 data4 0x8c4646ca,0x8c4646ca, 0xc7eeee29,0xc7eeee29
873 data4 0x6bb8b8d3,0x6bb8b8d3, 0x2814143c,0x2814143c
874 data4 0xa7dede79,0xa7dede79, 0xbc5e5ee2,0xbc5e5ee2
875 data4 0x160b0b1d,0x160b0b1d, 0xaddbdb76,0xaddbdb76
876 data4 0xdbe0e03b,0xdbe0e03b, 0x64323256,0x64323256
877 data4 0x743a3a4e,0x743a3a4e, 0x140a0a1e,0x140a0a1e
878 data4 0x924949db,0x924949db, 0x0c06060a,0x0c06060a
879 data4 0x4824246c,0x4824246c, 0xb85c5ce4,0xb85c5ce4
880 data4 0x9fc2c25d,0x9fc2c25d, 0xbdd3d36e,0xbdd3d36e
881 data4 0x43acacef,0x43acacef, 0xc46262a6,0xc46262a6
882 data4 0x399191a8,0x399191a8, 0x319595a4,0x319595a4
883 data4 0xd3e4e437,0xd3e4e437, 0xf279798b,0xf279798b
884 data4 0xd5e7e732,0xd5e7e732, 0x8bc8c843,0x8bc8c843
885 data4 0x6e373759,0x6e373759, 0xda6d6db7,0xda6d6db7
886 data4 0x018d8d8c,0x018d8d8c, 0xb1d5d564,0xb1d5d564
887 data4 0x9c4e4ed2,0x9c4e4ed2, 0x49a9a9e0,0x49a9a9e0
888 data4 0xd86c6cb4,0xd86c6cb4, 0xac5656fa,0xac5656fa
889 data4 0xf3f4f407,0xf3f4f407, 0xcfeaea25,0xcfeaea25
890 data4 0xca6565af,0xca6565af, 0xf47a7a8e,0xf47a7a8e
891 data4 0x47aeaee9,0x47aeaee9, 0x10080818,0x10080818
892 data4 0x6fbabad5,0x6fbabad5, 0xf0787888,0xf0787888
893 data4 0x4a25256f,0x4a25256f, 0x5c2e2e72,0x5c2e2e72
894 data4 0x381c1c24,0x381c1c24, 0x57a6a6f1,0x57a6a6f1
895 data4 0x73b4b4c7,0x73b4b4c7, 0x97c6c651,0x97c6c651
896 data4 0xcbe8e823,0xcbe8e823, 0xa1dddd7c,0xa1dddd7c
897 data4 0xe874749c,0xe874749c, 0x3e1f1f21,0x3e1f1f21
898 data4 0x964b4bdd,0x964b4bdd, 0x61bdbddc,0x61bdbddc
899 data4 0x0d8b8b86,0x0d8b8b86, 0x0f8a8a85,0x0f8a8a85
900 data4 0xe0707090,0xe0707090, 0x7c3e3e42,0x7c3e3e42
901 data4 0x71b5b5c4,0x71b5b5c4, 0xcc6666aa,0xcc6666aa
902 data4 0x904848d8,0x904848d8, 0x06030305,0x06030305
903 data4 0xf7f6f601,0xf7f6f601, 0x1c0e0e12,0x1c0e0e12
904 data4 0xc26161a3,0xc26161a3, 0x6a35355f,0x6a35355f
905 data4 0xae5757f9,0xae5757f9, 0x69b9b9d0,0x69b9b9d0
906 data4 0x17868691,0x17868691, 0x99c1c158,0x99c1c158
907 data4 0x3a1d1d27,0x3a1d1d27, 0x279e9eb9,0x279e9eb9
908 data4 0xd9e1e138,0xd9e1e138, 0xebf8f813,0xebf8f813
909 data4 0x2b9898b3,0x2b9898b3, 0x22111133,0x22111133
910 data4 0xd26969bb,0xd26969bb, 0xa9d9d970,0xa9d9d970
911 data4 0x078e8e89,0x078e8e89, 0x339494a7,0x339494a7
912 data4 0x2d9b9bb6,0x2d9b9bb6, 0x3c1e1e22,0x3c1e1e22
913 data4 0x15878792,0x15878792, 0xc9e9e920,0xc9e9e920
914 data4 0x87cece49,0x87cece49, 0xaa5555ff,0xaa5555ff
915 data4 0x50282878,0x50282878, 0xa5dfdf7a,0xa5dfdf7a
916 data4 0x038c8c8f,0x038c8c8f, 0x59a1a1f8,0x59a1a1f8
917 data4 0x09898980,0x09898980, 0x1a0d0d17,0x1a0d0d17
918 data4 0x65bfbfda,0x65bfbfda, 0xd7e6e631,0xd7e6e631
919 data4 0x844242c6,0x844242c6, 0xd06868b8,0xd06868b8
920 data4 0x824141c3,0x824141c3, 0x299999b0,0x299999b0
921 data4 0x5a2d2d77,0x5a2d2d77, 0x1e0f0f11,0x1e0f0f11
922 data4 0x7bb0b0cb,0x7bb0b0cb, 0xa85454fc,0xa85454fc
923 data4 0x6dbbbbd6,0x6dbbbbd6, 0x2c16163a,0x2c16163a
924// Te4:
925 data1 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
926 data1 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
927 data1 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
928 data1 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
929 data1 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
930 data1 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
931 data1 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
932 data1 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
933 data1 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
934 data1 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
935 data1 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
936 data1 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
937 data1 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
938 data1 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
939 data1 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
940 data1 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
941 data1 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
942 data1 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
943 data1 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
944 data1 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
945 data1 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
946 data1 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
947 data1 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
948 data1 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
949 data1 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
950 data1 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
951 data1 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
952 data1 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
953 data1 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
954 data1 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
955 data1 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
956 data1 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
957.size AES_Te#,2048+256 // HP-UX assembler fails to ".-AES_Te#"
958
959.align 64
960.global AES_Td#
961.type AES_Td#,@object
962AES_Td: data4 0x51f4a750,0x51f4a750, 0x7e416553,0x7e416553
963 data4 0x1a17a4c3,0x1a17a4c3, 0x3a275e96,0x3a275e96
964 data4 0x3bab6bcb,0x3bab6bcb, 0x1f9d45f1,0x1f9d45f1
965 data4 0xacfa58ab,0xacfa58ab, 0x4be30393,0x4be30393
966 data4 0x2030fa55,0x2030fa55, 0xad766df6,0xad766df6
967 data4 0x88cc7691,0x88cc7691, 0xf5024c25,0xf5024c25
968 data4 0x4fe5d7fc,0x4fe5d7fc, 0xc52acbd7,0xc52acbd7
969 data4 0x26354480,0x26354480, 0xb562a38f,0xb562a38f
970 data4 0xdeb15a49,0xdeb15a49, 0x25ba1b67,0x25ba1b67
971 data4 0x45ea0e98,0x45ea0e98, 0x5dfec0e1,0x5dfec0e1
972 data4 0xc32f7502,0xc32f7502, 0x814cf012,0x814cf012
973 data4 0x8d4697a3,0x8d4697a3, 0x6bd3f9c6,0x6bd3f9c6
974 data4 0x038f5fe7,0x038f5fe7, 0x15929c95,0x15929c95
975 data4 0xbf6d7aeb,0xbf6d7aeb, 0x955259da,0x955259da
976 data4 0xd4be832d,0xd4be832d, 0x587421d3,0x587421d3
977 data4 0x49e06929,0x49e06929, 0x8ec9c844,0x8ec9c844
978 data4 0x75c2896a,0x75c2896a, 0xf48e7978,0xf48e7978
979 data4 0x99583e6b,0x99583e6b, 0x27b971dd,0x27b971dd
980 data4 0xbee14fb6,0xbee14fb6, 0xf088ad17,0xf088ad17
981 data4 0xc920ac66,0xc920ac66, 0x7dce3ab4,0x7dce3ab4
982 data4 0x63df4a18,0x63df4a18, 0xe51a3182,0xe51a3182
983 data4 0x97513360,0x97513360, 0x62537f45,0x62537f45
984 data4 0xb16477e0,0xb16477e0, 0xbb6bae84,0xbb6bae84
985 data4 0xfe81a01c,0xfe81a01c, 0xf9082b94,0xf9082b94
986 data4 0x70486858,0x70486858, 0x8f45fd19,0x8f45fd19
987 data4 0x94de6c87,0x94de6c87, 0x527bf8b7,0x527bf8b7
988 data4 0xab73d323,0xab73d323, 0x724b02e2,0x724b02e2
989 data4 0xe31f8f57,0xe31f8f57, 0x6655ab2a,0x6655ab2a
990 data4 0xb2eb2807,0xb2eb2807, 0x2fb5c203,0x2fb5c203
991 data4 0x86c57b9a,0x86c57b9a, 0xd33708a5,0xd33708a5
992 data4 0x302887f2,0x302887f2, 0x23bfa5b2,0x23bfa5b2
993 data4 0x02036aba,0x02036aba, 0xed16825c,0xed16825c
994 data4 0x8acf1c2b,0x8acf1c2b, 0xa779b492,0xa779b492
995 data4 0xf307f2f0,0xf307f2f0, 0x4e69e2a1,0x4e69e2a1
996 data4 0x65daf4cd,0x65daf4cd, 0x0605bed5,0x0605bed5
997 data4 0xd134621f,0xd134621f, 0xc4a6fe8a,0xc4a6fe8a
998 data4 0x342e539d,0x342e539d, 0xa2f355a0,0xa2f355a0
999 data4 0x058ae132,0x058ae132, 0xa4f6eb75,0xa4f6eb75
1000 data4 0x0b83ec39,0x0b83ec39, 0x4060efaa,0x4060efaa
1001 data4 0x5e719f06,0x5e719f06, 0xbd6e1051,0xbd6e1051
1002 data4 0x3e218af9,0x3e218af9, 0x96dd063d,0x96dd063d
1003 data4 0xdd3e05ae,0xdd3e05ae, 0x4de6bd46,0x4de6bd46
1004 data4 0x91548db5,0x91548db5, 0x71c45d05,0x71c45d05
1005 data4 0x0406d46f,0x0406d46f, 0x605015ff,0x605015ff
1006 data4 0x1998fb24,0x1998fb24, 0xd6bde997,0xd6bde997
1007 data4 0x894043cc,0x894043cc, 0x67d99e77,0x67d99e77
1008 data4 0xb0e842bd,0xb0e842bd, 0x07898b88,0x07898b88
1009 data4 0xe7195b38,0xe7195b38, 0x79c8eedb,0x79c8eedb
1010 data4 0xa17c0a47,0xa17c0a47, 0x7c420fe9,0x7c420fe9
1011 data4 0xf8841ec9,0xf8841ec9, 0x00000000,0x00000000
1012 data4 0x09808683,0x09808683, 0x322bed48,0x322bed48
1013 data4 0x1e1170ac,0x1e1170ac, 0x6c5a724e,0x6c5a724e
1014 data4 0xfd0efffb,0xfd0efffb, 0x0f853856,0x0f853856
1015 data4 0x3daed51e,0x3daed51e, 0x362d3927,0x362d3927
1016 data4 0x0a0fd964,0x0a0fd964, 0x685ca621,0x685ca621
1017 data4 0x9b5b54d1,0x9b5b54d1, 0x24362e3a,0x24362e3a
1018 data4 0x0c0a67b1,0x0c0a67b1, 0x9357e70f,0x9357e70f
1019 data4 0xb4ee96d2,0xb4ee96d2, 0x1b9b919e,0x1b9b919e
1020 data4 0x80c0c54f,0x80c0c54f, 0x61dc20a2,0x61dc20a2
1021 data4 0x5a774b69,0x5a774b69, 0x1c121a16,0x1c121a16
1022 data4 0xe293ba0a,0xe293ba0a, 0xc0a02ae5,0xc0a02ae5
1023 data4 0x3c22e043,0x3c22e043, 0x121b171d,0x121b171d
1024 data4 0x0e090d0b,0x0e090d0b, 0xf28bc7ad,0xf28bc7ad
1025 data4 0x2db6a8b9,0x2db6a8b9, 0x141ea9c8,0x141ea9c8
1026 data4 0x57f11985,0x57f11985, 0xaf75074c,0xaf75074c
1027 data4 0xee99ddbb,0xee99ddbb, 0xa37f60fd,0xa37f60fd
1028 data4 0xf701269f,0xf701269f, 0x5c72f5bc,0x5c72f5bc
1029 data4 0x44663bc5,0x44663bc5, 0x5bfb7e34,0x5bfb7e34
1030 data4 0x8b432976,0x8b432976, 0xcb23c6dc,0xcb23c6dc
1031 data4 0xb6edfc68,0xb6edfc68, 0xb8e4f163,0xb8e4f163
1032 data4 0xd731dcca,0xd731dcca, 0x42638510,0x42638510
1033 data4 0x13972240,0x13972240, 0x84c61120,0x84c61120
1034 data4 0x854a247d,0x854a247d, 0xd2bb3df8,0xd2bb3df8
1035 data4 0xaef93211,0xaef93211, 0xc729a16d,0xc729a16d
1036 data4 0x1d9e2f4b,0x1d9e2f4b, 0xdcb230f3,0xdcb230f3
1037 data4 0x0d8652ec,0x0d8652ec, 0x77c1e3d0,0x77c1e3d0
1038 data4 0x2bb3166c,0x2bb3166c, 0xa970b999,0xa970b999
1039 data4 0x119448fa,0x119448fa, 0x47e96422,0x47e96422
1040 data4 0xa8fc8cc4,0xa8fc8cc4, 0xa0f03f1a,0xa0f03f1a
1041 data4 0x567d2cd8,0x567d2cd8, 0x223390ef,0x223390ef
1042 data4 0x87494ec7,0x87494ec7, 0xd938d1c1,0xd938d1c1
1043 data4 0x8ccaa2fe,0x8ccaa2fe, 0x98d40b36,0x98d40b36
1044 data4 0xa6f581cf,0xa6f581cf, 0xa57ade28,0xa57ade28
1045 data4 0xdab78e26,0xdab78e26, 0x3fadbfa4,0x3fadbfa4
1046 data4 0x2c3a9de4,0x2c3a9de4, 0x5078920d,0x5078920d
1047 data4 0x6a5fcc9b,0x6a5fcc9b, 0x547e4662,0x547e4662
1048 data4 0xf68d13c2,0xf68d13c2, 0x90d8b8e8,0x90d8b8e8
1049 data4 0x2e39f75e,0x2e39f75e, 0x82c3aff5,0x82c3aff5
1050 data4 0x9f5d80be,0x9f5d80be, 0x69d0937c,0x69d0937c
1051 data4 0x6fd52da9,0x6fd52da9, 0xcf2512b3,0xcf2512b3
1052 data4 0xc8ac993b,0xc8ac993b, 0x10187da7,0x10187da7
1053 data4 0xe89c636e,0xe89c636e, 0xdb3bbb7b,0xdb3bbb7b
1054 data4 0xcd267809,0xcd267809, 0x6e5918f4,0x6e5918f4
1055 data4 0xec9ab701,0xec9ab701, 0x834f9aa8,0x834f9aa8
1056 data4 0xe6956e65,0xe6956e65, 0xaaffe67e,0xaaffe67e
1057 data4 0x21bccf08,0x21bccf08, 0xef15e8e6,0xef15e8e6
1058 data4 0xbae79bd9,0xbae79bd9, 0x4a6f36ce,0x4a6f36ce
1059 data4 0xea9f09d4,0xea9f09d4, 0x29b07cd6,0x29b07cd6
1060 data4 0x31a4b2af,0x31a4b2af, 0x2a3f2331,0x2a3f2331
1061 data4 0xc6a59430,0xc6a59430, 0x35a266c0,0x35a266c0
1062 data4 0x744ebc37,0x744ebc37, 0xfc82caa6,0xfc82caa6
1063 data4 0xe090d0b0,0xe090d0b0, 0x33a7d815,0x33a7d815
1064 data4 0xf104984a,0xf104984a, 0x41ecdaf7,0x41ecdaf7
1065 data4 0x7fcd500e,0x7fcd500e, 0x1791f62f,0x1791f62f
1066 data4 0x764dd68d,0x764dd68d, 0x43efb04d,0x43efb04d
1067 data4 0xccaa4d54,0xccaa4d54, 0xe49604df,0xe49604df
1068 data4 0x9ed1b5e3,0x9ed1b5e3, 0x4c6a881b,0x4c6a881b
1069 data4 0xc12c1fb8,0xc12c1fb8, 0x4665517f,0x4665517f
1070 data4 0x9d5eea04,0x9d5eea04, 0x018c355d,0x018c355d
1071 data4 0xfa877473,0xfa877473, 0xfb0b412e,0xfb0b412e
1072 data4 0xb3671d5a,0xb3671d5a, 0x92dbd252,0x92dbd252
1073 data4 0xe9105633,0xe9105633, 0x6dd64713,0x6dd64713
1074 data4 0x9ad7618c,0x9ad7618c, 0x37a10c7a,0x37a10c7a
1075 data4 0x59f8148e,0x59f8148e, 0xeb133c89,0xeb133c89
1076 data4 0xcea927ee,0xcea927ee, 0xb761c935,0xb761c935
1077 data4 0xe11ce5ed,0xe11ce5ed, 0x7a47b13c,0x7a47b13c
1078 data4 0x9cd2df59,0x9cd2df59, 0x55f2733f,0x55f2733f
1079 data4 0x1814ce79,0x1814ce79, 0x73c737bf,0x73c737bf
1080 data4 0x53f7cdea,0x53f7cdea, 0x5ffdaa5b,0x5ffdaa5b
1081 data4 0xdf3d6f14,0xdf3d6f14, 0x7844db86,0x7844db86
1082 data4 0xcaaff381,0xcaaff381, 0xb968c43e,0xb968c43e
1083 data4 0x3824342c,0x3824342c, 0xc2a3405f,0xc2a3405f
1084 data4 0x161dc372,0x161dc372, 0xbce2250c,0xbce2250c
1085 data4 0x283c498b,0x283c498b, 0xff0d9541,0xff0d9541
1086 data4 0x39a80171,0x39a80171, 0x080cb3de,0x080cb3de
1087 data4 0xd8b4e49c,0xd8b4e49c, 0x6456c190,0x6456c190
1088 data4 0x7bcb8461,0x7bcb8461, 0xd532b670,0xd532b670
1089 data4 0x486c5c74,0x486c5c74, 0xd0b85742,0xd0b85742
1090// Td4:
1091 data1 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
1092 data1 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1093 data1 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1094 data1 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1095 data1 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1096 data1 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1097 data1 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1098 data1 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1099 data1 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1100 data1 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1101 data1 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1102 data1 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1103 data1 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1104 data1 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1105 data1 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1106 data1 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1107 data1 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1108 data1 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1109 data1 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1110 data1 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1111 data1 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1112 data1 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1113 data1 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1114 data1 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1115 data1 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1116 data1 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1117 data1 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1118 data1 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1119 data1 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1120 data1 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1121 data1 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1122 data1 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1123.size AES_Td#,2048+256 // HP-UX assembler fails to ".-AES_Td#"
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
new file mode 100644
index 0000000000..2ce6deffc8
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for MIPS
11
12# October 2010
13#
14# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
15# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
16# faster than gcc-generated code, which is not very impressive. But
17# recall that compressed S-box requires extra processing, namely
18# additional rotations. Rotations are implemented with lwl/lwr pairs,
19# which is normally used for loading unaligned data. Another cool
20# thing about this module is its endian neutrality, which means that
21# it processes data without ever changing byte order...
22
23######################################################################
24# There is a number of MIPS ABI in use, O32 and N32/64 are most
25# widely used. Then there is a new contender: NUBI. It appears that if
26# one picks the latter, it's possible to arrange code in ABI neutral
27# manner. Therefore let's stick to NUBI register layout:
28#
29($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
30($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
31($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
32($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
33#
34# The return value is placed in $a0. Following coding rules facilitate
35# interoperability:
36#
37# - never ever touch $tp, "thread pointer", former $gp;
38# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
39# old code];
40# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
41#
42# For reference here is register layout for N32/64 MIPS ABIs:
43#
44# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
45# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
46# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
47# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
48# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
49#
50$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
51
52if ($flavour =~ /64|n32/i) {
53 $PTR_ADD="dadd"; # incidentally works even on n32
54 $PTR_SUB="dsub"; # incidentally works even on n32
55 $REG_S="sd";
56 $REG_L="ld";
57 $PTR_SLL="dsll"; # incidentally works even on n32
58 $SZREG=8;
59} else {
60 $PTR_ADD="add";
61 $PTR_SUB="sub";
62 $REG_S="sw";
63 $REG_L="lw";
64 $PTR_SLL="sll";
65 $SZREG=4;
66}
67$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
68#
69# <appro@openssl.org>
70#
71######################################################################
72
73$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
74
75for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
76open STDOUT,">$output";
77
78if (!defined($big_endian))
79{ $big_endian=(unpack('L',pack('N',1))==1); }
80
81while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
82open STDOUT,">$output";
83
84my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
85
86$code.=<<___;
87.text
88#ifdef OPENSSL_FIPSCANISTER
89# include <openssl/fipssyms.h>
90#endif
91
92#if !defined(__vxworks) || defined(__pic__)
93.option pic2
94#endif
95.set noat
96___
97
98{{{
99my $FRAMESIZE=16*$SZREG;
100my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
101
102my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
103my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
104my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
105my ($key0,$cnt)=($gp,$fp);
106
107# instuction ordering is "stolen" from output from MIPSpro assembler
108# invoked with -mips3 -O3 arguments...
109$code.=<<___;
110.align 5
111.ent _mips_AES_encrypt
112_mips_AES_encrypt:
113 .frame $sp,0,$ra
114 .set reorder
115 lw $t0,0($key)
116 lw $t1,4($key)
117 lw $t2,8($key)
118 lw $t3,12($key)
119 lw $cnt,240($key)
120 $PTR_ADD $key0,$key,16
121
122 xor $s0,$t0
123 xor $s1,$t1
124 xor $s2,$t2
125 xor $s3,$t3
126
127 sub $cnt,1
128 _xtr $i0,$s1,16-2
129.Loop_enc:
130 _xtr $i1,$s2,16-2
131 _xtr $i2,$s3,16-2
132 _xtr $i3,$s0,16-2
133 and $i0,0x3fc
134 and $i1,0x3fc
135 and $i2,0x3fc
136 and $i3,0x3fc
137 $PTR_ADD $i0,$Tbl
138 $PTR_ADD $i1,$Tbl
139 $PTR_ADD $i2,$Tbl
140 $PTR_ADD $i3,$Tbl
141 lwl $t0,3($i0) # Te1[s1>>16]
142 lwl $t1,3($i1) # Te1[s2>>16]
143 lwl $t2,3($i2) # Te1[s3>>16]
144 lwl $t3,3($i3) # Te1[s0>>16]
145 lwr $t0,2($i0) # Te1[s1>>16]
146 lwr $t1,2($i1) # Te1[s2>>16]
147 lwr $t2,2($i2) # Te1[s3>>16]
148 lwr $t3,2($i3) # Te1[s0>>16]
149
150 _xtr $i0,$s2,8-2
151 _xtr $i1,$s3,8-2
152 _xtr $i2,$s0,8-2
153 _xtr $i3,$s1,8-2
154 and $i0,0x3fc
155 and $i1,0x3fc
156 and $i2,0x3fc
157 and $i3,0x3fc
158 $PTR_ADD $i0,$Tbl
159 $PTR_ADD $i1,$Tbl
160 $PTR_ADD $i2,$Tbl
161 $PTR_ADD $i3,$Tbl
162 lwl $t4,2($i0) # Te2[s2>>8]
163 lwl $t5,2($i1) # Te2[s3>>8]
164 lwl $t6,2($i2) # Te2[s0>>8]
165 lwl $t7,2($i3) # Te2[s1>>8]
166 lwr $t4,1($i0) # Te2[s2>>8]
167 lwr $t5,1($i1) # Te2[s3>>8]
168 lwr $t6,1($i2) # Te2[s0>>8]
169 lwr $t7,1($i3) # Te2[s1>>8]
170
171 _xtr $i0,$s3,0-2
172 _xtr $i1,$s0,0-2
173 _xtr $i2,$s1,0-2
174 _xtr $i3,$s2,0-2
175 and $i0,0x3fc
176 and $i1,0x3fc
177 and $i2,0x3fc
178 and $i3,0x3fc
179 $PTR_ADD $i0,$Tbl
180 $PTR_ADD $i1,$Tbl
181 $PTR_ADD $i2,$Tbl
182 $PTR_ADD $i3,$Tbl
183 lwl $t8,1($i0) # Te3[s3]
184 lwl $t9,1($i1) # Te3[s0]
185 lwl $t10,1($i2) # Te3[s1]
186 lwl $t11,1($i3) # Te3[s2]
187 lwr $t8,0($i0) # Te3[s3]
188 lwr $t9,0($i1) # Te3[s0]
189 lwr $t10,0($i2) # Te3[s1]
190 lwr $t11,0($i3) # Te3[s2]
191
192 _xtr $i0,$s0,24-2
193 _xtr $i1,$s1,24-2
194 _xtr $i2,$s2,24-2
195 _xtr $i3,$s3,24-2
196 and $i0,0x3fc
197 and $i1,0x3fc
198 and $i2,0x3fc
199 and $i3,0x3fc
200 $PTR_ADD $i0,$Tbl
201 $PTR_ADD $i1,$Tbl
202 $PTR_ADD $i2,$Tbl
203 $PTR_ADD $i3,$Tbl
204 xor $t0,$t4
205 xor $t1,$t5
206 xor $t2,$t6
207 xor $t3,$t7
208 lw $t4,0($i0) # Te0[s0>>24]
209 lw $t5,0($i1) # Te0[s1>>24]
210 lw $t6,0($i2) # Te0[s2>>24]
211 lw $t7,0($i3) # Te0[s3>>24]
212
213 lw $s0,0($key0)
214 lw $s1,4($key0)
215 lw $s2,8($key0)
216 lw $s3,12($key0)
217
218 xor $t0,$t8
219 xor $t1,$t9
220 xor $t2,$t10
221 xor $t3,$t11
222
223 xor $t0,$t4
224 xor $t1,$t5
225 xor $t2,$t6
226 xor $t3,$t7
227
228 sub $cnt,1
229 $PTR_ADD $key0,16
230 xor $s0,$t0
231 xor $s1,$t1
232 xor $s2,$t2
233 xor $s3,$t3
234 .set noreorder
235 bnez $cnt,.Loop_enc
236 _xtr $i0,$s1,16-2
237
238 .set reorder
239 _xtr $i1,$s2,16-2
240 _xtr $i2,$s3,16-2
241 _xtr $i3,$s0,16-2
242 and $i0,0x3fc
243 and $i1,0x3fc
244 and $i2,0x3fc
245 and $i3,0x3fc
246 $PTR_ADD $i0,$Tbl
247 $PTR_ADD $i1,$Tbl
248 $PTR_ADD $i2,$Tbl
249 $PTR_ADD $i3,$Tbl
250 lbu $t0,2($i0) # Te4[s1>>16]
251 lbu $t1,2($i1) # Te4[s2>>16]
252 lbu $t2,2($i2) # Te4[s3>>16]
253 lbu $t3,2($i3) # Te4[s0>>16]
254
255 _xtr $i0,$s2,8-2
256 _xtr $i1,$s3,8-2
257 _xtr $i2,$s0,8-2
258 _xtr $i3,$s1,8-2
259 and $i0,0x3fc
260 and $i1,0x3fc
261 and $i2,0x3fc
262 and $i3,0x3fc
263 $PTR_ADD $i0,$Tbl
264 $PTR_ADD $i1,$Tbl
265 $PTR_ADD $i2,$Tbl
266 $PTR_ADD $i3,$Tbl
267 lbu $t4,2($i0) # Te4[s2>>8]
268 lbu $t5,2($i1) # Te4[s3>>8]
269 lbu $t6,2($i2) # Te4[s0>>8]
270 lbu $t7,2($i3) # Te4[s1>>8]
271
272 _xtr $i0,$s0,24-2
273 _xtr $i1,$s1,24-2
274 _xtr $i2,$s2,24-2
275 _xtr $i3,$s3,24-2
276 and $i0,0x3fc
277 and $i1,0x3fc
278 and $i2,0x3fc
279 and $i3,0x3fc
280 $PTR_ADD $i0,$Tbl
281 $PTR_ADD $i1,$Tbl
282 $PTR_ADD $i2,$Tbl
283 $PTR_ADD $i3,$Tbl
284 lbu $t8,2($i0) # Te4[s0>>24]
285 lbu $t9,2($i1) # Te4[s1>>24]
286 lbu $t10,2($i2) # Te4[s2>>24]
287 lbu $t11,2($i3) # Te4[s3>>24]
288
289 _xtr $i0,$s3,0-2
290 _xtr $i1,$s0,0-2
291 _xtr $i2,$s1,0-2
292 _xtr $i3,$s2,0-2
293 and $i0,0x3fc
294 and $i1,0x3fc
295 and $i2,0x3fc
296 and $i3,0x3fc
297
298 _ins $t0,16
299 _ins $t1,16
300 _ins $t2,16
301 _ins $t3,16
302
303 _ins $t4,8
304 _ins $t5,8
305 _ins $t6,8
306 _ins $t7,8
307
308 xor $t0,$t4
309 xor $t1,$t5
310 xor $t2,$t6
311 xor $t3,$t7
312
313 $PTR_ADD $i0,$Tbl
314 $PTR_ADD $i1,$Tbl
315 $PTR_ADD $i2,$Tbl
316 $PTR_ADD $i3,$Tbl
317 lbu $t4,2($i0) # Te4[s3]
318 lbu $t5,2($i1) # Te4[s0]
319 lbu $t6,2($i2) # Te4[s1]
320 lbu $t7,2($i3) # Te4[s2]
321
322 _ins $t8,24
323 _ins $t9,24
324 _ins $t10,24
325 _ins $t11,24
326
327 lw $s0,0($key0)
328 lw $s1,4($key0)
329 lw $s2,8($key0)
330 lw $s3,12($key0)
331
332 xor $t0,$t8
333 xor $t1,$t9
334 xor $t2,$t10
335 xor $t3,$t11
336
337 _ins $t4,0
338 _ins $t5,0
339 _ins $t6,0
340 _ins $t7,0
341
342 xor $t0,$t4
343 xor $t1,$t5
344 xor $t2,$t6
345 xor $t3,$t7
346
347 xor $s0,$t0
348 xor $s1,$t1
349 xor $s2,$t2
350 xor $s3,$t3
351
352 jr $ra
353.end _mips_AES_encrypt
354
355.align 5
356.globl AES_encrypt
357.ent AES_encrypt
358AES_encrypt:
359 .frame $sp,$FRAMESIZE,$ra
360 .mask $SAVED_REGS_MASK,-$SZREG
361 .set noreorder
362___
363$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
364 .cpload $pf
365___
366$code.=<<___;
367 $PTR_SUB $sp,$FRAMESIZE
368 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
369 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
370 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
371 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
372 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
373 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
374 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
375 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
376 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
377 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
378___
379$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
380 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
381 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
382 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
383 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
384 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
385___
386$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
387 .cplocal $Tbl
388 .cpsetup $pf,$zero,AES_encrypt
389___
390$code.=<<___;
391 .set reorder
392 la $Tbl,AES_Te # PIC-ified 'load address'
393
394 lwl $s0,0+$MSB($inp)
395 lwl $s1,4+$MSB($inp)
396 lwl $s2,8+$MSB($inp)
397 lwl $s3,12+$MSB($inp)
398 lwr $s0,0+$LSB($inp)
399 lwr $s1,4+$LSB($inp)
400 lwr $s2,8+$LSB($inp)
401 lwr $s3,12+$LSB($inp)
402
403 bal _mips_AES_encrypt
404
405 swr $s0,0+$LSB($out)
406 swr $s1,4+$LSB($out)
407 swr $s2,8+$LSB($out)
408 swr $s3,12+$LSB($out)
409 swl $s0,0+$MSB($out)
410 swl $s1,4+$MSB($out)
411 swl $s2,8+$MSB($out)
412 swl $s3,12+$MSB($out)
413
414 .set noreorder
415 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
416 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
417 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
418 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
419 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
420 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
421 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
422 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
423 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
424 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
425___
426$code.=<<___ if ($flavour =~ /nubi/i);
427 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
428 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
429 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
430 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
431 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
432___
433$code.=<<___;
434 jr $ra
435 $PTR_ADD $sp,$FRAMESIZE
436.end AES_encrypt
437___
438
439$code.=<<___;
440.align 5
441.ent _mips_AES_decrypt
442_mips_AES_decrypt:
443 .frame $sp,0,$ra
444 .set reorder
445 lw $t0,0($key)
446 lw $t1,4($key)
447 lw $t2,8($key)
448 lw $t3,12($key)
449 lw $cnt,240($key)
450 $PTR_ADD $key0,$key,16
451
452 xor $s0,$t0
453 xor $s1,$t1
454 xor $s2,$t2
455 xor $s3,$t3
456
457 sub $cnt,1
458 _xtr $i0,$s3,16-2
459.Loop_dec:
460 _xtr $i1,$s0,16-2
461 _xtr $i2,$s1,16-2
462 _xtr $i3,$s2,16-2
463 and $i0,0x3fc
464 and $i1,0x3fc
465 and $i2,0x3fc
466 and $i3,0x3fc
467 $PTR_ADD $i0,$Tbl
468 $PTR_ADD $i1,$Tbl
469 $PTR_ADD $i2,$Tbl
470 $PTR_ADD $i3,$Tbl
471 lwl $t0,3($i0) # Td1[s3>>16]
472 lwl $t1,3($i1) # Td1[s0>>16]
473 lwl $t2,3($i2) # Td1[s1>>16]
474 lwl $t3,3($i3) # Td1[s2>>16]
475 lwr $t0,2($i0) # Td1[s3>>16]
476 lwr $t1,2($i1) # Td1[s0>>16]
477 lwr $t2,2($i2) # Td1[s1>>16]
478 lwr $t3,2($i3) # Td1[s2>>16]
479
480 _xtr $i0,$s2,8-2
481 _xtr $i1,$s3,8-2
482 _xtr $i2,$s0,8-2
483 _xtr $i3,$s1,8-2
484 and $i0,0x3fc
485 and $i1,0x3fc
486 and $i2,0x3fc
487 and $i3,0x3fc
488 $PTR_ADD $i0,$Tbl
489 $PTR_ADD $i1,$Tbl
490 $PTR_ADD $i2,$Tbl
491 $PTR_ADD $i3,$Tbl
492 lwl $t4,2($i0) # Td2[s2>>8]
493 lwl $t5,2($i1) # Td2[s3>>8]
494 lwl $t6,2($i2) # Td2[s0>>8]
495 lwl $t7,2($i3) # Td2[s1>>8]
496 lwr $t4,1($i0) # Td2[s2>>8]
497 lwr $t5,1($i1) # Td2[s3>>8]
498 lwr $t6,1($i2) # Td2[s0>>8]
499 lwr $t7,1($i3) # Td2[s1>>8]
500
501 _xtr $i0,$s1,0-2
502 _xtr $i1,$s2,0-2
503 _xtr $i2,$s3,0-2
504 _xtr $i3,$s0,0-2
505 and $i0,0x3fc
506 and $i1,0x3fc
507 and $i2,0x3fc
508 and $i3,0x3fc
509 $PTR_ADD $i0,$Tbl
510 $PTR_ADD $i1,$Tbl
511 $PTR_ADD $i2,$Tbl
512 $PTR_ADD $i3,$Tbl
513 lwl $t8,1($i0) # Td3[s1]
514 lwl $t9,1($i1) # Td3[s2]
515 lwl $t10,1($i2) # Td3[s3]
516 lwl $t11,1($i3) # Td3[s0]
517 lwr $t8,0($i0) # Td3[s1]
518 lwr $t9,0($i1) # Td3[s2]
519 lwr $t10,0($i2) # Td3[s3]
520 lwr $t11,0($i3) # Td3[s0]
521
522 _xtr $i0,$s0,24-2
523 _xtr $i1,$s1,24-2
524 _xtr $i2,$s2,24-2
525 _xtr $i3,$s3,24-2
526 and $i0,0x3fc
527 and $i1,0x3fc
528 and $i2,0x3fc
529 and $i3,0x3fc
530 $PTR_ADD $i0,$Tbl
531 $PTR_ADD $i1,$Tbl
532 $PTR_ADD $i2,$Tbl
533 $PTR_ADD $i3,$Tbl
534
535 xor $t0,$t4
536 xor $t1,$t5
537 xor $t2,$t6
538 xor $t3,$t7
539
540
541 lw $t4,0($i0) # Td0[s0>>24]
542 lw $t5,0($i1) # Td0[s1>>24]
543 lw $t6,0($i2) # Td0[s2>>24]
544 lw $t7,0($i3) # Td0[s3>>24]
545
546 lw $s0,0($key0)
547 lw $s1,4($key0)
548 lw $s2,8($key0)
549 lw $s3,12($key0)
550
551 xor $t0,$t8
552 xor $t1,$t9
553 xor $t2,$t10
554 xor $t3,$t11
555
556 xor $t0,$t4
557 xor $t1,$t5
558 xor $t2,$t6
559 xor $t3,$t7
560
561 sub $cnt,1
562 $PTR_ADD $key0,16
563 xor $s0,$t0
564 xor $s1,$t1
565 xor $s2,$t2
566 xor $s3,$t3
567 .set noreorder
568 bnez $cnt,.Loop_dec
569 _xtr $i0,$s3,16-2
570
571 .set reorder
572 lw $t4,1024($Tbl) # prefetch Td4
573 lw $t5,1024+32($Tbl)
574 lw $t6,1024+64($Tbl)
575 lw $t7,1024+96($Tbl)
576 lw $t8,1024+128($Tbl)
577 lw $t9,1024+160($Tbl)
578 lw $t10,1024+192($Tbl)
579 lw $t11,1024+224($Tbl)
580
581 _xtr $i0,$s3,16
582 _xtr $i1,$s0,16
583 _xtr $i2,$s1,16
584 _xtr $i3,$s2,16
585 and $i0,0xff
586 and $i1,0xff
587 and $i2,0xff
588 and $i3,0xff
589 $PTR_ADD $i0,$Tbl
590 $PTR_ADD $i1,$Tbl
591 $PTR_ADD $i2,$Tbl
592 $PTR_ADD $i3,$Tbl
593 lbu $t0,1024($i0) # Td4[s3>>16]
594 lbu $t1,1024($i1) # Td4[s0>>16]
595 lbu $t2,1024($i2) # Td4[s1>>16]
596 lbu $t3,1024($i3) # Td4[s2>>16]
597
598 _xtr $i0,$s2,8
599 _xtr $i1,$s3,8
600 _xtr $i2,$s0,8
601 _xtr $i3,$s1,8
602 and $i0,0xff
603 and $i1,0xff
604 and $i2,0xff
605 and $i3,0xff
606 $PTR_ADD $i0,$Tbl
607 $PTR_ADD $i1,$Tbl
608 $PTR_ADD $i2,$Tbl
609 $PTR_ADD $i3,$Tbl
610 lbu $t4,1024($i0) # Td4[s2>>8]
611 lbu $t5,1024($i1) # Td4[s3>>8]
612 lbu $t6,1024($i2) # Td4[s0>>8]
613 lbu $t7,1024($i3) # Td4[s1>>8]
614
615 _xtr $i0,$s0,24
616 _xtr $i1,$s1,24
617 _xtr $i2,$s2,24
618 _xtr $i3,$s3,24
619 $PTR_ADD $i0,$Tbl
620 $PTR_ADD $i1,$Tbl
621 $PTR_ADD $i2,$Tbl
622 $PTR_ADD $i3,$Tbl
623 lbu $t8,1024($i0) # Td4[s0>>24]
624 lbu $t9,1024($i1) # Td4[s1>>24]
625 lbu $t10,1024($i2) # Td4[s2>>24]
626 lbu $t11,1024($i3) # Td4[s3>>24]
627
628 _xtr $i0,$s1,0
629 _xtr $i1,$s2,0
630 _xtr $i2,$s3,0
631 _xtr $i3,$s0,0
632
633 _ins $t0,16
634 _ins $t1,16
635 _ins $t2,16
636 _ins $t3,16
637
638 _ins $t4,8
639 _ins $t5,8
640 _ins $t6,8
641 _ins $t7,8
642
643 xor $t0,$t4
644 xor $t1,$t5
645 xor $t2,$t6
646 xor $t3,$t7
647
648 $PTR_ADD $i0,$Tbl
649 $PTR_ADD $i1,$Tbl
650 $PTR_ADD $i2,$Tbl
651 $PTR_ADD $i3,$Tbl
652 lbu $t4,1024($i0) # Td4[s1]
653 lbu $t5,1024($i1) # Td4[s2]
654 lbu $t6,1024($i2) # Td4[s3]
655 lbu $t7,1024($i3) # Td4[s0]
656
657 _ins $t8,24
658 _ins $t9,24
659 _ins $t10,24
660 _ins $t11,24
661
662 lw $s0,0($key0)
663 lw $s1,4($key0)
664 lw $s2,8($key0)
665 lw $s3,12($key0)
666
667 _ins $t4,0
668 _ins $t5,0
669 _ins $t6,0
670 _ins $t7,0
671
672
673 xor $t0,$t8
674 xor $t1,$t9
675 xor $t2,$t10
676 xor $t3,$t11
677
678 xor $t0,$t4
679 xor $t1,$t5
680 xor $t2,$t6
681 xor $t3,$t7
682
683 xor $s0,$t0
684 xor $s1,$t1
685 xor $s2,$t2
686 xor $s3,$t3
687
688 jr $ra
689.end _mips_AES_decrypt
690
691.align 5
692.globl AES_decrypt
693.ent AES_decrypt
694AES_decrypt:
695 .frame $sp,$FRAMESIZE,$ra
696 .mask $SAVED_REGS_MASK,-$SZREG
697 .set noreorder
698___
699$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
700 .cpload $pf
701___
702$code.=<<___;
703 $PTR_SUB $sp,$FRAMESIZE
704 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
705 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
706 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
707 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
708 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
709 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
710 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
711 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
712 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
713 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
714___
715$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
716 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
717 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
718 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
719 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
720 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
721___
722$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
723 .cplocal $Tbl
724 .cpsetup $pf,$zero,AES_decrypt
725___
726$code.=<<___;
727 .set reorder
728 la $Tbl,AES_Td # PIC-ified 'load address'
729
730 lwl $s0,0+$MSB($inp)
731 lwl $s1,4+$MSB($inp)
732 lwl $s2,8+$MSB($inp)
733 lwl $s3,12+$MSB($inp)
734 lwr $s0,0+$LSB($inp)
735 lwr $s1,4+$LSB($inp)
736 lwr $s2,8+$LSB($inp)
737 lwr $s3,12+$LSB($inp)
738
739 bal _mips_AES_decrypt
740
741 swr $s0,0+$LSB($out)
742 swr $s1,4+$LSB($out)
743 swr $s2,8+$LSB($out)
744 swr $s3,12+$LSB($out)
745 swl $s0,0+$MSB($out)
746 swl $s1,4+$MSB($out)
747 swl $s2,8+$MSB($out)
748 swl $s3,12+$MSB($out)
749
750 .set noreorder
751 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
752 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
753 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
754 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
755 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
756 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
757 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
758 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
759 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
760 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
761___
762$code.=<<___ if ($flavour =~ /nubi/i);
763 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
764 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
765 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
766 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
767 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
768___
769$code.=<<___;
770 jr $ra
771 $PTR_ADD $sp,$FRAMESIZE
772.end AES_decrypt
773___
774}}}
775
776{{{
777my $FRAMESIZE=8*$SZREG;
778my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
779
780my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
781my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
782my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
783my ($rcon,$cnt)=($gp,$fp);
784
785$code.=<<___;
786.align 5
787.ent _mips_AES_set_encrypt_key
788_mips_AES_set_encrypt_key:
789 .frame $sp,0,$ra
790 .set noreorder
791 beqz $inp,.Lekey_done
792 li $t0,-1
793 beqz $key,.Lekey_done
794 $PTR_ADD $rcon,$Tbl,1024+256
795
796 .set reorder
797 lwl $rk0,0+$MSB($inp) # load 128 bits
798 lwl $rk1,4+$MSB($inp)
799 lwl $rk2,8+$MSB($inp)
800 lwl $rk3,12+$MSB($inp)
801 li $at,128
802 lwr $rk0,0+$LSB($inp)
803 lwr $rk1,4+$LSB($inp)
804 lwr $rk2,8+$LSB($inp)
805 lwr $rk3,12+$LSB($inp)
806 .set noreorder
807 beq $bits,$at,.L128bits
808 li $cnt,10
809
810 .set reorder
811 lwl $rk4,16+$MSB($inp) # load 192 bits
812 lwl $rk5,20+$MSB($inp)
813 li $at,192
814 lwr $rk4,16+$LSB($inp)
815 lwr $rk5,20+$LSB($inp)
816 .set noreorder
817 beq $bits,$at,.L192bits
818 li $cnt,8
819
820 .set reorder
821 lwl $rk6,24+$MSB($inp) # load 256 bits
822 lwl $rk7,28+$MSB($inp)
823 li $at,256
824 lwr $rk6,24+$LSB($inp)
825 lwr $rk7,28+$LSB($inp)
826 .set noreorder
827 beq $bits,$at,.L256bits
828 li $cnt,7
829
830 b .Lekey_done
831 li $t0,-2
832
833.align 4
834.L128bits:
835 .set reorder
836 srl $i0,$rk3,16
837 srl $i1,$rk3,8
838 and $i0,0xff
839 and $i1,0xff
840 and $i2,$rk3,0xff
841 srl $i3,$rk3,24
842 $PTR_ADD $i0,$Tbl
843 $PTR_ADD $i1,$Tbl
844 $PTR_ADD $i2,$Tbl
845 $PTR_ADD $i3,$Tbl
846 lbu $i0,1024($i0)
847 lbu $i1,1024($i1)
848 lbu $i2,1024($i2)
849 lbu $i3,1024($i3)
850
851 sw $rk0,0($key)
852 sw $rk1,4($key)
853 sw $rk2,8($key)
854 sw $rk3,12($key)
855 sub $cnt,1
856 $PTR_ADD $key,16
857
858 _bias $i0,24
859 _bias $i1,16
860 _bias $i2,8
861 _bias $i3,0
862
863 xor $rk0,$i0
864 lw $i0,0($rcon)
865 xor $rk0,$i1
866 xor $rk0,$i2
867 xor $rk0,$i3
868 xor $rk0,$i0
869
870 xor $rk1,$rk0
871 xor $rk2,$rk1
872 xor $rk3,$rk2
873
874 .set noreorder
875 bnez $cnt,.L128bits
876 $PTR_ADD $rcon,4
877
878 sw $rk0,0($key)
879 sw $rk1,4($key)
880 sw $rk2,8($key)
881 li $cnt,10
882 sw $rk3,12($key)
883 li $t0,0
884 sw $cnt,80($key)
885 b .Lekey_done
886 $PTR_SUB $key,10*16
887
888.align 4
889.L192bits:
890 .set reorder
891 srl $i0,$rk5,16
892 srl $i1,$rk5,8
893 and $i0,0xff
894 and $i1,0xff
895 and $i2,$rk5,0xff
896 srl $i3,$rk5,24
897 $PTR_ADD $i0,$Tbl
898 $PTR_ADD $i1,$Tbl
899 $PTR_ADD $i2,$Tbl
900 $PTR_ADD $i3,$Tbl
901 lbu $i0,1024($i0)
902 lbu $i1,1024($i1)
903 lbu $i2,1024($i2)
904 lbu $i3,1024($i3)
905
906 sw $rk0,0($key)
907 sw $rk1,4($key)
908 sw $rk2,8($key)
909 sw $rk3,12($key)
910 sw $rk4,16($key)
911 sw $rk5,20($key)
912 sub $cnt,1
913 $PTR_ADD $key,24
914
915 _bias $i0,24
916 _bias $i1,16
917 _bias $i2,8
918 _bias $i3,0
919
920 xor $rk0,$i0
921 lw $i0,0($rcon)
922 xor $rk0,$i1
923 xor $rk0,$i2
924 xor $rk0,$i3
925 xor $rk0,$i0
926
927 xor $rk1,$rk0
928 xor $rk2,$rk1
929 xor $rk3,$rk2
930 xor $rk4,$rk3
931 xor $rk5,$rk4
932
933 .set noreorder
934 bnez $cnt,.L192bits
935 $PTR_ADD $rcon,4
936
937 sw $rk0,0($key)
938 sw $rk1,4($key)
939 sw $rk2,8($key)
940 li $cnt,12
941 sw $rk3,12($key)
942 li $t0,0
943 sw $cnt,48($key)
944 b .Lekey_done
945 $PTR_SUB $key,12*16
946
947.align 4
948.L256bits:
949 .set reorder
950 srl $i0,$rk7,16
951 srl $i1,$rk7,8
952 and $i0,0xff
953 and $i1,0xff
954 and $i2,$rk7,0xff
955 srl $i3,$rk7,24
956 $PTR_ADD $i0,$Tbl
957 $PTR_ADD $i1,$Tbl
958 $PTR_ADD $i2,$Tbl
959 $PTR_ADD $i3,$Tbl
960 lbu $i0,1024($i0)
961 lbu $i1,1024($i1)
962 lbu $i2,1024($i2)
963 lbu $i3,1024($i3)
964
965 sw $rk0,0($key)
966 sw $rk1,4($key)
967 sw $rk2,8($key)
968 sw $rk3,12($key)
969 sw $rk4,16($key)
970 sw $rk5,20($key)
971 sw $rk6,24($key)
972 sw $rk7,28($key)
973 sub $cnt,1
974
975 _bias $i0,24
976 _bias $i1,16
977 _bias $i2,8
978 _bias $i3,0
979
980 xor $rk0,$i0
981 lw $i0,0($rcon)
982 xor $rk0,$i1
983 xor $rk0,$i2
984 xor $rk0,$i3
985 xor $rk0,$i0
986
987 xor $rk1,$rk0
988 xor $rk2,$rk1
989 xor $rk3,$rk2
990 beqz $cnt,.L256bits_done
991
992 srl $i0,$rk3,24
993 srl $i1,$rk3,16
994 srl $i2,$rk3,8
995 and $i3,$rk3,0xff
996 and $i1,0xff
997 and $i2,0xff
998 $PTR_ADD $i0,$Tbl
999 $PTR_ADD $i1,$Tbl
1000 $PTR_ADD $i2,$Tbl
1001 $PTR_ADD $i3,$Tbl
1002 lbu $i0,1024($i0)
1003 lbu $i1,1024($i1)
1004 lbu $i2,1024($i2)
1005 lbu $i3,1024($i3)
1006 sll $i0,24
1007 sll $i1,16
1008 sll $i2,8
1009
1010 xor $rk4,$i0
1011 xor $rk4,$i1
1012 xor $rk4,$i2
1013 xor $rk4,$i3
1014
1015 xor $rk5,$rk4
1016 xor $rk6,$rk5
1017 xor $rk7,$rk6
1018
1019 $PTR_ADD $key,32
1020 .set noreorder
1021 b .L256bits
1022 $PTR_ADD $rcon,4
1023
1024.L256bits_done:
1025 sw $rk0,32($key)
1026 sw $rk1,36($key)
1027 sw $rk2,40($key)
1028 li $cnt,14
1029 sw $rk3,44($key)
1030 li $t0,0
1031 sw $cnt,48($key)
1032 $PTR_SUB $key,12*16
1033
1034.Lekey_done:
1035 jr $ra
1036 nop
1037.end _mips_AES_set_encrypt_key
1038
1039.globl AES_set_encrypt_key
1040.ent AES_set_encrypt_key
1041AES_set_encrypt_key:
1042 .frame $sp,$FRAMESIZE,$ra
1043 .mask $SAVED_REGS_MASK,-$SZREG
1044 .set noreorder
1045___
1046$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1047 .cpload $pf
1048___
1049$code.=<<___;
1050 $PTR_SUB $sp,$FRAMESIZE
1051 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1052 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1055 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1056 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1057 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1058 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1059 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1060___
1061$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1062 .cplocal $Tbl
1063 .cpsetup $pf,$zero,AES_set_encrypt_key
1064___
1065$code.=<<___;
1066 .set reorder
1067 la $Tbl,AES_Te # PIC-ified 'load address'
1068
1069 bal _mips_AES_set_encrypt_key
1070
1071 .set noreorder
1072 move $a0,$t0
1073 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1074 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1075___
1076$code.=<<___ if ($flavour =~ /nubi/i);
1077 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1078 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1079 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1080 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1081 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1082___
1083$code.=<<___;
1084 jr $ra
1085 $PTR_ADD $sp,$FRAMESIZE
1086.end AES_set_encrypt_key
1087___
1088
1089my ($head,$tail)=($inp,$bits);
1090my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
1091my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
1092$code.=<<___;
1093.align 5
1094.globl AES_set_decrypt_key
1095.ent AES_set_decrypt_key
1096AES_set_decrypt_key:
1097 .frame $sp,$FRAMESIZE,$ra
1098 .mask $SAVED_REGS_MASK,-$SZREG
1099 .set noreorder
1100___
1101$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1102 .cpload $pf
1103___
1104$code.=<<___;
1105 $PTR_SUB $sp,$FRAMESIZE
1106 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1107 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1108___
1109$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1110 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1111 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1112 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1113 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1114 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1115___
1116$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1117 .cplocal $Tbl
1118 .cpsetup $pf,$zero,AES_set_decrypt_key
1119___
1120$code.=<<___;
1121 .set reorder
1122 la $Tbl,AES_Te # PIC-ified 'load address'
1123
1124 bal _mips_AES_set_encrypt_key
1125
1126 bltz $t0,.Ldkey_done
1127
1128 sll $at,$cnt,4
1129 $PTR_ADD $head,$key,0
1130 $PTR_ADD $tail,$key,$at
1131.align 4
1132.Lswap:
1133 lw $rk0,0($head)
1134 lw $rk1,4($head)
1135 lw $rk2,8($head)
1136 lw $rk3,12($head)
1137 lw $rk4,0($tail)
1138 lw $rk5,4($tail)
1139 lw $rk6,8($tail)
1140 lw $rk7,12($tail)
1141 sw $rk0,0($tail)
1142 sw $rk1,4($tail)
1143 sw $rk2,8($tail)
1144 sw $rk3,12($tail)
1145 $PTR_ADD $head,16
1146 $PTR_SUB $tail,16
1147 sw $rk4,-16($head)
1148 sw $rk5,-12($head)
1149 sw $rk6,-8($head)
1150 sw $rk7,-4($head)
1151 bne $head,$tail,.Lswap
1152
1153 lw $tp1,16($key) # modulo-scheduled
1154 lui $x80808080,0x8080
1155 sub $cnt,1
1156 or $x80808080,0x8080
1157 sll $cnt,2
1158 $PTR_ADD $key,16
1159 lui $x1b1b1b1b,0x1b1b
1160 nor $x7f7f7f7f,$zero,$x80808080
1161 or $x1b1b1b1b,0x1b1b
1162.align 4
1163.Lmix:
1164 and $m,$tp1,$x80808080
1165 and $tp2,$tp1,$x7f7f7f7f
1166 srl $tp4,$m,7
1167 addu $tp2,$tp2 # tp2<<1
1168 subu $m,$tp4
1169 and $m,$x1b1b1b1b
1170 xor $tp2,$m
1171
1172 and $m,$tp2,$x80808080
1173 and $tp4,$tp2,$x7f7f7f7f
1174 srl $tp8,$m,7
1175 addu $tp4,$tp4 # tp4<<1
1176 subu $m,$tp8
1177 and $m,$x1b1b1b1b
1178 xor $tp4,$m
1179
1180 and $m,$tp4,$x80808080
1181 and $tp8,$tp4,$x7f7f7f7f
1182 srl $tp9,$m,7
1183 addu $tp8,$tp8 # tp8<<1
1184 subu $m,$tp9
1185 and $m,$x1b1b1b1b
1186 xor $tp8,$m
1187
1188 xor $tp9,$tp8,$tp1
1189 xor $tpe,$tp8,$tp4
1190 xor $tpb,$tp9,$tp2
1191 xor $tpd,$tp9,$tp4
1192
1193 _ror $tp1,$tpd,16
1194 xor $tpe,$tp2
1195 _ror $tp2,$tpd,-16
1196 xor $tpe,$tp1
1197 _ror $tp1,$tp9,8
1198 xor $tpe,$tp2
1199 _ror $tp2,$tp9,-24
1200 xor $tpe,$tp1
1201 _ror $tp1,$tpb,24
1202 xor $tpe,$tp2
1203 _ror $tp2,$tpb,-8
1204 xor $tpe,$tp1
1205 lw $tp1,4($key) # modulo-scheduled
1206 xor $tpe,$tp2
1207 sub $cnt,1
1208 sw $tpe,0($key)
1209 $PTR_ADD $key,4
1210 bnez $cnt,.Lmix
1211
1212 li $t0,0
1213.Ldkey_done:
1214 .set noreorder
1215 move $a0,$t0
1216 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1217 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1218___
1219$code.=<<___ if ($flavour =~ /nubi/i);
1220 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1221 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1222 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1223 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1224 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1225___
1226$code.=<<___;
1227 jr $ra
1228 $PTR_ADD $sp,$FRAMESIZE
1229.end AES_set_decrypt_key
1230___
1231}}}
1232
1233######################################################################
1234# Tables are kept in endian-neutral manner
1235$code.=<<___;
1236.rdata
1237.align 6
1238AES_Te:
1239.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
1240.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
1241.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
1242.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
1243.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
1244.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
1245.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
1246.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
1247.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
1248.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
1249.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
1250.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
1251.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
1252.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
1253.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
1254.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
1255.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
1256.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
1257.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
1258.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
1259.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
1260.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
1261.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
1262.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
1263.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
1264.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
1265.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
1266.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
1267.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
1268.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
1269.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
1270.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
1271.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
1272.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
1273.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
1274.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
1275.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
1276.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
1277.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
1278.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
1279.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
1280.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
1281.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
1282.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
1283.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
1284.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
1285.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
1286.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
1287.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
1288.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
1289.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
1290.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
1291.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
1292.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
1293.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
1294.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
1295.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
1296.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
1297.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
1298.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
1299.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
1300.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
1301.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
1302.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
1303.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
1304.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
1305.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
1306.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
1307.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
1308.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
1309.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
1310.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
1311.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
1312.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
1313.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
1314.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
1315.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
1316.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
1317.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
1318.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
1319.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
1320.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
1321.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
1322.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
1323.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
1324.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
1325.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
1326.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
1327.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
1328.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
1329.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
1330.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
1331.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
1332.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
1333.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
1334.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
1335.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
1336.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
1337.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
1338.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
1339.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
1340.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
1341.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
1342.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
1343.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
1344.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
1345.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
1346.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
1347.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
1348.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
1349.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
1350.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
1351.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
1352.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
1353.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
1354.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
1355.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
1356.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
1357.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
1358.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
1359.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
1360.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
1361.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
1362.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
1363.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
1364.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
1365.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
1366.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
1367
1368.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
1369.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
1370.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
1371.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
1372.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
1373.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
1374.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
1375.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
1376.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
1377.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
1378.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
1379.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
1380.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
1381.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
1382.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
1383.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
1384.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
1385.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
1386.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
1387.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
1388.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
1389.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
1390.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
1391.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
1392.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
1393.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
1394.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
1395.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
1396.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
1397.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
1398.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
1399.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1400
1401.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
1402.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
1403.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
1404.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
1405.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
1406
1407.align 6
1408AES_Td:
1409.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
1410.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
1411.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
1412.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
1413.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
1414.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
1415.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
1416.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
1417.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
1418.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
1419.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
1420.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
1421.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
1422.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
1423.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
1424.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
1425.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
1426.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
1427.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
1428.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
1429.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
1430.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
1431.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
1432.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
1433.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
1434.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
1435.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
1436.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
1437.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
1438.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
1439.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
1440.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
1441.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
1442.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
1443.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
1444.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
1445.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
1446.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
1447.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
1448.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
1449.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
1450.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
1451.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
1452.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
1453.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
1454.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
1455.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
1456.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
1457.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
1458.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
1459.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
1460.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
1461.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
1462.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
1463.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
1464.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
1465.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
1466.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
1467.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
1468.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
1469.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
1470.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
1471.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
1472.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
1473.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
1474.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
1475.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
1476.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
1477.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
1478.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
1479.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
1480.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
1481.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
1482.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
1483.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
1484.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
1485.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
1486.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
1487.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
1488.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
1489.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
1490.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
1491.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
1492.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
1493.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
1494.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
1495.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
1496.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
1497.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
1498.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
1499.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
1500.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
1501.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
1502.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
1503.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
1504.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
1505.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
1506.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
1507.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
1508.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
1509.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
1510.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
1511.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
1512.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
1513.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
1514.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
1515.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
1516.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
1517.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
1518.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
1519.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
1520.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
1521.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
1522.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
1523.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
1524.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
1525.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
1526.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
1527.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
1528.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
1529.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
1530.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
1531.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
1532.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
1533.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
1534.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
1535.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
1536.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
1537
1538.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
1539.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1540.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1541.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1542.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1543.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1544.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1545.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1546.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1547.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1548.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1549.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1550.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1551.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1552.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1553.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1554.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1555.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1556.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1557.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1558.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1559.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1560.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1561.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1562.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1563.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1564.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1565.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1566.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1567.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1568.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1569.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1570___
1571
1572foreach (split("\n",$code)) {
1573 s/\`([^\`]*)\`/eval $1/ge;
1574
1575 # made-up _instructions, _xtr, _ins, _ror and _bias, cope
1576 # with byte order dependencies...
1577 if (/^\s+_/) {
1578 s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
1579
1580 s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
1581 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1582 : eval("24-$3"))/e or
1583 s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1584 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1585 : eval("24-$3"))/e or
1586 s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
1587 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1588 : eval("$3*-1"))/e or
1589 s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1590 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1591 : eval("($3-16)&31"))/e;
1592
1593 s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
1594 sprintf("sll\t$1,$2,$3")/e or
1595 s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
1596 sprintf("and\t$1,$2,0xff")/e or
1597 s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
1598 }
1599
1600 # convert lwl/lwr and swr/swl to little-endian order
1601 if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
1602 s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
1603 sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
1604 s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
1605 sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
1606 }
1607
1608 print $_,"\n";
1609}
1610
1611close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000000..c36b6a2270
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for PA-RISC.
11#
12# June 2009.
13#
14# The module is mechanical transliteration of aes-sparcv9.pl, but with
15# a twist: S-boxes are compressed even further down to 1K+256B. On
16# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
17# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
18# perform at 16 cycles per byte. It's not faster than code generated
19# by vendor compiler, but recall that it has compressed S-boxes, which
20# requires extra processing.
21#
22# Special thanks to polarhome.com for providing HP-UX account.
23
24$flavour = shift;
25$output = shift;
26open STDOUT,">$output";
27
28if ($flavour =~ /64/) {
29 $LEVEL ="2.0W";
30 $SIZE_T =8;
31 $FRAME_MARKER =80;
32 $SAVED_RP =16;
33 $PUSH ="std";
34 $PUSHMA ="std,ma";
35 $POP ="ldd";
36 $POPMB ="ldd,mb";
37} else {
38 $LEVEL ="1.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46}
47
48$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
49 # [+ argument transfer]
50$inp="%r26"; # arg0
51$out="%r25"; # arg1
52$key="%r24"; # arg2
53
54($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
55($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
56
57($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
58 $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
59("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
60"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
61
62$tbl="%r28";
63$rounds="%r29";
64
65$code=<<___;
66 .LEVEL $LEVEL
67 .SPACE \$TEXT\$
68 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
69
70 .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
71 .ALIGN 64
72AES_encrypt
73 .PROC
74 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
75 .ENTRY
76 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
77 $PUSHMA %r3,$FRAME(%sp)
78 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
79 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
80 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
81 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
82 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
83 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
84 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
85 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
86 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
87 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
88 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
89 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
90 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
91 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
92 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
93
94 blr %r0,$tbl
95 ldi 3,$t0
96L\$enc_pic
97 andcm $tbl,$t0,$tbl
98 ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
99
100 and $inp,$t0,$t0
101 sub $inp,$t0,$inp
102 ldw 0($inp),$s0
103 ldw 4($inp),$s1
104 ldw 8($inp),$s2
105 comib,= 0,$t0,L\$enc_inp_aligned
106 ldw 12($inp),$s3
107
108 sh3addl $t0,%r0,$t0
109 subi 32,$t0,$t0
110 mtctl $t0,%cr11
111 ldw 16($inp),$t1
112 vshd $s0,$s1,$s0
113 vshd $s1,$s2,$s1
114 vshd $s2,$s3,$s2
115 vshd $s3,$t1,$s3
116
117L\$enc_inp_aligned
118 bl _parisc_AES_encrypt,%r31
119 nop
120
121 extru,<> $out,31,2,%r0
122 b L\$enc_out_aligned
123 nop
124
125 _srm $s0,24,$acc0
126 _srm $s0,16,$acc1
127 stb $acc0,0($out)
128 _srm $s0,8,$acc2
129 stb $acc1,1($out)
130 _srm $s1,24,$acc4
131 stb $acc2,2($out)
132 _srm $s1,16,$acc5
133 stb $s0,3($out)
134 _srm $s1,8,$acc6
135 stb $acc4,4($out)
136 _srm $s2,24,$acc0
137 stb $acc5,5($out)
138 _srm $s2,16,$acc1
139 stb $acc6,6($out)
140 _srm $s2,8,$acc2
141 stb $s1,7($out)
142 _srm $s3,24,$acc4
143 stb $acc0,8($out)
144 _srm $s3,16,$acc5
145 stb $acc1,9($out)
146 _srm $s3,8,$acc6
147 stb $acc2,10($out)
148 stb $s2,11($out)
149 stb $acc4,12($out)
150 stb $acc5,13($out)
151 stb $acc6,14($out)
152 b L\$enc_done
153 stb $s3,15($out)
154
155L\$enc_out_aligned
156 stw $s0,0($out)
157 stw $s1,4($out)
158 stw $s2,8($out)
159 stw $s3,12($out)
160
161L\$enc_done
162 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
163 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
164 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
165 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
166 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
167 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
168 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
169 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
170 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
171 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
172 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
173 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
174 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
175 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
176 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
177 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
178 bv (%r2)
179 .EXIT
180 $POPMB -$FRAME(%sp),%r3
181 .PROCEND
182
183 .ALIGN 16
184_parisc_AES_encrypt
185 .PROC
186 .CALLINFO MILLICODE
187 .ENTRY
188 ldw 240($key),$rounds
189 ldw 0($key),$t0
190 ldw 4($key),$t1
191 ldw 8($key),$t2
192 _srm $rounds,1,$rounds
193 xor $t0,$s0,$s0
194 ldw 12($key),$t3
195 _srm $s0,24,$acc0
196 xor $t1,$s1,$s1
197 ldw 16($key),$t0
198 _srm $s1,16,$acc1
199 xor $t2,$s2,$s2
200 ldw 20($key),$t1
201 xor $t3,$s3,$s3
202 ldw 24($key),$t2
203 ldw 28($key),$t3
204L\$enc_loop
205 _srm $s2,8,$acc2
206 ldwx,s $acc0($tbl),$acc0
207 _srm $s3,0,$acc3
208 ldwx,s $acc1($tbl),$acc1
209 _srm $s1,24,$acc4
210 ldwx,s $acc2($tbl),$acc2
211 _srm $s2,16,$acc5
212 ldwx,s $acc3($tbl),$acc3
213 _srm $s3,8,$acc6
214 ldwx,s $acc4($tbl),$acc4
215 _srm $s0,0,$acc7
216 ldwx,s $acc5($tbl),$acc5
217 _srm $s2,24,$acc8
218 ldwx,s $acc6($tbl),$acc6
219 _srm $s3,16,$acc9
220 ldwx,s $acc7($tbl),$acc7
221 _srm $s0,8,$acc10
222 ldwx,s $acc8($tbl),$acc8
223 _srm $s1,0,$acc11
224 ldwx,s $acc9($tbl),$acc9
225 _srm $s3,24,$acc12
226 ldwx,s $acc10($tbl),$acc10
227 _srm $s0,16,$acc13
228 ldwx,s $acc11($tbl),$acc11
229 _srm $s1,8,$acc14
230 ldwx,s $acc12($tbl),$acc12
231 _srm $s2,0,$acc15
232 ldwx,s $acc13($tbl),$acc13
233 ldwx,s $acc14($tbl),$acc14
234 ldwx,s $acc15($tbl),$acc15
235 addib,= -1,$rounds,L\$enc_last
236 ldo 32($key),$key
237
238 _ror $acc1,8,$acc1
239 xor $acc0,$t0,$t0
240 ldw 0($key),$s0
241 _ror $acc2,16,$acc2
242 xor $acc1,$t0,$t0
243 ldw 4($key),$s1
244 _ror $acc3,24,$acc3
245 xor $acc2,$t0,$t0
246 ldw 8($key),$s2
247 _ror $acc5,8,$acc5
248 xor $acc3,$t0,$t0
249 ldw 12($key),$s3
250 _ror $acc6,16,$acc6
251 xor $acc4,$t1,$t1
252 _ror $acc7,24,$acc7
253 xor $acc5,$t1,$t1
254 _ror $acc9,8,$acc9
255 xor $acc6,$t1,$t1
256 _ror $acc10,16,$acc10
257 xor $acc7,$t1,$t1
258 _ror $acc11,24,$acc11
259 xor $acc8,$t2,$t2
260 _ror $acc13,8,$acc13
261 xor $acc9,$t2,$t2
262 _ror $acc14,16,$acc14
263 xor $acc10,$t2,$t2
264 _ror $acc15,24,$acc15
265 xor $acc11,$t2,$t2
266 xor $acc12,$acc14,$acc14
267 xor $acc13,$t3,$t3
268 _srm $t0,24,$acc0
269 xor $acc14,$t3,$t3
270 _srm $t1,16,$acc1
271 xor $acc15,$t3,$t3
272
273 _srm $t2,8,$acc2
274 ldwx,s $acc0($tbl),$acc0
275 _srm $t3,0,$acc3
276 ldwx,s $acc1($tbl),$acc1
277 _srm $t1,24,$acc4
278 ldwx,s $acc2($tbl),$acc2
279 _srm $t2,16,$acc5
280 ldwx,s $acc3($tbl),$acc3
281 _srm $t3,8,$acc6
282 ldwx,s $acc4($tbl),$acc4
283 _srm $t0,0,$acc7
284 ldwx,s $acc5($tbl),$acc5
285 _srm $t2,24,$acc8
286 ldwx,s $acc6($tbl),$acc6
287 _srm $t3,16,$acc9
288 ldwx,s $acc7($tbl),$acc7
289 _srm $t0,8,$acc10
290 ldwx,s $acc8($tbl),$acc8
291 _srm $t1,0,$acc11
292 ldwx,s $acc9($tbl),$acc9
293 _srm $t3,24,$acc12
294 ldwx,s $acc10($tbl),$acc10
295 _srm $t0,16,$acc13
296 ldwx,s $acc11($tbl),$acc11
297 _srm $t1,8,$acc14
298 ldwx,s $acc12($tbl),$acc12
299 _srm $t2,0,$acc15
300 ldwx,s $acc13($tbl),$acc13
301 _ror $acc1,8,$acc1
302 ldwx,s $acc14($tbl),$acc14
303
304 _ror $acc2,16,$acc2
305 xor $acc0,$s0,$s0
306 ldwx,s $acc15($tbl),$acc15
307 _ror $acc3,24,$acc3
308 xor $acc1,$s0,$s0
309 ldw 16($key),$t0
310 _ror $acc5,8,$acc5
311 xor $acc2,$s0,$s0
312 ldw 20($key),$t1
313 _ror $acc6,16,$acc6
314 xor $acc3,$s0,$s0
315 ldw 24($key),$t2
316 _ror $acc7,24,$acc7
317 xor $acc4,$s1,$s1
318 ldw 28($key),$t3
319 _ror $acc9,8,$acc9
320 xor $acc5,$s1,$s1
321 ldw 1024+0($tbl),%r0 ; prefetch te4
322 _ror $acc10,16,$acc10
323 xor $acc6,$s1,$s1
324 ldw 1024+32($tbl),%r0 ; prefetch te4
325 _ror $acc11,24,$acc11
326 xor $acc7,$s1,$s1
327 ldw 1024+64($tbl),%r0 ; prefetch te4
328 _ror $acc13,8,$acc13
329 xor $acc8,$s2,$s2
330 ldw 1024+96($tbl),%r0 ; prefetch te4
331 _ror $acc14,16,$acc14
332 xor $acc9,$s2,$s2
333 ldw 1024+128($tbl),%r0 ; prefetch te4
334 _ror $acc15,24,$acc15
335 xor $acc10,$s2,$s2
336 ldw 1024+160($tbl),%r0 ; prefetch te4
337 _srm $s0,24,$acc0
338 xor $acc11,$s2,$s2
339 ldw 1024+192($tbl),%r0 ; prefetch te4
340 xor $acc12,$acc14,$acc14
341 xor $acc13,$s3,$s3
342 ldw 1024+224($tbl),%r0 ; prefetch te4
343 _srm $s1,16,$acc1
344 xor $acc14,$s3,$s3
345 b L\$enc_loop
346 xor $acc15,$s3,$s3
347
348 .ALIGN 16
349L\$enc_last
350 ldo 1024($tbl),$rounds
351 _ror $acc1,8,$acc1
352 xor $acc0,$t0,$t0
353 ldw 0($key),$s0
354 _ror $acc2,16,$acc2
355 xor $acc1,$t0,$t0
356 ldw 4($key),$s1
357 _ror $acc3,24,$acc3
358 xor $acc2,$t0,$t0
359 ldw 8($key),$s2
360 _ror $acc5,8,$acc5
361 xor $acc3,$t0,$t0
362 ldw 12($key),$s3
363 _ror $acc6,16,$acc6
364 xor $acc4,$t1,$t1
365 _ror $acc7,24,$acc7
366 xor $acc5,$t1,$t1
367 _ror $acc9,8,$acc9
368 xor $acc6,$t1,$t1
369 _ror $acc10,16,$acc10
370 xor $acc7,$t1,$t1
371 _ror $acc11,24,$acc11
372 xor $acc8,$t2,$t2
373 _ror $acc13,8,$acc13
374 xor $acc9,$t2,$t2
375 _ror $acc14,16,$acc14
376 xor $acc10,$t2,$t2
377 _ror $acc15,24,$acc15
378 xor $acc11,$t2,$t2
379 xor $acc12,$acc14,$acc14
380 xor $acc13,$t3,$t3
381 _srm $t0,24,$acc0
382 xor $acc14,$t3,$t3
383 _srm $t1,16,$acc1
384 xor $acc15,$t3,$t3
385
386 _srm $t2,8,$acc2
387 ldbx $acc0($rounds),$acc0
388 _srm $t1,24,$acc4
389 ldbx $acc1($rounds),$acc1
390 _srm $t2,16,$acc5
391 _srm $t3,0,$acc3
392 ldbx $acc2($rounds),$acc2
393 ldbx $acc3($rounds),$acc3
394 _srm $t3,8,$acc6
395 ldbx $acc4($rounds),$acc4
396 _srm $t2,24,$acc8
397 ldbx $acc5($rounds),$acc5
398 _srm $t3,16,$acc9
399 _srm $t0,0,$acc7
400 ldbx $acc6($rounds),$acc6
401 ldbx $acc7($rounds),$acc7
402 _srm $t0,8,$acc10
403 ldbx $acc8($rounds),$acc8
404 _srm $t3,24,$acc12
405 ldbx $acc9($rounds),$acc9
406 _srm $t0,16,$acc13
407 _srm $t1,0,$acc11
408 ldbx $acc10($rounds),$acc10
409 _srm $t1,8,$acc14
410 ldbx $acc11($rounds),$acc11
411 ldbx $acc12($rounds),$acc12
412 ldbx $acc13($rounds),$acc13
413 _srm $t2,0,$acc15
414 ldbx $acc14($rounds),$acc14
415
416 dep $acc0,7,8,$acc3
417 ldbx $acc15($rounds),$acc15
418 dep $acc4,7,8,$acc7
419 dep $acc1,15,8,$acc3
420 dep $acc5,15,8,$acc7
421 dep $acc2,23,8,$acc3
422 dep $acc6,23,8,$acc7
423 xor $acc3,$s0,$s0
424 xor $acc7,$s1,$s1
425 dep $acc8,7,8,$acc11
426 dep $acc12,7,8,$acc15
427 dep $acc9,15,8,$acc11
428 dep $acc13,15,8,$acc15
429 dep $acc10,23,8,$acc11
430 dep $acc14,23,8,$acc15
431 xor $acc11,$s2,$s2
432
433 bv (%r31)
434 .EXIT
435 xor $acc15,$s3,$s3
436 .PROCEND
437
438 .ALIGN 64
439L\$AES_Te
440 .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
441 .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
442 .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
443 .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
444 .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
445 .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
446 .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
447 .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
448 .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
449 .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
450 .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
451 .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
452 .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
453 .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
454 .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
455 .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
456 .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
457 .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
458 .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
459 .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
460 .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
461 .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
462 .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
463 .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
464 .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
465 .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
466 .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
467 .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
468 .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
469 .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
470 .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
471 .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
472 .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
473 .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
474 .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
475 .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
476 .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
477 .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
478 .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
479 .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
480 .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
481 .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
482 .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
483 .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
484 .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
485 .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
486 .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
487 .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
488 .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
489 .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
490 .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
491 .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
492 .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
493 .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
494 .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
495 .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
496 .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
497 .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
498 .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
499 .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
500 .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
501 .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
502 .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
503 .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
504 .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
505 .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
506 .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
507 .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
508 .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
509 .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
510 .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
511 .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
512 .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
513 .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
514 .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
515 .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
516 .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
517 .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
518 .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
519 .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
520 .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
521 .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
522 .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
523 .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
524 .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
525 .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
526 .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
527 .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
528 .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
529 .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
530 .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
531 .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
532 .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
533 .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
534 .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
535 .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
536___
537
538$code.=<<___;
539 .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
540 .ALIGN 16
541AES_decrypt
542 .PROC
543 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
544 .ENTRY
545 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
546 $PUSHMA %r3,$FRAME(%sp)
547 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
548 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
549 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
550 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
551 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
552 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
553 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
554 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
555 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
556 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
557 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
558 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
559 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
560 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
561 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
562
563 blr %r0,$tbl
564 ldi 3,$t0
565L\$dec_pic
566 andcm $tbl,$t0,$tbl
567 ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
568
569 and $inp,$t0,$t0
570 sub $inp,$t0,$inp
571 ldw 0($inp),$s0
572 ldw 4($inp),$s1
573 ldw 8($inp),$s2
574 comib,= 0,$t0,L\$dec_inp_aligned
575 ldw 12($inp),$s3
576
577 sh3addl $t0,%r0,$t0
578 subi 32,$t0,$t0
579 mtctl $t0,%cr11
580 ldw 16($inp),$t1
581 vshd $s0,$s1,$s0
582 vshd $s1,$s2,$s1
583 vshd $s2,$s3,$s2
584 vshd $s3,$t1,$s3
585
586L\$dec_inp_aligned
587 bl _parisc_AES_decrypt,%r31
588 nop
589
590 extru,<> $out,31,2,%r0
591 b L\$dec_out_aligned
592 nop
593
594 _srm $s0,24,$acc0
595 _srm $s0,16,$acc1
596 stb $acc0,0($out)
597 _srm $s0,8,$acc2
598 stb $acc1,1($out)
599 _srm $s1,24,$acc4
600 stb $acc2,2($out)
601 _srm $s1,16,$acc5
602 stb $s0,3($out)
603 _srm $s1,8,$acc6
604 stb $acc4,4($out)
605 _srm $s2,24,$acc0
606 stb $acc5,5($out)
607 _srm $s2,16,$acc1
608 stb $acc6,6($out)
609 _srm $s2,8,$acc2
610 stb $s1,7($out)
611 _srm $s3,24,$acc4
612 stb $acc0,8($out)
613 _srm $s3,16,$acc5
614 stb $acc1,9($out)
615 _srm $s3,8,$acc6
616 stb $acc2,10($out)
617 stb $s2,11($out)
618 stb $acc4,12($out)
619 stb $acc5,13($out)
620 stb $acc6,14($out)
621 b L\$dec_done
622 stb $s3,15($out)
623
624L\$dec_out_aligned
625 stw $s0,0($out)
626 stw $s1,4($out)
627 stw $s2,8($out)
628 stw $s3,12($out)
629
630L\$dec_done
631 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
632 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
633 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
634 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
635 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
636 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
637 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
638 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
639 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
640 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
641 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
642 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
643 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
644 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
645 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
646 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
647 bv (%r2)
648 .EXIT
649 $POPMB -$FRAME(%sp),%r3
650 .PROCEND
651
652 .ALIGN 16
653_parisc_AES_decrypt
654 .PROC
655 .CALLINFO MILLICODE
656 .ENTRY
657 ldw 240($key),$rounds
658 ldw 0($key),$t0
659 ldw 4($key),$t1
660 ldw 8($key),$t2
661 ldw 12($key),$t3
662 _srm $rounds,1,$rounds
663 xor $t0,$s0,$s0
664 ldw 16($key),$t0
665 xor $t1,$s1,$s1
666 ldw 20($key),$t1
667 _srm $s0,24,$acc0
668 xor $t2,$s2,$s2
669 ldw 24($key),$t2
670 xor $t3,$s3,$s3
671 ldw 28($key),$t3
672 _srm $s3,16,$acc1
673L\$dec_loop
674 _srm $s2,8,$acc2
675 ldwx,s $acc0($tbl),$acc0
676 _srm $s1,0,$acc3
677 ldwx,s $acc1($tbl),$acc1
678 _srm $s1,24,$acc4
679 ldwx,s $acc2($tbl),$acc2
680 _srm $s0,16,$acc5
681 ldwx,s $acc3($tbl),$acc3
682 _srm $s3,8,$acc6
683 ldwx,s $acc4($tbl),$acc4
684 _srm $s2,0,$acc7
685 ldwx,s $acc5($tbl),$acc5
686 _srm $s2,24,$acc8
687 ldwx,s $acc6($tbl),$acc6
688 _srm $s1,16,$acc9
689 ldwx,s $acc7($tbl),$acc7
690 _srm $s0,8,$acc10
691 ldwx,s $acc8($tbl),$acc8
692 _srm $s3,0,$acc11
693 ldwx,s $acc9($tbl),$acc9
694 _srm $s3,24,$acc12
695 ldwx,s $acc10($tbl),$acc10
696 _srm $s2,16,$acc13
697 ldwx,s $acc11($tbl),$acc11
698 _srm $s1,8,$acc14
699 ldwx,s $acc12($tbl),$acc12
700 _srm $s0,0,$acc15
701 ldwx,s $acc13($tbl),$acc13
702 ldwx,s $acc14($tbl),$acc14
703 ldwx,s $acc15($tbl),$acc15
704 addib,= -1,$rounds,L\$dec_last
705 ldo 32($key),$key
706
707 _ror $acc1,8,$acc1
708 xor $acc0,$t0,$t0
709 ldw 0($key),$s0
710 _ror $acc2,16,$acc2
711 xor $acc1,$t0,$t0
712 ldw 4($key),$s1
713 _ror $acc3,24,$acc3
714 xor $acc2,$t0,$t0
715 ldw 8($key),$s2
716 _ror $acc5,8,$acc5
717 xor $acc3,$t0,$t0
718 ldw 12($key),$s3
719 _ror $acc6,16,$acc6
720 xor $acc4,$t1,$t1
721 _ror $acc7,24,$acc7
722 xor $acc5,$t1,$t1
723 _ror $acc9,8,$acc9
724 xor $acc6,$t1,$t1
725 _ror $acc10,16,$acc10
726 xor $acc7,$t1,$t1
727 _ror $acc11,24,$acc11
728 xor $acc8,$t2,$t2
729 _ror $acc13,8,$acc13
730 xor $acc9,$t2,$t2
731 _ror $acc14,16,$acc14
732 xor $acc10,$t2,$t2
733 _ror $acc15,24,$acc15
734 xor $acc11,$t2,$t2
735 xor $acc12,$acc14,$acc14
736 xor $acc13,$t3,$t3
737 _srm $t0,24,$acc0
738 xor $acc14,$t3,$t3
739 xor $acc15,$t3,$t3
740 _srm $t3,16,$acc1
741
742 _srm $t2,8,$acc2
743 ldwx,s $acc0($tbl),$acc0
744 _srm $t1,0,$acc3
745 ldwx,s $acc1($tbl),$acc1
746 _srm $t1,24,$acc4
747 ldwx,s $acc2($tbl),$acc2
748 _srm $t0,16,$acc5
749 ldwx,s $acc3($tbl),$acc3
750 _srm $t3,8,$acc6
751 ldwx,s $acc4($tbl),$acc4
752 _srm $t2,0,$acc7
753 ldwx,s $acc5($tbl),$acc5
754 _srm $t2,24,$acc8
755 ldwx,s $acc6($tbl),$acc6
756 _srm $t1,16,$acc9
757 ldwx,s $acc7($tbl),$acc7
758 _srm $t0,8,$acc10
759 ldwx,s $acc8($tbl),$acc8
760 _srm $t3,0,$acc11
761 ldwx,s $acc9($tbl),$acc9
762 _srm $t3,24,$acc12
763 ldwx,s $acc10($tbl),$acc10
764 _srm $t2,16,$acc13
765 ldwx,s $acc11($tbl),$acc11
766 _srm $t1,8,$acc14
767 ldwx,s $acc12($tbl),$acc12
768 _srm $t0,0,$acc15
769 ldwx,s $acc13($tbl),$acc13
770 _ror $acc1,8,$acc1
771 ldwx,s $acc14($tbl),$acc14
772
773 _ror $acc2,16,$acc2
774 xor $acc0,$s0,$s0
775 ldwx,s $acc15($tbl),$acc15
776 _ror $acc3,24,$acc3
777 xor $acc1,$s0,$s0
778 ldw 16($key),$t0
779 _ror $acc5,8,$acc5
780 xor $acc2,$s0,$s0
781 ldw 20($key),$t1
782 _ror $acc6,16,$acc6
783 xor $acc3,$s0,$s0
784 ldw 24($key),$t2
785 _ror $acc7,24,$acc7
786 xor $acc4,$s1,$s1
787 ldw 28($key),$t3
788 _ror $acc9,8,$acc9
789 xor $acc5,$s1,$s1
790 ldw 1024+0($tbl),%r0 ; prefetch td4
791 _ror $acc10,16,$acc10
792 xor $acc6,$s1,$s1
793 ldw 1024+32($tbl),%r0 ; prefetch td4
794 _ror $acc11,24,$acc11
795 xor $acc7,$s1,$s1
796 ldw 1024+64($tbl),%r0 ; prefetch td4
797 _ror $acc13,8,$acc13
798 xor $acc8,$s2,$s2
799 ldw 1024+96($tbl),%r0 ; prefetch td4
800 _ror $acc14,16,$acc14
801 xor $acc9,$s2,$s2
802 ldw 1024+128($tbl),%r0 ; prefetch td4
803 _ror $acc15,24,$acc15
804 xor $acc10,$s2,$s2
805 ldw 1024+160($tbl),%r0 ; prefetch td4
806 _srm $s0,24,$acc0
807 xor $acc11,$s2,$s2
808 ldw 1024+192($tbl),%r0 ; prefetch td4
809 xor $acc12,$acc14,$acc14
810 xor $acc13,$s3,$s3
811 ldw 1024+224($tbl),%r0 ; prefetch td4
812 xor $acc14,$s3,$s3
813 xor $acc15,$s3,$s3
814 b L\$dec_loop
815 _srm $s3,16,$acc1
816
817 .ALIGN 16
818L\$dec_last
819 ldo 1024($tbl),$rounds
820 _ror $acc1,8,$acc1
821 xor $acc0,$t0,$t0
822 ldw 0($key),$s0
823 _ror $acc2,16,$acc2
824 xor $acc1,$t0,$t0
825 ldw 4($key),$s1
826 _ror $acc3,24,$acc3
827 xor $acc2,$t0,$t0
828 ldw 8($key),$s2
829 _ror $acc5,8,$acc5
830 xor $acc3,$t0,$t0
831 ldw 12($key),$s3
832 _ror $acc6,16,$acc6
833 xor $acc4,$t1,$t1
834 _ror $acc7,24,$acc7
835 xor $acc5,$t1,$t1
836 _ror $acc9,8,$acc9
837 xor $acc6,$t1,$t1
838 _ror $acc10,16,$acc10
839 xor $acc7,$t1,$t1
840 _ror $acc11,24,$acc11
841 xor $acc8,$t2,$t2
842 _ror $acc13,8,$acc13
843 xor $acc9,$t2,$t2
844 _ror $acc14,16,$acc14
845 xor $acc10,$t2,$t2
846 _ror $acc15,24,$acc15
847 xor $acc11,$t2,$t2
848 xor $acc12,$acc14,$acc14
849 xor $acc13,$t3,$t3
850 _srm $t0,24,$acc0
851 xor $acc14,$t3,$t3
852 xor $acc15,$t3,$t3
853 _srm $t3,16,$acc1
854
855 _srm $t2,8,$acc2
856 ldbx $acc0($rounds),$acc0
857 _srm $t1,24,$acc4
858 ldbx $acc1($rounds),$acc1
859 _srm $t0,16,$acc5
860 _srm $t1,0,$acc3
861 ldbx $acc2($rounds),$acc2
862 ldbx $acc3($rounds),$acc3
863 _srm $t3,8,$acc6
864 ldbx $acc4($rounds),$acc4
865 _srm $t2,24,$acc8
866 ldbx $acc5($rounds),$acc5
867 _srm $t1,16,$acc9
868 _srm $t2,0,$acc7
869 ldbx $acc6($rounds),$acc6
870 ldbx $acc7($rounds),$acc7
871 _srm $t0,8,$acc10
872 ldbx $acc8($rounds),$acc8
873 _srm $t3,24,$acc12
874 ldbx $acc9($rounds),$acc9
875 _srm $t2,16,$acc13
876 _srm $t3,0,$acc11
877 ldbx $acc10($rounds),$acc10
878 _srm $t1,8,$acc14
879 ldbx $acc11($rounds),$acc11
880 ldbx $acc12($rounds),$acc12
881 ldbx $acc13($rounds),$acc13
882 _srm $t0,0,$acc15
883 ldbx $acc14($rounds),$acc14
884
885 dep $acc0,7,8,$acc3
886 ldbx $acc15($rounds),$acc15
887 dep $acc4,7,8,$acc7
888 dep $acc1,15,8,$acc3
889 dep $acc5,15,8,$acc7
890 dep $acc2,23,8,$acc3
891 dep $acc6,23,8,$acc7
892 xor $acc3,$s0,$s0
893 xor $acc7,$s1,$s1
894 dep $acc8,7,8,$acc11
895 dep $acc12,7,8,$acc15
896 dep $acc9,15,8,$acc11
897 dep $acc13,15,8,$acc15
898 dep $acc10,23,8,$acc11
899 dep $acc14,23,8,$acc15
900 xor $acc11,$s2,$s2
901
902 bv (%r31)
903 .EXIT
904 xor $acc15,$s3,$s3
905 .PROCEND
906
907 .ALIGN 64
908L\$AES_Td
909 .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
910 .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
911 .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
912 .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
913 .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
914 .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
915 .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
916 .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
917 .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
918 .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
919 .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
920 .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
921 .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
922 .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
923 .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
924 .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
925 .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
926 .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
927 .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
928 .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
929 .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
930 .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
931 .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
932 .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
933 .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
934 .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
935 .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
936 .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
937 .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
938 .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
939 .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
940 .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
941 .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
942 .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
943 .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
944 .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
945 .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
946 .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
947 .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
948 .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
949 .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
950 .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
951 .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
952 .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
953 .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
954 .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
955 .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
956 .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
957 .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
958 .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
959 .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
960 .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
961 .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
962 .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
963 .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
964 .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
965 .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
966 .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
967 .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
968 .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
969 .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
970 .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
971 .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
972 .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
973 .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
974 .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
975 .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
976 .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
977 .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
978 .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
979 .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
980 .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
981 .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
982 .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
983 .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
984 .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
985 .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
986 .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
987 .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
988 .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
989 .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
990 .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
991 .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
992 .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
993 .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
994 .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
995 .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
996 .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
997 .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
998 .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
999 .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1000 .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1001 .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1002 .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1003 .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1004 .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1005 .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
1006___
1007
1008foreach (split("\n",$code)) {
1009 s/\`([^\`]*)\`/eval $1/ge;
1010
1011 # translate made up instructons: _ror, _srm
1012 s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
1013
1014 s/_srm(\s+%r[0-9]+),([0-9]+),/
1015 $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
1016 : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
1017
1018 s/,\*/,/ if ($SIZE_T==4);
1019 print $_,"\n";
1020}
1021close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
new file mode 100644
index 0000000000..7c52cbe5f9
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -0,0 +1,1365 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# Needs more work: key setup, CBC routine...
11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
14# 4.0. But these are not the ones currently used! Their "compact"
15# counterparts are, for security reason. ppc_AES_encrypt_compact runs
16# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17# at 1/3 of ppc_AES_decrypt.
18
19# February 2010
20#
21# Rescheduling instructions to favour Power6 pipeline gave 10%
22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue
25# 4 load instructions in two cycles, only in 3. As result non-compact
26# block subroutines are 25% slower than one would expect. Compact
27# functions scale better, because they have pure computational part,
28# which scales perfectly with clock frequency. To be specific
29# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32$flavour = shift;
33
34if ($flavour =~ /64/) {
35 $SIZE_T =8;
36 $LRSAVE =2*$SIZE_T;
37 $STU ="stdu";
38 $POP ="ld";
39 $PUSH ="std";
40} elsif ($flavour =~ /32/) {
41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
43 $STU ="stwu";
44 $POP ="lwz";
45 $PUSH ="stw";
46} else { die "nonsense $flavour"; }
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
51die "can't locate ppc-xlate.pl";
52
53open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
54
55$FRAME=32*$SIZE_T;
56
57sub _data_word()
58{ my $i;
59 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
60}
61
62$sp="r1";
63$toc="r2";
64$inp="r3";
65$out="r4";
66$key="r5";
67
68$Tbl0="r3";
69$Tbl1="r6";
70$Tbl2="r7";
71$Tbl3="r2";
72
73$s0="r8";
74$s1="r9";
75$s2="r10";
76$s3="r11";
77
78$t0="r12";
79$t1="r13";
80$t2="r14";
81$t3="r15";
82
83$acc00="r16";
84$acc01="r17";
85$acc02="r18";
86$acc03="r19";
87
88$acc04="r20";
89$acc05="r21";
90$acc06="r22";
91$acc07="r23";
92
93$acc08="r24";
94$acc09="r25";
95$acc10="r26";
96$acc11="r27";
97
98$acc12="r28";
99$acc13="r29";
100$acc14="r30";
101$acc15="r31";
102
103# stay away from TLS pointer
104if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
105else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
106$mask80=$Tbl2;
107$mask1b=$Tbl3;
108
109$code.=<<___;
110.machine "any"
111.text
112
113.align 7
114LAES_Te:
115 mflr r0
116 bcl 20,31,\$+4
117 mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
118 addi $Tbl0,$Tbl0,`128-8`
119 mtlr r0
120 blr
121 .long 0
122 .byte 0,12,0x14,0,0,0,0,0
123 .space `64-9*4`
124LAES_Td:
125 mflr r0
126 bcl 20,31,\$+4
127 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
128 addi $Tbl0,$Tbl0,`128-64-8+2048+256`
129 mtlr r0
130 blr
131 .long 0
132 .byte 0,12,0x14,0,0,0,0,0
133 .space `128-64-9*4`
134___
135&_data_word(
136 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
137 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
138 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
139 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
140 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
141 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
142 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
143 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
144 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
145 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
146 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
147 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
148 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
149 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
150 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
151 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
152 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
153 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
154 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
155 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
156 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
157 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
158 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
159 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
160 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
161 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
162 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
163 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
164 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
165 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
166 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
167 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
168 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
169 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
170 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
171 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
172 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
173 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
174 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
175 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
176 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
177 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
178 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
179 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
180 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
181 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
182 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
183 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
184 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
185 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
186 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
187 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
188 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
189 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
190 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
191 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
192 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
193 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
194 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
195 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
196 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
197 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
198 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
199 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
200$code.=<<___;
201.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
202.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
203.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
204.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
205.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
206.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
207.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
208.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
209.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
210.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
211.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
212.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
213.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
214.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
215.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
216.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
217.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
218.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
219.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
220.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
221.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
222.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
223.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
224.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
225.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
226.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
227.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
228.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
229.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
230.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
231.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
232.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
233___
234&_data_word(
235 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
236 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
237 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
238 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
239 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
240 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
241 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
242 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
243 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
244 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
245 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
246 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
247 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
248 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
249 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
250 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
251 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
252 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
253 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
254 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
255 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
256 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
257 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
258 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
259 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
260 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
261 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
262 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
263 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
264 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
265 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
266 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
267 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
268 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
269 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
270 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
271 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
272 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
273 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
274 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
275 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
276 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
277 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
278 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
279 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
280 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
281 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
282 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
283 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
284 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
285 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
286 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
287 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
288 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
289 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
290 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
291 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
292 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
293 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
294 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
295 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
296 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
297 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
298 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
299$code.=<<___;
300.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
301.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
302.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
303.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
304.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
305.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
306.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
307.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
308.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
309.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
310.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
311.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
312.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
313.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
314.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
315.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
316.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
317.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
318.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
319.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
320.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
321.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
322.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
323.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
324.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
325.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
326.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
327.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
328.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
329.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
330.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
331.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
332
333
334.globl .AES_encrypt
335.align 7
336.AES_encrypt:
337 $STU $sp,-$FRAME($sp)
338 mflr r0
339
340 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
341 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
342 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
343 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
344 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
345 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
346 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
347 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
348 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
349 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
350 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
351 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
352 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
353 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
354 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
355 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
356 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
357 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
358 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
359 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
360 $PUSH r0,`$FRAME+$LRSAVE`($sp)
361
362 andi. $t0,$inp,3
363 andi. $t1,$out,3
364 or. $t0,$t0,$t1
365 bne Lenc_unaligned
366
367Lenc_unaligned_ok:
368 lwz $s0,0($inp)
369 lwz $s1,4($inp)
370 lwz $s2,8($inp)
371 lwz $s3,12($inp)
372 bl LAES_Te
373 bl Lppc_AES_encrypt_compact
374 stw $s0,0($out)
375 stw $s1,4($out)
376 stw $s2,8($out)
377 stw $s3,12($out)
378 b Lenc_done
379
380Lenc_unaligned:
381 subfic $t0,$inp,4096
382 subfic $t1,$out,4096
383 andi. $t0,$t0,4096-16
384 beq Lenc_xpage
385 andi. $t1,$t1,4096-16
386 bne Lenc_unaligned_ok
387
388Lenc_xpage:
389 lbz $acc00,0($inp)
390 lbz $acc01,1($inp)
391 lbz $acc02,2($inp)
392 lbz $s0,3($inp)
393 lbz $acc04,4($inp)
394 lbz $acc05,5($inp)
395 lbz $acc06,6($inp)
396 lbz $s1,7($inp)
397 lbz $acc08,8($inp)
398 lbz $acc09,9($inp)
399 lbz $acc10,10($inp)
400 insrwi $s0,$acc00,8,0
401 lbz $s2,11($inp)
402 insrwi $s1,$acc04,8,0
403 lbz $acc12,12($inp)
404 insrwi $s0,$acc01,8,8
405 lbz $acc13,13($inp)
406 insrwi $s1,$acc05,8,8
407 lbz $acc14,14($inp)
408 insrwi $s0,$acc02,8,16
409 lbz $s3,15($inp)
410 insrwi $s1,$acc06,8,16
411 insrwi $s2,$acc08,8,0
412 insrwi $s3,$acc12,8,0
413 insrwi $s2,$acc09,8,8
414 insrwi $s3,$acc13,8,8
415 insrwi $s2,$acc10,8,16
416 insrwi $s3,$acc14,8,16
417
418 bl LAES_Te
419 bl Lppc_AES_encrypt_compact
420
421 extrwi $acc00,$s0,8,0
422 extrwi $acc01,$s0,8,8
423 stb $acc00,0($out)
424 extrwi $acc02,$s0,8,16
425 stb $acc01,1($out)
426 stb $acc02,2($out)
427 extrwi $acc04,$s1,8,0
428 stb $s0,3($out)
429 extrwi $acc05,$s1,8,8
430 stb $acc04,4($out)
431 extrwi $acc06,$s1,8,16
432 stb $acc05,5($out)
433 stb $acc06,6($out)
434 extrwi $acc08,$s2,8,0
435 stb $s1,7($out)
436 extrwi $acc09,$s2,8,8
437 stb $acc08,8($out)
438 extrwi $acc10,$s2,8,16
439 stb $acc09,9($out)
440 stb $acc10,10($out)
441 extrwi $acc12,$s3,8,0
442 stb $s2,11($out)
443 extrwi $acc13,$s3,8,8
444 stb $acc12,12($out)
445 extrwi $acc14,$s3,8,16
446 stb $acc13,13($out)
447 stb $acc14,14($out)
448 stb $s3,15($out)
449
450Lenc_done:
451 $POP r0,`$FRAME+$LRSAVE`($sp)
452 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
453 $POP r13,`$FRAME-$SIZE_T*19`($sp)
454 $POP r14,`$FRAME-$SIZE_T*18`($sp)
455 $POP r15,`$FRAME-$SIZE_T*17`($sp)
456 $POP r16,`$FRAME-$SIZE_T*16`($sp)
457 $POP r17,`$FRAME-$SIZE_T*15`($sp)
458 $POP r18,`$FRAME-$SIZE_T*14`($sp)
459 $POP r19,`$FRAME-$SIZE_T*13`($sp)
460 $POP r20,`$FRAME-$SIZE_T*12`($sp)
461 $POP r21,`$FRAME-$SIZE_T*11`($sp)
462 $POP r22,`$FRAME-$SIZE_T*10`($sp)
463 $POP r23,`$FRAME-$SIZE_T*9`($sp)
464 $POP r24,`$FRAME-$SIZE_T*8`($sp)
465 $POP r25,`$FRAME-$SIZE_T*7`($sp)
466 $POP r26,`$FRAME-$SIZE_T*6`($sp)
467 $POP r27,`$FRAME-$SIZE_T*5`($sp)
468 $POP r28,`$FRAME-$SIZE_T*4`($sp)
469 $POP r29,`$FRAME-$SIZE_T*3`($sp)
470 $POP r30,`$FRAME-$SIZE_T*2`($sp)
471 $POP r31,`$FRAME-$SIZE_T*1`($sp)
472 mtlr r0
473 addi $sp,$sp,$FRAME
474 blr
475 .long 0
476 .byte 0,12,4,1,0x80,18,3,0
477 .long 0
478
479.align 5
480Lppc_AES_encrypt:
481 lwz $acc00,240($key)
482 addi $Tbl1,$Tbl0,3
483 lwz $t0,0($key)
484 addi $Tbl2,$Tbl0,2
485 lwz $t1,4($key)
486 addi $Tbl3,$Tbl0,1
487 lwz $t2,8($key)
488 addi $acc00,$acc00,-1
489 lwz $t3,12($key)
490 addi $key,$key,16
491 xor $s0,$s0,$t0
492 xor $s1,$s1,$t1
493 xor $s2,$s2,$t2
494 xor $s3,$s3,$t3
495 mtctr $acc00
496.align 4
497Lenc_loop:
498 rlwinm $acc00,$s0,`32-24+3`,21,28
499 rlwinm $acc01,$s1,`32-24+3`,21,28
500 rlwinm $acc02,$s2,`32-24+3`,21,28
501 rlwinm $acc03,$s3,`32-24+3`,21,28
502 lwz $t0,0($key)
503 rlwinm $acc04,$s1,`32-16+3`,21,28
504 lwz $t1,4($key)
505 rlwinm $acc05,$s2,`32-16+3`,21,28
506 lwz $t2,8($key)
507 rlwinm $acc06,$s3,`32-16+3`,21,28
508 lwz $t3,12($key)
509 rlwinm $acc07,$s0,`32-16+3`,21,28
510 lwzx $acc00,$Tbl0,$acc00
511 rlwinm $acc08,$s2,`32-8+3`,21,28
512 lwzx $acc01,$Tbl0,$acc01
513 rlwinm $acc09,$s3,`32-8+3`,21,28
514 lwzx $acc02,$Tbl0,$acc02
515 rlwinm $acc10,$s0,`32-8+3`,21,28
516 lwzx $acc03,$Tbl0,$acc03
517 rlwinm $acc11,$s1,`32-8+3`,21,28
518 lwzx $acc04,$Tbl1,$acc04
519 rlwinm $acc12,$s3,`0+3`,21,28
520 lwzx $acc05,$Tbl1,$acc05
521 rlwinm $acc13,$s0,`0+3`,21,28
522 lwzx $acc06,$Tbl1,$acc06
523 rlwinm $acc14,$s1,`0+3`,21,28
524 lwzx $acc07,$Tbl1,$acc07
525 rlwinm $acc15,$s2,`0+3`,21,28
526 lwzx $acc08,$Tbl2,$acc08
527 xor $t0,$t0,$acc00
528 lwzx $acc09,$Tbl2,$acc09
529 xor $t1,$t1,$acc01
530 lwzx $acc10,$Tbl2,$acc10
531 xor $t2,$t2,$acc02
532 lwzx $acc11,$Tbl2,$acc11
533 xor $t3,$t3,$acc03
534 lwzx $acc12,$Tbl3,$acc12
535 xor $t0,$t0,$acc04
536 lwzx $acc13,$Tbl3,$acc13
537 xor $t1,$t1,$acc05
538 lwzx $acc14,$Tbl3,$acc14
539 xor $t2,$t2,$acc06
540 lwzx $acc15,$Tbl3,$acc15
541 xor $t3,$t3,$acc07
542 xor $t0,$t0,$acc08
543 xor $t1,$t1,$acc09
544 xor $t2,$t2,$acc10
545 xor $t3,$t3,$acc11
546 xor $s0,$t0,$acc12
547 xor $s1,$t1,$acc13
548 xor $s2,$t2,$acc14
549 xor $s3,$t3,$acc15
550 addi $key,$key,16
551 bdnz- Lenc_loop
552
553 addi $Tbl2,$Tbl0,2048
554 nop
555 lwz $t0,0($key)
556 rlwinm $acc00,$s0,`32-24`,24,31
557 lwz $t1,4($key)
558 rlwinm $acc01,$s1,`32-24`,24,31
559 lwz $t2,8($key)
560 rlwinm $acc02,$s2,`32-24`,24,31
561 lwz $t3,12($key)
562 rlwinm $acc03,$s3,`32-24`,24,31
563 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
564 rlwinm $acc04,$s1,`32-16`,24,31
565 lwz $acc09,`2048+32`($Tbl0)
566 rlwinm $acc05,$s2,`32-16`,24,31
567 lwz $acc10,`2048+64`($Tbl0)
568 rlwinm $acc06,$s3,`32-16`,24,31
569 lwz $acc11,`2048+96`($Tbl0)
570 rlwinm $acc07,$s0,`32-16`,24,31
571 lwz $acc12,`2048+128`($Tbl0)
572 rlwinm $acc08,$s2,`32-8`,24,31
573 lwz $acc13,`2048+160`($Tbl0)
574 rlwinm $acc09,$s3,`32-8`,24,31
575 lwz $acc14,`2048+192`($Tbl0)
576 rlwinm $acc10,$s0,`32-8`,24,31
577 lwz $acc15,`2048+224`($Tbl0)
578 rlwinm $acc11,$s1,`32-8`,24,31
579 lbzx $acc00,$Tbl2,$acc00
580 rlwinm $acc12,$s3,`0`,24,31
581 lbzx $acc01,$Tbl2,$acc01
582 rlwinm $acc13,$s0,`0`,24,31
583 lbzx $acc02,$Tbl2,$acc02
584 rlwinm $acc14,$s1,`0`,24,31
585 lbzx $acc03,$Tbl2,$acc03
586 rlwinm $acc15,$s2,`0`,24,31
587 lbzx $acc04,$Tbl2,$acc04
588 rlwinm $s0,$acc00,24,0,7
589 lbzx $acc05,$Tbl2,$acc05
590 rlwinm $s1,$acc01,24,0,7
591 lbzx $acc06,$Tbl2,$acc06
592 rlwinm $s2,$acc02,24,0,7
593 lbzx $acc07,$Tbl2,$acc07
594 rlwinm $s3,$acc03,24,0,7
595 lbzx $acc08,$Tbl2,$acc08
596 rlwimi $s0,$acc04,16,8,15
597 lbzx $acc09,$Tbl2,$acc09
598 rlwimi $s1,$acc05,16,8,15
599 lbzx $acc10,$Tbl2,$acc10
600 rlwimi $s2,$acc06,16,8,15
601 lbzx $acc11,$Tbl2,$acc11
602 rlwimi $s3,$acc07,16,8,15
603 lbzx $acc12,$Tbl2,$acc12
604 rlwimi $s0,$acc08,8,16,23
605 lbzx $acc13,$Tbl2,$acc13
606 rlwimi $s1,$acc09,8,16,23
607 lbzx $acc14,$Tbl2,$acc14
608 rlwimi $s2,$acc10,8,16,23
609 lbzx $acc15,$Tbl2,$acc15
610 rlwimi $s3,$acc11,8,16,23
611 or $s0,$s0,$acc12
612 or $s1,$s1,$acc13
613 or $s2,$s2,$acc14
614 or $s3,$s3,$acc15
615 xor $s0,$s0,$t0
616 xor $s1,$s1,$t1
617 xor $s2,$s2,$t2
618 xor $s3,$s3,$t3
619 blr
620 .long 0
621 .byte 0,12,0x14,0,0,0,0,0
622
623.align 4
624Lppc_AES_encrypt_compact:
625 lwz $acc00,240($key)
626 addi $Tbl1,$Tbl0,2048
627 lwz $t0,0($key)
628 lis $mask80,0x8080
629 lwz $t1,4($key)
630 lis $mask1b,0x1b1b
631 lwz $t2,8($key)
632 ori $mask80,$mask80,0x8080
633 lwz $t3,12($key)
634 ori $mask1b,$mask1b,0x1b1b
635 addi $key,$key,16
636 mtctr $acc00
637.align 4
638Lenc_compact_loop:
639 xor $s0,$s0,$t0
640 xor $s1,$s1,$t1
641 rlwinm $acc00,$s0,`32-24`,24,31
642 xor $s2,$s2,$t2
643 rlwinm $acc01,$s1,`32-24`,24,31
644 xor $s3,$s3,$t3
645 rlwinm $acc02,$s2,`32-24`,24,31
646 rlwinm $acc03,$s3,`32-24`,24,31
647 rlwinm $acc04,$s1,`32-16`,24,31
648 rlwinm $acc05,$s2,`32-16`,24,31
649 rlwinm $acc06,$s3,`32-16`,24,31
650 rlwinm $acc07,$s0,`32-16`,24,31
651 lbzx $acc00,$Tbl1,$acc00
652 rlwinm $acc08,$s2,`32-8`,24,31
653 lbzx $acc01,$Tbl1,$acc01
654 rlwinm $acc09,$s3,`32-8`,24,31
655 lbzx $acc02,$Tbl1,$acc02
656 rlwinm $acc10,$s0,`32-8`,24,31
657 lbzx $acc03,$Tbl1,$acc03
658 rlwinm $acc11,$s1,`32-8`,24,31
659 lbzx $acc04,$Tbl1,$acc04
660 rlwinm $acc12,$s3,`0`,24,31
661 lbzx $acc05,$Tbl1,$acc05
662 rlwinm $acc13,$s0,`0`,24,31
663 lbzx $acc06,$Tbl1,$acc06
664 rlwinm $acc14,$s1,`0`,24,31
665 lbzx $acc07,$Tbl1,$acc07
666 rlwinm $acc15,$s2,`0`,24,31
667 lbzx $acc08,$Tbl1,$acc08
668 rlwinm $s0,$acc00,24,0,7
669 lbzx $acc09,$Tbl1,$acc09
670 rlwinm $s1,$acc01,24,0,7
671 lbzx $acc10,$Tbl1,$acc10
672 rlwinm $s2,$acc02,24,0,7
673 lbzx $acc11,$Tbl1,$acc11
674 rlwinm $s3,$acc03,24,0,7
675 lbzx $acc12,$Tbl1,$acc12
676 rlwimi $s0,$acc04,16,8,15
677 lbzx $acc13,$Tbl1,$acc13
678 rlwimi $s1,$acc05,16,8,15
679 lbzx $acc14,$Tbl1,$acc14
680 rlwimi $s2,$acc06,16,8,15
681 lbzx $acc15,$Tbl1,$acc15
682 rlwimi $s3,$acc07,16,8,15
683 rlwimi $s0,$acc08,8,16,23
684 rlwimi $s1,$acc09,8,16,23
685 rlwimi $s2,$acc10,8,16,23
686 rlwimi $s3,$acc11,8,16,23
687 lwz $t0,0($key)
688 or $s0,$s0,$acc12
689 lwz $t1,4($key)
690 or $s1,$s1,$acc13
691 lwz $t2,8($key)
692 or $s2,$s2,$acc14
693 lwz $t3,12($key)
694 or $s3,$s3,$acc15
695
696 addi $key,$key,16
697 bdz Lenc_compact_done
698
699 and $acc00,$s0,$mask80 # r1=r0&0x80808080
700 and $acc01,$s1,$mask80
701 and $acc02,$s2,$mask80
702 and $acc03,$s3,$mask80
703 srwi $acc04,$acc00,7 # r1>>7
704 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
705 srwi $acc05,$acc01,7
706 andc $acc09,$s1,$mask80
707 srwi $acc06,$acc02,7
708 andc $acc10,$s2,$mask80
709 srwi $acc07,$acc03,7
710 andc $acc11,$s3,$mask80
711 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
712 sub $acc01,$acc01,$acc05
713 sub $acc02,$acc02,$acc06
714 sub $acc03,$acc03,$acc07
715 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
716 add $acc09,$acc09,$acc09
717 add $acc10,$acc10,$acc10
718 add $acc11,$acc11,$acc11
719 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
720 and $acc01,$acc01,$mask1b
721 and $acc02,$acc02,$mask1b
722 and $acc03,$acc03,$mask1b
723 xor $acc00,$acc00,$acc08 # r2
724 xor $acc01,$acc01,$acc09
725 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
726 xor $acc02,$acc02,$acc10
727 rotlwi $acc13,$s1,16
728 xor $acc03,$acc03,$acc11
729 rotlwi $acc14,$s2,16
730
731 xor $s0,$s0,$acc00 # r0^r2
732 rotlwi $acc15,$s3,16
733 xor $s1,$s1,$acc01
734 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
735 xor $s2,$s2,$acc02
736 rotrwi $s1,$s1,24
737 xor $s3,$s3,$acc03
738 rotrwi $s2,$s2,24
739 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
740 rotrwi $s3,$s3,24
741 xor $s1,$s1,$acc01
742 xor $s2,$s2,$acc02
743 xor $s3,$s3,$acc03
744 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
745 xor $s0,$s0,$acc12 #
746 rotlwi $acc09,$acc13,8
747 xor $s1,$s1,$acc13
748 rotlwi $acc10,$acc14,8
749 xor $s2,$s2,$acc14
750 rotlwi $acc11,$acc15,8
751 xor $s3,$s3,$acc15
752 xor $s0,$s0,$acc08 #
753 xor $s1,$s1,$acc09
754 xor $s2,$s2,$acc10
755 xor $s3,$s3,$acc11
756
757 b Lenc_compact_loop
758.align 4
759Lenc_compact_done:
760 xor $s0,$s0,$t0
761 xor $s1,$s1,$t1
762 xor $s2,$s2,$t2
763 xor $s3,$s3,$t3
764 blr
765 .long 0
766 .byte 0,12,0x14,0,0,0,0,0
767
768.globl .AES_decrypt
769.align 7
770.AES_decrypt:
771 $STU $sp,-$FRAME($sp)
772 mflr r0
773
774 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
775 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
776 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
777 $PUSH r15,`$FRAME-$SIZE_T*17`($sp)
778 $PUSH r16,`$FRAME-$SIZE_T*16`($sp)
779 $PUSH r17,`$FRAME-$SIZE_T*15`($sp)
780 $PUSH r18,`$FRAME-$SIZE_T*14`($sp)
781 $PUSH r19,`$FRAME-$SIZE_T*13`($sp)
782 $PUSH r20,`$FRAME-$SIZE_T*12`($sp)
783 $PUSH r21,`$FRAME-$SIZE_T*11`($sp)
784 $PUSH r22,`$FRAME-$SIZE_T*10`($sp)
785 $PUSH r23,`$FRAME-$SIZE_T*9`($sp)
786 $PUSH r24,`$FRAME-$SIZE_T*8`($sp)
787 $PUSH r25,`$FRAME-$SIZE_T*7`($sp)
788 $PUSH r26,`$FRAME-$SIZE_T*6`($sp)
789 $PUSH r27,`$FRAME-$SIZE_T*5`($sp)
790 $PUSH r28,`$FRAME-$SIZE_T*4`($sp)
791 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
792 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
793 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
794 $PUSH r0,`$FRAME+$LRSAVE`($sp)
795
796 andi. $t0,$inp,3
797 andi. $t1,$out,3
798 or. $t0,$t0,$t1
799 bne Ldec_unaligned
800
801Ldec_unaligned_ok:
802 lwz $s0,0($inp)
803 lwz $s1,4($inp)
804 lwz $s2,8($inp)
805 lwz $s3,12($inp)
806 bl LAES_Td
807 bl Lppc_AES_decrypt_compact
808 stw $s0,0($out)
809 stw $s1,4($out)
810 stw $s2,8($out)
811 stw $s3,12($out)
812 b Ldec_done
813
814Ldec_unaligned:
815 subfic $t0,$inp,4096
816 subfic $t1,$out,4096
817 andi. $t0,$t0,4096-16
818 beq Ldec_xpage
819 andi. $t1,$t1,4096-16
820 bne Ldec_unaligned_ok
821
822Ldec_xpage:
823 lbz $acc00,0($inp)
824 lbz $acc01,1($inp)
825 lbz $acc02,2($inp)
826 lbz $s0,3($inp)
827 lbz $acc04,4($inp)
828 lbz $acc05,5($inp)
829 lbz $acc06,6($inp)
830 lbz $s1,7($inp)
831 lbz $acc08,8($inp)
832 lbz $acc09,9($inp)
833 lbz $acc10,10($inp)
834 insrwi $s0,$acc00,8,0
835 lbz $s2,11($inp)
836 insrwi $s1,$acc04,8,0
837 lbz $acc12,12($inp)
838 insrwi $s0,$acc01,8,8
839 lbz $acc13,13($inp)
840 insrwi $s1,$acc05,8,8
841 lbz $acc14,14($inp)
842 insrwi $s0,$acc02,8,16
843 lbz $s3,15($inp)
844 insrwi $s1,$acc06,8,16
845 insrwi $s2,$acc08,8,0
846 insrwi $s3,$acc12,8,0
847 insrwi $s2,$acc09,8,8
848 insrwi $s3,$acc13,8,8
849 insrwi $s2,$acc10,8,16
850 insrwi $s3,$acc14,8,16
851
852 bl LAES_Td
853 bl Lppc_AES_decrypt_compact
854
855 extrwi $acc00,$s0,8,0
856 extrwi $acc01,$s0,8,8
857 stb $acc00,0($out)
858 extrwi $acc02,$s0,8,16
859 stb $acc01,1($out)
860 stb $acc02,2($out)
861 extrwi $acc04,$s1,8,0
862 stb $s0,3($out)
863 extrwi $acc05,$s1,8,8
864 stb $acc04,4($out)
865 extrwi $acc06,$s1,8,16
866 stb $acc05,5($out)
867 stb $acc06,6($out)
868 extrwi $acc08,$s2,8,0
869 stb $s1,7($out)
870 extrwi $acc09,$s2,8,8
871 stb $acc08,8($out)
872 extrwi $acc10,$s2,8,16
873 stb $acc09,9($out)
874 stb $acc10,10($out)
875 extrwi $acc12,$s3,8,0
876 stb $s2,11($out)
877 extrwi $acc13,$s3,8,8
878 stb $acc12,12($out)
879 extrwi $acc14,$s3,8,16
880 stb $acc13,13($out)
881 stb $acc14,14($out)
882 stb $s3,15($out)
883
884Ldec_done:
885 $POP r0,`$FRAME+$LRSAVE`($sp)
886 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
887 $POP r13,`$FRAME-$SIZE_T*19`($sp)
888 $POP r14,`$FRAME-$SIZE_T*18`($sp)
889 $POP r15,`$FRAME-$SIZE_T*17`($sp)
890 $POP r16,`$FRAME-$SIZE_T*16`($sp)
891 $POP r17,`$FRAME-$SIZE_T*15`($sp)
892 $POP r18,`$FRAME-$SIZE_T*14`($sp)
893 $POP r19,`$FRAME-$SIZE_T*13`($sp)
894 $POP r20,`$FRAME-$SIZE_T*12`($sp)
895 $POP r21,`$FRAME-$SIZE_T*11`($sp)
896 $POP r22,`$FRAME-$SIZE_T*10`($sp)
897 $POP r23,`$FRAME-$SIZE_T*9`($sp)
898 $POP r24,`$FRAME-$SIZE_T*8`($sp)
899 $POP r25,`$FRAME-$SIZE_T*7`($sp)
900 $POP r26,`$FRAME-$SIZE_T*6`($sp)
901 $POP r27,`$FRAME-$SIZE_T*5`($sp)
902 $POP r28,`$FRAME-$SIZE_T*4`($sp)
903 $POP r29,`$FRAME-$SIZE_T*3`($sp)
904 $POP r30,`$FRAME-$SIZE_T*2`($sp)
905 $POP r31,`$FRAME-$SIZE_T*1`($sp)
906 mtlr r0
907 addi $sp,$sp,$FRAME
908 blr
909 .long 0
910 .byte 0,12,4,1,0x80,18,3,0
911 .long 0
912
913.align 5
914Lppc_AES_decrypt:
915 lwz $acc00,240($key)
916 addi $Tbl1,$Tbl0,3
917 lwz $t0,0($key)
918 addi $Tbl2,$Tbl0,2
919 lwz $t1,4($key)
920 addi $Tbl3,$Tbl0,1
921 lwz $t2,8($key)
922 addi $acc00,$acc00,-1
923 lwz $t3,12($key)
924 addi $key,$key,16
925 xor $s0,$s0,$t0
926 xor $s1,$s1,$t1
927 xor $s2,$s2,$t2
928 xor $s3,$s3,$t3
929 mtctr $acc00
930.align 4
931Ldec_loop:
932 rlwinm $acc00,$s0,`32-24+3`,21,28
933 rlwinm $acc01,$s1,`32-24+3`,21,28
934 rlwinm $acc02,$s2,`32-24+3`,21,28
935 rlwinm $acc03,$s3,`32-24+3`,21,28
936 lwz $t0,0($key)
937 rlwinm $acc04,$s3,`32-16+3`,21,28
938 lwz $t1,4($key)
939 rlwinm $acc05,$s0,`32-16+3`,21,28
940 lwz $t2,8($key)
941 rlwinm $acc06,$s1,`32-16+3`,21,28
942 lwz $t3,12($key)
943 rlwinm $acc07,$s2,`32-16+3`,21,28
944 lwzx $acc00,$Tbl0,$acc00
945 rlwinm $acc08,$s2,`32-8+3`,21,28
946 lwzx $acc01,$Tbl0,$acc01
947 rlwinm $acc09,$s3,`32-8+3`,21,28
948 lwzx $acc02,$Tbl0,$acc02
949 rlwinm $acc10,$s0,`32-8+3`,21,28
950 lwzx $acc03,$Tbl0,$acc03
951 rlwinm $acc11,$s1,`32-8+3`,21,28
952 lwzx $acc04,$Tbl1,$acc04
953 rlwinm $acc12,$s1,`0+3`,21,28
954 lwzx $acc05,$Tbl1,$acc05
955 rlwinm $acc13,$s2,`0+3`,21,28
956 lwzx $acc06,$Tbl1,$acc06
957 rlwinm $acc14,$s3,`0+3`,21,28
958 lwzx $acc07,$Tbl1,$acc07
959 rlwinm $acc15,$s0,`0+3`,21,28
960 lwzx $acc08,$Tbl2,$acc08
961 xor $t0,$t0,$acc00
962 lwzx $acc09,$Tbl2,$acc09
963 xor $t1,$t1,$acc01
964 lwzx $acc10,$Tbl2,$acc10
965 xor $t2,$t2,$acc02
966 lwzx $acc11,$Tbl2,$acc11
967 xor $t3,$t3,$acc03
968 lwzx $acc12,$Tbl3,$acc12
969 xor $t0,$t0,$acc04
970 lwzx $acc13,$Tbl3,$acc13
971 xor $t1,$t1,$acc05
972 lwzx $acc14,$Tbl3,$acc14
973 xor $t2,$t2,$acc06
974 lwzx $acc15,$Tbl3,$acc15
975 xor $t3,$t3,$acc07
976 xor $t0,$t0,$acc08
977 xor $t1,$t1,$acc09
978 xor $t2,$t2,$acc10
979 xor $t3,$t3,$acc11
980 xor $s0,$t0,$acc12
981 xor $s1,$t1,$acc13
982 xor $s2,$t2,$acc14
983 xor $s3,$t3,$acc15
984 addi $key,$key,16
985 bdnz- Ldec_loop
986
987 addi $Tbl2,$Tbl0,2048
988 nop
989 lwz $t0,0($key)
990 rlwinm $acc00,$s0,`32-24`,24,31
991 lwz $t1,4($key)
992 rlwinm $acc01,$s1,`32-24`,24,31
993 lwz $t2,8($key)
994 rlwinm $acc02,$s2,`32-24`,24,31
995 lwz $t3,12($key)
996 rlwinm $acc03,$s3,`32-24`,24,31
997 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
998 rlwinm $acc04,$s3,`32-16`,24,31
999 lwz $acc09,`2048+32`($Tbl0)
1000 rlwinm $acc05,$s0,`32-16`,24,31
1001 lwz $acc10,`2048+64`($Tbl0)
1002 lbzx $acc00,$Tbl2,$acc00
1003 lwz $acc11,`2048+96`($Tbl0)
1004 lbzx $acc01,$Tbl2,$acc01
1005 lwz $acc12,`2048+128`($Tbl0)
1006 rlwinm $acc06,$s1,`32-16`,24,31
1007 lwz $acc13,`2048+160`($Tbl0)
1008 rlwinm $acc07,$s2,`32-16`,24,31
1009 lwz $acc14,`2048+192`($Tbl0)
1010 rlwinm $acc08,$s2,`32-8`,24,31
1011 lwz $acc15,`2048+224`($Tbl0)
1012 rlwinm $acc09,$s3,`32-8`,24,31
1013 lbzx $acc02,$Tbl2,$acc02
1014 rlwinm $acc10,$s0,`32-8`,24,31
1015 lbzx $acc03,$Tbl2,$acc03
1016 rlwinm $acc11,$s1,`32-8`,24,31
1017 lbzx $acc04,$Tbl2,$acc04
1018 rlwinm $acc12,$s1,`0`,24,31
1019 lbzx $acc05,$Tbl2,$acc05
1020 rlwinm $acc13,$s2,`0`,24,31
1021 lbzx $acc06,$Tbl2,$acc06
1022 rlwinm $acc14,$s3,`0`,24,31
1023 lbzx $acc07,$Tbl2,$acc07
1024 rlwinm $acc15,$s0,`0`,24,31
1025 lbzx $acc08,$Tbl2,$acc08
1026 rlwinm $s0,$acc00,24,0,7
1027 lbzx $acc09,$Tbl2,$acc09
1028 rlwinm $s1,$acc01,24,0,7
1029 lbzx $acc10,$Tbl2,$acc10
1030 rlwinm $s2,$acc02,24,0,7
1031 lbzx $acc11,$Tbl2,$acc11
1032 rlwinm $s3,$acc03,24,0,7
1033 lbzx $acc12,$Tbl2,$acc12
1034 rlwimi $s0,$acc04,16,8,15
1035 lbzx $acc13,$Tbl2,$acc13
1036 rlwimi $s1,$acc05,16,8,15
1037 lbzx $acc14,$Tbl2,$acc14
1038 rlwimi $s2,$acc06,16,8,15
1039 lbzx $acc15,$Tbl2,$acc15
1040 rlwimi $s3,$acc07,16,8,15
1041 rlwimi $s0,$acc08,8,16,23
1042 rlwimi $s1,$acc09,8,16,23
1043 rlwimi $s2,$acc10,8,16,23
1044 rlwimi $s3,$acc11,8,16,23
1045 or $s0,$s0,$acc12
1046 or $s1,$s1,$acc13
1047 or $s2,$s2,$acc14
1048 or $s3,$s3,$acc15
1049 xor $s0,$s0,$t0
1050 xor $s1,$s1,$t1
1051 xor $s2,$s2,$t2
1052 xor $s3,$s3,$t3
1053 blr
1054 .long 0
1055 .byte 0,12,0x14,0,0,0,0,0
1056
1057.align 4
1058Lppc_AES_decrypt_compact:
1059 lwz $acc00,240($key)
1060 addi $Tbl1,$Tbl0,2048
1061 lwz $t0,0($key)
1062 lis $mask80,0x8080
1063 lwz $t1,4($key)
1064 lis $mask1b,0x1b1b
1065 lwz $t2,8($key)
1066 ori $mask80,$mask80,0x8080
1067 lwz $t3,12($key)
1068 ori $mask1b,$mask1b,0x1b1b
1069 addi $key,$key,16
1070___
1071$code.=<<___ if ($SIZE_T==8);
1072 insrdi $mask80,$mask80,32,0
1073 insrdi $mask1b,$mask1b,32,0
1074___
1075$code.=<<___;
1076 mtctr $acc00
1077.align 4
1078Ldec_compact_loop:
1079 xor $s0,$s0,$t0
1080 xor $s1,$s1,$t1
1081 rlwinm $acc00,$s0,`32-24`,24,31
1082 xor $s2,$s2,$t2
1083 rlwinm $acc01,$s1,`32-24`,24,31
1084 xor $s3,$s3,$t3
1085 rlwinm $acc02,$s2,`32-24`,24,31
1086 rlwinm $acc03,$s3,`32-24`,24,31
1087 rlwinm $acc04,$s3,`32-16`,24,31
1088 rlwinm $acc05,$s0,`32-16`,24,31
1089 rlwinm $acc06,$s1,`32-16`,24,31
1090 rlwinm $acc07,$s2,`32-16`,24,31
1091 lbzx $acc00,$Tbl1,$acc00
1092 rlwinm $acc08,$s2,`32-8`,24,31
1093 lbzx $acc01,$Tbl1,$acc01
1094 rlwinm $acc09,$s3,`32-8`,24,31
1095 lbzx $acc02,$Tbl1,$acc02
1096 rlwinm $acc10,$s0,`32-8`,24,31
1097 lbzx $acc03,$Tbl1,$acc03
1098 rlwinm $acc11,$s1,`32-8`,24,31
1099 lbzx $acc04,$Tbl1,$acc04
1100 rlwinm $acc12,$s1,`0`,24,31
1101 lbzx $acc05,$Tbl1,$acc05
1102 rlwinm $acc13,$s2,`0`,24,31
1103 lbzx $acc06,$Tbl1,$acc06
1104 rlwinm $acc14,$s3,`0`,24,31
1105 lbzx $acc07,$Tbl1,$acc07
1106 rlwinm $acc15,$s0,`0`,24,31
1107 lbzx $acc08,$Tbl1,$acc08
1108 rlwinm $s0,$acc00,24,0,7
1109 lbzx $acc09,$Tbl1,$acc09
1110 rlwinm $s1,$acc01,24,0,7
1111 lbzx $acc10,$Tbl1,$acc10
1112 rlwinm $s2,$acc02,24,0,7
1113 lbzx $acc11,$Tbl1,$acc11
1114 rlwinm $s3,$acc03,24,0,7
1115 lbzx $acc12,$Tbl1,$acc12
1116 rlwimi $s0,$acc04,16,8,15
1117 lbzx $acc13,$Tbl1,$acc13
1118 rlwimi $s1,$acc05,16,8,15
1119 lbzx $acc14,$Tbl1,$acc14
1120 rlwimi $s2,$acc06,16,8,15
1121 lbzx $acc15,$Tbl1,$acc15
1122 rlwimi $s3,$acc07,16,8,15
1123 rlwimi $s0,$acc08,8,16,23
1124 rlwimi $s1,$acc09,8,16,23
1125 rlwimi $s2,$acc10,8,16,23
1126 rlwimi $s3,$acc11,8,16,23
1127 lwz $t0,0($key)
1128 or $s0,$s0,$acc12
1129 lwz $t1,4($key)
1130 or $s1,$s1,$acc13
1131 lwz $t2,8($key)
1132 or $s2,$s2,$acc14
1133 lwz $t3,12($key)
1134 or $s3,$s3,$acc15
1135
1136 addi $key,$key,16
1137 bdz Ldec_compact_done
1138___
1139$code.=<<___ if ($SIZE_T==8);
1140 # vectorized permutation improves decrypt performance by 10%
1141 insrdi $s0,$s1,32,0
1142 insrdi $s2,$s3,32,0
1143
1144 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1145 and $acc02,$s2,$mask80
1146 srdi $acc04,$acc00,7 # r1>>7
1147 srdi $acc06,$acc02,7
1148 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1149 andc $acc10,$s2,$mask80
1150 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1151 sub $acc02,$acc02,$acc06
1152 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1153 add $acc10,$acc10,$acc10
1154 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1155 and $acc02,$acc02,$mask1b
1156 xor $acc00,$acc00,$acc08 # r2
1157 xor $acc02,$acc02,$acc10
1158
1159 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1160 and $acc06,$acc02,$mask80
1161 srdi $acc08,$acc04,7 # r1>>7
1162 srdi $acc10,$acc06,7
1163 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1164 andc $acc14,$acc02,$mask80
1165 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1166 sub $acc06,$acc06,$acc10
1167 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1168 add $acc14,$acc14,$acc14
1169 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1170 and $acc06,$acc06,$mask1b
1171 xor $acc04,$acc04,$acc12 # r4
1172 xor $acc06,$acc06,$acc14
1173
1174 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1175 and $acc10,$acc06,$mask80
1176 srdi $acc12,$acc08,7 # r1>>7
1177 srdi $acc14,$acc10,7
1178 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1179 sub $acc10,$acc10,$acc14
1180 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1181 andc $acc14,$acc06,$mask80
1182 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1183 add $acc14,$acc14,$acc14
1184 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1185 and $acc10,$acc10,$mask1b
1186 xor $acc08,$acc08,$acc12 # r8
1187 xor $acc10,$acc10,$acc14
1188
1189 xor $acc00,$acc00,$s0 # r2^r0
1190 xor $acc02,$acc02,$s2
1191 xor $acc04,$acc04,$s0 # r4^r0
1192 xor $acc06,$acc06,$s2
1193
1194 extrdi $acc01,$acc00,32,0
1195 extrdi $acc03,$acc02,32,0
1196 extrdi $acc05,$acc04,32,0
1197 extrdi $acc07,$acc06,32,0
1198 extrdi $acc09,$acc08,32,0
1199 extrdi $acc11,$acc10,32,0
1200___
1201$code.=<<___ if ($SIZE_T==4);
1202 and $acc00,$s0,$mask80 # r1=r0&0x80808080
1203 and $acc01,$s1,$mask80
1204 and $acc02,$s2,$mask80
1205 and $acc03,$s3,$mask80
1206 srwi $acc04,$acc00,7 # r1>>7
1207 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1208 srwi $acc05,$acc01,7
1209 andc $acc09,$s1,$mask80
1210 srwi $acc06,$acc02,7
1211 andc $acc10,$s2,$mask80
1212 srwi $acc07,$acc03,7
1213 andc $acc11,$s3,$mask80
1214 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1215 sub $acc01,$acc01,$acc05
1216 sub $acc02,$acc02,$acc06
1217 sub $acc03,$acc03,$acc07
1218 add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
1219 add $acc09,$acc09,$acc09
1220 add $acc10,$acc10,$acc10
1221 add $acc11,$acc11,$acc11
1222 and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1223 and $acc01,$acc01,$mask1b
1224 and $acc02,$acc02,$mask1b
1225 and $acc03,$acc03,$mask1b
1226 xor $acc00,$acc00,$acc08 # r2
1227 xor $acc01,$acc01,$acc09
1228 xor $acc02,$acc02,$acc10
1229 xor $acc03,$acc03,$acc11
1230
1231 and $acc04,$acc00,$mask80 # r1=r2&0x80808080
1232 and $acc05,$acc01,$mask80
1233 and $acc06,$acc02,$mask80
1234 and $acc07,$acc03,$mask80
1235 srwi $acc08,$acc04,7 # r1>>7
1236 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1237 srwi $acc09,$acc05,7
1238 andc $acc13,$acc01,$mask80
1239 srwi $acc10,$acc06,7
1240 andc $acc14,$acc02,$mask80
1241 srwi $acc11,$acc07,7
1242 andc $acc15,$acc03,$mask80
1243 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1244 sub $acc05,$acc05,$acc09
1245 sub $acc06,$acc06,$acc10
1246 sub $acc07,$acc07,$acc11
1247 add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
1248 add $acc13,$acc13,$acc13
1249 add $acc14,$acc14,$acc14
1250 add $acc15,$acc15,$acc15
1251 and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1252 and $acc05,$acc05,$mask1b
1253 and $acc06,$acc06,$mask1b
1254 and $acc07,$acc07,$mask1b
1255 xor $acc04,$acc04,$acc12 # r4
1256 xor $acc05,$acc05,$acc13
1257 xor $acc06,$acc06,$acc14
1258 xor $acc07,$acc07,$acc15
1259
1260 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1261 and $acc09,$acc05,$mask80
1262 srwi $acc12,$acc08,7 # r1>>7
1263 and $acc10,$acc06,$mask80
1264 srwi $acc13,$acc09,7
1265 and $acc11,$acc07,$mask80
1266 srwi $acc14,$acc10,7
1267 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1268 srwi $acc15,$acc11,7
1269 sub $acc09,$acc09,$acc13
1270 sub $acc10,$acc10,$acc14
1271 sub $acc11,$acc11,$acc15
1272 andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
1273 andc $acc13,$acc05,$mask80
1274 andc $acc14,$acc06,$mask80
1275 andc $acc15,$acc07,$mask80
1276 add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
1277 add $acc13,$acc13,$acc13
1278 add $acc14,$acc14,$acc14
1279 add $acc15,$acc15,$acc15
1280 and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
1281 and $acc09,$acc09,$mask1b
1282 and $acc10,$acc10,$mask1b
1283 and $acc11,$acc11,$mask1b
1284 xor $acc08,$acc08,$acc12 # r8
1285 xor $acc09,$acc09,$acc13
1286 xor $acc10,$acc10,$acc14
1287 xor $acc11,$acc11,$acc15
1288
1289 xor $acc00,$acc00,$s0 # r2^r0
1290 xor $acc01,$acc01,$s1
1291 xor $acc02,$acc02,$s2
1292 xor $acc03,$acc03,$s3
1293 xor $acc04,$acc04,$s0 # r4^r0
1294 xor $acc05,$acc05,$s1
1295 xor $acc06,$acc06,$s2
1296 xor $acc07,$acc07,$s3
1297___
1298$code.=<<___;
1299 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1300 rotrwi $s1,$s1,8
1301 xor $s0,$s0,$acc00 # ^= r2^r0
1302 rotrwi $s2,$s2,8
1303 xor $s1,$s1,$acc01
1304 rotrwi $s3,$s3,8
1305 xor $s2,$s2,$acc02
1306 xor $s3,$s3,$acc03
1307 xor $acc00,$acc00,$acc08
1308 xor $acc01,$acc01,$acc09
1309 xor $acc02,$acc02,$acc10
1310 xor $acc03,$acc03,$acc11
1311 xor $s0,$s0,$acc04 # ^= r4^r0
1312 rotrwi $acc00,$acc00,24
1313 xor $s1,$s1,$acc05
1314 rotrwi $acc01,$acc01,24
1315 xor $s2,$s2,$acc06
1316 rotrwi $acc02,$acc02,24
1317 xor $s3,$s3,$acc07
1318 rotrwi $acc03,$acc03,24
1319 xor $acc04,$acc04,$acc08
1320 xor $acc05,$acc05,$acc09
1321 xor $acc06,$acc06,$acc10
1322 xor $acc07,$acc07,$acc11
1323 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1324 rotrwi $acc04,$acc04,16
1325 xor $s1,$s1,$acc09
1326 rotrwi $acc05,$acc05,16
1327 xor $s2,$s2,$acc10
1328 rotrwi $acc06,$acc06,16
1329 xor $s3,$s3,$acc11
1330 rotrwi $acc07,$acc07,16
1331 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1332 rotrwi $acc08,$acc08,8
1333 xor $s1,$s1,$acc01
1334 rotrwi $acc09,$acc09,8
1335 xor $s2,$s2,$acc02
1336 rotrwi $acc10,$acc10,8
1337 xor $s3,$s3,$acc03
1338 rotrwi $acc11,$acc11,8
1339 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1340 xor $s1,$s1,$acc05
1341 xor $s2,$s2,$acc06
1342 xor $s3,$s3,$acc07
1343 xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
1344 xor $s1,$s1,$acc09
1345 xor $s2,$s2,$acc10
1346 xor $s3,$s3,$acc11
1347
1348 b Ldec_compact_loop
1349.align 4
1350Ldec_compact_done:
1351 xor $s0,$s0,$t0
1352 xor $s1,$s1,$t1
1353 xor $s2,$s2,$t2
1354 xor $s3,$s3,$t3
1355 blr
1356 .long 0
1357 .byte 0,12,0x14,0,0,0,0,0
1358
1359.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1360.align 7
1361___
1362
1363$code =~ s/\`([^\`]*)\`/eval $1/gem;
1364print $code;
1365close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
new file mode 100644
index 0000000000..445a1e6762
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -0,0 +1,2254 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0; # allow hardware support
99
100$t0="%r0"; $mask="%r0";
101$t1="%r1";
102$t2="%r2"; $inp="%r2";
103$t3="%r3"; $out="%r3"; $bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type AES_Te,\@object
128.align 256
129AES_Te:
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align 256
235.size AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# const AES_KEY *key) {
239.globl AES_encrypt
240.type AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244 l %r0,240($key)
245 lhi %r1,16
246 clr %r0,%r1
247 jl .Lesoft
248
249 la %r1,0($key)
250 #la %r2,0($inp)
251 la %r4,0($out)
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
254 brc 1,.-4 # can this happen?
255 br %r14
256.align 64
257.Lesoft:
258___
259$code.=<<___;
260 stm${g} %r3,$ra,3*$SIZE_T($sp)
261
262 llgf $s0,0($inp)
263 llgf $s1,4($inp)
264 llgf $s2,8($inp)
265 llgf $s3,12($inp)
266
267 larl $tbl,AES_Te
268 bras $ra,_s390x_AES_encrypt
269
270 l${g} $out,3*$SIZE_T($sp)
271 st $s0,0($out)
272 st $s1,4($out)
273 st $s2,8($out)
274 st $s3,12($out)
275
276 lm${g} %r6,$ra,6*$SIZE_T($sp)
277 br $ra
278.size AES_encrypt,.-AES_encrypt
279
280.type _s390x_AES_encrypt,\@function
281.align 16
282_s390x_AES_encrypt:
283 st${g} $ra,15*$SIZE_T($sp)
284 x $s0,0($key)
285 x $s1,4($key)
286 x $s2,8($key)
287 x $s3,12($key)
288 l $rounds,240($key)
289 llill $mask,`0xff<<3`
290 aghi $rounds,-1
291 j .Lenc_loop
292.align 16
293.Lenc_loop:
294 sllg $t1,$s0,`0+3`
295 srlg $t2,$s0,`8-3`
296 srlg $t3,$s0,`16-3`
297 srl $s0,`24-3`
298 nr $s0,$mask
299 ngr $t1,$mask
300 nr $t2,$mask
301 nr $t3,$mask
302
303 srlg $i1,$s1,`16-3` # i0
304 sllg $i2,$s1,`0+3`
305 srlg $i3,$s1,`8-3`
306 srl $s1,`24-3`
307 nr $i1,$mask
308 nr $s1,$mask
309 ngr $i2,$mask
310 nr $i3,$mask
311
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
316
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
321
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
324 nr $i1,$mask
325 nr $i2,$mask
326 sllg $i3,$s2,`0+3`
327 srl $s2,`24-3`
328 nr $s2,$mask
329 ngr $i3,$mask
330
331 xr $s1,$t1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
334 nr $ra,$mask
335 la $key,16($key)
336 ngr $t1,$mask
337
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
342
343 srlg $i3,$s3,`16-3` # i2
344 xr $s2,$t2
345 srl $s3,`24-3`
346 nr $i3,$mask
347 nr $s3,$mask
348
349 x $s0,0($key)
350 x $s1,4($key)
351 x $s2,8($key)
352 x $t3,12($key)
353
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
358 xr $s3,$t3
359
360 brct $rounds,.Lenc_loop
361 .align 16
362
363 sllg $t1,$s0,`0+3`
364 srlg $t2,$s0,`8-3`
365 ngr $t1,$mask
366 srlg $t3,$s0,`16-3`
367 srl $s0,`24-3`
368 nr $s0,$mask
369 nr $t2,$mask
370 nr $t3,$mask
371
372 srlg $i1,$s1,`16-3` # i0
373 sllg $i2,$s1,`0+3`
374 ngr $i2,$mask
375 srlg $i3,$s1,`8-3`
376 srl $s1,`24-3`
377 nr $i1,$mask
378 nr $s1,$mask
379 nr $i3,$mask
380
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
383 sll $s0,24
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
386 sll $t2,8
387 sll $t3,16
388
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
393 sll $i1,16
394 sll $s1,24
395 sll $i3,8
396 or $s0,$i1
397 or $s1,$t1
398 or $t2,$i2
399 or $t3,$i3
400
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
403 nr $i1,$mask
404 nr $i2,$mask
405 sllg $i3,$s2,`0+3`
406 srl $s2,`24-3`
407 ngr $i3,$mask
408 nr $s2,$mask
409
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
412 ngr $t1,$mask
413
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
416 sll $i1,8
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
419 sll $i2,16
420 nr $ra,$mask
421 sll $s2,24
422 or $s0,$i1
423 or $s1,$i2
424 or $s2,$t2
425 or $t3,$i3
426
427 srlg $i3,$s3,`16-3` # i2
428 srl $s3,`24-3`
429 nr $i3,$mask
430 nr $s3,$mask
431
432 l $t0,16($key)
433 l $t2,20($key)
434
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
439 sll $i2,8
440 sll $i3,16
441 sll $s3,24
442 or $s0,$i1
443 or $s1,$i2
444 or $s2,$i3
445 or $s3,$t3
446
447 l${g} $ra,15*$SIZE_T($sp)
448 xr $s0,$t0
449 xr $s1,$t2
450 x $s2,24($key)
451 x $s3,28($key)
452
453 br $ra
454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type AES_Td,\@object
459.align 256
460AES_Td:
461___
462&_data_word(
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# const AES_KEY *key) {
565.globl AES_decrypt
566.type AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570 l %r0,240($key)
571 lhi %r1,16
572 clr %r0,%r1
573 jl .Ldsoft
574
575 la %r1,0($key)
576 #la %r2,0($inp)
577 la %r4,0($out)
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
580 brc 1,.-4 # can this happen?
581 br %r14
582.align 64
583.Ldsoft:
584___
585$code.=<<___;
586 stm${g} %r3,$ra,3*$SIZE_T($sp)
587
588 llgf $s0,0($inp)
589 llgf $s1,4($inp)
590 llgf $s2,8($inp)
591 llgf $s3,12($inp)
592
593 larl $tbl,AES_Td
594 bras $ra,_s390x_AES_decrypt
595
596 l${g} $out,3*$SIZE_T($sp)
597 st $s0,0($out)
598 st $s1,4($out)
599 st $s2,8($out)
600 st $s3,12($out)
601
602 lm${g} %r6,$ra,6*$SIZE_T($sp)
603 br $ra
604.size AES_decrypt,.-AES_decrypt
605
606.type _s390x_AES_decrypt,\@function
607.align 16
608_s390x_AES_decrypt:
609 st${g} $ra,15*$SIZE_T($sp)
610 x $s0,0($key)
611 x $s1,4($key)
612 x $s2,8($key)
613 x $s3,12($key)
614 l $rounds,240($key)
615 llill $mask,`0xff<<3`
616 aghi $rounds,-1
617 j .Ldec_loop
618.align 16
619.Ldec_loop:
620 srlg $t1,$s0,`16-3`
621 srlg $t2,$s0,`8-3`
622 sllg $t3,$s0,`0+3`
623 srl $s0,`24-3`
624 nr $s0,$mask
625 nr $t1,$mask
626 nr $t2,$mask
627 ngr $t3,$mask
628
629 sllg $i1,$s1,`0+3` # i0
630 srlg $i2,$s1,`16-3`
631 srlg $i3,$s1,`8-3`
632 srl $s1,`24-3`
633 ngr $i1,$mask
634 nr $s1,$mask
635 nr $i2,$mask
636 nr $i3,$mask
637
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
642
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
647
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
650 srlg $i3,$s2,`16-3`
651 srl $s2,`24-3`
652 nr $i1,$mask
653 ngr $i2,$mask
654 nr $s2,$mask
655 nr $i3,$mask
656
657 xr $s1,$t1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
660 nr $ra,$mask
661 la $key,16($key)
662 nr $t1,$mask
663
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
668
669 sllg $i3,$s3,`0+3` # i2
670 srl $s3,`24-3`
671 ngr $i3,$mask
672 nr $s3,$mask
673
674 xr $s2,$t2
675 x $s0,0($key)
676 x $s1,4($key)
677 x $s2,8($key)
678 x $t3,12($key)
679
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
684 xr $s3,$t3
685
686 brct $rounds,.Ldec_loop
687 .align 16
688
689 l $t1,`2048+0`($tbl) # prefetch Td4
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
693 llill $mask,0xff
694
695 srlg $i3,$s0,24 # i0
696 srlg $t1,$s0,16
697 srlg $t2,$s0,8
698 nr $s0,$mask # i3
699 nr $t1,$mask
700
701 srlg $i1,$s1,24
702 nr $t2,$mask
703 srlg $i2,$s1,16
704 srlg $ra,$s1,8
705 nr $s1,$mask # i0
706 nr $i2,$mask
707 nr $ra,$mask
708
709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
712 sll $t1,16
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
714 sllg $s0,$i3,24
715 sll $t2,8
716
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
720 sll $i1,24
721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
722 sll $i2,16
723 sll $i3,8
724 or $s0,$s1
725 or $t1,$i1
726 or $t2,$i2
727 or $t3,$i3
728
729 srlg $i1,$s2,8 # i0
730 srlg $i2,$s2,24
731 srlg $i3,$s2,16
732 nr $s2,$mask # i1
733 nr $i1,$mask
734 nr $i3,$mask
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
739 sll $i1,8
740 sll $i2,24
741 or $s0,$i1
742 sll $i3,16
743 or $t2,$i2
744 or $t3,$i3
745
746 srlg $i1,$s3,16 # i0
747 srlg $i2,$s3,8 # i1
748 srlg $i3,$s3,24
749 nr $s3,$mask # i2
750 nr $i1,$mask
751 nr $i2,$mask
752
753 l${g} $ra,15*$SIZE_T($sp)
754 or $s1,$t1
755 l $t0,16($key)
756 l $t1,20($key)
757
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
760 sll $i1,16
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
763 sll $i2,8
764 sll $s3,24
765 or $s0,$i1
766 or $s1,$i2
767 or $s2,$t2
768 or $s3,$t3
769
770 xr $s0,$t0
771 xr $s1,$t1
772 x $s2,24($key)
773 x $s3,28($key)
774
775 br $ra
776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# AES_KEY *key) {
782.globl private_AES_set_encrypt_key
783.type private_AES_set_encrypt_key,\@function
784.align 16
785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787 lghi $t0,0
788 cl${g}r $inp,$t0
789 je .Lminus1
790 cl${g}r $key,$t0
791 je .Lminus1
792
793 lghi $t0,128
794 clr $bits,$t0
795 je .Lproceed
796 lghi $t0,192
797 clr $bits,$t0
798 je .Lproceed
799 lghi $t0,256
800 clr $bits,$t0
801 je .Lproceed
802 lghi %r2,-2
803 br %r14
804
805.align 16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809 # convert bits to km code, [128,192,256]->[18,19,20]
810 lhi %r5,-128
811 lhi %r0,18
812 ar %r5,$bits
813 srl %r5,6
814 ar %r5,%r0
815
816 larl %r1,OPENSSL_s390xcap_P
817 lg %r0,0(%r1)
818 tmhl %r0,0x4000 # check for message-security assist
819 jz .Lekey_internal
820
821 lghi %r0,0 # query capability vector
822 la %r1,16($sp)
823 .long 0xb92f0042 # kmc %r4,%r2
824
825 llihh %r1,0x8000
826 srlg %r1,%r1,0(%r5)
827 ng %r1,16($sp)
828 jz .Lekey_internal
829
830 lmg %r0,%r1,0($inp) # just copy 128 bits...
831 stmg %r0,%r1,0($key)
832 lhi %r0,192
833 cr $bits,%r0
834 jl 1f
835 lg %r1,16($inp)
836 stg %r1,16($key)
837 je 1f
838 lg %r1,24($inp)
839 stg %r1,24($key)
8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
842 st %r5,240($key) # save km code
843 lghi %r2,0
844 br %r14
845___
846$code.=<<___;
847.align 16
848.Lekey_internal:
849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
850
851 larl $tbl,AES_Te+2048
852
853 llgf $s0,0($inp)
854 llgf $s1,4($inp)
855 llgf $s2,8($inp)
856 llgf $s3,12($inp)
857 st $s0,0($key)
858 st $s1,4($key)
859 st $s2,8($key)
860 st $s3,12($key)
861 lghi $t0,128
862 cr $bits,$t0
863 jne .Lnot128
864
865 llill $mask,0xff
866 lghi $t3,0 # i=0
867 lghi $rounds,10
868 st $rounds,240($key)
869
870 llgfr $t2,$s3 # temp=rk[3]
871 srlg $i1,$s3,8
872 srlg $i2,$s3,16
873 srlg $i3,$s3,24
874 nr $t2,$mask
875 nr $i1,$mask
876 nr $i2,$mask
877
878.align 16
879.L128_loop:
880 la $t2,0($t2,$tbl)
881 la $i1,0($i1,$tbl)
882 la $i2,0($i2,$tbl)
883 la $i3,0($i3,$tbl)
884 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
885 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
886 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
887 icm $t2,1,0($i3) # Te4[rk[3]>>24]
888 x $t2,256($t3,$tbl) # rcon[i]
889 xr $s0,$t2 # rk[4]=rk[0]^...
890 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
891 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
892 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
893
894 llgfr $t2,$s3 # temp=rk[3]
895 srlg $i1,$s3,8
896 srlg $i2,$s3,16
897 nr $t2,$mask
898 nr $i1,$mask
899 srlg $i3,$s3,24
900 nr $i2,$mask
901
902 st $s0,16($key)
903 st $s1,20($key)
904 st $s2,24($key)
905 st $s3,28($key)
906 la $key,16($key) # key+=4
907 la $t3,4($t3) # i++
908 brct $rounds,.L128_loop
909 lghi $t0,10
910 lghi %r2,0
911 lm${g} %r4,%r13,4*$SIZE_T($sp)
912 br $ra
913
914.align 16
915.Lnot128:
916 llgf $t0,16($inp)
917 llgf $t1,20($inp)
918 st $t0,16($key)
919 st $t1,20($key)
920 lghi $t0,192
921 cr $bits,$t0
922 jne .Lnot192
923
924 llill $mask,0xff
925 lghi $t3,0 # i=0
926 lghi $rounds,12
927 st $rounds,240($key)
928 lghi $rounds,8
929
930 srlg $i1,$t1,8
931 srlg $i2,$t1,16
932 srlg $i3,$t1,24
933 nr $t1,$mask
934 nr $i1,$mask
935 nr $i2,$mask
936
937.align 16
938.L192_loop:
939 la $t1,0($t1,$tbl)
940 la $i1,0($i1,$tbl)
941 la $i2,0($i2,$tbl)
942 la $i3,0($i3,$tbl)
943 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
944 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
945 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
946 icm $t1,1,0($i3) # Te4[rk[5]>>24]
947 x $t1,256($t3,$tbl) # rcon[i]
948 xr $s0,$t1 # rk[6]=rk[0]^...
949 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
950 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
951 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
952
953 st $s0,24($key)
954 st $s1,28($key)
955 st $s2,32($key)
956 st $s3,36($key)
957 brct $rounds,.L192_continue
958 lghi $t0,12
959 lghi %r2,0
960 lm${g} %r4,%r13,4*$SIZE_T($sp)
961 br $ra
962
963.align 16
964.L192_continue:
965 lgr $t1,$s3
966 x $t1,16($key) # rk[10]=rk[4]^rk[9]
967 st $t1,40($key)
968 x $t1,20($key) # rk[11]=rk[5]^rk[10]
969 st $t1,44($key)
970
971 srlg $i1,$t1,8
972 srlg $i2,$t1,16
973 srlg $i3,$t1,24
974 nr $t1,$mask
975 nr $i1,$mask
976 nr $i2,$mask
977
978 la $key,24($key) # key+=6
979 la $t3,4($t3) # i++
980 j .L192_loop
981
982.align 16
983.Lnot192:
984 llgf $t0,24($inp)
985 llgf $t1,28($inp)
986 st $t0,24($key)
987 st $t1,28($key)
988 llill $mask,0xff
989 lghi $t3,0 # i=0
990 lghi $rounds,14
991 st $rounds,240($key)
992 lghi $rounds,7
993
994 srlg $i1,$t1,8
995 srlg $i2,$t1,16
996 srlg $i3,$t1,24
997 nr $t1,$mask
998 nr $i1,$mask
999 nr $i2,$mask
1000
1001.align 16
1002.L256_loop:
1003 la $t1,0($t1,$tbl)
1004 la $i1,0($i1,$tbl)
1005 la $i2,0($i2,$tbl)
1006 la $i3,0($i3,$tbl)
1007 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1008 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1009 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1010 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1011 x $t1,256($t3,$tbl) # rcon[i]
1012 xr $s0,$t1 # rk[8]=rk[0]^...
1013 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1014 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1015 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1016 st $s0,32($key)
1017 st $s1,36($key)
1018 st $s2,40($key)
1019 st $s3,44($key)
1020 brct $rounds,.L256_continue
1021 lghi $t0,14
1022 lghi %r2,0
1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
1024 br $ra
1025
1026.align 16
1027.L256_continue:
1028 lgr $t1,$s3 # temp=rk[11]
1029 srlg $i1,$s3,8
1030 srlg $i2,$s3,16
1031 srlg $i3,$s3,24
1032 nr $t1,$mask
1033 nr $i1,$mask
1034 nr $i2,$mask
1035 la $t1,0($t1,$tbl)
1036 la $i1,0($i1,$tbl)
1037 la $i2,0($i2,$tbl)
1038 la $i3,0($i3,$tbl)
1039 llgc $t1,0($t1) # Te4[rk[11]>>0]
1040 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1041 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1042 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1043 x $t1,16($key) # rk[12]=rk[4]^...
1044 st $t1,48($key)
1045 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1046 st $t1,52($key)
1047 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1048 st $t1,56($key)
1049 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1050 st $t1,60($key)
1051
1052 srlg $i1,$t1,8
1053 srlg $i2,$t1,16
1054 srlg $i3,$t1,24
1055 nr $t1,$mask
1056 nr $i1,$mask
1057 nr $i2,$mask
1058
1059 la $key,32($key) # key+=8
1060 la $t3,4($t3) # i++
1061 j .L256_loop
1062
1063.Lminus1:
1064 lghi %r2,-1
1065 br $ra
1066.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1067
1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1069# AES_KEY *key) {
1070.globl private_AES_set_decrypt_key
1071.type private_AES_set_decrypt_key,\@function
1072.align 16
1073private_AES_set_decrypt_key:
1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1076 bras $ra,_s390x_AES_set_encrypt_key
1077 #l${g} $key,4*$SIZE_T($sp)
1078 l${g} $ra,14*$SIZE_T($sp)
1079 ltgr %r2,%r2
1080 bnzr $ra
1081___
1082$code.=<<___ if (!$softonly);
1083 #l $t0,240($key)
1084 lhi $t1,16
1085 cr $t0,$t1
1086 jl .Lgo
1087 oill $t0,0x80 # set "decrypt" bit
1088 st $t0,240($key)
1089 br $ra
1090___
1091$code.=<<___;
1092.align 16
1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1094 la $i1,0($key)
1095 sllg $i2,$rounds,4
1096 la $i2,0($i2,$key)
1097 srl $rounds,1
1098 lghi $t1,-16
1099
1100.align 16
1101.Linv: lmg $s0,$s1,0($i1)
1102 lmg $s2,$s3,0($i2)
1103 stmg $s0,$s1,0($i2)
1104 stmg $s2,$s3,0($i1)
1105 la $i1,16($i1)
1106 la $i2,0($t1,$i2)
1107 brct $rounds,.Linv
1108___
1109$mask80=$i1;
1110$mask1b=$i2;
1111$maskfe=$i3;
1112$code.=<<___;
1113 llgf $rounds,240($key)
1114 aghi $rounds,-1
1115 sll $rounds,2 # (rounds-1)*4
1116 llilh $mask80,0x8080
1117 llilh $mask1b,0x1b1b
1118 llilh $maskfe,0xfefe
1119 oill $mask80,0x8080
1120 oill $mask1b,0x1b1b
1121 oill $maskfe,0xfefe
1122
1123.align 16
1124.Lmix: l $s0,16($key) # tp1
1125 lr $s1,$s0
1126 ngr $s1,$mask80
1127 srlg $t1,$s1,7
1128 slr $s1,$t1
1129 nr $s1,$mask1b
1130 sllg $t1,$s0,1
1131 nr $t1,$maskfe
1132 xr $s1,$t1 # tp2
1133
1134 lr $s2,$s1
1135 ngr $s2,$mask80
1136 srlg $t1,$s2,7
1137 slr $s2,$t1
1138 nr $s2,$mask1b
1139 sllg $t1,$s1,1
1140 nr $t1,$maskfe
1141 xr $s2,$t1 # tp4
1142
1143 lr $s3,$s2
1144 ngr $s3,$mask80
1145 srlg $t1,$s3,7
1146 slr $s3,$t1
1147 nr $s3,$mask1b
1148 sllg $t1,$s2,1
1149 nr $t1,$maskfe
1150 xr $s3,$t1 # tp8
1151
1152 xr $s1,$s0 # tp2^tp1
1153 xr $s2,$s0 # tp4^tp1
1154 rll $s0,$s0,24 # = ROTATE(tp1,8)
1155 xr $s2,$s3 # ^=tp8
1156 xr $s0,$s1 # ^=tp2^tp1
1157 xr $s1,$s3 # tp2^tp1^tp8
1158 xr $s0,$s2 # ^=tp4^tp1^tp8
1159 rll $s1,$s1,8
1160 rll $s2,$s2,16
1161 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1162 rll $s3,$s3,24
1163 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1164 xr $s0,$s3 # ^= ROTATE(tp8,8)
1165
1166 st $s0,16($key)
1167 la $key,4($key)
1168 brct $rounds,.Lmix
1169
1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1171 lghi %r2,0
1172 br $ra
1173.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1174___
1175
1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1178# size_t length, const AES_KEY *key,
1179# unsigned char *ivec, const int enc)
1180{
1181my $inp="%r2";
1182my $out="%r4"; # length and out are swapped
1183my $len="%r3";
1184my $key="%r5";
1185my $ivp="%r6";
1186
1187$code.=<<___;
1188.globl AES_cbc_encrypt
1189.type AES_cbc_encrypt,\@function
1190.align 16
1191AES_cbc_encrypt:
1192 xgr %r3,%r4 # flip %r3 and %r4, out and len
1193 xgr %r4,%r3
1194 xgr %r3,%r4
1195___
1196$code.=<<___ if (!$softonly);
1197 lhi %r0,16
1198 cl %r0,240($key)
1199 jh .Lcbc_software
1200
1201 lg %r0,0($ivp) # copy ivec
1202 lg %r1,8($ivp)
1203 stmg %r0,%r1,16($sp)
1204 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1205 stmg %r0,%r1,32($sp)
1206 lmg %r0,%r1,16($key)
1207 stmg %r0,%r1,48($sp)
1208 l %r0,240($key) # load kmc code
1209 lghi $key,15 # res=len%16, len-=res;
1210 ngr $key,$len
1211 sl${g}r $len,$key
1212 la %r1,16($sp) # parameter block - ivec || key
1213 jz .Lkmc_truncated
1214 .long 0xb92f0042 # kmc %r4,%r2
1215 brc 1,.-4 # pay attention to "partial completion"
1216 ltr $key,$key
1217 jnz .Lkmc_truncated
1218.Lkmc_done:
1219 lmg %r0,%r1,16($sp) # copy ivec to caller
1220 stg %r0,0($ivp)
1221 stg %r1,8($ivp)
1222 br $ra
1223.align 16
1224.Lkmc_truncated:
1225 ahi $key,-1 # it's the way it's encoded in mvc
1226 tmll %r0,0x80
1227 jnz .Lkmc_truncated_dec
1228 lghi %r1,0
1229 stg %r1,16*$SIZE_T($sp)
1230 stg %r1,16*$SIZE_T+8($sp)
1231 bras %r1,1f
1232 mvc 16*$SIZE_T(1,$sp),0($inp)
12331: ex $key,0(%r1)
1234 la %r1,16($sp) # restore parameter block
1235 la $inp,16*$SIZE_T($sp)
1236 lghi $len,16
1237 .long 0xb92f0042 # kmc %r4,%r2
1238 j .Lkmc_done
1239.align 16
1240.Lkmc_truncated_dec:
1241 st${g} $out,4*$SIZE_T($sp)
1242 la $out,16*$SIZE_T($sp)
1243 lghi $len,16
1244 .long 0xb92f0042 # kmc %r4,%r2
1245 l${g} $out,4*$SIZE_T($sp)
1246 bras %r1,2f
1247 mvc 0(1,$out),16*$SIZE_T($sp)
12482: ex $key,0(%r1)
1249 j .Lkmc_done
1250.align 16
1251.Lcbc_software:
1252___
1253$code.=<<___;
1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1255 lhi %r0,0
1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1257 je .Lcbc_decrypt
1258
1259 larl $tbl,AES_Te
1260
1261 llgf $s0,0($ivp)
1262 llgf $s1,4($ivp)
1263 llgf $s2,8($ivp)
1264 llgf $s3,12($ivp)
1265
1266 lghi $t0,16
1267 sl${g}r $len,$t0
1268 brc 4,.Lcbc_enc_tail # if borrow
1269.Lcbc_enc_loop:
1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1271 x $s0,0($inp)
1272 x $s1,4($inp)
1273 x $s2,8($inp)
1274 x $s3,12($inp)
1275 lgr %r4,$key
1276
1277 bras $ra,_s390x_AES_encrypt
1278
1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1280 st $s0,0($out)
1281 st $s1,4($out)
1282 st $s2,8($out)
1283 st $s3,12($out)
1284
1285 la $inp,16($inp)
1286 la $out,16($out)
1287 lghi $t0,16
1288 lt${g}r $len,$len
1289 jz .Lcbc_enc_done
1290 sl${g}r $len,$t0
1291 brc 4,.Lcbc_enc_tail # if borrow
1292 j .Lcbc_enc_loop
1293.align 16
1294.Lcbc_enc_done:
1295 l${g} $ivp,6*$SIZE_T($sp)
1296 st $s0,0($ivp)
1297 st $s1,4($ivp)
1298 st $s2,8($ivp)
1299 st $s3,12($ivp)
1300
1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1302 br $ra
1303
1304.align 16
1305.Lcbc_enc_tail:
1306 aghi $len,15
1307 lghi $t0,0
1308 stg $t0,16*$SIZE_T($sp)
1309 stg $t0,16*$SIZE_T+8($sp)
1310 bras $t1,3f
1311 mvc 16*$SIZE_T(1,$sp),0($inp)
13123: ex $len,0($t1)
1313 lghi $len,0
1314 la $inp,16*$SIZE_T($sp)
1315 j .Lcbc_enc_loop
1316
1317.align 16
1318.Lcbc_decrypt:
1319 larl $tbl,AES_Td
1320
1321 lg $t0,0($ivp)
1322 lg $t1,8($ivp)
1323 stmg $t0,$t1,16*$SIZE_T($sp)
1324
1325.Lcbc_dec_loop:
1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1327 llgf $s0,0($inp)
1328 llgf $s1,4($inp)
1329 llgf $s2,8($inp)
1330 llgf $s3,12($inp)
1331 lgr %r4,$key
1332
1333 bras $ra,_s390x_AES_decrypt
1334
1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1336 sllg $s0,$s0,32
1337 sllg $s2,$s2,32
1338 lr $s0,$s1
1339 lr $s2,$s3
1340
1341 lg $t0,0($inp)
1342 lg $t1,8($inp)
1343 xg $s0,16*$SIZE_T($sp)
1344 xg $s2,16*$SIZE_T+8($sp)
1345 lghi $s1,16
1346 sl${g}r $len,$s1
1347 brc 4,.Lcbc_dec_tail # if borrow
1348 brc 2,.Lcbc_dec_done # if zero
1349 stg $s0,0($out)
1350 stg $s2,8($out)
1351 stmg $t0,$t1,16*$SIZE_T($sp)
1352
1353 la $inp,16($inp)
1354 la $out,16($out)
1355 j .Lcbc_dec_loop
1356
1357.Lcbc_dec_done:
1358 stg $s0,0($out)
1359 stg $s2,8($out)
1360.Lcbc_dec_exit:
1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1362 stmg $t0,$t1,0($ivp)
1363
1364 br $ra
1365
1366.align 16
1367.Lcbc_dec_tail:
1368 aghi $len,15
1369 stg $s0,16*$SIZE_T($sp)
1370 stg $s2,16*$SIZE_T+8($sp)
1371 bras $s1,4f
1372 mvc 0(1,$out),16*$SIZE_T($sp)
13734: ex $len,0($s1)
1374 j .Lcbc_dec_exit
1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,2*$SIZE_T($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,2*$SIZE_T($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 srlg $i2,$s0,63 # carry bit from lower half
1688 sllg $s0,$s0,1
1689 sllg $s1,$s1,1
1690 xgr $s0,$i1
1691 ogr $s1,$i2
1692.Lxts_km_start:
1693 lrvgr $i1,$s0 # flip byte order
1694 lrvgr $i2,$s1
1695 stg $i1,0($s2,$inp)
1696 stg $i2,8($s2,$inp)
1697 xg $i1,0($inp)
1698 xg $i2,8($inp)
1699 stg $i1,0($out,$inp)
1700 stg $i2,8($out,$inp)
1701 la $inp,16($inp)
1702 brct $s3,.Lxts_km_prepare
1703
1704 slgr $inp,$fp # rewind $inp
1705 la $s2,0($out,$inp)
1706 lgr $s3,$fp
1707 .long 0xb92e00aa # km $s2,$s2
1708 brc 1,.-4 # pay attention to "partial completion"
1709
1710 la $s2,16($sp)
1711 slgr $s2,$inp
1712 srlg $s3,$fp,4
1713.Lxts_km_xor:
1714 lg $i1,0($out,$inp)
1715 lg $i2,8($out,$inp)
1716 xg $i1,0($s2,$inp)
1717 xg $i2,8($s2,$inp)
1718 stg $i1,0($out,$inp)
1719 stg $i2,8($out,$inp)
1720 la $inp,16($inp)
1721 brct $s3,.Lxts_km_xor
1722
1723 slgr $len,$fp
1724 brc 1,.Lxts_km_loop # not zero, no borrow
1725 algr $fp,$len
1726 lghi $len,0
1727 brc 4+1,.Lxts_km_loop # not zero
1728
1729 l${g} $i1,0($sp) # back-chain
1730 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1731 la $i2,16($sp)
1732 srlg $fp,$fp,4
1733.Lxts_km_zap:
1734 stg $i1,0($i2)
1735 stg $i1,8($i2)
1736 la $i2,16($i2)
1737 brct $fp,.Lxts_km_zap
1738
1739 la $sp,0($i1)
1740 llgc $len,2*$SIZE_T-1($i1)
1741 nill $len,0x0f # $len%=16
1742 bzr $ra
1743
1744 # generate one more tweak...
1745 lghi $i1,0x87
1746 srag $i2,$s1,63 # broadcast upper bit
1747 ngr $i1,$i2 # rem
1748 srlg $i2,$s0,63 # carry bit from lower half
1749 sllg $s0,$s0,1
1750 sllg $s1,$s1,1
1751 xgr $s0,$i1
1752 ogr $s1,$i2
1753
1754 ltr $len,$len # clear zero flag
1755 br $ra
1756.size _s390x_xts_km,.-_s390x_xts_km
1757
1758.globl AES_xts_encrypt
1759.type AES_xts_encrypt,\@function
1760.align 16
1761AES_xts_encrypt:
1762 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1763 xgr %r4,%r3
1764 xgr %r3,%r4
1765___
1766$code.=<<___ if ($SIZE_T==4);
1767 llgfr $len,$len
1768___
1769$code.=<<___;
1770 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1771 srag $len,$len,4 # formally wrong, because it expands
1772 # sign byte, but who can afford asking
1773 # to process more than 2^63-1 bytes?
1774 # I use it, because it sets condition
1775 # code...
1776 bcr 8,$ra # abort if zero (i.e. less than 16)
1777___
1778$code.=<<___ if (!$softonly);
1779 llgf %r0,240($key2)
1780 lhi %r1,16
1781 clr %r0,%r1
1782 jl .Lxts_enc_software
1783
1784 stm${g} %r6,$s3,6*$SIZE_T($sp)
1785 st${g} $ra,14*$SIZE_T($sp)
1786
1787 sllg $len,$len,4 # $len&=~15
1788 slgr $out,$inp
1789
1790 # generate the tweak value
1791 l${g} $s3,$stdframe($sp) # pointer to iv
1792 la $s2,$tweak($sp)
1793 lmg $s0,$s1,0($s3)
1794 lghi $s3,16
1795 stmg $s0,$s1,0($s2)
1796 la %r1,0($key2) # $key2 is not needed anymore
1797 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1798 brc 1,.-4 # can this happen?
1799
1800 l %r0,240($key1)
1801 la %r1,0($key1) # $key1 is not needed anymore
1802 bras $ra,_s390x_xts_km
1803 jz .Lxts_enc_km_done
1804
1805 aghi $inp,-16 # take one step back
1806 la $i3,0($out,$inp) # put aside real $out
1807.Lxts_enc_km_steal:
1808 llgc $i1,16($inp)
1809 llgc $i2,0($out,$inp)
1810 stc $i1,0($out,$inp)
1811 stc $i2,16($out,$inp)
1812 la $inp,1($inp)
1813 brct $len,.Lxts_enc_km_steal
1814
1815 la $s2,0($i3)
1816 lghi $s3,16
1817 lrvgr $i1,$s0 # flip byte order
1818 lrvgr $i2,$s1
1819 xg $i1,0($s2)
1820 xg $i2,8($s2)
1821 stg $i1,0($s2)
1822 stg $i2,8($s2)
1823 .long 0xb92e00aa # km $s2,$s2
1824 brc 1,.-4 # can this happen?
1825 lrvgr $i1,$s0 # flip byte order
1826 lrvgr $i2,$s1
1827 xg $i1,0($i3)
1828 xg $i2,8($i3)
1829 stg $i1,0($i3)
1830 stg $i2,8($i3)
1831
1832.Lxts_enc_km_done:
1833 l${g} $ra,14*$SIZE_T($sp)
1834 st${g} $sp,$tweak($sp) # wipe tweak
1835 st${g} $sp,$tweak($sp)
1836 lm${g} %r6,$s3,6*$SIZE_T($sp)
1837 br $ra
1838.align 16
1839.Lxts_enc_software:
1840___
1841$code.=<<___;
1842 stm${g} %r6,$ra,6*$SIZE_T($sp)
1843
1844 slgr $out,$inp
1845
1846 xgr $s0,$s0 # clear upper half
1847 xgr $s1,$s1
1848 lrv $s0,$stdframe+4($sp) # load secno
1849 lrv $s1,$stdframe+0($sp)
1850 xgr $s2,$s2
1851 xgr $s3,$s3
1852 stm${g} %r2,%r5,2*$SIZE_T($sp)
1853 la $key,0($key2)
1854 larl $tbl,AES_Te
1855 bras $ra,_s390x_AES_encrypt # generate the tweak
1856 lm${g} %r2,%r5,2*$SIZE_T($sp)
1857 stm $s0,$s3,$tweak($sp) # save the tweak
1858 j .Lxts_enc_enter
1859
1860.align 16
1861.Lxts_enc_loop:
1862 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1863 lrvg $s3,$tweak+8($sp)
1864 lghi %r1,0x87
1865 srag %r0,$s3,63 # broadcast upper bit
1866 ngr %r1,%r0 # rem
1867 srlg %r0,$s1,63 # carry bit from lower half
1868 sllg $s1,$s1,1
1869 sllg $s3,$s3,1
1870 xgr $s1,%r1
1871 ogr $s3,%r0
1872 lrvgr $s1,$s1 # flip byte order
1873 lrvgr $s3,$s3
1874 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1875 stg $s1,$tweak+0($sp) # save the tweak
1876 llgfr $s1,$s1
1877 srlg $s2,$s3,32
1878 stg $s3,$tweak+8($sp)
1879 llgfr $s3,$s3
1880 la $inp,16($inp) # $inp+=16
1881.Lxts_enc_enter:
1882 x $s0,0($inp) # ^=*($inp)
1883 x $s1,4($inp)
1884 x $s2,8($inp)
1885 x $s3,12($inp)
1886 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1887 la $key,0($key1)
1888 bras $ra,_s390x_AES_encrypt
1889 lm${g} %r2,%r5,2*$SIZE_T($sp)
1890 x $s0,$tweak+0($sp) # ^=tweak
1891 x $s1,$tweak+4($sp)
1892 x $s2,$tweak+8($sp)
1893 x $s3,$tweak+12($sp)
1894 st $s0,0($out,$inp)
1895 st $s1,4($out,$inp)
1896 st $s2,8($out,$inp)
1897 st $s3,12($out,$inp)
1898 brct${g} $len,.Lxts_enc_loop
1899
1900 llgc $len,`2*$SIZE_T-1`($sp)
1901 nill $len,0x0f # $len%16
1902 jz .Lxts_enc_done
1903
1904 la $i3,0($inp,$out) # put aside real $out
1905.Lxts_enc_steal:
1906 llgc %r0,16($inp)
1907 llgc %r1,0($out,$inp)
1908 stc %r0,0($out,$inp)
1909 stc %r1,16($out,$inp)
1910 la $inp,1($inp)
1911 brct $len,.Lxts_enc_steal
1912 la $out,0($i3) # restore real $out
1913
1914 # generate last tweak...
1915 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1916 lrvg $s3,$tweak+8($sp)
1917 lghi %r1,0x87
1918 srag %r0,$s3,63 # broadcast upper bit
1919 ngr %r1,%r0 # rem
1920 srlg %r0,$s1,63 # carry bit from lower half
1921 sllg $s1,$s1,1
1922 sllg $s3,$s3,1
1923 xgr $s1,%r1
1924 ogr $s3,%r0
1925 lrvgr $s1,$s1 # flip byte order
1926 lrvgr $s3,$s3
1927 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1928 stg $s1,$tweak+0($sp) # save the tweak
1929 llgfr $s1,$s1
1930 srlg $s2,$s3,32
1931 stg $s3,$tweak+8($sp)
1932 llgfr $s3,$s3
1933
1934 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1935 x $s1,4($out)
1936 x $s2,8($out)
1937 x $s3,12($out)
1938 st${g} $out,4*$SIZE_T($sp)
1939 la $key,0($key1)
1940 bras $ra,_s390x_AES_encrypt
1941 l${g} $out,4*$SIZE_T($sp)
1942 x $s0,`$tweak+0`($sp) # ^=tweak
1943 x $s1,`$tweak+4`($sp)
1944 x $s2,`$tweak+8`($sp)
1945 x $s3,`$tweak+12`($sp)
1946 st $s0,0($out)
1947 st $s1,4($out)
1948 st $s2,8($out)
1949 st $s3,12($out)
1950
1951.Lxts_enc_done:
1952 stg $sp,$tweak+0($sp) # wipe tweak
1953 stg $sp,$twesk+8($sp)
1954 lm${g} %r6,$ra,6*$SIZE_T($sp)
1955 br $ra
1956.size AES_xts_encrypt,.-AES_xts_encrypt
1957___
1958# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1959# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1960#
1961$code.=<<___;
1962.globl AES_xts_decrypt
1963.type AES_xts_decrypt,\@function
1964.align 16
1965AES_xts_decrypt:
1966 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1967 xgr %r4,%r3
1968 xgr %r3,%r4
1969___
1970$code.=<<___ if ($SIZE_T==4);
1971 llgfr $len,$len
1972___
1973$code.=<<___;
1974 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1975 aghi $len,-16
1976 bcr 4,$ra # abort if less than zero. formally
1977 # wrong, because $len is unsigned,
1978 # but who can afford asking to
1979 # process more than 2^63-1 bytes?
1980 tmll $len,0x0f
1981 jnz .Lxts_dec_proceed
1982 aghi $len,16
1983.Lxts_dec_proceed:
1984___
1985$code.=<<___ if (!$softonly);
1986 llgf %r0,240($key2)
1987 lhi %r1,16
1988 clr %r0,%r1
1989 jl .Lxts_dec_software
1990
1991 stm${g} %r6,$s3,6*$SIZE_T($sp)
1992 st${g} $ra,14*$SIZE_T($sp)
1993
1994 nill $len,0xfff0 # $len&=~15
1995 slgr $out,$inp
1996
1997 # generate the tweak value
1998 l${g} $s3,$stdframe($sp) # pointer to iv
1999 la $s2,$tweak($sp)
2000 lmg $s0,$s1,0($s3)
2001 lghi $s3,16
2002 stmg $s0,$s1,0($s2)
2003 la %r1,0($key2) # $key2 is not needed past this point
2004 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2005 brc 1,.-4 # can this happen?
2006
2007 l %r0,240($key1)
2008 la %r1,0($key1) # $key1 is not needed anymore
2009
2010 ltgr $len,$len
2011 jz .Lxts_dec_km_short
2012 bras $ra,_s390x_xts_km
2013 jz .Lxts_dec_km_done
2014
2015 lrvgr $s2,$s0 # make copy in reverse byte order
2016 lrvgr $s3,$s1
2017 j .Lxts_dec_km_2ndtweak
2018
2019.Lxts_dec_km_short:
2020 llgc $len,`2*$SIZE_T-1`($sp)
2021 nill $len,0x0f # $len%=16
2022 lrvg $s0,$tweak+0($sp) # load the tweak
2023 lrvg $s1,$tweak+8($sp)
2024 lrvgr $s2,$s0 # make copy in reverse byte order
2025 lrvgr $s3,$s1
2026
2027.Lxts_dec_km_2ndtweak:
2028 lghi $i1,0x87
2029 srag $i2,$s1,63 # broadcast upper bit
2030 ngr $i1,$i2 # rem
2031 srlg $i2,$s0,63 # carry bit from lower half
2032 sllg $s0,$s0,1
2033 sllg $s1,$s1,1
2034 xgr $s0,$i1
2035 ogr $s1,$i2
2036 lrvgr $i1,$s0 # flip byte order
2037 lrvgr $i2,$s1
2038
2039 xg $i1,0($inp)
2040 xg $i2,8($inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043 la $i2,0($out,$inp)
2044 lghi $i3,16
2045 .long 0xb92e0066 # km $i2,$i2
2046 brc 1,.-4 # can this happen?
2047 lrvgr $i1,$s0
2048 lrvgr $i2,$s1
2049 xg $i1,0($out,$inp)
2050 xg $i2,8($out,$inp)
2051 stg $i1,0($out,$inp)
2052 stg $i2,8($out,$inp)
2053
2054 la $i3,0($out,$inp) # put aside real $out
2055.Lxts_dec_km_steal:
2056 llgc $i1,16($inp)
2057 llgc $i2,0($out,$inp)
2058 stc $i1,0($out,$inp)
2059 stc $i2,16($out,$inp)
2060 la $inp,1($inp)
2061 brct $len,.Lxts_dec_km_steal
2062
2063 lgr $s0,$s2
2064 lgr $s1,$s3
2065 xg $s0,0($i3)
2066 xg $s1,8($i3)
2067 stg $s0,0($i3)
2068 stg $s1,8($i3)
2069 la $s0,0($i3)
2070 lghi $s1,16
2071 .long 0xb92e0088 # km $s0,$s0
2072 brc 1,.-4 # can this happen?
2073 xg $s2,0($i3)
2074 xg $s3,8($i3)
2075 stg $s2,0($i3)
2076 stg $s3,8($i3)
2077.Lxts_dec_km_done:
2078 l${g} $ra,14*$SIZE_T($sp)
2079 st${g} $sp,$tweak($sp) # wipe tweak
2080 st${g} $sp,$tweak($sp)
2081 lm${g} %r6,$s3,6*$SIZE_T($sp)
2082 br $ra
2083.align 16
2084.Lxts_dec_software:
2085___
2086$code.=<<___;
2087 stm${g} %r6,$ra,6*$SIZE_T($sp)
2088
2089 srlg $len,$len,4
2090 slgr $out,$inp
2091
2092 xgr $s0,$s0 # clear upper half
2093 xgr $s1,$s1
2094 lrv $s0,$stdframe+4($sp) # load secno
2095 lrv $s1,$stdframe+0($sp)
2096 xgr $s2,$s2
2097 xgr $s3,$s3
2098 stm${g} %r2,%r5,2*$SIZE_T($sp)
2099 la $key,0($key2)
2100 larl $tbl,AES_Te
2101 bras $ra,_s390x_AES_encrypt # generate the tweak
2102 lm${g} %r2,%r5,2*$SIZE_T($sp)
2103 larl $tbl,AES_Td
2104 lt${g}r $len,$len
2105 stm $s0,$s3,$tweak($sp) # save the tweak
2106 jz .Lxts_dec_short
2107 j .Lxts_dec_enter
2108
2109.align 16
2110.Lxts_dec_loop:
2111 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2112 lrvg $s3,$tweak+8($sp)
2113 lghi %r1,0x87
2114 srag %r0,$s3,63 # broadcast upper bit
2115 ngr %r1,%r0 # rem
2116 srlg %r0,$s1,63 # carry bit from lower half
2117 sllg $s1,$s1,1
2118 sllg $s3,$s3,1
2119 xgr $s1,%r1
2120 ogr $s3,%r0
2121 lrvgr $s1,$s1 # flip byte order
2122 lrvgr $s3,$s3
2123 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2124 stg $s1,$tweak+0($sp) # save the tweak
2125 llgfr $s1,$s1
2126 srlg $s2,$s3,32
2127 stg $s3,$tweak+8($sp)
2128 llgfr $s3,$s3
2129.Lxts_dec_enter:
2130 x $s0,0($inp) # tweak^=*(inp)
2131 x $s1,4($inp)
2132 x $s2,8($inp)
2133 x $s3,12($inp)
2134 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2135 la $key,0($key1)
2136 bras $ra,_s390x_AES_decrypt
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2138 x $s0,$tweak+0($sp) # ^=tweak
2139 x $s1,$tweak+4($sp)
2140 x $s2,$tweak+8($sp)
2141 x $s3,$tweak+12($sp)
2142 st $s0,0($out,$inp)
2143 st $s1,4($out,$inp)
2144 st $s2,8($out,$inp)
2145 st $s3,12($out,$inp)
2146 la $inp,16($inp)
2147 brct${g} $len,.Lxts_dec_loop
2148
2149 llgc $len,`2*$SIZE_T-1`($sp)
2150 nill $len,0x0f # $len%16
2151 jz .Lxts_dec_done
2152
2153 # generate pair of tweaks...
2154 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2155 lrvg $s3,$tweak+8($sp)
2156 lghi %r1,0x87
2157 srag %r0,$s3,63 # broadcast upper bit
2158 ngr %r1,%r0 # rem
2159 srlg %r0,$s1,63 # carry bit from lower half
2160 sllg $s1,$s1,1
2161 sllg $s3,$s3,1
2162 xgr $s1,%r1
2163 ogr $s3,%r0
2164 lrvgr $i2,$s1 # flip byte order
2165 lrvgr $i3,$s3
2166 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2167 j .Lxts_dec_2ndtweak
2168
2169.align 16
2170.Lxts_dec_short:
2171 llgc $len,`2*$SIZE_T-1`($sp)
2172 nill $len,0x0f # $len%16
2173 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2174 lrvg $s3,$tweak+8($sp)
2175.Lxts_dec_2ndtweak:
2176 lghi %r1,0x87
2177 srag %r0,$s3,63 # broadcast upper bit
2178 ngr %r1,%r0 # rem
2179 srlg %r0,$s1,63 # carry bit from lower half
2180 sllg $s1,$s1,1
2181 sllg $s3,$s3,1
2182 xgr $s1,%r1
2183 ogr $s3,%r0
2184 lrvgr $s1,$s1 # flip byte order
2185 lrvgr $s3,$s3
2186 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2187 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2188 llgfr $s1,$s1
2189 srlg $s2,$s3,32
2190 stg $s3,$tweak-16+8($sp)
2191 llgfr $s3,$s3
2192
2193 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2194 x $s1,4($inp)
2195 x $s2,8($inp)
2196 x $s3,12($inp)
2197 stm${g} %r2,%r3,2*$SIZE_T($sp)
2198 la $key,0($key1)
2199 bras $ra,_s390x_AES_decrypt
2200 lm${g} %r2,%r5,2*$SIZE_T($sp)
2201 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2202 x $s1,$tweak-16+4($sp)
2203 x $s2,$tweak-16+8($sp)
2204 x $s3,$tweak-16+12($sp)
2205 st $s0,0($out,$inp)
2206 st $s1,4($out,$inp)
2207 st $s2,8($out,$inp)
2208 st $s3,12($out,$inp)
2209
2210 la $i3,0($out,$inp) # put aside real $out
2211.Lxts_dec_steal:
2212 llgc %r0,16($inp)
2213 llgc %r1,0($out,$inp)
2214 stc %r0,0($out,$inp)
2215 stc %r1,16($out,$inp)
2216 la $inp,1($inp)
2217 brct $len,.Lxts_dec_steal
2218 la $out,0($i3) # restore real $out
2219
2220 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2221 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2222 x $s1,4($out)
2223 x $s2,8($out)
2224 x $s3,12($out)
2225 st${g} $out,4*$SIZE_T($sp)
2226 la $key,0($key1)
2227 bras $ra,_s390x_AES_decrypt
2228 l${g} $out,4*$SIZE_T($sp)
2229 x $s0,$tweak+0($sp) # ^=tweak
2230 x $s1,$tweak+4($sp)
2231 x $s2,$tweak+8($sp)
2232 x $s3,$tweak+12($sp)
2233 st $s0,0($out)
2234 st $s1,4($out)
2235 st $s2,8($out)
2236 st $s3,12($out)
2237 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2238 stg $sp,$tweak-16+8($sp)
2239.Lxts_dec_done:
2240 stg $sp,$tweak+0($sp) # wipe tweak
2241 stg $sp,$twesk+8($sp)
2242 lm${g} %r6,$ra,6*$SIZE_T($sp)
2243 br $ra
2244.size AES_xts_decrypt,.-AES_xts_decrypt
2245___
2246}
2247$code.=<<___;
2248.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2249.comm OPENSSL_s390xcap_P,16,8
2250___
2251
2252$code =~ s/\`([^\`]*)\`/eval $1/gem;
2253print $code;
2254close STDOUT; # force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
new file mode 100755
index 0000000000..403c4d1290
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
@@ -0,0 +1,1182 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# Version 1.1
10#
11# The major reason for undertaken effort was to mitigate the hazard of
12# cache-timing attack. This is [currently and initially!] addressed in
13# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
14# 2. References to them are scheduled for L2 cache latency, meaning
15# that the tables don't have to reside in L1 cache. Once again, this
16# is an initial draft and one should expect more countermeasures to
17# be implemented...
18#
19# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
20# round.
21#
22# Even though performance was not the primary goal [on the contrary,
23# extra shifts "induced" by compressed S-box and longer loop epilogue
24# "induced" by scheduling for L2 have negative effect on performance],
25# the code turned out to run in ~23 cycles per processed byte en-/
26# decrypted with 128-bit key. This is pretty good result for code
27# with mentioned qualities and UltraSPARC core. Compared to Sun C
28# generated code my encrypt procedure runs just few percents faster,
29# while decrypt one - whole 50% faster [yes, Sun C failed to generate
30# optimal decrypt procedure]. Compared to GNU C generated code both
31# procedures are more than 60% faster:-)
32
33$bits=32;
34for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
35if ($bits==64) { $bias=2047; $frame=192; }
36else { $bias=0; $frame=112; }
37$locals=16;
38
39$acc0="%l0";
40$acc1="%o0";
41$acc2="%o1";
42$acc3="%o2";
43
44$acc4="%l1";
45$acc5="%o3";
46$acc6="%o4";
47$acc7="%o5";
48
49$acc8="%l2";
50$acc9="%o7";
51$acc10="%g1";
52$acc11="%g2";
53
54$acc12="%l3";
55$acc13="%g3";
56$acc14="%g4";
57$acc15="%g5";
58
59$t0="%l4";
60$t1="%l5";
61$t2="%l6";
62$t3="%l7";
63
64$s0="%i0";
65$s1="%i1";
66$s2="%i2";
67$s3="%i3";
68$tbl="%i4";
69$key="%i5";
70$rounds="%i7"; # aliases with return address, which is off-loaded to stack
71
72sub _data_word()
73{ my $i;
74 while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
75}
76
77$code.=<<___ if ($bits==64);
78.register %g2,#scratch
79.register %g3,#scratch
80___
81$code.=<<___;
82.section ".text",#alloc,#execinstr
83
84.align 256
85AES_Te:
86___
87&_data_word(
88 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
89 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
90 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
91 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
92 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
93 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
94 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
95 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
96 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
97 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
98 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
99 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
100 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
101 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
102 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
103 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
104 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
105 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
106 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
107 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
108 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
109 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
110 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
111 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
112 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
113 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
114 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
115 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
116 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
117 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
118 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
119 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
120 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
121 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
122 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
123 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
124 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
125 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
126 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
127 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
128 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
129 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
130 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
131 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
132 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
133 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
134 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
135 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
136 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
137 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
138 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
139 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
140 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
141 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
142 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
143 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
144 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
145 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
146 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
147 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
148 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
149 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
150 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
151 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
152$code.=<<___;
153 .byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
154 .byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
155 .byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
156 .byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
157 .byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
158 .byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
159 .byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
160 .byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
161 .byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
162 .byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
163 .byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
164 .byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
165 .byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
166 .byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
167 .byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
168 .byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
169 .byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
170 .byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
171 .byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
172 .byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
173 .byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
174 .byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
175 .byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
176 .byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
177 .byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
178 .byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
179 .byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
180 .byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
181 .byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
182 .byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
183 .byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
184 .byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
185.type AES_Te,#object
186.size AES_Te,(.-AES_Te)
187
188.align 64
189.skip 16
190_sparcv9_AES_encrypt:
191 save %sp,-$frame-$locals,%sp
192 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
193 ld [$key+240],$rounds
194 ld [$key+0],$t0
195 ld [$key+4],$t1 !
196 ld [$key+8],$t2
197 srl $rounds,1,$rounds
198 xor $t0,$s0,$s0
199 ld [$key+12],$t3
200 srl $s0,21,$acc0
201 xor $t1,$s1,$s1
202 ld [$key+16],$t0
203 srl $s1,13,$acc1 !
204 xor $t2,$s2,$s2
205 ld [$key+20],$t1
206 xor $t3,$s3,$s3
207 ld [$key+24],$t2
208 and $acc0,2040,$acc0
209 ld [$key+28],$t3
210 nop
211.Lenc_loop:
212 srl $s2,5,$acc2 !
213 and $acc1,2040,$acc1
214 ldx [$tbl+$acc0],$acc0
215 sll $s3,3,$acc3
216 and $acc2,2040,$acc2
217 ldx [$tbl+$acc1],$acc1
218 srl $s1,21,$acc4
219 and $acc3,2040,$acc3
220 ldx [$tbl+$acc2],$acc2 !
221 srl $s2,13,$acc5
222 and $acc4,2040,$acc4
223 ldx [$tbl+$acc3],$acc3
224 srl $s3,5,$acc6
225 and $acc5,2040,$acc5
226 ldx [$tbl+$acc4],$acc4
227 fmovs %f0,%f0
228 sll $s0,3,$acc7 !
229 and $acc6,2040,$acc6
230 ldx [$tbl+$acc5],$acc5
231 srl $s2,21,$acc8
232 and $acc7,2040,$acc7
233 ldx [$tbl+$acc6],$acc6
234 srl $s3,13,$acc9
235 and $acc8,2040,$acc8
236 ldx [$tbl+$acc7],$acc7 !
237 srl $s0,5,$acc10
238 and $acc9,2040,$acc9
239 ldx [$tbl+$acc8],$acc8
240 sll $s1,3,$acc11
241 and $acc10,2040,$acc10
242 ldx [$tbl+$acc9],$acc9
243 fmovs %f0,%f0
244 srl $s3,21,$acc12 !
245 and $acc11,2040,$acc11
246 ldx [$tbl+$acc10],$acc10
247 srl $s0,13,$acc13
248 and $acc12,2040,$acc12
249 ldx [$tbl+$acc11],$acc11
250 srl $s1,5,$acc14
251 and $acc13,2040,$acc13
252 ldx [$tbl+$acc12],$acc12 !
253 sll $s2,3,$acc15
254 and $acc14,2040,$acc14
255 ldx [$tbl+$acc13],$acc13
256 and $acc15,2040,$acc15
257 add $key,32,$key
258 ldx [$tbl+$acc14],$acc14
259 fmovs %f0,%f0
260 subcc $rounds,1,$rounds !
261 ldx [$tbl+$acc15],$acc15
262 bz,a,pn %icc,.Lenc_last
263 add $tbl,2048,$rounds
264
265 srlx $acc1,8,$acc1
266 xor $acc0,$t0,$t0
267 ld [$key+0],$s0
268 fmovs %f0,%f0
269 srlx $acc2,16,$acc2 !
270 xor $acc1,$t0,$t0
271 ld [$key+4],$s1
272 srlx $acc3,24,$acc3
273 xor $acc2,$t0,$t0
274 ld [$key+8],$s2
275 srlx $acc5,8,$acc5
276 xor $acc3,$t0,$t0
277 ld [$key+12],$s3 !
278 srlx $acc6,16,$acc6
279 xor $acc4,$t1,$t1
280 fmovs %f0,%f0
281 srlx $acc7,24,$acc7
282 xor $acc5,$t1,$t1
283 srlx $acc9,8,$acc9
284 xor $acc6,$t1,$t1
285 srlx $acc10,16,$acc10 !
286 xor $acc7,$t1,$t1
287 srlx $acc11,24,$acc11
288 xor $acc8,$t2,$t2
289 srlx $acc13,8,$acc13
290 xor $acc9,$t2,$t2
291 srlx $acc14,16,$acc14
292 xor $acc10,$t2,$t2
293 srlx $acc15,24,$acc15 !
294 xor $acc11,$t2,$t2
295 xor $acc12,$acc14,$acc14
296 xor $acc13,$t3,$t3
297 srl $t0,21,$acc0
298 xor $acc14,$t3,$t3
299 srl $t1,13,$acc1
300 xor $acc15,$t3,$t3
301
302 and $acc0,2040,$acc0 !
303 srl $t2,5,$acc2
304 and $acc1,2040,$acc1
305 ldx [$tbl+$acc0],$acc0
306 sll $t3,3,$acc3
307 and $acc2,2040,$acc2
308 ldx [$tbl+$acc1],$acc1
309 fmovs %f0,%f0
310 srl $t1,21,$acc4 !
311 and $acc3,2040,$acc3
312 ldx [$tbl+$acc2],$acc2
313 srl $t2,13,$acc5
314 and $acc4,2040,$acc4
315 ldx [$tbl+$acc3],$acc3
316 srl $t3,5,$acc6
317 and $acc5,2040,$acc5
318 ldx [$tbl+$acc4],$acc4 !
319 sll $t0,3,$acc7
320 and $acc6,2040,$acc6
321 ldx [$tbl+$acc5],$acc5
322 srl $t2,21,$acc8
323 and $acc7,2040,$acc7
324 ldx [$tbl+$acc6],$acc6
325 fmovs %f0,%f0
326 srl $t3,13,$acc9 !
327 and $acc8,2040,$acc8
328 ldx [$tbl+$acc7],$acc7
329 srl $t0,5,$acc10
330 and $acc9,2040,$acc9
331 ldx [$tbl+$acc8],$acc8
332 sll $t1,3,$acc11
333 and $acc10,2040,$acc10
334 ldx [$tbl+$acc9],$acc9 !
335 srl $t3,21,$acc12
336 and $acc11,2040,$acc11
337 ldx [$tbl+$acc10],$acc10
338 srl $t0,13,$acc13
339 and $acc12,2040,$acc12
340 ldx [$tbl+$acc11],$acc11
341 fmovs %f0,%f0
342 srl $t1,5,$acc14 !
343 and $acc13,2040,$acc13
344 ldx [$tbl+$acc12],$acc12
345 sll $t2,3,$acc15
346 and $acc14,2040,$acc14
347 ldx [$tbl+$acc13],$acc13
348 srlx $acc1,8,$acc1
349 and $acc15,2040,$acc15
350 ldx [$tbl+$acc14],$acc14 !
351
352 srlx $acc2,16,$acc2
353 xor $acc0,$s0,$s0
354 ldx [$tbl+$acc15],$acc15
355 srlx $acc3,24,$acc3
356 xor $acc1,$s0,$s0
357 ld [$key+16],$t0
358 fmovs %f0,%f0
359 srlx $acc5,8,$acc5 !
360 xor $acc2,$s0,$s0
361 ld [$key+20],$t1
362 srlx $acc6,16,$acc6
363 xor $acc3,$s0,$s0
364 ld [$key+24],$t2
365 srlx $acc7,24,$acc7
366 xor $acc4,$s1,$s1
367 ld [$key+28],$t3 !
368 srlx $acc9,8,$acc9
369 xor $acc5,$s1,$s1
370 ldx [$tbl+2048+0],%g0 ! prefetch te4
371 srlx $acc10,16,$acc10
372 xor $acc6,$s1,$s1
373 ldx [$tbl+2048+32],%g0 ! prefetch te4
374 srlx $acc11,24,$acc11
375 xor $acc7,$s1,$s1
376 ldx [$tbl+2048+64],%g0 ! prefetch te4
377 srlx $acc13,8,$acc13
378 xor $acc8,$s2,$s2
379 ldx [$tbl+2048+96],%g0 ! prefetch te4
380 srlx $acc14,16,$acc14 !
381 xor $acc9,$s2,$s2
382 ldx [$tbl+2048+128],%g0 ! prefetch te4
383 srlx $acc15,24,$acc15
384 xor $acc10,$s2,$s2
385 ldx [$tbl+2048+160],%g0 ! prefetch te4
386 srl $s0,21,$acc0
387 xor $acc11,$s2,$s2
388 ldx [$tbl+2048+192],%g0 ! prefetch te4
389 xor $acc12,$acc14,$acc14
390 xor $acc13,$s3,$s3
391 ldx [$tbl+2048+224],%g0 ! prefetch te4
392 srl $s1,13,$acc1 !
393 xor $acc14,$s3,$s3
394 xor $acc15,$s3,$s3
395 ba .Lenc_loop
396 and $acc0,2040,$acc0
397
398.align 32
399.Lenc_last:
400 srlx $acc1,8,$acc1 !
401 xor $acc0,$t0,$t0
402 ld [$key+0],$s0
403 srlx $acc2,16,$acc2
404 xor $acc1,$t0,$t0
405 ld [$key+4],$s1
406 srlx $acc3,24,$acc3
407 xor $acc2,$t0,$t0
408 ld [$key+8],$s2 !
409 srlx $acc5,8,$acc5
410 xor $acc3,$t0,$t0
411 ld [$key+12],$s3
412 srlx $acc6,16,$acc6
413 xor $acc4,$t1,$t1
414 srlx $acc7,24,$acc7
415 xor $acc5,$t1,$t1
416 srlx $acc9,8,$acc9 !
417 xor $acc6,$t1,$t1
418 srlx $acc10,16,$acc10
419 xor $acc7,$t1,$t1
420 srlx $acc11,24,$acc11
421 xor $acc8,$t2,$t2
422 srlx $acc13,8,$acc13
423 xor $acc9,$t2,$t2
424 srlx $acc14,16,$acc14 !
425 xor $acc10,$t2,$t2
426 srlx $acc15,24,$acc15
427 xor $acc11,$t2,$t2
428 xor $acc12,$acc14,$acc14
429 xor $acc13,$t3,$t3
430 srl $t0,24,$acc0
431 xor $acc14,$t3,$t3
432 srl $t1,16,$acc1 !
433 xor $acc15,$t3,$t3
434
435 srl $t2,8,$acc2
436 and $acc1,255,$acc1
437 ldub [$rounds+$acc0],$acc0
438 srl $t1,24,$acc4
439 and $acc2,255,$acc2
440 ldub [$rounds+$acc1],$acc1
441 srl $t2,16,$acc5 !
442 and $t3,255,$acc3
443 ldub [$rounds+$acc2],$acc2
444 ldub [$rounds+$acc3],$acc3
445 srl $t3,8,$acc6
446 and $acc5,255,$acc5
447 ldub [$rounds+$acc4],$acc4
448 fmovs %f0,%f0
449 srl $t2,24,$acc8 !
450 and $acc6,255,$acc6
451 ldub [$rounds+$acc5],$acc5
452 srl $t3,16,$acc9
453 and $t0,255,$acc7
454 ldub [$rounds+$acc6],$acc6
455 ldub [$rounds+$acc7],$acc7
456 fmovs %f0,%f0
457 srl $t0,8,$acc10 !
458 and $acc9,255,$acc9
459 ldub [$rounds+$acc8],$acc8
460 srl $t3,24,$acc12
461 and $acc10,255,$acc10
462 ldub [$rounds+$acc9],$acc9
463 srl $t0,16,$acc13
464 and $t1,255,$acc11
465 ldub [$rounds+$acc10],$acc10 !
466 srl $t1,8,$acc14
467 and $acc13,255,$acc13
468 ldub [$rounds+$acc11],$acc11
469 ldub [$rounds+$acc12],$acc12
470 and $acc14,255,$acc14
471 ldub [$rounds+$acc13],$acc13
472 and $t2,255,$acc15
473 ldub [$rounds+$acc14],$acc14 !
474
475 sll $acc0,24,$acc0
476 xor $acc3,$s0,$s0
477 ldub [$rounds+$acc15],$acc15
478 sll $acc1,16,$acc1
479 xor $acc0,$s0,$s0
480 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
481 fmovs %f0,%f0
482 sll $acc2,8,$acc2 !
483 xor $acc1,$s0,$s0
484 sll $acc4,24,$acc4
485 xor $acc2,$s0,$s0
486 sll $acc5,16,$acc5
487 xor $acc7,$s1,$s1
488 sll $acc6,8,$acc6
489 xor $acc4,$s1,$s1
490 sll $acc8,24,$acc8 !
491 xor $acc5,$s1,$s1
492 sll $acc9,16,$acc9
493 xor $acc11,$s2,$s2
494 sll $acc10,8,$acc10
495 xor $acc6,$s1,$s1
496 sll $acc12,24,$acc12
497 xor $acc8,$s2,$s2
498 sll $acc13,16,$acc13 !
499 xor $acc9,$s2,$s2
500 sll $acc14,8,$acc14
501 xor $acc10,$s2,$s2
502 xor $acc12,$acc14,$acc14
503 xor $acc13,$s3,$s3
504 xor $acc14,$s3,$s3
505 xor $acc15,$s3,$s3
506
507 ret
508 restore
509.type _sparcv9_AES_encrypt,#function
510.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
511
512.align 32
513.globl AES_encrypt
514AES_encrypt:
515 or %o0,%o1,%g1
516 andcc %g1,3,%g0
517 bnz,pn %xcc,.Lunaligned_enc
518 save %sp,-$frame,%sp
519
520 ld [%i0+0],%o0
521 ld [%i0+4],%o1
522 ld [%i0+8],%o2
523 ld [%i0+12],%o3
524
5251: call .+8
526 add %o7,AES_Te-1b,%o4
527 call _sparcv9_AES_encrypt
528 mov %i2,%o5
529
530 st %o0,[%i1+0]
531 st %o1,[%i1+4]
532 st %o2,[%i1+8]
533 st %o3,[%i1+12]
534
535 ret
536 restore
537
538.align 32
539.Lunaligned_enc:
540 ldub [%i0+0],%l0
541 ldub [%i0+1],%l1
542 ldub [%i0+2],%l2
543
544 sll %l0,24,%l0
545 ldub [%i0+3],%l3
546 sll %l1,16,%l1
547 ldub [%i0+4],%l4
548 sll %l2,8,%l2
549 or %l1,%l0,%l0
550 ldub [%i0+5],%l5
551 sll %l4,24,%l4
552 or %l3,%l2,%l2
553 ldub [%i0+6],%l6
554 sll %l5,16,%l5
555 or %l0,%l2,%o0
556 ldub [%i0+7],%l7
557
558 sll %l6,8,%l6
559 or %l5,%l4,%l4
560 ldub [%i0+8],%l0
561 or %l7,%l6,%l6
562 ldub [%i0+9],%l1
563 or %l4,%l6,%o1
564 ldub [%i0+10],%l2
565
566 sll %l0,24,%l0
567 ldub [%i0+11],%l3
568 sll %l1,16,%l1
569 ldub [%i0+12],%l4
570 sll %l2,8,%l2
571 or %l1,%l0,%l0
572 ldub [%i0+13],%l5
573 sll %l4,24,%l4
574 or %l3,%l2,%l2
575 ldub [%i0+14],%l6
576 sll %l5,16,%l5
577 or %l0,%l2,%o2
578 ldub [%i0+15],%l7
579
580 sll %l6,8,%l6
581 or %l5,%l4,%l4
582 or %l7,%l6,%l6
583 or %l4,%l6,%o3
584
5851: call .+8
586 add %o7,AES_Te-1b,%o4
587 call _sparcv9_AES_encrypt
588 mov %i2,%o5
589
590 srl %o0,24,%l0
591 srl %o0,16,%l1
592 stb %l0,[%i1+0]
593 srl %o0,8,%l2
594 stb %l1,[%i1+1]
595 stb %l2,[%i1+2]
596 srl %o1,24,%l4
597 stb %o0,[%i1+3]
598
599 srl %o1,16,%l5
600 stb %l4,[%i1+4]
601 srl %o1,8,%l6
602 stb %l5,[%i1+5]
603 stb %l6,[%i1+6]
604 srl %o2,24,%l0
605 stb %o1,[%i1+7]
606
607 srl %o2,16,%l1
608 stb %l0,[%i1+8]
609 srl %o2,8,%l2
610 stb %l1,[%i1+9]
611 stb %l2,[%i1+10]
612 srl %o3,24,%l4
613 stb %o2,[%i1+11]
614
615 srl %o3,16,%l5
616 stb %l4,[%i1+12]
617 srl %o3,8,%l6
618 stb %l5,[%i1+13]
619 stb %l6,[%i1+14]
620 stb %o3,[%i1+15]
621
622 ret
623 restore
624.type AES_encrypt,#function
625.size AES_encrypt,(.-AES_encrypt)
626
627___
628
629$code.=<<___;
630.align 256
631AES_Td:
632___
633&_data_word(
634 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
635 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
636 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
637 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
638 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
639 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
640 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
641 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
642 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
643 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
644 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
645 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
646 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
647 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
648 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
649 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
650 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
651 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
652 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
653 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
654 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
655 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
656 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
657 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
658 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
659 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
660 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
661 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
662 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
663 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
664 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
665 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
666 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
667 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
668 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
669 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
670 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
671 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
672 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
673 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
674 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
675 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
676 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
677 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
678 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
679 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
680 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
681 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
682 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
683 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
684 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
685 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
686 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
687 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
688 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
689 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
690 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
691 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
692 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
693 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
694 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
695 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
696 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
697 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
698$code.=<<___;
699 .byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
700 .byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
701 .byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
702 .byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
703 .byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
704 .byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
705 .byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
706 .byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
707 .byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
708 .byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
709 .byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
710 .byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
711 .byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
712 .byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
713 .byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
714 .byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
715 .byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
716 .byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
717 .byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
718 .byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
719 .byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
720 .byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
721 .byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
722 .byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
723 .byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
724 .byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
725 .byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
726 .byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
727 .byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
728 .byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
729 .byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
730 .byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
731.type AES_Td,#object
732.size AES_Td,(.-AES_Td)
733
734.align 64
735.skip 16
736_sparcv9_AES_decrypt:
737 save %sp,-$frame-$locals,%sp
738 stx %i7,[%sp+$bias+$frame+0] ! off-load return address
739 ld [$key+240],$rounds
740 ld [$key+0],$t0
741 ld [$key+4],$t1 !
742 ld [$key+8],$t2
743 ld [$key+12],$t3
744 srl $rounds,1,$rounds
745 xor $t0,$s0,$s0
746 ld [$key+16],$t0
747 xor $t1,$s1,$s1
748 ld [$key+20],$t1
749 srl $s0,21,$acc0 !
750 xor $t2,$s2,$s2
751 ld [$key+24],$t2
752 xor $t3,$s3,$s3
753 and $acc0,2040,$acc0
754 ld [$key+28],$t3
755 srl $s3,13,$acc1
756 nop
757.Ldec_loop:
758 srl $s2,5,$acc2 !
759 and $acc1,2040,$acc1
760 ldx [$tbl+$acc0],$acc0
761 sll $s1,3,$acc3
762 and $acc2,2040,$acc2
763 ldx [$tbl+$acc1],$acc1
764 srl $s1,21,$acc4
765 and $acc3,2040,$acc3
766 ldx [$tbl+$acc2],$acc2 !
767 srl $s0,13,$acc5
768 and $acc4,2040,$acc4
769 ldx [$tbl+$acc3],$acc3
770 srl $s3,5,$acc6
771 and $acc5,2040,$acc5
772 ldx [$tbl+$acc4],$acc4
773 fmovs %f0,%f0
774 sll $s2,3,$acc7 !
775 and $acc6,2040,$acc6
776 ldx [$tbl+$acc5],$acc5
777 srl $s2,21,$acc8
778 and $acc7,2040,$acc7
779 ldx [$tbl+$acc6],$acc6
780 srl $s1,13,$acc9
781 and $acc8,2040,$acc8
782 ldx [$tbl+$acc7],$acc7 !
783 srl $s0,5,$acc10
784 and $acc9,2040,$acc9
785 ldx [$tbl+$acc8],$acc8
786 sll $s3,3,$acc11
787 and $acc10,2040,$acc10
788 ldx [$tbl+$acc9],$acc9
789 fmovs %f0,%f0
790 srl $s3,21,$acc12 !
791 and $acc11,2040,$acc11
792 ldx [$tbl+$acc10],$acc10
793 srl $s2,13,$acc13
794 and $acc12,2040,$acc12
795 ldx [$tbl+$acc11],$acc11
796 srl $s1,5,$acc14
797 and $acc13,2040,$acc13
798 ldx [$tbl+$acc12],$acc12 !
799 sll $s0,3,$acc15
800 and $acc14,2040,$acc14
801 ldx [$tbl+$acc13],$acc13
802 and $acc15,2040,$acc15
803 add $key,32,$key
804 ldx [$tbl+$acc14],$acc14
805 fmovs %f0,%f0
806 subcc $rounds,1,$rounds !
807 ldx [$tbl+$acc15],$acc15
808 bz,a,pn %icc,.Ldec_last
809 add $tbl,2048,$rounds
810
811 srlx $acc1,8,$acc1
812 xor $acc0,$t0,$t0
813 ld [$key+0],$s0
814 fmovs %f0,%f0
815 srlx $acc2,16,$acc2 !
816 xor $acc1,$t0,$t0
817 ld [$key+4],$s1
818 srlx $acc3,24,$acc3
819 xor $acc2,$t0,$t0
820 ld [$key+8],$s2
821 srlx $acc5,8,$acc5
822 xor $acc3,$t0,$t0
823 ld [$key+12],$s3 !
824 srlx $acc6,16,$acc6
825 xor $acc4,$t1,$t1
826 fmovs %f0,%f0
827 srlx $acc7,24,$acc7
828 xor $acc5,$t1,$t1
829 srlx $acc9,8,$acc9
830 xor $acc6,$t1,$t1
831 srlx $acc10,16,$acc10 !
832 xor $acc7,$t1,$t1
833 srlx $acc11,24,$acc11
834 xor $acc8,$t2,$t2
835 srlx $acc13,8,$acc13
836 xor $acc9,$t2,$t2
837 srlx $acc14,16,$acc14
838 xor $acc10,$t2,$t2
839 srlx $acc15,24,$acc15 !
840 xor $acc11,$t2,$t2
841 xor $acc12,$acc14,$acc14
842 xor $acc13,$t3,$t3
843 srl $t0,21,$acc0
844 xor $acc14,$t3,$t3
845 xor $acc15,$t3,$t3
846 srl $t3,13,$acc1
847
848 and $acc0,2040,$acc0 !
849 srl $t2,5,$acc2
850 and $acc1,2040,$acc1
851 ldx [$tbl+$acc0],$acc0
852 sll $t1,3,$acc3
853 and $acc2,2040,$acc2
854 ldx [$tbl+$acc1],$acc1
855 fmovs %f0,%f0
856 srl $t1,21,$acc4 !
857 and $acc3,2040,$acc3
858 ldx [$tbl+$acc2],$acc2
859 srl $t0,13,$acc5
860 and $acc4,2040,$acc4
861 ldx [$tbl+$acc3],$acc3
862 srl $t3,5,$acc6
863 and $acc5,2040,$acc5
864 ldx [$tbl+$acc4],$acc4 !
865 sll $t2,3,$acc7
866 and $acc6,2040,$acc6
867 ldx [$tbl+$acc5],$acc5
868 srl $t2,21,$acc8
869 and $acc7,2040,$acc7
870 ldx [$tbl+$acc6],$acc6
871 fmovs %f0,%f0
872 srl $t1,13,$acc9 !
873 and $acc8,2040,$acc8
874 ldx [$tbl+$acc7],$acc7
875 srl $t0,5,$acc10
876 and $acc9,2040,$acc9
877 ldx [$tbl+$acc8],$acc8
878 sll $t3,3,$acc11
879 and $acc10,2040,$acc10
880 ldx [$tbl+$acc9],$acc9 !
881 srl $t3,21,$acc12
882 and $acc11,2040,$acc11
883 ldx [$tbl+$acc10],$acc10
884 srl $t2,13,$acc13
885 and $acc12,2040,$acc12
886 ldx [$tbl+$acc11],$acc11
887 fmovs %f0,%f0
888 srl $t1,5,$acc14 !
889 and $acc13,2040,$acc13
890 ldx [$tbl+$acc12],$acc12
891 sll $t0,3,$acc15
892 and $acc14,2040,$acc14
893 ldx [$tbl+$acc13],$acc13
894 srlx $acc1,8,$acc1
895 and $acc15,2040,$acc15
896 ldx [$tbl+$acc14],$acc14 !
897
898 srlx $acc2,16,$acc2
899 xor $acc0,$s0,$s0
900 ldx [$tbl+$acc15],$acc15
901 srlx $acc3,24,$acc3
902 xor $acc1,$s0,$s0
903 ld [$key+16],$t0
904 fmovs %f0,%f0
905 srlx $acc5,8,$acc5 !
906 xor $acc2,$s0,$s0
907 ld [$key+20],$t1
908 srlx $acc6,16,$acc6
909 xor $acc3,$s0,$s0
910 ld [$key+24],$t2
911 srlx $acc7,24,$acc7
912 xor $acc4,$s1,$s1
913 ld [$key+28],$t3 !
914 srlx $acc9,8,$acc9
915 xor $acc5,$s1,$s1
916 ldx [$tbl+2048+0],%g0 ! prefetch td4
917 srlx $acc10,16,$acc10
918 xor $acc6,$s1,$s1
919 ldx [$tbl+2048+32],%g0 ! prefetch td4
920 srlx $acc11,24,$acc11
921 xor $acc7,$s1,$s1
922 ldx [$tbl+2048+64],%g0 ! prefetch td4
923 srlx $acc13,8,$acc13
924 xor $acc8,$s2,$s2
925 ldx [$tbl+2048+96],%g0 ! prefetch td4
926 srlx $acc14,16,$acc14 !
927 xor $acc9,$s2,$s2
928 ldx [$tbl+2048+128],%g0 ! prefetch td4
929 srlx $acc15,24,$acc15
930 xor $acc10,$s2,$s2
931 ldx [$tbl+2048+160],%g0 ! prefetch td4
932 srl $s0,21,$acc0
933 xor $acc11,$s2,$s2
934 ldx [$tbl+2048+192],%g0 ! prefetch td4
935 xor $acc12,$acc14,$acc14
936 xor $acc13,$s3,$s3
937 ldx [$tbl+2048+224],%g0 ! prefetch td4
938 and $acc0,2040,$acc0 !
939 xor $acc14,$s3,$s3
940 xor $acc15,$s3,$s3
941 ba .Ldec_loop
942 srl $s3,13,$acc1
943
944.align 32
945.Ldec_last:
946 srlx $acc1,8,$acc1 !
947 xor $acc0,$t0,$t0
948 ld [$key+0],$s0
949 srlx $acc2,16,$acc2
950 xor $acc1,$t0,$t0
951 ld [$key+4],$s1
952 srlx $acc3,24,$acc3
953 xor $acc2,$t0,$t0
954 ld [$key+8],$s2 !
955 srlx $acc5,8,$acc5
956 xor $acc3,$t0,$t0
957 ld [$key+12],$s3
958 srlx $acc6,16,$acc6
959 xor $acc4,$t1,$t1
960 srlx $acc7,24,$acc7
961 xor $acc5,$t1,$t1
962 srlx $acc9,8,$acc9 !
963 xor $acc6,$t1,$t1
964 srlx $acc10,16,$acc10
965 xor $acc7,$t1,$t1
966 srlx $acc11,24,$acc11
967 xor $acc8,$t2,$t2
968 srlx $acc13,8,$acc13
969 xor $acc9,$t2,$t2
970 srlx $acc14,16,$acc14 !
971 xor $acc10,$t2,$t2
972 srlx $acc15,24,$acc15
973 xor $acc11,$t2,$t2
974 xor $acc12,$acc14,$acc14
975 xor $acc13,$t3,$t3
976 srl $t0,24,$acc0
977 xor $acc14,$t3,$t3
978 xor $acc15,$t3,$t3 !
979 srl $t3,16,$acc1
980
981 srl $t2,8,$acc2
982 and $acc1,255,$acc1
983 ldub [$rounds+$acc0],$acc0
984 srl $t1,24,$acc4
985 and $acc2,255,$acc2
986 ldub [$rounds+$acc1],$acc1
987 srl $t0,16,$acc5 !
988 and $t1,255,$acc3
989 ldub [$rounds+$acc2],$acc2
990 ldub [$rounds+$acc3],$acc3
991 srl $t3,8,$acc6
992 and $acc5,255,$acc5
993 ldub [$rounds+$acc4],$acc4
994 fmovs %f0,%f0
995 srl $t2,24,$acc8 !
996 and $acc6,255,$acc6
997 ldub [$rounds+$acc5],$acc5
998 srl $t1,16,$acc9
999 and $t2,255,$acc7
1000 ldub [$rounds+$acc6],$acc6
1001 ldub [$rounds+$acc7],$acc7
1002 fmovs %f0,%f0
1003 srl $t0,8,$acc10 !
1004 and $acc9,255,$acc9
1005 ldub [$rounds+$acc8],$acc8
1006 srl $t3,24,$acc12
1007 and $acc10,255,$acc10
1008 ldub [$rounds+$acc9],$acc9
1009 srl $t2,16,$acc13
1010 and $t3,255,$acc11
1011 ldub [$rounds+$acc10],$acc10 !
1012 srl $t1,8,$acc14
1013 and $acc13,255,$acc13
1014 ldub [$rounds+$acc11],$acc11
1015 ldub [$rounds+$acc12],$acc12
1016 and $acc14,255,$acc14
1017 ldub [$rounds+$acc13],$acc13
1018 and $t0,255,$acc15
1019 ldub [$rounds+$acc14],$acc14 !
1020
1021 sll $acc0,24,$acc0
1022 xor $acc3,$s0,$s0
1023 ldub [$rounds+$acc15],$acc15
1024 sll $acc1,16,$acc1
1025 xor $acc0,$s0,$s0
1026 ldx [%sp+$bias+$frame+0],%i7 ! restore return address
1027 fmovs %f0,%f0
1028 sll $acc2,8,$acc2 !
1029 xor $acc1,$s0,$s0
1030 sll $acc4,24,$acc4
1031 xor $acc2,$s0,$s0
1032 sll $acc5,16,$acc5
1033 xor $acc7,$s1,$s1
1034 sll $acc6,8,$acc6
1035 xor $acc4,$s1,$s1
1036 sll $acc8,24,$acc8 !
1037 xor $acc5,$s1,$s1
1038 sll $acc9,16,$acc9
1039 xor $acc11,$s2,$s2
1040 sll $acc10,8,$acc10
1041 xor $acc6,$s1,$s1
1042 sll $acc12,24,$acc12
1043 xor $acc8,$s2,$s2
1044 sll $acc13,16,$acc13 !
1045 xor $acc9,$s2,$s2
1046 sll $acc14,8,$acc14
1047 xor $acc10,$s2,$s2
1048 xor $acc12,$acc14,$acc14
1049 xor $acc13,$s3,$s3
1050 xor $acc14,$s3,$s3
1051 xor $acc15,$s3,$s3
1052
1053 ret
1054 restore
1055.type _sparcv9_AES_decrypt,#function
1056.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
1057
1058.align 32
1059.globl AES_decrypt
1060AES_decrypt:
1061 or %o0,%o1,%g1
1062 andcc %g1,3,%g0
1063 bnz,pn %xcc,.Lunaligned_dec
1064 save %sp,-$frame,%sp
1065
1066 ld [%i0+0],%o0
1067 ld [%i0+4],%o1
1068 ld [%i0+8],%o2
1069 ld [%i0+12],%o3
1070
10711: call .+8
1072 add %o7,AES_Td-1b,%o4
1073 call _sparcv9_AES_decrypt
1074 mov %i2,%o5
1075
1076 st %o0,[%i1+0]
1077 st %o1,[%i1+4]
1078 st %o2,[%i1+8]
1079 st %o3,[%i1+12]
1080
1081 ret
1082 restore
1083
1084.align 32
1085.Lunaligned_dec:
1086 ldub [%i0+0],%l0
1087 ldub [%i0+1],%l1
1088 ldub [%i0+2],%l2
1089
1090 sll %l0,24,%l0
1091 ldub [%i0+3],%l3
1092 sll %l1,16,%l1
1093 ldub [%i0+4],%l4
1094 sll %l2,8,%l2
1095 or %l1,%l0,%l0
1096 ldub [%i0+5],%l5
1097 sll %l4,24,%l4
1098 or %l3,%l2,%l2
1099 ldub [%i0+6],%l6
1100 sll %l5,16,%l5
1101 or %l0,%l2,%o0
1102 ldub [%i0+7],%l7
1103
1104 sll %l6,8,%l6
1105 or %l5,%l4,%l4
1106 ldub [%i0+8],%l0
1107 or %l7,%l6,%l6
1108 ldub [%i0+9],%l1
1109 or %l4,%l6,%o1
1110 ldub [%i0+10],%l2
1111
1112 sll %l0,24,%l0
1113 ldub [%i0+11],%l3
1114 sll %l1,16,%l1
1115 ldub [%i0+12],%l4
1116 sll %l2,8,%l2
1117 or %l1,%l0,%l0
1118 ldub [%i0+13],%l5
1119 sll %l4,24,%l4
1120 or %l3,%l2,%l2
1121 ldub [%i0+14],%l6
1122 sll %l5,16,%l5
1123 or %l0,%l2,%o2
1124 ldub [%i0+15],%l7
1125
1126 sll %l6,8,%l6
1127 or %l5,%l4,%l4
1128 or %l7,%l6,%l6
1129 or %l4,%l6,%o3
1130
11311: call .+8
1132 add %o7,AES_Td-1b,%o4
1133 call _sparcv9_AES_decrypt
1134 mov %i2,%o5
1135
1136 srl %o0,24,%l0
1137 srl %o0,16,%l1
1138 stb %l0,[%i1+0]
1139 srl %o0,8,%l2
1140 stb %l1,[%i1+1]
1141 stb %l2,[%i1+2]
1142 srl %o1,24,%l4
1143 stb %o0,[%i1+3]
1144
1145 srl %o1,16,%l5
1146 stb %l4,[%i1+4]
1147 srl %o1,8,%l6
1148 stb %l5,[%i1+5]
1149 stb %l6,[%i1+6]
1150 srl %o2,24,%l0
1151 stb %o1,[%i1+7]
1152
1153 srl %o2,16,%l1
1154 stb %l0,[%i1+8]
1155 srl %o2,8,%l2
1156 stb %l1,[%i1+9]
1157 stb %l2,[%i1+10]
1158 srl %o3,24,%l4
1159 stb %o2,[%i1+11]
1160
1161 srl %o3,16,%l5
1162 stb %l4,[%i1+12]
1163 srl %o3,8,%l6
1164 stb %l5,[%i1+13]
1165 stb %l6,[%i1+14]
1166 stb %o3,[%i1+15]
1167
1168 ret
1169 restore
1170.type AES_decrypt,#function
1171.size AES_decrypt,(.-AES_decrypt)
1172___
1173
1174# fmovs instructions substituting for FP nops were originally added
1175# to meet specific instruction alignment requirements to maximize ILP.
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance...
1179$code =~ s/fmovs.*$//gm;
1180
1181print $code;
1182close STDOUT; # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
new file mode 100755
index 0000000000..53e4ef85fd
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -0,0 +1,2809 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# Version 2.1.
11#
12# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
13# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
14# [you'll notice a lot of resemblance], such as compressed S-boxes
15# in little-endian byte order, prefetch of these tables in CBC mode,
16# as well as avoiding L1 cache aliasing between stack frame and key
17# schedule and already mentioned tables, compressed Td4...
18#
19# Performance in number of cycles per processed byte for 128-bit key:
20#
21# ECB encrypt ECB decrypt CBC large chunk
22# AMD64 33 41 13.0
23# EM64T 38 59 18.6(*)
24# Core 2 30 43 14.5(*)
25#
26# (*) with hyper-threading off
27
28$flavour = shift;
29$output = shift;
30if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
31
32$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
33
34$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
35( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
36( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
37die "can't locate x86_64-xlate.pl";
38
39open STDOUT,"| $^X $xlate $flavour $output";
40
41$verticalspin=1; # unlike 32-bit version $verticalspin performs
42 # ~15% better on both AMD and Intel cores
43$speed_limit=512; # see aes-586.pl for details
44
45$code=".text\n";
46
47$s0="%eax";
48$s1="%ebx";
49$s2="%ecx";
50$s3="%edx";
51$acc0="%esi"; $mask80="%rsi";
52$acc1="%edi"; $maskfe="%rdi";
53$acc2="%ebp"; $mask1b="%rbp";
54$inp="%r8";
55$out="%r9";
56$t0="%r10d";
57$t1="%r11d";
58$t2="%r12d";
59$rnds="%r13d";
60$sbox="%r14";
61$key="%r15";
62
63sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
64sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
65 $r =~ s/%[er]([sd]i)/%\1l/;
66 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
67sub LO() { my $r=shift; $r =~ s/%r([a-z]+)/%e\1/;
68 $r =~ s/%r([0-9]+)/%r\1d/; $r; }
69sub _data_word()
70{ my $i;
71 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
72}
73sub data_word()
74{ my $i;
75 my $last=pop(@_);
76 $code.=".long\t";
77 while(defined($i=shift)) { $code.=sprintf"0x%08x,",$i; }
78 $code.=sprintf"0x%08x\n",$last;
79}
80
81sub data_byte()
82{ my $i;
83 my $last=pop(@_);
84 $code.=".byte\t";
85 while(defined($i=shift)) { $code.=sprintf"0x%02x,",$i&0xff; }
86 $code.=sprintf"0x%02x\n",$last&0xff;
87}
88
89sub encvert()
90{ my $t3="%r8d"; # zaps $inp!
91
92$code.=<<___;
93 # favor 3-way issue Opteron pipeline...
94 movzb `&lo("$s0")`,$acc0
95 movzb `&lo("$s1")`,$acc1
96 movzb `&lo("$s2")`,$acc2
97 mov 0($sbox,$acc0,8),$t0
98 mov 0($sbox,$acc1,8),$t1
99 mov 0($sbox,$acc2,8),$t2
100
101 movzb `&hi("$s1")`,$acc0
102 movzb `&hi("$s2")`,$acc1
103 movzb `&lo("$s3")`,$acc2
104 xor 3($sbox,$acc0,8),$t0
105 xor 3($sbox,$acc1,8),$t1
106 mov 0($sbox,$acc2,8),$t3
107
108 movzb `&hi("$s3")`,$acc0
109 shr \$16,$s2
110 movzb `&hi("$s0")`,$acc2
111 xor 3($sbox,$acc0,8),$t2
112 shr \$16,$s3
113 xor 3($sbox,$acc2,8),$t3
114
115 shr \$16,$s1
116 lea 16($key),$key
117 shr \$16,$s0
118
119 movzb `&lo("$s2")`,$acc0
120 movzb `&lo("$s3")`,$acc1
121 movzb `&lo("$s0")`,$acc2
122 xor 2($sbox,$acc0,8),$t0
123 xor 2($sbox,$acc1,8),$t1
124 xor 2($sbox,$acc2,8),$t2
125
126 movzb `&hi("$s3")`,$acc0
127 movzb `&hi("$s0")`,$acc1
128 movzb `&lo("$s1")`,$acc2
129 xor 1($sbox,$acc0,8),$t0
130 xor 1($sbox,$acc1,8),$t1
131 xor 2($sbox,$acc2,8),$t3
132
133 mov 12($key),$s3
134 movzb `&hi("$s1")`,$acc1
135 movzb `&hi("$s2")`,$acc2
136 mov 0($key),$s0
137 xor 1($sbox,$acc1,8),$t2
138 xor 1($sbox,$acc2,8),$t3
139
140 mov 4($key),$s1
141 mov 8($key),$s2
142 xor $t0,$s0
143 xor $t1,$s1
144 xor $t2,$s2
145 xor $t3,$s3
146___
147}
148
149sub enclastvert()
150{ my $t3="%r8d"; # zaps $inp!
151
152$code.=<<___;
153 movzb `&lo("$s0")`,$acc0
154 movzb `&lo("$s1")`,$acc1
155 movzb `&lo("$s2")`,$acc2
156 movzb 2($sbox,$acc0,8),$t0
157 movzb 2($sbox,$acc1,8),$t1
158 movzb 2($sbox,$acc2,8),$t2
159
160 movzb `&lo("$s3")`,$acc0
161 movzb `&hi("$s1")`,$acc1
162 movzb `&hi("$s2")`,$acc2
163 movzb 2($sbox,$acc0,8),$t3
164 mov 0($sbox,$acc1,8),$acc1 #$t0
165 mov 0($sbox,$acc2,8),$acc2 #$t1
166
167 and \$0x0000ff00,$acc1
168 and \$0x0000ff00,$acc2
169
170 xor $acc1,$t0
171 xor $acc2,$t1
172 shr \$16,$s2
173
174 movzb `&hi("$s3")`,$acc0
175 movzb `&hi("$s0")`,$acc1
176 shr \$16,$s3
177 mov 0($sbox,$acc0,8),$acc0 #$t2
178 mov 0($sbox,$acc1,8),$acc1 #$t3
179
180 and \$0x0000ff00,$acc0
181 and \$0x0000ff00,$acc1
182 shr \$16,$s1
183 xor $acc0,$t2
184 xor $acc1,$t3
185 shr \$16,$s0
186
187 movzb `&lo("$s2")`,$acc0
188 movzb `&lo("$s3")`,$acc1
189 movzb `&lo("$s0")`,$acc2
190 mov 0($sbox,$acc0,8),$acc0 #$t0
191 mov 0($sbox,$acc1,8),$acc1 #$t1
192 mov 0($sbox,$acc2,8),$acc2 #$t2
193
194 and \$0x00ff0000,$acc0
195 and \$0x00ff0000,$acc1
196 and \$0x00ff0000,$acc2
197
198 xor $acc0,$t0
199 xor $acc1,$t1
200 xor $acc2,$t2
201
202 movzb `&lo("$s1")`,$acc0
203 movzb `&hi("$s3")`,$acc1
204 movzb `&hi("$s0")`,$acc2
205 mov 0($sbox,$acc0,8),$acc0 #$t3
206 mov 2($sbox,$acc1,8),$acc1 #$t0
207 mov 2($sbox,$acc2,8),$acc2 #$t1
208
209 and \$0x00ff0000,$acc0
210 and \$0xff000000,$acc1
211 and \$0xff000000,$acc2
212
213 xor $acc0,$t3
214 xor $acc1,$t0
215 xor $acc2,$t1
216
217 movzb `&hi("$s1")`,$acc0
218 movzb `&hi("$s2")`,$acc1
219 mov 16+12($key),$s3
220 mov 2($sbox,$acc0,8),$acc0 #$t2
221 mov 2($sbox,$acc1,8),$acc1 #$t3
222 mov 16+0($key),$s0
223
224 and \$0xff000000,$acc0
225 and \$0xff000000,$acc1
226
227 xor $acc0,$t2
228 xor $acc1,$t3
229
230 mov 16+4($key),$s1
231 mov 16+8($key),$s2
232 xor $t0,$s0
233 xor $t1,$s1
234 xor $t2,$s2
235 xor $t3,$s3
236___
237}
238
239sub encstep()
240{ my ($i,@s) = @_;
241 my $tmp0=$acc0;
242 my $tmp1=$acc1;
243 my $tmp2=$acc2;
244 my $out=($t0,$t1,$t2,$s[0])[$i];
245
246 if ($i==3) {
247 $tmp0=$s[1];
248 $tmp1=$s[2];
249 $tmp2=$s[3];
250 }
251 $code.=" movzb ".&lo($s[0]).",$out\n";
252 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
253 $code.=" lea 16($key),$key\n" if ($i==0);
254
255 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
256 $code.=" mov 0($sbox,$out,8),$out\n";
257
258 $code.=" shr \$16,$tmp1\n";
259 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
260 $code.=" xor 3($sbox,$tmp0,8),$out\n";
261
262 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
263 $code.=" shr \$24,$tmp2\n";
264 $code.=" xor 4*$i($key),$out\n";
265
266 $code.=" xor 2($sbox,$tmp1,8),$out\n";
267 $code.=" xor 1($sbox,$tmp2,8),$out\n";
268
269 $code.=" mov $t0,$s[1]\n" if ($i==3);
270 $code.=" mov $t1,$s[2]\n" if ($i==3);
271 $code.=" mov $t2,$s[3]\n" if ($i==3);
272 $code.="\n";
273}
274
275sub enclast()
276{ my ($i,@s)=@_;
277 my $tmp0=$acc0;
278 my $tmp1=$acc1;
279 my $tmp2=$acc2;
280 my $out=($t0,$t1,$t2,$s[0])[$i];
281
282 if ($i==3) {
283 $tmp0=$s[1];
284 $tmp1=$s[2];
285 $tmp2=$s[3];
286 }
287 $code.=" movzb ".&lo($s[0]).",$out\n";
288 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
289
290 $code.=" mov 2($sbox,$out,8),$out\n";
291 $code.=" shr \$16,$tmp1\n";
292 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
293
294 $code.=" and \$0x000000ff,$out\n";
295 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
296 $code.=" movzb ".&lo($tmp1).",$tmp1\n";
297 $code.=" shr \$24,$tmp2\n";
298
299 $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
300 $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
301 $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
302
303 $code.=" and \$0x0000ff00,$tmp0\n";
304 $code.=" and \$0x00ff0000,$tmp1\n";
305 $code.=" and \$0xff000000,$tmp2\n";
306
307 $code.=" xor $tmp0,$out\n";
308 $code.=" mov $t0,$s[1]\n" if ($i==3);
309 $code.=" xor $tmp1,$out\n";
310 $code.=" mov $t1,$s[2]\n" if ($i==3);
311 $code.=" xor $tmp2,$out\n";
312 $code.=" mov $t2,$s[3]\n" if ($i==3);
313 $code.="\n";
314}
315
316$code.=<<___;
317.type _x86_64_AES_encrypt,\@abi-omnipotent
318.align 16
319_x86_64_AES_encrypt:
320 xor 0($key),$s0 # xor with key
321 xor 4($key),$s1
322 xor 8($key),$s2
323 xor 12($key),$s3
324
325 mov 240($key),$rnds # load key->rounds
326 sub \$1,$rnds
327 jmp .Lenc_loop
328.align 16
329.Lenc_loop:
330___
331 if ($verticalspin) { &encvert(); }
332 else { &encstep(0,$s0,$s1,$s2,$s3);
333 &encstep(1,$s1,$s2,$s3,$s0);
334 &encstep(2,$s2,$s3,$s0,$s1);
335 &encstep(3,$s3,$s0,$s1,$s2);
336 }
337$code.=<<___;
338 sub \$1,$rnds
339 jnz .Lenc_loop
340___
341 if ($verticalspin) { &enclastvert(); }
342 else { &enclast(0,$s0,$s1,$s2,$s3);
343 &enclast(1,$s1,$s2,$s3,$s0);
344 &enclast(2,$s2,$s3,$s0,$s1);
345 &enclast(3,$s3,$s0,$s1,$s2);
346 $code.=<<___;
347 xor 16+0($key),$s0 # xor with key
348 xor 16+4($key),$s1
349 xor 16+8($key),$s2
350 xor 16+12($key),$s3
351___
352 }
353$code.=<<___;
354 .byte 0xf3,0xc3 # rep ret
355.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
356___
357
358# it's possible to implement this by shifting tN by 8, filling least
359# significant byte with byte load and finally bswap-ing at the end,
360# but such partial register load kills Core 2...
361sub enccompactvert()
362{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
363
364$code.=<<___;
365 movzb `&lo("$s0")`,$t0
366 movzb `&lo("$s1")`,$t1
367 movzb `&lo("$s2")`,$t2
368 movzb ($sbox,$t0,1),$t0
369 movzb ($sbox,$t1,1),$t1
370 movzb ($sbox,$t2,1),$t2
371
372 movzb `&lo("$s3")`,$t3
373 movzb `&hi("$s1")`,$acc0
374 movzb `&hi("$s2")`,$acc1
375 movzb ($sbox,$t3,1),$t3
376 movzb ($sbox,$acc0,1),$t4 #$t0
377 movzb ($sbox,$acc1,1),$t5 #$t1
378
379 movzb `&hi("$s3")`,$acc2
380 movzb `&hi("$s0")`,$acc0
381 shr \$16,$s2
382 movzb ($sbox,$acc2,1),$acc2 #$t2
383 movzb ($sbox,$acc0,1),$acc0 #$t3
384 shr \$16,$s3
385
386 movzb `&lo("$s2")`,$acc1
387 shl \$8,$t4
388 shl \$8,$t5
389 movzb ($sbox,$acc1,1),$acc1 #$t0
390 xor $t4,$t0
391 xor $t5,$t1
392
393 movzb `&lo("$s3")`,$t4
394 shr \$16,$s0
395 shr \$16,$s1
396 movzb `&lo("$s0")`,$t5
397 shl \$8,$acc2
398 shl \$8,$acc0
399 movzb ($sbox,$t4,1),$t4 #$t1
400 movzb ($sbox,$t5,1),$t5 #$t2
401 xor $acc2,$t2
402 xor $acc0,$t3
403
404 movzb `&lo("$s1")`,$acc2
405 movzb `&hi("$s3")`,$acc0
406 shl \$16,$acc1
407 movzb ($sbox,$acc2,1),$acc2 #$t3
408 movzb ($sbox,$acc0,1),$acc0 #$t0
409 xor $acc1,$t0
410
411 movzb `&hi("$s0")`,$acc1
412 shr \$8,$s2
413 shr \$8,$s1
414 movzb ($sbox,$acc1,1),$acc1 #$t1
415 movzb ($sbox,$s2,1),$s3 #$t3
416 movzb ($sbox,$s1,1),$s2 #$t2
417 shl \$16,$t4
418 shl \$16,$t5
419 shl \$16,$acc2
420 xor $t4,$t1
421 xor $t5,$t2
422 xor $acc2,$t3
423
424 shl \$24,$acc0
425 shl \$24,$acc1
426 shl \$24,$s3
427 xor $acc0,$t0
428 shl \$24,$s2
429 xor $acc1,$t1
430 mov $t0,$s0
431 mov $t1,$s1
432 xor $t2,$s2
433 xor $t3,$s3
434___
435}
436
437sub enctransform_ref()
438{ my $sn = shift;
439 my ($acc,$r2,$tmp)=("%r8d","%r9d","%r13d");
440
441$code.=<<___;
442 mov $sn,$acc
443 and \$0x80808080,$acc
444 mov $acc,$tmp
445 shr \$7,$tmp
446 lea ($sn,$sn),$r2
447 sub $tmp,$acc
448 and \$0xfefefefe,$r2
449 and \$0x1b1b1b1b,$acc
450 mov $sn,$tmp
451 xor $acc,$r2
452
453 xor $r2,$sn
454 rol \$24,$sn
455 xor $r2,$sn
456 ror \$16,$tmp
457 xor $tmp,$sn
458 ror \$8,$tmp
459 xor $tmp,$sn
460___
461}
462
463# unlike decrypt case it does not pay off to parallelize enctransform
464sub enctransform()
465{ my ($t3,$r20,$r21)=($acc2,"%r8d","%r9d");
466
467$code.=<<___;
468 mov $s0,$acc0
469 mov $s1,$acc1
470 and \$0x80808080,$acc0
471 and \$0x80808080,$acc1
472 mov $acc0,$t0
473 mov $acc1,$t1
474 shr \$7,$t0
475 lea ($s0,$s0),$r20
476 shr \$7,$t1
477 lea ($s1,$s1),$r21
478 sub $t0,$acc0
479 sub $t1,$acc1
480 and \$0xfefefefe,$r20
481 and \$0xfefefefe,$r21
482 and \$0x1b1b1b1b,$acc0
483 and \$0x1b1b1b1b,$acc1
484 mov $s0,$t0
485 mov $s1,$t1
486 xor $acc0,$r20
487 xor $acc1,$r21
488
489 xor $r20,$s0
490 xor $r21,$s1
491 mov $s2,$acc0
492 mov $s3,$acc1
493 rol \$24,$s0
494 rol \$24,$s1
495 and \$0x80808080,$acc0
496 and \$0x80808080,$acc1
497 xor $r20,$s0
498 xor $r21,$s1
499 mov $acc0,$t2
500 mov $acc1,$t3
501 ror \$16,$t0
502 ror \$16,$t1
503 shr \$7,$t2
504 lea ($s2,$s2),$r20
505 xor $t0,$s0
506 xor $t1,$s1
507 shr \$7,$t3
508 lea ($s3,$s3),$r21
509 ror \$8,$t0
510 ror \$8,$t1
511 sub $t2,$acc0
512 sub $t3,$acc1
513 xor $t0,$s0
514 xor $t1,$s1
515
516 and \$0xfefefefe,$r20
517 and \$0xfefefefe,$r21
518 and \$0x1b1b1b1b,$acc0
519 and \$0x1b1b1b1b,$acc1
520 mov $s2,$t2
521 mov $s3,$t3
522 xor $acc0,$r20
523 xor $acc1,$r21
524
525 xor $r20,$s2
526 xor $r21,$s3
527 rol \$24,$s2
528 rol \$24,$s3
529 xor $r20,$s2
530 xor $r21,$s3
531 mov 0($sbox),$acc0 # prefetch Te4
532 ror \$16,$t2
533 ror \$16,$t3
534 mov 64($sbox),$acc1
535 xor $t2,$s2
536 xor $t3,$s3
537 mov 128($sbox),$r20
538 ror \$8,$t2
539 ror \$8,$t3
540 mov 192($sbox),$r21
541 xor $t2,$s2
542 xor $t3,$s3
543___
544}
545
546$code.=<<___;
547.type _x86_64_AES_encrypt_compact,\@abi-omnipotent
548.align 16
549_x86_64_AES_encrypt_compact:
550 lea 128($sbox),$inp # size optimization
551 mov 0-128($inp),$acc1 # prefetch Te4
552 mov 32-128($inp),$acc2
553 mov 64-128($inp),$t0
554 mov 96-128($inp),$t1
555 mov 128-128($inp),$acc1
556 mov 160-128($inp),$acc2
557 mov 192-128($inp),$t0
558 mov 224-128($inp),$t1
559 jmp .Lenc_loop_compact
560.align 16
561.Lenc_loop_compact:
562 xor 0($key),$s0 # xor with key
563 xor 4($key),$s1
564 xor 8($key),$s2
565 xor 12($key),$s3
566 lea 16($key),$key
567___
568 &enccompactvert();
569$code.=<<___;
570 cmp 16(%rsp),$key
571 je .Lenc_compact_done
572___
573 &enctransform();
574$code.=<<___;
575 jmp .Lenc_loop_compact
576.align 16
577.Lenc_compact_done:
578 xor 0($key),$s0
579 xor 4($key),$s1
580 xor 8($key),$s2
581 xor 12($key),$s3
582 .byte 0xf3,0xc3 # rep ret
583.size _x86_64_AES_encrypt_compact,.-_x86_64_AES_encrypt_compact
584___
585
586# void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
587$code.=<<___;
588.globl AES_encrypt
589.type AES_encrypt,\@function,3
590.align 16
591AES_encrypt:
592 push %rbx
593 push %rbp
594 push %r12
595 push %r13
596 push %r14
597 push %r15
598
599 # allocate frame "above" key schedule
600 mov %rsp,%r10
601 lea -63(%rdx),%rcx # %rdx is key argument
602 and \$-64,%rsp
603 sub %rsp,%rcx
604 neg %rcx
605 and \$0x3c0,%rcx
606 sub %rcx,%rsp
607 sub \$32,%rsp
608
609 mov %rsi,16(%rsp) # save out
610 mov %r10,24(%rsp) # save real stack pointer
611.Lenc_prologue:
612
613 mov %rdx,$key
614 mov 240($key),$rnds # load rounds
615
616 mov 0(%rdi),$s0 # load input vector
617 mov 4(%rdi),$s1
618 mov 8(%rdi),$s2
619 mov 12(%rdi),$s3
620
621 shl \$4,$rnds
622 lea ($key,$rnds),%rbp
623 mov $key,(%rsp) # key schedule
624 mov %rbp,8(%rsp) # end of key schedule
625
626 # pick Te4 copy which can't "overlap" with stack frame or key schedule
627 lea .LAES_Te+2048(%rip),$sbox
628 lea 768(%rsp),%rbp
629 sub $sbox,%rbp
630 and \$0x300,%rbp
631 lea ($sbox,%rbp),$sbox
632
633 call _x86_64_AES_encrypt_compact
634
635 mov 16(%rsp),$out # restore out
636 mov 24(%rsp),%rsi # restore saved stack pointer
637 mov $s0,0($out) # write output vector
638 mov $s1,4($out)
639 mov $s2,8($out)
640 mov $s3,12($out)
641
642 mov (%rsi),%r15
643 mov 8(%rsi),%r14
644 mov 16(%rsi),%r13
645 mov 24(%rsi),%r12
646 mov 32(%rsi),%rbp
647 mov 40(%rsi),%rbx
648 lea 48(%rsi),%rsp
649.Lenc_epilogue:
650 ret
651.size AES_encrypt,.-AES_encrypt
652___
653
654#------------------------------------------------------------------#
655
656sub decvert()
657{ my $t3="%r8d"; # zaps $inp!
658
659$code.=<<___;
660 # favor 3-way issue Opteron pipeline...
661 movzb `&lo("$s0")`,$acc0
662 movzb `&lo("$s1")`,$acc1
663 movzb `&lo("$s2")`,$acc2
664 mov 0($sbox,$acc0,8),$t0
665 mov 0($sbox,$acc1,8),$t1
666 mov 0($sbox,$acc2,8),$t2
667
668 movzb `&hi("$s3")`,$acc0
669 movzb `&hi("$s0")`,$acc1
670 movzb `&lo("$s3")`,$acc2
671 xor 3($sbox,$acc0,8),$t0
672 xor 3($sbox,$acc1,8),$t1
673 mov 0($sbox,$acc2,8),$t3
674
675 movzb `&hi("$s1")`,$acc0
676 shr \$16,$s0
677 movzb `&hi("$s2")`,$acc2
678 xor 3($sbox,$acc0,8),$t2
679 shr \$16,$s3
680 xor 3($sbox,$acc2,8),$t3
681
682 shr \$16,$s1
683 lea 16($key),$key
684 shr \$16,$s2
685
686 movzb `&lo("$s2")`,$acc0
687 movzb `&lo("$s3")`,$acc1
688 movzb `&lo("$s0")`,$acc2
689 xor 2($sbox,$acc0,8),$t0
690 xor 2($sbox,$acc1,8),$t1
691 xor 2($sbox,$acc2,8),$t2
692
693 movzb `&hi("$s1")`,$acc0
694 movzb `&hi("$s2")`,$acc1
695 movzb `&lo("$s1")`,$acc2
696 xor 1($sbox,$acc0,8),$t0
697 xor 1($sbox,$acc1,8),$t1
698 xor 2($sbox,$acc2,8),$t3
699
700 movzb `&hi("$s3")`,$acc0
701 mov 12($key),$s3
702 movzb `&hi("$s0")`,$acc2
703 xor 1($sbox,$acc0,8),$t2
704 mov 0($key),$s0
705 xor 1($sbox,$acc2,8),$t3
706
707 xor $t0,$s0
708 mov 4($key),$s1
709 mov 8($key),$s2
710 xor $t2,$s2
711 xor $t1,$s1
712 xor $t3,$s3
713___
714}
715
716sub declastvert()
717{ my $t3="%r8d"; # zaps $inp!
718
719$code.=<<___;
720 lea 2048($sbox),$sbox # size optimization
721 movzb `&lo("$s0")`,$acc0
722 movzb `&lo("$s1")`,$acc1
723 movzb `&lo("$s2")`,$acc2
724 movzb ($sbox,$acc0,1),$t0
725 movzb ($sbox,$acc1,1),$t1
726 movzb ($sbox,$acc2,1),$t2
727
728 movzb `&lo("$s3")`,$acc0
729 movzb `&hi("$s3")`,$acc1
730 movzb `&hi("$s0")`,$acc2
731 movzb ($sbox,$acc0,1),$t3
732 movzb ($sbox,$acc1,1),$acc1 #$t0
733 movzb ($sbox,$acc2,1),$acc2 #$t1
734
735 shl \$8,$acc1
736 shl \$8,$acc2
737
738 xor $acc1,$t0
739 xor $acc2,$t1
740 shr \$16,$s3
741
742 movzb `&hi("$s1")`,$acc0
743 movzb `&hi("$s2")`,$acc1
744 shr \$16,$s0
745 movzb ($sbox,$acc0,1),$acc0 #$t2
746 movzb ($sbox,$acc1,1),$acc1 #$t3
747
748 shl \$8,$acc0
749 shl \$8,$acc1
750 shr \$16,$s1
751 xor $acc0,$t2
752 xor $acc1,$t3
753 shr \$16,$s2
754
755 movzb `&lo("$s2")`,$acc0
756 movzb `&lo("$s3")`,$acc1
757 movzb `&lo("$s0")`,$acc2
758 movzb ($sbox,$acc0,1),$acc0 #$t0
759 movzb ($sbox,$acc1,1),$acc1 #$t1
760 movzb ($sbox,$acc2,1),$acc2 #$t2
761
762 shl \$16,$acc0
763 shl \$16,$acc1
764 shl \$16,$acc2
765
766 xor $acc0,$t0
767 xor $acc1,$t1
768 xor $acc2,$t2
769
770 movzb `&lo("$s1")`,$acc0
771 movzb `&hi("$s1")`,$acc1
772 movzb `&hi("$s2")`,$acc2
773 movzb ($sbox,$acc0,1),$acc0 #$t3
774 movzb ($sbox,$acc1,1),$acc1 #$t0
775 movzb ($sbox,$acc2,1),$acc2 #$t1
776
777 shl \$16,$acc0
778 shl \$24,$acc1
779 shl \$24,$acc2
780
781 xor $acc0,$t3
782 xor $acc1,$t0
783 xor $acc2,$t1
784
785 movzb `&hi("$s3")`,$acc0
786 movzb `&hi("$s0")`,$acc1
787 mov 16+12($key),$s3
788 movzb ($sbox,$acc0,1),$acc0 #$t2
789 movzb ($sbox,$acc1,1),$acc1 #$t3
790 mov 16+0($key),$s0
791
792 shl \$24,$acc0
793 shl \$24,$acc1
794
795 xor $acc0,$t2
796 xor $acc1,$t3
797
798 mov 16+4($key),$s1
799 mov 16+8($key),$s2
800 lea -2048($sbox),$sbox
801 xor $t0,$s0
802 xor $t1,$s1
803 xor $t2,$s2
804 xor $t3,$s3
805___
806}
807
808sub decstep()
809{ my ($i,@s) = @_;
810 my $tmp0=$acc0;
811 my $tmp1=$acc1;
812 my $tmp2=$acc2;
813 my $out=($t0,$t1,$t2,$s[0])[$i];
814
815 $code.=" mov $s[0],$out\n" if ($i!=3);
816 $tmp1=$s[2] if ($i==3);
817 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
818 $code.=" and \$0xFF,$out\n";
819
820 $code.=" mov 0($sbox,$out,8),$out\n";
821 $code.=" shr \$16,$tmp1\n";
822 $tmp2=$s[3] if ($i==3);
823 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
824
825 $tmp0=$s[1] if ($i==3);
826 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
827 $code.=" and \$0xFF,$tmp1\n";
828 $code.=" shr \$24,$tmp2\n";
829
830 $code.=" xor 3($sbox,$tmp0,8),$out\n";
831 $code.=" xor 2($sbox,$tmp1,8),$out\n";
832 $code.=" xor 1($sbox,$tmp2,8),$out\n";
833
834 $code.=" mov $t2,$s[1]\n" if ($i==3);
835 $code.=" mov $t1,$s[2]\n" if ($i==3);
836 $code.=" mov $t0,$s[3]\n" if ($i==3);
837 $code.="\n";
838}
839
840sub declast()
841{ my ($i,@s)=@_;
842 my $tmp0=$acc0;
843 my $tmp1=$acc1;
844 my $tmp2=$acc2;
845 my $out=($t0,$t1,$t2,$s[0])[$i];
846
847 $code.=" mov $s[0],$out\n" if ($i!=3);
848 $tmp1=$s[2] if ($i==3);
849 $code.=" mov $s[2],$tmp1\n" if ($i!=3);
850 $code.=" and \$0xFF,$out\n";
851
852 $code.=" movzb 2048($sbox,$out,1),$out\n";
853 $code.=" shr \$16,$tmp1\n";
854 $tmp2=$s[3] if ($i==3);
855 $code.=" mov $s[3],$tmp2\n" if ($i!=3);
856
857 $tmp0=$s[1] if ($i==3);
858 $code.=" movzb ".&hi($s[1]).",$tmp0\n";
859 $code.=" and \$0xFF,$tmp1\n";
860 $code.=" shr \$24,$tmp2\n";
861
862 $code.=" movzb 2048($sbox,$tmp0,1),$tmp0\n";
863 $code.=" movzb 2048($sbox,$tmp1,1),$tmp1\n";
864 $code.=" movzb 2048($sbox,$tmp2,1),$tmp2\n";
865
866 $code.=" shl \$8,$tmp0\n";
867 $code.=" shl \$16,$tmp1\n";
868 $code.=" shl \$24,$tmp2\n";
869
870 $code.=" xor $tmp0,$out\n";
871 $code.=" mov $t2,$s[1]\n" if ($i==3);
872 $code.=" xor $tmp1,$out\n";
873 $code.=" mov $t1,$s[2]\n" if ($i==3);
874 $code.=" xor $tmp2,$out\n";
875 $code.=" mov $t0,$s[3]\n" if ($i==3);
876 $code.="\n";
877}
878
879$code.=<<___;
880.type _x86_64_AES_decrypt,\@abi-omnipotent
881.align 16
882_x86_64_AES_decrypt:
883 xor 0($key),$s0 # xor with key
884 xor 4($key),$s1
885 xor 8($key),$s2
886 xor 12($key),$s3
887
888 mov 240($key),$rnds # load key->rounds
889 sub \$1,$rnds
890 jmp .Ldec_loop
891.align 16
892.Ldec_loop:
893___
894 if ($verticalspin) { &decvert(); }
895 else { &decstep(0,$s0,$s3,$s2,$s1);
896 &decstep(1,$s1,$s0,$s3,$s2);
897 &decstep(2,$s2,$s1,$s0,$s3);
898 &decstep(3,$s3,$s2,$s1,$s0);
899 $code.=<<___;
900 lea 16($key),$key
901 xor 0($key),$s0 # xor with key
902 xor 4($key),$s1
903 xor 8($key),$s2
904 xor 12($key),$s3
905___
906 }
907$code.=<<___;
908 sub \$1,$rnds
909 jnz .Ldec_loop
910___
911 if ($verticalspin) { &declastvert(); }
912 else { &declast(0,$s0,$s3,$s2,$s1);
913 &declast(1,$s1,$s0,$s3,$s2);
914 &declast(2,$s2,$s1,$s0,$s3);
915 &declast(3,$s3,$s2,$s1,$s0);
916 $code.=<<___;
917 xor 16+0($key),$s0 # xor with key
918 xor 16+4($key),$s1
919 xor 16+8($key),$s2
920 xor 16+12($key),$s3
921___
922 }
923$code.=<<___;
924 .byte 0xf3,0xc3 # rep ret
925.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
926___
927
928sub deccompactvert()
929{ my ($t3,$t4,$t5)=("%r8d","%r9d","%r13d");
930
931$code.=<<___;
932 movzb `&lo("$s0")`,$t0
933 movzb `&lo("$s1")`,$t1
934 movzb `&lo("$s2")`,$t2
935 movzb ($sbox,$t0,1),$t0
936 movzb ($sbox,$t1,1),$t1
937 movzb ($sbox,$t2,1),$t2
938
939 movzb `&lo("$s3")`,$t3
940 movzb `&hi("$s3")`,$acc0
941 movzb `&hi("$s0")`,$acc1
942 movzb ($sbox,$t3,1),$t3
943 movzb ($sbox,$acc0,1),$t4 #$t0
944 movzb ($sbox,$acc1,1),$t5 #$t1
945
946 movzb `&hi("$s1")`,$acc2
947 movzb `&hi("$s2")`,$acc0
948 shr \$16,$s2
949 movzb ($sbox,$acc2,1),$acc2 #$t2
950 movzb ($sbox,$acc0,1),$acc0 #$t3
951 shr \$16,$s3
952
953 movzb `&lo("$s2")`,$acc1
954 shl \$8,$t4
955 shl \$8,$t5
956 movzb ($sbox,$acc1,1),$acc1 #$t0
957 xor $t4,$t0
958 xor $t5,$t1
959
960 movzb `&lo("$s3")`,$t4
961 shr \$16,$s0
962 shr \$16,$s1
963 movzb `&lo("$s0")`,$t5
964 shl \$8,$acc2
965 shl \$8,$acc0
966 movzb ($sbox,$t4,1),$t4 #$t1
967 movzb ($sbox,$t5,1),$t5 #$t2
968 xor $acc2,$t2
969 xor $acc0,$t3
970
971 movzb `&lo("$s1")`,$acc2
972 movzb `&hi("$s1")`,$acc0
973 shl \$16,$acc1
974 movzb ($sbox,$acc2,1),$acc2 #$t3
975 movzb ($sbox,$acc0,1),$acc0 #$t0
976 xor $acc1,$t0
977
978 movzb `&hi("$s2")`,$acc1
979 shl \$16,$t4
980 shl \$16,$t5
981 movzb ($sbox,$acc1,1),$s1 #$t1
982 xor $t4,$t1
983 xor $t5,$t2
984
985 movzb `&hi("$s3")`,$acc1
986 shr \$8,$s0
987 shl \$16,$acc2
988 movzb ($sbox,$acc1,1),$s2 #$t2
989 movzb ($sbox,$s0,1),$s3 #$t3
990 xor $acc2,$t3
991
992 shl \$24,$acc0
993 shl \$24,$s1
994 shl \$24,$s2
995 xor $acc0,$t0
996 shl \$24,$s3
997 xor $t1,$s1
998 mov $t0,$s0
999 xor $t2,$s2
1000 xor $t3,$s3
1001___
1002}
1003
1004# parallelized version! input is pair of 64-bit values: %rax=s1.s0
1005# and %rcx=s3.s2, output is four 32-bit values in %eax=s0, %ebx=s1,
1006# %ecx=s2 and %edx=s3.
1007sub dectransform()
1008{ my ($tp10,$tp20,$tp40,$tp80,$acc0)=("%rax","%r8", "%r9", "%r10","%rbx");
1009 my ($tp18,$tp28,$tp48,$tp88,$acc8)=("%rcx","%r11","%r12","%r13","%rdx");
1010 my $prefetch = shift;
1011
1012$code.=<<___;
1013 mov $tp10,$acc0
1014 mov $tp18,$acc8
1015 and $mask80,$acc0
1016 and $mask80,$acc8
1017 mov $acc0,$tp40
1018 mov $acc8,$tp48
1019 shr \$7,$tp40
1020 lea ($tp10,$tp10),$tp20
1021 shr \$7,$tp48
1022 lea ($tp18,$tp18),$tp28
1023 sub $tp40,$acc0
1024 sub $tp48,$acc8
1025 and $maskfe,$tp20
1026 and $maskfe,$tp28
1027 and $mask1b,$acc0
1028 and $mask1b,$acc8
1029 xor $tp20,$acc0
1030 xor $tp28,$acc8
1031 mov $acc0,$tp20
1032 mov $acc8,$tp28
1033
1034 and $mask80,$acc0
1035 and $mask80,$acc8
1036 mov $acc0,$tp80
1037 mov $acc8,$tp88
1038 shr \$7,$tp80
1039 lea ($tp20,$tp20),$tp40
1040 shr \$7,$tp88
1041 lea ($tp28,$tp28),$tp48
1042 sub $tp80,$acc0
1043 sub $tp88,$acc8
1044 and $maskfe,$tp40
1045 and $maskfe,$tp48
1046 and $mask1b,$acc0
1047 and $mask1b,$acc8
1048 xor $tp40,$acc0
1049 xor $tp48,$acc8
1050 mov $acc0,$tp40
1051 mov $acc8,$tp48
1052
1053 and $mask80,$acc0
1054 and $mask80,$acc8
1055 mov $acc0,$tp80
1056 mov $acc8,$tp88
1057 shr \$7,$tp80
1058 xor $tp10,$tp20 # tp2^=tp1
1059 shr \$7,$tp88
1060 xor $tp18,$tp28 # tp2^=tp1
1061 sub $tp80,$acc0
1062 sub $tp88,$acc8
1063 lea ($tp40,$tp40),$tp80
1064 lea ($tp48,$tp48),$tp88
1065 xor $tp10,$tp40 # tp4^=tp1
1066 xor $tp18,$tp48 # tp4^=tp1
1067 and $maskfe,$tp80
1068 and $maskfe,$tp88
1069 and $mask1b,$acc0
1070 and $mask1b,$acc8
1071 xor $acc0,$tp80
1072 xor $acc8,$tp88
1073
1074 xor $tp80,$tp10 # tp1^=tp8
1075 xor $tp88,$tp18 # tp1^=tp8
1076 xor $tp80,$tp20 # tp2^tp1^=tp8
1077 xor $tp88,$tp28 # tp2^tp1^=tp8
1078 mov $tp10,$acc0
1079 mov $tp18,$acc8
1080 xor $tp80,$tp40 # tp4^tp1^=tp8
1081 xor $tp88,$tp48 # tp4^tp1^=tp8
1082 shr \$32,$acc0
1083 shr \$32,$acc8
1084 xor $tp20,$tp80 # tp8^=tp8^tp2^tp1=tp2^tp1
1085 xor $tp28,$tp88 # tp8^=tp8^tp2^tp1=tp2^tp1
1086 rol \$8,`&LO("$tp10")` # ROTATE(tp1^tp8,8)
1087 rol \$8,`&LO("$tp18")` # ROTATE(tp1^tp8,8)
1088 xor $tp40,$tp80 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1089 xor $tp48,$tp88 # tp2^tp1^=tp8^tp4^tp1=tp8^tp4^tp2
1090
1091 rol \$8,`&LO("$acc0")` # ROTATE(tp1^tp8,8)
1092 rol \$8,`&LO("$acc8")` # ROTATE(tp1^tp8,8)
1093 xor `&LO("$tp80")`,`&LO("$tp10")`
1094 xor `&LO("$tp88")`,`&LO("$tp18")`
1095 shr \$32,$tp80
1096 shr \$32,$tp88
1097 xor `&LO("$tp80")`,`&LO("$acc0")`
1098 xor `&LO("$tp88")`,`&LO("$acc8")`
1099
1100 mov $tp20,$tp80
1101 mov $tp28,$tp88
1102 shr \$32,$tp80
1103 shr \$32,$tp88
1104 rol \$24,`&LO("$tp20")` # ROTATE(tp2^tp1^tp8,24)
1105 rol \$24,`&LO("$tp28")` # ROTATE(tp2^tp1^tp8,24)
1106 rol \$24,`&LO("$tp80")` # ROTATE(tp2^tp1^tp8,24)
1107 rol \$24,`&LO("$tp88")` # ROTATE(tp2^tp1^tp8,24)
1108 xor `&LO("$tp20")`,`&LO("$tp10")`
1109 xor `&LO("$tp28")`,`&LO("$tp18")`
1110 mov $tp40,$tp20
1111 mov $tp48,$tp28
1112 xor `&LO("$tp80")`,`&LO("$acc0")`
1113 xor `&LO("$tp88")`,`&LO("$acc8")`
1114
1115 `"mov 0($sbox),$mask80" if ($prefetch)`
1116 shr \$32,$tp20
1117 shr \$32,$tp28
1118 `"mov 64($sbox),$maskfe" if ($prefetch)`
1119 rol \$16,`&LO("$tp40")` # ROTATE(tp4^tp1^tp8,16)
1120 rol \$16,`&LO("$tp48")` # ROTATE(tp4^tp1^tp8,16)
1121 `"mov 128($sbox),$mask1b" if ($prefetch)`
1122 rol \$16,`&LO("$tp20")` # ROTATE(tp4^tp1^tp8,16)
1123 rol \$16,`&LO("$tp28")` # ROTATE(tp4^tp1^tp8,16)
1124 `"mov 192($sbox),$tp80" if ($prefetch)`
1125 xor `&LO("$tp40")`,`&LO("$tp10")`
1126 xor `&LO("$tp48")`,`&LO("$tp18")`
1127 `"mov 256($sbox),$tp88" if ($prefetch)`
1128 xor `&LO("$tp20")`,`&LO("$acc0")`
1129 xor `&LO("$tp28")`,`&LO("$acc8")`
1130___
1131}
1132
1133$code.=<<___;
1134.type _x86_64_AES_decrypt_compact,\@abi-omnipotent
1135.align 16
1136_x86_64_AES_decrypt_compact:
1137 lea 128($sbox),$inp # size optimization
1138 mov 0-128($inp),$acc1 # prefetch Td4
1139 mov 32-128($inp),$acc2
1140 mov 64-128($inp),$t0
1141 mov 96-128($inp),$t1
1142 mov 128-128($inp),$acc1
1143 mov 160-128($inp),$acc2
1144 mov 192-128($inp),$t0
1145 mov 224-128($inp),$t1
1146 jmp .Ldec_loop_compact
1147
1148.align 16
1149.Ldec_loop_compact:
1150 xor 0($key),$s0 # xor with key
1151 xor 4($key),$s1
1152 xor 8($key),$s2
1153 xor 12($key),$s3
1154 lea 16($key),$key
1155___
1156 &deccompactvert();
1157$code.=<<___;
1158 cmp 16(%rsp),$key
1159 je .Ldec_compact_done
1160
1161 mov 256+0($sbox),$mask80
1162 shl \$32,%rbx
1163 shl \$32,%rdx
1164 mov 256+8($sbox),$maskfe
1165 or %rbx,%rax
1166 or %rdx,%rcx
1167 mov 256+16($sbox),$mask1b
1168___
1169 &dectransform(1);
1170$code.=<<___;
1171 jmp .Ldec_loop_compact
1172.align 16
1173.Ldec_compact_done:
1174 xor 0($key),$s0
1175 xor 4($key),$s1
1176 xor 8($key),$s2
1177 xor 12($key),$s3
1178 .byte 0xf3,0xc3 # rep ret
1179.size _x86_64_AES_decrypt_compact,.-_x86_64_AES_decrypt_compact
1180___
1181
1182# void AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1183$code.=<<___;
1184.globl AES_decrypt
1185.type AES_decrypt,\@function,3
1186.align 16
1187AES_decrypt:
1188 push %rbx
1189 push %rbp
1190 push %r12
1191 push %r13
1192 push %r14
1193 push %r15
1194
1195 # allocate frame "above" key schedule
1196 mov %rsp,%r10
1197 lea -63(%rdx),%rcx # %rdx is key argument
1198 and \$-64,%rsp
1199 sub %rsp,%rcx
1200 neg %rcx
1201 and \$0x3c0,%rcx
1202 sub %rcx,%rsp
1203 sub \$32,%rsp
1204
1205 mov %rsi,16(%rsp) # save out
1206 mov %r10,24(%rsp) # save real stack pointer
1207.Ldec_prologue:
1208
1209 mov %rdx,$key
1210 mov 240($key),$rnds # load rounds
1211
1212 mov 0(%rdi),$s0 # load input vector
1213 mov 4(%rdi),$s1
1214 mov 8(%rdi),$s2
1215 mov 12(%rdi),$s3
1216
1217 shl \$4,$rnds
1218 lea ($key,$rnds),%rbp
1219 mov $key,(%rsp) # key schedule
1220 mov %rbp,8(%rsp) # end of key schedule
1221
1222 # pick Td4 copy which can't "overlap" with stack frame or key schedule
1223 lea .LAES_Td+2048(%rip),$sbox
1224 lea 768(%rsp),%rbp
1225 sub $sbox,%rbp
1226 and \$0x300,%rbp
1227 lea ($sbox,%rbp),$sbox
1228 shr \$3,%rbp # recall "magic" constants!
1229 add %rbp,$sbox
1230
1231 call _x86_64_AES_decrypt_compact
1232
1233 mov 16(%rsp),$out # restore out
1234 mov 24(%rsp),%rsi # restore saved stack pointer
1235 mov $s0,0($out) # write output vector
1236 mov $s1,4($out)
1237 mov $s2,8($out)
1238 mov $s3,12($out)
1239
1240 mov (%rsi),%r15
1241 mov 8(%rsi),%r14
1242 mov 16(%rsi),%r13
1243 mov 24(%rsi),%r12
1244 mov 32(%rsi),%rbp
1245 mov 40(%rsi),%rbx
1246 lea 48(%rsi),%rsp
1247.Ldec_epilogue:
1248 ret
1249.size AES_decrypt,.-AES_decrypt
1250___
1251#------------------------------------------------------------------#
1252
1253sub enckey()
1254{
1255$code.=<<___;
1256 movz %dl,%esi # rk[i]>>0
1257 movzb -128(%rbp,%rsi),%ebx
1258 movz %dh,%esi # rk[i]>>8
1259 shl \$24,%ebx
1260 xor %ebx,%eax
1261
1262 movzb -128(%rbp,%rsi),%ebx
1263 shr \$16,%edx
1264 movz %dl,%esi # rk[i]>>16
1265 xor %ebx,%eax
1266
1267 movzb -128(%rbp,%rsi),%ebx
1268 movz %dh,%esi # rk[i]>>24
1269 shl \$8,%ebx
1270 xor %ebx,%eax
1271
1272 movzb -128(%rbp,%rsi),%ebx
1273 shl \$16,%ebx
1274 xor %ebx,%eax
1275
1276 xor 1024-128(%rbp,%rcx,4),%eax # rcon
1277___
1278}
1279
1280# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1281# AES_KEY *key)
1282$code.=<<___;
1283.globl AES_set_encrypt_key
1284.type AES_set_encrypt_key,\@function,3
1285.align 16
1286AES_set_encrypt_key:
1287 push %rbx
1288 push %rbp
1289 push %r12 # redundant, but allows to share
1290 push %r13 # exception handler...
1291 push %r14
1292 push %r15
1293 sub \$8,%rsp
1294.Lenc_key_prologue:
1295
1296 call _x86_64_AES_set_encrypt_key
1297
1298 mov 8(%rsp),%r15
1299 mov 16(%rsp),%r14
1300 mov 24(%rsp),%r13
1301 mov 32(%rsp),%r12
1302 mov 40(%rsp),%rbp
1303 mov 48(%rsp),%rbx
1304 add \$56,%rsp
1305.Lenc_key_epilogue:
1306 ret
1307.size AES_set_encrypt_key,.-AES_set_encrypt_key
1308
1309.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1310.align 16
1311_x86_64_AES_set_encrypt_key:
1312 mov %esi,%ecx # %ecx=bits
1313 mov %rdi,%rsi # %rsi=userKey
1314 mov %rdx,%rdi # %rdi=key
1315
1316 test \$-1,%rsi
1317 jz .Lbadpointer
1318 test \$-1,%rdi
1319 jz .Lbadpointer
1320
1321 lea .LAES_Te(%rip),%rbp
1322 lea 2048+128(%rbp),%rbp
1323
1324 # prefetch Te4
1325 mov 0-128(%rbp),%eax
1326 mov 32-128(%rbp),%ebx
1327 mov 64-128(%rbp),%r8d
1328 mov 96-128(%rbp),%edx
1329 mov 128-128(%rbp),%eax
1330 mov 160-128(%rbp),%ebx
1331 mov 192-128(%rbp),%r8d
1332 mov 224-128(%rbp),%edx
1333
1334 cmp \$128,%ecx
1335 je .L10rounds
1336 cmp \$192,%ecx
1337 je .L12rounds
1338 cmp \$256,%ecx
1339 je .L14rounds
1340 mov \$-2,%rax # invalid number of bits
1341 jmp .Lexit
1342
1343.L10rounds:
1344 mov 0(%rsi),%rax # copy first 4 dwords
1345 mov 8(%rsi),%rdx
1346 mov %rax,0(%rdi)
1347 mov %rdx,8(%rdi)
1348
1349 shr \$32,%rdx
1350 xor %ecx,%ecx
1351 jmp .L10shortcut
1352.align 4
1353.L10loop:
1354 mov 0(%rdi),%eax # rk[0]
1355 mov 12(%rdi),%edx # rk[3]
1356.L10shortcut:
1357___
1358 &enckey ();
1359$code.=<<___;
1360 mov %eax,16(%rdi) # rk[4]
1361 xor 4(%rdi),%eax
1362 mov %eax,20(%rdi) # rk[5]
1363 xor 8(%rdi),%eax
1364 mov %eax,24(%rdi) # rk[6]
1365 xor 12(%rdi),%eax
1366 mov %eax,28(%rdi) # rk[7]
1367 add \$1,%ecx
1368 lea 16(%rdi),%rdi
1369 cmp \$10,%ecx
1370 jl .L10loop
1371
1372 movl \$10,80(%rdi) # setup number of rounds
1373 xor %rax,%rax
1374 jmp .Lexit
1375
1376.L12rounds:
1377 mov 0(%rsi),%rax # copy first 6 dwords
1378 mov 8(%rsi),%rbx
1379 mov 16(%rsi),%rdx
1380 mov %rax,0(%rdi)
1381 mov %rbx,8(%rdi)
1382 mov %rdx,16(%rdi)
1383
1384 shr \$32,%rdx
1385 xor %ecx,%ecx
1386 jmp .L12shortcut
1387.align 4
1388.L12loop:
1389 mov 0(%rdi),%eax # rk[0]
1390 mov 20(%rdi),%edx # rk[5]
1391.L12shortcut:
1392___
1393 &enckey ();
1394$code.=<<___;
1395 mov %eax,24(%rdi) # rk[6]
1396 xor 4(%rdi),%eax
1397 mov %eax,28(%rdi) # rk[7]
1398 xor 8(%rdi),%eax
1399 mov %eax,32(%rdi) # rk[8]
1400 xor 12(%rdi),%eax
1401 mov %eax,36(%rdi) # rk[9]
1402
1403 cmp \$7,%ecx
1404 je .L12break
1405 add \$1,%ecx
1406
1407 xor 16(%rdi),%eax
1408 mov %eax,40(%rdi) # rk[10]
1409 xor 20(%rdi),%eax
1410 mov %eax,44(%rdi) # rk[11]
1411
1412 lea 24(%rdi),%rdi
1413 jmp .L12loop
1414.L12break:
1415 movl \$12,72(%rdi) # setup number of rounds
1416 xor %rax,%rax
1417 jmp .Lexit
1418
1419.L14rounds:
1420 mov 0(%rsi),%rax # copy first 8 dwords
1421 mov 8(%rsi),%rbx
1422 mov 16(%rsi),%rcx
1423 mov 24(%rsi),%rdx
1424 mov %rax,0(%rdi)
1425 mov %rbx,8(%rdi)
1426 mov %rcx,16(%rdi)
1427 mov %rdx,24(%rdi)
1428
1429 shr \$32,%rdx
1430 xor %ecx,%ecx
1431 jmp .L14shortcut
1432.align 4
1433.L14loop:
1434 mov 0(%rdi),%eax # rk[0]
1435 mov 28(%rdi),%edx # rk[4]
1436.L14shortcut:
1437___
1438 &enckey ();
1439$code.=<<___;
1440 mov %eax,32(%rdi) # rk[8]
1441 xor 4(%rdi),%eax
1442 mov %eax,36(%rdi) # rk[9]
1443 xor 8(%rdi),%eax
1444 mov %eax,40(%rdi) # rk[10]
1445 xor 12(%rdi),%eax
1446 mov %eax,44(%rdi) # rk[11]
1447
1448 cmp \$6,%ecx
1449 je .L14break
1450 add \$1,%ecx
1451
1452 mov %eax,%edx
1453 mov 16(%rdi),%eax # rk[4]
1454 movz %dl,%esi # rk[11]>>0
1455 movzb -128(%rbp,%rsi),%ebx
1456 movz %dh,%esi # rk[11]>>8
1457 xor %ebx,%eax
1458
1459 movzb -128(%rbp,%rsi),%ebx
1460 shr \$16,%edx
1461 shl \$8,%ebx
1462 movz %dl,%esi # rk[11]>>16
1463 xor %ebx,%eax
1464
1465 movzb -128(%rbp,%rsi),%ebx
1466 movz %dh,%esi # rk[11]>>24
1467 shl \$16,%ebx
1468 xor %ebx,%eax
1469
1470 movzb -128(%rbp,%rsi),%ebx
1471 shl \$24,%ebx
1472 xor %ebx,%eax
1473
1474 mov %eax,48(%rdi) # rk[12]
1475 xor 20(%rdi),%eax
1476 mov %eax,52(%rdi) # rk[13]
1477 xor 24(%rdi),%eax
1478 mov %eax,56(%rdi) # rk[14]
1479 xor 28(%rdi),%eax
1480 mov %eax,60(%rdi) # rk[15]
1481
1482 lea 32(%rdi),%rdi
1483 jmp .L14loop
1484.L14break:
1485 movl \$14,48(%rdi) # setup number of rounds
1486 xor %rax,%rax
1487 jmp .Lexit
1488
1489.Lbadpointer:
1490 mov \$-1,%rax
1491.Lexit:
1492 .byte 0xf3,0xc3 # rep ret
1493.size _x86_64_AES_set_encrypt_key,.-_x86_64_AES_set_encrypt_key
1494___
1495
1496sub deckey_ref()
1497{ my ($i,$ptr,$te,$td) = @_;
1498 my ($tp1,$tp2,$tp4,$tp8,$acc)=("%eax","%ebx","%edi","%edx","%r8d");
1499$code.=<<___;
1500 mov $i($ptr),$tp1
1501 mov $tp1,$acc
1502 and \$0x80808080,$acc
1503 mov $acc,$tp4
1504 shr \$7,$tp4
1505 lea 0($tp1,$tp1),$tp2
1506 sub $tp4,$acc
1507 and \$0xfefefefe,$tp2
1508 and \$0x1b1b1b1b,$acc
1509 xor $tp2,$acc
1510 mov $acc,$tp2
1511
1512 and \$0x80808080,$acc
1513 mov $acc,$tp8
1514 shr \$7,$tp8
1515 lea 0($tp2,$tp2),$tp4
1516 sub $tp8,$acc
1517 and \$0xfefefefe,$tp4
1518 and \$0x1b1b1b1b,$acc
1519 xor $tp1,$tp2 # tp2^tp1
1520 xor $tp4,$acc
1521 mov $acc,$tp4
1522
1523 and \$0x80808080,$acc
1524 mov $acc,$tp8
1525 shr \$7,$tp8
1526 sub $tp8,$acc
1527 lea 0($tp4,$tp4),$tp8
1528 xor $tp1,$tp4 # tp4^tp1
1529 and \$0xfefefefe,$tp8
1530 and \$0x1b1b1b1b,$acc
1531 xor $acc,$tp8
1532
1533 xor $tp8,$tp1 # tp1^tp8
1534 rol \$8,$tp1 # ROTATE(tp1^tp8,8)
1535 xor $tp8,$tp2 # tp2^tp1^tp8
1536 xor $tp8,$tp4 # tp4^tp1^tp8
1537 xor $tp2,$tp8
1538 xor $tp4,$tp8 # tp8^(tp8^tp4^tp1)^(tp8^tp2^tp1)=tp8^tp4^tp2
1539
1540 xor $tp8,$tp1
1541 rol \$24,$tp2 # ROTATE(tp2^tp1^tp8,24)
1542 xor $tp2,$tp1
1543 rol \$16,$tp4 # ROTATE(tp4^tp1^tp8,16)
1544 xor $tp4,$tp1
1545
1546 mov $tp1,$i($ptr)
1547___
1548}
1549
1550# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1551# AES_KEY *key)
1552$code.=<<___;
1553.globl AES_set_decrypt_key
1554.type AES_set_decrypt_key,\@function,3
1555.align 16
1556AES_set_decrypt_key:
1557 push %rbx
1558 push %rbp
1559 push %r12
1560 push %r13
1561 push %r14
1562 push %r15
1563 push %rdx # save key schedule
1564.Ldec_key_prologue:
1565
1566 call _x86_64_AES_set_encrypt_key
1567 mov (%rsp),%r8 # restore key schedule
1568 cmp \$0,%eax
1569 jne .Labort
1570
1571 mov 240(%r8),%r14d # pull number of rounds
1572 xor %rdi,%rdi
1573 lea (%rdi,%r14d,4),%rcx
1574 mov %r8,%rsi
1575 lea (%r8,%rcx,4),%rdi # pointer to last chunk
1576.align 4
1577.Linvert:
1578 mov 0(%rsi),%rax
1579 mov 8(%rsi),%rbx
1580 mov 0(%rdi),%rcx
1581 mov 8(%rdi),%rdx
1582 mov %rax,0(%rdi)
1583 mov %rbx,8(%rdi)
1584 mov %rcx,0(%rsi)
1585 mov %rdx,8(%rsi)
1586 lea 16(%rsi),%rsi
1587 lea -16(%rdi),%rdi
1588 cmp %rsi,%rdi
1589 jne .Linvert
1590
1591 lea .LAES_Te+2048+1024(%rip),%rax # rcon
1592
1593 mov 40(%rax),$mask80
1594 mov 48(%rax),$maskfe
1595 mov 56(%rax),$mask1b
1596
1597 mov %r8,$key
1598 sub \$1,%r14d
1599.align 4
1600.Lpermute:
1601 lea 16($key),$key
1602 mov 0($key),%rax
1603 mov 8($key),%rcx
1604___
1605 &dectransform ();
1606$code.=<<___;
1607 mov %eax,0($key)
1608 mov %ebx,4($key)
1609 mov %ecx,8($key)
1610 mov %edx,12($key)
1611 sub \$1,%r14d
1612 jnz .Lpermute
1613
1614 xor %rax,%rax
1615.Labort:
1616 mov 8(%rsp),%r15
1617 mov 16(%rsp),%r14
1618 mov 24(%rsp),%r13
1619 mov 32(%rsp),%r12
1620 mov 40(%rsp),%rbp
1621 mov 48(%rsp),%rbx
1622 add \$56,%rsp
1623.Ldec_key_epilogue:
1624 ret
1625.size AES_set_decrypt_key,.-AES_set_decrypt_key
1626___
1627
1628# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
1629# size_t length, const AES_KEY *key,
1630# unsigned char *ivp,const int enc);
1631{
1632# stack frame layout
1633# -8(%rsp) return address
1634my $keyp="0(%rsp)"; # one to pass as $key
1635my $keyend="8(%rsp)"; # &(keyp->rd_key[4*keyp->rounds])
1636my $_rsp="16(%rsp)"; # saved %rsp
1637my $_inp="24(%rsp)"; # copy of 1st parameter, inp
1638my $_out="32(%rsp)"; # copy of 2nd parameter, out
1639my $_len="40(%rsp)"; # copy of 3rd parameter, length
1640my $_key="48(%rsp)"; # copy of 4th parameter, key
1641my $_ivp="56(%rsp)"; # copy of 5th parameter, ivp
1642my $ivec="64(%rsp)"; # ivec[16]
1643my $aes_key="80(%rsp)"; # copy of aes_key
1644my $mark="80+240(%rsp)"; # copy of aes_key->rounds
1645
1646$code.=<<___;
1647.globl AES_cbc_encrypt
1648.type AES_cbc_encrypt,\@function,6
1649.align 16
1650.extern OPENSSL_ia32cap_P
1651AES_cbc_encrypt:
1652 cmp \$0,%rdx # check length
1653 je .Lcbc_epilogue
1654 pushfq
1655 push %rbx
1656 push %rbp
1657 push %r12
1658 push %r13
1659 push %r14
1660 push %r15
1661.Lcbc_prologue:
1662
1663 cld
1664 mov %r9d,%r9d # clear upper half of enc
1665
1666 lea .LAES_Te(%rip),$sbox
1667 cmp \$0,%r9
1668 jne .Lcbc_picked_te
1669 lea .LAES_Td(%rip),$sbox
1670.Lcbc_picked_te:
1671
1672 mov PIC_GOT(OPENSSL_ia32cap_P),%r10d
1673 cmp \$$speed_limit,%rdx
1674 jb .Lcbc_slow_prologue
1675 test \$15,%rdx
1676 jnz .Lcbc_slow_prologue
1677 bt \$28,%r10d
1678 jc .Lcbc_slow_prologue
1679
1680 # allocate aligned stack frame...
1681 lea -88-248(%rsp),$key
1682 and \$-64,$key
1683
1684 # ... and make sure it doesn't alias with AES_T[ed] modulo 4096
1685 mov $sbox,%r10
1686 lea 2304($sbox),%r11
1687 mov $key,%r12
1688 and \$0xFFF,%r10 # s = $sbox&0xfff
1689 and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
1690 and \$0xFFF,%r12 # p = %rsp&0xfff
1691
1692 cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
1693 jb .Lcbc_te_break_out
1694 sub %r11,%r12
1695 sub %r12,$key
1696 jmp .Lcbc_te_ok
1697.Lcbc_te_break_out: # else %rsp -= (p-s)&0xfff + framesz
1698 sub %r10,%r12
1699 and \$0xFFF,%r12
1700 add \$320,%r12
1701 sub %r12,$key
1702.align 4
1703.Lcbc_te_ok:
1704
1705 xchg %rsp,$key
1706 #add \$8,%rsp # reserve for return address!
1707 mov $key,$_rsp # save %rsp
1708.Lcbc_fast_body:
1709 mov %rdi,$_inp # save copy of inp
1710 mov %rsi,$_out # save copy of out
1711 mov %rdx,$_len # save copy of len
1712 mov %rcx,$_key # save copy of key
1713 mov %r8,$_ivp # save copy of ivp
1714 movl \$0,$mark # copy of aes_key->rounds = 0;
1715 mov %r8,%rbp # rearrange input arguments
1716 mov %r9,%rbx
1717 mov %rsi,$out
1718 mov %rdi,$inp
1719 mov %rcx,$key
1720
1721 mov 240($key),%eax # key->rounds
1722 # do we copy key schedule to stack?
1723 mov $key,%r10
1724 sub $sbox,%r10
1725 and \$0xfff,%r10
1726 cmp \$2304,%r10
1727 jb .Lcbc_do_ecopy
1728 cmp \$4096-248,%r10
1729 jb .Lcbc_skip_ecopy
1730.align 4
1731.Lcbc_do_ecopy:
1732 mov $key,%rsi
1733 lea $aes_key,%rdi
1734 lea $aes_key,$key
1735 mov \$240/8,%ecx
1736 .long 0x90A548F3 # rep movsq
1737 mov %eax,(%rdi) # copy aes_key->rounds
1738.Lcbc_skip_ecopy:
1739 mov $key,$keyp # save key pointer
1740
1741 mov \$18,%ecx
1742.align 4
1743.Lcbc_prefetch_te:
1744 mov 0($sbox),%r10
1745 mov 32($sbox),%r11
1746 mov 64($sbox),%r12
1747 mov 96($sbox),%r13
1748 lea 128($sbox),$sbox
1749 sub \$1,%ecx
1750 jnz .Lcbc_prefetch_te
1751 lea -2304($sbox),$sbox
1752
1753 cmp \$0,%rbx
1754 je .LFAST_DECRYPT
1755
1756#----------------------------- ENCRYPT -----------------------------#
1757 mov 0(%rbp),$s0 # load iv
1758 mov 4(%rbp),$s1
1759 mov 8(%rbp),$s2
1760 mov 12(%rbp),$s3
1761
1762.align 4
1763.Lcbc_fast_enc_loop:
1764 xor 0($inp),$s0
1765 xor 4($inp),$s1
1766 xor 8($inp),$s2
1767 xor 12($inp),$s3
1768 mov $keyp,$key # restore key
1769 mov $inp,$_inp # if ($verticalspin) save inp
1770
1771 call _x86_64_AES_encrypt
1772
1773 mov $_inp,$inp # if ($verticalspin) restore inp
1774 mov $_len,%r10
1775 mov $s0,0($out)
1776 mov $s1,4($out)
1777 mov $s2,8($out)
1778 mov $s3,12($out)
1779
1780 lea 16($inp),$inp
1781 lea 16($out),$out
1782 sub \$16,%r10
1783 test \$-16,%r10
1784 mov %r10,$_len
1785 jnz .Lcbc_fast_enc_loop
1786 mov $_ivp,%rbp # restore ivp
1787 mov $s0,0(%rbp) # save ivec
1788 mov $s1,4(%rbp)
1789 mov $s2,8(%rbp)
1790 mov $s3,12(%rbp)
1791
1792 jmp .Lcbc_fast_cleanup
1793
1794#----------------------------- DECRYPT -----------------------------#
1795.align 16
1796.LFAST_DECRYPT:
1797 cmp $inp,$out
1798 je .Lcbc_fast_dec_in_place
1799
1800 mov %rbp,$ivec
1801.align 4
1802.Lcbc_fast_dec_loop:
1803 mov 0($inp),$s0 # read input
1804 mov 4($inp),$s1
1805 mov 8($inp),$s2
1806 mov 12($inp),$s3
1807 mov $keyp,$key # restore key
1808 mov $inp,$_inp # if ($verticalspin) save inp
1809
1810 call _x86_64_AES_decrypt
1811
1812 mov $ivec,%rbp # load ivp
1813 mov $_inp,$inp # if ($verticalspin) restore inp
1814 mov $_len,%r10 # load len
1815 xor 0(%rbp),$s0 # xor iv
1816 xor 4(%rbp),$s1
1817 xor 8(%rbp),$s2
1818 xor 12(%rbp),$s3
1819 mov $inp,%rbp # current input, next iv
1820
1821 sub \$16,%r10
1822 mov %r10,$_len # update len
1823 mov %rbp,$ivec # update ivp
1824
1825 mov $s0,0($out) # write output
1826 mov $s1,4($out)
1827 mov $s2,8($out)
1828 mov $s3,12($out)
1829
1830 lea 16($inp),$inp
1831 lea 16($out),$out
1832 jnz .Lcbc_fast_dec_loop
1833 mov $_ivp,%r12 # load user ivp
1834 mov 0(%rbp),%r10 # load iv
1835 mov 8(%rbp),%r11
1836 mov %r10,0(%r12) # copy back to user
1837 mov %r11,8(%r12)
1838 jmp .Lcbc_fast_cleanup
1839
1840.align 16
1841.Lcbc_fast_dec_in_place:
1842 mov 0(%rbp),%r10 # copy iv to stack
1843 mov 8(%rbp),%r11
1844 mov %r10,0+$ivec
1845 mov %r11,8+$ivec
1846.align 4
1847.Lcbc_fast_dec_in_place_loop:
1848 mov 0($inp),$s0 # load input
1849 mov 4($inp),$s1
1850 mov 8($inp),$s2
1851 mov 12($inp),$s3
1852 mov $keyp,$key # restore key
1853 mov $inp,$_inp # if ($verticalspin) save inp
1854
1855 call _x86_64_AES_decrypt
1856
1857 mov $_inp,$inp # if ($verticalspin) restore inp
1858 mov $_len,%r10
1859 xor 0+$ivec,$s0
1860 xor 4+$ivec,$s1
1861 xor 8+$ivec,$s2
1862 xor 12+$ivec,$s3
1863
1864 mov 0($inp),%r11 # load input
1865 mov 8($inp),%r12
1866 sub \$16,%r10
1867 jz .Lcbc_fast_dec_in_place_done
1868
1869 mov %r11,0+$ivec # copy input to iv
1870 mov %r12,8+$ivec
1871
1872 mov $s0,0($out) # save output [zaps input]
1873 mov $s1,4($out)
1874 mov $s2,8($out)
1875 mov $s3,12($out)
1876
1877 lea 16($inp),$inp
1878 lea 16($out),$out
1879 mov %r10,$_len
1880 jmp .Lcbc_fast_dec_in_place_loop
1881.Lcbc_fast_dec_in_place_done:
1882 mov $_ivp,%rdi
1883 mov %r11,0(%rdi) # copy iv back to user
1884 mov %r12,8(%rdi)
1885
1886 mov $s0,0($out) # save output [zaps input]
1887 mov $s1,4($out)
1888 mov $s2,8($out)
1889 mov $s3,12($out)
1890
1891.align 4
1892.Lcbc_fast_cleanup:
1893 cmpl \$0,$mark # was the key schedule copied?
1894 lea $aes_key,%rdi
1895 je .Lcbc_exit
1896 mov \$240/8,%ecx
1897 xor %rax,%rax
1898 .long 0x90AB48F3 # rep stosq
1899
1900 jmp .Lcbc_exit
1901
1902#--------------------------- SLOW ROUTINE ---------------------------#
1903.align 16
1904.Lcbc_slow_prologue:
1905 # allocate aligned stack frame...
1906 lea -88(%rsp),%rbp
1907 and \$-64,%rbp
1908 # ... just "above" key schedule
1909 lea -88-63(%rcx),%r10
1910 sub %rbp,%r10
1911 neg %r10
1912 and \$0x3c0,%r10
1913 sub %r10,%rbp
1914
1915 xchg %rsp,%rbp
1916 #add \$8,%rsp # reserve for return address!
1917 mov %rbp,$_rsp # save %rsp
1918.Lcbc_slow_body:
1919 #mov %rdi,$_inp # save copy of inp
1920 #mov %rsi,$_out # save copy of out
1921 #mov %rdx,$_len # save copy of len
1922 #mov %rcx,$_key # save copy of key
1923 mov %r8,$_ivp # save copy of ivp
1924 mov %r8,%rbp # rearrange input arguments
1925 mov %r9,%rbx
1926 mov %rsi,$out
1927 mov %rdi,$inp
1928 mov %rcx,$key
1929 mov %rdx,%r10
1930
1931 mov 240($key),%eax
1932 mov $key,$keyp # save key pointer
1933 shl \$4,%eax
1934 lea ($key,%rax),%rax
1935 mov %rax,$keyend
1936
1937 # pick Te4 copy which can't "overlap" with stack frame or key scdedule
1938 lea 2048($sbox),$sbox
1939 lea 768-8(%rsp),%rax
1940 sub $sbox,%rax
1941 and \$0x300,%rax
1942 lea ($sbox,%rax),$sbox
1943
1944 cmp \$0,%rbx
1945 je .LSLOW_DECRYPT
1946
1947#--------------------------- SLOW ENCRYPT ---------------------------#
1948 test \$-16,%r10 # check upon length
1949 mov 0(%rbp),$s0 # load iv
1950 mov 4(%rbp),$s1
1951 mov 8(%rbp),$s2
1952 mov 12(%rbp),$s3
1953 jz .Lcbc_slow_enc_tail # short input...
1954
1955.align 4
1956.Lcbc_slow_enc_loop:
1957 xor 0($inp),$s0
1958 xor 4($inp),$s1
1959 xor 8($inp),$s2
1960 xor 12($inp),$s3
1961 mov $keyp,$key # restore key
1962 mov $inp,$_inp # save inp
1963 mov $out,$_out # save out
1964 mov %r10,$_len # save len
1965
1966 call _x86_64_AES_encrypt_compact
1967
1968 mov $_inp,$inp # restore inp
1969 mov $_out,$out # restore out
1970 mov $_len,%r10 # restore len
1971 mov $s0,0($out)
1972 mov $s1,4($out)
1973 mov $s2,8($out)
1974 mov $s3,12($out)
1975
1976 lea 16($inp),$inp
1977 lea 16($out),$out
1978 sub \$16,%r10
1979 test \$-16,%r10
1980 jnz .Lcbc_slow_enc_loop
1981 test \$15,%r10
1982 jnz .Lcbc_slow_enc_tail
1983 mov $_ivp,%rbp # restore ivp
1984 mov $s0,0(%rbp) # save ivec
1985 mov $s1,4(%rbp)
1986 mov $s2,8(%rbp)
1987 mov $s3,12(%rbp)
1988
1989 jmp .Lcbc_exit
1990
1991.align 4
1992.Lcbc_slow_enc_tail:
1993 mov %rax,%r11
1994 mov %rcx,%r12
1995 mov %r10,%rcx
1996 mov $inp,%rsi
1997 mov $out,%rdi
1998 .long 0x9066A4F3 # rep movsb
1999 mov \$16,%rcx # zero tail
2000 sub %r10,%rcx
2001 xor %rax,%rax
2002 .long 0x9066AAF3 # rep stosb
2003 mov $out,$inp # this is not a mistake!
2004 mov \$16,%r10 # len=16
2005 mov %r11,%rax
2006 mov %r12,%rcx
2007 jmp .Lcbc_slow_enc_loop # one more spin...
2008#--------------------------- SLOW DECRYPT ---------------------------#
2009.align 16
2010.LSLOW_DECRYPT:
2011 shr \$3,%rax
2012 add %rax,$sbox # recall "magic" constants!
2013
2014 mov 0(%rbp),%r11 # copy iv to stack
2015 mov 8(%rbp),%r12
2016 mov %r11,0+$ivec
2017 mov %r12,8+$ivec
2018
2019.align 4
2020.Lcbc_slow_dec_loop:
2021 mov 0($inp),$s0 # load input
2022 mov 4($inp),$s1
2023 mov 8($inp),$s2
2024 mov 12($inp),$s3
2025 mov $keyp,$key # restore key
2026 mov $inp,$_inp # save inp
2027 mov $out,$_out # save out
2028 mov %r10,$_len # save len
2029
2030 call _x86_64_AES_decrypt_compact
2031
2032 mov $_inp,$inp # restore inp
2033 mov $_out,$out # restore out
2034 mov $_len,%r10
2035 xor 0+$ivec,$s0
2036 xor 4+$ivec,$s1
2037 xor 8+$ivec,$s2
2038 xor 12+$ivec,$s3
2039
2040 mov 0($inp),%r11 # load input
2041 mov 8($inp),%r12
2042 sub \$16,%r10
2043 jc .Lcbc_slow_dec_partial
2044 jz .Lcbc_slow_dec_done
2045
2046 mov %r11,0+$ivec # copy input to iv
2047 mov %r12,8+$ivec
2048
2049 mov $s0,0($out) # save output [can zap input]
2050 mov $s1,4($out)
2051 mov $s2,8($out)
2052 mov $s3,12($out)
2053
2054 lea 16($inp),$inp
2055 lea 16($out),$out
2056 jmp .Lcbc_slow_dec_loop
2057.Lcbc_slow_dec_done:
2058 mov $_ivp,%rdi
2059 mov %r11,0(%rdi) # copy iv back to user
2060 mov %r12,8(%rdi)
2061
2062 mov $s0,0($out) # save output [can zap input]
2063 mov $s1,4($out)
2064 mov $s2,8($out)
2065 mov $s3,12($out)
2066
2067 jmp .Lcbc_exit
2068
2069.align 4
2070.Lcbc_slow_dec_partial:
2071 mov $_ivp,%rdi
2072 mov %r11,0(%rdi) # copy iv back to user
2073 mov %r12,8(%rdi)
2074
2075 mov $s0,0+$ivec # save output to stack
2076 mov $s1,4+$ivec
2077 mov $s2,8+$ivec
2078 mov $s3,12+$ivec
2079
2080 mov $out,%rdi
2081 lea $ivec,%rsi
2082 lea 16(%r10),%rcx
2083 .long 0x9066A4F3 # rep movsb
2084 jmp .Lcbc_exit
2085
2086.align 16
2087.Lcbc_exit:
2088 mov $_rsp,%rsi
2089 mov (%rsi),%r15
2090 mov 8(%rsi),%r14
2091 mov 16(%rsi),%r13
2092 mov 24(%rsi),%r12
2093 mov 32(%rsi),%rbp
2094 mov 40(%rsi),%rbx
2095 lea 48(%rsi),%rsp
2096.Lcbc_popfq:
2097 popfq
2098.Lcbc_epilogue:
2099 ret
2100.size AES_cbc_encrypt,.-AES_cbc_encrypt
2101___
2102}
2103
2104$code.=<<___;
2105.align 64
2106.LAES_Te:
2107___
2108 &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
2109 &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
2110 &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
2111 &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
2112 &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
2113 &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
2114 &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
2115 &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
2116 &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
2117 &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
2118 &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
2119 &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
2120 &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
2121 &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
2122 &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
2123 &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
2124 &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
2125 &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
2126 &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
2127 &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
2128 &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
2129 &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
2130 &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
2131 &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
2132 &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
2133 &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
2134 &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
2135 &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
2136 &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
2137 &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
2138 &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
2139 &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
2140 &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
2141 &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
2142 &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
2143 &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
2144 &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
2145 &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
2146 &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
2147 &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
2148 &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
2149 &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
2150 &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
2151 &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
2152 &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
2153 &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
2154 &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
2155 &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
2156 &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
2157 &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
2158 &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
2159 &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
2160 &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
2161 &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
2162 &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
2163 &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
2164 &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
2165 &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
2166 &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
2167 &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
2168 &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
2169 &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
2170 &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
2171 &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
2172
2173#Te4 # four copies of Te4 to choose from to avoid L1 aliasing
2174 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2175 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2176 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2177 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2178 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2179 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2180 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2181 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2182 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2183 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2184 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2185 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2186 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2187 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2188 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2189 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2190 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2191 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2192 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2193 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2194 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2195 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2196 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2197 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2198 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2199 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2200 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2201 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2202 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2203 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2204 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2205 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2206
2207 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2208 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2209 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2210 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2211 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2212 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2213 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2214 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2215 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2216 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2217 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2218 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2219 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2220 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2221 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2222 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2223 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2224 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2225 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2226 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2227 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2228 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2229 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2230 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2231 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2232 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2233 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2234 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2235 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2236 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2237 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2238 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2239
2240 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2241 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2242 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2243 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2244 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2245 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2246 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2247 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2248 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2249 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2250 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2251 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2252 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2253 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2254 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2255 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2256 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2257 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2258 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2259 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2260 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2261 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2262 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2263 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2264 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2265 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2266 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2267 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2268 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2269 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2270 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2271 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2272
2273 &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
2274 &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
2275 &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
2276 &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
2277 &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
2278 &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
2279 &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
2280 &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
2281 &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
2282 &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
2283 &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
2284 &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
2285 &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
2286 &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
2287 &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
2288 &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
2289 &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
2290 &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
2291 &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
2292 &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
2293 &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
2294 &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
2295 &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
2296 &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
2297 &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
2298 &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
2299 &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
2300 &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
2301 &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
2302 &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
2303 &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
2304 &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
2305#rcon:
2306$code.=<<___;
2307 .long 0x00000001, 0x00000002, 0x00000004, 0x00000008
2308 .long 0x00000010, 0x00000020, 0x00000040, 0x00000080
2309 .long 0x0000001b, 0x00000036, 0x80808080, 0x80808080
2310 .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b
2311___
2312$code.=<<___;
2313.align 64
2314.LAES_Td:
2315___
2316 &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
2317 &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
2318 &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
2319 &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
2320 &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
2321 &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
2322 &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
2323 &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
2324 &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
2325 &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
2326 &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
2327 &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
2328 &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
2329 &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
2330 &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
2331 &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
2332 &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
2333 &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
2334 &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
2335 &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
2336 &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
2337 &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
2338 &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
2339 &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
2340 &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
2341 &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
2342 &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
2343 &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
2344 &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
2345 &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
2346 &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
2347 &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
2348 &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
2349 &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
2350 &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
2351 &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
2352 &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
2353 &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
2354 &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
2355 &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
2356 &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
2357 &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
2358 &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
2359 &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
2360 &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
2361 &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
2362 &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
2363 &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
2364 &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
2365 &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
2366 &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
2367 &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
2368 &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
2369 &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
2370 &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
2371 &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
2372 &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
2373 &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
2374 &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
2375 &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
2376 &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
2377 &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
2378 &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
2379 &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
2380
2381#Td4: # four copies of Td4 to choose from to avoid L1 aliasing
2382 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2383 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2384 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2385 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2386 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2387 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2388 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2389 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2390 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2391 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2392 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2393 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2394 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2395 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2396 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2397 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2398 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2399 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2400 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2401 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2402 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2403 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2404 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2405 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2406 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2407 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2408 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2409 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2410 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2411 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2412 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2413 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2414$code.=<<___;
2415 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2416 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2417___
2418 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2419 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2420 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2421 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2422 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2423 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2424 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2425 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2426 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2427 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2428 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2429 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2430 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2431 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2432 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2433 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2434 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2435 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2436 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2437 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2438 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2439 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2440 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2441 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2442 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2443 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2444 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2445 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2446 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2447 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2448 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2449 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2450$code.=<<___;
2451 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2452 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2453___
2454 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2455 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2456 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2457 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2458 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2459 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2460 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2461 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2462 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2463 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2464 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2465 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2466 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2467 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2468 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2469 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2470 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2471 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2472 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2473 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2474 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2475 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2476 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2477 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2478 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2479 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2480 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2481 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2482 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2483 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2484 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2485 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2486$code.=<<___;
2487 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2488 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2489___
2490 &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
2491 &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
2492 &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
2493 &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
2494 &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
2495 &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
2496 &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
2497 &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
2498 &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
2499 &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
2500 &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
2501 &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
2502 &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
2503 &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
2504 &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
2505 &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
2506 &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
2507 &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
2508 &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
2509 &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
2510 &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
2511 &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
2512 &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
2513 &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
2514 &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
2515 &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
2516 &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
2517 &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
2518 &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
2519 &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
2520 &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
2521 &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
2522$code.=<<___;
2523 .long 0x80808080, 0x80808080, 0xfefefefe, 0xfefefefe
2524 .long 0x1b1b1b1b, 0x1b1b1b1b, 0, 0
2525.asciz "AES for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
2526.align 64
2527___
2528
2529# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2530# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2531if ($win64) {
2532$rec="%rcx";
2533$frame="%rdx";
2534$context="%r8";
2535$disp="%r9";
2536
2537$code.=<<___;
2538.extern __imp_RtlVirtualUnwind
2539.type block_se_handler,\@abi-omnipotent
2540.align 16
2541block_se_handler:
2542 push %rsi
2543 push %rdi
2544 push %rbx
2545 push %rbp
2546 push %r12
2547 push %r13
2548 push %r14
2549 push %r15
2550 pushfq
2551 sub \$64,%rsp
2552
2553 mov 120($context),%rax # pull context->Rax
2554 mov 248($context),%rbx # pull context->Rip
2555
2556 mov 8($disp),%rsi # disp->ImageBase
2557 mov 56($disp),%r11 # disp->HandlerData
2558
2559 mov 0(%r11),%r10d # HandlerData[0]
2560 lea (%rsi,%r10),%r10 # prologue label
2561 cmp %r10,%rbx # context->Rip<prologue label
2562 jb .Lin_block_prologue
2563
2564 mov 152($context),%rax # pull context->Rsp
2565
2566 mov 4(%r11),%r10d # HandlerData[1]
2567 lea (%rsi,%r10),%r10 # epilogue label
2568 cmp %r10,%rbx # context->Rip>=epilogue label
2569 jae .Lin_block_prologue
2570
2571 mov 24(%rax),%rax # pull saved real stack pointer
2572 lea 48(%rax),%rax # adjust...
2573
2574 mov -8(%rax),%rbx
2575 mov -16(%rax),%rbp
2576 mov -24(%rax),%r12
2577 mov -32(%rax),%r13
2578 mov -40(%rax),%r14
2579 mov -48(%rax),%r15
2580 mov %rbx,144($context) # restore context->Rbx
2581 mov %rbp,160($context) # restore context->Rbp
2582 mov %r12,216($context) # restore context->R12
2583 mov %r13,224($context) # restore context->R13
2584 mov %r14,232($context) # restore context->R14
2585 mov %r15,240($context) # restore context->R15
2586
2587.Lin_block_prologue:
2588 mov 8(%rax),%rdi
2589 mov 16(%rax),%rsi
2590 mov %rax,152($context) # restore context->Rsp
2591 mov %rsi,168($context) # restore context->Rsi
2592 mov %rdi,176($context) # restore context->Rdi
2593
2594 jmp .Lcommon_seh_exit
2595.size block_se_handler,.-block_se_handler
2596
2597.type key_se_handler,\@abi-omnipotent
2598.align 16
2599key_se_handler:
2600 push %rsi
2601 push %rdi
2602 push %rbx
2603 push %rbp
2604 push %r12
2605 push %r13
2606 push %r14
2607 push %r15
2608 pushfq
2609 sub \$64,%rsp
2610
2611 mov 120($context),%rax # pull context->Rax
2612 mov 248($context),%rbx # pull context->Rip
2613
2614 mov 8($disp),%rsi # disp->ImageBase
2615 mov 56($disp),%r11 # disp->HandlerData
2616
2617 mov 0(%r11),%r10d # HandlerData[0]
2618 lea (%rsi,%r10),%r10 # prologue label
2619 cmp %r10,%rbx # context->Rip<prologue label
2620 jb .Lin_key_prologue
2621
2622 mov 152($context),%rax # pull context->Rsp
2623
2624 mov 4(%r11),%r10d # HandlerData[1]
2625 lea (%rsi,%r10),%r10 # epilogue label
2626 cmp %r10,%rbx # context->Rip>=epilogue label
2627 jae .Lin_key_prologue
2628
2629 lea 56(%rax),%rax
2630
2631 mov -8(%rax),%rbx
2632 mov -16(%rax),%rbp
2633 mov -24(%rax),%r12
2634 mov -32(%rax),%r13
2635 mov -40(%rax),%r14
2636 mov -48(%rax),%r15
2637 mov %rbx,144($context) # restore context->Rbx
2638 mov %rbp,160($context) # restore context->Rbp
2639 mov %r12,216($context) # restore context->R12
2640 mov %r13,224($context) # restore context->R13
2641 mov %r14,232($context) # restore context->R14
2642 mov %r15,240($context) # restore context->R15
2643
2644.Lin_key_prologue:
2645 mov 8(%rax),%rdi
2646 mov 16(%rax),%rsi
2647 mov %rax,152($context) # restore context->Rsp
2648 mov %rsi,168($context) # restore context->Rsi
2649 mov %rdi,176($context) # restore context->Rdi
2650
2651 jmp .Lcommon_seh_exit
2652.size key_se_handler,.-key_se_handler
2653
2654.type cbc_se_handler,\@abi-omnipotent
2655.align 16
2656cbc_se_handler:
2657 push %rsi
2658 push %rdi
2659 push %rbx
2660 push %rbp
2661 push %r12
2662 push %r13
2663 push %r14
2664 push %r15
2665 pushfq
2666 sub \$64,%rsp
2667
2668 mov 120($context),%rax # pull context->Rax
2669 mov 248($context),%rbx # pull context->Rip
2670
2671 lea .Lcbc_prologue(%rip),%r10
2672 cmp %r10,%rbx # context->Rip<.Lcbc_prologue
2673 jb .Lin_cbc_prologue
2674
2675 lea .Lcbc_fast_body(%rip),%r10
2676 cmp %r10,%rbx # context->Rip<.Lcbc_fast_body
2677 jb .Lin_cbc_frame_setup
2678
2679 lea .Lcbc_slow_prologue(%rip),%r10
2680 cmp %r10,%rbx # context->Rip<.Lcbc_slow_prologue
2681 jb .Lin_cbc_body
2682
2683 lea .Lcbc_slow_body(%rip),%r10
2684 cmp %r10,%rbx # context->Rip<.Lcbc_slow_body
2685 jb .Lin_cbc_frame_setup
2686
2687.Lin_cbc_body:
2688 mov 152($context),%rax # pull context->Rsp
2689
2690 lea .Lcbc_epilogue(%rip),%r10
2691 cmp %r10,%rbx # context->Rip>=.Lcbc_epilogue
2692 jae .Lin_cbc_prologue
2693
2694 lea 8(%rax),%rax
2695
2696 lea .Lcbc_popfq(%rip),%r10
2697 cmp %r10,%rbx # context->Rip>=.Lcbc_popfq
2698 jae .Lin_cbc_prologue
2699
2700 mov `16-8`(%rax),%rax # biased $_rsp
2701 lea 56(%rax),%rax
2702
2703.Lin_cbc_frame_setup:
2704 mov -16(%rax),%rbx
2705 mov -24(%rax),%rbp
2706 mov -32(%rax),%r12
2707 mov -40(%rax),%r13
2708 mov -48(%rax),%r14
2709 mov -56(%rax),%r15
2710 mov %rbx,144($context) # restore context->Rbx
2711 mov %rbp,160($context) # restore context->Rbp
2712 mov %r12,216($context) # restore context->R12
2713 mov %r13,224($context) # restore context->R13
2714 mov %r14,232($context) # restore context->R14
2715 mov %r15,240($context) # restore context->R15
2716
2717.Lin_cbc_prologue:
2718 mov 8(%rax),%rdi
2719 mov 16(%rax),%rsi
2720 mov %rax,152($context) # restore context->Rsp
2721 mov %rsi,168($context) # restore context->Rsi
2722 mov %rdi,176($context) # restore context->Rdi
2723
2724.Lcommon_seh_exit:
2725
2726 mov 40($disp),%rdi # disp->ContextRecord
2727 mov $context,%rsi # context
2728 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2729 .long 0xa548f3fc # cld; rep movsq
2730
2731 mov $disp,%rsi
2732 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2733 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2734 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2735 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2736 mov 40(%rsi),%r10 # disp->ContextRecord
2737 lea 56(%rsi),%r11 # &disp->HandlerData
2738 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2739 mov %r10,32(%rsp) # arg5
2740 mov %r11,40(%rsp) # arg6
2741 mov %r12,48(%rsp) # arg7
2742 mov %rcx,56(%rsp) # arg8, (NULL)
2743 call *__imp_RtlVirtualUnwind(%rip)
2744
2745 mov \$1,%eax # ExceptionContinueSearch
2746 add \$64,%rsp
2747 popfq
2748 pop %r15
2749 pop %r14
2750 pop %r13
2751 pop %r12
2752 pop %rbp
2753 pop %rbx
2754 pop %rdi
2755 pop %rsi
2756 ret
2757.size cbc_se_handler,.-cbc_se_handler
2758
2759.section .pdata
2760.align 4
2761 .rva .LSEH_begin_AES_encrypt
2762 .rva .LSEH_end_AES_encrypt
2763 .rva .LSEH_info_AES_encrypt
2764
2765 .rva .LSEH_begin_AES_decrypt
2766 .rva .LSEH_end_AES_decrypt
2767 .rva .LSEH_info_AES_decrypt
2768
2769 .rva .LSEH_begin_AES_set_encrypt_key
2770 .rva .LSEH_end_AES_set_encrypt_key
2771 .rva .LSEH_info_AES_set_encrypt_key
2772
2773 .rva .LSEH_begin_AES_set_decrypt_key
2774 .rva .LSEH_end_AES_set_decrypt_key
2775 .rva .LSEH_info_AES_set_decrypt_key
2776
2777 .rva .LSEH_begin_AES_cbc_encrypt
2778 .rva .LSEH_end_AES_cbc_encrypt
2779 .rva .LSEH_info_AES_cbc_encrypt
2780
2781.section .xdata
2782.align 8
2783.LSEH_info_AES_encrypt:
2784 .byte 9,0,0,0
2785 .rva block_se_handler
2786 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
2787.LSEH_info_AES_decrypt:
2788 .byte 9,0,0,0
2789 .rva block_se_handler
2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2791.LSEH_info_AES_set_encrypt_key:
2792 .byte 9,0,0,0
2793 .rva key_se_handler
2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2795.LSEH_info_AES_set_decrypt_key:
2796 .byte 9,0,0,0
2797 .rva key_se_handler
2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
2799.LSEH_info_AES_cbc_encrypt:
2800 .byte 9,0,0,0
2801 .rva cbc_se_handler
2802___
2803}
2804
2805$code =~ s/\`([^\`]*)\`/eval($1)/gem;
2806
2807print $code;
2808
2809close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23# AES-128-CBC +SHA1 stitch gain
24# Westmere 3.77[+5.6] 9.37 6.65 +41%
25# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
26#
27# AES-192-CBC
28# Westmere 4.51 10.11 6.97 +45%
29# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
30#
31# AES-256-CBC
32# Westmere 5.25 10.85 7.25 +50%
33# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
34#
35# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36# background information. Above numbers in parentheses are SSSE3
37# results collected on AVX-capable CPU, i.e. apply on OSes that
38# don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47# AES-128-CBC AES-192-CBC AES-256-CBC
48# Westmere 1.31 1.55 1.80
49# Sandy Bridge 0.93 1.06 1.22
50
51$flavour = shift;
52$output = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64 $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67 $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70 $1>=10);
71
72open STDOUT,"| $^X $xlate $flavour $output";
73
74# void aesni_cbc_sha1_enc(const void *inp,
75# void *out,
76# size_t length,
77# const AES_KEY *key,
78# unsigned char *iv,
79# SHA_CTX *ctx,
80# const void *in0);
81
82$code.=<<___;
83.text
84.extern OPENSSL_ia32cap_P
85
86.globl aesni_cbc_sha1_enc
87.type aesni_cbc_sha1_enc,\@abi-omnipotent
88.align 16
89aesni_cbc_sha1_enc:
90 # caller should check for SSSE3 and AES-NI bits
91 mov OPENSSL_ia32cap_P+0(%rip),%r10d
92 mov OPENSSL_ia32cap_P+4(%rip),%r11d
93___
94$code.=<<___ if ($avx);
95 and \$`1<<28`,%r11d # mask AVX bit
96 and \$`1<<30`,%r10d # mask "Intel CPU" bit
97 or %r11d,%r10d
98 cmp \$`1<<28|1<<30`,%r10d
99 je aesni_cbc_sha1_enc_avx
100___
101$code.=<<___;
102 jmp aesni_cbc_sha1_enc_ssse3
103 ret
104.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
105___
106
107my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108
109my $Xi=4;
110my @X=map("%xmm$_",(4..7,0..3));
111my @Tx=map("%xmm$_",(8..10));
112my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
113my @T=("%esi","%edi");
114my $j=0; my $jj=0; my $r=0; my $sn=0;
115my $K_XX_XX="%r11";
116my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
117my @rndkey=("%xmm14","%xmm15");
118
119sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
120{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
121 my $arg = pop;
122 $arg = "\$$arg" if ($arg*1 eq $arg);
123 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
124}
125
126my $_rol=sub { &rol(@_) };
127my $_ror=sub { &ror(@_) };
128
129$code.=<<___;
130.type aesni_cbc_sha1_enc_ssse3,\@function,6
131.align 16
132aesni_cbc_sha1_enc_ssse3:
133 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
134 #shr \$6,$len # debugging artefact
135 #jz .Lepilogue_ssse3 # debugging artefact
136 push %rbx
137 push %rbp
138 push %r12
139 push %r13
140 push %r14
141 push %r15
142 lea `-104-($win64?10*16:0)`(%rsp),%rsp
143 #mov $in0,$inp # debugging artefact
144 #lea 64(%rsp),$ctx # debugging artefact
145___
146$code.=<<___ if ($win64);
147 movaps %xmm6,96+0(%rsp)
148 movaps %xmm7,96+16(%rsp)
149 movaps %xmm8,96+32(%rsp)
150 movaps %xmm9,96+48(%rsp)
151 movaps %xmm10,96+64(%rsp)
152 movaps %xmm11,96+80(%rsp)
153 movaps %xmm12,96+96(%rsp)
154 movaps %xmm13,96+112(%rsp)
155 movaps %xmm14,96+128(%rsp)
156 movaps %xmm15,96+144(%rsp)
157.Lprologue_ssse3:
158___
159$code.=<<___;
160 mov $in0,%r12 # reassign arguments
161 mov $out,%r13
162 mov $len,%r14
163 mov $key,%r15
164 movdqu ($ivp),$iv # load IV
165 mov $ivp,88(%rsp) # save $ivp
166___
167my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
168my $rounds="${ivp}d";
169$code.=<<___;
170 shl \$6,$len
171 sub $in0,$out
172 mov 240($key),$rounds
173 add $inp,$len # end of input
174
175 lea K_XX_XX(%rip),$K_XX_XX
176 mov 0($ctx),$A # load context
177 mov 4($ctx),$B
178 mov 8($ctx),$C
179 mov 12($ctx),$D
180 mov $B,@T[0] # magic seed
181 mov 16($ctx),$E
182
183 movdqa 64($K_XX_XX),@X[2] # pbswap mask
184 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
185 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
186 movdqu 16($inp),@X[-3&7]
187 movdqu 32($inp),@X[-2&7]
188 movdqu 48($inp),@X[-1&7]
189 pshufb @X[2],@X[-4&7] # byte swap
190 add \$64,$inp
191 pshufb @X[2],@X[-3&7]
192 pshufb @X[2],@X[-2&7]
193 pshufb @X[2],@X[-1&7]
194 paddd @Tx[1],@X[-4&7] # add K_00_19
195 paddd @Tx[1],@X[-3&7]
196 paddd @Tx[1],@X[-2&7]
197 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
198 psubd @Tx[1],@X[-4&7] # restore X[]
199 movdqa @X[-3&7],16(%rsp)
200 psubd @Tx[1],@X[-3&7]
201 movdqa @X[-2&7],32(%rsp)
202 psubd @Tx[1],@X[-2&7]
203 movups ($key),$rndkey0 # $key[0]
204 movups 16($key),$rndkey[0] # forward reference
205 jmp .Loop_ssse3
206___
207
208my $aesenc=sub {
209 use integer;
210 my ($n,$k)=($r/10,$r%10);
211 if ($k==0) {
212 $code.=<<___;
213 movups `16*$n`($in0),$in # load input
214 xorps $rndkey0,$in
215___
216 $code.=<<___ if ($n);
217 movups $iv,`16*($n-1)`($out,$in0) # write output
218___
219 $code.=<<___;
220 xorps $in,$iv
221 aesenc $rndkey[0],$iv
222 movups `32+16*$k`($key),$rndkey[1]
223___
224 } elsif ($k==9) {
225 $sn++;
226 $code.=<<___;
227 cmp \$11,$rounds
228 jb .Laesenclast$sn
229 movups `32+16*($k+0)`($key),$rndkey[1]
230 aesenc $rndkey[0],$iv
231 movups `32+16*($k+1)`($key),$rndkey[0]
232 aesenc $rndkey[1],$iv
233 je .Laesenclast$sn
234 movups `32+16*($k+2)`($key),$rndkey[1]
235 aesenc $rndkey[0],$iv
236 movups `32+16*($k+3)`($key),$rndkey[0]
237 aesenc $rndkey[1],$iv
238.Laesenclast$sn:
239 aesenclast $rndkey[0],$iv
240 movups 16($key),$rndkey[1] # forward reference
241___
242 } else {
243 $code.=<<___;
244 aesenc $rndkey[0],$iv
245 movups `32+16*$k`($key),$rndkey[1]
246___
247 }
248 $r++; unshift(@rndkey,pop(@rndkey));
249};
250
251sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
252{ use integer;
253 my $body = shift;
254 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
255 my ($a,$b,$c,$d,$e);
256
257 &movdqa (@X[0],@X[-3&7]);
258 eval(shift(@insns));
259 eval(shift(@insns));
260 &movdqa (@Tx[0],@X[-1&7]);
261 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
262 eval(shift(@insns));
263 eval(shift(@insns));
264
265 &paddd (@Tx[1],@X[-1&7]);
266 eval(shift(@insns));
267 eval(shift(@insns));
268 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
269 eval(shift(@insns));
270 eval(shift(@insns));
271 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
272 eval(shift(@insns));
273 eval(shift(@insns));
274
275 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
276 eval(shift(@insns));
277 eval(shift(@insns));
278 eval(shift(@insns));
279 eval(shift(@insns));
280
281 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
282 eval(shift(@insns));
283 eval(shift(@insns));
284 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
285 eval(shift(@insns));
286 eval(shift(@insns));
287
288 &movdqa (@Tx[2],@X[0]);
289 &movdqa (@Tx[0],@X[0]);
290 eval(shift(@insns));
291 eval(shift(@insns));
292 eval(shift(@insns));
293 eval(shift(@insns));
294
295 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
296 &paddd (@X[0],@X[0]);
297 eval(shift(@insns));
298 eval(shift(@insns));
299 eval(shift(@insns));
300 eval(shift(@insns));
301
302 &psrld (@Tx[0],31);
303 eval(shift(@insns));
304 eval(shift(@insns));
305 &movdqa (@Tx[1],@Tx[2]);
306 eval(shift(@insns));
307 eval(shift(@insns));
308
309 &psrld (@Tx[2],30);
310 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
311 eval(shift(@insns));
312 eval(shift(@insns));
313 eval(shift(@insns));
314 eval(shift(@insns));
315
316 &pslld (@Tx[1],2);
317 &pxor (@X[0],@Tx[2]);
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
321 eval(shift(@insns));
322 eval(shift(@insns));
323
324 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
325
326 foreach (@insns) { eval; } # remaining instructions [if any]
327
328 $Xi++; push(@X,shift(@X)); # "rotate" X[]
329 push(@Tx,shift(@Tx));
330}
331
332sub Xupdate_ssse3_32_79()
333{ use integer;
334 my $body = shift;
335 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
336 my ($a,$b,$c,$d,$e);
337
338 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
339 eval(shift(@insns)); # body_20_39
340 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
341 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
342 eval(shift(@insns));
343 eval(shift(@insns));
344 eval(shift(@insns)); # rol
345
346 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
347 eval(shift(@insns));
348 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
349 if ($Xi%5) {
350 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
351 } else { # ... or load next one
352 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
353 }
354 &paddd (@Tx[1],@X[-1&7]);
355 eval(shift(@insns)); # ror
356 eval(shift(@insns));
357
358 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
359 eval(shift(@insns)); # body_20_39
360 eval(shift(@insns));
361 eval(shift(@insns));
362 eval(shift(@insns)); # rol
363
364 &movdqa (@Tx[0],@X[0]);
365 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
366 eval(shift(@insns));
367 eval(shift(@insns));
368 eval(shift(@insns)); # ror
369 eval(shift(@insns));
370
371 &pslld (@X[0],2);
372 eval(shift(@insns)); # body_20_39
373 eval(shift(@insns));
374 &psrld (@Tx[0],30);
375 eval(shift(@insns));
376 eval(shift(@insns)); # rol
377 eval(shift(@insns));
378 eval(shift(@insns));
379 eval(shift(@insns)); # ror
380 eval(shift(@insns));
381
382 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
383 eval(shift(@insns)); # body_20_39
384 eval(shift(@insns));
385 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
386 eval(shift(@insns));
387 eval(shift(@insns)); # rol
388 eval(shift(@insns));
389 eval(shift(@insns));
390 eval(shift(@insns)); # rol
391 eval(shift(@insns));
392
393 foreach (@insns) { eval; } # remaining instructions
394
395 $Xi++; push(@X,shift(@X)); # "rotate" X[]
396 push(@Tx,shift(@Tx));
397}
398
399sub Xuplast_ssse3_80()
400{ use integer;
401 my $body = shift;
402 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
403 my ($a,$b,$c,$d,$e);
404
405 eval(shift(@insns));
406 &paddd (@Tx[1],@X[-1&7]);
407 eval(shift(@insns));
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411
412 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
413
414 foreach (@insns) { eval; } # remaining instructions
415
416 &cmp ($inp,$len);
417 &je (".Ldone_ssse3");
418
419 unshift(@Tx,pop(@Tx));
420
421 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
422 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
423 &movdqu (@X[-4&7],"0($inp)"); # load input
424 &movdqu (@X[-3&7],"16($inp)");
425 &movdqu (@X[-2&7],"32($inp)");
426 &movdqu (@X[-1&7],"48($inp)");
427 &pshufb (@X[-4&7],@X[2]); # byte swap
428 &add ($inp,64);
429
430 $Xi=0;
431}
432
433sub Xloop_ssse3()
434{ use integer;
435 my $body = shift;
436 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
437 my ($a,$b,$c,$d,$e);
438
439 eval(shift(@insns));
440 eval(shift(@insns));
441 &pshufb (@X[($Xi-3)&7],@X[2]);
442 eval(shift(@insns));
443 eval(shift(@insns));
444 &paddd (@X[($Xi-4)&7],@Tx[1]);
445 eval(shift(@insns));
446 eval(shift(@insns));
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &psubd (@X[($Xi-4)&7],@Tx[1]);
453
454 foreach (@insns) { eval; }
455 $Xi++;
456}
457
458sub Xtail_ssse3()
459{ use integer;
460 my $body = shift;
461 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
462 my ($a,$b,$c,$d,$e);
463
464 foreach (@insns) { eval; }
465}
466
467sub body_00_19 () {
468 use integer;
469 my ($k,$n);
470 my @r=(
471 '($a,$b,$c,$d,$e)=@V;'.
472 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
473 '&xor ($c,$d);',
474 '&mov (@T[1],$a);', # $b in next round
475 '&$_rol ($a,5);',
476 '&and (@T[0],$c);', # ($b&($c^$d))
477 '&xor ($c,$d);', # restore $c
478 '&xor (@T[0],$d);',
479 '&add ($e,$a);',
480 '&$_ror ($b,$j?7:2);', # $b>>>2
481 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
482 );
483 $n = scalar(@r);
484 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
485 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
486 $jj++;
487 return @r;
488}
489
490sub body_20_39 () {
491 use integer;
492 my ($k,$n);
493 my @r=(
494 '($a,$b,$c,$d,$e)=@V;'.
495 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
496 '&xor (@T[0],$d);', # ($b^$d)
497 '&mov (@T[1],$a);', # $b in next round
498 '&$_rol ($a,5);',
499 '&xor (@T[0],$c);', # ($b^$d^$c)
500 '&add ($e,$a);',
501 '&$_ror ($b,7);', # $b>>>2
502 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
503 );
504 $n = scalar(@r);
505 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
506 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
507 $jj++;
508 return @r;
509}
510
511sub body_40_59 () {
512 use integer;
513 my ($k,$n);
514 my @r=(
515 '($a,$b,$c,$d,$e)=@V;'.
516 '&mov (@T[1],$c);',
517 '&xor ($c,$d);',
518 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
519 '&and (@T[1],$d);',
520 '&and (@T[0],$c);', # ($b&($c^$d))
521 '&$_ror ($b,7);', # $b>>>2
522 '&add ($e,@T[1]);',
523 '&mov (@T[1],$a);', # $b in next round
524 '&$_rol ($a,5);',
525 '&add ($e,@T[0]);',
526 '&xor ($c,$d);', # restore $c
527 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
528 );
529 $n = scalar(@r);
530 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
531 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
532 $jj++;
533 return @r;
534}
535$code.=<<___;
536.align 16
537.Loop_ssse3:
538___
539 &Xupdate_ssse3_16_31(\&body_00_19);
540 &Xupdate_ssse3_16_31(\&body_00_19);
541 &Xupdate_ssse3_16_31(\&body_00_19);
542 &Xupdate_ssse3_16_31(\&body_00_19);
543 &Xupdate_ssse3_32_79(\&body_00_19);
544 &Xupdate_ssse3_32_79(\&body_20_39);
545 &Xupdate_ssse3_32_79(\&body_20_39);
546 &Xupdate_ssse3_32_79(\&body_20_39);
547 &Xupdate_ssse3_32_79(\&body_20_39);
548 &Xupdate_ssse3_32_79(\&body_20_39);
549 &Xupdate_ssse3_32_79(\&body_40_59);
550 &Xupdate_ssse3_32_79(\&body_40_59);
551 &Xupdate_ssse3_32_79(\&body_40_59);
552 &Xupdate_ssse3_32_79(\&body_40_59);
553 &Xupdate_ssse3_32_79(\&body_40_59);
554 &Xupdate_ssse3_32_79(\&body_20_39);
555 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
556
557 $saved_j=$j; @saved_V=@V;
558 $saved_r=$r; @saved_rndkey=@rndkey;
559
560 &Xloop_ssse3(\&body_20_39);
561 &Xloop_ssse3(\&body_20_39);
562 &Xloop_ssse3(\&body_20_39);
563
564$code.=<<___;
565 movups $iv,48($out,$in0) # write output
566 lea 64($in0),$in0
567
568 add 0($ctx),$A # update context
569 add 4($ctx),@T[0]
570 add 8($ctx),$C
571 add 12($ctx),$D
572 mov $A,0($ctx)
573 add 16($ctx),$E
574 mov @T[0],4($ctx)
575 mov @T[0],$B # magic seed
576 mov $C,8($ctx)
577 mov $D,12($ctx)
578 mov $E,16($ctx)
579 jmp .Loop_ssse3
580
581.align 16
582.Ldone_ssse3:
583___
584 $jj=$j=$saved_j; @V=@saved_V;
585 $r=$saved_r; @rndkey=@saved_rndkey;
586
587 &Xtail_ssse3(\&body_20_39);
588 &Xtail_ssse3(\&body_20_39);
589 &Xtail_ssse3(\&body_20_39);
590
591$code.=<<___;
592 movups $iv,48($out,$in0) # write output
593 mov 88(%rsp),$ivp # restore $ivp
594
595 add 0($ctx),$A # update context
596 add 4($ctx),@T[0]
597 add 8($ctx),$C
598 mov $A,0($ctx)
599 add 12($ctx),$D
600 mov @T[0],4($ctx)
601 add 16($ctx),$E
602 mov $C,8($ctx)
603 mov $D,12($ctx)
604 mov $E,16($ctx)
605 movups $iv,($ivp) # write IV
606___
607$code.=<<___ if ($win64);
608 movaps 96+0(%rsp),%xmm6
609 movaps 96+16(%rsp),%xmm7
610 movaps 96+32(%rsp),%xmm8
611 movaps 96+48(%rsp),%xmm9
612 movaps 96+64(%rsp),%xmm10
613 movaps 96+80(%rsp),%xmm11
614 movaps 96+96(%rsp),%xmm12
615 movaps 96+112(%rsp),%xmm13
616 movaps 96+128(%rsp),%xmm14
617 movaps 96+144(%rsp),%xmm15
618___
619$code.=<<___;
620 lea `104+($win64?10*16:0)`(%rsp),%rsi
621 mov 0(%rsi),%r15
622 mov 8(%rsi),%r14
623 mov 16(%rsi),%r13
624 mov 24(%rsi),%r12
625 mov 32(%rsi),%rbp
626 mov 40(%rsi),%rbx
627 lea 48(%rsi),%rsp
628.Lepilogue_ssse3:
629 ret
630.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
631___
632
633$j=$jj=$r=$sn=0;
634
635if ($avx) {
636my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
637
638my $Xi=4;
639my @X=map("%xmm$_",(4..7,0..3));
640my @Tx=map("%xmm$_",(8..10));
641my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
642my @T=("%esi","%edi");
643
644my $_rol=sub { &shld(@_[0],@_) };
645my $_ror=sub { &shrd(@_[0],@_) };
646
647$code.=<<___;
648.type aesni_cbc_sha1_enc_avx,\@function,6
649.align 16
650aesni_cbc_sha1_enc_avx:
651 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
652 #shr \$6,$len # debugging artefact
653 #jz .Lepilogue_avx # debugging artefact
654 push %rbx
655 push %rbp
656 push %r12
657 push %r13
658 push %r14
659 push %r15
660 lea `-104-($win64?10*16:0)`(%rsp),%rsp
661 #mov $in0,$inp # debugging artefact
662 #lea 64(%rsp),$ctx # debugging artefact
663___
664$code.=<<___ if ($win64);
665 movaps %xmm6,96+0(%rsp)
666 movaps %xmm7,96+16(%rsp)
667 movaps %xmm8,96+32(%rsp)
668 movaps %xmm9,96+48(%rsp)
669 movaps %xmm10,96+64(%rsp)
670 movaps %xmm11,96+80(%rsp)
671 movaps %xmm12,96+96(%rsp)
672 movaps %xmm13,96+112(%rsp)
673 movaps %xmm14,96+128(%rsp)
674 movaps %xmm15,96+144(%rsp)
675.Lprologue_avx:
676___
677$code.=<<___;
678 vzeroall
679 mov $in0,%r12 # reassign arguments
680 mov $out,%r13
681 mov $len,%r14
682 mov $key,%r15
683 vmovdqu ($ivp),$iv # load IV
684 mov $ivp,88(%rsp) # save $ivp
685___
686my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
687my $rounds="${ivp}d";
688$code.=<<___;
689 shl \$6,$len
690 sub $in0,$out
691 mov 240($key),$rounds
692 add \$112,$key # size optimization
693 add $inp,$len # end of input
694
695 lea K_XX_XX(%rip),$K_XX_XX
696 mov 0($ctx),$A # load context
697 mov 4($ctx),$B
698 mov 8($ctx),$C
699 mov 12($ctx),$D
700 mov $B,@T[0] # magic seed
701 mov 16($ctx),$E
702
703 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
704 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
705 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
706 vmovdqu 16($inp),@X[-3&7]
707 vmovdqu 32($inp),@X[-2&7]
708 vmovdqu 48($inp),@X[-1&7]
709 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
710 add \$64,$inp
711 vpshufb @X[2],@X[-3&7],@X[-3&7]
712 vpshufb @X[2],@X[-2&7],@X[-2&7]
713 vpshufb @X[2],@X[-1&7],@X[-1&7]
714 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
715 vpaddd @Tx[1],@X[-3&7],@X[1]
716 vpaddd @Tx[1],@X[-2&7],@X[2]
717 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
718 vmovdqa @X[1],16(%rsp)
719 vmovdqa @X[2],32(%rsp)
720 vmovups -112($key),$rndkey0 # $key[0]
721 vmovups 16-112($key),$rndkey[0] # forward reference
722 jmp .Loop_avx
723___
724
725my $aesenc=sub {
726 use integer;
727 my ($n,$k)=($r/10,$r%10);
728 if ($k==0) {
729 $code.=<<___;
730 vmovups `16*$n`($in0),$in # load input
731 vxorps $rndkey0,$in,$in
732___
733 $code.=<<___ if ($n);
734 vmovups $iv,`16*($n-1)`($out,$in0) # write output
735___
736 $code.=<<___;
737 vxorps $in,$iv,$iv
738 vaesenc $rndkey[0],$iv,$iv
739 vmovups `32+16*$k-112`($key),$rndkey[1]
740___
741 } elsif ($k==9) {
742 $sn++;
743 $code.=<<___;
744 cmp \$11,$rounds
745 jb .Lvaesenclast$sn
746 vaesenc $rndkey[0],$iv,$iv
747 vmovups `32+16*($k+0)-112`($key),$rndkey[1]
748 vaesenc $rndkey[1],$iv,$iv
749 vmovups `32+16*($k+1)-112`($key),$rndkey[0]
750 je .Lvaesenclast$sn
751 vaesenc $rndkey[0],$iv,$iv
752 vmovups `32+16*($k+2)-112`($key),$rndkey[1]
753 vaesenc $rndkey[1],$iv,$iv
754 vmovups `32+16*($k+3)-112`($key),$rndkey[0]
755.Lvaesenclast$sn:
756 vaesenclast $rndkey[0],$iv,$iv
757 vmovups 16-112($key),$rndkey[1] # forward reference
758___
759 } else {
760 $code.=<<___;
761 vaesenc $rndkey[0],$iv,$iv
762 vmovups `32+16*$k-112`($key),$rndkey[1]
763___
764 }
765 $r++; unshift(@rndkey,pop(@rndkey));
766};
767
768sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
769{ use integer;
770 my $body = shift;
771 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
772 my ($a,$b,$c,$d,$e);
773
774 eval(shift(@insns));
775 eval(shift(@insns));
776 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
777 eval(shift(@insns));
778 eval(shift(@insns));
779
780 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
781 eval(shift(@insns));
782 eval(shift(@insns));
783 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
784 eval(shift(@insns));
785 eval(shift(@insns));
786 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
787 eval(shift(@insns));
788 eval(shift(@insns));
789
790 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
791 eval(shift(@insns));
792 eval(shift(@insns));
793 eval(shift(@insns));
794 eval(shift(@insns));
795
796 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
797 eval(shift(@insns));
798 eval(shift(@insns));
799 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
800 eval(shift(@insns));
801 eval(shift(@insns));
802
803 &vpsrld (@Tx[0],@X[0],31);
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808
809 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
810 &vpaddd (@X[0],@X[0],@X[0]);
811 eval(shift(@insns));
812 eval(shift(@insns));
813 eval(shift(@insns));
814 eval(shift(@insns));
815
816 &vpsrld (@Tx[1],@Tx[2],30);
817 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
818 eval(shift(@insns));
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
822
823 &vpslld (@Tx[2],@Tx[2],2);
824 &vpxor (@X[0],@X[0],@Tx[1]);
825 eval(shift(@insns));
826 eval(shift(@insns));
827 eval(shift(@insns));
828 eval(shift(@insns));
829
830 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
831 eval(shift(@insns));
832 eval(shift(@insns));
833 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
834 eval(shift(@insns));
835 eval(shift(@insns));
836
837
838 foreach (@insns) { eval; } # remaining instructions [if any]
839
840 $Xi++; push(@X,shift(@X)); # "rotate" X[]
841 push(@Tx,shift(@Tx));
842}
843
844sub Xupdate_avx_32_79()
845{ use integer;
846 my $body = shift;
847 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
848 my ($a,$b,$c,$d,$e);
849
850 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
851 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
852 eval(shift(@insns)); # body_20_39
853 eval(shift(@insns));
854 eval(shift(@insns));
855 eval(shift(@insns)); # rol
856
857 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
858 eval(shift(@insns));
859 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
860 if ($Xi%5) {
861 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
862 } else { # ... or load next one
863 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
864 }
865 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
866 eval(shift(@insns)); # ror
867 eval(shift(@insns));
868
869 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
870 eval(shift(@insns)); # body_20_39
871 eval(shift(@insns));
872 eval(shift(@insns));
873 eval(shift(@insns)); # rol
874
875 &vpsrld (@Tx[0],@X[0],30);
876 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
877 eval(shift(@insns));
878 eval(shift(@insns));
879 eval(shift(@insns)); # ror
880 eval(shift(@insns));
881
882 &vpslld (@X[0],@X[0],2);
883 eval(shift(@insns)); # body_20_39
884 eval(shift(@insns));
885 eval(shift(@insns));
886 eval(shift(@insns)); # rol
887 eval(shift(@insns));
888 eval(shift(@insns));
889 eval(shift(@insns)); # ror
890 eval(shift(@insns));
891
892 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
893 eval(shift(@insns)); # body_20_39
894 eval(shift(@insns));
895 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
896 eval(shift(@insns));
897 eval(shift(@insns)); # rol
898 eval(shift(@insns));
899 eval(shift(@insns));
900 eval(shift(@insns)); # rol
901 eval(shift(@insns));
902
903 foreach (@insns) { eval; } # remaining instructions
904
905 $Xi++; push(@X,shift(@X)); # "rotate" X[]
906 push(@Tx,shift(@Tx));
907}
908
909sub Xuplast_avx_80()
910{ use integer;
911 my $body = shift;
912 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
913 my ($a,$b,$c,$d,$e);
914
915 eval(shift(@insns));
916 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
917 eval(shift(@insns));
918 eval(shift(@insns));
919 eval(shift(@insns));
920 eval(shift(@insns));
921
922 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
923
924 foreach (@insns) { eval; } # remaining instructions
925
926 &cmp ($inp,$len);
927 &je (".Ldone_avx");
928
929 unshift(@Tx,pop(@Tx));
930
931 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
932 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
933 &vmovdqu(@X[-4&7],"0($inp)"); # load input
934 &vmovdqu(@X[-3&7],"16($inp)");
935 &vmovdqu(@X[-2&7],"32($inp)");
936 &vmovdqu(@X[-1&7],"48($inp)");
937 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
938 &add ($inp,64);
939
940 $Xi=0;
941}
942
943sub Xloop_avx()
944{ use integer;
945 my $body = shift;
946 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
947 my ($a,$b,$c,$d,$e);
948
949 eval(shift(@insns));
950 eval(shift(@insns));
951 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
952 eval(shift(@insns));
953 eval(shift(@insns));
954 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
955 eval(shift(@insns));
956 eval(shift(@insns));
957 eval(shift(@insns));
958 eval(shift(@insns));
959 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
960 eval(shift(@insns));
961 eval(shift(@insns));
962
963 foreach (@insns) { eval; }
964 $Xi++;
965}
966
967sub Xtail_avx()
968{ use integer;
969 my $body = shift;
970 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
971 my ($a,$b,$c,$d,$e);
972
973 foreach (@insns) { eval; }
974}
975
976$code.=<<___;
977.align 16
978.Loop_avx:
979___
980 &Xupdate_avx_16_31(\&body_00_19);
981 &Xupdate_avx_16_31(\&body_00_19);
982 &Xupdate_avx_16_31(\&body_00_19);
983 &Xupdate_avx_16_31(\&body_00_19);
984 &Xupdate_avx_32_79(\&body_00_19);
985 &Xupdate_avx_32_79(\&body_20_39);
986 &Xupdate_avx_32_79(\&body_20_39);
987 &Xupdate_avx_32_79(\&body_20_39);
988 &Xupdate_avx_32_79(\&body_20_39);
989 &Xupdate_avx_32_79(\&body_20_39);
990 &Xupdate_avx_32_79(\&body_40_59);
991 &Xupdate_avx_32_79(\&body_40_59);
992 &Xupdate_avx_32_79(\&body_40_59);
993 &Xupdate_avx_32_79(\&body_40_59);
994 &Xupdate_avx_32_79(\&body_40_59);
995 &Xupdate_avx_32_79(\&body_20_39);
996 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
997
998 $saved_j=$j; @saved_V=@V;
999 $saved_r=$r; @saved_rndkey=@rndkey;
1000
1001 &Xloop_avx(\&body_20_39);
1002 &Xloop_avx(\&body_20_39);
1003 &Xloop_avx(\&body_20_39);
1004
1005$code.=<<___;
1006 vmovups $iv,48($out,$in0) # write output
1007 lea 64($in0),$in0
1008
1009 add 0($ctx),$A # update context
1010 add 4($ctx),@T[0]
1011 add 8($ctx),$C
1012 add 12($ctx),$D
1013 mov $A,0($ctx)
1014 add 16($ctx),$E
1015 mov @T[0],4($ctx)
1016 mov @T[0],$B # magic seed
1017 mov $C,8($ctx)
1018 mov $D,12($ctx)
1019 mov $E,16($ctx)
1020 jmp .Loop_avx
1021
1022.align 16
1023.Ldone_avx:
1024___
1025 $jj=$j=$saved_j; @V=@saved_V;
1026 $r=$saved_r; @rndkey=@saved_rndkey;
1027
1028 &Xtail_avx(\&body_20_39);
1029 &Xtail_avx(\&body_20_39);
1030 &Xtail_avx(\&body_20_39);
1031
1032$code.=<<___;
1033 vmovups $iv,48($out,$in0) # write output
1034 mov 88(%rsp),$ivp # restore $ivp
1035
1036 add 0($ctx),$A # update context
1037 add 4($ctx),@T[0]
1038 add 8($ctx),$C
1039 mov $A,0($ctx)
1040 add 12($ctx),$D
1041 mov @T[0],4($ctx)
1042 add 16($ctx),$E
1043 mov $C,8($ctx)
1044 mov $D,12($ctx)
1045 mov $E,16($ctx)
1046 vmovups $iv,($ivp) # write IV
1047 vzeroall
1048___
1049$code.=<<___ if ($win64);
1050 movaps 96+0(%rsp),%xmm6
1051 movaps 96+16(%rsp),%xmm7
1052 movaps 96+32(%rsp),%xmm8
1053 movaps 96+48(%rsp),%xmm9
1054 movaps 96+64(%rsp),%xmm10
1055 movaps 96+80(%rsp),%xmm11
1056 movaps 96+96(%rsp),%xmm12
1057 movaps 96+112(%rsp),%xmm13
1058 movaps 96+128(%rsp),%xmm14
1059 movaps 96+144(%rsp),%xmm15
1060___
1061$code.=<<___;
1062 lea `104+($win64?10*16:0)`(%rsp),%rsi
1063 mov 0(%rsi),%r15
1064 mov 8(%rsi),%r14
1065 mov 16(%rsi),%r13
1066 mov 24(%rsi),%r12
1067 mov 32(%rsi),%rbp
1068 mov 40(%rsi),%rbx
1069 lea 48(%rsi),%rsp
1070.Lepilogue_avx:
1071 ret
1072.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1073___
1074}
1075$code.=<<___;
1076.align 64
1077K_XX_XX:
1078.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1079.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1080.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1081.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1082.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1083
1084.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1085.align 64
1086___
1087
1088# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1090if ($win64) {
1091$rec="%rcx";
1092$frame="%rdx";
1093$context="%r8";
1094$disp="%r9";
1095
1096$code.=<<___;
1097.extern __imp_RtlVirtualUnwind
1098.type ssse3_handler,\@abi-omnipotent
1099.align 16
1100ssse3_handler:
1101 push %rsi
1102 push %rdi
1103 push %rbx
1104 push %rbp
1105 push %r12
1106 push %r13
1107 push %r14
1108 push %r15
1109 pushfq
1110 sub \$64,%rsp
1111
1112 mov 120($context),%rax # pull context->Rax
1113 mov 248($context),%rbx # pull context->Rip
1114
1115 mov 8($disp),%rsi # disp->ImageBase
1116 mov 56($disp),%r11 # disp->HandlerData
1117
1118 mov 0(%r11),%r10d # HandlerData[0]
1119 lea (%rsi,%r10),%r10 # prologue label
1120 cmp %r10,%rbx # context->Rip<prologue label
1121 jb .Lcommon_seh_tail
1122
1123 mov 152($context),%rax # pull context->Rsp
1124
1125 mov 4(%r11),%r10d # HandlerData[1]
1126 lea (%rsi,%r10),%r10 # epilogue label
1127 cmp %r10,%rbx # context->Rip>=epilogue label
1128 jae .Lcommon_seh_tail
1129
1130 lea 96(%rax),%rsi
1131 lea 512($context),%rdi # &context.Xmm6
1132 mov \$20,%ecx
1133 .long 0xa548f3fc # cld; rep movsq
1134 lea `104+10*16`(%rax),%rax # adjust stack pointer
1135
1136 mov 0(%rax),%r15
1137 mov 8(%rax),%r14
1138 mov 16(%rax),%r13
1139 mov 24(%rax),%r12
1140 mov 32(%rax),%rbp
1141 mov 40(%rax),%rbx
1142 lea 48(%rax),%rax
1143 mov %rbx,144($context) # restore context->Rbx
1144 mov %rbp,160($context) # restore context->Rbp
1145 mov %r12,216($context) # restore context->R12
1146 mov %r13,224($context) # restore context->R13
1147 mov %r14,232($context) # restore context->R14
1148 mov %r15,240($context) # restore context->R15
1149
1150.Lcommon_seh_tail:
1151 mov 8(%rax),%rdi
1152 mov 16(%rax),%rsi
1153 mov %rax,152($context) # restore context->Rsp
1154 mov %rsi,168($context) # restore context->Rsi
1155 mov %rdi,176($context) # restore context->Rdi
1156
1157 mov 40($disp),%rdi # disp->ContextRecord
1158 mov $context,%rsi # context
1159 mov \$154,%ecx # sizeof(CONTEXT)
1160 .long 0xa548f3fc # cld; rep movsq
1161
1162 mov $disp,%rsi
1163 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1164 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1165 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1166 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1167 mov 40(%rsi),%r10 # disp->ContextRecord
1168 lea 56(%rsi),%r11 # &disp->HandlerData
1169 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1170 mov %r10,32(%rsp) # arg5
1171 mov %r11,40(%rsp) # arg6
1172 mov %r12,48(%rsp) # arg7
1173 mov %rcx,56(%rsp) # arg8, (NULL)
1174 call *__imp_RtlVirtualUnwind(%rip)
1175
1176 mov \$1,%eax # ExceptionContinueSearch
1177 add \$64,%rsp
1178 popfq
1179 pop %r15
1180 pop %r14
1181 pop %r13
1182 pop %r12
1183 pop %rbp
1184 pop %rbx
1185 pop %rdi
1186 pop %rsi
1187 ret
1188.size ssse3_handler,.-ssse3_handler
1189
1190.section .pdata
1191.align 4
1192 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1193 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
1194 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
1195___
1196$code.=<<___ if ($avx);
1197 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
1198 .rva .LSEH_end_aesni_cbc_sha1_enc_avx
1199 .rva .LSEH_info_aesni_cbc_sha1_enc_avx
1200___
1201$code.=<<___;
1202.section .xdata
1203.align 8
1204.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1205 .byte 9,0,0,0
1206 .rva ssse3_handler
1207 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1208___
1209$code.=<<___ if ($avx);
1210.LSEH_info_aesni_cbc_sha1_enc_avx:
1211 .byte 9,0,0,0
1212 .rva ssse3_handler
1213 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1214___
1215}
1216
1217####################################################################
1218sub rex {
1219 local *opcode=shift;
1220 my ($dst,$src)=@_;
1221 my $rex=0;
1222
1223 $rex|=0x04 if($dst>=8);
1224 $rex|=0x01 if($src>=8);
1225 push @opcode,$rex|0x40 if($rex);
1226}
1227
1228sub aesni {
1229 my $line=shift;
1230 my @opcode=(0x66);
1231
1232 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1233 my %opcodelet = (
1234 "aesenc" => 0xdc, "aesenclast" => 0xdd
1235 );
1236 return undef if (!defined($opcodelet{$1}));
1237 rex(\@opcode,$3,$2);
1238 push @opcode,0x0f,0x38,$opcodelet{$1};
1239 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1240 return ".byte\t".join(',',@opcode);
1241 }
1242 return $line;
1243}
1244
1245$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1246$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1247
1248print $code;
1249close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000000..3dc345b585
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49$inline=1; # inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni") { $movekey=*movups; }
58else { $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
76
77# AESNI extenstion
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98 $sn++;
99
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
108 &dec ($rounds);
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
114
115sub aesni_generate1 # fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
124 &cmp ($rounds,11);
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &shr ($rounds,1);
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
258
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
273
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
364 &and ($len,-16);
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402 &call ("_aesni_encrypt6");
403
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
421 &cmp ($len,0x20);
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
522 &cmp ($len,0x20);
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
597 &mov ($rounds,&DWP(240,$key));
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
606 &mov ($rounds_,1);
607 &xor ($key_,$key_);
608 &mov (&DWP(16,"esp"),$rounds_);
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
615 &movdqa ($inout3,&QWP(0,"esp"));
616 &movdqa ($inout0,$ivec);
617 &mov ($rounds_,$rounds);
618 &pshufb ($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP(0,$key_));
622 &mov ($rounds,$rounds_);
623 &movups ($in0,&QWP(0,$inp));
624
625 &xorps ($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP(16,$key_));
627 &xorps ($rndkey0,$in0);
628 &lea ($key,&DWP(32,$key_));
629 &xorps ($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633 &aesenc ($inout0,$rndkey1);
634 &dec ($rounds);
635 &aesenc ($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP(16,$key));
637 &aesenc ($inout0,$rndkey0);
638 &lea ($key,&DWP(32,$key));
639 &aesenc ($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP(0,$key));
641 &jnz (&label("ccm64_enc2_loop"));
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
644 &paddq ($ivec,&QWP(16,"esp"));
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
647
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
650 &xorps ($in0,$inout0); # inp^=E(ivec)
651 &movdqa ($inout0,$ivec);
652 &movups (&QWP(0,$out),$in0); # save output
653 &lea ($out,&DWP(16,$out));
654 &pshufb ($inout0,$inout3);
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
659 &movups (&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
685 &mov ($rounds_,1);
686 &xor ($key_,$key_);
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
694
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
698 &pshufb ($ivec,$inout3);
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
703 &movups ($in0,&QWP(0,$inp)); # load inp
704 &paddq ($ivec,&QWP(16,"esp"));
705 &lea ($inp,&QWP(16,$inp));
706 &jmp (&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709 &xorps ($in0,$inout0); # inp ^= E(ivec)
710 &movdqa ($inout0,$ivec);
711 &mov ($rounds,$rounds_);
712 &movups (&QWP(0,$out),$in0); # save output
713 &lea ($out,&DWP(16,$out));
714 &pshufb ($inout0,$inout3);
715
716 &sub ($len,1);
717 &jz (&label("ccm64_dec_break"));
718
719 &$movekey ($rndkey0,&QWP(0,$key_));
720 &shr ($rounds,1);
721 &$movekey ($rndkey1,&QWP(16,$key_));
722 &xorps ($in0,$rndkey0);
723 &lea ($key,&DWP(32,$key_));
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &lea ($inp,&QWP(16,$inp));
743 &aesenclast ($inout0,$rndkey0);
744 &aesenclast ($cmac,$rndkey0);
745 &jmp (&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748 &mov ($key,$key_);
749 if ($inline)
750 { &aesni_inline_generate1("enc",$cmac,$in0); }
751 else
752 { &call ("_aesni_encrypt1",$cmac); }
753
754 &mov ("esp",&DWP(48,"esp"));
755 &mov ($out,&wparam(5));
756 &movups (&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762# size_t blocks, const AES_KEY *key,
763# const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769# 0 pshufb mask
770# 16 vector addend: 0,6,6,6
771# 32 counter-less ivec
772# 48 1st triplet of counter vector
773# 64 2nd triplet of counter vector
774# 80 saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777 &mov ($inp,&wparam(0));
778 &mov ($out,&wparam(1));
779 &mov ($len,&wparam(2));
780 &mov ($key,&wparam(3));
781 &mov ($rounds_,&wparam(4));
782 &mov ($key_,"esp");
783 &sub ("esp",88);
784 &and ("esp",-16); # align stack
785 &mov (&DWP(80,"esp"),$key_);
786
787 &cmp ($len,1);
788 &je (&label("ctr32_one_shortcut"));
789
790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
791
792 # compose byte-swap control mask for pshufb on stack
793 &mov (&DWP(0,"esp"),0x0c0d0e0f);
794 &mov (&DWP(4,"esp"),0x08090a0b);
795 &mov (&DWP(8,"esp"),0x04050607);
796 &mov (&DWP(12,"esp"),0x00010203);
797
798 # compose counter increment vector on stack
799 &mov ($rounds,6);
800 &xor ($key_,$key_);
801 &mov (&DWP(16,"esp"),$rounds);
802 &mov (&DWP(20,"esp"),$rounds);
803 &mov (&DWP(24,"esp"),$rounds);
804 &mov (&DWP(28,"esp"),$key_);
805
806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
808
809 &mov ($rounds,&DWP(240,$key)); # key->rounds
810
811 # compose 2 vectors of 3x32-bit counters
812 &bswap ($rounds_);
813 &pxor ($rndkey1,$rndkey1);
814 &pxor ($rndkey0,$rndkey0);
815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
816 &pinsrd ($rndkey1,$rounds_,0);
817 &lea ($key_,&DWP(3,$rounds_));
818 &pinsrd ($rndkey0,$key_,0);
819 &inc ($rounds_);
820 &pinsrd ($rndkey1,$rounds_,1);
821 &inc ($key_);
822 &pinsrd ($rndkey0,$key_,1);
823 &inc ($rounds_);
824 &pinsrd ($rndkey1,$rounds_,2);
825 &inc ($key_);
826 &pinsrd ($rndkey0,$key_,2);
827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb ($rndkey1,$inout0); # byte swap
829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb ($rndkey0,$inout0); # byte swap
831
832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd ($inout1,$rndkey1,2<<6);
834 &cmp ($len,6);
835 &jb (&label("ctr32_tail"));
836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
837 &shr ($rounds,1);
838 &mov ($key_,$key); # backup $key
839 &mov ($rounds_,$rounds); # backup $rounds
840 &sub ($len,6);
841 &jmp (&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844 &pshufd ($inout2,$rndkey1,1<<6);
845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
846 &pshufd ($inout3,$rndkey0,3<<6);
847 &por ($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd ($inout4,$rndkey0,2<<6);
849 &por ($inout1,$rndkey1);
850 &pshufd ($inout5,$rndkey0,1<<6);
851 &por ($inout2,$rndkey1);
852 &por ($inout3,$rndkey1);
853 &por ($inout4,$rndkey1);
854 &por ($inout5,$rndkey1);
855
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP(0,$key_));
858 &$movekey ($rndkey1,&QWP(16,$key_));
859 &lea ($key,&DWP(32,$key_));
860 &dec ($rounds);
861 &pxor ($inout0,$rndkey0);
862 &pxor ($inout1,$rndkey0);
863 &aesenc ($inout0,$rndkey1);
864 &pxor ($inout2,$rndkey0);
865 &aesenc ($inout1,$rndkey1);
866 &pxor ($inout3,$rndkey0);
867 &aesenc ($inout2,$rndkey1);
868 &pxor ($inout4,$rndkey0);
869 &aesenc ($inout3,$rndkey1);
870 &pxor ($inout5,$rndkey0);
871 &aesenc ($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP(0,$key));
873 &aesenc ($inout5,$rndkey1);
874
875 &call (&label("_aesni_encrypt6_enter"));
876
877 &movups ($rndkey1,&QWP(0,$inp));
878 &movups ($rndkey0,&QWP(0x10,$inp));
879 &xorps ($inout0,$rndkey1);
880 &movups ($rndkey1,&QWP(0x20,$inp));
881 &xorps ($inout1,$rndkey0);
882 &movups (&QWP(0,$out),$inout0);
883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
884 &xorps ($inout2,$rndkey1);
885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
886 &movups (&QWP(0x10,$out),$inout1);
887 &movups (&QWP(0x20,$out),$inout2);
888
889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
892
893 &movups ($inout1,&QWP(0x30,$inp));
894 &movups ($inout2,&QWP(0x40,$inp));
895 &xorps ($inout3,$inout1);
896 &movups ($inout1,&QWP(0x50,$inp));
897 &lea ($inp,&DWP(0x60,$inp));
898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb ($rndkey1,$inout0); # byte swap
900 &xorps ($inout4,$inout2);
901 &movups (&QWP(0x30,$out),$inout3);
902 &xorps ($inout5,$inout1);
903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb ($rndkey0,$inout0); # byte swap
905 &movups (&QWP(0x40,$out),$inout4);
906 &pshufd ($inout0,$rndkey1,3<<6);
907 &movups (&QWP(0x50,$out),$inout5);
908 &lea ($out,&DWP(0x60,$out));
909
910 &mov ($rounds,$rounds_);
911 &pshufd ($inout1,$rndkey1,2<<6);
912 &sub ($len,6);
913 &jnc (&label("ctr32_loop6"));
914
915 &add ($len,6);
916 &jz (&label("ctr32_ret"));
917 &mov ($key,$key_);
918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
920
921&set_label("ctr32_tail");
922 &por ($inout0,$inout5);
923 &cmp ($len,2);
924 &jb (&label("ctr32_one"));
925
926 &pshufd ($inout2,$rndkey1,1<<6);
927 &por ($inout1,$inout5);
928 &je (&label("ctr32_two"));
929
930 &pshufd ($inout3,$rndkey0,3<<6);
931 &por ($inout2,$inout5);
932 &cmp ($len,4);
933 &jb (&label("ctr32_three"));
934
935 &pshufd ($inout4,$rndkey0,2<<6);
936 &por ($inout3,$inout5);
937 &je (&label("ctr32_four"));
938
939 &por ($inout4,$inout5);
940 &call ("_aesni_encrypt6");
941 &movups ($rndkey1,&QWP(0,$inp));
942 &movups ($rndkey0,&QWP(0x10,$inp));
943 &xorps ($inout0,$rndkey1);
944 &movups ($rndkey1,&QWP(0x20,$inp));
945 &xorps ($inout1,$rndkey0);
946 &movups ($rndkey0,&QWP(0x30,$inp));
947 &xorps ($inout2,$rndkey1);
948 &movups ($rndkey1,&QWP(0x40,$inp));
949 &xorps ($inout3,$rndkey0);
950 &movups (&QWP(0,$out),$inout0);
951 &xorps ($inout4,$rndkey1);
952 &movups (&QWP(0x10,$out),$inout1);
953 &movups (&QWP(0x20,$out),$inout2);
954 &movups (&QWP(0x30,$out),$inout3);
955 &movups (&QWP(0x40,$out),$inout4);
956 &jmp (&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
960 &mov ($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963 if ($inline)
964 { &aesni_inline_generate1("enc"); }
965 else
966 { &call ("_aesni_encrypt1"); }
967 &movups ($in0,&QWP(0,$inp));
968 &xorps ($in0,$inout0);
969 &movups (&QWP(0,$out),$in0);
970 &jmp (&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973 &call ("_aesni_encrypt3");
974 &movups ($inout3,&QWP(0,$inp));
975 &movups ($inout4,&QWP(0x10,$inp));
976 &xorps ($inout0,$inout3);
977 &xorps ($inout1,$inout4);
978 &movups (&QWP(0,$out),$inout0);
979 &movups (&QWP(0x10,$out),$inout1);
980 &jmp (&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983 &call ("_aesni_encrypt3");
984 &movups ($inout3,&QWP(0,$inp));
985 &movups ($inout4,&QWP(0x10,$inp));
986 &xorps ($inout0,$inout3);
987 &movups ($inout5,&QWP(0x20,$inp));
988 &xorps ($inout1,$inout4);
989 &movups (&QWP(0,$out),$inout0);
990 &xorps ($inout2,$inout5);
991 &movups (&QWP(0x10,$out),$inout1);
992 &movups (&QWP(0x20,$out),$inout2);
993 &jmp (&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996 &call ("_aesni_encrypt4");
997 &movups ($inout4,&QWP(0,$inp));
998 &movups ($inout5,&QWP(0x10,$inp));
999 &movups ($rndkey1,&QWP(0x20,$inp));
1000 &xorps ($inout0,$inout4);
1001 &movups ($rndkey0,&QWP(0x30,$inp));
1002 &xorps ($inout1,$inout5);
1003 &movups (&QWP(0,$out),$inout0);
1004 &xorps ($inout2,$rndkey1);
1005 &movups (&QWP(0x10,$out),$inout1);
1006 &xorps ($inout3,$rndkey0);
1007 &movups (&QWP(0x20,$out),$inout2);
1008 &movups (&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011 &mov ("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016# const AES_KEY *key1, const AES_KEY *key2
1017# const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022 &mov ($key,&wparam(4)); # key2
1023 &mov ($inp,&wparam(5)); # clear-text tweak
1024
1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1026 &movups ($inout0,&QWP(0,$inp));
1027 if ($inline)
1028 { &aesni_inline_generate1("enc"); }
1029 else
1030 { &call ("_aesni_encrypt1"); }
1031
1032 &mov ($inp,&wparam(0));
1033 &mov ($out,&wparam(1));
1034 &mov ($len,&wparam(2));
1035 &mov ($key,&wparam(3)); # key1
1036
1037 &mov ($key_,"esp");
1038 &sub ("esp",16*7+8);
1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1041
1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov (&DWP(16*6+4,"esp"),0);
1044 &mov (&DWP(16*6+8,"esp"),1);
1045 &mov (&DWP(16*6+12,"esp"),0);
1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048
1049 &movdqa ($tweak,$inout0);
1050 &pxor ($twtmp,$twtmp);
1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053
1054 &and ($len,-16);
1055 &mov ($key_,$key); # backup $key
1056 &mov ($rounds_,$rounds); # backup $rounds
1057 &sub ($len,16*6);
1058 &jc (&label("xts_enc_short"));
1059
1060 &shr ($rounds,1);
1061 &mov ($rounds_,$rounds);
1062 &jmp (&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd ($twres,$twtmp,0x13);
1067 &pxor ($twtmp,$twtmp);
1068 &movdqa (&QWP(16*$i,"esp"),$tweak);
1069 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1070 &pand ($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1072 &pxor ($tweak,$twres);
1073 }
1074 &pshufd ($inout5,$twtmp,0x13);
1075 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1076 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP(0,$key_));
1078 &pand ($inout5,$twmask); # isolate carry and residue
1079 &movups ($inout0,&QWP(0,$inp)); # load input
1080 &pxor ($inout5,$tweak);
1081
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu ($inout1,&QWP(16*1,$inp));
1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu ($inout2,&QWP(16*2,$inp));
1086 &pxor ($inout1,$rndkey0);
1087 &movdqu ($inout3,&QWP(16*3,$inp));
1088 &pxor ($inout2,$rndkey0);
1089 &movdqu ($inout4,&QWP(16*4,$inp));
1090 &pxor ($inout3,$rndkey0);
1091 &movdqu ($rndkey1,&QWP(16*5,$inp));
1092 &pxor ($inout4,$rndkey0);
1093 &lea ($inp,&DWP(16*6,$inp));
1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor ($inout5,$rndkey1);
1097
1098 &$movekey ($rndkey1,&QWP(16,$key_));
1099 &lea ($key,&DWP(32,$key_));
1100 &pxor ($inout1,&QWP(16*1,"esp"));
1101 &aesenc ($inout0,$rndkey1);
1102 &pxor ($inout2,&QWP(16*2,"esp"));
1103 &aesenc ($inout1,$rndkey1);
1104 &pxor ($inout3,&QWP(16*3,"esp"));
1105 &dec ($rounds);
1106 &aesenc ($inout2,$rndkey1);
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout3,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &aesenc ($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP(0,$key));
1112 &aesenc ($inout5,$rndkey1);
1113 &call (&label("_aesni_encrypt6_enter"));
1114
1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1116 &pxor ($twtmp,$twtmp);
1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &xorps ($inout1,&QWP(16*1,"esp"));
1120 &movups (&QWP(16*0,$out),$inout0); # write output
1121 &xorps ($inout2,&QWP(16*2,"esp"));
1122 &movups (&QWP(16*1,$out),$inout1);
1123 &xorps ($inout3,&QWP(16*3,"esp"));
1124 &movups (&QWP(16*2,$out),$inout2);
1125 &xorps ($inout4,&QWP(16*4,"esp"));
1126 &movups (&QWP(16*3,$out),$inout3);
1127 &xorps ($inout5,$tweak);
1128 &movups (&QWP(16*4,$out),$inout4);
1129 &pshufd ($twres,$twtmp,0x13);
1130 &movups (&QWP(16*5,$out),$inout5);
1131 &lea ($out,&DWP(16*6,$out));
1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133
1134 &pxor ($twtmp,$twtmp);
1135 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1136 &pand ($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1138 &mov ($rounds,$rounds_); # restore $rounds
1139 &pxor ($tweak,$twres);
1140
1141 &sub ($len,16*6);
1142 &jnc (&label("xts_enc_loop6"));
1143
1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1145 &mov ($key,$key_); # restore $key
1146 &mov ($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149 &add ($len,16*6);
1150 &jz (&label("xts_enc_done6x"));
1151
1152 &movdqa ($inout3,$tweak); # put aside previous tweak
1153 &cmp ($len,0x20);
1154 &jb (&label("xts_enc_one"));
1155
1156 &pshufd ($twres,$twtmp,0x13);
1157 &pxor ($twtmp,$twtmp);
1158 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1159 &pand ($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &pxor ($tweak,$twres);
1162 &je (&label("xts_enc_two"));
1163
1164 &pshufd ($twres,$twtmp,0x13);
1165 &pxor ($twtmp,$twtmp);
1166 &movdqa ($inout4,$tweak); # put aside previous tweak
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &pand ($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1170 &pxor ($tweak,$twres);
1171 &cmp ($len,0x40);
1172 &jb (&label("xts_enc_three"));
1173
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa ($inout5,$tweak); # put aside previous tweak
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 &movdqa (&QWP(16*0,"esp"),$inout3);
1182 &movdqa (&QWP(16*1,"esp"),$inout4);
1183 &je (&label("xts_enc_four"));
1184
1185 &movdqa (&QWP(16*2,"esp"),$inout5);
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*3,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1189 &pand ($inout5,$twmask); # isolate carry and residue
1190 &pxor ($inout5,$tweak);
1191
1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1193 &movdqu ($inout1,&QWP(16*1,$inp));
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout1,&QWP(16*1,"esp"));
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout2,&QWP(16*2,"esp"));
1200 &lea ($inp,&DWP(16*5,$inp));
1201 &pxor ($inout3,&QWP(16*3,"esp"));
1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1203 &pxor ($inout4,$inout5);
1204
1205 &call ("_aesni_encrypt6");
1206
1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &xorps ($inout2,&QWP(16*2,"esp"));
1211 &movups (&QWP(16*0,$out),$inout0); # write output
1212 &xorps ($inout3,&QWP(16*3,"esp"));
1213 &movups (&QWP(16*1,$out),$inout1);
1214 &xorps ($inout4,$tweak);
1215 &movups (&QWP(16*2,$out),$inout2);
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &movups (&QWP(16*4,$out),$inout4);
1218 &lea ($out,&DWP(16*5,$out));
1219 &jmp (&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222 &movups ($inout0,&QWP(16*0,$inp)); # load input
1223 &lea ($inp,&DWP(16*1,$inp));
1224 &xorps ($inout0,$inout3); # input^=tweak
1225 if ($inline)
1226 { &aesni_inline_generate1("enc"); }
1227 else
1228 { &call ("_aesni_encrypt1"); }
1229 &xorps ($inout0,$inout3); # output^=tweak
1230 &movups (&QWP(16*0,$out),$inout0); # write output
1231 &lea ($out,&DWP(16*1,$out));
1232
1233 &movdqa ($tweak,$inout3); # last tweak
1234 &jmp (&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237 &movaps ($inout4,$tweak); # put aside last tweak
1238
1239 &movups ($inout0,&QWP(16*0,$inp)); # load input
1240 &movups ($inout1,&QWP(16*1,$inp));
1241 &lea ($inp,&DWP(16*2,$inp));
1242 &xorps ($inout0,$inout3); # input^=tweak
1243 &xorps ($inout1,$inout4);
1244 &xorps ($inout2,$inout2);
1245
1246 &call ("_aesni_encrypt3");
1247
1248 &xorps ($inout0,$inout3); # output^=tweak
1249 &xorps ($inout1,$inout4);
1250 &movups (&QWP(16*0,$out),$inout0); # write output
1251 &movups (&QWP(16*1,$out),$inout1);
1252 &lea ($out,&DWP(16*2,$out));
1253
1254 &movdqa ($tweak,$inout4); # last tweak
1255 &jmp (&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258 &movaps ($inout5,$tweak); # put aside last tweak
1259 &movups ($inout0,&QWP(16*0,$inp)); # load input
1260 &movups ($inout1,&QWP(16*1,$inp));
1261 &movups ($inout2,&QWP(16*2,$inp));
1262 &lea ($inp,&DWP(16*3,$inp));
1263 &xorps ($inout0,$inout3); # input^=tweak
1264 &xorps ($inout1,$inout4);
1265 &xorps ($inout2,$inout5);
1266
1267 &call ("_aesni_encrypt3");
1268
1269 &xorps ($inout0,$inout3); # output^=tweak
1270 &xorps ($inout1,$inout4);
1271 &xorps ($inout2,$inout5);
1272 &movups (&QWP(16*0,$out),$inout0); # write output
1273 &movups (&QWP(16*1,$out),$inout1);
1274 &movups (&QWP(16*2,$out),$inout2);
1275 &lea ($out,&DWP(16*3,$out));
1276
1277 &movdqa ($tweak,$inout5); # last tweak
1278 &jmp (&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281 &movaps ($inout4,$tweak); # put aside last tweak
1282
1283 &movups ($inout0,&QWP(16*0,$inp)); # load input
1284 &movups ($inout1,&QWP(16*1,$inp));
1285 &movups ($inout2,&QWP(16*2,$inp));
1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287 &movups ($inout3,&QWP(16*3,$inp));
1288 &lea ($inp,&DWP(16*4,$inp));
1289 &xorps ($inout1,&QWP(16*1,"esp"));
1290 &xorps ($inout2,$inout5);
1291 &xorps ($inout3,$inout4);
1292
1293 &call ("_aesni_encrypt4");
1294
1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296 &xorps ($inout1,&QWP(16*1,"esp"));
1297 &xorps ($inout2,$inout5);
1298 &movups (&QWP(16*0,$out),$inout0); # write output
1299 &xorps ($inout3,$inout4);
1300 &movups (&QWP(16*1,$out),$inout1);
1301 &movups (&QWP(16*2,$out),$inout2);
1302 &movups (&QWP(16*3,$out),$inout3);
1303 &lea ($out,&DWP(16*4,$out));
1304
1305 &movdqa ($tweak,$inout4); # last tweak
1306 &jmp (&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1310 &and ($len,15);
1311 &jz (&label("xts_enc_ret"));
1312 &movdqa ($inout3,$tweak);
1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1314 &jmp (&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1318 &pxor ($twtmp,$twtmp);
1319 &and ($len,15);
1320 &jz (&label("xts_enc_ret"));
1321
1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd ($inout3,$twtmp,0x13);
1325 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327 &pxor ($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330 &movz ($rounds,&BP(0,$inp));
1331 &movz ($key,&BP(-16,$out));
1332 &lea ($inp,&DWP(1,$inp));
1333 &mov (&BP(-16,$out),&LB($rounds));
1334 &mov (&BP(0,$out),&LB($key));
1335 &lea ($out,&DWP(1,$out));
1336 &sub ($len,1);
1337 &jnz (&label("xts_enc_steal"));
1338
1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1340 &mov ($key,$key_); # restore $key
1341 &mov ($rounds,$rounds_); # restore $rounds
1342
1343 &movups ($inout0,&QWP(-16,$out)); # load input
1344 &xorps ($inout0,$inout3); # input^=tweak
1345 if ($inline)
1346 { &aesni_inline_generate1("enc"); }
1347 else
1348 { &call ("_aesni_encrypt1"); }
1349 &xorps ($inout0,$inout3); # output^=tweak
1350 &movups (&QWP(-16,$out),$inout0); # write output
1351
1352&set_label("xts_enc_ret");
1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357 &mov ($key,&wparam(4)); # key2
1358 &mov ($inp,&wparam(5)); # clear-text tweak
1359
1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1361 &movups ($inout0,&QWP(0,$inp));
1362 if ($inline)
1363 { &aesni_inline_generate1("enc"); }
1364 else
1365 { &call ("_aesni_encrypt1"); }
1366
1367 &mov ($inp,&wparam(0));
1368 &mov ($out,&wparam(1));
1369 &mov ($len,&wparam(2));
1370 &mov ($key,&wparam(3)); # key1
1371
1372 &mov ($key_,"esp");
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1375
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1377 &test ($len,15);
1378 &setnz (&LB($rounds_));
1379 &shl ($rounds_,4);
1380 &sub ($len,$rounds_);
1381
1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov (&DWP(16*6+4,"esp"),0);
1384 &mov (&DWP(16*6+8,"esp"),1);
1385 &mov (&DWP(16*6+12,"esp"),0);
1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388
1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1390 &mov ($key_,$key); # backup $key
1391 &mov ($rounds_,$rounds); # backup $rounds
1392
1393 &movdqa ($tweak,$inout0);
1394 &pxor ($twtmp,$twtmp);
1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1397
1398 &and ($len,-16);
1399 &sub ($len,16*6);
1400 &jc (&label("xts_dec_short"));
1401
1402 &shr ($rounds,1);
1403 &mov ($rounds_,$rounds);
1404 &jmp (&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd ($twres,$twtmp,0x13);
1409 &pxor ($twtmp,$twtmp);
1410 &movdqa (&QWP(16*$i,"esp"),$tweak);
1411 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1412 &pand ($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1414 &pxor ($tweak,$twres);
1415 }
1416 &pshufd ($inout5,$twtmp,0x13);
1417 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1418 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP(0,$key_));
1420 &pand ($inout5,$twmask); # isolate carry and residue
1421 &movups ($inout0,&QWP(0,$inp)); # load input
1422 &pxor ($inout5,$tweak);
1423
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu ($inout1,&QWP(16*1,$inp));
1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu ($inout2,&QWP(16*2,$inp));
1428 &pxor ($inout1,$rndkey0);
1429 &movdqu ($inout3,&QWP(16*3,$inp));
1430 &pxor ($inout2,$rndkey0);
1431 &movdqu ($inout4,&QWP(16*4,$inp));
1432 &pxor ($inout3,$rndkey0);
1433 &movdqu ($rndkey1,&QWP(16*5,$inp));
1434 &pxor ($inout4,$rndkey0);
1435 &lea ($inp,&DWP(16*6,$inp));
1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor ($inout5,$rndkey1);
1439
1440 &$movekey ($rndkey1,&QWP(16,$key_));
1441 &lea ($key,&DWP(32,$key_));
1442 &pxor ($inout1,&QWP(16*1,"esp"));
1443 &aesdec ($inout0,$rndkey1);
1444 &pxor ($inout2,&QWP(16*2,"esp"));
1445 &aesdec ($inout1,$rndkey1);
1446 &pxor ($inout3,&QWP(16*3,"esp"));
1447 &dec ($rounds);
1448 &aesdec ($inout2,$rndkey1);
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout3,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &aesdec ($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP(0,$key));
1454 &aesdec ($inout5,$rndkey1);
1455 &call (&label("_aesni_decrypt6_enter"));
1456
1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1458 &pxor ($twtmp,$twtmp);
1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1461 &xorps ($inout1,&QWP(16*1,"esp"));
1462 &movups (&QWP(16*0,$out),$inout0); # write output
1463 &xorps ($inout2,&QWP(16*2,"esp"));
1464 &movups (&QWP(16*1,$out),$inout1);
1465 &xorps ($inout3,&QWP(16*3,"esp"));
1466 &movups (&QWP(16*2,$out),$inout2);
1467 &xorps ($inout4,&QWP(16*4,"esp"));
1468 &movups (&QWP(16*3,$out),$inout3);
1469 &xorps ($inout5,$tweak);
1470 &movups (&QWP(16*4,$out),$inout4);
1471 &pshufd ($twres,$twtmp,0x13);
1472 &movups (&QWP(16*5,$out),$inout5);
1473 &lea ($out,&DWP(16*6,$out));
1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475
1476 &pxor ($twtmp,$twtmp);
1477 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1478 &pand ($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1480 &mov ($rounds,$rounds_); # restore $rounds
1481 &pxor ($tweak,$twres);
1482
1483 &sub ($len,16*6);
1484 &jnc (&label("xts_dec_loop6"));
1485
1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1487 &mov ($key,$key_); # restore $key
1488 &mov ($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491 &add ($len,16*6);
1492 &jz (&label("xts_dec_done6x"));
1493
1494 &movdqa ($inout3,$tweak); # put aside previous tweak
1495 &cmp ($len,0x20);
1496 &jb (&label("xts_dec_one"));
1497
1498 &pshufd ($twres,$twtmp,0x13);
1499 &pxor ($twtmp,$twtmp);
1500 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1501 &pand ($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1503 &pxor ($tweak,$twres);
1504 &je (&label("xts_dec_two"));
1505
1506 &pshufd ($twres,$twtmp,0x13);
1507 &pxor ($twtmp,$twtmp);
1508 &movdqa ($inout4,$tweak); # put aside previous tweak
1509 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1510 &pand ($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1512 &pxor ($tweak,$twres);
1513 &cmp ($len,0x40);
1514 &jb (&label("xts_dec_three"));
1515
1516 &pshufd ($twres,$twtmp,0x13);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($inout5,$tweak); # put aside previous tweak
1519 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1520 &pand ($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522 &pxor ($tweak,$twres);
1523 &movdqa (&QWP(16*0,"esp"),$inout3);
1524 &movdqa (&QWP(16*1,"esp"),$inout4);
1525 &je (&label("xts_dec_four"));
1526
1527 &movdqa (&QWP(16*2,"esp"),$inout5);
1528 &pshufd ($inout5,$twtmp,0x13);
1529 &movdqa (&QWP(16*3,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1531 &pand ($inout5,$twmask); # isolate carry and residue
1532 &pxor ($inout5,$tweak);
1533
1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1535 &movdqu ($inout1,&QWP(16*1,$inp));
1536 &movdqu ($inout2,&QWP(16*2,$inp));
1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538 &movdqu ($inout3,&QWP(16*3,$inp));
1539 &pxor ($inout1,&QWP(16*1,"esp"));
1540 &movdqu ($inout4,&QWP(16*4,$inp));
1541 &pxor ($inout2,&QWP(16*2,"esp"));
1542 &lea ($inp,&DWP(16*5,$inp));
1543 &pxor ($inout3,&QWP(16*3,"esp"));
1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1545 &pxor ($inout4,$inout5);
1546
1547 &call ("_aesni_decrypt6");
1548
1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551 &xorps ($inout1,&QWP(16*1,"esp"));
1552 &xorps ($inout2,&QWP(16*2,"esp"));
1553 &movups (&QWP(16*0,$out),$inout0); # write output
1554 &xorps ($inout3,&QWP(16*3,"esp"));
1555 &movups (&QWP(16*1,$out),$inout1);
1556 &xorps ($inout4,$tweak);
1557 &movups (&QWP(16*2,$out),$inout2);
1558 &movups (&QWP(16*3,$out),$inout3);
1559 &movups (&QWP(16*4,$out),$inout4);
1560 &lea ($out,&DWP(16*5,$out));
1561 &jmp (&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564 &movups ($inout0,&QWP(16*0,$inp)); # load input
1565 &lea ($inp,&DWP(16*1,$inp));
1566 &xorps ($inout0,$inout3); # input^=tweak
1567 if ($inline)
1568 { &aesni_inline_generate1("dec"); }
1569 else
1570 { &call ("_aesni_decrypt1"); }
1571 &xorps ($inout0,$inout3); # output^=tweak
1572 &movups (&QWP(16*0,$out),$inout0); # write output
1573 &lea ($out,&DWP(16*1,$out));
1574
1575 &movdqa ($tweak,$inout3); # last tweak
1576 &jmp (&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579 &movaps ($inout4,$tweak); # put aside last tweak
1580
1581 &movups ($inout0,&QWP(16*0,$inp)); # load input
1582 &movups ($inout1,&QWP(16*1,$inp));
1583 &lea ($inp,&DWP(16*2,$inp));
1584 &xorps ($inout0,$inout3); # input^=tweak
1585 &xorps ($inout1,$inout4);
1586
1587 &call ("_aesni_decrypt3");
1588
1589 &xorps ($inout0,$inout3); # output^=tweak
1590 &xorps ($inout1,$inout4);
1591 &movups (&QWP(16*0,$out),$inout0); # write output
1592 &movups (&QWP(16*1,$out),$inout1);
1593 &lea ($out,&DWP(16*2,$out));
1594
1595 &movdqa ($tweak,$inout4); # last tweak
1596 &jmp (&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599 &movaps ($inout5,$tweak); # put aside last tweak
1600 &movups ($inout0,&QWP(16*0,$inp)); # load input
1601 &movups ($inout1,&QWP(16*1,$inp));
1602 &movups ($inout2,&QWP(16*2,$inp));
1603 &lea ($inp,&DWP(16*3,$inp));
1604 &xorps ($inout0,$inout3); # input^=tweak
1605 &xorps ($inout1,$inout4);
1606 &xorps ($inout2,$inout5);
1607
1608 &call ("_aesni_decrypt3");
1609
1610 &xorps ($inout0,$inout3); # output^=tweak
1611 &xorps ($inout1,$inout4);
1612 &xorps ($inout2,$inout5);
1613 &movups (&QWP(16*0,$out),$inout0); # write output
1614 &movups (&QWP(16*1,$out),$inout1);
1615 &movups (&QWP(16*2,$out),$inout2);
1616 &lea ($out,&DWP(16*3,$out));
1617
1618 &movdqa ($tweak,$inout5); # last tweak
1619 &jmp (&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622 &movaps ($inout4,$tweak); # put aside last tweak
1623
1624 &movups ($inout0,&QWP(16*0,$inp)); # load input
1625 &movups ($inout1,&QWP(16*1,$inp));
1626 &movups ($inout2,&QWP(16*2,$inp));
1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628 &movups ($inout3,&QWP(16*3,$inp));
1629 &lea ($inp,&DWP(16*4,$inp));
1630 &xorps ($inout1,&QWP(16*1,"esp"));
1631 &xorps ($inout2,$inout5);
1632 &xorps ($inout3,$inout4);
1633
1634 &call ("_aesni_decrypt4");
1635
1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637 &xorps ($inout1,&QWP(16*1,"esp"));
1638 &xorps ($inout2,$inout5);
1639 &movups (&QWP(16*0,$out),$inout0); # write output
1640 &xorps ($inout3,$inout4);
1641 &movups (&QWP(16*1,$out),$inout1);
1642 &movups (&QWP(16*2,$out),$inout2);
1643 &movups (&QWP(16*3,$out),$inout3);
1644 &lea ($out,&DWP(16*4,$out));
1645
1646 &movdqa ($tweak,$inout4); # last tweak
1647 &jmp (&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1651 &and ($len,15);
1652 &jz (&label("xts_dec_ret"));
1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1654 &jmp (&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1658 &pxor ($twtmp,$twtmp);
1659 &and ($len,15);
1660 &jz (&label("xts_dec_ret"));
1661
1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd ($twres,$twtmp,0x13);
1665 &pxor ($twtmp,$twtmp);
1666 &movdqa ($twmask,&QWP(16*6,"esp"));
1667 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1668 &pand ($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1670 &pxor ($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673 &pshufd ($inout3,$twtmp,0x13);
1674 &movdqa ($inout4,$tweak); # put aside previous tweak
1675 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1676 &pand ($inout3,$twmask); # isolate carry and residue
1677 &pxor ($inout3,$tweak);
1678
1679 &mov ($key,$key_); # restore $key
1680 &mov ($rounds,$rounds_); # restore $rounds
1681
1682 &movups ($inout0,&QWP(0,$inp)); # load input
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(0,$out),$inout0); # write output
1690
1691&set_label("xts_dec_steal");
1692 &movz ($rounds,&BP(16,$inp));
1693 &movz ($key,&BP(0,$out));
1694 &lea ($inp,&DWP(1,$inp));
1695 &mov (&BP(0,$out),&LB($rounds));
1696 &mov (&BP(16,$out),&LB($key));
1697 &lea ($out,&DWP(1,$out));
1698 &sub ($len,1);
1699 &jnz (&label("xts_dec_steal"));
1700
1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1702 &mov ($key,$key_); # restore $key
1703 &mov ($rounds,$rounds_); # restore $rounds
1704
1705 &movups ($inout0,&QWP(0,$out)); # load input
1706 &xorps ($inout0,$inout4); # input^=tweak
1707 if ($inline)
1708 { &aesni_inline_generate1("dec"); }
1709 else
1710 { &call ("_aesni_decrypt1"); }
1711 &xorps ($inout0,$inout4); # output^=tweak
1712 &movups (&QWP(0,$out),$inout0); # write output
1713
1714&set_label("xts_dec_ret");
1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722# size_t length, const AES_KEY *key,
1723# unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725 &mov ($inp,&wparam(0));
1726 &mov ($rounds_,"esp");
1727 &mov ($out,&wparam(1));
1728 &sub ($rounds_,24);
1729 &mov ($len,&wparam(2));
1730 &and ($rounds_,-16);
1731 &mov ($key,&wparam(3));
1732 &mov ($key_,&wparam(4));
1733 &test ($len,$len);
1734 &jz (&label("cbc_abort"));
1735
1736 &cmp (&wparam(5),0);
1737 &xchg ($rounds_,"esp"); # alloca
1738 &movups ($ivec,&QWP(0,$key_)); # load IV
1739 &mov ($rounds,&DWP(240,$key));
1740 &mov ($key_,$key); # backup $key
1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1742 &mov ($rounds_,$rounds); # backup $rounds
1743 &je (&label("cbc_decrypt"));
1744
1745 &movaps ($inout0,$ivec);
1746 &cmp ($len,16);
1747 &jb (&label("cbc_enc_tail"));
1748 &sub ($len,16);
1749 &jmp (&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752 &movups ($ivec,&QWP(0,$inp)); # input actually
1753 &lea ($inp,&DWP(16,$inp));
1754 if ($inline)
1755 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1756 else
1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1758 &mov ($rounds,$rounds_); # restore $rounds
1759 &mov ($key,$key_); # restore $key
1760 &movups (&QWP(0,$out),$inout0); # store output
1761 &lea ($out,&DWP(16,$out));
1762 &sub ($len,16);
1763 &jnc (&label("cbc_enc_loop"));
1764 &add ($len,16);
1765 &jnz (&label("cbc_enc_tail"));
1766 &movaps ($ivec,$inout0);
1767 &jmp (&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770 &mov ("ecx",$len); # zaps $rounds
1771 &data_word(0xA4F3F689); # rep movsb
1772 &mov ("ecx",16); # zero tail
1773 &sub ("ecx",$len);
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word(0xAAF3F689); # rep stosb
1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1777 &mov ($rounds,$rounds_); # restore $rounds
1778 &mov ($inp,$out); # $inp and $out are the same
1779 &mov ($key,$key_); # restore $key
1780 &jmp (&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783 &cmp ($len,0x50);
1784 &jbe (&label("cbc_dec_tail"));
1785 &movaps (&QWP(0,"esp"),$ivec); # save IV
1786 &sub ($len,0x50);
1787 &jmp (&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1791 &movups (&QWP(0,$out),$inout5);
1792 &lea ($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794 &movdqu ($inout0,&QWP(0,$inp));
1795 &movdqu ($inout1,&QWP(0x10,$inp));
1796 &movdqu ($inout2,&QWP(0x20,$inp));
1797 &movdqu ($inout3,&QWP(0x30,$inp));
1798 &movdqu ($inout4,&QWP(0x40,$inp));
1799 &movdqu ($inout5,&QWP(0x50,$inp));
1800
1801 &call ("_aesni_decrypt6");
1802
1803 &movups ($rndkey1,&QWP(0,$inp));
1804 &movups ($rndkey0,&QWP(0x10,$inp));
1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1806 &xorps ($inout1,$rndkey1);
1807 &movups ($rndkey1,&QWP(0x20,$inp));
1808 &xorps ($inout2,$rndkey0);
1809 &movups ($rndkey0,&QWP(0x30,$inp));
1810 &xorps ($inout3,$rndkey1);
1811 &movups ($rndkey1,&QWP(0x40,$inp));
1812 &xorps ($inout4,$rndkey0);
1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1814 &xorps ($inout5,$rndkey1);
1815 &movups (&QWP(0,$out),$inout0);
1816 &movups (&QWP(0x10,$out),$inout1);
1817 &lea ($inp,&DWP(0x60,$inp));
1818 &movups (&QWP(0x20,$out),$inout2);
1819 &mov ($rounds,$rounds_) # restore $rounds
1820 &movups (&QWP(0x30,$out),$inout3);
1821 &mov ($key,$key_); # restore $key
1822 &movups (&QWP(0x40,$out),$inout4);
1823 &lea ($out,&DWP(0x50,$out));
1824 &sub ($len,0x60);
1825 &ja (&label("cbc_dec_loop6"));
1826
1827 &movaps ($inout0,$inout5);
1828 &movaps ($ivec,$rndkey0);
1829 &add ($len,0x50);
1830 &jle (&label("cbc_dec_tail_collected"));
1831 &movups (&QWP(0,$out),$inout0);
1832 &lea ($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834 &movups ($inout0,&QWP(0,$inp));
1835 &movaps ($in0,$inout0);
1836 &cmp ($len,0x10);
1837 &jbe (&label("cbc_dec_one"));
1838
1839 &movups ($inout1,&QWP(0x10,$inp));
1840 &movaps ($in1,$inout1);
1841 &cmp ($len,0x20);
1842 &jbe (&label("cbc_dec_two"));
1843
1844 &movups ($inout2,&QWP(0x20,$inp));
1845 &cmp ($len,0x30);
1846 &jbe (&label("cbc_dec_three"));
1847
1848 &movups ($inout3,&QWP(0x30,$inp));
1849 &cmp ($len,0x40);
1850 &jbe (&label("cbc_dec_four"));
1851
1852 &movups ($inout4,&QWP(0x40,$inp));
1853 &movaps (&QWP(0,"esp"),$ivec); # save IV
1854 &movups ($inout0,&QWP(0,$inp));
1855 &xorps ($inout5,$inout5);
1856 &call ("_aesni_decrypt6");
1857 &movups ($rndkey1,&QWP(0,$inp));
1858 &movups ($rndkey0,&QWP(0x10,$inp));
1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1860 &xorps ($inout1,$rndkey1);
1861 &movups ($rndkey1,&QWP(0x20,$inp));
1862 &xorps ($inout2,$rndkey0);
1863 &movups ($rndkey0,&QWP(0x30,$inp));
1864 &xorps ($inout3,$rndkey1);
1865 &movups ($ivec,&QWP(0x40,$inp)); # IV
1866 &xorps ($inout4,$rndkey0);
1867 &movups (&QWP(0,$out),$inout0);
1868 &movups (&QWP(0x10,$out),$inout1);
1869 &movups (&QWP(0x20,$out),$inout2);
1870 &movups (&QWP(0x30,$out),$inout3);
1871 &lea ($out,&DWP(0x40,$out));
1872 &movaps ($inout0,$inout4);
1873 &sub ($len,0x50);
1874 &jmp (&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877 if ($inline)
1878 { &aesni_inline_generate1("dec"); }
1879 else
1880 { &call ("_aesni_decrypt1"); }
1881 &xorps ($inout0,$ivec);
1882 &movaps ($ivec,$in0);
1883 &sub ($len,0x10);
1884 &jmp (&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887 &xorps ($inout2,$inout2);
1888 &call ("_aesni_decrypt3");
1889 &xorps ($inout0,$ivec);
1890 &xorps ($inout1,$in0);
1891 &movups (&QWP(0,$out),$inout0);
1892 &movaps ($inout0,$inout1);
1893 &lea ($out,&DWP(0x10,$out));
1894 &movaps ($ivec,$in1);
1895 &sub ($len,0x20);
1896 &jmp (&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899 &call ("_aesni_decrypt3");
1900 &xorps ($inout0,$ivec);
1901 &xorps ($inout1,$in0);
1902 &xorps ($inout2,$in1);
1903 &movups (&QWP(0,$out),$inout0);
1904 &movaps ($inout0,$inout2);
1905 &movups (&QWP(0x10,$out),$inout1);
1906 &lea ($out,&DWP(0x20,$out));
1907 &movups ($ivec,&QWP(0x20,$inp));
1908 &sub ($len,0x30);
1909 &jmp (&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912 &call ("_aesni_decrypt4");
1913 &movups ($rndkey1,&QWP(0x10,$inp));
1914 &movups ($rndkey0,&QWP(0x20,$inp));
1915 &xorps ($inout0,$ivec);
1916 &movups ($ivec,&QWP(0x30,$inp));
1917 &xorps ($inout1,$in0);
1918 &movups (&QWP(0,$out),$inout0);
1919 &xorps ($inout2,$rndkey1);
1920 &movups (&QWP(0x10,$out),$inout1);
1921 &xorps ($inout3,$rndkey0);
1922 &movups (&QWP(0x20,$out),$inout2);
1923 &lea ($out,&DWP(0x30,$out));
1924 &movaps ($inout0,$inout3);
1925 &sub ($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928 &and ($len,15);
1929 &jnz (&label("cbc_dec_tail_partial"));
1930 &movups (&QWP(0,$out),$inout0);
1931 &jmp (&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934 &movaps (&QWP(0,"esp"),$inout0);
1935 &mov ("ecx",16);
1936 &mov ($inp,"esp");
1937 &sub ("ecx",$len);
1938 &data_word(0xA4F3F689); # rep movsb
1939
1940&set_label("cbc_ret");
1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1942 &mov ($key_,&wparam(4));
1943 &movups (&QWP(0,$key_),$ivec); # output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952# "eax" const unsigned char *userKey
1953# $rounds int bits
1954# $key AES_KEY *key
1955# output:
1956# "eax" return code
1957# $round rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960 &test ("eax","eax");
1961 &jz (&label("bad_pointer"));
1962 &test ($key,$key);
1963 &jz (&label("bad_pointer"));
1964
1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea ($key,&DWP(16,$key));
1968 &cmp ($rounds,256);
1969 &je (&label("14rounds"));
1970 &cmp ($rounds,192);
1971 &je (&label("12rounds"));
1972 &cmp ($rounds,128);
1973 &jne (&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976 &mov ($rounds,9);
1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1979 &call (&label("key_128_cold"));
1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1995 &call (&label("key_128"));
1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1997 &call (&label("key_128"));
1998 &$movekey (&QWP(0,$key),"xmm0");
1999 &mov (&DWP(80,$key),$rounds);
2000 &xor ("eax","eax");
2001 &ret();
2002
2003&set_label("key_128",16);
2004 &$movekey (&QWP(0,$key),"xmm0");
2005 &lea ($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007 &shufps ("xmm4","xmm0",0b00010000);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm4","xmm0",0b10001100);
2010 &xorps ("xmm0","xmm4");
2011 &shufps ("xmm1","xmm1",0b11111111); # critical path
2012 &xorps ("xmm0","xmm1");
2013 &ret();
2014
2015&set_label("12rounds",16);
2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2017 &mov ($rounds,11);
2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2020 &call (&label("key_192a_cold"));
2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2022 &call (&label("key_192b"));
2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2024 &call (&label("key_192a"));
2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2026 &call (&label("key_192b"));
2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2028 &call (&label("key_192a"));
2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2030 &call (&label("key_192b"));
2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2032 &call (&label("key_192a"));
2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2034 &call (&label("key_192b"));
2035 &$movekey (&QWP(0,$key),"xmm0");
2036 &mov (&DWP(48,$key),$rounds);
2037 &xor ("eax","eax");
2038 &ret();
2039
2040&set_label("key_192a",16);
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &lea ($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044 &movaps ("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046 &shufps ("xmm4","xmm0",0b00010000);
2047 &movdqa ("xmm3","xmm2");
2048 &xorps ("xmm0","xmm4");
2049 &shufps ("xmm4","xmm0",0b10001100);
2050 &pslldq ("xmm3",4);
2051 &xorps ("xmm0","xmm4");
2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2053 &pxor ("xmm2","xmm3");
2054 &pxor ("xmm0","xmm1");
2055 &pshufd ("xmm3","xmm0",0b11111111);
2056 &pxor ("xmm2","xmm3");
2057 &ret();
2058
2059&set_label("key_192b",16);
2060 &movaps ("xmm3","xmm0");
2061 &shufps ("xmm5","xmm0",0b01000100);
2062 &$movekey (&QWP(0,$key),"xmm5");
2063 &shufps ("xmm3","xmm2",0b01001110);
2064 &$movekey (&QWP(16,$key),"xmm3");
2065 &lea ($key,&DWP(32,$key));
2066 &jmp (&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2070 &mov ($rounds,13);
2071 &lea ($key,&DWP(16,$key));
2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2075 &call (&label("key_256a_cold"));
2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2077 &call (&label("key_256b"));
2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2079 &call (&label("key_256a"));
2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2081 &call (&label("key_256b"));
2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2083 &call (&label("key_256a"));
2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2085 &call (&label("key_256b"));
2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2087 &call (&label("key_256a"));
2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2089 &call (&label("key_256b"));
2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2091 &call (&label("key_256a"));
2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2093 &call (&label("key_256b"));
2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2095 &call (&label("key_256a"));
2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2097 &call (&label("key_256b"));
2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2099 &call (&label("key_256a"));
2100 &$movekey (&QWP(0,$key),"xmm0");
2101 &mov (&DWP(16,$key),$rounds);
2102 &xor ("eax","eax");
2103 &ret();
2104
2105&set_label("key_256a",16);
2106 &$movekey (&QWP(0,$key),"xmm2");
2107 &lea ($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109 &shufps ("xmm4","xmm0",0b00010000);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm4","xmm0",0b10001100);
2112 &xorps ("xmm0","xmm4");
2113 &shufps ("xmm1","xmm1",0b11111111); # critical path
2114 &xorps ("xmm0","xmm1");
2115 &ret();
2116
2117&set_label("key_256b",16);
2118 &$movekey (&QWP(0,$key),"xmm0");
2119 &lea ($key,&DWP(16,$key));
2120
2121 &shufps ("xmm4","xmm2",0b00010000);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm4","xmm2",0b10001100);
2124 &xorps ("xmm2","xmm4");
2125 &shufps ("xmm1","xmm1",0b10101010); # critical path
2126 &xorps ("xmm2","xmm1");
2127 &ret();
2128
2129&set_label("bad_pointer",4);
2130 &mov ("eax",-1);
2131 &ret ();
2132&set_label("bad_keybits",4);
2133 &mov ("eax",-2);
2134 &ret ();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138# AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140 &mov ("eax",&wparam(0));
2141 &mov ($rounds,&wparam(1));
2142 &mov ($key,&wparam(2));
2143 &call ("_aesni_set_encrypt_key");
2144 &ret ();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148# AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150 &mov ("eax",&wparam(0));
2151 &mov ($rounds,&wparam(1));
2152 &mov ($key,&wparam(2));
2153 &call ("_aesni_set_encrypt_key");
2154 &mov ($key,&wparam(2));
2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test ("eax","eax");
2157 &jnz (&label("dec_key_ret"));
2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2159
2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP(0,"eax"));
2162 &$movekey (&QWP(0,"eax"),"xmm0");
2163 &$movekey (&QWP(0,$key),"xmm1");
2164 &lea ($key,&DWP(16,$key));
2165 &lea ("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP(0,"eax"));
2170 &aesimc ("xmm0","xmm0");
2171 &aesimc ("xmm1","xmm1");
2172 &lea ($key,&DWP(16,$key));
2173 &lea ("eax",&DWP(-16,"eax"));
2174 &$movekey (&QWP(16,"eax"),"xmm0");
2175 &$movekey (&QWP(-16,$key),"xmm1");
2176 &cmp ("eax",$key);
2177 &ja (&label("dec_key_inverse"));
2178
2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2180 &aesimc ("xmm0","xmm0");
2181 &$movekey (&QWP(0,$key),"xmm0");
2182
2183 &xor ("eax","eax"); # return success
2184&set_label("dec_key_ret");
2185 &ret ();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188
2189&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
new file mode 100644
index 0000000000..49e0f4b351
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
@@ -0,0 +1,992 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details].
14
15$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-x86_64.pl:-)
18
19$flavour = shift;
20$output = shift;
21if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
22
23$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
24
25$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
26( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
27( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
28die "can't locate x86_64-xlate.pl";
29
30open STDOUT,"| $^X $xlate $flavour $output";
31
32$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
33@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
34 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
35
36$code=".text\n";
37
38$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
39# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
40$inp="%rdi";
41$out="%rsi";
42$len="%rdx";
43$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
44$ivp="%r8"; # cbc
45
46$rnds_="%r10d"; # backup copy for $rounds
47$key_="%r11"; # backup copy for $key
48
49# %xmm register layout
50$inout0="%xmm0"; $inout1="%xmm1";
51$inout2="%xmm2"; $inout3="%xmm3";
52$rndkey0="%xmm4"; $rndkey1="%xmm5";
53
54$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt
55$in1="%xmm8"; $in2="%xmm9";
56
57# Inline version of internal aesni_[en|de]crypt1.
58#
59# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
60# cycles which take care of loop variables...
61{ my $sn;
62sub aesni_generate1 {
63my ($p,$key,$rounds)=@_;
64++$sn;
65$code.=<<___;
66 $movkey ($key),$rndkey0
67 $movkey 16($key),$rndkey1
68 lea 32($key),$key
69 pxor $rndkey0,$inout0
70.Loop_${p}1_$sn:
71 aes${p} $rndkey1,$inout0
72 dec $rounds
73 $movkey ($key),$rndkey1
74 lea 16($key),$key
75 jnz .Loop_${p}1_$sn # loop body is 16 bytes
76 aes${p}last $rndkey1,$inout0
77___
78}}
79# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
80#
81{ my ($inp,$out,$key) = @_4args;
82
83$code.=<<___;
84.globl ${PREFIX}_encrypt
85.type ${PREFIX}_encrypt,\@abi-omnipotent
86.align 16
87${PREFIX}_encrypt:
88 movups ($inp),$inout0 # load input
89 mov 240($key),$rounds # pull $rounds
90___
91 &aesni_generate1("enc",$key,$rounds);
92$code.=<<___;
93 movups $inout0,($out) # output
94 ret
95.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
96
97.globl ${PREFIX}_decrypt
98.type ${PREFIX}_decrypt,\@abi-omnipotent
99.align 16
100${PREFIX}_decrypt:
101 movups ($inp),$inout0 # load input
102 mov 240($key),$rounds # pull $rounds
103___
104 &aesni_generate1("dec",$key,$rounds);
105$code.=<<___;
106 movups $inout0,($out) # output
107 ret
108.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
109___
110}
111
112# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
113# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
114# latency is 6, it turned out that it can be scheduled only every
115# *second* cycle. Thus 3x interleave is the one providing optimal
116# utilization, i.e. when subroutine's throughput is virtually same as
117# of non-interleaved subroutine [for number of input blocks up to 3].
118# This is why it makes no sense to implement 2x subroutine. As soon
119# as/if Intel improves throughput by making it possible to schedule
120# the instructions in question *every* cycles I would have to
121# implement 6x interleave and use it in loop...
122sub aesni_generate3 {
123my $dir=shift;
124# As already mentioned it takes in $key and $rounds, which are *not*
125# preserved. $inout[0-2] is cipher/clear text...
126$code.=<<___;
127.type _aesni_${dir}rypt3,\@abi-omnipotent
128.align 16
129_aesni_${dir}rypt3:
130 $movkey ($key),$rndkey0
131 shr \$1,$rounds
132 $movkey 16($key),$rndkey1
133 lea 32($key),$key
134 pxor $rndkey0,$inout0
135 pxor $rndkey0,$inout1
136 pxor $rndkey0,$inout2
137
138.L${dir}_loop3:
139 aes${dir} $rndkey1,$inout0
140 $movkey ($key),$rndkey0
141 aes${dir} $rndkey1,$inout1
142 dec $rounds
143 aes${dir} $rndkey1,$inout2
144 aes${dir} $rndkey0,$inout0
145 $movkey 16($key),$rndkey1
146 aes${dir} $rndkey0,$inout1
147 lea 32($key),$key
148 aes${dir} $rndkey0,$inout2
149 jnz .L${dir}_loop3
150
151 aes${dir} $rndkey1,$inout0
152 $movkey ($key),$rndkey0
153 aes${dir} $rndkey1,$inout1
154 aes${dir} $rndkey1,$inout2
155 aes${dir}last $rndkey0,$inout0
156 aes${dir}last $rndkey0,$inout1
157 aes${dir}last $rndkey0,$inout2
158 ret
159.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
160___
161}
162# 4x interleave is implemented to improve small block performance,
163# most notably [and naturally] 4 block by ~30%. One can argue that one
164# should have implemented 5x as well, but improvement would be <20%,
165# so it's not worth it...
166sub aesni_generate4 {
167my $dir=shift;
168# As already mentioned it takes in $key and $rounds, which are *not*
169# preserved. $inout[0-3] is cipher/clear text...
170$code.=<<___;
171.type _aesni_${dir}rypt4,\@abi-omnipotent
172.align 16
173_aesni_${dir}rypt4:
174 $movkey ($key),$rndkey0
175 shr \$1,$rounds
176 $movkey 16($key),$rndkey1
177 lea 32($key),$key
178 pxor $rndkey0,$inout0
179 pxor $rndkey0,$inout1
180 pxor $rndkey0,$inout2
181 pxor $rndkey0,$inout3
182
183.L${dir}_loop4:
184 aes${dir} $rndkey1,$inout0
185 $movkey ($key),$rndkey0
186 aes${dir} $rndkey1,$inout1
187 dec $rounds
188 aes${dir} $rndkey1,$inout2
189 aes${dir} $rndkey1,$inout3
190 aes${dir} $rndkey0,$inout0
191 $movkey 16($key),$rndkey1
192 aes${dir} $rndkey0,$inout1
193 lea 32($key),$key
194 aes${dir} $rndkey0,$inout2
195 aes${dir} $rndkey0,$inout3
196 jnz .L${dir}_loop4
197
198 aes${dir} $rndkey1,$inout0
199 $movkey ($key),$rndkey0
200 aes${dir} $rndkey1,$inout1
201 aes${dir} $rndkey1,$inout2
202 aes${dir} $rndkey1,$inout3
203 aes${dir}last $rndkey0,$inout0
204 aes${dir}last $rndkey0,$inout1
205 aes${dir}last $rndkey0,$inout2
206 aes${dir}last $rndkey0,$inout3
207 ret
208.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
209___
210}
211&aesni_generate3("enc") if ($PREFIX eq "aesni");
212&aesni_generate3("dec");
213&aesni_generate4("enc") if ($PREFIX eq "aesni");
214&aesni_generate4("dec");
215
216if ($PREFIX eq "aesni") {
217# void aesni_ecb_encrypt (const void *in, void *out,
218# size_t length, const AES_KEY *key,
219# int enc);
220$code.=<<___;
221.globl aesni_ecb_encrypt
222.type aesni_ecb_encrypt,\@function,5
223.align 16
224aesni_ecb_encrypt:
225 cmp \$16,$len # check length
226 jb .Lecb_ret
227
228 mov 240($key),$rounds # pull $rounds
229 and \$-16,$len
230 mov $key,$key_ # backup $key
231 test %r8d,%r8d # 5th argument
232 mov $rounds,$rnds_ # backup $rounds
233 jz .Lecb_decrypt
234#--------------------------- ECB ENCRYPT ------------------------------#
235 sub \$0x40,$len
236 jbe .Lecb_enc_tail
237 jmp .Lecb_enc_loop3
238.align 16
239.Lecb_enc_loop3:
240 movups ($inp),$inout0
241 movups 0x10($inp),$inout1
242 movups 0x20($inp),$inout2
243 call _aesni_encrypt3
244 sub \$0x30,$len
245 lea 0x30($inp),$inp
246 lea 0x30($out),$out
247 movups $inout0,-0x30($out)
248 mov $rnds_,$rounds # restore $rounds
249 movups $inout1,-0x20($out)
250 mov $key_,$key # restore $key
251 movups $inout2,-0x10($out)
252 ja .Lecb_enc_loop3
253
254.Lecb_enc_tail:
255 add \$0x40,$len
256 jz .Lecb_ret
257
258 cmp \$0x10,$len
259 movups ($inp),$inout0
260 je .Lecb_enc_one
261 cmp \$0x20,$len
262 movups 0x10($inp),$inout1
263 je .Lecb_enc_two
264 cmp \$0x30,$len
265 movups 0x20($inp),$inout2
266 je .Lecb_enc_three
267 movups 0x30($inp),$inout3
268 call _aesni_encrypt4
269 movups $inout0,($out)
270 movups $inout1,0x10($out)
271 movups $inout2,0x20($out)
272 movups $inout3,0x30($out)
273 jmp .Lecb_ret
274.align 16
275.Lecb_enc_one:
276___
277 &aesni_generate1("enc",$key,$rounds);
278$code.=<<___;
279 movups $inout0,($out)
280 jmp .Lecb_ret
281.align 16
282.Lecb_enc_two:
283 call _aesni_encrypt3
284 movups $inout0,($out)
285 movups $inout1,0x10($out)
286 jmp .Lecb_ret
287.align 16
288.Lecb_enc_three:
289 call _aesni_encrypt3
290 movups $inout0,($out)
291 movups $inout1,0x10($out)
292 movups $inout2,0x20($out)
293 jmp .Lecb_ret
294 #--------------------------- ECB DECRYPT ------------------------------#
295.align 16
296.Lecb_decrypt:
297 sub \$0x40,$len
298 jbe .Lecb_dec_tail
299 jmp .Lecb_dec_loop3
300.align 16
301.Lecb_dec_loop3:
302 movups ($inp),$inout0
303 movups 0x10($inp),$inout1
304 movups 0x20($inp),$inout2
305 call _aesni_decrypt3
306 sub \$0x30,$len
307 lea 0x30($inp),$inp
308 lea 0x30($out),$out
309 movups $inout0,-0x30($out)
310 mov $rnds_,$rounds # restore $rounds
311 movups $inout1,-0x20($out)
312 mov $key_,$key # restore $key
313 movups $inout2,-0x10($out)
314 ja .Lecb_dec_loop3
315
316.Lecb_dec_tail:
317 add \$0x40,$len
318 jz .Lecb_ret
319
320 cmp \$0x10,$len
321 movups ($inp),$inout0
322 je .Lecb_dec_one
323 cmp \$0x20,$len
324 movups 0x10($inp),$inout1
325 je .Lecb_dec_two
326 cmp \$0x30,$len
327 movups 0x20($inp),$inout2
328 je .Lecb_dec_three
329 movups 0x30($inp),$inout3
330 call _aesni_decrypt4
331 movups $inout0,($out)
332 movups $inout1,0x10($out)
333 movups $inout2,0x20($out)
334 movups $inout3,0x30($out)
335 jmp .Lecb_ret
336.align 16
337.Lecb_dec_one:
338___
339 &aesni_generate1("dec",$key,$rounds);
340$code.=<<___;
341 movups $inout0,($out)
342 jmp .Lecb_ret
343.align 16
344.Lecb_dec_two:
345 call _aesni_decrypt3
346 movups $inout0,($out)
347 movups $inout1,0x10($out)
348 jmp .Lecb_ret
349.align 16
350.Lecb_dec_three:
351 call _aesni_decrypt3
352 movups $inout0,($out)
353 movups $inout1,0x10($out)
354 movups $inout2,0x20($out)
355
356.Lecb_ret:
357 ret
358.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
359___
360}
361
362# void $PREFIX_cbc_encrypt (const void *inp, void *out,
363# size_t length, const AES_KEY *key,
364# unsigned char *ivp,const int enc);
365$reserved = $win64?0x40:-0x18; # used in decrypt
366$code.=<<___;
367.globl ${PREFIX}_cbc_encrypt
368.type ${PREFIX}_cbc_encrypt,\@function,6
369.align 16
370${PREFIX}_cbc_encrypt:
371 test $len,$len # check length
372 jz .Lcbc_ret
373
374 mov 240($key),$rnds_ # pull $rounds
375 mov $key,$key_ # backup $key
376 test %r9d,%r9d # 6th argument
377 jz .Lcbc_decrypt
378#--------------------------- CBC ENCRYPT ------------------------------#
379 movups ($ivp),$inout0 # load iv as initial state
380 cmp \$16,$len
381 mov $rnds_,$rounds
382 jb .Lcbc_enc_tail
383 sub \$16,$len
384 jmp .Lcbc_enc_loop
385.align 16
386.Lcbc_enc_loop:
387 movups ($inp),$inout1 # load input
388 lea 16($inp),$inp
389 pxor $inout1,$inout0
390___
391 &aesni_generate1("enc",$key,$rounds);
392$code.=<<___;
393 sub \$16,$len
394 lea 16($out),$out
395 mov $rnds_,$rounds # restore $rounds
396 mov $key_,$key # restore $key
397 movups $inout0,-16($out) # store output
398 jnc .Lcbc_enc_loop
399 add \$16,$len
400 jnz .Lcbc_enc_tail
401 movups $inout0,($ivp)
402 jmp .Lcbc_ret
403
404.Lcbc_enc_tail:
405 mov $len,%rcx # zaps $key
406 xchg $inp,$out # $inp is %rsi and $out is %rdi now
407 .long 0x9066A4F3 # rep movsb
408 mov \$16,%ecx # zero tail
409 sub $len,%rcx
410 xor %eax,%eax
411 .long 0x9066AAF3 # rep stosb
412 lea -16(%rdi),%rdi # rewind $out by 1 block
413 mov $rnds_,$rounds # restore $rounds
414 mov %rdi,%rsi # $inp and $out are the same
415 mov $key_,$key # restore $key
416 xor $len,$len # len=16
417 jmp .Lcbc_enc_loop # one more spin
418 #--------------------------- CBC DECRYPT ------------------------------#
419.align 16
420.Lcbc_decrypt:
421___
422$code.=<<___ if ($win64);
423 lea -0x58(%rsp),%rsp
424 movaps %xmm6,(%rsp)
425 movaps %xmm7,0x10(%rsp)
426 movaps %xmm8,0x20(%rsp)
427 movaps %xmm9,0x30(%rsp)
428.Lcbc_decrypt_body:
429___
430$code.=<<___;
431 movups ($ivp),$iv
432 sub \$0x40,$len
433 mov $rnds_,$rounds
434 jbe .Lcbc_dec_tail
435 jmp .Lcbc_dec_loop3
436.align 16
437.Lcbc_dec_loop3:
438 movups ($inp),$inout0
439 movups 0x10($inp),$inout1
440 movups 0x20($inp),$inout2
441 movaps $inout0,$in0
442 movaps $inout1,$in1
443 movaps $inout2,$in2
444 call _aesni_decrypt3
445 sub \$0x30,$len
446 lea 0x30($inp),$inp
447 lea 0x30($out),$out
448 pxor $iv,$inout0
449 pxor $in0,$inout1
450 movaps $in2,$iv
451 pxor $in1,$inout2
452 movups $inout0,-0x30($out)
453 mov $rnds_,$rounds # restore $rounds
454 movups $inout1,-0x20($out)
455 mov $key_,$key # restore $key
456 movups $inout2,-0x10($out)
457 ja .Lcbc_dec_loop3
458
459.Lcbc_dec_tail:
460 add \$0x40,$len
461 movups $iv,($ivp)
462 jz .Lcbc_dec_ret
463
464 movups ($inp),$inout0
465 cmp \$0x10,$len
466 movaps $inout0,$in0
467 jbe .Lcbc_dec_one
468 movups 0x10($inp),$inout1
469 cmp \$0x20,$len
470 movaps $inout1,$in1
471 jbe .Lcbc_dec_two
472 movups 0x20($inp),$inout2
473 cmp \$0x30,$len
474 movaps $inout2,$in2
475 jbe .Lcbc_dec_three
476 movups 0x30($inp),$inout3
477 call _aesni_decrypt4
478 pxor $iv,$inout0
479 movups 0x30($inp),$iv
480 pxor $in0,$inout1
481 movups $inout0,($out)
482 pxor $in1,$inout2
483 movups $inout1,0x10($out)
484 pxor $in2,$inout3
485 movups $inout2,0x20($out)
486 movaps $inout3,$inout0
487 lea 0x30($out),$out
488 jmp .Lcbc_dec_tail_collected
489.align 16
490.Lcbc_dec_one:
491___
492 &aesni_generate1("dec",$key,$rounds);
493$code.=<<___;
494 pxor $iv,$inout0
495 movaps $in0,$iv
496 jmp .Lcbc_dec_tail_collected
497.align 16
498.Lcbc_dec_two:
499 call _aesni_decrypt3
500 pxor $iv,$inout0
501 pxor $in0,$inout1
502 movups $inout0,($out)
503 movaps $in1,$iv
504 movaps $inout1,$inout0
505 lea 0x10($out),$out
506 jmp .Lcbc_dec_tail_collected
507.align 16
508.Lcbc_dec_three:
509 call _aesni_decrypt3
510 pxor $iv,$inout0
511 pxor $in0,$inout1
512 movups $inout0,($out)
513 pxor $in1,$inout2
514 movups $inout1,0x10($out)
515 movaps $in2,$iv
516 movaps $inout2,$inout0
517 lea 0x20($out),$out
518 jmp .Lcbc_dec_tail_collected
519.align 16
520.Lcbc_dec_tail_collected:
521 and \$15,$len
522 movups $iv,($ivp)
523 jnz .Lcbc_dec_tail_partial
524 movups $inout0,($out)
525 jmp .Lcbc_dec_ret
526.Lcbc_dec_tail_partial:
527 movaps $inout0,$reserved(%rsp)
528 mov $out,%rdi
529 mov $len,%rcx
530 lea $reserved(%rsp),%rsi
531 .long 0x9066A4F3 # rep movsb
532
533.Lcbc_dec_ret:
534___
535$code.=<<___ if ($win64);
536 movaps (%rsp),%xmm6
537 movaps 0x10(%rsp),%xmm7
538 movaps 0x20(%rsp),%xmm8
539 movaps 0x30(%rsp),%xmm9
540 lea 0x58(%rsp),%rsp
541___
542$code.=<<___;
543.Lcbc_ret:
544 ret
545.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
546___
547
548# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
549# int bits, AES_KEY *key)
550{ my ($inp,$bits,$key) = @_4args;
551 $bits =~ s/%r/%e/;
552
553$code.=<<___;
554.globl ${PREFIX}_set_decrypt_key
555.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
556.align 16
557${PREFIX}_set_decrypt_key:
558 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
559 call _aesni_set_encrypt_key
560 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
561 test %eax,%eax
562 jnz .Ldec_key_ret
563 lea 16($key,$bits),$inp # points at the end of key schedule
564
565 $movkey ($key),%xmm0 # just swap
566 $movkey ($inp),%xmm1
567 $movkey %xmm0,($inp)
568 $movkey %xmm1,($key)
569 lea 16($key),$key
570 lea -16($inp),$inp
571
572.Ldec_key_inverse:
573 $movkey ($key),%xmm0 # swap and inverse
574 $movkey ($inp),%xmm1
575 aesimc %xmm0,%xmm0
576 aesimc %xmm1,%xmm1
577 lea 16($key),$key
578 lea -16($inp),$inp
579 cmp $key,$inp
580 $movkey %xmm0,16($inp)
581 $movkey %xmm1,-16($key)
582 ja .Ldec_key_inverse
583
584 $movkey ($key),%xmm0 # inverse middle
585 aesimc %xmm0,%xmm0
586 $movkey %xmm0,($inp)
587.Ldec_key_ret:
588 add \$8,%rsp
589 ret
590.LSEH_end_set_decrypt_key:
591.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
592___
593
594# This is based on submission by
595#
596# Huang Ying <ying.huang@intel.com>
597# Vinodh Gopal <vinodh.gopal@intel.com>
598# Kahraman Akdemir
599#
600# Agressively optimized in respect to aeskeygenassist's critical path
601# and is contained in %xmm0-5 to meet Win64 ABI requirement.
602#
603$code.=<<___;
604.globl ${PREFIX}_set_encrypt_key
605.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
606.align 16
607${PREFIX}_set_encrypt_key:
608_aesni_set_encrypt_key:
609 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
610 test $inp,$inp
611 mov \$-1,%rax
612 jz .Lenc_key_ret
613 test $key,$key
614 jz .Lenc_key_ret
615
616 movups ($inp),%xmm0 # pull first 128 bits of *userKey
617 pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
618 lea 16($key),%rax
619 cmp \$256,$bits
620 je .L14rounds
621 cmp \$192,$bits
622 je .L12rounds
623 cmp \$128,$bits
624 jne .Lbad_keybits
625
626.L10rounds:
627 mov \$9,$bits # 10 rounds for 128-bit key
628 $movkey %xmm0,($key) # round 0
629 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
630 call .Lkey_expansion_128_cold
631 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
632 call .Lkey_expansion_128
633 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
634 call .Lkey_expansion_128
635 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
636 call .Lkey_expansion_128
637 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
638 call .Lkey_expansion_128
639 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
640 call .Lkey_expansion_128
641 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
642 call .Lkey_expansion_128
643 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
644 call .Lkey_expansion_128
645 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
646 call .Lkey_expansion_128
647 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
648 call .Lkey_expansion_128
649 $movkey %xmm0,(%rax)
650 mov $bits,80(%rax) # 240(%rdx)
651 xor %eax,%eax
652 jmp .Lenc_key_ret
653
654.align 16
655.L12rounds:
656 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
657 mov \$11,$bits # 12 rounds for 192
658 $movkey %xmm0,($key) # round 0
659 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
660 call .Lkey_expansion_192a_cold
661 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
662 call .Lkey_expansion_192b
663 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
664 call .Lkey_expansion_192a
665 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
666 call .Lkey_expansion_192b
667 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
668 call .Lkey_expansion_192a
669 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
670 call .Lkey_expansion_192b
671 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
672 call .Lkey_expansion_192a
673 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
674 call .Lkey_expansion_192b
675 $movkey %xmm0,(%rax)
676 mov $bits,48(%rax) # 240(%rdx)
677 xor %rax, %rax
678 jmp .Lenc_key_ret
679
680.align 16
681.L14rounds:
682 movups 16($inp),%xmm2 # remaning half of *userKey
683 mov \$13,$bits # 14 rounds for 256
684 lea 16(%rax),%rax
685 $movkey %xmm0,($key) # round 0
686 $movkey %xmm2,16($key) # round 1
687 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
688 call .Lkey_expansion_256a_cold
689 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
690 call .Lkey_expansion_256b
691 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
692 call .Lkey_expansion_256a
693 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
694 call .Lkey_expansion_256b
695 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
696 call .Lkey_expansion_256a
697 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
698 call .Lkey_expansion_256b
699 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
700 call .Lkey_expansion_256a
701 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
702 call .Lkey_expansion_256b
703 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
704 call .Lkey_expansion_256a
705 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
706 call .Lkey_expansion_256b
707 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
708 call .Lkey_expansion_256a
709 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
710 call .Lkey_expansion_256b
711 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
712 call .Lkey_expansion_256a
713 $movkey %xmm0,(%rax)
714 mov $bits,16(%rax) # 240(%rdx)
715 xor %rax,%rax
716 jmp .Lenc_key_ret
717
718.align 16
719.Lbad_keybits:
720 mov \$-2,%rax
721.Lenc_key_ret:
722 add \$8,%rsp
723 ret
724.LSEH_end_set_encrypt_key:
725
726.align 16
727.Lkey_expansion_128:
728 $movkey %xmm0,(%rax)
729 lea 16(%rax),%rax
730.Lkey_expansion_128_cold:
731 shufps \$0b00010000,%xmm0,%xmm4
732 pxor %xmm4, %xmm0
733 shufps \$0b10001100,%xmm0,%xmm4
734 pxor %xmm4, %xmm0
735 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
736 pxor %xmm1,%xmm0
737 ret
738
739.align 16
740.Lkey_expansion_192a:
741 $movkey %xmm0,(%rax)
742 lea 16(%rax),%rax
743.Lkey_expansion_192a_cold:
744 movaps %xmm2, %xmm5
745.Lkey_expansion_192b_warm:
746 shufps \$0b00010000,%xmm0,%xmm4
747 movaps %xmm2,%xmm3
748 pxor %xmm4,%xmm0
749 shufps \$0b10001100,%xmm0,%xmm4
750 pslldq \$4,%xmm3
751 pxor %xmm4,%xmm0
752 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
753 pxor %xmm3,%xmm2
754 pxor %xmm1,%xmm0
755 pshufd \$0b11111111,%xmm0,%xmm3
756 pxor %xmm3,%xmm2
757 ret
758
759.align 16
760.Lkey_expansion_192b:
761 movaps %xmm0,%xmm3
762 shufps \$0b01000100,%xmm0,%xmm5
763 $movkey %xmm5,(%rax)
764 shufps \$0b01001110,%xmm2,%xmm3
765 $movkey %xmm3,16(%rax)
766 lea 32(%rax),%rax
767 jmp .Lkey_expansion_192b_warm
768
769.align 16
770.Lkey_expansion_256a:
771 $movkey %xmm2,(%rax)
772 lea 16(%rax),%rax
773.Lkey_expansion_256a_cold:
774 shufps \$0b00010000,%xmm0,%xmm4
775 pxor %xmm4,%xmm0
776 shufps \$0b10001100,%xmm0,%xmm4
777 pxor %xmm4,%xmm0
778 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
779 pxor %xmm1,%xmm0
780 ret
781
782.align 16
783.Lkey_expansion_256b:
784 $movkey %xmm0,(%rax)
785 lea 16(%rax),%rax
786
787 shufps \$0b00010000,%xmm2,%xmm4
788 pxor %xmm4,%xmm2
789 shufps \$0b10001100,%xmm2,%xmm4
790 pxor %xmm4,%xmm2
791 pshufd \$0b10101010,%xmm1,%xmm1 # critical path
792 pxor %xmm1,%xmm2
793 ret
794.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
795___
796}
797
798$code.=<<___;
799.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
800.align 64
801___
802
803# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
804# CONTEXT *context,DISPATCHER_CONTEXT *disp)
805if ($win64) {
806$rec="%rcx";
807$frame="%rdx";
808$context="%r8";
809$disp="%r9";
810
811$code.=<<___;
812.extern __imp_RtlVirtualUnwind
813.type cbc_se_handler,\@abi-omnipotent
814.align 16
815cbc_se_handler:
816 push %rsi
817 push %rdi
818 push %rbx
819 push %rbp
820 push %r12
821 push %r13
822 push %r14
823 push %r15
824 pushfq
825 sub \$64,%rsp
826
827 mov 152($context),%rax # pull context->Rsp
828 mov 248($context),%rbx # pull context->Rip
829
830 lea .Lcbc_decrypt(%rip),%r10
831 cmp %r10,%rbx # context->Rip<"prologue" label
832 jb .Lin_prologue
833
834 lea .Lcbc_decrypt_body(%rip),%r10
835 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
836 jb .Lrestore_rax
837
838 lea .Lcbc_ret(%rip),%r10
839 cmp %r10,%rbx # context->Rip>="epilogue" label
840 jae .Lin_prologue
841
842 lea 0(%rax),%rsi # top of stack
843 lea 512($context),%rdi # &context.Xmm6
844 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
845 .long 0xa548f3fc # cld; rep movsq
846 lea 0x58(%rax),%rax # adjust stack pointer
847 jmp .Lin_prologue
848
849.Lrestore_rax:
850 mov 120($context),%rax
851.Lin_prologue:
852 mov 8(%rax),%rdi
853 mov 16(%rax),%rsi
854 mov %rax,152($context) # restore context->Rsp
855 mov %rsi,168($context) # restore context->Rsi
856 mov %rdi,176($context) # restore context->Rdi
857
858 jmp .Lcommon_seh_exit
859.size cbc_se_handler,.-cbc_se_handler
860
861.type ecb_se_handler,\@abi-omnipotent
862.align 16
863ecb_se_handler:
864 push %rsi
865 push %rdi
866 push %rbx
867 push %rbp
868 push %r12
869 push %r13
870 push %r14
871 push %r15
872 pushfq
873 sub \$64,%rsp
874
875 mov 152($context),%rax # pull context->Rsp
876 mov 8(%rax),%rdi
877 mov 16(%rax),%rsi
878 mov %rsi,168($context) # restore context->Rsi
879 mov %rdi,176($context) # restore context->Rdi
880
881.Lcommon_seh_exit:
882
883 mov 40($disp),%rdi # disp->ContextRecord
884 mov $context,%rsi # context
885 mov \$154,%ecx # sizeof(CONTEXT)
886 .long 0xa548f3fc # cld; rep movsq
887
888 mov $disp,%rsi
889 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
890 mov 8(%rsi),%rdx # arg2, disp->ImageBase
891 mov 0(%rsi),%r8 # arg3, disp->ControlPc
892 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
893 mov 40(%rsi),%r10 # disp->ContextRecord
894 lea 56(%rsi),%r11 # &disp->HandlerData
895 lea 24(%rsi),%r12 # &disp->EstablisherFrame
896 mov %r10,32(%rsp) # arg5
897 mov %r11,40(%rsp) # arg6
898 mov %r12,48(%rsp) # arg7
899 mov %rcx,56(%rsp) # arg8, (NULL)
900 call *__imp_RtlVirtualUnwind(%rip)
901
902 mov \$1,%eax # ExceptionContinueSearch
903 add \$64,%rsp
904 popfq
905 pop %r15
906 pop %r14
907 pop %r13
908 pop %r12
909 pop %rbp
910 pop %rbx
911 pop %rdi
912 pop %rsi
913 ret
914.size cbc_se_handler,.-cbc_se_handler
915
916.section .pdata
917.align 4
918 .rva .LSEH_begin_${PREFIX}_ecb_encrypt
919 .rva .LSEH_end_${PREFIX}_ecb_encrypt
920 .rva .LSEH_info_ecb
921
922 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
923 .rva .LSEH_end_${PREFIX}_cbc_encrypt
924 .rva .LSEH_info_cbc
925
926 .rva ${PREFIX}_set_decrypt_key
927 .rva .LSEH_end_set_decrypt_key
928 .rva .LSEH_info_key
929
930 .rva ${PREFIX}_set_encrypt_key
931 .rva .LSEH_end_set_encrypt_key
932 .rva .LSEH_info_key
933.section .xdata
934.align 8
935.LSEH_info_ecb:
936 .byte 9,0,0,0
937 .rva ecb_se_handler
938.LSEH_info_cbc:
939 .byte 9,0,0,0
940 .rva cbc_se_handler
941.LSEH_info_key:
942 .byte 0x01,0x04,0x01,0x00
943 .byte 0x04,0x02,0x00,0x00
944___
945}
946
947sub rex {
948 local *opcode=shift;
949 my ($dst,$src)=@_;
950
951 if ($dst>=8 || $src>=8) {
952 $rex=0x40;
953 $rex|=0x04 if($dst>=8);
954 $rex|=0x01 if($src>=8);
955 push @opcode,$rex;
956 }
957}
958
959sub aesni {
960 my $line=shift;
961 my @opcode=(0x66);
962
963 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
964 rex(\@opcode,$4,$3);
965 push @opcode,0x0f,0x3a,0xdf;
966 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
967 my $c=$2;
968 push @opcode,$c=~/^0/?oct($c):$c;
969 return ".byte\t".join(',',@opcode);
970 }
971 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
972 my %opcodelet = (
973 "aesimc" => 0xdb,
974 "aesenc" => 0xdc, "aesenclast" => 0xdd,
975 "aesdec" => 0xde, "aesdeclast" => 0xdf
976 );
977 return undef if (!defined($opcodelet{$1}));
978 rex(\@opcode,$3,$2);
979 push @opcode,0x0f,0x38,$opcodelet{$1};
980 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
981 return ".byte\t".join(',',@opcode);
982 }
983 return $line;
984}
985
986$code =~ s/\`([^\`]*)\`/eval($1)/gem;
987$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
988
989print $code;
990
991close STDOUT;
992
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000000..c9c6312fa7
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3044 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
88# Atom 20.9
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open STDOUT,"| $^X $xlate $flavour $output";
109
110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
112my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
113
114{
115my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116
117sub Sbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InBasisChange (@b);
124 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
125 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
126}
127
128sub InBasisChange {
129# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131my @b=@_[0..7];
132$code.=<<___;
133 pxor @b[6], @b[5]
134 pxor @b[1], @b[2]
135 pxor @b[0], @b[3]
136 pxor @b[2], @b[6]
137 pxor @b[0], @b[5]
138
139 pxor @b[3], @b[6]
140 pxor @b[7], @b[3]
141 pxor @b[5], @b[7]
142 pxor @b[4], @b[3]
143 pxor @b[5], @b[4]
144 pxor @b[1], @b[3]
145
146 pxor @b[7], @b[2]
147 pxor @b[5], @b[1]
148___
149}
150
151sub OutBasisChange {
152# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154my @b=@_[0..7];
155$code.=<<___;
156 pxor @b[6], @b[0]
157 pxor @b[4], @b[1]
158 pxor @b[0], @b[2]
159 pxor @b[6], @b[4]
160 pxor @b[1], @b[6]
161
162 pxor @b[5], @b[1]
163 pxor @b[3], @b[5]
164 pxor @b[7], @b[3]
165 pxor @b[5], @b[7]
166 pxor @b[5], @b[2]
167
168 pxor @b[7], @b[4]
169___
170}
171
172sub InvSbox {
173# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175my @b=@_[0..7];
176my @t=@_[8..11];
177my @s=@_[12..15];
178 &InvInBasisChange (@b);
179 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
180 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
181}
182
183sub InvInBasisChange { # OutBasisChange in reverse
184my @b=@_[5,1,2,6,3,7,0,4];
185$code.=<<___
186 pxor @b[7], @b[4]
187
188 pxor @b[5], @b[7]
189 pxor @b[5], @b[2]
190 pxor @b[7], @b[3]
191 pxor @b[3], @b[5]
192 pxor @b[5], @b[1]
193
194 pxor @b[1], @b[6]
195 pxor @b[0], @b[2]
196 pxor @b[6], @b[4]
197 pxor @b[6], @b[0]
198 pxor @b[4], @b[1]
199___
200}
201
202sub InvOutBasisChange { # InBasisChange in reverse
203my @b=@_[2,5,7,3,6,1,0,4];
204$code.=<<___;
205 pxor @b[5], @b[1]
206 pxor @b[7], @b[2]
207
208 pxor @b[1], @b[3]
209 pxor @b[5], @b[4]
210 pxor @b[5], @b[7]
211 pxor @b[4], @b[3]
212 pxor @b[0], @b[5]
213 pxor @b[7], @b[3]
214 pxor @b[2], @b[6]
215 pxor @b[1], @b[2]
216 pxor @b[3], @b[6]
217
218 pxor @b[0], @b[3]
219 pxor @b[6], @b[5]
220___
221}
222
223sub Mul_GF4 {
224#;*************************************************************
225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226#;*************************************************************
227my ($x0,$x1,$y0,$y1,$t0)=@_;
228$code.=<<___;
229 movdqa $y0, $t0
230 pxor $y1, $t0
231 pand $x0, $t0
232 pxor $x1, $x0
233 pand $y0, $x1
234 pand $y1, $x0
235 pxor $x1, $x0
236 pxor $t0, $x1
237___
238}
239
240sub Mul_GF4_N { # not used, see next subroutine
241# multiply and scale by N
242my ($x0,$x1,$y0,$y1,$t0)=@_;
243$code.=<<___;
244 movdqa $y0, $t0
245 pxor $y1, $t0
246 pand $x0, $t0
247 pxor $x1, $x0
248 pand $y0, $x1
249 pand $y1, $x0
250 pxor $x0, $x1
251 pxor $t0, $x0
252___
253}
254
255sub Mul_GF4_N_GF4 {
256# interleaved Mul_GF4_N and Mul_GF4
257my ($x0,$x1,$y0,$y1,$t0,
258 $x2,$x3,$y2,$y3,$t1)=@_;
259$code.=<<___;
260 movdqa $y0, $t0
261 movdqa $y2, $t1
262 pxor $y1, $t0
263 pxor $y3, $t1
264 pand $x0, $t0
265 pand $x2, $t1
266 pxor $x1, $x0
267 pxor $x3, $x2
268 pand $y0, $x1
269 pand $y2, $x3
270 pand $y1, $x0
271 pand $y3, $x2
272 pxor $x0, $x1
273 pxor $x3, $x2
274 pxor $t0, $x0
275 pxor $t1, $x3
276___
277}
278sub Mul_GF16_2 {
279my @x=@_[0..7];
280my @y=@_[8..11];
281my @t=@_[12..15];
282$code.=<<___;
283 movdqa @x[0], @t[0]
284 movdqa @x[1], @t[1]
285___
286 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
287$code.=<<___;
288 pxor @x[2], @t[0]
289 pxor @x[3], @t[1]
290 pxor @y[2], @y[0]
291 pxor @y[3], @y[1]
292___
293 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
294 @x[2], @x[3], @y[2], @y[3], @t[2]);
295$code.=<<___;
296 pxor @t[0], @x[0]
297 pxor @t[0], @x[2]
298 pxor @t[1], @x[1]
299 pxor @t[1], @x[3]
300
301 movdqa @x[4], @t[0]
302 movdqa @x[5], @t[1]
303 pxor @x[6], @t[0]
304 pxor @x[7], @t[1]
305___
306 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
307 @x[6], @x[7], @y[2], @y[3], @t[2]);
308$code.=<<___;
309 pxor @y[2], @y[0]
310 pxor @y[3], @y[1]
311___
312 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
313$code.=<<___;
314 pxor @t[0], @x[4]
315 pxor @t[0], @x[6]
316 pxor @t[1], @x[5]
317 pxor @t[1], @x[7]
318___
319}
320sub Inv_GF256 {
321#;********************************************************************
322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
323#;********************************************************************
324my @x=@_[0..7];
325my @t=@_[8..11];
326my @s=@_[12..15];
327# direct optimizations from hardware
328$code.=<<___;
329 movdqa @x[4], @t[3]
330 movdqa @x[5], @t[2]
331 movdqa @x[1], @t[1]
332 movdqa @x[7], @s[1]
333 movdqa @x[0], @s[0]
334
335 pxor @x[6], @t[3]
336 pxor @x[7], @t[2]
337 pxor @x[3], @t[1]
338 movdqa @t[3], @s[2]
339 pxor @x[6], @s[1]
340 movdqa @t[2], @t[0]
341 pxor @x[2], @s[0]
342 movdqa @t[3], @s[3]
343
344 por @t[1], @t[2]
345 por @s[0], @t[3]
346 pxor @t[0], @s[3]
347 pand @s[0], @s[2]
348 pxor @t[1], @s[0]
349 pand @t[1], @t[0]
350 pand @s[0], @s[3]
351 movdqa @x[3], @s[0]
352 pxor @x[2], @s[0]
353 pand @s[0], @s[1]
354 pxor @s[1], @t[3]
355 pxor @s[1], @t[2]
356 movdqa @x[4], @s[1]
357 movdqa @x[1], @s[0]
358 pxor @x[5], @s[1]
359 pxor @x[0], @s[0]
360 movdqa @s[1], @t[1]
361 pand @s[0], @s[1]
362 por @s[0], @t[1]
363 pxor @s[1], @t[0]
364 pxor @s[3], @t[3]
365 pxor @s[2], @t[2]
366 pxor @s[3], @t[1]
367 movdqa @x[7], @s[0]
368 pxor @s[2], @t[0]
369 movdqa @x[6], @s[1]
370 pxor @s[2], @t[1]
371 movdqa @x[5], @s[2]
372 pand @x[3], @s[0]
373 movdqa @x[4], @s[3]
374 pand @x[2], @s[1]
375 pand @x[1], @s[2]
376 por @x[0], @s[3]
377 pxor @s[0], @t[3]
378 pxor @s[1], @t[2]
379 pxor @s[2], @t[1]
380 pxor @s[3], @t[0]
381
382 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383
384 # new smaller inversion
385
386 movdqa @t[3], @s[0]
387 pand @t[1], @t[3]
388 pxor @t[2], @s[0]
389
390 movdqa @t[0], @s[2]
391 movdqa @s[0], @s[3]
392 pxor @t[3], @s[2]
393 pand @s[2], @s[3]
394
395 movdqa @t[1], @s[1]
396 pxor @t[2], @s[3]
397 pxor @t[0], @s[1]
398
399 pxor @t[2], @t[3]
400
401 pand @t[3], @s[1]
402
403 movdqa @s[2], @t[2]
404 pxor @t[0], @s[1]
405
406 pxor @s[1], @t[2]
407 pxor @s[1], @t[1]
408
409 pand @t[0], @t[2]
410
411 pxor @t[2], @s[2]
412 pxor @t[2], @t[1]
413
414 pand @s[3], @s[2]
415
416 pxor @s[0], @s[2]
417___
418# output in s3, s2, s1, t1
419
420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421
422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424
425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426}
427
428# AES linear components
429
430sub ShiftRows {
431my @x=@_[0..7];
432my $mask=pop;
433$code.=<<___;
434 pxor 0x00($key),@x[0]
435 pxor 0x10($key),@x[1]
436 pshufb $mask,@x[0]
437 pxor 0x20($key),@x[2]
438 pshufb $mask,@x[1]
439 pxor 0x30($key),@x[3]
440 pshufb $mask,@x[2]
441 pxor 0x40($key),@x[4]
442 pshufb $mask,@x[3]
443 pxor 0x50($key),@x[5]
444 pshufb $mask,@x[4]
445 pxor 0x60($key),@x[6]
446 pshufb $mask,@x[5]
447 pxor 0x70($key),@x[7]
448 pshufb $mask,@x[6]
449 lea 0x80($key),$key
450 pshufb $mask,@x[7]
451___
452}
453
454sub MixColumns {
455# modified to emit output in order suitable for feeding back to aesenc[last]
456my @x=@_[0..7];
457my @t=@_[8..15];
458$code.=<<___;
459 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
460 pshufd \$0x93, @x[1], @t[1]
461 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
462 pshufd \$0x93, @x[2], @t[2]
463 pxor @t[1], @x[1]
464 pshufd \$0x93, @x[3], @t[3]
465 pxor @t[2], @x[2]
466 pshufd \$0x93, @x[4], @t[4]
467 pxor @t[3], @x[3]
468 pshufd \$0x93, @x[5], @t[5]
469 pxor @t[4], @x[4]
470 pshufd \$0x93, @x[6], @t[6]
471 pxor @t[5], @x[5]
472 pshufd \$0x93, @x[7], @t[7]
473 pxor @t[6], @x[6]
474 pxor @t[7], @x[7]
475
476 pxor @x[0], @t[1]
477 pxor @x[7], @t[0]
478 pxor @x[7], @t[1]
479 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pxor @x[1], @t[2]
481 pshufd \$0x4E, @x[1], @x[1]
482 pxor @x[4], @t[5]
483 pxor @t[0], @x[0]
484 pxor @x[5], @t[6]
485 pxor @t[1], @x[1]
486 pxor @x[3], @t[4]
487 pshufd \$0x4E, @x[4], @t[0]
488 pxor @x[6], @t[7]
489 pshufd \$0x4E, @x[5], @t[1]
490 pxor @x[2], @t[3]
491 pshufd \$0x4E, @x[3], @x[4]
492 pxor @x[7], @t[3]
493 pshufd \$0x4E, @x[7], @x[5]
494 pxor @x[7], @t[4]
495 pshufd \$0x4E, @x[6], @x[3]
496 pxor @t[4], @t[0]
497 pshufd \$0x4E, @x[2], @x[6]
498 pxor @t[5], @t[1]
499
500 pxor @t[3], @x[4]
501 pxor @t[7], @x[5]
502 pxor @t[6], @x[3]
503 movdqa @t[0], @x[2]
504 pxor @t[2], @x[6]
505 movdqa @t[1], @x[7]
506___
507}
508
509sub InvMixColumns {
510my @x=@_[0..7];
511my @t=@_[8..15];
512
513$code.=<<___;
514 # multiplication by 0x0e
515 pshufd \$0x93, @x[7], @t[7]
516 movdqa @x[2], @t[2]
517 pxor @x[5], @x[7] # 7 5
518 pxor @x[5], @x[2] # 2 5
519 pshufd \$0x93, @x[0], @t[0]
520 movdqa @x[5], @t[5]
521 pxor @x[0], @x[5] # 5 0 [1]
522 pxor @x[1], @x[0] # 0 1
523 pshufd \$0x93, @x[1], @t[1]
524 pxor @x[2], @x[1] # 1 25
525 pxor @x[6], @x[0] # 01 6 [2]
526 pxor @x[3], @x[1] # 125 3 [4]
527 pshufd \$0x93, @x[3], @t[3]
528 pxor @x[0], @x[2] # 25 016 [3]
529 pxor @x[7], @x[3] # 3 75
530 pxor @x[6], @x[7] # 75 6 [0]
531 pshufd \$0x93, @x[6], @t[6]
532 movdqa @x[4], @t[4]
533 pxor @x[4], @x[6] # 6 4
534 pxor @x[3], @x[4] # 4 375 [6]
535 pxor @x[7], @x[3] # 375 756=36
536 pxor @t[5], @x[6] # 64 5 [7]
537 pxor @t[2], @x[3] # 36 2
538 pxor @t[4], @x[3] # 362 4 [5]
539 pshufd \$0x93, @t[5], @t[5]
540___
541 my @y = @x[7,5,0,2,1,3,4,6];
542$code.=<<___;
543 # multiplication by 0x0b
544 pxor @y[0], @y[1]
545 pxor @t[0], @y[0]
546 pxor @t[1], @y[1]
547 pshufd \$0x93, @t[2], @t[2]
548 pxor @t[5], @y[0]
549 pxor @t[6], @y[1]
550 pxor @t[7], @y[0]
551 pshufd \$0x93, @t[4], @t[4]
552 pxor @t[6], @t[7] # clobber t[7]
553 pxor @y[0], @y[1]
554
555 pxor @t[0], @y[3]
556 pshufd \$0x93, @t[0], @t[0]
557 pxor @t[1], @y[2]
558 pxor @t[1], @y[4]
559 pxor @t[2], @y[2]
560 pshufd \$0x93, @t[1], @t[1]
561 pxor @t[2], @y[3]
562 pxor @t[2], @y[5]
563 pxor @t[7], @y[2]
564 pshufd \$0x93, @t[2], @t[2]
565 pxor @t[3], @y[3]
566 pxor @t[3], @y[6]
567 pxor @t[3], @y[4]
568 pshufd \$0x93, @t[3], @t[3]
569 pxor @t[4], @y[7]
570 pxor @t[4], @y[5]
571 pxor @t[7], @y[7]
572 pxor @t[5], @y[3]
573 pxor @t[4], @y[4]
574 pxor @t[5], @t[7] # clobber t[7] even more
575
576 pxor @t[7], @y[5]
577 pshufd \$0x93, @t[4], @t[4]
578 pxor @t[7], @y[6]
579 pxor @t[7], @y[4]
580
581 pxor @t[5], @t[7]
582 pshufd \$0x93, @t[5], @t[5]
583 pxor @t[6], @t[7] # restore t[7]
584
585 # multiplication by 0x0d
586 pxor @y[7], @y[4]
587 pxor @t[4], @y[7]
588 pshufd \$0x93, @t[6], @t[6]
589 pxor @t[0], @y[2]
590 pxor @t[5], @y[7]
591 pxor @t[2], @y[2]
592 pshufd \$0x93, @t[7], @t[7]
593
594 pxor @y[1], @y[3]
595 pxor @t[1], @y[1]
596 pxor @t[0], @y[0]
597 pxor @t[0], @y[3]
598 pxor @t[5], @y[1]
599 pxor @t[5], @y[0]
600 pxor @t[7], @y[1]
601 pshufd \$0x93, @t[0], @t[0]
602 pxor @t[6], @y[0]
603 pxor @y[1], @y[3]
604 pxor @t[1], @y[4]
605 pshufd \$0x93, @t[1], @t[1]
606
607 pxor @t[7], @y[7]
608 pxor @t[2], @y[4]
609 pxor @t[2], @y[5]
610 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[6], @y[2]
612 pxor @t[3], @t[6] # clobber t[6]
613 pxor @y[7], @y[4]
614 pxor @t[6], @y[3]
615
616 pxor @t[6], @y[6]
617 pxor @t[5], @y[5]
618 pxor @t[4], @y[6]
619 pshufd \$0x93, @t[4], @t[4]
620 pxor @t[6], @y[5]
621 pxor @t[7], @y[6]
622 pxor @t[3], @t[6] # restore t[6]
623
624 pshufd \$0x93, @t[5], @t[5]
625 pshufd \$0x93, @t[6], @t[6]
626 pshufd \$0x93, @t[7], @t[7]
627 pshufd \$0x93, @t[3], @t[3]
628
629 # multiplication by 0x09
630 pxor @y[1], @y[4]
631 pxor @y[1], @t[1] # t[1]=y[1]
632 pxor @t[5], @t[0] # clobber t[0]
633 pxor @t[5], @t[1]
634 pxor @t[0], @y[3]
635 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[6], @t[1]
637 pxor @t[7], @t[6] # clobber t[6]
638 pxor @t[1], @y[4]
639 pxor @t[4], @y[7]
640 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @t[3], @y[6]
642 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @t[2], @y[5]
644 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @t[7], @t[3]
646 pxor @y[5], @t[5] # t[5]=y[5]
647 pxor @t[6], @t[2]
648 pxor @t[6], @t[5]
649 pxor @y[6], @t[6] # t[6]=y[6]
650 pxor @y[7], @t[7] # t[7]=y[7]
651
652 movdqa @t[0],@XMM[0]
653 movdqa @t[1],@XMM[1]
654 movdqa @t[2],@XMM[2]
655 movdqa @t[3],@XMM[3]
656 movdqa @t[4],@XMM[4]
657 movdqa @t[5],@XMM[5]
658 movdqa @t[6],@XMM[6]
659 movdqa @t[7],@XMM[7]
660___
661}
662
663sub aesenc { # not used
664my @b=@_[0..7];
665my @t=@_[8..15];
666$code.=<<___;
667 movdqa 0x30($const),@t[0] # .LSR
668___
669 &ShiftRows (@b,@t[0]);
670 &Sbox (@b,@t);
671 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
672}
673
674sub aesenclast { # not used
675my @b=@_[0..7];
676my @t=@_[8..15];
677$code.=<<___;
678 movdqa 0x40($const),@t[0] # .LSRM0
679___
680 &ShiftRows (@b,@t[0]);
681 &Sbox (@b,@t);
682$code.=<<___
683 pxor 0x00($key),@b[0]
684 pxor 0x10($key),@b[1]
685 pxor 0x20($key),@b[4]
686 pxor 0x30($key),@b[6]
687 pxor 0x40($key),@b[3]
688 pxor 0x50($key),@b[7]
689 pxor 0x60($key),@b[2]
690 pxor 0x70($key),@b[5]
691___
692}
693
694sub swapmove {
695my ($a,$b,$n,$mask,$t)=@_;
696$code.=<<___;
697 movdqa $b,$t
698 psrlq \$$n,$b
699 pxor $a,$b
700 pand $mask,$b
701 pxor $b,$a
702 psllq \$$n,$b
703 pxor $t,$b
704___
705}
706sub swapmove2x {
707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708$code.=<<___;
709 movdqa $b0,$t0
710 psrlq \$$n,$b0
711 movdqa $b1,$t1
712 psrlq \$$n,$b1
713 pxor $a0,$b0
714 pxor $a1,$b1
715 pand $mask,$b0
716 pand $mask,$b1
717 pxor $b0,$a0
718 psllq \$$n,$b0
719 pxor $b1,$a1
720 psllq \$$n,$b1
721 pxor $t0,$b0
722 pxor $t1,$b1
723___
724}
725
726sub bitslice {
727my @x=reverse(@_[0..7]);
728my ($t0,$t1,$t2,$t3)=@_[8..11];
729$code.=<<___;
730 movdqa 0x00($const),$t0 # .LBS0
731 movdqa 0x10($const),$t1 # .LBS1
732___
733 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735$code.=<<___;
736 movdqa 0x20($const),$t0 # .LBS2
737___
738 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740
741 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743}
744
745$code.=<<___;
746.text
747
748.extern asm_AES_encrypt
749.extern asm_AES_decrypt
750
751.type _bsaes_encrypt8,\@abi-omnipotent
752.align 64
753_bsaes_encrypt8:
754 lea .LBS0(%rip), $const # constants table
755
756 movdqa ($key), @XMM[9] # round 0 key
757 lea 0x10($key), $key
758 movdqa 0x50($const), @XMM[8] # .LM0SR
759 pxor @XMM[9], @XMM[0] # xor with round0 key
760 pxor @XMM[9], @XMM[1]
761 pshufb @XMM[8], @XMM[0]
762 pxor @XMM[9], @XMM[2]
763 pshufb @XMM[8], @XMM[1]
764 pxor @XMM[9], @XMM[3]
765 pshufb @XMM[8], @XMM[2]
766 pxor @XMM[9], @XMM[4]
767 pshufb @XMM[8], @XMM[3]
768 pxor @XMM[9], @XMM[5]
769 pshufb @XMM[8], @XMM[4]
770 pxor @XMM[9], @XMM[6]
771 pshufb @XMM[8], @XMM[5]
772 pxor @XMM[9], @XMM[7]
773 pshufb @XMM[8], @XMM[6]
774 pshufb @XMM[8], @XMM[7]
775_bsaes_encrypt8_bitslice:
776___
777 &bitslice (@XMM[0..7, 8..11]);
778$code.=<<___;
779 dec $rounds
780 jmp .Lenc_sbox
781.align 16
782.Lenc_loop:
783___
784 &ShiftRows (@XMM[0..7, 8]);
785$code.=".Lenc_sbox:\n";
786 &Sbox (@XMM[0..7, 8..15]);
787$code.=<<___;
788 dec $rounds
789 jl .Lenc_done
790___
791 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792$code.=<<___;
793 movdqa 0x30($const), @XMM[8] # .LSR
794 jnz .Lenc_loop
795 movdqa 0x40($const), @XMM[8] # .LSRM0
796 jmp .Lenc_loop
797.align 16
798.Lenc_done:
799___
800 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802$code.=<<___;
803 movdqa ($key), @XMM[8] # last round key
804 pxor @XMM[8], @XMM[4]
805 pxor @XMM[8], @XMM[6]
806 pxor @XMM[8], @XMM[3]
807 pxor @XMM[8], @XMM[7]
808 pxor @XMM[8], @XMM[2]
809 pxor @XMM[8], @XMM[5]
810 pxor @XMM[8], @XMM[0]
811 pxor @XMM[8], @XMM[1]
812 ret
813.size _bsaes_encrypt8,.-_bsaes_encrypt8
814
815.type _bsaes_decrypt8,\@abi-omnipotent
816.align 64
817_bsaes_decrypt8:
818 lea .LBS0(%rip), $const # constants table
819
820 movdqa ($key), @XMM[9] # round 0 key
821 lea 0x10($key), $key
822 movdqa -0x30($const), @XMM[8] # .LM0ISR
823 pxor @XMM[9], @XMM[0] # xor with round0 key
824 pxor @XMM[9], @XMM[1]
825 pshufb @XMM[8], @XMM[0]
826 pxor @XMM[9], @XMM[2]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[3]
829 pshufb @XMM[8], @XMM[2]
830 pxor @XMM[9], @XMM[4]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[5]
833 pshufb @XMM[8], @XMM[4]
834 pxor @XMM[9], @XMM[6]
835 pshufb @XMM[8], @XMM[5]
836 pxor @XMM[9], @XMM[7]
837 pshufb @XMM[8], @XMM[6]
838 pshufb @XMM[8], @XMM[7]
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Ldec_sbox
844.align 16
845.Ldec_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Ldec_sbox:\n";
849 &InvSbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Ldec_done
853___
854 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855$code.=<<___;
856 movdqa -0x10($const), @XMM[8] # .LISR
857 jnz .Ldec_loop
858 movdqa -0x20($const), @XMM[8] # .LISRM0
859 jmp .Ldec_loop
860.align 16
861.Ldec_done:
862___
863 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[6]
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[2]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[3]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
874 ret
875.size _bsaes_decrypt8,.-_bsaes_decrypt8
876___
877}
878{
879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880
881sub bitslice_key {
882my @x=reverse(@_[0..7]);
883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884
885 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886$code.=<<___;
887 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
888 movdqa @x[0], @x[2]
889 movdqa @x[1], @x[3]
890___
891 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892
893 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894$code.=<<___;
895 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896 movdqa @x[0], @x[4]
897 movdqa @x[2], @x[6]
898 movdqa @x[1], @x[5]
899 movdqa @x[3], @x[7]
900___
901 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
902 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
903}
904
905$code.=<<___;
906.type _bsaes_key_convert,\@abi-omnipotent
907.align 16
908_bsaes_key_convert:
909 lea .Lmasks(%rip), $const
910 movdqu ($inp), %xmm7 # load round 0 key
911 lea 0x10($inp), $inp
912 movdqa 0x00($const), %xmm0 # 0x01...
913 movdqa 0x10($const), %xmm1 # 0x02...
914 movdqa 0x20($const), %xmm2 # 0x04...
915 movdqa 0x30($const), %xmm3 # 0x08...
916 movdqa 0x40($const), %xmm4 # .LM0
917 pcmpeqd %xmm5, %xmm5 # .LNOT
918
919 movdqu ($inp), %xmm6 # load round 1 key
920 movdqa %xmm7, ($out) # save round 0 key
921 lea 0x10($out), $out
922 dec $rounds
923 jmp .Lkey_loop
924.align 16
925.Lkey_loop:
926 pshufb %xmm4, %xmm6 # .LM0
927
928 movdqa %xmm0, %xmm8
929 movdqa %xmm1, %xmm9
930
931 pand %xmm6, %xmm8
932 pand %xmm6, %xmm9
933 movdqa %xmm2, %xmm10
934 pcmpeqb %xmm0, %xmm8
935 psllq \$4, %xmm0 # 0x10...
936 movdqa %xmm3, %xmm11
937 pcmpeqb %xmm1, %xmm9
938 psllq \$4, %xmm1 # 0x20...
939
940 pand %xmm6, %xmm10
941 pand %xmm6, %xmm11
942 movdqa %xmm0, %xmm12
943 pcmpeqb %xmm2, %xmm10
944 psllq \$4, %xmm2 # 0x40...
945 movdqa %xmm1, %xmm13
946 pcmpeqb %xmm3, %xmm11
947 psllq \$4, %xmm3 # 0x80...
948
949 movdqa %xmm2, %xmm14
950 movdqa %xmm3, %xmm15
951 pxor %xmm5, %xmm8 # "pnot"
952 pxor %xmm5, %xmm9
953
954 pand %xmm6, %xmm12
955 pand %xmm6, %xmm13
956 movdqa %xmm8, 0x00($out) # write bit-sliced round key
957 pcmpeqb %xmm0, %xmm12
958 psrlq \$4, %xmm0 # 0x01...
959 movdqa %xmm9, 0x10($out)
960 pcmpeqb %xmm1, %xmm13
961 psrlq \$4, %xmm1 # 0x02...
962 lea 0x10($inp), $inp
963
964 pand %xmm6, %xmm14
965 pand %xmm6, %xmm15
966 movdqa %xmm10, 0x20($out)
967 pcmpeqb %xmm2, %xmm14
968 psrlq \$4, %xmm2 # 0x04...
969 movdqa %xmm11, 0x30($out)
970 pcmpeqb %xmm3, %xmm15
971 psrlq \$4, %xmm3 # 0x08...
972 movdqu ($inp), %xmm6 # load next round key
973
974 pxor %xmm5, %xmm13 # "pnot"
975 pxor %xmm5, %xmm14
976 movdqa %xmm12, 0x40($out)
977 movdqa %xmm13, 0x50($out)
978 movdqa %xmm14, 0x60($out)
979 movdqa %xmm15, 0x70($out)
980 lea 0x80($out),$out
981 dec $rounds
982 jnz .Lkey_loop
983
984 movdqa 0x50($const), %xmm7 # .L63
985 #movdqa %xmm6, ($out) # don't save last round key
986 ret
987.size _bsaes_key_convert,.-_bsaes_key_convert
988___
989}
990
991if (0 && !$win64) { # following four functions are unsupported interface
992 # used for benchmarking...
993$code.=<<___;
994.globl bsaes_enc_key_convert
995.type bsaes_enc_key_convert,\@function,2
996.align 16
997bsaes_enc_key_convert:
998 mov 240($inp),%r10d # pass rounds
999 mov $inp,%rcx # pass key
1000 mov $out,%rax # pass key schedule
1001 call _bsaes_key_convert
1002 pxor %xmm6,%xmm7 # fix up last round key
1003 movdqa %xmm7,(%rax) # save last round key
1004 ret
1005.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006
1007.globl bsaes_encrypt_128
1008.type bsaes_encrypt_128,\@function,4
1009.align 16
1010bsaes_encrypt_128:
1011.Lenc128_loop:
1012 movdqu 0x00($inp), @XMM[0] # load input
1013 movdqu 0x10($inp), @XMM[1]
1014 movdqu 0x20($inp), @XMM[2]
1015 movdqu 0x30($inp), @XMM[3]
1016 movdqu 0x40($inp), @XMM[4]
1017 movdqu 0x50($inp), @XMM[5]
1018 movdqu 0x60($inp), @XMM[6]
1019 movdqu 0x70($inp), @XMM[7]
1020 mov $key, %rax # pass the $key
1021 lea 0x80($inp), $inp
1022 mov \$10,%r10d
1023
1024 call _bsaes_encrypt8
1025
1026 movdqu @XMM[0], 0x00($out) # write output
1027 movdqu @XMM[1], 0x10($out)
1028 movdqu @XMM[4], 0x20($out)
1029 movdqu @XMM[6], 0x30($out)
1030 movdqu @XMM[3], 0x40($out)
1031 movdqu @XMM[7], 0x50($out)
1032 movdqu @XMM[2], 0x60($out)
1033 movdqu @XMM[5], 0x70($out)
1034 lea 0x80($out), $out
1035 sub \$0x80,$len
1036 ja .Lenc128_loop
1037 ret
1038.size bsaes_encrypt_128,.-bsaes_encrypt_128
1039
1040.globl bsaes_dec_key_convert
1041.type bsaes_dec_key_convert,\@function,2
1042.align 16
1043bsaes_dec_key_convert:
1044 mov 240($inp),%r10d # pass rounds
1045 mov $inp,%rcx # pass key
1046 mov $out,%rax # pass key schedule
1047 call _bsaes_key_convert
1048 pxor ($out),%xmm7 # fix up round 0 key
1049 movdqa %xmm6,(%rax) # save last round key
1050 movdqa %xmm7,($out)
1051 ret
1052.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053
1054.globl bsaes_decrypt_128
1055.type bsaes_decrypt_128,\@function,4
1056.align 16
1057bsaes_decrypt_128:
1058.Ldec128_loop:
1059 movdqu 0x00($inp), @XMM[0] # load input
1060 movdqu 0x10($inp), @XMM[1]
1061 movdqu 0x20($inp), @XMM[2]
1062 movdqu 0x30($inp), @XMM[3]
1063 movdqu 0x40($inp), @XMM[4]
1064 movdqu 0x50($inp), @XMM[5]
1065 movdqu 0x60($inp), @XMM[6]
1066 movdqu 0x70($inp), @XMM[7]
1067 mov $key, %rax # pass the $key
1068 lea 0x80($inp), $inp
1069 mov \$10,%r10d
1070
1071 call _bsaes_decrypt8
1072
1073 movdqu @XMM[0], 0x00($out) # write output
1074 movdqu @XMM[1], 0x10($out)
1075 movdqu @XMM[6], 0x20($out)
1076 movdqu @XMM[4], 0x30($out)
1077 movdqu @XMM[2], 0x40($out)
1078 movdqu @XMM[7], 0x50($out)
1079 movdqu @XMM[3], 0x60($out)
1080 movdqu @XMM[5], 0x70($out)
1081 lea 0x80($out), $out
1082 sub \$0x80,$len
1083 ja .Ldec128_loop
1084 ret
1085.size bsaes_decrypt_128,.-bsaes_decrypt_128
1086___
1087}
1088{
1089######################################################################
1090#
1091# OpenSSL interface
1092#
1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096
1097if ($ecb) {
1098$code.=<<___;
1099.globl bsaes_ecb_encrypt_blocks
1100.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101.align 16
1102bsaes_ecb_encrypt_blocks:
1103 mov %rsp, %rax
1104.Lecb_enc_prologue:
1105 push %rbp
1106 push %rbx
1107 push %r12
1108 push %r13
1109 push %r14
1110 push %r15
1111 lea -0x48(%rsp),%rsp
1112___
1113$code.=<<___ if ($win64);
1114 lea -0xa0(%rsp), %rsp
1115 movaps %xmm6, 0x40(%rsp)
1116 movaps %xmm7, 0x50(%rsp)
1117 movaps %xmm8, 0x60(%rsp)
1118 movaps %xmm9, 0x70(%rsp)
1119 movaps %xmm10, 0x80(%rsp)
1120 movaps %xmm11, 0x90(%rsp)
1121 movaps %xmm12, 0xa0(%rsp)
1122 movaps %xmm13, 0xb0(%rsp)
1123 movaps %xmm14, 0xc0(%rsp)
1124 movaps %xmm15, 0xd0(%rsp)
1125.Lecb_enc_body:
1126___
1127$code.=<<___;
1128 mov %rsp,%rbp # backup %rsp
1129 mov 240($arg4),%eax # rounds
1130 mov $arg1,$inp # backup arguments
1131 mov $arg2,$out
1132 mov $arg3,$len
1133 mov $arg4,$key
1134 cmp \$8,$arg3
1135 jb .Lecb_enc_short
1136
1137 mov %eax,%ebx # backup rounds
1138 shl \$7,%rax # 128 bytes per inner round key
1139 sub \$`128-32`,%rax # size of bit-sliced key schedule
1140 sub %rax,%rsp
1141 mov %rsp,%rax # pass key schedule
1142 mov $key,%rcx # pass key
1143 mov %ebx,%r10d # pass rounds
1144 call _bsaes_key_convert
1145 pxor %xmm6,%xmm7 # fix up last round key
1146 movdqa %xmm7,(%rax) # save last round key
1147
1148 sub \$8,$len
1149.Lecb_enc_loop:
1150 movdqu 0x00($inp), @XMM[0] # load input
1151 movdqu 0x10($inp), @XMM[1]
1152 movdqu 0x20($inp), @XMM[2]
1153 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1155 movdqu 0x50($inp), @XMM[5]
1156 mov %rsp, %rax # pass key schedule
1157 movdqu 0x60($inp), @XMM[6]
1158 mov %ebx,%r10d # pass rounds
1159 movdqu 0x70($inp), @XMM[7]
1160 lea 0x80($inp), $inp
1161
1162 call _bsaes_encrypt8
1163
1164 movdqu @XMM[0], 0x00($out) # write output
1165 movdqu @XMM[1], 0x10($out)
1166 movdqu @XMM[4], 0x20($out)
1167 movdqu @XMM[6], 0x30($out)
1168 movdqu @XMM[3], 0x40($out)
1169 movdqu @XMM[7], 0x50($out)
1170 movdqu @XMM[2], 0x60($out)
1171 movdqu @XMM[5], 0x70($out)
1172 lea 0x80($out), $out
1173 sub \$8,$len
1174 jnc .Lecb_enc_loop
1175
1176 add \$8,$len
1177 jz .Lecb_enc_done
1178
1179 movdqu 0x00($inp), @XMM[0] # load input
1180 mov %rsp, %rax # pass key schedule
1181 mov %ebx,%r10d # pass rounds
1182 cmp \$2,$len
1183 jb .Lecb_enc_one
1184 movdqu 0x10($inp), @XMM[1]
1185 je .Lecb_enc_two
1186 movdqu 0x20($inp), @XMM[2]
1187 cmp \$4,$len
1188 jb .Lecb_enc_three
1189 movdqu 0x30($inp), @XMM[3]
1190 je .Lecb_enc_four
1191 movdqu 0x40($inp), @XMM[4]
1192 cmp \$6,$len
1193 jb .Lecb_enc_five
1194 movdqu 0x50($inp), @XMM[5]
1195 je .Lecb_enc_six
1196 movdqu 0x60($inp), @XMM[6]
1197 call _bsaes_encrypt8
1198 movdqu @XMM[0], 0x00($out) # write output
1199 movdqu @XMM[1], 0x10($out)
1200 movdqu @XMM[4], 0x20($out)
1201 movdqu @XMM[6], 0x30($out)
1202 movdqu @XMM[3], 0x40($out)
1203 movdqu @XMM[7], 0x50($out)
1204 movdqu @XMM[2], 0x60($out)
1205 jmp .Lecb_enc_done
1206.align 16
1207.Lecb_enc_six:
1208 call _bsaes_encrypt8
1209 movdqu @XMM[0], 0x00($out) # write output
1210 movdqu @XMM[1], 0x10($out)
1211 movdqu @XMM[4], 0x20($out)
1212 movdqu @XMM[6], 0x30($out)
1213 movdqu @XMM[3], 0x40($out)
1214 movdqu @XMM[7], 0x50($out)
1215 jmp .Lecb_enc_done
1216.align 16
1217.Lecb_enc_five:
1218 call _bsaes_encrypt8
1219 movdqu @XMM[0], 0x00($out) # write output
1220 movdqu @XMM[1], 0x10($out)
1221 movdqu @XMM[4], 0x20($out)
1222 movdqu @XMM[6], 0x30($out)
1223 movdqu @XMM[3], 0x40($out)
1224 jmp .Lecb_enc_done
1225.align 16
1226.Lecb_enc_four:
1227 call _bsaes_encrypt8
1228 movdqu @XMM[0], 0x00($out) # write output
1229 movdqu @XMM[1], 0x10($out)
1230 movdqu @XMM[4], 0x20($out)
1231 movdqu @XMM[6], 0x30($out)
1232 jmp .Lecb_enc_done
1233.align 16
1234.Lecb_enc_three:
1235 call _bsaes_encrypt8
1236 movdqu @XMM[0], 0x00($out) # write output
1237 movdqu @XMM[1], 0x10($out)
1238 movdqu @XMM[4], 0x20($out)
1239 jmp .Lecb_enc_done
1240.align 16
1241.Lecb_enc_two:
1242 call _bsaes_encrypt8
1243 movdqu @XMM[0], 0x00($out) # write output
1244 movdqu @XMM[1], 0x10($out)
1245 jmp .Lecb_enc_done
1246.align 16
1247.Lecb_enc_one:
1248 call _bsaes_encrypt8
1249 movdqu @XMM[0], 0x00($out) # write output
1250 jmp .Lecb_enc_done
1251.align 16
1252.Lecb_enc_short:
1253 lea ($inp), $arg1
1254 lea ($out), $arg2
1255 lea ($key), $arg3
1256 call asm_AES_encrypt
1257 lea 16($inp), $inp
1258 lea 16($out), $out
1259 dec $len
1260 jnz .Lecb_enc_short
1261
1262.Lecb_enc_done:
1263 lea (%rsp),%rax
1264 pxor %xmm0, %xmm0
1265.Lecb_enc_bzero: # wipe key schedule [if any]
1266 movdqa %xmm0, 0x00(%rax)
1267 movdqa %xmm0, 0x10(%rax)
1268 lea 0x20(%rax), %rax
1269 cmp %rax, %rbp
1270 jb .Lecb_enc_bzero
1271
1272 lea (%rbp),%rsp # restore %rsp
1273___
1274$code.=<<___ if ($win64);
1275 movaps 0x40(%rbp), %xmm6
1276 movaps 0x50(%rbp), %xmm7
1277 movaps 0x60(%rbp), %xmm8
1278 movaps 0x70(%rbp), %xmm9
1279 movaps 0x80(%rbp), %xmm10
1280 movaps 0x90(%rbp), %xmm11
1281 movaps 0xa0(%rbp), %xmm12
1282 movaps 0xb0(%rbp), %xmm13
1283 movaps 0xc0(%rbp), %xmm14
1284 movaps 0xd0(%rbp), %xmm15
1285 lea 0xa0(%rbp), %rsp
1286___
1287$code.=<<___;
1288 mov 0x48(%rsp), %r15
1289 mov 0x50(%rsp), %r14
1290 mov 0x58(%rsp), %r13
1291 mov 0x60(%rsp), %r12
1292 mov 0x68(%rsp), %rbx
1293 mov 0x70(%rsp), %rax
1294 lea 0x78(%rsp), %rsp
1295 mov %rax, %rbp
1296.Lecb_enc_epilogue:
1297 ret
1298.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299
1300.globl bsaes_ecb_decrypt_blocks
1301.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302.align 16
1303bsaes_ecb_decrypt_blocks:
1304 mov %rsp, %rax
1305.Lecb_dec_prologue:
1306 push %rbp
1307 push %rbx
1308 push %r12
1309 push %r13
1310 push %r14
1311 push %r15
1312 lea -0x48(%rsp),%rsp
1313___
1314$code.=<<___ if ($win64);
1315 lea -0xa0(%rsp), %rsp
1316 movaps %xmm6, 0x40(%rsp)
1317 movaps %xmm7, 0x50(%rsp)
1318 movaps %xmm8, 0x60(%rsp)
1319 movaps %xmm9, 0x70(%rsp)
1320 movaps %xmm10, 0x80(%rsp)
1321 movaps %xmm11, 0x90(%rsp)
1322 movaps %xmm12, 0xa0(%rsp)
1323 movaps %xmm13, 0xb0(%rsp)
1324 movaps %xmm14, 0xc0(%rsp)
1325 movaps %xmm15, 0xd0(%rsp)
1326.Lecb_dec_body:
1327___
1328$code.=<<___;
1329 mov %rsp,%rbp # backup %rsp
1330 mov 240($arg4),%eax # rounds
1331 mov $arg1,$inp # backup arguments
1332 mov $arg2,$out
1333 mov $arg3,$len
1334 mov $arg4,$key
1335 cmp \$8,$arg3
1336 jb .Lecb_dec_short
1337
1338 mov %eax,%ebx # backup rounds
1339 shl \$7,%rax # 128 bytes per inner round key
1340 sub \$`128-32`,%rax # size of bit-sliced key schedule
1341 sub %rax,%rsp
1342 mov %rsp,%rax # pass key schedule
1343 mov $key,%rcx # pass key
1344 mov %ebx,%r10d # pass rounds
1345 call _bsaes_key_convert
1346 pxor (%rsp),%xmm7 # fix up 0 round key
1347 movdqa %xmm6,(%rax) # save last round key
1348 movdqa %xmm7,(%rsp)
1349
1350 sub \$8,$len
1351.Lecb_dec_loop:
1352 movdqu 0x00($inp), @XMM[0] # load input
1353 movdqu 0x10($inp), @XMM[1]
1354 movdqu 0x20($inp), @XMM[2]
1355 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1357 movdqu 0x50($inp), @XMM[5]
1358 mov %rsp, %rax # pass key schedule
1359 movdqu 0x60($inp), @XMM[6]
1360 mov %ebx,%r10d # pass rounds
1361 movdqu 0x70($inp), @XMM[7]
1362 lea 0x80($inp), $inp
1363
1364 call _bsaes_decrypt8
1365
1366 movdqu @XMM[0], 0x00($out) # write output
1367 movdqu @XMM[1], 0x10($out)
1368 movdqu @XMM[6], 0x20($out)
1369 movdqu @XMM[4], 0x30($out)
1370 movdqu @XMM[2], 0x40($out)
1371 movdqu @XMM[7], 0x50($out)
1372 movdqu @XMM[3], 0x60($out)
1373 movdqu @XMM[5], 0x70($out)
1374 lea 0x80($out), $out
1375 sub \$8,$len
1376 jnc .Lecb_dec_loop
1377
1378 add \$8,$len
1379 jz .Lecb_dec_done
1380
1381 movdqu 0x00($inp), @XMM[0] # load input
1382 mov %rsp, %rax # pass key schedule
1383 mov %ebx,%r10d # pass rounds
1384 cmp \$2,$len
1385 jb .Lecb_dec_one
1386 movdqu 0x10($inp), @XMM[1]
1387 je .Lecb_dec_two
1388 movdqu 0x20($inp), @XMM[2]
1389 cmp \$4,$len
1390 jb .Lecb_dec_three
1391 movdqu 0x30($inp), @XMM[3]
1392 je .Lecb_dec_four
1393 movdqu 0x40($inp), @XMM[4]
1394 cmp \$6,$len
1395 jb .Lecb_dec_five
1396 movdqu 0x50($inp), @XMM[5]
1397 je .Lecb_dec_six
1398 movdqu 0x60($inp), @XMM[6]
1399 call _bsaes_decrypt8
1400 movdqu @XMM[0], 0x00($out) # write output
1401 movdqu @XMM[1], 0x10($out)
1402 movdqu @XMM[6], 0x20($out)
1403 movdqu @XMM[4], 0x30($out)
1404 movdqu @XMM[2], 0x40($out)
1405 movdqu @XMM[7], 0x50($out)
1406 movdqu @XMM[3], 0x60($out)
1407 jmp .Lecb_dec_done
1408.align 16
1409.Lecb_dec_six:
1410 call _bsaes_decrypt8
1411 movdqu @XMM[0], 0x00($out) # write output
1412 movdqu @XMM[1], 0x10($out)
1413 movdqu @XMM[6], 0x20($out)
1414 movdqu @XMM[4], 0x30($out)
1415 movdqu @XMM[2], 0x40($out)
1416 movdqu @XMM[7], 0x50($out)
1417 jmp .Lecb_dec_done
1418.align 16
1419.Lecb_dec_five:
1420 call _bsaes_decrypt8
1421 movdqu @XMM[0], 0x00($out) # write output
1422 movdqu @XMM[1], 0x10($out)
1423 movdqu @XMM[6], 0x20($out)
1424 movdqu @XMM[4], 0x30($out)
1425 movdqu @XMM[2], 0x40($out)
1426 jmp .Lecb_dec_done
1427.align 16
1428.Lecb_dec_four:
1429 call _bsaes_decrypt8
1430 movdqu @XMM[0], 0x00($out) # write output
1431 movdqu @XMM[1], 0x10($out)
1432 movdqu @XMM[6], 0x20($out)
1433 movdqu @XMM[4], 0x30($out)
1434 jmp .Lecb_dec_done
1435.align 16
1436.Lecb_dec_three:
1437 call _bsaes_decrypt8
1438 movdqu @XMM[0], 0x00($out) # write output
1439 movdqu @XMM[1], 0x10($out)
1440 movdqu @XMM[6], 0x20($out)
1441 jmp .Lecb_dec_done
1442.align 16
1443.Lecb_dec_two:
1444 call _bsaes_decrypt8
1445 movdqu @XMM[0], 0x00($out) # write output
1446 movdqu @XMM[1], 0x10($out)
1447 jmp .Lecb_dec_done
1448.align 16
1449.Lecb_dec_one:
1450 call _bsaes_decrypt8
1451 movdqu @XMM[0], 0x00($out) # write output
1452 jmp .Lecb_dec_done
1453.align 16
1454.Lecb_dec_short:
1455 lea ($inp), $arg1
1456 lea ($out), $arg2
1457 lea ($key), $arg3
1458 call asm_AES_decrypt
1459 lea 16($inp), $inp
1460 lea 16($out), $out
1461 dec $len
1462 jnz .Lecb_dec_short
1463
1464.Lecb_dec_done:
1465 lea (%rsp),%rax
1466 pxor %xmm0, %xmm0
1467.Lecb_dec_bzero: # wipe key schedule [if any]
1468 movdqa %xmm0, 0x00(%rax)
1469 movdqa %xmm0, 0x10(%rax)
1470 lea 0x20(%rax), %rax
1471 cmp %rax, %rbp
1472 jb .Lecb_dec_bzero
1473
1474 lea (%rbp),%rsp # restore %rsp
1475___
1476$code.=<<___ if ($win64);
1477 movaps 0x40(%rbp), %xmm6
1478 movaps 0x50(%rbp), %xmm7
1479 movaps 0x60(%rbp), %xmm8
1480 movaps 0x70(%rbp), %xmm9
1481 movaps 0x80(%rbp), %xmm10
1482 movaps 0x90(%rbp), %xmm11
1483 movaps 0xa0(%rbp), %xmm12
1484 movaps 0xb0(%rbp), %xmm13
1485 movaps 0xc0(%rbp), %xmm14
1486 movaps 0xd0(%rbp), %xmm15
1487 lea 0xa0(%rbp), %rsp
1488___
1489$code.=<<___;
1490 mov 0x48(%rsp), %r15
1491 mov 0x50(%rsp), %r14
1492 mov 0x58(%rsp), %r13
1493 mov 0x60(%rsp), %r12
1494 mov 0x68(%rsp), %rbx
1495 mov 0x70(%rsp), %rax
1496 lea 0x78(%rsp), %rsp
1497 mov %rax, %rbp
1498.Lecb_dec_epilogue:
1499 ret
1500.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501___
1502}
1503$code.=<<___;
1504.extern asm_AES_cbc_encrypt
1505.globl bsaes_cbc_encrypt
1506.type bsaes_cbc_encrypt,\@abi-omnipotent
1507.align 16
1508bsaes_cbc_encrypt:
1509___
1510$code.=<<___ if ($win64);
1511 mov 48(%rsp),$arg6 # pull direction flag
1512___
1513$code.=<<___;
1514 cmp \$0,$arg6
1515 jne asm_AES_cbc_encrypt
1516 cmp \$128,$arg3
1517 jb asm_AES_cbc_encrypt
1518
1519 mov %rsp, %rax
1520.Lcbc_dec_prologue:
1521 push %rbp
1522 push %rbx
1523 push %r12
1524 push %r13
1525 push %r14
1526 push %r15
1527 lea -0x48(%rsp), %rsp
1528___
1529$code.=<<___ if ($win64);
1530 mov 0xa0(%rsp),$arg5 # pull ivp
1531 lea -0xa0(%rsp), %rsp
1532 movaps %xmm6, 0x40(%rsp)
1533 movaps %xmm7, 0x50(%rsp)
1534 movaps %xmm8, 0x60(%rsp)
1535 movaps %xmm9, 0x70(%rsp)
1536 movaps %xmm10, 0x80(%rsp)
1537 movaps %xmm11, 0x90(%rsp)
1538 movaps %xmm12, 0xa0(%rsp)
1539 movaps %xmm13, 0xb0(%rsp)
1540 movaps %xmm14, 0xc0(%rsp)
1541 movaps %xmm15, 0xd0(%rsp)
1542.Lcbc_dec_body:
1543___
1544$code.=<<___;
1545 mov %rsp, %rbp # backup %rsp
1546 mov 240($arg4), %eax # rounds
1547 mov $arg1, $inp # backup arguments
1548 mov $arg2, $out
1549 mov $arg3, $len
1550 mov $arg4, $key
1551 mov $arg5, %rbx
1552 shr \$4, $len # bytes to blocks
1553
1554 mov %eax, %edx # rounds
1555 shl \$7, %rax # 128 bytes per inner round key
1556 sub \$`128-32`, %rax # size of bit-sliced key schedule
1557 sub %rax, %rsp
1558
1559 mov %rsp, %rax # pass key schedule
1560 mov $key, %rcx # pass key
1561 mov %edx, %r10d # pass rounds
1562 call _bsaes_key_convert
1563 pxor (%rsp),%xmm7 # fix up 0 round key
1564 movdqa %xmm6,(%rax) # save last round key
1565 movdqa %xmm7,(%rsp)
1566
1567 movdqu (%rbx), @XMM[15] # load IV
1568 sub \$8,$len
1569.Lcbc_dec_loop:
1570 movdqu 0x00($inp), @XMM[0] # load input
1571 movdqu 0x10($inp), @XMM[1]
1572 movdqu 0x20($inp), @XMM[2]
1573 movdqu 0x30($inp), @XMM[3]
1574 movdqu 0x40($inp), @XMM[4]
1575 movdqu 0x50($inp), @XMM[5]
1576 mov %rsp, %rax # pass key schedule
1577 movdqu 0x60($inp), @XMM[6]
1578 mov %edx,%r10d # pass rounds
1579 movdqu 0x70($inp), @XMM[7]
1580 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1581
1582 call _bsaes_decrypt8
1583
1584 pxor 0x20(%rbp), @XMM[0] # ^= IV
1585 movdqu 0x00($inp), @XMM[8] # re-load input
1586 movdqu 0x10($inp), @XMM[9]
1587 pxor @XMM[8], @XMM[1]
1588 movdqu 0x20($inp), @XMM[10]
1589 pxor @XMM[9], @XMM[6]
1590 movdqu 0x30($inp), @XMM[11]
1591 pxor @XMM[10], @XMM[4]
1592 movdqu 0x40($inp), @XMM[12]
1593 pxor @XMM[11], @XMM[2]
1594 movdqu 0x50($inp), @XMM[13]
1595 pxor @XMM[12], @XMM[7]
1596 movdqu 0x60($inp), @XMM[14]
1597 pxor @XMM[13], @XMM[3]
1598 movdqu 0x70($inp), @XMM[15] # IV
1599 pxor @XMM[14], @XMM[5]
1600 movdqu @XMM[0], 0x00($out) # write output
1601 lea 0x80($inp), $inp
1602 movdqu @XMM[1], 0x10($out)
1603 movdqu @XMM[6], 0x20($out)
1604 movdqu @XMM[4], 0x30($out)
1605 movdqu @XMM[2], 0x40($out)
1606 movdqu @XMM[7], 0x50($out)
1607 movdqu @XMM[3], 0x60($out)
1608 movdqu @XMM[5], 0x70($out)
1609 lea 0x80($out), $out
1610 sub \$8,$len
1611 jnc .Lcbc_dec_loop
1612
1613 add \$8,$len
1614 jz .Lcbc_dec_done
1615
1616 movdqu 0x00($inp), @XMM[0] # load input
1617 mov %rsp, %rax # pass key schedule
1618 mov %edx, %r10d # pass rounds
1619 cmp \$2,$len
1620 jb .Lcbc_dec_one
1621 movdqu 0x10($inp), @XMM[1]
1622 je .Lcbc_dec_two
1623 movdqu 0x20($inp), @XMM[2]
1624 cmp \$4,$len
1625 jb .Lcbc_dec_three
1626 movdqu 0x30($inp), @XMM[3]
1627 je .Lcbc_dec_four
1628 movdqu 0x40($inp), @XMM[4]
1629 cmp \$6,$len
1630 jb .Lcbc_dec_five
1631 movdqu 0x50($inp), @XMM[5]
1632 je .Lcbc_dec_six
1633 movdqu 0x60($inp), @XMM[6]
1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1635 call _bsaes_decrypt8
1636 pxor 0x20(%rbp), @XMM[0] # ^= IV
1637 movdqu 0x00($inp), @XMM[8] # re-load input
1638 movdqu 0x10($inp), @XMM[9]
1639 pxor @XMM[8], @XMM[1]
1640 movdqu 0x20($inp), @XMM[10]
1641 pxor @XMM[9], @XMM[6]
1642 movdqu 0x30($inp), @XMM[11]
1643 pxor @XMM[10], @XMM[4]
1644 movdqu 0x40($inp), @XMM[12]
1645 pxor @XMM[11], @XMM[2]
1646 movdqu 0x50($inp), @XMM[13]
1647 pxor @XMM[12], @XMM[7]
1648 movdqu 0x60($inp), @XMM[15] # IV
1649 pxor @XMM[13], @XMM[3]
1650 movdqu @XMM[0], 0x00($out) # write output
1651 movdqu @XMM[1], 0x10($out)
1652 movdqu @XMM[6], 0x20($out)
1653 movdqu @XMM[4], 0x30($out)
1654 movdqu @XMM[2], 0x40($out)
1655 movdqu @XMM[7], 0x50($out)
1656 movdqu @XMM[3], 0x60($out)
1657 jmp .Lcbc_dec_done
1658.align 16
1659.Lcbc_dec_six:
1660 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1661 call _bsaes_decrypt8
1662 pxor 0x20(%rbp), @XMM[0] # ^= IV
1663 movdqu 0x00($inp), @XMM[8] # re-load input
1664 movdqu 0x10($inp), @XMM[9]
1665 pxor @XMM[8], @XMM[1]
1666 movdqu 0x20($inp), @XMM[10]
1667 pxor @XMM[9], @XMM[6]
1668 movdqu 0x30($inp), @XMM[11]
1669 pxor @XMM[10], @XMM[4]
1670 movdqu 0x40($inp), @XMM[12]
1671 pxor @XMM[11], @XMM[2]
1672 movdqu 0x50($inp), @XMM[15] # IV
1673 pxor @XMM[12], @XMM[7]
1674 movdqu @XMM[0], 0x00($out) # write output
1675 movdqu @XMM[1], 0x10($out)
1676 movdqu @XMM[6], 0x20($out)
1677 movdqu @XMM[4], 0x30($out)
1678 movdqu @XMM[2], 0x40($out)
1679 movdqu @XMM[7], 0x50($out)
1680 jmp .Lcbc_dec_done
1681.align 16
1682.Lcbc_dec_five:
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[10]
1690 pxor @XMM[9], @XMM[6]
1691 movdqu 0x30($inp), @XMM[11]
1692 pxor @XMM[10], @XMM[4]
1693 movdqu 0x40($inp), @XMM[15] # IV
1694 pxor @XMM[11], @XMM[2]
1695 movdqu @XMM[0], 0x00($out) # write output
1696 movdqu @XMM[1], 0x10($out)
1697 movdqu @XMM[6], 0x20($out)
1698 movdqu @XMM[4], 0x30($out)
1699 movdqu @XMM[2], 0x40($out)
1700 jmp .Lcbc_dec_done
1701.align 16
1702.Lcbc_dec_four:
1703 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1704 call _bsaes_decrypt8
1705 pxor 0x20(%rbp), @XMM[0] # ^= IV
1706 movdqu 0x00($inp), @XMM[8] # re-load input
1707 movdqu 0x10($inp), @XMM[9]
1708 pxor @XMM[8], @XMM[1]
1709 movdqu 0x20($inp), @XMM[10]
1710 pxor @XMM[9], @XMM[6]
1711 movdqu 0x30($inp), @XMM[15] # IV
1712 pxor @XMM[10], @XMM[4]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 jmp .Lcbc_dec_done
1718.align 16
1719.Lcbc_dec_three:
1720 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1721 call _bsaes_decrypt8
1722 pxor 0x20(%rbp), @XMM[0] # ^= IV
1723 movdqu 0x00($inp), @XMM[8] # re-load input
1724 movdqu 0x10($inp), @XMM[9]
1725 pxor @XMM[8], @XMM[1]
1726 movdqu 0x20($inp), @XMM[15] # IV
1727 pxor @XMM[9], @XMM[6]
1728 movdqu @XMM[0], 0x00($out) # write output
1729 movdqu @XMM[1], 0x10($out)
1730 movdqu @XMM[6], 0x20($out)
1731 jmp .Lcbc_dec_done
1732.align 16
1733.Lcbc_dec_two:
1734 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1735 call _bsaes_decrypt8
1736 pxor 0x20(%rbp), @XMM[0] # ^= IV
1737 movdqu 0x00($inp), @XMM[8] # re-load input
1738 movdqu 0x10($inp), @XMM[15] # IV
1739 pxor @XMM[8], @XMM[1]
1740 movdqu @XMM[0], 0x00($out) # write output
1741 movdqu @XMM[1], 0x10($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_one:
1745 lea ($inp), $arg1
1746 lea 0x20(%rbp), $arg2 # buffer output
1747 lea ($key), $arg3
1748 call asm_AES_decrypt # doesn't touch %xmm
1749 pxor 0x20(%rbp), @XMM[15] # ^= IV
1750 movdqu @XMM[15], ($out) # write output
1751 movdqa @XMM[0], @XMM[15] # IV
1752
1753.Lcbc_dec_done:
1754 movdqu @XMM[15], (%rbx) # return IV
1755 lea (%rsp), %rax
1756 pxor %xmm0, %xmm0
1757.Lcbc_dec_bzero: # wipe key schedule [if any]
1758 movdqa %xmm0, 0x00(%rax)
1759 movdqa %xmm0, 0x10(%rax)
1760 lea 0x20(%rax), %rax
1761 cmp %rax, %rbp
1762 ja .Lcbc_dec_bzero
1763
1764 lea (%rbp),%rsp # restore %rsp
1765___
1766$code.=<<___ if ($win64);
1767 movaps 0x40(%rbp), %xmm6
1768 movaps 0x50(%rbp), %xmm7
1769 movaps 0x60(%rbp), %xmm8
1770 movaps 0x70(%rbp), %xmm9
1771 movaps 0x80(%rbp), %xmm10
1772 movaps 0x90(%rbp), %xmm11
1773 movaps 0xa0(%rbp), %xmm12
1774 movaps 0xb0(%rbp), %xmm13
1775 movaps 0xc0(%rbp), %xmm14
1776 movaps 0xd0(%rbp), %xmm15
1777 lea 0xa0(%rbp), %rsp
1778___
1779$code.=<<___;
1780 mov 0x48(%rsp), %r15
1781 mov 0x50(%rsp), %r14
1782 mov 0x58(%rsp), %r13
1783 mov 0x60(%rsp), %r12
1784 mov 0x68(%rsp), %rbx
1785 mov 0x70(%rsp), %rax
1786 lea 0x78(%rsp), %rsp
1787 mov %rax, %rbp
1788.Lcbc_dec_epilogue:
1789 ret
1790.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791
1792.globl bsaes_ctr32_encrypt_blocks
1793.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794.align 16
1795bsaes_ctr32_encrypt_blocks:
1796 mov %rsp, %rax
1797.Lctr_enc_prologue:
1798 push %rbp
1799 push %rbx
1800 push %r12
1801 push %r13
1802 push %r14
1803 push %r15
1804 lea -0x48(%rsp), %rsp
1805___
1806$code.=<<___ if ($win64);
1807 mov 0xa0(%rsp),$arg5 # pull ivp
1808 lea -0xa0(%rsp), %rsp
1809 movaps %xmm6, 0x40(%rsp)
1810 movaps %xmm7, 0x50(%rsp)
1811 movaps %xmm8, 0x60(%rsp)
1812 movaps %xmm9, 0x70(%rsp)
1813 movaps %xmm10, 0x80(%rsp)
1814 movaps %xmm11, 0x90(%rsp)
1815 movaps %xmm12, 0xa0(%rsp)
1816 movaps %xmm13, 0xb0(%rsp)
1817 movaps %xmm14, 0xc0(%rsp)
1818 movaps %xmm15, 0xd0(%rsp)
1819.Lctr_enc_body:
1820___
1821$code.=<<___;
1822 mov %rsp, %rbp # backup %rsp
1823 movdqu ($arg5), %xmm0 # load counter
1824 mov 240($arg4), %eax # rounds
1825 mov $arg1, $inp # backup arguments
1826 mov $arg2, $out
1827 mov $arg3, $len
1828 mov $arg4, $key
1829 movdqa %xmm0, 0x20(%rbp) # copy counter
1830 cmp \$8, $arg3
1831 jb .Lctr_enc_short
1832
1833 mov %eax, %ebx # rounds
1834 shl \$7, %rax # 128 bytes per inner round key
1835 sub \$`128-32`, %rax # size of bit-sliced key schedule
1836 sub %rax, %rsp
1837
1838 mov %rsp, %rax # pass key schedule
1839 mov $key, %rcx # pass key
1840 mov %ebx, %r10d # pass rounds
1841 call _bsaes_key_convert
1842 pxor %xmm6,%xmm7 # fix up last round key
1843 movdqa %xmm7,(%rax) # save last round key
1844
1845 movdqa (%rsp), @XMM[9] # load round0 key
1846 lea .LADD1(%rip), %r11
1847 movdqa 0x20(%rbp), @XMM[0] # counter copy
1848 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1849 pshufb @XMM[8], @XMM[9] # byte swap upper part
1850 pshufb @XMM[8], @XMM[0]
1851 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1852 jmp .Lctr_enc_loop
1853.align 16
1854.Lctr_enc_loop:
1855 movdqa @XMM[0], 0x20(%rbp) # save counter
1856 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1857 movdqa @XMM[0], @XMM[2]
1858 paddd 0x00(%r11), @XMM[1] # .LADD1
1859 movdqa @XMM[0], @XMM[3]
1860 paddd 0x10(%r11), @XMM[2] # .LADD2
1861 movdqa @XMM[0], @XMM[4]
1862 paddd 0x20(%r11), @XMM[3] # .LADD3
1863 movdqa @XMM[0], @XMM[5]
1864 paddd 0x30(%r11), @XMM[4] # .LADD4
1865 movdqa @XMM[0], @XMM[6]
1866 paddd 0x40(%r11), @XMM[5] # .LADD5
1867 movdqa @XMM[0], @XMM[7]
1868 paddd 0x50(%r11), @XMM[6] # .LADD6
1869 paddd 0x60(%r11), @XMM[7] # .LADD7
1870
1871 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872 # to flip byte order in 32-bit counter
1873 movdqa (%rsp), @XMM[9] # round 0 key
1874 lea 0x10(%rsp), %rax # pass key schedule
1875 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1876 pxor @XMM[9], @XMM[0] # xor with round0 key
1877 pxor @XMM[9], @XMM[1]
1878 pshufb @XMM[8], @XMM[0]
1879 pxor @XMM[9], @XMM[2]
1880 pshufb @XMM[8], @XMM[1]
1881 pxor @XMM[9], @XMM[3]
1882 pshufb @XMM[8], @XMM[2]
1883 pxor @XMM[9], @XMM[4]
1884 pshufb @XMM[8], @XMM[3]
1885 pxor @XMM[9], @XMM[5]
1886 pshufb @XMM[8], @XMM[4]
1887 pxor @XMM[9], @XMM[6]
1888 pshufb @XMM[8], @XMM[5]
1889 pxor @XMM[9], @XMM[7]
1890 pshufb @XMM[8], @XMM[6]
1891 lea .LBS0(%rip), %r11 # constants table
1892 pshufb @XMM[8], @XMM[7]
1893 mov %ebx,%r10d # pass rounds
1894
1895 call _bsaes_encrypt8_bitslice
1896
1897 sub \$8,$len
1898 jc .Lctr_enc_loop_done
1899
1900 movdqu 0x00($inp), @XMM[8] # load input
1901 movdqu 0x10($inp), @XMM[9]
1902 movdqu 0x20($inp), @XMM[10]
1903 movdqu 0x30($inp), @XMM[11]
1904 movdqu 0x40($inp), @XMM[12]
1905 movdqu 0x50($inp), @XMM[13]
1906 movdqu 0x60($inp), @XMM[14]
1907 movdqu 0x70($inp), @XMM[15]
1908 lea 0x80($inp),$inp
1909 pxor @XMM[0], @XMM[8]
1910 movdqa 0x20(%rbp), @XMM[0] # load counter
1911 pxor @XMM[9], @XMM[1]
1912 movdqu @XMM[8], 0x00($out) # write output
1913 pxor @XMM[10], @XMM[4]
1914 movdqu @XMM[1], 0x10($out)
1915 pxor @XMM[11], @XMM[6]
1916 movdqu @XMM[4], 0x20($out)
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[6], 0x30($out)
1919 pxor @XMM[13], @XMM[7]
1920 movdqu @XMM[3], 0x40($out)
1921 pxor @XMM[14], @XMM[2]
1922 movdqu @XMM[7], 0x50($out)
1923 pxor @XMM[15], @XMM[5]
1924 movdqu @XMM[2], 0x60($out)
1925 lea .LADD1(%rip), %r11
1926 movdqu @XMM[5], 0x70($out)
1927 lea 0x80($out), $out
1928 paddd 0x70(%r11), @XMM[0] # .LADD8
1929 jnz .Lctr_enc_loop
1930
1931 jmp .Lctr_enc_done
1932.align 16
1933.Lctr_enc_loop_done:
1934 add \$8, $len
1935 movdqu 0x00($inp), @XMM[8] # load input
1936 pxor @XMM[8], @XMM[0]
1937 movdqu @XMM[0], 0x00($out) # write output
1938 cmp \$2,$len
1939 jb .Lctr_enc_done
1940 movdqu 0x10($inp), @XMM[9]
1941 pxor @XMM[9], @XMM[1]
1942 movdqu @XMM[1], 0x10($out)
1943 je .Lctr_enc_done
1944 movdqu 0x20($inp), @XMM[10]
1945 pxor @XMM[10], @XMM[4]
1946 movdqu @XMM[4], 0x20($out)
1947 cmp \$4,$len
1948 jb .Lctr_enc_done
1949 movdqu 0x30($inp), @XMM[11]
1950 pxor @XMM[11], @XMM[6]
1951 movdqu @XMM[6], 0x30($out)
1952 je .Lctr_enc_done
1953 movdqu 0x40($inp), @XMM[12]
1954 pxor @XMM[12], @XMM[3]
1955 movdqu @XMM[3], 0x40($out)
1956 cmp \$6,$len
1957 jb .Lctr_enc_done
1958 movdqu 0x50($inp), @XMM[13]
1959 pxor @XMM[13], @XMM[7]
1960 movdqu @XMM[7], 0x50($out)
1961 je .Lctr_enc_done
1962 movdqu 0x60($inp), @XMM[14]
1963 pxor @XMM[14], @XMM[2]
1964 movdqu @XMM[2], 0x60($out)
1965 jmp .Lctr_enc_done
1966
1967.align 16
1968.Lctr_enc_short:
1969 lea 0x20(%rbp), $arg1
1970 lea 0x30(%rbp), $arg2
1971 lea ($key), $arg3
1972 call asm_AES_encrypt
1973 movdqu ($inp), @XMM[1]
1974 lea 16($inp), $inp
1975 mov 0x2c(%rbp), %eax # load 32-bit counter
1976 bswap %eax
1977 pxor 0x30(%rbp), @XMM[1]
1978 inc %eax # increment
1979 movdqu @XMM[1], ($out)
1980 bswap %eax
1981 lea 16($out), $out
1982 mov %eax, 0x2c(%rsp) # save 32-bit counter
1983 dec $len
1984 jnz .Lctr_enc_short
1985
1986.Lctr_enc_done:
1987 lea (%rsp), %rax
1988 pxor %xmm0, %xmm0
1989.Lctr_enc_bzero: # wipe key schedule [if any]
1990 movdqa %xmm0, 0x00(%rax)
1991 movdqa %xmm0, 0x10(%rax)
1992 lea 0x20(%rax), %rax
1993 cmp %rax, %rbp
1994 ja .Lctr_enc_bzero
1995
1996 lea (%rbp),%rsp # restore %rsp
1997___
1998$code.=<<___ if ($win64);
1999 movaps 0x40(%rbp), %xmm6
2000 movaps 0x50(%rbp), %xmm7
2001 movaps 0x60(%rbp), %xmm8
2002 movaps 0x70(%rbp), %xmm9
2003 movaps 0x80(%rbp), %xmm10
2004 movaps 0x90(%rbp), %xmm11
2005 movaps 0xa0(%rbp), %xmm12
2006 movaps 0xb0(%rbp), %xmm13
2007 movaps 0xc0(%rbp), %xmm14
2008 movaps 0xd0(%rbp), %xmm15
2009 lea 0xa0(%rbp), %rsp
2010___
2011$code.=<<___;
2012 mov 0x48(%rsp), %r15
2013 mov 0x50(%rsp), %r14
2014 mov 0x58(%rsp), %r13
2015 mov 0x60(%rsp), %r12
2016 mov 0x68(%rsp), %rbx
2017 mov 0x70(%rsp), %rax
2018 lea 0x78(%rsp), %rsp
2019 mov %rax, %rbp
2020.Lctr_enc_epilogue:
2021 ret
2022.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023___
2024######################################################################
2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026# const AES_KEY *key1, const AES_KEY *key2,
2027# const unsigned char iv[16]);
2028#
2029my ($twmask,$twres,$twtmp)=@XMM[13..15];
2030$code.=<<___;
2031.globl bsaes_xts_encrypt
2032.type bsaes_xts_encrypt,\@abi-omnipotent
2033.align 16
2034bsaes_xts_encrypt:
2035 mov %rsp, %rax
2036.Lxts_enc_prologue:
2037 push %rbp
2038 push %rbx
2039 push %r12
2040 push %r13
2041 push %r14
2042 push %r15
2043 lea -0x48(%rsp), %rsp
2044___
2045$code.=<<___ if ($win64);
2046 mov 0xa0(%rsp),$arg5 # pull key2
2047 mov 0xa8(%rsp),$arg6 # pull ivp
2048 lea -0xa0(%rsp), %rsp
2049 movaps %xmm6, 0x40(%rsp)
2050 movaps %xmm7, 0x50(%rsp)
2051 movaps %xmm8, 0x60(%rsp)
2052 movaps %xmm9, 0x70(%rsp)
2053 movaps %xmm10, 0x80(%rsp)
2054 movaps %xmm11, 0x90(%rsp)
2055 movaps %xmm12, 0xa0(%rsp)
2056 movaps %xmm13, 0xb0(%rsp)
2057 movaps %xmm14, 0xc0(%rsp)
2058 movaps %xmm15, 0xd0(%rsp)
2059.Lxts_enc_body:
2060___
2061$code.=<<___;
2062 mov %rsp, %rbp # backup %rsp
2063 mov $arg1, $inp # backup arguments
2064 mov $arg2, $out
2065 mov $arg3, $len
2066 mov $arg4, $key
2067
2068 lea ($arg6), $arg1
2069 lea 0x20(%rbp), $arg2
2070 lea ($arg5), $arg3
2071 call asm_AES_encrypt # generate initial tweak
2072
2073 mov 240($key), %eax # rounds
2074 mov $len, %rbx # backup $len
2075
2076 mov %eax, %edx # rounds
2077 shl \$7, %rax # 128 bytes per inner round key
2078 sub \$`128-32`, %rax # size of bit-sliced key schedule
2079 sub %rax, %rsp
2080
2081 mov %rsp, %rax # pass key schedule
2082 mov $key, %rcx # pass key
2083 mov %edx, %r10d # pass rounds
2084 call _bsaes_key_convert
2085 pxor %xmm6, %xmm7 # fix up last round key
2086 movdqa %xmm7, (%rax) # save last round key
2087
2088 and \$-16, $len
2089 sub \$0x80, %rsp # place for tweak[8]
2090 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2091
2092 pxor $twtmp, $twtmp
2093 movdqa .Lxts_magic(%rip), $twmask
2094 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2095
2096 sub \$0x80, $len
2097 jc .Lxts_enc_short
2098 jmp .Lxts_enc_loop
2099
2100.align 16
2101.Lxts_enc_loop:
2102___
2103 for ($i=0;$i<7;$i++) {
2104 $code.=<<___;
2105 pshufd \$0x13, $twtmp, $twres
2106 pxor $twtmp, $twtmp
2107 movdqa @XMM[7], @XMM[$i]
2108 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2110 pand $twmask, $twres # isolate carry and residue
2111 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2112 pxor $twres, @XMM[7]
2113___
2114 $code.=<<___ if ($i>=1);
2115 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2116___
2117 $code.=<<___ if ($i>=2);
2118 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119___
2120 }
2121$code.=<<___;
2122 movdqu 0x60($inp), @XMM[8+6]
2123 pxor @XMM[8+5], @XMM[5]
2124 movdqu 0x70($inp), @XMM[8+7]
2125 lea 0x80($inp), $inp
2126 movdqa @XMM[7], 0x70(%rsp)
2127 pxor @XMM[8+6], @XMM[6]
2128 lea 0x80(%rsp), %rax # pass key schedule
2129 pxor @XMM[8+7], @XMM[7]
2130 mov %edx, %r10d # pass rounds
2131
2132 call _bsaes_encrypt8
2133
2134 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2135 pxor 0x10(%rsp), @XMM[1]
2136 movdqu @XMM[0], 0x00($out) # write output
2137 pxor 0x20(%rsp), @XMM[4]
2138 movdqu @XMM[1], 0x10($out)
2139 pxor 0x30(%rsp), @XMM[6]
2140 movdqu @XMM[4], 0x20($out)
2141 pxor 0x40(%rsp), @XMM[3]
2142 movdqu @XMM[6], 0x30($out)
2143 pxor 0x50(%rsp), @XMM[7]
2144 movdqu @XMM[3], 0x40($out)
2145 pxor 0x60(%rsp), @XMM[2]
2146 movdqu @XMM[7], 0x50($out)
2147 pxor 0x70(%rsp), @XMM[5]
2148 movdqu @XMM[2], 0x60($out)
2149 movdqu @XMM[5], 0x70($out)
2150 lea 0x80($out), $out
2151
2152 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2153 pxor $twtmp, $twtmp
2154 movdqa .Lxts_magic(%rip), $twmask
2155 pcmpgtd @XMM[7], $twtmp
2156 pshufd \$0x13, $twtmp, $twres
2157 pxor $twtmp, $twtmp
2158 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2159 pand $twmask, $twres # isolate carry and residue
2160 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2161 pxor $twres, @XMM[7]
2162
2163 sub \$0x80,$len
2164 jnc .Lxts_enc_loop
2165
2166.Lxts_enc_short:
2167 add \$0x80, $len
2168 jz .Lxts_enc_done
2169___
2170 for ($i=0;$i<7;$i++) {
2171 $code.=<<___;
2172 pshufd \$0x13, $twtmp, $twres
2173 pxor $twtmp, $twtmp
2174 movdqa @XMM[7], @XMM[$i]
2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2177 pand $twmask, $twres # isolate carry and residue
2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2179 pxor $twres, @XMM[7]
2180___
2181 $code.=<<___ if ($i>=1);
2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2183 cmp \$`0x10*$i`,$len
2184 je .Lxts_enc_$i
2185___
2186 $code.=<<___ if ($i>=2);
2187 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188___
2189 }
2190$code.=<<___;
2191 movdqu 0x60($inp), @XMM[8+6]
2192 pxor @XMM[8+5], @XMM[5]
2193 movdqa @XMM[7], 0x70(%rsp)
2194 lea 0x70($inp), $inp
2195 pxor @XMM[8+6], @XMM[6]
2196 lea 0x80(%rsp), %rax # pass key schedule
2197 mov %edx, %r10d # pass rounds
2198
2199 call _bsaes_encrypt8
2200
2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2202 pxor 0x10(%rsp), @XMM[1]
2203 movdqu @XMM[0], 0x00($out) # write output
2204 pxor 0x20(%rsp), @XMM[4]
2205 movdqu @XMM[1], 0x10($out)
2206 pxor 0x30(%rsp), @XMM[6]
2207 movdqu @XMM[4], 0x20($out)
2208 pxor 0x40(%rsp), @XMM[3]
2209 movdqu @XMM[6], 0x30($out)
2210 pxor 0x50(%rsp), @XMM[7]
2211 movdqu @XMM[3], 0x40($out)
2212 pxor 0x60(%rsp), @XMM[2]
2213 movdqu @XMM[7], 0x50($out)
2214 movdqu @XMM[2], 0x60($out)
2215 lea 0x70($out), $out
2216
2217 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2218 jmp .Lxts_enc_done
2219.align 16
2220.Lxts_enc_6:
2221 pxor @XMM[8+4], @XMM[4]
2222 lea 0x60($inp), $inp
2223 pxor @XMM[8+5], @XMM[5]
2224 lea 0x80(%rsp), %rax # pass key schedule
2225 mov %edx, %r10d # pass rounds
2226
2227 call _bsaes_encrypt8
2228
2229 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2230 pxor 0x10(%rsp), @XMM[1]
2231 movdqu @XMM[0], 0x00($out) # write output
2232 pxor 0x20(%rsp), @XMM[4]
2233 movdqu @XMM[1], 0x10($out)
2234 pxor 0x30(%rsp), @XMM[6]
2235 movdqu @XMM[4], 0x20($out)
2236 pxor 0x40(%rsp), @XMM[3]
2237 movdqu @XMM[6], 0x30($out)
2238 pxor 0x50(%rsp), @XMM[7]
2239 movdqu @XMM[3], 0x40($out)
2240 movdqu @XMM[7], 0x50($out)
2241 lea 0x60($out), $out
2242
2243 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2244 jmp .Lxts_enc_done
2245.align 16
2246.Lxts_enc_5:
2247 pxor @XMM[8+3], @XMM[3]
2248 lea 0x50($inp), $inp
2249 pxor @XMM[8+4], @XMM[4]
2250 lea 0x80(%rsp), %rax # pass key schedule
2251 mov %edx, %r10d # pass rounds
2252
2253 call _bsaes_encrypt8
2254
2255 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2256 pxor 0x10(%rsp), @XMM[1]
2257 movdqu @XMM[0], 0x00($out) # write output
2258 pxor 0x20(%rsp), @XMM[4]
2259 movdqu @XMM[1], 0x10($out)
2260 pxor 0x30(%rsp), @XMM[6]
2261 movdqu @XMM[4], 0x20($out)
2262 pxor 0x40(%rsp), @XMM[3]
2263 movdqu @XMM[6], 0x30($out)
2264 movdqu @XMM[3], 0x40($out)
2265 lea 0x50($out), $out
2266
2267 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2268 jmp .Lxts_enc_done
2269.align 16
2270.Lxts_enc_4:
2271 pxor @XMM[8+2], @XMM[2]
2272 lea 0x40($inp), $inp
2273 pxor @XMM[8+3], @XMM[3]
2274 lea 0x80(%rsp), %rax # pass key schedule
2275 mov %edx, %r10d # pass rounds
2276
2277 call _bsaes_encrypt8
2278
2279 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2280 pxor 0x10(%rsp), @XMM[1]
2281 movdqu @XMM[0], 0x00($out) # write output
2282 pxor 0x20(%rsp), @XMM[4]
2283 movdqu @XMM[1], 0x10($out)
2284 pxor 0x30(%rsp), @XMM[6]
2285 movdqu @XMM[4], 0x20($out)
2286 movdqu @XMM[6], 0x30($out)
2287 lea 0x40($out), $out
2288
2289 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2290 jmp .Lxts_enc_done
2291.align 16
2292.Lxts_enc_3:
2293 pxor @XMM[8+1], @XMM[1]
2294 lea 0x30($inp), $inp
2295 pxor @XMM[8+2], @XMM[2]
2296 lea 0x80(%rsp), %rax # pass key schedule
2297 mov %edx, %r10d # pass rounds
2298
2299 call _bsaes_encrypt8
2300
2301 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2302 pxor 0x10(%rsp), @XMM[1]
2303 movdqu @XMM[0], 0x00($out) # write output
2304 pxor 0x20(%rsp), @XMM[4]
2305 movdqu @XMM[1], 0x10($out)
2306 movdqu @XMM[4], 0x20($out)
2307 lea 0x30($out), $out
2308
2309 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2310 jmp .Lxts_enc_done
2311.align 16
2312.Lxts_enc_2:
2313 pxor @XMM[8+0], @XMM[0]
2314 lea 0x20($inp), $inp
2315 pxor @XMM[8+1], @XMM[1]
2316 lea 0x80(%rsp), %rax # pass key schedule
2317 mov %edx, %r10d # pass rounds
2318
2319 call _bsaes_encrypt8
2320
2321 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2322 pxor 0x10(%rsp), @XMM[1]
2323 movdqu @XMM[0], 0x00($out) # write output
2324 movdqu @XMM[1], 0x10($out)
2325 lea 0x20($out), $out
2326
2327 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2328 jmp .Lxts_enc_done
2329.align 16
2330.Lxts_enc_1:
2331 pxor @XMM[0], @XMM[8]
2332 lea 0x10($inp), $inp
2333 movdqa @XMM[8], 0x20(%rbp)
2334 lea 0x20(%rbp), $arg1
2335 lea 0x20(%rbp), $arg2
2336 lea ($key), $arg3
2337 call asm_AES_encrypt # doesn't touch %xmm
2338 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2339 #pxor @XMM[8], @XMM[0]
2340 #lea 0x80(%rsp), %rax # pass key schedule
2341 #mov %edx, %r10d # pass rounds
2342 #call _bsaes_encrypt8
2343 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 movdqu @XMM[0], 0x00($out) # write output
2345 lea 0x10($out), $out
2346
2347 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2348
2349.Lxts_enc_done:
2350 and \$15, %ebx
2351 jz .Lxts_enc_ret
2352 mov $out, %rdx
2353
2354.Lxts_enc_steal:
2355 movzb ($inp), %eax
2356 movzb -16(%rdx), %ecx
2357 lea 1($inp), $inp
2358 mov %al, -16(%rdx)
2359 mov %cl, 0(%rdx)
2360 lea 1(%rdx), %rdx
2361 sub \$1,%ebx
2362 jnz .Lxts_enc_steal
2363
2364 movdqu -16($out), @XMM[0]
2365 lea 0x20(%rbp), $arg1
2366 pxor @XMM[7], @XMM[0]
2367 lea 0x20(%rbp), $arg2
2368 movdqa @XMM[0], 0x20(%rbp)
2369 lea ($key), $arg3
2370 call asm_AES_encrypt # doesn't touch %xmm
2371 pxor 0x20(%rbp), @XMM[7]
2372 movdqu @XMM[7], -16($out)
2373
2374.Lxts_enc_ret:
2375 lea (%rsp), %rax
2376 pxor %xmm0, %xmm0
2377.Lxts_enc_bzero: # wipe key schedule [if any]
2378 movdqa %xmm0, 0x00(%rax)
2379 movdqa %xmm0, 0x10(%rax)
2380 lea 0x20(%rax), %rax
2381 cmp %rax, %rbp
2382 ja .Lxts_enc_bzero
2383
2384 lea (%rbp),%rsp # restore %rsp
2385___
2386$code.=<<___ if ($win64);
2387 movaps 0x40(%rbp), %xmm6
2388 movaps 0x50(%rbp), %xmm7
2389 movaps 0x60(%rbp), %xmm8
2390 movaps 0x70(%rbp), %xmm9
2391 movaps 0x80(%rbp), %xmm10
2392 movaps 0x90(%rbp), %xmm11
2393 movaps 0xa0(%rbp), %xmm12
2394 movaps 0xb0(%rbp), %xmm13
2395 movaps 0xc0(%rbp), %xmm14
2396 movaps 0xd0(%rbp), %xmm15
2397 lea 0xa0(%rbp), %rsp
2398___
2399$code.=<<___;
2400 mov 0x48(%rsp), %r15
2401 mov 0x50(%rsp), %r14
2402 mov 0x58(%rsp), %r13
2403 mov 0x60(%rsp), %r12
2404 mov 0x68(%rsp), %rbx
2405 mov 0x70(%rsp), %rax
2406 lea 0x78(%rsp), %rsp
2407 mov %rax, %rbp
2408.Lxts_enc_epilogue:
2409 ret
2410.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411
2412.globl bsaes_xts_decrypt
2413.type bsaes_xts_decrypt,\@abi-omnipotent
2414.align 16
2415bsaes_xts_decrypt:
2416 mov %rsp, %rax
2417.Lxts_dec_prologue:
2418 push %rbp
2419 push %rbx
2420 push %r12
2421 push %r13
2422 push %r14
2423 push %r15
2424 lea -0x48(%rsp), %rsp
2425___
2426$code.=<<___ if ($win64);
2427 mov 0xa0(%rsp),$arg5 # pull key2
2428 mov 0xa8(%rsp),$arg6 # pull ivp
2429 lea -0xa0(%rsp), %rsp
2430 movaps %xmm6, 0x40(%rsp)
2431 movaps %xmm7, 0x50(%rsp)
2432 movaps %xmm8, 0x60(%rsp)
2433 movaps %xmm9, 0x70(%rsp)
2434 movaps %xmm10, 0x80(%rsp)
2435 movaps %xmm11, 0x90(%rsp)
2436 movaps %xmm12, 0xa0(%rsp)
2437 movaps %xmm13, 0xb0(%rsp)
2438 movaps %xmm14, 0xc0(%rsp)
2439 movaps %xmm15, 0xd0(%rsp)
2440.Lxts_dec_body:
2441___
2442$code.=<<___;
2443 mov %rsp, %rbp # backup %rsp
2444 mov $arg1, $inp # backup arguments
2445 mov $arg2, $out
2446 mov $arg3, $len
2447 mov $arg4, $key
2448
2449 lea ($arg6), $arg1
2450 lea 0x20(%rbp), $arg2
2451 lea ($arg5), $arg3
2452 call asm_AES_encrypt # generate initial tweak
2453
2454 mov 240($key), %eax # rounds
2455 mov $len, %rbx # backup $len
2456
2457 mov %eax, %edx # rounds
2458 shl \$7, %rax # 128 bytes per inner round key
2459 sub \$`128-32`, %rax # size of bit-sliced key schedule
2460 sub %rax, %rsp
2461
2462 mov %rsp, %rax # pass key schedule
2463 mov $key, %rcx # pass key
2464 mov %edx, %r10d # pass rounds
2465 call _bsaes_key_convert
2466 pxor (%rsp), %xmm7 # fix up round 0 key
2467 movdqa %xmm6, (%rax) # save last round key
2468 movdqa %xmm7, (%rsp)
2469
2470 xor %eax, %eax # if ($len%16) len-=16;
2471 and \$-16, $len
2472 test \$15, %ebx
2473 setnz %al
2474 shl \$4, %rax
2475 sub %rax, $len
2476
2477 sub \$0x80, %rsp # place for tweak[8]
2478 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2479
2480 pxor $twtmp, $twtmp
2481 movdqa .Lxts_magic(%rip), $twmask
2482 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2483
2484 sub \$0x80, $len
2485 jc .Lxts_dec_short
2486 jmp .Lxts_dec_loop
2487
2488.align 16
2489.Lxts_dec_loop:
2490___
2491 for ($i=0;$i<7;$i++) {
2492 $code.=<<___;
2493 pshufd \$0x13, $twtmp, $twres
2494 pxor $twtmp, $twtmp
2495 movdqa @XMM[7], @XMM[$i]
2496 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2498 pand $twmask, $twres # isolate carry and residue
2499 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2500 pxor $twres, @XMM[7]
2501___
2502 $code.=<<___ if ($i>=1);
2503 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2504___
2505 $code.=<<___ if ($i>=2);
2506 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507___
2508 }
2509$code.=<<___;
2510 movdqu 0x60($inp), @XMM[8+6]
2511 pxor @XMM[8+5], @XMM[5]
2512 movdqu 0x70($inp), @XMM[8+7]
2513 lea 0x80($inp), $inp
2514 movdqa @XMM[7], 0x70(%rsp)
2515 pxor @XMM[8+6], @XMM[6]
2516 lea 0x80(%rsp), %rax # pass key schedule
2517 pxor @XMM[8+7], @XMM[7]
2518 mov %edx, %r10d # pass rounds
2519
2520 call _bsaes_decrypt8
2521
2522 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2523 pxor 0x10(%rsp), @XMM[1]
2524 movdqu @XMM[0], 0x00($out) # write output
2525 pxor 0x20(%rsp), @XMM[6]
2526 movdqu @XMM[1], 0x10($out)
2527 pxor 0x30(%rsp), @XMM[4]
2528 movdqu @XMM[6], 0x20($out)
2529 pxor 0x40(%rsp), @XMM[2]
2530 movdqu @XMM[4], 0x30($out)
2531 pxor 0x50(%rsp), @XMM[7]
2532 movdqu @XMM[2], 0x40($out)
2533 pxor 0x60(%rsp), @XMM[3]
2534 movdqu @XMM[7], 0x50($out)
2535 pxor 0x70(%rsp), @XMM[5]
2536 movdqu @XMM[3], 0x60($out)
2537 movdqu @XMM[5], 0x70($out)
2538 lea 0x80($out), $out
2539
2540 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2541 pxor $twtmp, $twtmp
2542 movdqa .Lxts_magic(%rip), $twmask
2543 pcmpgtd @XMM[7], $twtmp
2544 pshufd \$0x13, $twtmp, $twres
2545 pxor $twtmp, $twtmp
2546 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2547 pand $twmask, $twres # isolate carry and residue
2548 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2549 pxor $twres, @XMM[7]
2550
2551 sub \$0x80,$len
2552 jnc .Lxts_dec_loop
2553
2554.Lxts_dec_short:
2555 add \$0x80, $len
2556 jz .Lxts_dec_done
2557___
2558 for ($i=0;$i<7;$i++) {
2559 $code.=<<___;
2560 pshufd \$0x13, $twtmp, $twres
2561 pxor $twtmp, $twtmp
2562 movdqa @XMM[7], @XMM[$i]
2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2565 pand $twmask, $twres # isolate carry and residue
2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2567 pxor $twres, @XMM[7]
2568___
2569 $code.=<<___ if ($i>=1);
2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2571 cmp \$`0x10*$i`,$len
2572 je .Lxts_dec_$i
2573___
2574 $code.=<<___ if ($i>=2);
2575 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576___
2577 }
2578$code.=<<___;
2579 movdqu 0x60($inp), @XMM[8+6]
2580 pxor @XMM[8+5], @XMM[5]
2581 movdqa @XMM[7], 0x70(%rsp)
2582 lea 0x70($inp), $inp
2583 pxor @XMM[8+6], @XMM[6]
2584 lea 0x80(%rsp), %rax # pass key schedule
2585 mov %edx, %r10d # pass rounds
2586
2587 call _bsaes_decrypt8
2588
2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2590 pxor 0x10(%rsp), @XMM[1]
2591 movdqu @XMM[0], 0x00($out) # write output
2592 pxor 0x20(%rsp), @XMM[6]
2593 movdqu @XMM[1], 0x10($out)
2594 pxor 0x30(%rsp), @XMM[4]
2595 movdqu @XMM[6], 0x20($out)
2596 pxor 0x40(%rsp), @XMM[2]
2597 movdqu @XMM[4], 0x30($out)
2598 pxor 0x50(%rsp), @XMM[7]
2599 movdqu @XMM[2], 0x40($out)
2600 pxor 0x60(%rsp), @XMM[3]
2601 movdqu @XMM[7], 0x50($out)
2602 movdqu @XMM[3], 0x60($out)
2603 lea 0x70($out), $out
2604
2605 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2606 jmp .Lxts_dec_done
2607.align 16
2608.Lxts_dec_6:
2609 pxor @XMM[8+4], @XMM[4]
2610 lea 0x60($inp), $inp
2611 pxor @XMM[8+5], @XMM[5]
2612 lea 0x80(%rsp), %rax # pass key schedule
2613 mov %edx, %r10d # pass rounds
2614
2615 call _bsaes_decrypt8
2616
2617 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2618 pxor 0x10(%rsp), @XMM[1]
2619 movdqu @XMM[0], 0x00($out) # write output
2620 pxor 0x20(%rsp), @XMM[6]
2621 movdqu @XMM[1], 0x10($out)
2622 pxor 0x30(%rsp), @XMM[4]
2623 movdqu @XMM[6], 0x20($out)
2624 pxor 0x40(%rsp), @XMM[2]
2625 movdqu @XMM[4], 0x30($out)
2626 pxor 0x50(%rsp), @XMM[7]
2627 movdqu @XMM[2], 0x40($out)
2628 movdqu @XMM[7], 0x50($out)
2629 lea 0x60($out), $out
2630
2631 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2632 jmp .Lxts_dec_done
2633.align 16
2634.Lxts_dec_5:
2635 pxor @XMM[8+3], @XMM[3]
2636 lea 0x50($inp), $inp
2637 pxor @XMM[8+4], @XMM[4]
2638 lea 0x80(%rsp), %rax # pass key schedule
2639 mov %edx, %r10d # pass rounds
2640
2641 call _bsaes_decrypt8
2642
2643 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2644 pxor 0x10(%rsp), @XMM[1]
2645 movdqu @XMM[0], 0x00($out) # write output
2646 pxor 0x20(%rsp), @XMM[6]
2647 movdqu @XMM[1], 0x10($out)
2648 pxor 0x30(%rsp), @XMM[4]
2649 movdqu @XMM[6], 0x20($out)
2650 pxor 0x40(%rsp), @XMM[2]
2651 movdqu @XMM[4], 0x30($out)
2652 movdqu @XMM[2], 0x40($out)
2653 lea 0x50($out), $out
2654
2655 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2656 jmp .Lxts_dec_done
2657.align 16
2658.Lxts_dec_4:
2659 pxor @XMM[8+2], @XMM[2]
2660 lea 0x40($inp), $inp
2661 pxor @XMM[8+3], @XMM[3]
2662 lea 0x80(%rsp), %rax # pass key schedule
2663 mov %edx, %r10d # pass rounds
2664
2665 call _bsaes_decrypt8
2666
2667 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2668 pxor 0x10(%rsp), @XMM[1]
2669 movdqu @XMM[0], 0x00($out) # write output
2670 pxor 0x20(%rsp), @XMM[6]
2671 movdqu @XMM[1], 0x10($out)
2672 pxor 0x30(%rsp), @XMM[4]
2673 movdqu @XMM[6], 0x20($out)
2674 movdqu @XMM[4], 0x30($out)
2675 lea 0x40($out), $out
2676
2677 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2678 jmp .Lxts_dec_done
2679.align 16
2680.Lxts_dec_3:
2681 pxor @XMM[8+1], @XMM[1]
2682 lea 0x30($inp), $inp
2683 pxor @XMM[8+2], @XMM[2]
2684 lea 0x80(%rsp), %rax # pass key schedule
2685 mov %edx, %r10d # pass rounds
2686
2687 call _bsaes_decrypt8
2688
2689 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2690 pxor 0x10(%rsp), @XMM[1]
2691 movdqu @XMM[0], 0x00($out) # write output
2692 pxor 0x20(%rsp), @XMM[6]
2693 movdqu @XMM[1], 0x10($out)
2694 movdqu @XMM[6], 0x20($out)
2695 lea 0x30($out), $out
2696
2697 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2698 jmp .Lxts_dec_done
2699.align 16
2700.Lxts_dec_2:
2701 pxor @XMM[8+0], @XMM[0]
2702 lea 0x20($inp), $inp
2703 pxor @XMM[8+1], @XMM[1]
2704 lea 0x80(%rsp), %rax # pass key schedule
2705 mov %edx, %r10d # pass rounds
2706
2707 call _bsaes_decrypt8
2708
2709 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2710 pxor 0x10(%rsp), @XMM[1]
2711 movdqu @XMM[0], 0x00($out) # write output
2712 movdqu @XMM[1], 0x10($out)
2713 lea 0x20($out), $out
2714
2715 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2716 jmp .Lxts_dec_done
2717.align 16
2718.Lxts_dec_1:
2719 pxor @XMM[0], @XMM[8]
2720 lea 0x10($inp), $inp
2721 movdqa @XMM[8], 0x20(%rbp)
2722 lea 0x20(%rbp), $arg1
2723 lea 0x20(%rbp), $arg2
2724 lea ($key), $arg3
2725 call asm_AES_decrypt # doesn't touch %xmm
2726 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2727 #pxor @XMM[8], @XMM[0]
2728 #lea 0x80(%rsp), %rax # pass key schedule
2729 #mov %edx, %r10d # pass rounds
2730 #call _bsaes_decrypt8
2731 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 movdqu @XMM[0], 0x00($out) # write output
2733 lea 0x10($out), $out
2734
2735 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2736
2737.Lxts_dec_done:
2738 and \$15, %ebx
2739 jz .Lxts_dec_ret
2740
2741 pxor $twtmp, $twtmp
2742 movdqa .Lxts_magic(%rip), $twmask
2743 pcmpgtd @XMM[7], $twtmp
2744 pshufd \$0x13, $twtmp, $twres
2745 movdqa @XMM[7], @XMM[6]
2746 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2747 pand $twmask, $twres # isolate carry and residue
2748 movdqu ($inp), @XMM[0]
2749 pxor $twres, @XMM[7]
2750
2751 lea 0x20(%rbp), $arg1
2752 pxor @XMM[7], @XMM[0]
2753 lea 0x20(%rbp), $arg2
2754 movdqa @XMM[0], 0x20(%rbp)
2755 lea ($key), $arg3
2756 call asm_AES_decrypt # doesn't touch %xmm
2757 pxor 0x20(%rbp), @XMM[7]
2758 mov $out, %rdx
2759 movdqu @XMM[7], ($out)
2760
2761.Lxts_dec_steal:
2762 movzb 16($inp), %eax
2763 movzb (%rdx), %ecx
2764 lea 1($inp), $inp
2765 mov %al, (%rdx)
2766 mov %cl, 16(%rdx)
2767 lea 1(%rdx), %rdx
2768 sub \$1,%ebx
2769 jnz .Lxts_dec_steal
2770
2771 movdqu ($out), @XMM[0]
2772 lea 0x20(%rbp), $arg1
2773 pxor @XMM[6], @XMM[0]
2774 lea 0x20(%rbp), $arg2
2775 movdqa @XMM[0], 0x20(%rbp)
2776 lea ($key), $arg3
2777 call asm_AES_decrypt # doesn't touch %xmm
2778 pxor 0x20(%rbp), @XMM[6]
2779 movdqu @XMM[6], ($out)
2780
2781.Lxts_dec_ret:
2782 lea (%rsp), %rax
2783 pxor %xmm0, %xmm0
2784.Lxts_dec_bzero: # wipe key schedule [if any]
2785 movdqa %xmm0, 0x00(%rax)
2786 movdqa %xmm0, 0x10(%rax)
2787 lea 0x20(%rax), %rax
2788 cmp %rax, %rbp
2789 ja .Lxts_dec_bzero
2790
2791 lea (%rbp),%rsp # restore %rsp
2792___
2793$code.=<<___ if ($win64);
2794 movaps 0x40(%rbp), %xmm6
2795 movaps 0x50(%rbp), %xmm7
2796 movaps 0x60(%rbp), %xmm8
2797 movaps 0x70(%rbp), %xmm9
2798 movaps 0x80(%rbp), %xmm10
2799 movaps 0x90(%rbp), %xmm11
2800 movaps 0xa0(%rbp), %xmm12
2801 movaps 0xb0(%rbp), %xmm13
2802 movaps 0xc0(%rbp), %xmm14
2803 movaps 0xd0(%rbp), %xmm15
2804 lea 0xa0(%rbp), %rsp
2805___
2806$code.=<<___;
2807 mov 0x48(%rsp), %r15
2808 mov 0x50(%rsp), %r14
2809 mov 0x58(%rsp), %r13
2810 mov 0x60(%rsp), %r12
2811 mov 0x68(%rsp), %rbx
2812 mov 0x70(%rsp), %rax
2813 lea 0x78(%rsp), %rsp
2814 mov %rax, %rbp
2815.Lxts_dec_epilogue:
2816 ret
2817.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818___
2819}
2820$code.=<<___;
2821.type _bsaes_const,\@object
2822.align 64
2823_bsaes_const:
2824.LM0ISR: # InvShiftRows constants
2825 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2826.LISRM0:
2827 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2828.LISR:
2829 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2830.LBS0: # bit-slice constants
2831 .quad 0x5555555555555555, 0x5555555555555555
2832.LBS1:
2833 .quad 0x3333333333333333, 0x3333333333333333
2834.LBS2:
2835 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836.LSR: # shiftrows constants
2837 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2838.LSRM0:
2839 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2840.LM0SR:
2841 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2842.LSWPUP: # byte-swap upper dword
2843 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2844.LSWPUPM0SR:
2845 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2846.LADD1: # counter increment constants
2847 .quad 0x0000000000000000, 0x0000000100000000
2848.LADD2:
2849 .quad 0x0000000000000000, 0x0000000200000000
2850.LADD3:
2851 .quad 0x0000000000000000, 0x0000000300000000
2852.LADD4:
2853 .quad 0x0000000000000000, 0x0000000400000000
2854.LADD5:
2855 .quad 0x0000000000000000, 0x0000000500000000
2856.LADD6:
2857 .quad 0x0000000000000000, 0x0000000600000000
2858.LADD7:
2859 .quad 0x0000000000000000, 0x0000000700000000
2860.LADD8:
2861 .quad 0x0000000000000000, 0x0000000800000000
2862.Lxts_magic:
2863 .long 0x87,0,1,0
2864.Lmasks:
2865 .quad 0x0101010101010101, 0x0101010101010101
2866 .quad 0x0202020202020202, 0x0202020202020202
2867 .quad 0x0404040404040404, 0x0404040404040404
2868 .quad 0x0808080808080808, 0x0808080808080808
2869.LM0:
2870 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2871.L63:
2872 .quad 0x6363636363636363, 0x6363636363636363
2873.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874.align 64
2875.size _bsaes_const,.-_bsaes_const
2876___
2877
2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880if ($win64) {
2881$rec="%rcx";
2882$frame="%rdx";
2883$context="%r8";
2884$disp="%r9";
2885
2886$code.=<<___;
2887.extern __imp_RtlVirtualUnwind
2888.type se_handler,\@abi-omnipotent
2889.align 16
2890se_handler:
2891 push %rsi
2892 push %rdi
2893 push %rbx
2894 push %rbp
2895 push %r12
2896 push %r13
2897 push %r14
2898 push %r15
2899 pushfq
2900 sub \$64,%rsp
2901
2902 mov 120($context),%rax # pull context->Rax
2903 mov 248($context),%rbx # pull context->Rip
2904
2905 mov 8($disp),%rsi # disp->ImageBase
2906 mov 56($disp),%r11 # disp->HandlerData
2907
2908 mov 0(%r11),%r10d # HandlerData[0]
2909 lea (%rsi,%r10),%r10 # prologue label
2910 cmp %r10,%rbx # context->Rip<prologue label
2911 jb .Lin_prologue
2912
2913 mov 152($context),%rax # pull context->Rsp
2914
2915 mov 4(%r11),%r10d # HandlerData[1]
2916 lea (%rsi,%r10),%r10 # epilogue label
2917 cmp %r10,%rbx # context->Rip>=epilogue label
2918 jae .Lin_prologue
2919
2920 mov 160($context),%rax # pull context->Rbp
2921
2922 lea 0x40(%rax),%rsi # %xmm save area
2923 lea 512($context),%rdi # &context.Xmm6
2924 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2925 .long 0xa548f3fc # cld; rep movsq
2926 lea 0xa0(%rax),%rax # adjust stack pointer
2927
2928 mov 0x70(%rax),%rbp
2929 mov 0x68(%rax),%rbx
2930 mov 0x60(%rax),%r12
2931 mov 0x58(%rax),%r13
2932 mov 0x50(%rax),%r14
2933 mov 0x48(%rax),%r15
2934 lea 0x78(%rax),%rax # adjust stack pointer
2935 mov %rbx,144($context) # restore context->Rbx
2936 mov %rbp,160($context) # restore context->Rbp
2937 mov %r12,216($context) # restore context->R12
2938 mov %r13,224($context) # restore context->R13
2939 mov %r14,232($context) # restore context->R14
2940 mov %r15,240($context) # restore context->R15
2941
2942.Lin_prologue:
2943 mov %rax,152($context) # restore context->Rsp
2944
2945 mov 40($disp),%rdi # disp->ContextRecord
2946 mov $context,%rsi # context
2947 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2948 .long 0xa548f3fc # cld; rep movsq
2949
2950 mov $disp,%rsi
2951 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2952 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2953 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2954 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2955 mov 40(%rsi),%r10 # disp->ContextRecord
2956 lea 56(%rsi),%r11 # &disp->HandlerData
2957 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2958 mov %r10,32(%rsp) # arg5
2959 mov %r11,40(%rsp) # arg6
2960 mov %r12,48(%rsp) # arg7
2961 mov %rcx,56(%rsp) # arg8, (NULL)
2962 call *__imp_RtlVirtualUnwind(%rip)
2963
2964 mov \$1,%eax # ExceptionContinueSearch
2965 add \$64,%rsp
2966 popfq
2967 pop %r15
2968 pop %r14
2969 pop %r13
2970 pop %r12
2971 pop %rbp
2972 pop %rbx
2973 pop %rdi
2974 pop %rsi
2975 ret
2976.size se_handler,.-se_handler
2977
2978.section .pdata
2979.align 4
2980___
2981$code.=<<___ if ($ecb);
2982 .rva .Lecb_enc_prologue
2983 .rva .Lecb_enc_epilogue
2984 .rva .Lecb_enc_info
2985
2986 .rva .Lecb_dec_prologue
2987 .rva .Lecb_dec_epilogue
2988 .rva .Lecb_dec_info
2989___
2990$code.=<<___;
2991 .rva .Lcbc_dec_prologue
2992 .rva .Lcbc_dec_epilogue
2993 .rva .Lcbc_dec_info
2994
2995 .rva .Lctr_enc_prologue
2996 .rva .Lctr_enc_epilogue
2997 .rva .Lctr_enc_info
2998
2999 .rva .Lxts_enc_prologue
3000 .rva .Lxts_enc_epilogue
3001 .rva .Lxts_enc_info
3002
3003 .rva .Lxts_dec_prologue
3004 .rva .Lxts_dec_epilogue
3005 .rva .Lxts_dec_info
3006
3007.section .xdata
3008.align 8
3009___
3010$code.=<<___ if ($ecb);
3011.Lecb_enc_info:
3012 .byte 9,0,0,0
3013 .rva se_handler
3014 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3015.Lecb_dec_info:
3016 .byte 9,0,0,0
3017 .rva se_handler
3018 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3019___
3020$code.=<<___;
3021.Lcbc_dec_info:
3022 .byte 9,0,0,0
3023 .rva se_handler
3024 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3025.Lctr_enc_info:
3026 .byte 9,0,0,0
3027 .rva se_handler
3028 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3029.Lxts_enc_info:
3030 .byte 9,0,0,0
3031 .rva se_handler
3032 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3033.Lxts_dec_info:
3034 .byte 9,0,0,0
3035 .rva se_handler
3036 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3037___
3038}
3039
3040$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041
3042print $code;
3043
3044close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000000..1533e2c304
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60&static_label("_vpaes_consts");
61&static_label("_vpaes_schedule_low_round");
62
63&set_label("_vpaes_consts",64);
64$k_inv=-0x30; # inv, inva
65 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
66 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
67
68$k_s0F=-0x10; # s0F
69 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
70
71$k_ipt=0x00; # input transform (lo, hi)
72 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
73 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
74
75$k_sb1=0x20; # sb1u, sb1t
76 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
77 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
78$k_sb2=0x40; # sb2u, sb2t
79 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
80 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
81$k_sbo=0x60; # sbou, sbot
82 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
83 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
84
85$k_mc_forward=0x80; # mc_forward
86 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
87 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
88 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
89 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
90
91$k_mc_backward=0xc0; # mc_backward
92 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
93 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
94 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
95 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
96
97$k_sr=0x100; # sr
98 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
99 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
100 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
101 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
102
103$k_rcon=0x140; # rcon
104 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
105
106$k_s63=0x150; # s63: all equal to 0x63 transformed
107 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
108
109$k_opt=0x160; # output transform
110 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
111 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
112
113$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
114 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
115 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
116##
117## Decryption stuff
118## Key schedule constants
119##
120$k_dksd=0x1a0; # decryption key schedule: invskew x*D
121 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
122 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
123$k_dksb=0x1c0; # decryption key schedule: invskew x*B
124 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
125 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
126$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
127 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
128 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
129$k_dks9=0x200; # decryption key schedule: invskew x*9
130 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
131 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
132
133##
134## Decryption stuff
135## Round function constants
136##
137$k_dipt=0x220; # decryption input transform
138 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
139 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
140
141$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
142 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
143 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
144$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
145 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
146 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
147$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
148 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
149 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
150$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
151 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
152 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
153$k_dsbo=0x2c0; # decryption sbox final output
154 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
155 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
156&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
157&align (64);
158
159&function_begin_B("_vpaes_preheat");
160 &add ($const,&DWP(0,"esp"));
161 &movdqa ("xmm7",&QWP($k_inv,$const));
162 &movdqa ("xmm6",&QWP($k_s0F,$const));
163 &ret ();
164&function_end_B("_vpaes_preheat");
165
166##
167## _aes_encrypt_core
168##
169## AES-encrypt %xmm0.
170##
171## Inputs:
172## %xmm0 = input
173## %xmm6-%xmm7 as in _vpaes_preheat
174## (%edx) = scheduled keys
175##
176## Output in %xmm0
177## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
178##
179##
180&function_begin_B("_vpaes_encrypt_core");
181 &mov ($magic,16);
182 &mov ($round,&DWP(240,$key));
183 &movdqa ("xmm1","xmm6")
184 &movdqa ("xmm2",&QWP($k_ipt,$const));
185 &pandn ("xmm1","xmm0");
186 &movdqu ("xmm5",&QWP(0,$key));
187 &psrld ("xmm1",4);
188 &pand ("xmm0","xmm6");
189 &pshufb ("xmm2","xmm0");
190 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
191 &pshufb ("xmm0","xmm1");
192 &pxor ("xmm2","xmm5");
193 &pxor ("xmm0","xmm2");
194 &add ($key,16);
195 &lea ($base,&DWP($k_mc_backward,$const));
196 &jmp (&label("enc_entry"));
197
198
199&set_label("enc_loop",16);
200 # middle of middle round
201 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
202 &pshufb ("xmm4","xmm2"); # 4 = sb1u
203 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
204 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
205 &pshufb ("xmm0","xmm3"); # 0 = sb1t
206 &pxor ("xmm0","xmm4"); # 0 = A
207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
208 &pshufb ("xmm5","xmm2"); # 4 = sb2u
209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
210 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
211 &pshufb ("xmm2","xmm3"); # 2 = sb2t
212 &pxor ("xmm2","xmm5"); # 2 = 2A
213 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
214 &movdqa ("xmm3","xmm0"); # 3 = A
215 &pshufb ("xmm0","xmm1"); # 0 = B
216 &add ($key,16); # next key
217 &pxor ("xmm0","xmm2"); # 0 = 2A+B
218 &pshufb ("xmm3","xmm4"); # 3 = D
219 &add ($magic,16); # next mc
220 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
221 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
222 &and ($magic,0x30); # ... mod 4
223 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
224 &sub ($round,1); # nr--
225
226&set_label("enc_entry");
227 # top of round
228 &movdqa ("xmm1","xmm6"); # 1 : i
229 &pandn ("xmm1","xmm0"); # 1 = i<<4
230 &psrld ("xmm1",4); # 1 = i
231 &pand ("xmm0","xmm6"); # 0 = k
232 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
233 &pshufb ("xmm5","xmm0"); # 2 = a/k
234 &pxor ("xmm0","xmm1"); # 0 = j
235 &movdqa ("xmm3","xmm7"); # 3 : 1/i
236 &pshufb ("xmm3","xmm1"); # 3 = 1/i
237 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
238 &movdqa ("xmm4","xmm7"); # 4 : 1/j
239 &pshufb ("xmm4","xmm0"); # 4 = 1/j
240 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
242 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
243 &pxor ("xmm2","xmm0"); # 2 = io
244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
245 &movdqu ("xmm5",&QWP(0,$key));
246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
247 &pxor ("xmm3","xmm1"); # 3 = jo
248 &jnz (&label("enc_loop"));
249
250 # middle of last round
251 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
252 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
253 &pshufb ("xmm4","xmm2"); # 4 = sbou
254 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
255 &pshufb ("xmm0","xmm3"); # 0 = sb1t
256 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
257 &pxor ("xmm0","xmm4"); # 0 = A
258 &pshufb ("xmm0","xmm1");
259 &ret ();
260&function_end_B("_vpaes_encrypt_core");
261
262##
263## Decryption core
264##
265## Same API as encryption core.
266##
267&function_begin_B("_vpaes_decrypt_core");
268 &mov ($round,&DWP(240,$key));
269 &lea ($base,&DWP($k_dsbd,$const));
270 &movdqa ("xmm1","xmm6");
271 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
272 &pandn ("xmm1","xmm0");
273 &mov ($magic,$round);
274 &psrld ("xmm1",4)
275 &movdqu ("xmm5",&QWP(0,$key));
276 &shl ($magic,4);
277 &pand ("xmm0","xmm6");
278 &pshufb ("xmm2","xmm0");
279 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
280 &xor ($magic,0x30);
281 &pshufb ("xmm0","xmm1");
282 &and ($magic,0x30);
283 &pxor ("xmm2","xmm5");
284 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
285 &pxor ("xmm0","xmm2");
286 &add ($key,16);
287 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
288 &jmp (&label("dec_entry"));
289
290&set_label("dec_loop",16);
291##
292## Inverse mix columns
293##
294 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
295 &pshufb ("xmm4","xmm2"); # 4 = sb9u
296 &pxor ("xmm4","xmm0");
297 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
298 &pshufb ("xmm0","xmm3"); # 0 = sb9t
299 &pxor ("xmm0","xmm4"); # 0 = ch
300 &add ($key,16); # next round key
301
302 &pshufb ("xmm0","xmm5"); # MC ch
303 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
304 &pshufb ("xmm4","xmm2"); # 4 = sbdu
305 &pxor ("xmm4","xmm0"); # 4 = ch
306 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
307 &pshufb ("xmm0","xmm3"); # 0 = sbdt
308 &pxor ("xmm0","xmm4"); # 0 = ch
309 &sub ($round,1); # nr--
310
311 &pshufb ("xmm0","xmm5"); # MC ch
312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
313 &pshufb ("xmm4","xmm2"); # 4 = sbbu
314 &pxor ("xmm4","xmm0"); # 4 = ch
315 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
316 &pshufb ("xmm0","xmm3"); # 0 = sbbt
317 &pxor ("xmm0","xmm4"); # 0 = ch
318
319 &pshufb ("xmm0","xmm5"); # MC ch
320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
321 &pshufb ("xmm4","xmm2"); # 4 = sbeu
322 &pxor ("xmm4","xmm0"); # 4 = ch
323 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
324 &pshufb ("xmm0","xmm3"); # 0 = sbet
325 &pxor ("xmm0","xmm4"); # 0 = ch
326
327 &palignr("xmm5","xmm5",12);
328
329&set_label("dec_entry");
330 # top of round
331 &movdqa ("xmm1","xmm6"); # 1 : i
332 &pandn ("xmm1","xmm0"); # 1 = i<<4
333 &psrld ("xmm1",4); # 1 = i
334 &pand ("xmm0","xmm6"); # 0 = k
335 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
336 &pshufb ("xmm2","xmm0"); # 2 = a/k
337 &pxor ("xmm0","xmm1"); # 0 = j
338 &movdqa ("xmm3","xmm7"); # 3 : 1/i
339 &pshufb ("xmm3","xmm1"); # 3 = 1/i
340 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
341 &movdqa ("xmm4","xmm7"); # 4 : 1/j
342 &pshufb ("xmm4","xmm0"); # 4 = 1/j
343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
346 &pxor ("xmm2","xmm0"); # 2 = io
347 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
349 &pxor ("xmm3","xmm1"); # 3 = jo
350 &movdqu ("xmm0",&QWP(0,$key));
351 &jnz (&label("dec_loop"));
352
353 # middle of last round
354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
355 &pshufb ("xmm4","xmm2"); # 4 = sbou
356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
358 &movdqa ("xmm2",&QWP(0,$magic));
359 &pshufb ("xmm0","xmm3"); # 0 = sb1t
360 &pxor ("xmm0","xmm4"); # 0 = A
361 &pshufb ("xmm0","xmm2");
362 &ret ();
363&function_end_B("_vpaes_decrypt_core");
364
365########################################################
366## ##
367## AES key schedule ##
368## ##
369########################################################
370&function_begin_B("_vpaes_schedule_core");
371 &add ($const,&DWP(0,"esp"));
372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
374
375 # input transform
376 &movdqa ("xmm3","xmm0");
377 &lea ($base,&DWP($k_ipt,$const));
378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
379 &call ("_vpaes_schedule_transform");
380 &movdqa ("xmm7","xmm0");
381
382 &test ($out,$out);
383 &jnz (&label("schedule_am_decrypting"));
384
385 # encrypting, output zeroth round key after transform
386 &movdqu (&QWP(0,$key),"xmm0");
387 &jmp (&label("schedule_go"));
388
389&set_label("schedule_am_decrypting");
390 # decrypting, output zeroth round key after shiftrows
391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
392 &pshufb ("xmm3","xmm1");
393 &movdqu (&QWP(0,$key),"xmm3");
394 &xor ($magic,0x30);
395
396&set_label("schedule_go");
397 &cmp ($round,192);
398 &ja (&label("schedule_256"));
399 &je (&label("schedule_192"));
400 # 128: fall though
401
402##
403## .schedule_128
404##
405## 128-bit specific part of key schedule.
406##
407## This schedule is really simple, because all its parts
408## are accomplished by the subroutines.
409##
410&set_label("schedule_128");
411 &mov ($round,10);
412
413&set_label("loop_schedule_128");
414 &call ("_vpaes_schedule_round");
415 &dec ($round);
416 &jz (&label("schedule_mangle_last"));
417 &call ("_vpaes_schedule_mangle"); # write output
418 &jmp (&label("loop_schedule_128"));
419
420##
421## .aes_schedule_192
422##
423## 192-bit specific part of key schedule.
424##
425## The main body of this schedule is the same as the 128-bit
426## schedule, but with more smearing. The long, high side is
427## stored in %xmm7 as before, and the short, low side is in
428## the high bits of %xmm6.
429##
430## This schedule is somewhat nastier, however, because each
431## round produces 192 bits of key material, or 1.5 round keys.
432## Therefore, on each cycle we do 2 rounds and produce 3 round
433## keys.
434##
435&set_label("schedule_192",16);
436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
437 &call ("_vpaes_schedule_transform"); # input transform
438 &movdqa ("xmm6","xmm0"); # save short part
439 &pxor ("xmm4","xmm4"); # clear 4
440 &movhlps("xmm6","xmm4"); # clobber low side with zeros
441 &mov ($round,4);
442
443&set_label("loop_schedule_192");
444 &call ("_vpaes_schedule_round");
445 &palignr("xmm0","xmm6",8);
446 &call ("_vpaes_schedule_mangle"); # save key n
447 &call ("_vpaes_schedule_192_smear");
448 &call ("_vpaes_schedule_mangle"); # save key n+1
449 &call ("_vpaes_schedule_round");
450 &dec ($round);
451 &jz (&label("schedule_mangle_last"));
452 &call ("_vpaes_schedule_mangle"); # save key n+2
453 &call ("_vpaes_schedule_192_smear");
454 &jmp (&label("loop_schedule_192"));
455
456##
457## .aes_schedule_256
458##
459## 256-bit specific part of key schedule.
460##
461## The structure here is very similar to the 128-bit
462## schedule, but with an additional "low side" in
463## %xmm6. The low side's rounds are the same as the
464## high side's, except no rcon and no rotation.
465##
466&set_label("schedule_256",16);
467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
468 &call ("_vpaes_schedule_transform"); # input transform
469 &mov ($round,7);
470
471&set_label("loop_schedule_256");
472 &call ("_vpaes_schedule_mangle"); # output low result
473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
474
475 # high round
476 &call ("_vpaes_schedule_round");
477 &dec ($round);
478 &jz (&label("schedule_mangle_last"));
479 &call ("_vpaes_schedule_mangle");
480
481 # low round. swap xmm7 and xmm6
482 &pshufd ("xmm0","xmm0",0xFF);
483 &movdqa (&QWP(20,"esp"),"xmm7");
484 &movdqa ("xmm7","xmm6");
485 &call ("_vpaes_schedule_low_round");
486 &movdqa ("xmm7",&QWP(20,"esp"));
487
488 &jmp (&label("loop_schedule_256"));
489
490##
491## .aes_schedule_mangle_last
492##
493## Mangler for last round of key schedule
494## Mangles %xmm0
495## when encrypting, outputs out(%xmm0) ^ 63
496## when decrypting, outputs unskew(%xmm0)
497##
498## Always called right before return... jumps to cleanup and exits
499##
500&set_label("schedule_mangle_last",16);
501 # schedule last round key from xmm0
502 &lea ($base,&DWP($k_deskew,$const));
503 &test ($out,$out);
504 &jnz (&label("schedule_mangle_last_dec"));
505
506 # encrypting
507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
508 &pshufb ("xmm0","xmm1"); # output permute
509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
510 &add ($key,32);
511
512&set_label("schedule_mangle_last_dec");
513 &add ($key,-16);
514 &pxor ("xmm0",&QWP($k_s63,$const));
515 &call ("_vpaes_schedule_transform"); # output transform
516 &movdqu (&QWP(0,$key),"xmm0"); # save last key
517
518 # cleanup
519 &pxor ("xmm0","xmm0");
520 &pxor ("xmm1","xmm1");
521 &pxor ("xmm2","xmm2");
522 &pxor ("xmm3","xmm3");
523 &pxor ("xmm4","xmm4");
524 &pxor ("xmm5","xmm5");
525 &pxor ("xmm6","xmm6");
526 &pxor ("xmm7","xmm7");
527 &ret ();
528&function_end_B("_vpaes_schedule_core");
529
530##
531## .aes_schedule_192_smear
532##
533## Smear the short, low side in the 192-bit key schedule.
534##
535## Inputs:
536## %xmm7: high side, b a x y
537## %xmm6: low side, d c 0 0
538## %xmm13: 0
539##
540## Outputs:
541## %xmm6: b+c+d b+c 0 0
542## %xmm0: b+c+d b+c b a
543##
544&function_begin_B("_vpaes_schedule_192_smear");
545 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
546 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
547 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
548 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
549 &movdqa ("xmm0","xmm6");
550 &pxor ("xmm1","xmm1");
551 &movhlps("xmm6","xmm1"); # clobber low side with zeros
552 &ret ();
553&function_end_B("_vpaes_schedule_192_smear");
554
555##
556## .aes_schedule_round
557##
558## Runs one main round of the key schedule on %xmm0, %xmm7
559##
560## Specifically, runs subbytes on the high dword of %xmm0
561## then rotates it by one byte and xors into the low dword of
562## %xmm7.
563##
564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
565## next rcon.
566##
567## Smears the dwords of %xmm7 by xoring the low into the
568## second low, result into third, result into highest.
569##
570## Returns results in %xmm7 = %xmm0.
571## Clobbers %xmm1-%xmm5.
572##
573&function_begin_B("_vpaes_schedule_round");
574 # extract rcon from xmm8
575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
576 &pxor ("xmm1","xmm1");
577 &palignr("xmm1","xmm2",15);
578 &palignr("xmm2","xmm2",15);
579 &pxor ("xmm7","xmm1");
580
581 # rotate
582 &pshufd ("xmm0","xmm0",0xFF);
583 &palignr("xmm0","xmm0",1);
584
585 # fall through...
586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
587
588 # low round: same as high round, but no rotation and no rcon.
589&set_label("_vpaes_schedule_low_round");
590 # smear xmm7
591 &movdqa ("xmm1","xmm7");
592 &pslldq ("xmm7",4);
593 &pxor ("xmm7","xmm1");
594 &movdqa ("xmm1","xmm7");
595 &pslldq ("xmm7",8);
596 &pxor ("xmm7","xmm1");
597 &pxor ("xmm7",&QWP($k_s63,$const));
598
599 # subbyte
600 &movdqa ("xmm4",&QWP($k_s0F,$const));
601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
602 &movdqa ("xmm1","xmm4");
603 &pandn ("xmm1","xmm0");
604 &psrld ("xmm1",4); # 1 = i
605 &pand ("xmm0","xmm4"); # 0 = k
606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
607 &pshufb ("xmm2","xmm0"); # 2 = a/k
608 &pxor ("xmm0","xmm1"); # 0 = j
609 &movdqa ("xmm3","xmm5"); # 3 : 1/i
610 &pshufb ("xmm3","xmm1"); # 3 = 1/i
611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
612 &movdqa ("xmm4","xmm5"); # 4 : 1/j
613 &pshufb ("xmm4","xmm0"); # 4 = 1/j
614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
617 &pxor ("xmm2","xmm0"); # 2 = io
618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
620 &pxor ("xmm3","xmm1"); # 3 = jo
621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
622 &pshufb ("xmm4","xmm2"); # 4 = sbou
623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
624 &pshufb ("xmm0","xmm3"); # 0 = sb1t
625 &pxor ("xmm0","xmm4"); # 0 = sbox output
626
627 # add in smeared stuff
628 &pxor ("xmm0","xmm7");
629 &movdqa ("xmm7","xmm0");
630 &ret ();
631&function_end_B("_vpaes_schedule_round");
632
633##
634## .aes_schedule_transform
635##
636## Linear-transform %xmm0 according to tables at (%ebx)
637##
638## Output in %xmm0
639## Clobbers %xmm1, %xmm2
640##
641&function_begin_B("_vpaes_schedule_transform");
642 &movdqa ("xmm2",&QWP($k_s0F,$const));
643 &movdqa ("xmm1","xmm2");
644 &pandn ("xmm1","xmm0");
645 &psrld ("xmm1",4);
646 &pand ("xmm0","xmm2");
647 &movdqa ("xmm2",&QWP(0,$base));
648 &pshufb ("xmm2","xmm0");
649 &movdqa ("xmm0",&QWP(16,$base));
650 &pshufb ("xmm0","xmm1");
651 &pxor ("xmm0","xmm2");
652 &ret ();
653&function_end_B("_vpaes_schedule_transform");
654
655##
656## .aes_schedule_mangle
657##
658## Mangle xmm0 from (basis-transformed) standard version
659## to our version.
660##
661## On encrypt,
662## xor with 0x63
663## multiply by circulant 0,1,1,1
664## apply shiftrows transform
665##
666## On decrypt,
667## xor with 0x63
668## multiply by "inverse mixcolumns" circulant E,B,D,9
669## deskew
670## apply shiftrows transform
671##
672##
673## Writes out to (%edx), and increments or decrements it
674## Keeps track of round number mod 4 in %ecx
675## Preserves xmm0
676## Clobbers xmm1-xmm5
677##
678&function_begin_B("_vpaes_schedule_mangle");
679 &movdqa ("xmm4","xmm0"); # save xmm0 for later
680 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
681 &test ($out,$out);
682 &jnz (&label("schedule_mangle_dec"));
683
684 # encrypting
685 &add ($key,16);
686 &pxor ("xmm4",&QWP($k_s63,$const));
687 &pshufb ("xmm4","xmm5");
688 &movdqa ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691 &pshufb ("xmm4","xmm5");
692 &pxor ("xmm3","xmm4");
693
694 &jmp (&label("schedule_mangle_both"));
695
696&set_label("schedule_mangle_dec",16);
697 # inverse mix columns
698 &movdqa ("xmm2",&QWP($k_s0F,$const));
699 &lea ($inp,&DWP($k_dksd,$const));
700 &movdqa ("xmm1","xmm2");
701 &pandn ("xmm1","xmm4");
702 &psrld ("xmm1",4); # 1 = hi
703 &pand ("xmm4","xmm2"); # 4 = lo
704
705 &movdqa ("xmm2",&QWP(0,$inp));
706 &pshufb ("xmm2","xmm4");
707 &movdqa ("xmm3",&QWP(0x10,$inp));
708 &pshufb ("xmm3","xmm1");
709 &pxor ("xmm3","xmm2");
710 &pshufb ("xmm3","xmm5");
711
712 &movdqa ("xmm2",&QWP(0x20,$inp));
713 &pshufb ("xmm2","xmm4");
714 &pxor ("xmm2","xmm3");
715 &movdqa ("xmm3",&QWP(0x30,$inp));
716 &pshufb ("xmm3","xmm1");
717 &pxor ("xmm3","xmm2");
718 &pshufb ("xmm3","xmm5");
719
720 &movdqa ("xmm2",&QWP(0x40,$inp));
721 &pshufb ("xmm2","xmm4");
722 &pxor ("xmm2","xmm3");
723 &movdqa ("xmm3",&QWP(0x50,$inp));
724 &pshufb ("xmm3","xmm1");
725 &pxor ("xmm3","xmm2");
726 &pshufb ("xmm3","xmm5");
727
728 &movdqa ("xmm2",&QWP(0x60,$inp));
729 &pshufb ("xmm2","xmm4");
730 &pxor ("xmm2","xmm3");
731 &movdqa ("xmm3",&QWP(0x70,$inp));
732 &pshufb ("xmm3","xmm1");
733 &pxor ("xmm3","xmm2");
734
735 &add ($key,-16);
736
737&set_label("schedule_mangle_both");
738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
739 &pshufb ("xmm3","xmm1");
740 &add ($magic,-16);
741 &and ($magic,0x30);
742 &movdqu (&QWP(0,$key),"xmm3");
743 &ret ();
744&function_end_B("_vpaes_schedule_mangle");
745
746#
747# Interface to OpenSSL
748#
749&function_begin("${PREFIX}_set_encrypt_key");
750 &mov ($inp,&wparam(0)); # inp
751 &lea ($base,&DWP(-56,"esp"));
752 &mov ($round,&wparam(1)); # bits
753 &and ($base,-16);
754 &mov ($key,&wparam(2)); # key
755 &xchg ($base,"esp"); # alloca
756 &mov (&DWP(48,"esp"),$base);
757
758 &mov ($base,$round);
759 &shr ($base,5);
760 &add ($base,5);
761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
762 &mov ($magic,0x30);
763 &mov ($out,0);
764
765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
766 &call ("_vpaes_schedule_core");
767&set_label("pic_point");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
796 &call ("_vpaes_schedule_core");
797&set_label("pic_point");
798
799 &mov ("esp",&DWP(48,"esp"));
800 &xor ("eax","eax");
801&function_end("${PREFIX}_set_decrypt_key");
802
803&function_begin("${PREFIX}_encrypt");
804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805 &call ("_vpaes_preheat");
806&set_label("pic_point");
807 &mov ($inp,&wparam(0)); # inp
808 &lea ($base,&DWP(-56,"esp"));
809 &mov ($out,&wparam(1)); # out
810 &and ($base,-16);
811 &mov ($key,&wparam(2)); # key
812 &xchg ($base,"esp"); # alloca
813 &mov (&DWP(48,"esp"),$base);
814
815 &movdqu ("xmm0",&QWP(0,$inp));
816 &call ("_vpaes_encrypt_core");
817 &movdqu (&QWP(0,$out),"xmm0");
818
819 &mov ("esp",&DWP(48,"esp"));
820&function_end("${PREFIX}_encrypt");
821
822&function_begin("${PREFIX}_decrypt");
823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
824 &call ("_vpaes_preheat");
825&set_label("pic_point");
826 &mov ($inp,&wparam(0)); # inp
827 &lea ($base,&DWP(-56,"esp"));
828 &mov ($out,&wparam(1)); # out
829 &and ($base,-16);
830 &mov ($key,&wparam(2)); # key
831 &xchg ($base,"esp"); # alloca
832 &mov (&DWP(48,"esp"),$base);
833
834 &movdqu ("xmm0",&QWP(0,$inp));
835 &call ("_vpaes_decrypt_core");
836 &movdqu (&QWP(0,$out),"xmm0");
837
838 &mov ("esp",&DWP(48,"esp"));
839&function_end("${PREFIX}_decrypt");
840
841&function_begin("${PREFIX}_cbc_encrypt");
842 &mov ($inp,&wparam(0)); # inp
843 &mov ($out,&wparam(1)); # out
844 &mov ($round,&wparam(2)); # len
845 &mov ($key,&wparam(3)); # key
846 &sub ($round,16);
847 &jc (&label("cbc_abort"));
848 &lea ($base,&DWP(-56,"esp"));
849 &mov ($const,&wparam(4)); # ivp
850 &and ($base,-16);
851 &mov ($magic,&wparam(5)); # enc
852 &xchg ($base,"esp"); # alloca
853 &movdqu ("xmm1",&QWP(0,$const)); # load IV
854 &sub ($out,$inp);
855 &mov (&DWP(48,"esp"),$base);
856
857 &mov (&DWP(0,"esp"),$out); # save out
858 &mov (&DWP(4,"esp"),$key) # save key
859 &mov (&DWP(8,"esp"),$const); # save ivp
860 &mov ($out,$round); # $out works as $len
861
862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
863 &call ("_vpaes_preheat");
864&set_label("pic_point");
865 &cmp ($magic,0);
866 &je (&label("cbc_dec_loop"));
867 &jmp (&label("cbc_enc_loop"));
868
869&set_label("cbc_enc_loop",16);
870 &movdqu ("xmm0",&QWP(0,$inp)); # load input
871 &pxor ("xmm0","xmm1"); # inp^=iv
872 &call ("_vpaes_encrypt_core");
873 &mov ($base,&DWP(0,"esp")); # restore out
874 &mov ($key,&DWP(4,"esp")); # restore key
875 &movdqa ("xmm1","xmm0");
876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
877 &lea ($inp,&DWP(16,$inp));
878 &sub ($out,16);
879 &jnc (&label("cbc_enc_loop"));
880 &jmp (&label("cbc_done"));
881
882&set_label("cbc_dec_loop",16);
883 &movdqu ("xmm0",&QWP(0,$inp)); # load input
884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
886 &call ("_vpaes_decrypt_core");
887 &mov ($base,&DWP(0,"esp")); # restore out
888 &mov ($key,&DWP(4,"esp")); # restore key
889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
892 &lea ($inp,&DWP(16,$inp));
893 &sub ($out,16);
894 &jnc (&label("cbc_dec_loop"));
895
896&set_label("cbc_done");
897 &mov ($base,&DWP(8,"esp")); # restore ivp
898 &mov ("esp",&DWP(48,"esp"));
899 &movdqu (&QWP(0,$base),"xmm1"); # write IV
900&set_label("cbc_abort");
901&function_end("${PREFIX}_cbc_encrypt");
902
903&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000000..37998db5e1
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1206 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open STDOUT,"| $^X $xlate $flavour $output";
60
61$PREFIX="vpaes";
62
63$code.=<<___;
64.text
65
66##
67## _aes_encrypt_core
68##
69## AES-encrypt %xmm0.
70##
71## Inputs:
72## %xmm0 = input
73## %xmm9-%xmm15 as in _vpaes_preheat
74## (%rdx) = scheduled keys
75##
76## Output in %xmm0
77## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
78## Preserves %xmm6 - %xmm8 so you get some local vectors
79##
80##
81.type _vpaes_encrypt_core,\@abi-omnipotent
82.align 16
83_vpaes_encrypt_core:
84 mov %rdx, %r9
85 mov \$16, %r11
86 mov 240(%rdx),%eax
87 movdqa %xmm9, %xmm1
88 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
89 pandn %xmm0, %xmm1
90 movdqu (%r9), %xmm5 # round0 key
91 psrld \$4, %xmm1
92 pand %xmm9, %xmm0
93 pshufb %xmm0, %xmm2
94 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
95 pshufb %xmm1, %xmm0
96 pxor %xmm5, %xmm2
97 pxor %xmm2, %xmm0
98 add \$16, %r9
99 lea .Lk_mc_backward(%rip),%r10
100 jmp .Lenc_entry
101
102.align 16
103.Lenc_loop:
104 # middle of middle round
105 movdqa %xmm13, %xmm4 # 4 : sb1u
106 pshufb %xmm2, %xmm4 # 4 = sb1u
107 pxor %xmm5, %xmm4 # 4 = sb1u + k
108 movdqa %xmm12, %xmm0 # 0 : sb1t
109 pshufb %xmm3, %xmm0 # 0 = sb1t
110 pxor %xmm4, %xmm0 # 0 = A
111 movdqa %xmm15, %xmm5 # 4 : sb2u
112 pshufb %xmm2, %xmm5 # 4 = sb2u
113 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
114 movdqa %xmm14, %xmm2 # 2 : sb2t
115 pshufb %xmm3, %xmm2 # 2 = sb2t
116 pxor %xmm5, %xmm2 # 2 = 2A
117 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
118 movdqa %xmm0, %xmm3 # 3 = A
119 pshufb %xmm1, %xmm0 # 0 = B
120 add \$16, %r9 # next key
121 pxor %xmm2, %xmm0 # 0 = 2A+B
122 pshufb %xmm4, %xmm3 # 3 = D
123 add \$16, %r11 # next mc
124 pxor %xmm0, %xmm3 # 3 = 2A+B+D
125 pshufb %xmm1, %xmm0 # 0 = 2B+C
126 and \$0x30, %r11 # ... mod 4
127 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
128 sub \$1,%rax # nr--
129
130.Lenc_entry:
131 # top of round
132 movdqa %xmm9, %xmm1 # 1 : i
133 pandn %xmm0, %xmm1 # 1 = i<<4
134 psrld \$4, %xmm1 # 1 = i
135 pand %xmm9, %xmm0 # 0 = k
136 movdqa %xmm11, %xmm5 # 2 : a/k
137 pshufb %xmm0, %xmm5 # 2 = a/k
138 pxor %xmm1, %xmm0 # 0 = j
139 movdqa %xmm10, %xmm3 # 3 : 1/i
140 pshufb %xmm1, %xmm3 # 3 = 1/i
141 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
142 movdqa %xmm10, %xmm4 # 4 : 1/j
143 pshufb %xmm0, %xmm4 # 4 = 1/j
144 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
145 movdqa %xmm10, %xmm2 # 2 : 1/iak
146 pshufb %xmm3, %xmm2 # 2 = 1/iak
147 pxor %xmm0, %xmm2 # 2 = io
148 movdqa %xmm10, %xmm3 # 3 : 1/jak
149 movdqu (%r9), %xmm5
150 pshufb %xmm4, %xmm3 # 3 = 1/jak
151 pxor %xmm1, %xmm3 # 3 = jo
152 jnz .Lenc_loop
153
154 # middle of last round
155 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
156 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
157 pshufb %xmm2, %xmm4 # 4 = sbou
158 pxor %xmm5, %xmm4 # 4 = sb1u + k
159 pshufb %xmm3, %xmm0 # 0 = sb1t
160 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
161 pxor %xmm4, %xmm0 # 0 = A
162 pshufb %xmm1, %xmm0
163 ret
164.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
165
166##
167## Decryption core
168##
169## Same API as encryption core.
170##
171.type _vpaes_decrypt_core,\@abi-omnipotent
172.align 16
173_vpaes_decrypt_core:
174 mov %rdx, %r9 # load key
175 mov 240(%rdx),%eax
176 movdqa %xmm9, %xmm1
177 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
178 pandn %xmm0, %xmm1
179 mov %rax, %r11
180 psrld \$4, %xmm1
181 movdqu (%r9), %xmm5 # round0 key
182 shl \$4, %r11
183 pand %xmm9, %xmm0
184 pshufb %xmm0, %xmm2
185 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
186 xor \$0x30, %r11
187 lea .Lk_dsbd(%rip),%r10
188 pshufb %xmm1, %xmm0
189 and \$0x30, %r11
190 pxor %xmm5, %xmm2
191 movdqa .Lk_mc_forward+48(%rip), %xmm5
192 pxor %xmm2, %xmm0
193 add \$16, %r9
194 add %r10, %r11
195 jmp .Ldec_entry
196
197.align 16
198.Ldec_loop:
199##
200## Inverse mix columns
201##
202 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
203 pshufb %xmm2, %xmm4 # 4 = sb9u
204 pxor %xmm0, %xmm4
205 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
206 pshufb %xmm3, %xmm0 # 0 = sb9t
207 pxor %xmm4, %xmm0 # 0 = ch
208 add \$16, %r9 # next round key
209
210 pshufb %xmm5, %xmm0 # MC ch
211 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
212 pshufb %xmm2, %xmm4 # 4 = sbdu
213 pxor %xmm0, %xmm4 # 4 = ch
214 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
215 pshufb %xmm3, %xmm0 # 0 = sbdt
216 pxor %xmm4, %xmm0 # 0 = ch
217 sub \$1,%rax # nr--
218
219 pshufb %xmm5, %xmm0 # MC ch
220 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
221 pshufb %xmm2, %xmm4 # 4 = sbbu
222 pxor %xmm0, %xmm4 # 4 = ch
223 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
224 pshufb %xmm3, %xmm0 # 0 = sbbt
225 pxor %xmm4, %xmm0 # 0 = ch
226
227 pshufb %xmm5, %xmm0 # MC ch
228 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
229 pshufb %xmm2, %xmm4 # 4 = sbeu
230 pxor %xmm0, %xmm4 # 4 = ch
231 movdqa 0x50(%r10),%xmm0 # 0 : sbet
232 pshufb %xmm3, %xmm0 # 0 = sbet
233 pxor %xmm4, %xmm0 # 0 = ch
234
235 palignr \$12, %xmm5, %xmm5
236
237.Ldec_entry:
238 # top of round
239 movdqa %xmm9, %xmm1 # 1 : i
240 pandn %xmm0, %xmm1 # 1 = i<<4
241 psrld \$4, %xmm1 # 1 = i
242 pand %xmm9, %xmm0 # 0 = k
243 movdqa %xmm11, %xmm2 # 2 : a/k
244 pshufb %xmm0, %xmm2 # 2 = a/k
245 pxor %xmm1, %xmm0 # 0 = j
246 movdqa %xmm10, %xmm3 # 3 : 1/i
247 pshufb %xmm1, %xmm3 # 3 = 1/i
248 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
249 movdqa %xmm10, %xmm4 # 4 : 1/j
250 pshufb %xmm0, %xmm4 # 4 = 1/j
251 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
252 movdqa %xmm10, %xmm2 # 2 : 1/iak
253 pshufb %xmm3, %xmm2 # 2 = 1/iak
254 pxor %xmm0, %xmm2 # 2 = io
255 movdqa %xmm10, %xmm3 # 3 : 1/jak
256 pshufb %xmm4, %xmm3 # 3 = 1/jak
257 pxor %xmm1, %xmm3 # 3 = jo
258 movdqu (%r9), %xmm0
259 jnz .Ldec_loop
260
261 # middle of last round
262 movdqa 0x60(%r10), %xmm4 # 3 : sbou
263 pshufb %xmm2, %xmm4 # 4 = sbou
264 pxor %xmm0, %xmm4 # 4 = sb1u + k
265 movdqa 0x70(%r10), %xmm0 # 0 : sbot
266 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
267 pshufb %xmm3, %xmm0 # 0 = sb1t
268 pxor %xmm4, %xmm0 # 0 = A
269 pshufb %xmm2, %xmm0
270 ret
271.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
272
273########################################################
274## ##
275## AES key schedule ##
276## ##
277########################################################
278.type _vpaes_schedule_core,\@abi-omnipotent
279.align 16
280_vpaes_schedule_core:
281 # rdi = key
282 # rsi = size in bits
283 # rdx = buffer
284 # rcx = direction. 0=encrypt, 1=decrypt
285
286 call _vpaes_preheat # load the tables
287 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
288 movdqu (%rdi), %xmm0 # load key (unaligned)
289
290 # input transform
291 movdqa %xmm0, %xmm3
292 lea .Lk_ipt(%rip), %r11
293 call _vpaes_schedule_transform
294 movdqa %xmm0, %xmm7
295
296 lea .Lk_sr(%rip),%r10
297 test %rcx, %rcx
298 jnz .Lschedule_am_decrypting
299
300 # encrypting, output zeroth round key after transform
301 movdqu %xmm0, (%rdx)
302 jmp .Lschedule_go
303
304.Lschedule_am_decrypting:
305 # decrypting, output zeroth round key after shiftrows
306 movdqa (%r8,%r10),%xmm1
307 pshufb %xmm1, %xmm3
308 movdqu %xmm3, (%rdx)
309 xor \$0x30, %r8
310
311.Lschedule_go:
312 cmp \$192, %esi
313 ja .Lschedule_256
314 je .Lschedule_192
315 # 128: fall though
316
317##
318## .schedule_128
319##
320## 128-bit specific part of key schedule.
321##
322## This schedule is really simple, because all its parts
323## are accomplished by the subroutines.
324##
325.Lschedule_128:
326 mov \$10, %esi
327
328.Loop_schedule_128:
329 call _vpaes_schedule_round
330 dec %rsi
331 jz .Lschedule_mangle_last
332 call _vpaes_schedule_mangle # write output
333 jmp .Loop_schedule_128
334
335##
336## .aes_schedule_192
337##
338## 192-bit specific part of key schedule.
339##
340## The main body of this schedule is the same as the 128-bit
341## schedule, but with more smearing. The long, high side is
342## stored in %xmm7 as before, and the short, low side is in
343## the high bits of %xmm6.
344##
345## This schedule is somewhat nastier, however, because each
346## round produces 192 bits of key material, or 1.5 round keys.
347## Therefore, on each cycle we do 2 rounds and produce 3 round
348## keys.
349##
350.align 16
351.Lschedule_192:
352 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
353 call _vpaes_schedule_transform # input transform
354 movdqa %xmm0, %xmm6 # save short part
355 pxor %xmm4, %xmm4 # clear 4
356 movhlps %xmm4, %xmm6 # clobber low side with zeros
357 mov \$4, %esi
358
359.Loop_schedule_192:
360 call _vpaes_schedule_round
361 palignr \$8,%xmm6,%xmm0
362 call _vpaes_schedule_mangle # save key n
363 call _vpaes_schedule_192_smear
364 call _vpaes_schedule_mangle # save key n+1
365 call _vpaes_schedule_round
366 dec %rsi
367 jz .Lschedule_mangle_last
368 call _vpaes_schedule_mangle # save key n+2
369 call _vpaes_schedule_192_smear
370 jmp .Loop_schedule_192
371
372##
373## .aes_schedule_256
374##
375## 256-bit specific part of key schedule.
376##
377## The structure here is very similar to the 128-bit
378## schedule, but with an additional "low side" in
379## %xmm6. The low side's rounds are the same as the
380## high side's, except no rcon and no rotation.
381##
382.align 16
383.Lschedule_256:
384 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
385 call _vpaes_schedule_transform # input transform
386 mov \$7, %esi
387
388.Loop_schedule_256:
389 call _vpaes_schedule_mangle # output low result
390 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
391
392 # high round
393 call _vpaes_schedule_round
394 dec %rsi
395 jz .Lschedule_mangle_last
396 call _vpaes_schedule_mangle
397
398 # low round. swap xmm7 and xmm6
399 pshufd \$0xFF, %xmm0, %xmm0
400 movdqa %xmm7, %xmm5
401 movdqa %xmm6, %xmm7
402 call _vpaes_schedule_low_round
403 movdqa %xmm5, %xmm7
404
405 jmp .Loop_schedule_256
406
407
408##
409## .aes_schedule_mangle_last
410##
411## Mangler for last round of key schedule
412## Mangles %xmm0
413## when encrypting, outputs out(%xmm0) ^ 63
414## when decrypting, outputs unskew(%xmm0)
415##
416## Always called right before return... jumps to cleanup and exits
417##
418.align 16
419.Lschedule_mangle_last:
420 # schedule last round key from xmm0
421 lea .Lk_deskew(%rip),%r11 # prepare to deskew
422 test %rcx, %rcx
423 jnz .Lschedule_mangle_last_dec
424
425 # encrypting
426 movdqa (%r8,%r10),%xmm1
427 pshufb %xmm1, %xmm0 # output permute
428 lea .Lk_opt(%rip), %r11 # prepare to output transform
429 add \$32, %rdx
430
431.Lschedule_mangle_last_dec:
432 add \$-16, %rdx
433 pxor .Lk_s63(%rip), %xmm0
434 call _vpaes_schedule_transform # output transform
435 movdqu %xmm0, (%rdx) # save last key
436
437 # cleanup
438 pxor %xmm0, %xmm0
439 pxor %xmm1, %xmm1
440 pxor %xmm2, %xmm2
441 pxor %xmm3, %xmm3
442 pxor %xmm4, %xmm4
443 pxor %xmm5, %xmm5
444 pxor %xmm6, %xmm6
445 pxor %xmm7, %xmm7
446 ret
447.size _vpaes_schedule_core,.-_vpaes_schedule_core
448
449##
450## .aes_schedule_192_smear
451##
452## Smear the short, low side in the 192-bit key schedule.
453##
454## Inputs:
455## %xmm7: high side, b a x y
456## %xmm6: low side, d c 0 0
457## %xmm13: 0
458##
459## Outputs:
460## %xmm6: b+c+d b+c 0 0
461## %xmm0: b+c+d b+c b a
462##
463.type _vpaes_schedule_192_smear,\@abi-omnipotent
464.align 16
465_vpaes_schedule_192_smear:
466 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
467 pxor %xmm0, %xmm6 # -> c+d c 0 0
468 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
469 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
470 movdqa %xmm6, %xmm0
471 pxor %xmm1, %xmm1
472 movhlps %xmm1, %xmm6 # clobber low side with zeros
473 ret
474.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
475
476##
477## .aes_schedule_round
478##
479## Runs one main round of the key schedule on %xmm0, %xmm7
480##
481## Specifically, runs subbytes on the high dword of %xmm0
482## then rotates it by one byte and xors into the low dword of
483## %xmm7.
484##
485## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
486## next rcon.
487##
488## Smears the dwords of %xmm7 by xoring the low into the
489## second low, result into third, result into highest.
490##
491## Returns results in %xmm7 = %xmm0.
492## Clobbers %xmm1-%xmm4, %r11.
493##
494.type _vpaes_schedule_round,\@abi-omnipotent
495.align 16
496_vpaes_schedule_round:
497 # extract rcon from xmm8
498 pxor %xmm1, %xmm1
499 palignr \$15, %xmm8, %xmm1
500 palignr \$15, %xmm8, %xmm8
501 pxor %xmm1, %xmm7
502
503 # rotate
504 pshufd \$0xFF, %xmm0, %xmm0
505 palignr \$1, %xmm0, %xmm0
506
507 # fall through...
508
509 # low round: same as high round, but no rotation and no rcon.
510_vpaes_schedule_low_round:
511 # smear xmm7
512 movdqa %xmm7, %xmm1
513 pslldq \$4, %xmm7
514 pxor %xmm1, %xmm7
515 movdqa %xmm7, %xmm1
516 pslldq \$8, %xmm7
517 pxor %xmm1, %xmm7
518 pxor .Lk_s63(%rip), %xmm7
519
520 # subbytes
521 movdqa %xmm9, %xmm1
522 pandn %xmm0, %xmm1
523 psrld \$4, %xmm1 # 1 = i
524 pand %xmm9, %xmm0 # 0 = k
525 movdqa %xmm11, %xmm2 # 2 : a/k
526 pshufb %xmm0, %xmm2 # 2 = a/k
527 pxor %xmm1, %xmm0 # 0 = j
528 movdqa %xmm10, %xmm3 # 3 : 1/i
529 pshufb %xmm1, %xmm3 # 3 = 1/i
530 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
531 movdqa %xmm10, %xmm4 # 4 : 1/j
532 pshufb %xmm0, %xmm4 # 4 = 1/j
533 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
534 movdqa %xmm10, %xmm2 # 2 : 1/iak
535 pshufb %xmm3, %xmm2 # 2 = 1/iak
536 pxor %xmm0, %xmm2 # 2 = io
537 movdqa %xmm10, %xmm3 # 3 : 1/jak
538 pshufb %xmm4, %xmm3 # 3 = 1/jak
539 pxor %xmm1, %xmm3 # 3 = jo
540 movdqa %xmm13, %xmm4 # 4 : sbou
541 pshufb %xmm2, %xmm4 # 4 = sbou
542 movdqa %xmm12, %xmm0 # 0 : sbot
543 pshufb %xmm3, %xmm0 # 0 = sb1t
544 pxor %xmm4, %xmm0 # 0 = sbox output
545
546 # add in smeared stuff
547 pxor %xmm7, %xmm0
548 movdqa %xmm0, %xmm7
549 ret
550.size _vpaes_schedule_round,.-_vpaes_schedule_round
551
552##
553## .aes_schedule_transform
554##
555## Linear-transform %xmm0 according to tables at (%r11)
556##
557## Requires that %xmm9 = 0x0F0F... as in preheat
558## Output in %xmm0
559## Clobbers %xmm1, %xmm2
560##
561.type _vpaes_schedule_transform,\@abi-omnipotent
562.align 16
563_vpaes_schedule_transform:
564 movdqa %xmm9, %xmm1
565 pandn %xmm0, %xmm1
566 psrld \$4, %xmm1
567 pand %xmm9, %xmm0
568 movdqa (%r11), %xmm2 # lo
569 pshufb %xmm0, %xmm2
570 movdqa 16(%r11), %xmm0 # hi
571 pshufb %xmm1, %xmm0
572 pxor %xmm2, %xmm0
573 ret
574.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
575
576##
577## .aes_schedule_mangle
578##
579## Mangle xmm0 from (basis-transformed) standard version
580## to our version.
581##
582## On encrypt,
583## xor with 0x63
584## multiply by circulant 0,1,1,1
585## apply shiftrows transform
586##
587## On decrypt,
588## xor with 0x63
589## multiply by "inverse mixcolumns" circulant E,B,D,9
590## deskew
591## apply shiftrows transform
592##
593##
594## Writes out to (%rdx), and increments or decrements it
595## Keeps track of round number mod 4 in %r8
596## Preserves xmm0
597## Clobbers xmm1-xmm5
598##
599.type _vpaes_schedule_mangle,\@abi-omnipotent
600.align 16
601_vpaes_schedule_mangle:
602 movdqa %xmm0, %xmm4 # save xmm0 for later
603 movdqa .Lk_mc_forward(%rip),%xmm5
604 test %rcx, %rcx
605 jnz .Lschedule_mangle_dec
606
607 # encrypting
608 add \$16, %rdx
609 pxor .Lk_s63(%rip),%xmm4
610 pshufb %xmm5, %xmm4
611 movdqa %xmm4, %xmm3
612 pshufb %xmm5, %xmm4
613 pxor %xmm4, %xmm3
614 pshufb %xmm5, %xmm4
615 pxor %xmm4, %xmm3
616
617 jmp .Lschedule_mangle_both
618.align 16
619.Lschedule_mangle_dec:
620 # inverse mix columns
621 lea .Lk_dksd(%rip),%r11
622 movdqa %xmm9, %xmm1
623 pandn %xmm4, %xmm1
624 psrld \$4, %xmm1 # 1 = hi
625 pand %xmm9, %xmm4 # 4 = lo
626
627 movdqa 0x00(%r11), %xmm2
628 pshufb %xmm4, %xmm2
629 movdqa 0x10(%r11), %xmm3
630 pshufb %xmm1, %xmm3
631 pxor %xmm2, %xmm3
632 pshufb %xmm5, %xmm3
633
634 movdqa 0x20(%r11), %xmm2
635 pshufb %xmm4, %xmm2
636 pxor %xmm3, %xmm2
637 movdqa 0x30(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x40(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x50(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x60(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x70(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656
657 add \$-16, %rdx
658
659.Lschedule_mangle_both:
660 movdqa (%r8,%r10),%xmm1
661 pshufb %xmm1,%xmm3
662 add \$-16, %r8
663 and \$0x30, %r8
664 movdqu %xmm3, (%rdx)
665 ret
666.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
667
668#
669# Interface to OpenSSL
670#
671.globl ${PREFIX}_set_encrypt_key
672.type ${PREFIX}_set_encrypt_key,\@function,3
673.align 16
674${PREFIX}_set_encrypt_key:
675___
676$code.=<<___ if ($win64);
677 lea -0xb8(%rsp),%rsp
678 movaps %xmm6,0x10(%rsp)
679 movaps %xmm7,0x20(%rsp)
680 movaps %xmm8,0x30(%rsp)
681 movaps %xmm9,0x40(%rsp)
682 movaps %xmm10,0x50(%rsp)
683 movaps %xmm11,0x60(%rsp)
684 movaps %xmm12,0x70(%rsp)
685 movaps %xmm13,0x80(%rsp)
686 movaps %xmm14,0x90(%rsp)
687 movaps %xmm15,0xa0(%rsp)
688.Lenc_key_body:
689___
690$code.=<<___;
691 mov %esi,%eax
692 shr \$5,%eax
693 add \$5,%eax
694 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
695
696 mov \$0,%ecx
697 mov \$0x30,%r8d
698 call _vpaes_schedule_core
699___
700$code.=<<___ if ($win64);
701 movaps 0x10(%rsp),%xmm6
702 movaps 0x20(%rsp),%xmm7
703 movaps 0x30(%rsp),%xmm8
704 movaps 0x40(%rsp),%xmm9
705 movaps 0x50(%rsp),%xmm10
706 movaps 0x60(%rsp),%xmm11
707 movaps 0x70(%rsp),%xmm12
708 movaps 0x80(%rsp),%xmm13
709 movaps 0x90(%rsp),%xmm14
710 movaps 0xa0(%rsp),%xmm15
711 lea 0xb8(%rsp),%rsp
712.Lenc_key_epilogue:
713___
714$code.=<<___;
715 xor %eax,%eax
716 ret
717.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
718
719.globl ${PREFIX}_set_decrypt_key
720.type ${PREFIX}_set_decrypt_key,\@function,3
721.align 16
722${PREFIX}_set_decrypt_key:
723___
724$code.=<<___ if ($win64);
725 lea -0xb8(%rsp),%rsp
726 movaps %xmm6,0x10(%rsp)
727 movaps %xmm7,0x20(%rsp)
728 movaps %xmm8,0x30(%rsp)
729 movaps %xmm9,0x40(%rsp)
730 movaps %xmm10,0x50(%rsp)
731 movaps %xmm11,0x60(%rsp)
732 movaps %xmm12,0x70(%rsp)
733 movaps %xmm13,0x80(%rsp)
734 movaps %xmm14,0x90(%rsp)
735 movaps %xmm15,0xa0(%rsp)
736.Ldec_key_body:
737___
738$code.=<<___;
739 mov %esi,%eax
740 shr \$5,%eax
741 add \$5,%eax
742 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
743 shl \$4,%eax
744 lea 16(%rdx,%rax),%rdx
745
746 mov \$1,%ecx
747 mov %esi,%r8d
748 shr \$1,%r8d
749 and \$32,%r8d
750 xor \$32,%r8d # nbits==192?0:32
751 call _vpaes_schedule_core
752___
753$code.=<<___ if ($win64);
754 movaps 0x10(%rsp),%xmm6
755 movaps 0x20(%rsp),%xmm7
756 movaps 0x30(%rsp),%xmm8
757 movaps 0x40(%rsp),%xmm9
758 movaps 0x50(%rsp),%xmm10
759 movaps 0x60(%rsp),%xmm11
760 movaps 0x70(%rsp),%xmm12
761 movaps 0x80(%rsp),%xmm13
762 movaps 0x90(%rsp),%xmm14
763 movaps 0xa0(%rsp),%xmm15
764 lea 0xb8(%rsp),%rsp
765.Ldec_key_epilogue:
766___
767$code.=<<___;
768 xor %eax,%eax
769 ret
770.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
771
772.globl ${PREFIX}_encrypt
773.type ${PREFIX}_encrypt,\@function,3
774.align 16
775${PREFIX}_encrypt:
776___
777$code.=<<___ if ($win64);
778 lea -0xb8(%rsp),%rsp
779 movaps %xmm6,0x10(%rsp)
780 movaps %xmm7,0x20(%rsp)
781 movaps %xmm8,0x30(%rsp)
782 movaps %xmm9,0x40(%rsp)
783 movaps %xmm10,0x50(%rsp)
784 movaps %xmm11,0x60(%rsp)
785 movaps %xmm12,0x70(%rsp)
786 movaps %xmm13,0x80(%rsp)
787 movaps %xmm14,0x90(%rsp)
788 movaps %xmm15,0xa0(%rsp)
789.Lenc_body:
790___
791$code.=<<___;
792 movdqu (%rdi),%xmm0
793 call _vpaes_preheat
794 call _vpaes_encrypt_core
795 movdqu %xmm0,(%rsi)
796___
797$code.=<<___ if ($win64);
798 movaps 0x10(%rsp),%xmm6
799 movaps 0x20(%rsp),%xmm7
800 movaps 0x30(%rsp),%xmm8
801 movaps 0x40(%rsp),%xmm9
802 movaps 0x50(%rsp),%xmm10
803 movaps 0x60(%rsp),%xmm11
804 movaps 0x70(%rsp),%xmm12
805 movaps 0x80(%rsp),%xmm13
806 movaps 0x90(%rsp),%xmm14
807 movaps 0xa0(%rsp),%xmm15
808 lea 0xb8(%rsp),%rsp
809.Lenc_epilogue:
810___
811$code.=<<___;
812 ret
813.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
814
815.globl ${PREFIX}_decrypt
816.type ${PREFIX}_decrypt,\@function,3
817.align 16
818${PREFIX}_decrypt:
819___
820$code.=<<___ if ($win64);
821 lea -0xb8(%rsp),%rsp
822 movaps %xmm6,0x10(%rsp)
823 movaps %xmm7,0x20(%rsp)
824 movaps %xmm8,0x30(%rsp)
825 movaps %xmm9,0x40(%rsp)
826 movaps %xmm10,0x50(%rsp)
827 movaps %xmm11,0x60(%rsp)
828 movaps %xmm12,0x70(%rsp)
829 movaps %xmm13,0x80(%rsp)
830 movaps %xmm14,0x90(%rsp)
831 movaps %xmm15,0xa0(%rsp)
832.Ldec_body:
833___
834$code.=<<___;
835 movdqu (%rdi),%xmm0
836 call _vpaes_preheat
837 call _vpaes_decrypt_core
838 movdqu %xmm0,(%rsi)
839___
840$code.=<<___ if ($win64);
841 movaps 0x10(%rsp),%xmm6
842 movaps 0x20(%rsp),%xmm7
843 movaps 0x30(%rsp),%xmm8
844 movaps 0x40(%rsp),%xmm9
845 movaps 0x50(%rsp),%xmm10
846 movaps 0x60(%rsp),%xmm11
847 movaps 0x70(%rsp),%xmm12
848 movaps 0x80(%rsp),%xmm13
849 movaps 0x90(%rsp),%xmm14
850 movaps 0xa0(%rsp),%xmm15
851 lea 0xb8(%rsp),%rsp
852.Ldec_epilogue:
853___
854$code.=<<___;
855 ret
856.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
857___
858{
859my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
860# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
861# size_t length, const AES_KEY *key,
862# unsigned char *ivp,const int enc);
863$code.=<<___;
864.globl ${PREFIX}_cbc_encrypt
865.type ${PREFIX}_cbc_encrypt,\@function,6
866.align 16
867${PREFIX}_cbc_encrypt:
868 xchg $key,$len
869___
870($len,$key)=($key,$len);
871$code.=<<___;
872 sub \$16,$len
873 jc .Lcbc_abort
874___
875$code.=<<___ if ($win64);
876 lea -0xb8(%rsp),%rsp
877 movaps %xmm6,0x10(%rsp)
878 movaps %xmm7,0x20(%rsp)
879 movaps %xmm8,0x30(%rsp)
880 movaps %xmm9,0x40(%rsp)
881 movaps %xmm10,0x50(%rsp)
882 movaps %xmm11,0x60(%rsp)
883 movaps %xmm12,0x70(%rsp)
884 movaps %xmm13,0x80(%rsp)
885 movaps %xmm14,0x90(%rsp)
886 movaps %xmm15,0xa0(%rsp)
887.Lcbc_body:
888___
889$code.=<<___;
890 movdqu ($ivp),%xmm6 # load IV
891 sub $inp,$out
892 call _vpaes_preheat
893 cmp \$0,${enc}d
894 je .Lcbc_dec_loop
895 jmp .Lcbc_enc_loop
896.align 16
897.Lcbc_enc_loop:
898 movdqu ($inp),%xmm0
899 pxor %xmm6,%xmm0
900 call _vpaes_encrypt_core
901 movdqa %xmm0,%xmm6
902 movdqu %xmm0,($out,$inp)
903 lea 16($inp),$inp
904 sub \$16,$len
905 jnc .Lcbc_enc_loop
906 jmp .Lcbc_done
907.align 16
908.Lcbc_dec_loop:
909 movdqu ($inp),%xmm0
910 movdqa %xmm0,%xmm7
911 call _vpaes_decrypt_core
912 pxor %xmm6,%xmm0
913 movdqa %xmm7,%xmm6
914 movdqu %xmm0,($out,$inp)
915 lea 16($inp),$inp
916 sub \$16,$len
917 jnc .Lcbc_dec_loop
918.Lcbc_done:
919 movdqu %xmm6,($ivp) # save IV
920___
921$code.=<<___ if ($win64);
922 movaps 0x10(%rsp),%xmm6
923 movaps 0x20(%rsp),%xmm7
924 movaps 0x30(%rsp),%xmm8
925 movaps 0x40(%rsp),%xmm9
926 movaps 0x50(%rsp),%xmm10
927 movaps 0x60(%rsp),%xmm11
928 movaps 0x70(%rsp),%xmm12
929 movaps 0x80(%rsp),%xmm13
930 movaps 0x90(%rsp),%xmm14
931 movaps 0xa0(%rsp),%xmm15
932 lea 0xb8(%rsp),%rsp
933.Lcbc_epilogue:
934___
935$code.=<<___;
936.Lcbc_abort:
937 ret
938.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
939___
940}
941$code.=<<___;
942##
943## _aes_preheat
944##
945## Fills register %r10 -> .aes_consts (so you can -fPIC)
946## and %xmm9-%xmm15 as specified below.
947##
948.type _vpaes_preheat,\@abi-omnipotent
949.align 16
950_vpaes_preheat:
951 lea .Lk_s0F(%rip), %r10
952 movdqa -0x20(%r10), %xmm10 # .Lk_inv
953 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
954 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
955 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
956 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
957 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
958 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
959 ret
960.size _vpaes_preheat,.-_vpaes_preheat
961########################################################
962## ##
963## Constants ##
964## ##
965########################################################
966.type _vpaes_consts,\@object
967.align 64
968_vpaes_consts:
969.Lk_inv: # inv, inva
970 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
971 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
972
973.Lk_s0F: # s0F
974 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
975
976.Lk_ipt: # input transform (lo, hi)
977 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
978 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
979
980.Lk_sb1: # sb1u, sb1t
981 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
982 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
983.Lk_sb2: # sb2u, sb2t
984 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
985 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
986.Lk_sbo: # sbou, sbot
987 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
988 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
989
990.Lk_mc_forward: # mc_forward
991 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
992 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
993 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
994 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
995
996.Lk_mc_backward:# mc_backward
997 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
998 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
999 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1000 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1001
1002.Lk_sr: # sr
1003 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1004 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1005 .quad 0x0F060D040B020900, 0x070E050C030A0108
1006 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1007
1008.Lk_rcon: # rcon
1009 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1010
1011.Lk_s63: # s63: all equal to 0x63 transformed
1012 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1013
1014.Lk_opt: # output transform
1015 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1016 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1017
1018.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1019 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1020 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1021
1022##
1023## Decryption stuff
1024## Key schedule constants
1025##
1026.Lk_dksd: # decryption key schedule: invskew x*D
1027 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1028 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1029.Lk_dksb: # decryption key schedule: invskew x*B
1030 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1031 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1032.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1033 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1034 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1035.Lk_dks9: # decryption key schedule: invskew x*9
1036 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1037 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1038
1039##
1040## Decryption stuff
1041## Round function constants
1042##
1043.Lk_dipt: # decryption input transform
1044 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1045 .quad 0x86E383E660056500, 0x12771772F491F194
1046
1047.Lk_dsb9: # decryption sbox output *9*u, *9*t
1048 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1049 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1050.Lk_dsbd: # decryption sbox output *D*u, *D*t
1051 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1052 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1053.Lk_dsbb: # decryption sbox output *B*u, *B*t
1054 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1055 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1056.Lk_dsbe: # decryption sbox output *E*u, *E*t
1057 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1058 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1059.Lk_dsbo: # decryption sbox final output
1060 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1061 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1062.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1063.align 64
1064.size _vpaes_consts,.-_vpaes_consts
1065___
1066
1067if ($win64) {
1068# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1069# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1070$rec="%rcx";
1071$frame="%rdx";
1072$context="%r8";
1073$disp="%r9";
1074
1075$code.=<<___;
1076.extern __imp_RtlVirtualUnwind
1077.type se_handler,\@abi-omnipotent
1078.align 16
1079se_handler:
1080 push %rsi
1081 push %rdi
1082 push %rbx
1083 push %rbp
1084 push %r12
1085 push %r13
1086 push %r14
1087 push %r15
1088 pushfq
1089 sub \$64,%rsp
1090
1091 mov 120($context),%rax # pull context->Rax
1092 mov 248($context),%rbx # pull context->Rip
1093
1094 mov 8($disp),%rsi # disp->ImageBase
1095 mov 56($disp),%r11 # disp->HandlerData
1096
1097 mov 0(%r11),%r10d # HandlerData[0]
1098 lea (%rsi,%r10),%r10 # prologue label
1099 cmp %r10,%rbx # context->Rip<prologue label
1100 jb .Lin_prologue
1101
1102 mov 152($context),%rax # pull context->Rsp
1103
1104 mov 4(%r11),%r10d # HandlerData[1]
1105 lea (%rsi,%r10),%r10 # epilogue label
1106 cmp %r10,%rbx # context->Rip>=epilogue label
1107 jae .Lin_prologue
1108
1109 lea 16(%rax),%rsi # %xmm save area
1110 lea 512($context),%rdi # &context.Xmm6
1111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1112 .long 0xa548f3fc # cld; rep movsq
1113 lea 0xb8(%rax),%rax # adjust stack pointer
1114
1115.Lin_prologue:
1116 mov 8(%rax),%rdi
1117 mov 16(%rax),%rsi
1118 mov %rax,152($context) # restore context->Rsp
1119 mov %rsi,168($context) # restore context->Rsi
1120 mov %rdi,176($context) # restore context->Rdi
1121
1122 mov 40($disp),%rdi # disp->ContextRecord
1123 mov $context,%rsi # context
1124 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1125 .long 0xa548f3fc # cld; rep movsq
1126
1127 mov $disp,%rsi
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov 40(%rsi),%r10 # disp->ContextRecord
1133 lea 56(%rsi),%r11 # &disp->HandlerData
1134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov %r10,32(%rsp) # arg5
1136 mov %r11,40(%rsp) # arg6
1137 mov %r12,48(%rsp) # arg7
1138 mov %rcx,56(%rsp) # arg8, (NULL)
1139 call *__imp_RtlVirtualUnwind(%rip)
1140
1141 mov \$1,%eax # ExceptionContinueSearch
1142 add \$64,%rsp
1143 popfq
1144 pop %r15
1145 pop %r14
1146 pop %r13
1147 pop %r12
1148 pop %rbp
1149 pop %rbx
1150 pop %rdi
1151 pop %rsi
1152 ret
1153.size se_handler,.-se_handler
1154
1155.section .pdata
1156.align 4
1157 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1158 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1159 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1160
1161 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1162 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1163 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1164
1165 .rva .LSEH_begin_${PREFIX}_encrypt
1166 .rva .LSEH_end_${PREFIX}_encrypt
1167 .rva .LSEH_info_${PREFIX}_encrypt
1168
1169 .rva .LSEH_begin_${PREFIX}_decrypt
1170 .rva .LSEH_end_${PREFIX}_decrypt
1171 .rva .LSEH_info_${PREFIX}_decrypt
1172
1173 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1174 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1175 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1176
1177.section .xdata
1178.align 8
1179.LSEH_info_${PREFIX}_set_encrypt_key:
1180 .byte 9,0,0,0
1181 .rva se_handler
1182 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1183.LSEH_info_${PREFIX}_set_decrypt_key:
1184 .byte 9,0,0,0
1185 .rva se_handler
1186 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1187.LSEH_info_${PREFIX}_encrypt:
1188 .byte 9,0,0,0
1189 .rva se_handler
1190 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1191.LSEH_info_${PREFIX}_decrypt:
1192 .byte 9,0,0,0
1193 .rva se_handler
1194 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1195.LSEH_info_${PREFIX}_cbc_encrypt:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1199___
1200}
1201
1202$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1203
1204print $code;
1205
1206close STDOUT;