summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3123
1 files changed, 0 insertions, 3123 deletions
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
deleted file mode 100644
index c44a338114..0000000000
--- a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
+++ /dev/null
@@ -1,3123 +0,0 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possible thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 9.83
87# Nehalem 7.74
88# Atom 19.0
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open OUT,"| \"$^X\" $xlate $flavour $output";
109*STDOUT=*OUT;
110
111my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
112my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
113my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114
115{
116my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117
118sub Sbox {
119# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
120# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
121my @b=@_[0..7];
122my @t=@_[8..11];
123my @s=@_[12..15];
124 &InBasisChange (@b);
125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
127}
128
129sub InBasisChange {
130# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
132my @b=@_[0..7];
133$code.=<<___;
134 pxor @b[6], @b[5]
135 pxor @b[1], @b[2]
136 pxor @b[0], @b[3]
137 pxor @b[2], @b[6]
138 pxor @b[0], @b[5]
139
140 pxor @b[3], @b[6]
141 pxor @b[7], @b[3]
142 pxor @b[5], @b[7]
143 pxor @b[4], @b[3]
144 pxor @b[5], @b[4]
145 pxor @b[1], @b[3]
146
147 pxor @b[7], @b[2]
148 pxor @b[5], @b[1]
149___
150}
151
152sub OutBasisChange {
153# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
154# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
155my @b=@_[0..7];
156$code.=<<___;
157 pxor @b[6], @b[0]
158 pxor @b[4], @b[1]
159 pxor @b[0], @b[2]
160 pxor @b[6], @b[4]
161 pxor @b[1], @b[6]
162
163 pxor @b[5], @b[1]
164 pxor @b[3], @b[5]
165 pxor @b[7], @b[3]
166 pxor @b[5], @b[7]
167 pxor @b[5], @b[2]
168
169 pxor @b[7], @b[4]
170___
171}
172
173sub InvSbox {
174# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
175# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
176my @b=@_[0..7];
177my @t=@_[8..11];
178my @s=@_[12..15];
179 &InvInBasisChange (@b);
180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182}
183
184sub InvInBasisChange { # OutBasisChange in reverse
185my @b=@_[5,1,2,6,3,7,0,4];
186$code.=<<___
187 pxor @b[7], @b[4]
188
189 pxor @b[5], @b[7]
190 pxor @b[5], @b[2]
191 pxor @b[7], @b[3]
192 pxor @b[3], @b[5]
193 pxor @b[5], @b[1]
194
195 pxor @b[1], @b[6]
196 pxor @b[0], @b[2]
197 pxor @b[6], @b[4]
198 pxor @b[6], @b[0]
199 pxor @b[4], @b[1]
200___
201}
202
203sub InvOutBasisChange { # InBasisChange in reverse
204my @b=@_[2,5,7,3,6,1,0,4];
205$code.=<<___;
206 pxor @b[5], @b[1]
207 pxor @b[7], @b[2]
208
209 pxor @b[1], @b[3]
210 pxor @b[5], @b[4]
211 pxor @b[5], @b[7]
212 pxor @b[4], @b[3]
213 pxor @b[0], @b[5]
214 pxor @b[7], @b[3]
215 pxor @b[2], @b[6]
216 pxor @b[1], @b[2]
217 pxor @b[3], @b[6]
218
219 pxor @b[0], @b[3]
220 pxor @b[6], @b[5]
221___
222}
223
224sub Mul_GF4 {
225#;*************************************************************
226#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
227#;*************************************************************
228my ($x0,$x1,$y0,$y1,$t0)=@_;
229$code.=<<___;
230 movdqa $y0, $t0
231 pxor $y1, $t0
232 pand $x0, $t0
233 pxor $x1, $x0
234 pand $y0, $x1
235 pand $y1, $x0
236 pxor $x1, $x0
237 pxor $t0, $x1
238___
239}
240
241sub Mul_GF4_N { # not used, see next subroutine
242# multiply and scale by N
243my ($x0,$x1,$y0,$y1,$t0)=@_;
244$code.=<<___;
245 movdqa $y0, $t0
246 pxor $y1, $t0
247 pand $x0, $t0
248 pxor $x1, $x0
249 pand $y0, $x1
250 pand $y1, $x0
251 pxor $x0, $x1
252 pxor $t0, $x0
253___
254}
255
256sub Mul_GF4_N_GF4 {
257# interleaved Mul_GF4_N and Mul_GF4
258my ($x0,$x1,$y0,$y1,$t0,
259 $x2,$x3,$y2,$y3,$t1)=@_;
260$code.=<<___;
261 movdqa $y0, $t0
262 movdqa $y2, $t1
263 pxor $y1, $t0
264 pxor $y3, $t1
265 pand $x0, $t0
266 pand $x2, $t1
267 pxor $x1, $x0
268 pxor $x3, $x2
269 pand $y0, $x1
270 pand $y2, $x3
271 pand $y1, $x0
272 pand $y3, $x2
273 pxor $x0, $x1
274 pxor $x3, $x2
275 pxor $t0, $x0
276 pxor $t1, $x3
277___
278}
279sub Mul_GF16_2 {
280my @x=@_[0..7];
281my @y=@_[8..11];
282my @t=@_[12..15];
283$code.=<<___;
284 movdqa @x[0], @t[0]
285 movdqa @x[1], @t[1]
286___
287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
288$code.=<<___;
289 pxor @x[2], @t[0]
290 pxor @x[3], @t[1]
291 pxor @y[2], @y[0]
292 pxor @y[3], @y[1]
293___
294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
295 @x[2], @x[3], @y[2], @y[3], @t[2]);
296$code.=<<___;
297 pxor @t[0], @x[0]
298 pxor @t[0], @x[2]
299 pxor @t[1], @x[1]
300 pxor @t[1], @x[3]
301
302 movdqa @x[4], @t[0]
303 movdqa @x[5], @t[1]
304 pxor @x[6], @t[0]
305 pxor @x[7], @t[1]
306___
307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
308 @x[6], @x[7], @y[2], @y[3], @t[2]);
309$code.=<<___;
310 pxor @y[2], @y[0]
311 pxor @y[3], @y[1]
312___
313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314$code.=<<___;
315 pxor @t[0], @x[4]
316 pxor @t[0], @x[6]
317 pxor @t[1], @x[5]
318 pxor @t[1], @x[7]
319___
320}
321sub Inv_GF256 {
322#;********************************************************************
323#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
324#;********************************************************************
325my @x=@_[0..7];
326my @t=@_[8..11];
327my @s=@_[12..15];
328# direct optimizations from hardware
329$code.=<<___;
330 movdqa @x[4], @t[3]
331 movdqa @x[5], @t[2]
332 movdqa @x[1], @t[1]
333 movdqa @x[7], @s[1]
334 movdqa @x[0], @s[0]
335
336 pxor @x[6], @t[3]
337 pxor @x[7], @t[2]
338 pxor @x[3], @t[1]
339 movdqa @t[3], @s[2]
340 pxor @x[6], @s[1]
341 movdqa @t[2], @t[0]
342 pxor @x[2], @s[0]
343 movdqa @t[3], @s[3]
344
345 por @t[1], @t[2]
346 por @s[0], @t[3]
347 pxor @t[0], @s[3]
348 pand @s[0], @s[2]
349 pxor @t[1], @s[0]
350 pand @t[1], @t[0]
351 pand @s[0], @s[3]
352 movdqa @x[3], @s[0]
353 pxor @x[2], @s[0]
354 pand @s[0], @s[1]
355 pxor @s[1], @t[3]
356 pxor @s[1], @t[2]
357 movdqa @x[4], @s[1]
358 movdqa @x[1], @s[0]
359 pxor @x[5], @s[1]
360 pxor @x[0], @s[0]
361 movdqa @s[1], @t[1]
362 pand @s[0], @s[1]
363 por @s[0], @t[1]
364 pxor @s[1], @t[0]
365 pxor @s[3], @t[3]
366 pxor @s[2], @t[2]
367 pxor @s[3], @t[1]
368 movdqa @x[7], @s[0]
369 pxor @s[2], @t[0]
370 movdqa @x[6], @s[1]
371 pxor @s[2], @t[1]
372 movdqa @x[5], @s[2]
373 pand @x[3], @s[0]
374 movdqa @x[4], @s[3]
375 pand @x[2], @s[1]
376 pand @x[1], @s[2]
377 por @x[0], @s[3]
378 pxor @s[0], @t[3]
379 pxor @s[1], @t[2]
380 pxor @s[2], @t[1]
381 pxor @s[3], @t[0]
382
383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
384
385 # new smaller inversion
386
387 movdqa @t[3], @s[0]
388 pand @t[1], @t[3]
389 pxor @t[2], @s[0]
390
391 movdqa @t[0], @s[2]
392 movdqa @s[0], @s[3]
393 pxor @t[3], @s[2]
394 pand @s[2], @s[3]
395
396 movdqa @t[1], @s[1]
397 pxor @t[2], @s[3]
398 pxor @t[0], @s[1]
399
400 pxor @t[2], @t[3]
401
402 pand @t[3], @s[1]
403
404 movdqa @s[2], @t[2]
405 pxor @t[0], @s[1]
406
407 pxor @s[1], @t[2]
408 pxor @s[1], @t[1]
409
410 pand @t[0], @t[2]
411
412 pxor @t[2], @s[2]
413 pxor @t[2], @t[1]
414
415 pand @s[3], @s[2]
416
417 pxor @s[0], @s[2]
418___
419# output in s3, s2, s1, t1
420
421# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
422
423# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
425
426### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427}
428
429# AES linear components
430
431sub ShiftRows {
432my @x=@_[0..7];
433my $mask=pop;
434$code.=<<___;
435 pxor 0x00($key),@x[0]
436 pxor 0x10($key),@x[1]
437 pshufb $mask,@x[0]
438 pxor 0x20($key),@x[2]
439 pshufb $mask,@x[1]
440 pxor 0x30($key),@x[3]
441 pshufb $mask,@x[2]
442 pxor 0x40($key),@x[4]
443 pshufb $mask,@x[3]
444 pxor 0x50($key),@x[5]
445 pshufb $mask,@x[4]
446 pxor 0x60($key),@x[6]
447 pshufb $mask,@x[5]
448 pxor 0x70($key),@x[7]
449 pshufb $mask,@x[6]
450 lea 0x80($key),$key
451 pshufb $mask,@x[7]
452___
453}
454
455sub MixColumns {
456# modified to emit output in order suitable for feeding back to aesenc[last]
457my @x=@_[0..7];
458my @t=@_[8..15];
459my $inv=@_[16]; # optional
460$code.=<<___;
461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
462 pshufd \$0x93, @x[1], @t[1]
463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
464 pshufd \$0x93, @x[2], @t[2]
465 pxor @t[1], @x[1]
466 pshufd \$0x93, @x[3], @t[3]
467 pxor @t[2], @x[2]
468 pshufd \$0x93, @x[4], @t[4]
469 pxor @t[3], @x[3]
470 pshufd \$0x93, @x[5], @t[5]
471 pxor @t[4], @x[4]
472 pshufd \$0x93, @x[6], @t[6]
473 pxor @t[5], @x[5]
474 pshufd \$0x93, @x[7], @t[7]
475 pxor @t[6], @x[6]
476 pxor @t[7], @x[7]
477
478 pxor @x[0], @t[1]
479 pxor @x[7], @t[0]
480 pxor @x[7], @t[1]
481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
482 pxor @x[1], @t[2]
483 pshufd \$0x4E, @x[1], @x[1]
484 pxor @x[4], @t[5]
485 pxor @t[0], @x[0]
486 pxor @x[5], @t[6]
487 pxor @t[1], @x[1]
488 pxor @x[3], @t[4]
489 pshufd \$0x4E, @x[4], @t[0]
490 pxor @x[6], @t[7]
491 pshufd \$0x4E, @x[5], @t[1]
492 pxor @x[2], @t[3]
493 pshufd \$0x4E, @x[3], @x[4]
494 pxor @x[7], @t[3]
495 pshufd \$0x4E, @x[7], @x[5]
496 pxor @x[7], @t[4]
497 pshufd \$0x4E, @x[6], @x[3]
498 pxor @t[4], @t[0]
499 pshufd \$0x4E, @x[2], @x[6]
500 pxor @t[5], @t[1]
501___
502$code.=<<___ if (!$inv);
503 pxor @t[3], @x[4]
504 pxor @t[7], @x[5]
505 pxor @t[6], @x[3]
506 movdqa @t[0], @x[2]
507 pxor @t[2], @x[6]
508 movdqa @t[1], @x[7]
509___
510$code.=<<___ if ($inv);
511 pxor @x[4], @t[3]
512 pxor @t[7], @x[5]
513 pxor @x[3], @t[6]
514 movdqa @t[0], @x[3]
515 pxor @t[2], @x[6]
516 movdqa @t[6], @x[2]
517 movdqa @t[1], @x[7]
518 movdqa @x[6], @x[4]
519 movdqa @t[3], @x[6]
520___
521}
522
523sub InvMixColumns_orig {
524my @x=@_[0..7];
525my @t=@_[8..15];
526
527$code.=<<___;
528 # multiplication by 0x0e
529 pshufd \$0x93, @x[7], @t[7]
530 movdqa @x[2], @t[2]
531 pxor @x[5], @x[7] # 7 5
532 pxor @x[5], @x[2] # 2 5
533 pshufd \$0x93, @x[0], @t[0]
534 movdqa @x[5], @t[5]
535 pxor @x[0], @x[5] # 5 0 [1]
536 pxor @x[1], @x[0] # 0 1
537 pshufd \$0x93, @x[1], @t[1]
538 pxor @x[2], @x[1] # 1 25
539 pxor @x[6], @x[0] # 01 6 [2]
540 pxor @x[3], @x[1] # 125 3 [4]
541 pshufd \$0x93, @x[3], @t[3]
542 pxor @x[0], @x[2] # 25 016 [3]
543 pxor @x[7], @x[3] # 3 75
544 pxor @x[6], @x[7] # 75 6 [0]
545 pshufd \$0x93, @x[6], @t[6]
546 movdqa @x[4], @t[4]
547 pxor @x[4], @x[6] # 6 4
548 pxor @x[3], @x[4] # 4 375 [6]
549 pxor @x[7], @x[3] # 375 756=36
550 pxor @t[5], @x[6] # 64 5 [7]
551 pxor @t[2], @x[3] # 36 2
552 pxor @t[4], @x[3] # 362 4 [5]
553 pshufd \$0x93, @t[5], @t[5]
554___
555 my @y = @x[7,5,0,2,1,3,4,6];
556$code.=<<___;
557 # multiplication by 0x0b
558 pxor @y[0], @y[1]
559 pxor @t[0], @y[0]
560 pxor @t[1], @y[1]
561 pshufd \$0x93, @t[2], @t[2]
562 pxor @t[5], @y[0]
563 pxor @t[6], @y[1]
564 pxor @t[7], @y[0]
565 pshufd \$0x93, @t[4], @t[4]
566 pxor @t[6], @t[7] # clobber t[7]
567 pxor @y[0], @y[1]
568
569 pxor @t[0], @y[3]
570 pshufd \$0x93, @t[0], @t[0]
571 pxor @t[1], @y[2]
572 pxor @t[1], @y[4]
573 pxor @t[2], @y[2]
574 pshufd \$0x93, @t[1], @t[1]
575 pxor @t[2], @y[3]
576 pxor @t[2], @y[5]
577 pxor @t[7], @y[2]
578 pshufd \$0x93, @t[2], @t[2]
579 pxor @t[3], @y[3]
580 pxor @t[3], @y[6]
581 pxor @t[3], @y[4]
582 pshufd \$0x93, @t[3], @t[3]
583 pxor @t[4], @y[7]
584 pxor @t[4], @y[5]
585 pxor @t[7], @y[7]
586 pxor @t[5], @y[3]
587 pxor @t[4], @y[4]
588 pxor @t[5], @t[7] # clobber t[7] even more
589
590 pxor @t[7], @y[5]
591 pshufd \$0x93, @t[4], @t[4]
592 pxor @t[7], @y[6]
593 pxor @t[7], @y[4]
594
595 pxor @t[5], @t[7]
596 pshufd \$0x93, @t[5], @t[5]
597 pxor @t[6], @t[7] # restore t[7]
598
599 # multiplication by 0x0d
600 pxor @y[7], @y[4]
601 pxor @t[4], @y[7]
602 pshufd \$0x93, @t[6], @t[6]
603 pxor @t[0], @y[2]
604 pxor @t[5], @y[7]
605 pxor @t[2], @y[2]
606 pshufd \$0x93, @t[7], @t[7]
607
608 pxor @y[1], @y[3]
609 pxor @t[1], @y[1]
610 pxor @t[0], @y[0]
611 pxor @t[0], @y[3]
612 pxor @t[5], @y[1]
613 pxor @t[5], @y[0]
614 pxor @t[7], @y[1]
615 pshufd \$0x93, @t[0], @t[0]
616 pxor @t[6], @y[0]
617 pxor @y[1], @y[3]
618 pxor @t[1], @y[4]
619 pshufd \$0x93, @t[1], @t[1]
620
621 pxor @t[7], @y[7]
622 pxor @t[2], @y[4]
623 pxor @t[2], @y[5]
624 pshufd \$0x93, @t[2], @t[2]
625 pxor @t[6], @y[2]
626 pxor @t[3], @t[6] # clobber t[6]
627 pxor @y[7], @y[4]
628 pxor @t[6], @y[3]
629
630 pxor @t[6], @y[6]
631 pxor @t[5], @y[5]
632 pxor @t[4], @y[6]
633 pshufd \$0x93, @t[4], @t[4]
634 pxor @t[6], @y[5]
635 pxor @t[7], @y[6]
636 pxor @t[3], @t[6] # restore t[6]
637
638 pshufd \$0x93, @t[5], @t[5]
639 pshufd \$0x93, @t[6], @t[6]
640 pshufd \$0x93, @t[7], @t[7]
641 pshufd \$0x93, @t[3], @t[3]
642
643 # multiplication by 0x09
644 pxor @y[1], @y[4]
645 pxor @y[1], @t[1] # t[1]=y[1]
646 pxor @t[5], @t[0] # clobber t[0]
647 pxor @t[5], @t[1]
648 pxor @t[0], @y[3]
649 pxor @y[0], @t[0] # t[0]=y[0]
650 pxor @t[6], @t[1]
651 pxor @t[7], @t[6] # clobber t[6]
652 pxor @t[1], @y[4]
653 pxor @t[4], @y[7]
654 pxor @y[4], @t[4] # t[4]=y[4]
655 pxor @t[3], @y[6]
656 pxor @y[3], @t[3] # t[3]=y[3]
657 pxor @t[2], @y[5]
658 pxor @y[2], @t[2] # t[2]=y[2]
659 pxor @t[7], @t[3]
660 pxor @y[5], @t[5] # t[5]=y[5]
661 pxor @t[6], @t[2]
662 pxor @t[6], @t[5]
663 pxor @y[6], @t[6] # t[6]=y[6]
664 pxor @y[7], @t[7] # t[7]=y[7]
665
666 movdqa @t[0],@XMM[0]
667 movdqa @t[1],@XMM[1]
668 movdqa @t[2],@XMM[2]
669 movdqa @t[3],@XMM[3]
670 movdqa @t[4],@XMM[4]
671 movdqa @t[5],@XMM[5]
672 movdqa @t[6],@XMM[6]
673 movdqa @t[7],@XMM[7]
674___
675}
676
677sub InvMixColumns {
678my @x=@_[0..7];
679my @t=@_[8..15];
680
681# Thanks to Jussi Kivilinna for providing pointer to
682#
683# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
684# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
685# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
686# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
687
688$code.=<<___;
689 # multiplication by 0x05-0x00-0x04-0x00
690 pshufd \$0x4E, @x[0], @t[0]
691 pshufd \$0x4E, @x[6], @t[6]
692 pxor @x[0], @t[0]
693 pshufd \$0x4E, @x[7], @t[7]
694 pxor @x[6], @t[6]
695 pshufd \$0x4E, @x[1], @t[1]
696 pxor @x[7], @t[7]
697 pshufd \$0x4E, @x[2], @t[2]
698 pxor @x[1], @t[1]
699 pshufd \$0x4E, @x[3], @t[3]
700 pxor @x[2], @t[2]
701 pxor @t[6], @x[0]
702 pxor @t[6], @x[1]
703 pshufd \$0x4E, @x[4], @t[4]
704 pxor @x[3], @t[3]
705 pxor @t[0], @x[2]
706 pxor @t[1], @x[3]
707 pshufd \$0x4E, @x[5], @t[5]
708 pxor @x[4], @t[4]
709 pxor @t[7], @x[1]
710 pxor @t[2], @x[4]
711 pxor @x[5], @t[5]
712
713 pxor @t[7], @x[2]
714 pxor @t[6], @x[3]
715 pxor @t[6], @x[4]
716 pxor @t[3], @x[5]
717 pxor @t[4], @x[6]
718 pxor @t[7], @x[4]
719 pxor @t[7], @x[5]
720 pxor @t[5], @x[7]
721___
722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
723}
724
725sub aesenc { # not used
726my @b=@_[0..7];
727my @t=@_[8..15];
728$code.=<<___;
729 movdqa 0x30($const),@t[0] # .LSR
730___
731 &ShiftRows (@b,@t[0]);
732 &Sbox (@b,@t);
733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
734}
735
736sub aesenclast { # not used
737my @b=@_[0..7];
738my @t=@_[8..15];
739$code.=<<___;
740 movdqa 0x40($const),@t[0] # .LSRM0
741___
742 &ShiftRows (@b,@t[0]);
743 &Sbox (@b,@t);
744$code.=<<___
745 pxor 0x00($key),@b[0]
746 pxor 0x10($key),@b[1]
747 pxor 0x20($key),@b[4]
748 pxor 0x30($key),@b[6]
749 pxor 0x40($key),@b[3]
750 pxor 0x50($key),@b[7]
751 pxor 0x60($key),@b[2]
752 pxor 0x70($key),@b[5]
753___
754}
755
756sub swapmove {
757my ($a,$b,$n,$mask,$t)=@_;
758$code.=<<___;
759 movdqa $b,$t
760 psrlq \$$n,$b
761 pxor $a,$b
762 pand $mask,$b
763 pxor $b,$a
764 psllq \$$n,$b
765 pxor $t,$b
766___
767}
768sub swapmove2x {
769my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
770$code.=<<___;
771 movdqa $b0,$t0
772 psrlq \$$n,$b0
773 movdqa $b1,$t1
774 psrlq \$$n,$b1
775 pxor $a0,$b0
776 pxor $a1,$b1
777 pand $mask,$b0
778 pand $mask,$b1
779 pxor $b0,$a0
780 psllq \$$n,$b0
781 pxor $b1,$a1
782 psllq \$$n,$b1
783 pxor $t0,$b0
784 pxor $t1,$b1
785___
786}
787
788sub bitslice {
789my @x=reverse(@_[0..7]);
790my ($t0,$t1,$t2,$t3)=@_[8..11];
791$code.=<<___;
792 movdqa 0x00($const),$t0 # .LBS0
793 movdqa 0x10($const),$t1 # .LBS1
794___
795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
797$code.=<<___;
798 movdqa 0x20($const),$t0 # .LBS2
799___
800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
802
803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
805}
806
807$code.=<<___;
808.text
809
810.extern asm_AES_encrypt
811.extern asm_AES_decrypt
812
813.type _bsaes_encrypt8,\@abi-omnipotent
814.align 64
815_bsaes_encrypt8:
816 _CET_ENDBR
817 lea .LBS0(%rip), $const # constants table
818
819 movdqa ($key), @XMM[9] # round 0 key
820 lea 0x10($key), $key
821 movdqa 0x50($const), @XMM[8] # .LM0SR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
838_bsaes_encrypt8_bitslice:
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Lenc_sbox
844.align 16
845.Lenc_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Lenc_sbox:\n";
849 &Sbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Lenc_done
853___
854 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
855$code.=<<___;
856 movdqa 0x30($const), @XMM[8] # .LSR
857 jnz .Lenc_loop
858 movdqa 0x40($const), @XMM[8] # .LSRM0
859 jmp .Lenc_loop
860.align 16
861.Lenc_done:
862___
863 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
864 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
865$code.=<<___;
866 movdqa ($key), @XMM[8] # last round key
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[6]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[7]
871 pxor @XMM[8], @XMM[2]
872 pxor @XMM[8], @XMM[5]
873 pxor @XMM[8], @XMM[0]
874 pxor @XMM[8], @XMM[1]
875 ret
876.size _bsaes_encrypt8,.-_bsaes_encrypt8
877
878.type _bsaes_decrypt8,\@abi-omnipotent
879.align 64
880_bsaes_decrypt8:
881 _CET_ENDBR
882 lea .LBS0(%rip), $const # constants table
883
884 movdqa ($key), @XMM[9] # round 0 key
885 lea 0x10($key), $key
886 movdqa -0x30($const), @XMM[8] # .LM0ISR
887 pxor @XMM[9], @XMM[0] # xor with round0 key
888 pxor @XMM[9], @XMM[1]
889 pshufb @XMM[8], @XMM[0]
890 pxor @XMM[9], @XMM[2]
891 pshufb @XMM[8], @XMM[1]
892 pxor @XMM[9], @XMM[3]
893 pshufb @XMM[8], @XMM[2]
894 pxor @XMM[9], @XMM[4]
895 pshufb @XMM[8], @XMM[3]
896 pxor @XMM[9], @XMM[5]
897 pshufb @XMM[8], @XMM[4]
898 pxor @XMM[9], @XMM[6]
899 pshufb @XMM[8], @XMM[5]
900 pxor @XMM[9], @XMM[7]
901 pshufb @XMM[8], @XMM[6]
902 pshufb @XMM[8], @XMM[7]
903___
904 &bitslice (@XMM[0..7, 8..11]);
905$code.=<<___;
906 dec $rounds
907 jmp .Ldec_sbox
908.align 16
909.Ldec_loop:
910___
911 &ShiftRows (@XMM[0..7, 8]);
912$code.=".Ldec_sbox:\n";
913 &InvSbox (@XMM[0..7, 8..15]);
914$code.=<<___;
915 dec $rounds
916 jl .Ldec_done
917___
918 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
919$code.=<<___;
920 movdqa -0x10($const), @XMM[8] # .LISR
921 jnz .Ldec_loop
922 movdqa -0x20($const), @XMM[8] # .LISRM0
923 jmp .Ldec_loop
924.align 16
925.Ldec_done:
926___
927 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
928$code.=<<___;
929 movdqa ($key), @XMM[8] # last round key
930 pxor @XMM[8], @XMM[6]
931 pxor @XMM[8], @XMM[4]
932 pxor @XMM[8], @XMM[2]
933 pxor @XMM[8], @XMM[7]
934 pxor @XMM[8], @XMM[3]
935 pxor @XMM[8], @XMM[5]
936 pxor @XMM[8], @XMM[0]
937 pxor @XMM[8], @XMM[1]
938 ret
939.size _bsaes_decrypt8,.-_bsaes_decrypt8
940___
941}
942{
943my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
944
945sub bitslice_key {
946my @x=reverse(@_[0..7]);
947my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
948
949 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
950$code.=<<___;
951 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
952 movdqa @x[0], @x[2]
953 movdqa @x[1], @x[3]
954___
955 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
956
957 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
958$code.=<<___;
959 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
960 movdqa @x[0], @x[4]
961 movdqa @x[2], @x[6]
962 movdqa @x[1], @x[5]
963 movdqa @x[3], @x[7]
964___
965 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
966 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
967}
968
969$code.=<<___;
970.type _bsaes_key_convert,\@abi-omnipotent
971.align 16
972_bsaes_key_convert:
973 _CET_ENDBR
974 lea .Lmasks(%rip), $const
975 movdqu ($inp), %xmm7 # load round 0 key
976 lea 0x10($inp), $inp
977 movdqa 0x00($const), %xmm0 # 0x01...
978 movdqa 0x10($const), %xmm1 # 0x02...
979 movdqa 0x20($const), %xmm2 # 0x04...
980 movdqa 0x30($const), %xmm3 # 0x08...
981 movdqa 0x40($const), %xmm4 # .LM0
982 pcmpeqd %xmm5, %xmm5 # .LNOT
983
984 movdqu ($inp), %xmm6 # load round 1 key
985 movdqa %xmm7, ($out) # save round 0 key
986 lea 0x10($out), $out
987 dec $rounds
988 jmp .Lkey_loop
989.align 16
990.Lkey_loop:
991 pshufb %xmm4, %xmm6 # .LM0
992
993 movdqa %xmm0, %xmm8
994 movdqa %xmm1, %xmm9
995
996 pand %xmm6, %xmm8
997 pand %xmm6, %xmm9
998 movdqa %xmm2, %xmm10
999 pcmpeqb %xmm0, %xmm8
1000 psllq \$4, %xmm0 # 0x10...
1001 movdqa %xmm3, %xmm11
1002 pcmpeqb %xmm1, %xmm9
1003 psllq \$4, %xmm1 # 0x20...
1004
1005 pand %xmm6, %xmm10
1006 pand %xmm6, %xmm11
1007 movdqa %xmm0, %xmm12
1008 pcmpeqb %xmm2, %xmm10
1009 psllq \$4, %xmm2 # 0x40...
1010 movdqa %xmm1, %xmm13
1011 pcmpeqb %xmm3, %xmm11
1012 psllq \$4, %xmm3 # 0x80...
1013
1014 movdqa %xmm2, %xmm14
1015 movdqa %xmm3, %xmm15
1016 pxor %xmm5, %xmm8 # "pnot"
1017 pxor %xmm5, %xmm9
1018
1019 pand %xmm6, %xmm12
1020 pand %xmm6, %xmm13
1021 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1022 pcmpeqb %xmm0, %xmm12
1023 psrlq \$4, %xmm0 # 0x01...
1024 movdqa %xmm9, 0x10($out)
1025 pcmpeqb %xmm1, %xmm13
1026 psrlq \$4, %xmm1 # 0x02...
1027 lea 0x10($inp), $inp
1028
1029 pand %xmm6, %xmm14
1030 pand %xmm6, %xmm15
1031 movdqa %xmm10, 0x20($out)
1032 pcmpeqb %xmm2, %xmm14
1033 psrlq \$4, %xmm2 # 0x04...
1034 movdqa %xmm11, 0x30($out)
1035 pcmpeqb %xmm3, %xmm15
1036 psrlq \$4, %xmm3 # 0x08...
1037 movdqu ($inp), %xmm6 # load next round key
1038
1039 pxor %xmm5, %xmm13 # "pnot"
1040 pxor %xmm5, %xmm14
1041 movdqa %xmm12, 0x40($out)
1042 movdqa %xmm13, 0x50($out)
1043 movdqa %xmm14, 0x60($out)
1044 movdqa %xmm15, 0x70($out)
1045 lea 0x80($out),$out
1046 dec $rounds
1047 jnz .Lkey_loop
1048
1049 movdqa 0x50($const), %xmm7 # .L63
1050 #movdqa %xmm6, ($out) # don't save last round key
1051 ret
1052.size _bsaes_key_convert,.-_bsaes_key_convert
1053___
1054}
1055
1056if (0 && !$win64) { # following four functions are unsupported interface
1057 # used for benchmarking...
1058$code.=<<___;
1059.globl bsaes_enc_key_convert
1060.type bsaes_enc_key_convert,\@function,2
1061.align 16
1062bsaes_enc_key_convert:
1063 _CET_ENDBR
1064 mov 240($inp),%r10d # pass rounds
1065 mov $inp,%rcx # pass key
1066 mov $out,%rax # pass key schedule
1067 call _bsaes_key_convert
1068 pxor %xmm6,%xmm7 # fix up last round key
1069 movdqa %xmm7,(%rax) # save last round key
1070 ret
1071.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1072
1073.globl bsaes_encrypt_128
1074.type bsaes_encrypt_128,\@function,4
1075.align 16
1076bsaes_encrypt_128:
1077.Lenc128_loop:
1078 _CET_ENDBR
1079 movdqu 0x00($inp), @XMM[0] # load input
1080 movdqu 0x10($inp), @XMM[1]
1081 movdqu 0x20($inp), @XMM[2]
1082 movdqu 0x30($inp), @XMM[3]
1083 movdqu 0x40($inp), @XMM[4]
1084 movdqu 0x50($inp), @XMM[5]
1085 movdqu 0x60($inp), @XMM[6]
1086 movdqu 0x70($inp), @XMM[7]
1087 mov $key, %rax # pass the $key
1088 lea 0x80($inp), $inp
1089 mov \$10,%r10d
1090
1091 call _bsaes_encrypt8
1092
1093 movdqu @XMM[0], 0x00($out) # write output
1094 movdqu @XMM[1], 0x10($out)
1095 movdqu @XMM[4], 0x20($out)
1096 movdqu @XMM[6], 0x30($out)
1097 movdqu @XMM[3], 0x40($out)
1098 movdqu @XMM[7], 0x50($out)
1099 movdqu @XMM[2], 0x60($out)
1100 movdqu @XMM[5], 0x70($out)
1101 lea 0x80($out), $out
1102 sub \$0x80,$len
1103 ja .Lenc128_loop
1104 ret
1105.size bsaes_encrypt_128,.-bsaes_encrypt_128
1106
1107.globl bsaes_dec_key_convert
1108.type bsaes_dec_key_convert,\@function,2
1109.align 16
1110bsaes_dec_key_convert:
1111 _CET_ENDBR
1112 mov 240($inp),%r10d # pass rounds
1113 mov $inp,%rcx # pass key
1114 mov $out,%rax # pass key schedule
1115 call _bsaes_key_convert
1116 pxor ($out),%xmm7 # fix up round 0 key
1117 movdqa %xmm6,(%rax) # save last round key
1118 movdqa %xmm7,($out)
1119 ret
1120.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1121
1122.globl bsaes_decrypt_128
1123.type bsaes_decrypt_128,\@function,4
1124.align 16
1125bsaes_decrypt_128:
1126 _CET_ENDBR
1127.Ldec128_loop:
1128 movdqu 0x00($inp), @XMM[0] # load input
1129 movdqu 0x10($inp), @XMM[1]
1130 movdqu 0x20($inp), @XMM[2]
1131 movdqu 0x30($inp), @XMM[3]
1132 movdqu 0x40($inp), @XMM[4]
1133 movdqu 0x50($inp), @XMM[5]
1134 movdqu 0x60($inp), @XMM[6]
1135 movdqu 0x70($inp), @XMM[7]
1136 mov $key, %rax # pass the $key
1137 lea 0x80($inp), $inp
1138 mov \$10,%r10d
1139
1140 call _bsaes_decrypt8
1141
1142 movdqu @XMM[0], 0x00($out) # write output
1143 movdqu @XMM[1], 0x10($out)
1144 movdqu @XMM[6], 0x20($out)
1145 movdqu @XMM[4], 0x30($out)
1146 movdqu @XMM[2], 0x40($out)
1147 movdqu @XMM[7], 0x50($out)
1148 movdqu @XMM[3], 0x60($out)
1149 movdqu @XMM[5], 0x70($out)
1150 lea 0x80($out), $out
1151 sub \$0x80,$len
1152 ja .Ldec128_loop
1153 ret
1154.size bsaes_decrypt_128,.-bsaes_decrypt_128
1155___
1156}
1157{
1158######################################################################
1159#
1160# OpenSSL interface
1161#
1162my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1163 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1164my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1165
1166if ($ecb) {
1167$code.=<<___;
1168.globl bsaes_ecb_encrypt_blocks
1169.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1170.align 16
1171bsaes_ecb_encrypt_blocks:
1172 _CET_ENDBR
1173 mov %rsp, %rax
1174.Lecb_enc_prologue:
1175 push %rbp
1176 push %rbx
1177 push %r12
1178 push %r13
1179 push %r14
1180 push %r15
1181 lea -0x48(%rsp),%rsp
1182___
1183$code.=<<___ if ($win64);
1184 lea -0xa0(%rsp), %rsp
1185 movaps %xmm6, 0x40(%rsp)
1186 movaps %xmm7, 0x50(%rsp)
1187 movaps %xmm8, 0x60(%rsp)
1188 movaps %xmm9, 0x70(%rsp)
1189 movaps %xmm10, 0x80(%rsp)
1190 movaps %xmm11, 0x90(%rsp)
1191 movaps %xmm12, 0xa0(%rsp)
1192 movaps %xmm13, 0xb0(%rsp)
1193 movaps %xmm14, 0xc0(%rsp)
1194 movaps %xmm15, 0xd0(%rsp)
1195.Lecb_enc_body:
1196___
1197$code.=<<___;
1198 mov %rsp,%rbp # backup %rsp
1199 mov 240($arg4),%eax # rounds
1200 mov $arg1,$inp # backup arguments
1201 mov $arg2,$out
1202 mov $arg3,$len
1203 mov $arg4,$key
1204 cmp \$8,$arg3
1205 jb .Lecb_enc_short
1206
1207 mov %eax,%ebx # backup rounds
1208 shl \$7,%rax # 128 bytes per inner round key
1209 sub \$`128-32`,%rax # size of bit-sliced key schedule
1210 sub %rax,%rsp
1211 mov %rsp,%rax # pass key schedule
1212 mov $key,%rcx # pass key
1213 mov %ebx,%r10d # pass rounds
1214 call _bsaes_key_convert
1215 pxor %xmm6,%xmm7 # fix up last round key
1216 movdqa %xmm7,(%rax) # save last round key
1217
1218 sub \$8,$len
1219.Lecb_enc_loop:
1220 movdqu 0x00($inp), @XMM[0] # load input
1221 movdqu 0x10($inp), @XMM[1]
1222 movdqu 0x20($inp), @XMM[2]
1223 movdqu 0x30($inp), @XMM[3]
1224 movdqu 0x40($inp), @XMM[4]
1225 movdqu 0x50($inp), @XMM[5]
1226 mov %rsp, %rax # pass key schedule
1227 movdqu 0x60($inp), @XMM[6]
1228 mov %ebx,%r10d # pass rounds
1229 movdqu 0x70($inp), @XMM[7]
1230 lea 0x80($inp), $inp
1231
1232 call _bsaes_encrypt8
1233
1234 movdqu @XMM[0], 0x00($out) # write output
1235 movdqu @XMM[1], 0x10($out)
1236 movdqu @XMM[4], 0x20($out)
1237 movdqu @XMM[6], 0x30($out)
1238 movdqu @XMM[3], 0x40($out)
1239 movdqu @XMM[7], 0x50($out)
1240 movdqu @XMM[2], 0x60($out)
1241 movdqu @XMM[5], 0x70($out)
1242 lea 0x80($out), $out
1243 sub \$8,$len
1244 jnc .Lecb_enc_loop
1245
1246 add \$8,$len
1247 jz .Lecb_enc_done
1248
1249 movdqu 0x00($inp), @XMM[0] # load input
1250 mov %rsp, %rax # pass key schedule
1251 mov %ebx,%r10d # pass rounds
1252 cmp \$2,$len
1253 jb .Lecb_enc_one
1254 movdqu 0x10($inp), @XMM[1]
1255 je .Lecb_enc_two
1256 movdqu 0x20($inp), @XMM[2]
1257 cmp \$4,$len
1258 jb .Lecb_enc_three
1259 movdqu 0x30($inp), @XMM[3]
1260 je .Lecb_enc_four
1261 movdqu 0x40($inp), @XMM[4]
1262 cmp \$6,$len
1263 jb .Lecb_enc_five
1264 movdqu 0x50($inp), @XMM[5]
1265 je .Lecb_enc_six
1266 movdqu 0x60($inp), @XMM[6]
1267 call _bsaes_encrypt8
1268 movdqu @XMM[0], 0x00($out) # write output
1269 movdqu @XMM[1], 0x10($out)
1270 movdqu @XMM[4], 0x20($out)
1271 movdqu @XMM[6], 0x30($out)
1272 movdqu @XMM[3], 0x40($out)
1273 movdqu @XMM[7], 0x50($out)
1274 movdqu @XMM[2], 0x60($out)
1275 jmp .Lecb_enc_done
1276.align 16
1277.Lecb_enc_six:
1278 call _bsaes_encrypt8
1279 movdqu @XMM[0], 0x00($out) # write output
1280 movdqu @XMM[1], 0x10($out)
1281 movdqu @XMM[4], 0x20($out)
1282 movdqu @XMM[6], 0x30($out)
1283 movdqu @XMM[3], 0x40($out)
1284 movdqu @XMM[7], 0x50($out)
1285 jmp .Lecb_enc_done
1286.align 16
1287.Lecb_enc_five:
1288 call _bsaes_encrypt8
1289 movdqu @XMM[0], 0x00($out) # write output
1290 movdqu @XMM[1], 0x10($out)
1291 movdqu @XMM[4], 0x20($out)
1292 movdqu @XMM[6], 0x30($out)
1293 movdqu @XMM[3], 0x40($out)
1294 jmp .Lecb_enc_done
1295.align 16
1296.Lecb_enc_four:
1297 call _bsaes_encrypt8
1298 movdqu @XMM[0], 0x00($out) # write output
1299 movdqu @XMM[1], 0x10($out)
1300 movdqu @XMM[4], 0x20($out)
1301 movdqu @XMM[6], 0x30($out)
1302 jmp .Lecb_enc_done
1303.align 16
1304.Lecb_enc_three:
1305 call _bsaes_encrypt8
1306 movdqu @XMM[0], 0x00($out) # write output
1307 movdqu @XMM[1], 0x10($out)
1308 movdqu @XMM[4], 0x20($out)
1309 jmp .Lecb_enc_done
1310.align 16
1311.Lecb_enc_two:
1312 call _bsaes_encrypt8
1313 movdqu @XMM[0], 0x00($out) # write output
1314 movdqu @XMM[1], 0x10($out)
1315 jmp .Lecb_enc_done
1316.align 16
1317.Lecb_enc_one:
1318 call _bsaes_encrypt8
1319 movdqu @XMM[0], 0x00($out) # write output
1320 jmp .Lecb_enc_done
1321.align 16
1322.Lecb_enc_short:
1323 lea ($inp), $arg1
1324 lea ($out), $arg2
1325 lea ($key), $arg3
1326 call asm_AES_encrypt
1327 lea 16($inp), $inp
1328 lea 16($out), $out
1329 dec $len
1330 jnz .Lecb_enc_short
1331
1332.Lecb_enc_done:
1333 lea (%rsp),%rax
1334 pxor %xmm0, %xmm0
1335.Lecb_enc_bzero: # wipe key schedule [if any]
1336 movdqa %xmm0, 0x00(%rax)
1337 movdqa %xmm0, 0x10(%rax)
1338 lea 0x20(%rax), %rax
1339 cmp %rax, %rbp
1340 jb .Lecb_enc_bzero
1341
1342 lea (%rbp),%rsp # restore %rsp
1343___
1344$code.=<<___ if ($win64);
1345 movaps 0x40(%rbp), %xmm6
1346 movaps 0x50(%rbp), %xmm7
1347 movaps 0x60(%rbp), %xmm8
1348 movaps 0x70(%rbp), %xmm9
1349 movaps 0x80(%rbp), %xmm10
1350 movaps 0x90(%rbp), %xmm11
1351 movaps 0xa0(%rbp), %xmm12
1352 movaps 0xb0(%rbp), %xmm13
1353 movaps 0xc0(%rbp), %xmm14
1354 movaps 0xd0(%rbp), %xmm15
1355 lea 0xa0(%rbp), %rsp
1356___
1357$code.=<<___;
1358 mov 0x48(%rsp), %r15
1359 mov 0x50(%rsp), %r14
1360 mov 0x58(%rsp), %r13
1361 mov 0x60(%rsp), %r12
1362 mov 0x68(%rsp), %rbx
1363 mov 0x70(%rsp), %rax
1364 lea 0x78(%rsp), %rsp
1365 mov %rax, %rbp
1366.Lecb_enc_epilogue:
1367 ret
1368.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1369
1370.globl bsaes_ecb_decrypt_blocks
1371.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1372.align 16
1373bsaes_ecb_decrypt_blocks:
1374 _CET_ENDBR
1375 mov %rsp, %rax
1376.Lecb_dec_prologue:
1377 push %rbp
1378 push %rbx
1379 push %r12
1380 push %r13
1381 push %r14
1382 push %r15
1383 lea -0x48(%rsp),%rsp
1384___
1385$code.=<<___ if ($win64);
1386 lea -0xa0(%rsp), %rsp
1387 movaps %xmm6, 0x40(%rsp)
1388 movaps %xmm7, 0x50(%rsp)
1389 movaps %xmm8, 0x60(%rsp)
1390 movaps %xmm9, 0x70(%rsp)
1391 movaps %xmm10, 0x80(%rsp)
1392 movaps %xmm11, 0x90(%rsp)
1393 movaps %xmm12, 0xa0(%rsp)
1394 movaps %xmm13, 0xb0(%rsp)
1395 movaps %xmm14, 0xc0(%rsp)
1396 movaps %xmm15, 0xd0(%rsp)
1397.Lecb_dec_body:
1398___
1399$code.=<<___;
1400 mov %rsp,%rbp # backup %rsp
1401 mov 240($arg4),%eax # rounds
1402 mov $arg1,$inp # backup arguments
1403 mov $arg2,$out
1404 mov $arg3,$len
1405 mov $arg4,$key
1406 cmp \$8,$arg3
1407 jb .Lecb_dec_short
1408
1409 mov %eax,%ebx # backup rounds
1410 shl \$7,%rax # 128 bytes per inner round key
1411 sub \$`128-32`,%rax # size of bit-sliced key schedule
1412 sub %rax,%rsp
1413 mov %rsp,%rax # pass key schedule
1414 mov $key,%rcx # pass key
1415 mov %ebx,%r10d # pass rounds
1416 call _bsaes_key_convert
1417 pxor (%rsp),%xmm7 # fix up 0 round key
1418 movdqa %xmm6,(%rax) # save last round key
1419 movdqa %xmm7,(%rsp)
1420
1421 sub \$8,$len
1422.Lecb_dec_loop:
1423 movdqu 0x00($inp), @XMM[0] # load input
1424 movdqu 0x10($inp), @XMM[1]
1425 movdqu 0x20($inp), @XMM[2]
1426 movdqu 0x30($inp), @XMM[3]
1427 movdqu 0x40($inp), @XMM[4]
1428 movdqu 0x50($inp), @XMM[5]
1429 mov %rsp, %rax # pass key schedule
1430 movdqu 0x60($inp), @XMM[6]
1431 mov %ebx,%r10d # pass rounds
1432 movdqu 0x70($inp), @XMM[7]
1433 lea 0x80($inp), $inp
1434
1435 call _bsaes_decrypt8
1436
1437 movdqu @XMM[0], 0x00($out) # write output
1438 movdqu @XMM[1], 0x10($out)
1439 movdqu @XMM[6], 0x20($out)
1440 movdqu @XMM[4], 0x30($out)
1441 movdqu @XMM[2], 0x40($out)
1442 movdqu @XMM[7], 0x50($out)
1443 movdqu @XMM[3], 0x60($out)
1444 movdqu @XMM[5], 0x70($out)
1445 lea 0x80($out), $out
1446 sub \$8,$len
1447 jnc .Lecb_dec_loop
1448
1449 add \$8,$len
1450 jz .Lecb_dec_done
1451
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 mov %rsp, %rax # pass key schedule
1454 mov %ebx,%r10d # pass rounds
1455 cmp \$2,$len
1456 jb .Lecb_dec_one
1457 movdqu 0x10($inp), @XMM[1]
1458 je .Lecb_dec_two
1459 movdqu 0x20($inp), @XMM[2]
1460 cmp \$4,$len
1461 jb .Lecb_dec_three
1462 movdqu 0x30($inp), @XMM[3]
1463 je .Lecb_dec_four
1464 movdqu 0x40($inp), @XMM[4]
1465 cmp \$6,$len
1466 jb .Lecb_dec_five
1467 movdqu 0x50($inp), @XMM[5]
1468 je .Lecb_dec_six
1469 movdqu 0x60($inp), @XMM[6]
1470 call _bsaes_decrypt8
1471 movdqu @XMM[0], 0x00($out) # write output
1472 movdqu @XMM[1], 0x10($out)
1473 movdqu @XMM[6], 0x20($out)
1474 movdqu @XMM[4], 0x30($out)
1475 movdqu @XMM[2], 0x40($out)
1476 movdqu @XMM[7], 0x50($out)
1477 movdqu @XMM[3], 0x60($out)
1478 jmp .Lecb_dec_done
1479.align 16
1480.Lecb_dec_six:
1481 call _bsaes_decrypt8
1482 movdqu @XMM[0], 0x00($out) # write output
1483 movdqu @XMM[1], 0x10($out)
1484 movdqu @XMM[6], 0x20($out)
1485 movdqu @XMM[4], 0x30($out)
1486 movdqu @XMM[2], 0x40($out)
1487 movdqu @XMM[7], 0x50($out)
1488 jmp .Lecb_dec_done
1489.align 16
1490.Lecb_dec_five:
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1495 movdqu @XMM[4], 0x30($out)
1496 movdqu @XMM[2], 0x40($out)
1497 jmp .Lecb_dec_done
1498.align 16
1499.Lecb_dec_four:
1500 call _bsaes_decrypt8
1501 movdqu @XMM[0], 0x00($out) # write output
1502 movdqu @XMM[1], 0x10($out)
1503 movdqu @XMM[6], 0x20($out)
1504 movdqu @XMM[4], 0x30($out)
1505 jmp .Lecb_dec_done
1506.align 16
1507.Lecb_dec_three:
1508 call _bsaes_decrypt8
1509 movdqu @XMM[0], 0x00($out) # write output
1510 movdqu @XMM[1], 0x10($out)
1511 movdqu @XMM[6], 0x20($out)
1512 jmp .Lecb_dec_done
1513.align 16
1514.Lecb_dec_two:
1515 call _bsaes_decrypt8
1516 movdqu @XMM[0], 0x00($out) # write output
1517 movdqu @XMM[1], 0x10($out)
1518 jmp .Lecb_dec_done
1519.align 16
1520.Lecb_dec_one:
1521 call _bsaes_decrypt8
1522 movdqu @XMM[0], 0x00($out) # write output
1523 jmp .Lecb_dec_done
1524.align 16
1525.Lecb_dec_short:
1526 lea ($inp), $arg1
1527 lea ($out), $arg2
1528 lea ($key), $arg3
1529 call asm_AES_decrypt
1530 lea 16($inp), $inp
1531 lea 16($out), $out
1532 dec $len
1533 jnz .Lecb_dec_short
1534
1535.Lecb_dec_done:
1536 lea (%rsp),%rax
1537 pxor %xmm0, %xmm0
1538.Lecb_dec_bzero: # wipe key schedule [if any]
1539 movdqa %xmm0, 0x00(%rax)
1540 movdqa %xmm0, 0x10(%rax)
1541 lea 0x20(%rax), %rax
1542 cmp %rax, %rbp
1543 jb .Lecb_dec_bzero
1544
1545 lea (%rbp),%rsp # restore %rsp
1546___
1547$code.=<<___ if ($win64);
1548 movaps 0x40(%rbp), %xmm6
1549 movaps 0x50(%rbp), %xmm7
1550 movaps 0x60(%rbp), %xmm8
1551 movaps 0x70(%rbp), %xmm9
1552 movaps 0x80(%rbp), %xmm10
1553 movaps 0x90(%rbp), %xmm11
1554 movaps 0xa0(%rbp), %xmm12
1555 movaps 0xb0(%rbp), %xmm13
1556 movaps 0xc0(%rbp), %xmm14
1557 movaps 0xd0(%rbp), %xmm15
1558 lea 0xa0(%rbp), %rsp
1559___
1560$code.=<<___;
1561 mov 0x48(%rsp), %r15
1562 mov 0x50(%rsp), %r14
1563 mov 0x58(%rsp), %r13
1564 mov 0x60(%rsp), %r12
1565 mov 0x68(%rsp), %rbx
1566 mov 0x70(%rsp), %rax
1567 lea 0x78(%rsp), %rsp
1568 mov %rax, %rbp
1569.Lecb_dec_epilogue:
1570 ret
1571.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1572___
1573}
1574$code.=<<___;
1575.extern asm_AES_cbc_encrypt
1576.globl bsaes_cbc_encrypt
1577.type bsaes_cbc_encrypt,\@abi-omnipotent
1578.align 16
1579bsaes_cbc_encrypt:
1580 _CET_ENDBR
1581___
1582$code.=<<___ if ($win64);
1583 mov 48(%rsp),$arg6 # pull direction flag
1584___
1585$code.=<<___;
1586 cmp \$0,$arg6
1587 jne asm_AES_cbc_encrypt
1588 cmp \$128,$arg3
1589 jb asm_AES_cbc_encrypt
1590
1591 mov %rsp, %rax
1592.Lcbc_dec_prologue:
1593 push %rbp
1594 push %rbx
1595 push %r12
1596 push %r13
1597 push %r14
1598 push %r15
1599 lea -0x48(%rsp), %rsp
1600___
1601$code.=<<___ if ($win64);
1602 mov 0xa0(%rsp),$arg5 # pull ivp
1603 lea -0xa0(%rsp), %rsp
1604 movaps %xmm6, 0x40(%rsp)
1605 movaps %xmm7, 0x50(%rsp)
1606 movaps %xmm8, 0x60(%rsp)
1607 movaps %xmm9, 0x70(%rsp)
1608 movaps %xmm10, 0x80(%rsp)
1609 movaps %xmm11, 0x90(%rsp)
1610 movaps %xmm12, 0xa0(%rsp)
1611 movaps %xmm13, 0xb0(%rsp)
1612 movaps %xmm14, 0xc0(%rsp)
1613 movaps %xmm15, 0xd0(%rsp)
1614.Lcbc_dec_body:
1615___
1616$code.=<<___;
1617 mov %rsp, %rbp # backup %rsp
1618 mov 240($arg4), %eax # rounds
1619 mov $arg1, $inp # backup arguments
1620 mov $arg2, $out
1621 mov $arg3, $len
1622 mov $arg4, $key
1623 mov $arg5, %rbx
1624 shr \$4, $len # bytes to blocks
1625
1626 mov %eax, %edx # rounds
1627 shl \$7, %rax # 128 bytes per inner round key
1628 sub \$`128-32`, %rax # size of bit-sliced key schedule
1629 sub %rax, %rsp
1630
1631 mov %rsp, %rax # pass key schedule
1632 mov $key, %rcx # pass key
1633 mov %edx, %r10d # pass rounds
1634 call _bsaes_key_convert
1635 pxor (%rsp),%xmm7 # fix up 0 round key
1636 movdqa %xmm6,(%rax) # save last round key
1637 movdqa %xmm7,(%rsp)
1638
1639 movdqu (%rbx), @XMM[15] # load IV
1640 sub \$8,$len
1641.Lcbc_dec_loop:
1642 movdqu 0x00($inp), @XMM[0] # load input
1643 movdqu 0x10($inp), @XMM[1]
1644 movdqu 0x20($inp), @XMM[2]
1645 movdqu 0x30($inp), @XMM[3]
1646 movdqu 0x40($inp), @XMM[4]
1647 movdqu 0x50($inp), @XMM[5]
1648 mov %rsp, %rax # pass key schedule
1649 movdqu 0x60($inp), @XMM[6]
1650 mov %edx,%r10d # pass rounds
1651 movdqu 0x70($inp), @XMM[7]
1652 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1653
1654 call _bsaes_decrypt8
1655
1656 pxor 0x20(%rbp), @XMM[0] # ^= IV
1657 movdqu 0x00($inp), @XMM[8] # re-load input
1658 movdqu 0x10($inp), @XMM[9]
1659 pxor @XMM[8], @XMM[1]
1660 movdqu 0x20($inp), @XMM[10]
1661 pxor @XMM[9], @XMM[6]
1662 movdqu 0x30($inp), @XMM[11]
1663 pxor @XMM[10], @XMM[4]
1664 movdqu 0x40($inp), @XMM[12]
1665 pxor @XMM[11], @XMM[2]
1666 movdqu 0x50($inp), @XMM[13]
1667 pxor @XMM[12], @XMM[7]
1668 movdqu 0x60($inp), @XMM[14]
1669 pxor @XMM[13], @XMM[3]
1670 movdqu 0x70($inp), @XMM[15] # IV
1671 pxor @XMM[14], @XMM[5]
1672 movdqu @XMM[0], 0x00($out) # write output
1673 lea 0x80($inp), $inp
1674 movdqu @XMM[1], 0x10($out)
1675 movdqu @XMM[6], 0x20($out)
1676 movdqu @XMM[4], 0x30($out)
1677 movdqu @XMM[2], 0x40($out)
1678 movdqu @XMM[7], 0x50($out)
1679 movdqu @XMM[3], 0x60($out)
1680 movdqu @XMM[5], 0x70($out)
1681 lea 0x80($out), $out
1682 sub \$8,$len
1683 jnc .Lcbc_dec_loop
1684
1685 add \$8,$len
1686 jz .Lcbc_dec_done
1687
1688 movdqu 0x00($inp), @XMM[0] # load input
1689 mov %rsp, %rax # pass key schedule
1690 mov %edx, %r10d # pass rounds
1691 cmp \$2,$len
1692 jb .Lcbc_dec_one
1693 movdqu 0x10($inp), @XMM[1]
1694 je .Lcbc_dec_two
1695 movdqu 0x20($inp), @XMM[2]
1696 cmp \$4,$len
1697 jb .Lcbc_dec_three
1698 movdqu 0x30($inp), @XMM[3]
1699 je .Lcbc_dec_four
1700 movdqu 0x40($inp), @XMM[4]
1701 cmp \$6,$len
1702 jb .Lcbc_dec_five
1703 movdqu 0x50($inp), @XMM[5]
1704 je .Lcbc_dec_six
1705 movdqu 0x60($inp), @XMM[6]
1706 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1707 call _bsaes_decrypt8
1708 pxor 0x20(%rbp), @XMM[0] # ^= IV
1709 movdqu 0x00($inp), @XMM[8] # re-load input
1710 movdqu 0x10($inp), @XMM[9]
1711 pxor @XMM[8], @XMM[1]
1712 movdqu 0x20($inp), @XMM[10]
1713 pxor @XMM[9], @XMM[6]
1714 movdqu 0x30($inp), @XMM[11]
1715 pxor @XMM[10], @XMM[4]
1716 movdqu 0x40($inp), @XMM[12]
1717 pxor @XMM[11], @XMM[2]
1718 movdqu 0x50($inp), @XMM[13]
1719 pxor @XMM[12], @XMM[7]
1720 movdqu 0x60($inp), @XMM[15] # IV
1721 pxor @XMM[13], @XMM[3]
1722 movdqu @XMM[0], 0x00($out) # write output
1723 movdqu @XMM[1], 0x10($out)
1724 movdqu @XMM[6], 0x20($out)
1725 movdqu @XMM[4], 0x30($out)
1726 movdqu @XMM[2], 0x40($out)
1727 movdqu @XMM[7], 0x50($out)
1728 movdqu @XMM[3], 0x60($out)
1729 jmp .Lcbc_dec_done
1730.align 16
1731.Lcbc_dec_six:
1732 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1733 call _bsaes_decrypt8
1734 pxor 0x20(%rbp), @XMM[0] # ^= IV
1735 movdqu 0x00($inp), @XMM[8] # re-load input
1736 movdqu 0x10($inp), @XMM[9]
1737 pxor @XMM[8], @XMM[1]
1738 movdqu 0x20($inp), @XMM[10]
1739 pxor @XMM[9], @XMM[6]
1740 movdqu 0x30($inp), @XMM[11]
1741 pxor @XMM[10], @XMM[4]
1742 movdqu 0x40($inp), @XMM[12]
1743 pxor @XMM[11], @XMM[2]
1744 movdqu 0x50($inp), @XMM[15] # IV
1745 pxor @XMM[12], @XMM[7]
1746 movdqu @XMM[0], 0x00($out) # write output
1747 movdqu @XMM[1], 0x10($out)
1748 movdqu @XMM[6], 0x20($out)
1749 movdqu @XMM[4], 0x30($out)
1750 movdqu @XMM[2], 0x40($out)
1751 movdqu @XMM[7], 0x50($out)
1752 jmp .Lcbc_dec_done
1753.align 16
1754.Lcbc_dec_five:
1755 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1756 call _bsaes_decrypt8
1757 pxor 0x20(%rbp), @XMM[0] # ^= IV
1758 movdqu 0x00($inp), @XMM[8] # re-load input
1759 movdqu 0x10($inp), @XMM[9]
1760 pxor @XMM[8], @XMM[1]
1761 movdqu 0x20($inp), @XMM[10]
1762 pxor @XMM[9], @XMM[6]
1763 movdqu 0x30($inp), @XMM[11]
1764 pxor @XMM[10], @XMM[4]
1765 movdqu 0x40($inp), @XMM[15] # IV
1766 pxor @XMM[11], @XMM[2]
1767 movdqu @XMM[0], 0x00($out) # write output
1768 movdqu @XMM[1], 0x10($out)
1769 movdqu @XMM[6], 0x20($out)
1770 movdqu @XMM[4], 0x30($out)
1771 movdqu @XMM[2], 0x40($out)
1772 jmp .Lcbc_dec_done
1773.align 16
1774.Lcbc_dec_four:
1775 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1776 call _bsaes_decrypt8
1777 pxor 0x20(%rbp), @XMM[0] # ^= IV
1778 movdqu 0x00($inp), @XMM[8] # re-load input
1779 movdqu 0x10($inp), @XMM[9]
1780 pxor @XMM[8], @XMM[1]
1781 movdqu 0x20($inp), @XMM[10]
1782 pxor @XMM[9], @XMM[6]
1783 movdqu 0x30($inp), @XMM[15] # IV
1784 pxor @XMM[10], @XMM[4]
1785 movdqu @XMM[0], 0x00($out) # write output
1786 movdqu @XMM[1], 0x10($out)
1787 movdqu @XMM[6], 0x20($out)
1788 movdqu @XMM[4], 0x30($out)
1789 jmp .Lcbc_dec_done
1790.align 16
1791.Lcbc_dec_three:
1792 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1793 call _bsaes_decrypt8
1794 pxor 0x20(%rbp), @XMM[0] # ^= IV
1795 movdqu 0x00($inp), @XMM[8] # re-load input
1796 movdqu 0x10($inp), @XMM[9]
1797 pxor @XMM[8], @XMM[1]
1798 movdqu 0x20($inp), @XMM[15] # IV
1799 pxor @XMM[9], @XMM[6]
1800 movdqu @XMM[0], 0x00($out) # write output
1801 movdqu @XMM[1], 0x10($out)
1802 movdqu @XMM[6], 0x20($out)
1803 jmp .Lcbc_dec_done
1804.align 16
1805.Lcbc_dec_two:
1806 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1807 call _bsaes_decrypt8
1808 pxor 0x20(%rbp), @XMM[0] # ^= IV
1809 movdqu 0x00($inp), @XMM[8] # re-load input
1810 movdqu 0x10($inp), @XMM[15] # IV
1811 pxor @XMM[8], @XMM[1]
1812 movdqu @XMM[0], 0x00($out) # write output
1813 movdqu @XMM[1], 0x10($out)
1814 jmp .Lcbc_dec_done
1815.align 16
1816.Lcbc_dec_one:
1817 lea ($inp), $arg1
1818 lea 0x20(%rbp), $arg2 # buffer output
1819 lea ($key), $arg3
1820 call asm_AES_decrypt # doesn't touch %xmm
1821 pxor 0x20(%rbp), @XMM[15] # ^= IV
1822 movdqu @XMM[15], ($out) # write output
1823 movdqa @XMM[0], @XMM[15] # IV
1824
1825.Lcbc_dec_done:
1826 movdqu @XMM[15], (%rbx) # return IV
1827 lea (%rsp), %rax
1828 pxor %xmm0, %xmm0
1829.Lcbc_dec_bzero: # wipe key schedule [if any]
1830 movdqa %xmm0, 0x00(%rax)
1831 movdqa %xmm0, 0x10(%rax)
1832 lea 0x20(%rax), %rax
1833 cmp %rax, %rbp
1834 ja .Lcbc_dec_bzero
1835
1836 lea (%rbp),%rsp # restore %rsp
1837___
1838$code.=<<___ if ($win64);
1839 movaps 0x40(%rbp), %xmm6
1840 movaps 0x50(%rbp), %xmm7
1841 movaps 0x60(%rbp), %xmm8
1842 movaps 0x70(%rbp), %xmm9
1843 movaps 0x80(%rbp), %xmm10
1844 movaps 0x90(%rbp), %xmm11
1845 movaps 0xa0(%rbp), %xmm12
1846 movaps 0xb0(%rbp), %xmm13
1847 movaps 0xc0(%rbp), %xmm14
1848 movaps 0xd0(%rbp), %xmm15
1849 lea 0xa0(%rbp), %rsp
1850___
1851$code.=<<___;
1852 mov 0x48(%rsp), %r15
1853 mov 0x50(%rsp), %r14
1854 mov 0x58(%rsp), %r13
1855 mov 0x60(%rsp), %r12
1856 mov 0x68(%rsp), %rbx
1857 mov 0x70(%rsp), %rax
1858 lea 0x78(%rsp), %rsp
1859 mov %rax, %rbp
1860.Lcbc_dec_epilogue:
1861 ret
1862.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1863
1864.globl bsaes_ctr32_encrypt_blocks
1865.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1866.align 16
1867bsaes_ctr32_encrypt_blocks:
1868 _CET_ENDBR
1869 mov %rsp, %rax
1870.Lctr_enc_prologue:
1871 push %rbp
1872 push %rbx
1873 push %r12
1874 push %r13
1875 push %r14
1876 push %r15
1877 lea -0x48(%rsp), %rsp
1878___
1879$code.=<<___ if ($win64);
1880 mov 0xa0(%rsp),$arg5 # pull ivp
1881 lea -0xa0(%rsp), %rsp
1882 movaps %xmm6, 0x40(%rsp)
1883 movaps %xmm7, 0x50(%rsp)
1884 movaps %xmm8, 0x60(%rsp)
1885 movaps %xmm9, 0x70(%rsp)
1886 movaps %xmm10, 0x80(%rsp)
1887 movaps %xmm11, 0x90(%rsp)
1888 movaps %xmm12, 0xa0(%rsp)
1889 movaps %xmm13, 0xb0(%rsp)
1890 movaps %xmm14, 0xc0(%rsp)
1891 movaps %xmm15, 0xd0(%rsp)
1892.Lctr_enc_body:
1893___
1894$code.=<<___;
1895 mov %rsp, %rbp # backup %rsp
1896 movdqu ($arg5), %xmm0 # load counter
1897 mov 240($arg4), %eax # rounds
1898 mov $arg1, $inp # backup arguments
1899 mov $arg2, $out
1900 mov $arg3, $len
1901 mov $arg4, $key
1902 movdqa %xmm0, 0x20(%rbp) # copy counter
1903 cmp \$8, $arg3
1904 jb .Lctr_enc_short
1905
1906 mov %eax, %ebx # rounds
1907 shl \$7, %rax # 128 bytes per inner round key
1908 sub \$`128-32`, %rax # size of bit-sliced key schedule
1909 sub %rax, %rsp
1910
1911 mov %rsp, %rax # pass key schedule
1912 mov $key, %rcx # pass key
1913 mov %ebx, %r10d # pass rounds
1914 call _bsaes_key_convert
1915 pxor %xmm6,%xmm7 # fix up last round key
1916 movdqa %xmm7,(%rax) # save last round key
1917
1918 movdqa (%rsp), @XMM[9] # load round0 key
1919 lea .LADD1(%rip), %r11
1920 movdqa 0x20(%rbp), @XMM[0] # counter copy
1921 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1922 pshufb @XMM[8], @XMM[9] # byte swap upper part
1923 pshufb @XMM[8], @XMM[0]
1924 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1925 jmp .Lctr_enc_loop
1926.align 16
1927.Lctr_enc_loop:
1928 movdqa @XMM[0], 0x20(%rbp) # save counter
1929 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1930 movdqa @XMM[0], @XMM[2]
1931 paddd 0x00(%r11), @XMM[1] # .LADD1
1932 movdqa @XMM[0], @XMM[3]
1933 paddd 0x10(%r11), @XMM[2] # .LADD2
1934 movdqa @XMM[0], @XMM[4]
1935 paddd 0x20(%r11), @XMM[3] # .LADD3
1936 movdqa @XMM[0], @XMM[5]
1937 paddd 0x30(%r11), @XMM[4] # .LADD4
1938 movdqa @XMM[0], @XMM[6]
1939 paddd 0x40(%r11), @XMM[5] # .LADD5
1940 movdqa @XMM[0], @XMM[7]
1941 paddd 0x50(%r11), @XMM[6] # .LADD6
1942 paddd 0x60(%r11), @XMM[7] # .LADD7
1943
1944 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1945 # to flip byte order in 32-bit counter
1946 movdqa (%rsp), @XMM[9] # round 0 key
1947 lea 0x10(%rsp), %rax # pass key schedule
1948 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1949 pxor @XMM[9], @XMM[0] # xor with round0 key
1950 pxor @XMM[9], @XMM[1]
1951 pshufb @XMM[8], @XMM[0]
1952 pxor @XMM[9], @XMM[2]
1953 pshufb @XMM[8], @XMM[1]
1954 pxor @XMM[9], @XMM[3]
1955 pshufb @XMM[8], @XMM[2]
1956 pxor @XMM[9], @XMM[4]
1957 pshufb @XMM[8], @XMM[3]
1958 pxor @XMM[9], @XMM[5]
1959 pshufb @XMM[8], @XMM[4]
1960 pxor @XMM[9], @XMM[6]
1961 pshufb @XMM[8], @XMM[5]
1962 pxor @XMM[9], @XMM[7]
1963 pshufb @XMM[8], @XMM[6]
1964 lea .LBS0(%rip), %r11 # constants table
1965 pshufb @XMM[8], @XMM[7]
1966 mov %ebx,%r10d # pass rounds
1967
1968 call _bsaes_encrypt8_bitslice
1969
1970 sub \$8,$len
1971 jc .Lctr_enc_loop_done
1972
1973 movdqu 0x00($inp), @XMM[8] # load input
1974 movdqu 0x10($inp), @XMM[9]
1975 movdqu 0x20($inp), @XMM[10]
1976 movdqu 0x30($inp), @XMM[11]
1977 movdqu 0x40($inp), @XMM[12]
1978 movdqu 0x50($inp), @XMM[13]
1979 movdqu 0x60($inp), @XMM[14]
1980 movdqu 0x70($inp), @XMM[15]
1981 lea 0x80($inp),$inp
1982 pxor @XMM[0], @XMM[8]
1983 movdqa 0x20(%rbp), @XMM[0] # load counter
1984 pxor @XMM[9], @XMM[1]
1985 movdqu @XMM[8], 0x00($out) # write output
1986 pxor @XMM[10], @XMM[4]
1987 movdqu @XMM[1], 0x10($out)
1988 pxor @XMM[11], @XMM[6]
1989 movdqu @XMM[4], 0x20($out)
1990 pxor @XMM[12], @XMM[3]
1991 movdqu @XMM[6], 0x30($out)
1992 pxor @XMM[13], @XMM[7]
1993 movdqu @XMM[3], 0x40($out)
1994 pxor @XMM[14], @XMM[2]
1995 movdqu @XMM[7], 0x50($out)
1996 pxor @XMM[15], @XMM[5]
1997 movdqu @XMM[2], 0x60($out)
1998 lea .LADD1(%rip), %r11
1999 movdqu @XMM[5], 0x70($out)
2000 lea 0x80($out), $out
2001 paddd 0x70(%r11), @XMM[0] # .LADD8
2002 jnz .Lctr_enc_loop
2003
2004 jmp .Lctr_enc_done
2005.align 16
2006.Lctr_enc_loop_done:
2007 add \$8, $len
2008 movdqu 0x00($inp), @XMM[8] # load input
2009 pxor @XMM[8], @XMM[0]
2010 movdqu @XMM[0], 0x00($out) # write output
2011 cmp \$2,$len
2012 jb .Lctr_enc_done
2013 movdqu 0x10($inp), @XMM[9]
2014 pxor @XMM[9], @XMM[1]
2015 movdqu @XMM[1], 0x10($out)
2016 je .Lctr_enc_done
2017 movdqu 0x20($inp), @XMM[10]
2018 pxor @XMM[10], @XMM[4]
2019 movdqu @XMM[4], 0x20($out)
2020 cmp \$4,$len
2021 jb .Lctr_enc_done
2022 movdqu 0x30($inp), @XMM[11]
2023 pxor @XMM[11], @XMM[6]
2024 movdqu @XMM[6], 0x30($out)
2025 je .Lctr_enc_done
2026 movdqu 0x40($inp), @XMM[12]
2027 pxor @XMM[12], @XMM[3]
2028 movdqu @XMM[3], 0x40($out)
2029 cmp \$6,$len
2030 jb .Lctr_enc_done
2031 movdqu 0x50($inp), @XMM[13]
2032 pxor @XMM[13], @XMM[7]
2033 movdqu @XMM[7], 0x50($out)
2034 je .Lctr_enc_done
2035 movdqu 0x60($inp), @XMM[14]
2036 pxor @XMM[14], @XMM[2]
2037 movdqu @XMM[2], 0x60($out)
2038 jmp .Lctr_enc_done
2039
2040.align 16
2041.Lctr_enc_short:
2042 lea 0x20(%rbp), $arg1
2043 lea 0x30(%rbp), $arg2
2044 lea ($key), $arg3
2045 call asm_AES_encrypt
2046 movdqu ($inp), @XMM[1]
2047 lea 16($inp), $inp
2048 mov 0x2c(%rbp), %eax # load 32-bit counter
2049 bswap %eax
2050 pxor 0x30(%rbp), @XMM[1]
2051 inc %eax # increment
2052 movdqu @XMM[1], ($out)
2053 bswap %eax
2054 lea 16($out), $out
2055 mov %eax, 0x2c(%rsp) # save 32-bit counter
2056 dec $len
2057 jnz .Lctr_enc_short
2058
2059.Lctr_enc_done:
2060 lea (%rsp), %rax
2061 pxor %xmm0, %xmm0
2062.Lctr_enc_bzero: # wipe key schedule [if any]
2063 movdqa %xmm0, 0x00(%rax)
2064 movdqa %xmm0, 0x10(%rax)
2065 lea 0x20(%rax), %rax
2066 cmp %rax, %rbp
2067 ja .Lctr_enc_bzero
2068
2069 lea (%rbp),%rsp # restore %rsp
2070___
2071$code.=<<___ if ($win64);
2072 movaps 0x40(%rbp), %xmm6
2073 movaps 0x50(%rbp), %xmm7
2074 movaps 0x60(%rbp), %xmm8
2075 movaps 0x70(%rbp), %xmm9
2076 movaps 0x80(%rbp), %xmm10
2077 movaps 0x90(%rbp), %xmm11
2078 movaps 0xa0(%rbp), %xmm12
2079 movaps 0xb0(%rbp), %xmm13
2080 movaps 0xc0(%rbp), %xmm14
2081 movaps 0xd0(%rbp), %xmm15
2082 lea 0xa0(%rbp), %rsp
2083___
2084$code.=<<___;
2085 mov 0x48(%rsp), %r15
2086 mov 0x50(%rsp), %r14
2087 mov 0x58(%rsp), %r13
2088 mov 0x60(%rsp), %r12
2089 mov 0x68(%rsp), %rbx
2090 mov 0x70(%rsp), %rax
2091 lea 0x78(%rsp), %rsp
2092 mov %rax, %rbp
2093.Lctr_enc_epilogue:
2094 ret
2095.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2096___
2097######################################################################
2098# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2099# const AES_KEY *key1, const AES_KEY *key2,
2100# const unsigned char iv[16]);
2101#
2102my ($twmask,$twres,$twtmp)=@XMM[13..15];
2103$arg6=~s/d$//;
2104
2105$code.=<<___;
2106.globl bsaes_xts_encrypt
2107.type bsaes_xts_encrypt,\@abi-omnipotent
2108.align 16
2109bsaes_xts_encrypt:
2110 _CET_ENDBR
2111 mov %rsp, %rax
2112.Lxts_enc_prologue:
2113 push %rbp
2114 push %rbx
2115 push %r12
2116 push %r13
2117 push %r14
2118 push %r15
2119 lea -0x48(%rsp), %rsp
2120___
2121$code.=<<___ if ($win64);
2122 mov 0xa0(%rsp),$arg5 # pull key2
2123 mov 0xa8(%rsp),$arg6 # pull ivp
2124 lea -0xa0(%rsp), %rsp
2125 movaps %xmm6, 0x40(%rsp)
2126 movaps %xmm7, 0x50(%rsp)
2127 movaps %xmm8, 0x60(%rsp)
2128 movaps %xmm9, 0x70(%rsp)
2129 movaps %xmm10, 0x80(%rsp)
2130 movaps %xmm11, 0x90(%rsp)
2131 movaps %xmm12, 0xa0(%rsp)
2132 movaps %xmm13, 0xb0(%rsp)
2133 movaps %xmm14, 0xc0(%rsp)
2134 movaps %xmm15, 0xd0(%rsp)
2135.Lxts_enc_body:
2136___
2137$code.=<<___;
2138 mov %rsp, %rbp # backup %rsp
2139 mov $arg1, $inp # backup arguments
2140 mov $arg2, $out
2141 mov $arg3, $len
2142 mov $arg4, $key
2143
2144 lea ($arg6), $arg1
2145 lea 0x20(%rbp), $arg2
2146 lea ($arg5), $arg3
2147 call asm_AES_encrypt # generate initial tweak
2148
2149 mov 240($key), %eax # rounds
2150 mov $len, %rbx # backup $len
2151
2152 mov %eax, %edx # rounds
2153 shl \$7, %rax # 128 bytes per inner round key
2154 sub \$`128-32`, %rax # size of bit-sliced key schedule
2155 sub %rax, %rsp
2156
2157 mov %rsp, %rax # pass key schedule
2158 mov $key, %rcx # pass key
2159 mov %edx, %r10d # pass rounds
2160 call _bsaes_key_convert
2161 pxor %xmm6, %xmm7 # fix up last round key
2162 movdqa %xmm7, (%rax) # save last round key
2163
2164 and \$-16, $len
2165 sub \$0x80, %rsp # place for tweak[8]
2166 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2167
2168 pxor $twtmp, $twtmp
2169 movdqa .Lxts_magic(%rip), $twmask
2170 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2171
2172 sub \$0x80, $len
2173 jc .Lxts_enc_short
2174 jmp .Lxts_enc_loop
2175
2176.align 16
2177.Lxts_enc_loop:
2178___
2179 for ($i=0;$i<7;$i++) {
2180 $code.=<<___;
2181 pshufd \$0x13, $twtmp, $twres
2182 pxor $twtmp, $twtmp
2183 movdqa @XMM[7], @XMM[$i]
2184 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2185 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2186 pand $twmask, $twres # isolate carry and residue
2187 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2188 pxor $twres, @XMM[7]
2189___
2190 $code.=<<___ if ($i>=1);
2191 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2192___
2193 $code.=<<___ if ($i>=2);
2194 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2195___
2196 }
2197$code.=<<___;
2198 movdqu 0x60($inp), @XMM[8+6]
2199 pxor @XMM[8+5], @XMM[5]
2200 movdqu 0x70($inp), @XMM[8+7]
2201 lea 0x80($inp), $inp
2202 movdqa @XMM[7], 0x70(%rsp)
2203 pxor @XMM[8+6], @XMM[6]
2204 lea 0x80(%rsp), %rax # pass key schedule
2205 pxor @XMM[8+7], @XMM[7]
2206 mov %edx, %r10d # pass rounds
2207
2208 call _bsaes_encrypt8
2209
2210 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2211 pxor 0x10(%rsp), @XMM[1]
2212 movdqu @XMM[0], 0x00($out) # write output
2213 pxor 0x20(%rsp), @XMM[4]
2214 movdqu @XMM[1], 0x10($out)
2215 pxor 0x30(%rsp), @XMM[6]
2216 movdqu @XMM[4], 0x20($out)
2217 pxor 0x40(%rsp), @XMM[3]
2218 movdqu @XMM[6], 0x30($out)
2219 pxor 0x50(%rsp), @XMM[7]
2220 movdqu @XMM[3], 0x40($out)
2221 pxor 0x60(%rsp), @XMM[2]
2222 movdqu @XMM[7], 0x50($out)
2223 pxor 0x70(%rsp), @XMM[5]
2224 movdqu @XMM[2], 0x60($out)
2225 movdqu @XMM[5], 0x70($out)
2226 lea 0x80($out), $out
2227
2228 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2229 pxor $twtmp, $twtmp
2230 movdqa .Lxts_magic(%rip), $twmask
2231 pcmpgtd @XMM[7], $twtmp
2232 pshufd \$0x13, $twtmp, $twres
2233 pxor $twtmp, $twtmp
2234 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2235 pand $twmask, $twres # isolate carry and residue
2236 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2237 pxor $twres, @XMM[7]
2238
2239 sub \$0x80,$len
2240 jnc .Lxts_enc_loop
2241
2242.Lxts_enc_short:
2243 add \$0x80, $len
2244 jz .Lxts_enc_done
2245___
2246 for ($i=0;$i<7;$i++) {
2247 $code.=<<___;
2248 pshufd \$0x13, $twtmp, $twres
2249 pxor $twtmp, $twtmp
2250 movdqa @XMM[7], @XMM[$i]
2251 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2252 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2253 pand $twmask, $twres # isolate carry and residue
2254 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2255 pxor $twres, @XMM[7]
2256___
2257 $code.=<<___ if ($i>=1);
2258 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2259 cmp \$`0x10*$i`,$len
2260 je .Lxts_enc_$i
2261___
2262 $code.=<<___ if ($i>=2);
2263 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2264___
2265 }
2266$code.=<<___;
2267 movdqu 0x60($inp), @XMM[8+6]
2268 pxor @XMM[8+5], @XMM[5]
2269 movdqa @XMM[7], 0x70(%rsp)
2270 lea 0x70($inp), $inp
2271 pxor @XMM[8+6], @XMM[6]
2272 lea 0x80(%rsp), %rax # pass key schedule
2273 mov %edx, %r10d # pass rounds
2274
2275 call _bsaes_encrypt8
2276
2277 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2278 pxor 0x10(%rsp), @XMM[1]
2279 movdqu @XMM[0], 0x00($out) # write output
2280 pxor 0x20(%rsp), @XMM[4]
2281 movdqu @XMM[1], 0x10($out)
2282 pxor 0x30(%rsp), @XMM[6]
2283 movdqu @XMM[4], 0x20($out)
2284 pxor 0x40(%rsp), @XMM[3]
2285 movdqu @XMM[6], 0x30($out)
2286 pxor 0x50(%rsp), @XMM[7]
2287 movdqu @XMM[3], 0x40($out)
2288 pxor 0x60(%rsp), @XMM[2]
2289 movdqu @XMM[7], 0x50($out)
2290 movdqu @XMM[2], 0x60($out)
2291 lea 0x70($out), $out
2292
2293 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2294 jmp .Lxts_enc_done
2295.align 16
2296.Lxts_enc_6:
2297 pxor @XMM[8+4], @XMM[4]
2298 lea 0x60($inp), $inp
2299 pxor @XMM[8+5], @XMM[5]
2300 lea 0x80(%rsp), %rax # pass key schedule
2301 mov %edx, %r10d # pass rounds
2302
2303 call _bsaes_encrypt8
2304
2305 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2306 pxor 0x10(%rsp), @XMM[1]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 pxor 0x20(%rsp), @XMM[4]
2309 movdqu @XMM[1], 0x10($out)
2310 pxor 0x30(%rsp), @XMM[6]
2311 movdqu @XMM[4], 0x20($out)
2312 pxor 0x40(%rsp), @XMM[3]
2313 movdqu @XMM[6], 0x30($out)
2314 pxor 0x50(%rsp), @XMM[7]
2315 movdqu @XMM[3], 0x40($out)
2316 movdqu @XMM[7], 0x50($out)
2317 lea 0x60($out), $out
2318
2319 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2320 jmp .Lxts_enc_done
2321.align 16
2322.Lxts_enc_5:
2323 pxor @XMM[8+3], @XMM[3]
2324 lea 0x50($inp), $inp
2325 pxor @XMM[8+4], @XMM[4]
2326 lea 0x80(%rsp), %rax # pass key schedule
2327 mov %edx, %r10d # pass rounds
2328
2329 call _bsaes_encrypt8
2330
2331 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2332 pxor 0x10(%rsp), @XMM[1]
2333 movdqu @XMM[0], 0x00($out) # write output
2334 pxor 0x20(%rsp), @XMM[4]
2335 movdqu @XMM[1], 0x10($out)
2336 pxor 0x30(%rsp), @XMM[6]
2337 movdqu @XMM[4], 0x20($out)
2338 pxor 0x40(%rsp), @XMM[3]
2339 movdqu @XMM[6], 0x30($out)
2340 movdqu @XMM[3], 0x40($out)
2341 lea 0x50($out), $out
2342
2343 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2344 jmp .Lxts_enc_done
2345.align 16
2346.Lxts_enc_4:
2347 pxor @XMM[8+2], @XMM[2]
2348 lea 0x40($inp), $inp
2349 pxor @XMM[8+3], @XMM[3]
2350 lea 0x80(%rsp), %rax # pass key schedule
2351 mov %edx, %r10d # pass rounds
2352
2353 call _bsaes_encrypt8
2354
2355 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2356 pxor 0x10(%rsp), @XMM[1]
2357 movdqu @XMM[0], 0x00($out) # write output
2358 pxor 0x20(%rsp), @XMM[4]
2359 movdqu @XMM[1], 0x10($out)
2360 pxor 0x30(%rsp), @XMM[6]
2361 movdqu @XMM[4], 0x20($out)
2362 movdqu @XMM[6], 0x30($out)
2363 lea 0x40($out), $out
2364
2365 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2366 jmp .Lxts_enc_done
2367.align 16
2368.Lxts_enc_3:
2369 pxor @XMM[8+1], @XMM[1]
2370 lea 0x30($inp), $inp
2371 pxor @XMM[8+2], @XMM[2]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2374
2375 call _bsaes_encrypt8
2376
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 pxor 0x20(%rsp), @XMM[4]
2381 movdqu @XMM[1], 0x10($out)
2382 movdqu @XMM[4], 0x20($out)
2383 lea 0x30($out), $out
2384
2385 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2386 jmp .Lxts_enc_done
2387.align 16
2388.Lxts_enc_2:
2389 pxor @XMM[8+0], @XMM[0]
2390 lea 0x20($inp), $inp
2391 pxor @XMM[8+1], @XMM[1]
2392 lea 0x80(%rsp), %rax # pass key schedule
2393 mov %edx, %r10d # pass rounds
2394
2395 call _bsaes_encrypt8
2396
2397 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2398 pxor 0x10(%rsp), @XMM[1]
2399 movdqu @XMM[0], 0x00($out) # write output
2400 movdqu @XMM[1], 0x10($out)
2401 lea 0x20($out), $out
2402
2403 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2404 jmp .Lxts_enc_done
2405.align 16
2406.Lxts_enc_1:
2407 pxor @XMM[0], @XMM[8]
2408 lea 0x10($inp), $inp
2409 movdqa @XMM[8], 0x20(%rbp)
2410 lea 0x20(%rbp), $arg1
2411 lea 0x20(%rbp), $arg2
2412 lea ($key), $arg3
2413 call asm_AES_encrypt # doesn't touch %xmm
2414 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2415 #pxor @XMM[8], @XMM[0]
2416 #lea 0x80(%rsp), %rax # pass key schedule
2417 #mov %edx, %r10d # pass rounds
2418 #call _bsaes_encrypt8
2419 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2420 movdqu @XMM[0], 0x00($out) # write output
2421 lea 0x10($out), $out
2422
2423 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2424
2425.Lxts_enc_done:
2426 and \$15, %ebx
2427 jz .Lxts_enc_ret
2428 mov $out, %rdx
2429
2430.Lxts_enc_steal:
2431 movzb ($inp), %eax
2432 movzb -16(%rdx), %ecx
2433 lea 1($inp), $inp
2434 mov %al, -16(%rdx)
2435 mov %cl, 0(%rdx)
2436 lea 1(%rdx), %rdx
2437 sub \$1,%ebx
2438 jnz .Lxts_enc_steal
2439
2440 movdqu -16($out), @XMM[0]
2441 lea 0x20(%rbp), $arg1
2442 pxor @XMM[7], @XMM[0]
2443 lea 0x20(%rbp), $arg2
2444 movdqa @XMM[0], 0x20(%rbp)
2445 lea ($key), $arg3
2446 call asm_AES_encrypt # doesn't touch %xmm
2447 pxor 0x20(%rbp), @XMM[7]
2448 movdqu @XMM[7], -16($out)
2449
2450.Lxts_enc_ret:
2451 lea (%rsp), %rax
2452 pxor %xmm0, %xmm0
2453.Lxts_enc_bzero: # wipe key schedule [if any]
2454 movdqa %xmm0, 0x00(%rax)
2455 movdqa %xmm0, 0x10(%rax)
2456 lea 0x20(%rax), %rax
2457 cmp %rax, %rbp
2458 ja .Lxts_enc_bzero
2459
2460 lea (%rbp),%rsp # restore %rsp
2461___
2462$code.=<<___ if ($win64);
2463 movaps 0x40(%rbp), %xmm6
2464 movaps 0x50(%rbp), %xmm7
2465 movaps 0x60(%rbp), %xmm8
2466 movaps 0x70(%rbp), %xmm9
2467 movaps 0x80(%rbp), %xmm10
2468 movaps 0x90(%rbp), %xmm11
2469 movaps 0xa0(%rbp), %xmm12
2470 movaps 0xb0(%rbp), %xmm13
2471 movaps 0xc0(%rbp), %xmm14
2472 movaps 0xd0(%rbp), %xmm15
2473 lea 0xa0(%rbp), %rsp
2474___
2475$code.=<<___;
2476 mov 0x48(%rsp), %r15
2477 mov 0x50(%rsp), %r14
2478 mov 0x58(%rsp), %r13
2479 mov 0x60(%rsp), %r12
2480 mov 0x68(%rsp), %rbx
2481 mov 0x70(%rsp), %rax
2482 lea 0x78(%rsp), %rsp
2483 mov %rax, %rbp
2484.Lxts_enc_epilogue:
2485 ret
2486.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2487
2488.globl bsaes_xts_decrypt
2489.type bsaes_xts_decrypt,\@abi-omnipotent
2490.align 16
2491bsaes_xts_decrypt:
2492 _CET_ENDBR
2493 mov %rsp, %rax
2494.Lxts_dec_prologue:
2495 push %rbp
2496 push %rbx
2497 push %r12
2498 push %r13
2499 push %r14
2500 push %r15
2501 lea -0x48(%rsp), %rsp
2502___
2503$code.=<<___ if ($win64);
2504 mov 0xa0(%rsp),$arg5 # pull key2
2505 mov 0xa8(%rsp),$arg6 # pull ivp
2506 lea -0xa0(%rsp), %rsp
2507 movaps %xmm6, 0x40(%rsp)
2508 movaps %xmm7, 0x50(%rsp)
2509 movaps %xmm8, 0x60(%rsp)
2510 movaps %xmm9, 0x70(%rsp)
2511 movaps %xmm10, 0x80(%rsp)
2512 movaps %xmm11, 0x90(%rsp)
2513 movaps %xmm12, 0xa0(%rsp)
2514 movaps %xmm13, 0xb0(%rsp)
2515 movaps %xmm14, 0xc0(%rsp)
2516 movaps %xmm15, 0xd0(%rsp)
2517.Lxts_dec_body:
2518___
2519$code.=<<___;
2520 mov %rsp, %rbp # backup %rsp
2521 mov $arg1, $inp # backup arguments
2522 mov $arg2, $out
2523 mov $arg3, $len
2524 mov $arg4, $key
2525
2526 lea ($arg6), $arg1
2527 lea 0x20(%rbp), $arg2
2528 lea ($arg5), $arg3
2529 call asm_AES_encrypt # generate initial tweak
2530
2531 mov 240($key), %eax # rounds
2532 mov $len, %rbx # backup $len
2533
2534 mov %eax, %edx # rounds
2535 shl \$7, %rax # 128 bytes per inner round key
2536 sub \$`128-32`, %rax # size of bit-sliced key schedule
2537 sub %rax, %rsp
2538
2539 mov %rsp, %rax # pass key schedule
2540 mov $key, %rcx # pass key
2541 mov %edx, %r10d # pass rounds
2542 call _bsaes_key_convert
2543 pxor (%rsp), %xmm7 # fix up round 0 key
2544 movdqa %xmm6, (%rax) # save last round key
2545 movdqa %xmm7, (%rsp)
2546
2547 xor %eax, %eax # if ($len%16) len-=16;
2548 and \$-16, $len
2549 test \$15, %ebx
2550 setnz %al
2551 shl \$4, %rax
2552 sub %rax, $len
2553
2554 sub \$0x80, %rsp # place for tweak[8]
2555 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2556
2557 pxor $twtmp, $twtmp
2558 movdqa .Lxts_magic(%rip), $twmask
2559 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2560
2561 sub \$0x80, $len
2562 jc .Lxts_dec_short
2563 jmp .Lxts_dec_loop
2564
2565.align 16
2566.Lxts_dec_loop:
2567___
2568 for ($i=0;$i<7;$i++) {
2569 $code.=<<___;
2570 pshufd \$0x13, $twtmp, $twres
2571 pxor $twtmp, $twtmp
2572 movdqa @XMM[7], @XMM[$i]
2573 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2574 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2575 pand $twmask, $twres # isolate carry and residue
2576 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2577 pxor $twres, @XMM[7]
2578___
2579 $code.=<<___ if ($i>=1);
2580 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2581___
2582 $code.=<<___ if ($i>=2);
2583 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2584___
2585 }
2586$code.=<<___;
2587 movdqu 0x60($inp), @XMM[8+6]
2588 pxor @XMM[8+5], @XMM[5]
2589 movdqu 0x70($inp), @XMM[8+7]
2590 lea 0x80($inp), $inp
2591 movdqa @XMM[7], 0x70(%rsp)
2592 pxor @XMM[8+6], @XMM[6]
2593 lea 0x80(%rsp), %rax # pass key schedule
2594 pxor @XMM[8+7], @XMM[7]
2595 mov %edx, %r10d # pass rounds
2596
2597 call _bsaes_decrypt8
2598
2599 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2600 pxor 0x10(%rsp), @XMM[1]
2601 movdqu @XMM[0], 0x00($out) # write output
2602 pxor 0x20(%rsp), @XMM[6]
2603 movdqu @XMM[1], 0x10($out)
2604 pxor 0x30(%rsp), @XMM[4]
2605 movdqu @XMM[6], 0x20($out)
2606 pxor 0x40(%rsp), @XMM[2]
2607 movdqu @XMM[4], 0x30($out)
2608 pxor 0x50(%rsp), @XMM[7]
2609 movdqu @XMM[2], 0x40($out)
2610 pxor 0x60(%rsp), @XMM[3]
2611 movdqu @XMM[7], 0x50($out)
2612 pxor 0x70(%rsp), @XMM[5]
2613 movdqu @XMM[3], 0x60($out)
2614 movdqu @XMM[5], 0x70($out)
2615 lea 0x80($out), $out
2616
2617 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2618 pxor $twtmp, $twtmp
2619 movdqa .Lxts_magic(%rip), $twmask
2620 pcmpgtd @XMM[7], $twtmp
2621 pshufd \$0x13, $twtmp, $twres
2622 pxor $twtmp, $twtmp
2623 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2624 pand $twmask, $twres # isolate carry and residue
2625 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2626 pxor $twres, @XMM[7]
2627
2628 sub \$0x80,$len
2629 jnc .Lxts_dec_loop
2630
2631.Lxts_dec_short:
2632 add \$0x80, $len
2633 jz .Lxts_dec_done
2634___
2635 for ($i=0;$i<7;$i++) {
2636 $code.=<<___;
2637 pshufd \$0x13, $twtmp, $twres
2638 pxor $twtmp, $twtmp
2639 movdqa @XMM[7], @XMM[$i]
2640 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2641 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2642 pand $twmask, $twres # isolate carry and residue
2643 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2644 pxor $twres, @XMM[7]
2645___
2646 $code.=<<___ if ($i>=1);
2647 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2648 cmp \$`0x10*$i`,$len
2649 je .Lxts_dec_$i
2650___
2651 $code.=<<___ if ($i>=2);
2652 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2653___
2654 }
2655$code.=<<___;
2656 movdqu 0x60($inp), @XMM[8+6]
2657 pxor @XMM[8+5], @XMM[5]
2658 movdqa @XMM[7], 0x70(%rsp)
2659 lea 0x70($inp), $inp
2660 pxor @XMM[8+6], @XMM[6]
2661 lea 0x80(%rsp), %rax # pass key schedule
2662 mov %edx, %r10d # pass rounds
2663
2664 call _bsaes_decrypt8
2665
2666 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2667 pxor 0x10(%rsp), @XMM[1]
2668 movdqu @XMM[0], 0x00($out) # write output
2669 pxor 0x20(%rsp), @XMM[6]
2670 movdqu @XMM[1], 0x10($out)
2671 pxor 0x30(%rsp), @XMM[4]
2672 movdqu @XMM[6], 0x20($out)
2673 pxor 0x40(%rsp), @XMM[2]
2674 movdqu @XMM[4], 0x30($out)
2675 pxor 0x50(%rsp), @XMM[7]
2676 movdqu @XMM[2], 0x40($out)
2677 pxor 0x60(%rsp), @XMM[3]
2678 movdqu @XMM[7], 0x50($out)
2679 movdqu @XMM[3], 0x60($out)
2680 lea 0x70($out), $out
2681
2682 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2683 jmp .Lxts_dec_done
2684.align 16
2685.Lxts_dec_6:
2686 pxor @XMM[8+4], @XMM[4]
2687 lea 0x60($inp), $inp
2688 pxor @XMM[8+5], @XMM[5]
2689 lea 0x80(%rsp), %rax # pass key schedule
2690 mov %edx, %r10d # pass rounds
2691
2692 call _bsaes_decrypt8
2693
2694 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 pxor 0x10(%rsp), @XMM[1]
2696 movdqu @XMM[0], 0x00($out) # write output
2697 pxor 0x20(%rsp), @XMM[6]
2698 movdqu @XMM[1], 0x10($out)
2699 pxor 0x30(%rsp), @XMM[4]
2700 movdqu @XMM[6], 0x20($out)
2701 pxor 0x40(%rsp), @XMM[2]
2702 movdqu @XMM[4], 0x30($out)
2703 pxor 0x50(%rsp), @XMM[7]
2704 movdqu @XMM[2], 0x40($out)
2705 movdqu @XMM[7], 0x50($out)
2706 lea 0x60($out), $out
2707
2708 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2709 jmp .Lxts_dec_done
2710.align 16
2711.Lxts_dec_5:
2712 pxor @XMM[8+3], @XMM[3]
2713 lea 0x50($inp), $inp
2714 pxor @XMM[8+4], @XMM[4]
2715 lea 0x80(%rsp), %rax # pass key schedule
2716 mov %edx, %r10d # pass rounds
2717
2718 call _bsaes_decrypt8
2719
2720 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2721 pxor 0x10(%rsp), @XMM[1]
2722 movdqu @XMM[0], 0x00($out) # write output
2723 pxor 0x20(%rsp), @XMM[6]
2724 movdqu @XMM[1], 0x10($out)
2725 pxor 0x30(%rsp), @XMM[4]
2726 movdqu @XMM[6], 0x20($out)
2727 pxor 0x40(%rsp), @XMM[2]
2728 movdqu @XMM[4], 0x30($out)
2729 movdqu @XMM[2], 0x40($out)
2730 lea 0x50($out), $out
2731
2732 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2733 jmp .Lxts_dec_done
2734.align 16
2735.Lxts_dec_4:
2736 pxor @XMM[8+2], @XMM[2]
2737 lea 0x40($inp), $inp
2738 pxor @XMM[8+3], @XMM[3]
2739 lea 0x80(%rsp), %rax # pass key schedule
2740 mov %edx, %r10d # pass rounds
2741
2742 call _bsaes_decrypt8
2743
2744 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2745 pxor 0x10(%rsp), @XMM[1]
2746 movdqu @XMM[0], 0x00($out) # write output
2747 pxor 0x20(%rsp), @XMM[6]
2748 movdqu @XMM[1], 0x10($out)
2749 pxor 0x30(%rsp), @XMM[4]
2750 movdqu @XMM[6], 0x20($out)
2751 movdqu @XMM[4], 0x30($out)
2752 lea 0x40($out), $out
2753
2754 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2755 jmp .Lxts_dec_done
2756.align 16
2757.Lxts_dec_3:
2758 pxor @XMM[8+1], @XMM[1]
2759 lea 0x30($inp), $inp
2760 pxor @XMM[8+2], @XMM[2]
2761 lea 0x80(%rsp), %rax # pass key schedule
2762 mov %edx, %r10d # pass rounds
2763
2764 call _bsaes_decrypt8
2765
2766 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2767 pxor 0x10(%rsp), @XMM[1]
2768 movdqu @XMM[0], 0x00($out) # write output
2769 pxor 0x20(%rsp), @XMM[6]
2770 movdqu @XMM[1], 0x10($out)
2771 movdqu @XMM[6], 0x20($out)
2772 lea 0x30($out), $out
2773
2774 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2775 jmp .Lxts_dec_done
2776.align 16
2777.Lxts_dec_2:
2778 pxor @XMM[8+0], @XMM[0]
2779 lea 0x20($inp), $inp
2780 pxor @XMM[8+1], @XMM[1]
2781 lea 0x80(%rsp), %rax # pass key schedule
2782 mov %edx, %r10d # pass rounds
2783
2784 call _bsaes_decrypt8
2785
2786 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2787 pxor 0x10(%rsp), @XMM[1]
2788 movdqu @XMM[0], 0x00($out) # write output
2789 movdqu @XMM[1], 0x10($out)
2790 lea 0x20($out), $out
2791
2792 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2793 jmp .Lxts_dec_done
2794.align 16
2795.Lxts_dec_1:
2796 pxor @XMM[0], @XMM[8]
2797 lea 0x10($inp), $inp
2798 movdqa @XMM[8], 0x20(%rbp)
2799 lea 0x20(%rbp), $arg1
2800 lea 0x20(%rbp), $arg2
2801 lea ($key), $arg3
2802 call asm_AES_decrypt # doesn't touch %xmm
2803 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2804 #pxor @XMM[8], @XMM[0]
2805 #lea 0x80(%rsp), %rax # pass key schedule
2806 #mov %edx, %r10d # pass rounds
2807 #call _bsaes_decrypt8
2808 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2809 movdqu @XMM[0], 0x00($out) # write output
2810 lea 0x10($out), $out
2811
2812 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2813
2814.Lxts_dec_done:
2815 and \$15, %ebx
2816 jz .Lxts_dec_ret
2817
2818 pxor $twtmp, $twtmp
2819 movdqa .Lxts_magic(%rip), $twmask
2820 pcmpgtd @XMM[7], $twtmp
2821 pshufd \$0x13, $twtmp, $twres
2822 movdqa @XMM[7], @XMM[6]
2823 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2824 pand $twmask, $twres # isolate carry and residue
2825 movdqu ($inp), @XMM[0]
2826 pxor $twres, @XMM[7]
2827
2828 lea 0x20(%rbp), $arg1
2829 pxor @XMM[7], @XMM[0]
2830 lea 0x20(%rbp), $arg2
2831 movdqa @XMM[0], 0x20(%rbp)
2832 lea ($key), $arg3
2833 call asm_AES_decrypt # doesn't touch %xmm
2834 pxor 0x20(%rbp), @XMM[7]
2835 mov $out, %rdx
2836 movdqu @XMM[7], ($out)
2837
2838.Lxts_dec_steal:
2839 movzb 16($inp), %eax
2840 movzb (%rdx), %ecx
2841 lea 1($inp), $inp
2842 mov %al, (%rdx)
2843 mov %cl, 16(%rdx)
2844 lea 1(%rdx), %rdx
2845 sub \$1,%ebx
2846 jnz .Lxts_dec_steal
2847
2848 movdqu ($out), @XMM[0]
2849 lea 0x20(%rbp), $arg1
2850 pxor @XMM[6], @XMM[0]
2851 lea 0x20(%rbp), $arg2
2852 movdqa @XMM[0], 0x20(%rbp)
2853 lea ($key), $arg3
2854 call asm_AES_decrypt # doesn't touch %xmm
2855 pxor 0x20(%rbp), @XMM[6]
2856 movdqu @XMM[6], ($out)
2857
2858.Lxts_dec_ret:
2859 lea (%rsp), %rax
2860 pxor %xmm0, %xmm0
2861.Lxts_dec_bzero: # wipe key schedule [if any]
2862 movdqa %xmm0, 0x00(%rax)
2863 movdqa %xmm0, 0x10(%rax)
2864 lea 0x20(%rax), %rax
2865 cmp %rax, %rbp
2866 ja .Lxts_dec_bzero
2867
2868 lea (%rbp),%rsp # restore %rsp
2869___
2870$code.=<<___ if ($win64);
2871 movaps 0x40(%rbp), %xmm6
2872 movaps 0x50(%rbp), %xmm7
2873 movaps 0x60(%rbp), %xmm8
2874 movaps 0x70(%rbp), %xmm9
2875 movaps 0x80(%rbp), %xmm10
2876 movaps 0x90(%rbp), %xmm11
2877 movaps 0xa0(%rbp), %xmm12
2878 movaps 0xb0(%rbp), %xmm13
2879 movaps 0xc0(%rbp), %xmm14
2880 movaps 0xd0(%rbp), %xmm15
2881 lea 0xa0(%rbp), %rsp
2882___
2883$code.=<<___;
2884 mov 0x48(%rsp), %r15
2885 mov 0x50(%rsp), %r14
2886 mov 0x58(%rsp), %r13
2887 mov 0x60(%rsp), %r12
2888 mov 0x68(%rsp), %rbx
2889 mov 0x70(%rsp), %rax
2890 lea 0x78(%rsp), %rsp
2891 mov %rax, %rbp
2892.Lxts_dec_epilogue:
2893 ret
2894.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2895___
2896}
2897$code.=<<___;
2898.section .rodata
2899.type _bsaes_const,\@object
2900.align 64
2901_bsaes_const:
2902.LM0ISR: # InvShiftRows constants
2903 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2904.LISRM0:
2905 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2906.LISR:
2907 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2908.LBS0: # bit-slice constants
2909 .quad 0x5555555555555555, 0x5555555555555555
2910.LBS1:
2911 .quad 0x3333333333333333, 0x3333333333333333
2912.LBS2:
2913 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2914.LSR: # shiftrows constants
2915 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2916.LSRM0:
2917 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2918.LM0SR:
2919 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2920.LSWPUP: # byte-swap upper dword
2921 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2922.LSWPUPM0SR:
2923 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2924.LADD1: # counter increment constants
2925 .quad 0x0000000000000000, 0x0000000100000000
2926.LADD2:
2927 .quad 0x0000000000000000, 0x0000000200000000
2928.LADD3:
2929 .quad 0x0000000000000000, 0x0000000300000000
2930.LADD4:
2931 .quad 0x0000000000000000, 0x0000000400000000
2932.LADD5:
2933 .quad 0x0000000000000000, 0x0000000500000000
2934.LADD6:
2935 .quad 0x0000000000000000, 0x0000000600000000
2936.LADD7:
2937 .quad 0x0000000000000000, 0x0000000700000000
2938.LADD8:
2939 .quad 0x0000000000000000, 0x0000000800000000
2940.Lxts_magic:
2941 .long 0x87,0,1,0
2942.Lmasks:
2943 .quad 0x0101010101010101, 0x0101010101010101
2944 .quad 0x0202020202020202, 0x0202020202020202
2945 .quad 0x0404040404040404, 0x0404040404040404
2946 .quad 0x0808080808080808, 0x0808080808080808
2947.LM0:
2948 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2949.L63:
2950 .quad 0x6363636363636363, 0x6363636363636363
2951.align 64
2952.size _bsaes_const,.-_bsaes_const
2953.text
2954___
2955
2956# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2957# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2958if ($win64) {
2959$rec="%rcx";
2960$frame="%rdx";
2961$context="%r8";
2962$disp="%r9";
2963
2964$code.=<<___;
2965.extern __imp_RtlVirtualUnwind
2966.type se_handler,\@abi-omnipotent
2967.align 16
2968se_handler:
2969 _CET_ENDBR
2970 push %rsi
2971 push %rdi
2972 push %rbx
2973 push %rbp
2974 push %r12
2975 push %r13
2976 push %r14
2977 push %r15
2978 pushfq
2979 sub \$64,%rsp
2980
2981 mov 120($context),%rax # pull context->Rax
2982 mov 248($context),%rbx # pull context->Rip
2983
2984 mov 8($disp),%rsi # disp->ImageBase
2985 mov 56($disp),%r11 # disp->HandlerData
2986
2987 mov 0(%r11),%r10d # HandlerData[0]
2988 lea (%rsi,%r10),%r10 # prologue label
2989 cmp %r10,%rbx # context->Rip<prologue label
2990 jb .Lin_prologue
2991
2992 mov 152($context),%rax # pull context->Rsp
2993
2994 mov 4(%r11),%r10d # HandlerData[1]
2995 lea (%rsi,%r10),%r10 # epilogue label
2996 cmp %r10,%rbx # context->Rip>=epilogue label
2997 jae .Lin_prologue
2998
2999 mov 160($context),%rax # pull context->Rbp
3000
3001 lea 0x40(%rax),%rsi # %xmm save area
3002 lea 512($context),%rdi # &context.Xmm6
3003 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3004 .long 0xa548f3fc # cld; rep movsq
3005 lea 0xa0(%rax),%rax # adjust stack pointer
3006
3007 mov 0x70(%rax),%rbp
3008 mov 0x68(%rax),%rbx
3009 mov 0x60(%rax),%r12
3010 mov 0x58(%rax),%r13
3011 mov 0x50(%rax),%r14
3012 mov 0x48(%rax),%r15
3013 lea 0x78(%rax),%rax # adjust stack pointer
3014 mov %rbx,144($context) # restore context->Rbx
3015 mov %rbp,160($context) # restore context->Rbp
3016 mov %r12,216($context) # restore context->R12
3017 mov %r13,224($context) # restore context->R13
3018 mov %r14,232($context) # restore context->R14
3019 mov %r15,240($context) # restore context->R15
3020
3021.Lin_prologue:
3022 mov %rax,152($context) # restore context->Rsp
3023
3024 mov 40($disp),%rdi # disp->ContextRecord
3025 mov $context,%rsi # context
3026 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3027 .long 0xa548f3fc # cld; rep movsq
3028
3029 mov $disp,%rsi
3030 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3031 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3032 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3033 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3034 mov 40(%rsi),%r10 # disp->ContextRecord
3035 lea 56(%rsi),%r11 # &disp->HandlerData
3036 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3037 mov %r10,32(%rsp) # arg5
3038 mov %r11,40(%rsp) # arg6
3039 mov %r12,48(%rsp) # arg7
3040 mov %rcx,56(%rsp) # arg8, (NULL)
3041 call *__imp_RtlVirtualUnwind(%rip)
3042
3043 mov \$1,%eax # ExceptionContinueSearch
3044 add \$64,%rsp
3045 popfq
3046 pop %r15
3047 pop %r14
3048 pop %r13
3049 pop %r12
3050 pop %rbp
3051 pop %rbx
3052 pop %rdi
3053 pop %rsi
3054 ret
3055.size se_handler,.-se_handler
3056
3057.section .pdata
3058.align 4
3059___
3060$code.=<<___ if ($ecb);
3061 .rva .Lecb_enc_prologue
3062 .rva .Lecb_enc_epilogue
3063 .rva .Lecb_enc_info
3064
3065 .rva .Lecb_dec_prologue
3066 .rva .Lecb_dec_epilogue
3067 .rva .Lecb_dec_info
3068___
3069$code.=<<___;
3070 .rva .Lcbc_dec_prologue
3071 .rva .Lcbc_dec_epilogue
3072 .rva .Lcbc_dec_info
3073
3074 .rva .Lctr_enc_prologue
3075 .rva .Lctr_enc_epilogue
3076 .rva .Lctr_enc_info
3077
3078 .rva .Lxts_enc_prologue
3079 .rva .Lxts_enc_epilogue
3080 .rva .Lxts_enc_info
3081
3082 .rva .Lxts_dec_prologue
3083 .rva .Lxts_dec_epilogue
3084 .rva .Lxts_dec_info
3085
3086.section .xdata
3087.align 8
3088___
3089$code.=<<___ if ($ecb);
3090.Lecb_enc_info:
3091 .byte 9,0,0,0
3092 .rva se_handler
3093 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3094.Lecb_dec_info:
3095 .byte 9,0,0,0
3096 .rva se_handler
3097 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3098___
3099$code.=<<___;
3100.Lcbc_dec_info:
3101 .byte 9,0,0,0
3102 .rva se_handler
3103 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3104.Lctr_enc_info:
3105 .byte 9,0,0,0
3106 .rva se_handler
3107 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3108.Lxts_enc_info:
3109 .byte 9,0,0,0
3110 .rva se_handler
3111 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3112.Lxts_dec_info:
3113 .byte 9,0,0,0
3114 .rva se_handler
3115 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3116___
3117}
3118
3119$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3120
3121print $code;
3122
3123close STDOUT;