summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm
diff options
context:
space:
mode:
authordjm <>2012-10-13 21:23:50 +0000
committerdjm <>2012-10-13 21:23:50 +0000
commit228cae30b117c2493f69ad3c195341cd6ec8d430 (patch)
tree29ff00b10d52c0978077c4fd83c33b065bade73e /src/lib/libcrypto/aes/asm
parent731838c66b52c0ae5888333005b74115a620aa96 (diff)
downloadopenbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.gz
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.tar.bz2
openbsd-228cae30b117c2493f69ad3c195341cd6ec8d430.zip
import OpenSSL-1.0.1c
Diffstat (limited to 'src/lib/libcrypto/aes/asm')
-rw-r--r--src/lib/libcrypto/aes/asm/aes-586.pl14
-rw-r--r--src/lib/libcrypto/aes/asm/aes-armv4.pl182
-rw-r--r--src/lib/libcrypto/aes/asm/aes-mips.pl1611
-rw-r--r--src/lib/libcrypto/aes/asm/aes-parisc.pl1021
-rw-r--r--src/lib/libcrypto/aes/asm/aes-ppc.pl444
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl1071
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-sparcv9.pl3
-rwxr-xr-xsrc/lib/libcrypto/aes/asm/aes-x86_64.pl45
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl1249
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86.pl2189
-rw-r--r--src/lib/libcrypto/aes/asm/aesni-x86_64.pl2478
-rw-r--r--src/lib/libcrypto/aes/asm/bsaes-x86_64.pl3044
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86.pl903
-rw-r--r--src/lib/libcrypto/aes/asm/vpaes-x86_64.pl1206
14 files changed, 14982 insertions, 478 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index aab40e6f1c..687ed811be 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
39# but exhibits up to 10% improvement on other cores. 39# but exhibits up to 10% improvement on other cores.
40# 40#
41# Second version is "monolithic" replacement for aes_core.c, which in 41# Second version is "monolithic" replacement for aes_core.c, which in
42# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. 42# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
43# This made it possible to implement little-endian variant of the 43# This made it possible to implement little-endian variant of the
44# algorithm without modifying the base C code. Motivating factor for 44# algorithm without modifying the base C code. Motivating factor for
45# the undertaken effort was that it appeared that in tight IA-32 45# the undertaken effort was that it appeared that in tight IA-32
@@ -2854,12 +2854,12 @@ sub enckey()
2854 &set_label("exit"); 2854 &set_label("exit");
2855&function_end("_x86_AES_set_encrypt_key"); 2855&function_end("_x86_AES_set_encrypt_key");
2856 2856
2857# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, 2857# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2858# AES_KEY *key) 2858# AES_KEY *key)
2859&function_begin_B("AES_set_encrypt_key"); 2859&function_begin_B("private_AES_set_encrypt_key");
2860 &call ("_x86_AES_set_encrypt_key"); 2860 &call ("_x86_AES_set_encrypt_key");
2861 &ret (); 2861 &ret ();
2862&function_end_B("AES_set_encrypt_key"); 2862&function_end_B("private_AES_set_encrypt_key");
2863 2863
2864sub deckey() 2864sub deckey()
2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; 2865{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ sub deckey()
2916 &mov (&DWP(4*$i,$key),$tp1); 2916 &mov (&DWP(4*$i,$key),$tp1);
2917} 2917}
2918 2918
2919# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, 2919# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2920# AES_KEY *key) 2920# AES_KEY *key)
2921&function_begin_B("AES_set_decrypt_key"); 2921&function_begin_B("private_AES_set_decrypt_key");
2922 &call ("_x86_AES_set_encrypt_key"); 2922 &call ("_x86_AES_set_encrypt_key");
2923 &cmp ("eax",0); 2923 &cmp ("eax",0);
2924 &je (&label("proceed")); 2924 &je (&label("proceed"));
@@ -2974,7 +2974,7 @@ sub deckey()
2974 &jb (&label("permute")); 2974 &jb (&label("permute"));
2975 2975
2976 &xor ("eax","eax"); # return success 2976 &xor ("eax","eax"); # return success
2977&function_end("AES_set_decrypt_key"); 2977&function_end("private_AES_set_decrypt_key");
2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>"); 2978&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2979 2979
2980&asm_finish(); 2980&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
index c51ee1fbf6..86b86c4a0f 100644
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
27# Rescheduling for dual-issue pipeline resulted in 12% improvement on 27# Rescheduling for dual-issue pipeline resulted in 12% improvement on
28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key. 28# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
29 29
30# February 2011.
31#
32# Profiler-assisted and platform-specific optimization resulted in 16%
33# improvement on Cortex A8 core and ~21.5 cycles per byte.
34
30while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 35while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
31open STDOUT,">$output"; 36open STDOUT,">$output";
32 37
@@ -46,6 +51,7 @@ $key="r11";
46$rounds="r12"; 51$rounds="r12";
47 52
48$code=<<___; 53$code=<<___;
54#include "arm_arch.h"
49.text 55.text
50.code 32 56.code 32
51 57
@@ -166,7 +172,7 @@ AES_encrypt:
166 mov $rounds,r0 @ inp 172 mov $rounds,r0 @ inp
167 mov $key,r2 173 mov $key,r2
168 sub $tbl,r3,#AES_encrypt-AES_Te @ Te 174 sub $tbl,r3,#AES_encrypt-AES_Te @ Te
169 175#if __ARM_ARCH__<7
170 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral 176 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
171 ldrb $t1,[$rounds,#2] @ manner... 177 ldrb $t1,[$rounds,#2] @ manner...
172 ldrb $t2,[$rounds,#1] 178 ldrb $t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
195 orr $s3,$s3,$t1,lsl#8 201 orr $s3,$s3,$t1,lsl#8
196 orr $s3,$s3,$t2,lsl#16 202 orr $s3,$s3,$t2,lsl#16
197 orr $s3,$s3,$t3,lsl#24 203 orr $s3,$s3,$t3,lsl#24
198 204#else
205 ldr $s0,[$rounds,#0]
206 ldr $s1,[$rounds,#4]
207 ldr $s2,[$rounds,#8]
208 ldr $s3,[$rounds,#12]
209#ifdef __ARMEL__
210 rev $s0,$s0
211 rev $s1,$s1
212 rev $s2,$s2
213 rev $s3,$s3
214#endif
215#endif
199 bl _armv4_AES_encrypt 216 bl _armv4_AES_encrypt
200 217
201 ldr $rounds,[sp],#4 @ pop out 218 ldr $rounds,[sp],#4 @ pop out
219#if __ARM_ARCH__>=7
220#ifdef __ARMEL__
221 rev $s0,$s0
222 rev $s1,$s1
223 rev $s2,$s2
224 rev $s3,$s3
225#endif
226 str $s0,[$rounds,#0]
227 str $s1,[$rounds,#4]
228 str $s2,[$rounds,#8]
229 str $s3,[$rounds,#12]
230#else
202 mov $t1,$s0,lsr#24 @ write output in endian-neutral 231 mov $t1,$s0,lsr#24 @ write output in endian-neutral
203 mov $t2,$s0,lsr#16 @ manner... 232 mov $t2,$s0,lsr#16 @ manner...
204 mov $t3,$s0,lsr#8 233 mov $t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
227 strb $t2,[$rounds,#13] 256 strb $t2,[$rounds,#13]
228 strb $t3,[$rounds,#14] 257 strb $t3,[$rounds,#14]
229 strb $s3,[$rounds,#15] 258 strb $s3,[$rounds,#15]
230 259#endif
260#if __ARM_ARCH__>=5
261 ldmia sp!,{r4-r12,pc}
262#else
231 ldmia sp!,{r4-r12,lr} 263 ldmia sp!,{r4-r12,lr}
232 tst lr,#1 264 tst lr,#1
233 moveq pc,lr @ be binary compatible with V4, yet 265 moveq pc,lr @ be binary compatible with V4, yet
234 bx lr @ interoperable with Thumb ISA:-) 266 bx lr @ interoperable with Thumb ISA:-)
267#endif
235.size AES_encrypt,.-AES_encrypt 268.size AES_encrypt,.-AES_encrypt
236 269
237.type _armv4_AES_encrypt,%function 270.type _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
271 and $i2,lr,$s2,lsr#16 @ i1 304 and $i2,lr,$s2,lsr#16 @ i1
272 eor $t3,$t3,$i3,ror#8 305 eor $t3,$t3,$i3,ror#8
273 and $i3,lr,$s2 306 and $i3,lr,$s2
274 eor $s1,$s1,$t1,ror#24
275 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] 307 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
308 eor $s1,$s1,$t1,ror#24
309 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
276 mov $s2,$s2,lsr#24 310 mov $s2,$s2,lsr#24
277 311
278 ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
279 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] 312 ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
280 eor $s0,$s0,$i1,ror#16 313 eor $s0,$s0,$i1,ror#16
281 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] 314 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
284 and $i2,lr,$s3,lsr#8 @ i1 317 and $i2,lr,$s3,lsr#8 @ i1
285 eor $t3,$t3,$i3,ror#16 318 eor $t3,$t3,$i3,ror#16
286 and $i3,lr,$s3,lsr#16 @ i2 319 and $i3,lr,$s3,lsr#16 @ i2
287 eor $s2,$s2,$t2,ror#16
288 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] 320 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
321 eor $s2,$s2,$t2,ror#16
322 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
289 mov $s3,$s3,lsr#24 323 mov $s3,$s3,lsr#24
290 324
291 ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
292 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] 325 ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
293 eor $s0,$s0,$i1,ror#24 326 eor $s0,$s0,$i1,ror#24
294 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
295 eor $s1,$s1,$i2,ror#16
296 ldr $i1,[$key],#16 327 ldr $i1,[$key],#16
328 eor $s1,$s1,$i2,ror#16
329 ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
297 eor $s2,$s2,$i3,ror#8 330 eor $s2,$s2,$i3,ror#8
298 ldr $t1,[$key,#-12] 331 ldr $t1,[$key,#-12]
299 eor $s3,$s3,$t3,ror#8 332 eor $s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
333 and $i2,lr,$s2,lsr#16 @ i1 366 and $i2,lr,$s2,lsr#16 @ i1
334 eor $t3,$i3,$t3,lsl#8 367 eor $t3,$i3,$t3,lsl#8
335 and $i3,lr,$s2 368 and $i3,lr,$s2
336 eor $s1,$t1,$s1,lsl#24
337 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] 369 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
370 eor $s1,$t1,$s1,lsl#24
371 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
338 mov $s2,$s2,lsr#24 372 mov $s2,$s2,lsr#24
339 373
340 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
341 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] 374 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
342 eor $s0,$i1,$s0,lsl#8 375 eor $s0,$i1,$s0,lsl#8
343 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] 376 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
346 and $i2,lr,$s3,lsr#8 @ i1 379 and $i2,lr,$s3,lsr#8 @ i1
347 eor $t3,$i3,$t3,lsl#8 380 eor $t3,$i3,$t3,lsl#8
348 and $i3,lr,$s3,lsr#16 @ i2 381 and $i3,lr,$s3,lsr#16 @ i2
349 eor $s2,$t2,$s2,lsl#24
350 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] 382 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
383 eor $s2,$t2,$s2,lsl#24
384 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
351 mov $s3,$s3,lsr#24 385 mov $s3,$s3,lsr#24
352 386
353 ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
354 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] 387 ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
355 eor $s0,$i1,$s0,lsl#8 388 eor $s0,$i1,$s0,lsl#8
356 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
357 ldr $i1,[$key,#0] 389 ldr $i1,[$key,#0]
390 ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
358 eor $s1,$s1,$i2,lsl#8 391 eor $s1,$s1,$i2,lsl#8
359 ldr $t1,[$key,#4] 392 ldr $t1,[$key,#4]
360 eor $s2,$s2,$i3,lsl#16 393 eor $s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@ _armv4_AES_encrypt:
371 ldr pc,[sp],#4 @ pop and return 404 ldr pc,[sp],#4 @ pop and return
372.size _armv4_AES_encrypt,.-_armv4_AES_encrypt 405.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
373 406
374.global AES_set_encrypt_key 407.global private_AES_set_encrypt_key
375.type AES_set_encrypt_key,%function 408.type private_AES_set_encrypt_key,%function
376.align 5 409.align 5
377AES_set_encrypt_key: 410private_AES_set_encrypt_key:
411_armv4_AES_set_encrypt_key:
378 sub r3,pc,#8 @ AES_set_encrypt_key 412 sub r3,pc,#8 @ AES_set_encrypt_key
379 teq r0,#0 413 teq r0,#0
380 moveq r0,#-1 414 moveq r0,#-1
@@ -392,12 +426,13 @@ AES_set_encrypt_key:
392 bne .Labrt 426 bne .Labrt
393 427
394.Lok: stmdb sp!,{r4-r12,lr} 428.Lok: stmdb sp!,{r4-r12,lr}
395 sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 429 sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
396 430
397 mov $rounds,r0 @ inp 431 mov $rounds,r0 @ inp
398 mov lr,r1 @ bits 432 mov lr,r1 @ bits
399 mov $key,r2 @ key 433 mov $key,r2 @ key
400 434
435#if __ARM_ARCH__<7
401 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral 436 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
402 ldrb $t1,[$rounds,#2] @ manner... 437 ldrb $t1,[$rounds,#2] @ manner...
403 ldrb $t2,[$rounds,#1] 438 ldrb $t2,[$rounds,#1]
@@ -430,6 +465,22 @@ AES_set_encrypt_key:
430 orr $s3,$s3,$t3,lsl#24 465 orr $s3,$s3,$t3,lsl#24
431 str $s2,[$key,#-8] 466 str $s2,[$key,#-8]
432 str $s3,[$key,#-4] 467 str $s3,[$key,#-4]
468#else
469 ldr $s0,[$rounds,#0]
470 ldr $s1,[$rounds,#4]
471 ldr $s2,[$rounds,#8]
472 ldr $s3,[$rounds,#12]
473#ifdef __ARMEL__
474 rev $s0,$s0
475 rev $s1,$s1
476 rev $s2,$s2
477 rev $s3,$s3
478#endif
479 str $s0,[$key],#16
480 str $s1,[$key,#-12]
481 str $s2,[$key,#-8]
482 str $s3,[$key,#-4]
483#endif
433 484
434 teq lr,#128 485 teq lr,#128
435 bne .Lnot128 486 bne .Lnot128
@@ -466,6 +517,7 @@ AES_set_encrypt_key:
466 b .Ldone 517 b .Ldone
467 518
468.Lnot128: 519.Lnot128:
520#if __ARM_ARCH__<7
469 ldrb $i2,[$rounds,#19] 521 ldrb $i2,[$rounds,#19]
470 ldrb $t1,[$rounds,#18] 522 ldrb $t1,[$rounds,#18]
471 ldrb $t2,[$rounds,#17] 523 ldrb $t2,[$rounds,#17]
@@ -482,6 +534,16 @@ AES_set_encrypt_key:
482 str $i2,[$key],#8 534 str $i2,[$key],#8
483 orr $i3,$i3,$t3,lsl#24 535 orr $i3,$i3,$t3,lsl#24
484 str $i3,[$key,#-4] 536 str $i3,[$key,#-4]
537#else
538 ldr $i2,[$rounds,#16]
539 ldr $i3,[$rounds,#20]
540#ifdef __ARMEL__
541 rev $i2,$i2
542 rev $i3,$i3
543#endif
544 str $i2,[$key],#8
545 str $i3,[$key,#-4]
546#endif
485 547
486 teq lr,#192 548 teq lr,#192
487 bne .Lnot192 549 bne .Lnot192
@@ -526,6 +588,7 @@ AES_set_encrypt_key:
526 b .L192_loop 588 b .L192_loop
527 589
528.Lnot192: 590.Lnot192:
591#if __ARM_ARCH__<7
529 ldrb $i2,[$rounds,#27] 592 ldrb $i2,[$rounds,#27]
530 ldrb $t1,[$rounds,#26] 593 ldrb $t1,[$rounds,#26]
531 ldrb $t2,[$rounds,#25] 594 ldrb $t2,[$rounds,#25]
@@ -542,6 +605,16 @@ AES_set_encrypt_key:
542 str $i2,[$key],#8 605 str $i2,[$key],#8
543 orr $i3,$i3,$t3,lsl#24 606 orr $i3,$i3,$t3,lsl#24
544 str $i3,[$key,#-4] 607 str $i3,[$key,#-4]
608#else
609 ldr $i2,[$rounds,#24]
610 ldr $i3,[$rounds,#28]
611#ifdef __ARMEL__
612 rev $i2,$i2
613 rev $i3,$i3
614#endif
615 str $i2,[$key],#8
616 str $i3,[$key,#-4]
617#endif
545 618
546 mov $rounds,#14 619 mov $rounds,#14
547 str $rounds,[$key,#240-32] 620 str $rounds,[$key,#240-32]
@@ -606,14 +679,14 @@ AES_set_encrypt_key:
606.Labrt: tst lr,#1 679.Labrt: tst lr,#1
607 moveq pc,lr @ be binary compatible with V4, yet 680 moveq pc,lr @ be binary compatible with V4, yet
608 bx lr @ interoperable with Thumb ISA:-) 681 bx lr @ interoperable with Thumb ISA:-)
609.size AES_set_encrypt_key,.-AES_set_encrypt_key 682.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
610 683
611.global AES_set_decrypt_key 684.global private_AES_set_decrypt_key
612.type AES_set_decrypt_key,%function 685.type private_AES_set_decrypt_key,%function
613.align 5 686.align 5
614AES_set_decrypt_key: 687private_AES_set_decrypt_key:
615 str lr,[sp,#-4]! @ push lr 688 str lr,[sp,#-4]! @ push lr
616 bl AES_set_encrypt_key 689 bl _armv4_AES_set_encrypt_key
617 teq r0,#0 690 teq r0,#0
618 ldrne lr,[sp],#4 @ pop lr 691 ldrne lr,[sp],#4 @ pop lr
619 bne .Labrt 692 bne .Labrt
@@ -692,11 +765,15 @@ $code.=<<___;
692 bne .Lmix 765 bne .Lmix
693 766
694 mov r0,#0 767 mov r0,#0
768#if __ARM_ARCH__>=5
769 ldmia sp!,{r4-r12,pc}
770#else
695 ldmia sp!,{r4-r12,lr} 771 ldmia sp!,{r4-r12,lr}
696 tst lr,#1 772 tst lr,#1
697 moveq pc,lr @ be binary compatible with V4, yet 773 moveq pc,lr @ be binary compatible with V4, yet
698 bx lr @ interoperable with Thumb ISA:-) 774 bx lr @ interoperable with Thumb ISA:-)
699.size AES_set_decrypt_key,.-AES_set_decrypt_key 775#endif
776.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
700 777
701.type AES_Td,%object 778.type AES_Td,%object
702.align 5 779.align 5
@@ -811,7 +888,7 @@ AES_decrypt:
811 mov $rounds,r0 @ inp 888 mov $rounds,r0 @ inp
812 mov $key,r2 889 mov $key,r2
813 sub $tbl,r3,#AES_decrypt-AES_Td @ Td 890 sub $tbl,r3,#AES_decrypt-AES_Td @ Td
814 891#if __ARM_ARCH__<7
815 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral 892 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
816 ldrb $t1,[$rounds,#2] @ manner... 893 ldrb $t1,[$rounds,#2] @ manner...
817 ldrb $t2,[$rounds,#1] 894 ldrb $t2,[$rounds,#1]
@@ -840,10 +917,33 @@ AES_decrypt:
840 orr $s3,$s3,$t1,lsl#8 917 orr $s3,$s3,$t1,lsl#8
841 orr $s3,$s3,$t2,lsl#16 918 orr $s3,$s3,$t2,lsl#16
842 orr $s3,$s3,$t3,lsl#24 919 orr $s3,$s3,$t3,lsl#24
843 920#else
921 ldr $s0,[$rounds,#0]
922 ldr $s1,[$rounds,#4]
923 ldr $s2,[$rounds,#8]
924 ldr $s3,[$rounds,#12]
925#ifdef __ARMEL__
926 rev $s0,$s0
927 rev $s1,$s1
928 rev $s2,$s2
929 rev $s3,$s3
930#endif
931#endif
844 bl _armv4_AES_decrypt 932 bl _armv4_AES_decrypt
845 933
846 ldr $rounds,[sp],#4 @ pop out 934 ldr $rounds,[sp],#4 @ pop out
935#if __ARM_ARCH__>=7
936#ifdef __ARMEL__
937 rev $s0,$s0
938 rev $s1,$s1
939 rev $s2,$s2
940 rev $s3,$s3
941#endif
942 str $s0,[$rounds,#0]
943 str $s1,[$rounds,#4]
944 str $s2,[$rounds,#8]
945 str $s3,[$rounds,#12]
946#else
847 mov $t1,$s0,lsr#24 @ write output in endian-neutral 947 mov $t1,$s0,lsr#24 @ write output in endian-neutral
848 mov $t2,$s0,lsr#16 @ manner... 948 mov $t2,$s0,lsr#16 @ manner...
849 mov $t3,$s0,lsr#8 949 mov $t3,$s0,lsr#8
@@ -872,11 +972,15 @@ AES_decrypt:
872 strb $t2,[$rounds,#13] 972 strb $t2,[$rounds,#13]
873 strb $t3,[$rounds,#14] 973 strb $t3,[$rounds,#14]
874 strb $s3,[$rounds,#15] 974 strb $s3,[$rounds,#15]
875 975#endif
976#if __ARM_ARCH__>=5
977 ldmia sp!,{r4-r12,pc}
978#else
876 ldmia sp!,{r4-r12,lr} 979 ldmia sp!,{r4-r12,lr}
877 tst lr,#1 980 tst lr,#1
878 moveq pc,lr @ be binary compatible with V4, yet 981 moveq pc,lr @ be binary compatible with V4, yet
879 bx lr @ interoperable with Thumb ISA:-) 982 bx lr @ interoperable with Thumb ISA:-)
983#endif
880.size AES_decrypt,.-AES_decrypt 984.size AES_decrypt,.-AES_decrypt
881 985
882.type _armv4_AES_decrypt,%function 986.type _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt:
916 and $i2,lr,$s2 @ i1 1020 and $i2,lr,$s2 @ i1
917 eor $t3,$i3,$t3,ror#8 1021 eor $t3,$i3,$t3,ror#8
918 and $i3,lr,$s2,lsr#16 1022 and $i3,lr,$s2,lsr#16
919 eor $s1,$s1,$t1,ror#8
920 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] 1023 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
1024 eor $s1,$s1,$t1,ror#8
1025 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
921 mov $s2,$s2,lsr#24 1026 mov $s2,$s2,lsr#24
922 1027
923 ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
924 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] 1028 ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
925 eor $s0,$s0,$i1,ror#16 1029 eor $s0,$s0,$i1,ror#16
926 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] 1030 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt:
929 and $i2,lr,$s3,lsr#8 @ i1 1033 and $i2,lr,$s3,lsr#8 @ i1
930 eor $t3,$i3,$t3,ror#8 1034 eor $t3,$i3,$t3,ror#8
931 and $i3,lr,$s3 @ i2 1035 and $i3,lr,$s3 @ i2
932 eor $s2,$s2,$t2,ror#8
933 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] 1036 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
1037 eor $s2,$s2,$t2,ror#8
1038 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
934 mov $s3,$s3,lsr#24 1039 mov $s3,$s3,lsr#24
935 1040
936 ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
937 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] 1041 ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
938 eor $s0,$s0,$i1,ror#8 1042 eor $s0,$s0,$i1,ror#8
939 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] 1043 ldr $i1,[$key],#16
940 eor $s1,$s1,$i2,ror#16 1044 eor $s1,$s1,$i2,ror#16
1045 ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
941 eor $s2,$s2,$i3,ror#24 1046 eor $s2,$s2,$i3,ror#24
942 ldr $i1,[$key],#16
943 eor $s3,$s3,$t3,ror#8
944 1047
945 ldr $t1,[$key,#-12] 1048 ldr $t1,[$key,#-12]
946 ldr $t2,[$key,#-8]
947 eor $s0,$s0,$i1 1049 eor $s0,$s0,$i1
1050 ldr $t2,[$key,#-8]
1051 eor $s3,$s3,$t3,ror#8
948 ldr $t3,[$key,#-4] 1052 ldr $t3,[$key,#-4]
949 and $i1,lr,$s0,lsr#16 1053 and $i1,lr,$s0,lsr#16
950 eor $s1,$s1,$t1 1054 eor $s1,$s1,$t1
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt:
985 and $i1,lr,$s2,lsr#8 @ i0 1089 and $i1,lr,$s2,lsr#8 @ i0
986 eor $t2,$t2,$i2,lsl#8 1090 eor $t2,$t2,$i2,lsl#8
987 and $i2,lr,$s2 @ i1 1091 and $i2,lr,$s2 @ i1
988 eor $t3,$t3,$i3,lsl#8
989 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] 1092 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
1093 eor $t3,$t3,$i3,lsl#8
1094 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
990 and $i3,lr,$s2,lsr#16 1095 and $i3,lr,$s2,lsr#16
991 1096
992 ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
993 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] 1097 ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
994 eor $s0,$s0,$i1,lsl#8 1098 eor $s0,$s0,$i1,lsl#8
995 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] 1099 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt:
997 and $i1,lr,$s3,lsr#16 @ i0 1101 and $i1,lr,$s3,lsr#16 @ i0
998 eor $s2,$t2,$s2,lsl#16 1102 eor $s2,$t2,$s2,lsl#16
999 and $i2,lr,$s3,lsr#8 @ i1 1103 and $i2,lr,$s3,lsr#8 @ i1
1000 eor $t3,$t3,$i3,lsl#16
1001 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] 1104 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
1105 eor $t3,$t3,$i3,lsl#16
1106 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1002 and $i3,lr,$s3 @ i2 1107 and $i3,lr,$s3 @ i2
1003 1108
1004 ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
1005 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] 1109 ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
1006 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] 1110 ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
1007 eor $s0,$s0,$i1,lsl#16 1111 eor $s0,$s0,$i1,lsl#16
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
new file mode 100644
index 0000000000..2ce6deffc8
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for MIPS
11
12# October 2010
13#
14# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
15# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
16# faster than gcc-generated code, which is not very impressive. But
17# recall that compressed S-box requires extra processing, namely
18# additional rotations. Rotations are implemented with lwl/lwr pairs,
19# which is normally used for loading unaligned data. Another cool
20# thing about this module is its endian neutrality, which means that
21# it processes data without ever changing byte order...
22
23######################################################################
24# There is a number of MIPS ABI in use, O32 and N32/64 are most
25# widely used. Then there is a new contender: NUBI. It appears that if
26# one picks the latter, it's possible to arrange code in ABI neutral
27# manner. Therefore let's stick to NUBI register layout:
28#
29($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
30($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
31($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
32($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
33#
34# The return value is placed in $a0. Following coding rules facilitate
35# interoperability:
36#
37# - never ever touch $tp, "thread pointer", former $gp;
38# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
39# old code];
40# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
41#
42# For reference here is register layout for N32/64 MIPS ABIs:
43#
44# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
45# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
46# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
47# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
48# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
49#
50$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
51
52if ($flavour =~ /64|n32/i) {
53 $PTR_ADD="dadd"; # incidentally works even on n32
54 $PTR_SUB="dsub"; # incidentally works even on n32
55 $REG_S="sd";
56 $REG_L="ld";
57 $PTR_SLL="dsll"; # incidentally works even on n32
58 $SZREG=8;
59} else {
60 $PTR_ADD="add";
61 $PTR_SUB="sub";
62 $REG_S="sw";
63 $REG_L="lw";
64 $PTR_SLL="sll";
65 $SZREG=4;
66}
67$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
68#
69# <appro@openssl.org>
70#
71######################################################################
72
73$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
74
75for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
76open STDOUT,">$output";
77
78if (!defined($big_endian))
79{ $big_endian=(unpack('L',pack('N',1))==1); }
80
81while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
82open STDOUT,">$output";
83
84my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
85
86$code.=<<___;
87.text
88#ifdef OPENSSL_FIPSCANISTER
89# include <openssl/fipssyms.h>
90#endif
91
92#if !defined(__vxworks) || defined(__pic__)
93.option pic2
94#endif
95.set noat
96___
97
98{{{
99my $FRAMESIZE=16*$SZREG;
100my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
101
102my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
103my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
104my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
105my ($key0,$cnt)=($gp,$fp);
106
107# instuction ordering is "stolen" from output from MIPSpro assembler
108# invoked with -mips3 -O3 arguments...
109$code.=<<___;
110.align 5
111.ent _mips_AES_encrypt
112_mips_AES_encrypt:
113 .frame $sp,0,$ra
114 .set reorder
115 lw $t0,0($key)
116 lw $t1,4($key)
117 lw $t2,8($key)
118 lw $t3,12($key)
119 lw $cnt,240($key)
120 $PTR_ADD $key0,$key,16
121
122 xor $s0,$t0
123 xor $s1,$t1
124 xor $s2,$t2
125 xor $s3,$t3
126
127 sub $cnt,1
128 _xtr $i0,$s1,16-2
129.Loop_enc:
130 _xtr $i1,$s2,16-2
131 _xtr $i2,$s3,16-2
132 _xtr $i3,$s0,16-2
133 and $i0,0x3fc
134 and $i1,0x3fc
135 and $i2,0x3fc
136 and $i3,0x3fc
137 $PTR_ADD $i0,$Tbl
138 $PTR_ADD $i1,$Tbl
139 $PTR_ADD $i2,$Tbl
140 $PTR_ADD $i3,$Tbl
141 lwl $t0,3($i0) # Te1[s1>>16]
142 lwl $t1,3($i1) # Te1[s2>>16]
143 lwl $t2,3($i2) # Te1[s3>>16]
144 lwl $t3,3($i3) # Te1[s0>>16]
145 lwr $t0,2($i0) # Te1[s1>>16]
146 lwr $t1,2($i1) # Te1[s2>>16]
147 lwr $t2,2($i2) # Te1[s3>>16]
148 lwr $t3,2($i3) # Te1[s0>>16]
149
150 _xtr $i0,$s2,8-2
151 _xtr $i1,$s3,8-2
152 _xtr $i2,$s0,8-2
153 _xtr $i3,$s1,8-2
154 and $i0,0x3fc
155 and $i1,0x3fc
156 and $i2,0x3fc
157 and $i3,0x3fc
158 $PTR_ADD $i0,$Tbl
159 $PTR_ADD $i1,$Tbl
160 $PTR_ADD $i2,$Tbl
161 $PTR_ADD $i3,$Tbl
162 lwl $t4,2($i0) # Te2[s2>>8]
163 lwl $t5,2($i1) # Te2[s3>>8]
164 lwl $t6,2($i2) # Te2[s0>>8]
165 lwl $t7,2($i3) # Te2[s1>>8]
166 lwr $t4,1($i0) # Te2[s2>>8]
167 lwr $t5,1($i1) # Te2[s3>>8]
168 lwr $t6,1($i2) # Te2[s0>>8]
169 lwr $t7,1($i3) # Te2[s1>>8]
170
171 _xtr $i0,$s3,0-2
172 _xtr $i1,$s0,0-2
173 _xtr $i2,$s1,0-2
174 _xtr $i3,$s2,0-2
175 and $i0,0x3fc
176 and $i1,0x3fc
177 and $i2,0x3fc
178 and $i3,0x3fc
179 $PTR_ADD $i0,$Tbl
180 $PTR_ADD $i1,$Tbl
181 $PTR_ADD $i2,$Tbl
182 $PTR_ADD $i3,$Tbl
183 lwl $t8,1($i0) # Te3[s3]
184 lwl $t9,1($i1) # Te3[s0]
185 lwl $t10,1($i2) # Te3[s1]
186 lwl $t11,1($i3) # Te3[s2]
187 lwr $t8,0($i0) # Te3[s3]
188 lwr $t9,0($i1) # Te3[s0]
189 lwr $t10,0($i2) # Te3[s1]
190 lwr $t11,0($i3) # Te3[s2]
191
192 _xtr $i0,$s0,24-2
193 _xtr $i1,$s1,24-2
194 _xtr $i2,$s2,24-2
195 _xtr $i3,$s3,24-2
196 and $i0,0x3fc
197 and $i1,0x3fc
198 and $i2,0x3fc
199 and $i3,0x3fc
200 $PTR_ADD $i0,$Tbl
201 $PTR_ADD $i1,$Tbl
202 $PTR_ADD $i2,$Tbl
203 $PTR_ADD $i3,$Tbl
204 xor $t0,$t4
205 xor $t1,$t5
206 xor $t2,$t6
207 xor $t3,$t7
208 lw $t4,0($i0) # Te0[s0>>24]
209 lw $t5,0($i1) # Te0[s1>>24]
210 lw $t6,0($i2) # Te0[s2>>24]
211 lw $t7,0($i3) # Te0[s3>>24]
212
213 lw $s0,0($key0)
214 lw $s1,4($key0)
215 lw $s2,8($key0)
216 lw $s3,12($key0)
217
218 xor $t0,$t8
219 xor $t1,$t9
220 xor $t2,$t10
221 xor $t3,$t11
222
223 xor $t0,$t4
224 xor $t1,$t5
225 xor $t2,$t6
226 xor $t3,$t7
227
228 sub $cnt,1
229 $PTR_ADD $key0,16
230 xor $s0,$t0
231 xor $s1,$t1
232 xor $s2,$t2
233 xor $s3,$t3
234 .set noreorder
235 bnez $cnt,.Loop_enc
236 _xtr $i0,$s1,16-2
237
238 .set reorder
239 _xtr $i1,$s2,16-2
240 _xtr $i2,$s3,16-2
241 _xtr $i3,$s0,16-2
242 and $i0,0x3fc
243 and $i1,0x3fc
244 and $i2,0x3fc
245 and $i3,0x3fc
246 $PTR_ADD $i0,$Tbl
247 $PTR_ADD $i1,$Tbl
248 $PTR_ADD $i2,$Tbl
249 $PTR_ADD $i3,$Tbl
250 lbu $t0,2($i0) # Te4[s1>>16]
251 lbu $t1,2($i1) # Te4[s2>>16]
252 lbu $t2,2($i2) # Te4[s3>>16]
253 lbu $t3,2($i3) # Te4[s0>>16]
254
255 _xtr $i0,$s2,8-2
256 _xtr $i1,$s3,8-2
257 _xtr $i2,$s0,8-2
258 _xtr $i3,$s1,8-2
259 and $i0,0x3fc
260 and $i1,0x3fc
261 and $i2,0x3fc
262 and $i3,0x3fc
263 $PTR_ADD $i0,$Tbl
264 $PTR_ADD $i1,$Tbl
265 $PTR_ADD $i2,$Tbl
266 $PTR_ADD $i3,$Tbl
267 lbu $t4,2($i0) # Te4[s2>>8]
268 lbu $t5,2($i1) # Te4[s3>>8]
269 lbu $t6,2($i2) # Te4[s0>>8]
270 lbu $t7,2($i3) # Te4[s1>>8]
271
272 _xtr $i0,$s0,24-2
273 _xtr $i1,$s1,24-2
274 _xtr $i2,$s2,24-2
275 _xtr $i3,$s3,24-2
276 and $i0,0x3fc
277 and $i1,0x3fc
278 and $i2,0x3fc
279 and $i3,0x3fc
280 $PTR_ADD $i0,$Tbl
281 $PTR_ADD $i1,$Tbl
282 $PTR_ADD $i2,$Tbl
283 $PTR_ADD $i3,$Tbl
284 lbu $t8,2($i0) # Te4[s0>>24]
285 lbu $t9,2($i1) # Te4[s1>>24]
286 lbu $t10,2($i2) # Te4[s2>>24]
287 lbu $t11,2($i3) # Te4[s3>>24]
288
289 _xtr $i0,$s3,0-2
290 _xtr $i1,$s0,0-2
291 _xtr $i2,$s1,0-2
292 _xtr $i3,$s2,0-2
293 and $i0,0x3fc
294 and $i1,0x3fc
295 and $i2,0x3fc
296 and $i3,0x3fc
297
298 _ins $t0,16
299 _ins $t1,16
300 _ins $t2,16
301 _ins $t3,16
302
303 _ins $t4,8
304 _ins $t5,8
305 _ins $t6,8
306 _ins $t7,8
307
308 xor $t0,$t4
309 xor $t1,$t5
310 xor $t2,$t6
311 xor $t3,$t7
312
313 $PTR_ADD $i0,$Tbl
314 $PTR_ADD $i1,$Tbl
315 $PTR_ADD $i2,$Tbl
316 $PTR_ADD $i3,$Tbl
317 lbu $t4,2($i0) # Te4[s3]
318 lbu $t5,2($i1) # Te4[s0]
319 lbu $t6,2($i2) # Te4[s1]
320 lbu $t7,2($i3) # Te4[s2]
321
322 _ins $t8,24
323 _ins $t9,24
324 _ins $t10,24
325 _ins $t11,24
326
327 lw $s0,0($key0)
328 lw $s1,4($key0)
329 lw $s2,8($key0)
330 lw $s3,12($key0)
331
332 xor $t0,$t8
333 xor $t1,$t9
334 xor $t2,$t10
335 xor $t3,$t11
336
337 _ins $t4,0
338 _ins $t5,0
339 _ins $t6,0
340 _ins $t7,0
341
342 xor $t0,$t4
343 xor $t1,$t5
344 xor $t2,$t6
345 xor $t3,$t7
346
347 xor $s0,$t0
348 xor $s1,$t1
349 xor $s2,$t2
350 xor $s3,$t3
351
352 jr $ra
353.end _mips_AES_encrypt
354
355.align 5
356.globl AES_encrypt
357.ent AES_encrypt
358AES_encrypt:
359 .frame $sp,$FRAMESIZE,$ra
360 .mask $SAVED_REGS_MASK,-$SZREG
361 .set noreorder
362___
363$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
364 .cpload $pf
365___
366$code.=<<___;
367 $PTR_SUB $sp,$FRAMESIZE
368 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
369 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
370 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
371 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
372 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
373 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
374 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
375 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
376 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
377 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
378___
379$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
380 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
381 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
382 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
383 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
384 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
385___
386$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
387 .cplocal $Tbl
388 .cpsetup $pf,$zero,AES_encrypt
389___
390$code.=<<___;
391 .set reorder
392 la $Tbl,AES_Te # PIC-ified 'load address'
393
394 lwl $s0,0+$MSB($inp)
395 lwl $s1,4+$MSB($inp)
396 lwl $s2,8+$MSB($inp)
397 lwl $s3,12+$MSB($inp)
398 lwr $s0,0+$LSB($inp)
399 lwr $s1,4+$LSB($inp)
400 lwr $s2,8+$LSB($inp)
401 lwr $s3,12+$LSB($inp)
402
403 bal _mips_AES_encrypt
404
405 swr $s0,0+$LSB($out)
406 swr $s1,4+$LSB($out)
407 swr $s2,8+$LSB($out)
408 swr $s3,12+$LSB($out)
409 swl $s0,0+$MSB($out)
410 swl $s1,4+$MSB($out)
411 swl $s2,8+$MSB($out)
412 swl $s3,12+$MSB($out)
413
414 .set noreorder
415 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
416 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
417 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
418 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
419 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
420 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
421 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
422 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
423 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
424 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
425___
426$code.=<<___ if ($flavour =~ /nubi/i);
427 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
428 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
429 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
430 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
431 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
432___
433$code.=<<___;
434 jr $ra
435 $PTR_ADD $sp,$FRAMESIZE
436.end AES_encrypt
437___
438
439$code.=<<___;
440.align 5
441.ent _mips_AES_decrypt
442_mips_AES_decrypt:
443 .frame $sp,0,$ra
444 .set reorder
445 lw $t0,0($key)
446 lw $t1,4($key)
447 lw $t2,8($key)
448 lw $t3,12($key)
449 lw $cnt,240($key)
450 $PTR_ADD $key0,$key,16
451
452 xor $s0,$t0
453 xor $s1,$t1
454 xor $s2,$t2
455 xor $s3,$t3
456
457 sub $cnt,1
458 _xtr $i0,$s3,16-2
459.Loop_dec:
460 _xtr $i1,$s0,16-2
461 _xtr $i2,$s1,16-2
462 _xtr $i3,$s2,16-2
463 and $i0,0x3fc
464 and $i1,0x3fc
465 and $i2,0x3fc
466 and $i3,0x3fc
467 $PTR_ADD $i0,$Tbl
468 $PTR_ADD $i1,$Tbl
469 $PTR_ADD $i2,$Tbl
470 $PTR_ADD $i3,$Tbl
471 lwl $t0,3($i0) # Td1[s3>>16]
472 lwl $t1,3($i1) # Td1[s0>>16]
473 lwl $t2,3($i2) # Td1[s1>>16]
474 lwl $t3,3($i3) # Td1[s2>>16]
475 lwr $t0,2($i0) # Td1[s3>>16]
476 lwr $t1,2($i1) # Td1[s0>>16]
477 lwr $t2,2($i2) # Td1[s1>>16]
478 lwr $t3,2($i3) # Td1[s2>>16]
479
480 _xtr $i0,$s2,8-2
481 _xtr $i1,$s3,8-2
482 _xtr $i2,$s0,8-2
483 _xtr $i3,$s1,8-2
484 and $i0,0x3fc
485 and $i1,0x3fc
486 and $i2,0x3fc
487 and $i3,0x3fc
488 $PTR_ADD $i0,$Tbl
489 $PTR_ADD $i1,$Tbl
490 $PTR_ADD $i2,$Tbl
491 $PTR_ADD $i3,$Tbl
492 lwl $t4,2($i0) # Td2[s2>>8]
493 lwl $t5,2($i1) # Td2[s3>>8]
494 lwl $t6,2($i2) # Td2[s0>>8]
495 lwl $t7,2($i3) # Td2[s1>>8]
496 lwr $t4,1($i0) # Td2[s2>>8]
497 lwr $t5,1($i1) # Td2[s3>>8]
498 lwr $t6,1($i2) # Td2[s0>>8]
499 lwr $t7,1($i3) # Td2[s1>>8]
500
501 _xtr $i0,$s1,0-2
502 _xtr $i1,$s2,0-2
503 _xtr $i2,$s3,0-2
504 _xtr $i3,$s0,0-2
505 and $i0,0x3fc
506 and $i1,0x3fc
507 and $i2,0x3fc
508 and $i3,0x3fc
509 $PTR_ADD $i0,$Tbl
510 $PTR_ADD $i1,$Tbl
511 $PTR_ADD $i2,$Tbl
512 $PTR_ADD $i3,$Tbl
513 lwl $t8,1($i0) # Td3[s1]
514 lwl $t9,1($i1) # Td3[s2]
515 lwl $t10,1($i2) # Td3[s3]
516 lwl $t11,1($i3) # Td3[s0]
517 lwr $t8,0($i0) # Td3[s1]
518 lwr $t9,0($i1) # Td3[s2]
519 lwr $t10,0($i2) # Td3[s3]
520 lwr $t11,0($i3) # Td3[s0]
521
522 _xtr $i0,$s0,24-2
523 _xtr $i1,$s1,24-2
524 _xtr $i2,$s2,24-2
525 _xtr $i3,$s3,24-2
526 and $i0,0x3fc
527 and $i1,0x3fc
528 and $i2,0x3fc
529 and $i3,0x3fc
530 $PTR_ADD $i0,$Tbl
531 $PTR_ADD $i1,$Tbl
532 $PTR_ADD $i2,$Tbl
533 $PTR_ADD $i3,$Tbl
534
535 xor $t0,$t4
536 xor $t1,$t5
537 xor $t2,$t6
538 xor $t3,$t7
539
540
541 lw $t4,0($i0) # Td0[s0>>24]
542 lw $t5,0($i1) # Td0[s1>>24]
543 lw $t6,0($i2) # Td0[s2>>24]
544 lw $t7,0($i3) # Td0[s3>>24]
545
546 lw $s0,0($key0)
547 lw $s1,4($key0)
548 lw $s2,8($key0)
549 lw $s3,12($key0)
550
551 xor $t0,$t8
552 xor $t1,$t9
553 xor $t2,$t10
554 xor $t3,$t11
555
556 xor $t0,$t4
557 xor $t1,$t5
558 xor $t2,$t6
559 xor $t3,$t7
560
561 sub $cnt,1
562 $PTR_ADD $key0,16
563 xor $s0,$t0
564 xor $s1,$t1
565 xor $s2,$t2
566 xor $s3,$t3
567 .set noreorder
568 bnez $cnt,.Loop_dec
569 _xtr $i0,$s3,16-2
570
571 .set reorder
572 lw $t4,1024($Tbl) # prefetch Td4
573 lw $t5,1024+32($Tbl)
574 lw $t6,1024+64($Tbl)
575 lw $t7,1024+96($Tbl)
576 lw $t8,1024+128($Tbl)
577 lw $t9,1024+160($Tbl)
578 lw $t10,1024+192($Tbl)
579 lw $t11,1024+224($Tbl)
580
581 _xtr $i0,$s3,16
582 _xtr $i1,$s0,16
583 _xtr $i2,$s1,16
584 _xtr $i3,$s2,16
585 and $i0,0xff
586 and $i1,0xff
587 and $i2,0xff
588 and $i3,0xff
589 $PTR_ADD $i0,$Tbl
590 $PTR_ADD $i1,$Tbl
591 $PTR_ADD $i2,$Tbl
592 $PTR_ADD $i3,$Tbl
593 lbu $t0,1024($i0) # Td4[s3>>16]
594 lbu $t1,1024($i1) # Td4[s0>>16]
595 lbu $t2,1024($i2) # Td4[s1>>16]
596 lbu $t3,1024($i3) # Td4[s2>>16]
597
598 _xtr $i0,$s2,8
599 _xtr $i1,$s3,8
600 _xtr $i2,$s0,8
601 _xtr $i3,$s1,8
602 and $i0,0xff
603 and $i1,0xff
604 and $i2,0xff
605 and $i3,0xff
606 $PTR_ADD $i0,$Tbl
607 $PTR_ADD $i1,$Tbl
608 $PTR_ADD $i2,$Tbl
609 $PTR_ADD $i3,$Tbl
610 lbu $t4,1024($i0) # Td4[s2>>8]
611 lbu $t5,1024($i1) # Td4[s3>>8]
612 lbu $t6,1024($i2) # Td4[s0>>8]
613 lbu $t7,1024($i3) # Td4[s1>>8]
614
615 _xtr $i0,$s0,24
616 _xtr $i1,$s1,24
617 _xtr $i2,$s2,24
618 _xtr $i3,$s3,24
619 $PTR_ADD $i0,$Tbl
620 $PTR_ADD $i1,$Tbl
621 $PTR_ADD $i2,$Tbl
622 $PTR_ADD $i3,$Tbl
623 lbu $t8,1024($i0) # Td4[s0>>24]
624 lbu $t9,1024($i1) # Td4[s1>>24]
625 lbu $t10,1024($i2) # Td4[s2>>24]
626 lbu $t11,1024($i3) # Td4[s3>>24]
627
628 _xtr $i0,$s1,0
629 _xtr $i1,$s2,0
630 _xtr $i2,$s3,0
631 _xtr $i3,$s0,0
632
633 _ins $t0,16
634 _ins $t1,16
635 _ins $t2,16
636 _ins $t3,16
637
638 _ins $t4,8
639 _ins $t5,8
640 _ins $t6,8
641 _ins $t7,8
642
643 xor $t0,$t4
644 xor $t1,$t5
645 xor $t2,$t6
646 xor $t3,$t7
647
648 $PTR_ADD $i0,$Tbl
649 $PTR_ADD $i1,$Tbl
650 $PTR_ADD $i2,$Tbl
651 $PTR_ADD $i3,$Tbl
652 lbu $t4,1024($i0) # Td4[s1]
653 lbu $t5,1024($i1) # Td4[s2]
654 lbu $t6,1024($i2) # Td4[s3]
655 lbu $t7,1024($i3) # Td4[s0]
656
657 _ins $t8,24
658 _ins $t9,24
659 _ins $t10,24
660 _ins $t11,24
661
662 lw $s0,0($key0)
663 lw $s1,4($key0)
664 lw $s2,8($key0)
665 lw $s3,12($key0)
666
667 _ins $t4,0
668 _ins $t5,0
669 _ins $t6,0
670 _ins $t7,0
671
672
673 xor $t0,$t8
674 xor $t1,$t9
675 xor $t2,$t10
676 xor $t3,$t11
677
678 xor $t0,$t4
679 xor $t1,$t5
680 xor $t2,$t6
681 xor $t3,$t7
682
683 xor $s0,$t0
684 xor $s1,$t1
685 xor $s2,$t2
686 xor $s3,$t3
687
688 jr $ra
689.end _mips_AES_decrypt
690
691.align 5
692.globl AES_decrypt
693.ent AES_decrypt
694AES_decrypt:
695 .frame $sp,$FRAMESIZE,$ra
696 .mask $SAVED_REGS_MASK,-$SZREG
697 .set noreorder
698___
699$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
700 .cpload $pf
701___
702$code.=<<___;
703 $PTR_SUB $sp,$FRAMESIZE
704 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
705 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
706 $REG_S $s11,$FRAMESIZE-3*$SZREG($sp)
707 $REG_S $s10,$FRAMESIZE-4*$SZREG($sp)
708 $REG_S $s9,$FRAMESIZE-5*$SZREG($sp)
709 $REG_S $s8,$FRAMESIZE-6*$SZREG($sp)
710 $REG_S $s7,$FRAMESIZE-7*$SZREG($sp)
711 $REG_S $s6,$FRAMESIZE-8*$SZREG($sp)
712 $REG_S $s5,$FRAMESIZE-9*$SZREG($sp)
713 $REG_S $s4,$FRAMESIZE-10*$SZREG($sp)
714___
715$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
716 $REG_S \$15,$FRAMESIZE-11*$SZREG($sp)
717 $REG_S \$14,$FRAMESIZE-12*$SZREG($sp)
718 $REG_S \$13,$FRAMESIZE-13*$SZREG($sp)
719 $REG_S \$12,$FRAMESIZE-14*$SZREG($sp)
720 $REG_S $gp,$FRAMESIZE-15*$SZREG($sp)
721___
722$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
723 .cplocal $Tbl
724 .cpsetup $pf,$zero,AES_decrypt
725___
726$code.=<<___;
727 .set reorder
728 la $Tbl,AES_Td # PIC-ified 'load address'
729
730 lwl $s0,0+$MSB($inp)
731 lwl $s1,4+$MSB($inp)
732 lwl $s2,8+$MSB($inp)
733 lwl $s3,12+$MSB($inp)
734 lwr $s0,0+$LSB($inp)
735 lwr $s1,4+$LSB($inp)
736 lwr $s2,8+$LSB($inp)
737 lwr $s3,12+$LSB($inp)
738
739 bal _mips_AES_decrypt
740
741 swr $s0,0+$LSB($out)
742 swr $s1,4+$LSB($out)
743 swr $s2,8+$LSB($out)
744 swr $s3,12+$LSB($out)
745 swl $s0,0+$MSB($out)
746 swl $s1,4+$MSB($out)
747 swl $s2,8+$MSB($out)
748 swl $s3,12+$MSB($out)
749
750 .set noreorder
751 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
752 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
753 $REG_L $s11,$FRAMESIZE-3*$SZREG($sp)
754 $REG_L $s10,$FRAMESIZE-4*$SZREG($sp)
755 $REG_L $s9,$FRAMESIZE-5*$SZREG($sp)
756 $REG_L $s8,$FRAMESIZE-6*$SZREG($sp)
757 $REG_L $s7,$FRAMESIZE-7*$SZREG($sp)
758 $REG_L $s6,$FRAMESIZE-8*$SZREG($sp)
759 $REG_L $s5,$FRAMESIZE-9*$SZREG($sp)
760 $REG_L $s4,$FRAMESIZE-10*$SZREG($sp)
761___
762$code.=<<___ if ($flavour =~ /nubi/i);
763 $REG_L \$15,$FRAMESIZE-11*$SZREG($sp)
764 $REG_L \$14,$FRAMESIZE-12*$SZREG($sp)
765 $REG_L \$13,$FRAMESIZE-13*$SZREG($sp)
766 $REG_L \$12,$FRAMESIZE-14*$SZREG($sp)
767 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
768___
769$code.=<<___;
770 jr $ra
771 $PTR_ADD $sp,$FRAMESIZE
772.end AES_decrypt
773___
774}}}
775
776{{{
777my $FRAMESIZE=8*$SZREG;
778my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
779
780my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
781my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
782my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
783my ($rcon,$cnt)=($gp,$fp);
784
785$code.=<<___;
786.align 5
787.ent _mips_AES_set_encrypt_key
788_mips_AES_set_encrypt_key:
789 .frame $sp,0,$ra
790 .set noreorder
791 beqz $inp,.Lekey_done
792 li $t0,-1
793 beqz $key,.Lekey_done
794 $PTR_ADD $rcon,$Tbl,1024+256
795
796 .set reorder
797 lwl $rk0,0+$MSB($inp) # load 128 bits
798 lwl $rk1,4+$MSB($inp)
799 lwl $rk2,8+$MSB($inp)
800 lwl $rk3,12+$MSB($inp)
801 li $at,128
802 lwr $rk0,0+$LSB($inp)
803 lwr $rk1,4+$LSB($inp)
804 lwr $rk2,8+$LSB($inp)
805 lwr $rk3,12+$LSB($inp)
806 .set noreorder
807 beq $bits,$at,.L128bits
808 li $cnt,10
809
810 .set reorder
811 lwl $rk4,16+$MSB($inp) # load 192 bits
812 lwl $rk5,20+$MSB($inp)
813 li $at,192
814 lwr $rk4,16+$LSB($inp)
815 lwr $rk5,20+$LSB($inp)
816 .set noreorder
817 beq $bits,$at,.L192bits
818 li $cnt,8
819
820 .set reorder
821 lwl $rk6,24+$MSB($inp) # load 256 bits
822 lwl $rk7,28+$MSB($inp)
823 li $at,256
824 lwr $rk6,24+$LSB($inp)
825 lwr $rk7,28+$LSB($inp)
826 .set noreorder
827 beq $bits,$at,.L256bits
828 li $cnt,7
829
830 b .Lekey_done
831 li $t0,-2
832
833.align 4
834.L128bits:
835 .set reorder
836 srl $i0,$rk3,16
837 srl $i1,$rk3,8
838 and $i0,0xff
839 and $i1,0xff
840 and $i2,$rk3,0xff
841 srl $i3,$rk3,24
842 $PTR_ADD $i0,$Tbl
843 $PTR_ADD $i1,$Tbl
844 $PTR_ADD $i2,$Tbl
845 $PTR_ADD $i3,$Tbl
846 lbu $i0,1024($i0)
847 lbu $i1,1024($i1)
848 lbu $i2,1024($i2)
849 lbu $i3,1024($i3)
850
851 sw $rk0,0($key)
852 sw $rk1,4($key)
853 sw $rk2,8($key)
854 sw $rk3,12($key)
855 sub $cnt,1
856 $PTR_ADD $key,16
857
858 _bias $i0,24
859 _bias $i1,16
860 _bias $i2,8
861 _bias $i3,0
862
863 xor $rk0,$i0
864 lw $i0,0($rcon)
865 xor $rk0,$i1
866 xor $rk0,$i2
867 xor $rk0,$i3
868 xor $rk0,$i0
869
870 xor $rk1,$rk0
871 xor $rk2,$rk1
872 xor $rk3,$rk2
873
874 .set noreorder
875 bnez $cnt,.L128bits
876 $PTR_ADD $rcon,4
877
878 sw $rk0,0($key)
879 sw $rk1,4($key)
880 sw $rk2,8($key)
881 li $cnt,10
882 sw $rk3,12($key)
883 li $t0,0
884 sw $cnt,80($key)
885 b .Lekey_done
886 $PTR_SUB $key,10*16
887
888.align 4
889.L192bits:
890 .set reorder
891 srl $i0,$rk5,16
892 srl $i1,$rk5,8
893 and $i0,0xff
894 and $i1,0xff
895 and $i2,$rk5,0xff
896 srl $i3,$rk5,24
897 $PTR_ADD $i0,$Tbl
898 $PTR_ADD $i1,$Tbl
899 $PTR_ADD $i2,$Tbl
900 $PTR_ADD $i3,$Tbl
901 lbu $i0,1024($i0)
902 lbu $i1,1024($i1)
903 lbu $i2,1024($i2)
904 lbu $i3,1024($i3)
905
906 sw $rk0,0($key)
907 sw $rk1,4($key)
908 sw $rk2,8($key)
909 sw $rk3,12($key)
910 sw $rk4,16($key)
911 sw $rk5,20($key)
912 sub $cnt,1
913 $PTR_ADD $key,24
914
915 _bias $i0,24
916 _bias $i1,16
917 _bias $i2,8
918 _bias $i3,0
919
920 xor $rk0,$i0
921 lw $i0,0($rcon)
922 xor $rk0,$i1
923 xor $rk0,$i2
924 xor $rk0,$i3
925 xor $rk0,$i0
926
927 xor $rk1,$rk0
928 xor $rk2,$rk1
929 xor $rk3,$rk2
930 xor $rk4,$rk3
931 xor $rk5,$rk4
932
933 .set noreorder
934 bnez $cnt,.L192bits
935 $PTR_ADD $rcon,4
936
937 sw $rk0,0($key)
938 sw $rk1,4($key)
939 sw $rk2,8($key)
940 li $cnt,12
941 sw $rk3,12($key)
942 li $t0,0
943 sw $cnt,48($key)
944 b .Lekey_done
945 $PTR_SUB $key,12*16
946
947.align 4
948.L256bits:
949 .set reorder
950 srl $i0,$rk7,16
951 srl $i1,$rk7,8
952 and $i0,0xff
953 and $i1,0xff
954 and $i2,$rk7,0xff
955 srl $i3,$rk7,24
956 $PTR_ADD $i0,$Tbl
957 $PTR_ADD $i1,$Tbl
958 $PTR_ADD $i2,$Tbl
959 $PTR_ADD $i3,$Tbl
960 lbu $i0,1024($i0)
961 lbu $i1,1024($i1)
962 lbu $i2,1024($i2)
963 lbu $i3,1024($i3)
964
965 sw $rk0,0($key)
966 sw $rk1,4($key)
967 sw $rk2,8($key)
968 sw $rk3,12($key)
969 sw $rk4,16($key)
970 sw $rk5,20($key)
971 sw $rk6,24($key)
972 sw $rk7,28($key)
973 sub $cnt,1
974
975 _bias $i0,24
976 _bias $i1,16
977 _bias $i2,8
978 _bias $i3,0
979
980 xor $rk0,$i0
981 lw $i0,0($rcon)
982 xor $rk0,$i1
983 xor $rk0,$i2
984 xor $rk0,$i3
985 xor $rk0,$i0
986
987 xor $rk1,$rk0
988 xor $rk2,$rk1
989 xor $rk3,$rk2
990 beqz $cnt,.L256bits_done
991
992 srl $i0,$rk3,24
993 srl $i1,$rk3,16
994 srl $i2,$rk3,8
995 and $i3,$rk3,0xff
996 and $i1,0xff
997 and $i2,0xff
998 $PTR_ADD $i0,$Tbl
999 $PTR_ADD $i1,$Tbl
1000 $PTR_ADD $i2,$Tbl
1001 $PTR_ADD $i3,$Tbl
1002 lbu $i0,1024($i0)
1003 lbu $i1,1024($i1)
1004 lbu $i2,1024($i2)
1005 lbu $i3,1024($i3)
1006 sll $i0,24
1007 sll $i1,16
1008 sll $i2,8
1009
1010 xor $rk4,$i0
1011 xor $rk4,$i1
1012 xor $rk4,$i2
1013 xor $rk4,$i3
1014
1015 xor $rk5,$rk4
1016 xor $rk6,$rk5
1017 xor $rk7,$rk6
1018
1019 $PTR_ADD $key,32
1020 .set noreorder
1021 b .L256bits
1022 $PTR_ADD $rcon,4
1023
1024.L256bits_done:
1025 sw $rk0,32($key)
1026 sw $rk1,36($key)
1027 sw $rk2,40($key)
1028 li $cnt,14
1029 sw $rk3,44($key)
1030 li $t0,0
1031 sw $cnt,48($key)
1032 $PTR_SUB $key,12*16
1033
1034.Lekey_done:
1035 jr $ra
1036 nop
1037.end _mips_AES_set_encrypt_key
1038
1039.globl AES_set_encrypt_key
1040.ent AES_set_encrypt_key
1041AES_set_encrypt_key:
1042 .frame $sp,$FRAMESIZE,$ra
1043 .mask $SAVED_REGS_MASK,-$SZREG
1044 .set noreorder
1045___
1046$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1047 .cpload $pf
1048___
1049$code.=<<___;
1050 $PTR_SUB $sp,$FRAMESIZE
1051 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1052 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1053___
1054$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1055 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1056 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1057 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1058 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1059 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1060___
1061$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1062 .cplocal $Tbl
1063 .cpsetup $pf,$zero,AES_set_encrypt_key
1064___
1065$code.=<<___;
1066 .set reorder
1067 la $Tbl,AES_Te # PIC-ified 'load address'
1068
1069 bal _mips_AES_set_encrypt_key
1070
1071 .set noreorder
1072 move $a0,$t0
1073 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1074 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1075___
1076$code.=<<___ if ($flavour =~ /nubi/i);
1077 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1078 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1079 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1080 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1081 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1082___
1083$code.=<<___;
1084 jr $ra
1085 $PTR_ADD $sp,$FRAMESIZE
1086.end AES_set_encrypt_key
1087___
1088
1089my ($head,$tail)=($inp,$bits);
1090my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
1091my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
1092$code.=<<___;
1093.align 5
1094.globl AES_set_decrypt_key
1095.ent AES_set_decrypt_key
1096AES_set_decrypt_key:
1097 .frame $sp,$FRAMESIZE,$ra
1098 .mask $SAVED_REGS_MASK,-$SZREG
1099 .set noreorder
1100___
1101$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification
1102 .cpload $pf
1103___
1104$code.=<<___;
1105 $PTR_SUB $sp,$FRAMESIZE
1106 $REG_S $ra,$FRAMESIZE-1*$SZREG($sp)
1107 $REG_S $fp,$FRAMESIZE-2*$SZREG($sp)
1108___
1109$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
1110 $REG_S $s3,$FRAMESIZE-3*$SZREG($sp)
1111 $REG_S $s2,$FRAMESIZE-4*$SZREG($sp)
1112 $REG_S $s1,$FRAMESIZE-5*$SZREG($sp)
1113 $REG_S $s0,$FRAMESIZE-6*$SZREG($sp)
1114 $REG_S $gp,$FRAMESIZE-7*$SZREG($sp)
1115___
1116$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
1117 .cplocal $Tbl
1118 .cpsetup $pf,$zero,AES_set_decrypt_key
1119___
1120$code.=<<___;
1121 .set reorder
1122 la $Tbl,AES_Te # PIC-ified 'load address'
1123
1124 bal _mips_AES_set_encrypt_key
1125
1126 bltz $t0,.Ldkey_done
1127
1128 sll $at,$cnt,4
1129 $PTR_ADD $head,$key,0
1130 $PTR_ADD $tail,$key,$at
1131.align 4
1132.Lswap:
1133 lw $rk0,0($head)
1134 lw $rk1,4($head)
1135 lw $rk2,8($head)
1136 lw $rk3,12($head)
1137 lw $rk4,0($tail)
1138 lw $rk5,4($tail)
1139 lw $rk6,8($tail)
1140 lw $rk7,12($tail)
1141 sw $rk0,0($tail)
1142 sw $rk1,4($tail)
1143 sw $rk2,8($tail)
1144 sw $rk3,12($tail)
1145 $PTR_ADD $head,16
1146 $PTR_SUB $tail,16
1147 sw $rk4,-16($head)
1148 sw $rk5,-12($head)
1149 sw $rk6,-8($head)
1150 sw $rk7,-4($head)
1151 bne $head,$tail,.Lswap
1152
1153 lw $tp1,16($key) # modulo-scheduled
1154 lui $x80808080,0x8080
1155 sub $cnt,1
1156 or $x80808080,0x8080
1157 sll $cnt,2
1158 $PTR_ADD $key,16
1159 lui $x1b1b1b1b,0x1b1b
1160 nor $x7f7f7f7f,$zero,$x80808080
1161 or $x1b1b1b1b,0x1b1b
1162.align 4
1163.Lmix:
1164 and $m,$tp1,$x80808080
1165 and $tp2,$tp1,$x7f7f7f7f
1166 srl $tp4,$m,7
1167 addu $tp2,$tp2 # tp2<<1
1168 subu $m,$tp4
1169 and $m,$x1b1b1b1b
1170 xor $tp2,$m
1171
1172 and $m,$tp2,$x80808080
1173 and $tp4,$tp2,$x7f7f7f7f
1174 srl $tp8,$m,7
1175 addu $tp4,$tp4 # tp4<<1
1176 subu $m,$tp8
1177 and $m,$x1b1b1b1b
1178 xor $tp4,$m
1179
1180 and $m,$tp4,$x80808080
1181 and $tp8,$tp4,$x7f7f7f7f
1182 srl $tp9,$m,7
1183 addu $tp8,$tp8 # tp8<<1
1184 subu $m,$tp9
1185 and $m,$x1b1b1b1b
1186 xor $tp8,$m
1187
1188 xor $tp9,$tp8,$tp1
1189 xor $tpe,$tp8,$tp4
1190 xor $tpb,$tp9,$tp2
1191 xor $tpd,$tp9,$tp4
1192
1193 _ror $tp1,$tpd,16
1194 xor $tpe,$tp2
1195 _ror $tp2,$tpd,-16
1196 xor $tpe,$tp1
1197 _ror $tp1,$tp9,8
1198 xor $tpe,$tp2
1199 _ror $tp2,$tp9,-24
1200 xor $tpe,$tp1
1201 _ror $tp1,$tpb,24
1202 xor $tpe,$tp2
1203 _ror $tp2,$tpb,-8
1204 xor $tpe,$tp1
1205 lw $tp1,4($key) # modulo-scheduled
1206 xor $tpe,$tp2
1207 sub $cnt,1
1208 sw $tpe,0($key)
1209 $PTR_ADD $key,4
1210 bnez $cnt,.Lmix
1211
1212 li $t0,0
1213.Ldkey_done:
1214 .set noreorder
1215 move $a0,$t0
1216 $REG_L $ra,$FRAMESIZE-1*$SZREG($sp)
1217 $REG_L $fp,$FRAMESIZE-2*$SZREG($sp)
1218___
1219$code.=<<___ if ($flavour =~ /nubi/i);
1220 $REG_L $s3,$FRAMESIZE-11*$SZREG($sp)
1221 $REG_L $s2,$FRAMESIZE-12*$SZREG($sp)
1222 $REG_L $s1,$FRAMESIZE-13*$SZREG($sp)
1223 $REG_L $s0,$FRAMESIZE-14*$SZREG($sp)
1224 $REG_L $gp,$FRAMESIZE-15*$SZREG($sp)
1225___
1226$code.=<<___;
1227 jr $ra
1228 $PTR_ADD $sp,$FRAMESIZE
1229.end AES_set_decrypt_key
1230___
1231}}}
1232
1233######################################################################
1234# Tables are kept in endian-neutral manner
1235$code.=<<___;
1236.rdata
1237.align 6
1238AES_Te:
1239.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0
1240.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d
1241.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd
1242.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54
1243.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03
1244.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d
1245.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62
1246.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a
1247.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d
1248.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87
1249.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb
1250.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b
1251.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67
1252.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea
1253.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7
1254.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b
1255.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c
1256.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a
1257.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41
1258.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f
1259.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4
1260.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08
1261.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73
1262.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f
1263.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52
1264.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e
1265.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1
1266.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5
1267.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36
1268.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d
1269.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69
1270.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f
1271.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e
1272.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e
1273.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2
1274.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb
1275.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d
1276.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce
1277.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e
1278.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97
1279.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68
1280.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c
1281.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f
1282.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed
1283.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46
1284.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b
1285.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4
1286.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a
1287.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a
1288.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16
1289.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7
1290.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94
1291.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10
1292.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81
1293.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44
1294.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3
1295.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe
1296.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a
1297.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc
1298.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04
1299.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1
1300.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63
1301.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a
1302.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d
1303.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14
1304.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f
1305.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2
1306.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39
1307.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2
1308.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47
1309.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7
1310.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95
1311.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98
1312.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f
1313.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e
1314.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83
1315.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29
1316.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c
1317.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2
1318.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76
1319.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56
1320.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e
1321.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a
1322.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4
1323.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e
1324.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6
1325.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4
1326.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b
1327.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43
1328.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7
1329.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64
1330.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0
1331.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa
1332.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25
1333.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e
1334.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18
1335.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88
1336.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72
1337.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1
1338.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51
1339.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c
1340.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21
1341.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc
1342.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85
1343.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42
1344.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa
1345.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05
1346.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12
1347.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f
1348.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0
1349.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58
1350.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9
1351.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13
1352.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33
1353.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70
1354.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7
1355.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22
1356.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20
1357.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff
1358.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a
1359.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8
1360.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17
1361.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31
1362.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8
1363.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0
1364.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11
1365.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc
1366.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a
1367
1368.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4
1369.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
1370.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
1371.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
1372.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
1373.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
1374.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
1375.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
1376.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
1377.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
1378.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
1379.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
1380.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
1381.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
1382.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
1383.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
1384.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
1385.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
1386.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
1387.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
1388.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
1389.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
1390.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
1391.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
1392.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
1393.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
1394.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
1395.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
1396.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
1397.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
1398.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
1399.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
1400
1401.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon
1402.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00
1403.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00
1404.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00
1405.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00
1406
1407.align 6
1408AES_Td:
1409.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0
1410.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96
1411.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1
1412.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93
1413.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6
1414.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25
1415.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7
1416.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f
1417.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67
1418.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1
1419.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12
1420.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6
1421.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95
1422.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda
1423.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3
1424.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44
1425.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78
1426.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd
1427.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17
1428.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4
1429.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82
1430.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45
1431.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84
1432.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94
1433.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19
1434.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7
1435.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2
1436.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a
1437.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03
1438.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5
1439.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2
1440.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c
1441.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92
1442.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1
1443.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5
1444.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a
1445.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0
1446.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75
1447.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa
1448.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51
1449.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d
1450.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46
1451.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05
1452.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff
1453.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97
1454.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77
1455.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88
1456.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb
1457.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9
1458.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00
1459.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48
1460.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e
1461.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56
1462.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27
1463.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21
1464.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a
1465.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f
1466.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e
1467.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2
1468.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16
1469.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5
1470.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d
1471.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad
1472.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8
1473.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c
1474.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd
1475.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc
1476.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34
1477.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc
1478.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63
1479.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10
1480.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20
1481.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8
1482.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d
1483.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3
1484.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0
1485.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99
1486.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22
1487.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a
1488.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef
1489.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1
1490.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36
1491.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28
1492.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4
1493.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d
1494.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62
1495.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8
1496.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5
1497.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c
1498.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3
1499.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7
1500.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b
1501.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4
1502.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8
1503.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e
1504.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6
1505.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce
1506.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6
1507.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31
1508.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0
1509.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6
1510.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15
1511.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7
1512.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f
1513.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d
1514.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf
1515.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b
1516.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f
1517.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d
1518.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e
1519.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52
1520.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13
1521.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a
1522.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89
1523.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35
1524.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c
1525.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f
1526.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf
1527.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b
1528.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86
1529.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e
1530.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f
1531.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c
1532.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41
1533.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde
1534.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90
1535.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70
1536.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42
1537
1538.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4
1539.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
1540.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
1541.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
1542.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
1543.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
1544.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
1545.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
1546.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
1547.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
1548.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
1549.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
1550.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
1551.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
1552.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
1553.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
1554.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
1555.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
1556.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
1557.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
1558.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
1559.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
1560.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
1561.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
1562.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
1563.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
1564.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1565.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1566.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1567.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1568.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1569.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1570___
1571
1572foreach (split("\n",$code)) {
1573 s/\`([^\`]*)\`/eval $1/ge;
1574
1575 # made-up _instructions, _xtr, _ins, _ror and _bias, cope
1576 # with byte order dependencies...
1577 if (/^\s+_/) {
1578 s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
1579
1580 s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
1581 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1582 : eval("24-$3"))/e or
1583 s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1584 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1585 : eval("24-$3"))/e or
1586 s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
1587 sprintf("srl\t$1,$2,%d",$big_endian ? eval($3)
1588 : eval("$3*-1"))/e or
1589 s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
1590 sprintf("sll\t$1,$2,%d",$big_endian ? eval($3)
1591 : eval("($3-16)&31"))/e;
1592
1593 s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
1594 sprintf("sll\t$1,$2,$3")/e or
1595 s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
1596 sprintf("and\t$1,$2,0xff")/e or
1597 s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
1598 }
1599
1600 # convert lwl/lwr and swr/swl to little-endian order
1601 if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
1602 s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
1603 sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or
1604 s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
1605 sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
1606 }
1607
1608 print $_,"\n";
1609}
1610
1611close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000000..c36b6a2270
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for PA-RISC.
11#
12# June 2009.
13#
14# The module is mechanical transliteration of aes-sparcv9.pl, but with
15# a twist: S-boxes are compressed even further down to 1K+256B. On
16# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
17# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
18# perform at 16 cycles per byte. It's not faster than code generated
19# by vendor compiler, but recall that it has compressed S-boxes, which
20# requires extra processing.
21#
22# Special thanks to polarhome.com for providing HP-UX account.
23
24$flavour = shift;
25$output = shift;
26open STDOUT,">$output";
27
28if ($flavour =~ /64/) {
29 $LEVEL ="2.0W";
30 $SIZE_T =8;
31 $FRAME_MARKER =80;
32 $SAVED_RP =16;
33 $PUSH ="std";
34 $PUSHMA ="std,ma";
35 $POP ="ldd";
36 $POPMB ="ldd,mb";
37} else {
38 $LEVEL ="1.0";
39 $SIZE_T =4;
40 $FRAME_MARKER =48;
41 $SAVED_RP =20;
42 $PUSH ="stw";
43 $PUSHMA ="stwm";
44 $POP ="ldw";
45 $POPMB ="ldwm";
46}
47
48$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
49 # [+ argument transfer]
50$inp="%r26"; # arg0
51$out="%r25"; # arg1
52$key="%r24"; # arg2
53
54($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
55($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
56
57($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
58 $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
59("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
60"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
61
62$tbl="%r28";
63$rounds="%r29";
64
65$code=<<___;
66 .LEVEL $LEVEL
67 .SPACE \$TEXT\$
68 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
69
70 .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
71 .ALIGN 64
72AES_encrypt
73 .PROC
74 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
75 .ENTRY
76 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
77 $PUSHMA %r3,$FRAME(%sp)
78 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
79 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
80 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
81 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
82 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
83 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
84 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
85 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
86 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
87 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
88 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
89 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
90 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
91 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
92 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
93
94 blr %r0,$tbl
95 ldi 3,$t0
96L\$enc_pic
97 andcm $tbl,$t0,$tbl
98 ldo L\$AES_Te-L\$enc_pic($tbl),$tbl
99
100 and $inp,$t0,$t0
101 sub $inp,$t0,$inp
102 ldw 0($inp),$s0
103 ldw 4($inp),$s1
104 ldw 8($inp),$s2
105 comib,= 0,$t0,L\$enc_inp_aligned
106 ldw 12($inp),$s3
107
108 sh3addl $t0,%r0,$t0
109 subi 32,$t0,$t0
110 mtctl $t0,%cr11
111 ldw 16($inp),$t1
112 vshd $s0,$s1,$s0
113 vshd $s1,$s2,$s1
114 vshd $s2,$s3,$s2
115 vshd $s3,$t1,$s3
116
117L\$enc_inp_aligned
118 bl _parisc_AES_encrypt,%r31
119 nop
120
121 extru,<> $out,31,2,%r0
122 b L\$enc_out_aligned
123 nop
124
125 _srm $s0,24,$acc0
126 _srm $s0,16,$acc1
127 stb $acc0,0($out)
128 _srm $s0,8,$acc2
129 stb $acc1,1($out)
130 _srm $s1,24,$acc4
131 stb $acc2,2($out)
132 _srm $s1,16,$acc5
133 stb $s0,3($out)
134 _srm $s1,8,$acc6
135 stb $acc4,4($out)
136 _srm $s2,24,$acc0
137 stb $acc5,5($out)
138 _srm $s2,16,$acc1
139 stb $acc6,6($out)
140 _srm $s2,8,$acc2
141 stb $s1,7($out)
142 _srm $s3,24,$acc4
143 stb $acc0,8($out)
144 _srm $s3,16,$acc5
145 stb $acc1,9($out)
146 _srm $s3,8,$acc6
147 stb $acc2,10($out)
148 stb $s2,11($out)
149 stb $acc4,12($out)
150 stb $acc5,13($out)
151 stb $acc6,14($out)
152 b L\$enc_done
153 stb $s3,15($out)
154
155L\$enc_out_aligned
156 stw $s0,0($out)
157 stw $s1,4($out)
158 stw $s2,8($out)
159 stw $s3,12($out)
160
161L\$enc_done
162 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
163 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
164 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
165 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
166 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
167 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
168 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
169 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
170 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
171 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
172 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
173 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
174 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
175 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
176 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
177 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
178 bv (%r2)
179 .EXIT
180 $POPMB -$FRAME(%sp),%r3
181 .PROCEND
182
183 .ALIGN 16
184_parisc_AES_encrypt
185 .PROC
186 .CALLINFO MILLICODE
187 .ENTRY
188 ldw 240($key),$rounds
189 ldw 0($key),$t0
190 ldw 4($key),$t1
191 ldw 8($key),$t2
192 _srm $rounds,1,$rounds
193 xor $t0,$s0,$s0
194 ldw 12($key),$t3
195 _srm $s0,24,$acc0
196 xor $t1,$s1,$s1
197 ldw 16($key),$t0
198 _srm $s1,16,$acc1
199 xor $t2,$s2,$s2
200 ldw 20($key),$t1
201 xor $t3,$s3,$s3
202 ldw 24($key),$t2
203 ldw 28($key),$t3
204L\$enc_loop
205 _srm $s2,8,$acc2
206 ldwx,s $acc0($tbl),$acc0
207 _srm $s3,0,$acc3
208 ldwx,s $acc1($tbl),$acc1
209 _srm $s1,24,$acc4
210 ldwx,s $acc2($tbl),$acc2
211 _srm $s2,16,$acc5
212 ldwx,s $acc3($tbl),$acc3
213 _srm $s3,8,$acc6
214 ldwx,s $acc4($tbl),$acc4
215 _srm $s0,0,$acc7
216 ldwx,s $acc5($tbl),$acc5
217 _srm $s2,24,$acc8
218 ldwx,s $acc6($tbl),$acc6
219 _srm $s3,16,$acc9
220 ldwx,s $acc7($tbl),$acc7
221 _srm $s0,8,$acc10
222 ldwx,s $acc8($tbl),$acc8
223 _srm $s1,0,$acc11
224 ldwx,s $acc9($tbl),$acc9
225 _srm $s3,24,$acc12
226 ldwx,s $acc10($tbl),$acc10
227 _srm $s0,16,$acc13
228 ldwx,s $acc11($tbl),$acc11
229 _srm $s1,8,$acc14
230 ldwx,s $acc12($tbl),$acc12
231 _srm $s2,0,$acc15
232 ldwx,s $acc13($tbl),$acc13
233 ldwx,s $acc14($tbl),$acc14
234 ldwx,s $acc15($tbl),$acc15
235 addib,= -1,$rounds,L\$enc_last
236 ldo 32($key),$key
237
238 _ror $acc1,8,$acc1
239 xor $acc0,$t0,$t0
240 ldw 0($key),$s0
241 _ror $acc2,16,$acc2
242 xor $acc1,$t0,$t0
243 ldw 4($key),$s1
244 _ror $acc3,24,$acc3
245 xor $acc2,$t0,$t0
246 ldw 8($key),$s2
247 _ror $acc5,8,$acc5
248 xor $acc3,$t0,$t0
249 ldw 12($key),$s3
250 _ror $acc6,16,$acc6
251 xor $acc4,$t1,$t1
252 _ror $acc7,24,$acc7
253 xor $acc5,$t1,$t1
254 _ror $acc9,8,$acc9
255 xor $acc6,$t1,$t1
256 _ror $acc10,16,$acc10
257 xor $acc7,$t1,$t1
258 _ror $acc11,24,$acc11
259 xor $acc8,$t2,$t2
260 _ror $acc13,8,$acc13
261 xor $acc9,$t2,$t2
262 _ror $acc14,16,$acc14
263 xor $acc10,$t2,$t2
264 _ror $acc15,24,$acc15
265 xor $acc11,$t2,$t2
266 xor $acc12,$acc14,$acc14
267 xor $acc13,$t3,$t3
268 _srm $t0,24,$acc0
269 xor $acc14,$t3,$t3
270 _srm $t1,16,$acc1
271 xor $acc15,$t3,$t3
272
273 _srm $t2,8,$acc2
274 ldwx,s $acc0($tbl),$acc0
275 _srm $t3,0,$acc3
276 ldwx,s $acc1($tbl),$acc1
277 _srm $t1,24,$acc4
278 ldwx,s $acc2($tbl),$acc2
279 _srm $t2,16,$acc5
280 ldwx,s $acc3($tbl),$acc3
281 _srm $t3,8,$acc6
282 ldwx,s $acc4($tbl),$acc4
283 _srm $t0,0,$acc7
284 ldwx,s $acc5($tbl),$acc5
285 _srm $t2,24,$acc8
286 ldwx,s $acc6($tbl),$acc6
287 _srm $t3,16,$acc9
288 ldwx,s $acc7($tbl),$acc7
289 _srm $t0,8,$acc10
290 ldwx,s $acc8($tbl),$acc8
291 _srm $t1,0,$acc11
292 ldwx,s $acc9($tbl),$acc9
293 _srm $t3,24,$acc12
294 ldwx,s $acc10($tbl),$acc10
295 _srm $t0,16,$acc13
296 ldwx,s $acc11($tbl),$acc11
297 _srm $t1,8,$acc14
298 ldwx,s $acc12($tbl),$acc12
299 _srm $t2,0,$acc15
300 ldwx,s $acc13($tbl),$acc13
301 _ror $acc1,8,$acc1
302 ldwx,s $acc14($tbl),$acc14
303
304 _ror $acc2,16,$acc2
305 xor $acc0,$s0,$s0
306 ldwx,s $acc15($tbl),$acc15
307 _ror $acc3,24,$acc3
308 xor $acc1,$s0,$s0
309 ldw 16($key),$t0
310 _ror $acc5,8,$acc5
311 xor $acc2,$s0,$s0
312 ldw 20($key),$t1
313 _ror $acc6,16,$acc6
314 xor $acc3,$s0,$s0
315 ldw 24($key),$t2
316 _ror $acc7,24,$acc7
317 xor $acc4,$s1,$s1
318 ldw 28($key),$t3
319 _ror $acc9,8,$acc9
320 xor $acc5,$s1,$s1
321 ldw 1024+0($tbl),%r0 ; prefetch te4
322 _ror $acc10,16,$acc10
323 xor $acc6,$s1,$s1
324 ldw 1024+32($tbl),%r0 ; prefetch te4
325 _ror $acc11,24,$acc11
326 xor $acc7,$s1,$s1
327 ldw 1024+64($tbl),%r0 ; prefetch te4
328 _ror $acc13,8,$acc13
329 xor $acc8,$s2,$s2
330 ldw 1024+96($tbl),%r0 ; prefetch te4
331 _ror $acc14,16,$acc14
332 xor $acc9,$s2,$s2
333 ldw 1024+128($tbl),%r0 ; prefetch te4
334 _ror $acc15,24,$acc15
335 xor $acc10,$s2,$s2
336 ldw 1024+160($tbl),%r0 ; prefetch te4
337 _srm $s0,24,$acc0
338 xor $acc11,$s2,$s2
339 ldw 1024+192($tbl),%r0 ; prefetch te4
340 xor $acc12,$acc14,$acc14
341 xor $acc13,$s3,$s3
342 ldw 1024+224($tbl),%r0 ; prefetch te4
343 _srm $s1,16,$acc1
344 xor $acc14,$s3,$s3
345 b L\$enc_loop
346 xor $acc15,$s3,$s3
347
348 .ALIGN 16
349L\$enc_last
350 ldo 1024($tbl),$rounds
351 _ror $acc1,8,$acc1
352 xor $acc0,$t0,$t0
353 ldw 0($key),$s0
354 _ror $acc2,16,$acc2
355 xor $acc1,$t0,$t0
356 ldw 4($key),$s1
357 _ror $acc3,24,$acc3
358 xor $acc2,$t0,$t0
359 ldw 8($key),$s2
360 _ror $acc5,8,$acc5
361 xor $acc3,$t0,$t0
362 ldw 12($key),$s3
363 _ror $acc6,16,$acc6
364 xor $acc4,$t1,$t1
365 _ror $acc7,24,$acc7
366 xor $acc5,$t1,$t1
367 _ror $acc9,8,$acc9
368 xor $acc6,$t1,$t1
369 _ror $acc10,16,$acc10
370 xor $acc7,$t1,$t1
371 _ror $acc11,24,$acc11
372 xor $acc8,$t2,$t2
373 _ror $acc13,8,$acc13
374 xor $acc9,$t2,$t2
375 _ror $acc14,16,$acc14
376 xor $acc10,$t2,$t2
377 _ror $acc15,24,$acc15
378 xor $acc11,$t2,$t2
379 xor $acc12,$acc14,$acc14
380 xor $acc13,$t3,$t3
381 _srm $t0,24,$acc0
382 xor $acc14,$t3,$t3
383 _srm $t1,16,$acc1
384 xor $acc15,$t3,$t3
385
386 _srm $t2,8,$acc2
387 ldbx $acc0($rounds),$acc0
388 _srm $t1,24,$acc4
389 ldbx $acc1($rounds),$acc1
390 _srm $t2,16,$acc5
391 _srm $t3,0,$acc3
392 ldbx $acc2($rounds),$acc2
393 ldbx $acc3($rounds),$acc3
394 _srm $t3,8,$acc6
395 ldbx $acc4($rounds),$acc4
396 _srm $t2,24,$acc8
397 ldbx $acc5($rounds),$acc5
398 _srm $t3,16,$acc9
399 _srm $t0,0,$acc7
400 ldbx $acc6($rounds),$acc6
401 ldbx $acc7($rounds),$acc7
402 _srm $t0,8,$acc10
403 ldbx $acc8($rounds),$acc8
404 _srm $t3,24,$acc12
405 ldbx $acc9($rounds),$acc9
406 _srm $t0,16,$acc13
407 _srm $t1,0,$acc11
408 ldbx $acc10($rounds),$acc10
409 _srm $t1,8,$acc14
410 ldbx $acc11($rounds),$acc11
411 ldbx $acc12($rounds),$acc12
412 ldbx $acc13($rounds),$acc13
413 _srm $t2,0,$acc15
414 ldbx $acc14($rounds),$acc14
415
416 dep $acc0,7,8,$acc3
417 ldbx $acc15($rounds),$acc15
418 dep $acc4,7,8,$acc7
419 dep $acc1,15,8,$acc3
420 dep $acc5,15,8,$acc7
421 dep $acc2,23,8,$acc3
422 dep $acc6,23,8,$acc7
423 xor $acc3,$s0,$s0
424 xor $acc7,$s1,$s1
425 dep $acc8,7,8,$acc11
426 dep $acc12,7,8,$acc15
427 dep $acc9,15,8,$acc11
428 dep $acc13,15,8,$acc15
429 dep $acc10,23,8,$acc11
430 dep $acc14,23,8,$acc15
431 xor $acc11,$s2,$s2
432
433 bv (%r31)
434 .EXIT
435 xor $acc15,$s3,$s3
436 .PROCEND
437
438 .ALIGN 64
439L\$AES_Te
440 .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
441 .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
442 .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
443 .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
444 .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
445 .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
446 .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
447 .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
448 .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
449 .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
450 .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
451 .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
452 .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
453 .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
454 .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
455 .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
456 .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
457 .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
458 .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
459 .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
460 .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
461 .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
462 .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
463 .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
464 .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
465 .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
466 .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
467 .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
468 .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
469 .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
470 .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
471 .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
472 .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
473 .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
474 .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
475 .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
476 .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
477 .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
478 .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
479 .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
480 .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
481 .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
482 .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
483 .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
484 .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
485 .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
486 .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
487 .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
488 .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
489 .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
490 .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
491 .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
492 .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
493 .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
494 .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
495 .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
496 .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
497 .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
498 .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
499 .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
500 .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
501 .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
502 .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
503 .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
504 .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
505 .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
506 .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
507 .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
508 .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
509 .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
510 .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
511 .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
512 .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
513 .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
514 .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
515 .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
516 .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
517 .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
518 .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
519 .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
520 .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
521 .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
522 .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
523 .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
524 .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
525 .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
526 .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
527 .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
528 .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
529 .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
530 .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
531 .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
532 .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
533 .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
534 .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
535 .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
536___
537
538$code.=<<___;
539 .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
540 .ALIGN 16
541AES_decrypt
542 .PROC
543 .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
544 .ENTRY
545 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
546 $PUSHMA %r3,$FRAME(%sp)
547 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
548 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
549 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
550 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
551 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
552 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
553 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
554 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
555 $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
556 $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
557 $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
558 $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
559 $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
560 $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
561 $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
562
563 blr %r0,$tbl
564 ldi 3,$t0
565L\$dec_pic
566 andcm $tbl,$t0,$tbl
567 ldo L\$AES_Td-L\$dec_pic($tbl),$tbl
568
569 and $inp,$t0,$t0
570 sub $inp,$t0,$inp
571 ldw 0($inp),$s0
572 ldw 4($inp),$s1
573 ldw 8($inp),$s2
574 comib,= 0,$t0,L\$dec_inp_aligned
575 ldw 12($inp),$s3
576
577 sh3addl $t0,%r0,$t0
578 subi 32,$t0,$t0
579 mtctl $t0,%cr11
580 ldw 16($inp),$t1
581 vshd $s0,$s1,$s0
582 vshd $s1,$s2,$s1
583 vshd $s2,$s3,$s2
584 vshd $s3,$t1,$s3
585
586L\$dec_inp_aligned
587 bl _parisc_AES_decrypt,%r31
588 nop
589
590 extru,<> $out,31,2,%r0
591 b L\$dec_out_aligned
592 nop
593
594 _srm $s0,24,$acc0
595 _srm $s0,16,$acc1
596 stb $acc0,0($out)
597 _srm $s0,8,$acc2
598 stb $acc1,1($out)
599 _srm $s1,24,$acc4
600 stb $acc2,2($out)
601 _srm $s1,16,$acc5
602 stb $s0,3($out)
603 _srm $s1,8,$acc6
604 stb $acc4,4($out)
605 _srm $s2,24,$acc0
606 stb $acc5,5($out)
607 _srm $s2,16,$acc1
608 stb $acc6,6($out)
609 _srm $s2,8,$acc2
610 stb $s1,7($out)
611 _srm $s3,24,$acc4
612 stb $acc0,8($out)
613 _srm $s3,16,$acc5
614 stb $acc1,9($out)
615 _srm $s3,8,$acc6
616 stb $acc2,10($out)
617 stb $s2,11($out)
618 stb $acc4,12($out)
619 stb $acc5,13($out)
620 stb $acc6,14($out)
621 b L\$dec_done
622 stb $s3,15($out)
623
624L\$dec_out_aligned
625 stw $s0,0($out)
626 stw $s1,4($out)
627 stw $s2,8($out)
628 stw $s3,12($out)
629
630L\$dec_done
631 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
632 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4
633 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5
634 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6
635 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7
636 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8
637 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9
638 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10
639 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11
640 $POP `-$FRAME+9*$SIZE_T`(%sp),%r12
641 $POP `-$FRAME+10*$SIZE_T`(%sp),%r13
642 $POP `-$FRAME+11*$SIZE_T`(%sp),%r14
643 $POP `-$FRAME+12*$SIZE_T`(%sp),%r15
644 $POP `-$FRAME+13*$SIZE_T`(%sp),%r16
645 $POP `-$FRAME+14*$SIZE_T`(%sp),%r17
646 $POP `-$FRAME+15*$SIZE_T`(%sp),%r18
647 bv (%r2)
648 .EXIT
649 $POPMB -$FRAME(%sp),%r3
650 .PROCEND
651
652 .ALIGN 16
653_parisc_AES_decrypt
654 .PROC
655 .CALLINFO MILLICODE
656 .ENTRY
657 ldw 240($key),$rounds
658 ldw 0($key),$t0
659 ldw 4($key),$t1
660 ldw 8($key),$t2
661 ldw 12($key),$t3
662 _srm $rounds,1,$rounds
663 xor $t0,$s0,$s0
664 ldw 16($key),$t0
665 xor $t1,$s1,$s1
666 ldw 20($key),$t1
667 _srm $s0,24,$acc0
668 xor $t2,$s2,$s2
669 ldw 24($key),$t2
670 xor $t3,$s3,$s3
671 ldw 28($key),$t3
672 _srm $s3,16,$acc1
673L\$dec_loop
674 _srm $s2,8,$acc2
675 ldwx,s $acc0($tbl),$acc0
676 _srm $s1,0,$acc3
677 ldwx,s $acc1($tbl),$acc1
678 _srm $s1,24,$acc4
679 ldwx,s $acc2($tbl),$acc2
680 _srm $s0,16,$acc5
681 ldwx,s $acc3($tbl),$acc3
682 _srm $s3,8,$acc6
683 ldwx,s $acc4($tbl),$acc4
684 _srm $s2,0,$acc7
685 ldwx,s $acc5($tbl),$acc5
686 _srm $s2,24,$acc8
687 ldwx,s $acc6($tbl),$acc6
688 _srm $s1,16,$acc9
689 ldwx,s $acc7($tbl),$acc7
690 _srm $s0,8,$acc10
691 ldwx,s $acc8($tbl),$acc8
692 _srm $s3,0,$acc11
693 ldwx,s $acc9($tbl),$acc9
694 _srm $s3,24,$acc12
695 ldwx,s $acc10($tbl),$acc10
696 _srm $s2,16,$acc13
697 ldwx,s $acc11($tbl),$acc11
698 _srm $s1,8,$acc14
699 ldwx,s $acc12($tbl),$acc12
700 _srm $s0,0,$acc15
701 ldwx,s $acc13($tbl),$acc13
702 ldwx,s $acc14($tbl),$acc14
703 ldwx,s $acc15($tbl),$acc15
704 addib,= -1,$rounds,L\$dec_last
705 ldo 32($key),$key
706
707 _ror $acc1,8,$acc1
708 xor $acc0,$t0,$t0
709 ldw 0($key),$s0
710 _ror $acc2,16,$acc2
711 xor $acc1,$t0,$t0
712 ldw 4($key),$s1
713 _ror $acc3,24,$acc3
714 xor $acc2,$t0,$t0
715 ldw 8($key),$s2
716 _ror $acc5,8,$acc5
717 xor $acc3,$t0,$t0
718 ldw 12($key),$s3
719 _ror $acc6,16,$acc6
720 xor $acc4,$t1,$t1
721 _ror $acc7,24,$acc7
722 xor $acc5,$t1,$t1
723 _ror $acc9,8,$acc9
724 xor $acc6,$t1,$t1
725 _ror $acc10,16,$acc10
726 xor $acc7,$t1,$t1
727 _ror $acc11,24,$acc11
728 xor $acc8,$t2,$t2
729 _ror $acc13,8,$acc13
730 xor $acc9,$t2,$t2
731 _ror $acc14,16,$acc14
732 xor $acc10,$t2,$t2
733 _ror $acc15,24,$acc15
734 xor $acc11,$t2,$t2
735 xor $acc12,$acc14,$acc14
736 xor $acc13,$t3,$t3
737 _srm $t0,24,$acc0
738 xor $acc14,$t3,$t3
739 xor $acc15,$t3,$t3
740 _srm $t3,16,$acc1
741
742 _srm $t2,8,$acc2
743 ldwx,s $acc0($tbl),$acc0
744 _srm $t1,0,$acc3
745 ldwx,s $acc1($tbl),$acc1
746 _srm $t1,24,$acc4
747 ldwx,s $acc2($tbl),$acc2
748 _srm $t0,16,$acc5
749 ldwx,s $acc3($tbl),$acc3
750 _srm $t3,8,$acc6
751 ldwx,s $acc4($tbl),$acc4
752 _srm $t2,0,$acc7
753 ldwx,s $acc5($tbl),$acc5
754 _srm $t2,24,$acc8
755 ldwx,s $acc6($tbl),$acc6
756 _srm $t1,16,$acc9
757 ldwx,s $acc7($tbl),$acc7
758 _srm $t0,8,$acc10
759 ldwx,s $acc8($tbl),$acc8
760 _srm $t3,0,$acc11
761 ldwx,s $acc9($tbl),$acc9
762 _srm $t3,24,$acc12
763 ldwx,s $acc10($tbl),$acc10
764 _srm $t2,16,$acc13
765 ldwx,s $acc11($tbl),$acc11
766 _srm $t1,8,$acc14
767 ldwx,s $acc12($tbl),$acc12
768 _srm $t0,0,$acc15
769 ldwx,s $acc13($tbl),$acc13
770 _ror $acc1,8,$acc1
771 ldwx,s $acc14($tbl),$acc14
772
773 _ror $acc2,16,$acc2
774 xor $acc0,$s0,$s0
775 ldwx,s $acc15($tbl),$acc15
776 _ror $acc3,24,$acc3
777 xor $acc1,$s0,$s0
778 ldw 16($key),$t0
779 _ror $acc5,8,$acc5
780 xor $acc2,$s0,$s0
781 ldw 20($key),$t1
782 _ror $acc6,16,$acc6
783 xor $acc3,$s0,$s0
784 ldw 24($key),$t2
785 _ror $acc7,24,$acc7
786 xor $acc4,$s1,$s1
787 ldw 28($key),$t3
788 _ror $acc9,8,$acc9
789 xor $acc5,$s1,$s1
790 ldw 1024+0($tbl),%r0 ; prefetch td4
791 _ror $acc10,16,$acc10
792 xor $acc6,$s1,$s1
793 ldw 1024+32($tbl),%r0 ; prefetch td4
794 _ror $acc11,24,$acc11
795 xor $acc7,$s1,$s1
796 ldw 1024+64($tbl),%r0 ; prefetch td4
797 _ror $acc13,8,$acc13
798 xor $acc8,$s2,$s2
799 ldw 1024+96($tbl),%r0 ; prefetch td4
800 _ror $acc14,16,$acc14
801 xor $acc9,$s2,$s2
802 ldw 1024+128($tbl),%r0 ; prefetch td4
803 _ror $acc15,24,$acc15
804 xor $acc10,$s2,$s2
805 ldw 1024+160($tbl),%r0 ; prefetch td4
806 _srm $s0,24,$acc0
807 xor $acc11,$s2,$s2
808 ldw 1024+192($tbl),%r0 ; prefetch td4
809 xor $acc12,$acc14,$acc14
810 xor $acc13,$s3,$s3
811 ldw 1024+224($tbl),%r0 ; prefetch td4
812 xor $acc14,$s3,$s3
813 xor $acc15,$s3,$s3
814 b L\$dec_loop
815 _srm $s3,16,$acc1
816
817 .ALIGN 16
818L\$dec_last
819 ldo 1024($tbl),$rounds
820 _ror $acc1,8,$acc1
821 xor $acc0,$t0,$t0
822 ldw 0($key),$s0
823 _ror $acc2,16,$acc2
824 xor $acc1,$t0,$t0
825 ldw 4($key),$s1
826 _ror $acc3,24,$acc3
827 xor $acc2,$t0,$t0
828 ldw 8($key),$s2
829 _ror $acc5,8,$acc5
830 xor $acc3,$t0,$t0
831 ldw 12($key),$s3
832 _ror $acc6,16,$acc6
833 xor $acc4,$t1,$t1
834 _ror $acc7,24,$acc7
835 xor $acc5,$t1,$t1
836 _ror $acc9,8,$acc9
837 xor $acc6,$t1,$t1
838 _ror $acc10,16,$acc10
839 xor $acc7,$t1,$t1
840 _ror $acc11,24,$acc11
841 xor $acc8,$t2,$t2
842 _ror $acc13,8,$acc13
843 xor $acc9,$t2,$t2
844 _ror $acc14,16,$acc14
845 xor $acc10,$t2,$t2
846 _ror $acc15,24,$acc15
847 xor $acc11,$t2,$t2
848 xor $acc12,$acc14,$acc14
849 xor $acc13,$t3,$t3
850 _srm $t0,24,$acc0
851 xor $acc14,$t3,$t3
852 xor $acc15,$t3,$t3
853 _srm $t3,16,$acc1
854
855 _srm $t2,8,$acc2
856 ldbx $acc0($rounds),$acc0
857 _srm $t1,24,$acc4
858 ldbx $acc1($rounds),$acc1
859 _srm $t0,16,$acc5
860 _srm $t1,0,$acc3
861 ldbx $acc2($rounds),$acc2
862 ldbx $acc3($rounds),$acc3
863 _srm $t3,8,$acc6
864 ldbx $acc4($rounds),$acc4
865 _srm $t2,24,$acc8
866 ldbx $acc5($rounds),$acc5
867 _srm $t1,16,$acc9
868 _srm $t2,0,$acc7
869 ldbx $acc6($rounds),$acc6
870 ldbx $acc7($rounds),$acc7
871 _srm $t0,8,$acc10
872 ldbx $acc8($rounds),$acc8
873 _srm $t3,24,$acc12
874 ldbx $acc9($rounds),$acc9
875 _srm $t2,16,$acc13
876 _srm $t3,0,$acc11
877 ldbx $acc10($rounds),$acc10
878 _srm $t1,8,$acc14
879 ldbx $acc11($rounds),$acc11
880 ldbx $acc12($rounds),$acc12
881 ldbx $acc13($rounds),$acc13
882 _srm $t0,0,$acc15
883 ldbx $acc14($rounds),$acc14
884
885 dep $acc0,7,8,$acc3
886 ldbx $acc15($rounds),$acc15
887 dep $acc4,7,8,$acc7
888 dep $acc1,15,8,$acc3
889 dep $acc5,15,8,$acc7
890 dep $acc2,23,8,$acc3
891 dep $acc6,23,8,$acc7
892 xor $acc3,$s0,$s0
893 xor $acc7,$s1,$s1
894 dep $acc8,7,8,$acc11
895 dep $acc12,7,8,$acc15
896 dep $acc9,15,8,$acc11
897 dep $acc13,15,8,$acc15
898 dep $acc10,23,8,$acc11
899 dep $acc14,23,8,$acc15
900 xor $acc11,$s2,$s2
901
902 bv (%r31)
903 .EXIT
904 xor $acc15,$s3,$s3
905 .PROCEND
906
907 .ALIGN 64
908L\$AES_Td
909 .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
910 .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
911 .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
912 .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
913 .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
914 .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
915 .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
916 .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
917 .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
918 .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
919 .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
920 .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
921 .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
922 .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
923 .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
924 .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
925 .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
926 .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
927 .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
928 .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
929 .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
930 .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
931 .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
932 .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
933 .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
934 .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
935 .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
936 .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
937 .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
938 .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
939 .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
940 .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
941 .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
942 .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
943 .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
944 .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
945 .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
946 .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
947 .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
948 .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
949 .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
950 .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
951 .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
952 .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
953 .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
954 .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
955 .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
956 .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
957 .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
958 .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
959 .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
960 .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
961 .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
962 .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
963 .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
964 .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
965 .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
966 .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
967 .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
968 .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
969 .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
970 .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
971 .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
972 .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
973 .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
974 .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
975 .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
976 .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
977 .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
978 .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
979 .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
980 .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
981 .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
982 .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
983 .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
984 .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
985 .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
986 .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
987 .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
988 .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
989 .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
990 .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
991 .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
992 .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
993 .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
994 .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
995 .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
996 .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
997 .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
998 .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
999 .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
1000 .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
1001 .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
1002 .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
1003 .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
1004 .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
1005 .STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
1006___
1007
1008foreach (split("\n",$code)) {
1009 s/\`([^\`]*)\`/eval $1/ge;
1010
1011 # translate made up instructons: _ror, _srm
1012 s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or
1013
1014 s/_srm(\s+%r[0-9]+),([0-9]+),/
1015 $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
1016 : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
1017
1018 s/,\*/,/ if ($SIZE_T==4);
1019 print $_,"\n";
1020}
1021close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
index f82c5e1814..7c52cbe5f9 100644
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
7# details see http://www.openssl.org/~appro/cryptogams/. 7# details see http://www.openssl.org/~appro/cryptogams/.
8# ==================================================================== 8# ====================================================================
9 9
10# Needs more work: key setup, page boundaries, CBC routine... 10# Needs more work: key setup, CBC routine...
11# 11#
12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with 12# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13# 128-bit key, which is ~40% better than 64-bit code generated by gcc 13# 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
18 18
19# February 2010 19# February 2010
20# 20#
21# Rescheduling instructions to favour Power6 pipeline gives 10% 21# Rescheduling instructions to favour Power6 pipeline gave 10%
22# performance improvement on the platfrom in question (and marginal 22# performance improvement on the platfrom in question (and marginal
23# improvement even on others). It should be noted that Power6 fails 23# improvement even on others). It should be noted that Power6 fails
24# to process byte in 18 cycles, only in 23, because it fails to issue 24# to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
33 33
34if ($flavour =~ /64/) { 34if ($flavour =~ /64/) {
35 $SIZE_T =8; 35 $SIZE_T =8;
36 $LRSAVE =2*$SIZE_T;
36 $STU ="stdu"; 37 $STU ="stdu";
37 $POP ="ld"; 38 $POP ="ld";
38 $PUSH ="std"; 39 $PUSH ="std";
39} elsif ($flavour =~ /32/) { 40} elsif ($flavour =~ /32/) {
40 $SIZE_T =4; 41 $SIZE_T =4;
42 $LRSAVE =$SIZE_T;
41 $STU ="stwu"; 43 $STU ="stwu";
42 $POP ="lwz"; 44 $POP ="lwz";
43 $PUSH ="stw"; 45 $PUSH ="stw";
@@ -116,15 +118,19 @@ LAES_Te:
116 addi $Tbl0,$Tbl0,`128-8` 118 addi $Tbl0,$Tbl0,`128-8`
117 mtlr r0 119 mtlr r0
118 blr 120 blr
119 .space `32-24` 121 .long 0
122 .byte 0,12,0x14,0,0,0,0,0
123 .space `64-9*4`
120LAES_Td: 124LAES_Td:
121 mflr r0 125 mflr r0
122 bcl 20,31,\$+4 126 bcl 20,31,\$+4
123 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry 127 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
124 addi $Tbl0,$Tbl0,`128-8-32+2048+256` 128 addi $Tbl0,$Tbl0,`128-64-8+2048+256`
125 mtlr r0 129 mtlr r0
126 blr 130 blr
127 .space `128-32-24` 131 .long 0
132 .byte 0,12,0x14,0,0,0,0,0
133 .space `128-64-9*4`
128___ 134___
129&_data_word( 135&_data_word(
130 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, 136 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
328.globl .AES_encrypt 334.globl .AES_encrypt
329.align 7 335.align 7
330.AES_encrypt: 336.AES_encrypt:
331 mflr r0
332 $STU $sp,-$FRAME($sp) 337 $STU $sp,-$FRAME($sp)
338 mflr r0
333 339
334 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
335 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) 340 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
336 $PUSH r13,`$FRAME-$SIZE_T*19`($sp) 341 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
337 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 342 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
352 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 357 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
353 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 358 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
354 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 359 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
360 $PUSH r0,`$FRAME+$LRSAVE`($sp)
361
362 andi. $t0,$inp,3
363 andi. $t1,$out,3
364 or. $t0,$t0,$t1
365 bne Lenc_unaligned
355 366
367Lenc_unaligned_ok:
356 lwz $s0,0($inp) 368 lwz $s0,0($inp)
357 lwz $s1,4($inp) 369 lwz $s1,4($inp)
358 lwz $s2,8($inp) 370 lwz $s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
363 stw $s1,4($out) 375 stw $s1,4($out)
364 stw $s2,8($out) 376 stw $s2,8($out)
365 stw $s3,12($out) 377 stw $s3,12($out)
378 b Lenc_done
379
380Lenc_unaligned:
381 subfic $t0,$inp,4096
382 subfic $t1,$out,4096
383 andi. $t0,$t0,4096-16
384 beq Lenc_xpage
385 andi. $t1,$t1,4096-16
386 bne Lenc_unaligned_ok
387
388Lenc_xpage:
389 lbz $acc00,0($inp)
390 lbz $acc01,1($inp)
391 lbz $acc02,2($inp)
392 lbz $s0,3($inp)
393 lbz $acc04,4($inp)
394 lbz $acc05,5($inp)
395 lbz $acc06,6($inp)
396 lbz $s1,7($inp)
397 lbz $acc08,8($inp)
398 lbz $acc09,9($inp)
399 lbz $acc10,10($inp)
400 insrwi $s0,$acc00,8,0
401 lbz $s2,11($inp)
402 insrwi $s1,$acc04,8,0
403 lbz $acc12,12($inp)
404 insrwi $s0,$acc01,8,8
405 lbz $acc13,13($inp)
406 insrwi $s1,$acc05,8,8
407 lbz $acc14,14($inp)
408 insrwi $s0,$acc02,8,16
409 lbz $s3,15($inp)
410 insrwi $s1,$acc06,8,16
411 insrwi $s2,$acc08,8,0
412 insrwi $s3,$acc12,8,0
413 insrwi $s2,$acc09,8,8
414 insrwi $s3,$acc13,8,8
415 insrwi $s2,$acc10,8,16
416 insrwi $s3,$acc14,8,16
417
418 bl LAES_Te
419 bl Lppc_AES_encrypt_compact
420
421 extrwi $acc00,$s0,8,0
422 extrwi $acc01,$s0,8,8
423 stb $acc00,0($out)
424 extrwi $acc02,$s0,8,16
425 stb $acc01,1($out)
426 stb $acc02,2($out)
427 extrwi $acc04,$s1,8,0
428 stb $s0,3($out)
429 extrwi $acc05,$s1,8,8
430 stb $acc04,4($out)
431 extrwi $acc06,$s1,8,16
432 stb $acc05,5($out)
433 stb $acc06,6($out)
434 extrwi $acc08,$s2,8,0
435 stb $s1,7($out)
436 extrwi $acc09,$s2,8,8
437 stb $acc08,8($out)
438 extrwi $acc10,$s2,8,16
439 stb $acc09,9($out)
440 stb $acc10,10($out)
441 extrwi $acc12,$s3,8,0
442 stb $s2,11($out)
443 extrwi $acc13,$s3,8,8
444 stb $acc12,12($out)
445 extrwi $acc14,$s3,8,16
446 stb $acc13,13($out)
447 stb $acc14,14($out)
448 stb $s3,15($out)
366 449
367 $POP r0,`$FRAME-$SIZE_T*21`($sp) 450Lenc_done:
451 $POP r0,`$FRAME+$LRSAVE`($sp)
368 $POP $toc,`$FRAME-$SIZE_T*20`($sp) 452 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
369 $POP r13,`$FRAME-$SIZE_T*19`($sp) 453 $POP r13,`$FRAME-$SIZE_T*19`($sp)
370 $POP r14,`$FRAME-$SIZE_T*18`($sp) 454 $POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
388 mtlr r0 472 mtlr r0
389 addi $sp,$sp,$FRAME 473 addi $sp,$sp,$FRAME
390 blr 474 blr
475 .long 0
476 .byte 0,12,4,1,0x80,18,3,0
477 .long 0
391 478
392.align 5 479.align 5
393Lppc_AES_encrypt: 480Lppc_AES_encrypt:
394 lwz $acc00,240($key) 481 lwz $acc00,240($key)
395 lwz $t0,0($key)
396 lwz $t1,4($key)
397 lwz $t2,8($key)
398 lwz $t3,12($key)
399 addi $Tbl1,$Tbl0,3 482 addi $Tbl1,$Tbl0,3
483 lwz $t0,0($key)
400 addi $Tbl2,$Tbl0,2 484 addi $Tbl2,$Tbl0,2
485 lwz $t1,4($key)
401 addi $Tbl3,$Tbl0,1 486 addi $Tbl3,$Tbl0,1
487 lwz $t2,8($key)
402 addi $acc00,$acc00,-1 488 addi $acc00,$acc00,-1
489 lwz $t3,12($key)
403 addi $key,$key,16 490 addi $key,$key,16
404 xor $s0,$s0,$t0 491 xor $s0,$s0,$t0
405 xor $s1,$s1,$t1 492 xor $s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
413 rlwinm $acc02,$s2,`32-24+3`,21,28 500 rlwinm $acc02,$s2,`32-24+3`,21,28
414 rlwinm $acc03,$s3,`32-24+3`,21,28 501 rlwinm $acc03,$s3,`32-24+3`,21,28
415 lwz $t0,0($key) 502 lwz $t0,0($key)
416 lwz $t1,4($key)
417 rlwinm $acc04,$s1,`32-16+3`,21,28 503 rlwinm $acc04,$s1,`32-16+3`,21,28
504 lwz $t1,4($key)
418 rlwinm $acc05,$s2,`32-16+3`,21,28 505 rlwinm $acc05,$s2,`32-16+3`,21,28
419 lwz $t2,8($key) 506 lwz $t2,8($key)
420 lwz $t3,12($key)
421 rlwinm $acc06,$s3,`32-16+3`,21,28 507 rlwinm $acc06,$s3,`32-16+3`,21,28
508 lwz $t3,12($key)
422 rlwinm $acc07,$s0,`32-16+3`,21,28 509 rlwinm $acc07,$s0,`32-16+3`,21,28
423 lwzx $acc00,$Tbl0,$acc00 510 lwzx $acc00,$Tbl0,$acc00
424 lwzx $acc01,$Tbl0,$acc01
425 rlwinm $acc08,$s2,`32-8+3`,21,28 511 rlwinm $acc08,$s2,`32-8+3`,21,28
512 lwzx $acc01,$Tbl0,$acc01
426 rlwinm $acc09,$s3,`32-8+3`,21,28 513 rlwinm $acc09,$s3,`32-8+3`,21,28
427 lwzx $acc02,$Tbl0,$acc02 514 lwzx $acc02,$Tbl0,$acc02
428 lwzx $acc03,$Tbl0,$acc03
429 rlwinm $acc10,$s0,`32-8+3`,21,28 515 rlwinm $acc10,$s0,`32-8+3`,21,28
516 lwzx $acc03,$Tbl0,$acc03
430 rlwinm $acc11,$s1,`32-8+3`,21,28 517 rlwinm $acc11,$s1,`32-8+3`,21,28
431 lwzx $acc04,$Tbl1,$acc04 518 lwzx $acc04,$Tbl1,$acc04
432 lwzx $acc05,$Tbl1,$acc05
433 rlwinm $acc12,$s3,`0+3`,21,28 519 rlwinm $acc12,$s3,`0+3`,21,28
520 lwzx $acc05,$Tbl1,$acc05
434 rlwinm $acc13,$s0,`0+3`,21,28 521 rlwinm $acc13,$s0,`0+3`,21,28
435 lwzx $acc06,$Tbl1,$acc06 522 lwzx $acc06,$Tbl1,$acc06
436 lwzx $acc07,$Tbl1,$acc07
437 rlwinm $acc14,$s1,`0+3`,21,28 523 rlwinm $acc14,$s1,`0+3`,21,28
524 lwzx $acc07,$Tbl1,$acc07
438 rlwinm $acc15,$s2,`0+3`,21,28 525 rlwinm $acc15,$s2,`0+3`,21,28
439 lwzx $acc08,$Tbl2,$acc08 526 lwzx $acc08,$Tbl2,$acc08
440 lwzx $acc09,$Tbl2,$acc09
441 xor $t0,$t0,$acc00 527 xor $t0,$t0,$acc00
528 lwzx $acc09,$Tbl2,$acc09
442 xor $t1,$t1,$acc01 529 xor $t1,$t1,$acc01
443 lwzx $acc10,$Tbl2,$acc10 530 lwzx $acc10,$Tbl2,$acc10
444 lwzx $acc11,$Tbl2,$acc11
445 xor $t2,$t2,$acc02 531 xor $t2,$t2,$acc02
532 lwzx $acc11,$Tbl2,$acc11
446 xor $t3,$t3,$acc03 533 xor $t3,$t3,$acc03
447 lwzx $acc12,$Tbl3,$acc12 534 lwzx $acc12,$Tbl3,$acc12
448 lwzx $acc13,$Tbl3,$acc13
449 xor $t0,$t0,$acc04 535 xor $t0,$t0,$acc04
536 lwzx $acc13,$Tbl3,$acc13
450 xor $t1,$t1,$acc05 537 xor $t1,$t1,$acc05
451 lwzx $acc14,$Tbl3,$acc14 538 lwzx $acc14,$Tbl3,$acc14
452 lwzx $acc15,$Tbl3,$acc15
453 xor $t2,$t2,$acc06 539 xor $t2,$t2,$acc06
540 lwzx $acc15,$Tbl3,$acc15
454 xor $t3,$t3,$acc07 541 xor $t3,$t3,$acc07
455 xor $t0,$t0,$acc08 542 xor $t0,$t0,$acc08
456 xor $t1,$t1,$acc09 543 xor $t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
466 addi $Tbl2,$Tbl0,2048 553 addi $Tbl2,$Tbl0,2048
467 nop 554 nop
468 lwz $t0,0($key) 555 lwz $t0,0($key)
469 lwz $t1,4($key)
470 rlwinm $acc00,$s0,`32-24`,24,31 556 rlwinm $acc00,$s0,`32-24`,24,31
557 lwz $t1,4($key)
471 rlwinm $acc01,$s1,`32-24`,24,31 558 rlwinm $acc01,$s1,`32-24`,24,31
472 lwz $t2,8($key) 559 lwz $t2,8($key)
473 lwz $t3,12($key)
474 rlwinm $acc02,$s2,`32-24`,24,31 560 rlwinm $acc02,$s2,`32-24`,24,31
561 lwz $t3,12($key)
475 rlwinm $acc03,$s3,`32-24`,24,31 562 rlwinm $acc03,$s3,`32-24`,24,31
476 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 563 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
477 lwz $acc09,`2048+32`($Tbl0)
478 rlwinm $acc04,$s1,`32-16`,24,31 564 rlwinm $acc04,$s1,`32-16`,24,31
565 lwz $acc09,`2048+32`($Tbl0)
479 rlwinm $acc05,$s2,`32-16`,24,31 566 rlwinm $acc05,$s2,`32-16`,24,31
480 lwz $acc10,`2048+64`($Tbl0) 567 lwz $acc10,`2048+64`($Tbl0)
481 lwz $acc11,`2048+96`($Tbl0)
482 rlwinm $acc06,$s3,`32-16`,24,31 568 rlwinm $acc06,$s3,`32-16`,24,31
569 lwz $acc11,`2048+96`($Tbl0)
483 rlwinm $acc07,$s0,`32-16`,24,31 570 rlwinm $acc07,$s0,`32-16`,24,31
484 lwz $acc12,`2048+128`($Tbl0) 571 lwz $acc12,`2048+128`($Tbl0)
485 lwz $acc13,`2048+160`($Tbl0)
486 rlwinm $acc08,$s2,`32-8`,24,31 572 rlwinm $acc08,$s2,`32-8`,24,31
573 lwz $acc13,`2048+160`($Tbl0)
487 rlwinm $acc09,$s3,`32-8`,24,31 574 rlwinm $acc09,$s3,`32-8`,24,31
488 lwz $acc14,`2048+192`($Tbl0) 575 lwz $acc14,`2048+192`($Tbl0)
489 lwz $acc15,`2048+224`($Tbl0)
490 rlwinm $acc10,$s0,`32-8`,24,31 576 rlwinm $acc10,$s0,`32-8`,24,31
577 lwz $acc15,`2048+224`($Tbl0)
491 rlwinm $acc11,$s1,`32-8`,24,31 578 rlwinm $acc11,$s1,`32-8`,24,31
492 lbzx $acc00,$Tbl2,$acc00 579 lbzx $acc00,$Tbl2,$acc00
493 lbzx $acc01,$Tbl2,$acc01
494 rlwinm $acc12,$s3,`0`,24,31 580 rlwinm $acc12,$s3,`0`,24,31
581 lbzx $acc01,$Tbl2,$acc01
495 rlwinm $acc13,$s0,`0`,24,31 582 rlwinm $acc13,$s0,`0`,24,31
496 lbzx $acc02,$Tbl2,$acc02 583 lbzx $acc02,$Tbl2,$acc02
497 lbzx $acc03,$Tbl2,$acc03
498 rlwinm $acc14,$s1,`0`,24,31 584 rlwinm $acc14,$s1,`0`,24,31
585 lbzx $acc03,$Tbl2,$acc03
499 rlwinm $acc15,$s2,`0`,24,31 586 rlwinm $acc15,$s2,`0`,24,31
500 lbzx $acc04,$Tbl2,$acc04 587 lbzx $acc04,$Tbl2,$acc04
501 lbzx $acc05,$Tbl2,$acc05
502 rlwinm $s0,$acc00,24,0,7 588 rlwinm $s0,$acc00,24,0,7
589 lbzx $acc05,$Tbl2,$acc05
503 rlwinm $s1,$acc01,24,0,7 590 rlwinm $s1,$acc01,24,0,7
504 lbzx $acc06,$Tbl2,$acc06 591 lbzx $acc06,$Tbl2,$acc06
505 lbzx $acc07,$Tbl2,$acc07
506 rlwinm $s2,$acc02,24,0,7 592 rlwinm $s2,$acc02,24,0,7
593 lbzx $acc07,$Tbl2,$acc07
507 rlwinm $s3,$acc03,24,0,7 594 rlwinm $s3,$acc03,24,0,7
508 lbzx $acc08,$Tbl2,$acc08 595 lbzx $acc08,$Tbl2,$acc08
509 lbzx $acc09,$Tbl2,$acc09
510 rlwimi $s0,$acc04,16,8,15 596 rlwimi $s0,$acc04,16,8,15
597 lbzx $acc09,$Tbl2,$acc09
511 rlwimi $s1,$acc05,16,8,15 598 rlwimi $s1,$acc05,16,8,15
512 lbzx $acc10,$Tbl2,$acc10 599 lbzx $acc10,$Tbl2,$acc10
513 lbzx $acc11,$Tbl2,$acc11
514 rlwimi $s2,$acc06,16,8,15 600 rlwimi $s2,$acc06,16,8,15
601 lbzx $acc11,$Tbl2,$acc11
515 rlwimi $s3,$acc07,16,8,15 602 rlwimi $s3,$acc07,16,8,15
516 lbzx $acc12,$Tbl2,$acc12 603 lbzx $acc12,$Tbl2,$acc12
517 lbzx $acc13,$Tbl2,$acc13
518 rlwimi $s0,$acc08,8,16,23 604 rlwimi $s0,$acc08,8,16,23
605 lbzx $acc13,$Tbl2,$acc13
519 rlwimi $s1,$acc09,8,16,23 606 rlwimi $s1,$acc09,8,16,23
520 lbzx $acc14,$Tbl2,$acc14 607 lbzx $acc14,$Tbl2,$acc14
521 lbzx $acc15,$Tbl2,$acc15
522 rlwimi $s2,$acc10,8,16,23 608 rlwimi $s2,$acc10,8,16,23
609 lbzx $acc15,$Tbl2,$acc15
523 rlwimi $s3,$acc11,8,16,23 610 rlwimi $s3,$acc11,8,16,23
524 or $s0,$s0,$acc12 611 or $s0,$s0,$acc12
525 or $s1,$s1,$acc13 612 or $s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
530 xor $s2,$s2,$t2 617 xor $s2,$s2,$t2
531 xor $s3,$s3,$t3 618 xor $s3,$s3,$t3
532 blr 619 blr
620 .long 0
621 .byte 0,12,0x14,0,0,0,0,0
533 622
534.align 4 623.align 4
535Lppc_AES_encrypt_compact: 624Lppc_AES_encrypt_compact:
536 lwz $acc00,240($key) 625 lwz $acc00,240($key)
537 lwz $t0,0($key)
538 lwz $t1,4($key)
539 lwz $t2,8($key)
540 lwz $t3,12($key)
541 addi $Tbl1,$Tbl0,2048 626 addi $Tbl1,$Tbl0,2048
627 lwz $t0,0($key)
542 lis $mask80,0x8080 628 lis $mask80,0x8080
629 lwz $t1,4($key)
543 lis $mask1b,0x1b1b 630 lis $mask1b,0x1b1b
544 addi $key,$key,16 631 lwz $t2,8($key)
545 ori $mask80,$mask80,0x8080 632 ori $mask80,$mask80,0x8080
633 lwz $t3,12($key)
546 ori $mask1b,$mask1b,0x1b1b 634 ori $mask1b,$mask1b,0x1b1b
635 addi $key,$key,16
547 mtctr $acc00 636 mtctr $acc00
548.align 4 637.align 4
549Lenc_compact_loop: 638Lenc_compact_loop:
550 xor $s0,$s0,$t0 639 xor $s0,$s0,$t0
551 xor $s1,$s1,$t1 640 xor $s1,$s1,$t1
552 xor $s2,$s2,$t2
553 xor $s3,$s3,$t3
554 rlwinm $acc00,$s0,`32-24`,24,31 641 rlwinm $acc00,$s0,`32-24`,24,31
642 xor $s2,$s2,$t2
555 rlwinm $acc01,$s1,`32-24`,24,31 643 rlwinm $acc01,$s1,`32-24`,24,31
644 xor $s3,$s3,$t3
556 rlwinm $acc02,$s2,`32-24`,24,31 645 rlwinm $acc02,$s2,`32-24`,24,31
557 rlwinm $acc03,$s3,`32-24`,24,31 646 rlwinm $acc03,$s3,`32-24`,24,31
558 rlwinm $acc04,$s1,`32-16`,24,31 647 rlwinm $acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
560 rlwinm $acc06,$s3,`32-16`,24,31 649 rlwinm $acc06,$s3,`32-16`,24,31
561 rlwinm $acc07,$s0,`32-16`,24,31 650 rlwinm $acc07,$s0,`32-16`,24,31
562 lbzx $acc00,$Tbl1,$acc00 651 lbzx $acc00,$Tbl1,$acc00
563 lbzx $acc01,$Tbl1,$acc01
564 rlwinm $acc08,$s2,`32-8`,24,31 652 rlwinm $acc08,$s2,`32-8`,24,31
653 lbzx $acc01,$Tbl1,$acc01
565 rlwinm $acc09,$s3,`32-8`,24,31 654 rlwinm $acc09,$s3,`32-8`,24,31
566 lbzx $acc02,$Tbl1,$acc02 655 lbzx $acc02,$Tbl1,$acc02
567 lbzx $acc03,$Tbl1,$acc03
568 rlwinm $acc10,$s0,`32-8`,24,31 656 rlwinm $acc10,$s0,`32-8`,24,31
657 lbzx $acc03,$Tbl1,$acc03
569 rlwinm $acc11,$s1,`32-8`,24,31 658 rlwinm $acc11,$s1,`32-8`,24,31
570 lbzx $acc04,$Tbl1,$acc04 659 lbzx $acc04,$Tbl1,$acc04
571 lbzx $acc05,$Tbl1,$acc05
572 rlwinm $acc12,$s3,`0`,24,31 660 rlwinm $acc12,$s3,`0`,24,31
661 lbzx $acc05,$Tbl1,$acc05
573 rlwinm $acc13,$s0,`0`,24,31 662 rlwinm $acc13,$s0,`0`,24,31
574 lbzx $acc06,$Tbl1,$acc06 663 lbzx $acc06,$Tbl1,$acc06
575 lbzx $acc07,$Tbl1,$acc07
576 rlwinm $acc14,$s1,`0`,24,31 664 rlwinm $acc14,$s1,`0`,24,31
665 lbzx $acc07,$Tbl1,$acc07
577 rlwinm $acc15,$s2,`0`,24,31 666 rlwinm $acc15,$s2,`0`,24,31
578 lbzx $acc08,$Tbl1,$acc08 667 lbzx $acc08,$Tbl1,$acc08
579 lbzx $acc09,$Tbl1,$acc09
580 rlwinm $s0,$acc00,24,0,7 668 rlwinm $s0,$acc00,24,0,7
669 lbzx $acc09,$Tbl1,$acc09
581 rlwinm $s1,$acc01,24,0,7 670 rlwinm $s1,$acc01,24,0,7
582 lbzx $acc10,$Tbl1,$acc10 671 lbzx $acc10,$Tbl1,$acc10
583 lbzx $acc11,$Tbl1,$acc11
584 rlwinm $s2,$acc02,24,0,7 672 rlwinm $s2,$acc02,24,0,7
673 lbzx $acc11,$Tbl1,$acc11
585 rlwinm $s3,$acc03,24,0,7 674 rlwinm $s3,$acc03,24,0,7
586 lbzx $acc12,$Tbl1,$acc12 675 lbzx $acc12,$Tbl1,$acc12
587 lbzx $acc13,$Tbl1,$acc13
588 rlwimi $s0,$acc04,16,8,15 676 rlwimi $s0,$acc04,16,8,15
677 lbzx $acc13,$Tbl1,$acc13
589 rlwimi $s1,$acc05,16,8,15 678 rlwimi $s1,$acc05,16,8,15
590 lbzx $acc14,$Tbl1,$acc14 679 lbzx $acc14,$Tbl1,$acc14
591 lbzx $acc15,$Tbl1,$acc15
592 rlwimi $s2,$acc06,16,8,15 680 rlwimi $s2,$acc06,16,8,15
681 lbzx $acc15,$Tbl1,$acc15
593 rlwimi $s3,$acc07,16,8,15 682 rlwimi $s3,$acc07,16,8,15
594 rlwimi $s0,$acc08,8,16,23 683 rlwimi $s0,$acc08,8,16,23
595 rlwimi $s1,$acc09,8,16,23 684 rlwimi $s1,$acc09,8,16,23
596 rlwimi $s2,$acc10,8,16,23 685 rlwimi $s2,$acc10,8,16,23
597 rlwimi $s3,$acc11,8,16,23 686 rlwimi $s3,$acc11,8,16,23
598 lwz $t0,0($key) 687 lwz $t0,0($key)
599 lwz $t1,4($key)
600 or $s0,$s0,$acc12 688 or $s0,$s0,$acc12
689 lwz $t1,4($key)
601 or $s1,$s1,$acc13 690 or $s1,$s1,$acc13
602 lwz $t2,8($key) 691 lwz $t2,8($key)
603 lwz $t3,12($key)
604 or $s2,$s2,$acc14 692 or $s2,$s2,$acc14
693 lwz $t3,12($key)
605 or $s3,$s3,$acc15 694 or $s3,$s3,$acc15
606 695
607 addi $key,$key,16 696 addi $key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
612 and $acc02,$s2,$mask80 701 and $acc02,$s2,$mask80
613 and $acc03,$s3,$mask80 702 and $acc03,$s3,$mask80
614 srwi $acc04,$acc00,7 # r1>>7 703 srwi $acc04,$acc00,7 # r1>>7
615 srwi $acc05,$acc01,7
616 srwi $acc06,$acc02,7
617 srwi $acc07,$acc03,7
618 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f 704 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
705 srwi $acc05,$acc01,7
619 andc $acc09,$s1,$mask80 706 andc $acc09,$s1,$mask80
707 srwi $acc06,$acc02,7
620 andc $acc10,$s2,$mask80 708 andc $acc10,$s2,$mask80
709 srwi $acc07,$acc03,7
621 andc $acc11,$s3,$mask80 710 andc $acc11,$s3,$mask80
622 sub $acc00,$acc00,$acc04 # r1-(r1>>7) 711 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
623 sub $acc01,$acc01,$acc05 712 sub $acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
633 and $acc03,$acc03,$mask1b 722 and $acc03,$acc03,$mask1b
634 xor $acc00,$acc00,$acc08 # r2 723 xor $acc00,$acc00,$acc08 # r2
635 xor $acc01,$acc01,$acc09 724 xor $acc01,$acc01,$acc09
725 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
636 xor $acc02,$acc02,$acc10 726 xor $acc02,$acc02,$acc10
727 rotlwi $acc13,$s1,16
637 xor $acc03,$acc03,$acc11 728 xor $acc03,$acc03,$acc11
729 rotlwi $acc14,$s2,16
638 730
639 rotlwi $acc12,$s0,16 # ROTATE(r0,16)
640 rotlwi $acc13,$s1,16
641 rotlwi $acc14,$s2,16
642 rotlwi $acc15,$s3,16
643 xor $s0,$s0,$acc00 # r0^r2 731 xor $s0,$s0,$acc00 # r0^r2
732 rotlwi $acc15,$s3,16
644 xor $s1,$s1,$acc01 733 xor $s1,$s1,$acc01
645 xor $s2,$s2,$acc02
646 xor $s3,$s3,$acc03
647 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) 734 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
735 xor $s2,$s2,$acc02
648 rotrwi $s1,$s1,24 736 rotrwi $s1,$s1,24
737 xor $s3,$s3,$acc03
649 rotrwi $s2,$s2,24 738 rotrwi $s2,$s2,24
650 rotrwi $s3,$s3,24
651 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 739 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
740 rotrwi $s3,$s3,24
652 xor $s1,$s1,$acc01 741 xor $s1,$s1,$acc01
653 xor $s2,$s2,$acc02 742 xor $s2,$s2,$acc02
654 xor $s3,$s3,$acc03 743 xor $s3,$s3,$acc03
655 rotlwi $acc08,$acc12,8 # ROTATE(r0,24) 744 rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
656 rotlwi $acc09,$acc13,8
657 rotlwi $acc10,$acc14,8
658 rotlwi $acc11,$acc15,8
659 xor $s0,$s0,$acc12 # 745 xor $s0,$s0,$acc12 #
746 rotlwi $acc09,$acc13,8
660 xor $s1,$s1,$acc13 747 xor $s1,$s1,$acc13
748 rotlwi $acc10,$acc14,8
661 xor $s2,$s2,$acc14 749 xor $s2,$s2,$acc14
750 rotlwi $acc11,$acc15,8
662 xor $s3,$s3,$acc15 751 xor $s3,$s3,$acc15
663 xor $s0,$s0,$acc08 # 752 xor $s0,$s0,$acc08 #
664 xor $s1,$s1,$acc09 753 xor $s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
673 xor $s2,$s2,$t2 762 xor $s2,$s2,$t2
674 xor $s3,$s3,$t3 763 xor $s3,$s3,$t3
675 blr 764 blr
765 .long 0
766 .byte 0,12,0x14,0,0,0,0,0
676 767
677.globl .AES_decrypt 768.globl .AES_decrypt
678.align 7 769.align 7
679.AES_decrypt: 770.AES_decrypt:
680 mflr r0
681 $STU $sp,-$FRAME($sp) 771 $STU $sp,-$FRAME($sp)
772 mflr r0
682 773
683 $PUSH r0,`$FRAME-$SIZE_T*21`($sp)
684 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) 774 $PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
685 $PUSH r13,`$FRAME-$SIZE_T*19`($sp) 775 $PUSH r13,`$FRAME-$SIZE_T*19`($sp)
686 $PUSH r14,`$FRAME-$SIZE_T*18`($sp) 776 $PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
701 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 791 $PUSH r29,`$FRAME-$SIZE_T*3`($sp)
702 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 792 $PUSH r30,`$FRAME-$SIZE_T*2`($sp)
703 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 793 $PUSH r31,`$FRAME-$SIZE_T*1`($sp)
794 $PUSH r0,`$FRAME+$LRSAVE`($sp)
704 795
796 andi. $t0,$inp,3
797 andi. $t1,$out,3
798 or. $t0,$t0,$t1
799 bne Ldec_unaligned
800
801Ldec_unaligned_ok:
705 lwz $s0,0($inp) 802 lwz $s0,0($inp)
706 lwz $s1,4($inp) 803 lwz $s1,4($inp)
707 lwz $s2,8($inp) 804 lwz $s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
712 stw $s1,4($out) 809 stw $s1,4($out)
713 stw $s2,8($out) 810 stw $s2,8($out)
714 stw $s3,12($out) 811 stw $s3,12($out)
812 b Ldec_done
813
814Ldec_unaligned:
815 subfic $t0,$inp,4096
816 subfic $t1,$out,4096
817 andi. $t0,$t0,4096-16
818 beq Ldec_xpage
819 andi. $t1,$t1,4096-16
820 bne Ldec_unaligned_ok
821
822Ldec_xpage:
823 lbz $acc00,0($inp)
824 lbz $acc01,1($inp)
825 lbz $acc02,2($inp)
826 lbz $s0,3($inp)
827 lbz $acc04,4($inp)
828 lbz $acc05,5($inp)
829 lbz $acc06,6($inp)
830 lbz $s1,7($inp)
831 lbz $acc08,8($inp)
832 lbz $acc09,9($inp)
833 lbz $acc10,10($inp)
834 insrwi $s0,$acc00,8,0
835 lbz $s2,11($inp)
836 insrwi $s1,$acc04,8,0
837 lbz $acc12,12($inp)
838 insrwi $s0,$acc01,8,8
839 lbz $acc13,13($inp)
840 insrwi $s1,$acc05,8,8
841 lbz $acc14,14($inp)
842 insrwi $s0,$acc02,8,16
843 lbz $s3,15($inp)
844 insrwi $s1,$acc06,8,16
845 insrwi $s2,$acc08,8,0
846 insrwi $s3,$acc12,8,0
847 insrwi $s2,$acc09,8,8
848 insrwi $s3,$acc13,8,8
849 insrwi $s2,$acc10,8,16
850 insrwi $s3,$acc14,8,16
851
852 bl LAES_Td
853 bl Lppc_AES_decrypt_compact
715 854
716 $POP r0,`$FRAME-$SIZE_T*21`($sp) 855 extrwi $acc00,$s0,8,0
856 extrwi $acc01,$s0,8,8
857 stb $acc00,0($out)
858 extrwi $acc02,$s0,8,16
859 stb $acc01,1($out)
860 stb $acc02,2($out)
861 extrwi $acc04,$s1,8,0
862 stb $s0,3($out)
863 extrwi $acc05,$s1,8,8
864 stb $acc04,4($out)
865 extrwi $acc06,$s1,8,16
866 stb $acc05,5($out)
867 stb $acc06,6($out)
868 extrwi $acc08,$s2,8,0
869 stb $s1,7($out)
870 extrwi $acc09,$s2,8,8
871 stb $acc08,8($out)
872 extrwi $acc10,$s2,8,16
873 stb $acc09,9($out)
874 stb $acc10,10($out)
875 extrwi $acc12,$s3,8,0
876 stb $s2,11($out)
877 extrwi $acc13,$s3,8,8
878 stb $acc12,12($out)
879 extrwi $acc14,$s3,8,16
880 stb $acc13,13($out)
881 stb $acc14,14($out)
882 stb $s3,15($out)
883
884Ldec_done:
885 $POP r0,`$FRAME+$LRSAVE`($sp)
717 $POP $toc,`$FRAME-$SIZE_T*20`($sp) 886 $POP $toc,`$FRAME-$SIZE_T*20`($sp)
718 $POP r13,`$FRAME-$SIZE_T*19`($sp) 887 $POP r13,`$FRAME-$SIZE_T*19`($sp)
719 $POP r14,`$FRAME-$SIZE_T*18`($sp) 888 $POP r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
737 mtlr r0 906 mtlr r0
738 addi $sp,$sp,$FRAME 907 addi $sp,$sp,$FRAME
739 blr 908 blr
909 .long 0
910 .byte 0,12,4,1,0x80,18,3,0
911 .long 0
740 912
741.align 5 913.align 5
742Lppc_AES_decrypt: 914Lppc_AES_decrypt:
743 lwz $acc00,240($key) 915 lwz $acc00,240($key)
744 lwz $t0,0($key)
745 lwz $t1,4($key)
746 lwz $t2,8($key)
747 lwz $t3,12($key)
748 addi $Tbl1,$Tbl0,3 916 addi $Tbl1,$Tbl0,3
917 lwz $t0,0($key)
749 addi $Tbl2,$Tbl0,2 918 addi $Tbl2,$Tbl0,2
919 lwz $t1,4($key)
750 addi $Tbl3,$Tbl0,1 920 addi $Tbl3,$Tbl0,1
921 lwz $t2,8($key)
751 addi $acc00,$acc00,-1 922 addi $acc00,$acc00,-1
923 lwz $t3,12($key)
752 addi $key,$key,16 924 addi $key,$key,16
753 xor $s0,$s0,$t0 925 xor $s0,$s0,$t0
754 xor $s1,$s1,$t1 926 xor $s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
762 rlwinm $acc02,$s2,`32-24+3`,21,28 934 rlwinm $acc02,$s2,`32-24+3`,21,28
763 rlwinm $acc03,$s3,`32-24+3`,21,28 935 rlwinm $acc03,$s3,`32-24+3`,21,28
764 lwz $t0,0($key) 936 lwz $t0,0($key)
765 lwz $t1,4($key)
766 rlwinm $acc04,$s3,`32-16+3`,21,28 937 rlwinm $acc04,$s3,`32-16+3`,21,28
938 lwz $t1,4($key)
767 rlwinm $acc05,$s0,`32-16+3`,21,28 939 rlwinm $acc05,$s0,`32-16+3`,21,28
768 lwz $t2,8($key) 940 lwz $t2,8($key)
769 lwz $t3,12($key)
770 rlwinm $acc06,$s1,`32-16+3`,21,28 941 rlwinm $acc06,$s1,`32-16+3`,21,28
942 lwz $t3,12($key)
771 rlwinm $acc07,$s2,`32-16+3`,21,28 943 rlwinm $acc07,$s2,`32-16+3`,21,28
772 lwzx $acc00,$Tbl0,$acc00 944 lwzx $acc00,$Tbl0,$acc00
773 lwzx $acc01,$Tbl0,$acc01
774 rlwinm $acc08,$s2,`32-8+3`,21,28 945 rlwinm $acc08,$s2,`32-8+3`,21,28
946 lwzx $acc01,$Tbl0,$acc01
775 rlwinm $acc09,$s3,`32-8+3`,21,28 947 rlwinm $acc09,$s3,`32-8+3`,21,28
776 lwzx $acc02,$Tbl0,$acc02 948 lwzx $acc02,$Tbl0,$acc02
777 lwzx $acc03,$Tbl0,$acc03
778 rlwinm $acc10,$s0,`32-8+3`,21,28 949 rlwinm $acc10,$s0,`32-8+3`,21,28
950 lwzx $acc03,$Tbl0,$acc03
779 rlwinm $acc11,$s1,`32-8+3`,21,28 951 rlwinm $acc11,$s1,`32-8+3`,21,28
780 lwzx $acc04,$Tbl1,$acc04 952 lwzx $acc04,$Tbl1,$acc04
781 lwzx $acc05,$Tbl1,$acc05
782 rlwinm $acc12,$s1,`0+3`,21,28 953 rlwinm $acc12,$s1,`0+3`,21,28
954 lwzx $acc05,$Tbl1,$acc05
783 rlwinm $acc13,$s2,`0+3`,21,28 955 rlwinm $acc13,$s2,`0+3`,21,28
784 lwzx $acc06,$Tbl1,$acc06 956 lwzx $acc06,$Tbl1,$acc06
785 lwzx $acc07,$Tbl1,$acc07
786 rlwinm $acc14,$s3,`0+3`,21,28 957 rlwinm $acc14,$s3,`0+3`,21,28
958 lwzx $acc07,$Tbl1,$acc07
787 rlwinm $acc15,$s0,`0+3`,21,28 959 rlwinm $acc15,$s0,`0+3`,21,28
788 lwzx $acc08,$Tbl2,$acc08 960 lwzx $acc08,$Tbl2,$acc08
789 lwzx $acc09,$Tbl2,$acc09
790 xor $t0,$t0,$acc00 961 xor $t0,$t0,$acc00
962 lwzx $acc09,$Tbl2,$acc09
791 xor $t1,$t1,$acc01 963 xor $t1,$t1,$acc01
792 lwzx $acc10,$Tbl2,$acc10 964 lwzx $acc10,$Tbl2,$acc10
793 lwzx $acc11,$Tbl2,$acc11
794 xor $t2,$t2,$acc02 965 xor $t2,$t2,$acc02
966 lwzx $acc11,$Tbl2,$acc11
795 xor $t3,$t3,$acc03 967 xor $t3,$t3,$acc03
796 lwzx $acc12,$Tbl3,$acc12 968 lwzx $acc12,$Tbl3,$acc12
797 lwzx $acc13,$Tbl3,$acc13
798 xor $t0,$t0,$acc04 969 xor $t0,$t0,$acc04
970 lwzx $acc13,$Tbl3,$acc13
799 xor $t1,$t1,$acc05 971 xor $t1,$t1,$acc05
800 lwzx $acc14,$Tbl3,$acc14 972 lwzx $acc14,$Tbl3,$acc14
801 lwzx $acc15,$Tbl3,$acc15
802 xor $t2,$t2,$acc06 973 xor $t2,$t2,$acc06
974 lwzx $acc15,$Tbl3,$acc15
803 xor $t3,$t3,$acc07 975 xor $t3,$t3,$acc07
804 xor $t0,$t0,$acc08 976 xor $t0,$t0,$acc08
805 xor $t1,$t1,$acc09 977 xor $t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
815 addi $Tbl2,$Tbl0,2048 987 addi $Tbl2,$Tbl0,2048
816 nop 988 nop
817 lwz $t0,0($key) 989 lwz $t0,0($key)
818 lwz $t1,4($key)
819 rlwinm $acc00,$s0,`32-24`,24,31 990 rlwinm $acc00,$s0,`32-24`,24,31
991 lwz $t1,4($key)
820 rlwinm $acc01,$s1,`32-24`,24,31 992 rlwinm $acc01,$s1,`32-24`,24,31
821 lwz $t2,8($key) 993 lwz $t2,8($key)
822 lwz $t3,12($key)
823 rlwinm $acc02,$s2,`32-24`,24,31 994 rlwinm $acc02,$s2,`32-24`,24,31
995 lwz $t3,12($key)
824 rlwinm $acc03,$s3,`32-24`,24,31 996 rlwinm $acc03,$s3,`32-24`,24,31
825 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 997 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
826 lwz $acc09,`2048+32`($Tbl0)
827 rlwinm $acc04,$s3,`32-16`,24,31 998 rlwinm $acc04,$s3,`32-16`,24,31
999 lwz $acc09,`2048+32`($Tbl0)
828 rlwinm $acc05,$s0,`32-16`,24,31 1000 rlwinm $acc05,$s0,`32-16`,24,31
829 lwz $acc10,`2048+64`($Tbl0) 1001 lwz $acc10,`2048+64`($Tbl0)
830 lwz $acc11,`2048+96`($Tbl0)
831 lbzx $acc00,$Tbl2,$acc00 1002 lbzx $acc00,$Tbl2,$acc00
1003 lwz $acc11,`2048+96`($Tbl0)
832 lbzx $acc01,$Tbl2,$acc01 1004 lbzx $acc01,$Tbl2,$acc01
833 lwz $acc12,`2048+128`($Tbl0) 1005 lwz $acc12,`2048+128`($Tbl0)
834 lwz $acc13,`2048+160`($Tbl0)
835 rlwinm $acc06,$s1,`32-16`,24,31 1006 rlwinm $acc06,$s1,`32-16`,24,31
1007 lwz $acc13,`2048+160`($Tbl0)
836 rlwinm $acc07,$s2,`32-16`,24,31 1008 rlwinm $acc07,$s2,`32-16`,24,31
837 lwz $acc14,`2048+192`($Tbl0) 1009 lwz $acc14,`2048+192`($Tbl0)
838 lwz $acc15,`2048+224`($Tbl0)
839 rlwinm $acc08,$s2,`32-8`,24,31 1010 rlwinm $acc08,$s2,`32-8`,24,31
1011 lwz $acc15,`2048+224`($Tbl0)
840 rlwinm $acc09,$s3,`32-8`,24,31 1012 rlwinm $acc09,$s3,`32-8`,24,31
841 lbzx $acc02,$Tbl2,$acc02 1013 lbzx $acc02,$Tbl2,$acc02
842 lbzx $acc03,$Tbl2,$acc03
843 rlwinm $acc10,$s0,`32-8`,24,31 1014 rlwinm $acc10,$s0,`32-8`,24,31
1015 lbzx $acc03,$Tbl2,$acc03
844 rlwinm $acc11,$s1,`32-8`,24,31 1016 rlwinm $acc11,$s1,`32-8`,24,31
845 lbzx $acc04,$Tbl2,$acc04 1017 lbzx $acc04,$Tbl2,$acc04
846 lbzx $acc05,$Tbl2,$acc05
847 rlwinm $acc12,$s1,`0`,24,31 1018 rlwinm $acc12,$s1,`0`,24,31
1019 lbzx $acc05,$Tbl2,$acc05
848 rlwinm $acc13,$s2,`0`,24,31 1020 rlwinm $acc13,$s2,`0`,24,31
849 lbzx $acc06,$Tbl2,$acc06 1021 lbzx $acc06,$Tbl2,$acc06
850 lbzx $acc07,$Tbl2,$acc07
851 rlwinm $acc14,$s3,`0`,24,31 1022 rlwinm $acc14,$s3,`0`,24,31
1023 lbzx $acc07,$Tbl2,$acc07
852 rlwinm $acc15,$s0,`0`,24,31 1024 rlwinm $acc15,$s0,`0`,24,31
853 lbzx $acc08,$Tbl2,$acc08 1025 lbzx $acc08,$Tbl2,$acc08
854 lbzx $acc09,$Tbl2,$acc09
855 rlwinm $s0,$acc00,24,0,7 1026 rlwinm $s0,$acc00,24,0,7
1027 lbzx $acc09,$Tbl2,$acc09
856 rlwinm $s1,$acc01,24,0,7 1028 rlwinm $s1,$acc01,24,0,7
857 lbzx $acc10,$Tbl2,$acc10 1029 lbzx $acc10,$Tbl2,$acc10
858 lbzx $acc11,$Tbl2,$acc11
859 rlwinm $s2,$acc02,24,0,7 1030 rlwinm $s2,$acc02,24,0,7
1031 lbzx $acc11,$Tbl2,$acc11
860 rlwinm $s3,$acc03,24,0,7 1032 rlwinm $s3,$acc03,24,0,7
861 lbzx $acc12,$Tbl2,$acc12 1033 lbzx $acc12,$Tbl2,$acc12
862 lbzx $acc13,$Tbl2,$acc13
863 rlwimi $s0,$acc04,16,8,15 1034 rlwimi $s0,$acc04,16,8,15
1035 lbzx $acc13,$Tbl2,$acc13
864 rlwimi $s1,$acc05,16,8,15 1036 rlwimi $s1,$acc05,16,8,15
865 lbzx $acc14,$Tbl2,$acc14 1037 lbzx $acc14,$Tbl2,$acc14
866 lbzx $acc15,$Tbl2,$acc15
867 rlwimi $s2,$acc06,16,8,15 1038 rlwimi $s2,$acc06,16,8,15
1039 lbzx $acc15,$Tbl2,$acc15
868 rlwimi $s3,$acc07,16,8,15 1040 rlwimi $s3,$acc07,16,8,15
869 rlwimi $s0,$acc08,8,16,23 1041 rlwimi $s0,$acc08,8,16,23
870 rlwimi $s1,$acc09,8,16,23 1042 rlwimi $s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
879 xor $s2,$s2,$t2 1051 xor $s2,$s2,$t2
880 xor $s3,$s3,$t3 1052 xor $s3,$s3,$t3
881 blr 1053 blr
1054 .long 0
1055 .byte 0,12,0x14,0,0,0,0,0
882 1056
883.align 4 1057.align 4
884Lppc_AES_decrypt_compact: 1058Lppc_AES_decrypt_compact:
885 lwz $acc00,240($key) 1059 lwz $acc00,240($key)
886 lwz $t0,0($key)
887 lwz $t1,4($key)
888 lwz $t2,8($key)
889 lwz $t3,12($key)
890 addi $Tbl1,$Tbl0,2048 1060 addi $Tbl1,$Tbl0,2048
1061 lwz $t0,0($key)
891 lis $mask80,0x8080 1062 lis $mask80,0x8080
1063 lwz $t1,4($key)
892 lis $mask1b,0x1b1b 1064 lis $mask1b,0x1b1b
893 addi $key,$key,16 1065 lwz $t2,8($key)
894 ori $mask80,$mask80,0x8080 1066 ori $mask80,$mask80,0x8080
1067 lwz $t3,12($key)
895 ori $mask1b,$mask1b,0x1b1b 1068 ori $mask1b,$mask1b,0x1b1b
1069 addi $key,$key,16
896___ 1070___
897$code.=<<___ if ($SIZE_T==8); 1071$code.=<<___ if ($SIZE_T==8);
898 insrdi $mask80,$mask80,32,0 1072 insrdi $mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
904Ldec_compact_loop: 1078Ldec_compact_loop:
905 xor $s0,$s0,$t0 1079 xor $s0,$s0,$t0
906 xor $s1,$s1,$t1 1080 xor $s1,$s1,$t1
907 xor $s2,$s2,$t2
908 xor $s3,$s3,$t3
909 rlwinm $acc00,$s0,`32-24`,24,31 1081 rlwinm $acc00,$s0,`32-24`,24,31
1082 xor $s2,$s2,$t2
910 rlwinm $acc01,$s1,`32-24`,24,31 1083 rlwinm $acc01,$s1,`32-24`,24,31
1084 xor $s3,$s3,$t3
911 rlwinm $acc02,$s2,`32-24`,24,31 1085 rlwinm $acc02,$s2,`32-24`,24,31
912 rlwinm $acc03,$s3,`32-24`,24,31 1086 rlwinm $acc03,$s3,`32-24`,24,31
913 rlwinm $acc04,$s3,`32-16`,24,31 1087 rlwinm $acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
915 rlwinm $acc06,$s1,`32-16`,24,31 1089 rlwinm $acc06,$s1,`32-16`,24,31
916 rlwinm $acc07,$s2,`32-16`,24,31 1090 rlwinm $acc07,$s2,`32-16`,24,31
917 lbzx $acc00,$Tbl1,$acc00 1091 lbzx $acc00,$Tbl1,$acc00
918 lbzx $acc01,$Tbl1,$acc01
919 rlwinm $acc08,$s2,`32-8`,24,31 1092 rlwinm $acc08,$s2,`32-8`,24,31
1093 lbzx $acc01,$Tbl1,$acc01
920 rlwinm $acc09,$s3,`32-8`,24,31 1094 rlwinm $acc09,$s3,`32-8`,24,31
921 lbzx $acc02,$Tbl1,$acc02 1095 lbzx $acc02,$Tbl1,$acc02
922 lbzx $acc03,$Tbl1,$acc03
923 rlwinm $acc10,$s0,`32-8`,24,31 1096 rlwinm $acc10,$s0,`32-8`,24,31
1097 lbzx $acc03,$Tbl1,$acc03
924 rlwinm $acc11,$s1,`32-8`,24,31 1098 rlwinm $acc11,$s1,`32-8`,24,31
925 lbzx $acc04,$Tbl1,$acc04 1099 lbzx $acc04,$Tbl1,$acc04
926 lbzx $acc05,$Tbl1,$acc05
927 rlwinm $acc12,$s1,`0`,24,31 1100 rlwinm $acc12,$s1,`0`,24,31
1101 lbzx $acc05,$Tbl1,$acc05
928 rlwinm $acc13,$s2,`0`,24,31 1102 rlwinm $acc13,$s2,`0`,24,31
929 lbzx $acc06,$Tbl1,$acc06 1103 lbzx $acc06,$Tbl1,$acc06
930 lbzx $acc07,$Tbl1,$acc07
931 rlwinm $acc14,$s3,`0`,24,31 1104 rlwinm $acc14,$s3,`0`,24,31
1105 lbzx $acc07,$Tbl1,$acc07
932 rlwinm $acc15,$s0,`0`,24,31 1106 rlwinm $acc15,$s0,`0`,24,31
933 lbzx $acc08,$Tbl1,$acc08 1107 lbzx $acc08,$Tbl1,$acc08
934 lbzx $acc09,$Tbl1,$acc09
935 rlwinm $s0,$acc00,24,0,7 1108 rlwinm $s0,$acc00,24,0,7
1109 lbzx $acc09,$Tbl1,$acc09
936 rlwinm $s1,$acc01,24,0,7 1110 rlwinm $s1,$acc01,24,0,7
937 lbzx $acc10,$Tbl1,$acc10 1111 lbzx $acc10,$Tbl1,$acc10
938 lbzx $acc11,$Tbl1,$acc11
939 rlwinm $s2,$acc02,24,0,7 1112 rlwinm $s2,$acc02,24,0,7
1113 lbzx $acc11,$Tbl1,$acc11
940 rlwinm $s3,$acc03,24,0,7 1114 rlwinm $s3,$acc03,24,0,7
941 lbzx $acc12,$Tbl1,$acc12 1115 lbzx $acc12,$Tbl1,$acc12
942 lbzx $acc13,$Tbl1,$acc13
943 rlwimi $s0,$acc04,16,8,15 1116 rlwimi $s0,$acc04,16,8,15
1117 lbzx $acc13,$Tbl1,$acc13
944 rlwimi $s1,$acc05,16,8,15 1118 rlwimi $s1,$acc05,16,8,15
945 lbzx $acc14,$Tbl1,$acc14 1119 lbzx $acc14,$Tbl1,$acc14
946 lbzx $acc15,$Tbl1,$acc15
947 rlwimi $s2,$acc06,16,8,15 1120 rlwimi $s2,$acc06,16,8,15
1121 lbzx $acc15,$Tbl1,$acc15
948 rlwimi $s3,$acc07,16,8,15 1122 rlwimi $s3,$acc07,16,8,15
949 rlwimi $s0,$acc08,8,16,23 1123 rlwimi $s0,$acc08,8,16,23
950 rlwimi $s1,$acc09,8,16,23 1124 rlwimi $s1,$acc09,8,16,23
951 rlwimi $s2,$acc10,8,16,23 1125 rlwimi $s2,$acc10,8,16,23
952 rlwimi $s3,$acc11,8,16,23 1126 rlwimi $s3,$acc11,8,16,23
953 lwz $t0,0($key) 1127 lwz $t0,0($key)
954 lwz $t1,4($key)
955 or $s0,$s0,$acc12 1128 or $s0,$s0,$acc12
1129 lwz $t1,4($key)
956 or $s1,$s1,$acc13 1130 or $s1,$s1,$acc13
957 lwz $t2,8($key) 1131 lwz $t2,8($key)
958 lwz $t3,12($key)
959 or $s2,$s2,$acc14 1132 or $s2,$s2,$acc14
1133 lwz $t3,12($key)
960 or $s3,$s3,$acc15 1134 or $s3,$s3,$acc15
961 1135
962 addi $key,$key,16 1136 addi $key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
1030 and $acc02,$s2,$mask80 1204 and $acc02,$s2,$mask80
1031 and $acc03,$s3,$mask80 1205 and $acc03,$s3,$mask80
1032 srwi $acc04,$acc00,7 # r1>>7 1206 srwi $acc04,$acc00,7 # r1>>7
1033 srwi $acc05,$acc01,7
1034 srwi $acc06,$acc02,7
1035 srwi $acc07,$acc03,7
1036 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f 1207 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
1208 srwi $acc05,$acc01,7
1037 andc $acc09,$s1,$mask80 1209 andc $acc09,$s1,$mask80
1210 srwi $acc06,$acc02,7
1038 andc $acc10,$s2,$mask80 1211 andc $acc10,$s2,$mask80
1212 srwi $acc07,$acc03,7
1039 andc $acc11,$s3,$mask80 1213 andc $acc11,$s3,$mask80
1040 sub $acc00,$acc00,$acc04 # r1-(r1>>7) 1214 sub $acc00,$acc00,$acc04 # r1-(r1>>7)
1041 sub $acc01,$acc01,$acc05 1215 sub $acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
1059 and $acc06,$acc02,$mask80 1233 and $acc06,$acc02,$mask80
1060 and $acc07,$acc03,$mask80 1234 and $acc07,$acc03,$mask80
1061 srwi $acc08,$acc04,7 # r1>>7 1235 srwi $acc08,$acc04,7 # r1>>7
1062 srwi $acc09,$acc05,7
1063 srwi $acc10,$acc06,7
1064 srwi $acc11,$acc07,7
1065 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f 1236 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
1237 srwi $acc09,$acc05,7
1066 andc $acc13,$acc01,$mask80 1238 andc $acc13,$acc01,$mask80
1239 srwi $acc10,$acc06,7
1067 andc $acc14,$acc02,$mask80 1240 andc $acc14,$acc02,$mask80
1241 srwi $acc11,$acc07,7
1068 andc $acc15,$acc03,$mask80 1242 andc $acc15,$acc03,$mask80
1069 sub $acc04,$acc04,$acc08 # r1-(r1>>7) 1243 sub $acc04,$acc04,$acc08 # r1-(r1>>7)
1070 sub $acc05,$acc05,$acc09 1244 sub $acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
1085 1259
1086 and $acc08,$acc04,$mask80 # r1=r4&0x80808080 1260 and $acc08,$acc04,$mask80 # r1=r4&0x80808080
1087 and $acc09,$acc05,$mask80 1261 and $acc09,$acc05,$mask80
1088 and $acc10,$acc06,$mask80
1089 and $acc11,$acc07,$mask80
1090 srwi $acc12,$acc08,7 # r1>>7 1262 srwi $acc12,$acc08,7 # r1>>7
1263 and $acc10,$acc06,$mask80
1091 srwi $acc13,$acc09,7 1264 srwi $acc13,$acc09,7
1265 and $acc11,$acc07,$mask80
1092 srwi $acc14,$acc10,7 1266 srwi $acc14,$acc10,7
1093 srwi $acc15,$acc11,7
1094 sub $acc08,$acc08,$acc12 # r1-(r1>>7) 1267 sub $acc08,$acc08,$acc12 # r1-(r1>>7)
1268 srwi $acc15,$acc11,7
1095 sub $acc09,$acc09,$acc13 1269 sub $acc09,$acc09,$acc13
1096 sub $acc10,$acc10,$acc14 1270 sub $acc10,$acc10,$acc14
1097 sub $acc11,$acc11,$acc15 1271 sub $acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
1124$code.=<<___; 1298$code.=<<___;
1125 rotrwi $s0,$s0,8 # = ROTATE(r0,8) 1299 rotrwi $s0,$s0,8 # = ROTATE(r0,8)
1126 rotrwi $s1,$s1,8 1300 rotrwi $s1,$s1,8
1127 rotrwi $s2,$s2,8
1128 rotrwi $s3,$s3,8
1129 xor $s0,$s0,$acc00 # ^= r2^r0 1301 xor $s0,$s0,$acc00 # ^= r2^r0
1302 rotrwi $s2,$s2,8
1130 xor $s1,$s1,$acc01 1303 xor $s1,$s1,$acc01
1304 rotrwi $s3,$s3,8
1131 xor $s2,$s2,$acc02 1305 xor $s2,$s2,$acc02
1132 xor $s3,$s3,$acc03 1306 xor $s3,$s3,$acc03
1133 xor $acc00,$acc00,$acc08 1307 xor $acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
1135 xor $acc02,$acc02,$acc10 1309 xor $acc02,$acc02,$acc10
1136 xor $acc03,$acc03,$acc11 1310 xor $acc03,$acc03,$acc11
1137 xor $s0,$s0,$acc04 # ^= r4^r0 1311 xor $s0,$s0,$acc04 # ^= r4^r0
1138 xor $s1,$s1,$acc05
1139 xor $s2,$s2,$acc06
1140 xor $s3,$s3,$acc07
1141 rotrwi $acc00,$acc00,24 1312 rotrwi $acc00,$acc00,24
1313 xor $s1,$s1,$acc05
1142 rotrwi $acc01,$acc01,24 1314 rotrwi $acc01,$acc01,24
1315 xor $s2,$s2,$acc06
1143 rotrwi $acc02,$acc02,24 1316 rotrwi $acc02,$acc02,24
1317 xor $s3,$s3,$acc07
1144 rotrwi $acc03,$acc03,24 1318 rotrwi $acc03,$acc03,24
1145 xor $acc04,$acc04,$acc08 1319 xor $acc04,$acc04,$acc08
1146 xor $acc05,$acc05,$acc09 1320 xor $acc05,$acc05,$acc09
1147 xor $acc06,$acc06,$acc10 1321 xor $acc06,$acc06,$acc10
1148 xor $acc07,$acc07,$acc11 1322 xor $acc07,$acc07,$acc11
1149 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] 1323 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1150 xor $s1,$s1,$acc09
1151 xor $s2,$s2,$acc10
1152 xor $s3,$s3,$acc11
1153 rotrwi $acc04,$acc04,16 1324 rotrwi $acc04,$acc04,16
1325 xor $s1,$s1,$acc09
1154 rotrwi $acc05,$acc05,16 1326 rotrwi $acc05,$acc05,16
1327 xor $s2,$s2,$acc10
1155 rotrwi $acc06,$acc06,16 1328 rotrwi $acc06,$acc06,16
1329 xor $s3,$s3,$acc11
1156 rotrwi $acc07,$acc07,16 1330 rotrwi $acc07,$acc07,16
1157 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) 1331 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
1158 xor $s1,$s1,$acc01
1159 xor $s2,$s2,$acc02
1160 xor $s3,$s3,$acc03
1161 rotrwi $acc08,$acc08,8 1332 rotrwi $acc08,$acc08,8
1333 xor $s1,$s1,$acc01
1162 rotrwi $acc09,$acc09,8 1334 rotrwi $acc09,$acc09,8
1335 xor $s2,$s2,$acc02
1163 rotrwi $acc10,$acc10,8 1336 rotrwi $acc10,$acc10,8
1337 xor $s3,$s3,$acc03
1164 rotrwi $acc11,$acc11,8 1338 rotrwi $acc11,$acc11,8
1165 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) 1339 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
1166 xor $s1,$s1,$acc05 1340 xor $s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
1179 xor $s2,$s2,$t2 1353 xor $s2,$s2,$t2
1180 xor $s3,$s3,$t3 1354 xor $s3,$s3,$t3
1181 blr 1355 blr
1182.long 0 1356 .long 0
1357 .byte 0,12,0x14,0,0,0,0,0
1358
1183.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>" 1359.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1184.align 7 1360.align 7
1185___ 1361___
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 7e01889298..445a1e6762 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
44# Unlike previous version hardware support detection takes place only 44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds. 45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not 46# This is done, because deferred key setup can't be made MT-safe, not
47# for key lengthes longer than 128 bits. 47# for keys longer than 128 bits.
48# 48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement, 49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x, 50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized. 51# because software implementation was optimized.
52 52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
53$softonly=0; # allow hardware support 98$softonly=0; # allow hardware support
54 99
55$t0="%r0"; $mask="%r0"; 100$t0="%r0"; $mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
69$ra="%r14"; 114$ra="%r14";
70$sp="%r15"; 115$sp="%r15";
71 116
117$stdframe=16*$SIZE_T+4*8;
118
72sub _data_word() 119sub _data_word()
73{ my $i; 120{ my $i;
74 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
210.Lesoft: 257.Lesoft:
211___ 258___
212$code.=<<___; 259$code.=<<___;
213 stmg %r3,$ra,24($sp) 260 stm${g} %r3,$ra,3*$SIZE_T($sp)
214 261
215 llgf $s0,0($inp) 262 llgf $s0,0($inp)
216 llgf $s1,4($inp) 263 llgf $s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
220 larl $tbl,AES_Te 267 larl $tbl,AES_Te
221 bras $ra,_s390x_AES_encrypt 268 bras $ra,_s390x_AES_encrypt
222 269
223 lg $out,24($sp) 270 l${g} $out,3*$SIZE_T($sp)
224 st $s0,0($out) 271 st $s0,0($out)
225 st $s1,4($out) 272 st $s1,4($out)
226 st $s2,8($out) 273 st $s2,8($out)
227 st $s3,12($out) 274 st $s3,12($out)
228 275
229 lmg %r6,$ra,48($sp) 276 lm${g} %r6,$ra,6*$SIZE_T($sp)
230 br $ra 277 br $ra
231.size AES_encrypt,.-AES_encrypt 278.size AES_encrypt,.-AES_encrypt
232 279
233.type _s390x_AES_encrypt,\@function 280.type _s390x_AES_encrypt,\@function
234.align 16 281.align 16
235_s390x_AES_encrypt: 282_s390x_AES_encrypt:
236 stg $ra,152($sp) 283 st${g} $ra,15*$SIZE_T($sp)
237 x $s0,0($key) 284 x $s0,0($key)
238 x $s1,4($key) 285 x $s1,4($key)
239 x $s2,8($key) 286 x $s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
397 or $s2,$i3 444 or $s2,$i3
398 or $s3,$t3 445 or $s3,$t3
399 446
400 lg $ra,152($sp) 447 l${g} $ra,15*$SIZE_T($sp)
401 xr $s0,$t0 448 xr $s0,$t0
402 xr $s1,$t2 449 xr $s1,$t2
403 x $s2,24($key) 450 x $s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
536.Ldsoft: 583.Ldsoft:
537___ 584___
538$code.=<<___; 585$code.=<<___;
539 stmg %r3,$ra,24($sp) 586 stm${g} %r3,$ra,3*$SIZE_T($sp)
540 587
541 llgf $s0,0($inp) 588 llgf $s0,0($inp)
542 llgf $s1,4($inp) 589 llgf $s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
546 larl $tbl,AES_Td 593 larl $tbl,AES_Td
547 bras $ra,_s390x_AES_decrypt 594 bras $ra,_s390x_AES_decrypt
548 595
549 lg $out,24($sp) 596 l${g} $out,3*$SIZE_T($sp)
550 st $s0,0($out) 597 st $s0,0($out)
551 st $s1,4($out) 598 st $s1,4($out)
552 st $s2,8($out) 599 st $s2,8($out)
553 st $s3,12($out) 600 st $s3,12($out)
554 601
555 lmg %r6,$ra,48($sp) 602 lm${g} %r6,$ra,6*$SIZE_T($sp)
556 br $ra 603 br $ra
557.size AES_decrypt,.-AES_decrypt 604.size AES_decrypt,.-AES_decrypt
558 605
559.type _s390x_AES_decrypt,\@function 606.type _s390x_AES_decrypt,\@function
560.align 16 607.align 16
561_s390x_AES_decrypt: 608_s390x_AES_decrypt:
562 stg $ra,152($sp) 609 st${g} $ra,15*$SIZE_T($sp)
563 x $s0,0($key) 610 x $s0,0($key)
564 x $s1,4($key) 611 x $s1,4($key)
565 x $s2,8($key) 612 x $s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
703 nr $i1,$mask 750 nr $i1,$mask
704 nr $i2,$mask 751 nr $i2,$mask
705 752
706 lg $ra,152($sp) 753 l${g} $ra,15*$SIZE_T($sp)
707 or $s1,$t1 754 or $s1,$t1
708 l $t0,16($key) 755 l $t0,16($key)
709 l $t1,20($key) 756 l $t1,20($key)
@@ -732,14 +779,15 @@ ___
732$code.=<<___; 779$code.=<<___;
733# void AES_set_encrypt_key(const unsigned char *in, int bits, 780# void AES_set_encrypt_key(const unsigned char *in, int bits,
734# AES_KEY *key) { 781# AES_KEY *key) {
735.globl AES_set_encrypt_key 782.globl private_AES_set_encrypt_key
736.type AES_set_encrypt_key,\@function 783.type private_AES_set_encrypt_key,\@function
737.align 16 784.align 16
738AES_set_encrypt_key: 785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
739 lghi $t0,0 787 lghi $t0,0
740 clgr $inp,$t0 788 cl${g}r $inp,$t0
741 je .Lminus1 789 je .Lminus1
742 clgr $key,$t0 790 cl${g}r $key,$t0
743 je .Lminus1 791 je .Lminus1
744 792
745 lghi $t0,128 793 lghi $t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
789 je 1f 837 je 1f
790 lg %r1,24($inp) 838 lg %r1,24($inp)
791 stg %r1,24($key) 839 stg %r1,24($key)
7921: st $bits,236($key) # save bits 8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
793 st %r5,240($key) # save km code 842 st %r5,240($key) # save km code
794 lghi %r2,0 843 lghi %r2,0
795 br %r14 844 br %r14
@@ -797,7 +846,7 @@ ___
797$code.=<<___; 846$code.=<<___;
798.align 16 847.align 16
799.Lekey_internal: 848.Lekey_internal:
800 stmg %r6,%r13,48($sp) # all non-volatile regs 849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
801 850
802 larl $tbl,AES_Te+2048 851 larl $tbl,AES_Te+2048
803 852
@@ -857,8 +906,9 @@ $code.=<<___;
857 la $key,16($key) # key+=4 906 la $key,16($key) # key+=4
858 la $t3,4($t3) # i++ 907 la $t3,4($t3) # i++
859 brct $rounds,.L128_loop 908 brct $rounds,.L128_loop
909 lghi $t0,10
860 lghi %r2,0 910 lghi %r2,0
861 lmg %r6,%r13,48($sp) 911 lm${g} %r4,%r13,4*$SIZE_T($sp)
862 br $ra 912 br $ra
863 913
864.align 16 914.align 16
@@ -905,8 +955,9 @@ $code.=<<___;
905 st $s2,32($key) 955 st $s2,32($key)
906 st $s3,36($key) 956 st $s3,36($key)
907 brct $rounds,.L192_continue 957 brct $rounds,.L192_continue
958 lghi $t0,12
908 lghi %r2,0 959 lghi %r2,0
909 lmg %r6,%r13,48($sp) 960 lm${g} %r4,%r13,4*$SIZE_T($sp)
910 br $ra 961 br $ra
911 962
912.align 16 963.align 16
@@ -967,8 +1018,9 @@ $code.=<<___;
967 st $s2,40($key) 1018 st $s2,40($key)
968 st $s3,44($key) 1019 st $s3,44($key)
969 brct $rounds,.L256_continue 1020 brct $rounds,.L256_continue
1021 lghi $t0,14
970 lghi %r2,0 1022 lghi %r2,0
971 lmg %r6,%r13,48($sp) 1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
972 br $ra 1024 br $ra
973 1025
974.align 16 1026.align 16
@@ -1011,42 +1063,34 @@ $code.=<<___;
1011.Lminus1: 1063.Lminus1:
1012 lghi %r2,-1 1064 lghi %r2,-1
1013 br $ra 1065 br $ra
1014.size AES_set_encrypt_key,.-AES_set_encrypt_key 1066.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1015 1067
1016# void AES_set_decrypt_key(const unsigned char *in, int bits, 1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1017# AES_KEY *key) { 1069# AES_KEY *key) {
1018.globl AES_set_decrypt_key 1070.globl private_AES_set_decrypt_key
1019.type AES_set_decrypt_key,\@function 1071.type private_AES_set_decrypt_key,\@function
1020.align 16 1072.align 16
1021AES_set_decrypt_key: 1073private_AES_set_decrypt_key:
1022 stg $key,32($sp) # I rely on AES_set_encrypt_key to 1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1023 stg $ra,112($sp) # save non-volatile registers! 1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1024 bras $ra,AES_set_encrypt_key 1076 bras $ra,_s390x_AES_set_encrypt_key
1025 lg $key,32($sp) 1077 #l${g} $key,4*$SIZE_T($sp)
1026 lg $ra,112($sp) 1078 l${g} $ra,14*$SIZE_T($sp)
1027 ltgr %r2,%r2 1079 ltgr %r2,%r2
1028 bnzr $ra 1080 bnzr $ra
1029___ 1081___
1030$code.=<<___ if (!$softonly); 1082$code.=<<___ if (!$softonly);
1031 l $t0,240($key) 1083 #l $t0,240($key)
1032 lhi $t1,16 1084 lhi $t1,16
1033 cr $t0,$t1 1085 cr $t0,$t1
1034 jl .Lgo 1086 jl .Lgo
1035 oill $t0,0x80 # set "decrypt" bit 1087 oill $t0,0x80 # set "decrypt" bit
1036 st $t0,240($key) 1088 st $t0,240($key)
1037 br $ra 1089 br $ra
1038
1039.align 16
1040.Ldkey_internal:
1041 stg $key,32($sp)
1042 stg $ra,40($sp)
1043 bras $ra,.Lekey_internal
1044 lg $key,32($sp)
1045 lg $ra,40($sp)
1046___ 1090___
1047$code.=<<___; 1091$code.=<<___;
1048 1092.align 16
1049.Lgo: llgf $rounds,240($key) 1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1050 la $i1,0($key) 1094 la $i1,0($key)
1051 sllg $i2,$rounds,4 1095 sllg $i2,$rounds,4
1052 la $i2,0($i2,$key) 1096 la $i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
1123 la $key,4($key) 1167 la $key,4($key)
1124 brct $rounds,.Lmix 1168 brct $rounds,.Lmix
1125 1169
1126 lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! 1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1127 lghi %r2,0 1171 lghi %r2,0
1128 br $ra 1172 br $ra
1129.size AES_set_decrypt_key,.-AES_set_decrypt_key 1173.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1130___ 1174___
1131 1175
1132#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, 1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1133# size_t length, const AES_KEY *key, 1178# size_t length, const AES_KEY *key,
1134# unsigned char *ivec, const int enc) 1179# unsigned char *ivec, const int enc)
1135{ 1180{
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
1163 l %r0,240($key) # load kmc code 1208 l %r0,240($key) # load kmc code
1164 lghi $key,15 # res=len%16, len-=res; 1209 lghi $key,15 # res=len%16, len-=res;
1165 ngr $key,$len 1210 ngr $key,$len
1166 slgr $len,$key 1211 sl${g}r $len,$key
1167 la %r1,16($sp) # parameter block - ivec || key 1212 la %r1,16($sp) # parameter block - ivec || key
1168 jz .Lkmc_truncated 1213 jz .Lkmc_truncated
1169 .long 0xb92f0042 # kmc %r4,%r2 1214 .long 0xb92f0042 # kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
1181 tmll %r0,0x80 1226 tmll %r0,0x80
1182 jnz .Lkmc_truncated_dec 1227 jnz .Lkmc_truncated_dec
1183 lghi %r1,0 1228 lghi %r1,0
1184 stg %r1,128($sp) 1229 stg %r1,16*$SIZE_T($sp)
1185 stg %r1,136($sp) 1230 stg %r1,16*$SIZE_T+8($sp)
1186 bras %r1,1f 1231 bras %r1,1f
1187 mvc 128(1,$sp),0($inp) 1232 mvc 16*$SIZE_T(1,$sp),0($inp)
11881: ex $key,0(%r1) 12331: ex $key,0(%r1)
1189 la %r1,16($sp) # restore parameter block 1234 la %r1,16($sp) # restore parameter block
1190 la $inp,128($sp) 1235 la $inp,16*$SIZE_T($sp)
1191 lghi $len,16 1236 lghi $len,16
1192 .long 0xb92f0042 # kmc %r4,%r2 1237 .long 0xb92f0042 # kmc %r4,%r2
1193 j .Lkmc_done 1238 j .Lkmc_done
1194.align 16 1239.align 16
1195.Lkmc_truncated_dec: 1240.Lkmc_truncated_dec:
1196 stg $out,64($sp) 1241 st${g} $out,4*$SIZE_T($sp)
1197 la $out,128($sp) 1242 la $out,16*$SIZE_T($sp)
1198 lghi $len,16 1243 lghi $len,16
1199 .long 0xb92f0042 # kmc %r4,%r2 1244 .long 0xb92f0042 # kmc %r4,%r2
1200 lg $out,64($sp) 1245 l${g} $out,4*$SIZE_T($sp)
1201 bras %r1,2f 1246 bras %r1,2f
1202 mvc 0(1,$out),128($sp) 1247 mvc 0(1,$out),16*$SIZE_T($sp)
12032: ex $key,0(%r1) 12482: ex $key,0(%r1)
1204 j .Lkmc_done 1249 j .Lkmc_done
1205.align 16 1250.align 16
1206.Lcbc_software: 1251.Lcbc_software:
1207___ 1252___
1208$code.=<<___; 1253$code.=<<___;
1209 stmg $key,$ra,40($sp) 1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1210 lhi %r0,0 1255 lhi %r0,0
1211 cl %r0,164($sp) 1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1212 je .Lcbc_decrypt 1257 je .Lcbc_decrypt
1213 1258
1214 larl $tbl,AES_Te 1259 larl $tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
1219 llgf $s3,12($ivp) 1264 llgf $s3,12($ivp)
1220 1265
1221 lghi $t0,16 1266 lghi $t0,16
1222 slgr $len,$t0 1267 sl${g}r $len,$t0
1223 brc 4,.Lcbc_enc_tail # if borrow 1268 brc 4,.Lcbc_enc_tail # if borrow
1224.Lcbc_enc_loop: 1269.Lcbc_enc_loop:
1225 stmg $inp,$out,16($sp) 1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1226 x $s0,0($inp) 1271 x $s0,0($inp)
1227 x $s1,4($inp) 1272 x $s1,4($inp)
1228 x $s2,8($inp) 1273 x $s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
1231 1276
1232 bras $ra,_s390x_AES_encrypt 1277 bras $ra,_s390x_AES_encrypt
1233 1278
1234 lmg $inp,$key,16($sp) 1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1235 st $s0,0($out) 1280 st $s0,0($out)
1236 st $s1,4($out) 1281 st $s1,4($out)
1237 st $s2,8($out) 1282 st $s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
1240 la $inp,16($inp) 1285 la $inp,16($inp)
1241 la $out,16($out) 1286 la $out,16($out)
1242 lghi $t0,16 1287 lghi $t0,16
1243 ltgr $len,$len 1288 lt${g}r $len,$len
1244 jz .Lcbc_enc_done 1289 jz .Lcbc_enc_done
1245 slgr $len,$t0 1290 sl${g}r $len,$t0
1246 brc 4,.Lcbc_enc_tail # if borrow 1291 brc 4,.Lcbc_enc_tail # if borrow
1247 j .Lcbc_enc_loop 1292 j .Lcbc_enc_loop
1248.align 16 1293.align 16
1249.Lcbc_enc_done: 1294.Lcbc_enc_done:
1250 lg $ivp,48($sp) 1295 l${g} $ivp,6*$SIZE_T($sp)
1251 st $s0,0($ivp) 1296 st $s0,0($ivp)
1252 st $s1,4($ivp) 1297 st $s1,4($ivp)
1253 st $s2,8($ivp) 1298 st $s2,8($ivp)
1254 st $s3,12($ivp) 1299 st $s3,12($ivp)
1255 1300
1256 lmg %r7,$ra,56($sp) 1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1257 br $ra 1302 br $ra
1258 1303
1259.align 16 1304.align 16
1260.Lcbc_enc_tail: 1305.Lcbc_enc_tail:
1261 aghi $len,15 1306 aghi $len,15
1262 lghi $t0,0 1307 lghi $t0,0
1263 stg $t0,128($sp) 1308 stg $t0,16*$SIZE_T($sp)
1264 stg $t0,136($sp) 1309 stg $t0,16*$SIZE_T+8($sp)
1265 bras $t1,3f 1310 bras $t1,3f
1266 mvc 128(1,$sp),0($inp) 1311 mvc 16*$SIZE_T(1,$sp),0($inp)
12673: ex $len,0($t1) 13123: ex $len,0($t1)
1268 lghi $len,0 1313 lghi $len,0
1269 la $inp,128($sp) 1314 la $inp,16*$SIZE_T($sp)
1270 j .Lcbc_enc_loop 1315 j .Lcbc_enc_loop
1271 1316
1272.align 16 1317.align 16
@@ -1275,10 +1320,10 @@ $code.=<<___;
1275 1320
1276 lg $t0,0($ivp) 1321 lg $t0,0($ivp)
1277 lg $t1,8($ivp) 1322 lg $t1,8($ivp)
1278 stmg $t0,$t1,128($sp) 1323 stmg $t0,$t1,16*$SIZE_T($sp)
1279 1324
1280.Lcbc_dec_loop: 1325.Lcbc_dec_loop:
1281 stmg $inp,$out,16($sp) 1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1282 llgf $s0,0($inp) 1327 llgf $s0,0($inp)
1283 llgf $s1,4($inp) 1328 llgf $s1,4($inp)
1284 llgf $s2,8($inp) 1329 llgf $s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
1287 1332
1288 bras $ra,_s390x_AES_decrypt 1333 bras $ra,_s390x_AES_decrypt
1289 1334
1290 lmg $inp,$key,16($sp) 1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1291 sllg $s0,$s0,32 1336 sllg $s0,$s0,32
1292 sllg $s2,$s2,32 1337 sllg $s2,$s2,32
1293 lr $s0,$s1 1338 lr $s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
1295 1340
1296 lg $t0,0($inp) 1341 lg $t0,0($inp)
1297 lg $t1,8($inp) 1342 lg $t1,8($inp)
1298 xg $s0,128($sp) 1343 xg $s0,16*$SIZE_T($sp)
1299 xg $s2,136($sp) 1344 xg $s2,16*$SIZE_T+8($sp)
1300 lghi $s1,16 1345 lghi $s1,16
1301 slgr $len,$s1 1346 sl${g}r $len,$s1
1302 brc 4,.Lcbc_dec_tail # if borrow 1347 brc 4,.Lcbc_dec_tail # if borrow
1303 brc 2,.Lcbc_dec_done # if zero 1348 brc 2,.Lcbc_dec_done # if zero
1304 stg $s0,0($out) 1349 stg $s0,0($out)
1305 stg $s2,8($out) 1350 stg $s2,8($out)
1306 stmg $t0,$t1,128($sp) 1351 stmg $t0,$t1,16*$SIZE_T($sp)
1307 1352
1308 la $inp,16($inp) 1353 la $inp,16($inp)
1309 la $out,16($out) 1354 la $out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
1313 stg $s0,0($out) 1358 stg $s0,0($out)
1314 stg $s2,8($out) 1359 stg $s2,8($out)
1315.Lcbc_dec_exit: 1360.Lcbc_dec_exit:
1316 lmg $ivp,$ra,48($sp) 1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1317 stmg $t0,$t1,0($ivp) 1362 stmg $t0,$t1,0($ivp)
1318 1363
1319 br $ra 1364 br $ra
@@ -1321,19 +1366,889 @@ $code.=<<___;
1321.align 16 1366.align 16
1322.Lcbc_dec_tail: 1367.Lcbc_dec_tail:
1323 aghi $len,15 1368 aghi $len,15
1324 stg $s0,128($sp) 1369 stg $s0,16*$SIZE_T($sp)
1325 stg $s2,136($sp) 1370 stg $s2,16*$SIZE_T+8($sp)
1326 bras $s1,4f 1371 bras $s1,4f
1327 mvc 0(1,$out),128($sp) 1372 mvc 0(1,$out),16*$SIZE_T($sp)
13284: ex $len,0($s1) 13734: ex $len,0($s1)
1329 j .Lcbc_dec_exit 1374 j .Lcbc_dec_exit
1330.size AES_cbc_encrypt,.-AES_cbc_encrypt 1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm OPENSSL_s390xcap_P,8,8 1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,2*$SIZE_T($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,2*$SIZE_T($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 srlg $i2,$s0,63 # carry bit from lower half
1688 sllg $s0,$s0,1
1689 sllg $s1,$s1,1
1690 xgr $s0,$i1
1691 ogr $s1,$i2
1692.Lxts_km_start:
1693 lrvgr $i1,$s0 # flip byte order
1694 lrvgr $i2,$s1
1695 stg $i1,0($s2,$inp)
1696 stg $i2,8($s2,$inp)
1697 xg $i1,0($inp)
1698 xg $i2,8($inp)
1699 stg $i1,0($out,$inp)
1700 stg $i2,8($out,$inp)
1701 la $inp,16($inp)
1702 brct $s3,.Lxts_km_prepare
1703
1704 slgr $inp,$fp # rewind $inp
1705 la $s2,0($out,$inp)
1706 lgr $s3,$fp
1707 .long 0xb92e00aa # km $s2,$s2
1708 brc 1,.-4 # pay attention to "partial completion"
1709
1710 la $s2,16($sp)
1711 slgr $s2,$inp
1712 srlg $s3,$fp,4
1713.Lxts_km_xor:
1714 lg $i1,0($out,$inp)
1715 lg $i2,8($out,$inp)
1716 xg $i1,0($s2,$inp)
1717 xg $i2,8($s2,$inp)
1718 stg $i1,0($out,$inp)
1719 stg $i2,8($out,$inp)
1720 la $inp,16($inp)
1721 brct $s3,.Lxts_km_xor
1722
1723 slgr $len,$fp
1724 brc 1,.Lxts_km_loop # not zero, no borrow
1725 algr $fp,$len
1726 lghi $len,0
1727 brc 4+1,.Lxts_km_loop # not zero
1728
1729 l${g} $i1,0($sp) # back-chain
1730 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1731 la $i2,16($sp)
1732 srlg $fp,$fp,4
1733.Lxts_km_zap:
1734 stg $i1,0($i2)
1735 stg $i1,8($i2)
1736 la $i2,16($i2)
1737 brct $fp,.Lxts_km_zap
1738
1739 la $sp,0($i1)
1740 llgc $len,2*$SIZE_T-1($i1)
1741 nill $len,0x0f # $len%=16
1742 bzr $ra
1743
1744 # generate one more tweak...
1745 lghi $i1,0x87
1746 srag $i2,$s1,63 # broadcast upper bit
1747 ngr $i1,$i2 # rem
1748 srlg $i2,$s0,63 # carry bit from lower half
1749 sllg $s0,$s0,1
1750 sllg $s1,$s1,1
1751 xgr $s0,$i1
1752 ogr $s1,$i2
1753
1754 ltr $len,$len # clear zero flag
1755 br $ra
1756.size _s390x_xts_km,.-_s390x_xts_km
1757
1758.globl AES_xts_encrypt
1759.type AES_xts_encrypt,\@function
1760.align 16
1761AES_xts_encrypt:
1762 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1763 xgr %r4,%r3
1764 xgr %r3,%r4
1765___
1766$code.=<<___ if ($SIZE_T==4);
1767 llgfr $len,$len
1768___
1769$code.=<<___;
1770 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1771 srag $len,$len,4 # formally wrong, because it expands
1772 # sign byte, but who can afford asking
1773 # to process more than 2^63-1 bytes?
1774 # I use it, because it sets condition
1775 # code...
1776 bcr 8,$ra # abort if zero (i.e. less than 16)
1777___
1778$code.=<<___ if (!$softonly);
1779 llgf %r0,240($key2)
1780 lhi %r1,16
1781 clr %r0,%r1
1782 jl .Lxts_enc_software
1783
1784 stm${g} %r6,$s3,6*$SIZE_T($sp)
1785 st${g} $ra,14*$SIZE_T($sp)
1786
1787 sllg $len,$len,4 # $len&=~15
1788 slgr $out,$inp
1789
1790 # generate the tweak value
1791 l${g} $s3,$stdframe($sp) # pointer to iv
1792 la $s2,$tweak($sp)
1793 lmg $s0,$s1,0($s3)
1794 lghi $s3,16
1795 stmg $s0,$s1,0($s2)
1796 la %r1,0($key2) # $key2 is not needed anymore
1797 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1798 brc 1,.-4 # can this happen?
1799
1800 l %r0,240($key1)
1801 la %r1,0($key1) # $key1 is not needed anymore
1802 bras $ra,_s390x_xts_km
1803 jz .Lxts_enc_km_done
1804
1805 aghi $inp,-16 # take one step back
1806 la $i3,0($out,$inp) # put aside real $out
1807.Lxts_enc_km_steal:
1808 llgc $i1,16($inp)
1809 llgc $i2,0($out,$inp)
1810 stc $i1,0($out,$inp)
1811 stc $i2,16($out,$inp)
1812 la $inp,1($inp)
1813 brct $len,.Lxts_enc_km_steal
1814
1815 la $s2,0($i3)
1816 lghi $s3,16
1817 lrvgr $i1,$s0 # flip byte order
1818 lrvgr $i2,$s1
1819 xg $i1,0($s2)
1820 xg $i2,8($s2)
1821 stg $i1,0($s2)
1822 stg $i2,8($s2)
1823 .long 0xb92e00aa # km $s2,$s2
1824 brc 1,.-4 # can this happen?
1825 lrvgr $i1,$s0 # flip byte order
1826 lrvgr $i2,$s1
1827 xg $i1,0($i3)
1828 xg $i2,8($i3)
1829 stg $i1,0($i3)
1830 stg $i2,8($i3)
1831
1832.Lxts_enc_km_done:
1833 l${g} $ra,14*$SIZE_T($sp)
1834 st${g} $sp,$tweak($sp) # wipe tweak
1835 st${g} $sp,$tweak($sp)
1836 lm${g} %r6,$s3,6*$SIZE_T($sp)
1837 br $ra
1838.align 16
1839.Lxts_enc_software:
1840___
1841$code.=<<___;
1842 stm${g} %r6,$ra,6*$SIZE_T($sp)
1843
1844 slgr $out,$inp
1845
1846 xgr $s0,$s0 # clear upper half
1847 xgr $s1,$s1
1848 lrv $s0,$stdframe+4($sp) # load secno
1849 lrv $s1,$stdframe+0($sp)
1850 xgr $s2,$s2
1851 xgr $s3,$s3
1852 stm${g} %r2,%r5,2*$SIZE_T($sp)
1853 la $key,0($key2)
1854 larl $tbl,AES_Te
1855 bras $ra,_s390x_AES_encrypt # generate the tweak
1856 lm${g} %r2,%r5,2*$SIZE_T($sp)
1857 stm $s0,$s3,$tweak($sp) # save the tweak
1858 j .Lxts_enc_enter
1859
1860.align 16
1861.Lxts_enc_loop:
1862 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1863 lrvg $s3,$tweak+8($sp)
1864 lghi %r1,0x87
1865 srag %r0,$s3,63 # broadcast upper bit
1866 ngr %r1,%r0 # rem
1867 srlg %r0,$s1,63 # carry bit from lower half
1868 sllg $s1,$s1,1
1869 sllg $s3,$s3,1
1870 xgr $s1,%r1
1871 ogr $s3,%r0
1872 lrvgr $s1,$s1 # flip byte order
1873 lrvgr $s3,$s3
1874 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1875 stg $s1,$tweak+0($sp) # save the tweak
1876 llgfr $s1,$s1
1877 srlg $s2,$s3,32
1878 stg $s3,$tweak+8($sp)
1879 llgfr $s3,$s3
1880 la $inp,16($inp) # $inp+=16
1881.Lxts_enc_enter:
1882 x $s0,0($inp) # ^=*($inp)
1883 x $s1,4($inp)
1884 x $s2,8($inp)
1885 x $s3,12($inp)
1886 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1887 la $key,0($key1)
1888 bras $ra,_s390x_AES_encrypt
1889 lm${g} %r2,%r5,2*$SIZE_T($sp)
1890 x $s0,$tweak+0($sp) # ^=tweak
1891 x $s1,$tweak+4($sp)
1892 x $s2,$tweak+8($sp)
1893 x $s3,$tweak+12($sp)
1894 st $s0,0($out,$inp)
1895 st $s1,4($out,$inp)
1896 st $s2,8($out,$inp)
1897 st $s3,12($out,$inp)
1898 brct${g} $len,.Lxts_enc_loop
1899
1900 llgc $len,`2*$SIZE_T-1`($sp)
1901 nill $len,0x0f # $len%16
1902 jz .Lxts_enc_done
1903
1904 la $i3,0($inp,$out) # put aside real $out
1905.Lxts_enc_steal:
1906 llgc %r0,16($inp)
1907 llgc %r1,0($out,$inp)
1908 stc %r0,0($out,$inp)
1909 stc %r1,16($out,$inp)
1910 la $inp,1($inp)
1911 brct $len,.Lxts_enc_steal
1912 la $out,0($i3) # restore real $out
1913
1914 # generate last tweak...
1915 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1916 lrvg $s3,$tweak+8($sp)
1917 lghi %r1,0x87
1918 srag %r0,$s3,63 # broadcast upper bit
1919 ngr %r1,%r0 # rem
1920 srlg %r0,$s1,63 # carry bit from lower half
1921 sllg $s1,$s1,1
1922 sllg $s3,$s3,1
1923 xgr $s1,%r1
1924 ogr $s3,%r0
1925 lrvgr $s1,$s1 # flip byte order
1926 lrvgr $s3,$s3
1927 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1928 stg $s1,$tweak+0($sp) # save the tweak
1929 llgfr $s1,$s1
1930 srlg $s2,$s3,32
1931 stg $s3,$tweak+8($sp)
1932 llgfr $s3,$s3
1933
1934 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1935 x $s1,4($out)
1936 x $s2,8($out)
1937 x $s3,12($out)
1938 st${g} $out,4*$SIZE_T($sp)
1939 la $key,0($key1)
1940 bras $ra,_s390x_AES_encrypt
1941 l${g} $out,4*$SIZE_T($sp)
1942 x $s0,`$tweak+0`($sp) # ^=tweak
1943 x $s1,`$tweak+4`($sp)
1944 x $s2,`$tweak+8`($sp)
1945 x $s3,`$tweak+12`($sp)
1946 st $s0,0($out)
1947 st $s1,4($out)
1948 st $s2,8($out)
1949 st $s3,12($out)
1950
1951.Lxts_enc_done:
1952 stg $sp,$tweak+0($sp) # wipe tweak
1953 stg $sp,$twesk+8($sp)
1954 lm${g} %r6,$ra,6*$SIZE_T($sp)
1955 br $ra
1956.size AES_xts_encrypt,.-AES_xts_encrypt
1957___
1958# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1959# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1960#
1961$code.=<<___;
1962.globl AES_xts_decrypt
1963.type AES_xts_decrypt,\@function
1964.align 16
1965AES_xts_decrypt:
1966 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1967 xgr %r4,%r3
1968 xgr %r3,%r4
1969___
1970$code.=<<___ if ($SIZE_T==4);
1971 llgfr $len,$len
1972___
1973$code.=<<___;
1974 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1975 aghi $len,-16
1976 bcr 4,$ra # abort if less than zero. formally
1977 # wrong, because $len is unsigned,
1978 # but who can afford asking to
1979 # process more than 2^63-1 bytes?
1980 tmll $len,0x0f
1981 jnz .Lxts_dec_proceed
1982 aghi $len,16
1983.Lxts_dec_proceed:
1984___
1985$code.=<<___ if (!$softonly);
1986 llgf %r0,240($key2)
1987 lhi %r1,16
1988 clr %r0,%r1
1989 jl .Lxts_dec_software
1990
1991 stm${g} %r6,$s3,6*$SIZE_T($sp)
1992 st${g} $ra,14*$SIZE_T($sp)
1993
1994 nill $len,0xfff0 # $len&=~15
1995 slgr $out,$inp
1996
1997 # generate the tweak value
1998 l${g} $s3,$stdframe($sp) # pointer to iv
1999 la $s2,$tweak($sp)
2000 lmg $s0,$s1,0($s3)
2001 lghi $s3,16
2002 stmg $s0,$s1,0($s2)
2003 la %r1,0($key2) # $key2 is not needed past this point
2004 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2005 brc 1,.-4 # can this happen?
2006
2007 l %r0,240($key1)
2008 la %r1,0($key1) # $key1 is not needed anymore
2009
2010 ltgr $len,$len
2011 jz .Lxts_dec_km_short
2012 bras $ra,_s390x_xts_km
2013 jz .Lxts_dec_km_done
2014
2015 lrvgr $s2,$s0 # make copy in reverse byte order
2016 lrvgr $s3,$s1
2017 j .Lxts_dec_km_2ndtweak
2018
2019.Lxts_dec_km_short:
2020 llgc $len,`2*$SIZE_T-1`($sp)
2021 nill $len,0x0f # $len%=16
2022 lrvg $s0,$tweak+0($sp) # load the tweak
2023 lrvg $s1,$tweak+8($sp)
2024 lrvgr $s2,$s0 # make copy in reverse byte order
2025 lrvgr $s3,$s1
2026
2027.Lxts_dec_km_2ndtweak:
2028 lghi $i1,0x87
2029 srag $i2,$s1,63 # broadcast upper bit
2030 ngr $i1,$i2 # rem
2031 srlg $i2,$s0,63 # carry bit from lower half
2032 sllg $s0,$s0,1
2033 sllg $s1,$s1,1
2034 xgr $s0,$i1
2035 ogr $s1,$i2
2036 lrvgr $i1,$s0 # flip byte order
2037 lrvgr $i2,$s1
2038
2039 xg $i1,0($inp)
2040 xg $i2,8($inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043 la $i2,0($out,$inp)
2044 lghi $i3,16
2045 .long 0xb92e0066 # km $i2,$i2
2046 brc 1,.-4 # can this happen?
2047 lrvgr $i1,$s0
2048 lrvgr $i2,$s1
2049 xg $i1,0($out,$inp)
2050 xg $i2,8($out,$inp)
2051 stg $i1,0($out,$inp)
2052 stg $i2,8($out,$inp)
2053
2054 la $i3,0($out,$inp) # put aside real $out
2055.Lxts_dec_km_steal:
2056 llgc $i1,16($inp)
2057 llgc $i2,0($out,$inp)
2058 stc $i1,0($out,$inp)
2059 stc $i2,16($out,$inp)
2060 la $inp,1($inp)
2061 brct $len,.Lxts_dec_km_steal
2062
2063 lgr $s0,$s2
2064 lgr $s1,$s3
2065 xg $s0,0($i3)
2066 xg $s1,8($i3)
2067 stg $s0,0($i3)
2068 stg $s1,8($i3)
2069 la $s0,0($i3)
2070 lghi $s1,16
2071 .long 0xb92e0088 # km $s0,$s0
2072 brc 1,.-4 # can this happen?
2073 xg $s2,0($i3)
2074 xg $s3,8($i3)
2075 stg $s2,0($i3)
2076 stg $s3,8($i3)
2077.Lxts_dec_km_done:
2078 l${g} $ra,14*$SIZE_T($sp)
2079 st${g} $sp,$tweak($sp) # wipe tweak
2080 st${g} $sp,$tweak($sp)
2081 lm${g} %r6,$s3,6*$SIZE_T($sp)
2082 br $ra
2083.align 16
2084.Lxts_dec_software:
2085___
2086$code.=<<___;
2087 stm${g} %r6,$ra,6*$SIZE_T($sp)
2088
2089 srlg $len,$len,4
2090 slgr $out,$inp
2091
2092 xgr $s0,$s0 # clear upper half
2093 xgr $s1,$s1
2094 lrv $s0,$stdframe+4($sp) # load secno
2095 lrv $s1,$stdframe+0($sp)
2096 xgr $s2,$s2
2097 xgr $s3,$s3
2098 stm${g} %r2,%r5,2*$SIZE_T($sp)
2099 la $key,0($key2)
2100 larl $tbl,AES_Te
2101 bras $ra,_s390x_AES_encrypt # generate the tweak
2102 lm${g} %r2,%r5,2*$SIZE_T($sp)
2103 larl $tbl,AES_Td
2104 lt${g}r $len,$len
2105 stm $s0,$s3,$tweak($sp) # save the tweak
2106 jz .Lxts_dec_short
2107 j .Lxts_dec_enter
2108
2109.align 16
2110.Lxts_dec_loop:
2111 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2112 lrvg $s3,$tweak+8($sp)
2113 lghi %r1,0x87
2114 srag %r0,$s3,63 # broadcast upper bit
2115 ngr %r1,%r0 # rem
2116 srlg %r0,$s1,63 # carry bit from lower half
2117 sllg $s1,$s1,1
2118 sllg $s3,$s3,1
2119 xgr $s1,%r1
2120 ogr $s3,%r0
2121 lrvgr $s1,$s1 # flip byte order
2122 lrvgr $s3,$s3
2123 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2124 stg $s1,$tweak+0($sp) # save the tweak
2125 llgfr $s1,$s1
2126 srlg $s2,$s3,32
2127 stg $s3,$tweak+8($sp)
2128 llgfr $s3,$s3
2129.Lxts_dec_enter:
2130 x $s0,0($inp) # tweak^=*(inp)
2131 x $s1,4($inp)
2132 x $s2,8($inp)
2133 x $s3,12($inp)
2134 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2135 la $key,0($key1)
2136 bras $ra,_s390x_AES_decrypt
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2138 x $s0,$tweak+0($sp) # ^=tweak
2139 x $s1,$tweak+4($sp)
2140 x $s2,$tweak+8($sp)
2141 x $s3,$tweak+12($sp)
2142 st $s0,0($out,$inp)
2143 st $s1,4($out,$inp)
2144 st $s2,8($out,$inp)
2145 st $s3,12($out,$inp)
2146 la $inp,16($inp)
2147 brct${g} $len,.Lxts_dec_loop
2148
2149 llgc $len,`2*$SIZE_T-1`($sp)
2150 nill $len,0x0f # $len%16
2151 jz .Lxts_dec_done
2152
2153 # generate pair of tweaks...
2154 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2155 lrvg $s3,$tweak+8($sp)
2156 lghi %r1,0x87
2157 srag %r0,$s3,63 # broadcast upper bit
2158 ngr %r1,%r0 # rem
2159 srlg %r0,$s1,63 # carry bit from lower half
2160 sllg $s1,$s1,1
2161 sllg $s3,$s3,1
2162 xgr $s1,%r1
2163 ogr $s3,%r0
2164 lrvgr $i2,$s1 # flip byte order
2165 lrvgr $i3,$s3
2166 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2167 j .Lxts_dec_2ndtweak
2168
2169.align 16
2170.Lxts_dec_short:
2171 llgc $len,`2*$SIZE_T-1`($sp)
2172 nill $len,0x0f # $len%16
2173 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2174 lrvg $s3,$tweak+8($sp)
2175.Lxts_dec_2ndtweak:
2176 lghi %r1,0x87
2177 srag %r0,$s3,63 # broadcast upper bit
2178 ngr %r1,%r0 # rem
2179 srlg %r0,$s1,63 # carry bit from lower half
2180 sllg $s1,$s1,1
2181 sllg $s3,$s3,1
2182 xgr $s1,%r1
2183 ogr $s3,%r0
2184 lrvgr $s1,$s1 # flip byte order
2185 lrvgr $s3,$s3
2186 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2187 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2188 llgfr $s1,$s1
2189 srlg $s2,$s3,32
2190 stg $s3,$tweak-16+8($sp)
2191 llgfr $s3,$s3
2192
2193 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2194 x $s1,4($inp)
2195 x $s2,8($inp)
2196 x $s3,12($inp)
2197 stm${g} %r2,%r3,2*$SIZE_T($sp)
2198 la $key,0($key1)
2199 bras $ra,_s390x_AES_decrypt
2200 lm${g} %r2,%r5,2*$SIZE_T($sp)
2201 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2202 x $s1,$tweak-16+4($sp)
2203 x $s2,$tweak-16+8($sp)
2204 x $s3,$tweak-16+12($sp)
2205 st $s0,0($out,$inp)
2206 st $s1,4($out,$inp)
2207 st $s2,8($out,$inp)
2208 st $s3,12($out,$inp)
2209
2210 la $i3,0($out,$inp) # put aside real $out
2211.Lxts_dec_steal:
2212 llgc %r0,16($inp)
2213 llgc %r1,0($out,$inp)
2214 stc %r0,0($out,$inp)
2215 stc %r1,16($out,$inp)
2216 la $inp,1($inp)
2217 brct $len,.Lxts_dec_steal
2218 la $out,0($i3) # restore real $out
2219
2220 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2221 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2222 x $s1,4($out)
2223 x $s2,8($out)
2224 x $s3,12($out)
2225 st${g} $out,4*$SIZE_T($sp)
2226 la $key,0($key1)
2227 bras $ra,_s390x_AES_decrypt
2228 l${g} $out,4*$SIZE_T($sp)
2229 x $s0,$tweak+0($sp) # ^=tweak
2230 x $s1,$tweak+4($sp)
2231 x $s2,$tweak+8($sp)
2232 x $s3,$tweak+12($sp)
2233 st $s0,0($out)
2234 st $s1,4($out)
2235 st $s2,8($out)
2236 st $s3,12($out)
2237 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2238 stg $sp,$tweak-16+8($sp)
2239.Lxts_dec_done:
2240 stg $sp,$tweak+0($sp) # wipe tweak
2241 stg $sp,$twesk+8($sp)
2242 lm${g} %r6,$ra,6*$SIZE_T($sp)
2243 br $ra
2244.size AES_xts_decrypt,.-AES_xts_decrypt
1332___ 2245___
1333} 2246}
1334$code.=<<___; 2247$code.=<<___;
1335.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" 2248.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2249.comm OPENSSL_s390xcap_P,16,8
1336___ 2250___
1337 2251
1338$code =~ s/\`([^\`]*)\`/eval $1/gem; 2252$code =~ s/\`([^\`]*)\`/eval $1/gem;
1339print $code; 2253print $code;
2254close STDOUT; # force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
index c57b3a2d6d..403c4d1290 100755
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ___
1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have 1176# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
1177# undesired effect, so just omit them and sacrifice some portion of 1177# undesired effect, so just omit them and sacrifice some portion of
1178# percent in performance... 1178# percent in performance...
1179$code =~ s/fmovs.*$//gem; 1179$code =~ s/fmovs.*$//gm;
1180 1180
1181print $code; 1181print $code;
1182close STDOUT; # ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index a545e892ae..48fa857d5b 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -588,6 +588,9 @@ $code.=<<___;
588.globl AES_encrypt 588.globl AES_encrypt
589.type AES_encrypt,\@function,3 589.type AES_encrypt,\@function,3
590.align 16 590.align 16
591.globl asm_AES_encrypt
592.hidden asm_AES_encrypt
593asm_AES_encrypt:
591AES_encrypt: 594AES_encrypt:
592 push %rbx 595 push %rbx
593 push %rbp 596 push %rbp
@@ -1184,6 +1187,9 @@ $code.=<<___;
1184.globl AES_decrypt 1187.globl AES_decrypt
1185.type AES_decrypt,\@function,3 1188.type AES_decrypt,\@function,3
1186.align 16 1189.align 16
1190.globl asm_AES_decrypt
1191.hidden asm_AES_decrypt
1192asm_AES_decrypt:
1187AES_decrypt: 1193AES_decrypt:
1188 push %rbx 1194 push %rbx
1189 push %rbp 1195 push %rbp
@@ -1277,13 +1283,13 @@ $code.=<<___;
1277___ 1283___
1278} 1284}
1279 1285
1280# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, 1286# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
1281# AES_KEY *key) 1287# AES_KEY *key)
1282$code.=<<___; 1288$code.=<<___;
1283.globl AES_set_encrypt_key 1289.globl private_AES_set_encrypt_key
1284.type AES_set_encrypt_key,\@function,3 1290.type private_AES_set_encrypt_key,\@function,3
1285.align 16 1291.align 16
1286AES_set_encrypt_key: 1292private_AES_set_encrypt_key:
1287 push %rbx 1293 push %rbx
1288 push %rbp 1294 push %rbp
1289 push %r12 # redundant, but allows to share 1295 push %r12 # redundant, but allows to share
@@ -1304,7 +1310,7 @@ AES_set_encrypt_key:
1304 add \$56,%rsp 1310 add \$56,%rsp
1305.Lenc_key_epilogue: 1311.Lenc_key_epilogue:
1306 ret 1312 ret
1307.size AES_set_encrypt_key,.-AES_set_encrypt_key 1313.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1308 1314
1309.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent 1315.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
1310.align 16 1316.align 16
@@ -1547,13 +1553,13 @@ $code.=<<___;
1547___ 1553___
1548} 1554}
1549 1555
1550# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, 1556# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
1551# AES_KEY *key) 1557# AES_KEY *key)
1552$code.=<<___; 1558$code.=<<___;
1553.globl AES_set_decrypt_key 1559.globl private_AES_set_decrypt_key
1554.type AES_set_decrypt_key,\@function,3 1560.type private_AES_set_decrypt_key,\@function,3
1555.align 16 1561.align 16
1556AES_set_decrypt_key: 1562private_AES_set_decrypt_key:
1557 push %rbx 1563 push %rbx
1558 push %rbp 1564 push %rbp
1559 push %r12 1565 push %r12
@@ -1622,7 +1628,7 @@ $code.=<<___;
1622 add \$56,%rsp 1628 add \$56,%rsp
1623.Ldec_key_epilogue: 1629.Ldec_key_epilogue:
1624 ret 1630 ret
1625.size AES_set_decrypt_key,.-AES_set_decrypt_key 1631.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1626___ 1632___
1627 1633
1628# void AES_cbc_encrypt (const void char *inp, unsigned char *out, 1634# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1654,9 @@ $code.=<<___;
1648.type AES_cbc_encrypt,\@function,6 1654.type AES_cbc_encrypt,\@function,6
1649.align 16 1655.align 16
1650.extern OPENSSL_ia32cap_P 1656.extern OPENSSL_ia32cap_P
1657.globl asm_AES_cbc_encrypt
1658.hidden asm_AES_cbc_encrypt
1659asm_AES_cbc_encrypt:
1651AES_cbc_encrypt: 1660AES_cbc_encrypt:
1652 cmp \$0,%rdx # check length 1661 cmp \$0,%rdx # check length
1653 je .Lcbc_epilogue 1662 je .Lcbc_epilogue
@@ -2766,13 +2775,13 @@ cbc_se_handler:
2766 .rva .LSEH_end_AES_decrypt 2775 .rva .LSEH_end_AES_decrypt
2767 .rva .LSEH_info_AES_decrypt 2776 .rva .LSEH_info_AES_decrypt
2768 2777
2769 .rva .LSEH_begin_AES_set_encrypt_key 2778 .rva .LSEH_begin_private_AES_set_encrypt_key
2770 .rva .LSEH_end_AES_set_encrypt_key 2779 .rva .LSEH_end_private_AES_set_encrypt_key
2771 .rva .LSEH_info_AES_set_encrypt_key 2780 .rva .LSEH_info_private_AES_set_encrypt_key
2772 2781
2773 .rva .LSEH_begin_AES_set_decrypt_key 2782 .rva .LSEH_begin_private_AES_set_decrypt_key
2774 .rva .LSEH_end_AES_set_decrypt_key 2783 .rva .LSEH_end_private_AES_set_decrypt_key
2775 .rva .LSEH_info_AES_set_decrypt_key 2784 .rva .LSEH_info_private_AES_set_decrypt_key
2776 2785
2777 .rva .LSEH_begin_AES_cbc_encrypt 2786 .rva .LSEH_begin_AES_cbc_encrypt
2778 .rva .LSEH_end_AES_cbc_encrypt 2787 .rva .LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2797,11 @@ cbc_se_handler:
2788 .byte 9,0,0,0 2797 .byte 9,0,0,0
2789 .rva block_se_handler 2798 .rva block_se_handler
2790 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 2799 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
2791.LSEH_info_AES_set_encrypt_key: 2800.LSEH_info_private_AES_set_encrypt_key:
2792 .byte 9,0,0,0 2801 .byte 9,0,0,0
2793 .rva key_se_handler 2802 .rva key_se_handler
2794 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] 2803 .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
2795.LSEH_info_AES_set_decrypt_key: 2804.LSEH_info_private_AES_set_decrypt_key:
2796 .byte 9,0,0,0 2805 .byte 9,0,0,0
2797 .rva key_se_handler 2806 .rva key_se_handler
2798 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] 2807 .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# June 2011
11#
12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
13# in http://download.intel.com/design/intarch/papers/323686.pdf, is
14# that since AESNI-CBC encrypt exhibit *very* low instruction-level
15# parallelism, interleaving it with another algorithm would allow to
16# utilize processor resources better and achieve better performance.
17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
18# AESNI code is weaved into it. Below are performance numbers in
19# cycles per processed byte, less is better, for standalone AESNI-CBC
20# encrypt, sum of the latter and standalone SHA1, and "stitched"
21# subroutine:
22#
23# AES-128-CBC +SHA1 stitch gain
24# Westmere 3.77[+5.6] 9.37 6.65 +41%
25# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
26#
27# AES-192-CBC
28# Westmere 4.51 10.11 6.97 +45%
29# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
30#
31# AES-256-CBC
32# Westmere 5.25 10.85 7.25 +50%
33# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
34#
35# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
36# background information. Above numbers in parentheses are SSSE3
37# results collected on AVX-capable CPU, i.e. apply on OSes that
38# don't support AVX.
39#
40# Needless to mention that it makes no sense to implement "stitched"
41# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
42# fully utilize parallelism, so stitching would not give any gain
43# anyway. Well, there might be some, e.g. because of better cache
44# locality... For reference, here are performance results for
45# standalone AESNI-CBC decrypt:
46#
47# AES-128-CBC AES-192-CBC AES-256-CBC
48# Westmere 1.31 1.55 1.80
49# Sandy Bridge 0.93 1.06 1.22
50
51$flavour = shift;
52$output = shift;
53if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
54
55$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
56
57$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
59( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
60die "can't locate x86_64-xlate.pl";
61
62$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
63 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
64 $1>=2.19);
65$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
67 $1>=2.09);
68$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
70 $1>=10);
71
72open STDOUT,"| $^X $xlate $flavour $output";
73
74# void aesni_cbc_sha1_enc(const void *inp,
75# void *out,
76# size_t length,
77# const AES_KEY *key,
78# unsigned char *iv,
79# SHA_CTX *ctx,
80# const void *in0);
81
82$code.=<<___;
83.text
84.extern OPENSSL_ia32cap_P
85
86.globl aesni_cbc_sha1_enc
87.type aesni_cbc_sha1_enc,\@abi-omnipotent
88.align 16
89aesni_cbc_sha1_enc:
90 # caller should check for SSSE3 and AES-NI bits
91 mov OPENSSL_ia32cap_P+0(%rip),%r10d
92 mov OPENSSL_ia32cap_P+4(%rip),%r11d
93___
94$code.=<<___ if ($avx);
95 and \$`1<<28`,%r11d # mask AVX bit
96 and \$`1<<30`,%r10d # mask "Intel CPU" bit
97 or %r11d,%r10d
98 cmp \$`1<<28|1<<30`,%r10d
99 je aesni_cbc_sha1_enc_avx
100___
101$code.=<<___;
102 jmp aesni_cbc_sha1_enc_ssse3
103 ret
104.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
105___
106
107my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108
109my $Xi=4;
110my @X=map("%xmm$_",(4..7,0..3));
111my @Tx=map("%xmm$_",(8..10));
112my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
113my @T=("%esi","%edi");
114my $j=0; my $jj=0; my $r=0; my $sn=0;
115my $K_XX_XX="%r11";
116my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
117my @rndkey=("%xmm14","%xmm15");
118
119sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
120{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
121 my $arg = pop;
122 $arg = "\$$arg" if ($arg*1 eq $arg);
123 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
124}
125
126my $_rol=sub { &rol(@_) };
127my $_ror=sub { &ror(@_) };
128
129$code.=<<___;
130.type aesni_cbc_sha1_enc_ssse3,\@function,6
131.align 16
132aesni_cbc_sha1_enc_ssse3:
133 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
134 #shr \$6,$len # debugging artefact
135 #jz .Lepilogue_ssse3 # debugging artefact
136 push %rbx
137 push %rbp
138 push %r12
139 push %r13
140 push %r14
141 push %r15
142 lea `-104-($win64?10*16:0)`(%rsp),%rsp
143 #mov $in0,$inp # debugging artefact
144 #lea 64(%rsp),$ctx # debugging artefact
145___
146$code.=<<___ if ($win64);
147 movaps %xmm6,96+0(%rsp)
148 movaps %xmm7,96+16(%rsp)
149 movaps %xmm8,96+32(%rsp)
150 movaps %xmm9,96+48(%rsp)
151 movaps %xmm10,96+64(%rsp)
152 movaps %xmm11,96+80(%rsp)
153 movaps %xmm12,96+96(%rsp)
154 movaps %xmm13,96+112(%rsp)
155 movaps %xmm14,96+128(%rsp)
156 movaps %xmm15,96+144(%rsp)
157.Lprologue_ssse3:
158___
159$code.=<<___;
160 mov $in0,%r12 # reassign arguments
161 mov $out,%r13
162 mov $len,%r14
163 mov $key,%r15
164 movdqu ($ivp),$iv # load IV
165 mov $ivp,88(%rsp) # save $ivp
166___
167my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
168my $rounds="${ivp}d";
169$code.=<<___;
170 shl \$6,$len
171 sub $in0,$out
172 mov 240($key),$rounds
173 add $inp,$len # end of input
174
175 lea K_XX_XX(%rip),$K_XX_XX
176 mov 0($ctx),$A # load context
177 mov 4($ctx),$B
178 mov 8($ctx),$C
179 mov 12($ctx),$D
180 mov $B,@T[0] # magic seed
181 mov 16($ctx),$E
182
183 movdqa 64($K_XX_XX),@X[2] # pbswap mask
184 movdqa 0($K_XX_XX),@Tx[1] # K_00_19
185 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
186 movdqu 16($inp),@X[-3&7]
187 movdqu 32($inp),@X[-2&7]
188 movdqu 48($inp),@X[-1&7]
189 pshufb @X[2],@X[-4&7] # byte swap
190 add \$64,$inp
191 pshufb @X[2],@X[-3&7]
192 pshufb @X[2],@X[-2&7]
193 pshufb @X[2],@X[-1&7]
194 paddd @Tx[1],@X[-4&7] # add K_00_19
195 paddd @Tx[1],@X[-3&7]
196 paddd @Tx[1],@X[-2&7]
197 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
198 psubd @Tx[1],@X[-4&7] # restore X[]
199 movdqa @X[-3&7],16(%rsp)
200 psubd @Tx[1],@X[-3&7]
201 movdqa @X[-2&7],32(%rsp)
202 psubd @Tx[1],@X[-2&7]
203 movups ($key),$rndkey0 # $key[0]
204 movups 16($key),$rndkey[0] # forward reference
205 jmp .Loop_ssse3
206___
207
208my $aesenc=sub {
209 use integer;
210 my ($n,$k)=($r/10,$r%10);
211 if ($k==0) {
212 $code.=<<___;
213 movups `16*$n`($in0),$in # load input
214 xorps $rndkey0,$in
215___
216 $code.=<<___ if ($n);
217 movups $iv,`16*($n-1)`($out,$in0) # write output
218___
219 $code.=<<___;
220 xorps $in,$iv
221 aesenc $rndkey[0],$iv
222 movups `32+16*$k`($key),$rndkey[1]
223___
224 } elsif ($k==9) {
225 $sn++;
226 $code.=<<___;
227 cmp \$11,$rounds
228 jb .Laesenclast$sn
229 movups `32+16*($k+0)`($key),$rndkey[1]
230 aesenc $rndkey[0],$iv
231 movups `32+16*($k+1)`($key),$rndkey[0]
232 aesenc $rndkey[1],$iv
233 je .Laesenclast$sn
234 movups `32+16*($k+2)`($key),$rndkey[1]
235 aesenc $rndkey[0],$iv
236 movups `32+16*($k+3)`($key),$rndkey[0]
237 aesenc $rndkey[1],$iv
238.Laesenclast$sn:
239 aesenclast $rndkey[0],$iv
240 movups 16($key),$rndkey[1] # forward reference
241___
242 } else {
243 $code.=<<___;
244 aesenc $rndkey[0],$iv
245 movups `32+16*$k`($key),$rndkey[1]
246___
247 }
248 $r++; unshift(@rndkey,pop(@rndkey));
249};
250
251sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
252{ use integer;
253 my $body = shift;
254 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
255 my ($a,$b,$c,$d,$e);
256
257 &movdqa (@X[0],@X[-3&7]);
258 eval(shift(@insns));
259 eval(shift(@insns));
260 &movdqa (@Tx[0],@X[-1&7]);
261 &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]"
262 eval(shift(@insns));
263 eval(shift(@insns));
264
265 &paddd (@Tx[1],@X[-1&7]);
266 eval(shift(@insns));
267 eval(shift(@insns));
268 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
269 eval(shift(@insns));
270 eval(shift(@insns));
271 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
272 eval(shift(@insns));
273 eval(shift(@insns));
274
275 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
276 eval(shift(@insns));
277 eval(shift(@insns));
278 eval(shift(@insns));
279 eval(shift(@insns));
280
281 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
282 eval(shift(@insns));
283 eval(shift(@insns));
284 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
285 eval(shift(@insns));
286 eval(shift(@insns));
287
288 &movdqa (@Tx[2],@X[0]);
289 &movdqa (@Tx[0],@X[0]);
290 eval(shift(@insns));
291 eval(shift(@insns));
292 eval(shift(@insns));
293 eval(shift(@insns));
294
295 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
296 &paddd (@X[0],@X[0]);
297 eval(shift(@insns));
298 eval(shift(@insns));
299 eval(shift(@insns));
300 eval(shift(@insns));
301
302 &psrld (@Tx[0],31);
303 eval(shift(@insns));
304 eval(shift(@insns));
305 &movdqa (@Tx[1],@Tx[2]);
306 eval(shift(@insns));
307 eval(shift(@insns));
308
309 &psrld (@Tx[2],30);
310 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
311 eval(shift(@insns));
312 eval(shift(@insns));
313 eval(shift(@insns));
314 eval(shift(@insns));
315
316 &pslld (@Tx[1],2);
317 &pxor (@X[0],@Tx[2]);
318 eval(shift(@insns));
319 eval(shift(@insns));
320 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
321 eval(shift(@insns));
322 eval(shift(@insns));
323
324 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
325
326 foreach (@insns) { eval; } # remaining instructions [if any]
327
328 $Xi++; push(@X,shift(@X)); # "rotate" X[]
329 push(@Tx,shift(@Tx));
330}
331
332sub Xupdate_ssse3_32_79()
333{ use integer;
334 my $body = shift;
335 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
336 my ($a,$b,$c,$d,$e);
337
338 &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
339 eval(shift(@insns)); # body_20_39
340 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
341 &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]"
342 eval(shift(@insns));
343 eval(shift(@insns));
344 eval(shift(@insns)); # rol
345
346 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
347 eval(shift(@insns));
348 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
349 if ($Xi%5) {
350 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
351 } else { # ... or load next one
352 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
353 }
354 &paddd (@Tx[1],@X[-1&7]);
355 eval(shift(@insns)); # ror
356 eval(shift(@insns));
357
358 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
359 eval(shift(@insns)); # body_20_39
360 eval(shift(@insns));
361 eval(shift(@insns));
362 eval(shift(@insns)); # rol
363
364 &movdqa (@Tx[0],@X[0]);
365 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
366 eval(shift(@insns));
367 eval(shift(@insns));
368 eval(shift(@insns)); # ror
369 eval(shift(@insns));
370
371 &pslld (@X[0],2);
372 eval(shift(@insns)); # body_20_39
373 eval(shift(@insns));
374 &psrld (@Tx[0],30);
375 eval(shift(@insns));
376 eval(shift(@insns)); # rol
377 eval(shift(@insns));
378 eval(shift(@insns));
379 eval(shift(@insns)); # ror
380 eval(shift(@insns));
381
382 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
383 eval(shift(@insns)); # body_20_39
384 eval(shift(@insns));
385 &movdqa (@Tx[1],@X[0]) if ($Xi<19);
386 eval(shift(@insns));
387 eval(shift(@insns)); # rol
388 eval(shift(@insns));
389 eval(shift(@insns));
390 eval(shift(@insns)); # rol
391 eval(shift(@insns));
392
393 foreach (@insns) { eval; } # remaining instructions
394
395 $Xi++; push(@X,shift(@X)); # "rotate" X[]
396 push(@Tx,shift(@Tx));
397}
398
399sub Xuplast_ssse3_80()
400{ use integer;
401 my $body = shift;
402 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
403 my ($a,$b,$c,$d,$e);
404
405 eval(shift(@insns));
406 &paddd (@Tx[1],@X[-1&7]);
407 eval(shift(@insns));
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411
412 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
413
414 foreach (@insns) { eval; } # remaining instructions
415
416 &cmp ($inp,$len);
417 &je (".Ldone_ssse3");
418
419 unshift(@Tx,pop(@Tx));
420
421 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
422 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19
423 &movdqu (@X[-4&7],"0($inp)"); # load input
424 &movdqu (@X[-3&7],"16($inp)");
425 &movdqu (@X[-2&7],"32($inp)");
426 &movdqu (@X[-1&7],"48($inp)");
427 &pshufb (@X[-4&7],@X[2]); # byte swap
428 &add ($inp,64);
429
430 $Xi=0;
431}
432
433sub Xloop_ssse3()
434{ use integer;
435 my $body = shift;
436 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
437 my ($a,$b,$c,$d,$e);
438
439 eval(shift(@insns));
440 eval(shift(@insns));
441 &pshufb (@X[($Xi-3)&7],@X[2]);
442 eval(shift(@insns));
443 eval(shift(@insns));
444 &paddd (@X[($Xi-4)&7],@Tx[1]);
445 eval(shift(@insns));
446 eval(shift(@insns));
447 eval(shift(@insns));
448 eval(shift(@insns));
449 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
450 eval(shift(@insns));
451 eval(shift(@insns));
452 &psubd (@X[($Xi-4)&7],@Tx[1]);
453
454 foreach (@insns) { eval; }
455 $Xi++;
456}
457
458sub Xtail_ssse3()
459{ use integer;
460 my $body = shift;
461 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
462 my ($a,$b,$c,$d,$e);
463
464 foreach (@insns) { eval; }
465}
466
467sub body_00_19 () {
468 use integer;
469 my ($k,$n);
470 my @r=(
471 '($a,$b,$c,$d,$e)=@V;'.
472 '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer
473 '&xor ($c,$d);',
474 '&mov (@T[1],$a);', # $b in next round
475 '&$_rol ($a,5);',
476 '&and (@T[0],$c);', # ($b&($c^$d))
477 '&xor ($c,$d);', # restore $c
478 '&xor (@T[0],$d);',
479 '&add ($e,$a);',
480 '&$_ror ($b,$j?7:2);', # $b>>>2
481 '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
482 );
483 $n = scalar(@r);
484 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
485 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
486 $jj++;
487 return @r;
488}
489
490sub body_20_39 () {
491 use integer;
492 my ($k,$n);
493 my @r=(
494 '($a,$b,$c,$d,$e)=@V;'.
495 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
496 '&xor (@T[0],$d);', # ($b^$d)
497 '&mov (@T[1],$a);', # $b in next round
498 '&$_rol ($a,5);',
499 '&xor (@T[0],$c);', # ($b^$d^$c)
500 '&add ($e,$a);',
501 '&$_ror ($b,7);', # $b>>>2
502 '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
503 );
504 $n = scalar(@r);
505 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds
506 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
507 $jj++;
508 return @r;
509}
510
511sub body_40_59 () {
512 use integer;
513 my ($k,$n);
514 my @r=(
515 '($a,$b,$c,$d,$e)=@V;'.
516 '&mov (@T[1],$c);',
517 '&xor ($c,$d);',
518 '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
519 '&and (@T[1],$d);',
520 '&and (@T[0],$c);', # ($b&($c^$d))
521 '&$_ror ($b,7);', # $b>>>2
522 '&add ($e,@T[1]);',
523 '&mov (@T[1],$a);', # $b in next round
524 '&$_rol ($a,5);',
525 '&add ($e,@T[0]);',
526 '&xor ($c,$d);', # restore $c
527 '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
528 );
529 $n = scalar(@r);
530 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds
531 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n);
532 $jj++;
533 return @r;
534}
535$code.=<<___;
536.align 16
537.Loop_ssse3:
538___
539 &Xupdate_ssse3_16_31(\&body_00_19);
540 &Xupdate_ssse3_16_31(\&body_00_19);
541 &Xupdate_ssse3_16_31(\&body_00_19);
542 &Xupdate_ssse3_16_31(\&body_00_19);
543 &Xupdate_ssse3_32_79(\&body_00_19);
544 &Xupdate_ssse3_32_79(\&body_20_39);
545 &Xupdate_ssse3_32_79(\&body_20_39);
546 &Xupdate_ssse3_32_79(\&body_20_39);
547 &Xupdate_ssse3_32_79(\&body_20_39);
548 &Xupdate_ssse3_32_79(\&body_20_39);
549 &Xupdate_ssse3_32_79(\&body_40_59);
550 &Xupdate_ssse3_32_79(\&body_40_59);
551 &Xupdate_ssse3_32_79(\&body_40_59);
552 &Xupdate_ssse3_32_79(\&body_40_59);
553 &Xupdate_ssse3_32_79(\&body_40_59);
554 &Xupdate_ssse3_32_79(\&body_20_39);
555 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
556
557 $saved_j=$j; @saved_V=@V;
558 $saved_r=$r; @saved_rndkey=@rndkey;
559
560 &Xloop_ssse3(\&body_20_39);
561 &Xloop_ssse3(\&body_20_39);
562 &Xloop_ssse3(\&body_20_39);
563
564$code.=<<___;
565 movups $iv,48($out,$in0) # write output
566 lea 64($in0),$in0
567
568 add 0($ctx),$A # update context
569 add 4($ctx),@T[0]
570 add 8($ctx),$C
571 add 12($ctx),$D
572 mov $A,0($ctx)
573 add 16($ctx),$E
574 mov @T[0],4($ctx)
575 mov @T[0],$B # magic seed
576 mov $C,8($ctx)
577 mov $D,12($ctx)
578 mov $E,16($ctx)
579 jmp .Loop_ssse3
580
581.align 16
582.Ldone_ssse3:
583___
584 $jj=$j=$saved_j; @V=@saved_V;
585 $r=$saved_r; @rndkey=@saved_rndkey;
586
587 &Xtail_ssse3(\&body_20_39);
588 &Xtail_ssse3(\&body_20_39);
589 &Xtail_ssse3(\&body_20_39);
590
591$code.=<<___;
592 movups $iv,48($out,$in0) # write output
593 mov 88(%rsp),$ivp # restore $ivp
594
595 add 0($ctx),$A # update context
596 add 4($ctx),@T[0]
597 add 8($ctx),$C
598 mov $A,0($ctx)
599 add 12($ctx),$D
600 mov @T[0],4($ctx)
601 add 16($ctx),$E
602 mov $C,8($ctx)
603 mov $D,12($ctx)
604 mov $E,16($ctx)
605 movups $iv,($ivp) # write IV
606___
607$code.=<<___ if ($win64);
608 movaps 96+0(%rsp),%xmm6
609 movaps 96+16(%rsp),%xmm7
610 movaps 96+32(%rsp),%xmm8
611 movaps 96+48(%rsp),%xmm9
612 movaps 96+64(%rsp),%xmm10
613 movaps 96+80(%rsp),%xmm11
614 movaps 96+96(%rsp),%xmm12
615 movaps 96+112(%rsp),%xmm13
616 movaps 96+128(%rsp),%xmm14
617 movaps 96+144(%rsp),%xmm15
618___
619$code.=<<___;
620 lea `104+($win64?10*16:0)`(%rsp),%rsi
621 mov 0(%rsi),%r15
622 mov 8(%rsi),%r14
623 mov 16(%rsi),%r13
624 mov 24(%rsi),%r12
625 mov 32(%rsi),%rbp
626 mov 40(%rsi),%rbx
627 lea 48(%rsi),%rsp
628.Lepilogue_ssse3:
629 ret
630.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
631___
632
633$j=$jj=$r=$sn=0;
634
635if ($avx) {
636my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
637
638my $Xi=4;
639my @X=map("%xmm$_",(4..7,0..3));
640my @Tx=map("%xmm$_",(8..10));
641my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
642my @T=("%esi","%edi");
643
644my $_rol=sub { &shld(@_[0],@_) };
645my $_ror=sub { &shrd(@_[0],@_) };
646
647$code.=<<___;
648.type aesni_cbc_sha1_enc_avx,\@function,6
649.align 16
650aesni_cbc_sha1_enc_avx:
651 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
652 #shr \$6,$len # debugging artefact
653 #jz .Lepilogue_avx # debugging artefact
654 push %rbx
655 push %rbp
656 push %r12
657 push %r13
658 push %r14
659 push %r15
660 lea `-104-($win64?10*16:0)`(%rsp),%rsp
661 #mov $in0,$inp # debugging artefact
662 #lea 64(%rsp),$ctx # debugging artefact
663___
664$code.=<<___ if ($win64);
665 movaps %xmm6,96+0(%rsp)
666 movaps %xmm7,96+16(%rsp)
667 movaps %xmm8,96+32(%rsp)
668 movaps %xmm9,96+48(%rsp)
669 movaps %xmm10,96+64(%rsp)
670 movaps %xmm11,96+80(%rsp)
671 movaps %xmm12,96+96(%rsp)
672 movaps %xmm13,96+112(%rsp)
673 movaps %xmm14,96+128(%rsp)
674 movaps %xmm15,96+144(%rsp)
675.Lprologue_avx:
676___
677$code.=<<___;
678 vzeroall
679 mov $in0,%r12 # reassign arguments
680 mov $out,%r13
681 mov $len,%r14
682 mov $key,%r15
683 vmovdqu ($ivp),$iv # load IV
684 mov $ivp,88(%rsp) # save $ivp
685___
686my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments
687my $rounds="${ivp}d";
688$code.=<<___;
689 shl \$6,$len
690 sub $in0,$out
691 mov 240($key),$rounds
692 add \$112,$key # size optimization
693 add $inp,$len # end of input
694
695 lea K_XX_XX(%rip),$K_XX_XX
696 mov 0($ctx),$A # load context
697 mov 4($ctx),$B
698 mov 8($ctx),$C
699 mov 12($ctx),$D
700 mov $B,@T[0] # magic seed
701 mov 16($ctx),$E
702
703 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
704 vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19
705 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
706 vmovdqu 16($inp),@X[-3&7]
707 vmovdqu 32($inp),@X[-2&7]
708 vmovdqu 48($inp),@X[-1&7]
709 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
710 add \$64,$inp
711 vpshufb @X[2],@X[-3&7],@X[-3&7]
712 vpshufb @X[2],@X[-2&7],@X[-2&7]
713 vpshufb @X[2],@X[-1&7],@X[-1&7]
714 vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19
715 vpaddd @Tx[1],@X[-3&7],@X[1]
716 vpaddd @Tx[1],@X[-2&7],@X[2]
717 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
718 vmovdqa @X[1],16(%rsp)
719 vmovdqa @X[2],32(%rsp)
720 vmovups -112($key),$rndkey0 # $key[0]
721 vmovups 16-112($key),$rndkey[0] # forward reference
722 jmp .Loop_avx
723___
724
725my $aesenc=sub {
726 use integer;
727 my ($n,$k)=($r/10,$r%10);
728 if ($k==0) {
729 $code.=<<___;
730 vmovups `16*$n`($in0),$in # load input
731 vxorps $rndkey0,$in,$in
732___
733 $code.=<<___ if ($n);
734 vmovups $iv,`16*($n-1)`($out,$in0) # write output
735___
736 $code.=<<___;
737 vxorps $in,$iv,$iv
738 vaesenc $rndkey[0],$iv,$iv
739 vmovups `32+16*$k-112`($key),$rndkey[1]
740___
741 } elsif ($k==9) {
742 $sn++;
743 $code.=<<___;
744 cmp \$11,$rounds
745 jb .Lvaesenclast$sn
746 vaesenc $rndkey[0],$iv,$iv
747 vmovups `32+16*($k+0)-112`($key),$rndkey[1]
748 vaesenc $rndkey[1],$iv,$iv
749 vmovups `32+16*($k+1)-112`($key),$rndkey[0]
750 je .Lvaesenclast$sn
751 vaesenc $rndkey[0],$iv,$iv
752 vmovups `32+16*($k+2)-112`($key),$rndkey[1]
753 vaesenc $rndkey[1],$iv,$iv
754 vmovups `32+16*($k+3)-112`($key),$rndkey[0]
755.Lvaesenclast$sn:
756 vaesenclast $rndkey[0],$iv,$iv
757 vmovups 16-112($key),$rndkey[1] # forward reference
758___
759 } else {
760 $code.=<<___;
761 vaesenc $rndkey[0],$iv,$iv
762 vmovups `32+16*$k-112`($key),$rndkey[1]
763___
764 }
765 $r++; unshift(@rndkey,pop(@rndkey));
766};
767
768sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
769{ use integer;
770 my $body = shift;
771 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
772 my ($a,$b,$c,$d,$e);
773
774 eval(shift(@insns));
775 eval(shift(@insns));
776 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
777 eval(shift(@insns));
778 eval(shift(@insns));
779
780 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
781 eval(shift(@insns));
782 eval(shift(@insns));
783 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
784 eval(shift(@insns));
785 eval(shift(@insns));
786 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
787 eval(shift(@insns));
788 eval(shift(@insns));
789
790 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
791 eval(shift(@insns));
792 eval(shift(@insns));
793 eval(shift(@insns));
794 eval(shift(@insns));
795
796 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
797 eval(shift(@insns));
798 eval(shift(@insns));
799 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
800 eval(shift(@insns));
801 eval(shift(@insns));
802
803 &vpsrld (@Tx[0],@X[0],31);
804 eval(shift(@insns));
805 eval(shift(@insns));
806 eval(shift(@insns));
807 eval(shift(@insns));
808
809 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
810 &vpaddd (@X[0],@X[0],@X[0]);
811 eval(shift(@insns));
812 eval(shift(@insns));
813 eval(shift(@insns));
814 eval(shift(@insns));
815
816 &vpsrld (@Tx[1],@Tx[2],30);
817 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
818 eval(shift(@insns));
819 eval(shift(@insns));
820 eval(shift(@insns));
821 eval(shift(@insns));
822
823 &vpslld (@Tx[2],@Tx[2],2);
824 &vpxor (@X[0],@X[0],@Tx[1]);
825 eval(shift(@insns));
826 eval(shift(@insns));
827 eval(shift(@insns));
828 eval(shift(@insns));
829
830 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
831 eval(shift(@insns));
832 eval(shift(@insns));
833 &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX
834 eval(shift(@insns));
835 eval(shift(@insns));
836
837
838 foreach (@insns) { eval; } # remaining instructions [if any]
839
840 $Xi++; push(@X,shift(@X)); # "rotate" X[]
841 push(@Tx,shift(@Tx));
842}
843
844sub Xupdate_avx_32_79()
845{ use integer;
846 my $body = shift;
847 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
848 my ($a,$b,$c,$d,$e);
849
850 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
851 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
852 eval(shift(@insns)); # body_20_39
853 eval(shift(@insns));
854 eval(shift(@insns));
855 eval(shift(@insns)); # rol
856
857 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
858 eval(shift(@insns));
859 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
860 if ($Xi%5) {
861 &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
862 } else { # ... or load next one
863 &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
864 }
865 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
866 eval(shift(@insns)); # ror
867 eval(shift(@insns));
868
869 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
870 eval(shift(@insns)); # body_20_39
871 eval(shift(@insns));
872 eval(shift(@insns));
873 eval(shift(@insns)); # rol
874
875 &vpsrld (@Tx[0],@X[0],30);
876 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
877 eval(shift(@insns));
878 eval(shift(@insns));
879 eval(shift(@insns)); # ror
880 eval(shift(@insns));
881
882 &vpslld (@X[0],@X[0],2);
883 eval(shift(@insns)); # body_20_39
884 eval(shift(@insns));
885 eval(shift(@insns));
886 eval(shift(@insns)); # rol
887 eval(shift(@insns));
888 eval(shift(@insns));
889 eval(shift(@insns)); # ror
890 eval(shift(@insns));
891
892 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
893 eval(shift(@insns)); # body_20_39
894 eval(shift(@insns));
895 &vmovdqa (@Tx[1],@X[0]) if ($Xi<19);
896 eval(shift(@insns));
897 eval(shift(@insns)); # rol
898 eval(shift(@insns));
899 eval(shift(@insns));
900 eval(shift(@insns)); # rol
901 eval(shift(@insns));
902
903 foreach (@insns) { eval; } # remaining instructions
904
905 $Xi++; push(@X,shift(@X)); # "rotate" X[]
906 push(@Tx,shift(@Tx));
907}
908
909sub Xuplast_avx_80()
910{ use integer;
911 my $body = shift;
912 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
913 my ($a,$b,$c,$d,$e);
914
915 eval(shift(@insns));
916 &vpaddd (@Tx[1],@Tx[1],@X[-1&7]);
917 eval(shift(@insns));
918 eval(shift(@insns));
919 eval(shift(@insns));
920 eval(shift(@insns));
921
922 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
923
924 foreach (@insns) { eval; } # remaining instructions
925
926 &cmp ($inp,$len);
927 &je (".Ldone_avx");
928
929 unshift(@Tx,pop(@Tx));
930
931 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
932 &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19
933 &vmovdqu(@X[-4&7],"0($inp)"); # load input
934 &vmovdqu(@X[-3&7],"16($inp)");
935 &vmovdqu(@X[-2&7],"32($inp)");
936 &vmovdqu(@X[-1&7],"48($inp)");
937 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
938 &add ($inp,64);
939
940 $Xi=0;
941}
942
943sub Xloop_avx()
944{ use integer;
945 my $body = shift;
946 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
947 my ($a,$b,$c,$d,$e);
948
949 eval(shift(@insns));
950 eval(shift(@insns));
951 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
952 eval(shift(@insns));
953 eval(shift(@insns));
954 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
955 eval(shift(@insns));
956 eval(shift(@insns));
957 eval(shift(@insns));
958 eval(shift(@insns));
959 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
960 eval(shift(@insns));
961 eval(shift(@insns));
962
963 foreach (@insns) { eval; }
964 $Xi++;
965}
966
967sub Xtail_avx()
968{ use integer;
969 my $body = shift;
970 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
971 my ($a,$b,$c,$d,$e);
972
973 foreach (@insns) { eval; }
974}
975
976$code.=<<___;
977.align 16
978.Loop_avx:
979___
980 &Xupdate_avx_16_31(\&body_00_19);
981 &Xupdate_avx_16_31(\&body_00_19);
982 &Xupdate_avx_16_31(\&body_00_19);
983 &Xupdate_avx_16_31(\&body_00_19);
984 &Xupdate_avx_32_79(\&body_00_19);
985 &Xupdate_avx_32_79(\&body_20_39);
986 &Xupdate_avx_32_79(\&body_20_39);
987 &Xupdate_avx_32_79(\&body_20_39);
988 &Xupdate_avx_32_79(\&body_20_39);
989 &Xupdate_avx_32_79(\&body_20_39);
990 &Xupdate_avx_32_79(\&body_40_59);
991 &Xupdate_avx_32_79(\&body_40_59);
992 &Xupdate_avx_32_79(\&body_40_59);
993 &Xupdate_avx_32_79(\&body_40_59);
994 &Xupdate_avx_32_79(\&body_40_59);
995 &Xupdate_avx_32_79(\&body_20_39);
996 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
997
998 $saved_j=$j; @saved_V=@V;
999 $saved_r=$r; @saved_rndkey=@rndkey;
1000
1001 &Xloop_avx(\&body_20_39);
1002 &Xloop_avx(\&body_20_39);
1003 &Xloop_avx(\&body_20_39);
1004
1005$code.=<<___;
1006 vmovups $iv,48($out,$in0) # write output
1007 lea 64($in0),$in0
1008
1009 add 0($ctx),$A # update context
1010 add 4($ctx),@T[0]
1011 add 8($ctx),$C
1012 add 12($ctx),$D
1013 mov $A,0($ctx)
1014 add 16($ctx),$E
1015 mov @T[0],4($ctx)
1016 mov @T[0],$B # magic seed
1017 mov $C,8($ctx)
1018 mov $D,12($ctx)
1019 mov $E,16($ctx)
1020 jmp .Loop_avx
1021
1022.align 16
1023.Ldone_avx:
1024___
1025 $jj=$j=$saved_j; @V=@saved_V;
1026 $r=$saved_r; @rndkey=@saved_rndkey;
1027
1028 &Xtail_avx(\&body_20_39);
1029 &Xtail_avx(\&body_20_39);
1030 &Xtail_avx(\&body_20_39);
1031
1032$code.=<<___;
1033 vmovups $iv,48($out,$in0) # write output
1034 mov 88(%rsp),$ivp # restore $ivp
1035
1036 add 0($ctx),$A # update context
1037 add 4($ctx),@T[0]
1038 add 8($ctx),$C
1039 mov $A,0($ctx)
1040 add 12($ctx),$D
1041 mov @T[0],4($ctx)
1042 add 16($ctx),$E
1043 mov $C,8($ctx)
1044 mov $D,12($ctx)
1045 mov $E,16($ctx)
1046 vmovups $iv,($ivp) # write IV
1047 vzeroall
1048___
1049$code.=<<___ if ($win64);
1050 movaps 96+0(%rsp),%xmm6
1051 movaps 96+16(%rsp),%xmm7
1052 movaps 96+32(%rsp),%xmm8
1053 movaps 96+48(%rsp),%xmm9
1054 movaps 96+64(%rsp),%xmm10
1055 movaps 96+80(%rsp),%xmm11
1056 movaps 96+96(%rsp),%xmm12
1057 movaps 96+112(%rsp),%xmm13
1058 movaps 96+128(%rsp),%xmm14
1059 movaps 96+144(%rsp),%xmm15
1060___
1061$code.=<<___;
1062 lea `104+($win64?10*16:0)`(%rsp),%rsi
1063 mov 0(%rsi),%r15
1064 mov 8(%rsi),%r14
1065 mov 16(%rsi),%r13
1066 mov 24(%rsi),%r12
1067 mov 32(%rsi),%rbp
1068 mov 40(%rsi),%rbx
1069 lea 48(%rsi),%rsp
1070.Lepilogue_avx:
1071 ret
1072.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1073___
1074}
1075$code.=<<___;
1076.align 64
1077K_XX_XX:
1078.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1079.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1080.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1081.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1082.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1083
1084.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1085.align 64
1086___
1087
1088# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1089# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1090if ($win64) {
1091$rec="%rcx";
1092$frame="%rdx";
1093$context="%r8";
1094$disp="%r9";
1095
1096$code.=<<___;
1097.extern __imp_RtlVirtualUnwind
1098.type ssse3_handler,\@abi-omnipotent
1099.align 16
1100ssse3_handler:
1101 push %rsi
1102 push %rdi
1103 push %rbx
1104 push %rbp
1105 push %r12
1106 push %r13
1107 push %r14
1108 push %r15
1109 pushfq
1110 sub \$64,%rsp
1111
1112 mov 120($context),%rax # pull context->Rax
1113 mov 248($context),%rbx # pull context->Rip
1114
1115 mov 8($disp),%rsi # disp->ImageBase
1116 mov 56($disp),%r11 # disp->HandlerData
1117
1118 mov 0(%r11),%r10d # HandlerData[0]
1119 lea (%rsi,%r10),%r10 # prologue label
1120 cmp %r10,%rbx # context->Rip<prologue label
1121 jb .Lcommon_seh_tail
1122
1123 mov 152($context),%rax # pull context->Rsp
1124
1125 mov 4(%r11),%r10d # HandlerData[1]
1126 lea (%rsi,%r10),%r10 # epilogue label
1127 cmp %r10,%rbx # context->Rip>=epilogue label
1128 jae .Lcommon_seh_tail
1129
1130 lea 96(%rax),%rsi
1131 lea 512($context),%rdi # &context.Xmm6
1132 mov \$20,%ecx
1133 .long 0xa548f3fc # cld; rep movsq
1134 lea `104+10*16`(%rax),%rax # adjust stack pointer
1135
1136 mov 0(%rax),%r15
1137 mov 8(%rax),%r14
1138 mov 16(%rax),%r13
1139 mov 24(%rax),%r12
1140 mov 32(%rax),%rbp
1141 mov 40(%rax),%rbx
1142 lea 48(%rax),%rax
1143 mov %rbx,144($context) # restore context->Rbx
1144 mov %rbp,160($context) # restore context->Rbp
1145 mov %r12,216($context) # restore context->R12
1146 mov %r13,224($context) # restore context->R13
1147 mov %r14,232($context) # restore context->R14
1148 mov %r15,240($context) # restore context->R15
1149
1150.Lcommon_seh_tail:
1151 mov 8(%rax),%rdi
1152 mov 16(%rax),%rsi
1153 mov %rax,152($context) # restore context->Rsp
1154 mov %rsi,168($context) # restore context->Rsi
1155 mov %rdi,176($context) # restore context->Rdi
1156
1157 mov 40($disp),%rdi # disp->ContextRecord
1158 mov $context,%rsi # context
1159 mov \$154,%ecx # sizeof(CONTEXT)
1160 .long 0xa548f3fc # cld; rep movsq
1161
1162 mov $disp,%rsi
1163 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1164 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1165 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1166 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1167 mov 40(%rsi),%r10 # disp->ContextRecord
1168 lea 56(%rsi),%r11 # &disp->HandlerData
1169 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1170 mov %r10,32(%rsp) # arg5
1171 mov %r11,40(%rsp) # arg6
1172 mov %r12,48(%rsp) # arg7
1173 mov %rcx,56(%rsp) # arg8, (NULL)
1174 call *__imp_RtlVirtualUnwind(%rip)
1175
1176 mov \$1,%eax # ExceptionContinueSearch
1177 add \$64,%rsp
1178 popfq
1179 pop %r15
1180 pop %r14
1181 pop %r13
1182 pop %r12
1183 pop %rbp
1184 pop %rbx
1185 pop %rdi
1186 pop %rsi
1187 ret
1188.size ssse3_handler,.-ssse3_handler
1189
1190.section .pdata
1191.align 4
1192 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1193 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3
1194 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3
1195___
1196$code.=<<___ if ($avx);
1197 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx
1198 .rva .LSEH_end_aesni_cbc_sha1_enc_avx
1199 .rva .LSEH_info_aesni_cbc_sha1_enc_avx
1200___
1201$code.=<<___;
1202.section .xdata
1203.align 8
1204.LSEH_info_aesni_cbc_sha1_enc_ssse3:
1205 .byte 9,0,0,0
1206 .rva ssse3_handler
1207 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
1208___
1209$code.=<<___ if ($avx);
1210.LSEH_info_aesni_cbc_sha1_enc_avx:
1211 .byte 9,0,0,0
1212 .rva ssse3_handler
1213 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1214___
1215}
1216
1217####################################################################
1218sub rex {
1219 local *opcode=shift;
1220 my ($dst,$src)=@_;
1221 my $rex=0;
1222
1223 $rex|=0x04 if($dst>=8);
1224 $rex|=0x01 if($src>=8);
1225 push @opcode,$rex|0x40 if($rex);
1226}
1227
1228sub aesni {
1229 my $line=shift;
1230 my @opcode=(0x66);
1231
1232 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1233 my %opcodelet = (
1234 "aesenc" => 0xdc, "aesenclast" => 0xdd
1235 );
1236 return undef if (!defined($opcodelet{$1}));
1237 rex(\@opcode,$3,$2);
1238 push @opcode,0x0f,0x38,$opcodelet{$1};
1239 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1240 return ".byte\t".join(',',@opcode);
1241 }
1242 return $line;
1243}
1244
1245$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1246$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1247
1248print $code;
1249close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000000..3dc345b585
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for Intel AES-NI extension. In
11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13# details].
14#
15# Performance.
16#
17# To start with see corresponding paragraph in aesni-x86_64.pl...
18# Instead of filling table similar to one found there I've chosen to
19# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20# The simplified table below represents 32-bit performance relative
21# to 64-bit one in every given point. Ratios vary for different
22# encryption modes, therefore interval values.
23#
24# 16-byte 64-byte 256-byte 1-KB 8-KB
25# 53-67% 67-84% 91-94% 95-98% 97-99.5%
26#
27# Lower ratios for smaller block sizes are perfectly understandable,
28# because function call overhead is higher in 32-bit mode. Largest
29# 8-KB block performance is virtually same: 32-bit code is less than
30# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32# January 2011
33#
34# See aesni-x86_64.pl for details. Unlike x86_64 version this module
35# interleaves at most 6 aes[enc|dec] instructions, because there are
36# not enough registers for 8x interleave [which should be optimal for
37# Sandy Bridge]. Actually, performance results for 6x interleave
38# factor presented in aesni-x86_64.pl (except for CTR) are for this
39# module.
40
41# April 2011
42#
43# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49$inline=1; # inline _aesni_[en|de]crypt
50
51$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52push(@INC,"${dir}","${dir}../../perlasm");
53require "x86asm.pl";
54
55&asm_init($ARGV[0],$0);
56
57if ($PREFIX eq "aesni") { $movekey=*movups; }
58else { $movekey=*movups; }
59
60$len="eax";
61$rounds="ecx";
62$key="edx";
63$inp="esi";
64$out="edi";
65$rounds_="ebx"; # backup copy for $rounds
66$key_="ebp"; # backup copy for $key
67
68$rndkey0="xmm0";
69$rndkey1="xmm1";
70$inout0="xmm2";
71$inout1="xmm3";
72$inout2="xmm4";
73$inout3="xmm5"; $in1="xmm5";
74$inout4="xmm6"; $in0="xmm6";
75$inout5="xmm7"; $ivec="xmm7";
76
77# AESNI extenstion
78sub aeskeygenassist
79{ my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
82}
83sub aescommon
84{ my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
87}
88sub aesimc { aescommon(0xdb,@_); }
89sub aesenc { aescommon(0xdc,@_); }
90sub aesenclast { aescommon(0xdd,@_); }
91sub aesdec { aescommon(0xde,@_); }
92sub aesdeclast { aescommon(0xdf,@_); }
93
94# Inline version of internal aesni_[en|de]crypt1
95{ my $sn;
96sub aesni_inline_generate1
97{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
98 $sn++;
99
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
108 &dec ($rounds);
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
113}}
114
115sub aesni_generate1 # fully unrolled loop
116{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
117
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
124 &cmp ($rounds,11);
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
157 &ret();
158 &function_end_B("_aesni_${p}rypt1");
159}
160
161# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162&aesni_generate1("enc") if (!$inline);
163&function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
169 if ($inline)
170 { &aesni_inline_generate1("enc"); }
171 else
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
174 &ret ();
175&function_end_B("${PREFIX}_encrypt");
176
177# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178&aesni_generate1("dec") if(!$inline);
179&function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
185 if ($inline)
186 { &aesni_inline_generate1("dec"); }
187 else
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
190 &ret ();
191&function_end_B("${PREFIX}_decrypt");
192
193# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194# factor. Why 3x subroutine were originally used in loops? Even though
195# aes[enc|dec] latency was originally 6, it could be scheduled only
196# every *2nd* cycle. Thus 3x interleave was the one providing optimal
197# utilization, i.e. when subroutine's throughput is virtually same as
198# of non-interleaved subroutine [for number of input blocks up to 3].
199# This is why it makes no sense to implement 2x subroutine.
200# aes[enc|dec] latency in next processor generation is 8, but the
201# instructions can be scheduled every cycle. Optimal interleave for
202# new processor is therefore 8x, but it's unfeasible to accommodate it
203# in XMM registers addreassable in 32-bit mode and therefore 6x is
204# used instead...
205
206sub aesni_generate3
207{ my $p=shift;
208
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &shr ($rounds,1);
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
218
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
222 &dec ($rounds);
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
237 &ret();
238 &function_end_B("_aesni_${p}rypt3");
239}
240
241# 4x interleave is implemented to improve small block performance,
242# most notably [and naturally] 4 block by ~30%. One can argue that one
243# should have implemented 5x as well, but improvement would be <20%,
244# so it's not worth it...
245sub aesni_generate4
246{ my $p=shift;
247
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
251 &shr ($rounds,1);
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
258
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &dec ($rounds);
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
273
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
282 &ret();
283 &function_end_B("_aesni_${p}rypt4");
284}
285
286sub aesni_generate6
287{ my $p=shift;
288
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
292 &shr ($rounds,1);
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
301 &dec ($rounds);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
310
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
314 &dec ($rounds);
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
330
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
343 &ret();
344 &function_end_B("_aesni_${p}rypt6");
345}
346&aesni_generate3("enc") if ($PREFIX eq "aesni");
347&aesni_generate3("dec");
348&aesni_generate4("enc") if ($PREFIX eq "aesni");
349&aesni_generate4("dec");
350&aesni_generate6("enc") if ($PREFIX eq "aesni");
351&aesni_generate6("dec");
352
353if ($PREFIX eq "aesni") {
354######################################################################
355# void aesni_ecb_encrypt (const void *in, void *out,
356# size_t length, const AES_KEY *key,
357# int enc);
358&function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
364 &and ($len,-16);
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
369
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
372 &cmp ($len,0x60);
373 &jb (&label("ecb_enc_tail"));
374
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
382 &sub ($len,0x60);
383 &jmp (&label("ecb_enc_loop6_enter"));
384
385&set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400&set_label("ecb_enc_loop6_enter");
401
402 &call ("_aesni_encrypt6");
403
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
406 &sub ($len,0x60);
407 &jnc (&label("ecb_enc_loop6"));
408
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
416 &add ($len,0x60);
417 &jz (&label("ecb_ret"));
418
419&set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
421 &cmp ($len,0x20);
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
426 &cmp ($len,0x40);
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
439
440&set_label("ecb_enc_one",16);
441 if ($inline)
442 { &aesni_inline_generate1("enc"); }
443 else
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
447
448&set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
454
455&set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
461
462&set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469######################################################################
470&set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
473 &cmp ($len,0x60);
474 &jb (&label("ecb_dec_tail"));
475
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
483 &sub ($len,0x60);
484 &jmp (&label("ecb_dec_loop6_enter"));
485
486&set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501&set_label("ecb_dec_loop6_enter");
502
503 &call ("_aesni_decrypt6");
504
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
507 &sub ($len,0x60);
508 &jnc (&label("ecb_dec_loop6"));
509
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
517 &add ($len,0x60);
518 &jz (&label("ecb_ret"));
519
520&set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
522 &cmp ($len,0x20);
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
527 &cmp ($len,0x40);
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
540
541&set_label("ecb_dec_one",16);
542 if ($inline)
543 { &aesni_inline_generate1("dec"); }
544 else
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
548
549&set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
555
556&set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
562
563&set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
569
570&set_label("ecb_ret");
571&function_end("aesni_ecb_encrypt");
572
573######################################################################
574# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575# size_t blocks, const AES_KEY *key,
576# const char *ivec,char *cmac);
577#
578# Handles only complete blocks, operates on 64-bit counter and
579# does not update *ivec! Nor does it finalize CMAC value
580# (see engine/eng_aesni.c for details)
581#
582{ my $cmac=$inout1;
583&function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
590 &mov ($key_,"esp");
591 &sub ("esp",60);
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
594
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
597 &mov ($rounds,&DWP(240,$key));
598
599 # compose byte-swap control mask for pshufb on stack
600 &mov (&DWP(0,"esp"),0x0c0d0e0f);
601 &mov (&DWP(4,"esp"),0x08090a0b);
602 &mov (&DWP(8,"esp"),0x04050607);
603 &mov (&DWP(12,"esp"),0x00010203);
604
605 # compose counter increment vector on stack
606 &mov ($rounds_,1);
607 &xor ($key_,$key_);
608 &mov (&DWP(16,"esp"),$rounds_);
609 &mov (&DWP(20,"esp"),$key_);
610 &mov (&DWP(24,"esp"),$key_);
611 &mov (&DWP(28,"esp"),$key_);
612
613 &shr ($rounds,1);
614 &lea ($key_,&DWP(0,$key));
615 &movdqa ($inout3,&QWP(0,"esp"));
616 &movdqa ($inout0,$ivec);
617 &mov ($rounds_,$rounds);
618 &pshufb ($ivec,$inout3);
619
620&set_label("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP(0,$key_));
622 &mov ($rounds,$rounds_);
623 &movups ($in0,&QWP(0,$inp));
624
625 &xorps ($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP(16,$key_));
627 &xorps ($rndkey0,$in0);
628 &lea ($key,&DWP(32,$key_));
629 &xorps ($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP(0,$key));
631
632&set_label("ccm64_enc2_loop");
633 &aesenc ($inout0,$rndkey1);
634 &dec ($rounds);
635 &aesenc ($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP(16,$key));
637 &aesenc ($inout0,$rndkey0);
638 &lea ($key,&DWP(32,$key));
639 &aesenc ($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP(0,$key));
641 &jnz (&label("ccm64_enc2_loop"));
642 &aesenc ($inout0,$rndkey1);
643 &aesenc ($cmac,$rndkey1);
644 &paddq ($ivec,&QWP(16,"esp"));
645 &aesenclast ($inout0,$rndkey0);
646 &aesenclast ($cmac,$rndkey0);
647
648 &dec ($len);
649 &lea ($inp,&DWP(16,$inp));
650 &xorps ($in0,$inout0); # inp^=E(ivec)
651 &movdqa ($inout0,$ivec);
652 &movups (&QWP(0,$out),$in0); # save output
653 &lea ($out,&DWP(16,$out));
654 &pshufb ($inout0,$inout3);
655 &jnz (&label("ccm64_enc_outer"));
656
657 &mov ("esp",&DWP(48,"esp"));
658 &mov ($out,&wparam(5));
659 &movups (&QWP(0,$out),$cmac);
660&function_end("aesni_ccm64_encrypt_blocks");
661
662&function_begin("aesni_ccm64_decrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
669 &mov ($key_,"esp");
670 &sub ("esp",60);
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
673
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
677
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
683
684 # compose counter increment vector on stack
685 &mov ($rounds_,1);
686 &xor ($key_,$key_);
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
691
692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
693 &movdqa ($inout0,$ivec);
694
695 &mov ($key_,$key);
696 &mov ($rounds_,$rounds);
697
698 &pshufb ($ivec,$inout3);
699 if ($inline)
700 { &aesni_inline_generate1("enc"); }
701 else
702 { &call ("_aesni_encrypt1"); }
703 &movups ($in0,&QWP(0,$inp)); # load inp
704 &paddq ($ivec,&QWP(16,"esp"));
705 &lea ($inp,&QWP(16,$inp));
706 &jmp (&label("ccm64_dec_outer"));
707
708&set_label("ccm64_dec_outer",16);
709 &xorps ($in0,$inout0); # inp ^= E(ivec)
710 &movdqa ($inout0,$ivec);
711 &mov ($rounds,$rounds_);
712 &movups (&QWP(0,$out),$in0); # save output
713 &lea ($out,&DWP(16,$out));
714 &pshufb ($inout0,$inout3);
715
716 &sub ($len,1);
717 &jz (&label("ccm64_dec_break"));
718
719 &$movekey ($rndkey0,&QWP(0,$key_));
720 &shr ($rounds,1);
721 &$movekey ($rndkey1,&QWP(16,$key_));
722 &xorps ($in0,$rndkey0);
723 &lea ($key,&DWP(32,$key_));
724 &xorps ($inout0,$rndkey0);
725 &xorps ($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP(0,$key));
727
728&set_label("ccm64_dec2_loop");
729 &aesenc ($inout0,$rndkey1);
730 &dec ($rounds);
731 &aesenc ($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP(16,$key));
733 &aesenc ($inout0,$rndkey0);
734 &lea ($key,&DWP(32,$key));
735 &aesenc ($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP(0,$key));
737 &jnz (&label("ccm64_dec2_loop"));
738 &movups ($in0,&QWP(0,$inp)); # load inp
739 &paddq ($ivec,&QWP(16,"esp"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &lea ($inp,&QWP(16,$inp));
743 &aesenclast ($inout0,$rndkey0);
744 &aesenclast ($cmac,$rndkey0);
745 &jmp (&label("ccm64_dec_outer"));
746
747&set_label("ccm64_dec_break",16);
748 &mov ($key,$key_);
749 if ($inline)
750 { &aesni_inline_generate1("enc",$cmac,$in0); }
751 else
752 { &call ("_aesni_encrypt1",$cmac); }
753
754 &mov ("esp",&DWP(48,"esp"));
755 &mov ($out,&wparam(5));
756 &movups (&QWP(0,$out),$cmac);
757&function_end("aesni_ccm64_decrypt_blocks");
758}
759
760######################################################################
761# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762# size_t blocks, const AES_KEY *key,
763# const char *ivec);
764#
765# Handles only complete blocks, operates on 32-bit counter and
766# does not update *ivec! (see engine/eng_aesni.c for details)
767#
768# stack layout:
769# 0 pshufb mask
770# 16 vector addend: 0,6,6,6
771# 32 counter-less ivec
772# 48 1st triplet of counter vector
773# 64 2nd triplet of counter vector
774# 80 saved %esp
775
776&function_begin("aesni_ctr32_encrypt_blocks");
777 &mov ($inp,&wparam(0));
778 &mov ($out,&wparam(1));
779 &mov ($len,&wparam(2));
780 &mov ($key,&wparam(3));
781 &mov ($rounds_,&wparam(4));
782 &mov ($key_,"esp");
783 &sub ("esp",88);
784 &and ("esp",-16); # align stack
785 &mov (&DWP(80,"esp"),$key_);
786
787 &cmp ($len,1);
788 &je (&label("ctr32_one_shortcut"));
789
790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
791
792 # compose byte-swap control mask for pshufb on stack
793 &mov (&DWP(0,"esp"),0x0c0d0e0f);
794 &mov (&DWP(4,"esp"),0x08090a0b);
795 &mov (&DWP(8,"esp"),0x04050607);
796 &mov (&DWP(12,"esp"),0x00010203);
797
798 # compose counter increment vector on stack
799 &mov ($rounds,6);
800 &xor ($key_,$key_);
801 &mov (&DWP(16,"esp"),$rounds);
802 &mov (&DWP(20,"esp"),$rounds);
803 &mov (&DWP(24,"esp"),$rounds);
804 &mov (&DWP(28,"esp"),$key_);
805
806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
808
809 &mov ($rounds,&DWP(240,$key)); # key->rounds
810
811 # compose 2 vectors of 3x32-bit counters
812 &bswap ($rounds_);
813 &pxor ($rndkey1,$rndkey1);
814 &pxor ($rndkey0,$rndkey0);
815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
816 &pinsrd ($rndkey1,$rounds_,0);
817 &lea ($key_,&DWP(3,$rounds_));
818 &pinsrd ($rndkey0,$key_,0);
819 &inc ($rounds_);
820 &pinsrd ($rndkey1,$rounds_,1);
821 &inc ($key_);
822 &pinsrd ($rndkey0,$key_,1);
823 &inc ($rounds_);
824 &pinsrd ($rndkey1,$rounds_,2);
825 &inc ($key_);
826 &pinsrd ($rndkey0,$key_,2);
827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb ($rndkey1,$inout0); # byte swap
829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb ($rndkey0,$inout0); # byte swap
831
832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd ($inout1,$rndkey1,2<<6);
834 &cmp ($len,6);
835 &jb (&label("ctr32_tail"));
836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
837 &shr ($rounds,1);
838 &mov ($key_,$key); # backup $key
839 &mov ($rounds_,$rounds); # backup $rounds
840 &sub ($len,6);
841 &jmp (&label("ctr32_loop6"));
842
843&set_label("ctr32_loop6",16);
844 &pshufd ($inout2,$rndkey1,1<<6);
845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
846 &pshufd ($inout3,$rndkey0,3<<6);
847 &por ($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd ($inout4,$rndkey0,2<<6);
849 &por ($inout1,$rndkey1);
850 &pshufd ($inout5,$rndkey0,1<<6);
851 &por ($inout2,$rndkey1);
852 &por ($inout3,$rndkey1);
853 &por ($inout4,$rndkey1);
854 &por ($inout5,$rndkey1);
855
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP(0,$key_));
858 &$movekey ($rndkey1,&QWP(16,$key_));
859 &lea ($key,&DWP(32,$key_));
860 &dec ($rounds);
861 &pxor ($inout0,$rndkey0);
862 &pxor ($inout1,$rndkey0);
863 &aesenc ($inout0,$rndkey1);
864 &pxor ($inout2,$rndkey0);
865 &aesenc ($inout1,$rndkey1);
866 &pxor ($inout3,$rndkey0);
867 &aesenc ($inout2,$rndkey1);
868 &pxor ($inout4,$rndkey0);
869 &aesenc ($inout3,$rndkey1);
870 &pxor ($inout5,$rndkey0);
871 &aesenc ($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP(0,$key));
873 &aesenc ($inout5,$rndkey1);
874
875 &call (&label("_aesni_encrypt6_enter"));
876
877 &movups ($rndkey1,&QWP(0,$inp));
878 &movups ($rndkey0,&QWP(0x10,$inp));
879 &xorps ($inout0,$rndkey1);
880 &movups ($rndkey1,&QWP(0x20,$inp));
881 &xorps ($inout1,$rndkey0);
882 &movups (&QWP(0,$out),$inout0);
883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
884 &xorps ($inout2,$rndkey1);
885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
886 &movups (&QWP(0x10,$out),$inout1);
887 &movups (&QWP(0x20,$out),$inout2);
888
889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
892
893 &movups ($inout1,&QWP(0x30,$inp));
894 &movups ($inout2,&QWP(0x40,$inp));
895 &xorps ($inout3,$inout1);
896 &movups ($inout1,&QWP(0x50,$inp));
897 &lea ($inp,&DWP(0x60,$inp));
898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb ($rndkey1,$inout0); # byte swap
900 &xorps ($inout4,$inout2);
901 &movups (&QWP(0x30,$out),$inout3);
902 &xorps ($inout5,$inout1);
903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb ($rndkey0,$inout0); # byte swap
905 &movups (&QWP(0x40,$out),$inout4);
906 &pshufd ($inout0,$rndkey1,3<<6);
907 &movups (&QWP(0x50,$out),$inout5);
908 &lea ($out,&DWP(0x60,$out));
909
910 &mov ($rounds,$rounds_);
911 &pshufd ($inout1,$rndkey1,2<<6);
912 &sub ($len,6);
913 &jnc (&label("ctr32_loop6"));
914
915 &add ($len,6);
916 &jz (&label("ctr32_ret"));
917 &mov ($key,$key_);
918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
920
921&set_label("ctr32_tail");
922 &por ($inout0,$inout5);
923 &cmp ($len,2);
924 &jb (&label("ctr32_one"));
925
926 &pshufd ($inout2,$rndkey1,1<<6);
927 &por ($inout1,$inout5);
928 &je (&label("ctr32_two"));
929
930 &pshufd ($inout3,$rndkey0,3<<6);
931 &por ($inout2,$inout5);
932 &cmp ($len,4);
933 &jb (&label("ctr32_three"));
934
935 &pshufd ($inout4,$rndkey0,2<<6);
936 &por ($inout3,$inout5);
937 &je (&label("ctr32_four"));
938
939 &por ($inout4,$inout5);
940 &call ("_aesni_encrypt6");
941 &movups ($rndkey1,&QWP(0,$inp));
942 &movups ($rndkey0,&QWP(0x10,$inp));
943 &xorps ($inout0,$rndkey1);
944 &movups ($rndkey1,&QWP(0x20,$inp));
945 &xorps ($inout1,$rndkey0);
946 &movups ($rndkey0,&QWP(0x30,$inp));
947 &xorps ($inout2,$rndkey1);
948 &movups ($rndkey1,&QWP(0x40,$inp));
949 &xorps ($inout3,$rndkey0);
950 &movups (&QWP(0,$out),$inout0);
951 &xorps ($inout4,$rndkey1);
952 &movups (&QWP(0x10,$out),$inout1);
953 &movups (&QWP(0x20,$out),$inout2);
954 &movups (&QWP(0x30,$out),$inout3);
955 &movups (&QWP(0x40,$out),$inout4);
956 &jmp (&label("ctr32_ret"));
957
958&set_label("ctr32_one_shortcut",16);
959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
960 &mov ($rounds,&DWP(240,$key));
961
962&set_label("ctr32_one");
963 if ($inline)
964 { &aesni_inline_generate1("enc"); }
965 else
966 { &call ("_aesni_encrypt1"); }
967 &movups ($in0,&QWP(0,$inp));
968 &xorps ($in0,$inout0);
969 &movups (&QWP(0,$out),$in0);
970 &jmp (&label("ctr32_ret"));
971
972&set_label("ctr32_two",16);
973 &call ("_aesni_encrypt3");
974 &movups ($inout3,&QWP(0,$inp));
975 &movups ($inout4,&QWP(0x10,$inp));
976 &xorps ($inout0,$inout3);
977 &xorps ($inout1,$inout4);
978 &movups (&QWP(0,$out),$inout0);
979 &movups (&QWP(0x10,$out),$inout1);
980 &jmp (&label("ctr32_ret"));
981
982&set_label("ctr32_three",16);
983 &call ("_aesni_encrypt3");
984 &movups ($inout3,&QWP(0,$inp));
985 &movups ($inout4,&QWP(0x10,$inp));
986 &xorps ($inout0,$inout3);
987 &movups ($inout5,&QWP(0x20,$inp));
988 &xorps ($inout1,$inout4);
989 &movups (&QWP(0,$out),$inout0);
990 &xorps ($inout2,$inout5);
991 &movups (&QWP(0x10,$out),$inout1);
992 &movups (&QWP(0x20,$out),$inout2);
993 &jmp (&label("ctr32_ret"));
994
995&set_label("ctr32_four",16);
996 &call ("_aesni_encrypt4");
997 &movups ($inout4,&QWP(0,$inp));
998 &movups ($inout5,&QWP(0x10,$inp));
999 &movups ($rndkey1,&QWP(0x20,$inp));
1000 &xorps ($inout0,$inout4);
1001 &movups ($rndkey0,&QWP(0x30,$inp));
1002 &xorps ($inout1,$inout5);
1003 &movups (&QWP(0,$out),$inout0);
1004 &xorps ($inout2,$rndkey1);
1005 &movups (&QWP(0x10,$out),$inout1);
1006 &xorps ($inout3,$rndkey0);
1007 &movups (&QWP(0x20,$out),$inout2);
1008 &movups (&QWP(0x30,$out),$inout3);
1009
1010&set_label("ctr32_ret");
1011 &mov ("esp",&DWP(80,"esp"));
1012&function_end("aesni_ctr32_encrypt_blocks");
1013
1014######################################################################
1015# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016# const AES_KEY *key1, const AES_KEY *key2
1017# const unsigned char iv[16]);
1018#
1019{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020
1021&function_begin("aesni_xts_encrypt");
1022 &mov ($key,&wparam(4)); # key2
1023 &mov ($inp,&wparam(5)); # clear-text tweak
1024
1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1026 &movups ($inout0,&QWP(0,$inp));
1027 if ($inline)
1028 { &aesni_inline_generate1("enc"); }
1029 else
1030 { &call ("_aesni_encrypt1"); }
1031
1032 &mov ($inp,&wparam(0));
1033 &mov ($out,&wparam(1));
1034 &mov ($len,&wparam(2));
1035 &mov ($key,&wparam(3)); # key1
1036
1037 &mov ($key_,"esp");
1038 &sub ("esp",16*7+8);
1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1041
1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov (&DWP(16*6+4,"esp"),0);
1044 &mov (&DWP(16*6+8,"esp"),1);
1045 &mov (&DWP(16*6+12,"esp"),0);
1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048
1049 &movdqa ($tweak,$inout0);
1050 &pxor ($twtmp,$twtmp);
1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053
1054 &and ($len,-16);
1055 &mov ($key_,$key); # backup $key
1056 &mov ($rounds_,$rounds); # backup $rounds
1057 &sub ($len,16*6);
1058 &jc (&label("xts_enc_short"));
1059
1060 &shr ($rounds,1);
1061 &mov ($rounds_,$rounds);
1062 &jmp (&label("xts_enc_loop6"));
1063
1064&set_label("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd ($twres,$twtmp,0x13);
1067 &pxor ($twtmp,$twtmp);
1068 &movdqa (&QWP(16*$i,"esp"),$tweak);
1069 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1070 &pand ($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1072 &pxor ($tweak,$twres);
1073 }
1074 &pshufd ($inout5,$twtmp,0x13);
1075 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1076 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP(0,$key_));
1078 &pand ($inout5,$twmask); # isolate carry and residue
1079 &movups ($inout0,&QWP(0,$inp)); # load input
1080 &pxor ($inout5,$tweak);
1081
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu ($inout1,&QWP(16*1,$inp));
1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu ($inout2,&QWP(16*2,$inp));
1086 &pxor ($inout1,$rndkey0);
1087 &movdqu ($inout3,&QWP(16*3,$inp));
1088 &pxor ($inout2,$rndkey0);
1089 &movdqu ($inout4,&QWP(16*4,$inp));
1090 &pxor ($inout3,$rndkey0);
1091 &movdqu ($rndkey1,&QWP(16*5,$inp));
1092 &pxor ($inout4,$rndkey0);
1093 &lea ($inp,&DWP(16*6,$inp));
1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor ($inout5,$rndkey1);
1097
1098 &$movekey ($rndkey1,&QWP(16,$key_));
1099 &lea ($key,&DWP(32,$key_));
1100 &pxor ($inout1,&QWP(16*1,"esp"));
1101 &aesenc ($inout0,$rndkey1);
1102 &pxor ($inout2,&QWP(16*2,"esp"));
1103 &aesenc ($inout1,$rndkey1);
1104 &pxor ($inout3,&QWP(16*3,"esp"));
1105 &dec ($rounds);
1106 &aesenc ($inout2,$rndkey1);
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout3,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &aesenc ($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP(0,$key));
1112 &aesenc ($inout5,$rndkey1);
1113 &call (&label("_aesni_encrypt6_enter"));
1114
1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1116 &pxor ($twtmp,$twtmp);
1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1119 &xorps ($inout1,&QWP(16*1,"esp"));
1120 &movups (&QWP(16*0,$out),$inout0); # write output
1121 &xorps ($inout2,&QWP(16*2,"esp"));
1122 &movups (&QWP(16*1,$out),$inout1);
1123 &xorps ($inout3,&QWP(16*3,"esp"));
1124 &movups (&QWP(16*2,$out),$inout2);
1125 &xorps ($inout4,&QWP(16*4,"esp"));
1126 &movups (&QWP(16*3,$out),$inout3);
1127 &xorps ($inout5,$tweak);
1128 &movups (&QWP(16*4,$out),$inout4);
1129 &pshufd ($twres,$twtmp,0x13);
1130 &movups (&QWP(16*5,$out),$inout5);
1131 &lea ($out,&DWP(16*6,$out));
1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133
1134 &pxor ($twtmp,$twtmp);
1135 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1136 &pand ($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1138 &mov ($rounds,$rounds_); # restore $rounds
1139 &pxor ($tweak,$twres);
1140
1141 &sub ($len,16*6);
1142 &jnc (&label("xts_enc_loop6"));
1143
1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1145 &mov ($key,$key_); # restore $key
1146 &mov ($rounds_,$rounds);
1147
1148&set_label("xts_enc_short");
1149 &add ($len,16*6);
1150 &jz (&label("xts_enc_done6x"));
1151
1152 &movdqa ($inout3,$tweak); # put aside previous tweak
1153 &cmp ($len,0x20);
1154 &jb (&label("xts_enc_one"));
1155
1156 &pshufd ($twres,$twtmp,0x13);
1157 &pxor ($twtmp,$twtmp);
1158 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1159 &pand ($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &pxor ($tweak,$twres);
1162 &je (&label("xts_enc_two"));
1163
1164 &pshufd ($twres,$twtmp,0x13);
1165 &pxor ($twtmp,$twtmp);
1166 &movdqa ($inout4,$tweak); # put aside previous tweak
1167 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1168 &pand ($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1170 &pxor ($tweak,$twres);
1171 &cmp ($len,0x40);
1172 &jb (&label("xts_enc_three"));
1173
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa ($inout5,$tweak); # put aside previous tweak
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1181 &movdqa (&QWP(16*0,"esp"),$inout3);
1182 &movdqa (&QWP(16*1,"esp"),$inout4);
1183 &je (&label("xts_enc_four"));
1184
1185 &movdqa (&QWP(16*2,"esp"),$inout5);
1186 &pshufd ($inout5,$twtmp,0x13);
1187 &movdqa (&QWP(16*3,"esp"),$tweak);
1188 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1189 &pand ($inout5,$twmask); # isolate carry and residue
1190 &pxor ($inout5,$tweak);
1191
1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1193 &movdqu ($inout1,&QWP(16*1,$inp));
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout1,&QWP(16*1,"esp"));
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout2,&QWP(16*2,"esp"));
1200 &lea ($inp,&DWP(16*5,$inp));
1201 &pxor ($inout3,&QWP(16*3,"esp"));
1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1203 &pxor ($inout4,$inout5);
1204
1205 &call ("_aesni_encrypt6");
1206
1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209 &xorps ($inout1,&QWP(16*1,"esp"));
1210 &xorps ($inout2,&QWP(16*2,"esp"));
1211 &movups (&QWP(16*0,$out),$inout0); # write output
1212 &xorps ($inout3,&QWP(16*3,"esp"));
1213 &movups (&QWP(16*1,$out),$inout1);
1214 &xorps ($inout4,$tweak);
1215 &movups (&QWP(16*2,$out),$inout2);
1216 &movups (&QWP(16*3,$out),$inout3);
1217 &movups (&QWP(16*4,$out),$inout4);
1218 &lea ($out,&DWP(16*5,$out));
1219 &jmp (&label("xts_enc_done"));
1220
1221&set_label("xts_enc_one",16);
1222 &movups ($inout0,&QWP(16*0,$inp)); # load input
1223 &lea ($inp,&DWP(16*1,$inp));
1224 &xorps ($inout0,$inout3); # input^=tweak
1225 if ($inline)
1226 { &aesni_inline_generate1("enc"); }
1227 else
1228 { &call ("_aesni_encrypt1"); }
1229 &xorps ($inout0,$inout3); # output^=tweak
1230 &movups (&QWP(16*0,$out),$inout0); # write output
1231 &lea ($out,&DWP(16*1,$out));
1232
1233 &movdqa ($tweak,$inout3); # last tweak
1234 &jmp (&label("xts_enc_done"));
1235
1236&set_label("xts_enc_two",16);
1237 &movaps ($inout4,$tweak); # put aside last tweak
1238
1239 &movups ($inout0,&QWP(16*0,$inp)); # load input
1240 &movups ($inout1,&QWP(16*1,$inp));
1241 &lea ($inp,&DWP(16*2,$inp));
1242 &xorps ($inout0,$inout3); # input^=tweak
1243 &xorps ($inout1,$inout4);
1244 &xorps ($inout2,$inout2);
1245
1246 &call ("_aesni_encrypt3");
1247
1248 &xorps ($inout0,$inout3); # output^=tweak
1249 &xorps ($inout1,$inout4);
1250 &movups (&QWP(16*0,$out),$inout0); # write output
1251 &movups (&QWP(16*1,$out),$inout1);
1252 &lea ($out,&DWP(16*2,$out));
1253
1254 &movdqa ($tweak,$inout4); # last tweak
1255 &jmp (&label("xts_enc_done"));
1256
1257&set_label("xts_enc_three",16);
1258 &movaps ($inout5,$tweak); # put aside last tweak
1259 &movups ($inout0,&QWP(16*0,$inp)); # load input
1260 &movups ($inout1,&QWP(16*1,$inp));
1261 &movups ($inout2,&QWP(16*2,$inp));
1262 &lea ($inp,&DWP(16*3,$inp));
1263 &xorps ($inout0,$inout3); # input^=tweak
1264 &xorps ($inout1,$inout4);
1265 &xorps ($inout2,$inout5);
1266
1267 &call ("_aesni_encrypt3");
1268
1269 &xorps ($inout0,$inout3); # output^=tweak
1270 &xorps ($inout1,$inout4);
1271 &xorps ($inout2,$inout5);
1272 &movups (&QWP(16*0,$out),$inout0); # write output
1273 &movups (&QWP(16*1,$out),$inout1);
1274 &movups (&QWP(16*2,$out),$inout2);
1275 &lea ($out,&DWP(16*3,$out));
1276
1277 &movdqa ($tweak,$inout5); # last tweak
1278 &jmp (&label("xts_enc_done"));
1279
1280&set_label("xts_enc_four",16);
1281 &movaps ($inout4,$tweak); # put aside last tweak
1282
1283 &movups ($inout0,&QWP(16*0,$inp)); # load input
1284 &movups ($inout1,&QWP(16*1,$inp));
1285 &movups ($inout2,&QWP(16*2,$inp));
1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287 &movups ($inout3,&QWP(16*3,$inp));
1288 &lea ($inp,&DWP(16*4,$inp));
1289 &xorps ($inout1,&QWP(16*1,"esp"));
1290 &xorps ($inout2,$inout5);
1291 &xorps ($inout3,$inout4);
1292
1293 &call ("_aesni_encrypt4");
1294
1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296 &xorps ($inout1,&QWP(16*1,"esp"));
1297 &xorps ($inout2,$inout5);
1298 &movups (&QWP(16*0,$out),$inout0); # write output
1299 &xorps ($inout3,$inout4);
1300 &movups (&QWP(16*1,$out),$inout1);
1301 &movups (&QWP(16*2,$out),$inout2);
1302 &movups (&QWP(16*3,$out),$inout3);
1303 &lea ($out,&DWP(16*4,$out));
1304
1305 &movdqa ($tweak,$inout4); # last tweak
1306 &jmp (&label("xts_enc_done"));
1307
1308&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1310 &and ($len,15);
1311 &jz (&label("xts_enc_ret"));
1312 &movdqa ($inout3,$tweak);
1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1314 &jmp (&label("xts_enc_steal"));
1315
1316&set_label("xts_enc_done",16);
1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1318 &pxor ($twtmp,$twtmp);
1319 &and ($len,15);
1320 &jz (&label("xts_enc_ret"));
1321
1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd ($inout3,$twtmp,0x13);
1325 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327 &pxor ($inout3,$tweak);
1328
1329&set_label("xts_enc_steal");
1330 &movz ($rounds,&BP(0,$inp));
1331 &movz ($key,&BP(-16,$out));
1332 &lea ($inp,&DWP(1,$inp));
1333 &mov (&BP(-16,$out),&LB($rounds));
1334 &mov (&BP(0,$out),&LB($key));
1335 &lea ($out,&DWP(1,$out));
1336 &sub ($len,1);
1337 &jnz (&label("xts_enc_steal"));
1338
1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1340 &mov ($key,$key_); # restore $key
1341 &mov ($rounds,$rounds_); # restore $rounds
1342
1343 &movups ($inout0,&QWP(-16,$out)); # load input
1344 &xorps ($inout0,$inout3); # input^=tweak
1345 if ($inline)
1346 { &aesni_inline_generate1("enc"); }
1347 else
1348 { &call ("_aesni_encrypt1"); }
1349 &xorps ($inout0,$inout3); # output^=tweak
1350 &movups (&QWP(-16,$out),$inout0); # write output
1351
1352&set_label("xts_enc_ret");
1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354&function_end("aesni_xts_encrypt");
1355
1356&function_begin("aesni_xts_decrypt");
1357 &mov ($key,&wparam(4)); # key2
1358 &mov ($inp,&wparam(5)); # clear-text tweak
1359
1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1361 &movups ($inout0,&QWP(0,$inp));
1362 if ($inline)
1363 { &aesni_inline_generate1("enc"); }
1364 else
1365 { &call ("_aesni_encrypt1"); }
1366
1367 &mov ($inp,&wparam(0));
1368 &mov ($out,&wparam(1));
1369 &mov ($len,&wparam(2));
1370 &mov ($key,&wparam(3)); # key1
1371
1372 &mov ($key_,"esp");
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1375
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1377 &test ($len,15);
1378 &setnz (&LB($rounds_));
1379 &shl ($rounds_,4);
1380 &sub ($len,$rounds_);
1381
1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov (&DWP(16*6+4,"esp"),0);
1384 &mov (&DWP(16*6+8,"esp"),1);
1385 &mov (&DWP(16*6+12,"esp"),0);
1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388
1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1390 &mov ($key_,$key); # backup $key
1391 &mov ($rounds_,$rounds); # backup $rounds
1392
1393 &movdqa ($tweak,$inout0);
1394 &pxor ($twtmp,$twtmp);
1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1397
1398 &and ($len,-16);
1399 &sub ($len,16*6);
1400 &jc (&label("xts_dec_short"));
1401
1402 &shr ($rounds,1);
1403 &mov ($rounds_,$rounds);
1404 &jmp (&label("xts_dec_loop6"));
1405
1406&set_label("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd ($twres,$twtmp,0x13);
1409 &pxor ($twtmp,$twtmp);
1410 &movdqa (&QWP(16*$i,"esp"),$tweak);
1411 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1412 &pand ($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1414 &pxor ($tweak,$twres);
1415 }
1416 &pshufd ($inout5,$twtmp,0x13);
1417 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1418 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP(0,$key_));
1420 &pand ($inout5,$twmask); # isolate carry and residue
1421 &movups ($inout0,&QWP(0,$inp)); # load input
1422 &pxor ($inout5,$tweak);
1423
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu ($inout1,&QWP(16*1,$inp));
1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu ($inout2,&QWP(16*2,$inp));
1428 &pxor ($inout1,$rndkey0);
1429 &movdqu ($inout3,&QWP(16*3,$inp));
1430 &pxor ($inout2,$rndkey0);
1431 &movdqu ($inout4,&QWP(16*4,$inp));
1432 &pxor ($inout3,$rndkey0);
1433 &movdqu ($rndkey1,&QWP(16*5,$inp));
1434 &pxor ($inout4,$rndkey0);
1435 &lea ($inp,&DWP(16*6,$inp));
1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor ($inout5,$rndkey1);
1439
1440 &$movekey ($rndkey1,&QWP(16,$key_));
1441 &lea ($key,&DWP(32,$key_));
1442 &pxor ($inout1,&QWP(16*1,"esp"));
1443 &aesdec ($inout0,$rndkey1);
1444 &pxor ($inout2,&QWP(16*2,"esp"));
1445 &aesdec ($inout1,$rndkey1);
1446 &pxor ($inout3,&QWP(16*3,"esp"));
1447 &dec ($rounds);
1448 &aesdec ($inout2,$rndkey1);
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout3,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &aesdec ($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP(0,$key));
1454 &aesdec ($inout5,$rndkey1);
1455 &call (&label("_aesni_decrypt6_enter"));
1456
1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1458 &pxor ($twtmp,$twtmp);
1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1461 &xorps ($inout1,&QWP(16*1,"esp"));
1462 &movups (&QWP(16*0,$out),$inout0); # write output
1463 &xorps ($inout2,&QWP(16*2,"esp"));
1464 &movups (&QWP(16*1,$out),$inout1);
1465 &xorps ($inout3,&QWP(16*3,"esp"));
1466 &movups (&QWP(16*2,$out),$inout2);
1467 &xorps ($inout4,&QWP(16*4,"esp"));
1468 &movups (&QWP(16*3,$out),$inout3);
1469 &xorps ($inout5,$tweak);
1470 &movups (&QWP(16*4,$out),$inout4);
1471 &pshufd ($twres,$twtmp,0x13);
1472 &movups (&QWP(16*5,$out),$inout5);
1473 &lea ($out,&DWP(16*6,$out));
1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475
1476 &pxor ($twtmp,$twtmp);
1477 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1478 &pand ($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1480 &mov ($rounds,$rounds_); # restore $rounds
1481 &pxor ($tweak,$twres);
1482
1483 &sub ($len,16*6);
1484 &jnc (&label("xts_dec_loop6"));
1485
1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1487 &mov ($key,$key_); # restore $key
1488 &mov ($rounds_,$rounds);
1489
1490&set_label("xts_dec_short");
1491 &add ($len,16*6);
1492 &jz (&label("xts_dec_done6x"));
1493
1494 &movdqa ($inout3,$tweak); # put aside previous tweak
1495 &cmp ($len,0x20);
1496 &jb (&label("xts_dec_one"));
1497
1498 &pshufd ($twres,$twtmp,0x13);
1499 &pxor ($twtmp,$twtmp);
1500 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1501 &pand ($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1503 &pxor ($tweak,$twres);
1504 &je (&label("xts_dec_two"));
1505
1506 &pshufd ($twres,$twtmp,0x13);
1507 &pxor ($twtmp,$twtmp);
1508 &movdqa ($inout4,$tweak); # put aside previous tweak
1509 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1510 &pand ($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1512 &pxor ($tweak,$twres);
1513 &cmp ($len,0x40);
1514 &jb (&label("xts_dec_three"));
1515
1516 &pshufd ($twres,$twtmp,0x13);
1517 &pxor ($twtmp,$twtmp);
1518 &movdqa ($inout5,$tweak); # put aside previous tweak
1519 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1520 &pand ($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1522 &pxor ($tweak,$twres);
1523 &movdqa (&QWP(16*0,"esp"),$inout3);
1524 &movdqa (&QWP(16*1,"esp"),$inout4);
1525 &je (&label("xts_dec_four"));
1526
1527 &movdqa (&QWP(16*2,"esp"),$inout5);
1528 &pshufd ($inout5,$twtmp,0x13);
1529 &movdqa (&QWP(16*3,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1531 &pand ($inout5,$twmask); # isolate carry and residue
1532 &pxor ($inout5,$tweak);
1533
1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1535 &movdqu ($inout1,&QWP(16*1,$inp));
1536 &movdqu ($inout2,&QWP(16*2,$inp));
1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538 &movdqu ($inout3,&QWP(16*3,$inp));
1539 &pxor ($inout1,&QWP(16*1,"esp"));
1540 &movdqu ($inout4,&QWP(16*4,$inp));
1541 &pxor ($inout2,&QWP(16*2,"esp"));
1542 &lea ($inp,&DWP(16*5,$inp));
1543 &pxor ($inout3,&QWP(16*3,"esp"));
1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1545 &pxor ($inout4,$inout5);
1546
1547 &call ("_aesni_decrypt6");
1548
1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551 &xorps ($inout1,&QWP(16*1,"esp"));
1552 &xorps ($inout2,&QWP(16*2,"esp"));
1553 &movups (&QWP(16*0,$out),$inout0); # write output
1554 &xorps ($inout3,&QWP(16*3,"esp"));
1555 &movups (&QWP(16*1,$out),$inout1);
1556 &xorps ($inout4,$tweak);
1557 &movups (&QWP(16*2,$out),$inout2);
1558 &movups (&QWP(16*3,$out),$inout3);
1559 &movups (&QWP(16*4,$out),$inout4);
1560 &lea ($out,&DWP(16*5,$out));
1561 &jmp (&label("xts_dec_done"));
1562
1563&set_label("xts_dec_one",16);
1564 &movups ($inout0,&QWP(16*0,$inp)); # load input
1565 &lea ($inp,&DWP(16*1,$inp));
1566 &xorps ($inout0,$inout3); # input^=tweak
1567 if ($inline)
1568 { &aesni_inline_generate1("dec"); }
1569 else
1570 { &call ("_aesni_decrypt1"); }
1571 &xorps ($inout0,$inout3); # output^=tweak
1572 &movups (&QWP(16*0,$out),$inout0); # write output
1573 &lea ($out,&DWP(16*1,$out));
1574
1575 &movdqa ($tweak,$inout3); # last tweak
1576 &jmp (&label("xts_dec_done"));
1577
1578&set_label("xts_dec_two",16);
1579 &movaps ($inout4,$tweak); # put aside last tweak
1580
1581 &movups ($inout0,&QWP(16*0,$inp)); # load input
1582 &movups ($inout1,&QWP(16*1,$inp));
1583 &lea ($inp,&DWP(16*2,$inp));
1584 &xorps ($inout0,$inout3); # input^=tweak
1585 &xorps ($inout1,$inout4);
1586
1587 &call ("_aesni_decrypt3");
1588
1589 &xorps ($inout0,$inout3); # output^=tweak
1590 &xorps ($inout1,$inout4);
1591 &movups (&QWP(16*0,$out),$inout0); # write output
1592 &movups (&QWP(16*1,$out),$inout1);
1593 &lea ($out,&DWP(16*2,$out));
1594
1595 &movdqa ($tweak,$inout4); # last tweak
1596 &jmp (&label("xts_dec_done"));
1597
1598&set_label("xts_dec_three",16);
1599 &movaps ($inout5,$tweak); # put aside last tweak
1600 &movups ($inout0,&QWP(16*0,$inp)); # load input
1601 &movups ($inout1,&QWP(16*1,$inp));
1602 &movups ($inout2,&QWP(16*2,$inp));
1603 &lea ($inp,&DWP(16*3,$inp));
1604 &xorps ($inout0,$inout3); # input^=tweak
1605 &xorps ($inout1,$inout4);
1606 &xorps ($inout2,$inout5);
1607
1608 &call ("_aesni_decrypt3");
1609
1610 &xorps ($inout0,$inout3); # output^=tweak
1611 &xorps ($inout1,$inout4);
1612 &xorps ($inout2,$inout5);
1613 &movups (&QWP(16*0,$out),$inout0); # write output
1614 &movups (&QWP(16*1,$out),$inout1);
1615 &movups (&QWP(16*2,$out),$inout2);
1616 &lea ($out,&DWP(16*3,$out));
1617
1618 &movdqa ($tweak,$inout5); # last tweak
1619 &jmp (&label("xts_dec_done"));
1620
1621&set_label("xts_dec_four",16);
1622 &movaps ($inout4,$tweak); # put aside last tweak
1623
1624 &movups ($inout0,&QWP(16*0,$inp)); # load input
1625 &movups ($inout1,&QWP(16*1,$inp));
1626 &movups ($inout2,&QWP(16*2,$inp));
1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628 &movups ($inout3,&QWP(16*3,$inp));
1629 &lea ($inp,&DWP(16*4,$inp));
1630 &xorps ($inout1,&QWP(16*1,"esp"));
1631 &xorps ($inout2,$inout5);
1632 &xorps ($inout3,$inout4);
1633
1634 &call ("_aesni_decrypt4");
1635
1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637 &xorps ($inout1,&QWP(16*1,"esp"));
1638 &xorps ($inout2,$inout5);
1639 &movups (&QWP(16*0,$out),$inout0); # write output
1640 &xorps ($inout3,$inout4);
1641 &movups (&QWP(16*1,$out),$inout1);
1642 &movups (&QWP(16*2,$out),$inout2);
1643 &movups (&QWP(16*3,$out),$inout3);
1644 &lea ($out,&DWP(16*4,$out));
1645
1646 &movdqa ($tweak,$inout4); # last tweak
1647 &jmp (&label("xts_dec_done"));
1648
1649&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1651 &and ($len,15);
1652 &jz (&label("xts_dec_ret"));
1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1654 &jmp (&label("xts_dec_only_one_more"));
1655
1656&set_label("xts_dec_done",16);
1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1658 &pxor ($twtmp,$twtmp);
1659 &and ($len,15);
1660 &jz (&label("xts_dec_ret"));
1661
1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd ($twres,$twtmp,0x13);
1665 &pxor ($twtmp,$twtmp);
1666 &movdqa ($twmask,&QWP(16*6,"esp"));
1667 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1668 &pand ($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1670 &pxor ($tweak,$twres);
1671
1672&set_label("xts_dec_only_one_more");
1673 &pshufd ($inout3,$twtmp,0x13);
1674 &movdqa ($inout4,$tweak); # put aside previous tweak
1675 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1676 &pand ($inout3,$twmask); # isolate carry and residue
1677 &pxor ($inout3,$tweak);
1678
1679 &mov ($key,$key_); # restore $key
1680 &mov ($rounds,$rounds_); # restore $rounds
1681
1682 &movups ($inout0,&QWP(0,$inp)); # load input
1683 &xorps ($inout0,$inout3); # input^=tweak
1684 if ($inline)
1685 { &aesni_inline_generate1("dec"); }
1686 else
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(0,$out),$inout0); # write output
1690
1691&set_label("xts_dec_steal");
1692 &movz ($rounds,&BP(16,$inp));
1693 &movz ($key,&BP(0,$out));
1694 &lea ($inp,&DWP(1,$inp));
1695 &mov (&BP(0,$out),&LB($rounds));
1696 &mov (&BP(16,$out),&LB($key));
1697 &lea ($out,&DWP(1,$out));
1698 &sub ($len,1);
1699 &jnz (&label("xts_dec_steal"));
1700
1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1702 &mov ($key,$key_); # restore $key
1703 &mov ($rounds,$rounds_); # restore $rounds
1704
1705 &movups ($inout0,&QWP(0,$out)); # load input
1706 &xorps ($inout0,$inout4); # input^=tweak
1707 if ($inline)
1708 { &aesni_inline_generate1("dec"); }
1709 else
1710 { &call ("_aesni_decrypt1"); }
1711 &xorps ($inout0,$inout4); # output^=tweak
1712 &movups (&QWP(0,$out),$inout0); # write output
1713
1714&set_label("xts_dec_ret");
1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716&function_end("aesni_xts_decrypt");
1717}
1718}
1719
1720######################################################################
1721# void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722# size_t length, const AES_KEY *key,
1723# unsigned char *ivp,const int enc);
1724&function_begin("${PREFIX}_cbc_encrypt");
1725 &mov ($inp,&wparam(0));
1726 &mov ($rounds_,"esp");
1727 &mov ($out,&wparam(1));
1728 &sub ($rounds_,24);
1729 &mov ($len,&wparam(2));
1730 &and ($rounds_,-16);
1731 &mov ($key,&wparam(3));
1732 &mov ($key_,&wparam(4));
1733 &test ($len,$len);
1734 &jz (&label("cbc_abort"));
1735
1736 &cmp (&wparam(5),0);
1737 &xchg ($rounds_,"esp"); # alloca
1738 &movups ($ivec,&QWP(0,$key_)); # load IV
1739 &mov ($rounds,&DWP(240,$key));
1740 &mov ($key_,$key); # backup $key
1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1742 &mov ($rounds_,$rounds); # backup $rounds
1743 &je (&label("cbc_decrypt"));
1744
1745 &movaps ($inout0,$ivec);
1746 &cmp ($len,16);
1747 &jb (&label("cbc_enc_tail"));
1748 &sub ($len,16);
1749 &jmp (&label("cbc_enc_loop"));
1750
1751&set_label("cbc_enc_loop",16);
1752 &movups ($ivec,&QWP(0,$inp)); # input actually
1753 &lea ($inp,&DWP(16,$inp));
1754 if ($inline)
1755 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1756 else
1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1758 &mov ($rounds,$rounds_); # restore $rounds
1759 &mov ($key,$key_); # restore $key
1760 &movups (&QWP(0,$out),$inout0); # store output
1761 &lea ($out,&DWP(16,$out));
1762 &sub ($len,16);
1763 &jnc (&label("cbc_enc_loop"));
1764 &add ($len,16);
1765 &jnz (&label("cbc_enc_tail"));
1766 &movaps ($ivec,$inout0);
1767 &jmp (&label("cbc_ret"));
1768
1769&set_label("cbc_enc_tail");
1770 &mov ("ecx",$len); # zaps $rounds
1771 &data_word(0xA4F3F689); # rep movsb
1772 &mov ("ecx",16); # zero tail
1773 &sub ("ecx",$len);
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word(0xAAF3F689); # rep stosb
1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1777 &mov ($rounds,$rounds_); # restore $rounds
1778 &mov ($inp,$out); # $inp and $out are the same
1779 &mov ($key,$key_); # restore $key
1780 &jmp (&label("cbc_enc_loop"));
1781######################################################################
1782&set_label("cbc_decrypt",16);
1783 &cmp ($len,0x50);
1784 &jbe (&label("cbc_dec_tail"));
1785 &movaps (&QWP(0,"esp"),$ivec); # save IV
1786 &sub ($len,0x50);
1787 &jmp (&label("cbc_dec_loop6_enter"));
1788
1789&set_label("cbc_dec_loop6",16);
1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1791 &movups (&QWP(0,$out),$inout5);
1792 &lea ($out,&DWP(0x10,$out));
1793&set_label("cbc_dec_loop6_enter");
1794 &movdqu ($inout0,&QWP(0,$inp));
1795 &movdqu ($inout1,&QWP(0x10,$inp));
1796 &movdqu ($inout2,&QWP(0x20,$inp));
1797 &movdqu ($inout3,&QWP(0x30,$inp));
1798 &movdqu ($inout4,&QWP(0x40,$inp));
1799 &movdqu ($inout5,&QWP(0x50,$inp));
1800
1801 &call ("_aesni_decrypt6");
1802
1803 &movups ($rndkey1,&QWP(0,$inp));
1804 &movups ($rndkey0,&QWP(0x10,$inp));
1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1806 &xorps ($inout1,$rndkey1);
1807 &movups ($rndkey1,&QWP(0x20,$inp));
1808 &xorps ($inout2,$rndkey0);
1809 &movups ($rndkey0,&QWP(0x30,$inp));
1810 &xorps ($inout3,$rndkey1);
1811 &movups ($rndkey1,&QWP(0x40,$inp));
1812 &xorps ($inout4,$rndkey0);
1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1814 &xorps ($inout5,$rndkey1);
1815 &movups (&QWP(0,$out),$inout0);
1816 &movups (&QWP(0x10,$out),$inout1);
1817 &lea ($inp,&DWP(0x60,$inp));
1818 &movups (&QWP(0x20,$out),$inout2);
1819 &mov ($rounds,$rounds_) # restore $rounds
1820 &movups (&QWP(0x30,$out),$inout3);
1821 &mov ($key,$key_); # restore $key
1822 &movups (&QWP(0x40,$out),$inout4);
1823 &lea ($out,&DWP(0x50,$out));
1824 &sub ($len,0x60);
1825 &ja (&label("cbc_dec_loop6"));
1826
1827 &movaps ($inout0,$inout5);
1828 &movaps ($ivec,$rndkey0);
1829 &add ($len,0x50);
1830 &jle (&label("cbc_dec_tail_collected"));
1831 &movups (&QWP(0,$out),$inout0);
1832 &lea ($out,&DWP(0x10,$out));
1833&set_label("cbc_dec_tail");
1834 &movups ($inout0,&QWP(0,$inp));
1835 &movaps ($in0,$inout0);
1836 &cmp ($len,0x10);
1837 &jbe (&label("cbc_dec_one"));
1838
1839 &movups ($inout1,&QWP(0x10,$inp));
1840 &movaps ($in1,$inout1);
1841 &cmp ($len,0x20);
1842 &jbe (&label("cbc_dec_two"));
1843
1844 &movups ($inout2,&QWP(0x20,$inp));
1845 &cmp ($len,0x30);
1846 &jbe (&label("cbc_dec_three"));
1847
1848 &movups ($inout3,&QWP(0x30,$inp));
1849 &cmp ($len,0x40);
1850 &jbe (&label("cbc_dec_four"));
1851
1852 &movups ($inout4,&QWP(0x40,$inp));
1853 &movaps (&QWP(0,"esp"),$ivec); # save IV
1854 &movups ($inout0,&QWP(0,$inp));
1855 &xorps ($inout5,$inout5);
1856 &call ("_aesni_decrypt6");
1857 &movups ($rndkey1,&QWP(0,$inp));
1858 &movups ($rndkey0,&QWP(0x10,$inp));
1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1860 &xorps ($inout1,$rndkey1);
1861 &movups ($rndkey1,&QWP(0x20,$inp));
1862 &xorps ($inout2,$rndkey0);
1863 &movups ($rndkey0,&QWP(0x30,$inp));
1864 &xorps ($inout3,$rndkey1);
1865 &movups ($ivec,&QWP(0x40,$inp)); # IV
1866 &xorps ($inout4,$rndkey0);
1867 &movups (&QWP(0,$out),$inout0);
1868 &movups (&QWP(0x10,$out),$inout1);
1869 &movups (&QWP(0x20,$out),$inout2);
1870 &movups (&QWP(0x30,$out),$inout3);
1871 &lea ($out,&DWP(0x40,$out));
1872 &movaps ($inout0,$inout4);
1873 &sub ($len,0x50);
1874 &jmp (&label("cbc_dec_tail_collected"));
1875
1876&set_label("cbc_dec_one",16);
1877 if ($inline)
1878 { &aesni_inline_generate1("dec"); }
1879 else
1880 { &call ("_aesni_decrypt1"); }
1881 &xorps ($inout0,$ivec);
1882 &movaps ($ivec,$in0);
1883 &sub ($len,0x10);
1884 &jmp (&label("cbc_dec_tail_collected"));
1885
1886&set_label("cbc_dec_two",16);
1887 &xorps ($inout2,$inout2);
1888 &call ("_aesni_decrypt3");
1889 &xorps ($inout0,$ivec);
1890 &xorps ($inout1,$in0);
1891 &movups (&QWP(0,$out),$inout0);
1892 &movaps ($inout0,$inout1);
1893 &lea ($out,&DWP(0x10,$out));
1894 &movaps ($ivec,$in1);
1895 &sub ($len,0x20);
1896 &jmp (&label("cbc_dec_tail_collected"));
1897
1898&set_label("cbc_dec_three",16);
1899 &call ("_aesni_decrypt3");
1900 &xorps ($inout0,$ivec);
1901 &xorps ($inout1,$in0);
1902 &xorps ($inout2,$in1);
1903 &movups (&QWP(0,$out),$inout0);
1904 &movaps ($inout0,$inout2);
1905 &movups (&QWP(0x10,$out),$inout1);
1906 &lea ($out,&DWP(0x20,$out));
1907 &movups ($ivec,&QWP(0x20,$inp));
1908 &sub ($len,0x30);
1909 &jmp (&label("cbc_dec_tail_collected"));
1910
1911&set_label("cbc_dec_four",16);
1912 &call ("_aesni_decrypt4");
1913 &movups ($rndkey1,&QWP(0x10,$inp));
1914 &movups ($rndkey0,&QWP(0x20,$inp));
1915 &xorps ($inout0,$ivec);
1916 &movups ($ivec,&QWP(0x30,$inp));
1917 &xorps ($inout1,$in0);
1918 &movups (&QWP(0,$out),$inout0);
1919 &xorps ($inout2,$rndkey1);
1920 &movups (&QWP(0x10,$out),$inout1);
1921 &xorps ($inout3,$rndkey0);
1922 &movups (&QWP(0x20,$out),$inout2);
1923 &lea ($out,&DWP(0x30,$out));
1924 &movaps ($inout0,$inout3);
1925 &sub ($len,0x40);
1926
1927&set_label("cbc_dec_tail_collected");
1928 &and ($len,15);
1929 &jnz (&label("cbc_dec_tail_partial"));
1930 &movups (&QWP(0,$out),$inout0);
1931 &jmp (&label("cbc_ret"));
1932
1933&set_label("cbc_dec_tail_partial",16);
1934 &movaps (&QWP(0,"esp"),$inout0);
1935 &mov ("ecx",16);
1936 &mov ($inp,"esp");
1937 &sub ("ecx",$len);
1938 &data_word(0xA4F3F689); # rep movsb
1939
1940&set_label("cbc_ret");
1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1942 &mov ($key_,&wparam(4));
1943 &movups (&QWP(0,$key_),$ivec); # output IV
1944&set_label("cbc_abort");
1945&function_end("${PREFIX}_cbc_encrypt");
1946
1947######################################################################
1948# Mechanical port from aesni-x86_64.pl.
1949#
1950# _aesni_set_encrypt_key is private interface,
1951# input:
1952# "eax" const unsigned char *userKey
1953# $rounds int bits
1954# $key AES_KEY *key
1955# output:
1956# "eax" return code
1957# $round rounds
1958
1959&function_begin_B("_aesni_set_encrypt_key");
1960 &test ("eax","eax");
1961 &jz (&label("bad_pointer"));
1962 &test ($key,$key);
1963 &jz (&label("bad_pointer"));
1964
1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea ($key,&DWP(16,$key));
1968 &cmp ($rounds,256);
1969 &je (&label("14rounds"));
1970 &cmp ($rounds,192);
1971 &je (&label("12rounds"));
1972 &cmp ($rounds,128);
1973 &jne (&label("bad_keybits"));
1974
1975&set_label("10rounds",16);
1976 &mov ($rounds,9);
1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1979 &call (&label("key_128_cold"));
1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1995 &call (&label("key_128"));
1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1997 &call (&label("key_128"));
1998 &$movekey (&QWP(0,$key),"xmm0");
1999 &mov (&DWP(80,$key),$rounds);
2000 &xor ("eax","eax");
2001 &ret();
2002
2003&set_label("key_128",16);
2004 &$movekey (&QWP(0,$key),"xmm0");
2005 &lea ($key,&DWP(16,$key));
2006&set_label("key_128_cold");
2007 &shufps ("xmm4","xmm0",0b00010000);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm4","xmm0",0b10001100);
2010 &xorps ("xmm0","xmm4");
2011 &shufps ("xmm1","xmm1",0b11111111); # critical path
2012 &xorps ("xmm0","xmm1");
2013 &ret();
2014
2015&set_label("12rounds",16);
2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2017 &mov ($rounds,11);
2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2020 &call (&label("key_192a_cold"));
2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2022 &call (&label("key_192b"));
2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2024 &call (&label("key_192a"));
2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2026 &call (&label("key_192b"));
2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2028 &call (&label("key_192a"));
2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2030 &call (&label("key_192b"));
2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2032 &call (&label("key_192a"));
2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2034 &call (&label("key_192b"));
2035 &$movekey (&QWP(0,$key),"xmm0");
2036 &mov (&DWP(48,$key),$rounds);
2037 &xor ("eax","eax");
2038 &ret();
2039
2040&set_label("key_192a",16);
2041 &$movekey (&QWP(0,$key),"xmm0");
2042 &lea ($key,&DWP(16,$key));
2043&set_label("key_192a_cold",16);
2044 &movaps ("xmm5","xmm2");
2045&set_label("key_192b_warm");
2046 &shufps ("xmm4","xmm0",0b00010000);
2047 &movdqa ("xmm3","xmm2");
2048 &xorps ("xmm0","xmm4");
2049 &shufps ("xmm4","xmm0",0b10001100);
2050 &pslldq ("xmm3",4);
2051 &xorps ("xmm0","xmm4");
2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2053 &pxor ("xmm2","xmm3");
2054 &pxor ("xmm0","xmm1");
2055 &pshufd ("xmm3","xmm0",0b11111111);
2056 &pxor ("xmm2","xmm3");
2057 &ret();
2058
2059&set_label("key_192b",16);
2060 &movaps ("xmm3","xmm0");
2061 &shufps ("xmm5","xmm0",0b01000100);
2062 &$movekey (&QWP(0,$key),"xmm5");
2063 &shufps ("xmm3","xmm2",0b01001110);
2064 &$movekey (&QWP(16,$key),"xmm3");
2065 &lea ($key,&DWP(32,$key));
2066 &jmp (&label("key_192b_warm"));
2067
2068&set_label("14rounds",16);
2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2070 &mov ($rounds,13);
2071 &lea ($key,&DWP(16,$key));
2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2075 &call (&label("key_256a_cold"));
2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2077 &call (&label("key_256b"));
2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2079 &call (&label("key_256a"));
2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2081 &call (&label("key_256b"));
2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2083 &call (&label("key_256a"));
2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2085 &call (&label("key_256b"));
2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2087 &call (&label("key_256a"));
2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2089 &call (&label("key_256b"));
2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2091 &call (&label("key_256a"));
2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2093 &call (&label("key_256b"));
2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2095 &call (&label("key_256a"));
2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2097 &call (&label("key_256b"));
2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2099 &call (&label("key_256a"));
2100 &$movekey (&QWP(0,$key),"xmm0");
2101 &mov (&DWP(16,$key),$rounds);
2102 &xor ("eax","eax");
2103 &ret();
2104
2105&set_label("key_256a",16);
2106 &$movekey (&QWP(0,$key),"xmm2");
2107 &lea ($key,&DWP(16,$key));
2108&set_label("key_256a_cold");
2109 &shufps ("xmm4","xmm0",0b00010000);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm4","xmm0",0b10001100);
2112 &xorps ("xmm0","xmm4");
2113 &shufps ("xmm1","xmm1",0b11111111); # critical path
2114 &xorps ("xmm0","xmm1");
2115 &ret();
2116
2117&set_label("key_256b",16);
2118 &$movekey (&QWP(0,$key),"xmm0");
2119 &lea ($key,&DWP(16,$key));
2120
2121 &shufps ("xmm4","xmm2",0b00010000);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm4","xmm2",0b10001100);
2124 &xorps ("xmm2","xmm4");
2125 &shufps ("xmm1","xmm1",0b10101010); # critical path
2126 &xorps ("xmm2","xmm1");
2127 &ret();
2128
2129&set_label("bad_pointer",4);
2130 &mov ("eax",-1);
2131 &ret ();
2132&set_label("bad_keybits",4);
2133 &mov ("eax",-2);
2134 &ret ();
2135&function_end_B("_aesni_set_encrypt_key");
2136
2137# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138# AES_KEY *key)
2139&function_begin_B("${PREFIX}_set_encrypt_key");
2140 &mov ("eax",&wparam(0));
2141 &mov ($rounds,&wparam(1));
2142 &mov ($key,&wparam(2));
2143 &call ("_aesni_set_encrypt_key");
2144 &ret ();
2145&function_end_B("${PREFIX}_set_encrypt_key");
2146
2147# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148# AES_KEY *key)
2149&function_begin_B("${PREFIX}_set_decrypt_key");
2150 &mov ("eax",&wparam(0));
2151 &mov ($rounds,&wparam(1));
2152 &mov ($key,&wparam(2));
2153 &call ("_aesni_set_encrypt_key");
2154 &mov ($key,&wparam(2));
2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test ("eax","eax");
2157 &jnz (&label("dec_key_ret"));
2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2159
2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP(0,"eax"));
2162 &$movekey (&QWP(0,"eax"),"xmm0");
2163 &$movekey (&QWP(0,$key),"xmm1");
2164 &lea ($key,&DWP(16,$key));
2165 &lea ("eax",&DWP(-16,"eax"));
2166
2167&set_label("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP(0,"eax"));
2170 &aesimc ("xmm0","xmm0");
2171 &aesimc ("xmm1","xmm1");
2172 &lea ($key,&DWP(16,$key));
2173 &lea ("eax",&DWP(-16,"eax"));
2174 &$movekey (&QWP(16,"eax"),"xmm0");
2175 &$movekey (&QWP(-16,$key),"xmm1");
2176 &cmp ("eax",$key);
2177 &ja (&label("dec_key_inverse"));
2178
2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2180 &aesimc ("xmm0","xmm0");
2181 &$movekey (&QWP(0,$key),"xmm0");
2182
2183 &xor ("eax","eax"); # return success
2184&set_label("dec_key_ret");
2185 &ret ();
2186&function_end_B("${PREFIX}_set_decrypt_key");
2187&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188
2189&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
index 49e0f4b351..499f3b3f42 100644
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
@@ -11,6 +11,151 @@
11# OpenSSL context it's used with Intel engine, but can also be used as 11# OpenSSL context it's used with Intel engine, but can also be used as
12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for 12# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13# details]. 13# details].
14#
15# Performance.
16#
17# Given aes(enc|dec) instructions' latency asymptotic performance for
18# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19# processed with 128-bit key. And given their throughput asymptotic
20# performance for parallelizable modes is 1.25 cycles per byte. Being
21# asymptotic limit it's not something you commonly achieve in reality,
22# but how close does one get? Below are results collected for
23# different modes and block sized. Pairs of numbers are for en-/
24# decryption.
25#
26# 16-byte 64-byte 256-byte 1-KB 8-KB
27# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33#
34# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37# The results were collected with specially crafted speed.c benchmark
38# in order to compare them with results reported in "Intel Advanced
39# Encryption Standard (AES) New Instruction Set" White Paper Revision
40# 3.0 dated May 2010. All above results are consistently better. This
41# module also provides better performance for block sizes smaller than
42# 128 bytes in points *not* represented in the above table.
43#
44# Looking at the results for 8-KB buffer.
45#
46# CFB and OFB results are far from the limit, because implementation
47# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48# single-block aesni_encrypt, which is not the most optimal way to go.
49# CBC encrypt result is unexpectedly high and there is no documented
50# explanation for it. Seemingly there is a small penalty for feeding
51# the result back to AES unit the way it's done in CBC mode. There is
52# nothing one can do and the result appears optimal. CCM result is
53# identical to CBC, because CBC-MAC is essentially CBC encrypt without
54# saving output. CCM CTR "stays invisible," because it's neatly
55# interleaved wih CBC-MAC. This provides ~30% improvement over
56# "straghtforward" CCM implementation with CTR and CBC-MAC performed
57# disjointly. Parallelizable modes practically achieve the theoretical
58# limit.
59#
60# Looking at how results vary with buffer size.
61#
62# Curves are practically saturated at 1-KB buffer size. In most cases
63# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64# CTR curve doesn't follow this pattern and is "slowest" changing one
65# with "256-byte" result being 87% of "8-KB." This is because overhead
66# in CTR mode is most computationally intensive. Small-block CCM
67# decrypt is slower than encrypt, because first CTR and last CBC-MAC
68# iterations can't be interleaved.
69#
70# Results for 192- and 256-bit keys.
71#
72# EVP-free results were observed to scale perfectly with number of
73# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75# are a tad smaller, because the above mentioned penalty biases all
76# results by same constant value. In similar way function call
77# overhead affects small-block performance, as well as OFB and CFB
78# results. Differences are not large, most common coefficients are
79# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
82# January 2011
83#
84# While Westmere processor features 6 cycles latency for aes[enc|dec]
85# instructions, which can be scheduled every second cycle, Sandy
86# Bridge spends 8 cycles per instruction, but it can schedule them
87# every cycle. This means that code targeting Westmere would perform
88# suboptimally on Sandy Bridge. Therefore this update.
89#
90# In addition, non-parallelizable CBC encrypt (as well as CCM) is
91# optimized. Relative improvement might appear modest, 8% on Westmere,
92# but in absolute terms it's 3.77 cycles per byte encrypted with
93# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94# should be compared to asymptotic limits of 3.75 for Westmere and
95# 5.00 for Sandy Bridge. Actually, the fact that they get this close
96# to asymptotic limits is quite amazing. Indeed, the limit is
97# calculated as latency times number of rounds, 10 for 128-bit key,
98# and divided by 16, the number of bytes in block, or in other words
99# it accounts *solely* for aesenc instructions. But there are extra
100# instructions, and numbers so close to the asymptotic limits mean
101# that it's as if it takes as little as *one* additional cycle to
102# execute all of them. How is it possible? It is possible thanks to
103# out-of-order execution logic, which manages to overlap post-
104# processing of previous block, things like saving the output, with
105# actual encryption of current block, as well as pre-processing of
106# current block, things like fetching input and xor-ing it with
107# 0-round element of the key schedule, with actual encryption of
108# previous block. Keep this in mind...
109#
110# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111# performance is achieved by interleaving instructions working on
112# independent blocks. In which case asymptotic limit for such modes
113# can be obtained by dividing above mentioned numbers by AES
114# instructions' interleave factor. Westmere can execute at most 3
115# instructions at a time, meaning that optimal interleave factor is 3,
116# and that's where the "magic" number of 1.25 come from. "Optimal
117# interleave factor" means that increase of interleave factor does
118# not improve performance. The formula has proven to reflect reality
119# pretty well on Westmere... Sandy Bridge on the other hand can
120# execute up to 8 AES instructions at a time, so how does varying
121# interleave factor affect the performance? Here is table for ECB
122# (numbers are cycles per byte processed with 128-bit key):
123#
124# instruction interleave factor 3x 6x 8x
125# theoretical asymptotic limit 1.67 0.83 0.625
126# measured performance for 8KB block 1.05 0.86 0.84
127#
128# "as if" interleave factor 4.7x 5.8x 6.0x
129#
130# Further data for other parallelizable modes:
131#
132# CBC decrypt 1.16 0.93 0.93
133# CTR 1.14 0.91 n/a
134#
135# Well, given 3x column it's probably inappropriate to call the limit
136# asymptotic, if it can be surpassed, isn't it? What happens there?
137# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138# magic is responsible for this. Processor overlaps not only the
139# additional instructions with AES ones, but even AES instuctions
140# processing adjacent triplets of independent blocks. In the 6x case
141# additional instructions still claim disproportionally small amount
142# of additional cycles, but in 8x case number of instructions must be
143# a tad too high for out-of-order logic to cope with, and AES unit
144# remains underutilized... As you can see 8x interleave is hardly
145# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146# utilizies 6x interleave because of limited register bank capacity.
147#
148# Higher interleave factors do have negative impact on Westmere
149# performance. While for ECB mode it's negligible ~1.5%, other
150# parallelizables perform ~5% worse, which is outweighed by ~25%
151# improvement on Sandy Bridge. To balance regression on Westmere
152# CTR mode was implemented with 6x aesenc interleave factor.
153
154# April 2011
155#
156# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
157# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
158# in CTR mode AES instruction interleave factor was chosen to be 6x.
14 159
15$PREFIX="aesni"; # if $PREFIX is set to "AES", the script 160$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for 161 # generates drop-in replacement for
@@ -29,7 +174,7 @@ die "can't locate x86_64-xlate.pl";
29 174
30open STDOUT,"| $^X $xlate $flavour $output"; 175open STDOUT,"| $^X $xlate $flavour $output";
31 176
32$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; 177$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
33@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 178@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
34 ("%rdi","%rsi","%rdx","%rcx"); # Unix order 179 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
35 180
@@ -41,18 +186,20 @@ $inp="%rdi";
41$out="%rsi"; 186$out="%rsi";
42$len="%rdx"; 187$len="%rdx";
43$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! 188$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
44$ivp="%r8"; # cbc 189$ivp="%r8"; # cbc, ctr, ...
45 190
46$rnds_="%r10d"; # backup copy for $rounds 191$rnds_="%r10d"; # backup copy for $rounds
47$key_="%r11"; # backup copy for $key 192$key_="%r11"; # backup copy for $key
48 193
49# %xmm register layout 194# %xmm register layout
50$inout0="%xmm0"; $inout1="%xmm1"; 195$rndkey0="%xmm0"; $rndkey1="%xmm1";
51$inout2="%xmm2"; $inout3="%xmm3"; 196$inout0="%xmm2"; $inout1="%xmm3";
52$rndkey0="%xmm4"; $rndkey1="%xmm5"; 197$inout2="%xmm4"; $inout3="%xmm5";
53 198$inout4="%xmm6"; $inout5="%xmm7";
54$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt 199$inout6="%xmm8"; $inout7="%xmm9";
55$in1="%xmm8"; $in2="%xmm9"; 200
201$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
202$in0="%xmm8"; $iv="%xmm9";
56 203
57# Inline version of internal aesni_[en|de]crypt1. 204# Inline version of internal aesni_[en|de]crypt1.
58# 205#
@@ -60,20 +207,29 @@ $in1="%xmm8"; $in2="%xmm9";
60# cycles which take care of loop variables... 207# cycles which take care of loop variables...
61{ my $sn; 208{ my $sn;
62sub aesni_generate1 { 209sub aesni_generate1 {
63my ($p,$key,$rounds)=@_; 210my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
64++$sn; 211++$sn;
65$code.=<<___; 212$code.=<<___;
66 $movkey ($key),$rndkey0 213 $movkey ($key),$rndkey0
67 $movkey 16($key),$rndkey1 214 $movkey 16($key),$rndkey1
215___
216$code.=<<___ if (defined($ivec));
217 xorps $rndkey0,$ivec
218 lea 32($key),$key
219 xorps $ivec,$inout
220___
221$code.=<<___ if (!defined($ivec));
68 lea 32($key),$key 222 lea 32($key),$key
69 pxor $rndkey0,$inout0 223 xorps $rndkey0,$inout
224___
225$code.=<<___;
70.Loop_${p}1_$sn: 226.Loop_${p}1_$sn:
71 aes${p} $rndkey1,$inout0 227 aes${p} $rndkey1,$inout
72 dec $rounds 228 dec $rounds
73 $movkey ($key),$rndkey1 229 $movkey ($key),$rndkey1
74 lea 16($key),$key 230 lea 16($key),$key
75 jnz .Loop_${p}1_$sn # loop body is 16 bytes 231 jnz .Loop_${p}1_$sn # loop body is 16 bytes
76 aes${p}last $rndkey1,$inout0 232 aes${p}last $rndkey1,$inout
77___ 233___
78}} 234}}
79# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); 235# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
@@ -86,7 +242,7 @@ $code.=<<___;
86.align 16 242.align 16
87${PREFIX}_encrypt: 243${PREFIX}_encrypt:
88 movups ($inp),$inout0 # load input 244 movups ($inp),$inout0 # load input
89 mov 240($key),$rounds # pull $rounds 245 mov 240($key),$rounds # key->rounds
90___ 246___
91 &aesni_generate1("enc",$key,$rounds); 247 &aesni_generate1("enc",$key,$rounds);
92$code.=<<___; 248$code.=<<___;
@@ -99,7 +255,7 @@ $code.=<<___;
99.align 16 255.align 16
100${PREFIX}_decrypt: 256${PREFIX}_decrypt:
101 movups ($inp),$inout0 # load input 257 movups ($inp),$inout0 # load input
102 mov 240($key),$rounds # pull $rounds 258 mov 240($key),$rounds # key->rounds
103___ 259___
104 &aesni_generate1("dec",$key,$rounds); 260 &aesni_generate1("dec",$key,$rounds);
105$code.=<<___; 261$code.=<<___;
@@ -109,16 +265,16 @@ $code.=<<___;
109___ 265___
110} 266}
111 267
112# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave 268# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
113# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] 269# factor. Why 3x subroutine were originally used in loops? Even though
114# latency is 6, it turned out that it can be scheduled only every 270# aes[enc|dec] latency was originally 6, it could be scheduled only
115# *second* cycle. Thus 3x interleave is the one providing optimal 271# every *2nd* cycle. Thus 3x interleave was the one providing optimal
116# utilization, i.e. when subroutine's throughput is virtually same as 272# utilization, i.e. when subroutine's throughput is virtually same as
117# of non-interleaved subroutine [for number of input blocks up to 3]. 273# of non-interleaved subroutine [for number of input blocks up to 3].
118# This is why it makes no sense to implement 2x subroutine. As soon 274# This is why it makes no sense to implement 2x subroutine.
119# as/if Intel improves throughput by making it possible to schedule 275# aes[enc|dec] latency in next processor generation is 8, but the
120# the instructions in question *every* cycles I would have to 276# instructions can be scheduled every cycle. Optimal interleave for
121# implement 6x interleave and use it in loop... 277# new processor is therefore 8x...
122sub aesni_generate3 { 278sub aesni_generate3 {
123my $dir=shift; 279my $dir=shift;
124# As already mentioned it takes in $key and $rounds, which are *not* 280# As already mentioned it takes in $key and $rounds, which are *not*
@@ -131,25 +287,25 @@ _aesni_${dir}rypt3:
131 shr \$1,$rounds 287 shr \$1,$rounds
132 $movkey 16($key),$rndkey1 288 $movkey 16($key),$rndkey1
133 lea 32($key),$key 289 lea 32($key),$key
134 pxor $rndkey0,$inout0 290 xorps $rndkey0,$inout0
135 pxor $rndkey0,$inout1 291 xorps $rndkey0,$inout1
136 pxor $rndkey0,$inout2 292 xorps $rndkey0,$inout2
293 $movkey ($key),$rndkey0
137 294
138.L${dir}_loop3: 295.L${dir}_loop3:
139 aes${dir} $rndkey1,$inout0 296 aes${dir} $rndkey1,$inout0
140 $movkey ($key),$rndkey0
141 aes${dir} $rndkey1,$inout1 297 aes${dir} $rndkey1,$inout1
142 dec $rounds 298 dec $rounds
143 aes${dir} $rndkey1,$inout2 299 aes${dir} $rndkey1,$inout2
144 aes${dir} $rndkey0,$inout0
145 $movkey 16($key),$rndkey1 300 $movkey 16($key),$rndkey1
301 aes${dir} $rndkey0,$inout0
146 aes${dir} $rndkey0,$inout1 302 aes${dir} $rndkey0,$inout1
147 lea 32($key),$key 303 lea 32($key),$key
148 aes${dir} $rndkey0,$inout2 304 aes${dir} $rndkey0,$inout2
305 $movkey ($key),$rndkey0
149 jnz .L${dir}_loop3 306 jnz .L${dir}_loop3
150 307
151 aes${dir} $rndkey1,$inout0 308 aes${dir} $rndkey1,$inout0
152 $movkey ($key),$rndkey0
153 aes${dir} $rndkey1,$inout1 309 aes${dir} $rndkey1,$inout1
154 aes${dir} $rndkey1,$inout2 310 aes${dir} $rndkey1,$inout2
155 aes${dir}last $rndkey0,$inout0 311 aes${dir}last $rndkey0,$inout0
@@ -175,28 +331,28 @@ _aesni_${dir}rypt4:
175 shr \$1,$rounds 331 shr \$1,$rounds
176 $movkey 16($key),$rndkey1 332 $movkey 16($key),$rndkey1
177 lea 32($key),$key 333 lea 32($key),$key
178 pxor $rndkey0,$inout0 334 xorps $rndkey0,$inout0
179 pxor $rndkey0,$inout1 335 xorps $rndkey0,$inout1
180 pxor $rndkey0,$inout2 336 xorps $rndkey0,$inout2
181 pxor $rndkey0,$inout3 337 xorps $rndkey0,$inout3
338 $movkey ($key),$rndkey0
182 339
183.L${dir}_loop4: 340.L${dir}_loop4:
184 aes${dir} $rndkey1,$inout0 341 aes${dir} $rndkey1,$inout0
185 $movkey ($key),$rndkey0
186 aes${dir} $rndkey1,$inout1 342 aes${dir} $rndkey1,$inout1
187 dec $rounds 343 dec $rounds
188 aes${dir} $rndkey1,$inout2 344 aes${dir} $rndkey1,$inout2
189 aes${dir} $rndkey1,$inout3 345 aes${dir} $rndkey1,$inout3
190 aes${dir} $rndkey0,$inout0
191 $movkey 16($key),$rndkey1 346 $movkey 16($key),$rndkey1
347 aes${dir} $rndkey0,$inout0
192 aes${dir} $rndkey0,$inout1 348 aes${dir} $rndkey0,$inout1
193 lea 32($key),$key 349 lea 32($key),$key
194 aes${dir} $rndkey0,$inout2 350 aes${dir} $rndkey0,$inout2
195 aes${dir} $rndkey0,$inout3 351 aes${dir} $rndkey0,$inout3
352 $movkey ($key),$rndkey0
196 jnz .L${dir}_loop4 353 jnz .L${dir}_loop4
197 354
198 aes${dir} $rndkey1,$inout0 355 aes${dir} $rndkey1,$inout0
199 $movkey ($key),$rndkey0
200 aes${dir} $rndkey1,$inout1 356 aes${dir} $rndkey1,$inout1
201 aes${dir} $rndkey1,$inout2 357 aes${dir} $rndkey1,$inout2
202 aes${dir} $rndkey1,$inout3 358 aes${dir} $rndkey1,$inout3
@@ -208,12 +364,158 @@ _aesni_${dir}rypt4:
208.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 364.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
209___ 365___
210} 366}
367sub aesni_generate6 {
368my $dir=shift;
369# As already mentioned it takes in $key and $rounds, which are *not*
370# preserved. $inout[0-5] is cipher/clear text...
371$code.=<<___;
372.type _aesni_${dir}rypt6,\@abi-omnipotent
373.align 16
374_aesni_${dir}rypt6:
375 $movkey ($key),$rndkey0
376 shr \$1,$rounds
377 $movkey 16($key),$rndkey1
378 lea 32($key),$key
379 xorps $rndkey0,$inout0
380 pxor $rndkey0,$inout1
381 aes${dir} $rndkey1,$inout0
382 pxor $rndkey0,$inout2
383 aes${dir} $rndkey1,$inout1
384 pxor $rndkey0,$inout3
385 aes${dir} $rndkey1,$inout2
386 pxor $rndkey0,$inout4
387 aes${dir} $rndkey1,$inout3
388 pxor $rndkey0,$inout5
389 dec $rounds
390 aes${dir} $rndkey1,$inout4
391 $movkey ($key),$rndkey0
392 aes${dir} $rndkey1,$inout5
393 jmp .L${dir}_loop6_enter
394.align 16
395.L${dir}_loop6:
396 aes${dir} $rndkey1,$inout0
397 aes${dir} $rndkey1,$inout1
398 dec $rounds
399 aes${dir} $rndkey1,$inout2
400 aes${dir} $rndkey1,$inout3
401 aes${dir} $rndkey1,$inout4
402 aes${dir} $rndkey1,$inout5
403.L${dir}_loop6_enter: # happens to be 16-byte aligned
404 $movkey 16($key),$rndkey1
405 aes${dir} $rndkey0,$inout0
406 aes${dir} $rndkey0,$inout1
407 lea 32($key),$key
408 aes${dir} $rndkey0,$inout2
409 aes${dir} $rndkey0,$inout3
410 aes${dir} $rndkey0,$inout4
411 aes${dir} $rndkey0,$inout5
412 $movkey ($key),$rndkey0
413 jnz .L${dir}_loop6
414
415 aes${dir} $rndkey1,$inout0
416 aes${dir} $rndkey1,$inout1
417 aes${dir} $rndkey1,$inout2
418 aes${dir} $rndkey1,$inout3
419 aes${dir} $rndkey1,$inout4
420 aes${dir} $rndkey1,$inout5
421 aes${dir}last $rndkey0,$inout0
422 aes${dir}last $rndkey0,$inout1
423 aes${dir}last $rndkey0,$inout2
424 aes${dir}last $rndkey0,$inout3
425 aes${dir}last $rndkey0,$inout4
426 aes${dir}last $rndkey0,$inout5
427 ret
428.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
429___
430}
431sub aesni_generate8 {
432my $dir=shift;
433# As already mentioned it takes in $key and $rounds, which are *not*
434# preserved. $inout[0-7] is cipher/clear text...
435$code.=<<___;
436.type _aesni_${dir}rypt8,\@abi-omnipotent
437.align 16
438_aesni_${dir}rypt8:
439 $movkey ($key),$rndkey0
440 shr \$1,$rounds
441 $movkey 16($key),$rndkey1
442 lea 32($key),$key
443 xorps $rndkey0,$inout0
444 xorps $rndkey0,$inout1
445 aes${dir} $rndkey1,$inout0
446 pxor $rndkey0,$inout2
447 aes${dir} $rndkey1,$inout1
448 pxor $rndkey0,$inout3
449 aes${dir} $rndkey1,$inout2
450 pxor $rndkey0,$inout4
451 aes${dir} $rndkey1,$inout3
452 pxor $rndkey0,$inout5
453 dec $rounds
454 aes${dir} $rndkey1,$inout4
455 pxor $rndkey0,$inout6
456 aes${dir} $rndkey1,$inout5
457 pxor $rndkey0,$inout7
458 $movkey ($key),$rndkey0
459 aes${dir} $rndkey1,$inout6
460 aes${dir} $rndkey1,$inout7
461 $movkey 16($key),$rndkey1
462 jmp .L${dir}_loop8_enter
463.align 16
464.L${dir}_loop8:
465 aes${dir} $rndkey1,$inout0
466 aes${dir} $rndkey1,$inout1
467 dec $rounds
468 aes${dir} $rndkey1,$inout2
469 aes${dir} $rndkey1,$inout3
470 aes${dir} $rndkey1,$inout4
471 aes${dir} $rndkey1,$inout5
472 aes${dir} $rndkey1,$inout6
473 aes${dir} $rndkey1,$inout7
474 $movkey 16($key),$rndkey1
475.L${dir}_loop8_enter: # happens to be 16-byte aligned
476 aes${dir} $rndkey0,$inout0
477 aes${dir} $rndkey0,$inout1
478 lea 32($key),$key
479 aes${dir} $rndkey0,$inout2
480 aes${dir} $rndkey0,$inout3
481 aes${dir} $rndkey0,$inout4
482 aes${dir} $rndkey0,$inout5
483 aes${dir} $rndkey0,$inout6
484 aes${dir} $rndkey0,$inout7
485 $movkey ($key),$rndkey0
486 jnz .L${dir}_loop8
487
488 aes${dir} $rndkey1,$inout0
489 aes${dir} $rndkey1,$inout1
490 aes${dir} $rndkey1,$inout2
491 aes${dir} $rndkey1,$inout3
492 aes${dir} $rndkey1,$inout4
493 aes${dir} $rndkey1,$inout5
494 aes${dir} $rndkey1,$inout6
495 aes${dir} $rndkey1,$inout7
496 aes${dir}last $rndkey0,$inout0
497 aes${dir}last $rndkey0,$inout1
498 aes${dir}last $rndkey0,$inout2
499 aes${dir}last $rndkey0,$inout3
500 aes${dir}last $rndkey0,$inout4
501 aes${dir}last $rndkey0,$inout5
502 aes${dir}last $rndkey0,$inout6
503 aes${dir}last $rndkey0,$inout7
504 ret
505.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
506___
507}
211&aesni_generate3("enc") if ($PREFIX eq "aesni"); 508&aesni_generate3("enc") if ($PREFIX eq "aesni");
212&aesni_generate3("dec"); 509&aesni_generate3("dec");
213&aesni_generate4("enc") if ($PREFIX eq "aesni"); 510&aesni_generate4("enc") if ($PREFIX eq "aesni");
214&aesni_generate4("dec"); 511&aesni_generate4("dec");
512&aesni_generate6("enc") if ($PREFIX eq "aesni");
513&aesni_generate6("dec");
514&aesni_generate8("enc") if ($PREFIX eq "aesni");
515&aesni_generate8("dec");
215 516
216if ($PREFIX eq "aesni") { 517if ($PREFIX eq "aesni") {
518########################################################################
217# void aesni_ecb_encrypt (const void *in, void *out, 519# void aesni_ecb_encrypt (const void *in, void *out,
218# size_t length, const AES_KEY *key, 520# size_t length, const AES_KEY *key,
219# int enc); 521# int enc);
@@ -222,54 +524,98 @@ $code.=<<___;
222.type aesni_ecb_encrypt,\@function,5 524.type aesni_ecb_encrypt,\@function,5
223.align 16 525.align 16
224aesni_ecb_encrypt: 526aesni_ecb_encrypt:
225 cmp \$16,$len # check length
226 jb .Lecb_ret
227
228 mov 240($key),$rounds # pull $rounds
229 and \$-16,$len 527 and \$-16,$len
528 jz .Lecb_ret
529
530 mov 240($key),$rounds # key->rounds
531 $movkey ($key),$rndkey0
230 mov $key,$key_ # backup $key 532 mov $key,$key_ # backup $key
231 test %r8d,%r8d # 5th argument
232 mov $rounds,$rnds_ # backup $rounds 533 mov $rounds,$rnds_ # backup $rounds
534 test %r8d,%r8d # 5th argument
233 jz .Lecb_decrypt 535 jz .Lecb_decrypt
234#--------------------------- ECB ENCRYPT ------------------------------# 536#--------------------------- ECB ENCRYPT ------------------------------#
235 sub \$0x40,$len 537 cmp \$0x80,$len
236 jbe .Lecb_enc_tail 538 jb .Lecb_enc_tail
237 jmp .Lecb_enc_loop3 539
540 movdqu ($inp),$inout0
541 movdqu 0x10($inp),$inout1
542 movdqu 0x20($inp),$inout2
543 movdqu 0x30($inp),$inout3
544 movdqu 0x40($inp),$inout4
545 movdqu 0x50($inp),$inout5
546 movdqu 0x60($inp),$inout6
547 movdqu 0x70($inp),$inout7
548 lea 0x80($inp),$inp
549 sub \$0x80,$len
550 jmp .Lecb_enc_loop8_enter
238.align 16 551.align 16
239.Lecb_enc_loop3: 552.Lecb_enc_loop8:
240 movups ($inp),$inout0 553 movups $inout0,($out)
241 movups 0x10($inp),$inout1
242 movups 0x20($inp),$inout2
243 call _aesni_encrypt3
244 sub \$0x30,$len
245 lea 0x30($inp),$inp
246 lea 0x30($out),$out
247 movups $inout0,-0x30($out)
248 mov $rnds_,$rounds # restore $rounds
249 movups $inout1,-0x20($out)
250 mov $key_,$key # restore $key 554 mov $key_,$key # restore $key
251 movups $inout2,-0x10($out) 555 movdqu ($inp),$inout0
252 ja .Lecb_enc_loop3 556 mov $rnds_,$rounds # restore $rounds
557 movups $inout1,0x10($out)
558 movdqu 0x10($inp),$inout1
559 movups $inout2,0x20($out)
560 movdqu 0x20($inp),$inout2
561 movups $inout3,0x30($out)
562 movdqu 0x30($inp),$inout3
563 movups $inout4,0x40($out)
564 movdqu 0x40($inp),$inout4
565 movups $inout5,0x50($out)
566 movdqu 0x50($inp),$inout5
567 movups $inout6,0x60($out)
568 movdqu 0x60($inp),$inout6
569 movups $inout7,0x70($out)
570 lea 0x80($out),$out
571 movdqu 0x70($inp),$inout7
572 lea 0x80($inp),$inp
573.Lecb_enc_loop8_enter:
574
575 call _aesni_encrypt8
576
577 sub \$0x80,$len
578 jnc .Lecb_enc_loop8
253 579
254.Lecb_enc_tail: 580 movups $inout0,($out)
255 add \$0x40,$len 581 mov $key_,$key # restore $key
582 movups $inout1,0x10($out)
583 mov $rnds_,$rounds # restore $rounds
584 movups $inout2,0x20($out)
585 movups $inout3,0x30($out)
586 movups $inout4,0x40($out)
587 movups $inout5,0x50($out)
588 movups $inout6,0x60($out)
589 movups $inout7,0x70($out)
590 lea 0x80($out),$out
591 add \$0x80,$len
256 jz .Lecb_ret 592 jz .Lecb_ret
257 593
258 cmp \$0x10,$len 594.Lecb_enc_tail:
259 movups ($inp),$inout0 595 movups ($inp),$inout0
260 je .Lecb_enc_one
261 cmp \$0x20,$len 596 cmp \$0x20,$len
597 jb .Lecb_enc_one
262 movups 0x10($inp),$inout1 598 movups 0x10($inp),$inout1
263 je .Lecb_enc_two 599 je .Lecb_enc_two
264 cmp \$0x30,$len
265 movups 0x20($inp),$inout2 600 movups 0x20($inp),$inout2
266 je .Lecb_enc_three 601 cmp \$0x40,$len
602 jb .Lecb_enc_three
267 movups 0x30($inp),$inout3 603 movups 0x30($inp),$inout3
268 call _aesni_encrypt4 604 je .Lecb_enc_four
605 movups 0x40($inp),$inout4
606 cmp \$0x60,$len
607 jb .Lecb_enc_five
608 movups 0x50($inp),$inout5
609 je .Lecb_enc_six
610 movdqu 0x60($inp),$inout6
611 call _aesni_encrypt8
269 movups $inout0,($out) 612 movups $inout0,($out)
270 movups $inout1,0x10($out) 613 movups $inout1,0x10($out)
271 movups $inout2,0x20($out) 614 movups $inout2,0x20($out)
272 movups $inout3,0x30($out) 615 movups $inout3,0x30($out)
616 movups $inout4,0x40($out)
617 movups $inout5,0x50($out)
618 movups $inout6,0x60($out)
273 jmp .Lecb_ret 619 jmp .Lecb_ret
274.align 16 620.align 16
275.Lecb_enc_one: 621.Lecb_enc_one:
@@ -280,6 +626,7 @@ $code.=<<___;
280 jmp .Lecb_ret 626 jmp .Lecb_ret
281.align 16 627.align 16
282.Lecb_enc_two: 628.Lecb_enc_two:
629 xorps $inout2,$inout2
283 call _aesni_encrypt3 630 call _aesni_encrypt3
284 movups $inout0,($out) 631 movups $inout0,($out)
285 movups $inout1,0x10($out) 632 movups $inout1,0x10($out)
@@ -291,47 +638,121 @@ $code.=<<___;
291 movups $inout1,0x10($out) 638 movups $inout1,0x10($out)
292 movups $inout2,0x20($out) 639 movups $inout2,0x20($out)
293 jmp .Lecb_ret 640 jmp .Lecb_ret
641.align 16
642.Lecb_enc_four:
643 call _aesni_encrypt4
644 movups $inout0,($out)
645 movups $inout1,0x10($out)
646 movups $inout2,0x20($out)
647 movups $inout3,0x30($out)
648 jmp .Lecb_ret
649.align 16
650.Lecb_enc_five:
651 xorps $inout5,$inout5
652 call _aesni_encrypt6
653 movups $inout0,($out)
654 movups $inout1,0x10($out)
655 movups $inout2,0x20($out)
656 movups $inout3,0x30($out)
657 movups $inout4,0x40($out)
658 jmp .Lecb_ret
659.align 16
660.Lecb_enc_six:
661 call _aesni_encrypt6
662 movups $inout0,($out)
663 movups $inout1,0x10($out)
664 movups $inout2,0x20($out)
665 movups $inout3,0x30($out)
666 movups $inout4,0x40($out)
667 movups $inout5,0x50($out)
668 jmp .Lecb_ret
294 #--------------------------- ECB DECRYPT ------------------------------# 669 #--------------------------- ECB DECRYPT ------------------------------#
295.align 16 670.align 16
296.Lecb_decrypt: 671.Lecb_decrypt:
297 sub \$0x40,$len 672 cmp \$0x80,$len
298 jbe .Lecb_dec_tail 673 jb .Lecb_dec_tail
299 jmp .Lecb_dec_loop3 674
675 movdqu ($inp),$inout0
676 movdqu 0x10($inp),$inout1
677 movdqu 0x20($inp),$inout2
678 movdqu 0x30($inp),$inout3
679 movdqu 0x40($inp),$inout4
680 movdqu 0x50($inp),$inout5
681 movdqu 0x60($inp),$inout6
682 movdqu 0x70($inp),$inout7
683 lea 0x80($inp),$inp
684 sub \$0x80,$len
685 jmp .Lecb_dec_loop8_enter
300.align 16 686.align 16
301.Lecb_dec_loop3: 687.Lecb_dec_loop8:
302 movups ($inp),$inout0 688 movups $inout0,($out)
303 movups 0x10($inp),$inout1
304 movups 0x20($inp),$inout2
305 call _aesni_decrypt3
306 sub \$0x30,$len
307 lea 0x30($inp),$inp
308 lea 0x30($out),$out
309 movups $inout0,-0x30($out)
310 mov $rnds_,$rounds # restore $rounds
311 movups $inout1,-0x20($out)
312 mov $key_,$key # restore $key 689 mov $key_,$key # restore $key
313 movups $inout2,-0x10($out) 690 movdqu ($inp),$inout0
314 ja .Lecb_dec_loop3 691 mov $rnds_,$rounds # restore $rounds
692 movups $inout1,0x10($out)
693 movdqu 0x10($inp),$inout1
694 movups $inout2,0x20($out)
695 movdqu 0x20($inp),$inout2
696 movups $inout3,0x30($out)
697 movdqu 0x30($inp),$inout3
698 movups $inout4,0x40($out)
699 movdqu 0x40($inp),$inout4
700 movups $inout5,0x50($out)
701 movdqu 0x50($inp),$inout5
702 movups $inout6,0x60($out)
703 movdqu 0x60($inp),$inout6
704 movups $inout7,0x70($out)
705 lea 0x80($out),$out
706 movdqu 0x70($inp),$inout7
707 lea 0x80($inp),$inp
708.Lecb_dec_loop8_enter:
709
710 call _aesni_decrypt8
711
712 $movkey ($key_),$rndkey0
713 sub \$0x80,$len
714 jnc .Lecb_dec_loop8
315 715
316.Lecb_dec_tail: 716 movups $inout0,($out)
317 add \$0x40,$len 717 mov $key_,$key # restore $key
718 movups $inout1,0x10($out)
719 mov $rnds_,$rounds # restore $rounds
720 movups $inout2,0x20($out)
721 movups $inout3,0x30($out)
722 movups $inout4,0x40($out)
723 movups $inout5,0x50($out)
724 movups $inout6,0x60($out)
725 movups $inout7,0x70($out)
726 lea 0x80($out),$out
727 add \$0x80,$len
318 jz .Lecb_ret 728 jz .Lecb_ret
319 729
320 cmp \$0x10,$len 730.Lecb_dec_tail:
321 movups ($inp),$inout0 731 movups ($inp),$inout0
322 je .Lecb_dec_one
323 cmp \$0x20,$len 732 cmp \$0x20,$len
733 jb .Lecb_dec_one
324 movups 0x10($inp),$inout1 734 movups 0x10($inp),$inout1
325 je .Lecb_dec_two 735 je .Lecb_dec_two
326 cmp \$0x30,$len
327 movups 0x20($inp),$inout2 736 movups 0x20($inp),$inout2
328 je .Lecb_dec_three 737 cmp \$0x40,$len
738 jb .Lecb_dec_three
329 movups 0x30($inp),$inout3 739 movups 0x30($inp),$inout3
330 call _aesni_decrypt4 740 je .Lecb_dec_four
741 movups 0x40($inp),$inout4
742 cmp \$0x60,$len
743 jb .Lecb_dec_five
744 movups 0x50($inp),$inout5
745 je .Lecb_dec_six
746 movups 0x60($inp),$inout6
747 $movkey ($key),$rndkey0
748 call _aesni_decrypt8
331 movups $inout0,($out) 749 movups $inout0,($out)
332 movups $inout1,0x10($out) 750 movups $inout1,0x10($out)
333 movups $inout2,0x20($out) 751 movups $inout2,0x20($out)
334 movups $inout3,0x30($out) 752 movups $inout3,0x30($out)
753 movups $inout4,0x40($out)
754 movups $inout5,0x50($out)
755 movups $inout6,0x60($out)
335 jmp .Lecb_ret 756 jmp .Lecb_ret
336.align 16 757.align 16
337.Lecb_dec_one: 758.Lecb_dec_one:
@@ -342,6 +763,7 @@ $code.=<<___;
342 jmp .Lecb_ret 763 jmp .Lecb_ret
343.align 16 764.align 16
344.Lecb_dec_two: 765.Lecb_dec_two:
766 xorps $inout2,$inout2
345 call _aesni_decrypt3 767 call _aesni_decrypt3
346 movups $inout0,($out) 768 movups $inout0,($out)
347 movups $inout1,0x10($out) 769 movups $inout1,0x10($out)
@@ -352,17 +774,1353 @@ $code.=<<___;
352 movups $inout0,($out) 774 movups $inout0,($out)
353 movups $inout1,0x10($out) 775 movups $inout1,0x10($out)
354 movups $inout2,0x20($out) 776 movups $inout2,0x20($out)
777 jmp .Lecb_ret
778.align 16
779.Lecb_dec_four:
780 call _aesni_decrypt4
781 movups $inout0,($out)
782 movups $inout1,0x10($out)
783 movups $inout2,0x20($out)
784 movups $inout3,0x30($out)
785 jmp .Lecb_ret
786.align 16
787.Lecb_dec_five:
788 xorps $inout5,$inout5
789 call _aesni_decrypt6
790 movups $inout0,($out)
791 movups $inout1,0x10($out)
792 movups $inout2,0x20($out)
793 movups $inout3,0x30($out)
794 movups $inout4,0x40($out)
795 jmp .Lecb_ret
796.align 16
797.Lecb_dec_six:
798 call _aesni_decrypt6
799 movups $inout0,($out)
800 movups $inout1,0x10($out)
801 movups $inout2,0x20($out)
802 movups $inout3,0x30($out)
803 movups $inout4,0x40($out)
804 movups $inout5,0x50($out)
355 805
356.Lecb_ret: 806.Lecb_ret:
357 ret 807 ret
358.size aesni_ecb_encrypt,.-aesni_ecb_encrypt 808.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
359___ 809___
810
811{
812######################################################################
813# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
814# size_t blocks, const AES_KEY *key,
815# const char *ivec,char *cmac);
816#
817# Handles only complete blocks, operates on 64-bit counter and
818# does not update *ivec! Nor does it finalize CMAC value
819# (see engine/eng_aesni.c for details)
820#
821{
822my $cmac="%r9"; # 6th argument
823
824my $increment="%xmm6";
825my $bswap_mask="%xmm7";
826
827$code.=<<___;
828.globl aesni_ccm64_encrypt_blocks
829.type aesni_ccm64_encrypt_blocks,\@function,6
830.align 16
831aesni_ccm64_encrypt_blocks:
832___
833$code.=<<___ if ($win64);
834 lea -0x58(%rsp),%rsp
835 movaps %xmm6,(%rsp)
836 movaps %xmm7,0x10(%rsp)
837 movaps %xmm8,0x20(%rsp)
838 movaps %xmm9,0x30(%rsp)
839.Lccm64_enc_body:
840___
841$code.=<<___;
842 mov 240($key),$rounds # key->rounds
843 movdqu ($ivp),$iv
844 movdqa .Lincrement64(%rip),$increment
845 movdqa .Lbswap_mask(%rip),$bswap_mask
846
847 shr \$1,$rounds
848 lea 0($key),$key_
849 movdqu ($cmac),$inout1
850 movdqa $iv,$inout0
851 mov $rounds,$rnds_
852 pshufb $bswap_mask,$iv
853 jmp .Lccm64_enc_outer
854.align 16
855.Lccm64_enc_outer:
856 $movkey ($key_),$rndkey0
857 mov $rnds_,$rounds
858 movups ($inp),$in0 # load inp
859
860 xorps $rndkey0,$inout0 # counter
861 $movkey 16($key_),$rndkey1
862 xorps $in0,$rndkey0
863 lea 32($key_),$key
864 xorps $rndkey0,$inout1 # cmac^=inp
865 $movkey ($key),$rndkey0
866
867.Lccm64_enc2_loop:
868 aesenc $rndkey1,$inout0
869 dec $rounds
870 aesenc $rndkey1,$inout1
871 $movkey 16($key),$rndkey1
872 aesenc $rndkey0,$inout0
873 lea 32($key),$key
874 aesenc $rndkey0,$inout1
875 $movkey 0($key),$rndkey0
876 jnz .Lccm64_enc2_loop
877 aesenc $rndkey1,$inout0
878 aesenc $rndkey1,$inout1
879 paddq $increment,$iv
880 aesenclast $rndkey0,$inout0
881 aesenclast $rndkey0,$inout1
882
883 dec $len
884 lea 16($inp),$inp
885 xorps $inout0,$in0 # inp ^= E(iv)
886 movdqa $iv,$inout0
887 movups $in0,($out) # save output
888 lea 16($out),$out
889 pshufb $bswap_mask,$inout0
890 jnz .Lccm64_enc_outer
891
892 movups $inout1,($cmac)
893___
894$code.=<<___ if ($win64);
895 movaps (%rsp),%xmm6
896 movaps 0x10(%rsp),%xmm7
897 movaps 0x20(%rsp),%xmm8
898 movaps 0x30(%rsp),%xmm9
899 lea 0x58(%rsp),%rsp
900.Lccm64_enc_ret:
901___
902$code.=<<___;
903 ret
904.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
905___
906######################################################################
907$code.=<<___;
908.globl aesni_ccm64_decrypt_blocks
909.type aesni_ccm64_decrypt_blocks,\@function,6
910.align 16
911aesni_ccm64_decrypt_blocks:
912___
913$code.=<<___ if ($win64);
914 lea -0x58(%rsp),%rsp
915 movaps %xmm6,(%rsp)
916 movaps %xmm7,0x10(%rsp)
917 movaps %xmm8,0x20(%rsp)
918 movaps %xmm9,0x30(%rsp)
919.Lccm64_dec_body:
920___
921$code.=<<___;
922 mov 240($key),$rounds # key->rounds
923 movups ($ivp),$iv
924 movdqu ($cmac),$inout1
925 movdqa .Lincrement64(%rip),$increment
926 movdqa .Lbswap_mask(%rip),$bswap_mask
927
928 movaps $iv,$inout0
929 mov $rounds,$rnds_
930 mov $key,$key_
931 pshufb $bswap_mask,$iv
932___
933 &aesni_generate1("enc",$key,$rounds);
934$code.=<<___;
935 movups ($inp),$in0 # load inp
936 paddq $increment,$iv
937 lea 16($inp),$inp
938 jmp .Lccm64_dec_outer
939.align 16
940.Lccm64_dec_outer:
941 xorps $inout0,$in0 # inp ^= E(iv)
942 movdqa $iv,$inout0
943 mov $rnds_,$rounds
944 movups $in0,($out) # save output
945 lea 16($out),$out
946 pshufb $bswap_mask,$inout0
947
948 sub \$1,$len
949 jz .Lccm64_dec_break
950
951 $movkey ($key_),$rndkey0
952 shr \$1,$rounds
953 $movkey 16($key_),$rndkey1
954 xorps $rndkey0,$in0
955 lea 32($key_),$key
956 xorps $rndkey0,$inout0
957 xorps $in0,$inout1 # cmac^=out
958 $movkey ($key),$rndkey0
959
960.Lccm64_dec2_loop:
961 aesenc $rndkey1,$inout0
962 dec $rounds
963 aesenc $rndkey1,$inout1
964 $movkey 16($key),$rndkey1
965 aesenc $rndkey0,$inout0
966 lea 32($key),$key
967 aesenc $rndkey0,$inout1
968 $movkey 0($key),$rndkey0
969 jnz .Lccm64_dec2_loop
970 movups ($inp),$in0 # load inp
971 paddq $increment,$iv
972 aesenc $rndkey1,$inout0
973 aesenc $rndkey1,$inout1
974 lea 16($inp),$inp
975 aesenclast $rndkey0,$inout0
976 aesenclast $rndkey0,$inout1
977 jmp .Lccm64_dec_outer
978
979.align 16
980.Lccm64_dec_break:
981 #xorps $in0,$inout1 # cmac^=out
982___
983 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
984$code.=<<___;
985 movups $inout1,($cmac)
986___
987$code.=<<___ if ($win64);
988 movaps (%rsp),%xmm6
989 movaps 0x10(%rsp),%xmm7
990 movaps 0x20(%rsp),%xmm8
991 movaps 0x30(%rsp),%xmm9
992 lea 0x58(%rsp),%rsp
993.Lccm64_dec_ret:
994___
995$code.=<<___;
996 ret
997.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
998___
999}
1000######################################################################
1001# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1002# size_t blocks, const AES_KEY *key,
1003# const char *ivec);
1004#
1005# Handles only complete blocks, operates on 32-bit counter and
1006# does not update *ivec! (see engine/eng_aesni.c for details)
1007#
1008{
1009my $reserved = $win64?0:-0x28;
1010my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
1011my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
1012my $bswap_mask="%xmm15";
1013
1014$code.=<<___;
1015.globl aesni_ctr32_encrypt_blocks
1016.type aesni_ctr32_encrypt_blocks,\@function,5
1017.align 16
1018aesni_ctr32_encrypt_blocks:
1019___
1020$code.=<<___ if ($win64);
1021 lea -0xc8(%rsp),%rsp
1022 movaps %xmm6,0x20(%rsp)
1023 movaps %xmm7,0x30(%rsp)
1024 movaps %xmm8,0x40(%rsp)
1025 movaps %xmm9,0x50(%rsp)
1026 movaps %xmm10,0x60(%rsp)
1027 movaps %xmm11,0x70(%rsp)
1028 movaps %xmm12,0x80(%rsp)
1029 movaps %xmm13,0x90(%rsp)
1030 movaps %xmm14,0xa0(%rsp)
1031 movaps %xmm15,0xb0(%rsp)
1032.Lctr32_body:
1033___
1034$code.=<<___;
1035 cmp \$1,$len
1036 je .Lctr32_one_shortcut
1037
1038 movdqu ($ivp),$ivec
1039 movdqa .Lbswap_mask(%rip),$bswap_mask
1040 xor $rounds,$rounds
1041 pextrd \$3,$ivec,$rnds_ # pull 32-bit counter
1042 pinsrd \$3,$rounds,$ivec # wipe 32-bit counter
1043
1044 mov 240($key),$rounds # key->rounds
1045 bswap $rnds_
1046 pxor $iv0,$iv0 # vector of 3 32-bit counters
1047 pxor $iv1,$iv1 # vector of 3 32-bit counters
1048 pinsrd \$0,$rnds_,$iv0
1049 lea 3($rnds_),$key_
1050 pinsrd \$0,$key_,$iv1
1051 inc $rnds_
1052 pinsrd \$1,$rnds_,$iv0
1053 inc $key_
1054 pinsrd \$1,$key_,$iv1
1055 inc $rnds_
1056 pinsrd \$2,$rnds_,$iv0
1057 inc $key_
1058 pinsrd \$2,$key_,$iv1
1059 movdqa $iv0,$reserved(%rsp)
1060 pshufb $bswap_mask,$iv0
1061 movdqa $iv1,`$reserved+0x10`(%rsp)
1062 pshufb $bswap_mask,$iv1
1063
1064 pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword
1065 pshufd \$`2<<6`,$iv0,$inout1
1066 pshufd \$`1<<6`,$iv0,$inout2
1067 cmp \$6,$len
1068 jb .Lctr32_tail
1069 shr \$1,$rounds
1070 mov $key,$key_ # backup $key
1071 mov $rounds,$rnds_ # backup $rounds
1072 sub \$6,$len
1073 jmp .Lctr32_loop6
1074
1075.align 16
1076.Lctr32_loop6:
1077 pshufd \$`3<<6`,$iv1,$inout3
1078 por $ivec,$inout0 # merge counter-less ivec
1079 $movkey ($key_),$rndkey0
1080 pshufd \$`2<<6`,$iv1,$inout4
1081 por $ivec,$inout1
1082 $movkey 16($key_),$rndkey1
1083 pshufd \$`1<<6`,$iv1,$inout5
1084 por $ivec,$inout2
1085 por $ivec,$inout3
1086 xorps $rndkey0,$inout0
1087 por $ivec,$inout4
1088 por $ivec,$inout5
1089
1090 # inline _aesni_encrypt6 and interleave last rounds
1091 # with own code...
1092
1093 pxor $rndkey0,$inout1
1094 aesenc $rndkey1,$inout0
1095 lea 32($key_),$key
1096 pxor $rndkey0,$inout2
1097 aesenc $rndkey1,$inout1
1098 movdqa .Lincrement32(%rip),$iv1
1099 pxor $rndkey0,$inout3
1100 aesenc $rndkey1,$inout2
1101 movdqa $reserved(%rsp),$iv0
1102 pxor $rndkey0,$inout4
1103 aesenc $rndkey1,$inout3
1104 pxor $rndkey0,$inout5
1105 $movkey ($key),$rndkey0
1106 dec $rounds
1107 aesenc $rndkey1,$inout4
1108 aesenc $rndkey1,$inout5
1109 jmp .Lctr32_enc_loop6_enter
1110.align 16
1111.Lctr32_enc_loop6:
1112 aesenc $rndkey1,$inout0
1113 aesenc $rndkey1,$inout1
1114 dec $rounds
1115 aesenc $rndkey1,$inout2
1116 aesenc $rndkey1,$inout3
1117 aesenc $rndkey1,$inout4
1118 aesenc $rndkey1,$inout5
1119.Lctr32_enc_loop6_enter:
1120 $movkey 16($key),$rndkey1
1121 aesenc $rndkey0,$inout0
1122 aesenc $rndkey0,$inout1
1123 lea 32($key),$key
1124 aesenc $rndkey0,$inout2
1125 aesenc $rndkey0,$inout3
1126 aesenc $rndkey0,$inout4
1127 aesenc $rndkey0,$inout5
1128 $movkey ($key),$rndkey0
1129 jnz .Lctr32_enc_loop6
1130
1131 aesenc $rndkey1,$inout0
1132 paddd $iv1,$iv0 # increment counter vector
1133 aesenc $rndkey1,$inout1
1134 paddd `$reserved+0x10`(%rsp),$iv1
1135 aesenc $rndkey1,$inout2
1136 movdqa $iv0,$reserved(%rsp) # save counter vector
1137 aesenc $rndkey1,$inout3
1138 movdqa $iv1,`$reserved+0x10`(%rsp)
1139 aesenc $rndkey1,$inout4
1140 pshufb $bswap_mask,$iv0 # byte swap
1141 aesenc $rndkey1,$inout5
1142 pshufb $bswap_mask,$iv1
1143
1144 aesenclast $rndkey0,$inout0
1145 movups ($inp),$in0 # load input
1146 aesenclast $rndkey0,$inout1
1147 movups 0x10($inp),$in1
1148 aesenclast $rndkey0,$inout2
1149 movups 0x20($inp),$in2
1150 aesenclast $rndkey0,$inout3
1151 movups 0x30($inp),$in3
1152 aesenclast $rndkey0,$inout4
1153 movups 0x40($inp),$rndkey1
1154 aesenclast $rndkey0,$inout5
1155 movups 0x50($inp),$rndkey0
1156 lea 0x60($inp),$inp
1157
1158 xorps $inout0,$in0 # xor
1159 pshufd \$`3<<6`,$iv0,$inout0
1160 xorps $inout1,$in1
1161 pshufd \$`2<<6`,$iv0,$inout1
1162 movups $in0,($out) # store output
1163 xorps $inout2,$in2
1164 pshufd \$`1<<6`,$iv0,$inout2
1165 movups $in1,0x10($out)
1166 xorps $inout3,$in3
1167 movups $in2,0x20($out)
1168 xorps $inout4,$rndkey1
1169 movups $in3,0x30($out)
1170 xorps $inout5,$rndkey0
1171 movups $rndkey1,0x40($out)
1172 movups $rndkey0,0x50($out)
1173 lea 0x60($out),$out
1174 mov $rnds_,$rounds
1175 sub \$6,$len
1176 jnc .Lctr32_loop6
1177
1178 add \$6,$len
1179 jz .Lctr32_done
1180 mov $key_,$key # restore $key
1181 lea 1($rounds,$rounds),$rounds # restore original value
1182
1183.Lctr32_tail:
1184 por $ivec,$inout0
1185 movups ($inp),$in0
1186 cmp \$2,$len
1187 jb .Lctr32_one
1188
1189 por $ivec,$inout1
1190 movups 0x10($inp),$in1
1191 je .Lctr32_two
1192
1193 pshufd \$`3<<6`,$iv1,$inout3
1194 por $ivec,$inout2
1195 movups 0x20($inp),$in2
1196 cmp \$4,$len
1197 jb .Lctr32_three
1198
1199 pshufd \$`2<<6`,$iv1,$inout4
1200 por $ivec,$inout3
1201 movups 0x30($inp),$in3
1202 je .Lctr32_four
1203
1204 por $ivec,$inout4
1205 xorps $inout5,$inout5
1206
1207 call _aesni_encrypt6
1208
1209 movups 0x40($inp),$rndkey1
1210 xorps $inout0,$in0
1211 xorps $inout1,$in1
1212 movups $in0,($out)
1213 xorps $inout2,$in2
1214 movups $in1,0x10($out)
1215 xorps $inout3,$in3
1216 movups $in2,0x20($out)
1217 xorps $inout4,$rndkey1
1218 movups $in3,0x30($out)
1219 movups $rndkey1,0x40($out)
1220 jmp .Lctr32_done
1221
1222.align 16
1223.Lctr32_one_shortcut:
1224 movups ($ivp),$inout0
1225 movups ($inp),$in0
1226 mov 240($key),$rounds # key->rounds
1227.Lctr32_one:
1228___
1229 &aesni_generate1("enc",$key,$rounds);
1230$code.=<<___;
1231 xorps $inout0,$in0
1232 movups $in0,($out)
1233 jmp .Lctr32_done
1234
1235.align 16
1236.Lctr32_two:
1237 xorps $inout2,$inout2
1238 call _aesni_encrypt3
1239 xorps $inout0,$in0
1240 xorps $inout1,$in1
1241 movups $in0,($out)
1242 movups $in1,0x10($out)
1243 jmp .Lctr32_done
1244
1245.align 16
1246.Lctr32_three:
1247 call _aesni_encrypt3
1248 xorps $inout0,$in0
1249 xorps $inout1,$in1
1250 movups $in0,($out)
1251 xorps $inout2,$in2
1252 movups $in1,0x10($out)
1253 movups $in2,0x20($out)
1254 jmp .Lctr32_done
1255
1256.align 16
1257.Lctr32_four:
1258 call _aesni_encrypt4
1259 xorps $inout0,$in0
1260 xorps $inout1,$in1
1261 movups $in0,($out)
1262 xorps $inout2,$in2
1263 movups $in1,0x10($out)
1264 xorps $inout3,$in3
1265 movups $in2,0x20($out)
1266 movups $in3,0x30($out)
1267
1268.Lctr32_done:
1269___
1270$code.=<<___ if ($win64);
1271 movaps 0x20(%rsp),%xmm6
1272 movaps 0x30(%rsp),%xmm7
1273 movaps 0x40(%rsp),%xmm8
1274 movaps 0x50(%rsp),%xmm9
1275 movaps 0x60(%rsp),%xmm10
1276 movaps 0x70(%rsp),%xmm11
1277 movaps 0x80(%rsp),%xmm12
1278 movaps 0x90(%rsp),%xmm13
1279 movaps 0xa0(%rsp),%xmm14
1280 movaps 0xb0(%rsp),%xmm15
1281 lea 0xc8(%rsp),%rsp
1282.Lctr32_ret:
1283___
1284$code.=<<___;
1285 ret
1286.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1287___
360} 1288}
361 1289
1290######################################################################
1291# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1292# const AES_KEY *key1, const AES_KEY *key2
1293# const unsigned char iv[16]);
1294#
1295{
1296my @tweak=map("%xmm$_",(10..15));
1297my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1298my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1299my $frame_size = 0x68 + ($win64?160:0);
1300
1301$code.=<<___;
1302.globl aesni_xts_encrypt
1303.type aesni_xts_encrypt,\@function,6
1304.align 16
1305aesni_xts_encrypt:
1306 lea -$frame_size(%rsp),%rsp
1307___
1308$code.=<<___ if ($win64);
1309 movaps %xmm6,0x60(%rsp)
1310 movaps %xmm7,0x70(%rsp)
1311 movaps %xmm8,0x80(%rsp)
1312 movaps %xmm9,0x90(%rsp)
1313 movaps %xmm10,0xa0(%rsp)
1314 movaps %xmm11,0xb0(%rsp)
1315 movaps %xmm12,0xc0(%rsp)
1316 movaps %xmm13,0xd0(%rsp)
1317 movaps %xmm14,0xe0(%rsp)
1318 movaps %xmm15,0xf0(%rsp)
1319.Lxts_enc_body:
1320___
1321$code.=<<___;
1322 movups ($ivp),@tweak[5] # load clear-text tweak
1323 mov 240(%r8),$rounds # key2->rounds
1324 mov 240($key),$rnds_ # key1->rounds
1325___
1326 # generate the tweak
1327 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1328$code.=<<___;
1329 mov $key,$key_ # backup $key
1330 mov $rnds_,$rounds # backup $rounds
1331 mov $len,$len_ # backup $len
1332 and \$-16,$len
1333
1334 movdqa .Lxts_magic(%rip),$twmask
1335 pxor $twtmp,$twtmp
1336 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1337___
1338 for ($i=0;$i<4;$i++) {
1339 $code.=<<___;
1340 pshufd \$0x13,$twtmp,$twres
1341 pxor $twtmp,$twtmp
1342 movdqa @tweak[5],@tweak[$i]
1343 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1344 pand $twmask,$twres # isolate carry and residue
1345 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1346 pxor $twres,@tweak[5]
1347___
1348 }
1349$code.=<<___;
1350 sub \$16*6,$len
1351 jc .Lxts_enc_short
1352
1353 shr \$1,$rounds
1354 sub \$1,$rounds
1355 mov $rounds,$rnds_
1356 jmp .Lxts_enc_grandloop
1357
1358.align 16
1359.Lxts_enc_grandloop:
1360 pshufd \$0x13,$twtmp,$twres
1361 movdqa @tweak[5],@tweak[4]
1362 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1363 movdqu `16*0`($inp),$inout0 # load input
1364 pand $twmask,$twres # isolate carry and residue
1365 movdqu `16*1`($inp),$inout1
1366 pxor $twres,@tweak[5]
1367
1368 movdqu `16*2`($inp),$inout2
1369 pxor @tweak[0],$inout0 # input^=tweak
1370 movdqu `16*3`($inp),$inout3
1371 pxor @tweak[1],$inout1
1372 movdqu `16*4`($inp),$inout4
1373 pxor @tweak[2],$inout2
1374 movdqu `16*5`($inp),$inout5
1375 lea `16*6`($inp),$inp
1376 pxor @tweak[3],$inout3
1377 $movkey ($key_),$rndkey0
1378 pxor @tweak[4],$inout4
1379 pxor @tweak[5],$inout5
1380
1381 # inline _aesni_encrypt6 and interleave first and last rounds
1382 # with own code...
1383 $movkey 16($key_),$rndkey1
1384 pxor $rndkey0,$inout0
1385 pxor $rndkey0,$inout1
1386 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1387 aesenc $rndkey1,$inout0
1388 lea 32($key_),$key
1389 pxor $rndkey0,$inout2
1390 movdqa @tweak[1],`16*1`(%rsp)
1391 aesenc $rndkey1,$inout1
1392 pxor $rndkey0,$inout3
1393 movdqa @tweak[2],`16*2`(%rsp)
1394 aesenc $rndkey1,$inout2
1395 pxor $rndkey0,$inout4
1396 movdqa @tweak[3],`16*3`(%rsp)
1397 aesenc $rndkey1,$inout3
1398 pxor $rndkey0,$inout5
1399 $movkey ($key),$rndkey0
1400 dec $rounds
1401 movdqa @tweak[4],`16*4`(%rsp)
1402 aesenc $rndkey1,$inout4
1403 movdqa @tweak[5],`16*5`(%rsp)
1404 aesenc $rndkey1,$inout5
1405 pxor $twtmp,$twtmp
1406 pcmpgtd @tweak[5],$twtmp
1407 jmp .Lxts_enc_loop6_enter
1408
1409.align 16
1410.Lxts_enc_loop6:
1411 aesenc $rndkey1,$inout0
1412 aesenc $rndkey1,$inout1
1413 dec $rounds
1414 aesenc $rndkey1,$inout2
1415 aesenc $rndkey1,$inout3
1416 aesenc $rndkey1,$inout4
1417 aesenc $rndkey1,$inout5
1418.Lxts_enc_loop6_enter:
1419 $movkey 16($key),$rndkey1
1420 aesenc $rndkey0,$inout0
1421 aesenc $rndkey0,$inout1
1422 lea 32($key),$key
1423 aesenc $rndkey0,$inout2
1424 aesenc $rndkey0,$inout3
1425 aesenc $rndkey0,$inout4
1426 aesenc $rndkey0,$inout5
1427 $movkey ($key),$rndkey0
1428 jnz .Lxts_enc_loop6
1429
1430 pshufd \$0x13,$twtmp,$twres
1431 pxor $twtmp,$twtmp
1432 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1433 aesenc $rndkey1,$inout0
1434 pand $twmask,$twres # isolate carry and residue
1435 aesenc $rndkey1,$inout1
1436 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1437 aesenc $rndkey1,$inout2
1438 pxor $twres,@tweak[5]
1439 aesenc $rndkey1,$inout3
1440 aesenc $rndkey1,$inout4
1441 aesenc $rndkey1,$inout5
1442 $movkey 16($key),$rndkey1
1443
1444 pshufd \$0x13,$twtmp,$twres
1445 pxor $twtmp,$twtmp
1446 movdqa @tweak[5],@tweak[0]
1447 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1448 aesenc $rndkey0,$inout0
1449 pand $twmask,$twres # isolate carry and residue
1450 aesenc $rndkey0,$inout1
1451 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1452 aesenc $rndkey0,$inout2
1453 pxor $twres,@tweak[5]
1454 aesenc $rndkey0,$inout3
1455 aesenc $rndkey0,$inout4
1456 aesenc $rndkey0,$inout5
1457 $movkey 32($key),$rndkey0
1458
1459 pshufd \$0x13,$twtmp,$twres
1460 pxor $twtmp,$twtmp
1461 movdqa @tweak[5],@tweak[1]
1462 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1463 aesenc $rndkey1,$inout0
1464 pand $twmask,$twres # isolate carry and residue
1465 aesenc $rndkey1,$inout1
1466 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1467 aesenc $rndkey1,$inout2
1468 pxor $twres,@tweak[5]
1469 aesenc $rndkey1,$inout3
1470 aesenc $rndkey1,$inout4
1471 aesenc $rndkey1,$inout5
1472
1473 pshufd \$0x13,$twtmp,$twres
1474 pxor $twtmp,$twtmp
1475 movdqa @tweak[5],@tweak[2]
1476 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1477 aesenclast $rndkey0,$inout0
1478 pand $twmask,$twres # isolate carry and residue
1479 aesenclast $rndkey0,$inout1
1480 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1481 aesenclast $rndkey0,$inout2
1482 pxor $twres,@tweak[5]
1483 aesenclast $rndkey0,$inout3
1484 aesenclast $rndkey0,$inout4
1485 aesenclast $rndkey0,$inout5
1486
1487 pshufd \$0x13,$twtmp,$twres
1488 pxor $twtmp,$twtmp
1489 movdqa @tweak[5],@tweak[3]
1490 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1491 xorps `16*0`(%rsp),$inout0 # output^=tweak
1492 pand $twmask,$twres # isolate carry and residue
1493 xorps `16*1`(%rsp),$inout1
1494 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1495 pxor $twres,@tweak[5]
1496
1497 xorps `16*2`(%rsp),$inout2
1498 movups $inout0,`16*0`($out) # write output
1499 xorps `16*3`(%rsp),$inout3
1500 movups $inout1,`16*1`($out)
1501 xorps `16*4`(%rsp),$inout4
1502 movups $inout2,`16*2`($out)
1503 xorps `16*5`(%rsp),$inout5
1504 movups $inout3,`16*3`($out)
1505 mov $rnds_,$rounds # restore $rounds
1506 movups $inout4,`16*4`($out)
1507 movups $inout5,`16*5`($out)
1508 lea `16*6`($out),$out
1509 sub \$16*6,$len
1510 jnc .Lxts_enc_grandloop
1511
1512 lea 3($rounds,$rounds),$rounds # restore original value
1513 mov $key_,$key # restore $key
1514 mov $rounds,$rnds_ # backup $rounds
1515
1516.Lxts_enc_short:
1517 add \$16*6,$len
1518 jz .Lxts_enc_done
1519
1520 cmp \$0x20,$len
1521 jb .Lxts_enc_one
1522 je .Lxts_enc_two
1523
1524 cmp \$0x40,$len
1525 jb .Lxts_enc_three
1526 je .Lxts_enc_four
1527
1528 pshufd \$0x13,$twtmp,$twres
1529 movdqa @tweak[5],@tweak[4]
1530 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1531 movdqu ($inp),$inout0
1532 pand $twmask,$twres # isolate carry and residue
1533 movdqu 16*1($inp),$inout1
1534 pxor $twres,@tweak[5]
1535
1536 movdqu 16*2($inp),$inout2
1537 pxor @tweak[0],$inout0
1538 movdqu 16*3($inp),$inout3
1539 pxor @tweak[1],$inout1
1540 movdqu 16*4($inp),$inout4
1541 lea 16*5($inp),$inp
1542 pxor @tweak[2],$inout2
1543 pxor @tweak[3],$inout3
1544 pxor @tweak[4],$inout4
1545
1546 call _aesni_encrypt6
1547
1548 xorps @tweak[0],$inout0
1549 movdqa @tweak[5],@tweak[0]
1550 xorps @tweak[1],$inout1
1551 xorps @tweak[2],$inout2
1552 movdqu $inout0,($out)
1553 xorps @tweak[3],$inout3
1554 movdqu $inout1,16*1($out)
1555 xorps @tweak[4],$inout4
1556 movdqu $inout2,16*2($out)
1557 movdqu $inout3,16*3($out)
1558 movdqu $inout4,16*4($out)
1559 lea 16*5($out),$out
1560 jmp .Lxts_enc_done
1561
1562.align 16
1563.Lxts_enc_one:
1564 movups ($inp),$inout0
1565 lea 16*1($inp),$inp
1566 xorps @tweak[0],$inout0
1567___
1568 &aesni_generate1("enc",$key,$rounds);
1569$code.=<<___;
1570 xorps @tweak[0],$inout0
1571 movdqa @tweak[1],@tweak[0]
1572 movups $inout0,($out)
1573 lea 16*1($out),$out
1574 jmp .Lxts_enc_done
1575
1576.align 16
1577.Lxts_enc_two:
1578 movups ($inp),$inout0
1579 movups 16($inp),$inout1
1580 lea 32($inp),$inp
1581 xorps @tweak[0],$inout0
1582 xorps @tweak[1],$inout1
1583
1584 call _aesni_encrypt3
1585
1586 xorps @tweak[0],$inout0
1587 movdqa @tweak[2],@tweak[0]
1588 xorps @tweak[1],$inout1
1589 movups $inout0,($out)
1590 movups $inout1,16*1($out)
1591 lea 16*2($out),$out
1592 jmp .Lxts_enc_done
1593
1594.align 16
1595.Lxts_enc_three:
1596 movups ($inp),$inout0
1597 movups 16*1($inp),$inout1
1598 movups 16*2($inp),$inout2
1599 lea 16*3($inp),$inp
1600 xorps @tweak[0],$inout0
1601 xorps @tweak[1],$inout1
1602 xorps @tweak[2],$inout2
1603
1604 call _aesni_encrypt3
1605
1606 xorps @tweak[0],$inout0
1607 movdqa @tweak[3],@tweak[0]
1608 xorps @tweak[1],$inout1
1609 xorps @tweak[2],$inout2
1610 movups $inout0,($out)
1611 movups $inout1,16*1($out)
1612 movups $inout2,16*2($out)
1613 lea 16*3($out),$out
1614 jmp .Lxts_enc_done
1615
1616.align 16
1617.Lxts_enc_four:
1618 movups ($inp),$inout0
1619 movups 16*1($inp),$inout1
1620 movups 16*2($inp),$inout2
1621 xorps @tweak[0],$inout0
1622 movups 16*3($inp),$inout3
1623 lea 16*4($inp),$inp
1624 xorps @tweak[1],$inout1
1625 xorps @tweak[2],$inout2
1626 xorps @tweak[3],$inout3
1627
1628 call _aesni_encrypt4
1629
1630 xorps @tweak[0],$inout0
1631 movdqa @tweak[5],@tweak[0]
1632 xorps @tweak[1],$inout1
1633 xorps @tweak[2],$inout2
1634 movups $inout0,($out)
1635 xorps @tweak[3],$inout3
1636 movups $inout1,16*1($out)
1637 movups $inout2,16*2($out)
1638 movups $inout3,16*3($out)
1639 lea 16*4($out),$out
1640 jmp .Lxts_enc_done
1641
1642.align 16
1643.Lxts_enc_done:
1644 and \$15,$len_
1645 jz .Lxts_enc_ret
1646 mov $len_,$len
1647
1648.Lxts_enc_steal:
1649 movzb ($inp),%eax # borrow $rounds ...
1650 movzb -16($out),%ecx # ... and $key
1651 lea 1($inp),$inp
1652 mov %al,-16($out)
1653 mov %cl,0($out)
1654 lea 1($out),$out
1655 sub \$1,$len
1656 jnz .Lxts_enc_steal
1657
1658 sub $len_,$out # rewind $out
1659 mov $key_,$key # restore $key
1660 mov $rnds_,$rounds # restore $rounds
1661
1662 movups -16($out),$inout0
1663 xorps @tweak[0],$inout0
1664___
1665 &aesni_generate1("enc",$key,$rounds);
1666$code.=<<___;
1667 xorps @tweak[0],$inout0
1668 movups $inout0,-16($out)
1669
1670.Lxts_enc_ret:
1671___
1672$code.=<<___ if ($win64);
1673 movaps 0x60(%rsp),%xmm6
1674 movaps 0x70(%rsp),%xmm7
1675 movaps 0x80(%rsp),%xmm8
1676 movaps 0x90(%rsp),%xmm9
1677 movaps 0xa0(%rsp),%xmm10
1678 movaps 0xb0(%rsp),%xmm11
1679 movaps 0xc0(%rsp),%xmm12
1680 movaps 0xd0(%rsp),%xmm13
1681 movaps 0xe0(%rsp),%xmm14
1682 movaps 0xf0(%rsp),%xmm15
1683___
1684$code.=<<___;
1685 lea $frame_size(%rsp),%rsp
1686.Lxts_enc_epilogue:
1687 ret
1688.size aesni_xts_encrypt,.-aesni_xts_encrypt
1689___
1690
1691$code.=<<___;
1692.globl aesni_xts_decrypt
1693.type aesni_xts_decrypt,\@function,6
1694.align 16
1695aesni_xts_decrypt:
1696 lea -$frame_size(%rsp),%rsp
1697___
1698$code.=<<___ if ($win64);
1699 movaps %xmm6,0x60(%rsp)
1700 movaps %xmm7,0x70(%rsp)
1701 movaps %xmm8,0x80(%rsp)
1702 movaps %xmm9,0x90(%rsp)
1703 movaps %xmm10,0xa0(%rsp)
1704 movaps %xmm11,0xb0(%rsp)
1705 movaps %xmm12,0xc0(%rsp)
1706 movaps %xmm13,0xd0(%rsp)
1707 movaps %xmm14,0xe0(%rsp)
1708 movaps %xmm15,0xf0(%rsp)
1709.Lxts_dec_body:
1710___
1711$code.=<<___;
1712 movups ($ivp),@tweak[5] # load clear-text tweak
1713 mov 240($key2),$rounds # key2->rounds
1714 mov 240($key),$rnds_ # key1->rounds
1715___
1716 # generate the tweak
1717 &aesni_generate1("enc",$key2,$rounds,@tweak[5]);
1718$code.=<<___;
1719 xor %eax,%eax # if ($len%16) len-=16;
1720 test \$15,$len
1721 setnz %al
1722 shl \$4,%rax
1723 sub %rax,$len
1724
1725 mov $key,$key_ # backup $key
1726 mov $rnds_,$rounds # backup $rounds
1727 mov $len,$len_ # backup $len
1728 and \$-16,$len
1729
1730 movdqa .Lxts_magic(%rip),$twmask
1731 pxor $twtmp,$twtmp
1732 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1733___
1734 for ($i=0;$i<4;$i++) {
1735 $code.=<<___;
1736 pshufd \$0x13,$twtmp,$twres
1737 pxor $twtmp,$twtmp
1738 movdqa @tweak[5],@tweak[$i]
1739 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1740 pand $twmask,$twres # isolate carry and residue
1741 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1742 pxor $twres,@tweak[5]
1743___
1744 }
1745$code.=<<___;
1746 sub \$16*6,$len
1747 jc .Lxts_dec_short
1748
1749 shr \$1,$rounds
1750 sub \$1,$rounds
1751 mov $rounds,$rnds_
1752 jmp .Lxts_dec_grandloop
1753
1754.align 16
1755.Lxts_dec_grandloop:
1756 pshufd \$0x13,$twtmp,$twres
1757 movdqa @tweak[5],@tweak[4]
1758 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1759 movdqu `16*0`($inp),$inout0 # load input
1760 pand $twmask,$twres # isolate carry and residue
1761 movdqu `16*1`($inp),$inout1
1762 pxor $twres,@tweak[5]
1763
1764 movdqu `16*2`($inp),$inout2
1765 pxor @tweak[0],$inout0 # input^=tweak
1766 movdqu `16*3`($inp),$inout3
1767 pxor @tweak[1],$inout1
1768 movdqu `16*4`($inp),$inout4
1769 pxor @tweak[2],$inout2
1770 movdqu `16*5`($inp),$inout5
1771 lea `16*6`($inp),$inp
1772 pxor @tweak[3],$inout3
1773 $movkey ($key_),$rndkey0
1774 pxor @tweak[4],$inout4
1775 pxor @tweak[5],$inout5
1776
1777 # inline _aesni_decrypt6 and interleave first and last rounds
1778 # with own code...
1779 $movkey 16($key_),$rndkey1
1780 pxor $rndkey0,$inout0
1781 pxor $rndkey0,$inout1
1782 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
1783 aesdec $rndkey1,$inout0
1784 lea 32($key_),$key
1785 pxor $rndkey0,$inout2
1786 movdqa @tweak[1],`16*1`(%rsp)
1787 aesdec $rndkey1,$inout1
1788 pxor $rndkey0,$inout3
1789 movdqa @tweak[2],`16*2`(%rsp)
1790 aesdec $rndkey1,$inout2
1791 pxor $rndkey0,$inout4
1792 movdqa @tweak[3],`16*3`(%rsp)
1793 aesdec $rndkey1,$inout3
1794 pxor $rndkey0,$inout5
1795 $movkey ($key),$rndkey0
1796 dec $rounds
1797 movdqa @tweak[4],`16*4`(%rsp)
1798 aesdec $rndkey1,$inout4
1799 movdqa @tweak[5],`16*5`(%rsp)
1800 aesdec $rndkey1,$inout5
1801 pxor $twtmp,$twtmp
1802 pcmpgtd @tweak[5],$twtmp
1803 jmp .Lxts_dec_loop6_enter
1804
1805.align 16
1806.Lxts_dec_loop6:
1807 aesdec $rndkey1,$inout0
1808 aesdec $rndkey1,$inout1
1809 dec $rounds
1810 aesdec $rndkey1,$inout2
1811 aesdec $rndkey1,$inout3
1812 aesdec $rndkey1,$inout4
1813 aesdec $rndkey1,$inout5
1814.Lxts_dec_loop6_enter:
1815 $movkey 16($key),$rndkey1
1816 aesdec $rndkey0,$inout0
1817 aesdec $rndkey0,$inout1
1818 lea 32($key),$key
1819 aesdec $rndkey0,$inout2
1820 aesdec $rndkey0,$inout3
1821 aesdec $rndkey0,$inout4
1822 aesdec $rndkey0,$inout5
1823 $movkey ($key),$rndkey0
1824 jnz .Lxts_dec_loop6
1825
1826 pshufd \$0x13,$twtmp,$twres
1827 pxor $twtmp,$twtmp
1828 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1829 aesdec $rndkey1,$inout0
1830 pand $twmask,$twres # isolate carry and residue
1831 aesdec $rndkey1,$inout1
1832 pcmpgtd @tweak[5],$twtmp # broadcast upper bits
1833 aesdec $rndkey1,$inout2
1834 pxor $twres,@tweak[5]
1835 aesdec $rndkey1,$inout3
1836 aesdec $rndkey1,$inout4
1837 aesdec $rndkey1,$inout5
1838 $movkey 16($key),$rndkey1
1839
1840 pshufd \$0x13,$twtmp,$twres
1841 pxor $twtmp,$twtmp
1842 movdqa @tweak[5],@tweak[0]
1843 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1844 aesdec $rndkey0,$inout0
1845 pand $twmask,$twres # isolate carry and residue
1846 aesdec $rndkey0,$inout1
1847 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1848 aesdec $rndkey0,$inout2
1849 pxor $twres,@tweak[5]
1850 aesdec $rndkey0,$inout3
1851 aesdec $rndkey0,$inout4
1852 aesdec $rndkey0,$inout5
1853 $movkey 32($key),$rndkey0
1854
1855 pshufd \$0x13,$twtmp,$twres
1856 pxor $twtmp,$twtmp
1857 movdqa @tweak[5],@tweak[1]
1858 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1859 aesdec $rndkey1,$inout0
1860 pand $twmask,$twres # isolate carry and residue
1861 aesdec $rndkey1,$inout1
1862 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1863 aesdec $rndkey1,$inout2
1864 pxor $twres,@tweak[5]
1865 aesdec $rndkey1,$inout3
1866 aesdec $rndkey1,$inout4
1867 aesdec $rndkey1,$inout5
1868
1869 pshufd \$0x13,$twtmp,$twres
1870 pxor $twtmp,$twtmp
1871 movdqa @tweak[5],@tweak[2]
1872 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1873 aesdeclast $rndkey0,$inout0
1874 pand $twmask,$twres # isolate carry and residue
1875 aesdeclast $rndkey0,$inout1
1876 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1877 aesdeclast $rndkey0,$inout2
1878 pxor $twres,@tweak[5]
1879 aesdeclast $rndkey0,$inout3
1880 aesdeclast $rndkey0,$inout4
1881 aesdeclast $rndkey0,$inout5
1882
1883 pshufd \$0x13,$twtmp,$twres
1884 pxor $twtmp,$twtmp
1885 movdqa @tweak[5],@tweak[3]
1886 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1887 xorps `16*0`(%rsp),$inout0 # output^=tweak
1888 pand $twmask,$twres # isolate carry and residue
1889 xorps `16*1`(%rsp),$inout1
1890 pcmpgtd @tweak[5],$twtmp # broadcat upper bits
1891 pxor $twres,@tweak[5]
1892
1893 xorps `16*2`(%rsp),$inout2
1894 movups $inout0,`16*0`($out) # write output
1895 xorps `16*3`(%rsp),$inout3
1896 movups $inout1,`16*1`($out)
1897 xorps `16*4`(%rsp),$inout4
1898 movups $inout2,`16*2`($out)
1899 xorps `16*5`(%rsp),$inout5
1900 movups $inout3,`16*3`($out)
1901 mov $rnds_,$rounds # restore $rounds
1902 movups $inout4,`16*4`($out)
1903 movups $inout5,`16*5`($out)
1904 lea `16*6`($out),$out
1905 sub \$16*6,$len
1906 jnc .Lxts_dec_grandloop
1907
1908 lea 3($rounds,$rounds),$rounds # restore original value
1909 mov $key_,$key # restore $key
1910 mov $rounds,$rnds_ # backup $rounds
1911
1912.Lxts_dec_short:
1913 add \$16*6,$len
1914 jz .Lxts_dec_done
1915
1916 cmp \$0x20,$len
1917 jb .Lxts_dec_one
1918 je .Lxts_dec_two
1919
1920 cmp \$0x40,$len
1921 jb .Lxts_dec_three
1922 je .Lxts_dec_four
1923
1924 pshufd \$0x13,$twtmp,$twres
1925 movdqa @tweak[5],@tweak[4]
1926 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1927 movdqu ($inp),$inout0
1928 pand $twmask,$twres # isolate carry and residue
1929 movdqu 16*1($inp),$inout1
1930 pxor $twres,@tweak[5]
1931
1932 movdqu 16*2($inp),$inout2
1933 pxor @tweak[0],$inout0
1934 movdqu 16*3($inp),$inout3
1935 pxor @tweak[1],$inout1
1936 movdqu 16*4($inp),$inout4
1937 lea 16*5($inp),$inp
1938 pxor @tweak[2],$inout2
1939 pxor @tweak[3],$inout3
1940 pxor @tweak[4],$inout4
1941
1942 call _aesni_decrypt6
1943
1944 xorps @tweak[0],$inout0
1945 xorps @tweak[1],$inout1
1946 xorps @tweak[2],$inout2
1947 movdqu $inout0,($out)
1948 xorps @tweak[3],$inout3
1949 movdqu $inout1,16*1($out)
1950 xorps @tweak[4],$inout4
1951 movdqu $inout2,16*2($out)
1952 pxor $twtmp,$twtmp
1953 movdqu $inout3,16*3($out)
1954 pcmpgtd @tweak[5],$twtmp
1955 movdqu $inout4,16*4($out)
1956 lea 16*5($out),$out
1957 pshufd \$0x13,$twtmp,@tweak[1] # $twres
1958 and \$15,$len_
1959 jz .Lxts_dec_ret
1960
1961 movdqa @tweak[5],@tweak[0]
1962 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
1963 pand $twmask,@tweak[1] # isolate carry and residue
1964 pxor @tweak[5],@tweak[1]
1965 jmp .Lxts_dec_done2
1966
1967.align 16
1968.Lxts_dec_one:
1969 movups ($inp),$inout0
1970 lea 16*1($inp),$inp
1971 xorps @tweak[0],$inout0
1972___
1973 &aesni_generate1("dec",$key,$rounds);
1974$code.=<<___;
1975 xorps @tweak[0],$inout0
1976 movdqa @tweak[1],@tweak[0]
1977 movups $inout0,($out)
1978 movdqa @tweak[2],@tweak[1]
1979 lea 16*1($out),$out
1980 jmp .Lxts_dec_done
1981
1982.align 16
1983.Lxts_dec_two:
1984 movups ($inp),$inout0
1985 movups 16($inp),$inout1
1986 lea 32($inp),$inp
1987 xorps @tweak[0],$inout0
1988 xorps @tweak[1],$inout1
1989
1990 call _aesni_decrypt3
1991
1992 xorps @tweak[0],$inout0
1993 movdqa @tweak[2],@tweak[0]
1994 xorps @tweak[1],$inout1
1995 movdqa @tweak[3],@tweak[1]
1996 movups $inout0,($out)
1997 movups $inout1,16*1($out)
1998 lea 16*2($out),$out
1999 jmp .Lxts_dec_done
2000
2001.align 16
2002.Lxts_dec_three:
2003 movups ($inp),$inout0
2004 movups 16*1($inp),$inout1
2005 movups 16*2($inp),$inout2
2006 lea 16*3($inp),$inp
2007 xorps @tweak[0],$inout0
2008 xorps @tweak[1],$inout1
2009 xorps @tweak[2],$inout2
2010
2011 call _aesni_decrypt3
2012
2013 xorps @tweak[0],$inout0
2014 movdqa @tweak[3],@tweak[0]
2015 xorps @tweak[1],$inout1
2016 movdqa @tweak[5],@tweak[1]
2017 xorps @tweak[2],$inout2
2018 movups $inout0,($out)
2019 movups $inout1,16*1($out)
2020 movups $inout2,16*2($out)
2021 lea 16*3($out),$out
2022 jmp .Lxts_dec_done
2023
2024.align 16
2025.Lxts_dec_four:
2026 pshufd \$0x13,$twtmp,$twres
2027 movdqa @tweak[5],@tweak[4]
2028 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2029 movups ($inp),$inout0
2030 pand $twmask,$twres # isolate carry and residue
2031 movups 16*1($inp),$inout1
2032 pxor $twres,@tweak[5]
2033
2034 movups 16*2($inp),$inout2
2035 xorps @tweak[0],$inout0
2036 movups 16*3($inp),$inout3
2037 lea 16*4($inp),$inp
2038 xorps @tweak[1],$inout1
2039 xorps @tweak[2],$inout2
2040 xorps @tweak[3],$inout3
2041
2042 call _aesni_decrypt4
2043
2044 xorps @tweak[0],$inout0
2045 movdqa @tweak[4],@tweak[0]
2046 xorps @tweak[1],$inout1
2047 movdqa @tweak[5],@tweak[1]
2048 xorps @tweak[2],$inout2
2049 movups $inout0,($out)
2050 xorps @tweak[3],$inout3
2051 movups $inout1,16*1($out)
2052 movups $inout2,16*2($out)
2053 movups $inout3,16*3($out)
2054 lea 16*4($out),$out
2055 jmp .Lxts_dec_done
2056
2057.align 16
2058.Lxts_dec_done:
2059 and \$15,$len_
2060 jz .Lxts_dec_ret
2061.Lxts_dec_done2:
2062 mov $len_,$len
2063 mov $key_,$key # restore $key
2064 mov $rnds_,$rounds # restore $rounds
2065
2066 movups ($inp),$inout0
2067 xorps @tweak[1],$inout0
2068___
2069 &aesni_generate1("dec",$key,$rounds);
2070$code.=<<___;
2071 xorps @tweak[1],$inout0
2072 movups $inout0,($out)
2073
2074.Lxts_dec_steal:
2075 movzb 16($inp),%eax # borrow $rounds ...
2076 movzb ($out),%ecx # ... and $key
2077 lea 1($inp),$inp
2078 mov %al,($out)
2079 mov %cl,16($out)
2080 lea 1($out),$out
2081 sub \$1,$len
2082 jnz .Lxts_dec_steal
2083
2084 sub $len_,$out # rewind $out
2085 mov $key_,$key # restore $key
2086 mov $rnds_,$rounds # restore $rounds
2087
2088 movups ($out),$inout0
2089 xorps @tweak[0],$inout0
2090___
2091 &aesni_generate1("dec",$key,$rounds);
2092$code.=<<___;
2093 xorps @tweak[0],$inout0
2094 movups $inout0,($out)
2095
2096.Lxts_dec_ret:
2097___
2098$code.=<<___ if ($win64);
2099 movaps 0x60(%rsp),%xmm6
2100 movaps 0x70(%rsp),%xmm7
2101 movaps 0x80(%rsp),%xmm8
2102 movaps 0x90(%rsp),%xmm9
2103 movaps 0xa0(%rsp),%xmm10
2104 movaps 0xb0(%rsp),%xmm11
2105 movaps 0xc0(%rsp),%xmm12
2106 movaps 0xd0(%rsp),%xmm13
2107 movaps 0xe0(%rsp),%xmm14
2108 movaps 0xf0(%rsp),%xmm15
2109___
2110$code.=<<___;
2111 lea $frame_size(%rsp),%rsp
2112.Lxts_dec_epilogue:
2113 ret
2114.size aesni_xts_decrypt,.-aesni_xts_decrypt
2115___
2116} }}
2117
2118########################################################################
362# void $PREFIX_cbc_encrypt (const void *inp, void *out, 2119# void $PREFIX_cbc_encrypt (const void *inp, void *out,
363# size_t length, const AES_KEY *key, 2120# size_t length, const AES_KEY *key,
364# unsigned char *ivp,const int enc); 2121# unsigned char *ivp,const int enc);
365$reserved = $win64?0x40:-0x18; # used in decrypt 2122{
2123my $reserved = $win64?0x40:-0x18; # used in decrypt
366$code.=<<___; 2124$code.=<<___;
367.globl ${PREFIX}_cbc_encrypt 2125.globl ${PREFIX}_cbc_encrypt
368.type ${PREFIX}_cbc_encrypt,\@function,6 2126.type ${PREFIX}_cbc_encrypt,\@function,6
@@ -371,30 +2129,30 @@ ${PREFIX}_cbc_encrypt:
371 test $len,$len # check length 2129 test $len,$len # check length
372 jz .Lcbc_ret 2130 jz .Lcbc_ret
373 2131
374 mov 240($key),$rnds_ # pull $rounds 2132 mov 240($key),$rnds_ # key->rounds
375 mov $key,$key_ # backup $key 2133 mov $key,$key_ # backup $key
376 test %r9d,%r9d # 6th argument 2134 test %r9d,%r9d # 6th argument
377 jz .Lcbc_decrypt 2135 jz .Lcbc_decrypt
378#--------------------------- CBC ENCRYPT ------------------------------# 2136#--------------------------- CBC ENCRYPT ------------------------------#
379 movups ($ivp),$inout0 # load iv as initial state 2137 movups ($ivp),$inout0 # load iv as initial state
380 cmp \$16,$len
381 mov $rnds_,$rounds 2138 mov $rnds_,$rounds
2139 cmp \$16,$len
382 jb .Lcbc_enc_tail 2140 jb .Lcbc_enc_tail
383 sub \$16,$len 2141 sub \$16,$len
384 jmp .Lcbc_enc_loop 2142 jmp .Lcbc_enc_loop
385.align 16 2143.align 16
386.Lcbc_enc_loop: 2144.Lcbc_enc_loop:
387 movups ($inp),$inout1 # load input 2145 movups ($inp),$inout1 # load input
388 lea 16($inp),$inp 2146 lea 16($inp),$inp
389 pxor $inout1,$inout0 2147 #xorps $inout1,$inout0
390___ 2148___
391 &aesni_generate1("enc",$key,$rounds); 2149 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
392$code.=<<___; 2150$code.=<<___;
393 sub \$16,$len
394 lea 16($out),$out
395 mov $rnds_,$rounds # restore $rounds 2151 mov $rnds_,$rounds # restore $rounds
396 mov $key_,$key # restore $key 2152 mov $key_,$key # restore $key
397 movups $inout0,-16($out) # store output 2153 movups $inout0,0($out) # store output
2154 lea 16($out),$out
2155 sub \$16,$len
398 jnc .Lcbc_enc_loop 2156 jnc .Lcbc_enc_loop
399 add \$16,$len 2157 add \$16,$len
400 jnz .Lcbc_enc_tail 2158 jnz .Lcbc_enc_tail
@@ -429,92 +2187,238 @@ $code.=<<___ if ($win64);
429___ 2187___
430$code.=<<___; 2188$code.=<<___;
431 movups ($ivp),$iv 2189 movups ($ivp),$iv
432 sub \$0x40,$len
433 mov $rnds_,$rounds 2190 mov $rnds_,$rounds
2191 cmp \$0x70,$len
434 jbe .Lcbc_dec_tail 2192 jbe .Lcbc_dec_tail
435 jmp .Lcbc_dec_loop3 2193 shr \$1,$rnds_
436.align 16 2194 sub \$0x70,$len
437.Lcbc_dec_loop3: 2195 mov $rnds_,$rounds
438 movups ($inp),$inout0 2196 movaps $iv,$reserved(%rsp)
2197 jmp .Lcbc_dec_loop8_enter
2198.align 16
2199.Lcbc_dec_loop8:
2200 movaps $rndkey0,$reserved(%rsp) # save IV
2201 movups $inout7,($out)
2202 lea 0x10($out),$out
2203.Lcbc_dec_loop8_enter:
2204 $movkey ($key),$rndkey0
2205 movups ($inp),$inout0 # load input
439 movups 0x10($inp),$inout1 2206 movups 0x10($inp),$inout1
440 movups 0x20($inp),$inout2 2207 $movkey 16($key),$rndkey1
441 movaps $inout0,$in0
442 movaps $inout1,$in1
443 movaps $inout2,$in2
444 call _aesni_decrypt3
445 sub \$0x30,$len
446 lea 0x30($inp),$inp
447 lea 0x30($out),$out
448 pxor $iv,$inout0
449 pxor $in0,$inout1
450 movaps $in2,$iv
451 pxor $in1,$inout2
452 movups $inout0,-0x30($out)
453 mov $rnds_,$rounds # restore $rounds
454 movups $inout1,-0x20($out)
455 mov $key_,$key # restore $key
456 movups $inout2,-0x10($out)
457 ja .Lcbc_dec_loop3
458 2208
459.Lcbc_dec_tail: 2209 lea 32($key),$key
460 add \$0x40,$len 2210 movdqu 0x20($inp),$inout2
461 movups $iv,($ivp) 2211 xorps $rndkey0,$inout0
462 jz .Lcbc_dec_ret 2212 movdqu 0x30($inp),$inout3
2213 xorps $rndkey0,$inout1
2214 movdqu 0x40($inp),$inout4
2215 aesdec $rndkey1,$inout0
2216 pxor $rndkey0,$inout2
2217 movdqu 0x50($inp),$inout5
2218 aesdec $rndkey1,$inout1
2219 pxor $rndkey0,$inout3
2220 movdqu 0x60($inp),$inout6
2221 aesdec $rndkey1,$inout2
2222 pxor $rndkey0,$inout4
2223 movdqu 0x70($inp),$inout7
2224 aesdec $rndkey1,$inout3
2225 pxor $rndkey0,$inout5
2226 dec $rounds
2227 aesdec $rndkey1,$inout4
2228 pxor $rndkey0,$inout6
2229 aesdec $rndkey1,$inout5
2230 pxor $rndkey0,$inout7
2231 $movkey ($key),$rndkey0
2232 aesdec $rndkey1,$inout6
2233 aesdec $rndkey1,$inout7
2234 $movkey 16($key),$rndkey1
463 2235
2236 call .Ldec_loop8_enter
2237
2238 movups ($inp),$rndkey1 # re-load input
2239 movups 0x10($inp),$rndkey0
2240 xorps $reserved(%rsp),$inout0 # ^= IV
2241 xorps $rndkey1,$inout1
2242 movups 0x20($inp),$rndkey1
2243 xorps $rndkey0,$inout2
2244 movups 0x30($inp),$rndkey0
2245 xorps $rndkey1,$inout3
2246 movups 0x40($inp),$rndkey1
2247 xorps $rndkey0,$inout4
2248 movups 0x50($inp),$rndkey0
2249 xorps $rndkey1,$inout5
2250 movups 0x60($inp),$rndkey1
2251 xorps $rndkey0,$inout6
2252 movups 0x70($inp),$rndkey0 # IV
2253 xorps $rndkey1,$inout7
2254 movups $inout0,($out)
2255 movups $inout1,0x10($out)
2256 movups $inout2,0x20($out)
2257 movups $inout3,0x30($out)
2258 mov $rnds_,$rounds # restore $rounds
2259 movups $inout4,0x40($out)
2260 mov $key_,$key # restore $key
2261 movups $inout5,0x50($out)
2262 lea 0x80($inp),$inp
2263 movups $inout6,0x60($out)
2264 lea 0x70($out),$out
2265 sub \$0x80,$len
2266 ja .Lcbc_dec_loop8
2267
2268 movaps $inout7,$inout0
2269 movaps $rndkey0,$iv
2270 add \$0x70,$len
2271 jle .Lcbc_dec_tail_collected
2272 movups $inout0,($out)
2273 lea 1($rnds_,$rnds_),$rounds
2274 lea 0x10($out),$out
2275.Lcbc_dec_tail:
464 movups ($inp),$inout0 2276 movups ($inp),$inout0
465 cmp \$0x10,$len
466 movaps $inout0,$in0 2277 movaps $inout0,$in0
2278 cmp \$0x10,$len
467 jbe .Lcbc_dec_one 2279 jbe .Lcbc_dec_one
2280
468 movups 0x10($inp),$inout1 2281 movups 0x10($inp),$inout1
469 cmp \$0x20,$len
470 movaps $inout1,$in1 2282 movaps $inout1,$in1
2283 cmp \$0x20,$len
471 jbe .Lcbc_dec_two 2284 jbe .Lcbc_dec_two
2285
472 movups 0x20($inp),$inout2 2286 movups 0x20($inp),$inout2
473 cmp \$0x30,$len
474 movaps $inout2,$in2 2287 movaps $inout2,$in2
2288 cmp \$0x30,$len
475 jbe .Lcbc_dec_three 2289 jbe .Lcbc_dec_three
2290
476 movups 0x30($inp),$inout3 2291 movups 0x30($inp),$inout3
477 call _aesni_decrypt4 2292 cmp \$0x40,$len
478 pxor $iv,$inout0 2293 jbe .Lcbc_dec_four
479 movups 0x30($inp),$iv 2294
480 pxor $in0,$inout1 2295 movups 0x40($inp),$inout4
2296 cmp \$0x50,$len
2297 jbe .Lcbc_dec_five
2298
2299 movups 0x50($inp),$inout5
2300 cmp \$0x60,$len
2301 jbe .Lcbc_dec_six
2302
2303 movups 0x60($inp),$inout6
2304 movaps $iv,$reserved(%rsp) # save IV
2305 call _aesni_decrypt8
2306 movups ($inp),$rndkey1
2307 movups 0x10($inp),$rndkey0
2308 xorps $reserved(%rsp),$inout0 # ^= IV
2309 xorps $rndkey1,$inout1
2310 movups 0x20($inp),$rndkey1
2311 xorps $rndkey0,$inout2
2312 movups 0x30($inp),$rndkey0
2313 xorps $rndkey1,$inout3
2314 movups 0x40($inp),$rndkey1
2315 xorps $rndkey0,$inout4
2316 movups 0x50($inp),$rndkey0
2317 xorps $rndkey1,$inout5
2318 movups 0x60($inp),$iv # IV
2319 xorps $rndkey0,$inout6
481 movups $inout0,($out) 2320 movups $inout0,($out)
482 pxor $in1,$inout2
483 movups $inout1,0x10($out) 2321 movups $inout1,0x10($out)
484 pxor $in2,$inout3
485 movups $inout2,0x20($out) 2322 movups $inout2,0x20($out)
486 movaps $inout3,$inout0 2323 movups $inout3,0x30($out)
487 lea 0x30($out),$out 2324 movups $inout4,0x40($out)
2325 movups $inout5,0x50($out)
2326 lea 0x60($out),$out
2327 movaps $inout6,$inout0
2328 sub \$0x70,$len
488 jmp .Lcbc_dec_tail_collected 2329 jmp .Lcbc_dec_tail_collected
489.align 16 2330.align 16
490.Lcbc_dec_one: 2331.Lcbc_dec_one:
491___ 2332___
492 &aesni_generate1("dec",$key,$rounds); 2333 &aesni_generate1("dec",$key,$rounds);
493$code.=<<___; 2334$code.=<<___;
494 pxor $iv,$inout0 2335 xorps $iv,$inout0
495 movaps $in0,$iv 2336 movaps $in0,$iv
2337 sub \$0x10,$len
496 jmp .Lcbc_dec_tail_collected 2338 jmp .Lcbc_dec_tail_collected
497.align 16 2339.align 16
498.Lcbc_dec_two: 2340.Lcbc_dec_two:
2341 xorps $inout2,$inout2
499 call _aesni_decrypt3 2342 call _aesni_decrypt3
500 pxor $iv,$inout0 2343 xorps $iv,$inout0
501 pxor $in0,$inout1 2344 xorps $in0,$inout1
502 movups $inout0,($out) 2345 movups $inout0,($out)
503 movaps $in1,$iv 2346 movaps $in1,$iv
504 movaps $inout1,$inout0 2347 movaps $inout1,$inout0
505 lea 0x10($out),$out 2348 lea 0x10($out),$out
2349 sub \$0x20,$len
506 jmp .Lcbc_dec_tail_collected 2350 jmp .Lcbc_dec_tail_collected
507.align 16 2351.align 16
508.Lcbc_dec_three: 2352.Lcbc_dec_three:
509 call _aesni_decrypt3 2353 call _aesni_decrypt3
510 pxor $iv,$inout0 2354 xorps $iv,$inout0
511 pxor $in0,$inout1 2355 xorps $in0,$inout1
512 movups $inout0,($out) 2356 movups $inout0,($out)
513 pxor $in1,$inout2 2357 xorps $in1,$inout2
514 movups $inout1,0x10($out) 2358 movups $inout1,0x10($out)
515 movaps $in2,$iv 2359 movaps $in2,$iv
516 movaps $inout2,$inout0 2360 movaps $inout2,$inout0
517 lea 0x20($out),$out 2361 lea 0x20($out),$out
2362 sub \$0x30,$len
2363 jmp .Lcbc_dec_tail_collected
2364.align 16
2365.Lcbc_dec_four:
2366 call _aesni_decrypt4
2367 xorps $iv,$inout0
2368 movups 0x30($inp),$iv
2369 xorps $in0,$inout1
2370 movups $inout0,($out)
2371 xorps $in1,$inout2
2372 movups $inout1,0x10($out)
2373 xorps $in2,$inout3
2374 movups $inout2,0x20($out)
2375 movaps $inout3,$inout0
2376 lea 0x30($out),$out
2377 sub \$0x40,$len
2378 jmp .Lcbc_dec_tail_collected
2379.align 16
2380.Lcbc_dec_five:
2381 xorps $inout5,$inout5
2382 call _aesni_decrypt6
2383 movups 0x10($inp),$rndkey1
2384 movups 0x20($inp),$rndkey0
2385 xorps $iv,$inout0
2386 xorps $in0,$inout1
2387 xorps $rndkey1,$inout2
2388 movups 0x30($inp),$rndkey1
2389 xorps $rndkey0,$inout3
2390 movups 0x40($inp),$iv
2391 xorps $rndkey1,$inout4
2392 movups $inout0,($out)
2393 movups $inout1,0x10($out)
2394 movups $inout2,0x20($out)
2395 movups $inout3,0x30($out)
2396 lea 0x40($out),$out
2397 movaps $inout4,$inout0
2398 sub \$0x50,$len
2399 jmp .Lcbc_dec_tail_collected
2400.align 16
2401.Lcbc_dec_six:
2402 call _aesni_decrypt6
2403 movups 0x10($inp),$rndkey1
2404 movups 0x20($inp),$rndkey0
2405 xorps $iv,$inout0
2406 xorps $in0,$inout1
2407 xorps $rndkey1,$inout2
2408 movups 0x30($inp),$rndkey1
2409 xorps $rndkey0,$inout3
2410 movups 0x40($inp),$rndkey0
2411 xorps $rndkey1,$inout4
2412 movups 0x50($inp),$iv
2413 xorps $rndkey0,$inout5
2414 movups $inout0,($out)
2415 movups $inout1,0x10($out)
2416 movups $inout2,0x20($out)
2417 movups $inout3,0x30($out)
2418 movups $inout4,0x40($out)
2419 lea 0x50($out),$out
2420 movaps $inout5,$inout0
2421 sub \$0x60,$len
518 jmp .Lcbc_dec_tail_collected 2422 jmp .Lcbc_dec_tail_collected
519.align 16 2423.align 16
520.Lcbc_dec_tail_collected: 2424.Lcbc_dec_tail_collected:
@@ -523,10 +2427,12 @@ $code.=<<___;
523 jnz .Lcbc_dec_tail_partial 2427 jnz .Lcbc_dec_tail_partial
524 movups $inout0,($out) 2428 movups $inout0,($out)
525 jmp .Lcbc_dec_ret 2429 jmp .Lcbc_dec_ret
2430.align 16
526.Lcbc_dec_tail_partial: 2431.Lcbc_dec_tail_partial:
527 movaps $inout0,$reserved(%rsp) 2432 movaps $inout0,$reserved(%rsp)
2433 mov \$16,%rcx
528 mov $out,%rdi 2434 mov $out,%rdi
529 mov $len,%rcx 2435 sub $len,%rcx
530 lea $reserved(%rsp),%rsi 2436 lea $reserved(%rsp),%rsi
531 .long 0x9066A4F3 # rep movsb 2437 .long 0x9066A4F3 # rep movsb
532 2438
@@ -544,7 +2450,7 @@ $code.=<<___;
544 ret 2450 ret
545.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt 2451.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
546___ 2452___
547 2453}
548# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, 2454# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
549# int bits, AES_KEY *key) 2455# int bits, AES_KEY *key)
550{ my ($inp,$bits,$key) = @_4args; 2456{ my ($inp,$bits,$key) = @_4args;
@@ -556,7 +2462,7 @@ $code.=<<___;
556.align 16 2462.align 16
557${PREFIX}_set_decrypt_key: 2463${PREFIX}_set_decrypt_key:
558 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 2464 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
559 call _aesni_set_encrypt_key 2465 call __aesni_set_encrypt_key
560 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key 2466 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
561 test %eax,%eax 2467 test %eax,%eax
562 jnz .Ldec_key_ret 2468 jnz .Ldec_key_ret
@@ -576,9 +2482,9 @@ ${PREFIX}_set_decrypt_key:
576 aesimc %xmm1,%xmm1 2482 aesimc %xmm1,%xmm1
577 lea 16($key),$key 2483 lea 16($key),$key
578 lea -16($inp),$inp 2484 lea -16($inp),$inp
579 cmp $key,$inp
580 $movkey %xmm0,16($inp) 2485 $movkey %xmm0,16($inp)
581 $movkey %xmm1,-16($key) 2486 $movkey %xmm1,-16($key)
2487 cmp $key,$inp
582 ja .Ldec_key_inverse 2488 ja .Ldec_key_inverse
583 2489
584 $movkey ($key),%xmm0 # inverse middle 2490 $movkey ($key),%xmm0 # inverse middle
@@ -605,16 +2511,16 @@ $code.=<<___;
605.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent 2511.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
606.align 16 2512.align 16
607${PREFIX}_set_encrypt_key: 2513${PREFIX}_set_encrypt_key:
608_aesni_set_encrypt_key: 2514__aesni_set_encrypt_key:
609 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 2515 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
610 test $inp,$inp
611 mov \$-1,%rax 2516 mov \$-1,%rax
2517 test $inp,$inp
612 jz .Lenc_key_ret 2518 jz .Lenc_key_ret
613 test $key,$key 2519 test $key,$key
614 jz .Lenc_key_ret 2520 jz .Lenc_key_ret
615 2521
616 movups ($inp),%xmm0 # pull first 128 bits of *userKey 2522 movups ($inp),%xmm0 # pull first 128 bits of *userKey
617 pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 2523 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
618 lea 16($key),%rax 2524 lea 16($key),%rax
619 cmp \$256,$bits 2525 cmp \$256,$bits
620 je .L14rounds 2526 je .L14rounds
@@ -729,11 +2635,11 @@ _aesni_set_encrypt_key:
729 lea 16(%rax),%rax 2635 lea 16(%rax),%rax
730.Lkey_expansion_128_cold: 2636.Lkey_expansion_128_cold:
731 shufps \$0b00010000,%xmm0,%xmm4 2637 shufps \$0b00010000,%xmm0,%xmm4
732 pxor %xmm4, %xmm0 2638 xorps %xmm4, %xmm0
733 shufps \$0b10001100,%xmm0,%xmm4 2639 shufps \$0b10001100,%xmm0,%xmm4
734 pxor %xmm4, %xmm0 2640 xorps %xmm4, %xmm0
735 pshufd \$0b11111111,%xmm1,%xmm1 # critical path 2641 shufps \$0b11111111,%xmm1,%xmm1 # critical path
736 pxor %xmm1,%xmm0 2642 xorps %xmm1,%xmm0
737 ret 2643 ret
738 2644
739.align 16 2645.align 16
@@ -744,11 +2650,11 @@ _aesni_set_encrypt_key:
744 movaps %xmm2, %xmm5 2650 movaps %xmm2, %xmm5
745.Lkey_expansion_192b_warm: 2651.Lkey_expansion_192b_warm:
746 shufps \$0b00010000,%xmm0,%xmm4 2652 shufps \$0b00010000,%xmm0,%xmm4
747 movaps %xmm2,%xmm3 2653 movdqa %xmm2,%xmm3
748 pxor %xmm4,%xmm0 2654 xorps %xmm4,%xmm0
749 shufps \$0b10001100,%xmm0,%xmm4 2655 shufps \$0b10001100,%xmm0,%xmm4
750 pslldq \$4,%xmm3 2656 pslldq \$4,%xmm3
751 pxor %xmm4,%xmm0 2657 xorps %xmm4,%xmm0
752 pshufd \$0b01010101,%xmm1,%xmm1 # critical path 2658 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
753 pxor %xmm3,%xmm2 2659 pxor %xmm3,%xmm2
754 pxor %xmm1,%xmm0 2660 pxor %xmm1,%xmm0
@@ -772,11 +2678,11 @@ _aesni_set_encrypt_key:
772 lea 16(%rax),%rax 2678 lea 16(%rax),%rax
773.Lkey_expansion_256a_cold: 2679.Lkey_expansion_256a_cold:
774 shufps \$0b00010000,%xmm0,%xmm4 2680 shufps \$0b00010000,%xmm0,%xmm4
775 pxor %xmm4,%xmm0 2681 xorps %xmm4,%xmm0
776 shufps \$0b10001100,%xmm0,%xmm4 2682 shufps \$0b10001100,%xmm0,%xmm4
777 pxor %xmm4,%xmm0 2683 xorps %xmm4,%xmm0
778 pshufd \$0b11111111,%xmm1,%xmm1 # critical path 2684 shufps \$0b11111111,%xmm1,%xmm1 # critical path
779 pxor %xmm1,%xmm0 2685 xorps %xmm1,%xmm0
780 ret 2686 ret
781 2687
782.align 16 2688.align 16
@@ -785,17 +2691,28 @@ _aesni_set_encrypt_key:
785 lea 16(%rax),%rax 2691 lea 16(%rax),%rax
786 2692
787 shufps \$0b00010000,%xmm2,%xmm4 2693 shufps \$0b00010000,%xmm2,%xmm4
788 pxor %xmm4,%xmm2 2694 xorps %xmm4,%xmm2
789 shufps \$0b10001100,%xmm2,%xmm4 2695 shufps \$0b10001100,%xmm2,%xmm4
790 pxor %xmm4,%xmm2 2696 xorps %xmm4,%xmm2
791 pshufd \$0b10101010,%xmm1,%xmm1 # critical path 2697 shufps \$0b10101010,%xmm1,%xmm1 # critical path
792 pxor %xmm1,%xmm2 2698 xorps %xmm1,%xmm2
793 ret 2699 ret
794.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key 2700.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
2701.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
795___ 2702___
796} 2703}
797 2704
798$code.=<<___; 2705$code.=<<___;
2706.align 64
2707.Lbswap_mask:
2708 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
2709.Lincrement32:
2710 .long 6,6,6,0
2711.Lincrement64:
2712 .long 1,0,0,0
2713.Lxts_magic:
2714 .long 0x87,0,1,0
2715
799.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>" 2716.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
800.align 64 2717.align 64
801___ 2718___
@@ -810,9 +2727,11 @@ $disp="%r9";
810 2727
811$code.=<<___; 2728$code.=<<___;
812.extern __imp_RtlVirtualUnwind 2729.extern __imp_RtlVirtualUnwind
813.type cbc_se_handler,\@abi-omnipotent 2730___
2731$code.=<<___ if ($PREFIX eq "aesni");
2732.type ecb_se_handler,\@abi-omnipotent
814.align 16 2733.align 16
815cbc_se_handler: 2734ecb_se_handler:
816 push %rsi 2735 push %rsi
817 push %rdi 2736 push %rdi
818 push %rbx 2737 push %rbx
@@ -825,42 +2744,132 @@ cbc_se_handler:
825 sub \$64,%rsp 2744 sub \$64,%rsp
826 2745
827 mov 152($context),%rax # pull context->Rsp 2746 mov 152($context),%rax # pull context->Rsp
2747
2748 jmp .Lcommon_seh_tail
2749.size ecb_se_handler,.-ecb_se_handler
2750
2751.type ccm64_se_handler,\@abi-omnipotent
2752.align 16
2753ccm64_se_handler:
2754 push %rsi
2755 push %rdi
2756 push %rbx
2757 push %rbp
2758 push %r12
2759 push %r13
2760 push %r14
2761 push %r15
2762 pushfq
2763 sub \$64,%rsp
2764
2765 mov 120($context),%rax # pull context->Rax
828 mov 248($context),%rbx # pull context->Rip 2766 mov 248($context),%rbx # pull context->Rip
829 2767
830 lea .Lcbc_decrypt(%rip),%r10 2768 mov 8($disp),%rsi # disp->ImageBase
831 cmp %r10,%rbx # context->Rip<"prologue" label 2769 mov 56($disp),%r11 # disp->HandlerData
832 jb .Lin_prologue
833 2770
834 lea .Lcbc_decrypt_body(%rip),%r10 2771 mov 0(%r11),%r10d # HandlerData[0]
835 cmp %r10,%rbx # context->Rip<cbc_decrypt_body 2772 lea (%rsi,%r10),%r10 # prologue label
836 jb .Lrestore_rax 2773 cmp %r10,%rbx # context->Rip<prologue label
2774 jb .Lcommon_seh_tail
837 2775
838 lea .Lcbc_ret(%rip),%r10 2776 mov 152($context),%rax # pull context->Rsp
839 cmp %r10,%rbx # context->Rip>="epilogue" label
840 jae .Lin_prologue
841 2777
842 lea 0(%rax),%rsi # top of stack 2778 mov 4(%r11),%r10d # HandlerData[1]
2779 lea (%rsi,%r10),%r10 # epilogue label
2780 cmp %r10,%rbx # context->Rip>=epilogue label
2781 jae .Lcommon_seh_tail
2782
2783 lea 0(%rax),%rsi # %xmm save area
843 lea 512($context),%rdi # &context.Xmm6 2784 lea 512($context),%rdi # &context.Xmm6
844 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) 2785 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
845 .long 0xa548f3fc # cld; rep movsq 2786 .long 0xa548f3fc # cld; rep movsq
846 lea 0x58(%rax),%rax # adjust stack pointer 2787 lea 0x58(%rax),%rax # adjust stack pointer
847 jmp .Lin_prologue
848 2788
849.Lrestore_rax: 2789 jmp .Lcommon_seh_tail
850 mov 120($context),%rax 2790.size ccm64_se_handler,.-ccm64_se_handler
851.Lin_prologue:
852 mov 8(%rax),%rdi
853 mov 16(%rax),%rsi
854 mov %rax,152($context) # restore context->Rsp
855 mov %rsi,168($context) # restore context->Rsi
856 mov %rdi,176($context) # restore context->Rdi
857 2791
858 jmp .Lcommon_seh_exit 2792.type ctr32_se_handler,\@abi-omnipotent
859.size cbc_se_handler,.-cbc_se_handler 2793.align 16
2794ctr32_se_handler:
2795 push %rsi
2796 push %rdi
2797 push %rbx
2798 push %rbp
2799 push %r12
2800 push %r13
2801 push %r14
2802 push %r15
2803 pushfq
2804 sub \$64,%rsp
860 2805
861.type ecb_se_handler,\@abi-omnipotent 2806 mov 120($context),%rax # pull context->Rax
2807 mov 248($context),%rbx # pull context->Rip
2808
2809 lea .Lctr32_body(%rip),%r10
2810 cmp %r10,%rbx # context->Rip<"prologue" label
2811 jb .Lcommon_seh_tail
2812
2813 mov 152($context),%rax # pull context->Rsp
2814
2815 lea .Lctr32_ret(%rip),%r10
2816 cmp %r10,%rbx
2817 jae .Lcommon_seh_tail
2818
2819 lea 0x20(%rax),%rsi # %xmm save area
2820 lea 512($context),%rdi # &context.Xmm6
2821 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2822 .long 0xa548f3fc # cld; rep movsq
2823 lea 0xc8(%rax),%rax # adjust stack pointer
2824
2825 jmp .Lcommon_seh_tail
2826.size ctr32_se_handler,.-ctr32_se_handler
2827
2828.type xts_se_handler,\@abi-omnipotent
862.align 16 2829.align 16
863ecb_se_handler: 2830xts_se_handler:
2831 push %rsi
2832 push %rdi
2833 push %rbx
2834 push %rbp
2835 push %r12
2836 push %r13
2837 push %r14
2838 push %r15
2839 pushfq
2840 sub \$64,%rsp
2841
2842 mov 120($context),%rax # pull context->Rax
2843 mov 248($context),%rbx # pull context->Rip
2844
2845 mov 8($disp),%rsi # disp->ImageBase
2846 mov 56($disp),%r11 # disp->HandlerData
2847
2848 mov 0(%r11),%r10d # HandlerData[0]
2849 lea (%rsi,%r10),%r10 # prologue lable
2850 cmp %r10,%rbx # context->Rip<prologue label
2851 jb .Lcommon_seh_tail
2852
2853 mov 152($context),%rax # pull context->Rsp
2854
2855 mov 4(%r11),%r10d # HandlerData[1]
2856 lea (%rsi,%r10),%r10 # epilogue label
2857 cmp %r10,%rbx # context->Rip>=epilogue label
2858 jae .Lcommon_seh_tail
2859
2860 lea 0x60(%rax),%rsi # %xmm save area
2861 lea 512($context),%rdi # & context.Xmm6
2862 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2863 .long 0xa548f3fc # cld; rep movsq
2864 lea 0x68+160(%rax),%rax # adjust stack pointer
2865
2866 jmp .Lcommon_seh_tail
2867.size xts_se_handler,.-xts_se_handler
2868___
2869$code.=<<___;
2870.type cbc_se_handler,\@abi-omnipotent
2871.align 16
2872cbc_se_handler:
864 push %rsi 2873 push %rsi
865 push %rdi 2874 push %rdi
866 push %rbx 2875 push %rbx
@@ -873,13 +2882,37 @@ ecb_se_handler:
873 sub \$64,%rsp 2882 sub \$64,%rsp
874 2883
875 mov 152($context),%rax # pull context->Rsp 2884 mov 152($context),%rax # pull context->Rsp
2885 mov 248($context),%rbx # pull context->Rip
2886
2887 lea .Lcbc_decrypt(%rip),%r10
2888 cmp %r10,%rbx # context->Rip<"prologue" label
2889 jb .Lcommon_seh_tail
2890
2891 lea .Lcbc_decrypt_body(%rip),%r10
2892 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
2893 jb .Lrestore_cbc_rax
2894
2895 lea .Lcbc_ret(%rip),%r10
2896 cmp %r10,%rbx # context->Rip>="epilogue" label
2897 jae .Lcommon_seh_tail
2898
2899 lea 0(%rax),%rsi # top of stack
2900 lea 512($context),%rdi # &context.Xmm6
2901 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
2902 .long 0xa548f3fc # cld; rep movsq
2903 lea 0x58(%rax),%rax # adjust stack pointer
2904 jmp .Lcommon_seh_tail
2905
2906.Lrestore_cbc_rax:
2907 mov 120($context),%rax
2908
2909.Lcommon_seh_tail:
876 mov 8(%rax),%rdi 2910 mov 8(%rax),%rdi
877 mov 16(%rax),%rsi 2911 mov 16(%rax),%rsi
2912 mov %rax,152($context) # restore context->Rsp
878 mov %rsi,168($context) # restore context->Rsi 2913 mov %rsi,168($context) # restore context->Rsi
879 mov %rdi,176($context) # restore context->Rdi 2914 mov %rdi,176($context) # restore context->Rdi
880 2915
881.Lcommon_seh_exit:
882
883 mov 40($disp),%rdi # disp->ContextRecord 2916 mov 40($disp),%rdi # disp->ContextRecord
884 mov $context,%rsi # context 2917 mov $context,%rsi # context
885 mov \$154,%ecx # sizeof(CONTEXT) 2918 mov \$154,%ecx # sizeof(CONTEXT)
@@ -915,10 +2948,33 @@ ecb_se_handler:
915 2948
916.section .pdata 2949.section .pdata
917.align 4 2950.align 4
918 .rva .LSEH_begin_${PREFIX}_ecb_encrypt 2951___
919 .rva .LSEH_end_${PREFIX}_ecb_encrypt 2952$code.=<<___ if ($PREFIX eq "aesni");
2953 .rva .LSEH_begin_aesni_ecb_encrypt
2954 .rva .LSEH_end_aesni_ecb_encrypt
920 .rva .LSEH_info_ecb 2955 .rva .LSEH_info_ecb
921 2956
2957 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
2958 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
2959 .rva .LSEH_info_ccm64_enc
2960
2961 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
2962 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
2963 .rva .LSEH_info_ccm64_dec
2964
2965 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
2966 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
2967 .rva .LSEH_info_ctr32
2968
2969 .rva .LSEH_begin_aesni_xts_encrypt
2970 .rva .LSEH_end_aesni_xts_encrypt
2971 .rva .LSEH_info_xts_enc
2972
2973 .rva .LSEH_begin_aesni_xts_decrypt
2974 .rva .LSEH_end_aesni_xts_decrypt
2975 .rva .LSEH_info_xts_dec
2976___
2977$code.=<<___;
922 .rva .LSEH_begin_${PREFIX}_cbc_encrypt 2978 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
923 .rva .LSEH_end_${PREFIX}_cbc_encrypt 2979 .rva .LSEH_end_${PREFIX}_cbc_encrypt
924 .rva .LSEH_info_cbc 2980 .rva .LSEH_info_cbc
@@ -932,28 +2988,49 @@ ecb_se_handler:
932 .rva .LSEH_info_key 2988 .rva .LSEH_info_key
933.section .xdata 2989.section .xdata
934.align 8 2990.align 8
2991___
2992$code.=<<___ if ($PREFIX eq "aesni");
935.LSEH_info_ecb: 2993.LSEH_info_ecb:
936 .byte 9,0,0,0 2994 .byte 9,0,0,0
937 .rva ecb_se_handler 2995 .rva ecb_se_handler
2996.LSEH_info_ccm64_enc:
2997 .byte 9,0,0,0
2998 .rva ccm64_se_handler
2999 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
3000.LSEH_info_ccm64_dec:
3001 .byte 9,0,0,0
3002 .rva ccm64_se_handler
3003 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
3004.LSEH_info_ctr32:
3005 .byte 9,0,0,0
3006 .rva ctr32_se_handler
3007.LSEH_info_xts_enc:
3008 .byte 9,0,0,0
3009 .rva xts_se_handler
3010 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3011.LSEH_info_xts_dec:
3012 .byte 9,0,0,0
3013 .rva xts_se_handler
3014 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3015___
3016$code.=<<___;
938.LSEH_info_cbc: 3017.LSEH_info_cbc:
939 .byte 9,0,0,0 3018 .byte 9,0,0,0
940 .rva cbc_se_handler 3019 .rva cbc_se_handler
941.LSEH_info_key: 3020.LSEH_info_key:
942 .byte 0x01,0x04,0x01,0x00 3021 .byte 0x01,0x04,0x01,0x00
943 .byte 0x04,0x02,0x00,0x00 3022 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
944___ 3023___
945} 3024}
946 3025
947sub rex { 3026sub rex {
948 local *opcode=shift; 3027 local *opcode=shift;
949 my ($dst,$src)=@_; 3028 my ($dst,$src)=@_;
950 3029 my $rex=0;
951 if ($dst>=8 || $src>=8) { 3030
952 $rex=0x40; 3031 $rex|=0x04 if($dst>=8);
953 $rex|=0x04 if($dst>=8); 3032 $rex|=0x01 if($src>=8);
954 $rex|=0x01 if($src>=8); 3033 push @opcode,$rex|0x40 if($rex);
955 push @opcode,$rex;
956 }
957} 3034}
958 3035
959sub aesni { 3036sub aesni {
@@ -989,4 +3066,3 @@ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
989print $code; 3066print $code;
990 3067
991close STDOUT; 3068close STDOUT;
992
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000000..c9c6312fa7
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3044 @@
1#!/usr/bin/env perl
2
3###################################################################
4### AES-128 [originally in CTR mode] ###
5### bitsliced implementation for Intel Core 2 processors ###
6### requires support of SSE extensions up to SSSE3 ###
7### Author: Emilia Käsper and Peter Schwabe ###
8### Date: 2009-03-19 ###
9### Public domain ###
10### ###
11### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12### further information. ###
13###################################################################
14#
15# September 2011.
16#
17# Started as transliteration to "perlasm" the original code has
18# undergone following changes:
19#
20# - code was made position-independent;
21# - rounds were folded into a loop resulting in >5x size reduction
22# from 12.5KB to 2.2KB;
23# - above was possibile thanks to mixcolumns() modification that
24# allowed to feed its output back to aesenc[last], this was
25# achieved at cost of two additional inter-registers moves;
26# - some instruction reordering and interleaving;
27# - this module doesn't implement key setup subroutine, instead it
28# relies on conversion of "conventional" key schedule as returned
29# by AES_set_encrypt_key (see discussion below);
30# - first and last round keys are treated differently, which allowed
31# to skip one shiftrows(), reduce bit-sliced key schedule and
32# speed-up conversion by 22%;
33# - support for 192- and 256-bit keys was added;
34#
35# Resulting performance in CPU cycles spent to encrypt one byte out
36# of 4096-byte buffer with 128-bit key is:
37#
38# Emilia's this(*) difference
39#
40# Core 2 9.30 8.69 +7%
41# Nehalem(**) 7.63 6.98 +9%
42# Atom 17.1 17.4 -2%(***)
43#
44# (*) Comparison is not completely fair, because "this" is ECB,
45# i.e. no extra processing such as counter values calculation
46# and xor-ing input as in Emilia's CTR implementation is
47# performed. However, the CTR calculations stand for not more
48# than 1% of total time, so comparison is *rather* fair.
49#
50# (**) Results were collected on Westmere, which is considered to
51# be equivalent to Nehalem for this code.
52#
53# (***) Slowdown on Atom is rather strange per se, because original
54# implementation has a number of 9+-bytes instructions, which
55# are bad for Atom front-end, and which I eliminated completely.
56# In attempt to address deterioration sbox() was tested in FP
57# SIMD "domain" (movaps instead of movdqa, xorps instead of
58# pxor, etc.). While it resulted in nominal 4% improvement on
59# Atom, it hurted Westmere by more than 2x factor.
60#
61# As for key schedule conversion subroutine. Interface to OpenSSL
62# relies on per-invocation on-the-fly conversion. This naturally
63# has impact on performance, especially for short inputs. Conversion
64# time in CPU cycles and its ratio to CPU cycles spent in 8x block
65# function is:
66#
67# conversion conversion/8x block
68# Core 2 240 0.22
69# Nehalem 180 0.20
70# Atom 430 0.19
71#
72# The ratio values mean that 128-byte blocks will be processed
73# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
74# etc. Then keep in mind that input sizes not divisible by 128 are
75# *effectively* slower, especially shortest ones, e.g. consecutive
76# 144-byte blocks are processed 44% slower than one would expect,
77# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78# it's still faster than ["hyper-threading-safe" code path in]
79# aes-x86_64.pl on all lengths above 64 bytes...
80#
81# October 2011.
82#
83# Add decryption procedure. Performance in CPU cycles spent to decrypt
84# one byte out of 4096-byte buffer with 128-bit key is:
85#
86# Core 2 11.0
87# Nehalem 9.16
88# Atom 20.9
89#
90# November 2011.
91#
92# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
93# suboptimal, but XTS is meant to be used with larger blocks...
94#
95# <appro@openssl.org>
96
97$flavour = shift;
98$output = shift;
99if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100
101$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102
103$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
104( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
105( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
106die "can't locate x86_64-xlate.pl";
107
108open STDOUT,"| $^X $xlate $flavour $output";
109
110my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
111my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
112my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
113
114{
115my ($key,$rounds,$const)=("%rax","%r10d","%r11");
116
117sub Sbox {
118# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
119# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
120my @b=@_[0..7];
121my @t=@_[8..11];
122my @s=@_[12..15];
123 &InBasisChange (@b);
124 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
125 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
126}
127
128sub InBasisChange {
129# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
130# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
131my @b=@_[0..7];
132$code.=<<___;
133 pxor @b[6], @b[5]
134 pxor @b[1], @b[2]
135 pxor @b[0], @b[3]
136 pxor @b[2], @b[6]
137 pxor @b[0], @b[5]
138
139 pxor @b[3], @b[6]
140 pxor @b[7], @b[3]
141 pxor @b[5], @b[7]
142 pxor @b[4], @b[3]
143 pxor @b[5], @b[4]
144 pxor @b[1], @b[3]
145
146 pxor @b[7], @b[2]
147 pxor @b[5], @b[1]
148___
149}
150
151sub OutBasisChange {
152# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
153# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
154my @b=@_[0..7];
155$code.=<<___;
156 pxor @b[6], @b[0]
157 pxor @b[4], @b[1]
158 pxor @b[0], @b[2]
159 pxor @b[6], @b[4]
160 pxor @b[1], @b[6]
161
162 pxor @b[5], @b[1]
163 pxor @b[3], @b[5]
164 pxor @b[7], @b[3]
165 pxor @b[5], @b[7]
166 pxor @b[5], @b[2]
167
168 pxor @b[7], @b[4]
169___
170}
171
172sub InvSbox {
173# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
174# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
175my @b=@_[0..7];
176my @t=@_[8..11];
177my @s=@_[12..15];
178 &InvInBasisChange (@b);
179 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
180 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
181}
182
183sub InvInBasisChange { # OutBasisChange in reverse
184my @b=@_[5,1,2,6,3,7,0,4];
185$code.=<<___
186 pxor @b[7], @b[4]
187
188 pxor @b[5], @b[7]
189 pxor @b[5], @b[2]
190 pxor @b[7], @b[3]
191 pxor @b[3], @b[5]
192 pxor @b[5], @b[1]
193
194 pxor @b[1], @b[6]
195 pxor @b[0], @b[2]
196 pxor @b[6], @b[4]
197 pxor @b[6], @b[0]
198 pxor @b[4], @b[1]
199___
200}
201
202sub InvOutBasisChange { # InBasisChange in reverse
203my @b=@_[2,5,7,3,6,1,0,4];
204$code.=<<___;
205 pxor @b[5], @b[1]
206 pxor @b[7], @b[2]
207
208 pxor @b[1], @b[3]
209 pxor @b[5], @b[4]
210 pxor @b[5], @b[7]
211 pxor @b[4], @b[3]
212 pxor @b[0], @b[5]
213 pxor @b[7], @b[3]
214 pxor @b[2], @b[6]
215 pxor @b[1], @b[2]
216 pxor @b[3], @b[6]
217
218 pxor @b[0], @b[3]
219 pxor @b[6], @b[5]
220___
221}
222
223sub Mul_GF4 {
224#;*************************************************************
225#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
226#;*************************************************************
227my ($x0,$x1,$y0,$y1,$t0)=@_;
228$code.=<<___;
229 movdqa $y0, $t0
230 pxor $y1, $t0
231 pand $x0, $t0
232 pxor $x1, $x0
233 pand $y0, $x1
234 pand $y1, $x0
235 pxor $x1, $x0
236 pxor $t0, $x1
237___
238}
239
240sub Mul_GF4_N { # not used, see next subroutine
241# multiply and scale by N
242my ($x0,$x1,$y0,$y1,$t0)=@_;
243$code.=<<___;
244 movdqa $y0, $t0
245 pxor $y1, $t0
246 pand $x0, $t0
247 pxor $x1, $x0
248 pand $y0, $x1
249 pand $y1, $x0
250 pxor $x0, $x1
251 pxor $t0, $x0
252___
253}
254
255sub Mul_GF4_N_GF4 {
256# interleaved Mul_GF4_N and Mul_GF4
257my ($x0,$x1,$y0,$y1,$t0,
258 $x2,$x3,$y2,$y3,$t1)=@_;
259$code.=<<___;
260 movdqa $y0, $t0
261 movdqa $y2, $t1
262 pxor $y1, $t0
263 pxor $y3, $t1
264 pand $x0, $t0
265 pand $x2, $t1
266 pxor $x1, $x0
267 pxor $x3, $x2
268 pand $y0, $x1
269 pand $y2, $x3
270 pand $y1, $x0
271 pand $y3, $x2
272 pxor $x0, $x1
273 pxor $x3, $x2
274 pxor $t0, $x0
275 pxor $t1, $x3
276___
277}
278sub Mul_GF16_2 {
279my @x=@_[0..7];
280my @y=@_[8..11];
281my @t=@_[12..15];
282$code.=<<___;
283 movdqa @x[0], @t[0]
284 movdqa @x[1], @t[1]
285___
286 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
287$code.=<<___;
288 pxor @x[2], @t[0]
289 pxor @x[3], @t[1]
290 pxor @y[2], @y[0]
291 pxor @y[3], @y[1]
292___
293 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
294 @x[2], @x[3], @y[2], @y[3], @t[2]);
295$code.=<<___;
296 pxor @t[0], @x[0]
297 pxor @t[0], @x[2]
298 pxor @t[1], @x[1]
299 pxor @t[1], @x[3]
300
301 movdqa @x[4], @t[0]
302 movdqa @x[5], @t[1]
303 pxor @x[6], @t[0]
304 pxor @x[7], @t[1]
305___
306 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
307 @x[6], @x[7], @y[2], @y[3], @t[2]);
308$code.=<<___;
309 pxor @y[2], @y[0]
310 pxor @y[3], @y[1]
311___
312 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
313$code.=<<___;
314 pxor @t[0], @x[4]
315 pxor @t[0], @x[6]
316 pxor @t[1], @x[5]
317 pxor @t[1], @x[7]
318___
319}
320sub Inv_GF256 {
321#;********************************************************************
322#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
323#;********************************************************************
324my @x=@_[0..7];
325my @t=@_[8..11];
326my @s=@_[12..15];
327# direct optimizations from hardware
328$code.=<<___;
329 movdqa @x[4], @t[3]
330 movdqa @x[5], @t[2]
331 movdqa @x[1], @t[1]
332 movdqa @x[7], @s[1]
333 movdqa @x[0], @s[0]
334
335 pxor @x[6], @t[3]
336 pxor @x[7], @t[2]
337 pxor @x[3], @t[1]
338 movdqa @t[3], @s[2]
339 pxor @x[6], @s[1]
340 movdqa @t[2], @t[0]
341 pxor @x[2], @s[0]
342 movdqa @t[3], @s[3]
343
344 por @t[1], @t[2]
345 por @s[0], @t[3]
346 pxor @t[0], @s[3]
347 pand @s[0], @s[2]
348 pxor @t[1], @s[0]
349 pand @t[1], @t[0]
350 pand @s[0], @s[3]
351 movdqa @x[3], @s[0]
352 pxor @x[2], @s[0]
353 pand @s[0], @s[1]
354 pxor @s[1], @t[3]
355 pxor @s[1], @t[2]
356 movdqa @x[4], @s[1]
357 movdqa @x[1], @s[0]
358 pxor @x[5], @s[1]
359 pxor @x[0], @s[0]
360 movdqa @s[1], @t[1]
361 pand @s[0], @s[1]
362 por @s[0], @t[1]
363 pxor @s[1], @t[0]
364 pxor @s[3], @t[3]
365 pxor @s[2], @t[2]
366 pxor @s[3], @t[1]
367 movdqa @x[7], @s[0]
368 pxor @s[2], @t[0]
369 movdqa @x[6], @s[1]
370 pxor @s[2], @t[1]
371 movdqa @x[5], @s[2]
372 pand @x[3], @s[0]
373 movdqa @x[4], @s[3]
374 pand @x[2], @s[1]
375 pand @x[1], @s[2]
376 por @x[0], @s[3]
377 pxor @s[0], @t[3]
378 pxor @s[1], @t[2]
379 pxor @s[2], @t[1]
380 pxor @s[3], @t[0]
381
382 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383
384 # new smaller inversion
385
386 movdqa @t[3], @s[0]
387 pand @t[1], @t[3]
388 pxor @t[2], @s[0]
389
390 movdqa @t[0], @s[2]
391 movdqa @s[0], @s[3]
392 pxor @t[3], @s[2]
393 pand @s[2], @s[3]
394
395 movdqa @t[1], @s[1]
396 pxor @t[2], @s[3]
397 pxor @t[0], @s[1]
398
399 pxor @t[2], @t[3]
400
401 pand @t[3], @s[1]
402
403 movdqa @s[2], @t[2]
404 pxor @t[0], @s[1]
405
406 pxor @s[1], @t[2]
407 pxor @s[1], @t[1]
408
409 pand @t[0], @t[2]
410
411 pxor @t[2], @s[2]
412 pxor @t[2], @t[1]
413
414 pand @s[3], @s[2]
415
416 pxor @s[0], @s[2]
417___
418# output in s3, s2, s1, t1
419
420# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421
422# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
423 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424
425### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
426}
427
428# AES linear components
429
430sub ShiftRows {
431my @x=@_[0..7];
432my $mask=pop;
433$code.=<<___;
434 pxor 0x00($key),@x[0]
435 pxor 0x10($key),@x[1]
436 pshufb $mask,@x[0]
437 pxor 0x20($key),@x[2]
438 pshufb $mask,@x[1]
439 pxor 0x30($key),@x[3]
440 pshufb $mask,@x[2]
441 pxor 0x40($key),@x[4]
442 pshufb $mask,@x[3]
443 pxor 0x50($key),@x[5]
444 pshufb $mask,@x[4]
445 pxor 0x60($key),@x[6]
446 pshufb $mask,@x[5]
447 pxor 0x70($key),@x[7]
448 pshufb $mask,@x[6]
449 lea 0x80($key),$key
450 pshufb $mask,@x[7]
451___
452}
453
454sub MixColumns {
455# modified to emit output in order suitable for feeding back to aesenc[last]
456my @x=@_[0..7];
457my @t=@_[8..15];
458$code.=<<___;
459 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
460 pshufd \$0x93, @x[1], @t[1]
461 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
462 pshufd \$0x93, @x[2], @t[2]
463 pxor @t[1], @x[1]
464 pshufd \$0x93, @x[3], @t[3]
465 pxor @t[2], @x[2]
466 pshufd \$0x93, @x[4], @t[4]
467 pxor @t[3], @x[3]
468 pshufd \$0x93, @x[5], @t[5]
469 pxor @t[4], @x[4]
470 pshufd \$0x93, @x[6], @t[6]
471 pxor @t[5], @x[5]
472 pshufd \$0x93, @x[7], @t[7]
473 pxor @t[6], @x[6]
474 pxor @t[7], @x[7]
475
476 pxor @x[0], @t[1]
477 pxor @x[7], @t[0]
478 pxor @x[7], @t[1]
479 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pxor @x[1], @t[2]
481 pshufd \$0x4E, @x[1], @x[1]
482 pxor @x[4], @t[5]
483 pxor @t[0], @x[0]
484 pxor @x[5], @t[6]
485 pxor @t[1], @x[1]
486 pxor @x[3], @t[4]
487 pshufd \$0x4E, @x[4], @t[0]
488 pxor @x[6], @t[7]
489 pshufd \$0x4E, @x[5], @t[1]
490 pxor @x[2], @t[3]
491 pshufd \$0x4E, @x[3], @x[4]
492 pxor @x[7], @t[3]
493 pshufd \$0x4E, @x[7], @x[5]
494 pxor @x[7], @t[4]
495 pshufd \$0x4E, @x[6], @x[3]
496 pxor @t[4], @t[0]
497 pshufd \$0x4E, @x[2], @x[6]
498 pxor @t[5], @t[1]
499
500 pxor @t[3], @x[4]
501 pxor @t[7], @x[5]
502 pxor @t[6], @x[3]
503 movdqa @t[0], @x[2]
504 pxor @t[2], @x[6]
505 movdqa @t[1], @x[7]
506___
507}
508
509sub InvMixColumns {
510my @x=@_[0..7];
511my @t=@_[8..15];
512
513$code.=<<___;
514 # multiplication by 0x0e
515 pshufd \$0x93, @x[7], @t[7]
516 movdqa @x[2], @t[2]
517 pxor @x[5], @x[7] # 7 5
518 pxor @x[5], @x[2] # 2 5
519 pshufd \$0x93, @x[0], @t[0]
520 movdqa @x[5], @t[5]
521 pxor @x[0], @x[5] # 5 0 [1]
522 pxor @x[1], @x[0] # 0 1
523 pshufd \$0x93, @x[1], @t[1]
524 pxor @x[2], @x[1] # 1 25
525 pxor @x[6], @x[0] # 01 6 [2]
526 pxor @x[3], @x[1] # 125 3 [4]
527 pshufd \$0x93, @x[3], @t[3]
528 pxor @x[0], @x[2] # 25 016 [3]
529 pxor @x[7], @x[3] # 3 75
530 pxor @x[6], @x[7] # 75 6 [0]
531 pshufd \$0x93, @x[6], @t[6]
532 movdqa @x[4], @t[4]
533 pxor @x[4], @x[6] # 6 4
534 pxor @x[3], @x[4] # 4 375 [6]
535 pxor @x[7], @x[3] # 375 756=36
536 pxor @t[5], @x[6] # 64 5 [7]
537 pxor @t[2], @x[3] # 36 2
538 pxor @t[4], @x[3] # 362 4 [5]
539 pshufd \$0x93, @t[5], @t[5]
540___
541 my @y = @x[7,5,0,2,1,3,4,6];
542$code.=<<___;
543 # multiplication by 0x0b
544 pxor @y[0], @y[1]
545 pxor @t[0], @y[0]
546 pxor @t[1], @y[1]
547 pshufd \$0x93, @t[2], @t[2]
548 pxor @t[5], @y[0]
549 pxor @t[6], @y[1]
550 pxor @t[7], @y[0]
551 pshufd \$0x93, @t[4], @t[4]
552 pxor @t[6], @t[7] # clobber t[7]
553 pxor @y[0], @y[1]
554
555 pxor @t[0], @y[3]
556 pshufd \$0x93, @t[0], @t[0]
557 pxor @t[1], @y[2]
558 pxor @t[1], @y[4]
559 pxor @t[2], @y[2]
560 pshufd \$0x93, @t[1], @t[1]
561 pxor @t[2], @y[3]
562 pxor @t[2], @y[5]
563 pxor @t[7], @y[2]
564 pshufd \$0x93, @t[2], @t[2]
565 pxor @t[3], @y[3]
566 pxor @t[3], @y[6]
567 pxor @t[3], @y[4]
568 pshufd \$0x93, @t[3], @t[3]
569 pxor @t[4], @y[7]
570 pxor @t[4], @y[5]
571 pxor @t[7], @y[7]
572 pxor @t[5], @y[3]
573 pxor @t[4], @y[4]
574 pxor @t[5], @t[7] # clobber t[7] even more
575
576 pxor @t[7], @y[5]
577 pshufd \$0x93, @t[4], @t[4]
578 pxor @t[7], @y[6]
579 pxor @t[7], @y[4]
580
581 pxor @t[5], @t[7]
582 pshufd \$0x93, @t[5], @t[5]
583 pxor @t[6], @t[7] # restore t[7]
584
585 # multiplication by 0x0d
586 pxor @y[7], @y[4]
587 pxor @t[4], @y[7]
588 pshufd \$0x93, @t[6], @t[6]
589 pxor @t[0], @y[2]
590 pxor @t[5], @y[7]
591 pxor @t[2], @y[2]
592 pshufd \$0x93, @t[7], @t[7]
593
594 pxor @y[1], @y[3]
595 pxor @t[1], @y[1]
596 pxor @t[0], @y[0]
597 pxor @t[0], @y[3]
598 pxor @t[5], @y[1]
599 pxor @t[5], @y[0]
600 pxor @t[7], @y[1]
601 pshufd \$0x93, @t[0], @t[0]
602 pxor @t[6], @y[0]
603 pxor @y[1], @y[3]
604 pxor @t[1], @y[4]
605 pshufd \$0x93, @t[1], @t[1]
606
607 pxor @t[7], @y[7]
608 pxor @t[2], @y[4]
609 pxor @t[2], @y[5]
610 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[6], @y[2]
612 pxor @t[3], @t[6] # clobber t[6]
613 pxor @y[7], @y[4]
614 pxor @t[6], @y[3]
615
616 pxor @t[6], @y[6]
617 pxor @t[5], @y[5]
618 pxor @t[4], @y[6]
619 pshufd \$0x93, @t[4], @t[4]
620 pxor @t[6], @y[5]
621 pxor @t[7], @y[6]
622 pxor @t[3], @t[6] # restore t[6]
623
624 pshufd \$0x93, @t[5], @t[5]
625 pshufd \$0x93, @t[6], @t[6]
626 pshufd \$0x93, @t[7], @t[7]
627 pshufd \$0x93, @t[3], @t[3]
628
629 # multiplication by 0x09
630 pxor @y[1], @y[4]
631 pxor @y[1], @t[1] # t[1]=y[1]
632 pxor @t[5], @t[0] # clobber t[0]
633 pxor @t[5], @t[1]
634 pxor @t[0], @y[3]
635 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[6], @t[1]
637 pxor @t[7], @t[6] # clobber t[6]
638 pxor @t[1], @y[4]
639 pxor @t[4], @y[7]
640 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @t[3], @y[6]
642 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @t[2], @y[5]
644 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @t[7], @t[3]
646 pxor @y[5], @t[5] # t[5]=y[5]
647 pxor @t[6], @t[2]
648 pxor @t[6], @t[5]
649 pxor @y[6], @t[6] # t[6]=y[6]
650 pxor @y[7], @t[7] # t[7]=y[7]
651
652 movdqa @t[0],@XMM[0]
653 movdqa @t[1],@XMM[1]
654 movdqa @t[2],@XMM[2]
655 movdqa @t[3],@XMM[3]
656 movdqa @t[4],@XMM[4]
657 movdqa @t[5],@XMM[5]
658 movdqa @t[6],@XMM[6]
659 movdqa @t[7],@XMM[7]
660___
661}
662
663sub aesenc { # not used
664my @b=@_[0..7];
665my @t=@_[8..15];
666$code.=<<___;
667 movdqa 0x30($const),@t[0] # .LSR
668___
669 &ShiftRows (@b,@t[0]);
670 &Sbox (@b,@t);
671 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
672}
673
674sub aesenclast { # not used
675my @b=@_[0..7];
676my @t=@_[8..15];
677$code.=<<___;
678 movdqa 0x40($const),@t[0] # .LSRM0
679___
680 &ShiftRows (@b,@t[0]);
681 &Sbox (@b,@t);
682$code.=<<___
683 pxor 0x00($key),@b[0]
684 pxor 0x10($key),@b[1]
685 pxor 0x20($key),@b[4]
686 pxor 0x30($key),@b[6]
687 pxor 0x40($key),@b[3]
688 pxor 0x50($key),@b[7]
689 pxor 0x60($key),@b[2]
690 pxor 0x70($key),@b[5]
691___
692}
693
694sub swapmove {
695my ($a,$b,$n,$mask,$t)=@_;
696$code.=<<___;
697 movdqa $b,$t
698 psrlq \$$n,$b
699 pxor $a,$b
700 pand $mask,$b
701 pxor $b,$a
702 psllq \$$n,$b
703 pxor $t,$b
704___
705}
706sub swapmove2x {
707my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
708$code.=<<___;
709 movdqa $b0,$t0
710 psrlq \$$n,$b0
711 movdqa $b1,$t1
712 psrlq \$$n,$b1
713 pxor $a0,$b0
714 pxor $a1,$b1
715 pand $mask,$b0
716 pand $mask,$b1
717 pxor $b0,$a0
718 psllq \$$n,$b0
719 pxor $b1,$a1
720 psllq \$$n,$b1
721 pxor $t0,$b0
722 pxor $t1,$b1
723___
724}
725
726sub bitslice {
727my @x=reverse(@_[0..7]);
728my ($t0,$t1,$t2,$t3)=@_[8..11];
729$code.=<<___;
730 movdqa 0x00($const),$t0 # .LBS0
731 movdqa 0x10($const),$t1 # .LBS1
732___
733 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
734 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735$code.=<<___;
736 movdqa 0x20($const),$t0 # .LBS2
737___
738 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
739 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740
741 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
742 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
743}
744
745$code.=<<___;
746.text
747
748.extern asm_AES_encrypt
749.extern asm_AES_decrypt
750
751.type _bsaes_encrypt8,\@abi-omnipotent
752.align 64
753_bsaes_encrypt8:
754 lea .LBS0(%rip), $const # constants table
755
756 movdqa ($key), @XMM[9] # round 0 key
757 lea 0x10($key), $key
758 movdqa 0x50($const), @XMM[8] # .LM0SR
759 pxor @XMM[9], @XMM[0] # xor with round0 key
760 pxor @XMM[9], @XMM[1]
761 pshufb @XMM[8], @XMM[0]
762 pxor @XMM[9], @XMM[2]
763 pshufb @XMM[8], @XMM[1]
764 pxor @XMM[9], @XMM[3]
765 pshufb @XMM[8], @XMM[2]
766 pxor @XMM[9], @XMM[4]
767 pshufb @XMM[8], @XMM[3]
768 pxor @XMM[9], @XMM[5]
769 pshufb @XMM[8], @XMM[4]
770 pxor @XMM[9], @XMM[6]
771 pshufb @XMM[8], @XMM[5]
772 pxor @XMM[9], @XMM[7]
773 pshufb @XMM[8], @XMM[6]
774 pshufb @XMM[8], @XMM[7]
775_bsaes_encrypt8_bitslice:
776___
777 &bitslice (@XMM[0..7, 8..11]);
778$code.=<<___;
779 dec $rounds
780 jmp .Lenc_sbox
781.align 16
782.Lenc_loop:
783___
784 &ShiftRows (@XMM[0..7, 8]);
785$code.=".Lenc_sbox:\n";
786 &Sbox (@XMM[0..7, 8..15]);
787$code.=<<___;
788 dec $rounds
789 jl .Lenc_done
790___
791 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792$code.=<<___;
793 movdqa 0x30($const), @XMM[8] # .LSR
794 jnz .Lenc_loop
795 movdqa 0x40($const), @XMM[8] # .LSRM0
796 jmp .Lenc_loop
797.align 16
798.Lenc_done:
799___
800 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
801 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802$code.=<<___;
803 movdqa ($key), @XMM[8] # last round key
804 pxor @XMM[8], @XMM[4]
805 pxor @XMM[8], @XMM[6]
806 pxor @XMM[8], @XMM[3]
807 pxor @XMM[8], @XMM[7]
808 pxor @XMM[8], @XMM[2]
809 pxor @XMM[8], @XMM[5]
810 pxor @XMM[8], @XMM[0]
811 pxor @XMM[8], @XMM[1]
812 ret
813.size _bsaes_encrypt8,.-_bsaes_encrypt8
814
815.type _bsaes_decrypt8,\@abi-omnipotent
816.align 64
817_bsaes_decrypt8:
818 lea .LBS0(%rip), $const # constants table
819
820 movdqa ($key), @XMM[9] # round 0 key
821 lea 0x10($key), $key
822 movdqa -0x30($const), @XMM[8] # .LM0ISR
823 pxor @XMM[9], @XMM[0] # xor with round0 key
824 pxor @XMM[9], @XMM[1]
825 pshufb @XMM[8], @XMM[0]
826 pxor @XMM[9], @XMM[2]
827 pshufb @XMM[8], @XMM[1]
828 pxor @XMM[9], @XMM[3]
829 pshufb @XMM[8], @XMM[2]
830 pxor @XMM[9], @XMM[4]
831 pshufb @XMM[8], @XMM[3]
832 pxor @XMM[9], @XMM[5]
833 pshufb @XMM[8], @XMM[4]
834 pxor @XMM[9], @XMM[6]
835 pshufb @XMM[8], @XMM[5]
836 pxor @XMM[9], @XMM[7]
837 pshufb @XMM[8], @XMM[6]
838 pshufb @XMM[8], @XMM[7]
839___
840 &bitslice (@XMM[0..7, 8..11]);
841$code.=<<___;
842 dec $rounds
843 jmp .Ldec_sbox
844.align 16
845.Ldec_loop:
846___
847 &ShiftRows (@XMM[0..7, 8]);
848$code.=".Ldec_sbox:\n";
849 &InvSbox (@XMM[0..7, 8..15]);
850$code.=<<___;
851 dec $rounds
852 jl .Ldec_done
853___
854 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855$code.=<<___;
856 movdqa -0x10($const), @XMM[8] # .LISR
857 jnz .Ldec_loop
858 movdqa -0x20($const), @XMM[8] # .LISRM0
859 jmp .Ldec_loop
860.align 16
861.Ldec_done:
862___
863 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864$code.=<<___;
865 movdqa ($key), @XMM[8] # last round key
866 pxor @XMM[8], @XMM[6]
867 pxor @XMM[8], @XMM[4]
868 pxor @XMM[8], @XMM[2]
869 pxor @XMM[8], @XMM[7]
870 pxor @XMM[8], @XMM[3]
871 pxor @XMM[8], @XMM[5]
872 pxor @XMM[8], @XMM[0]
873 pxor @XMM[8], @XMM[1]
874 ret
875.size _bsaes_decrypt8,.-_bsaes_decrypt8
876___
877}
878{
879my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
880
881sub bitslice_key {
882my @x=reverse(@_[0..7]);
883my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884
885 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886$code.=<<___;
887 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
888 movdqa @x[0], @x[2]
889 movdqa @x[1], @x[3]
890___
891 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892
893 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894$code.=<<___;
895 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
896 movdqa @x[0], @x[4]
897 movdqa @x[2], @x[6]
898 movdqa @x[1], @x[5]
899 movdqa @x[3], @x[7]
900___
901 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
902 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
903}
904
905$code.=<<___;
906.type _bsaes_key_convert,\@abi-omnipotent
907.align 16
908_bsaes_key_convert:
909 lea .Lmasks(%rip), $const
910 movdqu ($inp), %xmm7 # load round 0 key
911 lea 0x10($inp), $inp
912 movdqa 0x00($const), %xmm0 # 0x01...
913 movdqa 0x10($const), %xmm1 # 0x02...
914 movdqa 0x20($const), %xmm2 # 0x04...
915 movdqa 0x30($const), %xmm3 # 0x08...
916 movdqa 0x40($const), %xmm4 # .LM0
917 pcmpeqd %xmm5, %xmm5 # .LNOT
918
919 movdqu ($inp), %xmm6 # load round 1 key
920 movdqa %xmm7, ($out) # save round 0 key
921 lea 0x10($out), $out
922 dec $rounds
923 jmp .Lkey_loop
924.align 16
925.Lkey_loop:
926 pshufb %xmm4, %xmm6 # .LM0
927
928 movdqa %xmm0, %xmm8
929 movdqa %xmm1, %xmm9
930
931 pand %xmm6, %xmm8
932 pand %xmm6, %xmm9
933 movdqa %xmm2, %xmm10
934 pcmpeqb %xmm0, %xmm8
935 psllq \$4, %xmm0 # 0x10...
936 movdqa %xmm3, %xmm11
937 pcmpeqb %xmm1, %xmm9
938 psllq \$4, %xmm1 # 0x20...
939
940 pand %xmm6, %xmm10
941 pand %xmm6, %xmm11
942 movdqa %xmm0, %xmm12
943 pcmpeqb %xmm2, %xmm10
944 psllq \$4, %xmm2 # 0x40...
945 movdqa %xmm1, %xmm13
946 pcmpeqb %xmm3, %xmm11
947 psllq \$4, %xmm3 # 0x80...
948
949 movdqa %xmm2, %xmm14
950 movdqa %xmm3, %xmm15
951 pxor %xmm5, %xmm8 # "pnot"
952 pxor %xmm5, %xmm9
953
954 pand %xmm6, %xmm12
955 pand %xmm6, %xmm13
956 movdqa %xmm8, 0x00($out) # write bit-sliced round key
957 pcmpeqb %xmm0, %xmm12
958 psrlq \$4, %xmm0 # 0x01...
959 movdqa %xmm9, 0x10($out)
960 pcmpeqb %xmm1, %xmm13
961 psrlq \$4, %xmm1 # 0x02...
962 lea 0x10($inp), $inp
963
964 pand %xmm6, %xmm14
965 pand %xmm6, %xmm15
966 movdqa %xmm10, 0x20($out)
967 pcmpeqb %xmm2, %xmm14
968 psrlq \$4, %xmm2 # 0x04...
969 movdqa %xmm11, 0x30($out)
970 pcmpeqb %xmm3, %xmm15
971 psrlq \$4, %xmm3 # 0x08...
972 movdqu ($inp), %xmm6 # load next round key
973
974 pxor %xmm5, %xmm13 # "pnot"
975 pxor %xmm5, %xmm14
976 movdqa %xmm12, 0x40($out)
977 movdqa %xmm13, 0x50($out)
978 movdqa %xmm14, 0x60($out)
979 movdqa %xmm15, 0x70($out)
980 lea 0x80($out),$out
981 dec $rounds
982 jnz .Lkey_loop
983
984 movdqa 0x50($const), %xmm7 # .L63
985 #movdqa %xmm6, ($out) # don't save last round key
986 ret
987.size _bsaes_key_convert,.-_bsaes_key_convert
988___
989}
990
991if (0 && !$win64) { # following four functions are unsupported interface
992 # used for benchmarking...
993$code.=<<___;
994.globl bsaes_enc_key_convert
995.type bsaes_enc_key_convert,\@function,2
996.align 16
997bsaes_enc_key_convert:
998 mov 240($inp),%r10d # pass rounds
999 mov $inp,%rcx # pass key
1000 mov $out,%rax # pass key schedule
1001 call _bsaes_key_convert
1002 pxor %xmm6,%xmm7 # fix up last round key
1003 movdqa %xmm7,(%rax) # save last round key
1004 ret
1005.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1006
1007.globl bsaes_encrypt_128
1008.type bsaes_encrypt_128,\@function,4
1009.align 16
1010bsaes_encrypt_128:
1011.Lenc128_loop:
1012 movdqu 0x00($inp), @XMM[0] # load input
1013 movdqu 0x10($inp), @XMM[1]
1014 movdqu 0x20($inp), @XMM[2]
1015 movdqu 0x30($inp), @XMM[3]
1016 movdqu 0x40($inp), @XMM[4]
1017 movdqu 0x50($inp), @XMM[5]
1018 movdqu 0x60($inp), @XMM[6]
1019 movdqu 0x70($inp), @XMM[7]
1020 mov $key, %rax # pass the $key
1021 lea 0x80($inp), $inp
1022 mov \$10,%r10d
1023
1024 call _bsaes_encrypt8
1025
1026 movdqu @XMM[0], 0x00($out) # write output
1027 movdqu @XMM[1], 0x10($out)
1028 movdqu @XMM[4], 0x20($out)
1029 movdqu @XMM[6], 0x30($out)
1030 movdqu @XMM[3], 0x40($out)
1031 movdqu @XMM[7], 0x50($out)
1032 movdqu @XMM[2], 0x60($out)
1033 movdqu @XMM[5], 0x70($out)
1034 lea 0x80($out), $out
1035 sub \$0x80,$len
1036 ja .Lenc128_loop
1037 ret
1038.size bsaes_encrypt_128,.-bsaes_encrypt_128
1039
1040.globl bsaes_dec_key_convert
1041.type bsaes_dec_key_convert,\@function,2
1042.align 16
1043bsaes_dec_key_convert:
1044 mov 240($inp),%r10d # pass rounds
1045 mov $inp,%rcx # pass key
1046 mov $out,%rax # pass key schedule
1047 call _bsaes_key_convert
1048 pxor ($out),%xmm7 # fix up round 0 key
1049 movdqa %xmm6,(%rax) # save last round key
1050 movdqa %xmm7,($out)
1051 ret
1052.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1053
1054.globl bsaes_decrypt_128
1055.type bsaes_decrypt_128,\@function,4
1056.align 16
1057bsaes_decrypt_128:
1058.Ldec128_loop:
1059 movdqu 0x00($inp), @XMM[0] # load input
1060 movdqu 0x10($inp), @XMM[1]
1061 movdqu 0x20($inp), @XMM[2]
1062 movdqu 0x30($inp), @XMM[3]
1063 movdqu 0x40($inp), @XMM[4]
1064 movdqu 0x50($inp), @XMM[5]
1065 movdqu 0x60($inp), @XMM[6]
1066 movdqu 0x70($inp), @XMM[7]
1067 mov $key, %rax # pass the $key
1068 lea 0x80($inp), $inp
1069 mov \$10,%r10d
1070
1071 call _bsaes_decrypt8
1072
1073 movdqu @XMM[0], 0x00($out) # write output
1074 movdqu @XMM[1], 0x10($out)
1075 movdqu @XMM[6], 0x20($out)
1076 movdqu @XMM[4], 0x30($out)
1077 movdqu @XMM[2], 0x40($out)
1078 movdqu @XMM[7], 0x50($out)
1079 movdqu @XMM[3], 0x60($out)
1080 movdqu @XMM[5], 0x70($out)
1081 lea 0x80($out), $out
1082 sub \$0x80,$len
1083 ja .Ldec128_loop
1084 ret
1085.size bsaes_decrypt_128,.-bsaes_decrypt_128
1086___
1087}
1088{
1089######################################################################
1090#
1091# OpenSSL interface
1092#
1093my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1094 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1095my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1096
1097if ($ecb) {
1098$code.=<<___;
1099.globl bsaes_ecb_encrypt_blocks
1100.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1101.align 16
1102bsaes_ecb_encrypt_blocks:
1103 mov %rsp, %rax
1104.Lecb_enc_prologue:
1105 push %rbp
1106 push %rbx
1107 push %r12
1108 push %r13
1109 push %r14
1110 push %r15
1111 lea -0x48(%rsp),%rsp
1112___
1113$code.=<<___ if ($win64);
1114 lea -0xa0(%rsp), %rsp
1115 movaps %xmm6, 0x40(%rsp)
1116 movaps %xmm7, 0x50(%rsp)
1117 movaps %xmm8, 0x60(%rsp)
1118 movaps %xmm9, 0x70(%rsp)
1119 movaps %xmm10, 0x80(%rsp)
1120 movaps %xmm11, 0x90(%rsp)
1121 movaps %xmm12, 0xa0(%rsp)
1122 movaps %xmm13, 0xb0(%rsp)
1123 movaps %xmm14, 0xc0(%rsp)
1124 movaps %xmm15, 0xd0(%rsp)
1125.Lecb_enc_body:
1126___
1127$code.=<<___;
1128 mov %rsp,%rbp # backup %rsp
1129 mov 240($arg4),%eax # rounds
1130 mov $arg1,$inp # backup arguments
1131 mov $arg2,$out
1132 mov $arg3,$len
1133 mov $arg4,$key
1134 cmp \$8,$arg3
1135 jb .Lecb_enc_short
1136
1137 mov %eax,%ebx # backup rounds
1138 shl \$7,%rax # 128 bytes per inner round key
1139 sub \$`128-32`,%rax # size of bit-sliced key schedule
1140 sub %rax,%rsp
1141 mov %rsp,%rax # pass key schedule
1142 mov $key,%rcx # pass key
1143 mov %ebx,%r10d # pass rounds
1144 call _bsaes_key_convert
1145 pxor %xmm6,%xmm7 # fix up last round key
1146 movdqa %xmm7,(%rax) # save last round key
1147
1148 sub \$8,$len
1149.Lecb_enc_loop:
1150 movdqu 0x00($inp), @XMM[0] # load input
1151 movdqu 0x10($inp), @XMM[1]
1152 movdqu 0x20($inp), @XMM[2]
1153 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1155 movdqu 0x50($inp), @XMM[5]
1156 mov %rsp, %rax # pass key schedule
1157 movdqu 0x60($inp), @XMM[6]
1158 mov %ebx,%r10d # pass rounds
1159 movdqu 0x70($inp), @XMM[7]
1160 lea 0x80($inp), $inp
1161
1162 call _bsaes_encrypt8
1163
1164 movdqu @XMM[0], 0x00($out) # write output
1165 movdqu @XMM[1], 0x10($out)
1166 movdqu @XMM[4], 0x20($out)
1167 movdqu @XMM[6], 0x30($out)
1168 movdqu @XMM[3], 0x40($out)
1169 movdqu @XMM[7], 0x50($out)
1170 movdqu @XMM[2], 0x60($out)
1171 movdqu @XMM[5], 0x70($out)
1172 lea 0x80($out), $out
1173 sub \$8,$len
1174 jnc .Lecb_enc_loop
1175
1176 add \$8,$len
1177 jz .Lecb_enc_done
1178
1179 movdqu 0x00($inp), @XMM[0] # load input
1180 mov %rsp, %rax # pass key schedule
1181 mov %ebx,%r10d # pass rounds
1182 cmp \$2,$len
1183 jb .Lecb_enc_one
1184 movdqu 0x10($inp), @XMM[1]
1185 je .Lecb_enc_two
1186 movdqu 0x20($inp), @XMM[2]
1187 cmp \$4,$len
1188 jb .Lecb_enc_three
1189 movdqu 0x30($inp), @XMM[3]
1190 je .Lecb_enc_four
1191 movdqu 0x40($inp), @XMM[4]
1192 cmp \$6,$len
1193 jb .Lecb_enc_five
1194 movdqu 0x50($inp), @XMM[5]
1195 je .Lecb_enc_six
1196 movdqu 0x60($inp), @XMM[6]
1197 call _bsaes_encrypt8
1198 movdqu @XMM[0], 0x00($out) # write output
1199 movdqu @XMM[1], 0x10($out)
1200 movdqu @XMM[4], 0x20($out)
1201 movdqu @XMM[6], 0x30($out)
1202 movdqu @XMM[3], 0x40($out)
1203 movdqu @XMM[7], 0x50($out)
1204 movdqu @XMM[2], 0x60($out)
1205 jmp .Lecb_enc_done
1206.align 16
1207.Lecb_enc_six:
1208 call _bsaes_encrypt8
1209 movdqu @XMM[0], 0x00($out) # write output
1210 movdqu @XMM[1], 0x10($out)
1211 movdqu @XMM[4], 0x20($out)
1212 movdqu @XMM[6], 0x30($out)
1213 movdqu @XMM[3], 0x40($out)
1214 movdqu @XMM[7], 0x50($out)
1215 jmp .Lecb_enc_done
1216.align 16
1217.Lecb_enc_five:
1218 call _bsaes_encrypt8
1219 movdqu @XMM[0], 0x00($out) # write output
1220 movdqu @XMM[1], 0x10($out)
1221 movdqu @XMM[4], 0x20($out)
1222 movdqu @XMM[6], 0x30($out)
1223 movdqu @XMM[3], 0x40($out)
1224 jmp .Lecb_enc_done
1225.align 16
1226.Lecb_enc_four:
1227 call _bsaes_encrypt8
1228 movdqu @XMM[0], 0x00($out) # write output
1229 movdqu @XMM[1], 0x10($out)
1230 movdqu @XMM[4], 0x20($out)
1231 movdqu @XMM[6], 0x30($out)
1232 jmp .Lecb_enc_done
1233.align 16
1234.Lecb_enc_three:
1235 call _bsaes_encrypt8
1236 movdqu @XMM[0], 0x00($out) # write output
1237 movdqu @XMM[1], 0x10($out)
1238 movdqu @XMM[4], 0x20($out)
1239 jmp .Lecb_enc_done
1240.align 16
1241.Lecb_enc_two:
1242 call _bsaes_encrypt8
1243 movdqu @XMM[0], 0x00($out) # write output
1244 movdqu @XMM[1], 0x10($out)
1245 jmp .Lecb_enc_done
1246.align 16
1247.Lecb_enc_one:
1248 call _bsaes_encrypt8
1249 movdqu @XMM[0], 0x00($out) # write output
1250 jmp .Lecb_enc_done
1251.align 16
1252.Lecb_enc_short:
1253 lea ($inp), $arg1
1254 lea ($out), $arg2
1255 lea ($key), $arg3
1256 call asm_AES_encrypt
1257 lea 16($inp), $inp
1258 lea 16($out), $out
1259 dec $len
1260 jnz .Lecb_enc_short
1261
1262.Lecb_enc_done:
1263 lea (%rsp),%rax
1264 pxor %xmm0, %xmm0
1265.Lecb_enc_bzero: # wipe key schedule [if any]
1266 movdqa %xmm0, 0x00(%rax)
1267 movdqa %xmm0, 0x10(%rax)
1268 lea 0x20(%rax), %rax
1269 cmp %rax, %rbp
1270 jb .Lecb_enc_bzero
1271
1272 lea (%rbp),%rsp # restore %rsp
1273___
1274$code.=<<___ if ($win64);
1275 movaps 0x40(%rbp), %xmm6
1276 movaps 0x50(%rbp), %xmm7
1277 movaps 0x60(%rbp), %xmm8
1278 movaps 0x70(%rbp), %xmm9
1279 movaps 0x80(%rbp), %xmm10
1280 movaps 0x90(%rbp), %xmm11
1281 movaps 0xa0(%rbp), %xmm12
1282 movaps 0xb0(%rbp), %xmm13
1283 movaps 0xc0(%rbp), %xmm14
1284 movaps 0xd0(%rbp), %xmm15
1285 lea 0xa0(%rbp), %rsp
1286___
1287$code.=<<___;
1288 mov 0x48(%rsp), %r15
1289 mov 0x50(%rsp), %r14
1290 mov 0x58(%rsp), %r13
1291 mov 0x60(%rsp), %r12
1292 mov 0x68(%rsp), %rbx
1293 mov 0x70(%rsp), %rax
1294 lea 0x78(%rsp), %rsp
1295 mov %rax, %rbp
1296.Lecb_enc_epilogue:
1297 ret
1298.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1299
1300.globl bsaes_ecb_decrypt_blocks
1301.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1302.align 16
1303bsaes_ecb_decrypt_blocks:
1304 mov %rsp, %rax
1305.Lecb_dec_prologue:
1306 push %rbp
1307 push %rbx
1308 push %r12
1309 push %r13
1310 push %r14
1311 push %r15
1312 lea -0x48(%rsp),%rsp
1313___
1314$code.=<<___ if ($win64);
1315 lea -0xa0(%rsp), %rsp
1316 movaps %xmm6, 0x40(%rsp)
1317 movaps %xmm7, 0x50(%rsp)
1318 movaps %xmm8, 0x60(%rsp)
1319 movaps %xmm9, 0x70(%rsp)
1320 movaps %xmm10, 0x80(%rsp)
1321 movaps %xmm11, 0x90(%rsp)
1322 movaps %xmm12, 0xa0(%rsp)
1323 movaps %xmm13, 0xb0(%rsp)
1324 movaps %xmm14, 0xc0(%rsp)
1325 movaps %xmm15, 0xd0(%rsp)
1326.Lecb_dec_body:
1327___
1328$code.=<<___;
1329 mov %rsp,%rbp # backup %rsp
1330 mov 240($arg4),%eax # rounds
1331 mov $arg1,$inp # backup arguments
1332 mov $arg2,$out
1333 mov $arg3,$len
1334 mov $arg4,$key
1335 cmp \$8,$arg3
1336 jb .Lecb_dec_short
1337
1338 mov %eax,%ebx # backup rounds
1339 shl \$7,%rax # 128 bytes per inner round key
1340 sub \$`128-32`,%rax # size of bit-sliced key schedule
1341 sub %rax,%rsp
1342 mov %rsp,%rax # pass key schedule
1343 mov $key,%rcx # pass key
1344 mov %ebx,%r10d # pass rounds
1345 call _bsaes_key_convert
1346 pxor (%rsp),%xmm7 # fix up 0 round key
1347 movdqa %xmm6,(%rax) # save last round key
1348 movdqa %xmm7,(%rsp)
1349
1350 sub \$8,$len
1351.Lecb_dec_loop:
1352 movdqu 0x00($inp), @XMM[0] # load input
1353 movdqu 0x10($inp), @XMM[1]
1354 movdqu 0x20($inp), @XMM[2]
1355 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1357 movdqu 0x50($inp), @XMM[5]
1358 mov %rsp, %rax # pass key schedule
1359 movdqu 0x60($inp), @XMM[6]
1360 mov %ebx,%r10d # pass rounds
1361 movdqu 0x70($inp), @XMM[7]
1362 lea 0x80($inp), $inp
1363
1364 call _bsaes_decrypt8
1365
1366 movdqu @XMM[0], 0x00($out) # write output
1367 movdqu @XMM[1], 0x10($out)
1368 movdqu @XMM[6], 0x20($out)
1369 movdqu @XMM[4], 0x30($out)
1370 movdqu @XMM[2], 0x40($out)
1371 movdqu @XMM[7], 0x50($out)
1372 movdqu @XMM[3], 0x60($out)
1373 movdqu @XMM[5], 0x70($out)
1374 lea 0x80($out), $out
1375 sub \$8,$len
1376 jnc .Lecb_dec_loop
1377
1378 add \$8,$len
1379 jz .Lecb_dec_done
1380
1381 movdqu 0x00($inp), @XMM[0] # load input
1382 mov %rsp, %rax # pass key schedule
1383 mov %ebx,%r10d # pass rounds
1384 cmp \$2,$len
1385 jb .Lecb_dec_one
1386 movdqu 0x10($inp), @XMM[1]
1387 je .Lecb_dec_two
1388 movdqu 0x20($inp), @XMM[2]
1389 cmp \$4,$len
1390 jb .Lecb_dec_three
1391 movdqu 0x30($inp), @XMM[3]
1392 je .Lecb_dec_four
1393 movdqu 0x40($inp), @XMM[4]
1394 cmp \$6,$len
1395 jb .Lecb_dec_five
1396 movdqu 0x50($inp), @XMM[5]
1397 je .Lecb_dec_six
1398 movdqu 0x60($inp), @XMM[6]
1399 call _bsaes_decrypt8
1400 movdqu @XMM[0], 0x00($out) # write output
1401 movdqu @XMM[1], 0x10($out)
1402 movdqu @XMM[6], 0x20($out)
1403 movdqu @XMM[4], 0x30($out)
1404 movdqu @XMM[2], 0x40($out)
1405 movdqu @XMM[7], 0x50($out)
1406 movdqu @XMM[3], 0x60($out)
1407 jmp .Lecb_dec_done
1408.align 16
1409.Lecb_dec_six:
1410 call _bsaes_decrypt8
1411 movdqu @XMM[0], 0x00($out) # write output
1412 movdqu @XMM[1], 0x10($out)
1413 movdqu @XMM[6], 0x20($out)
1414 movdqu @XMM[4], 0x30($out)
1415 movdqu @XMM[2], 0x40($out)
1416 movdqu @XMM[7], 0x50($out)
1417 jmp .Lecb_dec_done
1418.align 16
1419.Lecb_dec_five:
1420 call _bsaes_decrypt8
1421 movdqu @XMM[0], 0x00($out) # write output
1422 movdqu @XMM[1], 0x10($out)
1423 movdqu @XMM[6], 0x20($out)
1424 movdqu @XMM[4], 0x30($out)
1425 movdqu @XMM[2], 0x40($out)
1426 jmp .Lecb_dec_done
1427.align 16
1428.Lecb_dec_four:
1429 call _bsaes_decrypt8
1430 movdqu @XMM[0], 0x00($out) # write output
1431 movdqu @XMM[1], 0x10($out)
1432 movdqu @XMM[6], 0x20($out)
1433 movdqu @XMM[4], 0x30($out)
1434 jmp .Lecb_dec_done
1435.align 16
1436.Lecb_dec_three:
1437 call _bsaes_decrypt8
1438 movdqu @XMM[0], 0x00($out) # write output
1439 movdqu @XMM[1], 0x10($out)
1440 movdqu @XMM[6], 0x20($out)
1441 jmp .Lecb_dec_done
1442.align 16
1443.Lecb_dec_two:
1444 call _bsaes_decrypt8
1445 movdqu @XMM[0], 0x00($out) # write output
1446 movdqu @XMM[1], 0x10($out)
1447 jmp .Lecb_dec_done
1448.align 16
1449.Lecb_dec_one:
1450 call _bsaes_decrypt8
1451 movdqu @XMM[0], 0x00($out) # write output
1452 jmp .Lecb_dec_done
1453.align 16
1454.Lecb_dec_short:
1455 lea ($inp), $arg1
1456 lea ($out), $arg2
1457 lea ($key), $arg3
1458 call asm_AES_decrypt
1459 lea 16($inp), $inp
1460 lea 16($out), $out
1461 dec $len
1462 jnz .Lecb_dec_short
1463
1464.Lecb_dec_done:
1465 lea (%rsp),%rax
1466 pxor %xmm0, %xmm0
1467.Lecb_dec_bzero: # wipe key schedule [if any]
1468 movdqa %xmm0, 0x00(%rax)
1469 movdqa %xmm0, 0x10(%rax)
1470 lea 0x20(%rax), %rax
1471 cmp %rax, %rbp
1472 jb .Lecb_dec_bzero
1473
1474 lea (%rbp),%rsp # restore %rsp
1475___
1476$code.=<<___ if ($win64);
1477 movaps 0x40(%rbp), %xmm6
1478 movaps 0x50(%rbp), %xmm7
1479 movaps 0x60(%rbp), %xmm8
1480 movaps 0x70(%rbp), %xmm9
1481 movaps 0x80(%rbp), %xmm10
1482 movaps 0x90(%rbp), %xmm11
1483 movaps 0xa0(%rbp), %xmm12
1484 movaps 0xb0(%rbp), %xmm13
1485 movaps 0xc0(%rbp), %xmm14
1486 movaps 0xd0(%rbp), %xmm15
1487 lea 0xa0(%rbp), %rsp
1488___
1489$code.=<<___;
1490 mov 0x48(%rsp), %r15
1491 mov 0x50(%rsp), %r14
1492 mov 0x58(%rsp), %r13
1493 mov 0x60(%rsp), %r12
1494 mov 0x68(%rsp), %rbx
1495 mov 0x70(%rsp), %rax
1496 lea 0x78(%rsp), %rsp
1497 mov %rax, %rbp
1498.Lecb_dec_epilogue:
1499 ret
1500.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1501___
1502}
1503$code.=<<___;
1504.extern asm_AES_cbc_encrypt
1505.globl bsaes_cbc_encrypt
1506.type bsaes_cbc_encrypt,\@abi-omnipotent
1507.align 16
1508bsaes_cbc_encrypt:
1509___
1510$code.=<<___ if ($win64);
1511 mov 48(%rsp),$arg6 # pull direction flag
1512___
1513$code.=<<___;
1514 cmp \$0,$arg6
1515 jne asm_AES_cbc_encrypt
1516 cmp \$128,$arg3
1517 jb asm_AES_cbc_encrypt
1518
1519 mov %rsp, %rax
1520.Lcbc_dec_prologue:
1521 push %rbp
1522 push %rbx
1523 push %r12
1524 push %r13
1525 push %r14
1526 push %r15
1527 lea -0x48(%rsp), %rsp
1528___
1529$code.=<<___ if ($win64);
1530 mov 0xa0(%rsp),$arg5 # pull ivp
1531 lea -0xa0(%rsp), %rsp
1532 movaps %xmm6, 0x40(%rsp)
1533 movaps %xmm7, 0x50(%rsp)
1534 movaps %xmm8, 0x60(%rsp)
1535 movaps %xmm9, 0x70(%rsp)
1536 movaps %xmm10, 0x80(%rsp)
1537 movaps %xmm11, 0x90(%rsp)
1538 movaps %xmm12, 0xa0(%rsp)
1539 movaps %xmm13, 0xb0(%rsp)
1540 movaps %xmm14, 0xc0(%rsp)
1541 movaps %xmm15, 0xd0(%rsp)
1542.Lcbc_dec_body:
1543___
1544$code.=<<___;
1545 mov %rsp, %rbp # backup %rsp
1546 mov 240($arg4), %eax # rounds
1547 mov $arg1, $inp # backup arguments
1548 mov $arg2, $out
1549 mov $arg3, $len
1550 mov $arg4, $key
1551 mov $arg5, %rbx
1552 shr \$4, $len # bytes to blocks
1553
1554 mov %eax, %edx # rounds
1555 shl \$7, %rax # 128 bytes per inner round key
1556 sub \$`128-32`, %rax # size of bit-sliced key schedule
1557 sub %rax, %rsp
1558
1559 mov %rsp, %rax # pass key schedule
1560 mov $key, %rcx # pass key
1561 mov %edx, %r10d # pass rounds
1562 call _bsaes_key_convert
1563 pxor (%rsp),%xmm7 # fix up 0 round key
1564 movdqa %xmm6,(%rax) # save last round key
1565 movdqa %xmm7,(%rsp)
1566
1567 movdqu (%rbx), @XMM[15] # load IV
1568 sub \$8,$len
1569.Lcbc_dec_loop:
1570 movdqu 0x00($inp), @XMM[0] # load input
1571 movdqu 0x10($inp), @XMM[1]
1572 movdqu 0x20($inp), @XMM[2]
1573 movdqu 0x30($inp), @XMM[3]
1574 movdqu 0x40($inp), @XMM[4]
1575 movdqu 0x50($inp), @XMM[5]
1576 mov %rsp, %rax # pass key schedule
1577 movdqu 0x60($inp), @XMM[6]
1578 mov %edx,%r10d # pass rounds
1579 movdqu 0x70($inp), @XMM[7]
1580 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1581
1582 call _bsaes_decrypt8
1583
1584 pxor 0x20(%rbp), @XMM[0] # ^= IV
1585 movdqu 0x00($inp), @XMM[8] # re-load input
1586 movdqu 0x10($inp), @XMM[9]
1587 pxor @XMM[8], @XMM[1]
1588 movdqu 0x20($inp), @XMM[10]
1589 pxor @XMM[9], @XMM[6]
1590 movdqu 0x30($inp), @XMM[11]
1591 pxor @XMM[10], @XMM[4]
1592 movdqu 0x40($inp), @XMM[12]
1593 pxor @XMM[11], @XMM[2]
1594 movdqu 0x50($inp), @XMM[13]
1595 pxor @XMM[12], @XMM[7]
1596 movdqu 0x60($inp), @XMM[14]
1597 pxor @XMM[13], @XMM[3]
1598 movdqu 0x70($inp), @XMM[15] # IV
1599 pxor @XMM[14], @XMM[5]
1600 movdqu @XMM[0], 0x00($out) # write output
1601 lea 0x80($inp), $inp
1602 movdqu @XMM[1], 0x10($out)
1603 movdqu @XMM[6], 0x20($out)
1604 movdqu @XMM[4], 0x30($out)
1605 movdqu @XMM[2], 0x40($out)
1606 movdqu @XMM[7], 0x50($out)
1607 movdqu @XMM[3], 0x60($out)
1608 movdqu @XMM[5], 0x70($out)
1609 lea 0x80($out), $out
1610 sub \$8,$len
1611 jnc .Lcbc_dec_loop
1612
1613 add \$8,$len
1614 jz .Lcbc_dec_done
1615
1616 movdqu 0x00($inp), @XMM[0] # load input
1617 mov %rsp, %rax # pass key schedule
1618 mov %edx, %r10d # pass rounds
1619 cmp \$2,$len
1620 jb .Lcbc_dec_one
1621 movdqu 0x10($inp), @XMM[1]
1622 je .Lcbc_dec_two
1623 movdqu 0x20($inp), @XMM[2]
1624 cmp \$4,$len
1625 jb .Lcbc_dec_three
1626 movdqu 0x30($inp), @XMM[3]
1627 je .Lcbc_dec_four
1628 movdqu 0x40($inp), @XMM[4]
1629 cmp \$6,$len
1630 jb .Lcbc_dec_five
1631 movdqu 0x50($inp), @XMM[5]
1632 je .Lcbc_dec_six
1633 movdqu 0x60($inp), @XMM[6]
1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1635 call _bsaes_decrypt8
1636 pxor 0x20(%rbp), @XMM[0] # ^= IV
1637 movdqu 0x00($inp), @XMM[8] # re-load input
1638 movdqu 0x10($inp), @XMM[9]
1639 pxor @XMM[8], @XMM[1]
1640 movdqu 0x20($inp), @XMM[10]
1641 pxor @XMM[9], @XMM[6]
1642 movdqu 0x30($inp), @XMM[11]
1643 pxor @XMM[10], @XMM[4]
1644 movdqu 0x40($inp), @XMM[12]
1645 pxor @XMM[11], @XMM[2]
1646 movdqu 0x50($inp), @XMM[13]
1647 pxor @XMM[12], @XMM[7]
1648 movdqu 0x60($inp), @XMM[15] # IV
1649 pxor @XMM[13], @XMM[3]
1650 movdqu @XMM[0], 0x00($out) # write output
1651 movdqu @XMM[1], 0x10($out)
1652 movdqu @XMM[6], 0x20($out)
1653 movdqu @XMM[4], 0x30($out)
1654 movdqu @XMM[2], 0x40($out)
1655 movdqu @XMM[7], 0x50($out)
1656 movdqu @XMM[3], 0x60($out)
1657 jmp .Lcbc_dec_done
1658.align 16
1659.Lcbc_dec_six:
1660 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1661 call _bsaes_decrypt8
1662 pxor 0x20(%rbp), @XMM[0] # ^= IV
1663 movdqu 0x00($inp), @XMM[8] # re-load input
1664 movdqu 0x10($inp), @XMM[9]
1665 pxor @XMM[8], @XMM[1]
1666 movdqu 0x20($inp), @XMM[10]
1667 pxor @XMM[9], @XMM[6]
1668 movdqu 0x30($inp), @XMM[11]
1669 pxor @XMM[10], @XMM[4]
1670 movdqu 0x40($inp), @XMM[12]
1671 pxor @XMM[11], @XMM[2]
1672 movdqu 0x50($inp), @XMM[15] # IV
1673 pxor @XMM[12], @XMM[7]
1674 movdqu @XMM[0], 0x00($out) # write output
1675 movdqu @XMM[1], 0x10($out)
1676 movdqu @XMM[6], 0x20($out)
1677 movdqu @XMM[4], 0x30($out)
1678 movdqu @XMM[2], 0x40($out)
1679 movdqu @XMM[7], 0x50($out)
1680 jmp .Lcbc_dec_done
1681.align 16
1682.Lcbc_dec_five:
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[10]
1690 pxor @XMM[9], @XMM[6]
1691 movdqu 0x30($inp), @XMM[11]
1692 pxor @XMM[10], @XMM[4]
1693 movdqu 0x40($inp), @XMM[15] # IV
1694 pxor @XMM[11], @XMM[2]
1695 movdqu @XMM[0], 0x00($out) # write output
1696 movdqu @XMM[1], 0x10($out)
1697 movdqu @XMM[6], 0x20($out)
1698 movdqu @XMM[4], 0x30($out)
1699 movdqu @XMM[2], 0x40($out)
1700 jmp .Lcbc_dec_done
1701.align 16
1702.Lcbc_dec_four:
1703 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1704 call _bsaes_decrypt8
1705 pxor 0x20(%rbp), @XMM[0] # ^= IV
1706 movdqu 0x00($inp), @XMM[8] # re-load input
1707 movdqu 0x10($inp), @XMM[9]
1708 pxor @XMM[8], @XMM[1]
1709 movdqu 0x20($inp), @XMM[10]
1710 pxor @XMM[9], @XMM[6]
1711 movdqu 0x30($inp), @XMM[15] # IV
1712 pxor @XMM[10], @XMM[4]
1713 movdqu @XMM[0], 0x00($out) # write output
1714 movdqu @XMM[1], 0x10($out)
1715 movdqu @XMM[6], 0x20($out)
1716 movdqu @XMM[4], 0x30($out)
1717 jmp .Lcbc_dec_done
1718.align 16
1719.Lcbc_dec_three:
1720 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1721 call _bsaes_decrypt8
1722 pxor 0x20(%rbp), @XMM[0] # ^= IV
1723 movdqu 0x00($inp), @XMM[8] # re-load input
1724 movdqu 0x10($inp), @XMM[9]
1725 pxor @XMM[8], @XMM[1]
1726 movdqu 0x20($inp), @XMM[15] # IV
1727 pxor @XMM[9], @XMM[6]
1728 movdqu @XMM[0], 0x00($out) # write output
1729 movdqu @XMM[1], 0x10($out)
1730 movdqu @XMM[6], 0x20($out)
1731 jmp .Lcbc_dec_done
1732.align 16
1733.Lcbc_dec_two:
1734 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1735 call _bsaes_decrypt8
1736 pxor 0x20(%rbp), @XMM[0] # ^= IV
1737 movdqu 0x00($inp), @XMM[8] # re-load input
1738 movdqu 0x10($inp), @XMM[15] # IV
1739 pxor @XMM[8], @XMM[1]
1740 movdqu @XMM[0], 0x00($out) # write output
1741 movdqu @XMM[1], 0x10($out)
1742 jmp .Lcbc_dec_done
1743.align 16
1744.Lcbc_dec_one:
1745 lea ($inp), $arg1
1746 lea 0x20(%rbp), $arg2 # buffer output
1747 lea ($key), $arg3
1748 call asm_AES_decrypt # doesn't touch %xmm
1749 pxor 0x20(%rbp), @XMM[15] # ^= IV
1750 movdqu @XMM[15], ($out) # write output
1751 movdqa @XMM[0], @XMM[15] # IV
1752
1753.Lcbc_dec_done:
1754 movdqu @XMM[15], (%rbx) # return IV
1755 lea (%rsp), %rax
1756 pxor %xmm0, %xmm0
1757.Lcbc_dec_bzero: # wipe key schedule [if any]
1758 movdqa %xmm0, 0x00(%rax)
1759 movdqa %xmm0, 0x10(%rax)
1760 lea 0x20(%rax), %rax
1761 cmp %rax, %rbp
1762 ja .Lcbc_dec_bzero
1763
1764 lea (%rbp),%rsp # restore %rsp
1765___
1766$code.=<<___ if ($win64);
1767 movaps 0x40(%rbp), %xmm6
1768 movaps 0x50(%rbp), %xmm7
1769 movaps 0x60(%rbp), %xmm8
1770 movaps 0x70(%rbp), %xmm9
1771 movaps 0x80(%rbp), %xmm10
1772 movaps 0x90(%rbp), %xmm11
1773 movaps 0xa0(%rbp), %xmm12
1774 movaps 0xb0(%rbp), %xmm13
1775 movaps 0xc0(%rbp), %xmm14
1776 movaps 0xd0(%rbp), %xmm15
1777 lea 0xa0(%rbp), %rsp
1778___
1779$code.=<<___;
1780 mov 0x48(%rsp), %r15
1781 mov 0x50(%rsp), %r14
1782 mov 0x58(%rsp), %r13
1783 mov 0x60(%rsp), %r12
1784 mov 0x68(%rsp), %rbx
1785 mov 0x70(%rsp), %rax
1786 lea 0x78(%rsp), %rsp
1787 mov %rax, %rbp
1788.Lcbc_dec_epilogue:
1789 ret
1790.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1791
1792.globl bsaes_ctr32_encrypt_blocks
1793.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1794.align 16
1795bsaes_ctr32_encrypt_blocks:
1796 mov %rsp, %rax
1797.Lctr_enc_prologue:
1798 push %rbp
1799 push %rbx
1800 push %r12
1801 push %r13
1802 push %r14
1803 push %r15
1804 lea -0x48(%rsp), %rsp
1805___
1806$code.=<<___ if ($win64);
1807 mov 0xa0(%rsp),$arg5 # pull ivp
1808 lea -0xa0(%rsp), %rsp
1809 movaps %xmm6, 0x40(%rsp)
1810 movaps %xmm7, 0x50(%rsp)
1811 movaps %xmm8, 0x60(%rsp)
1812 movaps %xmm9, 0x70(%rsp)
1813 movaps %xmm10, 0x80(%rsp)
1814 movaps %xmm11, 0x90(%rsp)
1815 movaps %xmm12, 0xa0(%rsp)
1816 movaps %xmm13, 0xb0(%rsp)
1817 movaps %xmm14, 0xc0(%rsp)
1818 movaps %xmm15, 0xd0(%rsp)
1819.Lctr_enc_body:
1820___
1821$code.=<<___;
1822 mov %rsp, %rbp # backup %rsp
1823 movdqu ($arg5), %xmm0 # load counter
1824 mov 240($arg4), %eax # rounds
1825 mov $arg1, $inp # backup arguments
1826 mov $arg2, $out
1827 mov $arg3, $len
1828 mov $arg4, $key
1829 movdqa %xmm0, 0x20(%rbp) # copy counter
1830 cmp \$8, $arg3
1831 jb .Lctr_enc_short
1832
1833 mov %eax, %ebx # rounds
1834 shl \$7, %rax # 128 bytes per inner round key
1835 sub \$`128-32`, %rax # size of bit-sliced key schedule
1836 sub %rax, %rsp
1837
1838 mov %rsp, %rax # pass key schedule
1839 mov $key, %rcx # pass key
1840 mov %ebx, %r10d # pass rounds
1841 call _bsaes_key_convert
1842 pxor %xmm6,%xmm7 # fix up last round key
1843 movdqa %xmm7,(%rax) # save last round key
1844
1845 movdqa (%rsp), @XMM[9] # load round0 key
1846 lea .LADD1(%rip), %r11
1847 movdqa 0x20(%rbp), @XMM[0] # counter copy
1848 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1849 pshufb @XMM[8], @XMM[9] # byte swap upper part
1850 pshufb @XMM[8], @XMM[0]
1851 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1852 jmp .Lctr_enc_loop
1853.align 16
1854.Lctr_enc_loop:
1855 movdqa @XMM[0], 0x20(%rbp) # save counter
1856 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1857 movdqa @XMM[0], @XMM[2]
1858 paddd 0x00(%r11), @XMM[1] # .LADD1
1859 movdqa @XMM[0], @XMM[3]
1860 paddd 0x10(%r11), @XMM[2] # .LADD2
1861 movdqa @XMM[0], @XMM[4]
1862 paddd 0x20(%r11), @XMM[3] # .LADD3
1863 movdqa @XMM[0], @XMM[5]
1864 paddd 0x30(%r11), @XMM[4] # .LADD4
1865 movdqa @XMM[0], @XMM[6]
1866 paddd 0x40(%r11), @XMM[5] # .LADD5
1867 movdqa @XMM[0], @XMM[7]
1868 paddd 0x50(%r11), @XMM[6] # .LADD6
1869 paddd 0x60(%r11), @XMM[7] # .LADD7
1870
1871 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1872 # to flip byte order in 32-bit counter
1873 movdqa (%rsp), @XMM[9] # round 0 key
1874 lea 0x10(%rsp), %rax # pass key schedule
1875 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1876 pxor @XMM[9], @XMM[0] # xor with round0 key
1877 pxor @XMM[9], @XMM[1]
1878 pshufb @XMM[8], @XMM[0]
1879 pxor @XMM[9], @XMM[2]
1880 pshufb @XMM[8], @XMM[1]
1881 pxor @XMM[9], @XMM[3]
1882 pshufb @XMM[8], @XMM[2]
1883 pxor @XMM[9], @XMM[4]
1884 pshufb @XMM[8], @XMM[3]
1885 pxor @XMM[9], @XMM[5]
1886 pshufb @XMM[8], @XMM[4]
1887 pxor @XMM[9], @XMM[6]
1888 pshufb @XMM[8], @XMM[5]
1889 pxor @XMM[9], @XMM[7]
1890 pshufb @XMM[8], @XMM[6]
1891 lea .LBS0(%rip), %r11 # constants table
1892 pshufb @XMM[8], @XMM[7]
1893 mov %ebx,%r10d # pass rounds
1894
1895 call _bsaes_encrypt8_bitslice
1896
1897 sub \$8,$len
1898 jc .Lctr_enc_loop_done
1899
1900 movdqu 0x00($inp), @XMM[8] # load input
1901 movdqu 0x10($inp), @XMM[9]
1902 movdqu 0x20($inp), @XMM[10]
1903 movdqu 0x30($inp), @XMM[11]
1904 movdqu 0x40($inp), @XMM[12]
1905 movdqu 0x50($inp), @XMM[13]
1906 movdqu 0x60($inp), @XMM[14]
1907 movdqu 0x70($inp), @XMM[15]
1908 lea 0x80($inp),$inp
1909 pxor @XMM[0], @XMM[8]
1910 movdqa 0x20(%rbp), @XMM[0] # load counter
1911 pxor @XMM[9], @XMM[1]
1912 movdqu @XMM[8], 0x00($out) # write output
1913 pxor @XMM[10], @XMM[4]
1914 movdqu @XMM[1], 0x10($out)
1915 pxor @XMM[11], @XMM[6]
1916 movdqu @XMM[4], 0x20($out)
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[6], 0x30($out)
1919 pxor @XMM[13], @XMM[7]
1920 movdqu @XMM[3], 0x40($out)
1921 pxor @XMM[14], @XMM[2]
1922 movdqu @XMM[7], 0x50($out)
1923 pxor @XMM[15], @XMM[5]
1924 movdqu @XMM[2], 0x60($out)
1925 lea .LADD1(%rip), %r11
1926 movdqu @XMM[5], 0x70($out)
1927 lea 0x80($out), $out
1928 paddd 0x70(%r11), @XMM[0] # .LADD8
1929 jnz .Lctr_enc_loop
1930
1931 jmp .Lctr_enc_done
1932.align 16
1933.Lctr_enc_loop_done:
1934 add \$8, $len
1935 movdqu 0x00($inp), @XMM[8] # load input
1936 pxor @XMM[8], @XMM[0]
1937 movdqu @XMM[0], 0x00($out) # write output
1938 cmp \$2,$len
1939 jb .Lctr_enc_done
1940 movdqu 0x10($inp), @XMM[9]
1941 pxor @XMM[9], @XMM[1]
1942 movdqu @XMM[1], 0x10($out)
1943 je .Lctr_enc_done
1944 movdqu 0x20($inp), @XMM[10]
1945 pxor @XMM[10], @XMM[4]
1946 movdqu @XMM[4], 0x20($out)
1947 cmp \$4,$len
1948 jb .Lctr_enc_done
1949 movdqu 0x30($inp), @XMM[11]
1950 pxor @XMM[11], @XMM[6]
1951 movdqu @XMM[6], 0x30($out)
1952 je .Lctr_enc_done
1953 movdqu 0x40($inp), @XMM[12]
1954 pxor @XMM[12], @XMM[3]
1955 movdqu @XMM[3], 0x40($out)
1956 cmp \$6,$len
1957 jb .Lctr_enc_done
1958 movdqu 0x50($inp), @XMM[13]
1959 pxor @XMM[13], @XMM[7]
1960 movdqu @XMM[7], 0x50($out)
1961 je .Lctr_enc_done
1962 movdqu 0x60($inp), @XMM[14]
1963 pxor @XMM[14], @XMM[2]
1964 movdqu @XMM[2], 0x60($out)
1965 jmp .Lctr_enc_done
1966
1967.align 16
1968.Lctr_enc_short:
1969 lea 0x20(%rbp), $arg1
1970 lea 0x30(%rbp), $arg2
1971 lea ($key), $arg3
1972 call asm_AES_encrypt
1973 movdqu ($inp), @XMM[1]
1974 lea 16($inp), $inp
1975 mov 0x2c(%rbp), %eax # load 32-bit counter
1976 bswap %eax
1977 pxor 0x30(%rbp), @XMM[1]
1978 inc %eax # increment
1979 movdqu @XMM[1], ($out)
1980 bswap %eax
1981 lea 16($out), $out
1982 mov %eax, 0x2c(%rsp) # save 32-bit counter
1983 dec $len
1984 jnz .Lctr_enc_short
1985
1986.Lctr_enc_done:
1987 lea (%rsp), %rax
1988 pxor %xmm0, %xmm0
1989.Lctr_enc_bzero: # wipe key schedule [if any]
1990 movdqa %xmm0, 0x00(%rax)
1991 movdqa %xmm0, 0x10(%rax)
1992 lea 0x20(%rax), %rax
1993 cmp %rax, %rbp
1994 ja .Lctr_enc_bzero
1995
1996 lea (%rbp),%rsp # restore %rsp
1997___
1998$code.=<<___ if ($win64);
1999 movaps 0x40(%rbp), %xmm6
2000 movaps 0x50(%rbp), %xmm7
2001 movaps 0x60(%rbp), %xmm8
2002 movaps 0x70(%rbp), %xmm9
2003 movaps 0x80(%rbp), %xmm10
2004 movaps 0x90(%rbp), %xmm11
2005 movaps 0xa0(%rbp), %xmm12
2006 movaps 0xb0(%rbp), %xmm13
2007 movaps 0xc0(%rbp), %xmm14
2008 movaps 0xd0(%rbp), %xmm15
2009 lea 0xa0(%rbp), %rsp
2010___
2011$code.=<<___;
2012 mov 0x48(%rsp), %r15
2013 mov 0x50(%rsp), %r14
2014 mov 0x58(%rsp), %r13
2015 mov 0x60(%rsp), %r12
2016 mov 0x68(%rsp), %rbx
2017 mov 0x70(%rsp), %rax
2018 lea 0x78(%rsp), %rsp
2019 mov %rax, %rbp
2020.Lctr_enc_epilogue:
2021 ret
2022.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2023___
2024######################################################################
2025# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2026# const AES_KEY *key1, const AES_KEY *key2,
2027# const unsigned char iv[16]);
2028#
2029my ($twmask,$twres,$twtmp)=@XMM[13..15];
2030$code.=<<___;
2031.globl bsaes_xts_encrypt
2032.type bsaes_xts_encrypt,\@abi-omnipotent
2033.align 16
2034bsaes_xts_encrypt:
2035 mov %rsp, %rax
2036.Lxts_enc_prologue:
2037 push %rbp
2038 push %rbx
2039 push %r12
2040 push %r13
2041 push %r14
2042 push %r15
2043 lea -0x48(%rsp), %rsp
2044___
2045$code.=<<___ if ($win64);
2046 mov 0xa0(%rsp),$arg5 # pull key2
2047 mov 0xa8(%rsp),$arg6 # pull ivp
2048 lea -0xa0(%rsp), %rsp
2049 movaps %xmm6, 0x40(%rsp)
2050 movaps %xmm7, 0x50(%rsp)
2051 movaps %xmm8, 0x60(%rsp)
2052 movaps %xmm9, 0x70(%rsp)
2053 movaps %xmm10, 0x80(%rsp)
2054 movaps %xmm11, 0x90(%rsp)
2055 movaps %xmm12, 0xa0(%rsp)
2056 movaps %xmm13, 0xb0(%rsp)
2057 movaps %xmm14, 0xc0(%rsp)
2058 movaps %xmm15, 0xd0(%rsp)
2059.Lxts_enc_body:
2060___
2061$code.=<<___;
2062 mov %rsp, %rbp # backup %rsp
2063 mov $arg1, $inp # backup arguments
2064 mov $arg2, $out
2065 mov $arg3, $len
2066 mov $arg4, $key
2067
2068 lea ($arg6), $arg1
2069 lea 0x20(%rbp), $arg2
2070 lea ($arg5), $arg3
2071 call asm_AES_encrypt # generate initial tweak
2072
2073 mov 240($key), %eax # rounds
2074 mov $len, %rbx # backup $len
2075
2076 mov %eax, %edx # rounds
2077 shl \$7, %rax # 128 bytes per inner round key
2078 sub \$`128-32`, %rax # size of bit-sliced key schedule
2079 sub %rax, %rsp
2080
2081 mov %rsp, %rax # pass key schedule
2082 mov $key, %rcx # pass key
2083 mov %edx, %r10d # pass rounds
2084 call _bsaes_key_convert
2085 pxor %xmm6, %xmm7 # fix up last round key
2086 movdqa %xmm7, (%rax) # save last round key
2087
2088 and \$-16, $len
2089 sub \$0x80, %rsp # place for tweak[8]
2090 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2091
2092 pxor $twtmp, $twtmp
2093 movdqa .Lxts_magic(%rip), $twmask
2094 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2095
2096 sub \$0x80, $len
2097 jc .Lxts_enc_short
2098 jmp .Lxts_enc_loop
2099
2100.align 16
2101.Lxts_enc_loop:
2102___
2103 for ($i=0;$i<7;$i++) {
2104 $code.=<<___;
2105 pshufd \$0x13, $twtmp, $twres
2106 pxor $twtmp, $twtmp
2107 movdqa @XMM[7], @XMM[$i]
2108 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2109 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2110 pand $twmask, $twres # isolate carry and residue
2111 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2112 pxor $twres, @XMM[7]
2113___
2114 $code.=<<___ if ($i>=1);
2115 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2116___
2117 $code.=<<___ if ($i>=2);
2118 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2119___
2120 }
2121$code.=<<___;
2122 movdqu 0x60($inp), @XMM[8+6]
2123 pxor @XMM[8+5], @XMM[5]
2124 movdqu 0x70($inp), @XMM[8+7]
2125 lea 0x80($inp), $inp
2126 movdqa @XMM[7], 0x70(%rsp)
2127 pxor @XMM[8+6], @XMM[6]
2128 lea 0x80(%rsp), %rax # pass key schedule
2129 pxor @XMM[8+7], @XMM[7]
2130 mov %edx, %r10d # pass rounds
2131
2132 call _bsaes_encrypt8
2133
2134 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2135 pxor 0x10(%rsp), @XMM[1]
2136 movdqu @XMM[0], 0x00($out) # write output
2137 pxor 0x20(%rsp), @XMM[4]
2138 movdqu @XMM[1], 0x10($out)
2139 pxor 0x30(%rsp), @XMM[6]
2140 movdqu @XMM[4], 0x20($out)
2141 pxor 0x40(%rsp), @XMM[3]
2142 movdqu @XMM[6], 0x30($out)
2143 pxor 0x50(%rsp), @XMM[7]
2144 movdqu @XMM[3], 0x40($out)
2145 pxor 0x60(%rsp), @XMM[2]
2146 movdqu @XMM[7], 0x50($out)
2147 pxor 0x70(%rsp), @XMM[5]
2148 movdqu @XMM[2], 0x60($out)
2149 movdqu @XMM[5], 0x70($out)
2150 lea 0x80($out), $out
2151
2152 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2153 pxor $twtmp, $twtmp
2154 movdqa .Lxts_magic(%rip), $twmask
2155 pcmpgtd @XMM[7], $twtmp
2156 pshufd \$0x13, $twtmp, $twres
2157 pxor $twtmp, $twtmp
2158 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2159 pand $twmask, $twres # isolate carry and residue
2160 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2161 pxor $twres, @XMM[7]
2162
2163 sub \$0x80,$len
2164 jnc .Lxts_enc_loop
2165
2166.Lxts_enc_short:
2167 add \$0x80, $len
2168 jz .Lxts_enc_done
2169___
2170 for ($i=0;$i<7;$i++) {
2171 $code.=<<___;
2172 pshufd \$0x13, $twtmp, $twres
2173 pxor $twtmp, $twtmp
2174 movdqa @XMM[7], @XMM[$i]
2175 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2176 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2177 pand $twmask, $twres # isolate carry and residue
2178 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2179 pxor $twres, @XMM[7]
2180___
2181 $code.=<<___ if ($i>=1);
2182 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2183 cmp \$`0x10*$i`,$len
2184 je .Lxts_enc_$i
2185___
2186 $code.=<<___ if ($i>=2);
2187 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2188___
2189 }
2190$code.=<<___;
2191 movdqu 0x60($inp), @XMM[8+6]
2192 pxor @XMM[8+5], @XMM[5]
2193 movdqa @XMM[7], 0x70(%rsp)
2194 lea 0x70($inp), $inp
2195 pxor @XMM[8+6], @XMM[6]
2196 lea 0x80(%rsp), %rax # pass key schedule
2197 mov %edx, %r10d # pass rounds
2198
2199 call _bsaes_encrypt8
2200
2201 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2202 pxor 0x10(%rsp), @XMM[1]
2203 movdqu @XMM[0], 0x00($out) # write output
2204 pxor 0x20(%rsp), @XMM[4]
2205 movdqu @XMM[1], 0x10($out)
2206 pxor 0x30(%rsp), @XMM[6]
2207 movdqu @XMM[4], 0x20($out)
2208 pxor 0x40(%rsp), @XMM[3]
2209 movdqu @XMM[6], 0x30($out)
2210 pxor 0x50(%rsp), @XMM[7]
2211 movdqu @XMM[3], 0x40($out)
2212 pxor 0x60(%rsp), @XMM[2]
2213 movdqu @XMM[7], 0x50($out)
2214 movdqu @XMM[2], 0x60($out)
2215 lea 0x70($out), $out
2216
2217 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2218 jmp .Lxts_enc_done
2219.align 16
2220.Lxts_enc_6:
2221 pxor @XMM[8+4], @XMM[4]
2222 lea 0x60($inp), $inp
2223 pxor @XMM[8+5], @XMM[5]
2224 lea 0x80(%rsp), %rax # pass key schedule
2225 mov %edx, %r10d # pass rounds
2226
2227 call _bsaes_encrypt8
2228
2229 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2230 pxor 0x10(%rsp), @XMM[1]
2231 movdqu @XMM[0], 0x00($out) # write output
2232 pxor 0x20(%rsp), @XMM[4]
2233 movdqu @XMM[1], 0x10($out)
2234 pxor 0x30(%rsp), @XMM[6]
2235 movdqu @XMM[4], 0x20($out)
2236 pxor 0x40(%rsp), @XMM[3]
2237 movdqu @XMM[6], 0x30($out)
2238 pxor 0x50(%rsp), @XMM[7]
2239 movdqu @XMM[3], 0x40($out)
2240 movdqu @XMM[7], 0x50($out)
2241 lea 0x60($out), $out
2242
2243 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2244 jmp .Lxts_enc_done
2245.align 16
2246.Lxts_enc_5:
2247 pxor @XMM[8+3], @XMM[3]
2248 lea 0x50($inp), $inp
2249 pxor @XMM[8+4], @XMM[4]
2250 lea 0x80(%rsp), %rax # pass key schedule
2251 mov %edx, %r10d # pass rounds
2252
2253 call _bsaes_encrypt8
2254
2255 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2256 pxor 0x10(%rsp), @XMM[1]
2257 movdqu @XMM[0], 0x00($out) # write output
2258 pxor 0x20(%rsp), @XMM[4]
2259 movdqu @XMM[1], 0x10($out)
2260 pxor 0x30(%rsp), @XMM[6]
2261 movdqu @XMM[4], 0x20($out)
2262 pxor 0x40(%rsp), @XMM[3]
2263 movdqu @XMM[6], 0x30($out)
2264 movdqu @XMM[3], 0x40($out)
2265 lea 0x50($out), $out
2266
2267 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2268 jmp .Lxts_enc_done
2269.align 16
2270.Lxts_enc_4:
2271 pxor @XMM[8+2], @XMM[2]
2272 lea 0x40($inp), $inp
2273 pxor @XMM[8+3], @XMM[3]
2274 lea 0x80(%rsp), %rax # pass key schedule
2275 mov %edx, %r10d # pass rounds
2276
2277 call _bsaes_encrypt8
2278
2279 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2280 pxor 0x10(%rsp), @XMM[1]
2281 movdqu @XMM[0], 0x00($out) # write output
2282 pxor 0x20(%rsp), @XMM[4]
2283 movdqu @XMM[1], 0x10($out)
2284 pxor 0x30(%rsp), @XMM[6]
2285 movdqu @XMM[4], 0x20($out)
2286 movdqu @XMM[6], 0x30($out)
2287 lea 0x40($out), $out
2288
2289 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2290 jmp .Lxts_enc_done
2291.align 16
2292.Lxts_enc_3:
2293 pxor @XMM[8+1], @XMM[1]
2294 lea 0x30($inp), $inp
2295 pxor @XMM[8+2], @XMM[2]
2296 lea 0x80(%rsp), %rax # pass key schedule
2297 mov %edx, %r10d # pass rounds
2298
2299 call _bsaes_encrypt8
2300
2301 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2302 pxor 0x10(%rsp), @XMM[1]
2303 movdqu @XMM[0], 0x00($out) # write output
2304 pxor 0x20(%rsp), @XMM[4]
2305 movdqu @XMM[1], 0x10($out)
2306 movdqu @XMM[4], 0x20($out)
2307 lea 0x30($out), $out
2308
2309 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2310 jmp .Lxts_enc_done
2311.align 16
2312.Lxts_enc_2:
2313 pxor @XMM[8+0], @XMM[0]
2314 lea 0x20($inp), $inp
2315 pxor @XMM[8+1], @XMM[1]
2316 lea 0x80(%rsp), %rax # pass key schedule
2317 mov %edx, %r10d # pass rounds
2318
2319 call _bsaes_encrypt8
2320
2321 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2322 pxor 0x10(%rsp), @XMM[1]
2323 movdqu @XMM[0], 0x00($out) # write output
2324 movdqu @XMM[1], 0x10($out)
2325 lea 0x20($out), $out
2326
2327 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2328 jmp .Lxts_enc_done
2329.align 16
2330.Lxts_enc_1:
2331 pxor @XMM[0], @XMM[8]
2332 lea 0x10($inp), $inp
2333 movdqa @XMM[8], 0x20(%rbp)
2334 lea 0x20(%rbp), $arg1
2335 lea 0x20(%rbp), $arg2
2336 lea ($key), $arg3
2337 call asm_AES_encrypt # doesn't touch %xmm
2338 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2339 #pxor @XMM[8], @XMM[0]
2340 #lea 0x80(%rsp), %rax # pass key schedule
2341 #mov %edx, %r10d # pass rounds
2342 #call _bsaes_encrypt8
2343 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2344 movdqu @XMM[0], 0x00($out) # write output
2345 lea 0x10($out), $out
2346
2347 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2348
2349.Lxts_enc_done:
2350 and \$15, %ebx
2351 jz .Lxts_enc_ret
2352 mov $out, %rdx
2353
2354.Lxts_enc_steal:
2355 movzb ($inp), %eax
2356 movzb -16(%rdx), %ecx
2357 lea 1($inp), $inp
2358 mov %al, -16(%rdx)
2359 mov %cl, 0(%rdx)
2360 lea 1(%rdx), %rdx
2361 sub \$1,%ebx
2362 jnz .Lxts_enc_steal
2363
2364 movdqu -16($out), @XMM[0]
2365 lea 0x20(%rbp), $arg1
2366 pxor @XMM[7], @XMM[0]
2367 lea 0x20(%rbp), $arg2
2368 movdqa @XMM[0], 0x20(%rbp)
2369 lea ($key), $arg3
2370 call asm_AES_encrypt # doesn't touch %xmm
2371 pxor 0x20(%rbp), @XMM[7]
2372 movdqu @XMM[7], -16($out)
2373
2374.Lxts_enc_ret:
2375 lea (%rsp), %rax
2376 pxor %xmm0, %xmm0
2377.Lxts_enc_bzero: # wipe key schedule [if any]
2378 movdqa %xmm0, 0x00(%rax)
2379 movdqa %xmm0, 0x10(%rax)
2380 lea 0x20(%rax), %rax
2381 cmp %rax, %rbp
2382 ja .Lxts_enc_bzero
2383
2384 lea (%rbp),%rsp # restore %rsp
2385___
2386$code.=<<___ if ($win64);
2387 movaps 0x40(%rbp), %xmm6
2388 movaps 0x50(%rbp), %xmm7
2389 movaps 0x60(%rbp), %xmm8
2390 movaps 0x70(%rbp), %xmm9
2391 movaps 0x80(%rbp), %xmm10
2392 movaps 0x90(%rbp), %xmm11
2393 movaps 0xa0(%rbp), %xmm12
2394 movaps 0xb0(%rbp), %xmm13
2395 movaps 0xc0(%rbp), %xmm14
2396 movaps 0xd0(%rbp), %xmm15
2397 lea 0xa0(%rbp), %rsp
2398___
2399$code.=<<___;
2400 mov 0x48(%rsp), %r15
2401 mov 0x50(%rsp), %r14
2402 mov 0x58(%rsp), %r13
2403 mov 0x60(%rsp), %r12
2404 mov 0x68(%rsp), %rbx
2405 mov 0x70(%rsp), %rax
2406 lea 0x78(%rsp), %rsp
2407 mov %rax, %rbp
2408.Lxts_enc_epilogue:
2409 ret
2410.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2411
2412.globl bsaes_xts_decrypt
2413.type bsaes_xts_decrypt,\@abi-omnipotent
2414.align 16
2415bsaes_xts_decrypt:
2416 mov %rsp, %rax
2417.Lxts_dec_prologue:
2418 push %rbp
2419 push %rbx
2420 push %r12
2421 push %r13
2422 push %r14
2423 push %r15
2424 lea -0x48(%rsp), %rsp
2425___
2426$code.=<<___ if ($win64);
2427 mov 0xa0(%rsp),$arg5 # pull key2
2428 mov 0xa8(%rsp),$arg6 # pull ivp
2429 lea -0xa0(%rsp), %rsp
2430 movaps %xmm6, 0x40(%rsp)
2431 movaps %xmm7, 0x50(%rsp)
2432 movaps %xmm8, 0x60(%rsp)
2433 movaps %xmm9, 0x70(%rsp)
2434 movaps %xmm10, 0x80(%rsp)
2435 movaps %xmm11, 0x90(%rsp)
2436 movaps %xmm12, 0xa0(%rsp)
2437 movaps %xmm13, 0xb0(%rsp)
2438 movaps %xmm14, 0xc0(%rsp)
2439 movaps %xmm15, 0xd0(%rsp)
2440.Lxts_dec_body:
2441___
2442$code.=<<___;
2443 mov %rsp, %rbp # backup %rsp
2444 mov $arg1, $inp # backup arguments
2445 mov $arg2, $out
2446 mov $arg3, $len
2447 mov $arg4, $key
2448
2449 lea ($arg6), $arg1
2450 lea 0x20(%rbp), $arg2
2451 lea ($arg5), $arg3
2452 call asm_AES_encrypt # generate initial tweak
2453
2454 mov 240($key), %eax # rounds
2455 mov $len, %rbx # backup $len
2456
2457 mov %eax, %edx # rounds
2458 shl \$7, %rax # 128 bytes per inner round key
2459 sub \$`128-32`, %rax # size of bit-sliced key schedule
2460 sub %rax, %rsp
2461
2462 mov %rsp, %rax # pass key schedule
2463 mov $key, %rcx # pass key
2464 mov %edx, %r10d # pass rounds
2465 call _bsaes_key_convert
2466 pxor (%rsp), %xmm7 # fix up round 0 key
2467 movdqa %xmm6, (%rax) # save last round key
2468 movdqa %xmm7, (%rsp)
2469
2470 xor %eax, %eax # if ($len%16) len-=16;
2471 and \$-16, $len
2472 test \$15, %ebx
2473 setnz %al
2474 shl \$4, %rax
2475 sub %rax, $len
2476
2477 sub \$0x80, %rsp # place for tweak[8]
2478 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2479
2480 pxor $twtmp, $twtmp
2481 movdqa .Lxts_magic(%rip), $twmask
2482 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2483
2484 sub \$0x80, $len
2485 jc .Lxts_dec_short
2486 jmp .Lxts_dec_loop
2487
2488.align 16
2489.Lxts_dec_loop:
2490___
2491 for ($i=0;$i<7;$i++) {
2492 $code.=<<___;
2493 pshufd \$0x13, $twtmp, $twres
2494 pxor $twtmp, $twtmp
2495 movdqa @XMM[7], @XMM[$i]
2496 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2497 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2498 pand $twmask, $twres # isolate carry and residue
2499 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2500 pxor $twres, @XMM[7]
2501___
2502 $code.=<<___ if ($i>=1);
2503 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2504___
2505 $code.=<<___ if ($i>=2);
2506 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2507___
2508 }
2509$code.=<<___;
2510 movdqu 0x60($inp), @XMM[8+6]
2511 pxor @XMM[8+5], @XMM[5]
2512 movdqu 0x70($inp), @XMM[8+7]
2513 lea 0x80($inp), $inp
2514 movdqa @XMM[7], 0x70(%rsp)
2515 pxor @XMM[8+6], @XMM[6]
2516 lea 0x80(%rsp), %rax # pass key schedule
2517 pxor @XMM[8+7], @XMM[7]
2518 mov %edx, %r10d # pass rounds
2519
2520 call _bsaes_decrypt8
2521
2522 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2523 pxor 0x10(%rsp), @XMM[1]
2524 movdqu @XMM[0], 0x00($out) # write output
2525 pxor 0x20(%rsp), @XMM[6]
2526 movdqu @XMM[1], 0x10($out)
2527 pxor 0x30(%rsp), @XMM[4]
2528 movdqu @XMM[6], 0x20($out)
2529 pxor 0x40(%rsp), @XMM[2]
2530 movdqu @XMM[4], 0x30($out)
2531 pxor 0x50(%rsp), @XMM[7]
2532 movdqu @XMM[2], 0x40($out)
2533 pxor 0x60(%rsp), @XMM[3]
2534 movdqu @XMM[7], 0x50($out)
2535 pxor 0x70(%rsp), @XMM[5]
2536 movdqu @XMM[3], 0x60($out)
2537 movdqu @XMM[5], 0x70($out)
2538 lea 0x80($out), $out
2539
2540 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2541 pxor $twtmp, $twtmp
2542 movdqa .Lxts_magic(%rip), $twmask
2543 pcmpgtd @XMM[7], $twtmp
2544 pshufd \$0x13, $twtmp, $twres
2545 pxor $twtmp, $twtmp
2546 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2547 pand $twmask, $twres # isolate carry and residue
2548 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2549 pxor $twres, @XMM[7]
2550
2551 sub \$0x80,$len
2552 jnc .Lxts_dec_loop
2553
2554.Lxts_dec_short:
2555 add \$0x80, $len
2556 jz .Lxts_dec_done
2557___
2558 for ($i=0;$i<7;$i++) {
2559 $code.=<<___;
2560 pshufd \$0x13, $twtmp, $twres
2561 pxor $twtmp, $twtmp
2562 movdqa @XMM[7], @XMM[$i]
2563 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2564 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2565 pand $twmask, $twres # isolate carry and residue
2566 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2567 pxor $twres, @XMM[7]
2568___
2569 $code.=<<___ if ($i>=1);
2570 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2571 cmp \$`0x10*$i`,$len
2572 je .Lxts_dec_$i
2573___
2574 $code.=<<___ if ($i>=2);
2575 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2576___
2577 }
2578$code.=<<___;
2579 movdqu 0x60($inp), @XMM[8+6]
2580 pxor @XMM[8+5], @XMM[5]
2581 movdqa @XMM[7], 0x70(%rsp)
2582 lea 0x70($inp), $inp
2583 pxor @XMM[8+6], @XMM[6]
2584 lea 0x80(%rsp), %rax # pass key schedule
2585 mov %edx, %r10d # pass rounds
2586
2587 call _bsaes_decrypt8
2588
2589 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2590 pxor 0x10(%rsp), @XMM[1]
2591 movdqu @XMM[0], 0x00($out) # write output
2592 pxor 0x20(%rsp), @XMM[6]
2593 movdqu @XMM[1], 0x10($out)
2594 pxor 0x30(%rsp), @XMM[4]
2595 movdqu @XMM[6], 0x20($out)
2596 pxor 0x40(%rsp), @XMM[2]
2597 movdqu @XMM[4], 0x30($out)
2598 pxor 0x50(%rsp), @XMM[7]
2599 movdqu @XMM[2], 0x40($out)
2600 pxor 0x60(%rsp), @XMM[3]
2601 movdqu @XMM[7], 0x50($out)
2602 movdqu @XMM[3], 0x60($out)
2603 lea 0x70($out), $out
2604
2605 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2606 jmp .Lxts_dec_done
2607.align 16
2608.Lxts_dec_6:
2609 pxor @XMM[8+4], @XMM[4]
2610 lea 0x60($inp), $inp
2611 pxor @XMM[8+5], @XMM[5]
2612 lea 0x80(%rsp), %rax # pass key schedule
2613 mov %edx, %r10d # pass rounds
2614
2615 call _bsaes_decrypt8
2616
2617 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2618 pxor 0x10(%rsp), @XMM[1]
2619 movdqu @XMM[0], 0x00($out) # write output
2620 pxor 0x20(%rsp), @XMM[6]
2621 movdqu @XMM[1], 0x10($out)
2622 pxor 0x30(%rsp), @XMM[4]
2623 movdqu @XMM[6], 0x20($out)
2624 pxor 0x40(%rsp), @XMM[2]
2625 movdqu @XMM[4], 0x30($out)
2626 pxor 0x50(%rsp), @XMM[7]
2627 movdqu @XMM[2], 0x40($out)
2628 movdqu @XMM[7], 0x50($out)
2629 lea 0x60($out), $out
2630
2631 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2632 jmp .Lxts_dec_done
2633.align 16
2634.Lxts_dec_5:
2635 pxor @XMM[8+3], @XMM[3]
2636 lea 0x50($inp), $inp
2637 pxor @XMM[8+4], @XMM[4]
2638 lea 0x80(%rsp), %rax # pass key schedule
2639 mov %edx, %r10d # pass rounds
2640
2641 call _bsaes_decrypt8
2642
2643 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2644 pxor 0x10(%rsp), @XMM[1]
2645 movdqu @XMM[0], 0x00($out) # write output
2646 pxor 0x20(%rsp), @XMM[6]
2647 movdqu @XMM[1], 0x10($out)
2648 pxor 0x30(%rsp), @XMM[4]
2649 movdqu @XMM[6], 0x20($out)
2650 pxor 0x40(%rsp), @XMM[2]
2651 movdqu @XMM[4], 0x30($out)
2652 movdqu @XMM[2], 0x40($out)
2653 lea 0x50($out), $out
2654
2655 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2656 jmp .Lxts_dec_done
2657.align 16
2658.Lxts_dec_4:
2659 pxor @XMM[8+2], @XMM[2]
2660 lea 0x40($inp), $inp
2661 pxor @XMM[8+3], @XMM[3]
2662 lea 0x80(%rsp), %rax # pass key schedule
2663 mov %edx, %r10d # pass rounds
2664
2665 call _bsaes_decrypt8
2666
2667 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2668 pxor 0x10(%rsp), @XMM[1]
2669 movdqu @XMM[0], 0x00($out) # write output
2670 pxor 0x20(%rsp), @XMM[6]
2671 movdqu @XMM[1], 0x10($out)
2672 pxor 0x30(%rsp), @XMM[4]
2673 movdqu @XMM[6], 0x20($out)
2674 movdqu @XMM[4], 0x30($out)
2675 lea 0x40($out), $out
2676
2677 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2678 jmp .Lxts_dec_done
2679.align 16
2680.Lxts_dec_3:
2681 pxor @XMM[8+1], @XMM[1]
2682 lea 0x30($inp), $inp
2683 pxor @XMM[8+2], @XMM[2]
2684 lea 0x80(%rsp), %rax # pass key schedule
2685 mov %edx, %r10d # pass rounds
2686
2687 call _bsaes_decrypt8
2688
2689 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2690 pxor 0x10(%rsp), @XMM[1]
2691 movdqu @XMM[0], 0x00($out) # write output
2692 pxor 0x20(%rsp), @XMM[6]
2693 movdqu @XMM[1], 0x10($out)
2694 movdqu @XMM[6], 0x20($out)
2695 lea 0x30($out), $out
2696
2697 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2698 jmp .Lxts_dec_done
2699.align 16
2700.Lxts_dec_2:
2701 pxor @XMM[8+0], @XMM[0]
2702 lea 0x20($inp), $inp
2703 pxor @XMM[8+1], @XMM[1]
2704 lea 0x80(%rsp), %rax # pass key schedule
2705 mov %edx, %r10d # pass rounds
2706
2707 call _bsaes_decrypt8
2708
2709 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2710 pxor 0x10(%rsp), @XMM[1]
2711 movdqu @XMM[0], 0x00($out) # write output
2712 movdqu @XMM[1], 0x10($out)
2713 lea 0x20($out), $out
2714
2715 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2716 jmp .Lxts_dec_done
2717.align 16
2718.Lxts_dec_1:
2719 pxor @XMM[0], @XMM[8]
2720 lea 0x10($inp), $inp
2721 movdqa @XMM[8], 0x20(%rbp)
2722 lea 0x20(%rbp), $arg1
2723 lea 0x20(%rbp), $arg2
2724 lea ($key), $arg3
2725 call asm_AES_decrypt # doesn't touch %xmm
2726 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2727 #pxor @XMM[8], @XMM[0]
2728 #lea 0x80(%rsp), %rax # pass key schedule
2729 #mov %edx, %r10d # pass rounds
2730 #call _bsaes_decrypt8
2731 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2732 movdqu @XMM[0], 0x00($out) # write output
2733 lea 0x10($out), $out
2734
2735 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2736
2737.Lxts_dec_done:
2738 and \$15, %ebx
2739 jz .Lxts_dec_ret
2740
2741 pxor $twtmp, $twtmp
2742 movdqa .Lxts_magic(%rip), $twmask
2743 pcmpgtd @XMM[7], $twtmp
2744 pshufd \$0x13, $twtmp, $twres
2745 movdqa @XMM[7], @XMM[6]
2746 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2747 pand $twmask, $twres # isolate carry and residue
2748 movdqu ($inp), @XMM[0]
2749 pxor $twres, @XMM[7]
2750
2751 lea 0x20(%rbp), $arg1
2752 pxor @XMM[7], @XMM[0]
2753 lea 0x20(%rbp), $arg2
2754 movdqa @XMM[0], 0x20(%rbp)
2755 lea ($key), $arg3
2756 call asm_AES_decrypt # doesn't touch %xmm
2757 pxor 0x20(%rbp), @XMM[7]
2758 mov $out, %rdx
2759 movdqu @XMM[7], ($out)
2760
2761.Lxts_dec_steal:
2762 movzb 16($inp), %eax
2763 movzb (%rdx), %ecx
2764 lea 1($inp), $inp
2765 mov %al, (%rdx)
2766 mov %cl, 16(%rdx)
2767 lea 1(%rdx), %rdx
2768 sub \$1,%ebx
2769 jnz .Lxts_dec_steal
2770
2771 movdqu ($out), @XMM[0]
2772 lea 0x20(%rbp), $arg1
2773 pxor @XMM[6], @XMM[0]
2774 lea 0x20(%rbp), $arg2
2775 movdqa @XMM[0], 0x20(%rbp)
2776 lea ($key), $arg3
2777 call asm_AES_decrypt # doesn't touch %xmm
2778 pxor 0x20(%rbp), @XMM[6]
2779 movdqu @XMM[6], ($out)
2780
2781.Lxts_dec_ret:
2782 lea (%rsp), %rax
2783 pxor %xmm0, %xmm0
2784.Lxts_dec_bzero: # wipe key schedule [if any]
2785 movdqa %xmm0, 0x00(%rax)
2786 movdqa %xmm0, 0x10(%rax)
2787 lea 0x20(%rax), %rax
2788 cmp %rax, %rbp
2789 ja .Lxts_dec_bzero
2790
2791 lea (%rbp),%rsp # restore %rsp
2792___
2793$code.=<<___ if ($win64);
2794 movaps 0x40(%rbp), %xmm6
2795 movaps 0x50(%rbp), %xmm7
2796 movaps 0x60(%rbp), %xmm8
2797 movaps 0x70(%rbp), %xmm9
2798 movaps 0x80(%rbp), %xmm10
2799 movaps 0x90(%rbp), %xmm11
2800 movaps 0xa0(%rbp), %xmm12
2801 movaps 0xb0(%rbp), %xmm13
2802 movaps 0xc0(%rbp), %xmm14
2803 movaps 0xd0(%rbp), %xmm15
2804 lea 0xa0(%rbp), %rsp
2805___
2806$code.=<<___;
2807 mov 0x48(%rsp), %r15
2808 mov 0x50(%rsp), %r14
2809 mov 0x58(%rsp), %r13
2810 mov 0x60(%rsp), %r12
2811 mov 0x68(%rsp), %rbx
2812 mov 0x70(%rsp), %rax
2813 lea 0x78(%rsp), %rsp
2814 mov %rax, %rbp
2815.Lxts_dec_epilogue:
2816 ret
2817.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2818___
2819}
2820$code.=<<___;
2821.type _bsaes_const,\@object
2822.align 64
2823_bsaes_const:
2824.LM0ISR: # InvShiftRows constants
2825 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2826.LISRM0:
2827 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2828.LISR:
2829 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2830.LBS0: # bit-slice constants
2831 .quad 0x5555555555555555, 0x5555555555555555
2832.LBS1:
2833 .quad 0x3333333333333333, 0x3333333333333333
2834.LBS2:
2835 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2836.LSR: # shiftrows constants
2837 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2838.LSRM0:
2839 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2840.LM0SR:
2841 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2842.LSWPUP: # byte-swap upper dword
2843 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2844.LSWPUPM0SR:
2845 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2846.LADD1: # counter increment constants
2847 .quad 0x0000000000000000, 0x0000000100000000
2848.LADD2:
2849 .quad 0x0000000000000000, 0x0000000200000000
2850.LADD3:
2851 .quad 0x0000000000000000, 0x0000000300000000
2852.LADD4:
2853 .quad 0x0000000000000000, 0x0000000400000000
2854.LADD5:
2855 .quad 0x0000000000000000, 0x0000000500000000
2856.LADD6:
2857 .quad 0x0000000000000000, 0x0000000600000000
2858.LADD7:
2859 .quad 0x0000000000000000, 0x0000000700000000
2860.LADD8:
2861 .quad 0x0000000000000000, 0x0000000800000000
2862.Lxts_magic:
2863 .long 0x87,0,1,0
2864.Lmasks:
2865 .quad 0x0101010101010101, 0x0101010101010101
2866 .quad 0x0202020202020202, 0x0202020202020202
2867 .quad 0x0404040404040404, 0x0404040404040404
2868 .quad 0x0808080808080808, 0x0808080808080808
2869.LM0:
2870 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2871.L63:
2872 .quad 0x6363636363636363, 0x6363636363636363
2873.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2874.align 64
2875.size _bsaes_const,.-_bsaes_const
2876___
2877
2878# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2879# CONTEXT *context,DISPATCHER_CONTEXT *disp)
2880if ($win64) {
2881$rec="%rcx";
2882$frame="%rdx";
2883$context="%r8";
2884$disp="%r9";
2885
2886$code.=<<___;
2887.extern __imp_RtlVirtualUnwind
2888.type se_handler,\@abi-omnipotent
2889.align 16
2890se_handler:
2891 push %rsi
2892 push %rdi
2893 push %rbx
2894 push %rbp
2895 push %r12
2896 push %r13
2897 push %r14
2898 push %r15
2899 pushfq
2900 sub \$64,%rsp
2901
2902 mov 120($context),%rax # pull context->Rax
2903 mov 248($context),%rbx # pull context->Rip
2904
2905 mov 8($disp),%rsi # disp->ImageBase
2906 mov 56($disp),%r11 # disp->HandlerData
2907
2908 mov 0(%r11),%r10d # HandlerData[0]
2909 lea (%rsi,%r10),%r10 # prologue label
2910 cmp %r10,%rbx # context->Rip<prologue label
2911 jb .Lin_prologue
2912
2913 mov 152($context),%rax # pull context->Rsp
2914
2915 mov 4(%r11),%r10d # HandlerData[1]
2916 lea (%rsi,%r10),%r10 # epilogue label
2917 cmp %r10,%rbx # context->Rip>=epilogue label
2918 jae .Lin_prologue
2919
2920 mov 160($context),%rax # pull context->Rbp
2921
2922 lea 0x40(%rax),%rsi # %xmm save area
2923 lea 512($context),%rdi # &context.Xmm6
2924 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2925 .long 0xa548f3fc # cld; rep movsq
2926 lea 0xa0(%rax),%rax # adjust stack pointer
2927
2928 mov 0x70(%rax),%rbp
2929 mov 0x68(%rax),%rbx
2930 mov 0x60(%rax),%r12
2931 mov 0x58(%rax),%r13
2932 mov 0x50(%rax),%r14
2933 mov 0x48(%rax),%r15
2934 lea 0x78(%rax),%rax # adjust stack pointer
2935 mov %rbx,144($context) # restore context->Rbx
2936 mov %rbp,160($context) # restore context->Rbp
2937 mov %r12,216($context) # restore context->R12
2938 mov %r13,224($context) # restore context->R13
2939 mov %r14,232($context) # restore context->R14
2940 mov %r15,240($context) # restore context->R15
2941
2942.Lin_prologue:
2943 mov %rax,152($context) # restore context->Rsp
2944
2945 mov 40($disp),%rdi # disp->ContextRecord
2946 mov $context,%rsi # context
2947 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2948 .long 0xa548f3fc # cld; rep movsq
2949
2950 mov $disp,%rsi
2951 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2952 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2953 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2954 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2955 mov 40(%rsi),%r10 # disp->ContextRecord
2956 lea 56(%rsi),%r11 # &disp->HandlerData
2957 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2958 mov %r10,32(%rsp) # arg5
2959 mov %r11,40(%rsp) # arg6
2960 mov %r12,48(%rsp) # arg7
2961 mov %rcx,56(%rsp) # arg8, (NULL)
2962 call *__imp_RtlVirtualUnwind(%rip)
2963
2964 mov \$1,%eax # ExceptionContinueSearch
2965 add \$64,%rsp
2966 popfq
2967 pop %r15
2968 pop %r14
2969 pop %r13
2970 pop %r12
2971 pop %rbp
2972 pop %rbx
2973 pop %rdi
2974 pop %rsi
2975 ret
2976.size se_handler,.-se_handler
2977
2978.section .pdata
2979.align 4
2980___
2981$code.=<<___ if ($ecb);
2982 .rva .Lecb_enc_prologue
2983 .rva .Lecb_enc_epilogue
2984 .rva .Lecb_enc_info
2985
2986 .rva .Lecb_dec_prologue
2987 .rva .Lecb_dec_epilogue
2988 .rva .Lecb_dec_info
2989___
2990$code.=<<___;
2991 .rva .Lcbc_dec_prologue
2992 .rva .Lcbc_dec_epilogue
2993 .rva .Lcbc_dec_info
2994
2995 .rva .Lctr_enc_prologue
2996 .rva .Lctr_enc_epilogue
2997 .rva .Lctr_enc_info
2998
2999 .rva .Lxts_enc_prologue
3000 .rva .Lxts_enc_epilogue
3001 .rva .Lxts_enc_info
3002
3003 .rva .Lxts_dec_prologue
3004 .rva .Lxts_dec_epilogue
3005 .rva .Lxts_dec_info
3006
3007.section .xdata
3008.align 8
3009___
3010$code.=<<___ if ($ecb);
3011.Lecb_enc_info:
3012 .byte 9,0,0,0
3013 .rva se_handler
3014 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3015.Lecb_dec_info:
3016 .byte 9,0,0,0
3017 .rva se_handler
3018 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3019___
3020$code.=<<___;
3021.Lcbc_dec_info:
3022 .byte 9,0,0,0
3023 .rva se_handler
3024 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3025.Lctr_enc_info:
3026 .byte 9,0,0,0
3027 .rva se_handler
3028 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3029.Lxts_enc_info:
3030 .byte 9,0,0,0
3031 .rva se_handler
3032 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3033.Lxts_dec_info:
3034 .byte 9,0,0,0
3035 .rva se_handler
3036 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3037___
3038}
3039
3040$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3041
3042print $code;
3043
3044close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000000..1533e2c304
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-586.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86.pl column - [also
26# large-block CBC] encrypt/decrypt.
27#
28# aes-586.pl vpaes-x86.pl
29#
30# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***)
31# Nehalem 27.9/40.4/18.1 10.3/12.0
32# Atom 102./119./60.1 64.5/85.3(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +32%/65% improvement on Core 2
44# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
45# code path).
46#
47# <appro@openssl.org>
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50push(@INC,"${dir}","${dir}../../perlasm");
51require "x86asm.pl";
52
53&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
54
55$PREFIX="vpaes";
56
57my ($round, $base, $magic, $key, $const, $inp, $out)=
58 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
59
60&static_label("_vpaes_consts");
61&static_label("_vpaes_schedule_low_round");
62
63&set_label("_vpaes_consts",64);
64$k_inv=-0x30; # inv, inva
65 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
66 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
67
68$k_s0F=-0x10; # s0F
69 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
70
71$k_ipt=0x00; # input transform (lo, hi)
72 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
73 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
74
75$k_sb1=0x20; # sb1u, sb1t
76 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
77 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
78$k_sb2=0x40; # sb2u, sb2t
79 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
80 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
81$k_sbo=0x60; # sbou, sbot
82 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
83 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
84
85$k_mc_forward=0x80; # mc_forward
86 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
87 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
88 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
89 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
90
91$k_mc_backward=0xc0; # mc_backward
92 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
93 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
94 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
95 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
96
97$k_sr=0x100; # sr
98 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
99 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
100 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
101 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
102
103$k_rcon=0x140; # rcon
104 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
105
106$k_s63=0x150; # s63: all equal to 0x63 transformed
107 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
108
109$k_opt=0x160; # output transform
110 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
111 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
112
113$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
114 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
115 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
116##
117## Decryption stuff
118## Key schedule constants
119##
120$k_dksd=0x1a0; # decryption key schedule: invskew x*D
121 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
122 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
123$k_dksb=0x1c0; # decryption key schedule: invskew x*B
124 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
125 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
126$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
127 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
128 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
129$k_dks9=0x200; # decryption key schedule: invskew x*9
130 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
131 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
132
133##
134## Decryption stuff
135## Round function constants
136##
137$k_dipt=0x220; # decryption input transform
138 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
139 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
140
141$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
142 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
143 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
144$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
145 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
146 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
147$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
148 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
149 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
150$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
151 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
152 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
153$k_dsbo=0x2c0; # decryption sbox final output
154 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
155 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
156&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
157&align (64);
158
159&function_begin_B("_vpaes_preheat");
160 &add ($const,&DWP(0,"esp"));
161 &movdqa ("xmm7",&QWP($k_inv,$const));
162 &movdqa ("xmm6",&QWP($k_s0F,$const));
163 &ret ();
164&function_end_B("_vpaes_preheat");
165
166##
167## _aes_encrypt_core
168##
169## AES-encrypt %xmm0.
170##
171## Inputs:
172## %xmm0 = input
173## %xmm6-%xmm7 as in _vpaes_preheat
174## (%edx) = scheduled keys
175##
176## Output in %xmm0
177## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
178##
179##
180&function_begin_B("_vpaes_encrypt_core");
181 &mov ($magic,16);
182 &mov ($round,&DWP(240,$key));
183 &movdqa ("xmm1","xmm6")
184 &movdqa ("xmm2",&QWP($k_ipt,$const));
185 &pandn ("xmm1","xmm0");
186 &movdqu ("xmm5",&QWP(0,$key));
187 &psrld ("xmm1",4);
188 &pand ("xmm0","xmm6");
189 &pshufb ("xmm2","xmm0");
190 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
191 &pshufb ("xmm0","xmm1");
192 &pxor ("xmm2","xmm5");
193 &pxor ("xmm0","xmm2");
194 &add ($key,16);
195 &lea ($base,&DWP($k_mc_backward,$const));
196 &jmp (&label("enc_entry"));
197
198
199&set_label("enc_loop",16);
200 # middle of middle round
201 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
202 &pshufb ("xmm4","xmm2"); # 4 = sb1u
203 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
204 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
205 &pshufb ("xmm0","xmm3"); # 0 = sb1t
206 &pxor ("xmm0","xmm4"); # 0 = A
207 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
208 &pshufb ("xmm5","xmm2"); # 4 = sb2u
209 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
210 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
211 &pshufb ("xmm2","xmm3"); # 2 = sb2t
212 &pxor ("xmm2","xmm5"); # 2 = 2A
213 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
214 &movdqa ("xmm3","xmm0"); # 3 = A
215 &pshufb ("xmm0","xmm1"); # 0 = B
216 &add ($key,16); # next key
217 &pxor ("xmm0","xmm2"); # 0 = 2A+B
218 &pshufb ("xmm3","xmm4"); # 3 = D
219 &add ($magic,16); # next mc
220 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
221 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
222 &and ($magic,0x30); # ... mod 4
223 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
224 &sub ($round,1); # nr--
225
226&set_label("enc_entry");
227 # top of round
228 &movdqa ("xmm1","xmm6"); # 1 : i
229 &pandn ("xmm1","xmm0"); # 1 = i<<4
230 &psrld ("xmm1",4); # 1 = i
231 &pand ("xmm0","xmm6"); # 0 = k
232 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
233 &pshufb ("xmm5","xmm0"); # 2 = a/k
234 &pxor ("xmm0","xmm1"); # 0 = j
235 &movdqa ("xmm3","xmm7"); # 3 : 1/i
236 &pshufb ("xmm3","xmm1"); # 3 = 1/i
237 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
238 &movdqa ("xmm4","xmm7"); # 4 : 1/j
239 &pshufb ("xmm4","xmm0"); # 4 = 1/j
240 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
241 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
242 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
243 &pxor ("xmm2","xmm0"); # 2 = io
244 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
245 &movdqu ("xmm5",&QWP(0,$key));
246 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
247 &pxor ("xmm3","xmm1"); # 3 = jo
248 &jnz (&label("enc_loop"));
249
250 # middle of last round
251 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
252 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
253 &pshufb ("xmm4","xmm2"); # 4 = sbou
254 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
255 &pshufb ("xmm0","xmm3"); # 0 = sb1t
256 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
257 &pxor ("xmm0","xmm4"); # 0 = A
258 &pshufb ("xmm0","xmm1");
259 &ret ();
260&function_end_B("_vpaes_encrypt_core");
261
262##
263## Decryption core
264##
265## Same API as encryption core.
266##
267&function_begin_B("_vpaes_decrypt_core");
268 &mov ($round,&DWP(240,$key));
269 &lea ($base,&DWP($k_dsbd,$const));
270 &movdqa ("xmm1","xmm6");
271 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
272 &pandn ("xmm1","xmm0");
273 &mov ($magic,$round);
274 &psrld ("xmm1",4)
275 &movdqu ("xmm5",&QWP(0,$key));
276 &shl ($magic,4);
277 &pand ("xmm0","xmm6");
278 &pshufb ("xmm2","xmm0");
279 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
280 &xor ($magic,0x30);
281 &pshufb ("xmm0","xmm1");
282 &and ($magic,0x30);
283 &pxor ("xmm2","xmm5");
284 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
285 &pxor ("xmm0","xmm2");
286 &add ($key,16);
287 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
288 &jmp (&label("dec_entry"));
289
290&set_label("dec_loop",16);
291##
292## Inverse mix columns
293##
294 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
295 &pshufb ("xmm4","xmm2"); # 4 = sb9u
296 &pxor ("xmm4","xmm0");
297 &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t
298 &pshufb ("xmm0","xmm3"); # 0 = sb9t
299 &pxor ("xmm0","xmm4"); # 0 = ch
300 &add ($key,16); # next round key
301
302 &pshufb ("xmm0","xmm5"); # MC ch
303 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
304 &pshufb ("xmm4","xmm2"); # 4 = sbdu
305 &pxor ("xmm4","xmm0"); # 4 = ch
306 &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt
307 &pshufb ("xmm0","xmm3"); # 0 = sbdt
308 &pxor ("xmm0","xmm4"); # 0 = ch
309 &sub ($round,1); # nr--
310
311 &pshufb ("xmm0","xmm5"); # MC ch
312 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
313 &pshufb ("xmm4","xmm2"); # 4 = sbbu
314 &pxor ("xmm4","xmm0"); # 4 = ch
315 &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt
316 &pshufb ("xmm0","xmm3"); # 0 = sbbt
317 &pxor ("xmm0","xmm4"); # 0 = ch
318
319 &pshufb ("xmm0","xmm5"); # MC ch
320 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
321 &pshufb ("xmm4","xmm2"); # 4 = sbeu
322 &pxor ("xmm4","xmm0"); # 4 = ch
323 &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet
324 &pshufb ("xmm0","xmm3"); # 0 = sbet
325 &pxor ("xmm0","xmm4"); # 0 = ch
326
327 &palignr("xmm5","xmm5",12);
328
329&set_label("dec_entry");
330 # top of round
331 &movdqa ("xmm1","xmm6"); # 1 : i
332 &pandn ("xmm1","xmm0"); # 1 = i<<4
333 &psrld ("xmm1",4); # 1 = i
334 &pand ("xmm0","xmm6"); # 0 = k
335 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
336 &pshufb ("xmm2","xmm0"); # 2 = a/k
337 &pxor ("xmm0","xmm1"); # 0 = j
338 &movdqa ("xmm3","xmm7"); # 3 : 1/i
339 &pshufb ("xmm3","xmm1"); # 3 = 1/i
340 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
341 &movdqa ("xmm4","xmm7"); # 4 : 1/j
342 &pshufb ("xmm4","xmm0"); # 4 = 1/j
343 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
344 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
345 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
346 &pxor ("xmm2","xmm0"); # 2 = io
347 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
348 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
349 &pxor ("xmm3","xmm1"); # 3 = jo
350 &movdqu ("xmm0",&QWP(0,$key));
351 &jnz (&label("dec_loop"));
352
353 # middle of last round
354 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
355 &pshufb ("xmm4","xmm2"); # 4 = sbou
356 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
357 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
358 &movdqa ("xmm2",&QWP(0,$magic));
359 &pshufb ("xmm0","xmm3"); # 0 = sb1t
360 &pxor ("xmm0","xmm4"); # 0 = A
361 &pshufb ("xmm0","xmm2");
362 &ret ();
363&function_end_B("_vpaes_decrypt_core");
364
365########################################################
366## ##
367## AES key schedule ##
368## ##
369########################################################
370&function_begin_B("_vpaes_schedule_core");
371 &add ($const,&DWP(0,"esp"));
372 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
373 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
374
375 # input transform
376 &movdqa ("xmm3","xmm0");
377 &lea ($base,&DWP($k_ipt,$const));
378 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
379 &call ("_vpaes_schedule_transform");
380 &movdqa ("xmm7","xmm0");
381
382 &test ($out,$out);
383 &jnz (&label("schedule_am_decrypting"));
384
385 # encrypting, output zeroth round key after transform
386 &movdqu (&QWP(0,$key),"xmm0");
387 &jmp (&label("schedule_go"));
388
389&set_label("schedule_am_decrypting");
390 # decrypting, output zeroth round key after shiftrows
391 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
392 &pshufb ("xmm3","xmm1");
393 &movdqu (&QWP(0,$key),"xmm3");
394 &xor ($magic,0x30);
395
396&set_label("schedule_go");
397 &cmp ($round,192);
398 &ja (&label("schedule_256"));
399 &je (&label("schedule_192"));
400 # 128: fall though
401
402##
403## .schedule_128
404##
405## 128-bit specific part of key schedule.
406##
407## This schedule is really simple, because all its parts
408## are accomplished by the subroutines.
409##
410&set_label("schedule_128");
411 &mov ($round,10);
412
413&set_label("loop_schedule_128");
414 &call ("_vpaes_schedule_round");
415 &dec ($round);
416 &jz (&label("schedule_mangle_last"));
417 &call ("_vpaes_schedule_mangle"); # write output
418 &jmp (&label("loop_schedule_128"));
419
420##
421## .aes_schedule_192
422##
423## 192-bit specific part of key schedule.
424##
425## The main body of this schedule is the same as the 128-bit
426## schedule, but with more smearing. The long, high side is
427## stored in %xmm7 as before, and the short, low side is in
428## the high bits of %xmm6.
429##
430## This schedule is somewhat nastier, however, because each
431## round produces 192 bits of key material, or 1.5 round keys.
432## Therefore, on each cycle we do 2 rounds and produce 3 round
433## keys.
434##
435&set_label("schedule_192",16);
436 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
437 &call ("_vpaes_schedule_transform"); # input transform
438 &movdqa ("xmm6","xmm0"); # save short part
439 &pxor ("xmm4","xmm4"); # clear 4
440 &movhlps("xmm6","xmm4"); # clobber low side with zeros
441 &mov ($round,4);
442
443&set_label("loop_schedule_192");
444 &call ("_vpaes_schedule_round");
445 &palignr("xmm0","xmm6",8);
446 &call ("_vpaes_schedule_mangle"); # save key n
447 &call ("_vpaes_schedule_192_smear");
448 &call ("_vpaes_schedule_mangle"); # save key n+1
449 &call ("_vpaes_schedule_round");
450 &dec ($round);
451 &jz (&label("schedule_mangle_last"));
452 &call ("_vpaes_schedule_mangle"); # save key n+2
453 &call ("_vpaes_schedule_192_smear");
454 &jmp (&label("loop_schedule_192"));
455
456##
457## .aes_schedule_256
458##
459## 256-bit specific part of key schedule.
460##
461## The structure here is very similar to the 128-bit
462## schedule, but with an additional "low side" in
463## %xmm6. The low side's rounds are the same as the
464## high side's, except no rcon and no rotation.
465##
466&set_label("schedule_256",16);
467 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
468 &call ("_vpaes_schedule_transform"); # input transform
469 &mov ($round,7);
470
471&set_label("loop_schedule_256");
472 &call ("_vpaes_schedule_mangle"); # output low result
473 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
474
475 # high round
476 &call ("_vpaes_schedule_round");
477 &dec ($round);
478 &jz (&label("schedule_mangle_last"));
479 &call ("_vpaes_schedule_mangle");
480
481 # low round. swap xmm7 and xmm6
482 &pshufd ("xmm0","xmm0",0xFF);
483 &movdqa (&QWP(20,"esp"),"xmm7");
484 &movdqa ("xmm7","xmm6");
485 &call ("_vpaes_schedule_low_round");
486 &movdqa ("xmm7",&QWP(20,"esp"));
487
488 &jmp (&label("loop_schedule_256"));
489
490##
491## .aes_schedule_mangle_last
492##
493## Mangler for last round of key schedule
494## Mangles %xmm0
495## when encrypting, outputs out(%xmm0) ^ 63
496## when decrypting, outputs unskew(%xmm0)
497##
498## Always called right before return... jumps to cleanup and exits
499##
500&set_label("schedule_mangle_last",16);
501 # schedule last round key from xmm0
502 &lea ($base,&DWP($k_deskew,$const));
503 &test ($out,$out);
504 &jnz (&label("schedule_mangle_last_dec"));
505
506 # encrypting
507 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
508 &pshufb ("xmm0","xmm1"); # output permute
509 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
510 &add ($key,32);
511
512&set_label("schedule_mangle_last_dec");
513 &add ($key,-16);
514 &pxor ("xmm0",&QWP($k_s63,$const));
515 &call ("_vpaes_schedule_transform"); # output transform
516 &movdqu (&QWP(0,$key),"xmm0"); # save last key
517
518 # cleanup
519 &pxor ("xmm0","xmm0");
520 &pxor ("xmm1","xmm1");
521 &pxor ("xmm2","xmm2");
522 &pxor ("xmm3","xmm3");
523 &pxor ("xmm4","xmm4");
524 &pxor ("xmm5","xmm5");
525 &pxor ("xmm6","xmm6");
526 &pxor ("xmm7","xmm7");
527 &ret ();
528&function_end_B("_vpaes_schedule_core");
529
530##
531## .aes_schedule_192_smear
532##
533## Smear the short, low side in the 192-bit key schedule.
534##
535## Inputs:
536## %xmm7: high side, b a x y
537## %xmm6: low side, d c 0 0
538## %xmm13: 0
539##
540## Outputs:
541## %xmm6: b+c+d b+c 0 0
542## %xmm0: b+c+d b+c b a
543##
544&function_begin_B("_vpaes_schedule_192_smear");
545 &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0
546 &pxor ("xmm6","xmm0"); # -> c+d c 0 0
547 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
548 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
549 &movdqa ("xmm0","xmm6");
550 &pxor ("xmm1","xmm1");
551 &movhlps("xmm6","xmm1"); # clobber low side with zeros
552 &ret ();
553&function_end_B("_vpaes_schedule_192_smear");
554
555##
556## .aes_schedule_round
557##
558## Runs one main round of the key schedule on %xmm0, %xmm7
559##
560## Specifically, runs subbytes on the high dword of %xmm0
561## then rotates it by one byte and xors into the low dword of
562## %xmm7.
563##
564## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
565## next rcon.
566##
567## Smears the dwords of %xmm7 by xoring the low into the
568## second low, result into third, result into highest.
569##
570## Returns results in %xmm7 = %xmm0.
571## Clobbers %xmm1-%xmm5.
572##
573&function_begin_B("_vpaes_schedule_round");
574 # extract rcon from xmm8
575 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
576 &pxor ("xmm1","xmm1");
577 &palignr("xmm1","xmm2",15);
578 &palignr("xmm2","xmm2",15);
579 &pxor ("xmm7","xmm1");
580
581 # rotate
582 &pshufd ("xmm0","xmm0",0xFF);
583 &palignr("xmm0","xmm0",1);
584
585 # fall through...
586 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
587
588 # low round: same as high round, but no rotation and no rcon.
589&set_label("_vpaes_schedule_low_round");
590 # smear xmm7
591 &movdqa ("xmm1","xmm7");
592 &pslldq ("xmm7",4);
593 &pxor ("xmm7","xmm1");
594 &movdqa ("xmm1","xmm7");
595 &pslldq ("xmm7",8);
596 &pxor ("xmm7","xmm1");
597 &pxor ("xmm7",&QWP($k_s63,$const));
598
599 # subbyte
600 &movdqa ("xmm4",&QWP($k_s0F,$const));
601 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
602 &movdqa ("xmm1","xmm4");
603 &pandn ("xmm1","xmm0");
604 &psrld ("xmm1",4); # 1 = i
605 &pand ("xmm0","xmm4"); # 0 = k
606 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
607 &pshufb ("xmm2","xmm0"); # 2 = a/k
608 &pxor ("xmm0","xmm1"); # 0 = j
609 &movdqa ("xmm3","xmm5"); # 3 : 1/i
610 &pshufb ("xmm3","xmm1"); # 3 = 1/i
611 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
612 &movdqa ("xmm4","xmm5"); # 4 : 1/j
613 &pshufb ("xmm4","xmm0"); # 4 = 1/j
614 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
615 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
616 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
617 &pxor ("xmm2","xmm0"); # 2 = io
618 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
619 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
620 &pxor ("xmm3","xmm1"); # 3 = jo
621 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
622 &pshufb ("xmm4","xmm2"); # 4 = sbou
623 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
624 &pshufb ("xmm0","xmm3"); # 0 = sb1t
625 &pxor ("xmm0","xmm4"); # 0 = sbox output
626
627 # add in smeared stuff
628 &pxor ("xmm0","xmm7");
629 &movdqa ("xmm7","xmm0");
630 &ret ();
631&function_end_B("_vpaes_schedule_round");
632
633##
634## .aes_schedule_transform
635##
636## Linear-transform %xmm0 according to tables at (%ebx)
637##
638## Output in %xmm0
639## Clobbers %xmm1, %xmm2
640##
641&function_begin_B("_vpaes_schedule_transform");
642 &movdqa ("xmm2",&QWP($k_s0F,$const));
643 &movdqa ("xmm1","xmm2");
644 &pandn ("xmm1","xmm0");
645 &psrld ("xmm1",4);
646 &pand ("xmm0","xmm2");
647 &movdqa ("xmm2",&QWP(0,$base));
648 &pshufb ("xmm2","xmm0");
649 &movdqa ("xmm0",&QWP(16,$base));
650 &pshufb ("xmm0","xmm1");
651 &pxor ("xmm0","xmm2");
652 &ret ();
653&function_end_B("_vpaes_schedule_transform");
654
655##
656## .aes_schedule_mangle
657##
658## Mangle xmm0 from (basis-transformed) standard version
659## to our version.
660##
661## On encrypt,
662## xor with 0x63
663## multiply by circulant 0,1,1,1
664## apply shiftrows transform
665##
666## On decrypt,
667## xor with 0x63
668## multiply by "inverse mixcolumns" circulant E,B,D,9
669## deskew
670## apply shiftrows transform
671##
672##
673## Writes out to (%edx), and increments or decrements it
674## Keeps track of round number mod 4 in %ecx
675## Preserves xmm0
676## Clobbers xmm1-xmm5
677##
678&function_begin_B("_vpaes_schedule_mangle");
679 &movdqa ("xmm4","xmm0"); # save xmm0 for later
680 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
681 &test ($out,$out);
682 &jnz (&label("schedule_mangle_dec"));
683
684 # encrypting
685 &add ($key,16);
686 &pxor ("xmm4",&QWP($k_s63,$const));
687 &pshufb ("xmm4","xmm5");
688 &movdqa ("xmm3","xmm4");
689 &pshufb ("xmm4","xmm5");
690 &pxor ("xmm3","xmm4");
691 &pshufb ("xmm4","xmm5");
692 &pxor ("xmm3","xmm4");
693
694 &jmp (&label("schedule_mangle_both"));
695
696&set_label("schedule_mangle_dec",16);
697 # inverse mix columns
698 &movdqa ("xmm2",&QWP($k_s0F,$const));
699 &lea ($inp,&DWP($k_dksd,$const));
700 &movdqa ("xmm1","xmm2");
701 &pandn ("xmm1","xmm4");
702 &psrld ("xmm1",4); # 1 = hi
703 &pand ("xmm4","xmm2"); # 4 = lo
704
705 &movdqa ("xmm2",&QWP(0,$inp));
706 &pshufb ("xmm2","xmm4");
707 &movdqa ("xmm3",&QWP(0x10,$inp));
708 &pshufb ("xmm3","xmm1");
709 &pxor ("xmm3","xmm2");
710 &pshufb ("xmm3","xmm5");
711
712 &movdqa ("xmm2",&QWP(0x20,$inp));
713 &pshufb ("xmm2","xmm4");
714 &pxor ("xmm2","xmm3");
715 &movdqa ("xmm3",&QWP(0x30,$inp));
716 &pshufb ("xmm3","xmm1");
717 &pxor ("xmm3","xmm2");
718 &pshufb ("xmm3","xmm5");
719
720 &movdqa ("xmm2",&QWP(0x40,$inp));
721 &pshufb ("xmm2","xmm4");
722 &pxor ("xmm2","xmm3");
723 &movdqa ("xmm3",&QWP(0x50,$inp));
724 &pshufb ("xmm3","xmm1");
725 &pxor ("xmm3","xmm2");
726 &pshufb ("xmm3","xmm5");
727
728 &movdqa ("xmm2",&QWP(0x60,$inp));
729 &pshufb ("xmm2","xmm4");
730 &pxor ("xmm2","xmm3");
731 &movdqa ("xmm3",&QWP(0x70,$inp));
732 &pshufb ("xmm3","xmm1");
733 &pxor ("xmm3","xmm2");
734
735 &add ($key,-16);
736
737&set_label("schedule_mangle_both");
738 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
739 &pshufb ("xmm3","xmm1");
740 &add ($magic,-16);
741 &and ($magic,0x30);
742 &movdqu (&QWP(0,$key),"xmm3");
743 &ret ();
744&function_end_B("_vpaes_schedule_mangle");
745
746#
747# Interface to OpenSSL
748#
749&function_begin("${PREFIX}_set_encrypt_key");
750 &mov ($inp,&wparam(0)); # inp
751 &lea ($base,&DWP(-56,"esp"));
752 &mov ($round,&wparam(1)); # bits
753 &and ($base,-16);
754 &mov ($key,&wparam(2)); # key
755 &xchg ($base,"esp"); # alloca
756 &mov (&DWP(48,"esp"),$base);
757
758 &mov ($base,$round);
759 &shr ($base,5);
760 &add ($base,5);
761 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
762 &mov ($magic,0x30);
763 &mov ($out,0);
764
765 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
766 &call ("_vpaes_schedule_core");
767&set_label("pic_point");
768
769 &mov ("esp",&DWP(48,"esp"));
770 &xor ("eax","eax");
771&function_end("${PREFIX}_set_encrypt_key");
772
773&function_begin("${PREFIX}_set_decrypt_key");
774 &mov ($inp,&wparam(0)); # inp
775 &lea ($base,&DWP(-56,"esp"));
776 &mov ($round,&wparam(1)); # bits
777 &and ($base,-16);
778 &mov ($key,&wparam(2)); # key
779 &xchg ($base,"esp"); # alloca
780 &mov (&DWP(48,"esp"),$base);
781
782 &mov ($base,$round);
783 &shr ($base,5);
784 &add ($base,5);
785 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786 &shl ($base,4);
787 &lea ($key,&DWP(16,$key,$base));
788
789 &mov ($out,1);
790 &mov ($magic,$round);
791 &shr ($magic,1);
792 &and ($magic,32);
793 &xor ($magic,32); # nbist==192?0:32;
794
795 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
796 &call ("_vpaes_schedule_core");
797&set_label("pic_point");
798
799 &mov ("esp",&DWP(48,"esp"));
800 &xor ("eax","eax");
801&function_end("${PREFIX}_set_decrypt_key");
802
803&function_begin("${PREFIX}_encrypt");
804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805 &call ("_vpaes_preheat");
806&set_label("pic_point");
807 &mov ($inp,&wparam(0)); # inp
808 &lea ($base,&DWP(-56,"esp"));
809 &mov ($out,&wparam(1)); # out
810 &and ($base,-16);
811 &mov ($key,&wparam(2)); # key
812 &xchg ($base,"esp"); # alloca
813 &mov (&DWP(48,"esp"),$base);
814
815 &movdqu ("xmm0",&QWP(0,$inp));
816 &call ("_vpaes_encrypt_core");
817 &movdqu (&QWP(0,$out),"xmm0");
818
819 &mov ("esp",&DWP(48,"esp"));
820&function_end("${PREFIX}_encrypt");
821
822&function_begin("${PREFIX}_decrypt");
823 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
824 &call ("_vpaes_preheat");
825&set_label("pic_point");
826 &mov ($inp,&wparam(0)); # inp
827 &lea ($base,&DWP(-56,"esp"));
828 &mov ($out,&wparam(1)); # out
829 &and ($base,-16);
830 &mov ($key,&wparam(2)); # key
831 &xchg ($base,"esp"); # alloca
832 &mov (&DWP(48,"esp"),$base);
833
834 &movdqu ("xmm0",&QWP(0,$inp));
835 &call ("_vpaes_decrypt_core");
836 &movdqu (&QWP(0,$out),"xmm0");
837
838 &mov ("esp",&DWP(48,"esp"));
839&function_end("${PREFIX}_decrypt");
840
841&function_begin("${PREFIX}_cbc_encrypt");
842 &mov ($inp,&wparam(0)); # inp
843 &mov ($out,&wparam(1)); # out
844 &mov ($round,&wparam(2)); # len
845 &mov ($key,&wparam(3)); # key
846 &sub ($round,16);
847 &jc (&label("cbc_abort"));
848 &lea ($base,&DWP(-56,"esp"));
849 &mov ($const,&wparam(4)); # ivp
850 &and ($base,-16);
851 &mov ($magic,&wparam(5)); # enc
852 &xchg ($base,"esp"); # alloca
853 &movdqu ("xmm1",&QWP(0,$const)); # load IV
854 &sub ($out,$inp);
855 &mov (&DWP(48,"esp"),$base);
856
857 &mov (&DWP(0,"esp"),$out); # save out
858 &mov (&DWP(4,"esp"),$key) # save key
859 &mov (&DWP(8,"esp"),$const); # save ivp
860 &mov ($out,$round); # $out works as $len
861
862 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
863 &call ("_vpaes_preheat");
864&set_label("pic_point");
865 &cmp ($magic,0);
866 &je (&label("cbc_dec_loop"));
867 &jmp (&label("cbc_enc_loop"));
868
869&set_label("cbc_enc_loop",16);
870 &movdqu ("xmm0",&QWP(0,$inp)); # load input
871 &pxor ("xmm0","xmm1"); # inp^=iv
872 &call ("_vpaes_encrypt_core");
873 &mov ($base,&DWP(0,"esp")); # restore out
874 &mov ($key,&DWP(4,"esp")); # restore key
875 &movdqa ("xmm1","xmm0");
876 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
877 &lea ($inp,&DWP(16,$inp));
878 &sub ($out,16);
879 &jnc (&label("cbc_enc_loop"));
880 &jmp (&label("cbc_done"));
881
882&set_label("cbc_dec_loop",16);
883 &movdqu ("xmm0",&QWP(0,$inp)); # load input
884 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
885 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
886 &call ("_vpaes_decrypt_core");
887 &mov ($base,&DWP(0,"esp")); # restore out
888 &mov ($key,&DWP(4,"esp")); # restore key
889 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
890 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
891 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
892 &lea ($inp,&DWP(16,$inp));
893 &sub ($out,16);
894 &jnc (&label("cbc_dec_loop"));
895
896&set_label("cbc_done");
897 &mov ($base,&DWP(8,"esp")); # restore ivp
898 &mov ("esp",&DWP(48,"esp"));
899 &movdqu (&QWP(0,$base),"xmm1"); # write IV
900&set_label("cbc_abort");
901&function_end("${PREFIX}_cbc_encrypt");
902
903&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000000..37998db5e1
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1206 @@
1#!/usr/bin/env perl
2
3######################################################################
4## Constant-time SSSE3 AES core implementation.
5## version 0.1
6##
7## By Mike Hamburg (Stanford University), 2009
8## Public domain.
9##
10## For details see http://shiftleft.org/papers/vector_aes/ and
11## http://crypto.stanford.edu/vpaes/.
12
13######################################################################
14# September 2011.
15#
16# Interface to OpenSSL as "almost" drop-in replacement for
17# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18# doesn't handle partial vectors (doesn't have to if called from
19# EVP only). "Drop-in" implies that this module doesn't share key
20# schedule structure with the original nor does it make assumption
21# about its alignment...
22#
23# Performance summary. aes-x86_64.pl column lists large-block CBC
24# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25# byte processed with 128-bit key, and vpaes-x86_64.pl column -
26# [also large-block CBC] encrypt/decrypt.
27#
28# aes-x86_64.pl vpaes-x86_64.pl
29#
30# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31# Nehalem 30.5/42.2/14.6 9.8/11.8
32# Atom 63.9/79.0/32.1 64.0/84.8(***)
33#
34# (*) "Hyper-threading" in the context refers rather to cache shared
35# among multiple cores, than to specifically Intel HTT. As vast
36# majority of contemporary cores share cache, slower code path
37# is common place. In other words "with-hyper-threading-off"
38# results are presented mostly for reference purposes.
39#
40# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
41#
42# (***) Less impressive improvement on Core 2 and Atom is due to slow
43# pshufb, yet it's respectable +40%/78% improvement on Core 2
44# (as implied, over "hyper-threading-safe" code path).
45#
46# <appro@openssl.org>
47
48$flavour = shift;
49$output = shift;
50if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51
52$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53
54$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57die "can't locate x86_64-xlate.pl";
58
59open STDOUT,"| $^X $xlate $flavour $output";
60
61$PREFIX="vpaes";
62
63$code.=<<___;
64.text
65
66##
67## _aes_encrypt_core
68##
69## AES-encrypt %xmm0.
70##
71## Inputs:
72## %xmm0 = input
73## %xmm9-%xmm15 as in _vpaes_preheat
74## (%rdx) = scheduled keys
75##
76## Output in %xmm0
77## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
78## Preserves %xmm6 - %xmm8 so you get some local vectors
79##
80##
81.type _vpaes_encrypt_core,\@abi-omnipotent
82.align 16
83_vpaes_encrypt_core:
84 mov %rdx, %r9
85 mov \$16, %r11
86 mov 240(%rdx),%eax
87 movdqa %xmm9, %xmm1
88 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
89 pandn %xmm0, %xmm1
90 movdqu (%r9), %xmm5 # round0 key
91 psrld \$4, %xmm1
92 pand %xmm9, %xmm0
93 pshufb %xmm0, %xmm2
94 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
95 pshufb %xmm1, %xmm0
96 pxor %xmm5, %xmm2
97 pxor %xmm2, %xmm0
98 add \$16, %r9
99 lea .Lk_mc_backward(%rip),%r10
100 jmp .Lenc_entry
101
102.align 16
103.Lenc_loop:
104 # middle of middle round
105 movdqa %xmm13, %xmm4 # 4 : sb1u
106 pshufb %xmm2, %xmm4 # 4 = sb1u
107 pxor %xmm5, %xmm4 # 4 = sb1u + k
108 movdqa %xmm12, %xmm0 # 0 : sb1t
109 pshufb %xmm3, %xmm0 # 0 = sb1t
110 pxor %xmm4, %xmm0 # 0 = A
111 movdqa %xmm15, %xmm5 # 4 : sb2u
112 pshufb %xmm2, %xmm5 # 4 = sb2u
113 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
114 movdqa %xmm14, %xmm2 # 2 : sb2t
115 pshufb %xmm3, %xmm2 # 2 = sb2t
116 pxor %xmm5, %xmm2 # 2 = 2A
117 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
118 movdqa %xmm0, %xmm3 # 3 = A
119 pshufb %xmm1, %xmm0 # 0 = B
120 add \$16, %r9 # next key
121 pxor %xmm2, %xmm0 # 0 = 2A+B
122 pshufb %xmm4, %xmm3 # 3 = D
123 add \$16, %r11 # next mc
124 pxor %xmm0, %xmm3 # 3 = 2A+B+D
125 pshufb %xmm1, %xmm0 # 0 = 2B+C
126 and \$0x30, %r11 # ... mod 4
127 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
128 sub \$1,%rax # nr--
129
130.Lenc_entry:
131 # top of round
132 movdqa %xmm9, %xmm1 # 1 : i
133 pandn %xmm0, %xmm1 # 1 = i<<4
134 psrld \$4, %xmm1 # 1 = i
135 pand %xmm9, %xmm0 # 0 = k
136 movdqa %xmm11, %xmm5 # 2 : a/k
137 pshufb %xmm0, %xmm5 # 2 = a/k
138 pxor %xmm1, %xmm0 # 0 = j
139 movdqa %xmm10, %xmm3 # 3 : 1/i
140 pshufb %xmm1, %xmm3 # 3 = 1/i
141 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
142 movdqa %xmm10, %xmm4 # 4 : 1/j
143 pshufb %xmm0, %xmm4 # 4 = 1/j
144 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
145 movdqa %xmm10, %xmm2 # 2 : 1/iak
146 pshufb %xmm3, %xmm2 # 2 = 1/iak
147 pxor %xmm0, %xmm2 # 2 = io
148 movdqa %xmm10, %xmm3 # 3 : 1/jak
149 movdqu (%r9), %xmm5
150 pshufb %xmm4, %xmm3 # 3 = 1/jak
151 pxor %xmm1, %xmm3 # 3 = jo
152 jnz .Lenc_loop
153
154 # middle of last round
155 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
156 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
157 pshufb %xmm2, %xmm4 # 4 = sbou
158 pxor %xmm5, %xmm4 # 4 = sb1u + k
159 pshufb %xmm3, %xmm0 # 0 = sb1t
160 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
161 pxor %xmm4, %xmm0 # 0 = A
162 pshufb %xmm1, %xmm0
163 ret
164.size _vpaes_encrypt_core,.-_vpaes_encrypt_core
165
166##
167## Decryption core
168##
169## Same API as encryption core.
170##
171.type _vpaes_decrypt_core,\@abi-omnipotent
172.align 16
173_vpaes_decrypt_core:
174 mov %rdx, %r9 # load key
175 mov 240(%rdx),%eax
176 movdqa %xmm9, %xmm1
177 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
178 pandn %xmm0, %xmm1
179 mov %rax, %r11
180 psrld \$4, %xmm1
181 movdqu (%r9), %xmm5 # round0 key
182 shl \$4, %r11
183 pand %xmm9, %xmm0
184 pshufb %xmm0, %xmm2
185 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
186 xor \$0x30, %r11
187 lea .Lk_dsbd(%rip),%r10
188 pshufb %xmm1, %xmm0
189 and \$0x30, %r11
190 pxor %xmm5, %xmm2
191 movdqa .Lk_mc_forward+48(%rip), %xmm5
192 pxor %xmm2, %xmm0
193 add \$16, %r9
194 add %r10, %r11
195 jmp .Ldec_entry
196
197.align 16
198.Ldec_loop:
199##
200## Inverse mix columns
201##
202 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
203 pshufb %xmm2, %xmm4 # 4 = sb9u
204 pxor %xmm0, %xmm4
205 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
206 pshufb %xmm3, %xmm0 # 0 = sb9t
207 pxor %xmm4, %xmm0 # 0 = ch
208 add \$16, %r9 # next round key
209
210 pshufb %xmm5, %xmm0 # MC ch
211 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
212 pshufb %xmm2, %xmm4 # 4 = sbdu
213 pxor %xmm0, %xmm4 # 4 = ch
214 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
215 pshufb %xmm3, %xmm0 # 0 = sbdt
216 pxor %xmm4, %xmm0 # 0 = ch
217 sub \$1,%rax # nr--
218
219 pshufb %xmm5, %xmm0 # MC ch
220 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
221 pshufb %xmm2, %xmm4 # 4 = sbbu
222 pxor %xmm0, %xmm4 # 4 = ch
223 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
224 pshufb %xmm3, %xmm0 # 0 = sbbt
225 pxor %xmm4, %xmm0 # 0 = ch
226
227 pshufb %xmm5, %xmm0 # MC ch
228 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
229 pshufb %xmm2, %xmm4 # 4 = sbeu
230 pxor %xmm0, %xmm4 # 4 = ch
231 movdqa 0x50(%r10),%xmm0 # 0 : sbet
232 pshufb %xmm3, %xmm0 # 0 = sbet
233 pxor %xmm4, %xmm0 # 0 = ch
234
235 palignr \$12, %xmm5, %xmm5
236
237.Ldec_entry:
238 # top of round
239 movdqa %xmm9, %xmm1 # 1 : i
240 pandn %xmm0, %xmm1 # 1 = i<<4
241 psrld \$4, %xmm1 # 1 = i
242 pand %xmm9, %xmm0 # 0 = k
243 movdqa %xmm11, %xmm2 # 2 : a/k
244 pshufb %xmm0, %xmm2 # 2 = a/k
245 pxor %xmm1, %xmm0 # 0 = j
246 movdqa %xmm10, %xmm3 # 3 : 1/i
247 pshufb %xmm1, %xmm3 # 3 = 1/i
248 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
249 movdqa %xmm10, %xmm4 # 4 : 1/j
250 pshufb %xmm0, %xmm4 # 4 = 1/j
251 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
252 movdqa %xmm10, %xmm2 # 2 : 1/iak
253 pshufb %xmm3, %xmm2 # 2 = 1/iak
254 pxor %xmm0, %xmm2 # 2 = io
255 movdqa %xmm10, %xmm3 # 3 : 1/jak
256 pshufb %xmm4, %xmm3 # 3 = 1/jak
257 pxor %xmm1, %xmm3 # 3 = jo
258 movdqu (%r9), %xmm0
259 jnz .Ldec_loop
260
261 # middle of last round
262 movdqa 0x60(%r10), %xmm4 # 3 : sbou
263 pshufb %xmm2, %xmm4 # 4 = sbou
264 pxor %xmm0, %xmm4 # 4 = sb1u + k
265 movdqa 0x70(%r10), %xmm0 # 0 : sbot
266 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
267 pshufb %xmm3, %xmm0 # 0 = sb1t
268 pxor %xmm4, %xmm0 # 0 = A
269 pshufb %xmm2, %xmm0
270 ret
271.size _vpaes_decrypt_core,.-_vpaes_decrypt_core
272
273########################################################
274## ##
275## AES key schedule ##
276## ##
277########################################################
278.type _vpaes_schedule_core,\@abi-omnipotent
279.align 16
280_vpaes_schedule_core:
281 # rdi = key
282 # rsi = size in bits
283 # rdx = buffer
284 # rcx = direction. 0=encrypt, 1=decrypt
285
286 call _vpaes_preheat # load the tables
287 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
288 movdqu (%rdi), %xmm0 # load key (unaligned)
289
290 # input transform
291 movdqa %xmm0, %xmm3
292 lea .Lk_ipt(%rip), %r11
293 call _vpaes_schedule_transform
294 movdqa %xmm0, %xmm7
295
296 lea .Lk_sr(%rip),%r10
297 test %rcx, %rcx
298 jnz .Lschedule_am_decrypting
299
300 # encrypting, output zeroth round key after transform
301 movdqu %xmm0, (%rdx)
302 jmp .Lschedule_go
303
304.Lschedule_am_decrypting:
305 # decrypting, output zeroth round key after shiftrows
306 movdqa (%r8,%r10),%xmm1
307 pshufb %xmm1, %xmm3
308 movdqu %xmm3, (%rdx)
309 xor \$0x30, %r8
310
311.Lschedule_go:
312 cmp \$192, %esi
313 ja .Lschedule_256
314 je .Lschedule_192
315 # 128: fall though
316
317##
318## .schedule_128
319##
320## 128-bit specific part of key schedule.
321##
322## This schedule is really simple, because all its parts
323## are accomplished by the subroutines.
324##
325.Lschedule_128:
326 mov \$10, %esi
327
328.Loop_schedule_128:
329 call _vpaes_schedule_round
330 dec %rsi
331 jz .Lschedule_mangle_last
332 call _vpaes_schedule_mangle # write output
333 jmp .Loop_schedule_128
334
335##
336## .aes_schedule_192
337##
338## 192-bit specific part of key schedule.
339##
340## The main body of this schedule is the same as the 128-bit
341## schedule, but with more smearing. The long, high side is
342## stored in %xmm7 as before, and the short, low side is in
343## the high bits of %xmm6.
344##
345## This schedule is somewhat nastier, however, because each
346## round produces 192 bits of key material, or 1.5 round keys.
347## Therefore, on each cycle we do 2 rounds and produce 3 round
348## keys.
349##
350.align 16
351.Lschedule_192:
352 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
353 call _vpaes_schedule_transform # input transform
354 movdqa %xmm0, %xmm6 # save short part
355 pxor %xmm4, %xmm4 # clear 4
356 movhlps %xmm4, %xmm6 # clobber low side with zeros
357 mov \$4, %esi
358
359.Loop_schedule_192:
360 call _vpaes_schedule_round
361 palignr \$8,%xmm6,%xmm0
362 call _vpaes_schedule_mangle # save key n
363 call _vpaes_schedule_192_smear
364 call _vpaes_schedule_mangle # save key n+1
365 call _vpaes_schedule_round
366 dec %rsi
367 jz .Lschedule_mangle_last
368 call _vpaes_schedule_mangle # save key n+2
369 call _vpaes_schedule_192_smear
370 jmp .Loop_schedule_192
371
372##
373## .aes_schedule_256
374##
375## 256-bit specific part of key schedule.
376##
377## The structure here is very similar to the 128-bit
378## schedule, but with an additional "low side" in
379## %xmm6. The low side's rounds are the same as the
380## high side's, except no rcon and no rotation.
381##
382.align 16
383.Lschedule_256:
384 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
385 call _vpaes_schedule_transform # input transform
386 mov \$7, %esi
387
388.Loop_schedule_256:
389 call _vpaes_schedule_mangle # output low result
390 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
391
392 # high round
393 call _vpaes_schedule_round
394 dec %rsi
395 jz .Lschedule_mangle_last
396 call _vpaes_schedule_mangle
397
398 # low round. swap xmm7 and xmm6
399 pshufd \$0xFF, %xmm0, %xmm0
400 movdqa %xmm7, %xmm5
401 movdqa %xmm6, %xmm7
402 call _vpaes_schedule_low_round
403 movdqa %xmm5, %xmm7
404
405 jmp .Loop_schedule_256
406
407
408##
409## .aes_schedule_mangle_last
410##
411## Mangler for last round of key schedule
412## Mangles %xmm0
413## when encrypting, outputs out(%xmm0) ^ 63
414## when decrypting, outputs unskew(%xmm0)
415##
416## Always called right before return... jumps to cleanup and exits
417##
418.align 16
419.Lschedule_mangle_last:
420 # schedule last round key from xmm0
421 lea .Lk_deskew(%rip),%r11 # prepare to deskew
422 test %rcx, %rcx
423 jnz .Lschedule_mangle_last_dec
424
425 # encrypting
426 movdqa (%r8,%r10),%xmm1
427 pshufb %xmm1, %xmm0 # output permute
428 lea .Lk_opt(%rip), %r11 # prepare to output transform
429 add \$32, %rdx
430
431.Lschedule_mangle_last_dec:
432 add \$-16, %rdx
433 pxor .Lk_s63(%rip), %xmm0
434 call _vpaes_schedule_transform # output transform
435 movdqu %xmm0, (%rdx) # save last key
436
437 # cleanup
438 pxor %xmm0, %xmm0
439 pxor %xmm1, %xmm1
440 pxor %xmm2, %xmm2
441 pxor %xmm3, %xmm3
442 pxor %xmm4, %xmm4
443 pxor %xmm5, %xmm5
444 pxor %xmm6, %xmm6
445 pxor %xmm7, %xmm7
446 ret
447.size _vpaes_schedule_core,.-_vpaes_schedule_core
448
449##
450## .aes_schedule_192_smear
451##
452## Smear the short, low side in the 192-bit key schedule.
453##
454## Inputs:
455## %xmm7: high side, b a x y
456## %xmm6: low side, d c 0 0
457## %xmm13: 0
458##
459## Outputs:
460## %xmm6: b+c+d b+c 0 0
461## %xmm0: b+c+d b+c b a
462##
463.type _vpaes_schedule_192_smear,\@abi-omnipotent
464.align 16
465_vpaes_schedule_192_smear:
466 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
467 pxor %xmm0, %xmm6 # -> c+d c 0 0
468 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
469 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
470 movdqa %xmm6, %xmm0
471 pxor %xmm1, %xmm1
472 movhlps %xmm1, %xmm6 # clobber low side with zeros
473 ret
474.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
475
476##
477## .aes_schedule_round
478##
479## Runs one main round of the key schedule on %xmm0, %xmm7
480##
481## Specifically, runs subbytes on the high dword of %xmm0
482## then rotates it by one byte and xors into the low dword of
483## %xmm7.
484##
485## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
486## next rcon.
487##
488## Smears the dwords of %xmm7 by xoring the low into the
489## second low, result into third, result into highest.
490##
491## Returns results in %xmm7 = %xmm0.
492## Clobbers %xmm1-%xmm4, %r11.
493##
494.type _vpaes_schedule_round,\@abi-omnipotent
495.align 16
496_vpaes_schedule_round:
497 # extract rcon from xmm8
498 pxor %xmm1, %xmm1
499 palignr \$15, %xmm8, %xmm1
500 palignr \$15, %xmm8, %xmm8
501 pxor %xmm1, %xmm7
502
503 # rotate
504 pshufd \$0xFF, %xmm0, %xmm0
505 palignr \$1, %xmm0, %xmm0
506
507 # fall through...
508
509 # low round: same as high round, but no rotation and no rcon.
510_vpaes_schedule_low_round:
511 # smear xmm7
512 movdqa %xmm7, %xmm1
513 pslldq \$4, %xmm7
514 pxor %xmm1, %xmm7
515 movdqa %xmm7, %xmm1
516 pslldq \$8, %xmm7
517 pxor %xmm1, %xmm7
518 pxor .Lk_s63(%rip), %xmm7
519
520 # subbytes
521 movdqa %xmm9, %xmm1
522 pandn %xmm0, %xmm1
523 psrld \$4, %xmm1 # 1 = i
524 pand %xmm9, %xmm0 # 0 = k
525 movdqa %xmm11, %xmm2 # 2 : a/k
526 pshufb %xmm0, %xmm2 # 2 = a/k
527 pxor %xmm1, %xmm0 # 0 = j
528 movdqa %xmm10, %xmm3 # 3 : 1/i
529 pshufb %xmm1, %xmm3 # 3 = 1/i
530 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
531 movdqa %xmm10, %xmm4 # 4 : 1/j
532 pshufb %xmm0, %xmm4 # 4 = 1/j
533 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
534 movdqa %xmm10, %xmm2 # 2 : 1/iak
535 pshufb %xmm3, %xmm2 # 2 = 1/iak
536 pxor %xmm0, %xmm2 # 2 = io
537 movdqa %xmm10, %xmm3 # 3 : 1/jak
538 pshufb %xmm4, %xmm3 # 3 = 1/jak
539 pxor %xmm1, %xmm3 # 3 = jo
540 movdqa %xmm13, %xmm4 # 4 : sbou
541 pshufb %xmm2, %xmm4 # 4 = sbou
542 movdqa %xmm12, %xmm0 # 0 : sbot
543 pshufb %xmm3, %xmm0 # 0 = sb1t
544 pxor %xmm4, %xmm0 # 0 = sbox output
545
546 # add in smeared stuff
547 pxor %xmm7, %xmm0
548 movdqa %xmm0, %xmm7
549 ret
550.size _vpaes_schedule_round,.-_vpaes_schedule_round
551
552##
553## .aes_schedule_transform
554##
555## Linear-transform %xmm0 according to tables at (%r11)
556##
557## Requires that %xmm9 = 0x0F0F... as in preheat
558## Output in %xmm0
559## Clobbers %xmm1, %xmm2
560##
561.type _vpaes_schedule_transform,\@abi-omnipotent
562.align 16
563_vpaes_schedule_transform:
564 movdqa %xmm9, %xmm1
565 pandn %xmm0, %xmm1
566 psrld \$4, %xmm1
567 pand %xmm9, %xmm0
568 movdqa (%r11), %xmm2 # lo
569 pshufb %xmm0, %xmm2
570 movdqa 16(%r11), %xmm0 # hi
571 pshufb %xmm1, %xmm0
572 pxor %xmm2, %xmm0
573 ret
574.size _vpaes_schedule_transform,.-_vpaes_schedule_transform
575
576##
577## .aes_schedule_mangle
578##
579## Mangle xmm0 from (basis-transformed) standard version
580## to our version.
581##
582## On encrypt,
583## xor with 0x63
584## multiply by circulant 0,1,1,1
585## apply shiftrows transform
586##
587## On decrypt,
588## xor with 0x63
589## multiply by "inverse mixcolumns" circulant E,B,D,9
590## deskew
591## apply shiftrows transform
592##
593##
594## Writes out to (%rdx), and increments or decrements it
595## Keeps track of round number mod 4 in %r8
596## Preserves xmm0
597## Clobbers xmm1-xmm5
598##
599.type _vpaes_schedule_mangle,\@abi-omnipotent
600.align 16
601_vpaes_schedule_mangle:
602 movdqa %xmm0, %xmm4 # save xmm0 for later
603 movdqa .Lk_mc_forward(%rip),%xmm5
604 test %rcx, %rcx
605 jnz .Lschedule_mangle_dec
606
607 # encrypting
608 add \$16, %rdx
609 pxor .Lk_s63(%rip),%xmm4
610 pshufb %xmm5, %xmm4
611 movdqa %xmm4, %xmm3
612 pshufb %xmm5, %xmm4
613 pxor %xmm4, %xmm3
614 pshufb %xmm5, %xmm4
615 pxor %xmm4, %xmm3
616
617 jmp .Lschedule_mangle_both
618.align 16
619.Lschedule_mangle_dec:
620 # inverse mix columns
621 lea .Lk_dksd(%rip),%r11
622 movdqa %xmm9, %xmm1
623 pandn %xmm4, %xmm1
624 psrld \$4, %xmm1 # 1 = hi
625 pand %xmm9, %xmm4 # 4 = lo
626
627 movdqa 0x00(%r11), %xmm2
628 pshufb %xmm4, %xmm2
629 movdqa 0x10(%r11), %xmm3
630 pshufb %xmm1, %xmm3
631 pxor %xmm2, %xmm3
632 pshufb %xmm5, %xmm3
633
634 movdqa 0x20(%r11), %xmm2
635 pshufb %xmm4, %xmm2
636 pxor %xmm3, %xmm2
637 movdqa 0x30(%r11), %xmm3
638 pshufb %xmm1, %xmm3
639 pxor %xmm2, %xmm3
640 pshufb %xmm5, %xmm3
641
642 movdqa 0x40(%r11), %xmm2
643 pshufb %xmm4, %xmm2
644 pxor %xmm3, %xmm2
645 movdqa 0x50(%r11), %xmm3
646 pshufb %xmm1, %xmm3
647 pxor %xmm2, %xmm3
648 pshufb %xmm5, %xmm3
649
650 movdqa 0x60(%r11), %xmm2
651 pshufb %xmm4, %xmm2
652 pxor %xmm3, %xmm2
653 movdqa 0x70(%r11), %xmm3
654 pshufb %xmm1, %xmm3
655 pxor %xmm2, %xmm3
656
657 add \$-16, %rdx
658
659.Lschedule_mangle_both:
660 movdqa (%r8,%r10),%xmm1
661 pshufb %xmm1,%xmm3
662 add \$-16, %r8
663 and \$0x30, %r8
664 movdqu %xmm3, (%rdx)
665 ret
666.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
667
668#
669# Interface to OpenSSL
670#
671.globl ${PREFIX}_set_encrypt_key
672.type ${PREFIX}_set_encrypt_key,\@function,3
673.align 16
674${PREFIX}_set_encrypt_key:
675___
676$code.=<<___ if ($win64);
677 lea -0xb8(%rsp),%rsp
678 movaps %xmm6,0x10(%rsp)
679 movaps %xmm7,0x20(%rsp)
680 movaps %xmm8,0x30(%rsp)
681 movaps %xmm9,0x40(%rsp)
682 movaps %xmm10,0x50(%rsp)
683 movaps %xmm11,0x60(%rsp)
684 movaps %xmm12,0x70(%rsp)
685 movaps %xmm13,0x80(%rsp)
686 movaps %xmm14,0x90(%rsp)
687 movaps %xmm15,0xa0(%rsp)
688.Lenc_key_body:
689___
690$code.=<<___;
691 mov %esi,%eax
692 shr \$5,%eax
693 add \$5,%eax
694 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
695
696 mov \$0,%ecx
697 mov \$0x30,%r8d
698 call _vpaes_schedule_core
699___
700$code.=<<___ if ($win64);
701 movaps 0x10(%rsp),%xmm6
702 movaps 0x20(%rsp),%xmm7
703 movaps 0x30(%rsp),%xmm8
704 movaps 0x40(%rsp),%xmm9
705 movaps 0x50(%rsp),%xmm10
706 movaps 0x60(%rsp),%xmm11
707 movaps 0x70(%rsp),%xmm12
708 movaps 0x80(%rsp),%xmm13
709 movaps 0x90(%rsp),%xmm14
710 movaps 0xa0(%rsp),%xmm15
711 lea 0xb8(%rsp),%rsp
712.Lenc_key_epilogue:
713___
714$code.=<<___;
715 xor %eax,%eax
716 ret
717.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
718
719.globl ${PREFIX}_set_decrypt_key
720.type ${PREFIX}_set_decrypt_key,\@function,3
721.align 16
722${PREFIX}_set_decrypt_key:
723___
724$code.=<<___ if ($win64);
725 lea -0xb8(%rsp),%rsp
726 movaps %xmm6,0x10(%rsp)
727 movaps %xmm7,0x20(%rsp)
728 movaps %xmm8,0x30(%rsp)
729 movaps %xmm9,0x40(%rsp)
730 movaps %xmm10,0x50(%rsp)
731 movaps %xmm11,0x60(%rsp)
732 movaps %xmm12,0x70(%rsp)
733 movaps %xmm13,0x80(%rsp)
734 movaps %xmm14,0x90(%rsp)
735 movaps %xmm15,0xa0(%rsp)
736.Ldec_key_body:
737___
738$code.=<<___;
739 mov %esi,%eax
740 shr \$5,%eax
741 add \$5,%eax
742 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
743 shl \$4,%eax
744 lea 16(%rdx,%rax),%rdx
745
746 mov \$1,%ecx
747 mov %esi,%r8d
748 shr \$1,%r8d
749 and \$32,%r8d
750 xor \$32,%r8d # nbits==192?0:32
751 call _vpaes_schedule_core
752___
753$code.=<<___ if ($win64);
754 movaps 0x10(%rsp),%xmm6
755 movaps 0x20(%rsp),%xmm7
756 movaps 0x30(%rsp),%xmm8
757 movaps 0x40(%rsp),%xmm9
758 movaps 0x50(%rsp),%xmm10
759 movaps 0x60(%rsp),%xmm11
760 movaps 0x70(%rsp),%xmm12
761 movaps 0x80(%rsp),%xmm13
762 movaps 0x90(%rsp),%xmm14
763 movaps 0xa0(%rsp),%xmm15
764 lea 0xb8(%rsp),%rsp
765.Ldec_key_epilogue:
766___
767$code.=<<___;
768 xor %eax,%eax
769 ret
770.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
771
772.globl ${PREFIX}_encrypt
773.type ${PREFIX}_encrypt,\@function,3
774.align 16
775${PREFIX}_encrypt:
776___
777$code.=<<___ if ($win64);
778 lea -0xb8(%rsp),%rsp
779 movaps %xmm6,0x10(%rsp)
780 movaps %xmm7,0x20(%rsp)
781 movaps %xmm8,0x30(%rsp)
782 movaps %xmm9,0x40(%rsp)
783 movaps %xmm10,0x50(%rsp)
784 movaps %xmm11,0x60(%rsp)
785 movaps %xmm12,0x70(%rsp)
786 movaps %xmm13,0x80(%rsp)
787 movaps %xmm14,0x90(%rsp)
788 movaps %xmm15,0xa0(%rsp)
789.Lenc_body:
790___
791$code.=<<___;
792 movdqu (%rdi),%xmm0
793 call _vpaes_preheat
794 call _vpaes_encrypt_core
795 movdqu %xmm0,(%rsi)
796___
797$code.=<<___ if ($win64);
798 movaps 0x10(%rsp),%xmm6
799 movaps 0x20(%rsp),%xmm7
800 movaps 0x30(%rsp),%xmm8
801 movaps 0x40(%rsp),%xmm9
802 movaps 0x50(%rsp),%xmm10
803 movaps 0x60(%rsp),%xmm11
804 movaps 0x70(%rsp),%xmm12
805 movaps 0x80(%rsp),%xmm13
806 movaps 0x90(%rsp),%xmm14
807 movaps 0xa0(%rsp),%xmm15
808 lea 0xb8(%rsp),%rsp
809.Lenc_epilogue:
810___
811$code.=<<___;
812 ret
813.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
814
815.globl ${PREFIX}_decrypt
816.type ${PREFIX}_decrypt,\@function,3
817.align 16
818${PREFIX}_decrypt:
819___
820$code.=<<___ if ($win64);
821 lea -0xb8(%rsp),%rsp
822 movaps %xmm6,0x10(%rsp)
823 movaps %xmm7,0x20(%rsp)
824 movaps %xmm8,0x30(%rsp)
825 movaps %xmm9,0x40(%rsp)
826 movaps %xmm10,0x50(%rsp)
827 movaps %xmm11,0x60(%rsp)
828 movaps %xmm12,0x70(%rsp)
829 movaps %xmm13,0x80(%rsp)
830 movaps %xmm14,0x90(%rsp)
831 movaps %xmm15,0xa0(%rsp)
832.Ldec_body:
833___
834$code.=<<___;
835 movdqu (%rdi),%xmm0
836 call _vpaes_preheat
837 call _vpaes_decrypt_core
838 movdqu %xmm0,(%rsi)
839___
840$code.=<<___ if ($win64);
841 movaps 0x10(%rsp),%xmm6
842 movaps 0x20(%rsp),%xmm7
843 movaps 0x30(%rsp),%xmm8
844 movaps 0x40(%rsp),%xmm9
845 movaps 0x50(%rsp),%xmm10
846 movaps 0x60(%rsp),%xmm11
847 movaps 0x70(%rsp),%xmm12
848 movaps 0x80(%rsp),%xmm13
849 movaps 0x90(%rsp),%xmm14
850 movaps 0xa0(%rsp),%xmm15
851 lea 0xb8(%rsp),%rsp
852.Ldec_epilogue:
853___
854$code.=<<___;
855 ret
856.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
857___
858{
859my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
860# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
861# size_t length, const AES_KEY *key,
862# unsigned char *ivp,const int enc);
863$code.=<<___;
864.globl ${PREFIX}_cbc_encrypt
865.type ${PREFIX}_cbc_encrypt,\@function,6
866.align 16
867${PREFIX}_cbc_encrypt:
868 xchg $key,$len
869___
870($len,$key)=($key,$len);
871$code.=<<___;
872 sub \$16,$len
873 jc .Lcbc_abort
874___
875$code.=<<___ if ($win64);
876 lea -0xb8(%rsp),%rsp
877 movaps %xmm6,0x10(%rsp)
878 movaps %xmm7,0x20(%rsp)
879 movaps %xmm8,0x30(%rsp)
880 movaps %xmm9,0x40(%rsp)
881 movaps %xmm10,0x50(%rsp)
882 movaps %xmm11,0x60(%rsp)
883 movaps %xmm12,0x70(%rsp)
884 movaps %xmm13,0x80(%rsp)
885 movaps %xmm14,0x90(%rsp)
886 movaps %xmm15,0xa0(%rsp)
887.Lcbc_body:
888___
889$code.=<<___;
890 movdqu ($ivp),%xmm6 # load IV
891 sub $inp,$out
892 call _vpaes_preheat
893 cmp \$0,${enc}d
894 je .Lcbc_dec_loop
895 jmp .Lcbc_enc_loop
896.align 16
897.Lcbc_enc_loop:
898 movdqu ($inp),%xmm0
899 pxor %xmm6,%xmm0
900 call _vpaes_encrypt_core
901 movdqa %xmm0,%xmm6
902 movdqu %xmm0,($out,$inp)
903 lea 16($inp),$inp
904 sub \$16,$len
905 jnc .Lcbc_enc_loop
906 jmp .Lcbc_done
907.align 16
908.Lcbc_dec_loop:
909 movdqu ($inp),%xmm0
910 movdqa %xmm0,%xmm7
911 call _vpaes_decrypt_core
912 pxor %xmm6,%xmm0
913 movdqa %xmm7,%xmm6
914 movdqu %xmm0,($out,$inp)
915 lea 16($inp),$inp
916 sub \$16,$len
917 jnc .Lcbc_dec_loop
918.Lcbc_done:
919 movdqu %xmm6,($ivp) # save IV
920___
921$code.=<<___ if ($win64);
922 movaps 0x10(%rsp),%xmm6
923 movaps 0x20(%rsp),%xmm7
924 movaps 0x30(%rsp),%xmm8
925 movaps 0x40(%rsp),%xmm9
926 movaps 0x50(%rsp),%xmm10
927 movaps 0x60(%rsp),%xmm11
928 movaps 0x70(%rsp),%xmm12
929 movaps 0x80(%rsp),%xmm13
930 movaps 0x90(%rsp),%xmm14
931 movaps 0xa0(%rsp),%xmm15
932 lea 0xb8(%rsp),%rsp
933.Lcbc_epilogue:
934___
935$code.=<<___;
936.Lcbc_abort:
937 ret
938.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
939___
940}
941$code.=<<___;
942##
943## _aes_preheat
944##
945## Fills register %r10 -> .aes_consts (so you can -fPIC)
946## and %xmm9-%xmm15 as specified below.
947##
948.type _vpaes_preheat,\@abi-omnipotent
949.align 16
950_vpaes_preheat:
951 lea .Lk_s0F(%rip), %r10
952 movdqa -0x20(%r10), %xmm10 # .Lk_inv
953 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
954 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
955 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
956 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
957 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
958 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
959 ret
960.size _vpaes_preheat,.-_vpaes_preheat
961########################################################
962## ##
963## Constants ##
964## ##
965########################################################
966.type _vpaes_consts,\@object
967.align 64
968_vpaes_consts:
969.Lk_inv: # inv, inva
970 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
971 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
972
973.Lk_s0F: # s0F
974 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
975
976.Lk_ipt: # input transform (lo, hi)
977 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
978 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
979
980.Lk_sb1: # sb1u, sb1t
981 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
982 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
983.Lk_sb2: # sb2u, sb2t
984 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
985 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
986.Lk_sbo: # sbou, sbot
987 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
988 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
989
990.Lk_mc_forward: # mc_forward
991 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
992 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
993 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
994 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
995
996.Lk_mc_backward:# mc_backward
997 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
998 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
999 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1000 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1001
1002.Lk_sr: # sr
1003 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1004 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1005 .quad 0x0F060D040B020900, 0x070E050C030A0108
1006 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1007
1008.Lk_rcon: # rcon
1009 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1010
1011.Lk_s63: # s63: all equal to 0x63 transformed
1012 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1013
1014.Lk_opt: # output transform
1015 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1016 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1017
1018.Lk_deskew: # deskew tables: inverts the sbox's "skew"
1019 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1020 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1021
1022##
1023## Decryption stuff
1024## Key schedule constants
1025##
1026.Lk_dksd: # decryption key schedule: invskew x*D
1027 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1028 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1029.Lk_dksb: # decryption key schedule: invskew x*B
1030 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1031 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1032.Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1033 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1034 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1035.Lk_dks9: # decryption key schedule: invskew x*9
1036 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1037 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1038
1039##
1040## Decryption stuff
1041## Round function constants
1042##
1043.Lk_dipt: # decryption input transform
1044 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1045 .quad 0x86E383E660056500, 0x12771772F491F194
1046
1047.Lk_dsb9: # decryption sbox output *9*u, *9*t
1048 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1049 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1050.Lk_dsbd: # decryption sbox output *D*u, *D*t
1051 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1052 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1053.Lk_dsbb: # decryption sbox output *B*u, *B*t
1054 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1055 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1056.Lk_dsbe: # decryption sbox output *E*u, *E*t
1057 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1058 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1059.Lk_dsbo: # decryption sbox final output
1060 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1061 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1062.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1063.align 64
1064.size _vpaes_consts,.-_vpaes_consts
1065___
1066
1067if ($win64) {
1068# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1069# CONTEXT *context,DISPATCHER_CONTEXT *disp)
1070$rec="%rcx";
1071$frame="%rdx";
1072$context="%r8";
1073$disp="%r9";
1074
1075$code.=<<___;
1076.extern __imp_RtlVirtualUnwind
1077.type se_handler,\@abi-omnipotent
1078.align 16
1079se_handler:
1080 push %rsi
1081 push %rdi
1082 push %rbx
1083 push %rbp
1084 push %r12
1085 push %r13
1086 push %r14
1087 push %r15
1088 pushfq
1089 sub \$64,%rsp
1090
1091 mov 120($context),%rax # pull context->Rax
1092 mov 248($context),%rbx # pull context->Rip
1093
1094 mov 8($disp),%rsi # disp->ImageBase
1095 mov 56($disp),%r11 # disp->HandlerData
1096
1097 mov 0(%r11),%r10d # HandlerData[0]
1098 lea (%rsi,%r10),%r10 # prologue label
1099 cmp %r10,%rbx # context->Rip<prologue label
1100 jb .Lin_prologue
1101
1102 mov 152($context),%rax # pull context->Rsp
1103
1104 mov 4(%r11),%r10d # HandlerData[1]
1105 lea (%rsi,%r10),%r10 # epilogue label
1106 cmp %r10,%rbx # context->Rip>=epilogue label
1107 jae .Lin_prologue
1108
1109 lea 16(%rax),%rsi # %xmm save area
1110 lea 512($context),%rdi # &context.Xmm6
1111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1112 .long 0xa548f3fc # cld; rep movsq
1113 lea 0xb8(%rax),%rax # adjust stack pointer
1114
1115.Lin_prologue:
1116 mov 8(%rax),%rdi
1117 mov 16(%rax),%rsi
1118 mov %rax,152($context) # restore context->Rsp
1119 mov %rsi,168($context) # restore context->Rsi
1120 mov %rdi,176($context) # restore context->Rdi
1121
1122 mov 40($disp),%rdi # disp->ContextRecord
1123 mov $context,%rsi # context
1124 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1125 .long 0xa548f3fc # cld; rep movsq
1126
1127 mov $disp,%rsi
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov 40(%rsi),%r10 # disp->ContextRecord
1133 lea 56(%rsi),%r11 # &disp->HandlerData
1134 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov %r10,32(%rsp) # arg5
1136 mov %r11,40(%rsp) # arg6
1137 mov %r12,48(%rsp) # arg7
1138 mov %rcx,56(%rsp) # arg8, (NULL)
1139 call *__imp_RtlVirtualUnwind(%rip)
1140
1141 mov \$1,%eax # ExceptionContinueSearch
1142 add \$64,%rsp
1143 popfq
1144 pop %r15
1145 pop %r14
1146 pop %r13
1147 pop %r12
1148 pop %rbp
1149 pop %rbx
1150 pop %rdi
1151 pop %rsi
1152 ret
1153.size se_handler,.-se_handler
1154
1155.section .pdata
1156.align 4
1157 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1158 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1159 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1160
1161 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1162 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1163 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1164
1165 .rva .LSEH_begin_${PREFIX}_encrypt
1166 .rva .LSEH_end_${PREFIX}_encrypt
1167 .rva .LSEH_info_${PREFIX}_encrypt
1168
1169 .rva .LSEH_begin_${PREFIX}_decrypt
1170 .rva .LSEH_end_${PREFIX}_decrypt
1171 .rva .LSEH_info_${PREFIX}_decrypt
1172
1173 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1174 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1175 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1176
1177.section .xdata
1178.align 8
1179.LSEH_info_${PREFIX}_set_encrypt_key:
1180 .byte 9,0,0,0
1181 .rva se_handler
1182 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1183.LSEH_info_${PREFIX}_set_decrypt_key:
1184 .byte 9,0,0,0
1185 .rva se_handler
1186 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1187.LSEH_info_${PREFIX}_encrypt:
1188 .byte 9,0,0,0
1189 .rva se_handler
1190 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1191.LSEH_info_${PREFIX}_decrypt:
1192 .byte 9,0,0,0
1193 .rva se_handler
1194 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1195.LSEH_info_${PREFIX}_cbc_encrypt:
1196 .byte 9,0,0,0
1197 .rva se_handler
1198 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1199___
1200}
1201
1202$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1203
1204print $code;
1205
1206close STDOUT;