summaryrefslogtreecommitdiff
path: root/src/lib/libcrypto/aes/asm/aes-s390x.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl1071
1 files changed, 993 insertions, 78 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 7e01889298..445a1e6762 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
44# Unlike previous version hardware support detection takes place only 44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds. 45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not 46# This is done, because deferred key setup can't be made MT-safe, not
47# for key lengthes longer than 128 bits. 47# for keys longer than 128 bits.
48# 48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement, 49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x, 50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized. 51# because software implementation was optimized.
52 52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
53$softonly=0; # allow hardware support 98$softonly=0; # allow hardware support
54 99
55$t0="%r0"; $mask="%r0"; 100$t0="%r0"; $mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
69$ra="%r14"; 114$ra="%r14";
70$sp="%r15"; 115$sp="%r15";
71 116
117$stdframe=16*$SIZE_T+4*8;
118
72sub _data_word() 119sub _data_word()
73{ my $i; 120{ my $i;
74 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } 121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
210.Lesoft: 257.Lesoft:
211___ 258___
212$code.=<<___; 259$code.=<<___;
213 stmg %r3,$ra,24($sp) 260 stm${g} %r3,$ra,3*$SIZE_T($sp)
214 261
215 llgf $s0,0($inp) 262 llgf $s0,0($inp)
216 llgf $s1,4($inp) 263 llgf $s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
220 larl $tbl,AES_Te 267 larl $tbl,AES_Te
221 bras $ra,_s390x_AES_encrypt 268 bras $ra,_s390x_AES_encrypt
222 269
223 lg $out,24($sp) 270 l${g} $out,3*$SIZE_T($sp)
224 st $s0,0($out) 271 st $s0,0($out)
225 st $s1,4($out) 272 st $s1,4($out)
226 st $s2,8($out) 273 st $s2,8($out)
227 st $s3,12($out) 274 st $s3,12($out)
228 275
229 lmg %r6,$ra,48($sp) 276 lm${g} %r6,$ra,6*$SIZE_T($sp)
230 br $ra 277 br $ra
231.size AES_encrypt,.-AES_encrypt 278.size AES_encrypt,.-AES_encrypt
232 279
233.type _s390x_AES_encrypt,\@function 280.type _s390x_AES_encrypt,\@function
234.align 16 281.align 16
235_s390x_AES_encrypt: 282_s390x_AES_encrypt:
236 stg $ra,152($sp) 283 st${g} $ra,15*$SIZE_T($sp)
237 x $s0,0($key) 284 x $s0,0($key)
238 x $s1,4($key) 285 x $s1,4($key)
239 x $s2,8($key) 286 x $s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
397 or $s2,$i3 444 or $s2,$i3
398 or $s3,$t3 445 or $s3,$t3
399 446
400 lg $ra,152($sp) 447 l${g} $ra,15*$SIZE_T($sp)
401 xr $s0,$t0 448 xr $s0,$t0
402 xr $s1,$t2 449 xr $s1,$t2
403 x $s2,24($key) 450 x $s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
536.Ldsoft: 583.Ldsoft:
537___ 584___
538$code.=<<___; 585$code.=<<___;
539 stmg %r3,$ra,24($sp) 586 stm${g} %r3,$ra,3*$SIZE_T($sp)
540 587
541 llgf $s0,0($inp) 588 llgf $s0,0($inp)
542 llgf $s1,4($inp) 589 llgf $s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
546 larl $tbl,AES_Td 593 larl $tbl,AES_Td
547 bras $ra,_s390x_AES_decrypt 594 bras $ra,_s390x_AES_decrypt
548 595
549 lg $out,24($sp) 596 l${g} $out,3*$SIZE_T($sp)
550 st $s0,0($out) 597 st $s0,0($out)
551 st $s1,4($out) 598 st $s1,4($out)
552 st $s2,8($out) 599 st $s2,8($out)
553 st $s3,12($out) 600 st $s3,12($out)
554 601
555 lmg %r6,$ra,48($sp) 602 lm${g} %r6,$ra,6*$SIZE_T($sp)
556 br $ra 603 br $ra
557.size AES_decrypt,.-AES_decrypt 604.size AES_decrypt,.-AES_decrypt
558 605
559.type _s390x_AES_decrypt,\@function 606.type _s390x_AES_decrypt,\@function
560.align 16 607.align 16
561_s390x_AES_decrypt: 608_s390x_AES_decrypt:
562 stg $ra,152($sp) 609 st${g} $ra,15*$SIZE_T($sp)
563 x $s0,0($key) 610 x $s0,0($key)
564 x $s1,4($key) 611 x $s1,4($key)
565 x $s2,8($key) 612 x $s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
703 nr $i1,$mask 750 nr $i1,$mask
704 nr $i2,$mask 751 nr $i2,$mask
705 752
706 lg $ra,152($sp) 753 l${g} $ra,15*$SIZE_T($sp)
707 or $s1,$t1 754 or $s1,$t1
708 l $t0,16($key) 755 l $t0,16($key)
709 l $t1,20($key) 756 l $t1,20($key)
@@ -732,14 +779,15 @@ ___
732$code.=<<___; 779$code.=<<___;
733# void AES_set_encrypt_key(const unsigned char *in, int bits, 780# void AES_set_encrypt_key(const unsigned char *in, int bits,
734# AES_KEY *key) { 781# AES_KEY *key) {
735.globl AES_set_encrypt_key 782.globl private_AES_set_encrypt_key
736.type AES_set_encrypt_key,\@function 783.type private_AES_set_encrypt_key,\@function
737.align 16 784.align 16
738AES_set_encrypt_key: 785private_AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
739 lghi $t0,0 787 lghi $t0,0
740 clgr $inp,$t0 788 cl${g}r $inp,$t0
741 je .Lminus1 789 je .Lminus1
742 clgr $key,$t0 790 cl${g}r $key,$t0
743 je .Lminus1 791 je .Lminus1
744 792
745 lghi $t0,128 793 lghi $t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
789 je 1f 837 je 1f
790 lg %r1,24($inp) 838 lg %r1,24($inp)
791 stg %r1,24($key) 839 stg %r1,24($key)
7921: st $bits,236($key) # save bits 8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
793 st %r5,240($key) # save km code 842 st %r5,240($key) # save km code
794 lghi %r2,0 843 lghi %r2,0
795 br %r14 844 br %r14
@@ -797,7 +846,7 @@ ___
797$code.=<<___; 846$code.=<<___;
798.align 16 847.align 16
799.Lekey_internal: 848.Lekey_internal:
800 stmg %r6,%r13,48($sp) # all non-volatile regs 849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
801 850
802 larl $tbl,AES_Te+2048 851 larl $tbl,AES_Te+2048
803 852
@@ -857,8 +906,9 @@ $code.=<<___;
857 la $key,16($key) # key+=4 906 la $key,16($key) # key+=4
858 la $t3,4($t3) # i++ 907 la $t3,4($t3) # i++
859 brct $rounds,.L128_loop 908 brct $rounds,.L128_loop
909 lghi $t0,10
860 lghi %r2,0 910 lghi %r2,0
861 lmg %r6,%r13,48($sp) 911 lm${g} %r4,%r13,4*$SIZE_T($sp)
862 br $ra 912 br $ra
863 913
864.align 16 914.align 16
@@ -905,8 +955,9 @@ $code.=<<___;
905 st $s2,32($key) 955 st $s2,32($key)
906 st $s3,36($key) 956 st $s3,36($key)
907 brct $rounds,.L192_continue 957 brct $rounds,.L192_continue
958 lghi $t0,12
908 lghi %r2,0 959 lghi %r2,0
909 lmg %r6,%r13,48($sp) 960 lm${g} %r4,%r13,4*$SIZE_T($sp)
910 br $ra 961 br $ra
911 962
912.align 16 963.align 16
@@ -967,8 +1018,9 @@ $code.=<<___;
967 st $s2,40($key) 1018 st $s2,40($key)
968 st $s3,44($key) 1019 st $s3,44($key)
969 brct $rounds,.L256_continue 1020 brct $rounds,.L256_continue
1021 lghi $t0,14
970 lghi %r2,0 1022 lghi %r2,0
971 lmg %r6,%r13,48($sp) 1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
972 br $ra 1024 br $ra
973 1025
974.align 16 1026.align 16
@@ -1011,42 +1063,34 @@ $code.=<<___;
1011.Lminus1: 1063.Lminus1:
1012 lghi %r2,-1 1064 lghi %r2,-1
1013 br $ra 1065 br $ra
1014.size AES_set_encrypt_key,.-AES_set_encrypt_key 1066.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
1015 1067
1016# void AES_set_decrypt_key(const unsigned char *in, int bits, 1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1017# AES_KEY *key) { 1069# AES_KEY *key) {
1018.globl AES_set_decrypt_key 1070.globl private_AES_set_decrypt_key
1019.type AES_set_decrypt_key,\@function 1071.type private_AES_set_decrypt_key,\@function
1020.align 16 1072.align 16
1021AES_set_decrypt_key: 1073private_AES_set_decrypt_key:
1022 stg $key,32($sp) # I rely on AES_set_encrypt_key to 1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1023 stg $ra,112($sp) # save non-volatile registers! 1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1024 bras $ra,AES_set_encrypt_key 1076 bras $ra,_s390x_AES_set_encrypt_key
1025 lg $key,32($sp) 1077 #l${g} $key,4*$SIZE_T($sp)
1026 lg $ra,112($sp) 1078 l${g} $ra,14*$SIZE_T($sp)
1027 ltgr %r2,%r2 1079 ltgr %r2,%r2
1028 bnzr $ra 1080 bnzr $ra
1029___ 1081___
1030$code.=<<___ if (!$softonly); 1082$code.=<<___ if (!$softonly);
1031 l $t0,240($key) 1083 #l $t0,240($key)
1032 lhi $t1,16 1084 lhi $t1,16
1033 cr $t0,$t1 1085 cr $t0,$t1
1034 jl .Lgo 1086 jl .Lgo
1035 oill $t0,0x80 # set "decrypt" bit 1087 oill $t0,0x80 # set "decrypt" bit
1036 st $t0,240($key) 1088 st $t0,240($key)
1037 br $ra 1089 br $ra
1038
1039.align 16
1040.Ldkey_internal:
1041 stg $key,32($sp)
1042 stg $ra,40($sp)
1043 bras $ra,.Lekey_internal
1044 lg $key,32($sp)
1045 lg $ra,40($sp)
1046___ 1090___
1047$code.=<<___; 1091$code.=<<___;
1048 1092.align 16
1049.Lgo: llgf $rounds,240($key) 1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1050 la $i1,0($key) 1094 la $i1,0($key)
1051 sllg $i2,$rounds,4 1095 sllg $i2,$rounds,4
1052 la $i2,0($i2,$key) 1096 la $i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
1123 la $key,4($key) 1167 la $key,4($key)
1124 brct $rounds,.Lmix 1168 brct $rounds,.Lmix
1125 1169
1126 lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! 1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1127 lghi %r2,0 1171 lghi %r2,0
1128 br $ra 1172 br $ra
1129.size AES_set_decrypt_key,.-AES_set_decrypt_key 1173.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
1130___ 1174___
1131 1175
1132#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, 1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1133# size_t length, const AES_KEY *key, 1178# size_t length, const AES_KEY *key,
1134# unsigned char *ivec, const int enc) 1179# unsigned char *ivec, const int enc)
1135{ 1180{
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
1163 l %r0,240($key) # load kmc code 1208 l %r0,240($key) # load kmc code
1164 lghi $key,15 # res=len%16, len-=res; 1209 lghi $key,15 # res=len%16, len-=res;
1165 ngr $key,$len 1210 ngr $key,$len
1166 slgr $len,$key 1211 sl${g}r $len,$key
1167 la %r1,16($sp) # parameter block - ivec || key 1212 la %r1,16($sp) # parameter block - ivec || key
1168 jz .Lkmc_truncated 1213 jz .Lkmc_truncated
1169 .long 0xb92f0042 # kmc %r4,%r2 1214 .long 0xb92f0042 # kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
1181 tmll %r0,0x80 1226 tmll %r0,0x80
1182 jnz .Lkmc_truncated_dec 1227 jnz .Lkmc_truncated_dec
1183 lghi %r1,0 1228 lghi %r1,0
1184 stg %r1,128($sp) 1229 stg %r1,16*$SIZE_T($sp)
1185 stg %r1,136($sp) 1230 stg %r1,16*$SIZE_T+8($sp)
1186 bras %r1,1f 1231 bras %r1,1f
1187 mvc 128(1,$sp),0($inp) 1232 mvc 16*$SIZE_T(1,$sp),0($inp)
11881: ex $key,0(%r1) 12331: ex $key,0(%r1)
1189 la %r1,16($sp) # restore parameter block 1234 la %r1,16($sp) # restore parameter block
1190 la $inp,128($sp) 1235 la $inp,16*$SIZE_T($sp)
1191 lghi $len,16 1236 lghi $len,16
1192 .long 0xb92f0042 # kmc %r4,%r2 1237 .long 0xb92f0042 # kmc %r4,%r2
1193 j .Lkmc_done 1238 j .Lkmc_done
1194.align 16 1239.align 16
1195.Lkmc_truncated_dec: 1240.Lkmc_truncated_dec:
1196 stg $out,64($sp) 1241 st${g} $out,4*$SIZE_T($sp)
1197 la $out,128($sp) 1242 la $out,16*$SIZE_T($sp)
1198 lghi $len,16 1243 lghi $len,16
1199 .long 0xb92f0042 # kmc %r4,%r2 1244 .long 0xb92f0042 # kmc %r4,%r2
1200 lg $out,64($sp) 1245 l${g} $out,4*$SIZE_T($sp)
1201 bras %r1,2f 1246 bras %r1,2f
1202 mvc 0(1,$out),128($sp) 1247 mvc 0(1,$out),16*$SIZE_T($sp)
12032: ex $key,0(%r1) 12482: ex $key,0(%r1)
1204 j .Lkmc_done 1249 j .Lkmc_done
1205.align 16 1250.align 16
1206.Lcbc_software: 1251.Lcbc_software:
1207___ 1252___
1208$code.=<<___; 1253$code.=<<___;
1209 stmg $key,$ra,40($sp) 1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1210 lhi %r0,0 1255 lhi %r0,0
1211 cl %r0,164($sp) 1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1212 je .Lcbc_decrypt 1257 je .Lcbc_decrypt
1213 1258
1214 larl $tbl,AES_Te 1259 larl $tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
1219 llgf $s3,12($ivp) 1264 llgf $s3,12($ivp)
1220 1265
1221 lghi $t0,16 1266 lghi $t0,16
1222 slgr $len,$t0 1267 sl${g}r $len,$t0
1223 brc 4,.Lcbc_enc_tail # if borrow 1268 brc 4,.Lcbc_enc_tail # if borrow
1224.Lcbc_enc_loop: 1269.Lcbc_enc_loop:
1225 stmg $inp,$out,16($sp) 1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1226 x $s0,0($inp) 1271 x $s0,0($inp)
1227 x $s1,4($inp) 1272 x $s1,4($inp)
1228 x $s2,8($inp) 1273 x $s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
1231 1276
1232 bras $ra,_s390x_AES_encrypt 1277 bras $ra,_s390x_AES_encrypt
1233 1278
1234 lmg $inp,$key,16($sp) 1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1235 st $s0,0($out) 1280 st $s0,0($out)
1236 st $s1,4($out) 1281 st $s1,4($out)
1237 st $s2,8($out) 1282 st $s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
1240 la $inp,16($inp) 1285 la $inp,16($inp)
1241 la $out,16($out) 1286 la $out,16($out)
1242 lghi $t0,16 1287 lghi $t0,16
1243 ltgr $len,$len 1288 lt${g}r $len,$len
1244 jz .Lcbc_enc_done 1289 jz .Lcbc_enc_done
1245 slgr $len,$t0 1290 sl${g}r $len,$t0
1246 brc 4,.Lcbc_enc_tail # if borrow 1291 brc 4,.Lcbc_enc_tail # if borrow
1247 j .Lcbc_enc_loop 1292 j .Lcbc_enc_loop
1248.align 16 1293.align 16
1249.Lcbc_enc_done: 1294.Lcbc_enc_done:
1250 lg $ivp,48($sp) 1295 l${g} $ivp,6*$SIZE_T($sp)
1251 st $s0,0($ivp) 1296 st $s0,0($ivp)
1252 st $s1,4($ivp) 1297 st $s1,4($ivp)
1253 st $s2,8($ivp) 1298 st $s2,8($ivp)
1254 st $s3,12($ivp) 1299 st $s3,12($ivp)
1255 1300
1256 lmg %r7,$ra,56($sp) 1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1257 br $ra 1302 br $ra
1258 1303
1259.align 16 1304.align 16
1260.Lcbc_enc_tail: 1305.Lcbc_enc_tail:
1261 aghi $len,15 1306 aghi $len,15
1262 lghi $t0,0 1307 lghi $t0,0
1263 stg $t0,128($sp) 1308 stg $t0,16*$SIZE_T($sp)
1264 stg $t0,136($sp) 1309 stg $t0,16*$SIZE_T+8($sp)
1265 bras $t1,3f 1310 bras $t1,3f
1266 mvc 128(1,$sp),0($inp) 1311 mvc 16*$SIZE_T(1,$sp),0($inp)
12673: ex $len,0($t1) 13123: ex $len,0($t1)
1268 lghi $len,0 1313 lghi $len,0
1269 la $inp,128($sp) 1314 la $inp,16*$SIZE_T($sp)
1270 j .Lcbc_enc_loop 1315 j .Lcbc_enc_loop
1271 1316
1272.align 16 1317.align 16
@@ -1275,10 +1320,10 @@ $code.=<<___;
1275 1320
1276 lg $t0,0($ivp) 1321 lg $t0,0($ivp)
1277 lg $t1,8($ivp) 1322 lg $t1,8($ivp)
1278 stmg $t0,$t1,128($sp) 1323 stmg $t0,$t1,16*$SIZE_T($sp)
1279 1324
1280.Lcbc_dec_loop: 1325.Lcbc_dec_loop:
1281 stmg $inp,$out,16($sp) 1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1282 llgf $s0,0($inp) 1327 llgf $s0,0($inp)
1283 llgf $s1,4($inp) 1328 llgf $s1,4($inp)
1284 llgf $s2,8($inp) 1329 llgf $s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
1287 1332
1288 bras $ra,_s390x_AES_decrypt 1333 bras $ra,_s390x_AES_decrypt
1289 1334
1290 lmg $inp,$key,16($sp) 1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1291 sllg $s0,$s0,32 1336 sllg $s0,$s0,32
1292 sllg $s2,$s2,32 1337 sllg $s2,$s2,32
1293 lr $s0,$s1 1338 lr $s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
1295 1340
1296 lg $t0,0($inp) 1341 lg $t0,0($inp)
1297 lg $t1,8($inp) 1342 lg $t1,8($inp)
1298 xg $s0,128($sp) 1343 xg $s0,16*$SIZE_T($sp)
1299 xg $s2,136($sp) 1344 xg $s2,16*$SIZE_T+8($sp)
1300 lghi $s1,16 1345 lghi $s1,16
1301 slgr $len,$s1 1346 sl${g}r $len,$s1
1302 brc 4,.Lcbc_dec_tail # if borrow 1347 brc 4,.Lcbc_dec_tail # if borrow
1303 brc 2,.Lcbc_dec_done # if zero 1348 brc 2,.Lcbc_dec_done # if zero
1304 stg $s0,0($out) 1349 stg $s0,0($out)
1305 stg $s2,8($out) 1350 stg $s2,8($out)
1306 stmg $t0,$t1,128($sp) 1351 stmg $t0,$t1,16*$SIZE_T($sp)
1307 1352
1308 la $inp,16($inp) 1353 la $inp,16($inp)
1309 la $out,16($out) 1354 la $out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
1313 stg $s0,0($out) 1358 stg $s0,0($out)
1314 stg $s2,8($out) 1359 stg $s2,8($out)
1315.Lcbc_dec_exit: 1360.Lcbc_dec_exit:
1316 lmg $ivp,$ra,48($sp) 1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1317 stmg $t0,$t1,0($ivp) 1362 stmg $t0,$t1,0($ivp)
1318 1363
1319 br $ra 1364 br $ra
@@ -1321,19 +1366,889 @@ $code.=<<___;
1321.align 16 1366.align 16
1322.Lcbc_dec_tail: 1367.Lcbc_dec_tail:
1323 aghi $len,15 1368 aghi $len,15
1324 stg $s0,128($sp) 1369 stg $s0,16*$SIZE_T($sp)
1325 stg $s2,136($sp) 1370 stg $s2,16*$SIZE_T+8($sp)
1326 bras $s1,4f 1371 bras $s1,4f
1327 mvc 0(1,$out),128($sp) 1372 mvc 0(1,$out),16*$SIZE_T($sp)
13284: ex $len,0($s1) 13734: ex $len,0($s1)
1329 j .Lcbc_dec_exit 1374 j .Lcbc_dec_exit
1330.size AES_cbc_encrypt,.-AES_cbc_encrypt 1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1331.comm OPENSSL_s390xcap_P,8,8 1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,2*$SIZE_T($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,2*$SIZE_T($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 srlg $i2,$s0,63 # carry bit from lower half
1688 sllg $s0,$s0,1
1689 sllg $s1,$s1,1
1690 xgr $s0,$i1
1691 ogr $s1,$i2
1692.Lxts_km_start:
1693 lrvgr $i1,$s0 # flip byte order
1694 lrvgr $i2,$s1
1695 stg $i1,0($s2,$inp)
1696 stg $i2,8($s2,$inp)
1697 xg $i1,0($inp)
1698 xg $i2,8($inp)
1699 stg $i1,0($out,$inp)
1700 stg $i2,8($out,$inp)
1701 la $inp,16($inp)
1702 brct $s3,.Lxts_km_prepare
1703
1704 slgr $inp,$fp # rewind $inp
1705 la $s2,0($out,$inp)
1706 lgr $s3,$fp
1707 .long 0xb92e00aa # km $s2,$s2
1708 brc 1,.-4 # pay attention to "partial completion"
1709
1710 la $s2,16($sp)
1711 slgr $s2,$inp
1712 srlg $s3,$fp,4
1713.Lxts_km_xor:
1714 lg $i1,0($out,$inp)
1715 lg $i2,8($out,$inp)
1716 xg $i1,0($s2,$inp)
1717 xg $i2,8($s2,$inp)
1718 stg $i1,0($out,$inp)
1719 stg $i2,8($out,$inp)
1720 la $inp,16($inp)
1721 brct $s3,.Lxts_km_xor
1722
1723 slgr $len,$fp
1724 brc 1,.Lxts_km_loop # not zero, no borrow
1725 algr $fp,$len
1726 lghi $len,0
1727 brc 4+1,.Lxts_km_loop # not zero
1728
1729 l${g} $i1,0($sp) # back-chain
1730 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1731 la $i2,16($sp)
1732 srlg $fp,$fp,4
1733.Lxts_km_zap:
1734 stg $i1,0($i2)
1735 stg $i1,8($i2)
1736 la $i2,16($i2)
1737 brct $fp,.Lxts_km_zap
1738
1739 la $sp,0($i1)
1740 llgc $len,2*$SIZE_T-1($i1)
1741 nill $len,0x0f # $len%=16
1742 bzr $ra
1743
1744 # generate one more tweak...
1745 lghi $i1,0x87
1746 srag $i2,$s1,63 # broadcast upper bit
1747 ngr $i1,$i2 # rem
1748 srlg $i2,$s0,63 # carry bit from lower half
1749 sllg $s0,$s0,1
1750 sllg $s1,$s1,1
1751 xgr $s0,$i1
1752 ogr $s1,$i2
1753
1754 ltr $len,$len # clear zero flag
1755 br $ra
1756.size _s390x_xts_km,.-_s390x_xts_km
1757
1758.globl AES_xts_encrypt
1759.type AES_xts_encrypt,\@function
1760.align 16
1761AES_xts_encrypt:
1762 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1763 xgr %r4,%r3
1764 xgr %r3,%r4
1765___
1766$code.=<<___ if ($SIZE_T==4);
1767 llgfr $len,$len
1768___
1769$code.=<<___;
1770 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1771 srag $len,$len,4 # formally wrong, because it expands
1772 # sign byte, but who can afford asking
1773 # to process more than 2^63-1 bytes?
1774 # I use it, because it sets condition
1775 # code...
1776 bcr 8,$ra # abort if zero (i.e. less than 16)
1777___
1778$code.=<<___ if (!$softonly);
1779 llgf %r0,240($key2)
1780 lhi %r1,16
1781 clr %r0,%r1
1782 jl .Lxts_enc_software
1783
1784 stm${g} %r6,$s3,6*$SIZE_T($sp)
1785 st${g} $ra,14*$SIZE_T($sp)
1786
1787 sllg $len,$len,4 # $len&=~15
1788 slgr $out,$inp
1789
1790 # generate the tweak value
1791 l${g} $s3,$stdframe($sp) # pointer to iv
1792 la $s2,$tweak($sp)
1793 lmg $s0,$s1,0($s3)
1794 lghi $s3,16
1795 stmg $s0,$s1,0($s2)
1796 la %r1,0($key2) # $key2 is not needed anymore
1797 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1798 brc 1,.-4 # can this happen?
1799
1800 l %r0,240($key1)
1801 la %r1,0($key1) # $key1 is not needed anymore
1802 bras $ra,_s390x_xts_km
1803 jz .Lxts_enc_km_done
1804
1805 aghi $inp,-16 # take one step back
1806 la $i3,0($out,$inp) # put aside real $out
1807.Lxts_enc_km_steal:
1808 llgc $i1,16($inp)
1809 llgc $i2,0($out,$inp)
1810 stc $i1,0($out,$inp)
1811 stc $i2,16($out,$inp)
1812 la $inp,1($inp)
1813 brct $len,.Lxts_enc_km_steal
1814
1815 la $s2,0($i3)
1816 lghi $s3,16
1817 lrvgr $i1,$s0 # flip byte order
1818 lrvgr $i2,$s1
1819 xg $i1,0($s2)
1820 xg $i2,8($s2)
1821 stg $i1,0($s2)
1822 stg $i2,8($s2)
1823 .long 0xb92e00aa # km $s2,$s2
1824 brc 1,.-4 # can this happen?
1825 lrvgr $i1,$s0 # flip byte order
1826 lrvgr $i2,$s1
1827 xg $i1,0($i3)
1828 xg $i2,8($i3)
1829 stg $i1,0($i3)
1830 stg $i2,8($i3)
1831
1832.Lxts_enc_km_done:
1833 l${g} $ra,14*$SIZE_T($sp)
1834 st${g} $sp,$tweak($sp) # wipe tweak
1835 st${g} $sp,$tweak($sp)
1836 lm${g} %r6,$s3,6*$SIZE_T($sp)
1837 br $ra
1838.align 16
1839.Lxts_enc_software:
1840___
1841$code.=<<___;
1842 stm${g} %r6,$ra,6*$SIZE_T($sp)
1843
1844 slgr $out,$inp
1845
1846 xgr $s0,$s0 # clear upper half
1847 xgr $s1,$s1
1848 lrv $s0,$stdframe+4($sp) # load secno
1849 lrv $s1,$stdframe+0($sp)
1850 xgr $s2,$s2
1851 xgr $s3,$s3
1852 stm${g} %r2,%r5,2*$SIZE_T($sp)
1853 la $key,0($key2)
1854 larl $tbl,AES_Te
1855 bras $ra,_s390x_AES_encrypt # generate the tweak
1856 lm${g} %r2,%r5,2*$SIZE_T($sp)
1857 stm $s0,$s3,$tweak($sp) # save the tweak
1858 j .Lxts_enc_enter
1859
1860.align 16
1861.Lxts_enc_loop:
1862 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1863 lrvg $s3,$tweak+8($sp)
1864 lghi %r1,0x87
1865 srag %r0,$s3,63 # broadcast upper bit
1866 ngr %r1,%r0 # rem
1867 srlg %r0,$s1,63 # carry bit from lower half
1868 sllg $s1,$s1,1
1869 sllg $s3,$s3,1
1870 xgr $s1,%r1
1871 ogr $s3,%r0
1872 lrvgr $s1,$s1 # flip byte order
1873 lrvgr $s3,$s3
1874 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1875 stg $s1,$tweak+0($sp) # save the tweak
1876 llgfr $s1,$s1
1877 srlg $s2,$s3,32
1878 stg $s3,$tweak+8($sp)
1879 llgfr $s3,$s3
1880 la $inp,16($inp) # $inp+=16
1881.Lxts_enc_enter:
1882 x $s0,0($inp) # ^=*($inp)
1883 x $s1,4($inp)
1884 x $s2,8($inp)
1885 x $s3,12($inp)
1886 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1887 la $key,0($key1)
1888 bras $ra,_s390x_AES_encrypt
1889 lm${g} %r2,%r5,2*$SIZE_T($sp)
1890 x $s0,$tweak+0($sp) # ^=tweak
1891 x $s1,$tweak+4($sp)
1892 x $s2,$tweak+8($sp)
1893 x $s3,$tweak+12($sp)
1894 st $s0,0($out,$inp)
1895 st $s1,4($out,$inp)
1896 st $s2,8($out,$inp)
1897 st $s3,12($out,$inp)
1898 brct${g} $len,.Lxts_enc_loop
1899
1900 llgc $len,`2*$SIZE_T-1`($sp)
1901 nill $len,0x0f # $len%16
1902 jz .Lxts_enc_done
1903
1904 la $i3,0($inp,$out) # put aside real $out
1905.Lxts_enc_steal:
1906 llgc %r0,16($inp)
1907 llgc %r1,0($out,$inp)
1908 stc %r0,0($out,$inp)
1909 stc %r1,16($out,$inp)
1910 la $inp,1($inp)
1911 brct $len,.Lxts_enc_steal
1912 la $out,0($i3) # restore real $out
1913
1914 # generate last tweak...
1915 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1916 lrvg $s3,$tweak+8($sp)
1917 lghi %r1,0x87
1918 srag %r0,$s3,63 # broadcast upper bit
1919 ngr %r1,%r0 # rem
1920 srlg %r0,$s1,63 # carry bit from lower half
1921 sllg $s1,$s1,1
1922 sllg $s3,$s3,1
1923 xgr $s1,%r1
1924 ogr $s3,%r0
1925 lrvgr $s1,$s1 # flip byte order
1926 lrvgr $s3,$s3
1927 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1928 stg $s1,$tweak+0($sp) # save the tweak
1929 llgfr $s1,$s1
1930 srlg $s2,$s3,32
1931 stg $s3,$tweak+8($sp)
1932 llgfr $s3,$s3
1933
1934 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1935 x $s1,4($out)
1936 x $s2,8($out)
1937 x $s3,12($out)
1938 st${g} $out,4*$SIZE_T($sp)
1939 la $key,0($key1)
1940 bras $ra,_s390x_AES_encrypt
1941 l${g} $out,4*$SIZE_T($sp)
1942 x $s0,`$tweak+0`($sp) # ^=tweak
1943 x $s1,`$tweak+4`($sp)
1944 x $s2,`$tweak+8`($sp)
1945 x $s3,`$tweak+12`($sp)
1946 st $s0,0($out)
1947 st $s1,4($out)
1948 st $s2,8($out)
1949 st $s3,12($out)
1950
1951.Lxts_enc_done:
1952 stg $sp,$tweak+0($sp) # wipe tweak
1953 stg $sp,$twesk+8($sp)
1954 lm${g} %r6,$ra,6*$SIZE_T($sp)
1955 br $ra
1956.size AES_xts_encrypt,.-AES_xts_encrypt
1957___
1958# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1959# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
1960#
1961$code.=<<___;
1962.globl AES_xts_decrypt
1963.type AES_xts_decrypt,\@function
1964.align 16
1965AES_xts_decrypt:
1966 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1967 xgr %r4,%r3
1968 xgr %r3,%r4
1969___
1970$code.=<<___ if ($SIZE_T==4);
1971 llgfr $len,$len
1972___
1973$code.=<<___;
1974 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1975 aghi $len,-16
1976 bcr 4,$ra # abort if less than zero. formally
1977 # wrong, because $len is unsigned,
1978 # but who can afford asking to
1979 # process more than 2^63-1 bytes?
1980 tmll $len,0x0f
1981 jnz .Lxts_dec_proceed
1982 aghi $len,16
1983.Lxts_dec_proceed:
1984___
1985$code.=<<___ if (!$softonly);
1986 llgf %r0,240($key2)
1987 lhi %r1,16
1988 clr %r0,%r1
1989 jl .Lxts_dec_software
1990
1991 stm${g} %r6,$s3,6*$SIZE_T($sp)
1992 st${g} $ra,14*$SIZE_T($sp)
1993
1994 nill $len,0xfff0 # $len&=~15
1995 slgr $out,$inp
1996
1997 # generate the tweak value
1998 l${g} $s3,$stdframe($sp) # pointer to iv
1999 la $s2,$tweak($sp)
2000 lmg $s0,$s1,0($s3)
2001 lghi $s3,16
2002 stmg $s0,$s1,0($s2)
2003 la %r1,0($key2) # $key2 is not needed past this point
2004 .long 0xb92e00aa # km $s2,$s2, generate the tweak
2005 brc 1,.-4 # can this happen?
2006
2007 l %r0,240($key1)
2008 la %r1,0($key1) # $key1 is not needed anymore
2009
2010 ltgr $len,$len
2011 jz .Lxts_dec_km_short
2012 bras $ra,_s390x_xts_km
2013 jz .Lxts_dec_km_done
2014
2015 lrvgr $s2,$s0 # make copy in reverse byte order
2016 lrvgr $s3,$s1
2017 j .Lxts_dec_km_2ndtweak
2018
2019.Lxts_dec_km_short:
2020 llgc $len,`2*$SIZE_T-1`($sp)
2021 nill $len,0x0f # $len%=16
2022 lrvg $s0,$tweak+0($sp) # load the tweak
2023 lrvg $s1,$tweak+8($sp)
2024 lrvgr $s2,$s0 # make copy in reverse byte order
2025 lrvgr $s3,$s1
2026
2027.Lxts_dec_km_2ndtweak:
2028 lghi $i1,0x87
2029 srag $i2,$s1,63 # broadcast upper bit
2030 ngr $i1,$i2 # rem
2031 srlg $i2,$s0,63 # carry bit from lower half
2032 sllg $s0,$s0,1
2033 sllg $s1,$s1,1
2034 xgr $s0,$i1
2035 ogr $s1,$i2
2036 lrvgr $i1,$s0 # flip byte order
2037 lrvgr $i2,$s1
2038
2039 xg $i1,0($inp)
2040 xg $i2,8($inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043 la $i2,0($out,$inp)
2044 lghi $i3,16
2045 .long 0xb92e0066 # km $i2,$i2
2046 brc 1,.-4 # can this happen?
2047 lrvgr $i1,$s0
2048 lrvgr $i2,$s1
2049 xg $i1,0($out,$inp)
2050 xg $i2,8($out,$inp)
2051 stg $i1,0($out,$inp)
2052 stg $i2,8($out,$inp)
2053
2054 la $i3,0($out,$inp) # put aside real $out
2055.Lxts_dec_km_steal:
2056 llgc $i1,16($inp)
2057 llgc $i2,0($out,$inp)
2058 stc $i1,0($out,$inp)
2059 stc $i2,16($out,$inp)
2060 la $inp,1($inp)
2061 brct $len,.Lxts_dec_km_steal
2062
2063 lgr $s0,$s2
2064 lgr $s1,$s3
2065 xg $s0,0($i3)
2066 xg $s1,8($i3)
2067 stg $s0,0($i3)
2068 stg $s1,8($i3)
2069 la $s0,0($i3)
2070 lghi $s1,16
2071 .long 0xb92e0088 # km $s0,$s0
2072 brc 1,.-4 # can this happen?
2073 xg $s2,0($i3)
2074 xg $s3,8($i3)
2075 stg $s2,0($i3)
2076 stg $s3,8($i3)
2077.Lxts_dec_km_done:
2078 l${g} $ra,14*$SIZE_T($sp)
2079 st${g} $sp,$tweak($sp) # wipe tweak
2080 st${g} $sp,$tweak($sp)
2081 lm${g} %r6,$s3,6*$SIZE_T($sp)
2082 br $ra
2083.align 16
2084.Lxts_dec_software:
2085___
2086$code.=<<___;
2087 stm${g} %r6,$ra,6*$SIZE_T($sp)
2088
2089 srlg $len,$len,4
2090 slgr $out,$inp
2091
2092 xgr $s0,$s0 # clear upper half
2093 xgr $s1,$s1
2094 lrv $s0,$stdframe+4($sp) # load secno
2095 lrv $s1,$stdframe+0($sp)
2096 xgr $s2,$s2
2097 xgr $s3,$s3
2098 stm${g} %r2,%r5,2*$SIZE_T($sp)
2099 la $key,0($key2)
2100 larl $tbl,AES_Te
2101 bras $ra,_s390x_AES_encrypt # generate the tweak
2102 lm${g} %r2,%r5,2*$SIZE_T($sp)
2103 larl $tbl,AES_Td
2104 lt${g}r $len,$len
2105 stm $s0,$s3,$tweak($sp) # save the tweak
2106 jz .Lxts_dec_short
2107 j .Lxts_dec_enter
2108
2109.align 16
2110.Lxts_dec_loop:
2111 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2112 lrvg $s3,$tweak+8($sp)
2113 lghi %r1,0x87
2114 srag %r0,$s3,63 # broadcast upper bit
2115 ngr %r1,%r0 # rem
2116 srlg %r0,$s1,63 # carry bit from lower half
2117 sllg $s1,$s1,1
2118 sllg $s3,$s3,1
2119 xgr $s1,%r1
2120 ogr $s3,%r0
2121 lrvgr $s1,$s1 # flip byte order
2122 lrvgr $s3,$s3
2123 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2124 stg $s1,$tweak+0($sp) # save the tweak
2125 llgfr $s1,$s1
2126 srlg $s2,$s3,32
2127 stg $s3,$tweak+8($sp)
2128 llgfr $s3,$s3
2129.Lxts_dec_enter:
2130 x $s0,0($inp) # tweak^=*(inp)
2131 x $s1,4($inp)
2132 x $s2,8($inp)
2133 x $s3,12($inp)
2134 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2135 la $key,0($key1)
2136 bras $ra,_s390x_AES_decrypt
2137 lm${g} %r2,%r5,2*$SIZE_T($sp)
2138 x $s0,$tweak+0($sp) # ^=tweak
2139 x $s1,$tweak+4($sp)
2140 x $s2,$tweak+8($sp)
2141 x $s3,$tweak+12($sp)
2142 st $s0,0($out,$inp)
2143 st $s1,4($out,$inp)
2144 st $s2,8($out,$inp)
2145 st $s3,12($out,$inp)
2146 la $inp,16($inp)
2147 brct${g} $len,.Lxts_dec_loop
2148
2149 llgc $len,`2*$SIZE_T-1`($sp)
2150 nill $len,0x0f # $len%16
2151 jz .Lxts_dec_done
2152
2153 # generate pair of tweaks...
2154 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2155 lrvg $s3,$tweak+8($sp)
2156 lghi %r1,0x87
2157 srag %r0,$s3,63 # broadcast upper bit
2158 ngr %r1,%r0 # rem
2159 srlg %r0,$s1,63 # carry bit from lower half
2160 sllg $s1,$s1,1
2161 sllg $s3,$s3,1
2162 xgr $s1,%r1
2163 ogr $s3,%r0
2164 lrvgr $i2,$s1 # flip byte order
2165 lrvgr $i3,$s3
2166 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2167 j .Lxts_dec_2ndtweak
2168
2169.align 16
2170.Lxts_dec_short:
2171 llgc $len,`2*$SIZE_T-1`($sp)
2172 nill $len,0x0f # $len%16
2173 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2174 lrvg $s3,$tweak+8($sp)
2175.Lxts_dec_2ndtweak:
2176 lghi %r1,0x87
2177 srag %r0,$s3,63 # broadcast upper bit
2178 ngr %r1,%r0 # rem
2179 srlg %r0,$s1,63 # carry bit from lower half
2180 sllg $s1,$s1,1
2181 sllg $s3,$s3,1
2182 xgr $s1,%r1
2183 ogr $s3,%r0
2184 lrvgr $s1,$s1 # flip byte order
2185 lrvgr $s3,$s3
2186 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2187 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2188 llgfr $s1,$s1
2189 srlg $s2,$s3,32
2190 stg $s3,$tweak-16+8($sp)
2191 llgfr $s3,$s3
2192
2193 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2194 x $s1,4($inp)
2195 x $s2,8($inp)
2196 x $s3,12($inp)
2197 stm${g} %r2,%r3,2*$SIZE_T($sp)
2198 la $key,0($key1)
2199 bras $ra,_s390x_AES_decrypt
2200 lm${g} %r2,%r5,2*$SIZE_T($sp)
2201 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2202 x $s1,$tweak-16+4($sp)
2203 x $s2,$tweak-16+8($sp)
2204 x $s3,$tweak-16+12($sp)
2205 st $s0,0($out,$inp)
2206 st $s1,4($out,$inp)
2207 st $s2,8($out,$inp)
2208 st $s3,12($out,$inp)
2209
2210 la $i3,0($out,$inp) # put aside real $out
2211.Lxts_dec_steal:
2212 llgc %r0,16($inp)
2213 llgc %r1,0($out,$inp)
2214 stc %r0,0($out,$inp)
2215 stc %r1,16($out,$inp)
2216 la $inp,1($inp)
2217 brct $len,.Lxts_dec_steal
2218 la $out,0($i3) # restore real $out
2219
2220 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2221 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2222 x $s1,4($out)
2223 x $s2,8($out)
2224 x $s3,12($out)
2225 st${g} $out,4*$SIZE_T($sp)
2226 la $key,0($key1)
2227 bras $ra,_s390x_AES_decrypt
2228 l${g} $out,4*$SIZE_T($sp)
2229 x $s0,$tweak+0($sp) # ^=tweak
2230 x $s1,$tweak+4($sp)
2231 x $s2,$tweak+8($sp)
2232 x $s3,$tweak+12($sp)
2233 st $s0,0($out)
2234 st $s1,4($out)
2235 st $s2,8($out)
2236 st $s3,12($out)
2237 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2238 stg $sp,$tweak-16+8($sp)
2239.Lxts_dec_done:
2240 stg $sp,$tweak+0($sp) # wipe tweak
2241 stg $sp,$twesk+8($sp)
2242 lm${g} %r6,$ra,6*$SIZE_T($sp)
2243 br $ra
2244.size AES_xts_decrypt,.-AES_xts_decrypt
1332___ 2245___
1333} 2246}
1334$code.=<<___; 2247$code.=<<___;
1335.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>" 2248.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2249.comm OPENSSL_s390xcap_P,16,8
1336___ 2250___
1337 2251
1338$code =~ s/\`([^\`]*)\`/eval $1/gem; 2252$code =~ s/\`([^\`]*)\`/eval $1/gem;
1339print $code; 2253print $code;
2254close STDOUT; # force flush