summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjsing <>2016-09-04 14:31:29 +0000
committerjsing <>2016-09-04 14:31:29 +0000
commite38c58272a121e2bc9a785ec4001bbc802d68f66 (patch)
tree492fd2a4355d8592de425463d194374bdc85aa0a
parenta9cbed3be03a99c87e2b07b16b511e65a90bf800 (diff)
downloadopenbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.tar.gz
openbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.tar.bz2
openbsd-e38c58272a121e2bc9a785ec4001bbc802d68f66.zip
Less S390.
ok deraadt@
-rw-r--r--src/lib/libcrypto/aes/asm/aes-s390x.pl2237
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-gf2m.pl221
-rw-r--r--src/lib/libcrypto/bn/asm/s390x-mont.pl277
-rwxr-xr-xsrc/lib/libcrypto/bn/asm/s390x.S678
-rw-r--r--src/lib/libcrypto/camellia/camellia.c7
-rw-r--r--src/lib/libcrypto/md32_common.h16
-rw-r--r--src/lib/libcrypto/modes/asm/ghash-s390x.pl262
-rw-r--r--src/lib/libcrypto/rc4/asm/rc4-s390x.pl234
-rw-r--r--src/lib/libcrypto/s390xcap.c43
-rw-r--r--src/lib/libcrypto/s390xcpuid.S55
-rw-r--r--src/lib/libcrypto/sha/asm/sha1-s390x.pl246
-rw-r--r--src/lib/libcrypto/sha/asm/sha512-s390x.pl322
12 files changed, 2 insertions, 4596 deletions
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
deleted file mode 100644
index 71d5b55077..0000000000
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ /dev/null
@@ -1,2237 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# AES for s390x.
11
12# April 2007.
13#
14# Software performance improvement over gcc-generated code is ~70% and
15# in absolute terms is ~73 cycles per byte processed with 128-bit key.
16# You're likely to exclaim "why so slow?" Keep in mind that z-CPUs are
17# *strictly* in-order execution and issued instruction [in this case
18# load value from memory is critical] has to complete before execution
19# flow proceeds. S-boxes are compressed to 2KB[+256B].
20#
21# As for hardware acceleration support. It's basically a "teaser," as
22# it can and should be improved in several ways. Most notably support
23# for CBC is not utilized, nor multiple blocks are ever processed.
24# Then software key schedule can be postponed till hardware support
25# detection... Performance improvement over assembler is reportedly
26# ~2.5x, but can reach >8x [naturally on larger chunks] if proper
27# support is implemented.
28
29# May 2007.
30#
31# Implement AES_set_[en|de]crypt_key. Key schedule setup is avoided
32# for 128-bit keys, if hardware support is detected.
33
34# Januray 2009.
35#
36# Add support for hardware AES192/256 and reschedule instructions to
37# minimize/avoid Address Generation Interlock hazard and to favour
38# dual-issue z10 pipeline. This gave ~25% improvement on z10 and
39# almost 50% on z9. The gain is smaller on z10, because being dual-
40# issue z10 makes it improssible to eliminate the interlock condition:
41# critial path is not long enough. Yet it spends ~24 cycles per byte
42# processed with 128-bit key.
43#
44# Unlike previous version hardware support detection takes place only
45# at the moment of key schedule setup, which is denoted in key->rounds.
46# This is done, because deferred key setup can't be made MT-safe, not
47# for keys longer than 128 bits.
48#
49# Add AES_cbc_encrypt, which gives incredible performance improvement,
50# it was measured to be ~6.6x. It's less than previously mentioned 8x,
51# because software implementation was optimized.
52
53# May 2010.
54#
55# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
56# performance improvement over "generic" counter mode routine relying
57# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
58# to the fact that exact throughput value depends on current stack
59# frame alignment within 4KB page. In worst case you get ~75% of the
60# maximum, but *on average* it would be as much as ~98%. Meaning that
61# worst case is unlike, it's like hitting ravine on plateau.
62
63# November 2010.
64#
65# Adapt for -m31 build. If kernel supports what's called "highgprs"
66# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
67# instructions and achieve "64-bit" performance even in 31-bit legacy
68# application context. The feature is not specific to any particular
69# processor, as long as it's "z-CPU". Latter implies that the code
70# remains z/Architecture specific. On z990 it was measured to perform
71# 2x better than code generated by gcc 4.3.
72
73# December 2010.
74#
75# Add support for z196 "cipher message with counter" instruction.
76# Note however that it's disengaged, because it was measured to
77# perform ~12% worse than vanilla km-based code...
78
79# February 2011.
80#
81# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
82# instructions, which deliver ~70% improvement at 8KB block size over
83# vanilla km-based code, 37% - at most like 512-bytes block size.
84
85$flavour = shift;
86
87if ($flavour =~ /3[12]/) {
88 $SIZE_T=4;
89 $g="";
90} else {
91 $SIZE_T=8;
92 $g="g";
93}
94
95while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
96open STDOUT,">$output";
97
98$softonly=0; # allow hardware support
99
100$t0="%r0"; $mask="%r0";
101$t1="%r1";
102$t2="%r2"; $inp="%r2";
103$t3="%r3"; $out="%r3"; $bits="%r3";
104$key="%r4";
105$i1="%r5";
106$i2="%r6";
107$i3="%r7";
108$s0="%r8";
109$s1="%r9";
110$s2="%r10";
111$s3="%r11";
112$tbl="%r12";
113$rounds="%r13";
114$ra="%r14";
115$sp="%r15";
116
117$stdframe=16*$SIZE_T+4*8;
118
119sub _data_word()
120{ my $i;
121 while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
122}
123
124$code=<<___;
125.text
126
127.type AES_Te,\@object
128.align 256
129AES_Te:
130___
131&_data_word(
132 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
133 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
134 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
135 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
136 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
137 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
138 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
139 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
140 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
141 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
142 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
143 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
144 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
145 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
146 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
147 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
148 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
149 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
150 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
151 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
152 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
153 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
154 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
155 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
156 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
157 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
158 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
159 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
160 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
161 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
162 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
163 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
164 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
165 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
166 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
167 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
168 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
169 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
170 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
171 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
172 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
173 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
174 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
175 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
176 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
177 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
178 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
179 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
180 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
181 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
182 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
183 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
184 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
185 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
186 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
187 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
188 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
189 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
190 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
191 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
192 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
193 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
194 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
195 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
196$code.=<<___;
197# Te4[256]
198.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
199.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
200.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
201.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
202.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
203.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
204.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
205.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
206.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
207.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
208.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
209.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
210.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
211.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
212.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
213.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
214.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
215.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
216.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
217.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
218.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
219.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
220.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
221.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
222.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
223.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
224.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
225.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
226.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
227.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
228.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
229.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
230# rcon[]
231.long 0x01000000, 0x02000000, 0x04000000, 0x08000000
232.long 0x10000000, 0x20000000, 0x40000000, 0x80000000
233.long 0x1B000000, 0x36000000, 0, 0, 0, 0, 0, 0
234.align 256
235.size AES_Te,.-AES_Te
236
237# void AES_encrypt(const unsigned char *inp, unsigned char *out,
238# const AES_KEY *key) {
239.globl AES_encrypt
240.type AES_encrypt,\@function
241AES_encrypt:
242___
243$code.=<<___ if (!$softonly);
244 l %r0,240($key)
245 lhi %r1,16
246 clr %r0,%r1
247 jl .Lesoft
248
249 la %r1,0($key)
250 #la %r2,0($inp)
251 la %r4,0($out)
252 lghi %r3,16 # single block length
253 .long 0xb92e0042 # km %r4,%r2
254 brc 1,.-4 # can this happen?
255 br %r14
256.align 64
257.Lesoft:
258___
259$code.=<<___;
260 stm${g} %r3,$ra,3*$SIZE_T($sp)
261
262 llgf $s0,0($inp)
263 llgf $s1,4($inp)
264 llgf $s2,8($inp)
265 llgf $s3,12($inp)
266
267 larl $tbl,AES_Te
268 bras $ra,_s390x_AES_encrypt
269
270 l${g} $out,3*$SIZE_T($sp)
271 st $s0,0($out)
272 st $s1,4($out)
273 st $s2,8($out)
274 st $s3,12($out)
275
276 lm${g} %r6,$ra,6*$SIZE_T($sp)
277 br $ra
278.size AES_encrypt,.-AES_encrypt
279
280.type _s390x_AES_encrypt,\@function
281.align 16
282_s390x_AES_encrypt:
283 st${g} $ra,15*$SIZE_T($sp)
284 x $s0,0($key)
285 x $s1,4($key)
286 x $s2,8($key)
287 x $s3,12($key)
288 l $rounds,240($key)
289 llill $mask,`0xff<<3`
290 aghi $rounds,-1
291 j .Lenc_loop
292.align 16
293.Lenc_loop:
294 sllg $t1,$s0,`0+3`
295 srlg $t2,$s0,`8-3`
296 srlg $t3,$s0,`16-3`
297 srl $s0,`24-3`
298 nr $s0,$mask
299 ngr $t1,$mask
300 nr $t2,$mask
301 nr $t3,$mask
302
303 srlg $i1,$s1,`16-3` # i0
304 sllg $i2,$s1,`0+3`
305 srlg $i3,$s1,`8-3`
306 srl $s1,`24-3`
307 nr $i1,$mask
308 nr $s1,$mask
309 ngr $i2,$mask
310 nr $i3,$mask
311
312 l $s0,0($s0,$tbl) # Te0[s0>>24]
313 l $t1,1($t1,$tbl) # Te3[s0>>0]
314 l $t2,2($t2,$tbl) # Te2[s0>>8]
315 l $t3,3($t3,$tbl) # Te1[s0>>16]
316
317 x $s0,3($i1,$tbl) # Te1[s1>>16]
318 l $s1,0($s1,$tbl) # Te0[s1>>24]
319 x $t2,1($i2,$tbl) # Te3[s1>>0]
320 x $t3,2($i3,$tbl) # Te2[s1>>8]
321
322 srlg $i1,$s2,`8-3` # i0
323 srlg $i2,$s2,`16-3` # i1
324 nr $i1,$mask
325 nr $i2,$mask
326 sllg $i3,$s2,`0+3`
327 srl $s2,`24-3`
328 nr $s2,$mask
329 ngr $i3,$mask
330
331 xr $s1,$t1
332 srlg $ra,$s3,`8-3` # i1
333 sllg $t1,$s3,`0+3` # i0
334 nr $ra,$mask
335 la $key,16($key)
336 ngr $t1,$mask
337
338 x $s0,2($i1,$tbl) # Te2[s2>>8]
339 x $s1,3($i2,$tbl) # Te1[s2>>16]
340 l $s2,0($s2,$tbl) # Te0[s2>>24]
341 x $t3,1($i3,$tbl) # Te3[s2>>0]
342
343 srlg $i3,$s3,`16-3` # i2
344 xr $s2,$t2
345 srl $s3,`24-3`
346 nr $i3,$mask
347 nr $s3,$mask
348
349 x $s0,0($key)
350 x $s1,4($key)
351 x $s2,8($key)
352 x $t3,12($key)
353
354 x $s0,1($t1,$tbl) # Te3[s3>>0]
355 x $s1,2($ra,$tbl) # Te2[s3>>8]
356 x $s2,3($i3,$tbl) # Te1[s3>>16]
357 l $s3,0($s3,$tbl) # Te0[s3>>24]
358 xr $s3,$t3
359
360 brct $rounds,.Lenc_loop
361 .align 16
362
363 sllg $t1,$s0,`0+3`
364 srlg $t2,$s0,`8-3`
365 ngr $t1,$mask
366 srlg $t3,$s0,`16-3`
367 srl $s0,`24-3`
368 nr $s0,$mask
369 nr $t2,$mask
370 nr $t3,$mask
371
372 srlg $i1,$s1,`16-3` # i0
373 sllg $i2,$s1,`0+3`
374 ngr $i2,$mask
375 srlg $i3,$s1,`8-3`
376 srl $s1,`24-3`
377 nr $i1,$mask
378 nr $s1,$mask
379 nr $i3,$mask
380
381 llgc $s0,2($s0,$tbl) # Te4[s0>>24]
382 llgc $t1,2($t1,$tbl) # Te4[s0>>0]
383 sll $s0,24
384 llgc $t2,2($t2,$tbl) # Te4[s0>>8]
385 llgc $t3,2($t3,$tbl) # Te4[s0>>16]
386 sll $t2,8
387 sll $t3,16
388
389 llgc $i1,2($i1,$tbl) # Te4[s1>>16]
390 llgc $s1,2($s1,$tbl) # Te4[s1>>24]
391 llgc $i2,2($i2,$tbl) # Te4[s1>>0]
392 llgc $i3,2($i3,$tbl) # Te4[s1>>8]
393 sll $i1,16
394 sll $s1,24
395 sll $i3,8
396 or $s0,$i1
397 or $s1,$t1
398 or $t2,$i2
399 or $t3,$i3
400
401 srlg $i1,$s2,`8-3` # i0
402 srlg $i2,$s2,`16-3` # i1
403 nr $i1,$mask
404 nr $i2,$mask
405 sllg $i3,$s2,`0+3`
406 srl $s2,`24-3`
407 ngr $i3,$mask
408 nr $s2,$mask
409
410 sllg $t1,$s3,`0+3` # i0
411 srlg $ra,$s3,`8-3` # i1
412 ngr $t1,$mask
413
414 llgc $i1,2($i1,$tbl) # Te4[s2>>8]
415 llgc $i2,2($i2,$tbl) # Te4[s2>>16]
416 sll $i1,8
417 llgc $s2,2($s2,$tbl) # Te4[s2>>24]
418 llgc $i3,2($i3,$tbl) # Te4[s2>>0]
419 sll $i2,16
420 nr $ra,$mask
421 sll $s2,24
422 or $s0,$i1
423 or $s1,$i2
424 or $s2,$t2
425 or $t3,$i3
426
427 srlg $i3,$s3,`16-3` # i2
428 srl $s3,`24-3`
429 nr $i3,$mask
430 nr $s3,$mask
431
432 l $t0,16($key)
433 l $t2,20($key)
434
435 llgc $i1,2($t1,$tbl) # Te4[s3>>0]
436 llgc $i2,2($ra,$tbl) # Te4[s3>>8]
437 llgc $i3,2($i3,$tbl) # Te4[s3>>16]
438 llgc $s3,2($s3,$tbl) # Te4[s3>>24]
439 sll $i2,8
440 sll $i3,16
441 sll $s3,24
442 or $s0,$i1
443 or $s1,$i2
444 or $s2,$i3
445 or $s3,$t3
446
447 l${g} $ra,15*$SIZE_T($sp)
448 xr $s0,$t0
449 xr $s1,$t2
450 x $s2,24($key)
451 x $s3,28($key)
452
453 br $ra
454.size _s390x_AES_encrypt,.-_s390x_AES_encrypt
455___
456
457$code.=<<___;
458.type AES_Td,\@object
459.align 256
460AES_Td:
461___
462&_data_word(
463 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
464 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
465 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
466 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
467 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
468 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
469 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
470 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
471 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
472 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
473 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
474 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
475 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
476 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
477 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
478 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
479 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
480 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
481 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
482 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
483 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
484 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
485 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
486 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
487 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
488 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
489 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
490 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
491 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
492 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
493 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
494 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
495 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
496 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
497 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
498 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
499 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
500 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
501 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
502 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
503 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
504 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
505 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
506 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
507 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
508 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
509 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
510 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
511 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
512 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
513 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
514 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
515 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
516 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
517 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
518 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
519 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
520 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
521 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
522 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
523 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
524 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
525 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
526 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
527$code.=<<___;
528# Td4[256]
529.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
530.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
531.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
532.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
533.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
534.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
535.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
536.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
537.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
538.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
539.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
540.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
541.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
542.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
543.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
544.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
545.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
546.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
547.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
548.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
549.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
550.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
551.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
552.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
553.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
554.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
555.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
556.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
557.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
558.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
559.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
560.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
561.size AES_Td,.-AES_Td
562
563# void AES_decrypt(const unsigned char *inp, unsigned char *out,
564# const AES_KEY *key) {
565.globl AES_decrypt
566.type AES_decrypt,\@function
567AES_decrypt:
568___
569$code.=<<___ if (!$softonly);
570 l %r0,240($key)
571 lhi %r1,16
572 clr %r0,%r1
573 jl .Ldsoft
574
575 la %r1,0($key)
576 #la %r2,0($inp)
577 la %r4,0($out)
578 lghi %r3,16 # single block length
579 .long 0xb92e0042 # km %r4,%r2
580 brc 1,.-4 # can this happen?
581 br %r14
582.align 64
583.Ldsoft:
584___
585$code.=<<___;
586 stm${g} %r3,$ra,3*$SIZE_T($sp)
587
588 llgf $s0,0($inp)
589 llgf $s1,4($inp)
590 llgf $s2,8($inp)
591 llgf $s3,12($inp)
592
593 larl $tbl,AES_Td
594 bras $ra,_s390x_AES_decrypt
595
596 l${g} $out,3*$SIZE_T($sp)
597 st $s0,0($out)
598 st $s1,4($out)
599 st $s2,8($out)
600 st $s3,12($out)
601
602 lm${g} %r6,$ra,6*$SIZE_T($sp)
603 br $ra
604.size AES_decrypt,.-AES_decrypt
605
606.type _s390x_AES_decrypt,\@function
607.align 16
608_s390x_AES_decrypt:
609 st${g} $ra,15*$SIZE_T($sp)
610 x $s0,0($key)
611 x $s1,4($key)
612 x $s2,8($key)
613 x $s3,12($key)
614 l $rounds,240($key)
615 llill $mask,`0xff<<3`
616 aghi $rounds,-1
617 j .Ldec_loop
618.align 16
619.Ldec_loop:
620 srlg $t1,$s0,`16-3`
621 srlg $t2,$s0,`8-3`
622 sllg $t3,$s0,`0+3`
623 srl $s0,`24-3`
624 nr $s0,$mask
625 nr $t1,$mask
626 nr $t2,$mask
627 ngr $t3,$mask
628
629 sllg $i1,$s1,`0+3` # i0
630 srlg $i2,$s1,`16-3`
631 srlg $i3,$s1,`8-3`
632 srl $s1,`24-3`
633 ngr $i1,$mask
634 nr $s1,$mask
635 nr $i2,$mask
636 nr $i3,$mask
637
638 l $s0,0($s0,$tbl) # Td0[s0>>24]
639 l $t1,3($t1,$tbl) # Td1[s0>>16]
640 l $t2,2($t2,$tbl) # Td2[s0>>8]
641 l $t3,1($t3,$tbl) # Td3[s0>>0]
642
643 x $s0,1($i1,$tbl) # Td3[s1>>0]
644 l $s1,0($s1,$tbl) # Td0[s1>>24]
645 x $t2,3($i2,$tbl) # Td1[s1>>16]
646 x $t3,2($i3,$tbl) # Td2[s1>>8]
647
648 srlg $i1,$s2,`8-3` # i0
649 sllg $i2,$s2,`0+3` # i1
650 srlg $i3,$s2,`16-3`
651 srl $s2,`24-3`
652 nr $i1,$mask
653 ngr $i2,$mask
654 nr $s2,$mask
655 nr $i3,$mask
656
657 xr $s1,$t1
658 srlg $ra,$s3,`8-3` # i1
659 srlg $t1,$s3,`16-3` # i0
660 nr $ra,$mask
661 la $key,16($key)
662 nr $t1,$mask
663
664 x $s0,2($i1,$tbl) # Td2[s2>>8]
665 x $s1,1($i2,$tbl) # Td3[s2>>0]
666 l $s2,0($s2,$tbl) # Td0[s2>>24]
667 x $t3,3($i3,$tbl) # Td1[s2>>16]
668
669 sllg $i3,$s3,`0+3` # i2
670 srl $s3,`24-3`
671 ngr $i3,$mask
672 nr $s3,$mask
673
674 xr $s2,$t2
675 x $s0,0($key)
676 x $s1,4($key)
677 x $s2,8($key)
678 x $t3,12($key)
679
680 x $s0,3($t1,$tbl) # Td1[s3>>16]
681 x $s1,2($ra,$tbl) # Td2[s3>>8]
682 x $s2,1($i3,$tbl) # Td3[s3>>0]
683 l $s3,0($s3,$tbl) # Td0[s3>>24]
684 xr $s3,$t3
685
686 brct $rounds,.Ldec_loop
687 .align 16
688
689 l $t1,`2048+0`($tbl) # prefetch Td4
690 l $t2,`2048+64`($tbl)
691 l $t3,`2048+128`($tbl)
692 l $i1,`2048+192`($tbl)
693 llill $mask,0xff
694
695 srlg $i3,$s0,24 # i0
696 srlg $t1,$s0,16
697 srlg $t2,$s0,8
698 nr $s0,$mask # i3
699 nr $t1,$mask
700
701 srlg $i1,$s1,24
702 nr $t2,$mask
703 srlg $i2,$s1,16
704 srlg $ra,$s1,8
705 nr $s1,$mask # i0
706 nr $i2,$mask
707 nr $ra,$mask
708
709 llgc $i3,2048($i3,$tbl) # Td4[s0>>24]
710 llgc $t1,2048($t1,$tbl) # Td4[s0>>16]
711 llgc $t2,2048($t2,$tbl) # Td4[s0>>8]
712 sll $t1,16
713 llgc $t3,2048($s0,$tbl) # Td4[s0>>0]
714 sllg $s0,$i3,24
715 sll $t2,8
716
717 llgc $s1,2048($s1,$tbl) # Td4[s1>>0]
718 llgc $i1,2048($i1,$tbl) # Td4[s1>>24]
719 llgc $i2,2048($i2,$tbl) # Td4[s1>>16]
720 sll $i1,24
721 llgc $i3,2048($ra,$tbl) # Td4[s1>>8]
722 sll $i2,16
723 sll $i3,8
724 or $s0,$s1
725 or $t1,$i1
726 or $t2,$i2
727 or $t3,$i3
728
729 srlg $i1,$s2,8 # i0
730 srlg $i2,$s2,24
731 srlg $i3,$s2,16
732 nr $s2,$mask # i1
733 nr $i1,$mask
734 nr $i3,$mask
735 llgc $i1,2048($i1,$tbl) # Td4[s2>>8]
736 llgc $s1,2048($s2,$tbl) # Td4[s2>>0]
737 llgc $i2,2048($i2,$tbl) # Td4[s2>>24]
738 llgc $i3,2048($i3,$tbl) # Td4[s2>>16]
739 sll $i1,8
740 sll $i2,24
741 or $s0,$i1
742 sll $i3,16
743 or $t2,$i2
744 or $t3,$i3
745
746 srlg $i1,$s3,16 # i0
747 srlg $i2,$s3,8 # i1
748 srlg $i3,$s3,24
749 nr $s3,$mask # i2
750 nr $i1,$mask
751 nr $i2,$mask
752
753 l${g} $ra,15*$SIZE_T($sp)
754 or $s1,$t1
755 l $t0,16($key)
756 l $t1,20($key)
757
758 llgc $i1,2048($i1,$tbl) # Td4[s3>>16]
759 llgc $i2,2048($i2,$tbl) # Td4[s3>>8]
760 sll $i1,16
761 llgc $s2,2048($s3,$tbl) # Td4[s3>>0]
762 llgc $s3,2048($i3,$tbl) # Td4[s3>>24]
763 sll $i2,8
764 sll $s3,24
765 or $s0,$i1
766 or $s1,$i2
767 or $s2,$t2
768 or $s3,$t3
769
770 xr $s0,$t0
771 xr $s1,$t1
772 x $s2,24($key)
773 x $s3,28($key)
774
775 br $ra
776.size _s390x_AES_decrypt,.-_s390x_AES_decrypt
777___
778
779$code.=<<___;
780# void AES_set_encrypt_key(const unsigned char *in, int bits,
781# AES_KEY *key) {
782.globl AES_set_encrypt_key
783.type AES_set_encrypt_key,\@function
784.align 16
785AES_set_encrypt_key:
786_s390x_AES_set_encrypt_key:
787 lghi $t0,0
788 cl${g}r $inp,$t0
789 je .Lminus1
790 cl${g}r $key,$t0
791 je .Lminus1
792
793 lghi $t0,128
794 clr $bits,$t0
795 je .Lproceed
796 lghi $t0,192
797 clr $bits,$t0
798 je .Lproceed
799 lghi $t0,256
800 clr $bits,$t0
801 je .Lproceed
802 lghi %r2,-2
803 br %r14
804
805.align 16
806.Lproceed:
807___
808$code.=<<___ if (!$softonly);
809 # convert bits to km code, [128,192,256]->[18,19,20]
810 lhi %r5,-128
811 lhi %r0,18
812 ar %r5,$bits
813 srl %r5,6
814 ar %r5,%r0
815
816 larl %r1,OPENSSL_s390xcap_P
817 lg %r0,0(%r1)
818 tmhl %r0,0x4000 # check for message-security assist
819 jz .Lekey_internal
820
821 lghi %r0,0 # query capability vector
822 la %r1,16($sp)
823 .long 0xb92f0042 # kmc %r4,%r2
824
825 llihh %r1,0x8000
826 srlg %r1,%r1,0(%r5)
827 ng %r1,16($sp)
828 jz .Lekey_internal
829
830 lmg %r0,%r1,0($inp) # just copy 128 bits...
831 stmg %r0,%r1,0($key)
832 lhi %r0,192
833 cr $bits,%r0
834 jl 1f
835 lg %r1,16($inp)
836 stg %r1,16($key)
837 je 1f
838 lg %r1,24($inp)
839 stg %r1,24($key)
8401: st $bits,236($key) # save bits [for debugging purposes]
841 lgr $t0,%r5
842 st %r5,240($key) # save km code
843 lghi %r2,0
844 br %r14
845___
846$code.=<<___;
847.align 16
848.Lekey_internal:
849 stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key
850
851 larl $tbl,AES_Te+2048
852
853 llgf $s0,0($inp)
854 llgf $s1,4($inp)
855 llgf $s2,8($inp)
856 llgf $s3,12($inp)
857 st $s0,0($key)
858 st $s1,4($key)
859 st $s2,8($key)
860 st $s3,12($key)
861 lghi $t0,128
862 cr $bits,$t0
863 jne .Lnot128
864
865 llill $mask,0xff
866 lghi $t3,0 # i=0
867 lghi $rounds,10
868 st $rounds,240($key)
869
870 llgfr $t2,$s3 # temp=rk[3]
871 srlg $i1,$s3,8
872 srlg $i2,$s3,16
873 srlg $i3,$s3,24
874 nr $t2,$mask
875 nr $i1,$mask
876 nr $i2,$mask
877
878.align 16
879.L128_loop:
880 la $t2,0($t2,$tbl)
881 la $i1,0($i1,$tbl)
882 la $i2,0($i2,$tbl)
883 la $i3,0($i3,$tbl)
884 icm $t2,2,0($t2) # Te4[rk[3]>>0]<<8
885 icm $t2,4,0($i1) # Te4[rk[3]>>8]<<16
886 icm $t2,8,0($i2) # Te4[rk[3]>>16]<<24
887 icm $t2,1,0($i3) # Te4[rk[3]>>24]
888 x $t2,256($t3,$tbl) # rcon[i]
889 xr $s0,$t2 # rk[4]=rk[0]^...
890 xr $s1,$s0 # rk[5]=rk[1]^rk[4]
891 xr $s2,$s1 # rk[6]=rk[2]^rk[5]
892 xr $s3,$s2 # rk[7]=rk[3]^rk[6]
893
894 llgfr $t2,$s3 # temp=rk[3]
895 srlg $i1,$s3,8
896 srlg $i2,$s3,16
897 nr $t2,$mask
898 nr $i1,$mask
899 srlg $i3,$s3,24
900 nr $i2,$mask
901
902 st $s0,16($key)
903 st $s1,20($key)
904 st $s2,24($key)
905 st $s3,28($key)
906 la $key,16($key) # key+=4
907 la $t3,4($t3) # i++
908 brct $rounds,.L128_loop
909 lghi $t0,10
910 lghi %r2,0
911 lm${g} %r4,%r13,4*$SIZE_T($sp)
912 br $ra
913
914.align 16
915.Lnot128:
916 llgf $t0,16($inp)
917 llgf $t1,20($inp)
918 st $t0,16($key)
919 st $t1,20($key)
920 lghi $t0,192
921 cr $bits,$t0
922 jne .Lnot192
923
924 llill $mask,0xff
925 lghi $t3,0 # i=0
926 lghi $rounds,12
927 st $rounds,240($key)
928 lghi $rounds,8
929
930 srlg $i1,$t1,8
931 srlg $i2,$t1,16
932 srlg $i3,$t1,24
933 nr $t1,$mask
934 nr $i1,$mask
935 nr $i2,$mask
936
937.align 16
938.L192_loop:
939 la $t1,0($t1,$tbl)
940 la $i1,0($i1,$tbl)
941 la $i2,0($i2,$tbl)
942 la $i3,0($i3,$tbl)
943 icm $t1,2,0($t1) # Te4[rk[5]>>0]<<8
944 icm $t1,4,0($i1) # Te4[rk[5]>>8]<<16
945 icm $t1,8,0($i2) # Te4[rk[5]>>16]<<24
946 icm $t1,1,0($i3) # Te4[rk[5]>>24]
947 x $t1,256($t3,$tbl) # rcon[i]
948 xr $s0,$t1 # rk[6]=rk[0]^...
949 xr $s1,$s0 # rk[7]=rk[1]^rk[6]
950 xr $s2,$s1 # rk[8]=rk[2]^rk[7]
951 xr $s3,$s2 # rk[9]=rk[3]^rk[8]
952
953 st $s0,24($key)
954 st $s1,28($key)
955 st $s2,32($key)
956 st $s3,36($key)
957 brct $rounds,.L192_continue
958 lghi $t0,12
959 lghi %r2,0
960 lm${g} %r4,%r13,4*$SIZE_T($sp)
961 br $ra
962
963.align 16
964.L192_continue:
965 lgr $t1,$s3
966 x $t1,16($key) # rk[10]=rk[4]^rk[9]
967 st $t1,40($key)
968 x $t1,20($key) # rk[11]=rk[5]^rk[10]
969 st $t1,44($key)
970
971 srlg $i1,$t1,8
972 srlg $i2,$t1,16
973 srlg $i3,$t1,24
974 nr $t1,$mask
975 nr $i1,$mask
976 nr $i2,$mask
977
978 la $key,24($key) # key+=6
979 la $t3,4($t3) # i++
980 j .L192_loop
981
982.align 16
983.Lnot192:
984 llgf $t0,24($inp)
985 llgf $t1,28($inp)
986 st $t0,24($key)
987 st $t1,28($key)
988 llill $mask,0xff
989 lghi $t3,0 # i=0
990 lghi $rounds,14
991 st $rounds,240($key)
992 lghi $rounds,7
993
994 srlg $i1,$t1,8
995 srlg $i2,$t1,16
996 srlg $i3,$t1,24
997 nr $t1,$mask
998 nr $i1,$mask
999 nr $i2,$mask
1000
1001.align 16
1002.L256_loop:
1003 la $t1,0($t1,$tbl)
1004 la $i1,0($i1,$tbl)
1005 la $i2,0($i2,$tbl)
1006 la $i3,0($i3,$tbl)
1007 icm $t1,2,0($t1) # Te4[rk[7]>>0]<<8
1008 icm $t1,4,0($i1) # Te4[rk[7]>>8]<<16
1009 icm $t1,8,0($i2) # Te4[rk[7]>>16]<<24
1010 icm $t1,1,0($i3) # Te4[rk[7]>>24]
1011 x $t1,256($t3,$tbl) # rcon[i]
1012 xr $s0,$t1 # rk[8]=rk[0]^...
1013 xr $s1,$s0 # rk[9]=rk[1]^rk[8]
1014 xr $s2,$s1 # rk[10]=rk[2]^rk[9]
1015 xr $s3,$s2 # rk[11]=rk[3]^rk[10]
1016 st $s0,32($key)
1017 st $s1,36($key)
1018 st $s2,40($key)
1019 st $s3,44($key)
1020 brct $rounds,.L256_continue
1021 lghi $t0,14
1022 lghi %r2,0
1023 lm${g} %r4,%r13,4*$SIZE_T($sp)
1024 br $ra
1025
1026.align 16
1027.L256_continue:
1028 lgr $t1,$s3 # temp=rk[11]
1029 srlg $i1,$s3,8
1030 srlg $i2,$s3,16
1031 srlg $i3,$s3,24
1032 nr $t1,$mask
1033 nr $i1,$mask
1034 nr $i2,$mask
1035 la $t1,0($t1,$tbl)
1036 la $i1,0($i1,$tbl)
1037 la $i2,0($i2,$tbl)
1038 la $i3,0($i3,$tbl)
1039 llgc $t1,0($t1) # Te4[rk[11]>>0]
1040 icm $t1,2,0($i1) # Te4[rk[11]>>8]<<8
1041 icm $t1,4,0($i2) # Te4[rk[11]>>16]<<16
1042 icm $t1,8,0($i3) # Te4[rk[11]>>24]<<24
1043 x $t1,16($key) # rk[12]=rk[4]^...
1044 st $t1,48($key)
1045 x $t1,20($key) # rk[13]=rk[5]^rk[12]
1046 st $t1,52($key)
1047 x $t1,24($key) # rk[14]=rk[6]^rk[13]
1048 st $t1,56($key)
1049 x $t1,28($key) # rk[15]=rk[7]^rk[14]
1050 st $t1,60($key)
1051
1052 srlg $i1,$t1,8
1053 srlg $i2,$t1,16
1054 srlg $i3,$t1,24
1055 nr $t1,$mask
1056 nr $i1,$mask
1057 nr $i2,$mask
1058
1059 la $key,32($key) # key+=8
1060 la $t3,4($t3) # i++
1061 j .L256_loop
1062
1063.Lminus1:
1064 lghi %r2,-1
1065 br $ra
1066.size AES_set_encrypt_key,.-AES_set_encrypt_key
1067
1068# void AES_set_decrypt_key(const unsigned char *in, int bits,
1069# AES_KEY *key) {
1070.globl AES_set_decrypt_key
1071.type AES_set_decrypt_key,\@function
1072.align 16
1073AES_set_decrypt_key:
1074 #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
1075 st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
1076 bras $ra,_s390x_AES_set_encrypt_key
1077 #l${g} $key,4*$SIZE_T($sp)
1078 l${g} $ra,14*$SIZE_T($sp)
1079 ltgr %r2,%r2
1080 bnzr $ra
1081___
1082$code.=<<___ if (!$softonly);
1083 #l $t0,240($key)
1084 lhi $t1,16
1085 cr $t0,$t1
1086 jl .Lgo
1087 oill $t0,0x80 # set "decrypt" bit
1088 st $t0,240($key)
1089 br $ra
1090___
1091$code.=<<___;
1092.align 16
1093.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key)
1094 la $i1,0($key)
1095 sllg $i2,$rounds,4
1096 la $i2,0($i2,$key)
1097 srl $rounds,1
1098 lghi $t1,-16
1099
1100.align 16
1101.Linv: lmg $s0,$s1,0($i1)
1102 lmg $s2,$s3,0($i2)
1103 stmg $s0,$s1,0($i2)
1104 stmg $s2,$s3,0($i1)
1105 la $i1,16($i1)
1106 la $i2,0($t1,$i2)
1107 brct $rounds,.Linv
1108___
1109$mask80=$i1;
1110$mask1b=$i2;
1111$maskfe=$i3;
1112$code.=<<___;
1113 llgf $rounds,240($key)
1114 aghi $rounds,-1
1115 sll $rounds,2 # (rounds-1)*4
1116 llilh $mask80,0x8080
1117 llilh $mask1b,0x1b1b
1118 llilh $maskfe,0xfefe
1119 oill $mask80,0x8080
1120 oill $mask1b,0x1b1b
1121 oill $maskfe,0xfefe
1122
1123.align 16
1124.Lmix: l $s0,16($key) # tp1
1125 lr $s1,$s0
1126 ngr $s1,$mask80
1127 srlg $t1,$s1,7
1128 slr $s1,$t1
1129 nr $s1,$mask1b
1130 sllg $t1,$s0,1
1131 nr $t1,$maskfe
1132 xr $s1,$t1 # tp2
1133
1134 lr $s2,$s1
1135 ngr $s2,$mask80
1136 srlg $t1,$s2,7
1137 slr $s2,$t1
1138 nr $s2,$mask1b
1139 sllg $t1,$s1,1
1140 nr $t1,$maskfe
1141 xr $s2,$t1 # tp4
1142
1143 lr $s3,$s2
1144 ngr $s3,$mask80
1145 srlg $t1,$s3,7
1146 slr $s3,$t1
1147 nr $s3,$mask1b
1148 sllg $t1,$s2,1
1149 nr $t1,$maskfe
1150 xr $s3,$t1 # tp8
1151
1152 xr $s1,$s0 # tp2^tp1
1153 xr $s2,$s0 # tp4^tp1
1154 rll $s0,$s0,24 # = ROTATE(tp1,8)
1155 xr $s2,$s3 # ^=tp8
1156 xr $s0,$s1 # ^=tp2^tp1
1157 xr $s1,$s3 # tp2^tp1^tp8
1158 xr $s0,$s2 # ^=tp4^tp1^tp8
1159 rll $s1,$s1,8
1160 rll $s2,$s2,16
1161 xr $s0,$s1 # ^= ROTATE(tp8^tp2^tp1,24)
1162 rll $s3,$s3,24
1163 xr $s0,$s2 # ^= ROTATE(tp8^tp4^tp1,16)
1164 xr $s0,$s3 # ^= ROTATE(tp8,8)
1165
1166 st $s0,16($key)
1167 la $key,4($key)
1168 brct $rounds,.Lmix
1169
1170 lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
1171 lghi %r2,0
1172 br $ra
1173.size AES_set_decrypt_key,.-AES_set_decrypt_key
1174___
1175
1176########################################################################
1177# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
1178# size_t length, const AES_KEY *key,
1179# unsigned char *ivec, const int enc)
1180{
1181my $inp="%r2";
1182my $out="%r4"; # length and out are swapped
1183my $len="%r3";
1184my $key="%r5";
1185my $ivp="%r6";
1186
1187$code.=<<___;
1188.globl AES_cbc_encrypt
1189.type AES_cbc_encrypt,\@function
1190.align 16
1191AES_cbc_encrypt:
1192 xgr %r3,%r4 # flip %r3 and %r4, out and len
1193 xgr %r4,%r3
1194 xgr %r3,%r4
1195___
1196$code.=<<___ if (!$softonly);
1197 lhi %r0,16
1198 cl %r0,240($key)
1199 jh .Lcbc_software
1200
1201 lg %r0,0($ivp) # copy ivec
1202 lg %r1,8($ivp)
1203 stmg %r0,%r1,16($sp)
1204 lmg %r0,%r1,0($key) # copy key, cover 256 bit
1205 stmg %r0,%r1,32($sp)
1206 lmg %r0,%r1,16($key)
1207 stmg %r0,%r1,48($sp)
1208 l %r0,240($key) # load kmc code
1209 lghi $key,15 # res=len%16, len-=res;
1210 ngr $key,$len
1211 sl${g}r $len,$key
1212 la %r1,16($sp) # parameter block - ivec || key
1213 jz .Lkmc_truncated
1214 .long 0xb92f0042 # kmc %r4,%r2
1215 brc 1,.-4 # pay attention to "partial completion"
1216 ltr $key,$key
1217 jnz .Lkmc_truncated
1218.Lkmc_done:
1219 lmg %r0,%r1,16($sp) # copy ivec to caller
1220 stg %r0,0($ivp)
1221 stg %r1,8($ivp)
1222 br $ra
1223.align 16
1224.Lkmc_truncated:
1225 ahi $key,-1 # it's the way it's encoded in mvc
1226 tmll %r0,0x80
1227 jnz .Lkmc_truncated_dec
1228 lghi %r1,0
1229 stg %r1,16*$SIZE_T($sp)
1230 stg %r1,16*$SIZE_T+8($sp)
1231 bras %r1,1f
1232 mvc 16*$SIZE_T(1,$sp),0($inp)
12331: ex $key,0(%r1)
1234 la %r1,16($sp) # restore parameter block
1235 la $inp,16*$SIZE_T($sp)
1236 lghi $len,16
1237 .long 0xb92f0042 # kmc %r4,%r2
1238 j .Lkmc_done
1239.align 16
1240.Lkmc_truncated_dec:
1241 st${g} $out,4*$SIZE_T($sp)
1242 la $out,16*$SIZE_T($sp)
1243 lghi $len,16
1244 .long 0xb92f0042 # kmc %r4,%r2
1245 l${g} $out,4*$SIZE_T($sp)
1246 bras %r1,2f
1247 mvc 0(1,$out),16*$SIZE_T($sp)
12482: ex $key,0(%r1)
1249 j .Lkmc_done
1250.align 16
1251.Lcbc_software:
1252___
1253$code.=<<___;
1254 stm${g} $key,$ra,5*$SIZE_T($sp)
1255 lhi %r0,0
1256 cl %r0,`$stdframe+$SIZE_T-4`($sp)
1257 je .Lcbc_decrypt
1258
1259 larl $tbl,AES_Te
1260
1261 llgf $s0,0($ivp)
1262 llgf $s1,4($ivp)
1263 llgf $s2,8($ivp)
1264 llgf $s3,12($ivp)
1265
1266 lghi $t0,16
1267 sl${g}r $len,$t0
1268 brc 4,.Lcbc_enc_tail # if borrow
1269.Lcbc_enc_loop:
1270 stm${g} $inp,$out,2*$SIZE_T($sp)
1271 x $s0,0($inp)
1272 x $s1,4($inp)
1273 x $s2,8($inp)
1274 x $s3,12($inp)
1275 lgr %r4,$key
1276
1277 bras $ra,_s390x_AES_encrypt
1278
1279 lm${g} $inp,$key,2*$SIZE_T($sp)
1280 st $s0,0($out)
1281 st $s1,4($out)
1282 st $s2,8($out)
1283 st $s3,12($out)
1284
1285 la $inp,16($inp)
1286 la $out,16($out)
1287 lghi $t0,16
1288 lt${g}r $len,$len
1289 jz .Lcbc_enc_done
1290 sl${g}r $len,$t0
1291 brc 4,.Lcbc_enc_tail # if borrow
1292 j .Lcbc_enc_loop
1293.align 16
1294.Lcbc_enc_done:
1295 l${g} $ivp,6*$SIZE_T($sp)
1296 st $s0,0($ivp)
1297 st $s1,4($ivp)
1298 st $s2,8($ivp)
1299 st $s3,12($ivp)
1300
1301 lm${g} %r7,$ra,7*$SIZE_T($sp)
1302 br $ra
1303
1304.align 16
1305.Lcbc_enc_tail:
1306 aghi $len,15
1307 lghi $t0,0
1308 stg $t0,16*$SIZE_T($sp)
1309 stg $t0,16*$SIZE_T+8($sp)
1310 bras $t1,3f
1311 mvc 16*$SIZE_T(1,$sp),0($inp)
13123: ex $len,0($t1)
1313 lghi $len,0
1314 la $inp,16*$SIZE_T($sp)
1315 j .Lcbc_enc_loop
1316
1317.align 16
1318.Lcbc_decrypt:
1319 larl $tbl,AES_Td
1320
1321 lg $t0,0($ivp)
1322 lg $t1,8($ivp)
1323 stmg $t0,$t1,16*$SIZE_T($sp)
1324
1325.Lcbc_dec_loop:
1326 stm${g} $inp,$out,2*$SIZE_T($sp)
1327 llgf $s0,0($inp)
1328 llgf $s1,4($inp)
1329 llgf $s2,8($inp)
1330 llgf $s3,12($inp)
1331 lgr %r4,$key
1332
1333 bras $ra,_s390x_AES_decrypt
1334
1335 lm${g} $inp,$key,2*$SIZE_T($sp)
1336 sllg $s0,$s0,32
1337 sllg $s2,$s2,32
1338 lr $s0,$s1
1339 lr $s2,$s3
1340
1341 lg $t0,0($inp)
1342 lg $t1,8($inp)
1343 xg $s0,16*$SIZE_T($sp)
1344 xg $s2,16*$SIZE_T+8($sp)
1345 lghi $s1,16
1346 sl${g}r $len,$s1
1347 brc 4,.Lcbc_dec_tail # if borrow
1348 brc 2,.Lcbc_dec_done # if zero
1349 stg $s0,0($out)
1350 stg $s2,8($out)
1351 stmg $t0,$t1,16*$SIZE_T($sp)
1352
1353 la $inp,16($inp)
1354 la $out,16($out)
1355 j .Lcbc_dec_loop
1356
1357.Lcbc_dec_done:
1358 stg $s0,0($out)
1359 stg $s2,8($out)
1360.Lcbc_dec_exit:
1361 lm${g} %r6,$ra,6*$SIZE_T($sp)
1362 stmg $t0,$t1,0($ivp)
1363
1364 br $ra
1365
1366.align 16
1367.Lcbc_dec_tail:
1368 aghi $len,15
1369 stg $s0,16*$SIZE_T($sp)
1370 stg $s2,16*$SIZE_T+8($sp)
1371 bras $s1,4f
1372 mvc 0(1,$out),16*$SIZE_T($sp)
13734: ex $len,0($s1)
1374 j .Lcbc_dec_exit
1375.size AES_cbc_encrypt,.-AES_cbc_encrypt
1376___
1377}
1378########################################################################
1379# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
1380# size_t blocks, const AES_KEY *key,
1381# const unsigned char *ivec)
1382{
1383my $inp="%r2";
1384my $out="%r4"; # blocks and out are swapped
1385my $len="%r3";
1386my $key="%r5"; my $iv0="%r5";
1387my $ivp="%r6";
1388my $fp ="%r7";
1389
1390$code.=<<___;
1391.globl AES_ctr32_encrypt
1392.type AES_ctr32_encrypt,\@function
1393.align 16
1394AES_ctr32_encrypt:
1395 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1396 xgr %r4,%r3
1397 xgr %r3,%r4
1398 llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
1399___
1400$code.=<<___ if (!$softonly);
1401 l %r0,240($key)
1402 lhi %r1,16
1403 clr %r0,%r1
1404 jl .Lctr32_software
1405
1406 stm${g} %r6,$s3,6*$SIZE_T($sp)
1407
1408 slgr $out,$inp
1409 la %r1,0($key) # %r1 is permanent copy of $key
1410 lg $iv0,0($ivp) # load ivec
1411 lg $ivp,8($ivp)
1412
1413 # prepare and allocate stack frame at the top of 4K page
1414 # with 1K reserved for eventual signal handling
1415 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1416 lghi $s1,-4096
1417 algr $s0,$sp
1418 lgr $fp,$sp
1419 ngr $s0,$s1 # align at page boundary
1420 slgr $fp,$s0 # total buffer size
1421 lgr $s2,$sp
1422 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1423 slgr $fp,$s1 # deduct reservation to get usable buffer size
1424 # buffer size is at lest 256 and at most 3072+256-16
1425
1426 la $sp,1024($s0) # alloca
1427 srlg $fp,$fp,4 # convert bytes to blocks, minimum 16
1428 st${g} $s2,0($sp) # back-chain
1429 st${g} $fp,$SIZE_T($sp)
1430
1431 slgr $len,$fp
1432 brc 1,.Lctr32_hw_switch # not zero, no borrow
1433 algr $fp,$len # input is shorter than allocated buffer
1434 lghi $len,0
1435 st${g} $fp,$SIZE_T($sp)
1436
1437.Lctr32_hw_switch:
1438___
1439$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
1440 larl $s0,OPENSSL_s390xcap_P
1441 lg $s0,8($s0)
1442 tmhh $s0,0x0004 # check for message_security-assist-4
1443 jz .Lctr32_km_loop
1444
1445 llgfr $s0,%r0
1446 lgr $s1,%r1
1447 lghi %r0,0
1448 la %r1,16($sp)
1449 .long 0xb92d2042 # kmctr %r4,%r2,%r2
1450
1451 llihh %r0,0x8000 # check if kmctr supports the function code
1452 srlg %r0,%r0,0($s0)
1453 ng %r0,16($sp)
1454 lgr %r0,$s0
1455 lgr %r1,$s1
1456 jz .Lctr32_km_loop
1457
1458####### kmctr code
1459 algr $out,$inp # restore $out
1460 lgr $s1,$len # $s1 undertakes $len
1461 j .Lctr32_kmctr_loop
1462.align 16
1463.Lctr32_kmctr_loop:
1464 la $s2,16($sp)
1465 lgr $s3,$fp
1466.Lctr32_kmctr_prepare:
1467 stg $iv0,0($s2)
1468 stg $ivp,8($s2)
1469 la $s2,16($s2)
1470 ahi $ivp,1 # 32-bit increment, preserves upper half
1471 brct $s3,.Lctr32_kmctr_prepare
1472
1473 #la $inp,0($inp) # inp
1474 sllg $len,$fp,4 # len
1475 #la $out,0($out) # out
1476 la $s2,16($sp) # iv
1477 .long 0xb92da042 # kmctr $out,$s2,$inp
1478 brc 1,.-4 # pay attention to "partial completion"
1479
1480 slgr $s1,$fp
1481 brc 1,.Lctr32_kmctr_loop # not zero, no borrow
1482 algr $fp,$s1
1483 lghi $s1,0
1484 brc 4+1,.Lctr32_kmctr_loop # not zero
1485
1486 l${g} $sp,0($sp)
1487 lm${g} %r6,$s3,6*$SIZE_T($sp)
1488 br $ra
1489.align 16
1490___
1491$code.=<<___;
1492.Lctr32_km_loop:
1493 la $s2,16($sp)
1494 lgr $s3,$fp
1495.Lctr32_km_prepare:
1496 stg $iv0,0($s2)
1497 stg $ivp,8($s2)
1498 la $s2,16($s2)
1499 ahi $ivp,1 # 32-bit increment, preserves upper half
1500 brct $s3,.Lctr32_km_prepare
1501
1502 la $s0,16($sp) # inp
1503 sllg $s1,$fp,4 # len
1504 la $s2,16($sp) # out
1505 .long 0xb92e00a8 # km %r10,%r8
1506 brc 1,.-4 # pay attention to "partial completion"
1507
1508 la $s2,16($sp)
1509 lgr $s3,$fp
1510 slgr $s2,$inp
1511.Lctr32_km_xor:
1512 lg $s0,0($inp)
1513 lg $s1,8($inp)
1514 xg $s0,0($s2,$inp)
1515 xg $s1,8($s2,$inp)
1516 stg $s0,0($out,$inp)
1517 stg $s1,8($out,$inp)
1518 la $inp,16($inp)
1519 brct $s3,.Lctr32_km_xor
1520
1521 slgr $len,$fp
1522 brc 1,.Lctr32_km_loop # not zero, no borrow
1523 algr $fp,$len
1524 lghi $len,0
1525 brc 4+1,.Lctr32_km_loop # not zero
1526
1527 l${g} $s0,0($sp)
1528 l${g} $s1,$SIZE_T($sp)
1529 la $s2,16($sp)
1530.Lctr32_km_zap:
1531 stg $s0,0($s2)
1532 stg $s0,8($s2)
1533 la $s2,16($s2)
1534 brct $s1,.Lctr32_km_zap
1535
1536 la $sp,0($s0)
1537 lm${g} %r6,$s3,6*$SIZE_T($sp)
1538 br $ra
1539.align 16
1540.Lctr32_software:
1541___
1542$code.=<<___;
1543 stm${g} $key,$ra,5*$SIZE_T($sp)
1544 sl${g}r $inp,$out
1545 larl $tbl,AES_Te
1546 llgf $t1,12($ivp)
1547
1548.Lctr32_loop:
1549 stm${g} $inp,$out,2*$SIZE_T($sp)
1550 llgf $s0,0($ivp)
1551 llgf $s1,4($ivp)
1552 llgf $s2,8($ivp)
1553 lgr $s3,$t1
1554 st $t1,16*$SIZE_T($sp)
1555 lgr %r4,$key
1556
1557 bras $ra,_s390x_AES_encrypt
1558
1559 lm${g} $inp,$ivp,2*$SIZE_T($sp)
1560 llgf $t1,16*$SIZE_T($sp)
1561 x $s0,0($inp,$out)
1562 x $s1,4($inp,$out)
1563 x $s2,8($inp,$out)
1564 x $s3,12($inp,$out)
1565 stm $s0,$s3,0($out)
1566
1567 la $out,16($out)
1568 ahi $t1,1 # 32-bit increment
1569 brct $len,.Lctr32_loop
1570
1571 lm${g} %r6,$ra,6*$SIZE_T($sp)
1572 br $ra
1573.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
1574___
1575}
1576
1577########################################################################
1578# void AES_xts_encrypt(const char *inp,char *out,size_t len,
1579# const AES_KEY *key1, const AES_KEY *key2,
1580# const unsigned char iv[16]);
1581#
1582{
1583my $inp="%r2";
1584my $out="%r4"; # len and out are swapped
1585my $len="%r3";
1586my $key1="%r5"; # $i1
1587my $key2="%r6"; # $i2
1588my $fp="%r7"; # $i3
1589my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
1590
1591$code.=<<___;
1592.type _s390x_xts_km,\@function
1593.align 16
1594_s390x_xts_km:
1595___
1596$code.=<<___ if(1);
1597 llgfr $s0,%r0 # put aside the function code
1598 lghi $s1,0x7f
1599 nr $s1,%r0
1600 lghi %r0,0 # query capability vector
1601 la %r1,$tweak-16($sp)
1602 .long 0xb92e0042 # km %r4,%r2
1603 llihh %r1,0x8000
1604 srlg %r1,%r1,32($s1) # check for 32+function code
1605 ng %r1,$tweak-16($sp)
1606 lgr %r0,$s0 # restore the function code
1607 la %r1,0($key1) # restore $key1
1608 jz .Lxts_km_vanilla
1609
1610 lmg $i2,$i3,$tweak($sp) # put aside the tweak value
1611 algr $out,$inp
1612
1613 oill %r0,32 # switch to xts function code
1614 aghi $s1,-18 #
1615 sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
1616 la %r1,$tweak-16($sp)
1617 slgr %r1,$s1 # parameter block position
1618 lmg $s0,$s3,0($key1) # load 256 bits of key material,
1619 stmg $s0,$s3,0(%r1) # and copy it to parameter block.
1620 # yes, it contains junk and overlaps
1621 # with the tweak in 128-bit case.
1622 # it's done to avoid conditional
1623 # branch.
1624 stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
1625
1626 .long 0xb92e0042 # km %r4,%r2
1627 brc 1,.-4 # pay attention to "partial completion"
1628
1629 lrvg $s0,$tweak+0($sp) # load the last tweak
1630 lrvg $s1,$tweak+8($sp)
1631 stmg %r0,%r3,$tweak-32($sp) # wipe copy of the key
1632
1633 nill %r0,0xffdf # switch back to original function code
1634 la %r1,0($key1) # restore pointer to $key1
1635 slgr $out,$inp
1636
1637 llgc $len,2*$SIZE_T-1($sp)
1638 nill $len,0x0f # $len%=16
1639 br $ra
1640
1641.align 16
1642.Lxts_km_vanilla:
1643___
1644$code.=<<___;
1645 # prepare and allocate stack frame at the top of 4K page
1646 # with 1K reserved for eventual signal handling
1647 lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
1648 lghi $s1,-4096
1649 algr $s0,$sp
1650 lgr $fp,$sp
1651 ngr $s0,$s1 # align at page boundary
1652 slgr $fp,$s0 # total buffer size
1653 lgr $s2,$sp
1654 lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
1655 slgr $fp,$s1 # deduct reservation to get usable buffer size
1656 # buffer size is at lest 256 and at most 3072+256-16
1657
1658 la $sp,1024($s0) # alloca
1659 nill $fp,0xfff0 # round to 16*n
1660 st${g} $s2,0($sp) # back-chain
1661 nill $len,0xfff0 # redundant
1662 st${g} $fp,$SIZE_T($sp)
1663
1664 slgr $len,$fp
1665 brc 1,.Lxts_km_go # not zero, no borrow
1666 algr $fp,$len # input is shorter than allocated buffer
1667 lghi $len,0
1668 st${g} $fp,$SIZE_T($sp)
1669
1670.Lxts_km_go:
1671 lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
1672 lrvg $s1,$tweak+8($s2)
1673
1674 la $s2,16($sp) # vector of ascending tweak values
1675 slgr $s2,$inp
1676 srlg $s3,$fp,4
1677 j .Lxts_km_start
1678
1679.Lxts_km_loop:
1680 la $s2,16($sp)
1681 slgr $s2,$inp
1682 srlg $s3,$fp,4
1683.Lxts_km_prepare:
1684 lghi $i1,0x87
1685 srag $i2,$s1,63 # broadcast upper bit
1686 ngr $i1,$i2 # rem
1687 algr $s0,$s0
1688 alcgr $s1,$s1
1689 xgr $s0,$i1
1690.Lxts_km_start:
1691 lrvgr $i1,$s0 # flip byte order
1692 lrvgr $i2,$s1
1693 stg $i1,0($s2,$inp)
1694 stg $i2,8($s2,$inp)
1695 xg $i1,0($inp)
1696 xg $i2,8($inp)
1697 stg $i1,0($out,$inp)
1698 stg $i2,8($out,$inp)
1699 la $inp,16($inp)
1700 brct $s3,.Lxts_km_prepare
1701
1702 slgr $inp,$fp # rewind $inp
1703 la $s2,0($out,$inp)
1704 lgr $s3,$fp
1705 .long 0xb92e00aa # km $s2,$s2
1706 brc 1,.-4 # pay attention to "partial completion"
1707
1708 la $s2,16($sp)
1709 slgr $s2,$inp
1710 srlg $s3,$fp,4
1711.Lxts_km_xor:
1712 lg $i1,0($out,$inp)
1713 lg $i2,8($out,$inp)
1714 xg $i1,0($s2,$inp)
1715 xg $i2,8($s2,$inp)
1716 stg $i1,0($out,$inp)
1717 stg $i2,8($out,$inp)
1718 la $inp,16($inp)
1719 brct $s3,.Lxts_km_xor
1720
1721 slgr $len,$fp
1722 brc 1,.Lxts_km_loop # not zero, no borrow
1723 algr $fp,$len
1724 lghi $len,0
1725 brc 4+1,.Lxts_km_loop # not zero
1726
1727 l${g} $i1,0($sp) # back-chain
1728 llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
1729 la $i2,16($sp)
1730 srlg $fp,$fp,4
1731.Lxts_km_zap:
1732 stg $i1,0($i2)
1733 stg $i1,8($i2)
1734 la $i2,16($i2)
1735 brct $fp,.Lxts_km_zap
1736
1737 la $sp,0($i1)
1738 llgc $len,2*$SIZE_T-1($i1)
1739 nill $len,0x0f # $len%=16
1740 bzr $ra
1741
1742 # generate one more tweak...
1743 lghi $i1,0x87
1744 srag $i2,$s1,63 # broadcast upper bit
1745 ngr $i1,$i2 # rem
1746 algr $s0,$s0
1747 alcgr $s1,$s1
1748 xgr $s0,$i1
1749
1750 ltr $len,$len # clear zero flag
1751 br $ra
1752.size _s390x_xts_km,.-_s390x_xts_km
1753
1754.globl AES_xts_encrypt
1755.type AES_xts_encrypt,\@function
1756.align 16
1757AES_xts_encrypt:
1758 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1759 xgr %r4,%r3
1760 xgr %r3,%r4
1761___
1762$code.=<<___ if ($SIZE_T==4);
1763 llgfr $len,$len
1764___
1765$code.=<<___;
1766 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1767 srag $len,$len,4 # formally wrong, because it expands
1768 # sign byte, but who can afford asking
1769 # to process more than 2^63-1 bytes?
1770 # I use it, because it sets condition
1771 # code...
1772 bcr 8,$ra # abort if zero (i.e. less than 16)
1773___
1774$code.=<<___ if (!$softonly);
1775 llgf %r0,240($key2)
1776 lhi %r1,16
1777 clr %r0,%r1
1778 jl .Lxts_enc_software
1779
1780 st${g} $ra,5*$SIZE_T($sp)
1781 stm${g} %r6,$s3,6*$SIZE_T($sp)
1782
1783 sllg $len,$len,4 # $len&=~15
1784 slgr $out,$inp
1785
1786 # generate the tweak value
1787 l${g} $s3,$stdframe($sp) # pointer to iv
1788 la $s2,$tweak($sp)
1789 lmg $s0,$s1,0($s3)
1790 lghi $s3,16
1791 stmg $s0,$s1,0($s2)
1792 la %r1,0($key2) # $key2 is not needed anymore
1793 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1794 brc 1,.-4 # can this happen?
1795
1796 l %r0,240($key1)
1797 la %r1,0($key1) # $key1 is not needed anymore
1798 bras $ra,_s390x_xts_km
1799 jz .Lxts_enc_km_done
1800
1801 aghi $inp,-16 # take one step back
1802 la $i3,0($out,$inp) # put aside real $out
1803.Lxts_enc_km_steal:
1804 llgc $i1,16($inp)
1805 llgc $i2,0($out,$inp)
1806 stc $i1,0($out,$inp)
1807 stc $i2,16($out,$inp)
1808 la $inp,1($inp)
1809 brct $len,.Lxts_enc_km_steal
1810
1811 la $s2,0($i3)
1812 lghi $s3,16
1813 lrvgr $i1,$s0 # flip byte order
1814 lrvgr $i2,$s1
1815 xg $i1,0($s2)
1816 xg $i2,8($s2)
1817 stg $i1,0($s2)
1818 stg $i2,8($s2)
1819 .long 0xb92e00aa # km $s2,$s2
1820 brc 1,.-4 # can this happen?
1821 lrvgr $i1,$s0 # flip byte order
1822 lrvgr $i2,$s1
1823 xg $i1,0($i3)
1824 xg $i2,8($i3)
1825 stg $i1,0($i3)
1826 stg $i2,8($i3)
1827
1828.Lxts_enc_km_done:
1829 stg $sp,$tweak+0($sp) # wipe tweak
1830 stg $sp,$tweak+8($sp)
1831 l${g} $ra,5*$SIZE_T($sp)
1832 lm${g} %r6,$s3,6*$SIZE_T($sp)
1833 br $ra
1834.align 16
1835.Lxts_enc_software:
1836___
1837$code.=<<___;
1838 stm${g} %r6,$ra,6*$SIZE_T($sp)
1839
1840 slgr $out,$inp
1841
1842 l${g} $s3,$stdframe($sp) # ivp
1843 llgf $s0,0($s3) # load iv
1844 llgf $s1,4($s3)
1845 llgf $s2,8($s3)
1846 llgf $s3,12($s3)
1847 stm${g} %r2,%r5,2*$SIZE_T($sp)
1848 la $key,0($key2)
1849 larl $tbl,AES_Te
1850 bras $ra,_s390x_AES_encrypt # generate the tweak
1851 lm${g} %r2,%r5,2*$SIZE_T($sp)
1852 stm $s0,$s3,$tweak($sp) # save the tweak
1853 j .Lxts_enc_enter
1854
1855.align 16
1856.Lxts_enc_loop:
1857 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1858 lrvg $s3,$tweak+8($sp)
1859 lghi %r1,0x87
1860 srag %r0,$s3,63 # broadcast upper bit
1861 ngr %r1,%r0 # rem
1862 algr $s1,$s1
1863 alcgr $s3,$s3
1864 xgr $s1,%r1
1865 lrvgr $s1,$s1 # flip byte order
1866 lrvgr $s3,$s3
1867 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1868 stg $s1,$tweak+0($sp) # save the tweak
1869 llgfr $s1,$s1
1870 srlg $s2,$s3,32
1871 stg $s3,$tweak+8($sp)
1872 llgfr $s3,$s3
1873 la $inp,16($inp) # $inp+=16
1874.Lxts_enc_enter:
1875 x $s0,0($inp) # ^=*($inp)
1876 x $s1,4($inp)
1877 x $s2,8($inp)
1878 x $s3,12($inp)
1879 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
1880 la $key,0($key1)
1881 bras $ra,_s390x_AES_encrypt
1882 lm${g} %r2,%r5,2*$SIZE_T($sp)
1883 x $s0,$tweak+0($sp) # ^=tweak
1884 x $s1,$tweak+4($sp)
1885 x $s2,$tweak+8($sp)
1886 x $s3,$tweak+12($sp)
1887 st $s0,0($out,$inp)
1888 st $s1,4($out,$inp)
1889 st $s2,8($out,$inp)
1890 st $s3,12($out,$inp)
1891 brct${g} $len,.Lxts_enc_loop
1892
1893 llgc $len,`2*$SIZE_T-1`($sp)
1894 nill $len,0x0f # $len%16
1895 jz .Lxts_enc_done
1896
1897 la $i3,0($inp,$out) # put aside real $out
1898.Lxts_enc_steal:
1899 llgc %r0,16($inp)
1900 llgc %r1,0($out,$inp)
1901 stc %r0,0($out,$inp)
1902 stc %r1,16($out,$inp)
1903 la $inp,1($inp)
1904 brct $len,.Lxts_enc_steal
1905 la $out,0($i3) # restore real $out
1906
1907 # generate last tweak...
1908 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
1909 lrvg $s3,$tweak+8($sp)
1910 lghi %r1,0x87
1911 srag %r0,$s3,63 # broadcast upper bit
1912 ngr %r1,%r0 # rem
1913 algr $s1,$s1
1914 alcgr $s3,$s3
1915 xgr $s1,%r1
1916 lrvgr $s1,$s1 # flip byte order
1917 lrvgr $s3,$s3
1918 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
1919 stg $s1,$tweak+0($sp) # save the tweak
1920 llgfr $s1,$s1
1921 srlg $s2,$s3,32
1922 stg $s3,$tweak+8($sp)
1923 llgfr $s3,$s3
1924
1925 x $s0,0($out) # ^=*(inp)|stolen cipther-text
1926 x $s1,4($out)
1927 x $s2,8($out)
1928 x $s3,12($out)
1929 st${g} $out,4*$SIZE_T($sp)
1930 la $key,0($key1)
1931 bras $ra,_s390x_AES_encrypt
1932 l${g} $out,4*$SIZE_T($sp)
1933 x $s0,`$tweak+0`($sp) # ^=tweak
1934 x $s1,`$tweak+4`($sp)
1935 x $s2,`$tweak+8`($sp)
1936 x $s3,`$tweak+12`($sp)
1937 st $s0,0($out)
1938 st $s1,4($out)
1939 st $s2,8($out)
1940 st $s3,12($out)
1941
1942.Lxts_enc_done:
1943 stg $sp,$tweak+0($sp) # wipe tweak
1944 stg $sp,$twesk+8($sp)
1945 lm${g} %r6,$ra,6*$SIZE_T($sp)
1946 br $ra
1947.size AES_xts_encrypt,.-AES_xts_encrypt
1948___
1949# void AES_xts_decrypt(const char *inp,char *out,size_t len,
1950# const AES_KEY *key1, const AES_KEY *key2,
1951# const unsigned char iv[16]);
1952#
1953$code.=<<___;
1954.globl AES_xts_decrypt
1955.type AES_xts_decrypt,\@function
1956.align 16
1957AES_xts_decrypt:
1958 xgr %r3,%r4 # flip %r3 and %r4, $out and $len
1959 xgr %r4,%r3
1960 xgr %r3,%r4
1961___
1962$code.=<<___ if ($SIZE_T==4);
1963 llgfr $len,$len
1964___
1965$code.=<<___;
1966 st${g} $len,1*$SIZE_T($sp) # save copy of $len
1967 aghi $len,-16
1968 bcr 4,$ra # abort if less than zero. formally
1969 # wrong, because $len is unsigned,
1970 # but who can afford asking to
1971 # process more than 2^63-1 bytes?
1972 tmll $len,0x0f
1973 jnz .Lxts_dec_proceed
1974 aghi $len,16
1975.Lxts_dec_proceed:
1976___
1977$code.=<<___ if (!$softonly);
1978 llgf %r0,240($key2)
1979 lhi %r1,16
1980 clr %r0,%r1
1981 jl .Lxts_dec_software
1982
1983 st${g} $ra,5*$SIZE_T($sp)
1984 stm${g} %r6,$s3,6*$SIZE_T($sp)
1985
1986 nill $len,0xfff0 # $len&=~15
1987 slgr $out,$inp
1988
1989 # generate the tweak value
1990 l${g} $s3,$stdframe($sp) # pointer to iv
1991 la $s2,$tweak($sp)
1992 lmg $s0,$s1,0($s3)
1993 lghi $s3,16
1994 stmg $s0,$s1,0($s2)
1995 la %r1,0($key2) # $key2 is not needed past this point
1996 .long 0xb92e00aa # km $s2,$s2, generate the tweak
1997 brc 1,.-4 # can this happen?
1998
1999 l %r0,240($key1)
2000 la %r1,0($key1) # $key1 is not needed anymore
2001
2002 ltgr $len,$len
2003 jz .Lxts_dec_km_short
2004 bras $ra,_s390x_xts_km
2005 jz .Lxts_dec_km_done
2006
2007 lrvgr $s2,$s0 # make copy in reverse byte order
2008 lrvgr $s3,$s1
2009 j .Lxts_dec_km_2ndtweak
2010
2011.Lxts_dec_km_short:
2012 llgc $len,`2*$SIZE_T-1`($sp)
2013 nill $len,0x0f # $len%=16
2014 lrvg $s0,$tweak+0($sp) # load the tweak
2015 lrvg $s1,$tweak+8($sp)
2016 lrvgr $s2,$s0 # make copy in reverse byte order
2017 lrvgr $s3,$s1
2018
2019.Lxts_dec_km_2ndtweak:
2020 lghi $i1,0x87
2021 srag $i2,$s1,63 # broadcast upper bit
2022 ngr $i1,$i2 # rem
2023 algr $s0,$s0
2024 alcgr $s1,$s1
2025 xgr $s0,$i1
2026 lrvgr $i1,$s0 # flip byte order
2027 lrvgr $i2,$s1
2028
2029 xg $i1,0($inp)
2030 xg $i2,8($inp)
2031 stg $i1,0($out,$inp)
2032 stg $i2,8($out,$inp)
2033 la $i2,0($out,$inp)
2034 lghi $i3,16
2035 .long 0xb92e0066 # km $i2,$i2
2036 brc 1,.-4 # can this happen?
2037 lrvgr $i1,$s0
2038 lrvgr $i2,$s1
2039 xg $i1,0($out,$inp)
2040 xg $i2,8($out,$inp)
2041 stg $i1,0($out,$inp)
2042 stg $i2,8($out,$inp)
2043
2044 la $i3,0($out,$inp) # put aside real $out
2045.Lxts_dec_km_steal:
2046 llgc $i1,16($inp)
2047 llgc $i2,0($out,$inp)
2048 stc $i1,0($out,$inp)
2049 stc $i2,16($out,$inp)
2050 la $inp,1($inp)
2051 brct $len,.Lxts_dec_km_steal
2052
2053 lgr $s0,$s2
2054 lgr $s1,$s3
2055 xg $s0,0($i3)
2056 xg $s1,8($i3)
2057 stg $s0,0($i3)
2058 stg $s1,8($i3)
2059 la $s0,0($i3)
2060 lghi $s1,16
2061 .long 0xb92e0088 # km $s0,$s0
2062 brc 1,.-4 # can this happen?
2063 xg $s2,0($i3)
2064 xg $s3,8($i3)
2065 stg $s2,0($i3)
2066 stg $s3,8($i3)
2067.Lxts_dec_km_done:
2068 stg $sp,$tweak+0($sp) # wipe tweak
2069 stg $sp,$tweak+8($sp)
2070 l${g} $ra,5*$SIZE_T($sp)
2071 lm${g} %r6,$s3,6*$SIZE_T($sp)
2072 br $ra
2073.align 16
2074.Lxts_dec_software:
2075___
2076$code.=<<___;
2077 stm${g} %r6,$ra,6*$SIZE_T($sp)
2078
2079 srlg $len,$len,4
2080 slgr $out,$inp
2081
2082 l${g} $s3,$stdframe($sp) # ivp
2083 llgf $s0,0($s3) # load iv
2084 llgf $s1,4($s3)
2085 llgf $s2,8($s3)
2086 llgf $s3,12($s3)
2087 stm${g} %r2,%r5,2*$SIZE_T($sp)
2088 la $key,0($key2)
2089 larl $tbl,AES_Te
2090 bras $ra,_s390x_AES_encrypt # generate the tweak
2091 lm${g} %r2,%r5,2*$SIZE_T($sp)
2092 larl $tbl,AES_Td
2093 lt${g}r $len,$len
2094 stm $s0,$s3,$tweak($sp) # save the tweak
2095 jz .Lxts_dec_short
2096 j .Lxts_dec_enter
2097
2098.align 16
2099.Lxts_dec_loop:
2100 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2101 lrvg $s3,$tweak+8($sp)
2102 lghi %r1,0x87
2103 srag %r0,$s3,63 # broadcast upper bit
2104 ngr %r1,%r0 # rem
2105 algr $s1,$s1
2106 alcgr $s3,$s3
2107 xgr $s1,%r1
2108 lrvgr $s1,$s1 # flip byte order
2109 lrvgr $s3,$s3
2110 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2111 stg $s1,$tweak+0($sp) # save the tweak
2112 llgfr $s1,$s1
2113 srlg $s2,$s3,32
2114 stg $s3,$tweak+8($sp)
2115 llgfr $s3,$s3
2116.Lxts_dec_enter:
2117 x $s0,0($inp) # tweak^=*(inp)
2118 x $s1,4($inp)
2119 x $s2,8($inp)
2120 x $s3,12($inp)
2121 stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
2122 la $key,0($key1)
2123 bras $ra,_s390x_AES_decrypt
2124 lm${g} %r2,%r5,2*$SIZE_T($sp)
2125 x $s0,$tweak+0($sp) # ^=tweak
2126 x $s1,$tweak+4($sp)
2127 x $s2,$tweak+8($sp)
2128 x $s3,$tweak+12($sp)
2129 st $s0,0($out,$inp)
2130 st $s1,4($out,$inp)
2131 st $s2,8($out,$inp)
2132 st $s3,12($out,$inp)
2133 la $inp,16($inp)
2134 brct${g} $len,.Lxts_dec_loop
2135
2136 llgc $len,`2*$SIZE_T-1`($sp)
2137 nill $len,0x0f # $len%16
2138 jz .Lxts_dec_done
2139
2140 # generate pair of tweaks...
2141 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2142 lrvg $s3,$tweak+8($sp)
2143 lghi %r1,0x87
2144 srag %r0,$s3,63 # broadcast upper bit
2145 ngr %r1,%r0 # rem
2146 algr $s1,$s1
2147 alcgr $s3,$s3
2148 xgr $s1,%r1
2149 lrvgr $i2,$s1 # flip byte order
2150 lrvgr $i3,$s3
2151 stmg $i2,$i3,$tweak($sp) # save the 1st tweak
2152 j .Lxts_dec_2ndtweak
2153
2154.align 16
2155.Lxts_dec_short:
2156 llgc $len,`2*$SIZE_T-1`($sp)
2157 nill $len,0x0f # $len%16
2158 lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
2159 lrvg $s3,$tweak+8($sp)
2160.Lxts_dec_2ndtweak:
2161 lghi %r1,0x87
2162 srag %r0,$s3,63 # broadcast upper bit
2163 ngr %r1,%r0 # rem
2164 algr $s1,$s1
2165 alcgr $s3,$s3
2166 xgr $s1,%r1
2167 lrvgr $s1,$s1 # flip byte order
2168 lrvgr $s3,$s3
2169 srlg $s0,$s1,32 # smash the tweak to 4x32-bits
2170 stg $s1,$tweak-16+0($sp) # save the 2nd tweak
2171 llgfr $s1,$s1
2172 srlg $s2,$s3,32
2173 stg $s3,$tweak-16+8($sp)
2174 llgfr $s3,$s3
2175
2176 x $s0,0($inp) # tweak_the_2nd^=*(inp)
2177 x $s1,4($inp)
2178 x $s2,8($inp)
2179 x $s3,12($inp)
2180 stm${g} %r2,%r3,2*$SIZE_T($sp)
2181 la $key,0($key1)
2182 bras $ra,_s390x_AES_decrypt
2183 lm${g} %r2,%r5,2*$SIZE_T($sp)
2184 x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
2185 x $s1,$tweak-16+4($sp)
2186 x $s2,$tweak-16+8($sp)
2187 x $s3,$tweak-16+12($sp)
2188 st $s0,0($out,$inp)
2189 st $s1,4($out,$inp)
2190 st $s2,8($out,$inp)
2191 st $s3,12($out,$inp)
2192
2193 la $i3,0($out,$inp) # put aside real $out
2194.Lxts_dec_steal:
2195 llgc %r0,16($inp)
2196 llgc %r1,0($out,$inp)
2197 stc %r0,0($out,$inp)
2198 stc %r1,16($out,$inp)
2199 la $inp,1($inp)
2200 brct $len,.Lxts_dec_steal
2201 la $out,0($i3) # restore real $out
2202
2203 lm $s0,$s3,$tweak($sp) # load the 1st tweak
2204 x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
2205 x $s1,4($out)
2206 x $s2,8($out)
2207 x $s3,12($out)
2208 st${g} $out,4*$SIZE_T($sp)
2209 la $key,0($key1)
2210 bras $ra,_s390x_AES_decrypt
2211 l${g} $out,4*$SIZE_T($sp)
2212 x $s0,$tweak+0($sp) # ^=tweak
2213 x $s1,$tweak+4($sp)
2214 x $s2,$tweak+8($sp)
2215 x $s3,$tweak+12($sp)
2216 st $s0,0($out)
2217 st $s1,4($out)
2218 st $s2,8($out)
2219 st $s3,12($out)
2220 stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
2221 stg $sp,$tweak-16+8($sp)
2222.Lxts_dec_done:
2223 stg $sp,$tweak+0($sp) # wipe tweak
2224 stg $sp,$twesk+8($sp)
2225 lm${g} %r6,$ra,6*$SIZE_T($sp)
2226 br $ra
2227.size AES_xts_decrypt,.-AES_xts_decrypt
2228___
2229}
2230$code.=<<___;
2231.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
2232.comm OPENSSL_s390xcap_P,16,8
2233___
2234
2235$code =~ s/\`([^\`]*)\`/eval $1/gem;
2236print $code;
2237close STDOUT; # force flush
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
deleted file mode 100644
index cd9f13eca2..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
+++ /dev/null
@@ -1,221 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# May 2011
11#
12# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
13# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
14# the time being... gcc 4.3 appeared to generate poor code, therefore
15# the effort. And indeed, the module delivers 55%-90%(*) improvement
16# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
17# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
18# This is for 64-bit build. In 32-bit "highgprs" case improvement is
19# even higher, for example on z990 it was measured 80%-150%. ECDSA
20# sign is modest 9%-12% faster. Keep in mind that these coefficients
21# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
22# burnt in it...
23#
24# (*) gcc 4.1 was observed to deliver better results than gcc 4.3,
25# so that improvement coefficients can vary from one specific
26# setup to another.
27
28$flavour = shift;
29
30if ($flavour =~ /3[12]/) {
31 $SIZE_T=4;
32 $g="";
33} else {
34 $SIZE_T=8;
35 $g="g";
36}
37
38while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39open STDOUT,">$output";
40
41$stdframe=16*$SIZE_T+4*8;
42
43$rp="%r2";
44$a1="%r3";
45$a0="%r4";
46$b1="%r5";
47$b0="%r6";
48
49$ra="%r14";
50$sp="%r15";
51
52@T=("%r0","%r1");
53@i=("%r12","%r13");
54
55($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
56($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
57
58$code.=<<___;
59.text
60
61.type _mul_1x1,\@function
62.align 16
63_mul_1x1:
64 lgr $a1,$a
65 sllg $a2,$a,1
66 sllg $a4,$a,2
67 sllg $a8,$a,3
68
69 srag $lo,$a1,63 # broadcast 63rd bit
70 nihh $a1,0x1fff
71 srag @i[0],$a2,63 # broadcast 62nd bit
72 nihh $a2,0x3fff
73 srag @i[1],$a4,63 # broadcast 61st bit
74 nihh $a4,0x7fff
75 ngr $lo,$b
76 ngr @i[0],$b
77 ngr @i[1],$b
78
79 lghi @T[0],0
80 lgr $a12,$a1
81 stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0
82 xgr $a12,$a2
83 stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1
84 lgr $a48,$a4
85 stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2
86 xgr $a48,$a8
87 stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2
88 xgr $a1,$a4
89
90 stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4
91 xgr $a2,$a4
92 stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4
93 xgr $a12,$a4
94 stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4
95 xgr $a1,$a48
96 stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4
97 xgr $a2,$a48
98
99 stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8
100 xgr $a12,$a48
101 stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8
102 xgr $a1,$a4
103 stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8
104 xgr $a2,$a4
105 stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8
106
107 xgr $a12,$a4
108 stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8
109 srlg $hi,$lo,1
110 stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8
111 sllg $lo,$lo,63
112 stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8
113 srlg @T[0],@i[0],2
114 stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8
115
116 lghi $mask,`0xf<<3`
117 sllg $a1,@i[0],62
118 sllg @i[0],$b,3
119 srlg @T[1],@i[1],3
120 ngr @i[0],$mask
121 sllg $a2,@i[1],61
122 srlg @i[1],$b,4-3
123 xgr $hi,@T[0]
124 ngr @i[1],$mask
125 xgr $lo,$a1
126 xgr $hi,@T[1]
127 xgr $lo,$a2
128
129 xg $lo,$stdframe(@i[0],$sp)
130 srlg @i[0],$b,8-3
131 ngr @i[0],$mask
132___
133for($n=1;$n<14;$n++) {
134$code.=<<___;
135 lg @T[1],$stdframe(@i[1],$sp)
136 srlg @i[1],$b,`($n+2)*4`-3
137 sllg @T[0],@T[1],`$n*4`
138 ngr @i[1],$mask
139 srlg @T[1],@T[1],`64-$n*4`
140 xgr $lo,@T[0]
141 xgr $hi,@T[1]
142___
143 push(@i,shift(@i)); push(@T,shift(@T));
144}
145$code.=<<___;
146 lg @T[1],$stdframe(@i[1],$sp)
147 sllg @T[0],@T[1],`$n*4`
148 srlg @T[1],@T[1],`64-$n*4`
149 xgr $lo,@T[0]
150 xgr $hi,@T[1]
151
152 lg @T[0],$stdframe(@i[0],$sp)
153 sllg @T[1],@T[0],`($n+1)*4`
154 srlg @T[0],@T[0],`64-($n+1)*4`
155 xgr $lo,@T[1]
156 xgr $hi,@T[0]
157
158 br $ra
159.size _mul_1x1,.-_mul_1x1
160
161.globl bn_GF2m_mul_2x2
162.type bn_GF2m_mul_2x2,\@function
163.align 16
164bn_GF2m_mul_2x2:
165 stm${g} %r3,%r15,3*$SIZE_T($sp)
166
167 lghi %r1,-$stdframe-128
168 la %r0,0($sp)
169 la $sp,0(%r1,$sp) # alloca
170 st${g} %r0,0($sp) # back chain
171___
172if ($SIZE_T==8) {
173my @r=map("%r$_",(6..9));
174$code.=<<___;
175 bras $ra,_mul_1x1 # a1·b1
176 stmg $lo,$hi,16($rp)
177
178 lg $a,`$stdframe+128+4*$SIZE_T`($sp)
179 lg $b,`$stdframe+128+6*$SIZE_T`($sp)
180 bras $ra,_mul_1x1 # a0·b0
181 stmg $lo,$hi,0($rp)
182
183 lg $a,`$stdframe+128+3*$SIZE_T`($sp)
184 lg $b,`$stdframe+128+5*$SIZE_T`($sp)
185 xg $a,`$stdframe+128+4*$SIZE_T`($sp)
186 xg $b,`$stdframe+128+6*$SIZE_T`($sp)
187 bras $ra,_mul_1x1 # (a0+a1)·(b0+b1)
188 lmg @r[0],@r[3],0($rp)
189
190 xgr $lo,$hi
191 xgr $hi,@r[1]
192 xgr $lo,@r[0]
193 xgr $hi,@r[2]
194 xgr $lo,@r[3]
195 xgr $hi,@r[3]
196 xgr $lo,$hi
197 stg $hi,16($rp)
198 stg $lo,8($rp)
199___
200} else {
201$code.=<<___;
202 sllg %r3,%r3,32
203 sllg %r5,%r5,32
204 or %r3,%r4
205 or %r5,%r6
206 bras $ra,_mul_1x1
207 rllg $lo,$lo,32
208 rllg $hi,$hi,32
209 stmg $lo,$hi,0($rp)
210___
211}
212$code.=<<___;
213 lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
214 br $ra
215.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
216.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
217___
218
219$code =~ s/\`([^\`]*)\`/eval($1)/gem;
220print $code;
221close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
deleted file mode 100644
index 9fd64e81ee..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ /dev/null
@@ -1,277 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# April 2007.
11#
12# Performance improvement over vanilla C code varies from 85% to 45%
13# depending on key length and benchmark. Unfortunately in this context
14# these are not very impressive results [for code that utilizes "wide"
15# 64x64=128-bit multiplication, which is not commonly available to C
16# programmers], at least hand-coded bn_asm.c replacement is known to
17# provide 30-40% better results for longest keys. Well, on a second
18# thought it's not very surprising, because z-CPUs are single-issue
19# and _strictly_ in-order execution, while bn_mul_mont is more or less
20# dependent on CPU ability to pipe-line instructions and have several
21# of them "in-flight" at the same time. I mean while other methods,
22# for example Karatsuba, aim to minimize amount of multiplications at
23# the cost of other operations increase, bn_mul_mont aim to neatly
24# "overlap" multiplications and the other operations [and on most
25# platforms even minimize the amount of the other operations, in
26# particular references to memory]. But it's possible to improve this
27# module performance by implementing dedicated squaring code-path and
28# possibly by unrolling loops...
29
30# January 2009.
31#
32# Reschedule to minimize/avoid Address Generation Interlock hazard,
33# make inner loops counter-based.
34
35# November 2010.
36#
37# Adapt for -m31 build. If kernel supports what's called "highgprs"
38# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
39# instructions and achieve "64-bit" performance even in 31-bit legacy
40# application context. The feature is not specific to any particular
41# processor, as long as it's "z-CPU". Latter implies that the code
42# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
43# is achieved by swapping words after 64-bit loads, follow _dswap-s.
44# On z990 it was measured to perform 2.6-2.2 times better than
45# compiler-generated code, less for longer keys...
46
47$flavour = shift;
48
49if ($flavour =~ /3[12]/) {
50 $SIZE_T=4;
51 $g="";
52} else {
53 $SIZE_T=8;
54 $g="g";
55}
56
57while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
58open STDOUT,">$output";
59
60$stdframe=16*$SIZE_T+4*8;
61
62$mn0="%r0";
63$num="%r1";
64
65# int bn_mul_mont(
66$rp="%r2"; # BN_ULONG *rp,
67$ap="%r3"; # const BN_ULONG *ap,
68$bp="%r4"; # const BN_ULONG *bp,
69$np="%r5"; # const BN_ULONG *np,
70$n0="%r6"; # const BN_ULONG *n0,
71#$num="160(%r15)" # int num);
72
73$bi="%r2"; # zaps rp
74$j="%r7";
75
76$ahi="%r8";
77$alo="%r9";
78$nhi="%r10";
79$nlo="%r11";
80$AHI="%r12";
81$NHI="%r13";
82$count="%r14";
83$sp="%r15";
84
85$code.=<<___;
86.text
87.globl bn_mul_mont
88.type bn_mul_mont,\@function
89bn_mul_mont:
90 lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
91 sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes
92 la $bp,0($num,$bp)
93
94 st${g} %r2,2*$SIZE_T($sp)
95
96 cghi $num,16 #
97 lghi %r2,0 #
98 blr %r14 # if($num<16) return 0;
99___
100$code.=<<___ if ($flavour =~ /3[12]/);
101 tmll $num,4
102 bnzr %r14 # if ($num&1) return 0;
103___
104$code.=<<___ if ($flavour !~ /3[12]/);
105 cghi $num,96 #
106 bhr %r14 # if($num>96) return 0;
107___
108$code.=<<___;
109 stm${g} %r3,%r15,3*$SIZE_T($sp)
110
111 lghi $rp,-$stdframe-8 # leave room for carry bit
112 lcgr $j,$num # -$num
113 lgr %r0,$sp
114 la $rp,0($rp,$sp)
115 la $sp,0($j,$rp) # alloca
116 st${g} %r0,0($sp) # back chain
117
118 sra $num,3 # restore $num
119 la $bp,0($j,$bp) # restore $bp
120 ahi $num,-1 # adjust $num for inner loop
121 lg $n0,0($n0) # pull n0
122 _dswap $n0
123
124 lg $bi,0($bp)
125 _dswap $bi
126 lg $alo,0($ap)
127 _dswap $alo
128 mlgr $ahi,$bi # ap[0]*bp[0]
129 lgr $AHI,$ahi
130
131 lgr $mn0,$alo # "tp[0]"*n0
132 msgr $mn0,$n0
133
134 lg $nlo,0($np) #
135 _dswap $nlo
136 mlgr $nhi,$mn0 # np[0]*m1
137 algr $nlo,$alo # +="tp[0]"
138 lghi $NHI,0
139 alcgr $NHI,$nhi
140
141 la $j,8(%r0) # j=1
142 lr $count,$num
143
144.align 16
145.L1st:
146 lg $alo,0($j,$ap)
147 _dswap $alo
148 mlgr $ahi,$bi # ap[j]*bp[0]
149 algr $alo,$AHI
150 lghi $AHI,0
151 alcgr $AHI,$ahi
152
153 lg $nlo,0($j,$np)
154 _dswap $nlo
155 mlgr $nhi,$mn0 # np[j]*m1
156 algr $nlo,$NHI
157 lghi $NHI,0
158 alcgr $nhi,$NHI # +="tp[j]"
159 algr $nlo,$alo
160 alcgr $NHI,$nhi
161
162 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
163 la $j,8($j) # j++
164 brct $count,.L1st
165
166 algr $NHI,$AHI
167 lghi $AHI,0
168 alcgr $AHI,$AHI # upmost overflow bit
169 stg $NHI,$stdframe-8($j,$sp)
170 stg $AHI,$stdframe($j,$sp)
171 la $bp,8($bp) # bp++
172
173.Louter:
174 lg $bi,0($bp) # bp[i]
175 _dswap $bi
176 lg $alo,0($ap)
177 _dswap $alo
178 mlgr $ahi,$bi # ap[0]*bp[i]
179 alg $alo,$stdframe($sp) # +=tp[0]
180 lghi $AHI,0
181 alcgr $AHI,$ahi
182
183 lgr $mn0,$alo
184 msgr $mn0,$n0 # tp[0]*n0
185
186 lg $nlo,0($np) # np[0]
187 _dswap $nlo
188 mlgr $nhi,$mn0 # np[0]*m1
189 algr $nlo,$alo # +="tp[0]"
190 lghi $NHI,0
191 alcgr $NHI,$nhi
192
193 la $j,8(%r0) # j=1
194 lr $count,$num
195
196.align 16
197.Linner:
198 lg $alo,0($j,$ap)
199 _dswap $alo
200 mlgr $ahi,$bi # ap[j]*bp[i]
201 algr $alo,$AHI
202 lghi $AHI,0
203 alcgr $ahi,$AHI
204 alg $alo,$stdframe($j,$sp)# +=tp[j]
205 alcgr $AHI,$ahi
206
207 lg $nlo,0($j,$np)
208 _dswap $nlo
209 mlgr $nhi,$mn0 # np[j]*m1
210 algr $nlo,$NHI
211 lghi $NHI,0
212 alcgr $nhi,$NHI
213 algr $nlo,$alo # +="tp[j]"
214 alcgr $NHI,$nhi
215
216 stg $nlo,$stdframe-8($j,$sp) # tp[j-1]=
217 la $j,8($j) # j++
218 brct $count,.Linner
219
220 algr $NHI,$AHI
221 lghi $AHI,0
222 alcgr $AHI,$AHI
223 alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
224 lghi $ahi,0
225 alcgr $AHI,$ahi # new upmost overflow bit
226 stg $NHI,$stdframe-8($j,$sp)
227 stg $AHI,$stdframe($j,$sp)
228
229 la $bp,8($bp) # bp++
230 cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num]
231 jne .Louter
232
233 l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp
234 la $ap,$stdframe($sp)
235 ahi $num,1 # restore $num, incidentally clears "borrow"
236
237 la $j,0(%r0)
238 lr $count,$num
239.Lsub: lg $alo,0($j,$ap)
240 lg $nlo,0($j,$np)
241 _dswap $nlo
242 slbgr $alo,$nlo
243 stg $alo,0($j,$rp)
244 la $j,8($j)
245 brct $count,.Lsub
246 lghi $ahi,0
247 slbgr $AHI,$ahi # handle upmost carry
248
249 ngr $ap,$AHI
250 lghi $np,-1
251 xgr $np,$AHI
252 ngr $np,$rp
253 ogr $ap,$np # ap=borrow?tp:rp
254
255 la $j,0(%r0)
256 lgr $count,$num
257.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
258 _dswap $alo
259 stg $j,$stdframe($j,$sp) # zap tp
260 stg $alo,0($j,$rp)
261 la $j,8($j)
262 brct $count,.Lcopy
263
264 la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
265 lm${g} %r6,%r15,0(%r1)
266 lghi %r2,1 # signal "processed"
267 br %r14
268.size bn_mul_mont,.-bn_mul_mont
269.string "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
270___
271
272foreach (split("\n",$code)) {
273 s/\`([^\`]*)\`/eval $1/ge;
274 s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
275 print $_,"\n";
276}
277close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x.S b/src/lib/libcrypto/bn/asm/s390x.S
deleted file mode 100755
index 43fcb79bc0..0000000000
--- a/src/lib/libcrypto/bn/asm/s390x.S
+++ /dev/null
@@ -1,678 +0,0 @@
1.ident "s390x.S, version 1.1"
2// ====================================================================
3// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
4// project.
5//
6// Rights for redistribution and usage in source and binary forms are
7// granted according to the OpenSSL license. Warranty of any kind is
8// disclaimed.
9// ====================================================================
10
11.text
12
13#define zero %r0
14
15// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
16.globl bn_mul_add_words
17.type bn_mul_add_words,@function
18.align 4
19bn_mul_add_words:
20 lghi zero,0 // zero = 0
21 la %r1,0(%r2) // put rp aside
22 lghi %r2,0 // i=0;
23 ltgfr %r4,%r4
24 bler %r14 // if (len<=0) return 0;
25
26 stmg %r6,%r10,48(%r15)
27 lghi %r10,3
28 lghi %r8,0 // carry = 0
29 nr %r10,%r4 // len%4
30 sra %r4,2 // cnt=len/4
31 jz .Loop1_madd // carry is incidentally cleared if branch taken
32 algr zero,zero // clear carry
33
34.Loop4_madd:
35 lg %r7,0(%r2,%r3) // ap[i]
36 mlgr %r6,%r5 // *=w
37 alcgr %r7,%r8 // +=carry
38 alcgr %r6,zero
39 alg %r7,0(%r2,%r1) // +=rp[i]
40 stg %r7,0(%r2,%r1) // rp[i]=
41
42 lg %r9,8(%r2,%r3)
43 mlgr %r8,%r5
44 alcgr %r9,%r6
45 alcgr %r8,zero
46 alg %r9,8(%r2,%r1)
47 stg %r9,8(%r2,%r1)
48
49 lg %r7,16(%r2,%r3)
50 mlgr %r6,%r5
51 alcgr %r7,%r8
52 alcgr %r6,zero
53 alg %r7,16(%r2,%r1)
54 stg %r7,16(%r2,%r1)
55
56 lg %r9,24(%r2,%r3)
57 mlgr %r8,%r5
58 alcgr %r9,%r6
59 alcgr %r8,zero
60 alg %r9,24(%r2,%r1)
61 stg %r9,24(%r2,%r1)
62
63 la %r2,32(%r2) // i+=4
64 brct %r4,.Loop4_madd
65
66 la %r10,1(%r10) // see if len%4 is zero ...
67 brct %r10,.Loop1_madd // without touching condition code:-)
68
69.Lend_madd:
70 alcgr %r8,zero // collect carry bit
71 lgr %r2,%r8
72 lmg %r6,%r10,48(%r15)
73 br %r14
74
75.Loop1_madd:
76 lg %r7,0(%r2,%r3) // ap[i]
77 mlgr %r6,%r5 // *=w
78 alcgr %r7,%r8 // +=carry
79 alcgr %r6,zero
80 alg %r7,0(%r2,%r1) // +=rp[i]
81 stg %r7,0(%r2,%r1) // rp[i]=
82
83 lgr %r8,%r6
84 la %r2,8(%r2) // i++
85 brct %r10,.Loop1_madd
86
87 j .Lend_madd
88.size bn_mul_add_words,.-bn_mul_add_words
89
90// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
91.globl bn_mul_words
92.type bn_mul_words,@function
93.align 4
94bn_mul_words:
95 lghi zero,0 // zero = 0
96 la %r1,0(%r2) // put rp aside
97 lghi %r2,0 // i=0;
98 ltgfr %r4,%r4
99 bler %r14 // if (len<=0) return 0;
100
101 stmg %r6,%r10,48(%r15)
102 lghi %r10,3
103 lghi %r8,0 // carry = 0
104 nr %r10,%r4 // len%4
105 sra %r4,2 // cnt=len/4
106 jz .Loop1_mul // carry is incidentally cleared if branch taken
107 algr zero,zero // clear carry
108
109.Loop4_mul:
110 lg %r7,0(%r2,%r3) // ap[i]
111 mlgr %r6,%r5 // *=w
112 alcgr %r7,%r8 // +=carry
113 stg %r7,0(%r2,%r1) // rp[i]=
114
115 lg %r9,8(%r2,%r3)
116 mlgr %r8,%r5
117 alcgr %r9,%r6
118 stg %r9,8(%r2,%r1)
119
120 lg %r7,16(%r2,%r3)
121 mlgr %r6,%r5
122 alcgr %r7,%r8
123 stg %r7,16(%r2,%r1)
124
125 lg %r9,24(%r2,%r3)
126 mlgr %r8,%r5
127 alcgr %r9,%r6
128 stg %r9,24(%r2,%r1)
129
130 la %r2,32(%r2) // i+=4
131 brct %r4,.Loop4_mul
132
133 la %r10,1(%r10) // see if len%4 is zero ...
134 brct %r10,.Loop1_mul // without touching condition code:-)
135
136.Lend_mul:
137 alcgr %r8,zero // collect carry bit
138 lgr %r2,%r8
139 lmg %r6,%r10,48(%r15)
140 br %r14
141
142.Loop1_mul:
143 lg %r7,0(%r2,%r3) // ap[i]
144 mlgr %r6,%r5 // *=w
145 alcgr %r7,%r8 // +=carry
146 stg %r7,0(%r2,%r1) // rp[i]=
147
148 lgr %r8,%r6
149 la %r2,8(%r2) // i++
150 brct %r10,.Loop1_mul
151
152 j .Lend_mul
153.size bn_mul_words,.-bn_mul_words
154
155// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
156.globl bn_sqr_words
157.type bn_sqr_words,@function
158.align 4
159bn_sqr_words:
160 ltgfr %r4,%r4
161 bler %r14
162
163 stmg %r6,%r7,48(%r15)
164 srag %r1,%r4,2 // cnt=len/4
165 jz .Loop1_sqr
166
167.Loop4_sqr:
168 lg %r7,0(%r3)
169 mlgr %r6,%r7
170 stg %r7,0(%r2)
171 stg %r6,8(%r2)
172
173 lg %r7,8(%r3)
174 mlgr %r6,%r7
175 stg %r7,16(%r2)
176 stg %r6,24(%r2)
177
178 lg %r7,16(%r3)
179 mlgr %r6,%r7
180 stg %r7,32(%r2)
181 stg %r6,40(%r2)
182
183 lg %r7,24(%r3)
184 mlgr %r6,%r7
185 stg %r7,48(%r2)
186 stg %r6,56(%r2)
187
188 la %r3,32(%r3)
189 la %r2,64(%r2)
190 brct %r1,.Loop4_sqr
191
192 lghi %r1,3
193 nr %r4,%r1 // cnt=len%4
194 jz .Lend_sqr
195
196.Loop1_sqr:
197 lg %r7,0(%r3)
198 mlgr %r6,%r7
199 stg %r7,0(%r2)
200 stg %r6,8(%r2)
201
202 la %r3,8(%r3)
203 la %r2,16(%r2)
204 brct %r4,.Loop1_sqr
205
206.Lend_sqr:
207 lmg %r6,%r7,48(%r15)
208 br %r14
209.size bn_sqr_words,.-bn_sqr_words
210
211// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
212.globl bn_div_words
213.type bn_div_words,@function
214.align 4
215bn_div_words:
216 dlgr %r2,%r4
217 lgr %r2,%r3
218 br %r14
219.size bn_div_words,.-bn_div_words
220
221// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
222.globl bn_add_words
223.type bn_add_words,@function
224.align 4
225bn_add_words:
226 la %r1,0(%r2) // put rp aside
227 lghi %r2,0 // i=0
228 ltgfr %r5,%r5
229 bler %r14 // if (len<=0) return 0;
230
231 stg %r6,48(%r15)
232 lghi %r6,3
233 nr %r6,%r5 // len%4
234 sra %r5,2 // len/4, use sra because it sets condition code
235 jz .Loop1_add // carry is incidentally cleared if branch taken
236 algr %r2,%r2 // clear carry
237
238.Loop4_add:
239 lg %r0,0(%r2,%r3)
240 alcg %r0,0(%r2,%r4)
241 stg %r0,0(%r2,%r1)
242 lg %r0,8(%r2,%r3)
243 alcg %r0,8(%r2,%r4)
244 stg %r0,8(%r2,%r1)
245 lg %r0,16(%r2,%r3)
246 alcg %r0,16(%r2,%r4)
247 stg %r0,16(%r2,%r1)
248 lg %r0,24(%r2,%r3)
249 alcg %r0,24(%r2,%r4)
250 stg %r0,24(%r2,%r1)
251
252 la %r2,32(%r2) // i+=4
253 brct %r5,.Loop4_add
254
255 la %r6,1(%r6) // see if len%4 is zero ...
256 brct %r6,.Loop1_add // without touching condition code:-)
257
258.Lexit_add:
259 lghi %r2,0
260 alcgr %r2,%r2
261 lg %r6,48(%r15)
262 br %r14
263
264.Loop1_add:
265 lg %r0,0(%r2,%r3)
266 alcg %r0,0(%r2,%r4)
267 stg %r0,0(%r2,%r1)
268
269 la %r2,8(%r2) // i++
270 brct %r6,.Loop1_add
271
272 j .Lexit_add
273.size bn_add_words,.-bn_add_words
274
275// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
276.globl bn_sub_words
277.type bn_sub_words,@function
278.align 4
279bn_sub_words:
280 la %r1,0(%r2) // put rp aside
281 lghi %r2,0 // i=0
282 ltgfr %r5,%r5
283 bler %r14 // if (len<=0) return 0;
284
285 stg %r6,48(%r15)
286 lghi %r6,3
287 nr %r6,%r5 // len%4
288 sra %r5,2 // len/4, use sra because it sets condition code
289 jnz .Loop4_sub // borrow is incidentally cleared if branch taken
290 slgr %r2,%r2 // clear borrow
291
292.Loop1_sub:
293 lg %r0,0(%r2,%r3)
294 slbg %r0,0(%r2,%r4)
295 stg %r0,0(%r2,%r1)
296
297 la %r2,8(%r2) // i++
298 brct %r6,.Loop1_sub
299 j .Lexit_sub
300
301.Loop4_sub:
302 lg %r0,0(%r2,%r3)
303 slbg %r0,0(%r2,%r4)
304 stg %r0,0(%r2,%r1)
305 lg %r0,8(%r2,%r3)
306 slbg %r0,8(%r2,%r4)
307 stg %r0,8(%r2,%r1)
308 lg %r0,16(%r2,%r3)
309 slbg %r0,16(%r2,%r4)
310 stg %r0,16(%r2,%r1)
311 lg %r0,24(%r2,%r3)
312 slbg %r0,24(%r2,%r4)
313 stg %r0,24(%r2,%r1)
314
315 la %r2,32(%r2) // i+=4
316 brct %r5,.Loop4_sub
317
318 la %r6,1(%r6) // see if len%4 is zero ...
319 brct %r6,.Loop1_sub // without touching condition code:-)
320
321.Lexit_sub:
322 lghi %r2,0
323 slbgr %r2,%r2
324 lcgr %r2,%r2
325 lg %r6,48(%r15)
326 br %r14
327.size bn_sub_words,.-bn_sub_words
328
329#define c1 %r1
330#define c2 %r5
331#define c3 %r8
332
333#define mul_add_c(ai,bi,c1,c2,c3) \
334 lg %r7,ai*8(%r3); \
335 mlg %r6,bi*8(%r4); \
336 algr c1,%r7; \
337 alcgr c2,%r6; \
338 alcgr c3,zero
339
340// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
341.globl bn_mul_comba8
342.type bn_mul_comba8,@function
343.align 4
344bn_mul_comba8:
345 stmg %r6,%r8,48(%r15)
346
347 lghi c1,0
348 lghi c2,0
349 lghi c3,0
350 lghi zero,0
351
352 mul_add_c(0,0,c1,c2,c3);
353 stg c1,0*8(%r2)
354 lghi c1,0
355
356 mul_add_c(0,1,c2,c3,c1);
357 mul_add_c(1,0,c2,c3,c1);
358 stg c2,1*8(%r2)
359 lghi c2,0
360
361 mul_add_c(2,0,c3,c1,c2);
362 mul_add_c(1,1,c3,c1,c2);
363 mul_add_c(0,2,c3,c1,c2);
364 stg c3,2*8(%r2)
365 lghi c3,0
366
367 mul_add_c(0,3,c1,c2,c3);
368 mul_add_c(1,2,c1,c2,c3);
369 mul_add_c(2,1,c1,c2,c3);
370 mul_add_c(3,0,c1,c2,c3);
371 stg c1,3*8(%r2)
372 lghi c1,0
373
374 mul_add_c(4,0,c2,c3,c1);
375 mul_add_c(3,1,c2,c3,c1);
376 mul_add_c(2,2,c2,c3,c1);
377 mul_add_c(1,3,c2,c3,c1);
378 mul_add_c(0,4,c2,c3,c1);
379 stg c2,4*8(%r2)
380 lghi c2,0
381
382 mul_add_c(0,5,c3,c1,c2);
383 mul_add_c(1,4,c3,c1,c2);
384 mul_add_c(2,3,c3,c1,c2);
385 mul_add_c(3,2,c3,c1,c2);
386 mul_add_c(4,1,c3,c1,c2);
387 mul_add_c(5,0,c3,c1,c2);
388 stg c3,5*8(%r2)
389 lghi c3,0
390
391 mul_add_c(6,0,c1,c2,c3);
392 mul_add_c(5,1,c1,c2,c3);
393 mul_add_c(4,2,c1,c2,c3);
394 mul_add_c(3,3,c1,c2,c3);
395 mul_add_c(2,4,c1,c2,c3);
396 mul_add_c(1,5,c1,c2,c3);
397 mul_add_c(0,6,c1,c2,c3);
398 stg c1,6*8(%r2)
399 lghi c1,0
400
401 mul_add_c(0,7,c2,c3,c1);
402 mul_add_c(1,6,c2,c3,c1);
403 mul_add_c(2,5,c2,c3,c1);
404 mul_add_c(3,4,c2,c3,c1);
405 mul_add_c(4,3,c2,c3,c1);
406 mul_add_c(5,2,c2,c3,c1);
407 mul_add_c(6,1,c2,c3,c1);
408 mul_add_c(7,0,c2,c3,c1);
409 stg c2,7*8(%r2)
410 lghi c2,0
411
412 mul_add_c(7,1,c3,c1,c2);
413 mul_add_c(6,2,c3,c1,c2);
414 mul_add_c(5,3,c3,c1,c2);
415 mul_add_c(4,4,c3,c1,c2);
416 mul_add_c(3,5,c3,c1,c2);
417 mul_add_c(2,6,c3,c1,c2);
418 mul_add_c(1,7,c3,c1,c2);
419 stg c3,8*8(%r2)
420 lghi c3,0
421
422 mul_add_c(2,7,c1,c2,c3);
423 mul_add_c(3,6,c1,c2,c3);
424 mul_add_c(4,5,c1,c2,c3);
425 mul_add_c(5,4,c1,c2,c3);
426 mul_add_c(6,3,c1,c2,c3);
427 mul_add_c(7,2,c1,c2,c3);
428 stg c1,9*8(%r2)
429 lghi c1,0
430
431 mul_add_c(7,3,c2,c3,c1);
432 mul_add_c(6,4,c2,c3,c1);
433 mul_add_c(5,5,c2,c3,c1);
434 mul_add_c(4,6,c2,c3,c1);
435 mul_add_c(3,7,c2,c3,c1);
436 stg c2,10*8(%r2)
437 lghi c2,0
438
439 mul_add_c(4,7,c3,c1,c2);
440 mul_add_c(5,6,c3,c1,c2);
441 mul_add_c(6,5,c3,c1,c2);
442 mul_add_c(7,4,c3,c1,c2);
443 stg c3,11*8(%r2)
444 lghi c3,0
445
446 mul_add_c(7,5,c1,c2,c3);
447 mul_add_c(6,6,c1,c2,c3);
448 mul_add_c(5,7,c1,c2,c3);
449 stg c1,12*8(%r2)
450 lghi c1,0
451
452
453 mul_add_c(6,7,c2,c3,c1);
454 mul_add_c(7,6,c2,c3,c1);
455 stg c2,13*8(%r2)
456 lghi c2,0
457
458 mul_add_c(7,7,c3,c1,c2);
459 stg c3,14*8(%r2)
460 stg c1,15*8(%r2)
461
462 lmg %r6,%r8,48(%r15)
463 br %r14
464.size bn_mul_comba8,.-bn_mul_comba8
465
466// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
467.globl bn_mul_comba4
468.type bn_mul_comba4,@function
469.align 4
470bn_mul_comba4:
471 stmg %r6,%r8,48(%r15)
472
473 lghi c1,0
474 lghi c2,0
475 lghi c3,0
476 lghi zero,0
477
478 mul_add_c(0,0,c1,c2,c3);
479 stg c1,0*8(%r3)
480 lghi c1,0
481
482 mul_add_c(0,1,c2,c3,c1);
483 mul_add_c(1,0,c2,c3,c1);
484 stg c2,1*8(%r2)
485 lghi c2,0
486
487 mul_add_c(2,0,c3,c1,c2);
488 mul_add_c(1,1,c3,c1,c2);
489 mul_add_c(0,2,c3,c1,c2);
490 stg c3,2*8(%r2)
491 lghi c3,0
492
493 mul_add_c(0,3,c1,c2,c3);
494 mul_add_c(1,2,c1,c2,c3);
495 mul_add_c(2,1,c1,c2,c3);
496 mul_add_c(3,0,c1,c2,c3);
497 stg c1,3*8(%r2)
498 lghi c1,0
499
500 mul_add_c(3,1,c2,c3,c1);
501 mul_add_c(2,2,c2,c3,c1);
502 mul_add_c(1,3,c2,c3,c1);
503 stg c2,4*8(%r2)
504 lghi c2,0
505
506 mul_add_c(2,3,c3,c1,c2);
507 mul_add_c(3,2,c3,c1,c2);
508 stg c3,5*8(%r2)
509 lghi c3,0
510
511 mul_add_c(3,3,c1,c2,c3);
512 stg c1,6*8(%r2)
513 stg c2,7*8(%r2)
514
515 stmg %r6,%r8,48(%r15)
516 br %r14
517.size bn_mul_comba4,.-bn_mul_comba4
518
519#define sqr_add_c(ai,c1,c2,c3) \
520 lg %r7,ai*8(%r3); \
521 mlgr %r6,%r7; \
522 algr c1,%r7; \
523 alcgr c2,%r6; \
524 alcgr c3,zero
525
526#define sqr_add_c2(ai,aj,c1,c2,c3) \
527 lg %r7,ai*8(%r3); \
528 mlg %r6,aj*8(%r3); \
529 algr c1,%r7; \
530 alcgr c2,%r6; \
531 alcgr c3,zero; \
532 algr c1,%r7; \
533 alcgr c2,%r6; \
534 alcgr c3,zero
535
536// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
537.globl bn_sqr_comba8
538.type bn_sqr_comba8,@function
539.align 4
540bn_sqr_comba8:
541 stmg %r6,%r8,48(%r15)
542
543 lghi c1,0
544 lghi c2,0
545 lghi c3,0
546 lghi zero,0
547
548 sqr_add_c(0,c1,c2,c3);
549 stg c1,0*8(%r2)
550 lghi c1,0
551
552 sqr_add_c2(1,0,c2,c3,c1);
553 stg c2,1*8(%r2)
554 lghi c2,0
555
556 sqr_add_c(1,c3,c1,c2);
557 sqr_add_c2(2,0,c3,c1,c2);
558 stg c3,2*8(%r2)
559 lghi c3,0
560
561 sqr_add_c2(3,0,c1,c2,c3);
562 sqr_add_c2(2,1,c1,c2,c3);
563 stg c1,3*8(%r2)
564 lghi c1,0
565
566 sqr_add_c(2,c2,c3,c1);
567 sqr_add_c2(3,1,c2,c3,c1);
568 sqr_add_c2(4,0,c2,c3,c1);
569 stg c2,4*8(%r2)
570 lghi c2,0
571
572 sqr_add_c2(5,0,c3,c1,c2);
573 sqr_add_c2(4,1,c3,c1,c2);
574 sqr_add_c2(3,2,c3,c1,c2);
575 stg c3,5*8(%r2)
576 lghi c3,0
577
578 sqr_add_c(3,c1,c2,c3);
579 sqr_add_c2(4,2,c1,c2,c3);
580 sqr_add_c2(5,1,c1,c2,c3);
581 sqr_add_c2(6,0,c1,c2,c3);
582 stg c1,6*8(%r2)
583 lghi c1,0
584
585 sqr_add_c2(7,0,c2,c3,c1);
586 sqr_add_c2(6,1,c2,c3,c1);
587 sqr_add_c2(5,2,c2,c3,c1);
588 sqr_add_c2(4,3,c2,c3,c1);
589 stg c2,7*8(%r2)
590 lghi c2,0
591
592 sqr_add_c(4,c3,c1,c2);
593 sqr_add_c2(5,3,c3,c1,c2);
594 sqr_add_c2(6,2,c3,c1,c2);
595 sqr_add_c2(7,1,c3,c1,c2);
596 stg c3,8*8(%r2)
597 lghi c3,0
598
599 sqr_add_c2(7,2,c1,c2,c3);
600 sqr_add_c2(6,3,c1,c2,c3);
601 sqr_add_c2(5,4,c1,c2,c3);
602 stg c1,9*8(%r2)
603 lghi c1,0
604
605 sqr_add_c(5,c2,c3,c1);
606 sqr_add_c2(6,4,c2,c3,c1);
607 sqr_add_c2(7,3,c2,c3,c1);
608 stg c2,10*8(%r2)
609 lghi c2,0
610
611 sqr_add_c2(7,4,c3,c1,c2);
612 sqr_add_c2(6,5,c3,c1,c2);
613 stg c3,11*8(%r2)
614 lghi c3,0
615
616 sqr_add_c(6,c1,c2,c3);
617 sqr_add_c2(7,5,c1,c2,c3);
618 stg c1,12*8(%r2)
619 lghi c1,0
620
621 sqr_add_c2(7,6,c2,c3,c1);
622 stg c2,13*8(%r2)
623 lghi c2,0
624
625 sqr_add_c(7,c3,c1,c2);
626 stg c3,14*8(%r2)
627 stg c1,15*8(%r2)
628
629 lmg %r6,%r8,48(%r15)
630 br %r14
631.size bn_sqr_comba8,.-bn_sqr_comba8
632
633// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
634.globl bn_sqr_comba4
635.type bn_sqr_comba4,@function
636.align 4
637bn_sqr_comba4:
638 stmg %r6,%r8,48(%r15)
639
640 lghi c1,0
641 lghi c2,0
642 lghi c3,0
643 lghi zero,0
644
645 sqr_add_c(0,c1,c2,c3);
646 stg c1,0*8(%r2)
647 lghi c1,0
648
649 sqr_add_c2(1,0,c2,c3,c1);
650 stg c2,1*8(%r2)
651 lghi c2,0
652
653 sqr_add_c(1,c3,c1,c2);
654 sqr_add_c2(2,0,c3,c1,c2);
655 stg c3,2*8(%r2)
656 lghi c3,0
657
658 sqr_add_c2(3,0,c1,c2,c3);
659 sqr_add_c2(2,1,c1,c2,c3);
660 stg c1,3*8(%r2)
661 lghi c1,0
662
663 sqr_add_c(2,c2,c3,c1);
664 sqr_add_c2(3,1,c2,c3,c1);
665 stg c2,4*8(%r2)
666 lghi c2,0
667
668 sqr_add_c2(3,2,c3,c1,c2);
669 stg c3,5*8(%r2)
670 lghi c3,0
671
672 sqr_add_c(3,c1,c2,c3);
673 stg c1,6*8(%r2)
674 stg c2,7*8(%r2)
675
676 lmg %r6,%r8,48(%r15)
677 br %r14
678.size bn_sqr_comba4,.-bn_sqr_comba4
diff --git a/src/lib/libcrypto/camellia/camellia.c b/src/lib/libcrypto/camellia/camellia.c
index cb577798a8..5f754ff78b 100644
--- a/src/lib/libcrypto/camellia/camellia.c
+++ b/src/lib/libcrypto/camellia/camellia.c
@@ -1,4 +1,4 @@
1/* $OpenBSD: camellia.c,v 1.10 2014/11/19 11:37:52 bcook Exp $ */ 1/* $OpenBSD: camellia.c,v 1.11 2016/09/04 14:31:29 jsing Exp $ */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright 2006 NTT (Nippon Telegraph and Telephone Corporation) . 3 * Copyright 2006 NTT (Nippon Telegraph and Telephone Corporation) .
4 * ALL RIGHTS RESERVED. 4 * ALL RIGHTS RESERVED.
@@ -101,11 +101,6 @@
101 defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__) 101 defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__)
102# define LeftRotate(x,s) ({u32 ret; asm ("rlwinm %0,%1,%2,0,31":"=r"(ret):"r"(x),"I"(s)); ret; }) 102# define LeftRotate(x,s) ({u32 ret; asm ("rlwinm %0,%1,%2,0,31":"=r"(ret):"r"(x),"I"(s)); ret; })
103# define RightRotate(x,s) LeftRotate(x,(32-s)) 103# define RightRotate(x,s) LeftRotate(x,(32-s))
104# elif defined(__s390x__)
105# define LeftRotate(x,s) ({u32 ret; asm ("rll %0,%1,%2":"=r"(ret):"r"(x),"I"(s)); ret; })
106# define RightRotate(x,s) LeftRotate(x,(32-s))
107# define GETU32(p) (*(u32 *)(p))
108# define PUTU32(p,v) (*(u32 *)(p)=(v))
109# endif 104# endif
110# endif 105# endif
111#endif 106#endif
diff --git a/src/lib/libcrypto/md32_common.h b/src/lib/libcrypto/md32_common.h
index 7b6a354478..bad34d22d6 100644
--- a/src/lib/libcrypto/md32_common.h
+++ b/src/lib/libcrypto/md32_common.h
@@ -1,4 +1,4 @@
1/* $OpenBSD: md32_common.h,v 1.20 2014/11/09 19:08:24 miod Exp $ */ 1/* $OpenBSD: md32_common.h,v 1.21 2016/09/04 14:31:29 jsing Exp $ */
2/* ==================================================================== 2/* ====================================================================
3 * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. 3 * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
4 * 4 *
@@ -168,10 +168,6 @@ static inline uint32_t ROTATE(uint32_t a, uint32_t n)
168 *((unsigned int *)(c))=r; (c)+=4; }) 168 *((unsigned int *)(c))=r; (c)+=4; })
169# endif 169# endif
170#endif 170#endif
171#if defined(__s390__) || defined(__s390x__)
172# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4)
173# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4)
174#endif
175 171
176#ifndef HOST_c2l 172#ifndef HOST_c2l
177#define HOST_c2l(c,l) do {l =(((unsigned long)(*((c)++)))<<24); \ 173#define HOST_c2l(c,l) do {l =(((unsigned long)(*((c)++)))<<24); \
@@ -190,16 +186,6 @@ static inline uint32_t ROTATE(uint32_t a, uint32_t n)
190 186
191#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) 187#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
192 188
193#if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
194# if defined(__s390x__)
195# define HOST_c2l(c,l) ({ asm ("lrv %0,%1" \
196 :"=d"(l) :"m"(*(const unsigned int *)(c)));\
197 (c)+=4; })
198# define HOST_l2c(l,c) ({ asm ("strv %1,%0" \
199 :"=m"(*(unsigned int *)(c)) :"d"(l));\
200 (c)+=4; })
201# endif
202#endif
203#if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) 189#if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
204# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4) 190# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4)
205# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4) 191# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4)
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d89c..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# September 2010.
11#
12# The module implements "4-bit" GCM GHASH function and underlying
13# single multiplication operation in GF(2^128). "4-bit" means that it
14# uses 256 bytes per-key table [+128 bytes shared table]. Performance
15# was measured to be ~18 cycles per processed byte on z10, which is
16# almost 40% better than gcc-generated code. It should be noted that
17# 18 cycles is worse result than expected: loop is scheduled for 12
18# and the result should be close to 12. In the lack of instruction-
19# level profiling data it's impossible to tell why...
20
21# November 2010.
22#
23# Adapt for -m31 build. If kernel supports what's called "highgprs"
24# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
25# instructions and achieve "64-bit" performance even in 31-bit legacy
26# application context. The feature is not specific to any particular
27# processor, as long as it's "z-CPU". Latter implies that the code
28# remains z/Architecture specific. On z990 it was measured to perform
29# 2.8x better than 32-bit code generated by gcc 4.3.
30
31# March 2011.
32#
33# Support for hardware KIMD-GHASH is verified to produce correct
34# result and therefore is engaged. On z196 it was measured to process
35# 8KB buffer ~7 faster than software implementation. It's not as
36# impressive for smaller buffer sizes and for smallest 16-bytes buffer
37# it's actually almost 2 times slower. Which is the reason why
38# KIMD-GHASH is not used in gcm_gmult_4bit.
39
40$flavour = shift;
41
42if ($flavour =~ /3[12]/) {
43 $SIZE_T=4;
44 $g="";
45} else {
46 $SIZE_T=8;
47 $g="g";
48}
49
50while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
51open STDOUT,">$output";
52
53$softonly=0;
54
55$Zhi="%r0";
56$Zlo="%r1";
57
58$Xi="%r2"; # argument block
59$Htbl="%r3";
60$inp="%r4";
61$len="%r5";
62
63$rem0="%r6"; # variables
64$rem1="%r7";
65$nlo="%r8";
66$nhi="%r9";
67$xi="%r10";
68$cnt="%r11";
69$tmp="%r12";
70$x78="%r13";
71$rem_4bit="%r14";
72
73$sp="%r15";
74
75$code.=<<___;
76.text
77
78.globl gcm_gmult_4bit
79.align 32
80gcm_gmult_4bit:
81___
82$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
83 larl %r1,OPENSSL_s390xcap_P
84 lg %r0,0(%r1)
85 tmhl %r0,0x4000 # check for message-security-assist
86 jz .Lsoft_gmult
87 lghi %r0,0
88 la %r1,16($sp)
89 .long 0xb93e0004 # kimd %r0,%r4
90 lg %r1,24($sp)
91 tmhh %r1,0x4000 # check for function 65
92 jz .Lsoft_gmult
93 stg %r0,16($sp) # arrange 16 bytes of zero input
94 stg %r0,24($sp)
95 lghi %r0,65 # function 65
96 la %r1,0($Xi) # H lies right after Xi in gcm128_context
97 la $inp,16($sp)
98 lghi $len,16
99 .long 0xb93e0004 # kimd %r0,$inp
100 brc 1,.-4 # pay attention to "partial completion"
101 br %r14
102.align 32
103.Lsoft_gmult:
104___
105$code.=<<___;
106 stm${g} %r6,%r14,6*$SIZE_T($sp)
107
108 aghi $Xi,-1
109 lghi $len,1
110 lghi $x78,`0xf<<3`
111 larl $rem_4bit,rem_4bit
112
113 lg $Zlo,8+1($Xi) # Xi
114 j .Lgmult_shortcut
115.type gcm_gmult_4bit,\@function
116.size gcm_gmult_4bit,(.-gcm_gmult_4bit)
117
118.globl gcm_ghash_4bit
119.align 32
120gcm_ghash_4bit:
121___
122$code.=<<___ if(!$softonly);
123 larl %r1,OPENSSL_s390xcap_P
124 lg %r0,0(%r1)
125 tmhl %r0,0x4000 # check for message-security-assist
126 jz .Lsoft_ghash
127 lghi %r0,0
128 la %r1,16($sp)
129 .long 0xb93e0004 # kimd %r0,%r4
130 lg %r1,24($sp)
131 tmhh %r1,0x4000 # check for function 65
132 jz .Lsoft_ghash
133 lghi %r0,65 # function 65
134 la %r1,0($Xi) # H lies right after Xi in gcm128_context
135 .long 0xb93e0004 # kimd %r0,$inp
136 brc 1,.-4 # pay attention to "partial completion"
137 br %r14
138.align 32
139.Lsoft_ghash:
140___
141$code.=<<___ if ($flavour =~ /3[12]/);
142 llgfr $len,$len
143___
144$code.=<<___;
145 stm${g} %r6,%r14,6*$SIZE_T($sp)
146
147 aghi $Xi,-1
148 srlg $len,$len,4
149 lghi $x78,`0xf<<3`
150 larl $rem_4bit,rem_4bit
151
152 lg $Zlo,8+1($Xi) # Xi
153 lg $Zhi,0+1($Xi)
154 lghi $tmp,0
155.Louter:
156 xg $Zhi,0($inp) # Xi ^= inp
157 xg $Zlo,8($inp)
158 xgr $Zhi,$tmp
159 stg $Zlo,8+1($Xi)
160 stg $Zhi,0+1($Xi)
161
162.Lgmult_shortcut:
163 lghi $tmp,0xf0
164 sllg $nlo,$Zlo,4
165 srlg $xi,$Zlo,8 # extract second byte
166 ngr $nlo,$tmp
167 lgr $nhi,$Zlo
168 lghi $cnt,14
169 ngr $nhi,$tmp
170
171 lg $Zlo,8($nlo,$Htbl)
172 lg $Zhi,0($nlo,$Htbl)
173
174 sllg $nlo,$xi,4
175 sllg $rem0,$Zlo,3
176 ngr $nlo,$tmp
177 ngr $rem0,$x78
178 ngr $xi,$tmp
179
180 sllg $tmp,$Zhi,60
181 srlg $Zlo,$Zlo,4
182 srlg $Zhi,$Zhi,4
183 xg $Zlo,8($nhi,$Htbl)
184 xg $Zhi,0($nhi,$Htbl)
185 lgr $nhi,$xi
186 sllg $rem1,$Zlo,3
187 xgr $Zlo,$tmp
188 ngr $rem1,$x78
189 j .Lghash_inner
190.align 16
191.Lghash_inner:
192 srlg $Zlo,$Zlo,4
193 sllg $tmp,$Zhi,60
194 xg $Zlo,8($nlo,$Htbl)
195 srlg $Zhi,$Zhi,4
196 llgc $xi,0($cnt,$Xi)
197 xg $Zhi,0($nlo,$Htbl)
198 sllg $nlo,$xi,4
199 xg $Zhi,0($rem0,$rem_4bit)
200 nill $nlo,0xf0
201 sllg $rem0,$Zlo,3
202 xgr $Zlo,$tmp
203 ngr $rem0,$x78
204 nill $xi,0xf0
205
206 sllg $tmp,$Zhi,60
207 srlg $Zlo,$Zlo,4
208 srlg $Zhi,$Zhi,4
209 xg $Zlo,8($nhi,$Htbl)
210 xg $Zhi,0($nhi,$Htbl)
211 lgr $nhi,$xi
212 xg $Zhi,0($rem1,$rem_4bit)
213 sllg $rem1,$Zlo,3
214 xgr $Zlo,$tmp
215 ngr $rem1,$x78
216 brct $cnt,.Lghash_inner
217
218 sllg $tmp,$Zhi,60
219 srlg $Zlo,$Zlo,4
220 srlg $Zhi,$Zhi,4
221 xg $Zlo,8($nlo,$Htbl)
222 xg $Zhi,0($nlo,$Htbl)
223 sllg $xi,$Zlo,3
224 xg $Zhi,0($rem0,$rem_4bit)
225 xgr $Zlo,$tmp
226 ngr $xi,$x78
227
228 sllg $tmp,$Zhi,60
229 srlg $Zlo,$Zlo,4
230 srlg $Zhi,$Zhi,4
231 xg $Zlo,8($nhi,$Htbl)
232 xg $Zhi,0($nhi,$Htbl)
233 xgr $Zlo,$tmp
234 xg $Zhi,0($rem1,$rem_4bit)
235
236 lg $tmp,0($xi,$rem_4bit)
237 la $inp,16($inp)
238 sllg $tmp,$tmp,4 # correct last rem_4bit[rem]
239 brctg $len,.Louter
240
241 xgr $Zhi,$tmp
242 stg $Zlo,8+1($Xi)
243 stg $Zhi,0+1($Xi)
244 lm${g} %r6,%r14,6*$SIZE_T($sp)
245 br %r14
246.type gcm_ghash_4bit,\@function
247.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
248
249.align 64
250rem_4bit:
251 .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
252 .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
253 .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
254 .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
255.type rem_4bit,\@object
256.size rem_4bit,(.-rem_4bit)
257.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
258___
259
260$code =~ s/\`([^\`]*)\`/eval $1/gem;
261print $code;
262close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
deleted file mode 100644
index 1aa754820c..0000000000
--- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
+++ /dev/null
@@ -1,234 +0,0 @@
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# February 2009
11#
12# Performance is 2x of gcc 3.4.6 on z10. Coding "secret" is to
13# "cluster" Address Generation Interlocks, so that one pipeline stall
14# resolves several dependencies.
15
16# November 2010.
17#
18# Adapt for -m31 build. If kernel supports what's called "highgprs"
19# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
20# instructions and achieve "64-bit" performance even in 31-bit legacy
21# application context. The feature is not specific to any particular
22# processor, as long as it's "z-CPU". Latter implies that the code
23# remains z/Architecture specific. On z990 it was measured to perform
24# 50% better than code generated by gcc 4.3.
25
26$flavour = shift;
27
28if ($flavour =~ /3[12]/) {
29 $SIZE_T=4;
30 $g="";
31} else {
32 $SIZE_T=8;
33 $g="g";
34}
35
36while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
37open STDOUT,">$output";
38
39$rp="%r14";
40$sp="%r15";
41$code=<<___;
42.text
43
44___
45
46# void RC4(RC4_KEY *key,size_t len,const void *inp,void *out)
47{
48$acc="%r0";
49$cnt="%r1";
50$key="%r2";
51$len="%r3";
52$inp="%r4";
53$out="%r5";
54
55@XX=("%r6","%r7");
56@TX=("%r8","%r9");
57$YY="%r10";
58$TY="%r11";
59
60$code.=<<___;
61.globl RC4
62.type RC4,\@function
63.align 64
64RC4:
65 stm${g} %r6,%r11,6*$SIZE_T($sp)
66___
67$code.=<<___ if ($flavour =~ /3[12]/);
68 llgfr $len,$len
69___
70$code.=<<___;
71 llgc $XX[0],0($key)
72 llgc $YY,1($key)
73 la $XX[0],1($XX[0])
74 nill $XX[0],0xff
75 srlg $cnt,$len,3
76 ltgr $cnt,$cnt
77 llgc $TX[0],2($XX[0],$key)
78 jz .Lshort
79 j .Loop8
80
81.align 64
82.Loop8:
83___
84for ($i=0;$i<8;$i++) {
85$code.=<<___;
86 la $YY,0($YY,$TX[0]) # $i
87 nill $YY,255
88 la $XX[1],1($XX[0])
89 nill $XX[1],255
90___
91$code.=<<___ if ($i==1);
92 llgc $acc,2($TY,$key)
93___
94$code.=<<___ if ($i>1);
95 sllg $acc,$acc,8
96 ic $acc,2($TY,$key)
97___
98$code.=<<___;
99 llgc $TY,2($YY,$key)
100 stc $TX[0],2($YY,$key)
101 llgc $TX[1],2($XX[1],$key)
102 stc $TY,2($XX[0],$key)
103 cr $XX[1],$YY
104 jne .Lcmov$i
105 la $TX[1],0($TX[0])
106.Lcmov$i:
107 la $TY,0($TY,$TX[0])
108 nill $TY,255
109___
110push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
111}
112
113$code.=<<___;
114 lg $TX[1],0($inp)
115 sllg $acc,$acc,8
116 la $inp,8($inp)
117 ic $acc,2($TY,$key)
118 xgr $acc,$TX[1]
119 stg $acc,0($out)
120 la $out,8($out)
121 brctg $cnt,.Loop8
122
123.Lshort:
124 lghi $acc,7
125 ngr $len,$acc
126 jz .Lexit
127 j .Loop1
128
129.align 16
130.Loop1:
131 la $YY,0($YY,$TX[0])
132 nill $YY,255
133 llgc $TY,2($YY,$key)
134 stc $TX[0],2($YY,$key)
135 stc $TY,2($XX[0],$key)
136 ar $TY,$TX[0]
137 ahi $XX[0],1
138 nill $TY,255
139 nill $XX[0],255
140 llgc $acc,0($inp)
141 la $inp,1($inp)
142 llgc $TY,2($TY,$key)
143 llgc $TX[0],2($XX[0],$key)
144 xr $acc,$TY
145 stc $acc,0($out)
146 la $out,1($out)
147 brct $len,.Loop1
148
149.Lexit:
150 ahi $XX[0],-1
151 stc $XX[0],0($key)
152 stc $YY,1($key)
153 lm${g} %r6,%r11,6*$SIZE_T($sp)
154 br $rp
155.size RC4,.-RC4
156.string "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
157
158___
159}
160
161# void RC4_set_key(RC4_KEY *key,unsigned int len,const void *inp)
162{
163$cnt="%r0";
164$idx="%r1";
165$key="%r2";
166$len="%r3";
167$inp="%r4";
168$acc="%r5";
169$dat="%r6";
170$ikey="%r7";
171$iinp="%r8";
172
173$code.=<<___;
174.globl RC4_set_key
175.type RC4_set_key,\@function
176.align 64
177RC4_set_key:
178 stm${g} %r6,%r8,6*$SIZE_T($sp)
179 lhi $cnt,256
180 la $idx,0(%r0)
181 sth $idx,0($key)
182.align 4
183.L1stloop:
184 stc $idx,2($idx,$key)
185 la $idx,1($idx)
186 brct $cnt,.L1stloop
187
188 lghi $ikey,-256
189 lr $cnt,$len
190 la $iinp,0(%r0)
191 la $idx,0(%r0)
192.align 16
193.L2ndloop:
194 llgc $acc,2+256($ikey,$key)
195 llgc $dat,0($iinp,$inp)
196 la $idx,0($idx,$acc)
197 la $ikey,1($ikey)
198 la $idx,0($idx,$dat)
199 nill $idx,255
200 la $iinp,1($iinp)
201 tml $ikey,255
202 llgc $dat,2($idx,$key)
203 stc $dat,2+256-1($ikey,$key)
204 stc $acc,2($idx,$key)
205 jz .Ldone
206 brct $cnt,.L2ndloop
207 lr $cnt,$len
208 la $iinp,0(%r0)
209 j .L2ndloop
210.Ldone:
211 lm${g} %r6,%r8,6*$SIZE_T($sp)
212 br $rp
213.size RC4_set_key,.-RC4_set_key
214
215___
216}
217
218# const char *RC4_options()
219$code.=<<___;
220.globl RC4_options
221.type RC4_options,\@function
222.align 16
223RC4_options:
224 larl %r2,.Loptions
225 br %r14
226.size RC4_options,.-RC4_options
227.section .rodata
228.Loptions:
229.align 8
230.string "rc4(8x,char)"
231___
232
233print $code;
234close STDOUT; # force flush
diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c
deleted file mode 100644
index 6fc60f27f2..0000000000
--- a/src/lib/libcrypto/s390xcap.c
+++ /dev/null
@@ -1,43 +0,0 @@
1/* $OpenBSD: s390xcap.c,v 1.3 2014/06/12 15:49:27 deraadt Exp $ */
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5#include <setjmp.h>
6#include <signal.h>
7
8extern unsigned long OPENSSL_s390xcap_P[];
9
10static sigjmp_buf ill_jmp;
11static void ill_handler (int sig)
12{
13 siglongjmp(ill_jmp, sig);
14}
15
16unsigned long OPENSSL_s390x_facilities(void);
17
18void
19OPENSSL_cpuid_setup(void)
20{
21 sigset_t oset;
22 struct sigaction ill_act, oact;
23
24 if (OPENSSL_s390xcap_P[0])
25 return;
26
27 OPENSSL_s390xcap_P[0] = 1UL << (8*sizeof(unsigned long) - 1);
28
29 memset(&ill_act, 0, sizeof(ill_act));
30 ill_act.sa_handler = ill_handler;
31 sigfillset(&ill_act.sa_mask);
32 sigdelset(&ill_act.sa_mask, SIGILL);
33 sigdelset(&ill_act.sa_mask, SIGTRAP);
34 sigprocmask(SIG_SETMASK, &ill_act.sa_mask, &oset);
35 sigaction (SIGILL, &ill_act, &oact);
36
37 /* protection against missing store-facility-list-extended */
38 if (sigsetjmp(ill_jmp, 1) == 0)
39 OPENSSL_s390x_facilities();
40
41 sigaction (SIGILL, &oact, NULL);
42 sigprocmask(SIG_SETMASK, &oset, NULL);
43}
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S
deleted file mode 100644
index 25adb646c4..0000000000
--- a/src/lib/libcrypto/s390xcpuid.S
+++ /dev/null
@@ -1,55 +0,0 @@
1.text
2
3.globl OPENSSL_s390x_facilities
4.type OPENSSL_s390x_facilities,@function
5.align 16
6OPENSSL_s390x_facilities:
7 lghi %r0,0
8 larl %r2,OPENSSL_s390xcap_P
9 stg %r0,8(%r2)
10 .long 0xb2b02000 # stfle 0(%r2)
11 brc 8,.Ldone
12 lghi %r0,1
13 .long 0xb2b02000 # stfle 0(%r2)
14.Ldone:
15 lg %r2,0(%r2)
16 br %r14
17.size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
18
19.globl OPENSSL_atomic_add
20.type OPENSSL_atomic_add,@function
21.align 16
22OPENSSL_atomic_add:
23 l %r1,0(%r2)
24.Lspin: lr %r0,%r1
25 ar %r0,%r3
26 cs %r1,%r0,0(%r2)
27 brc 4,.Lspin
28 lgfr %r2,%r0 # OpenSSL expects the new value
29 br %r14
30.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
31
32.globl OPENSSL_wipe_cpu
33.type OPENSSL_wipe_cpu,@function
34.align 16
35OPENSSL_wipe_cpu:
36 xgr %r0,%r0
37 xgr %r1,%r1
38 lgr %r2,%r15
39 xgr %r3,%r3
40 xgr %r4,%r4
41 lzdr %f0
42 lzdr %f1
43 lzdr %f2
44 lzdr %f3
45 lzdr %f4
46 lzdr %f5
47 lzdr %f6
48 lzdr %f7
49 br %r14
50.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
51
52.section .init
53 brasl %r14,OPENSSL_cpuid_setup
54
55.comm OPENSSL_s390xcap_P,16,8
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
deleted file mode 100644
index 9193dda45e..0000000000
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ /dev/null
@@ -1,246 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA1 block procedure for s390x.
11
12# April 2007.
13#
14# Performance is >30% better than gcc 3.3 generated code. But the real
15# twist is that SHA1 hardware support is detected and utilized. In
16# which case performance can reach further >4.5x for larger chunks.
17
18# January 2009.
19#
20# Optimize Xupdate for amount of memory references and reschedule
21# instructions to favour dual-issue z10 pipeline. On z10 hardware is
22# "only" ~2.3x faster than software.
23
24# November 2010.
25#
26# Adapt for -m31 build. If kernel supports what's called "highgprs"
27# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
28# instructions and achieve "64-bit" performance even in 31-bit legacy
29# application context. The feature is not specific to any particular
30# processor, as long as it's "z-CPU". Latter implies that the code
31# remains z/Architecture specific.
32
33$kimdfunc=1; # magic function code for kimd instruction
34
35$flavour = shift;
36
37if ($flavour =~ /3[12]/) {
38 $SIZE_T=4;
39 $g="";
40} else {
41 $SIZE_T=8;
42 $g="g";
43}
44
45while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
46open STDOUT,">$output";
47
48$K_00_39="%r0"; $K=$K_00_39;
49$K_40_79="%r1";
50$ctx="%r2"; $prefetch="%r2";
51$inp="%r3";
52$len="%r4";
53
54$A="%r5";
55$B="%r6";
56$C="%r7";
57$D="%r8";
58$E="%r9"; @V=($A,$B,$C,$D,$E);
59$t0="%r10";
60$t1="%r11";
61@X=("%r12","%r13","%r14");
62$sp="%r15";
63
64$stdframe=16*$SIZE_T+4*8;
65$frame=$stdframe+16*4;
66
67sub Xupdate {
68my $i=shift;
69
70$code.=<<___ if ($i==15);
71 lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up
72 lr $X[0],$X[2]
73___
74return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle
75$code.=<<___ if ($i<16);
76 lg $X[0],`$i*4`($inp) ### Xload($i)
77 rllg $X[1],$X[0],32
78___
79$code.=<<___ if ($i>=16);
80 xgr $X[0],$prefetch ### Xupdate($i)
81 lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
82 xg $X[0],`$stdframe+4*(($i+8)%16)`($sp)
83 xgr $X[0],$prefetch
84 rll $X[0],$X[0],1
85 rllg $X[1],$X[0],32
86 rll $X[1],$X[1],1
87 rllg $X[0],$X[1],32
88 lr $X[2],$X[1] # feedback
89___
90$code.=<<___ if ($i<=70);
91 stg $X[0],`$stdframe+4*($i%16)`($sp)
92___
93unshift(@X,pop(@X));
94}
95
96sub BODY_00_19 {
97my ($i,$a,$b,$c,$d,$e)=@_;
98my $xi=$X[1];
99
100 &Xupdate($i);
101$code.=<<___;
102 alr $e,$K ### $i
103 rll $t1,$a,5
104 lr $t0,$d
105 xr $t0,$c
106 alr $e,$t1
107 nr $t0,$b
108 alr $e,$xi
109 xr $t0,$d
110 rll $b,$b,30
111 alr $e,$t0
112___
113}
114
115sub BODY_20_39 {
116my ($i,$a,$b,$c,$d,$e)=@_;
117my $xi=$X[1];
118
119 &Xupdate($i);
120$code.=<<___;
121 alr $e,$K ### $i
122 rll $t1,$a,5
123 lr $t0,$b
124 alr $e,$t1
125 xr $t0,$c
126 alr $e,$xi
127 xr $t0,$d
128 rll $b,$b,30
129 alr $e,$t0
130___
131}
132
133sub BODY_40_59 {
134my ($i,$a,$b,$c,$d,$e)=@_;
135my $xi=$X[1];
136
137 &Xupdate($i);
138$code.=<<___;
139 alr $e,$K ### $i
140 rll $t1,$a,5
141 lr $t0,$b
142 alr $e,$t1
143 or $t0,$c
144 lr $t1,$b
145 nr $t0,$d
146 nr $t1,$c
147 alr $e,$xi
148 or $t0,$t1
149 rll $b,$b,30
150 alr $e,$t0
151___
152}
153
154$code.=<<___;
155.text
156.align 64
157.type Ktable,\@object
158Ktable: .long 0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6
159 .skip 48 #.long 0,0,0,0,0,0,0,0,0,0,0,0
160.size Ktable,.-Ktable
161.globl sha1_block_data_order
162.type sha1_block_data_order,\@function
163sha1_block_data_order:
164___
165$code.=<<___ if ($kimdfunc);
166 larl %r1,OPENSSL_s390xcap_P
167 lg %r0,0(%r1)
168 tmhl %r0,0x4000 # check for message-security assist
169 jz .Lsoftware
170 lghi %r0,0
171 la %r1,`2*$SIZE_T`($sp)
172 .long 0xb93e0002 # kimd %r0,%r2
173 lg %r0,`2*$SIZE_T`($sp)
174 tmhh %r0,`0x8000>>$kimdfunc`
175 jz .Lsoftware
176 lghi %r0,$kimdfunc
177 lgr %r1,$ctx
178 lgr %r2,$inp
179 sllg %r3,$len,6
180 .long 0xb93e0002 # kimd %r0,%r2
181 brc 1,.-4 # pay attention to "partial completion"
182 br %r14
183.align 16
184.Lsoftware:
185___
186$code.=<<___;
187 lghi %r1,-$frame
188 st${g} $ctx,`2*$SIZE_T`($sp)
189 stm${g} %r6,%r15,`6*$SIZE_T`($sp)
190 lgr %r0,$sp
191 la $sp,0(%r1,$sp)
192 st${g} %r0,0($sp)
193
194 larl $t0,Ktable
195 llgf $A,0($ctx)
196 llgf $B,4($ctx)
197 llgf $C,8($ctx)
198 llgf $D,12($ctx)
199 llgf $E,16($ctx)
200
201 lg $K_00_39,0($t0)
202 lg $K_40_79,8($t0)
203
204.Lloop:
205 rllg $K_00_39,$K_00_39,32
206___
207for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
208$code.=<<___;
209 rllg $K_00_39,$K_00_39,32
210___
211for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
212$code.=<<___; $K=$K_40_79;
213 rllg $K_40_79,$K_40_79,32
214___
215for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
216$code.=<<___;
217 rllg $K_40_79,$K_40_79,32
218___
219for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
220$code.=<<___;
221
222 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
223 la $inp,64($inp)
224 al $A,0($ctx)
225 al $B,4($ctx)
226 al $C,8($ctx)
227 al $D,12($ctx)
228 al $E,16($ctx)
229 st $A,0($ctx)
230 st $B,4($ctx)
231 st $C,8($ctx)
232 st $D,12($ctx)
233 st $E,16($ctx)
234 brct${g} $len,.Lloop
235
236 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
237 br %r14
238.size sha1_block_data_order,.-sha1_block_data_order
239.string "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
240.comm OPENSSL_s390xcap_P,16,8
241___
242
243$code =~ s/\`([^\`]*)\`/eval $1/gem;
244
245print $code;
246close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
deleted file mode 100644
index 079a3fc78a..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ /dev/null
@@ -1,322 +0,0 @@
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# SHA256/512 block procedures for s390x.
11
12# April 2007.
13#
14# sha256_block_data_order is reportedly >3 times faster than gcc 3.3
15# generated code (must be a bug in compiler, as improvement is
16# "pathologically" high, in particular in comparison to other SHA
17# modules). But the real twist is that it detects if hardware support
18# for SHA256 is available and in such case utilizes it. Then the
19# performance can reach >6.5x of assembler one for larger chunks.
20#
21# sha512_block_data_order is ~70% faster than gcc 3.3 generated code.
22
23# January 2009.
24#
25# Add support for hardware SHA512 and reschedule instructions to
26# favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
27# than software.
28
29# November 2010.
30#
31# Adapt for -m31 build. If kernel supports what's called "highgprs"
32# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
33# instructions and achieve "64-bit" performance even in 31-bit legacy
34# application context. The feature is not specific to any particular
35# processor, as long as it's "z-CPU". Latter implies that the code
36# remains z/Architecture specific. On z900 SHA256 was measured to
37# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
38
39$flavour = shift;
40
41if ($flavour =~ /3[12]/) {
42 $SIZE_T=4;
43 $g="";
44} else {
45 $SIZE_T=8;
46 $g="g";
47}
48
49$t0="%r0";
50$t1="%r1";
51$ctx="%r2"; $t2="%r2";
52$inp="%r3";
53$len="%r4"; # used as index in inner loop
54
55$A="%r5";
56$B="%r6";
57$C="%r7";
58$D="%r8";
59$E="%r9";
60$F="%r10";
61$G="%r11";
62$H="%r12"; @V=($A,$B,$C,$D,$E,$F,$G,$H);
63$tbl="%r13";
64$T1="%r14";
65$sp="%r15";
66
67while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
68open STDOUT,">$output";
69
70if ($output =~ /512/) {
71 $label="512";
72 $SZ=8;
73 $LD="lg"; # load from memory
74 $ST="stg"; # store to memory
75 $ADD="alg"; # add with memory operand
76 $ROT="rllg"; # rotate left
77 $SHR="srlg"; # logical right shift [see even at the end]
78 @Sigma0=(25,30,36);
79 @Sigma1=(23,46,50);
80 @sigma0=(56,63, 7);
81 @sigma1=( 3,45, 6);
82 $rounds=80;
83 $kimdfunc=3; # 0 means unknown/unsupported/unimplemented/disabled
84} else {
85 $label="256";
86 $SZ=4;
87 $LD="llgf"; # load from memory
88 $ST="st"; # store to memory
89 $ADD="al"; # add with memory operand
90 $ROT="rll"; # rotate left
91 $SHR="srl"; # logical right shift
92 @Sigma0=(10,19,30);
93 @Sigma1=( 7,21,26);
94 @sigma0=(14,25, 3);
95 @sigma1=(13,15,10);
96 $rounds=64;
97 $kimdfunc=2; # magic function code for kimd instruction
98}
99$Func="sha${label}_block_data_order";
100$Table="K${label}";
101$stdframe=16*$SIZE_T+4*8;
102$frame=$stdframe+16*$SZ;
103
104sub BODY_00_15 {
105my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
106
107$code.=<<___ if ($i<16);
108 $LD $T1,`$i*$SZ`($inp) ### $i
109___
110$code.=<<___;
111 $ROT $t0,$e,$Sigma1[0]
112 $ROT $t1,$e,$Sigma1[1]
113 lgr $t2,$f
114 xgr $t0,$t1
115 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
116 xgr $t2,$g
117 $ST $T1,`$stdframe+$SZ*($i%16)`($sp)
118 xgr $t0,$t1 # Sigma1(e)
119 algr $T1,$h # T1+=h
120 ngr $t2,$e
121 lgr $t1,$a
122 algr $T1,$t0 # T1+=Sigma1(e)
123 $ROT $h,$a,$Sigma0[0]
124 xgr $t2,$g # Ch(e,f,g)
125 $ADD $T1,`$i*$SZ`($len,$tbl) # T1+=K[i]
126 $ROT $t0,$a,$Sigma0[1]
127 algr $T1,$t2 # T1+=Ch(e,f,g)
128 ogr $t1,$b
129 xgr $h,$t0
130 lgr $t2,$a
131 ngr $t1,$c
132 $ROT $t0,$t0,`$Sigma0[2]-$Sigma0[1]`
133 xgr $h,$t0 # h=Sigma0(a)
134 ngr $t2,$b
135 algr $h,$T1 # h+=T1
136 ogr $t2,$t1 # Maj(a,b,c)
137 algr $d,$T1 # d+=T1
138 algr $h,$t2 # h+=Maj(a,b,c)
139___
140}
141
142sub BODY_16_XX {
143my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
144
145$code.=<<___;
146 $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i
147 $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
148 $ROT $t0,$T1,$sigma0[0]
149 $SHR $T1,$sigma0[2]
150 $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]`
151 xgr $T1,$t0
152 $ROT $t0,$t1,$sigma1[0]
153 xgr $T1,$t2 # sigma0(X[i+1])
154 $SHR $t1,$sigma1[2]
155 $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i]
156 xgr $t1,$t0
157 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]`
158 $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9]
159 xgr $t1,$t0 # sigma1(X[i+14])
160 algr $T1,$t1 # +=sigma1(X[i+14])
161___
162 &BODY_00_15(@_);
163}
164
165$code.=<<___;
166.text
167.align 64
168.type $Table,\@object
169$Table:
170___
171$code.=<<___ if ($SZ==4);
172 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
173 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
179 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
180 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
188___
189$code.=<<___ if ($SZ==8);
190 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
191 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
192 .quad 0x3956c25bf348b538,0x59f111f1b605d019
193 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
194 .quad 0xd807aa98a3030242,0x12835b0145706fbe
195 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
196 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
197 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
198 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
199 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
200 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
201 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
202 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
203 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
204 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
205 .quad 0x06ca6351e003826f,0x142929670a0e6e70
206 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
207 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
208 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
209 .quad 0x81c2c92e47edaee6,0x92722c851482353b
210 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
211 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
212 .quad 0xd192e819d6ef5218,0xd69906245565a910
213 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
214 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
215 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
216 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
217 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
218 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
219 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
220 .quad 0x90befffa23631e28,0xa4506cebde82bde9
221 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
222 .quad 0xca273eceea26619c,0xd186b8c721c0c207
223 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
224 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
225 .quad 0x113f9804bef90dae,0x1b710b35131c471b
226 .quad 0x28db77f523047d84,0x32caab7b40c72493
227 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
228 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
229 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
230___
231$code.=<<___;
232.size $Table,.-$Table
233.globl $Func
234.type $Func,\@function
235$Func:
236 sllg $len,$len,`log(16*$SZ)/log(2)`
237___
238$code.=<<___ if ($kimdfunc);
239 larl %r1,OPENSSL_s390xcap_P
240 lg %r0,0(%r1)
241 tmhl %r0,0x4000 # check for message-security assist
242 jz .Lsoftware
243 lghi %r0,0
244 la %r1,`2*$SIZE_T`($sp)
245 .long 0xb93e0002 # kimd %r0,%r2
246 lg %r0,`2*$SIZE_T`($sp)
247 tmhh %r0,`0x8000>>$kimdfunc`
248 jz .Lsoftware
249 lghi %r0,$kimdfunc
250 lgr %r1,$ctx
251 lgr %r2,$inp
252 lgr %r3,$len
253 .long 0xb93e0002 # kimd %r0,%r2
254 brc 1,.-4 # pay attention to "partial completion"
255 br %r14
256.align 16
257.Lsoftware:
258___
259$code.=<<___;
260 lghi %r1,-$frame
261 la $len,0($len,$inp)
262 stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
263 lgr %r0,$sp
264 la $sp,0(%r1,$sp)
265 st${g} %r0,0($sp)
266
267 larl $tbl,$Table
268 $LD $A,`0*$SZ`($ctx)
269 $LD $B,`1*$SZ`($ctx)
270 $LD $C,`2*$SZ`($ctx)
271 $LD $D,`3*$SZ`($ctx)
272 $LD $E,`4*$SZ`($ctx)
273 $LD $F,`5*$SZ`($ctx)
274 $LD $G,`6*$SZ`($ctx)
275 $LD $H,`7*$SZ`($ctx)
276
277.Lloop:
278 lghi $len,0
279___
280for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
281$code.=".Lrounds_16_xx:\n";
282for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
283$code.=<<___;
284 aghi $len,`16*$SZ`
285 lghi $t0,`($rounds-16)*$SZ`
286 clgr $len,$t0
287 jne .Lrounds_16_xx
288
289 l${g} $ctx,`$frame+2*$SIZE_T`($sp)
290 la $inp,`16*$SZ`($inp)
291 $ADD $A,`0*$SZ`($ctx)
292 $ADD $B,`1*$SZ`($ctx)
293 $ADD $C,`2*$SZ`($ctx)
294 $ADD $D,`3*$SZ`($ctx)
295 $ADD $E,`4*$SZ`($ctx)
296 $ADD $F,`5*$SZ`($ctx)
297 $ADD $G,`6*$SZ`($ctx)
298 $ADD $H,`7*$SZ`($ctx)
299 $ST $A,`0*$SZ`($ctx)
300 $ST $B,`1*$SZ`($ctx)
301 $ST $C,`2*$SZ`($ctx)
302 $ST $D,`3*$SZ`($ctx)
303 $ST $E,`4*$SZ`($ctx)
304 $ST $F,`5*$SZ`($ctx)
305 $ST $G,`6*$SZ`($ctx)
306 $ST $H,`7*$SZ`($ctx)
307 cl${g} $inp,`$frame+4*$SIZE_T`($sp)
308 jne .Lloop
309
310 lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
311 br %r14
312.size $Func,.-$Func
313.string "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
314.comm OPENSSL_s390xcap_P,16,8
315___
316
317$code =~ s/\`([^\`]*)\`/eval $1/gem;
318# unlike 32-bit shift 64-bit one takes three arguments
319$code =~ s/(srlg\s+)(%r[0-9]+),/$1$2,$2,/gm;
320
321print $code;
322close STDOUT;